chronicle-etl 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +8 -0
  3. data/.ruby-version +1 -0
  4. data/.yardopts +1 -0
  5. data/CHANGELOG.md +23 -0
  6. data/Gemfile.lock +42 -10
  7. data/README.md +64 -11
  8. data/bin/console +16 -4
  9. data/chronicle-etl.gemspec +9 -7
  10. data/exe/chronicle-etl +2 -2
  11. data/lib/chronicle/etl.rb +5 -2
  12. data/lib/chronicle/etl/catalog.rb +62 -0
  13. data/lib/chronicle/etl/cli/connectors.rb +32 -0
  14. data/lib/chronicle/etl/cli/jobs.rb +111 -0
  15. data/lib/chronicle/etl/cli/main.rb +83 -0
  16. data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
  17. data/lib/chronicle/etl/config.rb +32 -0
  18. data/lib/chronicle/etl/extractors/{csv.rb → csv_extractor.rb} +3 -3
  19. data/lib/chronicle/etl/extractors/extractor.rb +23 -12
  20. data/lib/chronicle/etl/extractors/file_extractor.rb +52 -0
  21. data/lib/chronicle/etl/extractors/stdin_extractor.rb +11 -0
  22. data/lib/chronicle/etl/loaders/csv_loader.rb +29 -0
  23. data/lib/chronicle/etl/loaders/loader.rb +23 -16
  24. data/lib/chronicle/etl/loaders/rest_loader.rb +30 -0
  25. data/lib/chronicle/etl/loaders/stdout_loader.rb +9 -0
  26. data/lib/chronicle/etl/loaders/table_loader.rb +21 -0
  27. data/lib/chronicle/etl/runner.rb +33 -11
  28. data/lib/chronicle/etl/transformers/json_transformer.rb +11 -0
  29. data/lib/chronicle/etl/transformers/null_transformer.rb +10 -0
  30. data/lib/chronicle/etl/transformers/transformer.rb +27 -11
  31. data/lib/chronicle/etl/utils/progress_bar.rb +76 -0
  32. data/lib/chronicle/etl/version.rb +2 -2
  33. metadata +69 -30
  34. data/lib/chronicle/etl/cli.rb +0 -38
  35. data/lib/chronicle/etl/extractors/stdin.rb +0 -13
  36. data/lib/chronicle/etl/loaders/csv.rb +0 -31
  37. data/lib/chronicle/etl/loaders/stdout.rb +0 -11
  38. data/lib/chronicle/etl/loaders/table.rb +0 -22
  39. data/lib/chronicle/etl/transformers/json.rb +0 -13
  40. data/lib/chronicle/etl/transformers/null.rb +0 -11
  41. data/lib/chronicle/etl/utils/progress_bar_wrapper.rb +0 -43
@@ -0,0 +1,83 @@
1
+ require 'thor'
2
+ require 'chronicle/etl'
3
+ require 'colorize'
4
+
5
+ require 'chronicle/etl/cli/subcommand_base'
6
+ require 'chronicle/etl/cli/connectors'
7
+ require 'chronicle/etl/cli/jobs'
8
+
9
+ module Chronicle
10
+ module ETL
11
+ module CLI
12
+ # Main entrypoint for CLI app
13
+ class Main < Thor
14
+ class_option "verbose", type: :boolean, default: false
15
+ default_task "jobs"
16
+
17
+ desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
18
+ subcommand 'connectors', Connectors
19
+
20
+ desc 'jobs:COMMAND', 'Configure and run jobs', hide: true
21
+ subcommand 'jobs', Jobs
22
+
23
+ # Entrypoint for the CLI
24
+ def self.start(given_args = ARGV, config = {})
25
+ if given_args.none?
26
+ abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
27
+ end
28
+
29
+ # take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
30
+ if given_args.any? && given_args[0].include?(':')
31
+ commands = given_args.shift.split(':')
32
+ given_args = given_args.unshift(commands).flatten
33
+ end
34
+
35
+ super(given_args, config)
36
+ end
37
+
38
+ # Displays help options for chronicle-etl
39
+ def help(meth = nil, subcommand = false)
40
+ if meth && !respond_to?(meth)
41
+ klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
42
+ klass.start(['-h', task].compact, shell: shell)
43
+ else
44
+ shell.say "ABOUT".bold
45
+ shell.say " #{'chronicle-etl'.italic} is a utility tool for #{'extracting'.underline}, #{'transforming'.underline}, and #{'loading'.underline} personal data."
46
+ shell.say
47
+ shell.say "USAGE".bold
48
+ shell.say " $ chronicle-etl COMMAND"
49
+ shell.say
50
+ shell.say "EXAMPLES".bold
51
+ shell.say " Show available connectors:".italic.light_black
52
+ shell.say " $ chronicle-etl connectors:list"
53
+ shell.say
54
+ shell.say " Run a simple job:".italic.light_black
55
+ shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
56
+ shell.say
57
+ shell.say " Show full job options:".italic.light_black
58
+ shell.say " $ chronicle-etl jobs help start"
59
+
60
+ list = []
61
+
62
+ Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
63
+ list += thor_class.printable_tasks(false)
64
+ end
65
+ list.sort! { |a, b| a[0] <=> b[0] }
66
+ list.unshift ["help", "# This help menu"]
67
+
68
+ shell.say
69
+ shell.say 'ALL COMMANDS'.bold
70
+ shell.print_table(list, indent: 2, truncate: true)
71
+ shell.say
72
+ shell.say "VERSION".bold
73
+ shell.say " #{Chronicle::ETL::VERSION}"
74
+ shell.say
75
+ shell.say "FULL DOCUMENTATION".bold
76
+ shell.say " https://github.com/chronicle-app/chronicle-etl".blue
77
+ shell.say
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,37 @@
1
+ module Chronicle
2
+ module ETL
3
+ module CLI
4
+ # Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
5
+ class SubcommandBase < Thor
6
+ # Print usage instructions for a subcommand
7
+ def self.help(shell, subcommand = false)
8
+ list = printable_commands(true, subcommand)
9
+ Thor::Util.thor_classes_in(self).each do |klass|
10
+ list += klass.printable_commands(false)
11
+ end
12
+ list.sort! { |a, b| a[0] <=> b[0] }
13
+
14
+ shell.say "COMMANDS".bold
15
+ shell.print_table(list, indent: 2, truncate: true)
16
+ shell.say
17
+ class_options_help(shell)
18
+ end
19
+
20
+ # Show docs with command:subcommand pattern.
21
+ # For `help` command, don't use colon
22
+ def self.banner(command, namespace = nil, subcommand = false)
23
+ if command.name == 'help'
24
+ "#{subcommand_prefix} #{command.usage}"
25
+ else
26
+ "#{subcommand_prefix}:#{command.usage}"
27
+ end
28
+ end
29
+
30
+ # Use subcommand classname to derive display name for subcommand
31
+ def self.subcommand_prefix
32
+ self.name.gsub(%r{.*::}, '').gsub(%r{^[A-Z]}) { |match| match[0].downcase }.gsub(%r{[A-Z]}) { |match| "-#{match[0].downcase}" }
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,32 @@
1
+ require 'runcom'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # Utility methods to read, write, and access config files
6
+ module Config
7
+ # Loads a yml config file
8
+ def self.load(path)
9
+ config = Runcom::Config.new(path)
10
+ # FIXME: hack to deeply symbolize keys
11
+ JSON.parse(config.to_h.to_json, symbolize_names: true)
12
+ end
13
+
14
+ # Writes a hash as a yml config file
15
+ def self.write(path, data)
16
+ config = Runcom::Config.new(path)
17
+ filename = config.all[0].to_s + '.yml'
18
+ File.open(filename, 'w') do |f|
19
+ f << data.to_yaml
20
+ end
21
+ end
22
+
23
+ # Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
24
+ def self.jobs
25
+ job_directory = Runcom::Config.new('chronicle/etl/jobs').current
26
+ Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
27
+ File.basename(filename, ".*")
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -1,5 +1,5 @@
1
1
  require 'csv'
2
- class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
2
+ class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
3
3
  DEFAULT_OPTIONS = {
4
4
  headers: true,
5
5
  filename: $stdin
@@ -18,7 +18,7 @@ class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
18
18
  end
19
19
 
20
20
  def results_count
21
- CSV.read(@options[:filename], { headers: @options[:headers] }).count if read_from_file?
21
+ CSV.read(@options[:filename], headers: @options[:headers]).count if read_from_file?
22
22
  end
23
23
 
24
24
  private
@@ -33,7 +33,7 @@ class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
33
33
  }
34
34
 
35
35
  stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
36
- CSV.new(stream, csv_options)
36
+ CSV.new(stream, **csv_options)
37
37
  end
38
38
 
39
39
  def read_from_file?
@@ -1,20 +1,31 @@
1
+ require 'chronicle/etl'
2
+
1
3
  module Chronicle
2
- module Etl
3
- module Extractors
4
- class Extractor
5
- def initialize(options = {})
6
- @options = options.transform_keys!(&:to_sym)
7
- end
4
+ module ETL
5
+ # Abstract class representing an Extractor for an ETL job
6
+ class Extractor
7
+ extend Chronicle::ETL::Catalog
8
8
 
9
- def extract
10
- raise NotImplementedError
11
- end
9
+ # Construct a new instance of this extractor. Options are passed in from a Runner
10
+ # == Paramters:
11
+ # options::
12
+ # Options for configuring this Extractor
13
+ def initialize(options = {})
14
+ @options = options.transform_keys!(&:to_sym)
15
+ end
12
16
 
13
- def results_count; end
17
+ # Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
18
+ def extract
19
+ raise NotImplementedError
14
20
  end
21
+
22
+ # An optional method to calculate how many records there are to extract. Used primarily for
23
+ # building the progress bar
24
+ def results_count; end
15
25
  end
16
26
  end
17
27
  end
18
28
 
19
- require_relative 'stdin'
20
- require_relative 'csv'
29
+ require_relative 'csv_extractor'
30
+ require_relative 'file_extractor'
31
+ require_relative 'stdin_extractor'
@@ -0,0 +1,52 @@
1
+ require 'pathname'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class FileExtractor < Chronicle::ETL::Extractor
6
+ def extract
7
+ if file?
8
+ extract_file do |data, metadata|
9
+ yield(data, metadata)
10
+ end
11
+ elsif directory?
12
+ extract_from_directory do |data, metadata|
13
+ yield(data, metadata)
14
+ end
15
+ end
16
+ end
17
+
18
+ def results_count
19
+ if file?
20
+ return 1
21
+ else
22
+ search_pattern = File.join(@options[:filename], '**/*.eml')
23
+ Dir.glob(search_pattern).count
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def extract_from_directory
30
+ search_pattern = File.join(@options[:filename], '**/*.eml')
31
+ filenames = Dir.glob(search_pattern)
32
+ filenames.each do |filename|
33
+ file = File.open(filename)
34
+ yield(file.read, {filename: file})
35
+ end
36
+ end
37
+
38
+ def extract_file
39
+ file = File.open(@options[:filename])
40
+ yield(file.read, {filename: @options[:filename]})
41
+ end
42
+
43
+ def directory?
44
+ Pathname.new(@options[:filename]).directory?
45
+ end
46
+
47
+ def file?
48
+ Pathname.new(@options[:filename]).file?
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,11 @@
1
+ module Chronicle
2
+ module ETL
3
+ class StdinExtractor < Chronicle::ETL::Extractor
4
+ def extract
5
+ $stdin.read.each_line do |line|
6
+ yield line
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,29 @@
1
+ require 'csv'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class CsvLoader < Chronicle::ETL::Loader
6
+ def initialize(options={})
7
+ super(options)
8
+ @rows = []
9
+ end
10
+
11
+ def load(result)
12
+ if (result.is_a? Hash)
13
+ @rows << result.values
14
+ else
15
+ @rows << result
16
+ end
17
+ end
18
+
19
+ def finish
20
+ z = $stdout
21
+ CSV(z) do |csv|
22
+ @rows.each do |row|
23
+ csv << row
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -1,25 +1,32 @@
1
1
  module Chronicle
2
- module Etl
3
- module Loaders
4
- class Loader
5
- def initialize(options = {})
6
- @options = options
7
- end
2
+ module ETL
3
+ # Abstract class representing a Loader for an ETL job
4
+ class Loader
5
+ extend Chronicle::ETL::Catalog
8
6
 
9
- def start; end
10
-
11
- def first_load; end
7
+ # Construct a new instance of this loader. Options are passed in from a Runner
8
+ # == Paramters:
9
+ # options::
10
+ # Options for configuring this Loader
11
+ def initialize(options = {})
12
+ @options = options
13
+ end
12
14
 
13
- def load
14
- raise NotImplementedError
15
- end
15
+ # Called once before processing records
16
+ def start; end
16
17
 
17
- def finish; end
18
+ # Load a single record
19
+ def load
20
+ raise NotImplementedError
18
21
  end
22
+
23
+ # Called once there are no more records to process
24
+ def finish; end
19
25
  end
20
26
  end
21
27
  end
22
28
 
23
- require_relative 'stdout'
24
- require_relative 'csv'
25
- require_relative 'table'
29
+ require_relative 'csv_loader'
30
+ require_relative 'rest_loader'
31
+ require_relative 'stdout_loader'
32
+ require_relative 'table_loader'
@@ -0,0 +1,30 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'json'
4
+
5
+ module Chronicle
6
+ module ETL
7
+ class RestLoader < Chronicle::ETL::Loader
8
+ def initialize(options={})
9
+ super(options)
10
+ end
11
+
12
+ def load(result)
13
+ uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
14
+
15
+ header = {
16
+ "Authorization" => "Bearer #{@options[:access_token]}",
17
+ "Content-Type": 'application/json'
18
+ }
19
+
20
+ http = Net::HTTP.new(uri.host, uri.port)
21
+ request = Net::HTTP::Post.new(uri.request_uri, header)
22
+
23
+ obj = {data: result} unless result[:data]
24
+ request.body = obj.to_json
25
+
26
+ response = http.request(request)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,9 @@
1
+ module Chronicle
2
+ module ETL
3
+ class StdoutLoader < Chronicle::ETL::Loader
4
+ def load(result)
5
+ puts result.inspect
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,21 @@
1
+ require 'tty/table'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class TableLoader < Chronicle::ETL::Loader
6
+ def initialize(options)
7
+ super(options)
8
+ end
9
+
10
+ def load(result)
11
+ @table ||= TTY::Table.new(header: result.keys)
12
+ values = result.values.map{|x| x.to_s[0..30]}
13
+ @table << values
14
+ end
15
+
16
+ def finish
17
+ puts @table.render(:ascii, padding: [0, 1])
18
+ end
19
+ end
20
+ end
21
+ end
@@ -1,4 +1,10 @@
1
- class Chronicle::Etl::Runner
1
+ class Chronicle::ETL::Runner
2
+ BUILTIN = {
3
+ extractor: ['stdin', 'json', 'csv', 'file'],
4
+ transformer: ['null'],
5
+ loader: ['stdout', 'csv', 'table']
6
+ }.freeze
7
+
2
8
  def initialize(options)
3
9
  @options = options
4
10
 
@@ -6,16 +12,18 @@ class Chronicle::Etl::Runner
6
12
  end
7
13
 
8
14
  def run!
9
- progress_bar = Chronicle::Etl::Utils::ProgressBarWrapper.new(@extractor.results_count)
10
- @loader.start
15
+ total = @extractor.results_count
16
+ progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
17
+ count = 0
11
18
 
12
- @extractor.extract do |result, i|
13
- @loader.first_load(result) if i == 0
19
+ @loader.start
14
20
 
15
- transformed_data = @transformer.transform(result)
21
+ @extractor.extract do |data, metadata|
22
+ transformed_data = @transformer.transform(data)
16
23
  @loader.load(transformed_data)
17
24
 
18
25
  progress_bar.increment
26
+ count += 1
19
27
  end
20
28
 
21
29
  progress_bar.finish
@@ -25,13 +33,27 @@ class Chronicle::Etl::Runner
25
33
  private
26
34
 
27
35
  def instantiate_etl_classes
28
- @extractor = get_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
29
- @transformer = get_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
30
- @loader = get_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
36
+ @extractor = load_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
37
+ @transformer = load_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
38
+ @loader = load_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
31
39
  end
32
40
 
33
- def get_etl_class(phase, name)
34
- klass_name = "Chronicle::Etl::#{phase.to_s.capitalize}s::#{name.capitalize}"
41
+ def load_etl_class(phase, x)
42
+ if BUILTIN[phase].include? x
43
+ klass_name = "Chronicle::ETL::#{x.capitalize}#{phase.to_s.capitalize}"
44
+ else
45
+ # TODO: come up with syntax for specifying a particular extractor in a provider library
46
+ provider, name = x.split(":")
47
+ provider = x unless provider
48
+ begin
49
+ require "chronicle/#{provider}"
50
+ rescue LoadError => e
51
+ warn("Error loading #{phase} '#{provider}'".red)
52
+ warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{provider}`")
53
+ exit(false)
54
+ end
55
+ klass_name = "Chronicle::#{provider.capitalize}::#{name&.capitalize}#{phase.capitalize}"
56
+ end
35
57
  Object.const_get(klass_name)
36
58
  end
37
59
  end