chronicle-etl 0.1.1 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +8 -0
  3. data/.ruby-version +1 -0
  4. data/.yardopts +1 -0
  5. data/CHANGELOG.md +23 -0
  6. data/Gemfile.lock +42 -10
  7. data/README.md +64 -11
  8. data/bin/console +16 -4
  9. data/chronicle-etl.gemspec +9 -7
  10. data/exe/chronicle-etl +2 -2
  11. data/lib/chronicle/etl.rb +5 -2
  12. data/lib/chronicle/etl/catalog.rb +62 -0
  13. data/lib/chronicle/etl/cli/connectors.rb +32 -0
  14. data/lib/chronicle/etl/cli/jobs.rb +111 -0
  15. data/lib/chronicle/etl/cli/main.rb +83 -0
  16. data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
  17. data/lib/chronicle/etl/config.rb +32 -0
  18. data/lib/chronicle/etl/extractors/{csv.rb → csv_extractor.rb} +3 -3
  19. data/lib/chronicle/etl/extractors/extractor.rb +23 -12
  20. data/lib/chronicle/etl/extractors/file_extractor.rb +52 -0
  21. data/lib/chronicle/etl/extractors/stdin_extractor.rb +11 -0
  22. data/lib/chronicle/etl/loaders/csv_loader.rb +29 -0
  23. data/lib/chronicle/etl/loaders/loader.rb +23 -16
  24. data/lib/chronicle/etl/loaders/rest_loader.rb +30 -0
  25. data/lib/chronicle/etl/loaders/stdout_loader.rb +9 -0
  26. data/lib/chronicle/etl/loaders/table_loader.rb +21 -0
  27. data/lib/chronicle/etl/runner.rb +33 -11
  28. data/lib/chronicle/etl/transformers/json_transformer.rb +11 -0
  29. data/lib/chronicle/etl/transformers/null_transformer.rb +10 -0
  30. data/lib/chronicle/etl/transformers/transformer.rb +27 -11
  31. data/lib/chronicle/etl/utils/progress_bar.rb +76 -0
  32. data/lib/chronicle/etl/version.rb +2 -2
  33. metadata +69 -30
  34. data/lib/chronicle/etl/cli.rb +0 -38
  35. data/lib/chronicle/etl/extractors/stdin.rb +0 -13
  36. data/lib/chronicle/etl/loaders/csv.rb +0 -31
  37. data/lib/chronicle/etl/loaders/stdout.rb +0 -11
  38. data/lib/chronicle/etl/loaders/table.rb +0 -22
  39. data/lib/chronicle/etl/transformers/json.rb +0 -13
  40. data/lib/chronicle/etl/transformers/null.rb +0 -11
  41. data/lib/chronicle/etl/utils/progress_bar_wrapper.rb +0 -43
@@ -0,0 +1,83 @@
1
+ require 'thor'
2
+ require 'chronicle/etl'
3
+ require 'colorize'
4
+
5
+ require 'chronicle/etl/cli/subcommand_base'
6
+ require 'chronicle/etl/cli/connectors'
7
+ require 'chronicle/etl/cli/jobs'
8
+
9
+ module Chronicle
10
+ module ETL
11
+ module CLI
12
+ # Main entrypoint for CLI app
13
+ class Main < Thor
14
+ class_option "verbose", type: :boolean, default: false
15
+ default_task "jobs"
16
+
17
+ desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
18
+ subcommand 'connectors', Connectors
19
+
20
+ desc 'jobs:COMMAND', 'Configure and run jobs', hide: true
21
+ subcommand 'jobs', Jobs
22
+
23
+ # Entrypoint for the CLI
24
+ def self.start(given_args = ARGV, config = {})
25
+ if given_args.none?
26
+ abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
27
+ end
28
+
29
+ # take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
30
+ if given_args.any? && given_args[0].include?(':')
31
+ commands = given_args.shift.split(':')
32
+ given_args = given_args.unshift(commands).flatten
33
+ end
34
+
35
+ super(given_args, config)
36
+ end
37
+
38
+ # Displays help options for chronicle-etl
39
+ def help(meth = nil, subcommand = false)
40
+ if meth && !respond_to?(meth)
41
+ klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
42
+ klass.start(['-h', task].compact, shell: shell)
43
+ else
44
+ shell.say "ABOUT".bold
45
+ shell.say " #{'chronicle-etl'.italic} is a utility tool for #{'extracting'.underline}, #{'transforming'.underline}, and #{'loading'.underline} personal data."
46
+ shell.say
47
+ shell.say "USAGE".bold
48
+ shell.say " $ chronicle-etl COMMAND"
49
+ shell.say
50
+ shell.say "EXAMPLES".bold
51
+ shell.say " Show available connectors:".italic.light_black
52
+ shell.say " $ chronicle-etl connectors:list"
53
+ shell.say
54
+ shell.say " Run a simple job:".italic.light_black
55
+ shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
56
+ shell.say
57
+ shell.say " Show full job options:".italic.light_black
58
+ shell.say " $ chronicle-etl jobs help start"
59
+
60
+ list = []
61
+
62
+ Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
63
+ list += thor_class.printable_tasks(false)
64
+ end
65
+ list.sort! { |a, b| a[0] <=> b[0] }
66
+ list.unshift ["help", "# This help menu"]
67
+
68
+ shell.say
69
+ shell.say 'ALL COMMANDS'.bold
70
+ shell.print_table(list, indent: 2, truncate: true)
71
+ shell.say
72
+ shell.say "VERSION".bold
73
+ shell.say " #{Chronicle::ETL::VERSION}"
74
+ shell.say
75
+ shell.say "FULL DOCUMENTATION".bold
76
+ shell.say " https://github.com/chronicle-app/chronicle-etl".blue
77
+ shell.say
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,37 @@
1
+ module Chronicle
2
+ module ETL
3
+ module CLI
4
+ # Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
5
+ class SubcommandBase < Thor
6
+ # Print usage instructions for a subcommand
7
+ def self.help(shell, subcommand = false)
8
+ list = printable_commands(true, subcommand)
9
+ Thor::Util.thor_classes_in(self).each do |klass|
10
+ list += klass.printable_commands(false)
11
+ end
12
+ list.sort! { |a, b| a[0] <=> b[0] }
13
+
14
+ shell.say "COMMANDS".bold
15
+ shell.print_table(list, indent: 2, truncate: true)
16
+ shell.say
17
+ class_options_help(shell)
18
+ end
19
+
20
+ # Show docs with command:subcommand pattern.
21
+ # For `help` command, don't use colon
22
+ def self.banner(command, namespace = nil, subcommand = false)
23
+ if command.name == 'help'
24
+ "#{subcommand_prefix} #{command.usage}"
25
+ else
26
+ "#{subcommand_prefix}:#{command.usage}"
27
+ end
28
+ end
29
+
30
+ # Use subcommand classname to derive display name for subcommand
31
+ def self.subcommand_prefix
32
+ self.name.gsub(%r{.*::}, '').gsub(%r{^[A-Z]}) { |match| match[0].downcase }.gsub(%r{[A-Z]}) { |match| "-#{match[0].downcase}" }
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,32 @@
1
+ require 'runcom'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # Utility methods to read, write, and access config files
6
+ module Config
7
+ # Loads a yml config file
8
+ def self.load(path)
9
+ config = Runcom::Config.new(path)
10
+ # FIXME: hack to deeply symbolize keys
11
+ JSON.parse(config.to_h.to_json, symbolize_names: true)
12
+ end
13
+
14
+ # Writes a hash as a yml config file
15
+ def self.write(path, data)
16
+ config = Runcom::Config.new(path)
17
+ filename = config.all[0].to_s + '.yml'
18
+ File.open(filename, 'w') do |f|
19
+ f << data.to_yaml
20
+ end
21
+ end
22
+
23
+ # Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
24
+ def self.jobs
25
+ job_directory = Runcom::Config.new('chronicle/etl/jobs').current
26
+ Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
27
+ File.basename(filename, ".*")
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -1,5 +1,5 @@
1
1
  require 'csv'
2
- class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
2
+ class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
3
3
  DEFAULT_OPTIONS = {
4
4
  headers: true,
5
5
  filename: $stdin
@@ -18,7 +18,7 @@ class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
18
18
  end
19
19
 
20
20
  def results_count
21
- CSV.read(@options[:filename], { headers: @options[:headers] }).count if read_from_file?
21
+ CSV.read(@options[:filename], headers: @options[:headers]).count if read_from_file?
22
22
  end
23
23
 
24
24
  private
@@ -33,7 +33,7 @@ class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
33
33
  }
34
34
 
35
35
  stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
36
- CSV.new(stream, csv_options)
36
+ CSV.new(stream, **csv_options)
37
37
  end
38
38
 
39
39
  def read_from_file?
@@ -1,20 +1,31 @@
1
+ require 'chronicle/etl'
2
+
1
3
  module Chronicle
2
- module Etl
3
- module Extractors
4
- class Extractor
5
- def initialize(options = {})
6
- @options = options.transform_keys!(&:to_sym)
7
- end
4
+ module ETL
5
+ # Abstract class representing an Extractor for an ETL job
6
+ class Extractor
7
+ extend Chronicle::ETL::Catalog
8
8
 
9
- def extract
10
- raise NotImplementedError
11
- end
9
+ # Construct a new instance of this extractor. Options are passed in from a Runner
10
+ # == Paramters:
11
+ # options::
12
+ # Options for configuring this Extractor
13
+ def initialize(options = {})
14
+ @options = options.transform_keys!(&:to_sym)
15
+ end
12
16
 
13
- def results_count; end
17
+ # Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
18
+ def extract
19
+ raise NotImplementedError
14
20
  end
21
+
22
+ # An optional method to calculate how many records there are to extract. Used primarily for
23
+ # building the progress bar
24
+ def results_count; end
15
25
  end
16
26
  end
17
27
  end
18
28
 
19
- require_relative 'stdin'
20
- require_relative 'csv'
29
+ require_relative 'csv_extractor'
30
+ require_relative 'file_extractor'
31
+ require_relative 'stdin_extractor'
@@ -0,0 +1,52 @@
1
+ require 'pathname'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class FileExtractor < Chronicle::ETL::Extractor
6
+ def extract
7
+ if file?
8
+ extract_file do |data, metadata|
9
+ yield(data, metadata)
10
+ end
11
+ elsif directory?
12
+ extract_from_directory do |data, metadata|
13
+ yield(data, metadata)
14
+ end
15
+ end
16
+ end
17
+
18
+ def results_count
19
+ if file?
20
+ return 1
21
+ else
22
+ search_pattern = File.join(@options[:filename], '**/*.eml')
23
+ Dir.glob(search_pattern).count
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def extract_from_directory
30
+ search_pattern = File.join(@options[:filename], '**/*.eml')
31
+ filenames = Dir.glob(search_pattern)
32
+ filenames.each do |filename|
33
+ file = File.open(filename)
34
+ yield(file.read, {filename: file})
35
+ end
36
+ end
37
+
38
+ def extract_file
39
+ file = File.open(@options[:filename])
40
+ yield(file.read, {filename: @options[:filename]})
41
+ end
42
+
43
+ def directory?
44
+ Pathname.new(@options[:filename]).directory?
45
+ end
46
+
47
+ def file?
48
+ Pathname.new(@options[:filename]).file?
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,11 @@
1
+ module Chronicle
2
+ module ETL
3
+ class StdinExtractor < Chronicle::ETL::Extractor
4
+ def extract
5
+ $stdin.read.each_line do |line|
6
+ yield line
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,29 @@
1
+ require 'csv'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class CsvLoader < Chronicle::ETL::Loader
6
+ def initialize(options={})
7
+ super(options)
8
+ @rows = []
9
+ end
10
+
11
+ def load(result)
12
+ if (result.is_a? Hash)
13
+ @rows << result.values
14
+ else
15
+ @rows << result
16
+ end
17
+ end
18
+
19
+ def finish
20
+ z = $stdout
21
+ CSV(z) do |csv|
22
+ @rows.each do |row|
23
+ csv << row
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -1,25 +1,32 @@
1
1
  module Chronicle
2
- module Etl
3
- module Loaders
4
- class Loader
5
- def initialize(options = {})
6
- @options = options
7
- end
2
+ module ETL
3
+ # Abstract class representing a Loader for an ETL job
4
+ class Loader
5
+ extend Chronicle::ETL::Catalog
8
6
 
9
- def start; end
10
-
11
- def first_load; end
7
+ # Construct a new instance of this loader. Options are passed in from a Runner
8
+ # == Paramters:
9
+ # options::
10
+ # Options for configuring this Loader
11
+ def initialize(options = {})
12
+ @options = options
13
+ end
12
14
 
13
- def load
14
- raise NotImplementedError
15
- end
15
+ # Called once before processing records
16
+ def start; end
16
17
 
17
- def finish; end
18
+ # Load a single record
19
+ def load
20
+ raise NotImplementedError
18
21
  end
22
+
23
+ # Called once there are no more records to process
24
+ def finish; end
19
25
  end
20
26
  end
21
27
  end
22
28
 
23
- require_relative 'stdout'
24
- require_relative 'csv'
25
- require_relative 'table'
29
+ require_relative 'csv_loader'
30
+ require_relative 'rest_loader'
31
+ require_relative 'stdout_loader'
32
+ require_relative 'table_loader'
@@ -0,0 +1,30 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'json'
4
+
5
+ module Chronicle
6
+ module ETL
7
+ class RestLoader < Chronicle::ETL::Loader
8
+ def initialize(options={})
9
+ super(options)
10
+ end
11
+
12
+ def load(result)
13
+ uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
14
+
15
+ header = {
16
+ "Authorization" => "Bearer #{@options[:access_token]}",
17
+ "Content-Type": 'application/json'
18
+ }
19
+
20
+ http = Net::HTTP.new(uri.host, uri.port)
21
+ request = Net::HTTP::Post.new(uri.request_uri, header)
22
+
23
+ obj = {data: result} unless result[:data]
24
+ request.body = obj.to_json
25
+
26
+ response = http.request(request)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,9 @@
1
+ module Chronicle
2
+ module ETL
3
+ class StdoutLoader < Chronicle::ETL::Loader
4
+ def load(result)
5
+ puts result.inspect
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,21 @@
1
+ require 'tty/table'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class TableLoader < Chronicle::ETL::Loader
6
+ def initialize(options)
7
+ super(options)
8
+ end
9
+
10
+ def load(result)
11
+ @table ||= TTY::Table.new(header: result.keys)
12
+ values = result.values.map{|x| x.to_s[0..30]}
13
+ @table << values
14
+ end
15
+
16
+ def finish
17
+ puts @table.render(:ascii, padding: [0, 1])
18
+ end
19
+ end
20
+ end
21
+ end
@@ -1,4 +1,10 @@
1
- class Chronicle::Etl::Runner
1
+ class Chronicle::ETL::Runner
2
+ BUILTIN = {
3
+ extractor: ['stdin', 'json', 'csv', 'file'],
4
+ transformer: ['null'],
5
+ loader: ['stdout', 'csv', 'table']
6
+ }.freeze
7
+
2
8
  def initialize(options)
3
9
  @options = options
4
10
 
@@ -6,16 +12,18 @@ class Chronicle::Etl::Runner
6
12
  end
7
13
 
8
14
  def run!
9
- progress_bar = Chronicle::Etl::Utils::ProgressBarWrapper.new(@extractor.results_count)
10
- @loader.start
15
+ total = @extractor.results_count
16
+ progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
17
+ count = 0
11
18
 
12
- @extractor.extract do |result, i|
13
- @loader.first_load(result) if i == 0
19
+ @loader.start
14
20
 
15
- transformed_data = @transformer.transform(result)
21
+ @extractor.extract do |data, metadata|
22
+ transformed_data = @transformer.transform(data)
16
23
  @loader.load(transformed_data)
17
24
 
18
25
  progress_bar.increment
26
+ count += 1
19
27
  end
20
28
 
21
29
  progress_bar.finish
@@ -25,13 +33,27 @@ class Chronicle::Etl::Runner
25
33
  private
26
34
 
27
35
  def instantiate_etl_classes
28
- @extractor = get_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
29
- @transformer = get_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
30
- @loader = get_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
36
+ @extractor = load_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
37
+ @transformer = load_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
38
+ @loader = load_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
31
39
  end
32
40
 
33
- def get_etl_class(phase, name)
34
- klass_name = "Chronicle::Etl::#{phase.to_s.capitalize}s::#{name.capitalize}"
41
+ def load_etl_class(phase, x)
42
+ if BUILTIN[phase].include? x
43
+ klass_name = "Chronicle::ETL::#{x.capitalize}#{phase.to_s.capitalize}"
44
+ else
45
+ # TODO: come up with syntax for specifying a particular extractor in a provider library
46
+ provider, name = x.split(":")
47
+ provider = x unless provider
48
+ begin
49
+ require "chronicle/#{provider}"
50
+ rescue LoadError => e
51
+ warn("Error loading #{phase} '#{provider}'".red)
52
+ warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{provider}`")
53
+ exit(false)
54
+ end
55
+ klass_name = "Chronicle::#{provider.capitalize}::#{name&.capitalize}#{phase.capitalize}"
56
+ end
35
57
  Object.const_get(klass_name)
36
58
  end
37
59
  end