chronicle-etl 0.1.2 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +8 -0
  3. data/.ruby-version +1 -0
  4. data/.yardopts +1 -0
  5. data/CHANGELOG.md +11 -0
  6. data/Gemfile.lock +42 -10
  7. data/README.md +64 -11
  8. data/bin/console +16 -4
  9. data/chronicle-etl.gemspec +8 -6
  10. data/exe/chronicle-etl +2 -2
  11. data/lib/chronicle/etl.rb +6 -2
  12. data/lib/chronicle/etl/catalog.rb +102 -0
  13. data/lib/chronicle/etl/cli/connectors.rb +32 -0
  14. data/lib/chronicle/etl/cli/jobs.rb +110 -0
  15. data/lib/chronicle/etl/cli/main.rb +83 -0
  16. data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
  17. data/lib/chronicle/etl/config.rb +32 -0
  18. data/lib/chronicle/etl/exceptions.rb +17 -0
  19. data/lib/chronicle/etl/extractors/{csv.rb → csv_extractor.rb} +3 -3
  20. data/lib/chronicle/etl/extractors/extractor.rb +23 -12
  21. data/lib/chronicle/etl/extractors/file_extractor.rb +52 -0
  22. data/lib/chronicle/etl/extractors/stdin_extractor.rb +11 -0
  23. data/lib/chronicle/etl/loaders/csv_loader.rb +29 -0
  24. data/lib/chronicle/etl/loaders/loader.rb +23 -16
  25. data/lib/chronicle/etl/loaders/rest_loader.rb +30 -0
  26. data/lib/chronicle/etl/loaders/stdout_loader.rb +9 -0
  27. data/lib/chronicle/etl/loaders/table_loader.rb +21 -0
  28. data/lib/chronicle/etl/runner.rb +27 -38
  29. data/lib/chronicle/etl/transformers/json_transformer.rb +11 -0
  30. data/lib/chronicle/etl/transformers/null_transformer.rb +10 -0
  31. data/lib/chronicle/etl/transformers/transformer.rb +28 -11
  32. data/lib/chronicle/etl/utils/progress_bar.rb +76 -0
  33. data/lib/chronicle/etl/version.rb +2 -2
  34. metadata +68 -29
  35. data/lib/chronicle/etl/cli.rb +0 -38
  36. data/lib/chronicle/etl/extractors/stdin.rb +0 -13
  37. data/lib/chronicle/etl/loaders/csv.rb +0 -31
  38. data/lib/chronicle/etl/loaders/stdout.rb +0 -11
  39. data/lib/chronicle/etl/loaders/table.rb +0 -22
  40. data/lib/chronicle/etl/transformers/json.rb +0 -13
  41. data/lib/chronicle/etl/transformers/null.rb +0 -11
  42. data/lib/chronicle/etl/utils/progress_bar_wrapper.rb +0 -43
@@ -0,0 +1,110 @@
1
+ require 'pp'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module CLI
6
+ # CLI commands for working with ETL jobs
7
+ class Jobs < SubcommandBase
8
+ default_task "start"
9
+ namespace :jobs
10
+
11
+ class_option :extractor, aliases: '-e', desc: 'Extractor class (available: stdin, csv, file)', default: 'stdin', banner: 'extractor-name'
12
+ class_option :'extractor-opts', desc: 'Extractor options', type: :hash, default: {}
13
+ class_option :transformer, aliases: '-t', desc: 'Transformer class (available: null)', default: 'null', banner: 'transformer-name'
14
+ class_option :'transformer-opts', desc: 'Transformer options', type: :hash, default: {}
15
+ class_option :loader, aliases: '-l', desc: 'Loader class (available: stdout, csv, table)', default: 'stdout', banner: 'loader-name'
16
+ class_option :'loader-opts', desc: 'Loader options', type: :hash, default: {}
17
+ class_option :job, aliases: '-j', desc: 'Job configuration name (or filename)'
18
+
19
+ map run: :start # Thor doesn't like `run` as a command name
20
+ desc "run", "Start a job"
21
+ long_desc <<-LONG_DESC
22
+ This will run an ETL job. Each job needs three parts:
23
+
24
+ 1. #{'Extractor'.underline}: pulls data from an external source. By default, this is stdout. Other common options including pulling data from an API or reading JSON from a file.
25
+
26
+ 2. #{'Transformer'.underline}: transforms data into a new format. If none is specified, we use the `null` transformer which does nothing to the data.
27
+
28
+ 3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout.
29
+
30
+ If you do not want to use the command line flags, you can also configure a job with a .yml config file. You can either specify the path to this file or use the filename and place the file in ~/.config/chronicle/etl/jobs/NAME.yml and call it with `--job NAME`
31
+ LONG_DESC
32
+ # Run an ETL job
33
+ def start
34
+ runner_options = build_runner_options(options)
35
+ runner = Chronicle::ETL::Runner.new(runner_options)
36
+ runner.run!
37
+ end
38
+
39
+ desc "create", "Create a job"
40
+ # Create an ETL job
41
+ def create
42
+ runner_options = build_runner_options(options)
43
+ path = File.join('chronicle', 'etl', 'jobs', options[:job])
44
+ Chronicle::ETL::Config.write(path, runner_options)
45
+ end
46
+
47
+ desc "show", "Show details about a job"
48
+ # Show an ETL job
49
+ def show
50
+ runner_options = build_runner_options(options)
51
+ pp runner_options
52
+ end
53
+
54
+ desc "list", "List all available jobs"
55
+ # List available ETL jobs
56
+ def list
57
+ jobs = Chronicle::ETL::Config.jobs
58
+
59
+ job_details = jobs.map do |job|
60
+ r = Chronicle::ETL::Config.load("chronicle/etl/jobs/#{job}.yml")
61
+
62
+ extractor = r[:extractor][:name] if r[:extractor]
63
+ transformer = r[:transformer][:name] if r[:transformer]
64
+ loader = r[:loader][:name] if r[:loader]
65
+
66
+ [job, extractor, transformer, loader]
67
+ end
68
+
69
+ headers = ['name', 'extractor', 'transformer', 'loader'].map{|h| h.upcase.bold }
70
+
71
+ table = TTY::Table.new(headers, job_details)
72
+ puts table.render(indent: 0, padding: [0, 2])
73
+ end
74
+
75
+ private
76
+
77
+ # Create runner options by reading config file and then overwriting with flag options
78
+ def build_runner_options options
79
+ flag_options = process_flag_options(options)
80
+ job_options = load_job(options[:job])
81
+ flag_options.merge(job_options)
82
+ end
83
+
84
+ def load_job job
85
+ yml_config = Chronicle::ETL::Config.load("chronicle/etl/jobs/#{job}.yml")
86
+ # FIXME: use better trick to depely symbolize keys
87
+ JSON.parse(yml_config.to_json, symbolize_names: true)
88
+ end
89
+
90
+ # Takes flag options and turns them into a runner config
91
+ def process_flag_options options
92
+ {
93
+ extractor: {
94
+ name: options[:extractor],
95
+ options: options[:'extractor-opts']
96
+ },
97
+ transformer: {
98
+ name: options[:transformer],
99
+ options: options[:'transformer-opts']
100
+ },
101
+ loader: {
102
+ name: options[:loader],
103
+ options: options[:'loader-opts']
104
+ }
105
+ }
106
+ end
107
+ end
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,83 @@
1
+ require 'thor'
2
+ require 'chronicle/etl'
3
+ require 'colorize'
4
+
5
+ require 'chronicle/etl/cli/subcommand_base'
6
+ require 'chronicle/etl/cli/connectors'
7
+ require 'chronicle/etl/cli/jobs'
8
+
9
+ module Chronicle
10
+ module ETL
11
+ module CLI
12
+ # Main entrypoint for CLI app
13
+ class Main < Thor
14
+ class_option "verbose", type: :boolean, default: false
15
+ default_task "jobs"
16
+
17
+ desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
18
+ subcommand 'connectors', Connectors
19
+
20
+ desc 'jobs:COMMAND', 'Configure and run jobs', hide: true
21
+ subcommand 'jobs', Jobs
22
+
23
+ # Entrypoint for the CLI
24
+ def self.start(given_args = ARGV, config = {})
25
+ if given_args.none?
26
+ abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
27
+ end
28
+
29
+ # take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
30
+ if given_args.any? && given_args[0].include?(':')
31
+ commands = given_args.shift.split(':')
32
+ given_args = given_args.unshift(commands).flatten
33
+ end
34
+
35
+ super(given_args, config)
36
+ end
37
+
38
+ # Displays help options for chronicle-etl
39
+ def help(meth = nil, subcommand = false)
40
+ if meth && !respond_to?(meth)
41
+ klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
42
+ klass.start(['-h', task].compact, shell: shell)
43
+ else
44
+ shell.say "ABOUT".bold
45
+ shell.say " #{'chronicle-etl'.italic} is a utility tool for #{'extracting'.underline}, #{'transforming'.underline}, and #{'loading'.underline} personal data."
46
+ shell.say
47
+ shell.say "USAGE".bold
48
+ shell.say " $ chronicle-etl COMMAND"
49
+ shell.say
50
+ shell.say "EXAMPLES".bold
51
+ shell.say " Show available connectors:".italic.light_black
52
+ shell.say " $ chronicle-etl connectors:list"
53
+ shell.say
54
+ shell.say " Run a simple job:".italic.light_black
55
+ shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
56
+ shell.say
57
+ shell.say " Show full job options:".italic.light_black
58
+ shell.say " $ chronicle-etl jobs help start"
59
+
60
+ list = []
61
+
62
+ Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
63
+ list += thor_class.printable_tasks(false)
64
+ end
65
+ list.sort! { |a, b| a[0] <=> b[0] }
66
+ list.unshift ["help", "# This help menu"]
67
+
68
+ shell.say
69
+ shell.say 'ALL COMMANDS'.bold
70
+ shell.print_table(list, indent: 2, truncate: true)
71
+ shell.say
72
+ shell.say "VERSION".bold
73
+ shell.say " #{Chronicle::ETL::VERSION}"
74
+ shell.say
75
+ shell.say "FULL DOCUMENTATION".bold
76
+ shell.say " https://github.com/chronicle-app/chronicle-etl".blue
77
+ shell.say
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,37 @@
1
+ module Chronicle
2
+ module ETL
3
+ module CLI
4
+ # Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
5
+ class SubcommandBase < Thor
6
+ # Print usage instructions for a subcommand
7
+ def self.help(shell, subcommand = false)
8
+ list = printable_commands(true, subcommand)
9
+ Thor::Util.thor_classes_in(self).each do |klass|
10
+ list += klass.printable_commands(false)
11
+ end
12
+ list.sort! { |a, b| a[0] <=> b[0] }
13
+
14
+ shell.say "COMMANDS".bold
15
+ shell.print_table(list, indent: 2, truncate: true)
16
+ shell.say
17
+ class_options_help(shell)
18
+ end
19
+
20
+ # Show docs with command:subcommand pattern.
21
+ # For `help` command, don't use colon
22
+ def self.banner(command, namespace = nil, subcommand = false)
23
+ if command.name == 'help'
24
+ "#{subcommand_prefix} #{command.usage}"
25
+ else
26
+ "#{subcommand_prefix}:#{command.usage}"
27
+ end
28
+ end
29
+
30
+ # Use subcommand classname to derive display name for subcommand
31
+ def self.subcommand_prefix
32
+ self.name.gsub(%r{.*::}, '').gsub(%r{^[A-Z]}) { |match| match[0].downcase }.gsub(%r{[A-Z]}) { |match| "-#{match[0].downcase}" }
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,32 @@
1
+ require 'runcom'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # Utility methods to read, write, and access config files
6
+ module Config
7
+ # Loads a yml config file
8
+ def self.load(path)
9
+ config = Runcom::Config.new(path)
10
+ # FIXME: hack to deeply symbolize keys
11
+ JSON.parse(config.to_h.to_json, symbolize_names: true)
12
+ end
13
+
14
+ # Writes a hash as a yml config file
15
+ def self.write(path, data)
16
+ config = Runcom::Config.new(path)
17
+ filename = config.all[0].to_s + '.yml'
18
+ File.open(filename, 'w') do |f|
19
+ f << data.to_yaml
20
+ end
21
+ end
22
+
23
+ # Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
24
+ def self.jobs
25
+ job_directory = Runcom::Config.new('chronicle/etl/jobs').current
26
+ Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
27
+ File.basename(filename, ".*")
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,17 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Error < StandardError; end;
4
+
5
+ class ConnectorNotAvailableError < Error
6
+ def initialize(message, provider: nil, name: nil)
7
+ super(message)
8
+ @provider = provider
9
+ @name = name
10
+ end
11
+ attr_reader :name, :provider
12
+ end
13
+
14
+ class ProviderNotAvailableError < ConnectorNotAvailableError; end
15
+ class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
16
+ end
17
+ end
@@ -1,5 +1,5 @@
1
1
  require 'csv'
2
- class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
2
+ class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
3
3
  DEFAULT_OPTIONS = {
4
4
  headers: true,
5
5
  filename: $stdin
@@ -18,7 +18,7 @@ class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
18
18
  end
19
19
 
20
20
  def results_count
21
- CSV.read(@options[:filename], { headers: @options[:headers] }).count if read_from_file?
21
+ CSV.read(@options[:filename], headers: @options[:headers]).count if read_from_file?
22
22
  end
23
23
 
24
24
  private
@@ -33,7 +33,7 @@ class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
33
33
  }
34
34
 
35
35
  stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
36
- CSV.new(stream, csv_options)
36
+ CSV.new(stream, **csv_options)
37
37
  end
38
38
 
39
39
  def read_from_file?
@@ -1,20 +1,31 @@
1
+ require 'chronicle/etl'
2
+
1
3
  module Chronicle
2
- module Etl
3
- module Extractors
4
- class Extractor
5
- def initialize(options = {})
6
- @options = options.transform_keys!(&:to_sym)
7
- end
4
+ module ETL
5
+ # Abstract class representing an Extractor for an ETL job
6
+ class Extractor
7
+ extend Chronicle::ETL::Catalog
8
8
 
9
- def extract
10
- raise NotImplementedError
11
- end
9
+ # Construct a new instance of this extractor. Options are passed in from a Runner
10
+ # == Paramters:
11
+ # options::
12
+ # Options for configuring this Extractor
13
+ def initialize(options = {})
14
+ @options = options.transform_keys!(&:to_sym)
15
+ end
12
16
 
13
- def results_count; end
17
+ # Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
18
+ def extract
19
+ raise NotImplementedError
14
20
  end
21
+
22
+ # An optional method to calculate how many records there are to extract. Used primarily for
23
+ # building the progress bar
24
+ def results_count; end
15
25
  end
16
26
  end
17
27
  end
18
28
 
19
- require_relative 'stdin'
20
- require_relative 'csv'
29
+ require_relative 'csv_extractor'
30
+ require_relative 'file_extractor'
31
+ require_relative 'stdin_extractor'
@@ -0,0 +1,52 @@
1
+ require 'pathname'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class FileExtractor < Chronicle::ETL::Extractor
6
+ def extract
7
+ if file?
8
+ extract_file do |data, metadata|
9
+ yield(data, metadata)
10
+ end
11
+ elsif directory?
12
+ extract_from_directory do |data, metadata|
13
+ yield(data, metadata)
14
+ end
15
+ end
16
+ end
17
+
18
+ def results_count
19
+ if file?
20
+ return 1
21
+ else
22
+ search_pattern = File.join(@options[:filename], '**/*.eml')
23
+ Dir.glob(search_pattern).count
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def extract_from_directory
30
+ search_pattern = File.join(@options[:filename], '**/*.eml')
31
+ filenames = Dir.glob(search_pattern)
32
+ filenames.each do |filename|
33
+ file = File.open(filename)
34
+ yield(file.read, {filename: file})
35
+ end
36
+ end
37
+
38
+ def extract_file
39
+ file = File.open(@options[:filename])
40
+ yield(file.read, {filename: @options[:filename]})
41
+ end
42
+
43
+ def directory?
44
+ Pathname.new(@options[:filename]).directory?
45
+ end
46
+
47
+ def file?
48
+ Pathname.new(@options[:filename]).file?
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,11 @@
1
+ module Chronicle
2
+ module ETL
3
+ class StdinExtractor < Chronicle::ETL::Extractor
4
+ def extract
5
+ $stdin.read.each_line do |line|
6
+ yield line
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,29 @@
1
+ require 'csv'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class CsvLoader < Chronicle::ETL::Loader
6
+ def initialize(options={})
7
+ super(options)
8
+ @rows = []
9
+ end
10
+
11
+ def load(result)
12
+ if (result.is_a? Hash)
13
+ @rows << result.values
14
+ else
15
+ @rows << result
16
+ end
17
+ end
18
+
19
+ def finish
20
+ z = $stdout
21
+ CSV(z) do |csv|
22
+ @rows.each do |row|
23
+ csv << row
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end