chronicle-etl 0.1.2 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +8 -0
  3. data/.ruby-version +1 -0
  4. data/.yardopts +1 -0
  5. data/CHANGELOG.md +11 -0
  6. data/Gemfile.lock +42 -10
  7. data/README.md +64 -11
  8. data/bin/console +16 -4
  9. data/chronicle-etl.gemspec +8 -6
  10. data/exe/chronicle-etl +2 -2
  11. data/lib/chronicle/etl.rb +6 -2
  12. data/lib/chronicle/etl/catalog.rb +102 -0
  13. data/lib/chronicle/etl/cli/connectors.rb +32 -0
  14. data/lib/chronicle/etl/cli/jobs.rb +110 -0
  15. data/lib/chronicle/etl/cli/main.rb +83 -0
  16. data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
  17. data/lib/chronicle/etl/config.rb +32 -0
  18. data/lib/chronicle/etl/exceptions.rb +17 -0
  19. data/lib/chronicle/etl/extractors/{csv.rb → csv_extractor.rb} +3 -3
  20. data/lib/chronicle/etl/extractors/extractor.rb +23 -12
  21. data/lib/chronicle/etl/extractors/file_extractor.rb +52 -0
  22. data/lib/chronicle/etl/extractors/stdin_extractor.rb +11 -0
  23. data/lib/chronicle/etl/loaders/csv_loader.rb +29 -0
  24. data/lib/chronicle/etl/loaders/loader.rb +23 -16
  25. data/lib/chronicle/etl/loaders/rest_loader.rb +30 -0
  26. data/lib/chronicle/etl/loaders/stdout_loader.rb +9 -0
  27. data/lib/chronicle/etl/loaders/table_loader.rb +21 -0
  28. data/lib/chronicle/etl/runner.rb +27 -38
  29. data/lib/chronicle/etl/transformers/json_transformer.rb +11 -0
  30. data/lib/chronicle/etl/transformers/null_transformer.rb +10 -0
  31. data/lib/chronicle/etl/transformers/transformer.rb +28 -11
  32. data/lib/chronicle/etl/utils/progress_bar.rb +76 -0
  33. data/lib/chronicle/etl/version.rb +2 -2
  34. metadata +68 -29
  35. data/lib/chronicle/etl/cli.rb +0 -38
  36. data/lib/chronicle/etl/extractors/stdin.rb +0 -13
  37. data/lib/chronicle/etl/loaders/csv.rb +0 -31
  38. data/lib/chronicle/etl/loaders/stdout.rb +0 -11
  39. data/lib/chronicle/etl/loaders/table.rb +0 -22
  40. data/lib/chronicle/etl/transformers/json.rb +0 -13
  41. data/lib/chronicle/etl/transformers/null.rb +0 -11
  42. data/lib/chronicle/etl/utils/progress_bar_wrapper.rb +0 -43
@@ -0,0 +1,110 @@
1
+ require 'pp'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module CLI
6
+ # CLI commands for working with ETL jobs
7
+ class Jobs < SubcommandBase
8
+ default_task "start"
9
+ namespace :jobs
10
+
11
+ class_option :extractor, aliases: '-e', desc: 'Extractor class (available: stdin, csv, file)', default: 'stdin', banner: 'extractor-name'
12
+ class_option :'extractor-opts', desc: 'Extractor options', type: :hash, default: {}
13
+ class_option :transformer, aliases: '-t', desc: 'Transformer class (available: null)', default: 'null', banner: 'transformer-name'
14
+ class_option :'transformer-opts', desc: 'Transformer options', type: :hash, default: {}
15
+ class_option :loader, aliases: '-l', desc: 'Loader class (available: stdout, csv, table)', default: 'stdout', banner: 'loader-name'
16
+ class_option :'loader-opts', desc: 'Loader options', type: :hash, default: {}
17
+ class_option :job, aliases: '-j', desc: 'Job configuration name (or filename)'
18
+
19
+ map run: :start # Thor doesn't like `run` as a command name
20
+ desc "run", "Start a job"
21
+ long_desc <<-LONG_DESC
22
+ This will run an ETL job. Each job needs three parts:
23
+
24
+ 1. #{'Extractor'.underline}: pulls data from an external source. By default, this is stdout. Other common options including pulling data from an API or reading JSON from a file.
25
+
26
+ 2. #{'Transformer'.underline}: transforms data into a new format. If none is specified, we use the `null` transformer which does nothing to the data.
27
+
28
+ 3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout.
29
+
30
+ If you do not want to use the command line flags, you can also configure a job with a .yml config file. You can either specify the path to this file or use the filename and place the file in ~/.config/chronicle/etl/jobs/NAME.yml and call it with `--job NAME`
31
+ LONG_DESC
32
+ # Run an ETL job
33
+ def start
34
+ runner_options = build_runner_options(options)
35
+ runner = Chronicle::ETL::Runner.new(runner_options)
36
+ runner.run!
37
+ end
38
+
39
+ desc "create", "Create a job"
40
+ # Create an ETL job
41
+ def create
42
+ runner_options = build_runner_options(options)
43
+ path = File.join('chronicle', 'etl', 'jobs', options[:job])
44
+ Chronicle::ETL::Config.write(path, runner_options)
45
+ end
46
+
47
+ desc "show", "Show details about a job"
48
+ # Show an ETL job
49
+ def show
50
+ runner_options = build_runner_options(options)
51
+ pp runner_options
52
+ end
53
+
54
+ desc "list", "List all available jobs"
55
+ # List available ETL jobs
56
+ def list
57
+ jobs = Chronicle::ETL::Config.jobs
58
+
59
+ job_details = jobs.map do |job|
60
+ r = Chronicle::ETL::Config.load("chronicle/etl/jobs/#{job}.yml")
61
+
62
+ extractor = r[:extractor][:name] if r[:extractor]
63
+ transformer = r[:transformer][:name] if r[:transformer]
64
+ loader = r[:loader][:name] if r[:loader]
65
+
66
+ [job, extractor, transformer, loader]
67
+ end
68
+
69
+ headers = ['name', 'extractor', 'transformer', 'loader'].map{|h| h.upcase.bold }
70
+
71
+ table = TTY::Table.new(headers, job_details)
72
+ puts table.render(indent: 0, padding: [0, 2])
73
+ end
74
+
75
+ private
76
+
77
+ # Create runner options by reading config file and then overwriting with flag options
78
+ def build_runner_options options
79
+ flag_options = process_flag_options(options)
80
+ job_options = load_job(options[:job])
81
+ flag_options.merge(job_options)
82
+ end
83
+
84
+ def load_job job
85
+ yml_config = Chronicle::ETL::Config.load("chronicle/etl/jobs/#{job}.yml")
86
+ # FIXME: use better trick to depely symbolize keys
87
+ JSON.parse(yml_config.to_json, symbolize_names: true)
88
+ end
89
+
90
+ # Takes flag options and turns them into a runner config
91
+ def process_flag_options options
92
+ {
93
+ extractor: {
94
+ name: options[:extractor],
95
+ options: options[:'extractor-opts']
96
+ },
97
+ transformer: {
98
+ name: options[:transformer],
99
+ options: options[:'transformer-opts']
100
+ },
101
+ loader: {
102
+ name: options[:loader],
103
+ options: options[:'loader-opts']
104
+ }
105
+ }
106
+ end
107
+ end
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,83 @@
1
+ require 'thor'
2
+ require 'chronicle/etl'
3
+ require 'colorize'
4
+
5
+ require 'chronicle/etl/cli/subcommand_base'
6
+ require 'chronicle/etl/cli/connectors'
7
+ require 'chronicle/etl/cli/jobs'
8
+
9
+ module Chronicle
10
+ module ETL
11
+ module CLI
12
+ # Main entrypoint for CLI app
13
+ class Main < Thor
14
+ class_option "verbose", type: :boolean, default: false
15
+ default_task "jobs"
16
+
17
+ desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
18
+ subcommand 'connectors', Connectors
19
+
20
+ desc 'jobs:COMMAND', 'Configure and run jobs', hide: true
21
+ subcommand 'jobs', Jobs
22
+
23
+ # Entrypoint for the CLI
24
+ def self.start(given_args = ARGV, config = {})
25
+ if given_args.none?
26
+ abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
27
+ end
28
+
29
+ # take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
30
+ if given_args.any? && given_args[0].include?(':')
31
+ commands = given_args.shift.split(':')
32
+ given_args = given_args.unshift(commands).flatten
33
+ end
34
+
35
+ super(given_args, config)
36
+ end
37
+
38
+ # Displays help options for chronicle-etl
39
+ def help(meth = nil, subcommand = false)
40
+ if meth && !respond_to?(meth)
41
+ klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
42
+ klass.start(['-h', task].compact, shell: shell)
43
+ else
44
+ shell.say "ABOUT".bold
45
+ shell.say " #{'chronicle-etl'.italic} is a utility tool for #{'extracting'.underline}, #{'transforming'.underline}, and #{'loading'.underline} personal data."
46
+ shell.say
47
+ shell.say "USAGE".bold
48
+ shell.say " $ chronicle-etl COMMAND"
49
+ shell.say
50
+ shell.say "EXAMPLES".bold
51
+ shell.say " Show available connectors:".italic.light_black
52
+ shell.say " $ chronicle-etl connectors:list"
53
+ shell.say
54
+ shell.say " Run a simple job:".italic.light_black
55
+ shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
56
+ shell.say
57
+ shell.say " Show full job options:".italic.light_black
58
+ shell.say " $ chronicle-etl jobs help start"
59
+
60
+ list = []
61
+
62
+ Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
63
+ list += thor_class.printable_tasks(false)
64
+ end
65
+ list.sort! { |a, b| a[0] <=> b[0] }
66
+ list.unshift ["help", "# This help menu"]
67
+
68
+ shell.say
69
+ shell.say 'ALL COMMANDS'.bold
70
+ shell.print_table(list, indent: 2, truncate: true)
71
+ shell.say
72
+ shell.say "VERSION".bold
73
+ shell.say " #{Chronicle::ETL::VERSION}"
74
+ shell.say
75
+ shell.say "FULL DOCUMENTATION".bold
76
+ shell.say " https://github.com/chronicle-app/chronicle-etl".blue
77
+ shell.say
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,37 @@
1
+ module Chronicle
2
+ module ETL
3
+ module CLI
4
+ # Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
5
+ class SubcommandBase < Thor
6
+ # Print usage instructions for a subcommand
7
+ def self.help(shell, subcommand = false)
8
+ list = printable_commands(true, subcommand)
9
+ Thor::Util.thor_classes_in(self).each do |klass|
10
+ list += klass.printable_commands(false)
11
+ end
12
+ list.sort! { |a, b| a[0] <=> b[0] }
13
+
14
+ shell.say "COMMANDS".bold
15
+ shell.print_table(list, indent: 2, truncate: true)
16
+ shell.say
17
+ class_options_help(shell)
18
+ end
19
+
20
+ # Show docs with command:subcommand pattern.
21
+ # For `help` command, don't use colon
22
+ def self.banner(command, namespace = nil, subcommand = false)
23
+ if command.name == 'help'
24
+ "#{subcommand_prefix} #{command.usage}"
25
+ else
26
+ "#{subcommand_prefix}:#{command.usage}"
27
+ end
28
+ end
29
+
30
+ # Use subcommand classname to derive display name for subcommand
31
+ def self.subcommand_prefix
32
+ self.name.gsub(%r{.*::}, '').gsub(%r{^[A-Z]}) { |match| match[0].downcase }.gsub(%r{[A-Z]}) { |match| "-#{match[0].downcase}" }
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,32 @@
1
+ require 'runcom'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # Utility methods to read, write, and access config files
6
+ module Config
7
+ # Loads a yml config file
8
+ def self.load(path)
9
+ config = Runcom::Config.new(path)
10
+ # FIXME: hack to deeply symbolize keys
11
+ JSON.parse(config.to_h.to_json, symbolize_names: true)
12
+ end
13
+
14
+ # Writes a hash as a yml config file
15
+ def self.write(path, data)
16
+ config = Runcom::Config.new(path)
17
+ filename = config.all[0].to_s + '.yml'
18
+ File.open(filename, 'w') do |f|
19
+ f << data.to_yaml
20
+ end
21
+ end
22
+
23
+ # Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
24
+ def self.jobs
25
+ job_directory = Runcom::Config.new('chronicle/etl/jobs').current
26
+ Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
27
+ File.basename(filename, ".*")
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,17 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Error < StandardError; end;
4
+
5
+ class ConnectorNotAvailableError < Error
6
+ def initialize(message, provider: nil, name: nil)
7
+ super(message)
8
+ @provider = provider
9
+ @name = name
10
+ end
11
+ attr_reader :name, :provider
12
+ end
13
+
14
+ class ProviderNotAvailableError < ConnectorNotAvailableError; end
15
+ class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
16
+ end
17
+ end
@@ -1,5 +1,5 @@
1
1
  require 'csv'
2
- class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
2
+ class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
3
3
  DEFAULT_OPTIONS = {
4
4
  headers: true,
5
5
  filename: $stdin
@@ -18,7 +18,7 @@ class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
18
18
  end
19
19
 
20
20
  def results_count
21
- CSV.read(@options[:filename], { headers: @options[:headers] }).count if read_from_file?
21
+ CSV.read(@options[:filename], headers: @options[:headers]).count if read_from_file?
22
22
  end
23
23
 
24
24
  private
@@ -33,7 +33,7 @@ class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
33
33
  }
34
34
 
35
35
  stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
36
- CSV.new(stream, csv_options)
36
+ CSV.new(stream, **csv_options)
37
37
  end
38
38
 
39
39
  def read_from_file?
@@ -1,20 +1,31 @@
1
+ require 'chronicle/etl'
2
+
1
3
  module Chronicle
2
- module Etl
3
- module Extractors
4
- class Extractor
5
- def initialize(options = {})
6
- @options = options.transform_keys!(&:to_sym)
7
- end
4
+ module ETL
5
+ # Abstract class representing an Extractor for an ETL job
6
+ class Extractor
7
+ extend Chronicle::ETL::Catalog
8
8
 
9
- def extract
10
- raise NotImplementedError
11
- end
9
+ # Construct a new instance of this extractor. Options are passed in from a Runner
10
+ # == Paramters:
11
+ # options::
12
+ # Options for configuring this Extractor
13
+ def initialize(options = {})
14
+ @options = options.transform_keys!(&:to_sym)
15
+ end
12
16
 
13
- def results_count; end
17
+ # Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
18
+ def extract
19
+ raise NotImplementedError
14
20
  end
21
+
22
+ # An optional method to calculate how many records there are to extract. Used primarily for
23
+ # building the progress bar
24
+ def results_count; end
15
25
  end
16
26
  end
17
27
  end
18
28
 
19
- require_relative 'stdin'
20
- require_relative 'csv'
29
+ require_relative 'csv_extractor'
30
+ require_relative 'file_extractor'
31
+ require_relative 'stdin_extractor'
@@ -0,0 +1,52 @@
1
+ require 'pathname'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class FileExtractor < Chronicle::ETL::Extractor
6
+ def extract
7
+ if file?
8
+ extract_file do |data, metadata|
9
+ yield(data, metadata)
10
+ end
11
+ elsif directory?
12
+ extract_from_directory do |data, metadata|
13
+ yield(data, metadata)
14
+ end
15
+ end
16
+ end
17
+
18
+ def results_count
19
+ if file?
20
+ return 1
21
+ else
22
+ search_pattern = File.join(@options[:filename], '**/*.eml')
23
+ Dir.glob(search_pattern).count
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def extract_from_directory
30
+ search_pattern = File.join(@options[:filename], '**/*.eml')
31
+ filenames = Dir.glob(search_pattern)
32
+ filenames.each do |filename|
33
+ file = File.open(filename)
34
+ yield(file.read, {filename: file})
35
+ end
36
+ end
37
+
38
+ def extract_file
39
+ file = File.open(@options[:filename])
40
+ yield(file.read, {filename: @options[:filename]})
41
+ end
42
+
43
+ def directory?
44
+ Pathname.new(@options[:filename]).directory?
45
+ end
46
+
47
+ def file?
48
+ Pathname.new(@options[:filename]).file?
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,11 @@
1
+ module Chronicle
2
+ module ETL
3
+ class StdinExtractor < Chronicle::ETL::Extractor
4
+ def extract
5
+ $stdin.read.each_line do |line|
6
+ yield line
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,29 @@
1
+ require 'csv'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class CsvLoader < Chronicle::ETL::Loader
6
+ def initialize(options={})
7
+ super(options)
8
+ @rows = []
9
+ end
10
+
11
+ def load(result)
12
+ if (result.is_a? Hash)
13
+ @rows << result.values
14
+ else
15
+ @rows << result
16
+ end
17
+ end
18
+
19
+ def finish
20
+ z = $stdout
21
+ CSV(z) do |csv|
22
+ @rows.each do |row|
23
+ csv << row
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end