chronicle-etl 0.1.3 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +8 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile.lock +15 -1
  5. data/README.md +62 -11
  6. data/chronicle-etl.gemspec +6 -1
  7. data/exe/chronicle-etl +2 -2
  8. data/lib/chronicle/etl.rb +9 -2
  9. data/lib/chronicle/etl/catalog.rb +68 -18
  10. data/lib/chronicle/etl/cli/connectors.rb +32 -0
  11. data/lib/chronicle/etl/cli/jobs.rb +116 -0
  12. data/lib/chronicle/etl/cli/main.rb +83 -0
  13. data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
  14. data/lib/chronicle/etl/config.rb +53 -0
  15. data/lib/chronicle/etl/exceptions.rb +17 -0
  16. data/lib/chronicle/etl/extractors/csv_extractor.rb +1 -1
  17. data/lib/chronicle/etl/extractors/extractor.rb +18 -5
  18. data/lib/chronicle/etl/extractors/file_extractor.rb +2 -2
  19. data/lib/chronicle/etl/extractors/stdin_extractor.rb +2 -2
  20. data/lib/chronicle/etl/job.rb +62 -0
  21. data/lib/chronicle/etl/job_definition.rb +51 -0
  22. data/lib/chronicle/etl/job_log.rb +79 -0
  23. data/lib/chronicle/etl/job_logger.rb +76 -0
  24. data/lib/chronicle/etl/loaders/csv_loader.rb +2 -2
  25. data/lib/chronicle/etl/loaders/loader.rb +13 -6
  26. data/lib/chronicle/etl/loaders/rest_loader.rb +30 -0
  27. data/lib/chronicle/etl/loaders/stdout_loader.rb +2 -2
  28. data/lib/chronicle/etl/loaders/table_loader.rb +6 -10
  29. data/lib/chronicle/etl/runner.rb +19 -51
  30. data/lib/chronicle/etl/transformers/json_transformer.rb +2 -2
  31. data/lib/chronicle/etl/transformers/null_transformer.rb +4 -4
  32. data/lib/chronicle/etl/transformers/transformer.rb +21 -4
  33. data/lib/chronicle/etl/utils/progress_bar.rb +1 -1
  34. data/lib/chronicle/etl/version.rb +2 -2
  35. metadata +85 -4
  36. data/CHANGELOG.md +0 -18
  37. data/lib/chronicle/etl/cli.rb +0 -48
@@ -0,0 +1,83 @@
1
+ require 'thor'
2
+ require 'chronicle/etl'
3
+ require 'colorize'
4
+
5
+ require 'chronicle/etl/cli/subcommand_base'
6
+ require 'chronicle/etl/cli/connectors'
7
+ require 'chronicle/etl/cli/jobs'
8
+
9
+ module Chronicle
10
+ module ETL
11
+ module CLI
12
+ # Main entrypoint for CLI app
13
+ class Main < Thor
14
+ class_option "verbose", type: :boolean, default: false
15
+ default_task "jobs"
16
+
17
+ desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
18
+ subcommand 'connectors', Connectors
19
+
20
+ desc 'jobs:COMMAND', 'Configure and run jobs', hide: true
21
+ subcommand 'jobs', Jobs
22
+
23
+ # Entrypoint for the CLI
24
+ def self.start(given_args = ARGV, config = {})
25
+ if given_args.none?
26
+ abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
27
+ end
28
+
29
+ # take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
30
+ if given_args.any? && given_args[0].include?(':')
31
+ commands = given_args.shift.split(':')
32
+ given_args = given_args.unshift(commands).flatten
33
+ end
34
+
35
+ super(given_args, config)
36
+ end
37
+
38
+ # Displays help options for chronicle-etl
39
+ def help(meth = nil, subcommand = false)
40
+ if meth && !respond_to?(meth)
41
+ klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
42
+ klass.start(['-h', task].compact, shell: shell)
43
+ else
44
+ shell.say "ABOUT".bold
45
+ shell.say " #{'chronicle-etl'.italic} is a utility tool for #{'extracting'.underline}, #{'transforming'.underline}, and #{'loading'.underline} personal data."
46
+ shell.say
47
+ shell.say "USAGE".bold
48
+ shell.say " $ chronicle-etl COMMAND"
49
+ shell.say
50
+ shell.say "EXAMPLES".bold
51
+ shell.say " Show available connectors:".italic.light_black
52
+ shell.say " $ chronicle-etl connectors:list"
53
+ shell.say
54
+ shell.say " Run a simple job:".italic.light_black
55
+ shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
56
+ shell.say
57
+ shell.say " Show full job options:".italic.light_black
58
+ shell.say " $ chronicle-etl jobs help start"
59
+
60
+ list = []
61
+
62
+ Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
63
+ list += thor_class.printable_tasks(false)
64
+ end
65
+ list.sort! { |a, b| a[0] <=> b[0] }
66
+ list.unshift ["help", "# This help menu"]
67
+
68
+ shell.say
69
+ shell.say 'ALL COMMANDS'.bold
70
+ shell.print_table(list, indent: 2, truncate: true)
71
+ shell.say
72
+ shell.say "VERSION".bold
73
+ shell.say " #{Chronicle::ETL::VERSION}"
74
+ shell.say
75
+ shell.say "FULL DOCUMENTATION".bold
76
+ shell.say " https://github.com/chronicle-app/chronicle-etl".blue
77
+ shell.say
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,37 @@
1
+ module Chronicle
2
+ module ETL
3
+ module CLI
4
+ # Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
5
+ class SubcommandBase < Thor
6
+ # Print usage instructions for a subcommand
7
+ def self.help(shell, subcommand = false)
8
+ list = printable_commands(true, subcommand)
9
+ Thor::Util.thor_classes_in(self).each do |klass|
10
+ list += klass.printable_commands(false)
11
+ end
12
+ list.sort! { |a, b| a[0] <=> b[0] }
13
+
14
+ shell.say "COMMANDS".bold
15
+ shell.print_table(list, indent: 2, truncate: true)
16
+ shell.say
17
+ class_options_help(shell)
18
+ end
19
+
20
+ # Show docs with command:subcommand pattern.
21
+ # For `help` command, don't use colon
22
+ def self.banner(command, namespace = nil, subcommand = false)
23
+ if command.name == 'help'
24
+ "#{subcommand_prefix} #{command.usage}"
25
+ else
26
+ "#{subcommand_prefix}:#{command.usage}"
27
+ end
28
+ end
29
+
30
+ # Use subcommand classname to derive display name for subcommand
31
+ def self.subcommand_prefix
32
+ self.name.gsub(%r{.*::}, '').gsub(%r{^[A-Z]}) { |match| match[0].downcase }.gsub(%r{[A-Z]}) { |match| "-#{match[0].downcase}" }
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,53 @@
1
+ require 'runcom'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # Utility methods to read, write, and access config files
6
+ module Config
7
+ module_function
8
+
9
+ # Loads a yml config file
10
+ def load(path)
11
+ config = Runcom::Config.new(path)
12
+ # FIXME: hack to deeply symbolize keys
13
+ JSON.parse(config.to_h.to_json, symbolize_names: true)
14
+ end
15
+
16
+ # Writes a hash as a yml config file
17
+ def write(path, data)
18
+ config = Runcom::Config.new(path)
19
+ filename = config.all[0].to_s + '.yml'
20
+ File.open(filename, 'w') do |f|
21
+ f << data.to_yaml
22
+ end
23
+ end
24
+
25
+ # Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
26
+ def available_jobs
27
+ job_directory = Runcom::Config.new('chronicle/etl/jobs').current
28
+ Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
29
+ File.basename(filename, ".*")
30
+ end
31
+ end
32
+
33
+ # Returns all available credentials available in ~/.config/chronilce/etl/credenetials/*.yml
34
+ def available_credentials
35
+ job_directory = Runcom::Config.new('chronicle/etl/credentials').current
36
+ Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
37
+ File.basename(filename, ".*")
38
+ end
39
+ end
40
+
41
+ # Load a job definition from job config directory
42
+ def load_job_from_config(job_name)
43
+ definition = self.load("chronicle/etl/jobs/#{job_name}.yml")
44
+ definition[:name] = job_name
45
+ definition
46
+ end
47
+
48
+ def load_credentials(name)
49
+ config = self.load("chronicle/etl/credentials/#{name}.yml")
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,17 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Error < StandardError; end;
4
+
5
+ class ConnectorNotAvailableError < Error
6
+ def initialize(message, provider: nil, name: nil)
7
+ super(message)
8
+ @provider = provider
9
+ @name = name
10
+ end
11
+ attr_reader :name, :provider
12
+ end
13
+
14
+ class ProviderNotAvailableError < ConnectorNotAvailableError; end
15
+ class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
16
+ end
17
+ end
@@ -1,5 +1,5 @@
1
1
  require 'csv'
2
- class Chronicle::Etl::CsvExtractor < Chronicle::Etl::Extractor
2
+ class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
3
3
  DEFAULT_OPTIONS = {
4
4
  headers: true,
5
5
  filename: $stdin
@@ -1,21 +1,34 @@
1
1
  require 'chronicle/etl'
2
2
 
3
3
  module Chronicle
4
- module Etl
4
+ module ETL
5
+ # Abstract class representing an Extractor for an ETL job
5
6
  class Extractor
6
- extend Chronicle::Etl::Catalog
7
-
8
- ETL_PHASE = :extractor
7
+ extend Chronicle::ETL::Catalog
9
8
 
9
+ # Construct a new instance of this extractor. Options are passed in from a Runner
10
+ # == Paramters:
11
+ # options::
12
+ # Options for configuring this Extractor
10
13
  def initialize(options = {})
11
- @options = options.transform_keys!(&:to_sym)
14
+ @options = options.transform_keys!(&:to_sym)
15
+ handle_continuation
12
16
  end
13
17
 
18
+ # Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
14
19
  def extract
15
20
  raise NotImplementedError
16
21
  end
17
22
 
23
+ # An optional method to calculate how many records there are to extract. Used primarily for
24
+ # building the progress bar
18
25
  def results_count; end
26
+
27
+ private
28
+
29
+ def handle_continuation
30
+ @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation] && @options[:continuation].highest_timestamp
31
+ end
19
32
  end
20
33
  end
21
34
  end
@@ -1,8 +1,8 @@
1
1
  require 'pathname'
2
2
 
3
3
  module Chronicle
4
- module Etl
5
- class FileExtractor < Chronicle::Etl::Extractor
4
+ module ETL
5
+ class FileExtractor < Chronicle::ETL::Extractor
6
6
  def extract
7
7
  if file?
8
8
  extract_file do |data, metadata|
@@ -1,6 +1,6 @@
1
1
  module Chronicle
2
- module Etl
3
- class StdinExtractor < Chronicle::Etl::Extractor
2
+ module ETL
3
+ class StdinExtractor < Chronicle::ETL::Extractor
4
4
  def extract
5
5
  $stdin.read.each_line do |line|
6
6
  yield line
@@ -0,0 +1,62 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Job
4
+ attr_accessor :name,
5
+ :extractor_klass,
6
+ :extractor_options,
7
+ :transformer_klass,
8
+ :transformer_options,
9
+ :loader_klass,
10
+ :loader_options
11
+
12
+ # TODO: build a proper id system
13
+ alias id name
14
+
15
+ def initialize(definition)
16
+ definition = definition.definition # FIXME
17
+ @name = definition[:name]
18
+ @extractor_klass = load_klass(:extractor, definition[:extractor][:name])
19
+ @extractor_options = definition[:extractor][:options] || {}
20
+
21
+ @transformer_klass = load_klass(:transformer, definition[:transformer][:name])
22
+ @transformer_options = definition[:transformer][:options] || {}
23
+
24
+ @loader_klass = load_klass(:loader, definition[:loader][:name])
25
+ @loader_options = definition[:loader][:options] || {}
26
+
27
+ set_continuation
28
+ yield self if block_given?
29
+ end
30
+
31
+ def instantiate_extractor
32
+ instantiate_klass(:extractor)
33
+ end
34
+
35
+ def instantiate_transformer data
36
+ instantiate_klass(:transformer, data)
37
+ end
38
+
39
+ def instantiate_loader
40
+ instantiate_klass(:loader)
41
+ end
42
+
43
+ private
44
+
45
+ def instantiate_klass(phase, *args)
46
+ options = self.send("#{phase.to_s}_options")
47
+ args = args.unshift(options)
48
+ klass = self.send("#{phase.to_s}_klass")
49
+ klass.new(*args)
50
+ end
51
+
52
+ def load_klass phase, identifier
53
+ Chronicle::ETL::Catalog.phase_and_identifier_to_klass(phase, identifier)
54
+ end
55
+
56
+ def set_continuation
57
+ continuation = Chronicle::ETL::JobLogger.load_latest(@job_id)
58
+ @extractor_options[:continuation] = continuation
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,51 @@
1
+ require 'deep_merge'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class JobDefinition
6
+ SKELETON_DEFINITION = {
7
+ extractor: {
8
+ name: nil,
9
+ options: {}
10
+ },
11
+ transformer: {
12
+ name: nil,
13
+ options: {}
14
+ },
15
+ loader: {
16
+ name: nil,
17
+ options: {}
18
+ }
19
+ }.freeze
20
+
21
+ attr_accessor :definition
22
+
23
+ def initialize()
24
+ @definition = SKELETON_DEFINITION
25
+ end
26
+
27
+ # Add config hash to this definition
28
+ def add_config(config = {})
29
+ @definition = config.deep_merge(@definition)
30
+ load_credentials
31
+ validate
32
+ end
33
+
34
+ private
35
+
36
+ def load_credentials
37
+ Chronicle::ETL::Catalog::PHASES.each do |phase|
38
+ credentials_name = @definition[phase][:options][:credentials]
39
+ if credentials_name
40
+ credentials = Chronicle::ETL::Config.load_credentials(credentials_name)
41
+ @definition[phase][:options].deep_merge(credentials)
42
+ end
43
+ end
44
+ end
45
+
46
+ def validate
47
+ return true # TODO
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,79 @@
1
+ require 'pry'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A record of what happened in the running of a job. We're interested in
6
+ # tracking when it ran, if it was successful, and what the latest record
7
+ # we found is (to use as a cursor for the next time)
8
+ class JobLog
9
+ attr_accessor :job,
10
+ :job_id,
11
+ :last_id,
12
+ :highest_timestamp,
13
+ :num_records_processed,
14
+ :started_at,
15
+ :finished_at,
16
+ :success
17
+
18
+ # Create a new JobLog for a given Job
19
+ def initialize
20
+ @num_records_processed = 0
21
+ @success = false
22
+ yield self if block_given?
23
+ end
24
+
25
+ # Log the result of a single transformation in a job
26
+ # @param transformer [Chronicle::ETL::Tranformer] The transformer that ran
27
+ def log_transformation(transformer)
28
+ @last_id = transformer.id if transformer.id
29
+
30
+ # Save the highest timestamp that we've encountered so far
31
+ @highest_timestamp = [transformer.timestamp, @highest_timestamp].compact.max if transformer.timestamp
32
+
33
+ # TODO: a transformer might yield nil. We might also want certain transformers to explode
34
+ # records into multiple new ones. Therefore, this this variable will need more subtle behaviour
35
+ @num_records_processed += 1
36
+ end
37
+
38
+ # Indicate that a job has started
39
+ def start
40
+ @started_at = Time.now
41
+ end
42
+
43
+ # Indicate that a job has finished
44
+ def finish
45
+ @finished_at = Time.now
46
+ @success = true
47
+ end
48
+
49
+ def job= job
50
+ @job = job
51
+ @job_id = job.id
52
+ end
53
+
54
+ # Take a JobLog's instance variables and turn them into a hash representation
55
+ def serialize
56
+ {
57
+ job_id: @job_id,
58
+ last_id: @last_id,
59
+ highest_timestamp: @highest_timestamp,
60
+ num_records_processed: @num_records_processed,
61
+ started_at: @started_at,
62
+ finished_at: @finished_at,
63
+ success: @success
64
+ }
65
+ end
66
+
67
+ # Create a new JobLog and set its instance variables from a serialized hash
68
+ def self.build_from_serialized attrs
69
+ attrs.delete(:id)
70
+ new do |job_log|
71
+ attrs.each do |key, value|
72
+ setter = "#{key.to_s}=".to_sym
73
+ job_log.send(setter, value)
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end