chronicle-etl 0.1.4 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +8 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile.lock +15 -1
  5. data/README.md +31 -13
  6. data/chronicle-etl.gemspec +6 -1
  7. data/exe/chronicle-etl +2 -2
  8. data/lib/chronicle/etl.rb +15 -2
  9. data/lib/chronicle/etl/catalog.rb +67 -17
  10. data/lib/chronicle/etl/cli/connectors.rb +32 -0
  11. data/lib/chronicle/etl/cli/jobs.rb +116 -0
  12. data/lib/chronicle/etl/cli/main.rb +83 -0
  13. data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
  14. data/lib/chronicle/etl/config.rb +53 -0
  15. data/lib/chronicle/etl/exceptions.rb +19 -0
  16. data/lib/chronicle/etl/extractors/csv_extractor.rb +2 -3
  17. data/lib/chronicle/etl/extractors/extractor.rb +21 -5
  18. data/lib/chronicle/etl/extractors/file_extractor.rb +2 -2
  19. data/lib/chronicle/etl/extractors/stdin_extractor.rb +2 -2
  20. data/lib/chronicle/etl/job.rb +71 -0
  21. data/lib/chronicle/etl/job_definition.rb +51 -0
  22. data/lib/chronicle/etl/job_log.rb +85 -0
  23. data/lib/chronicle/etl/job_logger.rb +78 -0
  24. data/lib/chronicle/etl/loaders/csv_loader.rb +4 -8
  25. data/lib/chronicle/etl/loaders/loader.rb +11 -2
  26. data/lib/chronicle/etl/loaders/rest_loader.rb +33 -0
  27. data/lib/chronicle/etl/loaders/stdout_loader.rb +5 -5
  28. data/lib/chronicle/etl/loaders/table_loader.rb +7 -6
  29. data/lib/chronicle/etl/models/activity.rb +15 -0
  30. data/lib/chronicle/etl/models/base.rb +103 -0
  31. data/lib/chronicle/etl/models/entity.rb +15 -0
  32. data/lib/chronicle/etl/models/generic.rb +23 -0
  33. data/lib/chronicle/etl/runner.rb +24 -46
  34. data/lib/chronicle/etl/transformers/null_transformer.rb +5 -6
  35. data/lib/chronicle/etl/transformers/transformer.rb +23 -7
  36. data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
  37. data/lib/chronicle/etl/utils/jsonapi.rb +28 -0
  38. data/lib/chronicle/etl/utils/progress_bar.rb +2 -2
  39. data/lib/chronicle/etl/version.rb +2 -2
  40. metadata +91 -5
  41. data/CHANGELOG.md +0 -23
  42. data/lib/chronicle/etl/cli.rb +0 -56
  43. data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -0,0 +1,83 @@
1
+ require 'thor'
2
+ require 'chronicle/etl'
3
+ require 'colorize'
4
+
5
+ require 'chronicle/etl/cli/subcommand_base'
6
+ require 'chronicle/etl/cli/connectors'
7
+ require 'chronicle/etl/cli/jobs'
8
+
9
+ module Chronicle
10
+ module ETL
11
+ module CLI
12
+ # Main entrypoint for CLI app
13
+ class Main < Thor
14
+ class_option "verbose", type: :boolean, default: false
15
+ default_task "jobs"
16
+
17
+ desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
18
+ subcommand 'connectors', Connectors
19
+
20
+ desc 'jobs:COMMAND', 'Configure and run jobs', hide: true
21
+ subcommand 'jobs', Jobs
22
+
23
+ # Entrypoint for the CLI
24
+ def self.start(given_args = ARGV, config = {})
25
+ if given_args.none?
26
+ abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
27
+ end
28
+
29
+ # take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
30
+ if given_args.any? && given_args[0].include?(':')
31
+ commands = given_args.shift.split(':')
32
+ given_args = given_args.unshift(commands).flatten
33
+ end
34
+
35
+ super(given_args, config)
36
+ end
37
+
38
+ # Displays help options for chronicle-etl
39
+ def help(meth = nil, subcommand = false)
40
+ if meth && !respond_to?(meth)
41
+ klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
42
+ klass.start(['-h', task].compact, shell: shell)
43
+ else
44
+ shell.say "ABOUT".bold
45
+ shell.say " #{'chronicle-etl'.italic} is a utility tool for #{'extracting'.underline}, #{'transforming'.underline}, and #{'loading'.underline} personal data."
46
+ shell.say
47
+ shell.say "USAGE".bold
48
+ shell.say " $ chronicle-etl COMMAND"
49
+ shell.say
50
+ shell.say "EXAMPLES".bold
51
+ shell.say " Show available connectors:".italic.light_black
52
+ shell.say " $ chronicle-etl connectors:list"
53
+ shell.say
54
+ shell.say " Run a simple job:".italic.light_black
55
+ shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
56
+ shell.say
57
+ shell.say " Show full job options:".italic.light_black
58
+ shell.say " $ chronicle-etl jobs help start"
59
+
60
+ list = []
61
+
62
+ Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
63
+ list += thor_class.printable_tasks(false)
64
+ end
65
+ list.sort! { |a, b| a[0] <=> b[0] }
66
+ list.unshift ["help", "# This help menu"]
67
+
68
+ shell.say
69
+ shell.say 'ALL COMMANDS'.bold
70
+ shell.print_table(list, indent: 2, truncate: true)
71
+ shell.say
72
+ shell.say "VERSION".bold
73
+ shell.say " #{Chronicle::ETL::VERSION}"
74
+ shell.say
75
+ shell.say "FULL DOCUMENTATION".bold
76
+ shell.say " https://github.com/chronicle-app/chronicle-etl".blue
77
+ shell.say
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,37 @@
1
+ module Chronicle
2
+ module ETL
3
+ module CLI
4
+ # Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
5
+ class SubcommandBase < Thor
6
+ # Print usage instructions for a subcommand
7
+ def self.help(shell, subcommand = false)
8
+ list = printable_commands(true, subcommand)
9
+ Thor::Util.thor_classes_in(self).each do |klass|
10
+ list += klass.printable_commands(false)
11
+ end
12
+ list.sort! { |a, b| a[0] <=> b[0] }
13
+
14
+ shell.say "COMMANDS".bold
15
+ shell.print_table(list, indent: 2, truncate: true)
16
+ shell.say
17
+ class_options_help(shell)
18
+ end
19
+
20
+ # Show docs with command:subcommand pattern.
21
+ # For `help` command, don't use colon
22
+ def self.banner(command, namespace = nil, subcommand = false)
23
+ if command.name == 'help'
24
+ "#{subcommand_prefix} #{command.usage}"
25
+ else
26
+ "#{subcommand_prefix}:#{command.usage}"
27
+ end
28
+ end
29
+
30
+ # Use subcommand classname to derive display name for subcommand
31
+ def self.subcommand_prefix
32
+ self.name.gsub(%r{.*::}, '').gsub(%r{^[A-Z]}) { |match| match[0].downcase }.gsub(%r{[A-Z]}) { |match| "-#{match[0].downcase}" }
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,53 @@
1
+ require 'runcom'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # Utility methods to read, write, and access config files
6
+ module Config
7
+ module_function
8
+
9
+ # Loads a yml config file
10
+ def load(path)
11
+ config = Runcom::Config.new(path)
12
+ # FIXME: hack to deeply symbolize keys
13
+ JSON.parse(config.to_h.to_json, symbolize_names: true)
14
+ end
15
+
16
+ # Writes a hash as a yml config file
17
+ def write(path, data)
18
+ config = Runcom::Config.new(path)
19
+ filename = config.all[0].to_s + '.yml'
20
+ File.open(filename, 'w') do |f|
21
+ f << data.to_yaml
22
+ end
23
+ end
24
+
25
+ # Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
26
+ def available_jobs
27
+ job_directory = Runcom::Config.new('chronicle/etl/jobs').current
28
+ Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
29
+ File.basename(filename, ".*")
30
+ end
31
+ end
32
+
33
+ # Returns all available credentials available in ~/.config/chronilce/etl/credenetials/*.yml
34
+ def available_credentials
35
+ job_directory = Runcom::Config.new('chronicle/etl/credentials').current
36
+ Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
37
+ File.basename(filename, ".*")
38
+ end
39
+ end
40
+
41
+ # Load a job definition from job config directory
42
+ def load_job_from_config(job_name)
43
+ definition = self.load("chronicle/etl/jobs/#{job_name}.yml")
44
+ definition[:name] = job_name
45
+ definition
46
+ end
47
+
48
+ def load_credentials(name)
49
+ config = self.load("chronicle/etl/credentials/#{name}.yml")
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,19 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Error < StandardError; end;
4
+
5
+ class InvalidTransformedRecordError < Error; end
6
+
7
+ class ConnectorNotAvailableError < Error
8
+ def initialize(message, provider: nil, name: nil)
9
+ super(message)
10
+ @provider = provider
11
+ @name = name
12
+ end
13
+ attr_reader :name, :provider
14
+ end
15
+
16
+ class ProviderNotAvailableError < ConnectorNotAvailableError; end
17
+ class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
18
+ end
19
+ end
@@ -1,5 +1,5 @@
1
1
  require 'csv'
2
- class Chronicle::Etl::CsvExtractor < Chronicle::Etl::Extractor
2
+ class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
3
3
  DEFAULT_OPTIONS = {
4
4
  headers: true,
5
5
  filename: $stdin
@@ -28,8 +28,7 @@ class Chronicle::Etl::CsvExtractor < Chronicle::Etl::Extractor
28
28
 
29
29
  csv_options = {
30
30
  headers: headers,
31
- header_converters: :symbol,
32
- converters: [:all]
31
+ converters: :all
33
32
  }
34
33
 
35
34
  stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
@@ -1,21 +1,37 @@
1
1
  require 'chronicle/etl'
2
2
 
3
3
  module Chronicle
4
- module Etl
4
+ module ETL
5
+ # Abstract class representing an Extractor for an ETL job
5
6
  class Extractor
6
- extend Chronicle::Etl::Catalog
7
-
8
- ETL_PHASE = :extractor
7
+ extend Chronicle::ETL::Catalog
9
8
 
9
+ # Construct a new instance of this extractor. Options are passed in from a Runner
10
+ # == Paramters:
11
+ # options::
12
+ # Options for configuring this Extractor
10
13
  def initialize(options = {})
11
- @options = options.transform_keys!(&:to_sym)
14
+ @options = options.transform_keys!(&:to_sym)
15
+ handle_continuation
12
16
  end
13
17
 
18
+ # Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
14
19
  def extract
15
20
  raise NotImplementedError
16
21
  end
17
22
 
23
+ # An optional method to calculate how many records there are to extract. Used primarily for
24
+ # building the progress bar
18
25
  def results_count; end
26
+
27
+ private
28
+
29
+ def handle_continuation
30
+ return unless @options[:continuation]
31
+
32
+ @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation].highest_timestamp
33
+ @options[:load_after_id] = @options[:continuation].last_id if @options[:continuation].last_id
34
+ end
19
35
  end
20
36
  end
21
37
  end
@@ -1,8 +1,8 @@
1
1
  require 'pathname'
2
2
 
3
3
  module Chronicle
4
- module Etl
5
- class FileExtractor < Chronicle::Etl::Extractor
4
+ module ETL
5
+ class FileExtractor < Chronicle::ETL::Extractor
6
6
  def extract
7
7
  if file?
8
8
  extract_file do |data, metadata|
@@ -1,6 +1,6 @@
1
1
  module Chronicle
2
- module Etl
3
- class StdinExtractor < Chronicle::Etl::Extractor
2
+ module ETL
3
+ class StdinExtractor < Chronicle::ETL::Extractor
4
4
  def extract
5
5
  $stdin.read.each_line do |line|
6
6
  yield line
@@ -0,0 +1,71 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Job
4
+ attr_accessor :name,
5
+ :extractor_klass,
6
+ :extractor_options,
7
+ :transformer_klass,
8
+ :transformer_options,
9
+ :loader_klass,
10
+ :loader_options
11
+
12
+ # TODO: build a proper id system
13
+ alias id name
14
+
15
+ def initialize(definition)
16
+ definition = definition.definition # FIXME
17
+ @name = definition[:name]
18
+ @extractor_klass = load_klass(:extractor, definition[:extractor][:name])
19
+ @extractor_options = definition[:extractor][:options] || {}
20
+
21
+ @transformer_klass = load_klass(:transformer, definition[:transformer][:name])
22
+ @transformer_options = definition[:transformer][:options] || {}
23
+
24
+ @loader_klass = load_klass(:loader, definition[:loader][:name])
25
+ @loader_options = definition[:loader][:options] || {}
26
+
27
+ set_continuation if load_continuation?
28
+ yield self if block_given?
29
+ end
30
+
31
+ def instantiate_extractor
32
+ instantiate_klass(:extractor)
33
+ end
34
+
35
+ def instantiate_transformer(data)
36
+ instantiate_klass(:transformer, data)
37
+ end
38
+
39
+ def instantiate_loader
40
+ instantiate_klass(:loader)
41
+ end
42
+
43
+ def save_log?
44
+ # TODO: this needs more nuance
45
+ return !id.nil?
46
+ end
47
+
48
+ private
49
+
50
+ def instantiate_klass(phase, *args)
51
+ options = self.send("#{phase.to_s}_options")
52
+ args = args.unshift(options)
53
+ klass = self.send("#{phase.to_s}_klass")
54
+ klass.new(*args)
55
+ end
56
+
57
+ def load_klass phase, identifier
58
+ Chronicle::ETL::Catalog.phase_and_identifier_to_klass(phase, identifier)
59
+ end
60
+
61
+ def set_continuation
62
+ continuation = Chronicle::ETL::JobLogger.load_latest(@job_id)
63
+ @extractor_options[:continuation] = continuation
64
+ end
65
+
66
+ def load_continuation?
67
+ save_log?
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,51 @@
1
+ require 'deep_merge'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class JobDefinition
6
+ SKELETON_DEFINITION = {
7
+ extractor: {
8
+ name: nil,
9
+ options: {}
10
+ },
11
+ transformer: {
12
+ name: nil,
13
+ options: {}
14
+ },
15
+ loader: {
16
+ name: nil,
17
+ options: {}
18
+ }
19
+ }.freeze
20
+
21
+ attr_accessor :definition
22
+
23
+ def initialize()
24
+ @definition = SKELETON_DEFINITION
25
+ end
26
+
27
+ # Add config hash to this definition
28
+ def add_config(config = {})
29
+ @definition = config.deep_merge(@definition)
30
+ load_credentials
31
+ validate
32
+ end
33
+
34
+ private
35
+
36
+ def load_credentials
37
+ Chronicle::ETL::Catalog::PHASES.each do |phase|
38
+ credentials_name = @definition[phase][:options][:credentials]
39
+ if credentials_name
40
+ credentials = Chronicle::ETL::Config.load_credentials(credentials_name)
41
+ @definition[phase][:options].deep_merge(credentials)
42
+ end
43
+ end
44
+ end
45
+
46
+ def validate
47
+ return true # TODO
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,85 @@
1
+ require 'forwardable'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A record of what happened in the running of a job. We're interested in
6
+ # tracking when it ran, if it was successful, and what the latest record
7
+ # we found is (to use as a cursor for the next time)
8
+ class JobLog
9
+ extend Forwardable
10
+
11
+ attr_accessor :job,
12
+ :job_id,
13
+ :last_id,
14
+ :highest_timestamp,
15
+ :num_records_processed,
16
+ :started_at,
17
+ :finished_at,
18
+ :success
19
+
20
+ def_delegators :@job, :save_log?
21
+
22
+ # Create a new JobLog for a given Job
23
+ def initialize
24
+ @num_records_processed = 0
25
+ @success = false
26
+ yield self if block_given?
27
+ end
28
+
29
+ # Log the result of a single transformation in a job
30
+ # @param transformer [Chronicle::ETL::Tranformer] The transformer that ran
31
+ def log_transformation(transformer)
32
+ @last_id = transformer.id if transformer.id
33
+
34
+ # Save the highest timestamp that we've encountered so far
35
+ @highest_timestamp = [transformer.timestamp, @highest_timestamp].compact.max if transformer.timestamp
36
+
37
+ # TODO: a transformer might yield nil. We might also want certain transformers to explode
38
+ # records into multiple new ones. Therefore, this this variable will need more subtle behaviour
39
+ @num_records_processed += 1
40
+ end
41
+
42
+ # Indicate that a job has started
43
+ def start
44
+ @started_at = Time.now
45
+ end
46
+
47
+ # Indicate that a job has finished
48
+ def finish
49
+ @finished_at = Time.now
50
+ @success = true
51
+ end
52
+
53
+ def job= job
54
+ @job = job
55
+ @job_id = job.id
56
+ end
57
+
58
+ # Take a JobLog's instance variables and turn them into a hash representation
59
+ def serialize
60
+ {
61
+ job_id: @job_id,
62
+ last_id: @last_id,
63
+ highest_timestamp: @highest_timestamp,
64
+ num_records_processed: @num_records_processed,
65
+ started_at: @started_at,
66
+ finished_at: @finished_at,
67
+ success: @success
68
+ }
69
+ end
70
+
71
+ private
72
+
73
+ # Create a new JobLog and set its instance variables from a serialized hash
74
+ def self.build_from_serialized attrs
75
+ attrs.delete(:id)
76
+ new do |job_log|
77
+ attrs.each do |key, value|
78
+ setter = "#{key.to_s}=".to_sym
79
+ job_log.send(setter, value)
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end