chronicle-etl 0.1.4 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +8 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile.lock +15 -1
  5. data/README.md +31 -13
  6. data/chronicle-etl.gemspec +6 -1
  7. data/exe/chronicle-etl +2 -2
  8. data/lib/chronicle/etl.rb +15 -2
  9. data/lib/chronicle/etl/catalog.rb +67 -17
  10. data/lib/chronicle/etl/cli/connectors.rb +32 -0
  11. data/lib/chronicle/etl/cli/jobs.rb +116 -0
  12. data/lib/chronicle/etl/cli/main.rb +83 -0
  13. data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
  14. data/lib/chronicle/etl/config.rb +53 -0
  15. data/lib/chronicle/etl/exceptions.rb +19 -0
  16. data/lib/chronicle/etl/extractors/csv_extractor.rb +2 -3
  17. data/lib/chronicle/etl/extractors/extractor.rb +21 -5
  18. data/lib/chronicle/etl/extractors/file_extractor.rb +2 -2
  19. data/lib/chronicle/etl/extractors/stdin_extractor.rb +2 -2
  20. data/lib/chronicle/etl/job.rb +71 -0
  21. data/lib/chronicle/etl/job_definition.rb +51 -0
  22. data/lib/chronicle/etl/job_log.rb +85 -0
  23. data/lib/chronicle/etl/job_logger.rb +78 -0
  24. data/lib/chronicle/etl/loaders/csv_loader.rb +4 -8
  25. data/lib/chronicle/etl/loaders/loader.rb +11 -2
  26. data/lib/chronicle/etl/loaders/rest_loader.rb +33 -0
  27. data/lib/chronicle/etl/loaders/stdout_loader.rb +5 -5
  28. data/lib/chronicle/etl/loaders/table_loader.rb +7 -6
  29. data/lib/chronicle/etl/models/activity.rb +15 -0
  30. data/lib/chronicle/etl/models/base.rb +103 -0
  31. data/lib/chronicle/etl/models/entity.rb +15 -0
  32. data/lib/chronicle/etl/models/generic.rb +23 -0
  33. data/lib/chronicle/etl/runner.rb +24 -46
  34. data/lib/chronicle/etl/transformers/null_transformer.rb +5 -6
  35. data/lib/chronicle/etl/transformers/transformer.rb +23 -7
  36. data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
  37. data/lib/chronicle/etl/utils/jsonapi.rb +28 -0
  38. data/lib/chronicle/etl/utils/progress_bar.rb +2 -2
  39. data/lib/chronicle/etl/version.rb +2 -2
  40. metadata +91 -5
  41. data/CHANGELOG.md +0 -23
  42. data/lib/chronicle/etl/cli.rb +0 -56
  43. data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -0,0 +1,83 @@
1
+ require 'thor'
2
+ require 'chronicle/etl'
3
+ require 'colorize'
4
+
5
+ require 'chronicle/etl/cli/subcommand_base'
6
+ require 'chronicle/etl/cli/connectors'
7
+ require 'chronicle/etl/cli/jobs'
8
+
9
+ module Chronicle
10
+ module ETL
11
+ module CLI
12
+ # Main entrypoint for CLI app
13
+ class Main < Thor
14
+ class_option "verbose", type: :boolean, default: false
15
+ default_task "jobs"
16
+
17
+ desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
18
+ subcommand 'connectors', Connectors
19
+
20
+ desc 'jobs:COMMAND', 'Configure and run jobs', hide: true
21
+ subcommand 'jobs', Jobs
22
+
23
+ # Entrypoint for the CLI
24
+ def self.start(given_args = ARGV, config = {})
25
+ if given_args.none?
26
+ abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
27
+ end
28
+
29
+ # take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
30
+ if given_args.any? && given_args[0].include?(':')
31
+ commands = given_args.shift.split(':')
32
+ given_args = given_args.unshift(commands).flatten
33
+ end
34
+
35
+ super(given_args, config)
36
+ end
37
+
38
+ # Displays help options for chronicle-etl
39
+ def help(meth = nil, subcommand = false)
40
+ if meth && !respond_to?(meth)
41
+ klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
42
+ klass.start(['-h', task].compact, shell: shell)
43
+ else
44
+ shell.say "ABOUT".bold
45
+ shell.say " #{'chronicle-etl'.italic} is a utility tool for #{'extracting'.underline}, #{'transforming'.underline}, and #{'loading'.underline} personal data."
46
+ shell.say
47
+ shell.say "USAGE".bold
48
+ shell.say " $ chronicle-etl COMMAND"
49
+ shell.say
50
+ shell.say "EXAMPLES".bold
51
+ shell.say " Show available connectors:".italic.light_black
52
+ shell.say " $ chronicle-etl connectors:list"
53
+ shell.say
54
+ shell.say " Run a simple job:".italic.light_black
55
+ shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
56
+ shell.say
57
+ shell.say " Show full job options:".italic.light_black
58
+ shell.say " $ chronicle-etl jobs help start"
59
+
60
+ list = []
61
+
62
+ Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
63
+ list += thor_class.printable_tasks(false)
64
+ end
65
+ list.sort! { |a, b| a[0] <=> b[0] }
66
+ list.unshift ["help", "# This help menu"]
67
+
68
+ shell.say
69
+ shell.say 'ALL COMMANDS'.bold
70
+ shell.print_table(list, indent: 2, truncate: true)
71
+ shell.say
72
+ shell.say "VERSION".bold
73
+ shell.say " #{Chronicle::ETL::VERSION}"
74
+ shell.say
75
+ shell.say "FULL DOCUMENTATION".bold
76
+ shell.say " https://github.com/chronicle-app/chronicle-etl".blue
77
+ shell.say
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,37 @@
1
+ module Chronicle
2
+ module ETL
3
+ module CLI
4
+ # Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
5
+ class SubcommandBase < Thor
6
+ # Print usage instructions for a subcommand
7
+ def self.help(shell, subcommand = false)
8
+ list = printable_commands(true, subcommand)
9
+ Thor::Util.thor_classes_in(self).each do |klass|
10
+ list += klass.printable_commands(false)
11
+ end
12
+ list.sort! { |a, b| a[0] <=> b[0] }
13
+
14
+ shell.say "COMMANDS".bold
15
+ shell.print_table(list, indent: 2, truncate: true)
16
+ shell.say
17
+ class_options_help(shell)
18
+ end
19
+
20
+ # Show docs with command:subcommand pattern.
21
+ # For `help` command, don't use colon
22
+ def self.banner(command, namespace = nil, subcommand = false)
23
+ if command.name == 'help'
24
+ "#{subcommand_prefix} #{command.usage}"
25
+ else
26
+ "#{subcommand_prefix}:#{command.usage}"
27
+ end
28
+ end
29
+
30
+ # Use subcommand classname to derive display name for subcommand
31
+ def self.subcommand_prefix
32
+ self.name.gsub(%r{.*::}, '').gsub(%r{^[A-Z]}) { |match| match[0].downcase }.gsub(%r{[A-Z]}) { |match| "-#{match[0].downcase}" }
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,53 @@
1
+ require 'runcom'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # Utility methods to read, write, and access config files
6
+ module Config
7
+ module_function
8
+
9
+ # Loads a yml config file
10
+ def load(path)
11
+ config = Runcom::Config.new(path)
12
+ # FIXME: hack to deeply symbolize keys
13
+ JSON.parse(config.to_h.to_json, symbolize_names: true)
14
+ end
15
+
16
+ # Writes a hash as a yml config file
17
+ def write(path, data)
18
+ config = Runcom::Config.new(path)
19
+ filename = config.all[0].to_s + '.yml'
20
+ File.open(filename, 'w') do |f|
21
+ f << data.to_yaml
22
+ end
23
+ end
24
+
25
+ # Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
26
+ def available_jobs
27
+ job_directory = Runcom::Config.new('chronicle/etl/jobs').current
28
+ Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
29
+ File.basename(filename, ".*")
30
+ end
31
+ end
32
+
33
+ # Returns all available credentials available in ~/.config/chronilce/etl/credenetials/*.yml
34
+ def available_credentials
35
+ job_directory = Runcom::Config.new('chronicle/etl/credentials').current
36
+ Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
37
+ File.basename(filename, ".*")
38
+ end
39
+ end
40
+
41
+ # Load a job definition from job config directory
42
+ def load_job_from_config(job_name)
43
+ definition = self.load("chronicle/etl/jobs/#{job_name}.yml")
44
+ definition[:name] = job_name
45
+ definition
46
+ end
47
+
48
+ def load_credentials(name)
49
+ config = self.load("chronicle/etl/credentials/#{name}.yml")
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,19 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Error < StandardError; end;
4
+
5
+ class InvalidTransformedRecordError < Error; end
6
+
7
+ class ConnectorNotAvailableError < Error
8
+ def initialize(message, provider: nil, name: nil)
9
+ super(message)
10
+ @provider = provider
11
+ @name = name
12
+ end
13
+ attr_reader :name, :provider
14
+ end
15
+
16
+ class ProviderNotAvailableError < ConnectorNotAvailableError; end
17
+ class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
18
+ end
19
+ end
@@ -1,5 +1,5 @@
1
1
  require 'csv'
2
- class Chronicle::Etl::CsvExtractor < Chronicle::Etl::Extractor
2
+ class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
3
3
  DEFAULT_OPTIONS = {
4
4
  headers: true,
5
5
  filename: $stdin
@@ -28,8 +28,7 @@ class Chronicle::Etl::CsvExtractor < Chronicle::Etl::Extractor
28
28
 
29
29
  csv_options = {
30
30
  headers: headers,
31
- header_converters: :symbol,
32
- converters: [:all]
31
+ converters: :all
33
32
  }
34
33
 
35
34
  stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
@@ -1,21 +1,37 @@
1
1
  require 'chronicle/etl'
2
2
 
3
3
  module Chronicle
4
- module Etl
4
+ module ETL
5
+ # Abstract class representing an Extractor for an ETL job
5
6
  class Extractor
6
- extend Chronicle::Etl::Catalog
7
-
8
- ETL_PHASE = :extractor
7
+ extend Chronicle::ETL::Catalog
9
8
 
9
+ # Construct a new instance of this extractor. Options are passed in from a Runner
10
+ # == Paramters:
11
+ # options::
12
+ # Options for configuring this Extractor
10
13
  def initialize(options = {})
11
- @options = options.transform_keys!(&:to_sym)
14
+ @options = options.transform_keys!(&:to_sym)
15
+ handle_continuation
12
16
  end
13
17
 
18
+ # Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
14
19
  def extract
15
20
  raise NotImplementedError
16
21
  end
17
22
 
23
+ # An optional method to calculate how many records there are to extract. Used primarily for
24
+ # building the progress bar
18
25
  def results_count; end
26
+
27
+ private
28
+
29
+ def handle_continuation
30
+ return unless @options[:continuation]
31
+
32
+ @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation].highest_timestamp
33
+ @options[:load_after_id] = @options[:continuation].last_id if @options[:continuation].last_id
34
+ end
19
35
  end
20
36
  end
21
37
  end
@@ -1,8 +1,8 @@
1
1
  require 'pathname'
2
2
 
3
3
  module Chronicle
4
- module Etl
5
- class FileExtractor < Chronicle::Etl::Extractor
4
+ module ETL
5
+ class FileExtractor < Chronicle::ETL::Extractor
6
6
  def extract
7
7
  if file?
8
8
  extract_file do |data, metadata|
@@ -1,6 +1,6 @@
1
1
  module Chronicle
2
- module Etl
3
- class StdinExtractor < Chronicle::Etl::Extractor
2
+ module ETL
3
+ class StdinExtractor < Chronicle::ETL::Extractor
4
4
  def extract
5
5
  $stdin.read.each_line do |line|
6
6
  yield line
@@ -0,0 +1,71 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Job
4
+ attr_accessor :name,
5
+ :extractor_klass,
6
+ :extractor_options,
7
+ :transformer_klass,
8
+ :transformer_options,
9
+ :loader_klass,
10
+ :loader_options
11
+
12
+ # TODO: build a proper id system
13
+ alias id name
14
+
15
+ def initialize(definition)
16
+ definition = definition.definition # FIXME
17
+ @name = definition[:name]
18
+ @extractor_klass = load_klass(:extractor, definition[:extractor][:name])
19
+ @extractor_options = definition[:extractor][:options] || {}
20
+
21
+ @transformer_klass = load_klass(:transformer, definition[:transformer][:name])
22
+ @transformer_options = definition[:transformer][:options] || {}
23
+
24
+ @loader_klass = load_klass(:loader, definition[:loader][:name])
25
+ @loader_options = definition[:loader][:options] || {}
26
+
27
+ set_continuation if load_continuation?
28
+ yield self if block_given?
29
+ end
30
+
31
+ def instantiate_extractor
32
+ instantiate_klass(:extractor)
33
+ end
34
+
35
+ def instantiate_transformer(data)
36
+ instantiate_klass(:transformer, data)
37
+ end
38
+
39
+ def instantiate_loader
40
+ instantiate_klass(:loader)
41
+ end
42
+
43
+ def save_log?
44
+ # TODO: this needs more nuance
45
+ return !id.nil?
46
+ end
47
+
48
+ private
49
+
50
+ def instantiate_klass(phase, *args)
51
+ options = self.send("#{phase.to_s}_options")
52
+ args = args.unshift(options)
53
+ klass = self.send("#{phase.to_s}_klass")
54
+ klass.new(*args)
55
+ end
56
+
57
+ def load_klass phase, identifier
58
+ Chronicle::ETL::Catalog.phase_and_identifier_to_klass(phase, identifier)
59
+ end
60
+
61
+ def set_continuation
62
+ continuation = Chronicle::ETL::JobLogger.load_latest(@job_id)
63
+ @extractor_options[:continuation] = continuation
64
+ end
65
+
66
+ def load_continuation?
67
+ save_log?
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,51 @@
1
+ require 'deep_merge'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class JobDefinition
6
+ SKELETON_DEFINITION = {
7
+ extractor: {
8
+ name: nil,
9
+ options: {}
10
+ },
11
+ transformer: {
12
+ name: nil,
13
+ options: {}
14
+ },
15
+ loader: {
16
+ name: nil,
17
+ options: {}
18
+ }
19
+ }.freeze
20
+
21
+ attr_accessor :definition
22
+
23
+ def initialize()
24
+ @definition = SKELETON_DEFINITION
25
+ end
26
+
27
+ # Add config hash to this definition
28
+ def add_config(config = {})
29
+ @definition = config.deep_merge(@definition)
30
+ load_credentials
31
+ validate
32
+ end
33
+
34
+ private
35
+
36
+ def load_credentials
37
+ Chronicle::ETL::Catalog::PHASES.each do |phase|
38
+ credentials_name = @definition[phase][:options][:credentials]
39
+ if credentials_name
40
+ credentials = Chronicle::ETL::Config.load_credentials(credentials_name)
41
+ @definition[phase][:options].deep_merge(credentials)
42
+ end
43
+ end
44
+ end
45
+
46
+ def validate
47
+ return true # TODO
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,85 @@
1
+ require 'forwardable'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A record of what happened in the running of a job. We're interested in
6
+ # tracking when it ran, if it was successful, and what the latest record
7
+ # we found is (to use as a cursor for the next time)
8
+ class JobLog
9
+ extend Forwardable
10
+
11
+ attr_accessor :job,
12
+ :job_id,
13
+ :last_id,
14
+ :highest_timestamp,
15
+ :num_records_processed,
16
+ :started_at,
17
+ :finished_at,
18
+ :success
19
+
20
+ def_delegators :@job, :save_log?
21
+
22
+ # Create a new JobLog for a given Job
23
+ def initialize
24
+ @num_records_processed = 0
25
+ @success = false
26
+ yield self if block_given?
27
+ end
28
+
29
+ # Log the result of a single transformation in a job
30
+ # @param transformer [Chronicle::ETL::Tranformer] The transformer that ran
31
+ def log_transformation(transformer)
32
+ @last_id = transformer.id if transformer.id
33
+
34
+ # Save the highest timestamp that we've encountered so far
35
+ @highest_timestamp = [transformer.timestamp, @highest_timestamp].compact.max if transformer.timestamp
36
+
37
+ # TODO: a transformer might yield nil. We might also want certain transformers to explode
38
+ # records into multiple new ones. Therefore, this this variable will need more subtle behaviour
39
+ @num_records_processed += 1
40
+ end
41
+
42
+ # Indicate that a job has started
43
+ def start
44
+ @started_at = Time.now
45
+ end
46
+
47
+ # Indicate that a job has finished
48
+ def finish
49
+ @finished_at = Time.now
50
+ @success = true
51
+ end
52
+
53
+ def job= job
54
+ @job = job
55
+ @job_id = job.id
56
+ end
57
+
58
+ # Take a JobLog's instance variables and turn them into a hash representation
59
+ def serialize
60
+ {
61
+ job_id: @job_id,
62
+ last_id: @last_id,
63
+ highest_timestamp: @highest_timestamp,
64
+ num_records_processed: @num_records_processed,
65
+ started_at: @started_at,
66
+ finished_at: @finished_at,
67
+ success: @success
68
+ }
69
+ end
70
+
71
+ private
72
+
73
+ # Create a new JobLog and set its instance variables from a serialized hash
74
+ def self.build_from_serialized attrs
75
+ attrs.delete(:id)
76
+ new do |job_log|
77
+ attrs.each do |key, value|
78
+ setter = "#{key.to_s}=".to_sym
79
+ job_log.send(setter, value)
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end