chronicle-etl 0.1.3 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +8 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile.lock +15 -1
  5. data/README.md +62 -11
  6. data/chronicle-etl.gemspec +6 -1
  7. data/exe/chronicle-etl +2 -2
  8. data/lib/chronicle/etl.rb +9 -2
  9. data/lib/chronicle/etl/catalog.rb +68 -18
  10. data/lib/chronicle/etl/cli/connectors.rb +32 -0
  11. data/lib/chronicle/etl/cli/jobs.rb +116 -0
  12. data/lib/chronicle/etl/cli/main.rb +83 -0
  13. data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
  14. data/lib/chronicle/etl/config.rb +53 -0
  15. data/lib/chronicle/etl/exceptions.rb +17 -0
  16. data/lib/chronicle/etl/extractors/csv_extractor.rb +1 -1
  17. data/lib/chronicle/etl/extractors/extractor.rb +18 -5
  18. data/lib/chronicle/etl/extractors/file_extractor.rb +2 -2
  19. data/lib/chronicle/etl/extractors/stdin_extractor.rb +2 -2
  20. data/lib/chronicle/etl/job.rb +62 -0
  21. data/lib/chronicle/etl/job_definition.rb +51 -0
  22. data/lib/chronicle/etl/job_log.rb +79 -0
  23. data/lib/chronicle/etl/job_logger.rb +76 -0
  24. data/lib/chronicle/etl/loaders/csv_loader.rb +2 -2
  25. data/lib/chronicle/etl/loaders/loader.rb +13 -6
  26. data/lib/chronicle/etl/loaders/rest_loader.rb +30 -0
  27. data/lib/chronicle/etl/loaders/stdout_loader.rb +2 -2
  28. data/lib/chronicle/etl/loaders/table_loader.rb +6 -10
  29. data/lib/chronicle/etl/runner.rb +19 -51
  30. data/lib/chronicle/etl/transformers/json_transformer.rb +2 -2
  31. data/lib/chronicle/etl/transformers/null_transformer.rb +4 -4
  32. data/lib/chronicle/etl/transformers/transformer.rb +21 -4
  33. data/lib/chronicle/etl/utils/progress_bar.rb +1 -1
  34. data/lib/chronicle/etl/version.rb +2 -2
  35. metadata +85 -4
  36. data/CHANGELOG.md +0 -18
  37. data/lib/chronicle/etl/cli.rb +0 -48
@@ -0,0 +1,83 @@
1
+ require 'thor'
2
+ require 'chronicle/etl'
3
+ require 'colorize'
4
+
5
+ require 'chronicle/etl/cli/subcommand_base'
6
+ require 'chronicle/etl/cli/connectors'
7
+ require 'chronicle/etl/cli/jobs'
8
+
9
+ module Chronicle
10
+ module ETL
11
+ module CLI
12
+ # Main entrypoint for CLI app
13
+ class Main < Thor
14
+ class_option "verbose", type: :boolean, default: false
15
+ default_task "jobs"
16
+
17
+ desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
18
+ subcommand 'connectors', Connectors
19
+
20
+ desc 'jobs:COMMAND', 'Configure and run jobs', hide: true
21
+ subcommand 'jobs', Jobs
22
+
23
+ # Entrypoint for the CLI
24
+ def self.start(given_args = ARGV, config = {})
25
+ if given_args.none?
26
+ abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
27
+ end
28
+
29
+ # take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
30
+ if given_args.any? && given_args[0].include?(':')
31
+ commands = given_args.shift.split(':')
32
+ given_args = given_args.unshift(commands).flatten
33
+ end
34
+
35
+ super(given_args, config)
36
+ end
37
+
38
+ # Displays help options for chronicle-etl
39
+ def help(meth = nil, subcommand = false)
40
+ if meth && !respond_to?(meth)
41
+ klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
42
+ klass.start(['-h', task].compact, shell: shell)
43
+ else
44
+ shell.say "ABOUT".bold
45
+ shell.say " #{'chronicle-etl'.italic} is a utility tool for #{'extracting'.underline}, #{'transforming'.underline}, and #{'loading'.underline} personal data."
46
+ shell.say
47
+ shell.say "USAGE".bold
48
+ shell.say " $ chronicle-etl COMMAND"
49
+ shell.say
50
+ shell.say "EXAMPLES".bold
51
+ shell.say " Show available connectors:".italic.light_black
52
+ shell.say " $ chronicle-etl connectors:list"
53
+ shell.say
54
+ shell.say " Run a simple job:".italic.light_black
55
+ shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
56
+ shell.say
57
+ shell.say " Show full job options:".italic.light_black
58
+ shell.say " $ chronicle-etl jobs help start"
59
+
60
+ list = []
61
+
62
+ Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
63
+ list += thor_class.printable_tasks(false)
64
+ end
65
+ list.sort! { |a, b| a[0] <=> b[0] }
66
+ list.unshift ["help", "# This help menu"]
67
+
68
+ shell.say
69
+ shell.say 'ALL COMMANDS'.bold
70
+ shell.print_table(list, indent: 2, truncate: true)
71
+ shell.say
72
+ shell.say "VERSION".bold
73
+ shell.say " #{Chronicle::ETL::VERSION}"
74
+ shell.say
75
+ shell.say "FULL DOCUMENTATION".bold
76
+ shell.say " https://github.com/chronicle-app/chronicle-etl".blue
77
+ shell.say
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,37 @@
1
+ module Chronicle
2
+ module ETL
3
+ module CLI
4
+ # Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
5
+ class SubcommandBase < Thor
6
+ # Print usage instructions for a subcommand
7
+ def self.help(shell, subcommand = false)
8
+ list = printable_commands(true, subcommand)
9
+ Thor::Util.thor_classes_in(self).each do |klass|
10
+ list += klass.printable_commands(false)
11
+ end
12
+ list.sort! { |a, b| a[0] <=> b[0] }
13
+
14
+ shell.say "COMMANDS".bold
15
+ shell.print_table(list, indent: 2, truncate: true)
16
+ shell.say
17
+ class_options_help(shell)
18
+ end
19
+
20
+ # Show docs with command:subcommand pattern.
21
+ # For `help` command, don't use colon
22
+ def self.banner(command, namespace = nil, subcommand = false)
23
+ if command.name == 'help'
24
+ "#{subcommand_prefix} #{command.usage}"
25
+ else
26
+ "#{subcommand_prefix}:#{command.usage}"
27
+ end
28
+ end
29
+
30
+ # Use subcommand classname to derive display name for subcommand
31
+ def self.subcommand_prefix
32
+ self.name.gsub(%r{.*::}, '').gsub(%r{^[A-Z]}) { |match| match[0].downcase }.gsub(%r{[A-Z]}) { |match| "-#{match[0].downcase}" }
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,53 @@
1
+ require 'runcom'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # Utility methods to read, write, and access config files
6
+ module Config
7
+ module_function
8
+
9
+ # Loads a yml config file
10
+ def load(path)
11
+ config = Runcom::Config.new(path)
12
+ # FIXME: hack to deeply symbolize keys
13
+ JSON.parse(config.to_h.to_json, symbolize_names: true)
14
+ end
15
+
16
+ # Writes a hash as a yml config file
17
+ def write(path, data)
18
+ config = Runcom::Config.new(path)
19
+ filename = config.all[0].to_s + '.yml'
20
+ File.open(filename, 'w') do |f|
21
+ f << data.to_yaml
22
+ end
23
+ end
24
+
25
+ # Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
26
+ def available_jobs
27
+ job_directory = Runcom::Config.new('chronicle/etl/jobs').current
28
+ Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
29
+ File.basename(filename, ".*")
30
+ end
31
+ end
32
+
33
+ # Returns all available credentials available in ~/.config/chronilce/etl/credenetials/*.yml
34
+ def available_credentials
35
+ job_directory = Runcom::Config.new('chronicle/etl/credentials').current
36
+ Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
37
+ File.basename(filename, ".*")
38
+ end
39
+ end
40
+
41
+ # Load a job definition from job config directory
42
+ def load_job_from_config(job_name)
43
+ definition = self.load("chronicle/etl/jobs/#{job_name}.yml")
44
+ definition[:name] = job_name
45
+ definition
46
+ end
47
+
48
+ def load_credentials(name)
49
+ config = self.load("chronicle/etl/credentials/#{name}.yml")
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,17 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Error < StandardError; end;
4
+
5
+ class ConnectorNotAvailableError < Error
6
+ def initialize(message, provider: nil, name: nil)
7
+ super(message)
8
+ @provider = provider
9
+ @name = name
10
+ end
11
+ attr_reader :name, :provider
12
+ end
13
+
14
+ class ProviderNotAvailableError < ConnectorNotAvailableError; end
15
+ class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
16
+ end
17
+ end
@@ -1,5 +1,5 @@
1
1
  require 'csv'
2
- class Chronicle::Etl::CsvExtractor < Chronicle::Etl::Extractor
2
+ class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
3
3
  DEFAULT_OPTIONS = {
4
4
  headers: true,
5
5
  filename: $stdin
@@ -1,21 +1,34 @@
1
1
  require 'chronicle/etl'
2
2
 
3
3
  module Chronicle
4
- module Etl
4
+ module ETL
5
+ # Abstract class representing an Extractor for an ETL job
5
6
  class Extractor
6
- extend Chronicle::Etl::Catalog
7
-
8
- ETL_PHASE = :extractor
7
+ extend Chronicle::ETL::Catalog
9
8
 
9
+ # Construct a new instance of this extractor. Options are passed in from a Runner
10
+ # == Paramters:
11
+ # options::
12
+ # Options for configuring this Extractor
10
13
  def initialize(options = {})
11
- @options = options.transform_keys!(&:to_sym)
14
+ @options = options.transform_keys!(&:to_sym)
15
+ handle_continuation
12
16
  end
13
17
 
18
+ # Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
14
19
  def extract
15
20
  raise NotImplementedError
16
21
  end
17
22
 
23
+ # An optional method to calculate how many records there are to extract. Used primarily for
24
+ # building the progress bar
18
25
  def results_count; end
26
+
27
+ private
28
+
29
+ def handle_continuation
30
+ @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation] && @options[:continuation].highest_timestamp
31
+ end
19
32
  end
20
33
  end
21
34
  end
@@ -1,8 +1,8 @@
1
1
  require 'pathname'
2
2
 
3
3
  module Chronicle
4
- module Etl
5
- class FileExtractor < Chronicle::Etl::Extractor
4
+ module ETL
5
+ class FileExtractor < Chronicle::ETL::Extractor
6
6
  def extract
7
7
  if file?
8
8
  extract_file do |data, metadata|
@@ -1,6 +1,6 @@
1
1
  module Chronicle
2
- module Etl
3
- class StdinExtractor < Chronicle::Etl::Extractor
2
+ module ETL
3
+ class StdinExtractor < Chronicle::ETL::Extractor
4
4
  def extract
5
5
  $stdin.read.each_line do |line|
6
6
  yield line
@@ -0,0 +1,62 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Job
4
+ attr_accessor :name,
5
+ :extractor_klass,
6
+ :extractor_options,
7
+ :transformer_klass,
8
+ :transformer_options,
9
+ :loader_klass,
10
+ :loader_options
11
+
12
+ # TODO: build a proper id system
13
+ alias id name
14
+
15
+ def initialize(definition)
16
+ definition = definition.definition # FIXME
17
+ @name = definition[:name]
18
+ @extractor_klass = load_klass(:extractor, definition[:extractor][:name])
19
+ @extractor_options = definition[:extractor][:options] || {}
20
+
21
+ @transformer_klass = load_klass(:transformer, definition[:transformer][:name])
22
+ @transformer_options = definition[:transformer][:options] || {}
23
+
24
+ @loader_klass = load_klass(:loader, definition[:loader][:name])
25
+ @loader_options = definition[:loader][:options] || {}
26
+
27
+ set_continuation
28
+ yield self if block_given?
29
+ end
30
+
31
+ def instantiate_extractor
32
+ instantiate_klass(:extractor)
33
+ end
34
+
35
+ def instantiate_transformer data
36
+ instantiate_klass(:transformer, data)
37
+ end
38
+
39
+ def instantiate_loader
40
+ instantiate_klass(:loader)
41
+ end
42
+
43
+ private
44
+
45
+ def instantiate_klass(phase, *args)
46
+ options = self.send("#{phase.to_s}_options")
47
+ args = args.unshift(options)
48
+ klass = self.send("#{phase.to_s}_klass")
49
+ klass.new(*args)
50
+ end
51
+
52
+ def load_klass phase, identifier
53
+ Chronicle::ETL::Catalog.phase_and_identifier_to_klass(phase, identifier)
54
+ end
55
+
56
+ def set_continuation
57
+ continuation = Chronicle::ETL::JobLogger.load_latest(@job_id)
58
+ @extractor_options[:continuation] = continuation
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,51 @@
1
+ require 'deep_merge'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class JobDefinition
6
+ SKELETON_DEFINITION = {
7
+ extractor: {
8
+ name: nil,
9
+ options: {}
10
+ },
11
+ transformer: {
12
+ name: nil,
13
+ options: {}
14
+ },
15
+ loader: {
16
+ name: nil,
17
+ options: {}
18
+ }
19
+ }.freeze
20
+
21
+ attr_accessor :definition
22
+
23
+ def initialize()
24
+ @definition = SKELETON_DEFINITION
25
+ end
26
+
27
+ # Add config hash to this definition
28
+ def add_config(config = {})
29
+ @definition = config.deep_merge(@definition)
30
+ load_credentials
31
+ validate
32
+ end
33
+
34
+ private
35
+
36
+ def load_credentials
37
+ Chronicle::ETL::Catalog::PHASES.each do |phase|
38
+ credentials_name = @definition[phase][:options][:credentials]
39
+ if credentials_name
40
+ credentials = Chronicle::ETL::Config.load_credentials(credentials_name)
41
+ @definition[phase][:options].deep_merge(credentials)
42
+ end
43
+ end
44
+ end
45
+
46
+ def validate
47
+ return true # TODO
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,79 @@
1
+ require 'pry'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A record of what happened in the running of a job. We're interested in
6
+ # tracking when it ran, if it was successful, and what the latest record
7
+ # we found is (to use as a cursor for the next time)
8
+ class JobLog
9
+ attr_accessor :job,
10
+ :job_id,
11
+ :last_id,
12
+ :highest_timestamp,
13
+ :num_records_processed,
14
+ :started_at,
15
+ :finished_at,
16
+ :success
17
+
18
+ # Create a new JobLog for a given Job
19
+ def initialize
20
+ @num_records_processed = 0
21
+ @success = false
22
+ yield self if block_given?
23
+ end
24
+
25
+ # Log the result of a single transformation in a job
26
+ # @param transformer [Chronicle::ETL::Tranformer] The transformer that ran
27
+ def log_transformation(transformer)
28
+ @last_id = transformer.id if transformer.id
29
+
30
+ # Save the highest timestamp that we've encountered so far
31
+ @highest_timestamp = [transformer.timestamp, @highest_timestamp].compact.max if transformer.timestamp
32
+
33
+ # TODO: a transformer might yield nil. We might also want certain transformers to explode
34
+ # records into multiple new ones. Therefore, this this variable will need more subtle behaviour
35
+ @num_records_processed += 1
36
+ end
37
+
38
+ # Indicate that a job has started
39
+ def start
40
+ @started_at = Time.now
41
+ end
42
+
43
+ # Indicate that a job has finished
44
+ def finish
45
+ @finished_at = Time.now
46
+ @success = true
47
+ end
48
+
49
+ def job= job
50
+ @job = job
51
+ @job_id = job.id
52
+ end
53
+
54
+ # Take a JobLog's instance variables and turn them into a hash representation
55
+ def serialize
56
+ {
57
+ job_id: @job_id,
58
+ last_id: @last_id,
59
+ highest_timestamp: @highest_timestamp,
60
+ num_records_processed: @num_records_processed,
61
+ started_at: @started_at,
62
+ finished_at: @finished_at,
63
+ success: @success
64
+ }
65
+ end
66
+
67
+ # Create a new JobLog and set its instance variables from a serialized hash
68
+ def self.build_from_serialized attrs
69
+ attrs.delete(:id)
70
+ new do |job_log|
71
+ attrs.each do |key, value|
72
+ setter = "#{key.to_s}=".to_sym
73
+ job_log.send(setter, value)
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end