chronicle-etl 0.1.3 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -0
- data/.yardopts +1 -0
- data/Gemfile.lock +15 -1
- data/README.md +62 -11
- data/chronicle-etl.gemspec +6 -1
- data/exe/chronicle-etl +2 -2
- data/lib/chronicle/etl.rb +9 -2
- data/lib/chronicle/etl/catalog.rb +68 -18
- data/lib/chronicle/etl/cli/connectors.rb +32 -0
- data/lib/chronicle/etl/cli/jobs.rb +116 -0
- data/lib/chronicle/etl/cli/main.rb +83 -0
- data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
- data/lib/chronicle/etl/config.rb +53 -0
- data/lib/chronicle/etl/exceptions.rb +17 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +1 -1
- data/lib/chronicle/etl/extractors/extractor.rb +18 -5
- data/lib/chronicle/etl/extractors/file_extractor.rb +2 -2
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +2 -2
- data/lib/chronicle/etl/job.rb +62 -0
- data/lib/chronicle/etl/job_definition.rb +51 -0
- data/lib/chronicle/etl/job_log.rb +79 -0
- data/lib/chronicle/etl/job_logger.rb +76 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +2 -2
- data/lib/chronicle/etl/loaders/loader.rb +13 -6
- data/lib/chronicle/etl/loaders/rest_loader.rb +30 -0
- data/lib/chronicle/etl/loaders/stdout_loader.rb +2 -2
- data/lib/chronicle/etl/loaders/table_loader.rb +6 -10
- data/lib/chronicle/etl/runner.rb +19 -51
- data/lib/chronicle/etl/transformers/json_transformer.rb +2 -2
- data/lib/chronicle/etl/transformers/null_transformer.rb +4 -4
- data/lib/chronicle/etl/transformers/transformer.rb +21 -4
- data/lib/chronicle/etl/utils/progress_bar.rb +1 -1
- data/lib/chronicle/etl/version.rb +2 -2
- metadata +85 -4
- data/CHANGELOG.md +0 -18
- data/lib/chronicle/etl/cli.rb +0 -48
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'chronicle/etl'
|
3
|
+
require 'colorize'
|
4
|
+
|
5
|
+
require 'chronicle/etl/cli/subcommand_base'
|
6
|
+
require 'chronicle/etl/cli/connectors'
|
7
|
+
require 'chronicle/etl/cli/jobs'
|
8
|
+
|
9
|
+
module Chronicle
|
10
|
+
module ETL
|
11
|
+
module CLI
|
12
|
+
# Main entrypoint for CLI app
|
13
|
+
class Main < Thor
|
14
|
+
class_option "verbose", type: :boolean, default: false
|
15
|
+
default_task "jobs"
|
16
|
+
|
17
|
+
desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
|
18
|
+
subcommand 'connectors', Connectors
|
19
|
+
|
20
|
+
desc 'jobs:COMMAND', 'Configure and run jobs', hide: true
|
21
|
+
subcommand 'jobs', Jobs
|
22
|
+
|
23
|
+
# Entrypoint for the CLI
|
24
|
+
def self.start(given_args = ARGV, config = {})
|
25
|
+
if given_args.none?
|
26
|
+
abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
|
27
|
+
end
|
28
|
+
|
29
|
+
# take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
|
30
|
+
if given_args.any? && given_args[0].include?(':')
|
31
|
+
commands = given_args.shift.split(':')
|
32
|
+
given_args = given_args.unshift(commands).flatten
|
33
|
+
end
|
34
|
+
|
35
|
+
super(given_args, config)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Displays help options for chronicle-etl
|
39
|
+
def help(meth = nil, subcommand = false)
|
40
|
+
if meth && !respond_to?(meth)
|
41
|
+
klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
|
42
|
+
klass.start(['-h', task].compact, shell: shell)
|
43
|
+
else
|
44
|
+
shell.say "ABOUT".bold
|
45
|
+
shell.say " #{'chronicle-etl'.italic} is a utility tool for #{'extracting'.underline}, #{'transforming'.underline}, and #{'loading'.underline} personal data."
|
46
|
+
shell.say
|
47
|
+
shell.say "USAGE".bold
|
48
|
+
shell.say " $ chronicle-etl COMMAND"
|
49
|
+
shell.say
|
50
|
+
shell.say "EXAMPLES".bold
|
51
|
+
shell.say " Show available connectors:".italic.light_black
|
52
|
+
shell.say " $ chronicle-etl connectors:list"
|
53
|
+
shell.say
|
54
|
+
shell.say " Run a simple job:".italic.light_black
|
55
|
+
shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
|
56
|
+
shell.say
|
57
|
+
shell.say " Show full job options:".italic.light_black
|
58
|
+
shell.say " $ chronicle-etl jobs help start"
|
59
|
+
|
60
|
+
list = []
|
61
|
+
|
62
|
+
Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
|
63
|
+
list += thor_class.printable_tasks(false)
|
64
|
+
end
|
65
|
+
list.sort! { |a, b| a[0] <=> b[0] }
|
66
|
+
list.unshift ["help", "# This help menu"]
|
67
|
+
|
68
|
+
shell.say
|
69
|
+
shell.say 'ALL COMMANDS'.bold
|
70
|
+
shell.print_table(list, indent: 2, truncate: true)
|
71
|
+
shell.say
|
72
|
+
shell.say "VERSION".bold
|
73
|
+
shell.say " #{Chronicle::ETL::VERSION}"
|
74
|
+
shell.say
|
75
|
+
shell.say "FULL DOCUMENTATION".bold
|
76
|
+
shell.say " https://github.com/chronicle-app/chronicle-etl".blue
|
77
|
+
shell.say
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
module CLI
|
4
|
+
# Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
|
5
|
+
class SubcommandBase < Thor
|
6
|
+
# Print usage instructions for a subcommand
|
7
|
+
def self.help(shell, subcommand = false)
|
8
|
+
list = printable_commands(true, subcommand)
|
9
|
+
Thor::Util.thor_classes_in(self).each do |klass|
|
10
|
+
list += klass.printable_commands(false)
|
11
|
+
end
|
12
|
+
list.sort! { |a, b| a[0] <=> b[0] }
|
13
|
+
|
14
|
+
shell.say "COMMANDS".bold
|
15
|
+
shell.print_table(list, indent: 2, truncate: true)
|
16
|
+
shell.say
|
17
|
+
class_options_help(shell)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Show docs with command:subcommand pattern.
|
21
|
+
# For `help` command, don't use colon
|
22
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
23
|
+
if command.name == 'help'
|
24
|
+
"#{subcommand_prefix} #{command.usage}"
|
25
|
+
else
|
26
|
+
"#{subcommand_prefix}:#{command.usage}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Use subcommand classname to derive display name for subcommand
|
31
|
+
def self.subcommand_prefix
|
32
|
+
self.name.gsub(%r{.*::}, '').gsub(%r{^[A-Z]}) { |match| match[0].downcase }.gsub(%r{[A-Z]}) { |match| "-#{match[0].downcase}" }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'runcom'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
# Utility methods to read, write, and access config files
|
6
|
+
module Config
|
7
|
+
module_function
|
8
|
+
|
9
|
+
# Loads a yml config file
|
10
|
+
def load(path)
|
11
|
+
config = Runcom::Config.new(path)
|
12
|
+
# FIXME: hack to deeply symbolize keys
|
13
|
+
JSON.parse(config.to_h.to_json, symbolize_names: true)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Writes a hash as a yml config file
|
17
|
+
def write(path, data)
|
18
|
+
config = Runcom::Config.new(path)
|
19
|
+
filename = config.all[0].to_s + '.yml'
|
20
|
+
File.open(filename, 'w') do |f|
|
21
|
+
f << data.to_yaml
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
|
26
|
+
def available_jobs
|
27
|
+
job_directory = Runcom::Config.new('chronicle/etl/jobs').current
|
28
|
+
Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
|
29
|
+
File.basename(filename, ".*")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns all available credentials available in ~/.config/chronilce/etl/credenetials/*.yml
|
34
|
+
def available_credentials
|
35
|
+
job_directory = Runcom::Config.new('chronicle/etl/credentials').current
|
36
|
+
Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
|
37
|
+
File.basename(filename, ".*")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# Load a job definition from job config directory
|
42
|
+
def load_job_from_config(job_name)
|
43
|
+
definition = self.load("chronicle/etl/jobs/#{job_name}.yml")
|
44
|
+
definition[:name] = job_name
|
45
|
+
definition
|
46
|
+
end
|
47
|
+
|
48
|
+
def load_credentials(name)
|
49
|
+
config = self.load("chronicle/etl/credentials/#{name}.yml")
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class Error < StandardError; end;
|
4
|
+
|
5
|
+
class ConnectorNotAvailableError < Error
|
6
|
+
def initialize(message, provider: nil, name: nil)
|
7
|
+
super(message)
|
8
|
+
@provider = provider
|
9
|
+
@name = name
|
10
|
+
end
|
11
|
+
attr_reader :name, :provider
|
12
|
+
end
|
13
|
+
|
14
|
+
class ProviderNotAvailableError < ConnectorNotAvailableError; end
|
15
|
+
class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
|
16
|
+
end
|
17
|
+
end
|
@@ -1,21 +1,34 @@
|
|
1
1
|
require 'chronicle/etl'
|
2
2
|
|
3
3
|
module Chronicle
|
4
|
-
module
|
4
|
+
module ETL
|
5
|
+
# Abstract class representing an Extractor for an ETL job
|
5
6
|
class Extractor
|
6
|
-
extend Chronicle::
|
7
|
-
|
8
|
-
ETL_PHASE = :extractor
|
7
|
+
extend Chronicle::ETL::Catalog
|
9
8
|
|
9
|
+
# Construct a new instance of this extractor. Options are passed in from a Runner
|
10
|
+
# == Paramters:
|
11
|
+
# options::
|
12
|
+
# Options for configuring this Extractor
|
10
13
|
def initialize(options = {})
|
11
|
-
@options = options.transform_keys!(&:to_sym)
|
14
|
+
@options = options.transform_keys!(&:to_sym)
|
15
|
+
handle_continuation
|
12
16
|
end
|
13
17
|
|
18
|
+
# Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
|
14
19
|
def extract
|
15
20
|
raise NotImplementedError
|
16
21
|
end
|
17
22
|
|
23
|
+
# An optional method to calculate how many records there are to extract. Used primarily for
|
24
|
+
# building the progress bar
|
18
25
|
def results_count; end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def handle_continuation
|
30
|
+
@options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation] && @options[:continuation].highest_timestamp
|
31
|
+
end
|
19
32
|
end
|
20
33
|
end
|
21
34
|
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class Job
|
4
|
+
attr_accessor :name,
|
5
|
+
:extractor_klass,
|
6
|
+
:extractor_options,
|
7
|
+
:transformer_klass,
|
8
|
+
:transformer_options,
|
9
|
+
:loader_klass,
|
10
|
+
:loader_options
|
11
|
+
|
12
|
+
# TODO: build a proper id system
|
13
|
+
alias id name
|
14
|
+
|
15
|
+
def initialize(definition)
|
16
|
+
definition = definition.definition # FIXME
|
17
|
+
@name = definition[:name]
|
18
|
+
@extractor_klass = load_klass(:extractor, definition[:extractor][:name])
|
19
|
+
@extractor_options = definition[:extractor][:options] || {}
|
20
|
+
|
21
|
+
@transformer_klass = load_klass(:transformer, definition[:transformer][:name])
|
22
|
+
@transformer_options = definition[:transformer][:options] || {}
|
23
|
+
|
24
|
+
@loader_klass = load_klass(:loader, definition[:loader][:name])
|
25
|
+
@loader_options = definition[:loader][:options] || {}
|
26
|
+
|
27
|
+
set_continuation
|
28
|
+
yield self if block_given?
|
29
|
+
end
|
30
|
+
|
31
|
+
def instantiate_extractor
|
32
|
+
instantiate_klass(:extractor)
|
33
|
+
end
|
34
|
+
|
35
|
+
def instantiate_transformer data
|
36
|
+
instantiate_klass(:transformer, data)
|
37
|
+
end
|
38
|
+
|
39
|
+
def instantiate_loader
|
40
|
+
instantiate_klass(:loader)
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def instantiate_klass(phase, *args)
|
46
|
+
options = self.send("#{phase.to_s}_options")
|
47
|
+
args = args.unshift(options)
|
48
|
+
klass = self.send("#{phase.to_s}_klass")
|
49
|
+
klass.new(*args)
|
50
|
+
end
|
51
|
+
|
52
|
+
def load_klass phase, identifier
|
53
|
+
Chronicle::ETL::Catalog.phase_and_identifier_to_klass(phase, identifier)
|
54
|
+
end
|
55
|
+
|
56
|
+
def set_continuation
|
57
|
+
continuation = Chronicle::ETL::JobLogger.load_latest(@job_id)
|
58
|
+
@extractor_options[:continuation] = continuation
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'deep_merge'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class JobDefinition
|
6
|
+
SKELETON_DEFINITION = {
|
7
|
+
extractor: {
|
8
|
+
name: nil,
|
9
|
+
options: {}
|
10
|
+
},
|
11
|
+
transformer: {
|
12
|
+
name: nil,
|
13
|
+
options: {}
|
14
|
+
},
|
15
|
+
loader: {
|
16
|
+
name: nil,
|
17
|
+
options: {}
|
18
|
+
}
|
19
|
+
}.freeze
|
20
|
+
|
21
|
+
attr_accessor :definition
|
22
|
+
|
23
|
+
def initialize()
|
24
|
+
@definition = SKELETON_DEFINITION
|
25
|
+
end
|
26
|
+
|
27
|
+
# Add config hash to this definition
|
28
|
+
def add_config(config = {})
|
29
|
+
@definition = config.deep_merge(@definition)
|
30
|
+
load_credentials
|
31
|
+
validate
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def load_credentials
|
37
|
+
Chronicle::ETL::Catalog::PHASES.each do |phase|
|
38
|
+
credentials_name = @definition[phase][:options][:credentials]
|
39
|
+
if credentials_name
|
40
|
+
credentials = Chronicle::ETL::Config.load_credentials(credentials_name)
|
41
|
+
@definition[phase][:options].deep_merge(credentials)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def validate
|
47
|
+
return true # TODO
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'pry'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
# A record of what happened in the running of a job. We're interested in
|
6
|
+
# tracking when it ran, if it was successful, and what the latest record
|
7
|
+
# we found is (to use as a cursor for the next time)
|
8
|
+
class JobLog
|
9
|
+
attr_accessor :job,
|
10
|
+
:job_id,
|
11
|
+
:last_id,
|
12
|
+
:highest_timestamp,
|
13
|
+
:num_records_processed,
|
14
|
+
:started_at,
|
15
|
+
:finished_at,
|
16
|
+
:success
|
17
|
+
|
18
|
+
# Create a new JobLog for a given Job
|
19
|
+
def initialize
|
20
|
+
@num_records_processed = 0
|
21
|
+
@success = false
|
22
|
+
yield self if block_given?
|
23
|
+
end
|
24
|
+
|
25
|
+
# Log the result of a single transformation in a job
|
26
|
+
# @param transformer [Chronicle::ETL::Tranformer] The transformer that ran
|
27
|
+
def log_transformation(transformer)
|
28
|
+
@last_id = transformer.id if transformer.id
|
29
|
+
|
30
|
+
# Save the highest timestamp that we've encountered so far
|
31
|
+
@highest_timestamp = [transformer.timestamp, @highest_timestamp].compact.max if transformer.timestamp
|
32
|
+
|
33
|
+
# TODO: a transformer might yield nil. We might also want certain transformers to explode
|
34
|
+
# records into multiple new ones. Therefore, this this variable will need more subtle behaviour
|
35
|
+
@num_records_processed += 1
|
36
|
+
end
|
37
|
+
|
38
|
+
# Indicate that a job has started
|
39
|
+
def start
|
40
|
+
@started_at = Time.now
|
41
|
+
end
|
42
|
+
|
43
|
+
# Indicate that a job has finished
|
44
|
+
def finish
|
45
|
+
@finished_at = Time.now
|
46
|
+
@success = true
|
47
|
+
end
|
48
|
+
|
49
|
+
def job= job
|
50
|
+
@job = job
|
51
|
+
@job_id = job.id
|
52
|
+
end
|
53
|
+
|
54
|
+
# Take a JobLog's instance variables and turn them into a hash representation
|
55
|
+
def serialize
|
56
|
+
{
|
57
|
+
job_id: @job_id,
|
58
|
+
last_id: @last_id,
|
59
|
+
highest_timestamp: @highest_timestamp,
|
60
|
+
num_records_processed: @num_records_processed,
|
61
|
+
started_at: @started_at,
|
62
|
+
finished_at: @finished_at,
|
63
|
+
success: @success
|
64
|
+
}
|
65
|
+
end
|
66
|
+
|
67
|
+
# Create a new JobLog and set its instance variables from a serialized hash
|
68
|
+
def self.build_from_serialized attrs
|
69
|
+
attrs.delete(:id)
|
70
|
+
new do |job_log|
|
71
|
+
attrs.each do |key, value|
|
72
|
+
setter = "#{key.to_s}=".to_sym
|
73
|
+
job_log.send(setter, value)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|