chronicle-etl 0.1.3 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -0
- data/.yardopts +1 -0
- data/Gemfile.lock +15 -1
- data/README.md +62 -11
- data/chronicle-etl.gemspec +6 -1
- data/exe/chronicle-etl +2 -2
- data/lib/chronicle/etl.rb +9 -2
- data/lib/chronicle/etl/catalog.rb +68 -18
- data/lib/chronicle/etl/cli/connectors.rb +32 -0
- data/lib/chronicle/etl/cli/jobs.rb +116 -0
- data/lib/chronicle/etl/cli/main.rb +83 -0
- data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
- data/lib/chronicle/etl/config.rb +53 -0
- data/lib/chronicle/etl/exceptions.rb +17 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +1 -1
- data/lib/chronicle/etl/extractors/extractor.rb +18 -5
- data/lib/chronicle/etl/extractors/file_extractor.rb +2 -2
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +2 -2
- data/lib/chronicle/etl/job.rb +62 -0
- data/lib/chronicle/etl/job_definition.rb +51 -0
- data/lib/chronicle/etl/job_log.rb +79 -0
- data/lib/chronicle/etl/job_logger.rb +76 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +2 -2
- data/lib/chronicle/etl/loaders/loader.rb +13 -6
- data/lib/chronicle/etl/loaders/rest_loader.rb +30 -0
- data/lib/chronicle/etl/loaders/stdout_loader.rb +2 -2
- data/lib/chronicle/etl/loaders/table_loader.rb +6 -10
- data/lib/chronicle/etl/runner.rb +19 -51
- data/lib/chronicle/etl/transformers/json_transformer.rb +2 -2
- data/lib/chronicle/etl/transformers/null_transformer.rb +4 -4
- data/lib/chronicle/etl/transformers/transformer.rb +21 -4
- data/lib/chronicle/etl/utils/progress_bar.rb +1 -1
- data/lib/chronicle/etl/version.rb +2 -2
- metadata +85 -4
- data/CHANGELOG.md +0 -18
- data/lib/chronicle/etl/cli.rb +0 -48
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'chronicle/etl'
|
3
|
+
require 'colorize'
|
4
|
+
|
5
|
+
require 'chronicle/etl/cli/subcommand_base'
|
6
|
+
require 'chronicle/etl/cli/connectors'
|
7
|
+
require 'chronicle/etl/cli/jobs'
|
8
|
+
|
9
|
+
module Chronicle
|
10
|
+
module ETL
|
11
|
+
module CLI
|
12
|
+
# Main entrypoint for CLI app
|
13
|
+
class Main < Thor
|
14
|
+
class_option "verbose", type: :boolean, default: false
|
15
|
+
default_task "jobs"
|
16
|
+
|
17
|
+
desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
|
18
|
+
subcommand 'connectors', Connectors
|
19
|
+
|
20
|
+
desc 'jobs:COMMAND', 'Configure and run jobs', hide: true
|
21
|
+
subcommand 'jobs', Jobs
|
22
|
+
|
23
|
+
# Entrypoint for the CLI
|
24
|
+
def self.start(given_args = ARGV, config = {})
|
25
|
+
if given_args.none?
|
26
|
+
abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
|
27
|
+
end
|
28
|
+
|
29
|
+
# take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
|
30
|
+
if given_args.any? && given_args[0].include?(':')
|
31
|
+
commands = given_args.shift.split(':')
|
32
|
+
given_args = given_args.unshift(commands).flatten
|
33
|
+
end
|
34
|
+
|
35
|
+
super(given_args, config)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Displays help options for chronicle-etl
|
39
|
+
def help(meth = nil, subcommand = false)
|
40
|
+
if meth && !respond_to?(meth)
|
41
|
+
klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
|
42
|
+
klass.start(['-h', task].compact, shell: shell)
|
43
|
+
else
|
44
|
+
shell.say "ABOUT".bold
|
45
|
+
shell.say " #{'chronicle-etl'.italic} is a utility tool for #{'extracting'.underline}, #{'transforming'.underline}, and #{'loading'.underline} personal data."
|
46
|
+
shell.say
|
47
|
+
shell.say "USAGE".bold
|
48
|
+
shell.say " $ chronicle-etl COMMAND"
|
49
|
+
shell.say
|
50
|
+
shell.say "EXAMPLES".bold
|
51
|
+
shell.say " Show available connectors:".italic.light_black
|
52
|
+
shell.say " $ chronicle-etl connectors:list"
|
53
|
+
shell.say
|
54
|
+
shell.say " Run a simple job:".italic.light_black
|
55
|
+
shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
|
56
|
+
shell.say
|
57
|
+
shell.say " Show full job options:".italic.light_black
|
58
|
+
shell.say " $ chronicle-etl jobs help start"
|
59
|
+
|
60
|
+
list = []
|
61
|
+
|
62
|
+
Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
|
63
|
+
list += thor_class.printable_tasks(false)
|
64
|
+
end
|
65
|
+
list.sort! { |a, b| a[0] <=> b[0] }
|
66
|
+
list.unshift ["help", "# This help menu"]
|
67
|
+
|
68
|
+
shell.say
|
69
|
+
shell.say 'ALL COMMANDS'.bold
|
70
|
+
shell.print_table(list, indent: 2, truncate: true)
|
71
|
+
shell.say
|
72
|
+
shell.say "VERSION".bold
|
73
|
+
shell.say " #{Chronicle::ETL::VERSION}"
|
74
|
+
shell.say
|
75
|
+
shell.say "FULL DOCUMENTATION".bold
|
76
|
+
shell.say " https://github.com/chronicle-app/chronicle-etl".blue
|
77
|
+
shell.say
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
module CLI
|
4
|
+
# Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
|
5
|
+
class SubcommandBase < Thor
|
6
|
+
# Print usage instructions for a subcommand
|
7
|
+
def self.help(shell, subcommand = false)
|
8
|
+
list = printable_commands(true, subcommand)
|
9
|
+
Thor::Util.thor_classes_in(self).each do |klass|
|
10
|
+
list += klass.printable_commands(false)
|
11
|
+
end
|
12
|
+
list.sort! { |a, b| a[0] <=> b[0] }
|
13
|
+
|
14
|
+
shell.say "COMMANDS".bold
|
15
|
+
shell.print_table(list, indent: 2, truncate: true)
|
16
|
+
shell.say
|
17
|
+
class_options_help(shell)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Show docs with command:subcommand pattern.
|
21
|
+
# For `help` command, don't use colon
|
22
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
23
|
+
if command.name == 'help'
|
24
|
+
"#{subcommand_prefix} #{command.usage}"
|
25
|
+
else
|
26
|
+
"#{subcommand_prefix}:#{command.usage}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Use subcommand classname to derive display name for subcommand
|
31
|
+
def self.subcommand_prefix
|
32
|
+
self.name.gsub(%r{.*::}, '').gsub(%r{^[A-Z]}) { |match| match[0].downcase }.gsub(%r{[A-Z]}) { |match| "-#{match[0].downcase}" }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'runcom'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
# Utility methods to read, write, and access config files
|
6
|
+
module Config
|
7
|
+
module_function
|
8
|
+
|
9
|
+
# Loads a yml config file
|
10
|
+
def load(path)
|
11
|
+
config = Runcom::Config.new(path)
|
12
|
+
# FIXME: hack to deeply symbolize keys
|
13
|
+
JSON.parse(config.to_h.to_json, symbolize_names: true)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Writes a hash as a yml config file
|
17
|
+
def write(path, data)
|
18
|
+
config = Runcom::Config.new(path)
|
19
|
+
filename = config.all[0].to_s + '.yml'
|
20
|
+
File.open(filename, 'w') do |f|
|
21
|
+
f << data.to_yaml
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
|
26
|
+
def available_jobs
|
27
|
+
job_directory = Runcom::Config.new('chronicle/etl/jobs').current
|
28
|
+
Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
|
29
|
+
File.basename(filename, ".*")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns all available credentials available in ~/.config/chronilce/etl/credenetials/*.yml
|
34
|
+
def available_credentials
|
35
|
+
job_directory = Runcom::Config.new('chronicle/etl/credentials').current
|
36
|
+
Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
|
37
|
+
File.basename(filename, ".*")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# Load a job definition from job config directory
|
42
|
+
def load_job_from_config(job_name)
|
43
|
+
definition = self.load("chronicle/etl/jobs/#{job_name}.yml")
|
44
|
+
definition[:name] = job_name
|
45
|
+
definition
|
46
|
+
end
|
47
|
+
|
48
|
+
def load_credentials(name)
|
49
|
+
config = self.load("chronicle/etl/credentials/#{name}.yml")
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class Error < StandardError; end;
|
4
|
+
|
5
|
+
class ConnectorNotAvailableError < Error
|
6
|
+
def initialize(message, provider: nil, name: nil)
|
7
|
+
super(message)
|
8
|
+
@provider = provider
|
9
|
+
@name = name
|
10
|
+
end
|
11
|
+
attr_reader :name, :provider
|
12
|
+
end
|
13
|
+
|
14
|
+
class ProviderNotAvailableError < ConnectorNotAvailableError; end
|
15
|
+
class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
|
16
|
+
end
|
17
|
+
end
|
@@ -1,21 +1,34 @@
|
|
1
1
|
require 'chronicle/etl'
|
2
2
|
|
3
3
|
module Chronicle
|
4
|
-
module
|
4
|
+
module ETL
|
5
|
+
# Abstract class representing an Extractor for an ETL job
|
5
6
|
class Extractor
|
6
|
-
extend Chronicle::
|
7
|
-
|
8
|
-
ETL_PHASE = :extractor
|
7
|
+
extend Chronicle::ETL::Catalog
|
9
8
|
|
9
|
+
# Construct a new instance of this extractor. Options are passed in from a Runner
|
10
|
+
# == Paramters:
|
11
|
+
# options::
|
12
|
+
# Options for configuring this Extractor
|
10
13
|
def initialize(options = {})
|
11
|
-
@options = options.transform_keys!(&:to_sym)
|
14
|
+
@options = options.transform_keys!(&:to_sym)
|
15
|
+
handle_continuation
|
12
16
|
end
|
13
17
|
|
18
|
+
# Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
|
14
19
|
def extract
|
15
20
|
raise NotImplementedError
|
16
21
|
end
|
17
22
|
|
23
|
+
# An optional method to calculate how many records there are to extract. Used primarily for
|
24
|
+
# building the progress bar
|
18
25
|
def results_count; end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def handle_continuation
|
30
|
+
@options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation] && @options[:continuation].highest_timestamp
|
31
|
+
end
|
19
32
|
end
|
20
33
|
end
|
21
34
|
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class Job
|
4
|
+
attr_accessor :name,
|
5
|
+
:extractor_klass,
|
6
|
+
:extractor_options,
|
7
|
+
:transformer_klass,
|
8
|
+
:transformer_options,
|
9
|
+
:loader_klass,
|
10
|
+
:loader_options
|
11
|
+
|
12
|
+
# TODO: build a proper id system
|
13
|
+
alias id name
|
14
|
+
|
15
|
+
def initialize(definition)
|
16
|
+
definition = definition.definition # FIXME
|
17
|
+
@name = definition[:name]
|
18
|
+
@extractor_klass = load_klass(:extractor, definition[:extractor][:name])
|
19
|
+
@extractor_options = definition[:extractor][:options] || {}
|
20
|
+
|
21
|
+
@transformer_klass = load_klass(:transformer, definition[:transformer][:name])
|
22
|
+
@transformer_options = definition[:transformer][:options] || {}
|
23
|
+
|
24
|
+
@loader_klass = load_klass(:loader, definition[:loader][:name])
|
25
|
+
@loader_options = definition[:loader][:options] || {}
|
26
|
+
|
27
|
+
set_continuation
|
28
|
+
yield self if block_given?
|
29
|
+
end
|
30
|
+
|
31
|
+
def instantiate_extractor
|
32
|
+
instantiate_klass(:extractor)
|
33
|
+
end
|
34
|
+
|
35
|
+
def instantiate_transformer data
|
36
|
+
instantiate_klass(:transformer, data)
|
37
|
+
end
|
38
|
+
|
39
|
+
def instantiate_loader
|
40
|
+
instantiate_klass(:loader)
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def instantiate_klass(phase, *args)
|
46
|
+
options = self.send("#{phase.to_s}_options")
|
47
|
+
args = args.unshift(options)
|
48
|
+
klass = self.send("#{phase.to_s}_klass")
|
49
|
+
klass.new(*args)
|
50
|
+
end
|
51
|
+
|
52
|
+
def load_klass phase, identifier
|
53
|
+
Chronicle::ETL::Catalog.phase_and_identifier_to_klass(phase, identifier)
|
54
|
+
end
|
55
|
+
|
56
|
+
def set_continuation
|
57
|
+
continuation = Chronicle::ETL::JobLogger.load_latest(@job_id)
|
58
|
+
@extractor_options[:continuation] = continuation
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'deep_merge'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class JobDefinition
|
6
|
+
SKELETON_DEFINITION = {
|
7
|
+
extractor: {
|
8
|
+
name: nil,
|
9
|
+
options: {}
|
10
|
+
},
|
11
|
+
transformer: {
|
12
|
+
name: nil,
|
13
|
+
options: {}
|
14
|
+
},
|
15
|
+
loader: {
|
16
|
+
name: nil,
|
17
|
+
options: {}
|
18
|
+
}
|
19
|
+
}.freeze
|
20
|
+
|
21
|
+
attr_accessor :definition
|
22
|
+
|
23
|
+
def initialize()
|
24
|
+
@definition = SKELETON_DEFINITION
|
25
|
+
end
|
26
|
+
|
27
|
+
# Add config hash to this definition
|
28
|
+
def add_config(config = {})
|
29
|
+
@definition = config.deep_merge(@definition)
|
30
|
+
load_credentials
|
31
|
+
validate
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def load_credentials
|
37
|
+
Chronicle::ETL::Catalog::PHASES.each do |phase|
|
38
|
+
credentials_name = @definition[phase][:options][:credentials]
|
39
|
+
if credentials_name
|
40
|
+
credentials = Chronicle::ETL::Config.load_credentials(credentials_name)
|
41
|
+
@definition[phase][:options].deep_merge(credentials)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def validate
|
47
|
+
return true # TODO
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'pry'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
# A record of what happened in the running of a job. We're interested in
|
6
|
+
# tracking when it ran, if it was successful, and what the latest record
|
7
|
+
# we found is (to use as a cursor for the next time)
|
8
|
+
class JobLog
|
9
|
+
attr_accessor :job,
|
10
|
+
:job_id,
|
11
|
+
:last_id,
|
12
|
+
:highest_timestamp,
|
13
|
+
:num_records_processed,
|
14
|
+
:started_at,
|
15
|
+
:finished_at,
|
16
|
+
:success
|
17
|
+
|
18
|
+
# Create a new JobLog for a given Job
|
19
|
+
def initialize
|
20
|
+
@num_records_processed = 0
|
21
|
+
@success = false
|
22
|
+
yield self if block_given?
|
23
|
+
end
|
24
|
+
|
25
|
+
# Log the result of a single transformation in a job
|
26
|
+
# @param transformer [Chronicle::ETL::Tranformer] The transformer that ran
|
27
|
+
def log_transformation(transformer)
|
28
|
+
@last_id = transformer.id if transformer.id
|
29
|
+
|
30
|
+
# Save the highest timestamp that we've encountered so far
|
31
|
+
@highest_timestamp = [transformer.timestamp, @highest_timestamp].compact.max if transformer.timestamp
|
32
|
+
|
33
|
+
# TODO: a transformer might yield nil. We might also want certain transformers to explode
|
34
|
+
# records into multiple new ones. Therefore, this this variable will need more subtle behaviour
|
35
|
+
@num_records_processed += 1
|
36
|
+
end
|
37
|
+
|
38
|
+
# Indicate that a job has started
|
39
|
+
def start
|
40
|
+
@started_at = Time.now
|
41
|
+
end
|
42
|
+
|
43
|
+
# Indicate that a job has finished
|
44
|
+
def finish
|
45
|
+
@finished_at = Time.now
|
46
|
+
@success = true
|
47
|
+
end
|
48
|
+
|
49
|
+
def job= job
|
50
|
+
@job = job
|
51
|
+
@job_id = job.id
|
52
|
+
end
|
53
|
+
|
54
|
+
# Take a JobLog's instance variables and turn them into a hash representation
|
55
|
+
def serialize
|
56
|
+
{
|
57
|
+
job_id: @job_id,
|
58
|
+
last_id: @last_id,
|
59
|
+
highest_timestamp: @highest_timestamp,
|
60
|
+
num_records_processed: @num_records_processed,
|
61
|
+
started_at: @started_at,
|
62
|
+
finished_at: @finished_at,
|
63
|
+
success: @success
|
64
|
+
}
|
65
|
+
end
|
66
|
+
|
67
|
+
# Create a new JobLog and set its instance variables from a serialized hash
|
68
|
+
def self.build_from_serialized attrs
|
69
|
+
attrs.delete(:id)
|
70
|
+
new do |job_log|
|
71
|
+
attrs.each do |key, value|
|
72
|
+
setter = "#{key.to_s}=".to_sym
|
73
|
+
job_log.send(setter, value)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|