chronicle-etl 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -0
- data/.ruby-version +1 -0
- data/.yardopts +1 -0
- data/CHANGELOG.md +23 -0
- data/Gemfile.lock +42 -10
- data/README.md +64 -11
- data/bin/console +16 -4
- data/chronicle-etl.gemspec +9 -7
- data/exe/chronicle-etl +2 -2
- data/lib/chronicle/etl.rb +5 -2
- data/lib/chronicle/etl/catalog.rb +62 -0
- data/lib/chronicle/etl/cli/connectors.rb +32 -0
- data/lib/chronicle/etl/cli/jobs.rb +111 -0
- data/lib/chronicle/etl/cli/main.rb +83 -0
- data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
- data/lib/chronicle/etl/config.rb +32 -0
- data/lib/chronicle/etl/extractors/{csv.rb → csv_extractor.rb} +3 -5
- data/lib/chronicle/etl/extractors/extractor.rb +23 -12
- data/lib/chronicle/etl/extractors/file_extractor.rb +52 -0
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +11 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +29 -0
- data/lib/chronicle/etl/loaders/loader.rb +22 -16
- data/lib/chronicle/etl/loaders/stdout_loader.rb +9 -0
- data/lib/chronicle/etl/loaders/table_loader.rb +21 -0
- data/lib/chronicle/etl/runner.rb +33 -11
- data/lib/chronicle/etl/transformers/json_transformer.rb +11 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +10 -0
- data/lib/chronicle/etl/transformers/transformer.rb +27 -11
- data/lib/chronicle/etl/utils/progress_bar.rb +76 -0
- data/lib/chronicle/etl/version.rb +2 -2
- metadata +68 -30
- data/lib/chronicle/etl/cli.rb +0 -38
- data/lib/chronicle/etl/extractors/stdin.rb +0 -13
- data/lib/chronicle/etl/loaders/csv.rb +0 -31
- data/lib/chronicle/etl/loaders/stdout.rb +0 -13
- data/lib/chronicle/etl/loaders/table.rb +0 -22
- data/lib/chronicle/etl/transformers/json.rb +0 -13
- data/lib/chronicle/etl/transformers/null.rb +0 -11
- data/lib/chronicle/etl/utils/progress_bar_wrapper.rb +0 -43
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'chronicle/etl'
|
3
|
+
require 'colorize'
|
4
|
+
|
5
|
+
require 'chronicle/etl/cli/subcommand_base'
|
6
|
+
require 'chronicle/etl/cli/connectors'
|
7
|
+
require 'chronicle/etl/cli/jobs'
|
8
|
+
|
9
|
+
module Chronicle
|
10
|
+
module ETL
|
11
|
+
module CLI
|
12
|
+
# Main entrypoint for CLI app
|
13
|
+
class Main < Thor
|
14
|
+
class_option "verbose", type: :boolean, default: false
|
15
|
+
default_task "jobs"
|
16
|
+
|
17
|
+
desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
|
18
|
+
subcommand 'connectors', Connectors
|
19
|
+
|
20
|
+
desc 'jobs:COMMAND', 'Configure and run jobs', hide: true
|
21
|
+
subcommand 'jobs', Jobs
|
22
|
+
|
23
|
+
# Entrypoint for the CLI
|
24
|
+
def self.start(given_args = ARGV, config = {})
|
25
|
+
if given_args.none?
|
26
|
+
abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
|
27
|
+
end
|
28
|
+
|
29
|
+
# take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
|
30
|
+
if given_args.any? && given_args[0].include?(':')
|
31
|
+
commands = given_args.shift.split(':')
|
32
|
+
given_args = given_args.unshift(commands).flatten
|
33
|
+
end
|
34
|
+
|
35
|
+
super(given_args, config)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Displays help options for chronicle-etl
|
39
|
+
def help(meth = nil, subcommand = false)
|
40
|
+
if meth && !respond_to?(meth)
|
41
|
+
klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
|
42
|
+
klass.start(['-h', task].compact, shell: shell)
|
43
|
+
else
|
44
|
+
shell.say "ABOUT".bold
|
45
|
+
shell.say " #{'chronicle-etl'.italic} is a utility tool for #{'extracting'.underline}, #{'transforming'.underline}, and #{'loading'.underline} personal data."
|
46
|
+
shell.say
|
47
|
+
shell.say "USAGE".bold
|
48
|
+
shell.say " $ chronicle-etl COMMAND"
|
49
|
+
shell.say
|
50
|
+
shell.say "EXAMPLES".bold
|
51
|
+
shell.say " Show available connectors:".italic.light_black
|
52
|
+
shell.say " $ chronicle-etl connectors:list"
|
53
|
+
shell.say
|
54
|
+
shell.say " Run a simple job:".italic.light_black
|
55
|
+
shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
|
56
|
+
shell.say
|
57
|
+
shell.say " Show full job options:".italic.light_black
|
58
|
+
shell.say " $ chronicle-etl jobs help start"
|
59
|
+
|
60
|
+
list = []
|
61
|
+
|
62
|
+
Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
|
63
|
+
list += thor_class.printable_tasks(false)
|
64
|
+
end
|
65
|
+
list.sort! { |a, b| a[0] <=> b[0] }
|
66
|
+
list.unshift ["help", "# This help menu"]
|
67
|
+
|
68
|
+
shell.say
|
69
|
+
shell.say 'ALL COMMANDS'.bold
|
70
|
+
shell.print_table(list, indent: 2, truncate: true)
|
71
|
+
shell.say
|
72
|
+
shell.say "VERSION".bold
|
73
|
+
shell.say " #{Chronicle::ETL::VERSION}"
|
74
|
+
shell.say
|
75
|
+
shell.say "FULL DOCUMENTATION".bold
|
76
|
+
shell.say " https://github.com/chronicle-app/chronicle-etl".blue
|
77
|
+
shell.say
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
module CLI
|
4
|
+
# Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
|
5
|
+
class SubcommandBase < Thor
|
6
|
+
# Print usage instructions for a subcommand
|
7
|
+
def self.help(shell, subcommand = false)
|
8
|
+
list = printable_commands(true, subcommand)
|
9
|
+
Thor::Util.thor_classes_in(self).each do |klass|
|
10
|
+
list += klass.printable_commands(false)
|
11
|
+
end
|
12
|
+
list.sort! { |a, b| a[0] <=> b[0] }
|
13
|
+
|
14
|
+
shell.say "COMMANDS".bold
|
15
|
+
shell.print_table(list, indent: 2, truncate: true)
|
16
|
+
shell.say
|
17
|
+
class_options_help(shell)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Show docs with command:subcommand pattern.
|
21
|
+
# For `help` command, don't use colon
|
22
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
23
|
+
if command.name == 'help'
|
24
|
+
"#{subcommand_prefix} #{command.usage}"
|
25
|
+
else
|
26
|
+
"#{subcommand_prefix}:#{command.usage}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Use subcommand classname to derive display name for subcommand
|
31
|
+
def self.subcommand_prefix
|
32
|
+
self.name.gsub(%r{.*::}, '').gsub(%r{^[A-Z]}) { |match| match[0].downcase }.gsub(%r{[A-Z]}) { |match| "-#{match[0].downcase}" }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'runcom'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
# Utility methods to read, write, and access config files
|
6
|
+
module Config
|
7
|
+
# Loads a yml config file
|
8
|
+
def self.load(path)
|
9
|
+
config = Runcom::Config.new(path)
|
10
|
+
# FIXME: hack to deeply symbolize keys
|
11
|
+
JSON.parse(config.to_h.to_json, symbolize_names: true)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Writes a hash as a yml config file
|
15
|
+
def self.write(path, data)
|
16
|
+
config = Runcom::Config.new(path)
|
17
|
+
filename = config.all[0].to_s + '.yml'
|
18
|
+
File.open(filename, 'w') do |f|
|
19
|
+
f << data.to_yaml
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
|
24
|
+
def self.jobs
|
25
|
+
job_directory = Runcom::Config.new('chronicle/etl/jobs').current
|
26
|
+
Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
|
27
|
+
File.basename(filename, ".*")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -1,7 +1,5 @@
|
|
1
1
|
require 'csv'
|
2
|
-
|
3
|
-
|
4
|
-
class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
|
2
|
+
class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
|
5
3
|
DEFAULT_OPTIONS = {
|
6
4
|
headers: true,
|
7
5
|
filename: $stdin
|
@@ -20,7 +18,7 @@ class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
|
|
20
18
|
end
|
21
19
|
|
22
20
|
def results_count
|
23
|
-
CSV.read(@options[:filename],
|
21
|
+
CSV.read(@options[:filename], headers: @options[:headers]).count if read_from_file?
|
24
22
|
end
|
25
23
|
|
26
24
|
private
|
@@ -35,7 +33,7 @@ class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
|
|
35
33
|
}
|
36
34
|
|
37
35
|
stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
|
38
|
-
CSV.new(stream, csv_options)
|
36
|
+
CSV.new(stream, **csv_options)
|
39
37
|
end
|
40
38
|
|
41
39
|
def read_from_file?
|
@@ -1,20 +1,31 @@
|
|
1
|
+
require 'chronicle/etl'
|
2
|
+
|
1
3
|
module Chronicle
|
2
|
-
module
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
@options = options.transform_keys!(&:to_sym)
|
7
|
-
end
|
4
|
+
module ETL
|
5
|
+
# Abstract class representing an Extractor for an ETL job
|
6
|
+
class Extractor
|
7
|
+
extend Chronicle::ETL::Catalog
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
# Construct a new instance of this extractor. Options are passed in from a Runner
|
10
|
+
# == Paramters:
|
11
|
+
# options::
|
12
|
+
# Options for configuring this Extractor
|
13
|
+
def initialize(options = {})
|
14
|
+
@options = options.transform_keys!(&:to_sym)
|
15
|
+
end
|
12
16
|
|
13
|
-
|
17
|
+
# Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
|
18
|
+
def extract
|
19
|
+
raise NotImplementedError
|
14
20
|
end
|
21
|
+
|
22
|
+
# An optional method to calculate how many records there are to extract. Used primarily for
|
23
|
+
# building the progress bar
|
24
|
+
def results_count; end
|
15
25
|
end
|
16
26
|
end
|
17
27
|
end
|
18
28
|
|
19
|
-
require_relative '
|
20
|
-
require_relative '
|
29
|
+
require_relative 'csv_extractor'
|
30
|
+
require_relative 'file_extractor'
|
31
|
+
require_relative 'stdin_extractor'
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class FileExtractor < Chronicle::ETL::Extractor
|
6
|
+
def extract
|
7
|
+
if file?
|
8
|
+
extract_file do |data, metadata|
|
9
|
+
yield(data, metadata)
|
10
|
+
end
|
11
|
+
elsif directory?
|
12
|
+
extract_from_directory do |data, metadata|
|
13
|
+
yield(data, metadata)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def results_count
|
19
|
+
if file?
|
20
|
+
return 1
|
21
|
+
else
|
22
|
+
search_pattern = File.join(@options[:filename], '**/*.eml')
|
23
|
+
Dir.glob(search_pattern).count
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def extract_from_directory
|
30
|
+
search_pattern = File.join(@options[:filename], '**/*.eml')
|
31
|
+
filenames = Dir.glob(search_pattern)
|
32
|
+
filenames.each do |filename|
|
33
|
+
file = File.open(filename)
|
34
|
+
yield(file.read, {filename: file})
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def extract_file
|
39
|
+
file = File.open(@options[:filename])
|
40
|
+
yield(file.read, {filename: @options[:filename]})
|
41
|
+
end
|
42
|
+
|
43
|
+
def directory?
|
44
|
+
Pathname.new(@options[:filename]).directory?
|
45
|
+
end
|
46
|
+
|
47
|
+
def file?
|
48
|
+
Pathname.new(@options[:filename]).file?
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class CsvLoader < Chronicle::ETL::Loader
|
6
|
+
def initialize(options={})
|
7
|
+
super(options)
|
8
|
+
@rows = []
|
9
|
+
end
|
10
|
+
|
11
|
+
def load(result)
|
12
|
+
if (result.is_a? Hash)
|
13
|
+
@rows << result.values
|
14
|
+
else
|
15
|
+
@rows << result
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def finish
|
20
|
+
z = $stdout
|
21
|
+
CSV(z) do |csv|
|
22
|
+
@rows.each do |row|
|
23
|
+
csv << row
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -1,25 +1,31 @@
|
|
1
1
|
module Chronicle
|
2
|
-
module
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
@options = options
|
7
|
-
end
|
2
|
+
module ETL
|
3
|
+
# Abstract class representing a Loader for an ETL job
|
4
|
+
class Loader
|
5
|
+
extend Chronicle::ETL::Catalog
|
8
6
|
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
# Construct a new instance of this loader. Options are passed in from a Runner
|
8
|
+
# == Paramters:
|
9
|
+
# options::
|
10
|
+
# Options for configuring this Loader
|
11
|
+
def initialize(options = {})
|
12
|
+
@options = options
|
13
|
+
end
|
12
14
|
|
13
|
-
|
14
|
-
|
15
|
-
end
|
15
|
+
# Called once before processing records
|
16
|
+
def start; end
|
16
17
|
|
17
|
-
|
18
|
+
# Load a single record
|
19
|
+
def load
|
20
|
+
raise NotImplementedError
|
18
21
|
end
|
22
|
+
|
23
|
+
# Called once there are no more records to process
|
24
|
+
def finish; end
|
19
25
|
end
|
20
26
|
end
|
21
27
|
end
|
22
28
|
|
23
|
-
require_relative '
|
24
|
-
require_relative '
|
25
|
-
require_relative '
|
29
|
+
require_relative 'csv_loader'
|
30
|
+
require_relative 'stdout_loader'
|
31
|
+
require_relative 'table_loader'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'tty/table'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class TableLoader < Chronicle::ETL::Loader
|
6
|
+
def initialize(options)
|
7
|
+
super(options)
|
8
|
+
end
|
9
|
+
|
10
|
+
def load(result)
|
11
|
+
@table ||= TTY::Table.new(header: result.keys)
|
12
|
+
values = result.values.map{|x| x.to_s[0..30]}
|
13
|
+
@table << values
|
14
|
+
end
|
15
|
+
|
16
|
+
def finish
|
17
|
+
puts @table.render(:ascii, padding: [0, 1])
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -1,4 +1,10 @@
|
|
1
|
-
class Chronicle::
|
1
|
+
class Chronicle::ETL::Runner
|
2
|
+
BUILTIN = {
|
3
|
+
extractor: ['stdin', 'json', 'csv', 'file'],
|
4
|
+
transformer: ['null'],
|
5
|
+
loader: ['stdout', 'csv', 'table']
|
6
|
+
}.freeze
|
7
|
+
|
2
8
|
def initialize(options)
|
3
9
|
@options = options
|
4
10
|
|
@@ -6,16 +12,18 @@ class Chronicle::Etl::Runner
|
|
6
12
|
end
|
7
13
|
|
8
14
|
def run!
|
9
|
-
|
10
|
-
|
15
|
+
total = @extractor.results_count
|
16
|
+
progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
17
|
+
count = 0
|
11
18
|
|
12
|
-
@
|
13
|
-
@loader.first_load(result) if i == 0
|
19
|
+
@loader.start
|
14
20
|
|
15
|
-
|
21
|
+
@extractor.extract do |data, metadata|
|
22
|
+
transformed_data = @transformer.transform(data)
|
16
23
|
@loader.load(transformed_data)
|
17
24
|
|
18
25
|
progress_bar.increment
|
26
|
+
count += 1
|
19
27
|
end
|
20
28
|
|
21
29
|
progress_bar.finish
|
@@ -25,13 +33,27 @@ class Chronicle::Etl::Runner
|
|
25
33
|
private
|
26
34
|
|
27
35
|
def instantiate_etl_classes
|
28
|
-
@extractor =
|
29
|
-
@transformer =
|
30
|
-
@loader =
|
36
|
+
@extractor = load_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
|
37
|
+
@transformer = load_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
|
38
|
+
@loader = load_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
|
31
39
|
end
|
32
40
|
|
33
|
-
def
|
34
|
-
|
41
|
+
def load_etl_class(phase, x)
|
42
|
+
if BUILTIN[phase].include? x
|
43
|
+
klass_name = "Chronicle::ETL::#{x.capitalize}#{phase.to_s.capitalize}"
|
44
|
+
else
|
45
|
+
# TODO: come up with syntax for specifying a particular extractor in a provider library
|
46
|
+
provider, name = x.split(":")
|
47
|
+
provider = x unless provider
|
48
|
+
begin
|
49
|
+
require "chronicle/#{provider}"
|
50
|
+
rescue LoadError => e
|
51
|
+
warn("Error loading #{phase} '#{provider}'".red)
|
52
|
+
warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{provider}`")
|
53
|
+
exit(false)
|
54
|
+
end
|
55
|
+
klass_name = "Chronicle::#{provider.capitalize}::#{name&.capitalize}#{phase.capitalize}"
|
56
|
+
end
|
35
57
|
Object.const_get(klass_name)
|
36
58
|
end
|
37
59
|
end
|