chronicle-etl 0.1.2 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -0
- data/.ruby-version +1 -0
- data/.yardopts +1 -0
- data/CHANGELOG.md +11 -0
- data/Gemfile.lock +42 -10
- data/README.md +64 -11
- data/bin/console +16 -4
- data/chronicle-etl.gemspec +8 -6
- data/exe/chronicle-etl +2 -2
- data/lib/chronicle/etl.rb +6 -2
- data/lib/chronicle/etl/catalog.rb +102 -0
- data/lib/chronicle/etl/cli/connectors.rb +32 -0
- data/lib/chronicle/etl/cli/jobs.rb +110 -0
- data/lib/chronicle/etl/cli/main.rb +83 -0
- data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
- data/lib/chronicle/etl/config.rb +32 -0
- data/lib/chronicle/etl/exceptions.rb +17 -0
- data/lib/chronicle/etl/extractors/{csv.rb → csv_extractor.rb} +3 -3
- data/lib/chronicle/etl/extractors/extractor.rb +23 -12
- data/lib/chronicle/etl/extractors/file_extractor.rb +52 -0
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +11 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +29 -0
- data/lib/chronicle/etl/loaders/loader.rb +23 -16
- data/lib/chronicle/etl/loaders/rest_loader.rb +30 -0
- data/lib/chronicle/etl/loaders/stdout_loader.rb +9 -0
- data/lib/chronicle/etl/loaders/table_loader.rb +21 -0
- data/lib/chronicle/etl/runner.rb +27 -38
- data/lib/chronicle/etl/transformers/json_transformer.rb +11 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +10 -0
- data/lib/chronicle/etl/transformers/transformer.rb +28 -11
- data/lib/chronicle/etl/utils/progress_bar.rb +76 -0
- data/lib/chronicle/etl/version.rb +2 -2
- metadata +68 -29
- data/lib/chronicle/etl/cli.rb +0 -38
- data/lib/chronicle/etl/extractors/stdin.rb +0 -13
- data/lib/chronicle/etl/loaders/csv.rb +0 -31
- data/lib/chronicle/etl/loaders/stdout.rb +0 -11
- data/lib/chronicle/etl/loaders/table.rb +0 -22
- data/lib/chronicle/etl/transformers/json.rb +0 -13
- data/lib/chronicle/etl/transformers/null.rb +0 -11
- data/lib/chronicle/etl/utils/progress_bar_wrapper.rb +0 -43
@@ -0,0 +1,110 @@
|
|
1
|
+
require 'pp'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module CLI
|
6
|
+
# CLI commands for working with ETL jobs
|
7
|
+
class Jobs < SubcommandBase
|
8
|
+
default_task "start"
|
9
|
+
namespace :jobs
|
10
|
+
|
11
|
+
class_option :extractor, aliases: '-e', desc: 'Extractor class (available: stdin, csv, file)', default: 'stdin', banner: 'extractor-name'
|
12
|
+
class_option :'extractor-opts', desc: 'Extractor options', type: :hash, default: {}
|
13
|
+
class_option :transformer, aliases: '-t', desc: 'Transformer class (available: null)', default: 'null', banner: 'transformer-name'
|
14
|
+
class_option :'transformer-opts', desc: 'Transformer options', type: :hash, default: {}
|
15
|
+
class_option :loader, aliases: '-l', desc: 'Loader class (available: stdout, csv, table)', default: 'stdout', banner: 'loader-name'
|
16
|
+
class_option :'loader-opts', desc: 'Loader options', type: :hash, default: {}
|
17
|
+
class_option :job, aliases: '-j', desc: 'Job configuration name (or filename)'
|
18
|
+
|
19
|
+
map run: :start # Thor doesn't like `run` as a command name
|
20
|
+
desc "run", "Start a job"
|
21
|
+
long_desc <<-LONG_DESC
|
22
|
+
This will run an ETL job. Each job needs three parts:
|
23
|
+
|
24
|
+
1. #{'Extractor'.underline}: pulls data from an external source. By default, this is stdout. Other common options including pulling data from an API or reading JSON from a file.
|
25
|
+
|
26
|
+
2. #{'Transformer'.underline}: transforms data into a new format. If none is specified, we use the `null` transformer which does nothing to the data.
|
27
|
+
|
28
|
+
3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout.
|
29
|
+
|
30
|
+
If you do not want to use the command line flags, you can also configure a job with a .yml config file. You can either specify the path to this file or use the filename and place the file in ~/.config/chronicle/etl/jobs/NAME.yml and call it with `--job NAME`
|
31
|
+
LONG_DESC
|
32
|
+
# Run an ETL job
|
33
|
+
def start
|
34
|
+
runner_options = build_runner_options(options)
|
35
|
+
runner = Chronicle::ETL::Runner.new(runner_options)
|
36
|
+
runner.run!
|
37
|
+
end
|
38
|
+
|
39
|
+
desc "create", "Create a job"
|
40
|
+
# Create an ETL job
|
41
|
+
def create
|
42
|
+
runner_options = build_runner_options(options)
|
43
|
+
path = File.join('chronicle', 'etl', 'jobs', options[:job])
|
44
|
+
Chronicle::ETL::Config.write(path, runner_options)
|
45
|
+
end
|
46
|
+
|
47
|
+
desc "show", "Show details about a job"
|
48
|
+
# Show an ETL job
|
49
|
+
def show
|
50
|
+
runner_options = build_runner_options(options)
|
51
|
+
pp runner_options
|
52
|
+
end
|
53
|
+
|
54
|
+
desc "list", "List all available jobs"
|
55
|
+
# List available ETL jobs
|
56
|
+
def list
|
57
|
+
jobs = Chronicle::ETL::Config.jobs
|
58
|
+
|
59
|
+
job_details = jobs.map do |job|
|
60
|
+
r = Chronicle::ETL::Config.load("chronicle/etl/jobs/#{job}.yml")
|
61
|
+
|
62
|
+
extractor = r[:extractor][:name] if r[:extractor]
|
63
|
+
transformer = r[:transformer][:name] if r[:transformer]
|
64
|
+
loader = r[:loader][:name] if r[:loader]
|
65
|
+
|
66
|
+
[job, extractor, transformer, loader]
|
67
|
+
end
|
68
|
+
|
69
|
+
headers = ['name', 'extractor', 'transformer', 'loader'].map{|h| h.upcase.bold }
|
70
|
+
|
71
|
+
table = TTY::Table.new(headers, job_details)
|
72
|
+
puts table.render(indent: 0, padding: [0, 2])
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
|
77
|
+
# Create runner options by reading config file and then overwriting with flag options
|
78
|
+
def build_runner_options options
|
79
|
+
flag_options = process_flag_options(options)
|
80
|
+
job_options = load_job(options[:job])
|
81
|
+
flag_options.merge(job_options)
|
82
|
+
end
|
83
|
+
|
84
|
+
def load_job job
|
85
|
+
yml_config = Chronicle::ETL::Config.load("chronicle/etl/jobs/#{job}.yml")
|
86
|
+
# FIXME: use better trick to depely symbolize keys
|
87
|
+
JSON.parse(yml_config.to_json, symbolize_names: true)
|
88
|
+
end
|
89
|
+
|
90
|
+
# Takes flag options and turns them into a runner config
|
91
|
+
def process_flag_options options
|
92
|
+
{
|
93
|
+
extractor: {
|
94
|
+
name: options[:extractor],
|
95
|
+
options: options[:'extractor-opts']
|
96
|
+
},
|
97
|
+
transformer: {
|
98
|
+
name: options[:transformer],
|
99
|
+
options: options[:'transformer-opts']
|
100
|
+
},
|
101
|
+
loader: {
|
102
|
+
name: options[:loader],
|
103
|
+
options: options[:'loader-opts']
|
104
|
+
}
|
105
|
+
}
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'chronicle/etl'
|
3
|
+
require 'colorize'
|
4
|
+
|
5
|
+
require 'chronicle/etl/cli/subcommand_base'
|
6
|
+
require 'chronicle/etl/cli/connectors'
|
7
|
+
require 'chronicle/etl/cli/jobs'
|
8
|
+
|
9
|
+
module Chronicle
|
10
|
+
module ETL
|
11
|
+
module CLI
|
12
|
+
# Main entrypoint for CLI app
|
13
|
+
class Main < Thor
|
14
|
+
class_option "verbose", type: :boolean, default: false
|
15
|
+
default_task "jobs"
|
16
|
+
|
17
|
+
desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
|
18
|
+
subcommand 'connectors', Connectors
|
19
|
+
|
20
|
+
desc 'jobs:COMMAND', 'Configure and run jobs', hide: true
|
21
|
+
subcommand 'jobs', Jobs
|
22
|
+
|
23
|
+
# Entrypoint for the CLI
|
24
|
+
def self.start(given_args = ARGV, config = {})
|
25
|
+
if given_args.none?
|
26
|
+
abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
|
27
|
+
end
|
28
|
+
|
29
|
+
# take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
|
30
|
+
if given_args.any? && given_args[0].include?(':')
|
31
|
+
commands = given_args.shift.split(':')
|
32
|
+
given_args = given_args.unshift(commands).flatten
|
33
|
+
end
|
34
|
+
|
35
|
+
super(given_args, config)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Displays help options for chronicle-etl
|
39
|
+
def help(meth = nil, subcommand = false)
|
40
|
+
if meth && !respond_to?(meth)
|
41
|
+
klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
|
42
|
+
klass.start(['-h', task].compact, shell: shell)
|
43
|
+
else
|
44
|
+
shell.say "ABOUT".bold
|
45
|
+
shell.say " #{'chronicle-etl'.italic} is a utility tool for #{'extracting'.underline}, #{'transforming'.underline}, and #{'loading'.underline} personal data."
|
46
|
+
shell.say
|
47
|
+
shell.say "USAGE".bold
|
48
|
+
shell.say " $ chronicle-etl COMMAND"
|
49
|
+
shell.say
|
50
|
+
shell.say "EXAMPLES".bold
|
51
|
+
shell.say " Show available connectors:".italic.light_black
|
52
|
+
shell.say " $ chronicle-etl connectors:list"
|
53
|
+
shell.say
|
54
|
+
shell.say " Run a simple job:".italic.light_black
|
55
|
+
shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
|
56
|
+
shell.say
|
57
|
+
shell.say " Show full job options:".italic.light_black
|
58
|
+
shell.say " $ chronicle-etl jobs help start"
|
59
|
+
|
60
|
+
list = []
|
61
|
+
|
62
|
+
Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
|
63
|
+
list += thor_class.printable_tasks(false)
|
64
|
+
end
|
65
|
+
list.sort! { |a, b| a[0] <=> b[0] }
|
66
|
+
list.unshift ["help", "# This help menu"]
|
67
|
+
|
68
|
+
shell.say
|
69
|
+
shell.say 'ALL COMMANDS'.bold
|
70
|
+
shell.print_table(list, indent: 2, truncate: true)
|
71
|
+
shell.say
|
72
|
+
shell.say "VERSION".bold
|
73
|
+
shell.say " #{Chronicle::ETL::VERSION}"
|
74
|
+
shell.say
|
75
|
+
shell.say "FULL DOCUMENTATION".bold
|
76
|
+
shell.say " https://github.com/chronicle-app/chronicle-etl".blue
|
77
|
+
shell.say
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
module CLI
|
4
|
+
# Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
|
5
|
+
class SubcommandBase < Thor
|
6
|
+
# Print usage instructions for a subcommand
|
7
|
+
def self.help(shell, subcommand = false)
|
8
|
+
list = printable_commands(true, subcommand)
|
9
|
+
Thor::Util.thor_classes_in(self).each do |klass|
|
10
|
+
list += klass.printable_commands(false)
|
11
|
+
end
|
12
|
+
list.sort! { |a, b| a[0] <=> b[0] }
|
13
|
+
|
14
|
+
shell.say "COMMANDS".bold
|
15
|
+
shell.print_table(list, indent: 2, truncate: true)
|
16
|
+
shell.say
|
17
|
+
class_options_help(shell)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Show docs with command:subcommand pattern.
|
21
|
+
# For `help` command, don't use colon
|
22
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
23
|
+
if command.name == 'help'
|
24
|
+
"#{subcommand_prefix} #{command.usage}"
|
25
|
+
else
|
26
|
+
"#{subcommand_prefix}:#{command.usage}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Use subcommand classname to derive display name for subcommand
|
31
|
+
def self.subcommand_prefix
|
32
|
+
self.name.gsub(%r{.*::}, '').gsub(%r{^[A-Z]}) { |match| match[0].downcase }.gsub(%r{[A-Z]}) { |match| "-#{match[0].downcase}" }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'runcom'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
# Utility methods to read, write, and access config files
|
6
|
+
module Config
|
7
|
+
# Loads a yml config file
|
8
|
+
def self.load(path)
|
9
|
+
config = Runcom::Config.new(path)
|
10
|
+
# FIXME: hack to deeply symbolize keys
|
11
|
+
JSON.parse(config.to_h.to_json, symbolize_names: true)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Writes a hash as a yml config file
|
15
|
+
def self.write(path, data)
|
16
|
+
config = Runcom::Config.new(path)
|
17
|
+
filename = config.all[0].to_s + '.yml'
|
18
|
+
File.open(filename, 'w') do |f|
|
19
|
+
f << data.to_yaml
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
|
24
|
+
def self.jobs
|
25
|
+
job_directory = Runcom::Config.new('chronicle/etl/jobs').current
|
26
|
+
Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
|
27
|
+
File.basename(filename, ".*")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class Error < StandardError; end;
|
4
|
+
|
5
|
+
class ConnectorNotAvailableError < Error
|
6
|
+
def initialize(message, provider: nil, name: nil)
|
7
|
+
super(message)
|
8
|
+
@provider = provider
|
9
|
+
@name = name
|
10
|
+
end
|
11
|
+
attr_reader :name, :provider
|
12
|
+
end
|
13
|
+
|
14
|
+
class ProviderNotAvailableError < ConnectorNotAvailableError; end
|
15
|
+
class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
|
16
|
+
end
|
17
|
+
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'csv'
|
2
|
-
class Chronicle::
|
2
|
+
class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
|
3
3
|
DEFAULT_OPTIONS = {
|
4
4
|
headers: true,
|
5
5
|
filename: $stdin
|
@@ -18,7 +18,7 @@ class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
|
|
18
18
|
end
|
19
19
|
|
20
20
|
def results_count
|
21
|
-
CSV.read(@options[:filename],
|
21
|
+
CSV.read(@options[:filename], headers: @options[:headers]).count if read_from_file?
|
22
22
|
end
|
23
23
|
|
24
24
|
private
|
@@ -33,7 +33,7 @@ class Chronicle::Etl::Extractors::Csv < Chronicle::Etl::Extractors::Extractor
|
|
33
33
|
}
|
34
34
|
|
35
35
|
stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
|
36
|
-
CSV.new(stream, csv_options)
|
36
|
+
CSV.new(stream, **csv_options)
|
37
37
|
end
|
38
38
|
|
39
39
|
def read_from_file?
|
@@ -1,20 +1,31 @@
|
|
1
|
+
require 'chronicle/etl'
|
2
|
+
|
1
3
|
module Chronicle
|
2
|
-
module
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
@options = options.transform_keys!(&:to_sym)
|
7
|
-
end
|
4
|
+
module ETL
|
5
|
+
# Abstract class representing an Extractor for an ETL job
|
6
|
+
class Extractor
|
7
|
+
extend Chronicle::ETL::Catalog
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
# Construct a new instance of this extractor. Options are passed in from a Runner
|
10
|
+
# == Paramters:
|
11
|
+
# options::
|
12
|
+
# Options for configuring this Extractor
|
13
|
+
def initialize(options = {})
|
14
|
+
@options = options.transform_keys!(&:to_sym)
|
15
|
+
end
|
12
16
|
|
13
|
-
|
17
|
+
# Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
|
18
|
+
def extract
|
19
|
+
raise NotImplementedError
|
14
20
|
end
|
21
|
+
|
22
|
+
# An optional method to calculate how many records there are to extract. Used primarily for
|
23
|
+
# building the progress bar
|
24
|
+
def results_count; end
|
15
25
|
end
|
16
26
|
end
|
17
27
|
end
|
18
28
|
|
19
|
-
require_relative '
|
20
|
-
require_relative '
|
29
|
+
require_relative 'csv_extractor'
|
30
|
+
require_relative 'file_extractor'
|
31
|
+
require_relative 'stdin_extractor'
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class FileExtractor < Chronicle::ETL::Extractor
|
6
|
+
def extract
|
7
|
+
if file?
|
8
|
+
extract_file do |data, metadata|
|
9
|
+
yield(data, metadata)
|
10
|
+
end
|
11
|
+
elsif directory?
|
12
|
+
extract_from_directory do |data, metadata|
|
13
|
+
yield(data, metadata)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def results_count
|
19
|
+
if file?
|
20
|
+
return 1
|
21
|
+
else
|
22
|
+
search_pattern = File.join(@options[:filename], '**/*.eml')
|
23
|
+
Dir.glob(search_pattern).count
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def extract_from_directory
|
30
|
+
search_pattern = File.join(@options[:filename], '**/*.eml')
|
31
|
+
filenames = Dir.glob(search_pattern)
|
32
|
+
filenames.each do |filename|
|
33
|
+
file = File.open(filename)
|
34
|
+
yield(file.read, {filename: file})
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def extract_file
|
39
|
+
file = File.open(@options[:filename])
|
40
|
+
yield(file.read, {filename: @options[:filename]})
|
41
|
+
end
|
42
|
+
|
43
|
+
def directory?
|
44
|
+
Pathname.new(@options[:filename]).directory?
|
45
|
+
end
|
46
|
+
|
47
|
+
def file?
|
48
|
+
Pathname.new(@options[:filename]).file?
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class CsvLoader < Chronicle::ETL::Loader
|
6
|
+
def initialize(options={})
|
7
|
+
super(options)
|
8
|
+
@rows = []
|
9
|
+
end
|
10
|
+
|
11
|
+
def load(result)
|
12
|
+
if (result.is_a? Hash)
|
13
|
+
@rows << result.values
|
14
|
+
else
|
15
|
+
@rows << result
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def finish
|
20
|
+
z = $stdout
|
21
|
+
CSV(z) do |csv|
|
22
|
+
@rows.each do |row|
|
23
|
+
csv << row
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|