chronicle-etl 0.2.4 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +3 -0
- data/.rubocop.yml +31 -1
- data/Guardfile +7 -0
- data/README.md +21 -14
- data/Rakefile +4 -2
- data/chronicle-etl.gemspec +18 -10
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/cli/connectors.rb +53 -7
- data/lib/chronicle/etl/cli/jobs.rb +59 -24
- data/lib/chronicle/etl/cli/main.rb +18 -16
- data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
- data/lib/chronicle/etl/cli.rb +7 -0
- data/lib/chronicle/etl/config.rb +1 -1
- data/lib/chronicle/etl/configurable.rb +150 -0
- data/lib/chronicle/etl/exceptions.rb +14 -1
- data/lib/chronicle/etl/extraction.rb +12 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +32 -31
- data/lib/chronicle/etl/extractors/extractor.rb +25 -13
- data/lib/chronicle/etl/extractors/file_extractor.rb +17 -32
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +37 -0
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
- data/lib/chronicle/etl/job.rb +30 -29
- data/lib/chronicle/etl/job_definition.rb +45 -7
- data/lib/chronicle/etl/job_log.rb +10 -0
- data/lib/chronicle/etl/job_logger.rb +23 -20
- data/lib/chronicle/etl/loaders/csv_loader.rb +5 -1
- data/lib/chronicle/etl/loaders/loader.rb +5 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +9 -5
- data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
- data/lib/chronicle/etl/loaders/table_loader.rb +51 -7
- data/lib/chronicle/etl/logger.rb +48 -0
- data/lib/chronicle/etl/models/attachment.rb +14 -0
- data/lib/chronicle/etl/models/base.rb +23 -7
- data/lib/chronicle/etl/models/entity.rb +9 -3
- data/lib/chronicle/etl/registry/connector_registration.rb +62 -0
- data/lib/chronicle/etl/registry/registry.rb +52 -0
- data/lib/chronicle/etl/registry/self_registering.rb +25 -0
- data/lib/chronicle/etl/runner.rb +58 -7
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
- data/lib/chronicle/etl/serializers/serializer.rb +27 -0
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +247 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
- data/lib/chronicle/etl/transformers/transformer.rb +41 -10
- data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
- data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +8 -2
- metadata +146 -34
- data/.ruby-version +0 -1
- data/Gemfile.lock +0 -91
- data/lib/chronicle/etl/catalog.rb +0 -108
- data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
@@ -0,0 +1,150 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ostruct"
|
4
|
+
|
5
|
+
module Chronicle
|
6
|
+
module ETL
|
7
|
+
# A mixin that gives a class
|
8
|
+
# a {Chronicle::ETL::Configurable::ClassMethods#setting} macro to define
|
9
|
+
# settings and their properties (require, type, etc)
|
10
|
+
#
|
11
|
+
# @example Basic usage
|
12
|
+
# class Test < Chronicle::ETL::Extractor
|
13
|
+
# include Chronicle::ETL::Configurable
|
14
|
+
# setting :when, type: :date, required: true
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
# t = Test.new(when: '2022-02-24')
|
18
|
+
# t.config.when
|
19
|
+
module Configurable
|
20
|
+
# An individual setting for this Configurable
|
21
|
+
Setting = Struct.new(:default, :required, :type)
|
22
|
+
private_constant :Setting
|
23
|
+
|
24
|
+
# Collection of user-supplied options for this Configurable
|
25
|
+
class Config < OpenStruct
|
26
|
+
# Config values that aren't nil, as a hash
|
27
|
+
def compacted_h
|
28
|
+
to_h.compact
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# @private
|
33
|
+
def self.included(klass)
|
34
|
+
klass.extend(ClassMethods)
|
35
|
+
klass.include(InstanceMethods)
|
36
|
+
klass.prepend(Initializer)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Initializer method for classes that have Configurable mixed in
|
40
|
+
module Initializer
|
41
|
+
# Make sure this class has a default @config ready to use
|
42
|
+
def initialize(*args)
|
43
|
+
@config = initialize_default_config
|
44
|
+
super
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Instance methods for classes that have Configurable mixed in
|
49
|
+
module InstanceMethods
|
50
|
+
attr_reader :config
|
51
|
+
|
52
|
+
# Take given options and apply them to this class's settings
|
53
|
+
# and make them available in @config and validates that they
|
54
|
+
# conform to setting rules
|
55
|
+
def apply_options(options)
|
56
|
+
options.transform_keys!(&:to_sym)
|
57
|
+
|
58
|
+
options.each do |name, value|
|
59
|
+
setting = self.class.all_settings[name]
|
60
|
+
raise(Chronicle::ETL::ConfigurationError, "Unrecognized setting: #{name}") unless setting
|
61
|
+
|
62
|
+
@config[name] = coerced_value(setting, value)
|
63
|
+
end
|
64
|
+
validate_config
|
65
|
+
options
|
66
|
+
end
|
67
|
+
|
68
|
+
# Name of all settings available to this class
|
69
|
+
def self.settings
|
70
|
+
self.class.all_settings.keys
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def initialize_default_config
|
76
|
+
self.class.config_with_defaults
|
77
|
+
end
|
78
|
+
|
79
|
+
def validate_config
|
80
|
+
missing = (self.class.all_required_settings.keys - @config.compacted_h.keys)
|
81
|
+
raise Chronicle::ETL::ConfigurationError, "Missing options: #{missing}" if missing.count.positive?
|
82
|
+
end
|
83
|
+
|
84
|
+
def coerced_value(setting, value)
|
85
|
+
setting.type ? __send__("coerce_#{setting.type}", value) : value
|
86
|
+
end
|
87
|
+
|
88
|
+
def coerce_string(value)
|
89
|
+
value.to_s
|
90
|
+
end
|
91
|
+
|
92
|
+
def coerce_time(value)
|
93
|
+
# TODO: handle durations like '3h'
|
94
|
+
if value.is_a?(String)
|
95
|
+
Time.parse(value)
|
96
|
+
else
|
97
|
+
value
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Class methods for classes that have Configurable mixed in
|
103
|
+
module ClassMethods
|
104
|
+
# Macro for creating a setting on a class {::Chronicle::ETL::Configurable}
|
105
|
+
#
|
106
|
+
# @param [String] name Name of the setting
|
107
|
+
# @param [Boolean] required whether setting is required
|
108
|
+
# @param [Object] default Default value
|
109
|
+
# @param [Symbol] type Type
|
110
|
+
#
|
111
|
+
# @example Basic usage
|
112
|
+
# setting :when, type: :date, required: true
|
113
|
+
#
|
114
|
+
# @see ::Chronicle::ETL::Configurable
|
115
|
+
def setting(name, default: nil, required: false, type: nil)
|
116
|
+
s = Setting.new(default, required, type)
|
117
|
+
settings[name] = s
|
118
|
+
end
|
119
|
+
|
120
|
+
# Collect all settings defined on this class and its ancestors (that
|
121
|
+
# have Configurable mixin included)
|
122
|
+
def all_settings
|
123
|
+
if superclass.include?(Chronicle::ETL::Configurable)
|
124
|
+
superclass.all_settings.merge(settings)
|
125
|
+
else
|
126
|
+
settings
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Filters settings to those that are required.
|
131
|
+
def all_required_settings
|
132
|
+
all_settings.select { |_name, setting| setting.required } || {}
|
133
|
+
end
|
134
|
+
|
135
|
+
def settings
|
136
|
+
@settings ||= {}
|
137
|
+
end
|
138
|
+
|
139
|
+
def setting_exists?(name)
|
140
|
+
all_settings.keys.include? name
|
141
|
+
end
|
142
|
+
|
143
|
+
def config_with_defaults
|
144
|
+
s = all_settings.transform_values(&:default)
|
145
|
+
Config.new(s)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
@@ -2,7 +2,9 @@ module Chronicle
|
|
2
2
|
module ETL
|
3
3
|
class Error < StandardError; end;
|
4
4
|
|
5
|
-
class
|
5
|
+
class ConfigurationError < Error; end;
|
6
|
+
|
7
|
+
class RunnerTypeError < Error; end
|
6
8
|
|
7
9
|
class ConnectorNotAvailableError < Error
|
8
10
|
def initialize(message, provider: nil, name: nil)
|
@@ -15,5 +17,16 @@ module Chronicle
|
|
15
17
|
|
16
18
|
class ProviderNotAvailableError < ConnectorNotAvailableError; end
|
17
19
|
class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
|
20
|
+
|
21
|
+
class TransformationError < Error
|
22
|
+
attr_reader :transformation
|
23
|
+
|
24
|
+
def initialize(message=nil, transformation:)
|
25
|
+
super(message)
|
26
|
+
@transformation = transformation
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class UntransformableRecordError < TransformationError; end
|
18
31
|
end
|
19
32
|
end
|
@@ -1,41 +1,42 @@
|
|
1
1
|
require 'csv'
|
2
|
-
class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
|
3
|
-
DEFAULT_OPTIONS = {
|
4
|
-
headers: true,
|
5
|
-
filename: $stdin
|
6
|
-
}.freeze
|
7
|
-
|
8
|
-
def initialize(options = {})
|
9
|
-
super(DEFAULT_OPTIONS.merge(options))
|
10
|
-
end
|
11
2
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
yield result
|
17
|
-
end
|
18
|
-
end
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class CSVExtractor < Chronicle::ETL::Extractor
|
6
|
+
include Extractors::Helpers::FilesystemReader
|
19
7
|
|
20
|
-
|
21
|
-
|
22
|
-
|
8
|
+
register_connector do |r|
|
9
|
+
r.description = 'input as CSV'
|
10
|
+
end
|
23
11
|
|
24
|
-
|
12
|
+
setting :headers, default: true
|
13
|
+
setting :filename, default: $stdin
|
25
14
|
|
26
|
-
|
27
|
-
|
15
|
+
def extract
|
16
|
+
csv = initialize_csv
|
17
|
+
csv.each do |row|
|
18
|
+
yield Chronicle::ETL::Extraction.new(data: row.to_h)
|
19
|
+
end
|
20
|
+
end
|
28
21
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
}
|
22
|
+
def results_count
|
23
|
+
CSV.read(@config.filename, headers: @config.headers).count unless stdin?(@config.filename)
|
24
|
+
end
|
33
25
|
|
34
|
-
|
35
|
-
|
36
|
-
|
26
|
+
private
|
27
|
+
|
28
|
+
def initialize_csv
|
29
|
+
headers = @config.headers.is_a?(String) ? @config.headers.split(',') : @config.headers
|
37
30
|
|
38
|
-
|
39
|
-
|
31
|
+
csv_options = {
|
32
|
+
headers: headers,
|
33
|
+
converters: :all
|
34
|
+
}
|
35
|
+
|
36
|
+
open_from_filesystem(filename: @config.filename) do |file|
|
37
|
+
return CSV.new(file, **csv_options)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
40
41
|
end
|
41
42
|
end
|
@@ -4,38 +4,50 @@ module Chronicle
|
|
4
4
|
module ETL
|
5
5
|
# Abstract class representing an Extractor for an ETL job
|
6
6
|
class Extractor
|
7
|
-
extend Chronicle::ETL::
|
7
|
+
extend Chronicle::ETL::Registry::SelfRegistering
|
8
|
+
include Chronicle::ETL::Configurable
|
9
|
+
|
10
|
+
setting :since, type: :date
|
11
|
+
setting :until, type: :date
|
12
|
+
setting :limit
|
13
|
+
setting :load_after_id
|
14
|
+
setting :filename
|
8
15
|
|
9
16
|
# Construct a new instance of this extractor. Options are passed in from a Runner
|
10
|
-
# ==
|
17
|
+
# == Parameters:
|
11
18
|
# options::
|
12
19
|
# Options for configuring this Extractor
|
13
20
|
def initialize(options = {})
|
14
|
-
|
15
|
-
handle_continuation
|
21
|
+
apply_options(options)
|
16
22
|
end
|
17
23
|
|
18
|
-
#
|
19
|
-
def
|
20
|
-
raise NotImplementedError
|
21
|
-
end
|
24
|
+
# Hook called before #extract. Useful for gathering data, initailizing proxies, etc
|
25
|
+
def prepare; end
|
22
26
|
|
23
27
|
# An optional method to calculate how many records there are to extract. Used primarily for
|
24
28
|
# building the progress bar
|
25
29
|
def results_count; end
|
26
30
|
|
31
|
+
# Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
|
32
|
+
def extract
|
33
|
+
raise NotImplementedError
|
34
|
+
end
|
35
|
+
|
27
36
|
private
|
28
37
|
|
29
|
-
|
30
|
-
|
38
|
+
# TODO: reimplemenet this
|
39
|
+
# def handle_continuation
|
40
|
+
# return unless @config.continuation
|
31
41
|
|
32
|
-
|
33
|
-
|
34
|
-
end
|
42
|
+
# @config.since = @config.continuation.highest_timestamp if @config.continuation.highest_timestamp
|
43
|
+
# @config.load_after_id = @config.continuation.last_id if @config.continuation.last_id
|
44
|
+
# end
|
35
45
|
end
|
36
46
|
end
|
37
47
|
end
|
38
48
|
|
49
|
+
require_relative 'helpers/filesystem_reader'
|
39
50
|
require_relative 'csv_extractor'
|
40
51
|
require_relative 'file_extractor'
|
52
|
+
require_relative 'json_extractor'
|
41
53
|
require_relative 'stdin_extractor'
|
@@ -3,49 +3,34 @@ require 'pathname'
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
5
5
|
class FileExtractor < Chronicle::ETL::Extractor
|
6
|
-
|
7
|
-
if file?
|
8
|
-
extract_file do |data, metadata|
|
9
|
-
yield(data, metadata)
|
10
|
-
end
|
11
|
-
elsif directory?
|
12
|
-
extract_from_directory do |data, metadata|
|
13
|
-
yield(data, metadata)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
6
|
+
include Extractors::Helpers::FilesystemReader
|
17
7
|
|
18
|
-
|
19
|
-
|
20
|
-
return 1
|
21
|
-
else
|
22
|
-
search_pattern = File.join(@options[:filename], '**/*.eml')
|
23
|
-
Dir.glob(search_pattern).count
|
24
|
-
end
|
8
|
+
register_connector do |r|
|
9
|
+
r.description = 'file or directory of files'
|
25
10
|
end
|
26
11
|
|
27
|
-
|
12
|
+
# TODO: consolidate this with @config.filename
|
13
|
+
setting :dir_glob_pattern
|
28
14
|
|
29
|
-
def
|
30
|
-
search_pattern = File.join(@options[:filename], '**/*.eml')
|
31
|
-
filenames = Dir.glob(search_pattern)
|
15
|
+
def extract
|
32
16
|
filenames.each do |filename|
|
33
|
-
|
34
|
-
yield(file.read, {filename: file})
|
17
|
+
yield Chronicle::ETL::Extraction.new(data: filename)
|
35
18
|
end
|
36
19
|
end
|
37
20
|
|
38
|
-
def
|
39
|
-
|
40
|
-
yield(file.read, {filename: @options[:filename]})
|
21
|
+
def results_count
|
22
|
+
filenames.count
|
41
23
|
end
|
42
24
|
|
43
|
-
|
44
|
-
Pathname.new(@options[:filename]).directory?
|
45
|
-
end
|
25
|
+
private
|
46
26
|
|
47
|
-
def
|
48
|
-
|
27
|
+
def filenames
|
28
|
+
@filenames ||= filenames_in_directory(
|
29
|
+
path: @config.filename,
|
30
|
+
dir_glob_pattern: @config.dir_glob_pattern,
|
31
|
+
load_since: @config.since,
|
32
|
+
load_until: @config.until
|
33
|
+
)
|
49
34
|
end
|
50
35
|
end
|
51
36
|
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Extractors
|
6
|
+
module Helpers
|
7
|
+
module FilesystemReader
|
8
|
+
|
9
|
+
def filenames_in_directory(...)
|
10
|
+
filenames = gather_files(...)
|
11
|
+
if block_given?
|
12
|
+
filenames.each do |filename|
|
13
|
+
yield filename
|
14
|
+
end
|
15
|
+
else
|
16
|
+
filenames
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def read_from_filesystem(filename:, yield_each_line: true, dir_glob_pattern: '**/*')
|
21
|
+
open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
|
22
|
+
if yield_each_line
|
23
|
+
file.each_line do |line|
|
24
|
+
yield line
|
25
|
+
end
|
26
|
+
else
|
27
|
+
yield file.read
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def open_from_filesystem(filename:, dir_glob_pattern: '**/*')
|
33
|
+
open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
|
34
|
+
yield file
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def results_count
|
39
|
+
raise NotImplementedError
|
40
|
+
# if file?
|
41
|
+
# return 1
|
42
|
+
# else
|
43
|
+
# search_pattern = File.join(@options[:filename], '**/*')
|
44
|
+
# Dir.glob(search_pattern).count
|
45
|
+
# end
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def gather_files(path:, dir_glob_pattern: '**/*', load_since: nil, load_until: nil, smaller_than: nil, larger_than: nil, sort: :mtime)
|
51
|
+
search_pattern = File.join(path, '**', dir_glob_pattern)
|
52
|
+
files = Dir.glob(search_pattern)
|
53
|
+
|
54
|
+
files = files.keep_if {|f| (File.mtime(f) > load_since)} if load_since
|
55
|
+
files = files.keep_if {|f| (File.mtime(f) < load_until)} if load_until
|
56
|
+
|
57
|
+
# pass in file sizes in bytes
|
58
|
+
files = files.keep_if {|f| (File.size(f) < smaller_than)} if smaller_than
|
59
|
+
files = files.keep_if {|f| (File.size(f) > larger_than)} if larger_than
|
60
|
+
|
61
|
+
# TODO: incorporate sort argument
|
62
|
+
files.sort_by{ |f| File.mtime(f) }
|
63
|
+
end
|
64
|
+
|
65
|
+
def select_files_in_directory(path:, dir_glob_pattern: '**/*')
|
66
|
+
raise IOError.new("#{path} is not a directory.") unless directory?(path)
|
67
|
+
|
68
|
+
search_pattern = File.join(path, dir_glob_pattern)
|
69
|
+
Dir.glob(search_pattern).each do |filename|
|
70
|
+
yield(filename)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def open_files(filename:, dir_glob_pattern:)
|
75
|
+
if stdin?(filename)
|
76
|
+
yield $stdin
|
77
|
+
elsif directory?(filename)
|
78
|
+
search_pattern = File.join(filename, dir_glob_pattern)
|
79
|
+
filenames = Dir.glob(search_pattern)
|
80
|
+
filenames.each do |filename|
|
81
|
+
file = File.open(filename)
|
82
|
+
yield(file)
|
83
|
+
end
|
84
|
+
elsif file?(filename)
|
85
|
+
yield File.open(filename)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def stdin?(filename)
|
90
|
+
filename == $stdin
|
91
|
+
end
|
92
|
+
|
93
|
+
def directory?(filename)
|
94
|
+
Pathname.new(filename).directory?
|
95
|
+
end
|
96
|
+
|
97
|
+
def file?(filename)
|
98
|
+
Pathname.new(filename).file?
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class JsonExtractor < Chronicle::ETL::Extractor
|
4
|
+
include Extractors::Helpers::FilesystemReader
|
5
|
+
|
6
|
+
register_connector do |r|
|
7
|
+
r.description = 'input as JSON'
|
8
|
+
end
|
9
|
+
|
10
|
+
setting :filename, default: $stdin
|
11
|
+
setting :jsonl, default: true
|
12
|
+
|
13
|
+
def extract
|
14
|
+
load_input do |input|
|
15
|
+
parsed_data = parse_data(input)
|
16
|
+
yield Chronicle::ETL::Extraction.new(data: parsed_data) if parsed_data
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def results_count
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def parse_data data
|
26
|
+
JSON.parse(data)
|
27
|
+
rescue JSON::ParserError => e
|
28
|
+
end
|
29
|
+
|
30
|
+
def load_input
|
31
|
+
read_from_filesystem(filename: @options[:filename]) do |data|
|
32
|
+
yield data
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -1,9 +1,14 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
3
|
class StdinExtractor < Chronicle::ETL::Extractor
|
4
|
+
register_connector do |r|
|
5
|
+
r.description = 'stdin'
|
6
|
+
end
|
7
|
+
|
4
8
|
def extract
|
5
9
|
$stdin.read.each_line do |line|
|
6
|
-
|
10
|
+
data = { line: line.strip }
|
11
|
+
yield Chronicle::ETL::Extraction.new(data: data)
|
7
12
|
end
|
8
13
|
end
|
9
14
|
end
|
data/lib/chronicle/etl/job.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
|
+
require 'forwardable'
|
1
2
|
module Chronicle
|
2
3
|
module ETL
|
3
4
|
class Job
|
5
|
+
extend Forwardable
|
6
|
+
|
7
|
+
def_delegators :@job_definition, :dry_run?
|
8
|
+
|
4
9
|
attr_accessor :name,
|
5
10
|
:extractor_klass,
|
6
11
|
:extractor_options,
|
@@ -12,32 +17,30 @@ module Chronicle
|
|
12
17
|
# TODO: build a proper id system
|
13
18
|
alias id name
|
14
19
|
|
15
|
-
def initialize(
|
16
|
-
|
17
|
-
@name = definition[:name]
|
18
|
-
@
|
19
|
-
@
|
20
|
-
|
21
|
-
@transformer_klass = load_klass(:transformer, definition[:transformer][:name])
|
22
|
-
@transformer_options = definition[:transformer][:options] || {}
|
23
|
-
|
24
|
-
@loader_klass = load_klass(:loader, definition[:loader][:name])
|
25
|
-
@loader_options = definition[:loader][:options] || {}
|
20
|
+
def initialize(job_definition)
|
21
|
+
@job_definition = job_definition
|
22
|
+
@name = @job_definition.definition[:name]
|
23
|
+
@extractor_options = @job_definition.extractor_options
|
24
|
+
@transformer_options = @job_definition.transformer_options
|
25
|
+
@loader_options = @job_definition.loader_options
|
26
26
|
|
27
|
-
set_continuation if
|
27
|
+
set_continuation if use_continuation?
|
28
28
|
yield self if block_given?
|
29
29
|
end
|
30
30
|
|
31
31
|
def instantiate_extractor
|
32
|
-
|
32
|
+
@extractor_klass = @job_definition.extractor_klass
|
33
|
+
@extractor_klass.new(@extractor_options)
|
33
34
|
end
|
34
35
|
|
35
|
-
def instantiate_transformer(
|
36
|
-
|
36
|
+
def instantiate_transformer(extraction)
|
37
|
+
@transformer_klass = @job_definition.transformer_klass
|
38
|
+
@transformer_klass.new(extraction, @transformer_options)
|
37
39
|
end
|
38
40
|
|
39
41
|
def instantiate_loader
|
40
|
-
|
42
|
+
@loader_klass = @job_definition.loader_klass
|
43
|
+
@loader_klass.new(@loader_options)
|
41
44
|
end
|
42
45
|
|
43
46
|
def save_log?
|
@@ -45,26 +48,24 @@ module Chronicle
|
|
45
48
|
return !id.nil?
|
46
49
|
end
|
47
50
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
51
|
+
def to_s
|
52
|
+
output = "Job"
|
53
|
+
output += " '#{name}'".bold if name
|
54
|
+
output += "\n"
|
55
|
+
output += " → Extracting from #{@job_definition.extractor_klass.description}\n"
|
56
|
+
output += " → Transforming #{@job_definition.transformer_klass.description}\n"
|
57
|
+
output += " → Loading to #{@job_definition.loader_klass.description}\n"
|
55
58
|
end
|
56
59
|
|
57
|
-
|
58
|
-
Chronicle::ETL::Catalog.phase_and_identifier_to_klass(phase, identifier)
|
59
|
-
end
|
60
|
+
private
|
60
61
|
|
61
62
|
def set_continuation
|
62
|
-
continuation = Chronicle::ETL::JobLogger.load_latest(@
|
63
|
+
continuation = Chronicle::ETL::JobLogger.load_latest(@id)
|
63
64
|
@extractor_options[:continuation] = continuation
|
64
65
|
end
|
65
66
|
|
66
|
-
def
|
67
|
-
|
67
|
+
def use_continuation?
|
68
|
+
@job_definition.incremental?
|
68
69
|
end
|
69
70
|
end
|
70
71
|
end
|