chronicle-etl 0.2.4 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +3 -0
- data/.rubocop.yml +31 -1
- data/Guardfile +7 -0
- data/README.md +21 -14
- data/Rakefile +4 -2
- data/chronicle-etl.gemspec +18 -10
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/cli/connectors.rb +53 -7
- data/lib/chronicle/etl/cli/jobs.rb +59 -24
- data/lib/chronicle/etl/cli/main.rb +18 -16
- data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
- data/lib/chronicle/etl/cli.rb +7 -0
- data/lib/chronicle/etl/config.rb +1 -1
- data/lib/chronicle/etl/configurable.rb +150 -0
- data/lib/chronicle/etl/exceptions.rb +14 -1
- data/lib/chronicle/etl/extraction.rb +12 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +32 -31
- data/lib/chronicle/etl/extractors/extractor.rb +25 -13
- data/lib/chronicle/etl/extractors/file_extractor.rb +17 -32
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +37 -0
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
- data/lib/chronicle/etl/job.rb +30 -29
- data/lib/chronicle/etl/job_definition.rb +45 -7
- data/lib/chronicle/etl/job_log.rb +10 -0
- data/lib/chronicle/etl/job_logger.rb +23 -20
- data/lib/chronicle/etl/loaders/csv_loader.rb +5 -1
- data/lib/chronicle/etl/loaders/loader.rb +5 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +9 -5
- data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
- data/lib/chronicle/etl/loaders/table_loader.rb +51 -7
- data/lib/chronicle/etl/logger.rb +48 -0
- data/lib/chronicle/etl/models/attachment.rb +14 -0
- data/lib/chronicle/etl/models/base.rb +23 -7
- data/lib/chronicle/etl/models/entity.rb +9 -3
- data/lib/chronicle/etl/registry/connector_registration.rb +62 -0
- data/lib/chronicle/etl/registry/registry.rb +52 -0
- data/lib/chronicle/etl/registry/self_registering.rb +25 -0
- data/lib/chronicle/etl/runner.rb +58 -7
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
- data/lib/chronicle/etl/serializers/serializer.rb +27 -0
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +247 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
- data/lib/chronicle/etl/transformers/transformer.rb +41 -10
- data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
- data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +8 -2
- metadata +146 -34
- data/.ruby-version +0 -1
- data/Gemfile.lock +0 -91
- data/lib/chronicle/etl/catalog.rb +0 -108
- data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
@@ -0,0 +1,150 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ostruct"
|
4
|
+
|
5
|
+
module Chronicle
|
6
|
+
module ETL
|
7
|
+
# A mixin that gives a class
|
8
|
+
# a {Chronicle::ETL::Configurable::ClassMethods#setting} macro to define
|
9
|
+
# settings and their properties (require, type, etc)
|
10
|
+
#
|
11
|
+
# @example Basic usage
|
12
|
+
# class Test < Chronicle::ETL::Extractor
|
13
|
+
# include Chronicle::ETL::Configurable
|
14
|
+
# setting :when, type: :date, required: true
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
# t = Test.new(when: '2022-02-24')
|
18
|
+
# t.config.when
|
19
|
+
module Configurable
|
20
|
+
# An individual setting for this Configurable
|
21
|
+
Setting = Struct.new(:default, :required, :type)
|
22
|
+
private_constant :Setting
|
23
|
+
|
24
|
+
# Collection of user-supplied options for this Configurable
|
25
|
+
class Config < OpenStruct
|
26
|
+
# Config values that aren't nil, as a hash
|
27
|
+
def compacted_h
|
28
|
+
to_h.compact
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# @private
|
33
|
+
def self.included(klass)
|
34
|
+
klass.extend(ClassMethods)
|
35
|
+
klass.include(InstanceMethods)
|
36
|
+
klass.prepend(Initializer)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Initializer method for classes that have Configurable mixed in
|
40
|
+
module Initializer
|
41
|
+
# Make sure this class has a default @config ready to use
|
42
|
+
def initialize(*args)
|
43
|
+
@config = initialize_default_config
|
44
|
+
super
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Instance methods for classes that have Configurable mixed in
|
49
|
+
module InstanceMethods
|
50
|
+
attr_reader :config
|
51
|
+
|
52
|
+
# Take given options and apply them to this class's settings
|
53
|
+
# and make them available in @config and validates that they
|
54
|
+
# conform to setting rules
|
55
|
+
def apply_options(options)
|
56
|
+
options.transform_keys!(&:to_sym)
|
57
|
+
|
58
|
+
options.each do |name, value|
|
59
|
+
setting = self.class.all_settings[name]
|
60
|
+
raise(Chronicle::ETL::ConfigurationError, "Unrecognized setting: #{name}") unless setting
|
61
|
+
|
62
|
+
@config[name] = coerced_value(setting, value)
|
63
|
+
end
|
64
|
+
validate_config
|
65
|
+
options
|
66
|
+
end
|
67
|
+
|
68
|
+
# Name of all settings available to this class
|
69
|
+
def self.settings
|
70
|
+
self.class.all_settings.keys
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def initialize_default_config
|
76
|
+
self.class.config_with_defaults
|
77
|
+
end
|
78
|
+
|
79
|
+
def validate_config
|
80
|
+
missing = (self.class.all_required_settings.keys - @config.compacted_h.keys)
|
81
|
+
raise Chronicle::ETL::ConfigurationError, "Missing options: #{missing}" if missing.count.positive?
|
82
|
+
end
|
83
|
+
|
84
|
+
def coerced_value(setting, value)
|
85
|
+
setting.type ? __send__("coerce_#{setting.type}", value) : value
|
86
|
+
end
|
87
|
+
|
88
|
+
def coerce_string(value)
|
89
|
+
value.to_s
|
90
|
+
end
|
91
|
+
|
92
|
+
def coerce_time(value)
|
93
|
+
# TODO: handle durations like '3h'
|
94
|
+
if value.is_a?(String)
|
95
|
+
Time.parse(value)
|
96
|
+
else
|
97
|
+
value
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Class methods for classes that have Configurable mixed in
|
103
|
+
module ClassMethods
|
104
|
+
# Macro for creating a setting on a class {::Chronicle::ETL::Configurable}
|
105
|
+
#
|
106
|
+
# @param [String] name Name of the setting
|
107
|
+
# @param [Boolean] required whether setting is required
|
108
|
+
# @param [Object] default Default value
|
109
|
+
# @param [Symbol] type Type
|
110
|
+
#
|
111
|
+
# @example Basic usage
|
112
|
+
# setting :when, type: :date, required: true
|
113
|
+
#
|
114
|
+
# @see ::Chronicle::ETL::Configurable
|
115
|
+
def setting(name, default: nil, required: false, type: nil)
|
116
|
+
s = Setting.new(default, required, type)
|
117
|
+
settings[name] = s
|
118
|
+
end
|
119
|
+
|
120
|
+
# Collect all settings defined on this class and its ancestors (that
|
121
|
+
# have Configurable mixin included)
|
122
|
+
def all_settings
|
123
|
+
if superclass.include?(Chronicle::ETL::Configurable)
|
124
|
+
superclass.all_settings.merge(settings)
|
125
|
+
else
|
126
|
+
settings
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Filters settings to those that are required.
|
131
|
+
def all_required_settings
|
132
|
+
all_settings.select { |_name, setting| setting.required } || {}
|
133
|
+
end
|
134
|
+
|
135
|
+
def settings
|
136
|
+
@settings ||= {}
|
137
|
+
end
|
138
|
+
|
139
|
+
def setting_exists?(name)
|
140
|
+
all_settings.keys.include? name
|
141
|
+
end
|
142
|
+
|
143
|
+
def config_with_defaults
|
144
|
+
s = all_settings.transform_values(&:default)
|
145
|
+
Config.new(s)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
@@ -2,7 +2,9 @@ module Chronicle
|
|
2
2
|
module ETL
|
3
3
|
class Error < StandardError; end;
|
4
4
|
|
5
|
-
class
|
5
|
+
class ConfigurationError < Error; end;
|
6
|
+
|
7
|
+
class RunnerTypeError < Error; end
|
6
8
|
|
7
9
|
class ConnectorNotAvailableError < Error
|
8
10
|
def initialize(message, provider: nil, name: nil)
|
@@ -15,5 +17,16 @@ module Chronicle
|
|
15
17
|
|
16
18
|
class ProviderNotAvailableError < ConnectorNotAvailableError; end
|
17
19
|
class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
|
20
|
+
|
21
|
+
class TransformationError < Error
|
22
|
+
attr_reader :transformation
|
23
|
+
|
24
|
+
def initialize(message=nil, transformation:)
|
25
|
+
super(message)
|
26
|
+
@transformation = transformation
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class UntransformableRecordError < TransformationError; end
|
18
31
|
end
|
19
32
|
end
|
@@ -1,41 +1,42 @@
|
|
1
1
|
require 'csv'
|
2
|
-
class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
|
3
|
-
DEFAULT_OPTIONS = {
|
4
|
-
headers: true,
|
5
|
-
filename: $stdin
|
6
|
-
}.freeze
|
7
|
-
|
8
|
-
def initialize(options = {})
|
9
|
-
super(DEFAULT_OPTIONS.merge(options))
|
10
|
-
end
|
11
2
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
yield result
|
17
|
-
end
|
18
|
-
end
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class CSVExtractor < Chronicle::ETL::Extractor
|
6
|
+
include Extractors::Helpers::FilesystemReader
|
19
7
|
|
20
|
-
|
21
|
-
|
22
|
-
|
8
|
+
register_connector do |r|
|
9
|
+
r.description = 'input as CSV'
|
10
|
+
end
|
23
11
|
|
24
|
-
|
12
|
+
setting :headers, default: true
|
13
|
+
setting :filename, default: $stdin
|
25
14
|
|
26
|
-
|
27
|
-
|
15
|
+
def extract
|
16
|
+
csv = initialize_csv
|
17
|
+
csv.each do |row|
|
18
|
+
yield Chronicle::ETL::Extraction.new(data: row.to_h)
|
19
|
+
end
|
20
|
+
end
|
28
21
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
}
|
22
|
+
def results_count
|
23
|
+
CSV.read(@config.filename, headers: @config.headers).count unless stdin?(@config.filename)
|
24
|
+
end
|
33
25
|
|
34
|
-
|
35
|
-
|
36
|
-
|
26
|
+
private
|
27
|
+
|
28
|
+
def initialize_csv
|
29
|
+
headers = @config.headers.is_a?(String) ? @config.headers.split(',') : @config.headers
|
37
30
|
|
38
|
-
|
39
|
-
|
31
|
+
csv_options = {
|
32
|
+
headers: headers,
|
33
|
+
converters: :all
|
34
|
+
}
|
35
|
+
|
36
|
+
open_from_filesystem(filename: @config.filename) do |file|
|
37
|
+
return CSV.new(file, **csv_options)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
40
41
|
end
|
41
42
|
end
|
@@ -4,38 +4,50 @@ module Chronicle
|
|
4
4
|
module ETL
|
5
5
|
# Abstract class representing an Extractor for an ETL job
|
6
6
|
class Extractor
|
7
|
-
extend Chronicle::ETL::
|
7
|
+
extend Chronicle::ETL::Registry::SelfRegistering
|
8
|
+
include Chronicle::ETL::Configurable
|
9
|
+
|
10
|
+
setting :since, type: :date
|
11
|
+
setting :until, type: :date
|
12
|
+
setting :limit
|
13
|
+
setting :load_after_id
|
14
|
+
setting :filename
|
8
15
|
|
9
16
|
# Construct a new instance of this extractor. Options are passed in from a Runner
|
10
|
-
# ==
|
17
|
+
# == Parameters:
|
11
18
|
# options::
|
12
19
|
# Options for configuring this Extractor
|
13
20
|
def initialize(options = {})
|
14
|
-
|
15
|
-
handle_continuation
|
21
|
+
apply_options(options)
|
16
22
|
end
|
17
23
|
|
18
|
-
#
|
19
|
-
def
|
20
|
-
raise NotImplementedError
|
21
|
-
end
|
24
|
+
# Hook called before #extract. Useful for gathering data, initailizing proxies, etc
|
25
|
+
def prepare; end
|
22
26
|
|
23
27
|
# An optional method to calculate how many records there are to extract. Used primarily for
|
24
28
|
# building the progress bar
|
25
29
|
def results_count; end
|
26
30
|
|
31
|
+
# Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
|
32
|
+
def extract
|
33
|
+
raise NotImplementedError
|
34
|
+
end
|
35
|
+
|
27
36
|
private
|
28
37
|
|
29
|
-
|
30
|
-
|
38
|
+
# TODO: reimplemenet this
|
39
|
+
# def handle_continuation
|
40
|
+
# return unless @config.continuation
|
31
41
|
|
32
|
-
|
33
|
-
|
34
|
-
end
|
42
|
+
# @config.since = @config.continuation.highest_timestamp if @config.continuation.highest_timestamp
|
43
|
+
# @config.load_after_id = @config.continuation.last_id if @config.continuation.last_id
|
44
|
+
# end
|
35
45
|
end
|
36
46
|
end
|
37
47
|
end
|
38
48
|
|
49
|
+
require_relative 'helpers/filesystem_reader'
|
39
50
|
require_relative 'csv_extractor'
|
40
51
|
require_relative 'file_extractor'
|
52
|
+
require_relative 'json_extractor'
|
41
53
|
require_relative 'stdin_extractor'
|
@@ -3,49 +3,34 @@ require 'pathname'
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
5
5
|
class FileExtractor < Chronicle::ETL::Extractor
|
6
|
-
|
7
|
-
if file?
|
8
|
-
extract_file do |data, metadata|
|
9
|
-
yield(data, metadata)
|
10
|
-
end
|
11
|
-
elsif directory?
|
12
|
-
extract_from_directory do |data, metadata|
|
13
|
-
yield(data, metadata)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
6
|
+
include Extractors::Helpers::FilesystemReader
|
17
7
|
|
18
|
-
|
19
|
-
|
20
|
-
return 1
|
21
|
-
else
|
22
|
-
search_pattern = File.join(@options[:filename], '**/*.eml')
|
23
|
-
Dir.glob(search_pattern).count
|
24
|
-
end
|
8
|
+
register_connector do |r|
|
9
|
+
r.description = 'file or directory of files'
|
25
10
|
end
|
26
11
|
|
27
|
-
|
12
|
+
# TODO: consolidate this with @config.filename
|
13
|
+
setting :dir_glob_pattern
|
28
14
|
|
29
|
-
def
|
30
|
-
search_pattern = File.join(@options[:filename], '**/*.eml')
|
31
|
-
filenames = Dir.glob(search_pattern)
|
15
|
+
def extract
|
32
16
|
filenames.each do |filename|
|
33
|
-
|
34
|
-
yield(file.read, {filename: file})
|
17
|
+
yield Chronicle::ETL::Extraction.new(data: filename)
|
35
18
|
end
|
36
19
|
end
|
37
20
|
|
38
|
-
def
|
39
|
-
|
40
|
-
yield(file.read, {filename: @options[:filename]})
|
21
|
+
def results_count
|
22
|
+
filenames.count
|
41
23
|
end
|
42
24
|
|
43
|
-
|
44
|
-
Pathname.new(@options[:filename]).directory?
|
45
|
-
end
|
25
|
+
private
|
46
26
|
|
47
|
-
def
|
48
|
-
|
27
|
+
def filenames
|
28
|
+
@filenames ||= filenames_in_directory(
|
29
|
+
path: @config.filename,
|
30
|
+
dir_glob_pattern: @config.dir_glob_pattern,
|
31
|
+
load_since: @config.since,
|
32
|
+
load_until: @config.until
|
33
|
+
)
|
49
34
|
end
|
50
35
|
end
|
51
36
|
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Extractors
|
6
|
+
module Helpers
|
7
|
+
module FilesystemReader
|
8
|
+
|
9
|
+
def filenames_in_directory(...)
|
10
|
+
filenames = gather_files(...)
|
11
|
+
if block_given?
|
12
|
+
filenames.each do |filename|
|
13
|
+
yield filename
|
14
|
+
end
|
15
|
+
else
|
16
|
+
filenames
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def read_from_filesystem(filename:, yield_each_line: true, dir_glob_pattern: '**/*')
|
21
|
+
open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
|
22
|
+
if yield_each_line
|
23
|
+
file.each_line do |line|
|
24
|
+
yield line
|
25
|
+
end
|
26
|
+
else
|
27
|
+
yield file.read
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def open_from_filesystem(filename:, dir_glob_pattern: '**/*')
|
33
|
+
open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
|
34
|
+
yield file
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def results_count
|
39
|
+
raise NotImplementedError
|
40
|
+
# if file?
|
41
|
+
# return 1
|
42
|
+
# else
|
43
|
+
# search_pattern = File.join(@options[:filename], '**/*')
|
44
|
+
# Dir.glob(search_pattern).count
|
45
|
+
# end
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def gather_files(path:, dir_glob_pattern: '**/*', load_since: nil, load_until: nil, smaller_than: nil, larger_than: nil, sort: :mtime)
|
51
|
+
search_pattern = File.join(path, '**', dir_glob_pattern)
|
52
|
+
files = Dir.glob(search_pattern)
|
53
|
+
|
54
|
+
files = files.keep_if {|f| (File.mtime(f) > load_since)} if load_since
|
55
|
+
files = files.keep_if {|f| (File.mtime(f) < load_until)} if load_until
|
56
|
+
|
57
|
+
# pass in file sizes in bytes
|
58
|
+
files = files.keep_if {|f| (File.size(f) < smaller_than)} if smaller_than
|
59
|
+
files = files.keep_if {|f| (File.size(f) > larger_than)} if larger_than
|
60
|
+
|
61
|
+
# TODO: incorporate sort argument
|
62
|
+
files.sort_by{ |f| File.mtime(f) }
|
63
|
+
end
|
64
|
+
|
65
|
+
def select_files_in_directory(path:, dir_glob_pattern: '**/*')
|
66
|
+
raise IOError.new("#{path} is not a directory.") unless directory?(path)
|
67
|
+
|
68
|
+
search_pattern = File.join(path, dir_glob_pattern)
|
69
|
+
Dir.glob(search_pattern).each do |filename|
|
70
|
+
yield(filename)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def open_files(filename:, dir_glob_pattern:)
|
75
|
+
if stdin?(filename)
|
76
|
+
yield $stdin
|
77
|
+
elsif directory?(filename)
|
78
|
+
search_pattern = File.join(filename, dir_glob_pattern)
|
79
|
+
filenames = Dir.glob(search_pattern)
|
80
|
+
filenames.each do |filename|
|
81
|
+
file = File.open(filename)
|
82
|
+
yield(file)
|
83
|
+
end
|
84
|
+
elsif file?(filename)
|
85
|
+
yield File.open(filename)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def stdin?(filename)
|
90
|
+
filename == $stdin
|
91
|
+
end
|
92
|
+
|
93
|
+
def directory?(filename)
|
94
|
+
Pathname.new(filename).directory?
|
95
|
+
end
|
96
|
+
|
97
|
+
def file?(filename)
|
98
|
+
Pathname.new(filename).file?
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class JsonExtractor < Chronicle::ETL::Extractor
|
4
|
+
include Extractors::Helpers::FilesystemReader
|
5
|
+
|
6
|
+
register_connector do |r|
|
7
|
+
r.description = 'input as JSON'
|
8
|
+
end
|
9
|
+
|
10
|
+
setting :filename, default: $stdin
|
11
|
+
setting :jsonl, default: true
|
12
|
+
|
13
|
+
def extract
|
14
|
+
load_input do |input|
|
15
|
+
parsed_data = parse_data(input)
|
16
|
+
yield Chronicle::ETL::Extraction.new(data: parsed_data) if parsed_data
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def results_count
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def parse_data data
|
26
|
+
JSON.parse(data)
|
27
|
+
rescue JSON::ParserError => e
|
28
|
+
end
|
29
|
+
|
30
|
+
def load_input
|
31
|
+
read_from_filesystem(filename: @options[:filename]) do |data|
|
32
|
+
yield data
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -1,9 +1,14 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
3
|
class StdinExtractor < Chronicle::ETL::Extractor
|
4
|
+
register_connector do |r|
|
5
|
+
r.description = 'stdin'
|
6
|
+
end
|
7
|
+
|
4
8
|
def extract
|
5
9
|
$stdin.read.each_line do |line|
|
6
|
-
|
10
|
+
data = { line: line.strip }
|
11
|
+
yield Chronicle::ETL::Extraction.new(data: data)
|
7
12
|
end
|
8
13
|
end
|
9
14
|
end
|
data/lib/chronicle/etl/job.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
|
+
require 'forwardable'
|
1
2
|
module Chronicle
|
2
3
|
module ETL
|
3
4
|
class Job
|
5
|
+
extend Forwardable
|
6
|
+
|
7
|
+
def_delegators :@job_definition, :dry_run?
|
8
|
+
|
4
9
|
attr_accessor :name,
|
5
10
|
:extractor_klass,
|
6
11
|
:extractor_options,
|
@@ -12,32 +17,30 @@ module Chronicle
|
|
12
17
|
# TODO: build a proper id system
|
13
18
|
alias id name
|
14
19
|
|
15
|
-
def initialize(
|
16
|
-
|
17
|
-
@name = definition[:name]
|
18
|
-
@
|
19
|
-
@
|
20
|
-
|
21
|
-
@transformer_klass = load_klass(:transformer, definition[:transformer][:name])
|
22
|
-
@transformer_options = definition[:transformer][:options] || {}
|
23
|
-
|
24
|
-
@loader_klass = load_klass(:loader, definition[:loader][:name])
|
25
|
-
@loader_options = definition[:loader][:options] || {}
|
20
|
+
def initialize(job_definition)
|
21
|
+
@job_definition = job_definition
|
22
|
+
@name = @job_definition.definition[:name]
|
23
|
+
@extractor_options = @job_definition.extractor_options
|
24
|
+
@transformer_options = @job_definition.transformer_options
|
25
|
+
@loader_options = @job_definition.loader_options
|
26
26
|
|
27
|
-
set_continuation if
|
27
|
+
set_continuation if use_continuation?
|
28
28
|
yield self if block_given?
|
29
29
|
end
|
30
30
|
|
31
31
|
def instantiate_extractor
|
32
|
-
|
32
|
+
@extractor_klass = @job_definition.extractor_klass
|
33
|
+
@extractor_klass.new(@extractor_options)
|
33
34
|
end
|
34
35
|
|
35
|
-
def instantiate_transformer(
|
36
|
-
|
36
|
+
def instantiate_transformer(extraction)
|
37
|
+
@transformer_klass = @job_definition.transformer_klass
|
38
|
+
@transformer_klass.new(extraction, @transformer_options)
|
37
39
|
end
|
38
40
|
|
39
41
|
def instantiate_loader
|
40
|
-
|
42
|
+
@loader_klass = @job_definition.loader_klass
|
43
|
+
@loader_klass.new(@loader_options)
|
41
44
|
end
|
42
45
|
|
43
46
|
def save_log?
|
@@ -45,26 +48,24 @@ module Chronicle
|
|
45
48
|
return !id.nil?
|
46
49
|
end
|
47
50
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
51
|
+
def to_s
|
52
|
+
output = "Job"
|
53
|
+
output += " '#{name}'".bold if name
|
54
|
+
output += "\n"
|
55
|
+
output += " → Extracting from #{@job_definition.extractor_klass.description}\n"
|
56
|
+
output += " → Transforming #{@job_definition.transformer_klass.description}\n"
|
57
|
+
output += " → Loading to #{@job_definition.loader_klass.description}\n"
|
55
58
|
end
|
56
59
|
|
57
|
-
|
58
|
-
Chronicle::ETL::Catalog.phase_and_identifier_to_klass(phase, identifier)
|
59
|
-
end
|
60
|
+
private
|
60
61
|
|
61
62
|
def set_continuation
|
62
|
-
continuation = Chronicle::ETL::JobLogger.load_latest(@
|
63
|
+
continuation = Chronicle::ETL::JobLogger.load_latest(@id)
|
63
64
|
@extractor_options[:continuation] = continuation
|
64
65
|
end
|
65
66
|
|
66
|
-
def
|
67
|
-
|
67
|
+
def use_continuation?
|
68
|
+
@job_definition.incremental?
|
68
69
|
end
|
69
70
|
end
|
70
71
|
end
|