chronicle-etl 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +3 -0
- data/README.md +22 -15
- data/chronicle-etl.gemspec +13 -7
- data/lib/chronicle/etl/cli/connectors.rb +19 -7
- data/lib/chronicle/etl/cli/jobs.rb +38 -26
- data/lib/chronicle/etl/cli/main.rb +10 -2
- data/lib/chronicle/etl/config.rb +24 -3
- data/lib/chronicle/etl/exceptions.rb +13 -0
- data/lib/chronicle/etl/extraction.rb +12 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -37
- data/lib/chronicle/etl/extractors/extractor.rb +25 -4
- data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
- data/lib/chronicle/etl/job.rb +72 -0
- data/lib/chronicle/etl/job_definition.rb +89 -0
- data/lib/chronicle/etl/job_log.rb +95 -0
- data/lib/chronicle/etl/job_logger.rb +81 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +6 -6
- data/lib/chronicle/etl/loaders/loader.rb +2 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +16 -9
- data/lib/chronicle/etl/loaders/stdout_loader.rb +8 -3
- data/lib/chronicle/etl/loaders/table_loader.rb +58 -7
- data/lib/chronicle/etl/logger.rb +48 -0
- data/lib/chronicle/etl/models/activity.rb +15 -0
- data/lib/chronicle/etl/models/attachment.rb +14 -0
- data/lib/chronicle/etl/models/base.rb +119 -0
- data/lib/chronicle/etl/models/entity.rb +21 -0
- data/lib/chronicle/etl/models/generic.rb +23 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
- data/lib/chronicle/etl/registry/registry.rb +52 -0
- data/lib/chronicle/etl/registry/self_registering.rb +25 -0
- data/lib/chronicle/etl/runner.rb +66 -24
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
- data/lib/chronicle/etl/serializers/serializer.rb +27 -0
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +11 -3
- data/lib/chronicle/etl/transformers/transformer.rb +42 -13
- data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
- data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
- data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +16 -1
- metadata +139 -36
- data/CHANGELOG.md +0 -23
- data/Gemfile.lock +0 -85
- data/lib/chronicle/etl/catalog.rb +0 -102
- data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -3,49 +3,31 @@ require 'pathname'
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
5
5
|
class FileExtractor < Chronicle::ETL::Extractor
|
6
|
-
|
7
|
-
if file?
|
8
|
-
extract_file do |data, metadata|
|
9
|
-
yield(data, metadata)
|
10
|
-
end
|
11
|
-
elsif directory?
|
12
|
-
extract_from_directory do |data, metadata|
|
13
|
-
yield(data, metadata)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
6
|
+
include Extractors::Helpers::FilesystemReader
|
17
7
|
|
18
|
-
|
19
|
-
|
20
|
-
return 1
|
21
|
-
else
|
22
|
-
search_pattern = File.join(@options[:filename], '**/*.eml')
|
23
|
-
Dir.glob(search_pattern).count
|
24
|
-
end
|
8
|
+
register_connector do |r|
|
9
|
+
r.description = 'file or directory of files'
|
25
10
|
end
|
26
11
|
|
27
|
-
|
28
|
-
|
29
|
-
def extract_from_directory
|
30
|
-
search_pattern = File.join(@options[:filename], '**/*.eml')
|
31
|
-
filenames = Dir.glob(search_pattern)
|
12
|
+
def extract
|
32
13
|
filenames.each do |filename|
|
33
|
-
|
34
|
-
yield(file.read, {filename: file})
|
14
|
+
yield Chronicle::ETL::Extraction.new(data: filename)
|
35
15
|
end
|
36
16
|
end
|
37
17
|
|
38
|
-
def
|
39
|
-
|
40
|
-
yield(file.read, {filename: @options[:filename]})
|
18
|
+
def results_count
|
19
|
+
filenames.count
|
41
20
|
end
|
42
21
|
|
43
|
-
|
44
|
-
Pathname.new(@options[:filename]).directory?
|
45
|
-
end
|
22
|
+
private
|
46
23
|
|
47
|
-
def
|
48
|
-
|
24
|
+
def filenames
|
25
|
+
@filenames ||= filenames_in_directory(
|
26
|
+
path: @options[:filename],
|
27
|
+
dir_glob_pattern: @options[:dir_glob_pattern],
|
28
|
+
load_since: @options[:load_since],
|
29
|
+
load_until: @options[:load_until]
|
30
|
+
)
|
49
31
|
end
|
50
32
|
end
|
51
33
|
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Extractors
|
6
|
+
module Helpers
|
7
|
+
module FilesystemReader
|
8
|
+
|
9
|
+
def filenames_in_directory(...)
|
10
|
+
filenames = gather_files(...)
|
11
|
+
if block_given?
|
12
|
+
filenames.each do |filename|
|
13
|
+
yield filename
|
14
|
+
end
|
15
|
+
else
|
16
|
+
filenames
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def read_from_filesystem(filename:, yield_each_line: true, dir_glob_pattern: '**/*')
|
21
|
+
open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
|
22
|
+
if yield_each_line
|
23
|
+
file.each_line do |line|
|
24
|
+
yield line
|
25
|
+
end
|
26
|
+
else
|
27
|
+
yield file.read
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def open_from_filesystem(filename:, dir_glob_pattern: '**/*')
|
33
|
+
open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
|
34
|
+
yield file
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def results_count
|
39
|
+
raise NotImplementedError
|
40
|
+
# if file?
|
41
|
+
# return 1
|
42
|
+
# else
|
43
|
+
# search_pattern = File.join(@options[:filename], '**/*')
|
44
|
+
# Dir.glob(search_pattern).count
|
45
|
+
# end
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def gather_files(path:, dir_glob_pattern: '**/*', load_since: nil, load_until: nil, smaller_than: nil, larger_than: nil, sort: :mtime)
|
51
|
+
search_pattern = File.join(path, '**', dir_glob_pattern)
|
52
|
+
files = Dir.glob(search_pattern)
|
53
|
+
|
54
|
+
files = files.keep_if {|f| (File.mtime(f) > load_since)} if load_since
|
55
|
+
files = files.keep_if {|f| (File.mtime(f) < load_until)} if load_until
|
56
|
+
|
57
|
+
# pass in file sizes in bytes
|
58
|
+
files = files.keep_if {|f| (File.size(f) < smaller_than)} if smaller_than
|
59
|
+
files = files.keep_if {|f| (File.size(f) > larger_than)} if larger_than
|
60
|
+
|
61
|
+
# TODO: incorporate sort argument
|
62
|
+
files.sort_by{ |f| File.mtime(f) }
|
63
|
+
end
|
64
|
+
|
65
|
+
def select_files_in_directory(path:, dir_glob_pattern: '**/*')
|
66
|
+
raise IOError.new("#{path} is not a directory.") unless directory?(path)
|
67
|
+
|
68
|
+
search_pattern = File.join(path, dir_glob_pattern)
|
69
|
+
Dir.glob(search_pattern).each do |filename|
|
70
|
+
yield(filename)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def open_files(filename:, dir_glob_pattern:)
|
75
|
+
if stdin?(filename)
|
76
|
+
yield $stdin
|
77
|
+
elsif directory?(filename)
|
78
|
+
search_pattern = File.join(filename, dir_glob_pattern)
|
79
|
+
filenames = Dir.glob(search_pattern)
|
80
|
+
filenames.each do |filename|
|
81
|
+
file = File.open(filename)
|
82
|
+
yield(file)
|
83
|
+
end
|
84
|
+
elsif file?(filename)
|
85
|
+
yield File.open(filename)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def stdin?(filename)
|
90
|
+
filename == $stdin
|
91
|
+
end
|
92
|
+
|
93
|
+
def directory?(filename)
|
94
|
+
Pathname.new(filename).directory?
|
95
|
+
end
|
96
|
+
|
97
|
+
def file?(filename)
|
98
|
+
Pathname.new(filename).file?
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class JsonExtractor < Chronicle::ETL::Extractor
|
4
|
+
include Extractors::Helpers::FilesystemReader
|
5
|
+
|
6
|
+
register_connector do |r|
|
7
|
+
r.description = 'input as JSON'
|
8
|
+
end
|
9
|
+
|
10
|
+
DEFAULT_OPTIONS = {
|
11
|
+
filename: $stdin,
|
12
|
+
|
13
|
+
# We're expecting line-separated json objects
|
14
|
+
jsonl: true
|
15
|
+
}.freeze
|
16
|
+
|
17
|
+
def initialize(options = {})
|
18
|
+
super(DEFAULT_OPTIONS.merge(options))
|
19
|
+
end
|
20
|
+
|
21
|
+
def extract
|
22
|
+
load_input do |input|
|
23
|
+
parsed_data = parse_data(input)
|
24
|
+
yield Chronicle::ETL::Extraction.new(data: parsed_data) if parsed_data
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def results_count
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def parse_data data
|
34
|
+
JSON.parse(data)
|
35
|
+
rescue JSON::ParserError => e
|
36
|
+
end
|
37
|
+
|
38
|
+
def load_input
|
39
|
+
read_from_filesystem(filename: @options[:filename]) do |data|
|
40
|
+
yield data
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -1,9 +1,14 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
3
|
class StdinExtractor < Chronicle::ETL::Extractor
|
4
|
+
register_connector do |r|
|
5
|
+
r.description = 'stdin'
|
6
|
+
end
|
7
|
+
|
4
8
|
def extract
|
5
9
|
$stdin.read.each_line do |line|
|
6
|
-
|
10
|
+
data = { line: line.strip }
|
11
|
+
yield Chronicle::ETL::Extraction.new(data: data)
|
7
12
|
end
|
8
13
|
end
|
9
14
|
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
module Chronicle
|
3
|
+
module ETL
|
4
|
+
class Job
|
5
|
+
extend Forwardable
|
6
|
+
|
7
|
+
def_delegators :@job_definition, :dry_run?
|
8
|
+
|
9
|
+
attr_accessor :name,
|
10
|
+
:extractor_klass,
|
11
|
+
:extractor_options,
|
12
|
+
:transformer_klass,
|
13
|
+
:transformer_options,
|
14
|
+
:loader_klass,
|
15
|
+
:loader_options
|
16
|
+
|
17
|
+
# TODO: build a proper id system
|
18
|
+
alias id name
|
19
|
+
|
20
|
+
def initialize(job_definition)
|
21
|
+
@job_definition = job_definition
|
22
|
+
@name = @job_definition.definition[:name]
|
23
|
+
@extractor_options = @job_definition.extractor_options
|
24
|
+
@transformer_options = @job_definition.transformer_options
|
25
|
+
@loader_options = @job_definition.loader_options
|
26
|
+
|
27
|
+
set_continuation if use_continuation?
|
28
|
+
yield self if block_given?
|
29
|
+
end
|
30
|
+
|
31
|
+
def instantiate_extractor
|
32
|
+
@extractor_klass = @job_definition.extractor_klass
|
33
|
+
@extractor_klass.new(@extractor_options)
|
34
|
+
end
|
35
|
+
|
36
|
+
def instantiate_transformer(extraction)
|
37
|
+
@transformer_klass = @job_definition.transformer_klass
|
38
|
+
@transformer_klass.new(@transformer_options, extraction)
|
39
|
+
end
|
40
|
+
|
41
|
+
def instantiate_loader
|
42
|
+
@loader_klass = @job_definition.loader_klass
|
43
|
+
@loader_klass.new(@loader_options)
|
44
|
+
end
|
45
|
+
|
46
|
+
def save_log?
|
47
|
+
# TODO: this needs more nuance
|
48
|
+
return !id.nil?
|
49
|
+
end
|
50
|
+
|
51
|
+
def to_s
|
52
|
+
output = "Job"
|
53
|
+
output += " '#{name}'".bold if name
|
54
|
+
output += "\n"
|
55
|
+
output += " → Extracting from #{@job_definition.extractor_klass.description}\n"
|
56
|
+
output += " → Transforming #{@job_definition.transformer_klass.description}\n"
|
57
|
+
output += " → Loading to #{@job_definition.loader_klass.description}\n"
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def set_continuation
|
63
|
+
continuation = Chronicle::ETL::JobLogger.load_latest(@id)
|
64
|
+
@extractor_options[:continuation] = continuation
|
65
|
+
end
|
66
|
+
|
67
|
+
def use_continuation?
|
68
|
+
@job_definition.incremental?
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'active_support/core_ext/hash/deep_merge'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class JobDefinition
|
6
|
+
SKELETON_DEFINITION = {
|
7
|
+
incremental: false,
|
8
|
+
extractor: {
|
9
|
+
name: 'stdin',
|
10
|
+
options: {}
|
11
|
+
},
|
12
|
+
transformer: {
|
13
|
+
name: 'null',
|
14
|
+
options: {}
|
15
|
+
},
|
16
|
+
loader: {
|
17
|
+
name: 'stdout',
|
18
|
+
options: {}
|
19
|
+
}
|
20
|
+
}.freeze
|
21
|
+
|
22
|
+
attr_accessor :definition
|
23
|
+
|
24
|
+
def initialize()
|
25
|
+
@definition = SKELETON_DEFINITION
|
26
|
+
end
|
27
|
+
|
28
|
+
# Add config hash to this definition
|
29
|
+
def add_config(config = {})
|
30
|
+
@definition = @definition.deep_merge(config)
|
31
|
+
load_credentials
|
32
|
+
validate
|
33
|
+
end
|
34
|
+
|
35
|
+
# Is this job continuing from a previous run?
|
36
|
+
def incremental?
|
37
|
+
@definition[:incremental]
|
38
|
+
end
|
39
|
+
|
40
|
+
def dry_run?
|
41
|
+
@definition[:dry_run]
|
42
|
+
end
|
43
|
+
|
44
|
+
def extractor_klass
|
45
|
+
load_klass(:extractor, @definition[:extractor][:name])
|
46
|
+
end
|
47
|
+
|
48
|
+
def transformer_klass
|
49
|
+
load_klass(:transformer, @definition[:transformer][:name])
|
50
|
+
end
|
51
|
+
|
52
|
+
def loader_klass
|
53
|
+
load_klass(:loader, @definition[:loader][:name])
|
54
|
+
end
|
55
|
+
|
56
|
+
def extractor_options
|
57
|
+
@definition[:extractor][:options]
|
58
|
+
end
|
59
|
+
|
60
|
+
def transformer_options
|
61
|
+
@definition[:transformer][:options]
|
62
|
+
end
|
63
|
+
|
64
|
+
def loader_options
|
65
|
+
@definition[:loader][:options]
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def load_klass(phase, identifier)
|
71
|
+
Chronicle::ETL::Registry.find_by_phase_and_identifier(phase, identifier).klass
|
72
|
+
end
|
73
|
+
|
74
|
+
def load_credentials
|
75
|
+
Chronicle::ETL::Registry::PHASES.each do |phase|
|
76
|
+
credentials_name = @definition[phase].dig(:options, :credentials)
|
77
|
+
if credentials_name
|
78
|
+
credentials = Chronicle::ETL::Config.load_credentials(credentials_name)
|
79
|
+
@definition[phase][:options].deep_merge(credentials)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def validate
|
85
|
+
return true # TODO
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
# A record of what happened in the running of a job. We're interested in
|
6
|
+
# tracking when it ran, if it was successful, and what the latest record
|
7
|
+
# we found is (to use as a cursor for the next time)
|
8
|
+
class JobLog
|
9
|
+
extend Forwardable
|
10
|
+
|
11
|
+
attr_accessor :job,
|
12
|
+
:job_id,
|
13
|
+
:last_id,
|
14
|
+
:highest_timestamp,
|
15
|
+
:num_records_processed,
|
16
|
+
:started_at,
|
17
|
+
:finished_at,
|
18
|
+
:success
|
19
|
+
|
20
|
+
def_delegators :@job, :save_log?
|
21
|
+
|
22
|
+
# Create a new JobLog for a given Job
|
23
|
+
def initialize
|
24
|
+
@num_records_processed = 0
|
25
|
+
@success = false
|
26
|
+
yield self if block_given?
|
27
|
+
end
|
28
|
+
|
29
|
+
# Log the result of a single transformation in a job
|
30
|
+
# @param transformer [Chronicle::ETL::Tranformer] The transformer that ran
|
31
|
+
def log_transformation(transformer)
|
32
|
+
@last_id = transformer.id if transformer.id
|
33
|
+
|
34
|
+
# Save the highest timestamp that we've encountered so far
|
35
|
+
@highest_timestamp = [transformer.timestamp, @highest_timestamp].compact.max if transformer.timestamp
|
36
|
+
|
37
|
+
# TODO: a transformer might yield nil. We might also want certain transformers to explode
|
38
|
+
# records into multiple new ones. Therefore, this this variable will need more subtle behaviour
|
39
|
+
@num_records_processed += 1
|
40
|
+
end
|
41
|
+
|
42
|
+
# Indicate that a job has started
|
43
|
+
def start
|
44
|
+
@started_at = Time.now
|
45
|
+
end
|
46
|
+
|
47
|
+
# Indicate that a job has finished
|
48
|
+
def finish
|
49
|
+
@finished_at = Time.now
|
50
|
+
@success = true
|
51
|
+
end
|
52
|
+
|
53
|
+
def error
|
54
|
+
@finished_at = Time.now
|
55
|
+
end
|
56
|
+
|
57
|
+
def job= job
|
58
|
+
@job = job
|
59
|
+
@job_id = job.id
|
60
|
+
end
|
61
|
+
|
62
|
+
def duration
|
63
|
+
return unless @finished_at
|
64
|
+
|
65
|
+
@finished_at - @started_at
|
66
|
+
end
|
67
|
+
|
68
|
+
# Take a JobLog's instance variables and turn them into a hash representation
|
69
|
+
def serialize
|
70
|
+
{
|
71
|
+
job_id: @job_id,
|
72
|
+
last_id: @last_id,
|
73
|
+
highest_timestamp: @highest_timestamp,
|
74
|
+
num_records_processed: @num_records_processed,
|
75
|
+
started_at: @started_at,
|
76
|
+
finished_at: @finished_at,
|
77
|
+
success: @success
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
# Create a new JobLog and set its instance variables from a serialized hash
|
84
|
+
def self.build_from_serialized attrs
|
85
|
+
attrs.delete(:id)
|
86
|
+
new do |job_log|
|
87
|
+
attrs.each do |key, value|
|
88
|
+
setter = "#{key.to_s}=".to_sym
|
89
|
+
job_log.send(setter, value)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module Chronicle
|
5
|
+
module ETL
|
6
|
+
# Saves JobLogs to db and loads previous ones
|
7
|
+
class JobLogger
|
8
|
+
extend Forwardable
|
9
|
+
|
10
|
+
def_delegators :@job_log, :start, :finish, :error, :log_transformation, :duration, :success
|
11
|
+
attr_accessor :job_log
|
12
|
+
|
13
|
+
# For a given `job_id`, return the last successful log
|
14
|
+
def self.load_latest(job_id)
|
15
|
+
with_db_connection do |db|
|
16
|
+
attrs = db[:job_logs].reverse_order(:finished_at).where(success: true).first
|
17
|
+
JobLog.build_from_serialized(attrs) if attrs
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.with_db_connection
|
22
|
+
initialize_db unless db_exists?
|
23
|
+
Sequel.connect("sqlite://#{db_filename}") do |db|
|
24
|
+
initialize_schema(db) unless schema_exists?(db)
|
25
|
+
yield db
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.db_exists?
|
30
|
+
File.exists?(db_filename)
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.schema_exists?(db)
|
34
|
+
return db.tables.include? :job_logs
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.db_filename
|
38
|
+
data = Runcom::Data.new "chronicle/etl/job_log.db"
|
39
|
+
filename = data.all[0].to_s
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.initialize_db
|
43
|
+
FileUtils.mkdir_p(File.dirname(db_filename))
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.initialize_schema db
|
47
|
+
db.create_table :job_logs do
|
48
|
+
primary_key :id
|
49
|
+
String :job_id, null: false
|
50
|
+
String :last_id
|
51
|
+
Time :highest_timestamp
|
52
|
+
Integer :num_records_processed
|
53
|
+
boolean :success, default: false
|
54
|
+
Time :started_at
|
55
|
+
Time :finished_at
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Create a new JobLogger
|
60
|
+
def initialize(job)
|
61
|
+
@job_log = JobLog.new do |job_log|
|
62
|
+
job_log.job = job
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Save this JobLogger's JobLog to db
|
67
|
+
def save
|
68
|
+
return unless @job_log.save_log?
|
69
|
+
|
70
|
+
JobLogger.with_db_connection do |db|
|
71
|
+
dataset = db[:job_logs]
|
72
|
+
dataset.insert(@job_log.serialize)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def summarize
|
77
|
+
@job_log.inspect
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -3,17 +3,17 @@ require 'csv'
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
5
5
|
class CsvLoader < Chronicle::ETL::Loader
|
6
|
+
register_connector do |r|
|
7
|
+
r.description = 'CSV'
|
8
|
+
end
|
9
|
+
|
6
10
|
def initialize(options={})
|
7
11
|
super(options)
|
8
12
|
@rows = []
|
9
13
|
end
|
10
14
|
|
11
|
-
def load(
|
12
|
-
|
13
|
-
@rows << result.values
|
14
|
-
else
|
15
|
-
@rows << result
|
16
|
-
end
|
15
|
+
def load(record)
|
16
|
+
@rows << record.to_h_flattened.values
|
17
17
|
end
|
18
18
|
|
19
19
|
def finish
|
@@ -2,10 +2,10 @@ module Chronicle
|
|
2
2
|
module ETL
|
3
3
|
# Abstract class representing a Loader for an ETL job
|
4
4
|
class Loader
|
5
|
-
extend Chronicle::ETL::
|
5
|
+
extend Chronicle::ETL::Registry::SelfRegistering
|
6
6
|
|
7
7
|
# Construct a new instance of this loader. Options are passed in from a Runner
|
8
|
-
# ==
|
8
|
+
# == Parameters:
|
9
9
|
# options::
|
10
10
|
# Options for configuring this Loader
|
11
11
|
def initialize(options = {})
|
@@ -5,25 +5,32 @@ require 'json'
|
|
5
5
|
module Chronicle
|
6
6
|
module ETL
|
7
7
|
class RestLoader < Chronicle::ETL::Loader
|
8
|
-
|
8
|
+
register_connector do |r|
|
9
|
+
r.description = 'a REST endpoint'
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize( options={} )
|
9
13
|
super(options)
|
10
14
|
end
|
11
15
|
|
12
|
-
def load(
|
16
|
+
def load(record)
|
17
|
+
payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
|
18
|
+
# have the outer data key that json-api expects
|
19
|
+
payload = { data: payload } unless payload[:data]
|
20
|
+
|
13
21
|
uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
|
14
22
|
|
15
23
|
header = {
|
16
24
|
"Authorization" => "Bearer #{@options[:access_token]}",
|
17
25
|
"Content-Type": 'application/json'
|
18
26
|
}
|
27
|
+
use_ssl = uri.scheme == 'https'
|
19
28
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
response = http.request(request)
|
29
|
+
Net::HTTP.start(uri.host, uri.port, use_ssl: use_ssl) do |http|
|
30
|
+
request = Net::HTTP::Post.new(uri.request_uri, header)
|
31
|
+
request.body = payload.to_json
|
32
|
+
http.request(request)
|
33
|
+
end
|
27
34
|
end
|
28
35
|
end
|
29
36
|
end
|
@@ -1,9 +1,14 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
3
|
class StdoutLoader < Chronicle::ETL::Loader
|
4
|
-
|
5
|
-
|
4
|
+
register_connector do |r|
|
5
|
+
r.description = 'stdout'
|
6
|
+
end
|
7
|
+
|
8
|
+
def load(record)
|
9
|
+
serializer = Chronicle::ETL::JSONAPISerializer.new(record)
|
10
|
+
puts serializer.serializable_hash.to_json
|
6
11
|
end
|
7
12
|
end
|
8
13
|
end
|
9
|
-
end
|
14
|
+
end
|