chronicle-etl 0.3.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +35 -0
- data/.rubocop.yml +28 -1
- data/Guardfile +7 -0
- data/README.md +149 -85
- data/Rakefile +4 -2
- data/chronicle-etl.gemspec +10 -5
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/cli/connectors.rb +34 -0
- data/lib/chronicle/etl/cli/jobs.rb +44 -12
- data/lib/chronicle/etl/cli/main.rb +13 -19
- data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
- data/lib/chronicle/etl/cli.rb +7 -0
- data/lib/chronicle/etl/configurable.rb +158 -0
- data/lib/chronicle/etl/exceptions.rb +7 -1
- data/lib/chronicle/etl/extractors/csv_extractor.rb +24 -23
- data/lib/chronicle/etl/extractors/extractor.rb +23 -19
- data/lib/chronicle/etl/extractors/file_extractor.rb +34 -11
- data/lib/chronicle/etl/extractors/helpers/input_reader.rb +76 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +19 -18
- data/lib/chronicle/etl/job.rb +1 -1
- data/lib/chronicle/etl/job_definition.rb +1 -1
- data/lib/chronicle/etl/loaders/csv_loader.rb +1 -1
- data/lib/chronicle/etl/loaders/json_loader.rb +44 -0
- data/lib/chronicle/etl/loaders/loader.rb +5 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +5 -5
- data/lib/chronicle/etl/loaders/table_loader.rb +21 -24
- data/lib/chronicle/etl/logger.rb +1 -0
- data/lib/chronicle/etl/models/base.rb +3 -0
- data/lib/chronicle/etl/models/entity.rb +8 -2
- data/lib/chronicle/etl/models/raw.rb +26 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +1 -0
- data/lib/chronicle/etl/runner.rb +6 -4
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +6 -0
- data/lib/chronicle/etl/serializers/raw_serializer.rb +10 -0
- data/lib/chronicle/etl/serializers/serializer.rb +2 -1
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +22 -28
- data/lib/chronicle/etl/transformers/null_transformer.rb +1 -1
- data/lib/chronicle/etl/transformers/transformer.rb +3 -2
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +12 -4
- metadata +80 -19
- data/.ruby-version +0 -1
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +0 -104
- data/lib/chronicle/etl/loaders/stdout_loader.rb +0 -14
- data/lib/chronicle/etl/models/generic.rb +0 -23
@@ -1,17 +1,10 @@
|
|
1
|
-
require 'thor'
|
2
|
-
require 'chronicle/etl'
|
3
1
|
require 'colorize'
|
4
2
|
|
5
|
-
require 'chronicle/etl/cli/subcommand_base'
|
6
|
-
require 'chronicle/etl/cli/connectors'
|
7
|
-
require 'chronicle/etl/cli/jobs'
|
8
|
-
|
9
3
|
module Chronicle
|
10
4
|
module ETL
|
11
5
|
module CLI
|
12
6
|
# Main entrypoint for CLI app
|
13
|
-
class Main < Thor
|
14
|
-
class_option "verbose", type: :boolean, default: false
|
7
|
+
class Main < ::Thor
|
15
8
|
default_task "jobs"
|
16
9
|
|
17
10
|
desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
|
@@ -22,15 +15,6 @@ module Chronicle
|
|
22
15
|
|
23
16
|
# Entrypoint for the CLI
|
24
17
|
def self.start(given_args = ARGV, config = {})
|
25
|
-
if given_args[0] == "--version"
|
26
|
-
puts "#{Chronicle::ETL::VERSION}"
|
27
|
-
exit
|
28
|
-
end
|
29
|
-
|
30
|
-
if given_args.none?
|
31
|
-
abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
|
32
|
-
end
|
33
|
-
|
34
18
|
# take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
|
35
19
|
if given_args.any? && given_args[0].include?(':')
|
36
20
|
commands = given_args.shift.split(':')
|
@@ -40,10 +24,20 @@ module Chronicle
|
|
40
24
|
super(given_args, config)
|
41
25
|
end
|
42
26
|
|
27
|
+
def self.exit_on_failure?
|
28
|
+
true
|
29
|
+
end
|
30
|
+
|
31
|
+
desc "version", "Show version"
|
32
|
+
map %w(--version -v) => :version
|
33
|
+
def version
|
34
|
+
shell.say "chronicle-etl #{Chronicle::ETL::VERSION}"
|
35
|
+
end
|
36
|
+
|
43
37
|
# Displays help options for chronicle-etl
|
44
38
|
def help(meth = nil, subcommand = false)
|
45
39
|
if meth && !respond_to?(meth)
|
46
|
-
klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
|
40
|
+
klass, task = ::Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
|
47
41
|
klass.start(['-h', task].compact, shell: shell)
|
48
42
|
else
|
49
43
|
shell.say "ABOUT".bold
|
@@ -64,7 +58,7 @@ module Chronicle
|
|
64
58
|
|
65
59
|
list = []
|
66
60
|
|
67
|
-
Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
|
61
|
+
::Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
|
68
62
|
list += thor_class.printable_tasks(false)
|
69
63
|
end
|
70
64
|
list.sort! { |a, b| a[0] <=> b[0] }
|
@@ -2,11 +2,11 @@ module Chronicle
|
|
2
2
|
module ETL
|
3
3
|
module CLI
|
4
4
|
# Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
|
5
|
-
class SubcommandBase < Thor
|
5
|
+
class SubcommandBase < ::Thor
|
6
6
|
# Print usage instructions for a subcommand
|
7
7
|
def self.help(shell, subcommand = false)
|
8
8
|
list = printable_commands(true, subcommand)
|
9
|
-
Thor::Util.thor_classes_in(self).each do |klass|
|
9
|
+
::Thor::Util.thor_classes_in(self).each do |klass|
|
10
10
|
list += klass.printable_commands(false)
|
11
11
|
end
|
12
12
|
list.sort! { |a, b| a[0] <=> b[0] }
|
@@ -0,0 +1,158 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ostruct"
|
4
|
+
|
5
|
+
module Chronicle
|
6
|
+
module ETL
|
7
|
+
# A mixin that gives a class
|
8
|
+
# a {Chronicle::ETL::Configurable::ClassMethods#setting} macro to define
|
9
|
+
# settings and their properties (require, type, etc)
|
10
|
+
#
|
11
|
+
# @example Basic usage
|
12
|
+
# class Test < Chronicle::ETL::Extractor
|
13
|
+
# include Chronicle::ETL::Configurable
|
14
|
+
# setting :when, type: :date, required: true
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
# t = Test.new(when: '2022-02-24')
|
18
|
+
# t.config.when
|
19
|
+
module Configurable
|
20
|
+
# An individual setting for this Configurable
|
21
|
+
Setting = Struct.new(:default, :required, :type)
|
22
|
+
private_constant :Setting
|
23
|
+
|
24
|
+
# Collection of user-supplied options for this Configurable
|
25
|
+
class Config < OpenStruct
|
26
|
+
# Config values that aren't nil, as a hash
|
27
|
+
def compacted_h
|
28
|
+
to_h.compact
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# @private
|
33
|
+
def self.included(klass)
|
34
|
+
klass.extend(ClassMethods)
|
35
|
+
klass.include(InstanceMethods)
|
36
|
+
klass.prepend(Initializer)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Initializer method for classes that have Configurable mixed in
|
40
|
+
module Initializer
|
41
|
+
# Make sure this class has a default @config ready to use
|
42
|
+
def initialize(*args)
|
43
|
+
@config = initialize_default_config
|
44
|
+
super
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Instance methods for classes that have Configurable mixed in
|
49
|
+
module InstanceMethods
|
50
|
+
attr_reader :config
|
51
|
+
|
52
|
+
# Take given options and apply them to this class's settings
|
53
|
+
# and make them available in @config and validates that they
|
54
|
+
# conform to setting rules
|
55
|
+
def apply_options(options)
|
56
|
+
options.transform_keys!(&:to_sym)
|
57
|
+
|
58
|
+
options.each do |name, value|
|
59
|
+
setting = self.class.all_settings[name]
|
60
|
+
raise(Chronicle::ETL::ConfigurationError, "Unrecognized setting: #{name}") unless setting
|
61
|
+
|
62
|
+
@config[name] = coerced_value(setting, value)
|
63
|
+
end
|
64
|
+
validate_config
|
65
|
+
options
|
66
|
+
end
|
67
|
+
|
68
|
+
# Name of all settings available to this class
|
69
|
+
def self.settings
|
70
|
+
self.class.all_settings.keys
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def initialize_default_config
|
76
|
+
self.class.config_with_defaults
|
77
|
+
end
|
78
|
+
|
79
|
+
def validate_config
|
80
|
+
missing = (self.class.all_required_settings.keys - @config.compacted_h.keys)
|
81
|
+
raise Chronicle::ETL::ConfigurationError, "Missing options: #{missing}" if missing.count.positive?
|
82
|
+
end
|
83
|
+
|
84
|
+
def coerced_value(setting, value)
|
85
|
+
setting.type ? __send__("coerce_#{setting.type}", value) : value
|
86
|
+
end
|
87
|
+
|
88
|
+
def coerce_string(value)
|
89
|
+
value.to_s
|
90
|
+
end
|
91
|
+
|
92
|
+
def coerce_boolean(value)
|
93
|
+
if value.is_a?(String)
|
94
|
+
value.downcase == "true"
|
95
|
+
else
|
96
|
+
value
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def coerce_time(value)
|
101
|
+
# TODO: handle durations like '3h'
|
102
|
+
if value.is_a?(String)
|
103
|
+
Time.parse(value)
|
104
|
+
else
|
105
|
+
value
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# Class methods for classes that have Configurable mixed in
|
111
|
+
module ClassMethods
|
112
|
+
# Macro for creating a setting on a class {::Chronicle::ETL::Configurable}
|
113
|
+
#
|
114
|
+
# @param [String] name Name of the setting
|
115
|
+
# @param [Boolean] required whether setting is required
|
116
|
+
# @param [Object] default Default value
|
117
|
+
# @param [Symbol] type Type
|
118
|
+
#
|
119
|
+
# @example Basic usage
|
120
|
+
# setting :when, type: :date, required: true
|
121
|
+
#
|
122
|
+
# @see ::Chronicle::ETL::Configurable
|
123
|
+
def setting(name, default: nil, required: false, type: nil)
|
124
|
+
s = Setting.new(default, required, type)
|
125
|
+
settings[name] = s
|
126
|
+
end
|
127
|
+
|
128
|
+
# Collect all settings defined on this class and its ancestors (that
|
129
|
+
# have Configurable mixin included)
|
130
|
+
def all_settings
|
131
|
+
if superclass.include?(Chronicle::ETL::Configurable)
|
132
|
+
superclass.all_settings.merge(settings)
|
133
|
+
else
|
134
|
+
settings
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# Filters settings to those that are required.
|
139
|
+
def all_required_settings
|
140
|
+
all_settings.select { |_name, setting| setting.required } || {}
|
141
|
+
end
|
142
|
+
|
143
|
+
def settings
|
144
|
+
@settings ||= {}
|
145
|
+
end
|
146
|
+
|
147
|
+
def setting_exists?(name)
|
148
|
+
all_settings.keys.include? name
|
149
|
+
end
|
150
|
+
|
151
|
+
def config_with_defaults
|
152
|
+
s = all_settings.transform_values(&:default)
|
153
|
+
Config.new(s)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
|
-
class Error < StandardError; end
|
3
|
+
class Error < StandardError; end
|
4
|
+
|
5
|
+
class ConfigurationError < Error; end
|
4
6
|
|
5
7
|
class RunnerTypeError < Error; end
|
6
8
|
|
@@ -16,6 +18,10 @@ module Chronicle
|
|
16
18
|
class ProviderNotAvailableError < ConnectorNotAvailableError; end
|
17
19
|
class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
|
18
20
|
|
21
|
+
class ExtractionError < Error; end
|
22
|
+
|
23
|
+
class SerializationError < Error; end
|
24
|
+
|
19
25
|
class TransformationError < Error
|
20
26
|
attr_reader :transformation
|
21
27
|
|
@@ -2,46 +2,47 @@ require 'csv'
|
|
2
2
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
5
|
-
class
|
6
|
-
include Extractors::Helpers::
|
5
|
+
class CSVExtractor < Chronicle::ETL::Extractor
|
6
|
+
include Extractors::Helpers::InputReader
|
7
7
|
|
8
8
|
register_connector do |r|
|
9
|
-
r.description = '
|
9
|
+
r.description = 'CSV'
|
10
10
|
end
|
11
11
|
|
12
|
-
|
13
|
-
headers: true,
|
14
|
-
filename: $stdin
|
15
|
-
}.freeze
|
12
|
+
setting :headers, default: true
|
16
13
|
|
17
|
-
def
|
18
|
-
|
14
|
+
def prepare
|
15
|
+
@csvs = prepare_sources
|
19
16
|
end
|
20
17
|
|
21
18
|
def extract
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
@csvs.each do |csv|
|
20
|
+
csv.read.each do |row|
|
21
|
+
yield Chronicle::ETL::Extraction.new(data: row.to_h)
|
22
|
+
end
|
25
23
|
end
|
26
24
|
end
|
27
25
|
|
28
26
|
def results_count
|
29
|
-
|
27
|
+
@csvs.reduce(0) do |total_rows, csv|
|
28
|
+
row_count = csv.readlines.size
|
29
|
+
csv.rewind
|
30
|
+
total_rows + row_count
|
31
|
+
end
|
30
32
|
end
|
31
33
|
|
32
34
|
private
|
33
35
|
|
34
|
-
def
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
open_from_filesystem(filename: @options[:filename]) do |file|
|
43
|
-
return CSV.new(file, **csv_options)
|
36
|
+
def prepare_sources
|
37
|
+
@csvs = []
|
38
|
+
read_input do |csv_data|
|
39
|
+
csv_options = {
|
40
|
+
headers: @config.headers.is_a?(String) ? @config.headers.split(',') : @config.headers,
|
41
|
+
converters: :all
|
42
|
+
}
|
43
|
+
@csvs << CSV.new(csv_data, **csv_options)
|
44
44
|
end
|
45
|
+
@csvs
|
45
46
|
end
|
46
47
|
end
|
47
48
|
end
|
@@ -5,44 +5,48 @@ module Chronicle
|
|
5
5
|
# Abstract class representing an Extractor for an ETL job
|
6
6
|
class Extractor
|
7
7
|
extend Chronicle::ETL::Registry::SelfRegistering
|
8
|
+
include Chronicle::ETL::Configurable
|
9
|
+
|
10
|
+
setting :since, type: :time
|
11
|
+
setting :until, type: :time
|
12
|
+
setting :limit
|
13
|
+
setting :load_after_id
|
14
|
+
setting :input
|
8
15
|
|
9
16
|
# Construct a new instance of this extractor. Options are passed in from a Runner
|
10
|
-
# ==
|
17
|
+
# == Parameters:
|
11
18
|
# options::
|
12
19
|
# Options for configuring this Extractor
|
13
20
|
def initialize(options = {})
|
14
|
-
|
15
|
-
sanitize_options
|
16
|
-
handle_continuation
|
21
|
+
apply_options(options)
|
17
22
|
end
|
18
23
|
|
19
|
-
#
|
20
|
-
def
|
21
|
-
raise NotImplementedError
|
22
|
-
end
|
24
|
+
# Hook called before #extract. Useful for gathering data, initailizing proxies, etc
|
25
|
+
def prepare; end
|
23
26
|
|
24
27
|
# An optional method to calculate how many records there are to extract. Used primarily for
|
25
28
|
# building the progress bar
|
26
29
|
def results_count; end
|
27
30
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
@options[:load_since] = Time.parse(@options[:load_since]) if @options[:load_since] && @options[:load_since].is_a?(String)
|
32
|
-
@options[:load_until] = Time.parse(@options[:load_until]) if @options[:load_until] && @options[:load_until].is_a?(String)
|
31
|
+
# Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
|
32
|
+
def extract
|
33
|
+
raise NotImplementedError
|
33
34
|
end
|
34
35
|
|
35
|
-
|
36
|
-
return unless @options[:continuation]
|
36
|
+
private
|
37
37
|
|
38
|
-
|
39
|
-
|
40
|
-
|
38
|
+
# TODO: reimplemenet this
|
39
|
+
# def handle_continuation
|
40
|
+
# return unless @config.continuation
|
41
|
+
|
42
|
+
# @config.since = @config.continuation.highest_timestamp if @config.continuation.highest_timestamp
|
43
|
+
# @config.load_after_id = @config.continuation.last_id if @config.continuation.last_id
|
44
|
+
# end
|
41
45
|
end
|
42
46
|
end
|
43
47
|
end
|
44
48
|
|
45
|
-
require_relative 'helpers/
|
49
|
+
require_relative 'helpers/input_reader'
|
46
50
|
require_relative 'csv_extractor'
|
47
51
|
require_relative 'file_extractor'
|
48
52
|
require_relative 'json_extractor'
|
@@ -2,32 +2,55 @@ require 'pathname'
|
|
2
2
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
5
|
+
# Return filenames that match a pattern in a directory
|
5
6
|
class FileExtractor < Chronicle::ETL::Extractor
|
6
|
-
include Extractors::Helpers::FilesystemReader
|
7
7
|
|
8
8
|
register_connector do |r|
|
9
9
|
r.description = 'file or directory of files'
|
10
10
|
end
|
11
11
|
|
12
|
+
setting :input, default: ['.']
|
13
|
+
setting :dir_glob_pattern, default: "**/*"
|
14
|
+
setting :larger_than
|
15
|
+
setting :smaller_than
|
16
|
+
|
17
|
+
def prepare
|
18
|
+
@pathnames = gather_files
|
19
|
+
end
|
20
|
+
|
12
21
|
def extract
|
13
|
-
|
14
|
-
yield Chronicle::ETL::Extraction.new(data:
|
22
|
+
@pathnames.each do |pathname|
|
23
|
+
yield Chronicle::ETL::Extraction.new(data: pathname.to_path)
|
15
24
|
end
|
16
25
|
end
|
17
26
|
|
18
27
|
def results_count
|
19
|
-
|
28
|
+
@pathnames.count
|
20
29
|
end
|
21
30
|
|
22
31
|
private
|
23
32
|
|
24
|
-
def
|
25
|
-
@
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
33
|
+
def gather_files
|
34
|
+
roots = [@config.input].flatten.map { |filename| Pathname.new(filename) }
|
35
|
+
raise(ExtractionError, "Input must exist") unless roots.all?(&:exist?)
|
36
|
+
|
37
|
+
directories, files = roots.partition(&:directory?)
|
38
|
+
|
39
|
+
directories.each do |directory|
|
40
|
+
files += Dir.glob(File.join(directory, @config.dir_glob_pattern)).map { |filename| Pathname.new(filename) }
|
41
|
+
end
|
42
|
+
|
43
|
+
files = files.uniq
|
44
|
+
|
45
|
+
files = files.keep_if { |f| (f.mtime > @config.since) } if @config.since
|
46
|
+
files = files.keep_if { |f| (f.mtime < @config.until) } if @config.until
|
47
|
+
|
48
|
+
# pass in file sizes in bytes
|
49
|
+
files = files.keep_if { |f| (f.size < @config.smaller_than) } if @config.smaller_than
|
50
|
+
files = files.keep_if { |f| (f.size > @config.larger_than) } if @config.larger_than
|
51
|
+
|
52
|
+
# # TODO: incorporate sort argument
|
53
|
+
files.sort_by(&:mtime)
|
31
54
|
end
|
32
55
|
end
|
33
56
|
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Extractors
|
6
|
+
module Helpers
|
7
|
+
module InputReader
|
8
|
+
# Return an array of input filenames; converts a single string
|
9
|
+
# to an array if necessary
|
10
|
+
def filenames
|
11
|
+
[@config.input].flatten.map
|
12
|
+
end
|
13
|
+
|
14
|
+
# Filenames as an array of pathnames
|
15
|
+
def pathnames
|
16
|
+
filenames.map { |filename| Pathname.new(filename) }
|
17
|
+
end
|
18
|
+
|
19
|
+
# Whether we're reading from files
|
20
|
+
def read_from_files?
|
21
|
+
filenames.any?
|
22
|
+
end
|
23
|
+
|
24
|
+
# Whether we're reading input from stdin
|
25
|
+
def read_from_stdin?
|
26
|
+
!read_from_files? && $stdin.stat.pipe?
|
27
|
+
end
|
28
|
+
|
29
|
+
# Read input sources and yield each content
|
30
|
+
def read_input
|
31
|
+
if read_from_files?
|
32
|
+
pathnames.each do |pathname|
|
33
|
+
File.open(pathname) do |file|
|
34
|
+
yield file.read, pathname.to_path
|
35
|
+
end
|
36
|
+
end
|
37
|
+
elsif read_from_stdin?
|
38
|
+
yield $stdin.read, $stdin
|
39
|
+
else
|
40
|
+
raise ExtractionError, "No input files or stdin provided"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Read input sources line by line
|
45
|
+
def read_input_as_lines(&block)
|
46
|
+
if read_from_files?
|
47
|
+
lines_from_files(&block)
|
48
|
+
elsif read_from_stdin?
|
49
|
+
lines_from_stdin(&block)
|
50
|
+
else
|
51
|
+
raise ExtractionError, "No input files or stdin provided"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def lines_from_files(&block)
|
58
|
+
pathnames.each do |pathname|
|
59
|
+
File.open(pathname) do |file|
|
60
|
+
lines_from_io(file, &block)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def lines_from_stdin(&block)
|
66
|
+
lines_from_io($stdin, &block)
|
67
|
+
end
|
68
|
+
|
69
|
+
def lines_from_io(io, &block)
|
70
|
+
io.each_line(&block)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -1,43 +1,44 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
|
-
class
|
4
|
-
include Extractors::Helpers::
|
3
|
+
class JSONExtractor < Chronicle::ETL::Extractor
|
4
|
+
include Extractors::Helpers::InputReader
|
5
5
|
|
6
6
|
register_connector do |r|
|
7
|
-
r.description = '
|
7
|
+
r.description = 'JSON'
|
8
8
|
end
|
9
9
|
|
10
|
-
|
11
|
-
filename: $stdin,
|
10
|
+
setting :jsonl, default: true, type: :boolean
|
12
11
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
super(DEFAULT_OPTIONS.merge(options))
|
12
|
+
def prepare
|
13
|
+
@jsons = []
|
14
|
+
load_input do |input|
|
15
|
+
@jsons << parse_data(input)
|
16
|
+
end
|
19
17
|
end
|
20
18
|
|
21
19
|
def extract
|
22
|
-
|
23
|
-
|
24
|
-
yield Chronicle::ETL::Extraction.new(data: parsed_data) if parsed_data
|
20
|
+
@jsons.each do |json|
|
21
|
+
yield Chronicle::ETL::Extraction.new(data: json)
|
25
22
|
end
|
26
23
|
end
|
27
24
|
|
28
25
|
def results_count
|
26
|
+
@jsons.count
|
29
27
|
end
|
30
28
|
|
31
29
|
private
|
32
30
|
|
33
31
|
def parse_data data
|
34
32
|
JSON.parse(data)
|
35
|
-
rescue JSON::ParserError
|
33
|
+
rescue JSON::ParserError
|
34
|
+
raise Chronicle::ETL::ExtractionError, "Could not parse JSON"
|
36
35
|
end
|
37
36
|
|
38
|
-
def load_input
|
39
|
-
|
40
|
-
|
37
|
+
def load_input(&block)
|
38
|
+
if @config.jsonl
|
39
|
+
read_input_as_lines(&block)
|
40
|
+
else
|
41
|
+
read_input(&block)
|
41
42
|
end
|
42
43
|
end
|
43
44
|
end
|
data/lib/chronicle/etl/job.rb
CHANGED
@@ -35,7 +35,7 @@ module Chronicle
|
|
35
35
|
|
36
36
|
def instantiate_transformer(extraction)
|
37
37
|
@transformer_klass = @job_definition.transformer_klass
|
38
|
-
@transformer_klass.new(@transformer_options
|
38
|
+
@transformer_klass.new(extraction, @transformer_options)
|
39
39
|
end
|
40
40
|
|
41
41
|
def instantiate_loader
|