chronicle-etl 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +35 -0
- data/.rubocop.yml +28 -1
- data/Guardfile +7 -0
- data/README.md +149 -85
- data/Rakefile +4 -2
- data/chronicle-etl.gemspec +10 -5
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/cli/connectors.rb +34 -0
- data/lib/chronicle/etl/cli/jobs.rb +44 -12
- data/lib/chronicle/etl/cli/main.rb +13 -19
- data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
- data/lib/chronicle/etl/cli.rb +7 -0
- data/lib/chronicle/etl/configurable.rb +158 -0
- data/lib/chronicle/etl/exceptions.rb +7 -1
- data/lib/chronicle/etl/extractors/csv_extractor.rb +24 -23
- data/lib/chronicle/etl/extractors/extractor.rb +23 -19
- data/lib/chronicle/etl/extractors/file_extractor.rb +34 -11
- data/lib/chronicle/etl/extractors/helpers/input_reader.rb +76 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +19 -18
- data/lib/chronicle/etl/job.rb +1 -1
- data/lib/chronicle/etl/job_definition.rb +1 -1
- data/lib/chronicle/etl/loaders/csv_loader.rb +1 -1
- data/lib/chronicle/etl/loaders/json_loader.rb +44 -0
- data/lib/chronicle/etl/loaders/loader.rb +5 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +5 -5
- data/lib/chronicle/etl/loaders/table_loader.rb +21 -24
- data/lib/chronicle/etl/logger.rb +1 -0
- data/lib/chronicle/etl/models/base.rb +3 -0
- data/lib/chronicle/etl/models/entity.rb +8 -2
- data/lib/chronicle/etl/models/raw.rb +26 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +1 -0
- data/lib/chronicle/etl/runner.rb +6 -4
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +6 -0
- data/lib/chronicle/etl/serializers/raw_serializer.rb +10 -0
- data/lib/chronicle/etl/serializers/serializer.rb +2 -1
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +22 -28
- data/lib/chronicle/etl/transformers/null_transformer.rb +1 -1
- data/lib/chronicle/etl/transformers/transformer.rb +3 -2
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +12 -4
- metadata +80 -19
- data/.ruby-version +0 -1
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +0 -104
- data/lib/chronicle/etl/loaders/stdout_loader.rb +0 -14
- data/lib/chronicle/etl/models/generic.rb +0 -23
@@ -1,17 +1,10 @@
|
|
1
|
-
require 'thor'
|
2
|
-
require 'chronicle/etl'
|
3
1
|
require 'colorize'
|
4
2
|
|
5
|
-
require 'chronicle/etl/cli/subcommand_base'
|
6
|
-
require 'chronicle/etl/cli/connectors'
|
7
|
-
require 'chronicle/etl/cli/jobs'
|
8
|
-
|
9
3
|
module Chronicle
|
10
4
|
module ETL
|
11
5
|
module CLI
|
12
6
|
# Main entrypoint for CLI app
|
13
|
-
class Main < Thor
|
14
|
-
class_option "verbose", type: :boolean, default: false
|
7
|
+
class Main < ::Thor
|
15
8
|
default_task "jobs"
|
16
9
|
|
17
10
|
desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
|
@@ -22,15 +15,6 @@ module Chronicle
|
|
22
15
|
|
23
16
|
# Entrypoint for the CLI
|
24
17
|
def self.start(given_args = ARGV, config = {})
|
25
|
-
if given_args[0] == "--version"
|
26
|
-
puts "#{Chronicle::ETL::VERSION}"
|
27
|
-
exit
|
28
|
-
end
|
29
|
-
|
30
|
-
if given_args.none?
|
31
|
-
abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
|
32
|
-
end
|
33
|
-
|
34
18
|
# take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
|
35
19
|
if given_args.any? && given_args[0].include?(':')
|
36
20
|
commands = given_args.shift.split(':')
|
@@ -40,10 +24,20 @@ module Chronicle
|
|
40
24
|
super(given_args, config)
|
41
25
|
end
|
42
26
|
|
27
|
+
def self.exit_on_failure?
|
28
|
+
true
|
29
|
+
end
|
30
|
+
|
31
|
+
desc "version", "Show version"
|
32
|
+
map %w(--version -v) => :version
|
33
|
+
def version
|
34
|
+
shell.say "chronicle-etl #{Chronicle::ETL::VERSION}"
|
35
|
+
end
|
36
|
+
|
43
37
|
# Displays help options for chronicle-etl
|
44
38
|
def help(meth = nil, subcommand = false)
|
45
39
|
if meth && !respond_to?(meth)
|
46
|
-
klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
|
40
|
+
klass, task = ::Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
|
47
41
|
klass.start(['-h', task].compact, shell: shell)
|
48
42
|
else
|
49
43
|
shell.say "ABOUT".bold
|
@@ -64,7 +58,7 @@ module Chronicle
|
|
64
58
|
|
65
59
|
list = []
|
66
60
|
|
67
|
-
Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
|
61
|
+
::Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
|
68
62
|
list += thor_class.printable_tasks(false)
|
69
63
|
end
|
70
64
|
list.sort! { |a, b| a[0] <=> b[0] }
|
@@ -2,11 +2,11 @@ module Chronicle
|
|
2
2
|
module ETL
|
3
3
|
module CLI
|
4
4
|
# Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
|
5
|
-
class SubcommandBase < Thor
|
5
|
+
class SubcommandBase < ::Thor
|
6
6
|
# Print usage instructions for a subcommand
|
7
7
|
def self.help(shell, subcommand = false)
|
8
8
|
list = printable_commands(true, subcommand)
|
9
|
-
Thor::Util.thor_classes_in(self).each do |klass|
|
9
|
+
::Thor::Util.thor_classes_in(self).each do |klass|
|
10
10
|
list += klass.printable_commands(false)
|
11
11
|
end
|
12
12
|
list.sort! { |a, b| a[0] <=> b[0] }
|
@@ -0,0 +1,158 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ostruct"
|
4
|
+
|
5
|
+
module Chronicle
|
6
|
+
module ETL
|
7
|
+
# A mixin that gives a class
|
8
|
+
# a {Chronicle::ETL::Configurable::ClassMethods#setting} macro to define
|
9
|
+
# settings and their properties (require, type, etc)
|
10
|
+
#
|
11
|
+
# @example Basic usage
|
12
|
+
# class Test < Chronicle::ETL::Extractor
|
13
|
+
# include Chronicle::ETL::Configurable
|
14
|
+
# setting :when, type: :date, required: true
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
# t = Test.new(when: '2022-02-24')
|
18
|
+
# t.config.when
|
19
|
+
module Configurable
|
20
|
+
# An individual setting for this Configurable
|
21
|
+
Setting = Struct.new(:default, :required, :type)
|
22
|
+
private_constant :Setting
|
23
|
+
|
24
|
+
# Collection of user-supplied options for this Configurable
|
25
|
+
class Config < OpenStruct
|
26
|
+
# Config values that aren't nil, as a hash
|
27
|
+
def compacted_h
|
28
|
+
to_h.compact
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# @private
|
33
|
+
def self.included(klass)
|
34
|
+
klass.extend(ClassMethods)
|
35
|
+
klass.include(InstanceMethods)
|
36
|
+
klass.prepend(Initializer)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Initializer method for classes that have Configurable mixed in
|
40
|
+
module Initializer
|
41
|
+
# Make sure this class has a default @config ready to use
|
42
|
+
def initialize(*args)
|
43
|
+
@config = initialize_default_config
|
44
|
+
super
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Instance methods for classes that have Configurable mixed in
|
49
|
+
module InstanceMethods
|
50
|
+
attr_reader :config
|
51
|
+
|
52
|
+
# Take given options and apply them to this class's settings
|
53
|
+
# and make them available in @config and validates that they
|
54
|
+
# conform to setting rules
|
55
|
+
def apply_options(options)
|
56
|
+
options.transform_keys!(&:to_sym)
|
57
|
+
|
58
|
+
options.each do |name, value|
|
59
|
+
setting = self.class.all_settings[name]
|
60
|
+
raise(Chronicle::ETL::ConfigurationError, "Unrecognized setting: #{name}") unless setting
|
61
|
+
|
62
|
+
@config[name] = coerced_value(setting, value)
|
63
|
+
end
|
64
|
+
validate_config
|
65
|
+
options
|
66
|
+
end
|
67
|
+
|
68
|
+
# Name of all settings available to this class
|
69
|
+
def self.settings
|
70
|
+
self.class.all_settings.keys
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def initialize_default_config
|
76
|
+
self.class.config_with_defaults
|
77
|
+
end
|
78
|
+
|
79
|
+
def validate_config
|
80
|
+
missing = (self.class.all_required_settings.keys - @config.compacted_h.keys)
|
81
|
+
raise Chronicle::ETL::ConfigurationError, "Missing options: #{missing}" if missing.count.positive?
|
82
|
+
end
|
83
|
+
|
84
|
+
def coerced_value(setting, value)
|
85
|
+
setting.type ? __send__("coerce_#{setting.type}", value) : value
|
86
|
+
end
|
87
|
+
|
88
|
+
def coerce_string(value)
|
89
|
+
value.to_s
|
90
|
+
end
|
91
|
+
|
92
|
+
def coerce_boolean(value)
|
93
|
+
if value.is_a?(String)
|
94
|
+
value.downcase == "true"
|
95
|
+
else
|
96
|
+
value
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def coerce_time(value)
|
101
|
+
# TODO: handle durations like '3h'
|
102
|
+
if value.is_a?(String)
|
103
|
+
Time.parse(value)
|
104
|
+
else
|
105
|
+
value
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# Class methods for classes that have Configurable mixed in
|
111
|
+
module ClassMethods
|
112
|
+
# Macro for creating a setting on a class {::Chronicle::ETL::Configurable}
|
113
|
+
#
|
114
|
+
# @param [String] name Name of the setting
|
115
|
+
# @param [Boolean] required whether setting is required
|
116
|
+
# @param [Object] default Default value
|
117
|
+
# @param [Symbol] type Type
|
118
|
+
#
|
119
|
+
# @example Basic usage
|
120
|
+
# setting :when, type: :date, required: true
|
121
|
+
#
|
122
|
+
# @see ::Chronicle::ETL::Configurable
|
123
|
+
def setting(name, default: nil, required: false, type: nil)
|
124
|
+
s = Setting.new(default, required, type)
|
125
|
+
settings[name] = s
|
126
|
+
end
|
127
|
+
|
128
|
+
# Collect all settings defined on this class and its ancestors (that
|
129
|
+
# have Configurable mixin included)
|
130
|
+
def all_settings
|
131
|
+
if superclass.include?(Chronicle::ETL::Configurable)
|
132
|
+
superclass.all_settings.merge(settings)
|
133
|
+
else
|
134
|
+
settings
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# Filters settings to those that are required.
|
139
|
+
def all_required_settings
|
140
|
+
all_settings.select { |_name, setting| setting.required } || {}
|
141
|
+
end
|
142
|
+
|
143
|
+
def settings
|
144
|
+
@settings ||= {}
|
145
|
+
end
|
146
|
+
|
147
|
+
def setting_exists?(name)
|
148
|
+
all_settings.keys.include? name
|
149
|
+
end
|
150
|
+
|
151
|
+
def config_with_defaults
|
152
|
+
s = all_settings.transform_values(&:default)
|
153
|
+
Config.new(s)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
|
-
class Error < StandardError; end
|
3
|
+
class Error < StandardError; end
|
4
|
+
|
5
|
+
class ConfigurationError < Error; end
|
4
6
|
|
5
7
|
class RunnerTypeError < Error; end
|
6
8
|
|
@@ -16,6 +18,10 @@ module Chronicle
|
|
16
18
|
class ProviderNotAvailableError < ConnectorNotAvailableError; end
|
17
19
|
class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
|
18
20
|
|
21
|
+
class ExtractionError < Error; end
|
22
|
+
|
23
|
+
class SerializationError < Error; end
|
24
|
+
|
19
25
|
class TransformationError < Error
|
20
26
|
attr_reader :transformation
|
21
27
|
|
@@ -2,46 +2,47 @@ require 'csv'
|
|
2
2
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
5
|
-
class
|
6
|
-
include Extractors::Helpers::
|
5
|
+
class CSVExtractor < Chronicle::ETL::Extractor
|
6
|
+
include Extractors::Helpers::InputReader
|
7
7
|
|
8
8
|
register_connector do |r|
|
9
|
-
r.description = '
|
9
|
+
r.description = 'CSV'
|
10
10
|
end
|
11
11
|
|
12
|
-
|
13
|
-
headers: true,
|
14
|
-
filename: $stdin
|
15
|
-
}.freeze
|
12
|
+
setting :headers, default: true
|
16
13
|
|
17
|
-
def
|
18
|
-
|
14
|
+
def prepare
|
15
|
+
@csvs = prepare_sources
|
19
16
|
end
|
20
17
|
|
21
18
|
def extract
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
@csvs.each do |csv|
|
20
|
+
csv.read.each do |row|
|
21
|
+
yield Chronicle::ETL::Extraction.new(data: row.to_h)
|
22
|
+
end
|
25
23
|
end
|
26
24
|
end
|
27
25
|
|
28
26
|
def results_count
|
29
|
-
|
27
|
+
@csvs.reduce(0) do |total_rows, csv|
|
28
|
+
row_count = csv.readlines.size
|
29
|
+
csv.rewind
|
30
|
+
total_rows + row_count
|
31
|
+
end
|
30
32
|
end
|
31
33
|
|
32
34
|
private
|
33
35
|
|
34
|
-
def
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
open_from_filesystem(filename: @options[:filename]) do |file|
|
43
|
-
return CSV.new(file, **csv_options)
|
36
|
+
def prepare_sources
|
37
|
+
@csvs = []
|
38
|
+
read_input do |csv_data|
|
39
|
+
csv_options = {
|
40
|
+
headers: @config.headers.is_a?(String) ? @config.headers.split(',') : @config.headers,
|
41
|
+
converters: :all
|
42
|
+
}
|
43
|
+
@csvs << CSV.new(csv_data, **csv_options)
|
44
44
|
end
|
45
|
+
@csvs
|
45
46
|
end
|
46
47
|
end
|
47
48
|
end
|
@@ -5,44 +5,48 @@ module Chronicle
|
|
5
5
|
# Abstract class representing an Extractor for an ETL job
|
6
6
|
class Extractor
|
7
7
|
extend Chronicle::ETL::Registry::SelfRegistering
|
8
|
+
include Chronicle::ETL::Configurable
|
9
|
+
|
10
|
+
setting :since, type: :time
|
11
|
+
setting :until, type: :time
|
12
|
+
setting :limit
|
13
|
+
setting :load_after_id
|
14
|
+
setting :input
|
8
15
|
|
9
16
|
# Construct a new instance of this extractor. Options are passed in from a Runner
|
10
|
-
# ==
|
17
|
+
# == Parameters:
|
11
18
|
# options::
|
12
19
|
# Options for configuring this Extractor
|
13
20
|
def initialize(options = {})
|
14
|
-
|
15
|
-
sanitize_options
|
16
|
-
handle_continuation
|
21
|
+
apply_options(options)
|
17
22
|
end
|
18
23
|
|
19
|
-
#
|
20
|
-
def
|
21
|
-
raise NotImplementedError
|
22
|
-
end
|
24
|
+
# Hook called before #extract. Useful for gathering data, initailizing proxies, etc
|
25
|
+
def prepare; end
|
23
26
|
|
24
27
|
# An optional method to calculate how many records there are to extract. Used primarily for
|
25
28
|
# building the progress bar
|
26
29
|
def results_count; end
|
27
30
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
@options[:load_since] = Time.parse(@options[:load_since]) if @options[:load_since] && @options[:load_since].is_a?(String)
|
32
|
-
@options[:load_until] = Time.parse(@options[:load_until]) if @options[:load_until] && @options[:load_until].is_a?(String)
|
31
|
+
# Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
|
32
|
+
def extract
|
33
|
+
raise NotImplementedError
|
33
34
|
end
|
34
35
|
|
35
|
-
|
36
|
-
return unless @options[:continuation]
|
36
|
+
private
|
37
37
|
|
38
|
-
|
39
|
-
|
40
|
-
|
38
|
+
# TODO: reimplemenet this
|
39
|
+
# def handle_continuation
|
40
|
+
# return unless @config.continuation
|
41
|
+
|
42
|
+
# @config.since = @config.continuation.highest_timestamp if @config.continuation.highest_timestamp
|
43
|
+
# @config.load_after_id = @config.continuation.last_id if @config.continuation.last_id
|
44
|
+
# end
|
41
45
|
end
|
42
46
|
end
|
43
47
|
end
|
44
48
|
|
45
|
-
require_relative 'helpers/
|
49
|
+
require_relative 'helpers/input_reader'
|
46
50
|
require_relative 'csv_extractor'
|
47
51
|
require_relative 'file_extractor'
|
48
52
|
require_relative 'json_extractor'
|
@@ -2,32 +2,55 @@ require 'pathname'
|
|
2
2
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
5
|
+
# Return filenames that match a pattern in a directory
|
5
6
|
class FileExtractor < Chronicle::ETL::Extractor
|
6
|
-
include Extractors::Helpers::FilesystemReader
|
7
7
|
|
8
8
|
register_connector do |r|
|
9
9
|
r.description = 'file or directory of files'
|
10
10
|
end
|
11
11
|
|
12
|
+
setting :input, default: ['.']
|
13
|
+
setting :dir_glob_pattern, default: "**/*"
|
14
|
+
setting :larger_than
|
15
|
+
setting :smaller_than
|
16
|
+
|
17
|
+
def prepare
|
18
|
+
@pathnames = gather_files
|
19
|
+
end
|
20
|
+
|
12
21
|
def extract
|
13
|
-
|
14
|
-
yield Chronicle::ETL::Extraction.new(data:
|
22
|
+
@pathnames.each do |pathname|
|
23
|
+
yield Chronicle::ETL::Extraction.new(data: pathname.to_path)
|
15
24
|
end
|
16
25
|
end
|
17
26
|
|
18
27
|
def results_count
|
19
|
-
|
28
|
+
@pathnames.count
|
20
29
|
end
|
21
30
|
|
22
31
|
private
|
23
32
|
|
24
|
-
def
|
25
|
-
@
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
33
|
+
def gather_files
|
34
|
+
roots = [@config.input].flatten.map { |filename| Pathname.new(filename) }
|
35
|
+
raise(ExtractionError, "Input must exist") unless roots.all?(&:exist?)
|
36
|
+
|
37
|
+
directories, files = roots.partition(&:directory?)
|
38
|
+
|
39
|
+
directories.each do |directory|
|
40
|
+
files += Dir.glob(File.join(directory, @config.dir_glob_pattern)).map { |filename| Pathname.new(filename) }
|
41
|
+
end
|
42
|
+
|
43
|
+
files = files.uniq
|
44
|
+
|
45
|
+
files = files.keep_if { |f| (f.mtime > @config.since) } if @config.since
|
46
|
+
files = files.keep_if { |f| (f.mtime < @config.until) } if @config.until
|
47
|
+
|
48
|
+
# pass in file sizes in bytes
|
49
|
+
files = files.keep_if { |f| (f.size < @config.smaller_than) } if @config.smaller_than
|
50
|
+
files = files.keep_if { |f| (f.size > @config.larger_than) } if @config.larger_than
|
51
|
+
|
52
|
+
# # TODO: incorporate sort argument
|
53
|
+
files.sort_by(&:mtime)
|
31
54
|
end
|
32
55
|
end
|
33
56
|
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Extractors
|
6
|
+
module Helpers
|
7
|
+
module InputReader
|
8
|
+
# Return an array of input filenames; converts a single string
|
9
|
+
# to an array if necessary
|
10
|
+
def filenames
|
11
|
+
[@config.input].flatten.map
|
12
|
+
end
|
13
|
+
|
14
|
+
# Filenames as an array of pathnames
|
15
|
+
def pathnames
|
16
|
+
filenames.map { |filename| Pathname.new(filename) }
|
17
|
+
end
|
18
|
+
|
19
|
+
# Whether we're reading from files
|
20
|
+
def read_from_files?
|
21
|
+
filenames.any?
|
22
|
+
end
|
23
|
+
|
24
|
+
# Whether we're reading input from stdin
|
25
|
+
def read_from_stdin?
|
26
|
+
!read_from_files? && $stdin.stat.pipe?
|
27
|
+
end
|
28
|
+
|
29
|
+
# Read input sources and yield each content
|
30
|
+
def read_input
|
31
|
+
if read_from_files?
|
32
|
+
pathnames.each do |pathname|
|
33
|
+
File.open(pathname) do |file|
|
34
|
+
yield file.read, pathname.to_path
|
35
|
+
end
|
36
|
+
end
|
37
|
+
elsif read_from_stdin?
|
38
|
+
yield $stdin.read, $stdin
|
39
|
+
else
|
40
|
+
raise ExtractionError, "No input files or stdin provided"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Read input sources line by line
|
45
|
+
def read_input_as_lines(&block)
|
46
|
+
if read_from_files?
|
47
|
+
lines_from_files(&block)
|
48
|
+
elsif read_from_stdin?
|
49
|
+
lines_from_stdin(&block)
|
50
|
+
else
|
51
|
+
raise ExtractionError, "No input files or stdin provided"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def lines_from_files(&block)
|
58
|
+
pathnames.each do |pathname|
|
59
|
+
File.open(pathname) do |file|
|
60
|
+
lines_from_io(file, &block)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def lines_from_stdin(&block)
|
66
|
+
lines_from_io($stdin, &block)
|
67
|
+
end
|
68
|
+
|
69
|
+
def lines_from_io(io, &block)
|
70
|
+
io.each_line(&block)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -1,43 +1,44 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
|
-
class
|
4
|
-
include Extractors::Helpers::
|
3
|
+
class JSONExtractor < Chronicle::ETL::Extractor
|
4
|
+
include Extractors::Helpers::InputReader
|
5
5
|
|
6
6
|
register_connector do |r|
|
7
|
-
r.description = '
|
7
|
+
r.description = 'JSON'
|
8
8
|
end
|
9
9
|
|
10
|
-
|
11
|
-
filename: $stdin,
|
10
|
+
setting :jsonl, default: true, type: :boolean
|
12
11
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
super(DEFAULT_OPTIONS.merge(options))
|
12
|
+
def prepare
|
13
|
+
@jsons = []
|
14
|
+
load_input do |input|
|
15
|
+
@jsons << parse_data(input)
|
16
|
+
end
|
19
17
|
end
|
20
18
|
|
21
19
|
def extract
|
22
|
-
|
23
|
-
|
24
|
-
yield Chronicle::ETL::Extraction.new(data: parsed_data) if parsed_data
|
20
|
+
@jsons.each do |json|
|
21
|
+
yield Chronicle::ETL::Extraction.new(data: json)
|
25
22
|
end
|
26
23
|
end
|
27
24
|
|
28
25
|
def results_count
|
26
|
+
@jsons.count
|
29
27
|
end
|
30
28
|
|
31
29
|
private
|
32
30
|
|
33
31
|
def parse_data data
|
34
32
|
JSON.parse(data)
|
35
|
-
rescue JSON::ParserError
|
33
|
+
rescue JSON::ParserError
|
34
|
+
raise Chronicle::ETL::ExtractionError, "Could not parse JSON"
|
36
35
|
end
|
37
36
|
|
38
|
-
def load_input
|
39
|
-
|
40
|
-
|
37
|
+
def load_input(&block)
|
38
|
+
if @config.jsonl
|
39
|
+
read_input_as_lines(&block)
|
40
|
+
else
|
41
|
+
read_input(&block)
|
41
42
|
end
|
42
43
|
end
|
43
44
|
end
|
data/lib/chronicle/etl/job.rb
CHANGED
@@ -35,7 +35,7 @@ module Chronicle
|
|
35
35
|
|
36
36
|
def instantiate_transformer(extraction)
|
37
37
|
@transformer_klass = @job_definition.transformer_klass
|
38
|
-
@transformer_klass.new(@transformer_options
|
38
|
+
@transformer_klass.new(extraction, @transformer_options)
|
39
39
|
end
|
40
40
|
|
41
41
|
def instantiate_loader
|