chronicle-etl 0.3.1 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +35 -0
- data/.rubocop.yml +31 -1
- data/Guardfile +7 -0
- data/README.md +157 -82
- data/Rakefile +4 -2
- data/chronicle-etl.gemspec +11 -3
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/cli/connectors.rb +34 -5
- data/lib/chronicle/etl/cli/jobs.rb +90 -24
- data/lib/chronicle/etl/cli/main.rb +41 -19
- data/lib/chronicle/etl/cli/plugins.rb +62 -0
- data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
- data/lib/chronicle/etl/cli.rb +9 -0
- data/lib/chronicle/etl/config.rb +7 -4
- data/lib/chronicle/etl/configurable.rb +163 -0
- data/lib/chronicle/etl/exceptions.rb +29 -1
- data/lib/chronicle/etl/extractors/csv_extractor.rb +24 -23
- data/lib/chronicle/etl/extractors/extractor.rb +16 -15
- data/lib/chronicle/etl/extractors/file_extractor.rb +34 -11
- data/lib/chronicle/etl/extractors/helpers/input_reader.rb +76 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +19 -18
- data/lib/chronicle/etl/job.rb +8 -2
- data/lib/chronicle/etl/job_definition.rb +20 -5
- data/lib/chronicle/etl/loaders/csv_loader.rb +36 -9
- data/lib/chronicle/etl/loaders/helpers/encoding_helper.rb +18 -0
- data/lib/chronicle/etl/loaders/json_loader.rb +44 -0
- data/lib/chronicle/etl/loaders/loader.rb +28 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +5 -5
- data/lib/chronicle/etl/loaders/table_loader.rb +18 -37
- data/lib/chronicle/etl/logger.rb +6 -2
- data/lib/chronicle/etl/models/base.rb +3 -0
- data/lib/chronicle/etl/models/entity.rb +8 -2
- data/lib/chronicle/etl/models/raw.rb +26 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +6 -0
- data/lib/chronicle/etl/registry/plugin_registry.rb +70 -0
- data/lib/chronicle/etl/registry/registry.rb +27 -14
- data/lib/chronicle/etl/runner.rb +35 -17
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +6 -0
- data/lib/chronicle/etl/serializers/raw_serializer.rb +10 -0
- data/lib/chronicle/etl/serializers/serializer.rb +2 -1
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +22 -28
- data/lib/chronicle/etl/transformers/null_transformer.rb +1 -1
- data/lib/chronicle/etl/transformers/transformer.rb +3 -2
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +12 -4
- metadata +123 -18
- data/.ruby-version +0 -1
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +0 -104
- data/lib/chronicle/etl/loaders/stdout_loader.rb +0 -14
- data/lib/chronicle/etl/models/generic.rb +0 -23
@@ -9,59 +9,40 @@ module Chronicle
|
|
9
9
|
r.description = 'an ASCII table'
|
10
10
|
end
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
truncate_values_at: nil,
|
17
|
-
table_renderer: :basic
|
18
|
-
}.freeze
|
19
|
-
|
20
|
-
def initialize(options={})
|
21
|
-
@options = options.reverse_merge(DEFAULT_OPTIONS)
|
22
|
-
@records = []
|
23
|
-
end
|
12
|
+
setting :truncate_values_at, default: 40
|
13
|
+
setting :table_renderer, default: :basic
|
14
|
+
setting :fields_exclude, default: ['lids', 'type']
|
15
|
+
setting :header_row, default: true
|
24
16
|
|
25
17
|
def load(record)
|
26
|
-
|
18
|
+
records << record.to_h_flattened
|
27
19
|
end
|
28
20
|
|
29
21
|
def finish
|
30
|
-
return if
|
22
|
+
return if records.empty?
|
31
23
|
|
32
|
-
headers = build_headers(
|
33
|
-
rows = build_rows(
|
24
|
+
headers = build_headers(records)
|
25
|
+
rows = build_rows(records, headers)
|
34
26
|
|
35
|
-
@table = TTY::Table.new(header: headers, rows: rows)
|
27
|
+
@table = TTY::Table.new(header: (headers if @config.header_row), rows: rows)
|
36
28
|
puts @table.render(
|
37
|
-
@
|
29
|
+
@config.table_renderer.to_sym,
|
38
30
|
padding: [0, 2, 0, 0]
|
39
31
|
)
|
40
32
|
end
|
41
33
|
|
42
|
-
|
43
|
-
|
44
|
-
def build_headers(records)
|
45
|
-
headers =
|
46
|
-
if @options[:fields_include].any?
|
47
|
-
Set[*@options[:fields_include]]
|
48
|
-
else
|
49
|
-
# use all the keys of the flattened record hash
|
50
|
-
Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
|
51
|
-
end
|
52
|
-
|
53
|
-
headers = headers.delete_if { |header| header.end_with?(*@options[:fields_exclude]) } if @options[:fields_exclude].any?
|
54
|
-
headers = headers.first(@options[:fields_limit]) if @options[:fields_limit]
|
55
|
-
|
56
|
-
headers.to_a.map(&:to_sym)
|
34
|
+
def records
|
35
|
+
@records ||= []
|
57
36
|
end
|
58
37
|
|
38
|
+
private
|
39
|
+
|
59
40
|
def build_rows(records, headers)
|
60
41
|
records.map do |record|
|
61
|
-
values = record.values_at(*headers).map{|value| value.to_s }
|
62
|
-
|
63
|
-
if @
|
64
|
-
values = values.map{ |value| value.truncate(@
|
42
|
+
values = record.transform_keys(&:to_sym).values_at(*headers).map{|value| value.to_s }
|
43
|
+
values = values.map { |value| force_utf8(value) }
|
44
|
+
if @config.truncate_values_at
|
45
|
+
values = values.map{ |value| value.truncate(@config.truncate_values_at) }
|
65
46
|
end
|
66
47
|
|
67
48
|
values
|
data/lib/chronicle/etl/logger.rb
CHANGED
@@ -8,11 +8,11 @@ module Chronicle
|
|
8
8
|
WARN = 2
|
9
9
|
ERROR = 3
|
10
10
|
FATAL = 4
|
11
|
+
SILENT = 5
|
11
12
|
|
12
13
|
attr_accessor :log_level
|
13
14
|
|
14
15
|
@log_level = INFO
|
15
|
-
@destination = $stderr
|
16
16
|
|
17
17
|
def output message, level
|
18
18
|
return unless level >= @log_level
|
@@ -20,10 +20,14 @@ module Chronicle
|
|
20
20
|
if @progress_bar
|
21
21
|
@progress_bar.log(message)
|
22
22
|
else
|
23
|
-
|
23
|
+
$stderr.puts(message)
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
27
|
+
def fatal(message)
|
28
|
+
output(message, FATAL)
|
29
|
+
end
|
30
|
+
|
27
31
|
def error(message)
|
28
32
|
output(message, ERROR)
|
29
33
|
end
|
@@ -5,6 +5,9 @@ module Chronicle
|
|
5
5
|
module Models
|
6
6
|
# Represents a record that's been transformed by a Transformer and
|
7
7
|
# ready to be loaded. Loosely based on ActiveModel.
|
8
|
+
#
|
9
|
+
# @todo Experiment with just mixing in ActiveModel instead of this
|
10
|
+
# this reimplementation
|
8
11
|
class Base
|
9
12
|
ATTRIBUTES = [:provider, :provider_id, :lat, :lng, :metadata].freeze
|
10
13
|
ASSOCIATIONS = [].freeze
|
@@ -5,13 +5,19 @@ module Chronicle
|
|
5
5
|
module Models
|
6
6
|
class Entity < Chronicle::ETL::Models::Base
|
7
7
|
TYPE = 'entities'.freeze
|
8
|
-
ATTRIBUTES = [:title, :body, :represents, :slug, :myself, :metadata].freeze
|
8
|
+
ATTRIBUTES = [:title, :body, :provider_url, :represents, :slug, :myself, :metadata].freeze
|
9
|
+
|
10
|
+
# TODO: This desperately needs a validation system
|
9
11
|
ASSOCIATIONS = [
|
12
|
+
:involvements, # inverse of activity's `involved`
|
13
|
+
|
10
14
|
:attachments,
|
11
15
|
:abouts,
|
16
|
+
:aboutables, # inverse of above
|
12
17
|
:depicts,
|
13
18
|
:consumers,
|
14
|
-
:contains
|
19
|
+
:contains,
|
20
|
+
:containers # inverse of above
|
15
21
|
].freeze # TODO: add these to reflect Chronicle Schema
|
16
22
|
|
17
23
|
attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
# A record from an extraction with no processing or normalization applied
|
7
|
+
class Raw
|
8
|
+
TYPE = 'raw'
|
9
|
+
|
10
|
+
attr_accessor :raw_data
|
11
|
+
|
12
|
+
def initialize(raw_data)
|
13
|
+
@raw_data = raw_data
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_h
|
17
|
+
@raw_data.to_h
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_h_flattened
|
21
|
+
Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -3,6 +3,7 @@ module Chronicle
|
|
3
3
|
module Registry
|
4
4
|
# Records details about a connector such as its provider and a description
|
5
5
|
class ConnectorRegistration
|
6
|
+
# FIXME: refactor custom accessor methods later in file
|
6
7
|
attr_accessor :identifier, :provider, :klass, :description
|
7
8
|
|
8
9
|
def initialize(klass)
|
@@ -43,6 +44,11 @@ module Chronicle
|
|
43
44
|
@provider || (built_in? ? 'chronicle' : '')
|
44
45
|
end
|
45
46
|
|
47
|
+
# TODO: allow overriding here. Maybe through self-registration process
|
48
|
+
def plugin
|
49
|
+
@provider
|
50
|
+
end
|
51
|
+
|
46
52
|
def descriptive_phrase
|
47
53
|
prefix = case phase
|
48
54
|
when :extractor
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rubygems/command'
|
3
|
+
require 'rubygems/commands/install_command'
|
4
|
+
require 'rubygems/uninstaller'
|
5
|
+
|
6
|
+
module Chronicle
|
7
|
+
module ETL
|
8
|
+
module Registry
|
9
|
+
# Responsible for managing plugins available to chronicle-etl
|
10
|
+
#
|
11
|
+
# @todo Better validation for whether a gem is actually a plugin
|
12
|
+
# @todo Add ways to load a plugin that don't require a gem on rubygems.org
|
13
|
+
module PluginRegistry
|
14
|
+
# Does this plugin exist?
|
15
|
+
def self.exists?(name)
|
16
|
+
# TODO: implement this. Could query rubygems.org or have a
|
17
|
+
# hardcoded approved list
|
18
|
+
true
|
19
|
+
end
|
20
|
+
|
21
|
+
# All versions of all plugins currently installed
|
22
|
+
def self.all_installed
|
23
|
+
# TODO: add check for chronicle-etl dependency
|
24
|
+
Gem::Specification.filter { |s| s.name.match(/^chronicle-/) && s.name != "chronicle-etl" }
|
25
|
+
end
|
26
|
+
|
27
|
+
# Latest version of each installed plugin
|
28
|
+
def self.all_installed_latest
|
29
|
+
all_installed.group_by(&:name)
|
30
|
+
.transform_values { |versions| versions.sort_by(&:version).reverse.first }
|
31
|
+
.values
|
32
|
+
end
|
33
|
+
|
34
|
+
# Activate a plugin with given name by `require`ing it
|
35
|
+
def self.activate(name)
|
36
|
+
# By default, activates the latest available version of a gem
|
37
|
+
# so don't have to run Kernel#gem separately
|
38
|
+
require "chronicle/#{name}"
|
39
|
+
rescue LoadError
|
40
|
+
raise Chronicle::ETL::PluginLoadError.new(name), "Plugin #{name} couldn't be loaded" if exists?(name)
|
41
|
+
|
42
|
+
raise Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} doesn't exist"
|
43
|
+
end
|
44
|
+
|
45
|
+
# Install a plugin to local gems
|
46
|
+
def self.install(name)
|
47
|
+
gem_name = "chronicle-#{name}"
|
48
|
+
raise(Chronicle::ETL::PluginNotAvailableError.new(gem_name), "Plugin #{name} doesn't exist") unless exists?(gem_name)
|
49
|
+
|
50
|
+
Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
|
51
|
+
Gem.install(gem_name)
|
52
|
+
rescue Gem::UnsatisfiableDependencyError
|
53
|
+
# TODO: we need to catch a lot more than this here
|
54
|
+
raise Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} doesn't exist"
|
55
|
+
end
|
56
|
+
|
57
|
+
# Uninstall a plugin
|
58
|
+
def self.uninstall(name)
|
59
|
+
gem_name = "chronicle-#{name}"
|
60
|
+
Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
|
61
|
+
uninstaller = Gem::Uninstaller.new(gem_name)
|
62
|
+
uninstaller.uninstall
|
63
|
+
rescue Gem::InstallError
|
64
|
+
# TODO: strengthen this exception handling
|
65
|
+
raise(Chronicle::ETL::PluginError.new(name), "Plugin #{name} wasn't uninstalled")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -20,28 +20,40 @@ module Chronicle
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
24
|
-
|
25
|
-
Gem.install(gem_name)
|
23
|
+
def register connector
|
24
|
+
connectors << connector
|
26
25
|
end
|
27
26
|
|
28
|
-
def
|
27
|
+
def connectors
|
29
28
|
@connectors ||= []
|
30
|
-
@connectors << connector
|
31
29
|
end
|
32
30
|
|
33
31
|
def find_by_phase_and_identifier(phase, identifier)
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
32
|
+
# Simple case: built in connector
|
33
|
+
connector = connectors.find { |c| c.phase == phase && c.identifier == identifier }
|
34
|
+
return connector if connector
|
35
|
+
|
36
|
+
# if not available in built-in connectors, try to activate a
|
37
|
+
# relevant plugin and try again
|
38
|
+
if identifier.include?(":")
|
39
|
+
plugin, name = identifier.split(":")
|
40
|
+
else
|
41
|
+
# This case handles the case where the identifier is a
|
42
|
+
# shorthand (ie `imessage`) because there's only one default
|
43
|
+
# connector.
|
44
|
+
plugin = identifier
|
39
45
|
end
|
40
|
-
connector || raise(ConnectorNotAvailableError.new("Connector '#{identifier}' not found"))
|
41
|
-
end
|
42
46
|
|
43
|
-
|
44
|
-
|
47
|
+
PluginRegistry.activate(plugin)
|
48
|
+
|
49
|
+
candidates = connectors.select { |c| c.phase == phase && c.plugin == plugin }
|
50
|
+
# if no name given, just use first connector with right phase/plugin
|
51
|
+
# TODO: set up a property for connectors to specify that they're the
|
52
|
+
# default connector for the plugin
|
53
|
+
candidates = candidates.select { |c| c.identifier == name } if name
|
54
|
+
connector = candidates.first
|
55
|
+
|
56
|
+
connector || raise(ConnectorNotAvailableError, "Connector '#{identifier}' not found")
|
45
57
|
end
|
46
58
|
end
|
47
59
|
end
|
@@ -50,3 +62,4 @@ end
|
|
50
62
|
|
51
63
|
require_relative 'self_registering'
|
52
64
|
require_relative 'connector_registration'
|
65
|
+
require_relative 'plugin_registry'
|
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -8,19 +8,41 @@ class Chronicle::ETL::Runner
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def run!
|
11
|
-
|
12
|
-
|
11
|
+
validate_job
|
12
|
+
instantiate_connectors
|
13
|
+
prepare_job
|
14
|
+
prepare_ui
|
15
|
+
run_extraction
|
16
|
+
finish_job
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def validate_job
|
22
|
+
@job.job_definition.validate!
|
23
|
+
end
|
13
24
|
|
25
|
+
def instantiate_connectors
|
26
|
+
@extractor = @job.instantiate_extractor
|
27
|
+
@loader = @job.instantiate_loader
|
28
|
+
end
|
29
|
+
|
30
|
+
def prepare_job
|
31
|
+
Chronicle::ETL::Logger.info(tty_log_job_start)
|
14
32
|
@job_logger.start
|
15
|
-
loader.start
|
33
|
+
@loader.start
|
34
|
+
@extractor.prepare
|
35
|
+
end
|
16
36
|
|
17
|
-
|
18
|
-
total = extractor.results_count
|
37
|
+
def prepare_ui
|
38
|
+
total = @extractor.results_count
|
19
39
|
@progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
20
40
|
Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
|
41
|
+
end
|
21
42
|
|
22
|
-
|
23
|
-
|
43
|
+
# TODO: refactor this further
|
44
|
+
def run_extraction
|
45
|
+
@extractor.extract do |extraction|
|
24
46
|
unless extraction.is_a?(Chronicle::ETL::Extraction)
|
25
47
|
raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
|
26
48
|
end
|
@@ -28,14 +50,10 @@ class Chronicle::ETL::Runner
|
|
28
50
|
transformer = @job.instantiate_transformer(extraction)
|
29
51
|
record = transformer.transform
|
30
52
|
|
31
|
-
unless record.is_a?(Chronicle::ETL::Models::Base)
|
32
|
-
raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
|
33
|
-
end
|
34
|
-
|
35
53
|
Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
|
36
54
|
@job_logger.log_transformation(transformer)
|
37
55
|
|
38
|
-
loader.load(record) unless @job.dry_run?
|
56
|
+
@loader.load(record) unless @job.dry_run?
|
39
57
|
rescue Chronicle::ETL::TransformationError => e
|
40
58
|
Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
|
41
59
|
ensure
|
@@ -43,22 +61,22 @@ class Chronicle::ETL::Runner
|
|
43
61
|
end
|
44
62
|
|
45
63
|
@progress_bar.finish
|
46
|
-
loader.finish
|
64
|
+
@loader.finish
|
47
65
|
@job_logger.finish
|
48
66
|
rescue Interrupt
|
49
67
|
Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
|
50
68
|
@job_logger.error
|
51
69
|
rescue StandardError => e
|
52
70
|
raise e
|
53
|
-
|
71
|
+
end
|
72
|
+
|
73
|
+
def finish_job
|
54
74
|
@job_logger.save
|
55
|
-
@progress_bar
|
75
|
+
@progress_bar&.finish
|
56
76
|
Chronicle::ETL::Logger.detach_from_progress_bar
|
57
77
|
Chronicle::ETL::Logger.info(tty_log_completion)
|
58
78
|
end
|
59
79
|
|
60
|
-
private
|
61
|
-
|
62
80
|
def tty_log_job_start
|
63
81
|
output = "Beginning job "
|
64
82
|
output += "'#{@job.name}'".bold if @job.name
|
@@ -1,6 +1,12 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
3
|
class JSONAPISerializer < Chronicle::ETL::Serializer
|
4
|
+
def initialize(*args)
|
5
|
+
super
|
6
|
+
|
7
|
+
raise(SerializationError, "Record must be a subclass of Chronicle::ETL::Model::Base") unless @record.is_a?(Chronicle::ETL::Models::Base)
|
8
|
+
end
|
9
|
+
|
4
10
|
def serializable_hash
|
5
11
|
@record
|
6
12
|
.identifier_hash
|
@@ -19,20 +19,14 @@ module Chronicle
|
|
19
19
|
r.description = 'an image file'
|
20
20
|
end
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
}.freeze
|
31
|
-
|
32
|
-
def initialize(*args)
|
33
|
-
super(*args)
|
34
|
-
@options = @options.reverse_merge(DEFAULT_OPTIONS)
|
35
|
-
end
|
22
|
+
setting :timestamp_strategy, default: 'file_mtime'
|
23
|
+
setting :id_strategy, default: 'file_hash'
|
24
|
+
setting :verb, default: 'photographed'
|
25
|
+
# EXIF tags often don't have timezones
|
26
|
+
setting :timezone_default, default: 'Eastern Time (US & Canada)'
|
27
|
+
setting :include_image_data, default: true
|
28
|
+
setting :actor
|
29
|
+
setting :involved
|
36
30
|
|
37
31
|
def transform
|
38
32
|
# FIXME: set @filename; use block for reading file when necessary
|
@@ -48,7 +42,7 @@ module Chronicle
|
|
48
42
|
|
49
43
|
def id
|
50
44
|
@id ||= begin
|
51
|
-
id = build_with_strategy(field: :id, strategy: @
|
45
|
+
id = build_with_strategy(field: :id, strategy: @config.id_strategy)
|
52
46
|
raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
|
53
47
|
|
54
48
|
id
|
@@ -57,7 +51,7 @@ module Chronicle
|
|
57
51
|
|
58
52
|
def timestamp
|
59
53
|
@timestamp ||= begin
|
60
|
-
ts = build_with_strategy(field: :timestamp, strategy: @
|
54
|
+
ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
|
61
55
|
raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
|
62
56
|
|
63
57
|
ts
|
@@ -68,8 +62,8 @@ module Chronicle
|
|
68
62
|
|
69
63
|
def build_created(file)
|
70
64
|
record = ::Chronicle::ETL::Models::Activity.new
|
71
|
-
record.verb = @
|
72
|
-
record.provider = @
|
65
|
+
record.verb = @config.verb
|
66
|
+
record.provider = @config.provider
|
73
67
|
record.provider_id = id
|
74
68
|
record.end_at = timestamp
|
75
69
|
record.dedupe_on = [[:provider_id, :verb, :provider]]
|
@@ -84,24 +78,24 @@ module Chronicle
|
|
84
78
|
def build_actor
|
85
79
|
actor = ::Chronicle::ETL::Models::Entity.new
|
86
80
|
actor.represents = 'identity'
|
87
|
-
actor.provider = @
|
88
|
-
actor.slug = @
|
81
|
+
actor.provider = @config.actor[:provider]
|
82
|
+
actor.slug = @config.actor[:slug]
|
89
83
|
actor.dedupe_on = [[:provider, :slug, :represents]]
|
90
84
|
actor
|
91
85
|
end
|
92
86
|
|
93
87
|
def build_image
|
94
88
|
image = ::Chronicle::ETL::Models::Entity.new
|
95
|
-
image.represents = @
|
89
|
+
image.represents = @config.involved[:represents]
|
96
90
|
image.title = build_title
|
97
91
|
image.body = exif['Description']
|
98
|
-
image.provider = @
|
92
|
+
image.provider = @config.involved[:provider]
|
99
93
|
image.provider_id = id
|
100
94
|
image.assign_attributes(build_gps)
|
101
95
|
image.dedupe_on = [[:provider, :provider_id, :represents]]
|
102
96
|
|
103
|
-
if @
|
104
|
-
ocr_text = build_with_strategy(field: :ocr, strategy: @
|
97
|
+
if @config.ocr_strategy
|
98
|
+
ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
|
105
99
|
image.metadata[:ocr_text] = ocr_text if ocr_text
|
106
100
|
end
|
107
101
|
|
@@ -111,7 +105,7 @@ module Chronicle
|
|
111
105
|
image.depicts = build_people_depicted(names)
|
112
106
|
image.abouts = build_keywords(tags)
|
113
107
|
|
114
|
-
if @
|
108
|
+
if @config.include_image_data
|
115
109
|
attachment = ::Chronicle::ETL::Models::Attachment.new
|
116
110
|
attachment.data = build_image_data
|
117
111
|
image.attachments = [attachment]
|
@@ -124,7 +118,7 @@ module Chronicle
|
|
124
118
|
topics.map do |topic|
|
125
119
|
t = ::Chronicle::ETL::Models::Entity.new
|
126
120
|
t.represents = 'topic'
|
127
|
-
t.provider = @
|
121
|
+
t.provider = @config.involved[:provider]
|
128
122
|
t.title = topic
|
129
123
|
t.slug = topic.parameterize
|
130
124
|
t.dedupe_on = [[:provider, :represents, :slug]]
|
@@ -136,7 +130,7 @@ module Chronicle
|
|
136
130
|
names.map do |name|
|
137
131
|
identity = ::Chronicle::ETL::Models::Entity.new
|
138
132
|
identity.represents = 'identity'
|
139
|
-
identity.provider = @
|
133
|
+
identity.provider = @config.involved[:provider]
|
140
134
|
identity.slug = name.parameterize
|
141
135
|
identity.title = name
|
142
136
|
identity.dedupe_on = [[:provider, :represents, :slug]]
|
@@ -199,7 +193,7 @@ module Chronicle
|
|
199
193
|
elsif false
|
200
194
|
# TODO: support option of using GPS coordinates to determine timezone
|
201
195
|
else
|
202
|
-
zone = ActiveSupport::TimeZone.new(@
|
196
|
+
zone = ActiveSupport::TimeZone.new(@config.timezone_default)
|
203
197
|
timestamp = zone.parse(timestamp.asctime)
|
204
198
|
end
|
205
199
|
|
@@ -3,14 +3,15 @@ module Chronicle
|
|
3
3
|
# Abstract class representing an Transformer for an ETL job
|
4
4
|
class Transformer
|
5
5
|
extend Chronicle::ETL::Registry::SelfRegistering
|
6
|
+
include Chronicle::ETL::Configurable
|
6
7
|
|
7
8
|
# Construct a new instance of this transformer. Options are passed in from a Runner
|
8
9
|
# == Parameters:
|
9
10
|
# options::
|
10
11
|
# Options for configuring this Transformer
|
11
|
-
def initialize(options = {}
|
12
|
-
@options = options
|
12
|
+
def initialize(extraction, options = {})
|
13
13
|
@extraction = extraction
|
14
|
+
apply_options(options)
|
14
15
|
end
|
15
16
|
|
16
17
|
# @abstract Subclass is expected to implement #transform
|
data/lib/chronicle/etl.rb
CHANGED
@@ -1,24 +1,32 @@
|
|
1
1
|
require_relative 'etl/registry/registry'
|
2
2
|
require_relative 'etl/config'
|
3
|
+
require_relative 'etl/configurable'
|
3
4
|
require_relative 'etl/exceptions'
|
4
5
|
require_relative 'etl/extraction'
|
5
|
-
require_relative 'etl/extractors/extractor'
|
6
6
|
require_relative 'etl/job_definition'
|
7
7
|
require_relative 'etl/job_log'
|
8
8
|
require_relative 'etl/job_logger'
|
9
9
|
require_relative 'etl/job'
|
10
|
-
require_relative 'etl/loaders/loader'
|
11
10
|
require_relative 'etl/logger'
|
12
11
|
require_relative 'etl/models/activity'
|
13
12
|
require_relative 'etl/models/attachment'
|
14
13
|
require_relative 'etl/models/base'
|
14
|
+
require_relative 'etl/models/raw'
|
15
15
|
require_relative 'etl/models/entity'
|
16
|
-
require_relative 'etl/models/generic'
|
17
16
|
require_relative 'etl/runner'
|
18
17
|
require_relative 'etl/serializers/serializer'
|
19
|
-
require_relative 'etl/transformers/transformer'
|
20
18
|
require_relative 'etl/utils/binary_attachments'
|
21
19
|
require_relative 'etl/utils/hash_utilities'
|
22
20
|
require_relative 'etl/utils/text_recognition'
|
23
21
|
require_relative 'etl/utils/progress_bar'
|
24
22
|
require_relative 'etl/version'
|
23
|
+
|
24
|
+
require_relative 'etl/extractors/extractor'
|
25
|
+
require_relative 'etl/loaders/loader'
|
26
|
+
require_relative 'etl/transformers/transformer'
|
27
|
+
|
28
|
+
begin
|
29
|
+
require 'pry'
|
30
|
+
rescue LoadError
|
31
|
+
# Pry not available
|
32
|
+
end
|