chronicle-etl 0.3.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +35 -0
- data/.rubocop.yml +31 -1
- data/Guardfile +7 -0
- data/README.md +157 -82
- data/Rakefile +4 -2
- data/chronicle-etl.gemspec +11 -3
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/cli/connectors.rb +34 -5
- data/lib/chronicle/etl/cli/jobs.rb +90 -24
- data/lib/chronicle/etl/cli/main.rb +41 -19
- data/lib/chronicle/etl/cli/plugins.rb +62 -0
- data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
- data/lib/chronicle/etl/cli.rb +9 -0
- data/lib/chronicle/etl/config.rb +7 -4
- data/lib/chronicle/etl/configurable.rb +163 -0
- data/lib/chronicle/etl/exceptions.rb +29 -1
- data/lib/chronicle/etl/extractors/csv_extractor.rb +24 -23
- data/lib/chronicle/etl/extractors/extractor.rb +16 -15
- data/lib/chronicle/etl/extractors/file_extractor.rb +34 -11
- data/lib/chronicle/etl/extractors/helpers/input_reader.rb +76 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +19 -18
- data/lib/chronicle/etl/job.rb +8 -2
- data/lib/chronicle/etl/job_definition.rb +20 -5
- data/lib/chronicle/etl/loaders/csv_loader.rb +36 -9
- data/lib/chronicle/etl/loaders/helpers/encoding_helper.rb +18 -0
- data/lib/chronicle/etl/loaders/json_loader.rb +44 -0
- data/lib/chronicle/etl/loaders/loader.rb +28 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +5 -5
- data/lib/chronicle/etl/loaders/table_loader.rb +18 -37
- data/lib/chronicle/etl/logger.rb +6 -2
- data/lib/chronicle/etl/models/base.rb +3 -0
- data/lib/chronicle/etl/models/entity.rb +8 -2
- data/lib/chronicle/etl/models/raw.rb +26 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +6 -0
- data/lib/chronicle/etl/registry/plugin_registry.rb +70 -0
- data/lib/chronicle/etl/registry/registry.rb +27 -14
- data/lib/chronicle/etl/runner.rb +35 -17
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +6 -0
- data/lib/chronicle/etl/serializers/raw_serializer.rb +10 -0
- data/lib/chronicle/etl/serializers/serializer.rb +2 -1
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +22 -28
- data/lib/chronicle/etl/transformers/null_transformer.rb +1 -1
- data/lib/chronicle/etl/transformers/transformer.rb +3 -2
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +12 -4
- metadata +123 -18
- data/.ruby-version +0 -1
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +0 -104
- data/lib/chronicle/etl/loaders/stdout_loader.rb +0 -14
- data/lib/chronicle/etl/models/generic.rb +0 -23
@@ -9,59 +9,40 @@ module Chronicle
|
|
9
9
|
r.description = 'an ASCII table'
|
10
10
|
end
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
truncate_values_at: nil,
|
17
|
-
table_renderer: :basic
|
18
|
-
}.freeze
|
19
|
-
|
20
|
-
def initialize(options={})
|
21
|
-
@options = options.reverse_merge(DEFAULT_OPTIONS)
|
22
|
-
@records = []
|
23
|
-
end
|
12
|
+
setting :truncate_values_at, default: 40
|
13
|
+
setting :table_renderer, default: :basic
|
14
|
+
setting :fields_exclude, default: ['lids', 'type']
|
15
|
+
setting :header_row, default: true
|
24
16
|
|
25
17
|
def load(record)
|
26
|
-
|
18
|
+
records << record.to_h_flattened
|
27
19
|
end
|
28
20
|
|
29
21
|
def finish
|
30
|
-
return if
|
22
|
+
return if records.empty?
|
31
23
|
|
32
|
-
headers = build_headers(
|
33
|
-
rows = build_rows(
|
24
|
+
headers = build_headers(records)
|
25
|
+
rows = build_rows(records, headers)
|
34
26
|
|
35
|
-
@table = TTY::Table.new(header: headers, rows: rows)
|
27
|
+
@table = TTY::Table.new(header: (headers if @config.header_row), rows: rows)
|
36
28
|
puts @table.render(
|
37
|
-
@
|
29
|
+
@config.table_renderer.to_sym,
|
38
30
|
padding: [0, 2, 0, 0]
|
39
31
|
)
|
40
32
|
end
|
41
33
|
|
42
|
-
|
43
|
-
|
44
|
-
def build_headers(records)
|
45
|
-
headers =
|
46
|
-
if @options[:fields_include].any?
|
47
|
-
Set[*@options[:fields_include]]
|
48
|
-
else
|
49
|
-
# use all the keys of the flattened record hash
|
50
|
-
Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
|
51
|
-
end
|
52
|
-
|
53
|
-
headers = headers.delete_if { |header| header.end_with?(*@options[:fields_exclude]) } if @options[:fields_exclude].any?
|
54
|
-
headers = headers.first(@options[:fields_limit]) if @options[:fields_limit]
|
55
|
-
|
56
|
-
headers.to_a.map(&:to_sym)
|
34
|
+
def records
|
35
|
+
@records ||= []
|
57
36
|
end
|
58
37
|
|
38
|
+
private
|
39
|
+
|
59
40
|
def build_rows(records, headers)
|
60
41
|
records.map do |record|
|
61
|
-
values = record.values_at(*headers).map{|value| value.to_s }
|
62
|
-
|
63
|
-
if @
|
64
|
-
values = values.map{ |value| value.truncate(@
|
42
|
+
values = record.transform_keys(&:to_sym).values_at(*headers).map{|value| value.to_s }
|
43
|
+
values = values.map { |value| force_utf8(value) }
|
44
|
+
if @config.truncate_values_at
|
45
|
+
values = values.map{ |value| value.truncate(@config.truncate_values_at) }
|
65
46
|
end
|
66
47
|
|
67
48
|
values
|
data/lib/chronicle/etl/logger.rb
CHANGED
@@ -8,11 +8,11 @@ module Chronicle
|
|
8
8
|
WARN = 2
|
9
9
|
ERROR = 3
|
10
10
|
FATAL = 4
|
11
|
+
SILENT = 5
|
11
12
|
|
12
13
|
attr_accessor :log_level
|
13
14
|
|
14
15
|
@log_level = INFO
|
15
|
-
@destination = $stderr
|
16
16
|
|
17
17
|
def output message, level
|
18
18
|
return unless level >= @log_level
|
@@ -20,10 +20,14 @@ module Chronicle
|
|
20
20
|
if @progress_bar
|
21
21
|
@progress_bar.log(message)
|
22
22
|
else
|
23
|
-
|
23
|
+
$stderr.puts(message)
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
27
|
+
def fatal(message)
|
28
|
+
output(message, FATAL)
|
29
|
+
end
|
30
|
+
|
27
31
|
def error(message)
|
28
32
|
output(message, ERROR)
|
29
33
|
end
|
@@ -5,6 +5,9 @@ module Chronicle
|
|
5
5
|
module Models
|
6
6
|
# Represents a record that's been transformed by a Transformer and
|
7
7
|
# ready to be loaded. Loosely based on ActiveModel.
|
8
|
+
#
|
9
|
+
# @todo Experiment with just mixing in ActiveModel instead of this
|
10
|
+
# this reimplementation
|
8
11
|
class Base
|
9
12
|
ATTRIBUTES = [:provider, :provider_id, :lat, :lng, :metadata].freeze
|
10
13
|
ASSOCIATIONS = [].freeze
|
@@ -5,13 +5,19 @@ module Chronicle
|
|
5
5
|
module Models
|
6
6
|
class Entity < Chronicle::ETL::Models::Base
|
7
7
|
TYPE = 'entities'.freeze
|
8
|
-
ATTRIBUTES = [:title, :body, :represents, :slug, :myself, :metadata].freeze
|
8
|
+
ATTRIBUTES = [:title, :body, :provider_url, :represents, :slug, :myself, :metadata].freeze
|
9
|
+
|
10
|
+
# TODO: This desperately needs a validation system
|
9
11
|
ASSOCIATIONS = [
|
12
|
+
:involvements, # inverse of activity's `involved`
|
13
|
+
|
10
14
|
:attachments,
|
11
15
|
:abouts,
|
16
|
+
:aboutables, # inverse of above
|
12
17
|
:depicts,
|
13
18
|
:consumers,
|
14
|
-
:contains
|
19
|
+
:contains,
|
20
|
+
:containers # inverse of above
|
15
21
|
].freeze # TODO: add these to reflect Chronicle Schema
|
16
22
|
|
17
23
|
attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
# A record from an extraction with no processing or normalization applied
|
7
|
+
class Raw
|
8
|
+
TYPE = 'raw'
|
9
|
+
|
10
|
+
attr_accessor :raw_data
|
11
|
+
|
12
|
+
def initialize(raw_data)
|
13
|
+
@raw_data = raw_data
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_h
|
17
|
+
@raw_data.to_h
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_h_flattened
|
21
|
+
Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -3,6 +3,7 @@ module Chronicle
|
|
3
3
|
module Registry
|
4
4
|
# Records details about a connector such as its provider and a description
|
5
5
|
class ConnectorRegistration
|
6
|
+
# FIXME: refactor custom accessor methods later in file
|
6
7
|
attr_accessor :identifier, :provider, :klass, :description
|
7
8
|
|
8
9
|
def initialize(klass)
|
@@ -43,6 +44,11 @@ module Chronicle
|
|
43
44
|
@provider || (built_in? ? 'chronicle' : '')
|
44
45
|
end
|
45
46
|
|
47
|
+
# TODO: allow overriding here. Maybe through self-registration process
|
48
|
+
def plugin
|
49
|
+
@provider
|
50
|
+
end
|
51
|
+
|
46
52
|
def descriptive_phrase
|
47
53
|
prefix = case phase
|
48
54
|
when :extractor
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rubygems/command'
|
3
|
+
require 'rubygems/commands/install_command'
|
4
|
+
require 'rubygems/uninstaller'
|
5
|
+
|
6
|
+
module Chronicle
|
7
|
+
module ETL
|
8
|
+
module Registry
|
9
|
+
# Responsible for managing plugins available to chronicle-etl
|
10
|
+
#
|
11
|
+
# @todo Better validation for whether a gem is actually a plugin
|
12
|
+
# @todo Add ways to load a plugin that don't require a gem on rubygems.org
|
13
|
+
module PluginRegistry
|
14
|
+
# Does this plugin exist?
|
15
|
+
def self.exists?(name)
|
16
|
+
# TODO: implement this. Could query rubygems.org or have a
|
17
|
+
# hardcoded approved list
|
18
|
+
true
|
19
|
+
end
|
20
|
+
|
21
|
+
# All versions of all plugins currently installed
|
22
|
+
def self.all_installed
|
23
|
+
# TODO: add check for chronicle-etl dependency
|
24
|
+
Gem::Specification.filter { |s| s.name.match(/^chronicle-/) && s.name != "chronicle-etl" }
|
25
|
+
end
|
26
|
+
|
27
|
+
# Latest version of each installed plugin
|
28
|
+
def self.all_installed_latest
|
29
|
+
all_installed.group_by(&:name)
|
30
|
+
.transform_values { |versions| versions.sort_by(&:version).reverse.first }
|
31
|
+
.values
|
32
|
+
end
|
33
|
+
|
34
|
+
# Activate a plugin with given name by `require`ing it
|
35
|
+
def self.activate(name)
|
36
|
+
# By default, activates the latest available version of a gem
|
37
|
+
# so don't have to run Kernel#gem separately
|
38
|
+
require "chronicle/#{name}"
|
39
|
+
rescue LoadError
|
40
|
+
raise Chronicle::ETL::PluginLoadError.new(name), "Plugin #{name} couldn't be loaded" if exists?(name)
|
41
|
+
|
42
|
+
raise Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} doesn't exist"
|
43
|
+
end
|
44
|
+
|
45
|
+
# Install a plugin to local gems
|
46
|
+
def self.install(name)
|
47
|
+
gem_name = "chronicle-#{name}"
|
48
|
+
raise(Chronicle::ETL::PluginNotAvailableError.new(gem_name), "Plugin #{name} doesn't exist") unless exists?(gem_name)
|
49
|
+
|
50
|
+
Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
|
51
|
+
Gem.install(gem_name)
|
52
|
+
rescue Gem::UnsatisfiableDependencyError
|
53
|
+
# TODO: we need to catch a lot more than this here
|
54
|
+
raise Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} doesn't exist"
|
55
|
+
end
|
56
|
+
|
57
|
+
# Uninstall a plugin
|
58
|
+
def self.uninstall(name)
|
59
|
+
gem_name = "chronicle-#{name}"
|
60
|
+
Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
|
61
|
+
uninstaller = Gem::Uninstaller.new(gem_name)
|
62
|
+
uninstaller.uninstall
|
63
|
+
rescue Gem::InstallError
|
64
|
+
# TODO: strengthen this exception handling
|
65
|
+
raise(Chronicle::ETL::PluginError.new(name), "Plugin #{name} wasn't uninstalled")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -20,28 +20,40 @@ module Chronicle
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
24
|
-
|
25
|
-
Gem.install(gem_name)
|
23
|
+
def register connector
|
24
|
+
connectors << connector
|
26
25
|
end
|
27
26
|
|
28
|
-
def
|
27
|
+
def connectors
|
29
28
|
@connectors ||= []
|
30
|
-
@connectors << connector
|
31
29
|
end
|
32
30
|
|
33
31
|
def find_by_phase_and_identifier(phase, identifier)
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
32
|
+
# Simple case: built in connector
|
33
|
+
connector = connectors.find { |c| c.phase == phase && c.identifier == identifier }
|
34
|
+
return connector if connector
|
35
|
+
|
36
|
+
# if not available in built-in connectors, try to activate a
|
37
|
+
# relevant plugin and try again
|
38
|
+
if identifier.include?(":")
|
39
|
+
plugin, name = identifier.split(":")
|
40
|
+
else
|
41
|
+
# This case handles the case where the identifier is a
|
42
|
+
# shorthand (ie `imessage`) because there's only one default
|
43
|
+
# connector.
|
44
|
+
plugin = identifier
|
39
45
|
end
|
40
|
-
connector || raise(ConnectorNotAvailableError.new("Connector '#{identifier}' not found"))
|
41
|
-
end
|
42
46
|
|
43
|
-
|
44
|
-
|
47
|
+
PluginRegistry.activate(plugin)
|
48
|
+
|
49
|
+
candidates = connectors.select { |c| c.phase == phase && c.plugin == plugin }
|
50
|
+
# if no name given, just use first connector with right phase/plugin
|
51
|
+
# TODO: set up a property for connectors to specify that they're the
|
52
|
+
# default connector for the plugin
|
53
|
+
candidates = candidates.select { |c| c.identifier == name } if name
|
54
|
+
connector = candidates.first
|
55
|
+
|
56
|
+
connector || raise(ConnectorNotAvailableError, "Connector '#{identifier}' not found")
|
45
57
|
end
|
46
58
|
end
|
47
59
|
end
|
@@ -50,3 +62,4 @@ end
|
|
50
62
|
|
51
63
|
require_relative 'self_registering'
|
52
64
|
require_relative 'connector_registration'
|
65
|
+
require_relative 'plugin_registry'
|
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -8,19 +8,41 @@ class Chronicle::ETL::Runner
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def run!
|
11
|
-
|
12
|
-
|
11
|
+
validate_job
|
12
|
+
instantiate_connectors
|
13
|
+
prepare_job
|
14
|
+
prepare_ui
|
15
|
+
run_extraction
|
16
|
+
finish_job
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def validate_job
|
22
|
+
@job.job_definition.validate!
|
23
|
+
end
|
13
24
|
|
25
|
+
def instantiate_connectors
|
26
|
+
@extractor = @job.instantiate_extractor
|
27
|
+
@loader = @job.instantiate_loader
|
28
|
+
end
|
29
|
+
|
30
|
+
def prepare_job
|
31
|
+
Chronicle::ETL::Logger.info(tty_log_job_start)
|
14
32
|
@job_logger.start
|
15
|
-
loader.start
|
33
|
+
@loader.start
|
34
|
+
@extractor.prepare
|
35
|
+
end
|
16
36
|
|
17
|
-
|
18
|
-
total = extractor.results_count
|
37
|
+
def prepare_ui
|
38
|
+
total = @extractor.results_count
|
19
39
|
@progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
20
40
|
Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
|
41
|
+
end
|
21
42
|
|
22
|
-
|
23
|
-
|
43
|
+
# TODO: refactor this further
|
44
|
+
def run_extraction
|
45
|
+
@extractor.extract do |extraction|
|
24
46
|
unless extraction.is_a?(Chronicle::ETL::Extraction)
|
25
47
|
raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
|
26
48
|
end
|
@@ -28,14 +50,10 @@ class Chronicle::ETL::Runner
|
|
28
50
|
transformer = @job.instantiate_transformer(extraction)
|
29
51
|
record = transformer.transform
|
30
52
|
|
31
|
-
unless record.is_a?(Chronicle::ETL::Models::Base)
|
32
|
-
raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
|
33
|
-
end
|
34
|
-
|
35
53
|
Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
|
36
54
|
@job_logger.log_transformation(transformer)
|
37
55
|
|
38
|
-
loader.load(record) unless @job.dry_run?
|
56
|
+
@loader.load(record) unless @job.dry_run?
|
39
57
|
rescue Chronicle::ETL::TransformationError => e
|
40
58
|
Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
|
41
59
|
ensure
|
@@ -43,22 +61,22 @@ class Chronicle::ETL::Runner
|
|
43
61
|
end
|
44
62
|
|
45
63
|
@progress_bar.finish
|
46
|
-
loader.finish
|
64
|
+
@loader.finish
|
47
65
|
@job_logger.finish
|
48
66
|
rescue Interrupt
|
49
67
|
Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
|
50
68
|
@job_logger.error
|
51
69
|
rescue StandardError => e
|
52
70
|
raise e
|
53
|
-
|
71
|
+
end
|
72
|
+
|
73
|
+
def finish_job
|
54
74
|
@job_logger.save
|
55
|
-
@progress_bar
|
75
|
+
@progress_bar&.finish
|
56
76
|
Chronicle::ETL::Logger.detach_from_progress_bar
|
57
77
|
Chronicle::ETL::Logger.info(tty_log_completion)
|
58
78
|
end
|
59
79
|
|
60
|
-
private
|
61
|
-
|
62
80
|
def tty_log_job_start
|
63
81
|
output = "Beginning job "
|
64
82
|
output += "'#{@job.name}'".bold if @job.name
|
@@ -1,6 +1,12 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
3
|
class JSONAPISerializer < Chronicle::ETL::Serializer
|
4
|
+
def initialize(*args)
|
5
|
+
super
|
6
|
+
|
7
|
+
raise(SerializationError, "Record must be a subclass of Chronicle::ETL::Model::Base") unless @record.is_a?(Chronicle::ETL::Models::Base)
|
8
|
+
end
|
9
|
+
|
4
10
|
def serializable_hash
|
5
11
|
@record
|
6
12
|
.identifier_hash
|
@@ -19,20 +19,14 @@ module Chronicle
|
|
19
19
|
r.description = 'an image file'
|
20
20
|
end
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
}.freeze
|
31
|
-
|
32
|
-
def initialize(*args)
|
33
|
-
super(*args)
|
34
|
-
@options = @options.reverse_merge(DEFAULT_OPTIONS)
|
35
|
-
end
|
22
|
+
setting :timestamp_strategy, default: 'file_mtime'
|
23
|
+
setting :id_strategy, default: 'file_hash'
|
24
|
+
setting :verb, default: 'photographed'
|
25
|
+
# EXIF tags often don't have timezones
|
26
|
+
setting :timezone_default, default: 'Eastern Time (US & Canada)'
|
27
|
+
setting :include_image_data, default: true
|
28
|
+
setting :actor
|
29
|
+
setting :involved
|
36
30
|
|
37
31
|
def transform
|
38
32
|
# FIXME: set @filename; use block for reading file when necessary
|
@@ -48,7 +42,7 @@ module Chronicle
|
|
48
42
|
|
49
43
|
def id
|
50
44
|
@id ||= begin
|
51
|
-
id = build_with_strategy(field: :id, strategy: @
|
45
|
+
id = build_with_strategy(field: :id, strategy: @config.id_strategy)
|
52
46
|
raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
|
53
47
|
|
54
48
|
id
|
@@ -57,7 +51,7 @@ module Chronicle
|
|
57
51
|
|
58
52
|
def timestamp
|
59
53
|
@timestamp ||= begin
|
60
|
-
ts = build_with_strategy(field: :timestamp, strategy: @
|
54
|
+
ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
|
61
55
|
raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
|
62
56
|
|
63
57
|
ts
|
@@ -68,8 +62,8 @@ module Chronicle
|
|
68
62
|
|
69
63
|
def build_created(file)
|
70
64
|
record = ::Chronicle::ETL::Models::Activity.new
|
71
|
-
record.verb = @
|
72
|
-
record.provider = @
|
65
|
+
record.verb = @config.verb
|
66
|
+
record.provider = @config.provider
|
73
67
|
record.provider_id = id
|
74
68
|
record.end_at = timestamp
|
75
69
|
record.dedupe_on = [[:provider_id, :verb, :provider]]
|
@@ -84,24 +78,24 @@ module Chronicle
|
|
84
78
|
def build_actor
|
85
79
|
actor = ::Chronicle::ETL::Models::Entity.new
|
86
80
|
actor.represents = 'identity'
|
87
|
-
actor.provider = @
|
88
|
-
actor.slug = @
|
81
|
+
actor.provider = @config.actor[:provider]
|
82
|
+
actor.slug = @config.actor[:slug]
|
89
83
|
actor.dedupe_on = [[:provider, :slug, :represents]]
|
90
84
|
actor
|
91
85
|
end
|
92
86
|
|
93
87
|
def build_image
|
94
88
|
image = ::Chronicle::ETL::Models::Entity.new
|
95
|
-
image.represents = @
|
89
|
+
image.represents = @config.involved[:represents]
|
96
90
|
image.title = build_title
|
97
91
|
image.body = exif['Description']
|
98
|
-
image.provider = @
|
92
|
+
image.provider = @config.involved[:provider]
|
99
93
|
image.provider_id = id
|
100
94
|
image.assign_attributes(build_gps)
|
101
95
|
image.dedupe_on = [[:provider, :provider_id, :represents]]
|
102
96
|
|
103
|
-
if @
|
104
|
-
ocr_text = build_with_strategy(field: :ocr, strategy: @
|
97
|
+
if @config.ocr_strategy
|
98
|
+
ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
|
105
99
|
image.metadata[:ocr_text] = ocr_text if ocr_text
|
106
100
|
end
|
107
101
|
|
@@ -111,7 +105,7 @@ module Chronicle
|
|
111
105
|
image.depicts = build_people_depicted(names)
|
112
106
|
image.abouts = build_keywords(tags)
|
113
107
|
|
114
|
-
if @
|
108
|
+
if @config.include_image_data
|
115
109
|
attachment = ::Chronicle::ETL::Models::Attachment.new
|
116
110
|
attachment.data = build_image_data
|
117
111
|
image.attachments = [attachment]
|
@@ -124,7 +118,7 @@ module Chronicle
|
|
124
118
|
topics.map do |topic|
|
125
119
|
t = ::Chronicle::ETL::Models::Entity.new
|
126
120
|
t.represents = 'topic'
|
127
|
-
t.provider = @
|
121
|
+
t.provider = @config.involved[:provider]
|
128
122
|
t.title = topic
|
129
123
|
t.slug = topic.parameterize
|
130
124
|
t.dedupe_on = [[:provider, :represents, :slug]]
|
@@ -136,7 +130,7 @@ module Chronicle
|
|
136
130
|
names.map do |name|
|
137
131
|
identity = ::Chronicle::ETL::Models::Entity.new
|
138
132
|
identity.represents = 'identity'
|
139
|
-
identity.provider = @
|
133
|
+
identity.provider = @config.involved[:provider]
|
140
134
|
identity.slug = name.parameterize
|
141
135
|
identity.title = name
|
142
136
|
identity.dedupe_on = [[:provider, :represents, :slug]]
|
@@ -199,7 +193,7 @@ module Chronicle
|
|
199
193
|
elsif false
|
200
194
|
# TODO: support option of using GPS coordinates to determine timezone
|
201
195
|
else
|
202
|
-
zone = ActiveSupport::TimeZone.new(@
|
196
|
+
zone = ActiveSupport::TimeZone.new(@config.timezone_default)
|
203
197
|
timestamp = zone.parse(timestamp.asctime)
|
204
198
|
end
|
205
199
|
|
@@ -3,14 +3,15 @@ module Chronicle
|
|
3
3
|
# Abstract class representing an Transformer for an ETL job
|
4
4
|
class Transformer
|
5
5
|
extend Chronicle::ETL::Registry::SelfRegistering
|
6
|
+
include Chronicle::ETL::Configurable
|
6
7
|
|
7
8
|
# Construct a new instance of this transformer. Options are passed in from a Runner
|
8
9
|
# == Parameters:
|
9
10
|
# options::
|
10
11
|
# Options for configuring this Transformer
|
11
|
-
def initialize(options = {}
|
12
|
-
@options = options
|
12
|
+
def initialize(extraction, options = {})
|
13
13
|
@extraction = extraction
|
14
|
+
apply_options(options)
|
14
15
|
end
|
15
16
|
|
16
17
|
# @abstract Subclass is expected to implement #transform
|
data/lib/chronicle/etl.rb
CHANGED
@@ -1,24 +1,32 @@
|
|
1
1
|
require_relative 'etl/registry/registry'
|
2
2
|
require_relative 'etl/config'
|
3
|
+
require_relative 'etl/configurable'
|
3
4
|
require_relative 'etl/exceptions'
|
4
5
|
require_relative 'etl/extraction'
|
5
|
-
require_relative 'etl/extractors/extractor'
|
6
6
|
require_relative 'etl/job_definition'
|
7
7
|
require_relative 'etl/job_log'
|
8
8
|
require_relative 'etl/job_logger'
|
9
9
|
require_relative 'etl/job'
|
10
|
-
require_relative 'etl/loaders/loader'
|
11
10
|
require_relative 'etl/logger'
|
12
11
|
require_relative 'etl/models/activity'
|
13
12
|
require_relative 'etl/models/attachment'
|
14
13
|
require_relative 'etl/models/base'
|
14
|
+
require_relative 'etl/models/raw'
|
15
15
|
require_relative 'etl/models/entity'
|
16
|
-
require_relative 'etl/models/generic'
|
17
16
|
require_relative 'etl/runner'
|
18
17
|
require_relative 'etl/serializers/serializer'
|
19
|
-
require_relative 'etl/transformers/transformer'
|
20
18
|
require_relative 'etl/utils/binary_attachments'
|
21
19
|
require_relative 'etl/utils/hash_utilities'
|
22
20
|
require_relative 'etl/utils/text_recognition'
|
23
21
|
require_relative 'etl/utils/progress_bar'
|
24
22
|
require_relative 'etl/version'
|
23
|
+
|
24
|
+
require_relative 'etl/extractors/extractor'
|
25
|
+
require_relative 'etl/loaders/loader'
|
26
|
+
require_relative 'etl/transformers/transformer'
|
27
|
+
|
28
|
+
begin
|
29
|
+
require 'pry'
|
30
|
+
rescue LoadError
|
31
|
+
# Pry not available
|
32
|
+
end
|