chronicle-etl 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +3 -0
- data/README.md +20 -13
- data/chronicle-etl.gemspec +11 -8
- data/lib/chronicle/etl/cli/connectors.rb +19 -7
- data/lib/chronicle/etl/cli/jobs.rb +24 -18
- data/lib/chronicle/etl/cli/main.rb +10 -2
- data/lib/chronicle/etl/config.rb +1 -1
- data/lib/chronicle/etl/exceptions.rb +12 -1
- data/lib/chronicle/etl/extraction.rb +12 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -36
- data/lib/chronicle/etl/extractors/extractor.rb +9 -1
- data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
- data/lib/chronicle/etl/job.rb +30 -29
- data/lib/chronicle/etl/job_definition.rb +45 -7
- data/lib/chronicle/etl/job_log.rb +10 -0
- data/lib/chronicle/etl/job_logger.rb +23 -20
- data/lib/chronicle/etl/loaders/csv_loader.rb +4 -0
- data/lib/chronicle/etl/loaders/loader.rb +1 -1
- data/lib/chronicle/etl/loaders/rest_loader.rb +5 -1
- data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
- data/lib/chronicle/etl/loaders/table_loader.rb +57 -7
- data/lib/chronicle/etl/logger.rb +48 -0
- data/lib/chronicle/etl/models/attachment.rb +14 -0
- data/lib/chronicle/etl/models/base.rb +23 -7
- data/lib/chronicle/etl/models/entity.rb +9 -3
- data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
- data/lib/chronicle/etl/registry/registry.rb +52 -0
- data/lib/chronicle/etl/registry/self_registering.rb +25 -0
- data/lib/chronicle/etl/runner.rb +57 -7
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
- data/lib/chronicle/etl/serializers/serializer.rb +27 -0
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
- data/lib/chronicle/etl/transformers/transformer.rb +39 -9
- data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
- data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +7 -2
- metadata +96 -44
- data/Gemfile.lock +0 -91
- data/lib/chronicle/etl/catalog.rb +0 -108
- data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
# A singleton class that acts as a registry of connector classes available for ETL jobs
|
6
|
+
module Registry
|
7
|
+
PHASES = [:extractor, :transformer, :loader]
|
8
|
+
|
9
|
+
class << self
|
10
|
+
attr_accessor :connectors
|
11
|
+
|
12
|
+
def load_all!
|
13
|
+
load_connectors_from_gems
|
14
|
+
end
|
15
|
+
|
16
|
+
def load_connectors_from_gems
|
17
|
+
Gem::Specification.filter{|s| s.name.match(/^chronicle/) }.each do |gem|
|
18
|
+
require_str = gem.name.gsub('chronicle-', 'chronicle/')
|
19
|
+
require require_str rescue LoadError
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def install_connector name
|
24
|
+
gem_name = "chronicle-#{name}"
|
25
|
+
Gem.install(gem_name)
|
26
|
+
end
|
27
|
+
|
28
|
+
def register connector
|
29
|
+
@connectors ||= []
|
30
|
+
@connectors << connector
|
31
|
+
end
|
32
|
+
|
33
|
+
def find_by_phase_and_identifier(phase, identifier)
|
34
|
+
connector = find_within_loaded_connectors(phase, identifier)
|
35
|
+
unless connector
|
36
|
+
# Only load external connectors (slow) if not found in built-in connectors
|
37
|
+
load_all!
|
38
|
+
connector = find_within_loaded_connectors(phase, identifier)
|
39
|
+
end
|
40
|
+
connector || raise(ConnectorNotAvailableError.new("Connector '#{identifier}' not found"))
|
41
|
+
end
|
42
|
+
|
43
|
+
def find_within_loaded_connectors(phase, identifier)
|
44
|
+
@connectors.find { |c| c.phase == phase && c.identifier == identifier }
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
require_relative 'self_registering'
|
52
|
+
require_relative 'connector_registration'
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Registry
|
6
|
+
# Gives a connector class the ability to let the Chronicle::ETL::Registry
|
7
|
+
# know about itself
|
8
|
+
module SelfRegistering
|
9
|
+
extend Forwardable
|
10
|
+
|
11
|
+
attr_accessor :connector_registration
|
12
|
+
|
13
|
+
def_delegators :@connector_registration, :description, :provider, :identifier
|
14
|
+
|
15
|
+
# Creates a ConnectorRegistration for this connector's details and register's it
|
16
|
+
# into the Registry
|
17
|
+
def register_connector
|
18
|
+
@connector_registration ||= ::Chronicle::ETL::Registry::ConnectorRegistration.new(self)
|
19
|
+
yield @connector_registration if block_given?
|
20
|
+
::Chronicle::ETL::Registry.register(@connector_registration)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'colorize'
|
2
|
+
require 'chronic_duration'
|
2
3
|
|
3
4
|
class Chronicle::ETL::Runner
|
4
5
|
def initialize(job)
|
@@ -14,24 +15,73 @@ class Chronicle::ETL::Runner
|
|
14
15
|
loader.start
|
15
16
|
|
16
17
|
total = extractor.results_count
|
17
|
-
progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
18
|
+
@progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
19
|
+
Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
|
18
20
|
|
19
|
-
|
20
|
-
|
21
|
+
Chronicle::ETL::Logger.info(tty_log_job_start)
|
22
|
+
extractor.extract do |extraction|
|
23
|
+
unless extraction.is_a?(Chronicle::ETL::Extraction)
|
24
|
+
raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
|
25
|
+
end
|
26
|
+
|
27
|
+
transformer = @job.instantiate_transformer(extraction)
|
21
28
|
record = transformer.transform
|
22
29
|
|
23
30
|
unless record.is_a?(Chronicle::ETL::Models::Base)
|
24
|
-
raise Chronicle::ETL::
|
31
|
+
raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
|
25
32
|
end
|
26
33
|
|
34
|
+
Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
|
27
35
|
@job_logger.log_transformation(transformer)
|
28
|
-
|
29
|
-
|
36
|
+
|
37
|
+
loader.load(record) unless @job.dry_run?
|
38
|
+
rescue Chronicle::ETL::TransformationError => e
|
39
|
+
Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
|
40
|
+
ensure
|
41
|
+
@progress_bar.increment
|
30
42
|
end
|
31
43
|
|
32
|
-
progress_bar.finish
|
44
|
+
@progress_bar.finish
|
33
45
|
loader.finish
|
34
46
|
@job_logger.finish
|
47
|
+
rescue Interrupt
|
48
|
+
Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
|
49
|
+
@job_logger.error
|
50
|
+
rescue StandardError => e
|
51
|
+
raise e
|
52
|
+
ensure
|
35
53
|
@job_logger.save
|
54
|
+
@progress_bar.finish
|
55
|
+
Chronicle::ETL::Logger.detach_from_progress_bar
|
56
|
+
Chronicle::ETL::Logger.info(tty_log_completion)
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def tty_log_job_start
|
62
|
+
output = "Beginning job "
|
63
|
+
output += "'#{@job.name}'".bold if @job.name
|
64
|
+
output
|
65
|
+
end
|
66
|
+
|
67
|
+
def tty_log_transformation transformer
|
68
|
+
output = " ✓".green
|
69
|
+
output += " #{transformer}"
|
70
|
+
end
|
71
|
+
|
72
|
+
def tty_log_transformation_failure exception
|
73
|
+
output = " ✖".red
|
74
|
+
output += " Failed to build #{exception.transformation}. #{exception.message}"
|
75
|
+
end
|
76
|
+
|
77
|
+
def tty_log_completion
|
78
|
+
status = @job_logger.success ? 'Success' : 'Failed'
|
79
|
+
output = "\nCompleted job "
|
80
|
+
output += "'#{@job.name}'".bold if @job.name
|
81
|
+
output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
|
82
|
+
output += "\n Status:\t".light_black + status
|
83
|
+
output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
|
84
|
+
output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
|
85
|
+
output
|
36
86
|
end
|
37
87
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class JSONAPISerializer < Chronicle::ETL::Serializer
|
4
|
+
def serializable_hash
|
5
|
+
@record
|
6
|
+
.identifier_hash
|
7
|
+
.merge({ attributes: @record.attributes })
|
8
|
+
.merge({ relationships: build_associations })
|
9
|
+
.merge(@record.meta_hash)
|
10
|
+
end
|
11
|
+
|
12
|
+
def build_associations
|
13
|
+
@record.associations.transform_values do |value|
|
14
|
+
association_data =
|
15
|
+
if value.is_a?(Array)
|
16
|
+
value.map { |record| JSONAPISerializer.new(record).serializable_hash }
|
17
|
+
else
|
18
|
+
JSONAPISerializer.new(value).serializable_hash
|
19
|
+
end
|
20
|
+
{ data: association_data }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
# Abstract class representing a Serializer for an ETL record
|
4
|
+
class Serializer
|
5
|
+
# Construct a new instance of this serializer.
|
6
|
+
# == Parameters:
|
7
|
+
# options::
|
8
|
+
# Options for configuring this Serializers
|
9
|
+
def initialize(record, options = {})
|
10
|
+
@record = record
|
11
|
+
@options = options
|
12
|
+
end
|
13
|
+
|
14
|
+
# Serialize a record as a hash
|
15
|
+
def serializable_hash
|
16
|
+
raise NotImplementedError
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.serialize(record)
|
20
|
+
serializer = self.new(record)
|
21
|
+
serializer.serializable_hash
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
require_relative 'jsonapi_serializer'
|
@@ -0,0 +1,253 @@
|
|
1
|
+
require 'mini_exiftool'
|
2
|
+
require 'active_support'
|
3
|
+
require 'active_support/core_ext/object'
|
4
|
+
require 'active_support/core_ext/time'
|
5
|
+
require 'active_support/core_ext/hash/reverse_merge'
|
6
|
+
require 'active_support/core_ext/string/inflections'
|
7
|
+
|
8
|
+
module Chronicle
|
9
|
+
module ETL
|
10
|
+
# Transform a JPEG or other image file into a record.
|
11
|
+
# By default, file mtime and a hash of the file content is used to build
|
12
|
+
# the timestamp and ID respectively but other options are available (such
|
13
|
+
# as reading EXIF tags or extended attributes from the filesystem).
|
14
|
+
#
|
15
|
+
# TODO: This should be extracted into its own plugin
|
16
|
+
class ImageFileTransformer < Chronicle::ETL::Transformer
|
17
|
+
register_connector do |r|
|
18
|
+
r.identifier = 'image-file'
|
19
|
+
r.description = 'an image file'
|
20
|
+
end
|
21
|
+
|
22
|
+
DEFAULT_OPTIONS = {
|
23
|
+
timestamp_strategy: 'file_mtime',
|
24
|
+
id_strategy: 'file_hash',
|
25
|
+
verb: 'photographed',
|
26
|
+
|
27
|
+
# EXIF tags often don't have timezones
|
28
|
+
timezone_default: 'Eastern Time (US & Canada)',
|
29
|
+
include_image_data: true
|
30
|
+
}.freeze
|
31
|
+
|
32
|
+
def initialize(*args)
|
33
|
+
super(*args)
|
34
|
+
@options = @options.reverse_merge(DEFAULT_OPTIONS)
|
35
|
+
end
|
36
|
+
|
37
|
+
def transform
|
38
|
+
# FIXME: set @filename; use block for reading file when necessary
|
39
|
+
@file = File.open(@extraction.data)
|
40
|
+
record = build_created(@file)
|
41
|
+
@file.close
|
42
|
+
record
|
43
|
+
end
|
44
|
+
|
45
|
+
def friendly_identifier
|
46
|
+
@file.path
|
47
|
+
end
|
48
|
+
|
49
|
+
def id
|
50
|
+
@id ||= begin
|
51
|
+
id = build_with_strategy(field: :id, strategy: @options[:id_strategy])
|
52
|
+
raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
|
53
|
+
|
54
|
+
id
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def timestamp
|
59
|
+
@timestamp ||= begin
|
60
|
+
ts = build_with_strategy(field: :timestamp, strategy: @options[:timestamp_strategy])
|
61
|
+
raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
|
62
|
+
|
63
|
+
ts
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def build_created(file)
|
70
|
+
record = ::Chronicle::ETL::Models::Activity.new
|
71
|
+
record.verb = @options[:verb]
|
72
|
+
record.provider = @options[:provider]
|
73
|
+
record.provider_id = id
|
74
|
+
record.end_at = timestamp
|
75
|
+
record.dedupe_on = [[:provider_id, :verb, :provider]]
|
76
|
+
|
77
|
+
record.involved = build_image
|
78
|
+
record.actor = build_actor
|
79
|
+
|
80
|
+
record.assign_attributes(build_gps)
|
81
|
+
record
|
82
|
+
end
|
83
|
+
|
84
|
+
def build_actor
|
85
|
+
actor = ::Chronicle::ETL::Models::Entity.new
|
86
|
+
actor.represents = 'identity'
|
87
|
+
actor.provider = @options[:actor][:provider]
|
88
|
+
actor.slug = @options[:actor][:slug]
|
89
|
+
actor.dedupe_on = [[:provider, :slug, :represents]]
|
90
|
+
actor
|
91
|
+
end
|
92
|
+
|
93
|
+
def build_image
|
94
|
+
image = ::Chronicle::ETL::Models::Entity.new
|
95
|
+
image.represents = @options[:involved][:represents]
|
96
|
+
image.title = build_title
|
97
|
+
image.body = exif['Description']
|
98
|
+
image.provider = @options[:involved][:provider]
|
99
|
+
image.provider_id = id
|
100
|
+
image.assign_attributes(build_gps)
|
101
|
+
image.dedupe_on = [[:provider, :provider_id, :represents]]
|
102
|
+
|
103
|
+
if @options[:ocr_strategy]
|
104
|
+
ocr_text = build_with_strategy(field: :ocr, strategy: @options[:ocr_strategy])
|
105
|
+
image.metadata[:ocr_text] = ocr_text if ocr_text
|
106
|
+
end
|
107
|
+
|
108
|
+
names = extract_people_depicted
|
109
|
+
tags = extract_keywords(names)
|
110
|
+
|
111
|
+
image.depicts = build_people_depicted(names)
|
112
|
+
image.abouts = build_keywords(tags)
|
113
|
+
|
114
|
+
if @options[:include_image_data]
|
115
|
+
attachment = ::Chronicle::ETL::Models::Attachment.new
|
116
|
+
attachment.data = build_image_data
|
117
|
+
image.attachments = [attachment]
|
118
|
+
end
|
119
|
+
|
120
|
+
image
|
121
|
+
end
|
122
|
+
|
123
|
+
def build_keywords(topics)
|
124
|
+
topics.map do |topic|
|
125
|
+
t = ::Chronicle::ETL::Models::Entity.new
|
126
|
+
t.represents = 'topic'
|
127
|
+
t.provider = @options[:involved][:provider]
|
128
|
+
t.title = topic
|
129
|
+
t.slug = topic.parameterize
|
130
|
+
t.dedupe_on = [[:provider, :represents, :slug]]
|
131
|
+
t
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def build_people_depicted(names)
|
136
|
+
names.map do |name|
|
137
|
+
identity = ::Chronicle::ETL::Models::Entity.new
|
138
|
+
identity.represents = 'identity'
|
139
|
+
identity.provider = @options[:involved][:provider]
|
140
|
+
identity.slug = name.parameterize
|
141
|
+
identity.title = name
|
142
|
+
identity.dedupe_on = [[:provider, :represents, :slug]]
|
143
|
+
identity
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def build_gps
|
148
|
+
return {} unless exif['GPSLatitude']
|
149
|
+
|
150
|
+
{
|
151
|
+
lat: exif['GPSLatitude'],
|
152
|
+
lng: exif['GPSLongitude'],
|
153
|
+
elevation: exif['GPSAltitude']
|
154
|
+
}
|
155
|
+
end
|
156
|
+
|
157
|
+
def build_image_data
|
158
|
+
::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
|
159
|
+
end
|
160
|
+
|
161
|
+
def build_title
|
162
|
+
File.basename(@file)
|
163
|
+
end
|
164
|
+
|
165
|
+
def build_with_strategy(field:, strategy:[])
|
166
|
+
strategies = [strategy].flatten.compact
|
167
|
+
strategies.each do |s|
|
168
|
+
builder_method = "build_#{field}_using_#{s}"
|
169
|
+
result = send(builder_method.to_sym)
|
170
|
+
return result if result
|
171
|
+
end
|
172
|
+
return
|
173
|
+
end
|
174
|
+
|
175
|
+
def build_id_using_file_hash
|
176
|
+
Digest::SHA256.hexdigest(File.read(@file))
|
177
|
+
end
|
178
|
+
|
179
|
+
def build_id_using_xattr_version
|
180
|
+
load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
|
181
|
+
end
|
182
|
+
|
183
|
+
def build_id_using_xmp_document_id
|
184
|
+
exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
|
185
|
+
end
|
186
|
+
|
187
|
+
def build_timestamp_using_file_mtime
|
188
|
+
File.mtime(@file)
|
189
|
+
end
|
190
|
+
|
191
|
+
def build_timestamp_using_exif_datetimeoriginal
|
192
|
+
# EXIF tags don't have timezone information. This is a DateTime in UTC
|
193
|
+
timestamp = exif['DateTimeOriginal'] || return
|
194
|
+
|
195
|
+
if exif['OffsetTimeOriginal']
|
196
|
+
# Offset tags are only available in newer EXIF tags. If it exists, we
|
197
|
+
# use it instead of UTC
|
198
|
+
timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
|
199
|
+
elsif false
|
200
|
+
# TODO: support option of using GPS coordinates to determine timezone
|
201
|
+
else
|
202
|
+
zone = ActiveSupport::TimeZone.new(@options[:timezone_default])
|
203
|
+
timestamp = zone.parse(timestamp.asctime)
|
204
|
+
end
|
205
|
+
|
206
|
+
timestamp
|
207
|
+
end
|
208
|
+
|
209
|
+
# TODO: add documentation for how to set up `macocr`
|
210
|
+
def build_ocr_using_macocr
|
211
|
+
`macocr "#{@file.path}" 2>/dev/null`.presence
|
212
|
+
end
|
213
|
+
|
214
|
+
def exif
|
215
|
+
@exif ||= MiniExiftool.new(
|
216
|
+
@file.path,
|
217
|
+
numerical: true,
|
218
|
+
|
219
|
+
# EXIF timestamps don't have timezone information. MiniExifTool uses Time
|
220
|
+
# by default which parses timestamps in local time zone. Using DateTime
|
221
|
+
# parses dates as UTC and then we can apply a timezone offset if the optional
|
222
|
+
# EXIF timezone offset fields are available.
|
223
|
+
# https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
|
224
|
+
timestamps: DateTime
|
225
|
+
)
|
226
|
+
end
|
227
|
+
|
228
|
+
# Figure out which faces are tagged as regions and return a list of their names
|
229
|
+
def extract_people_depicted
|
230
|
+
return [] unless exif['RegionName']
|
231
|
+
|
232
|
+
names = [exif['RegionName']].flatten
|
233
|
+
types = [exif['RegionType']].flatten
|
234
|
+
|
235
|
+
names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
|
236
|
+
end
|
237
|
+
|
238
|
+
# Extract image keywords from EXIF/IPTC tag and subtract out those of which are
|
239
|
+
# tagged people (determiend by looking at face regions)
|
240
|
+
def extract_keywords(people_names = [])
|
241
|
+
[exif['Keywords'] || []].flatten - people_names
|
242
|
+
end
|
243
|
+
|
244
|
+
def load_value_from_xattr_plist attribute
|
245
|
+
require 'nokogiri'
|
246
|
+
xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
|
247
|
+
return unless xml
|
248
|
+
value = Nokogiri::XML.parse(r).xpath("//string").text
|
249
|
+
return value.presence
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
@@ -1,9 +1,18 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
3
|
class NullTransformer < Chronicle::ETL::Transformer
|
4
|
+
register_connector do |r|
|
5
|
+
r.identifier = 'null'
|
6
|
+
r.description = 'in no way'
|
7
|
+
end
|
8
|
+
|
4
9
|
def transform
|
5
|
-
Chronicle::ETL::Models::Generic.new(@data)
|
10
|
+
Chronicle::ETL::Models::Generic.new(@extraction.data)
|
6
11
|
end
|
12
|
+
|
13
|
+
def timestamp; end
|
14
|
+
|
15
|
+
def id; end
|
7
16
|
end
|
8
17
|
end
|
9
18
|
end
|
@@ -2,16 +2,15 @@ module Chronicle
|
|
2
2
|
module ETL
|
3
3
|
# Abstract class representing an Transformer for an ETL job
|
4
4
|
class Transformer
|
5
|
-
extend Chronicle::ETL::
|
5
|
+
extend Chronicle::ETL::Registry::SelfRegistering
|
6
6
|
|
7
7
|
# Construct a new instance of this transformer. Options are passed in from a Runner
|
8
|
-
# ==
|
8
|
+
# == Parameters:
|
9
9
|
# options::
|
10
10
|
# Options for configuring this Transformer
|
11
|
-
def initialize(options = {},
|
11
|
+
def initialize(options = {}, extraction)
|
12
12
|
@options = options
|
13
|
-
@
|
14
|
-
@record = Chronicle::ETL::Models::Activity.new
|
13
|
+
@extraction = extraction
|
15
14
|
end
|
16
15
|
|
17
16
|
# @abstract Subclass is expected to implement #transform
|
@@ -19,16 +18,47 @@ module Chronicle
|
|
19
18
|
# The main entrypoint for transforming a record. Called by a Runner on each extracted record
|
20
19
|
|
21
20
|
# The domain or provider-specific id of the record this transformer is working on.
|
22
|
-
#
|
23
|
-
#
|
24
|
-
|
21
|
+
# It is useful for:
|
22
|
+
# - de-duping records that might exist in the loader's destination
|
23
|
+
# - building a cursor so an extractor doesn't have to start from the beginning of a
|
24
|
+
# a source
|
25
|
+
def id
|
26
|
+
raise NotImplementedError
|
27
|
+
end
|
25
28
|
|
26
29
|
# The domain or provider-specific timestamp of the record this transformer is working on.
|
27
30
|
# Used for building a cursor so an extractor doesn't have to start from the beginning of a
|
28
31
|
# data source from the beginning.
|
29
|
-
def timestamp
|
32
|
+
def timestamp
|
33
|
+
raise NotImplementedError
|
34
|
+
end
|
35
|
+
|
36
|
+
# An optional, human-readable identifier for a transformation, intended for debugging or logging.
|
37
|
+
# By default, it is just the id.
|
38
|
+
def friendly_identifier
|
39
|
+
id
|
40
|
+
end
|
41
|
+
|
42
|
+
def to_s
|
43
|
+
ts = begin
|
44
|
+
unknown = "???"
|
45
|
+
timestamp&.iso8601 || unknown
|
46
|
+
rescue TransformationError, NotImplementedError
|
47
|
+
unknown
|
48
|
+
end
|
49
|
+
|
50
|
+
identifier = begin
|
51
|
+
unknown = self.class.to_s
|
52
|
+
friendly_identifier || self.class.to_s
|
53
|
+
rescue TransformationError, NotImplementedError
|
54
|
+
unknown
|
55
|
+
end
|
56
|
+
|
57
|
+
"[#{ts}] #{identifier}"
|
58
|
+
end
|
30
59
|
end
|
31
60
|
end
|
32
61
|
end
|
33
62
|
|
34
63
|
require_relative 'null_transformer'
|
64
|
+
require_relative 'image_file_transformer'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'marcel'
|
2
|
+
require 'base64'
|
3
|
+
|
4
|
+
module Chronicle
|
5
|
+
module ETL
|
6
|
+
module Utils
|
7
|
+
# Utility methods for dealing with binary files
|
8
|
+
module BinaryAttachments
|
9
|
+
def self.filename_to_base64(filename:, mimetype: nil)
|
10
|
+
mimetype = mimetype || guess_mimetype(filename: filename)
|
11
|
+
|
12
|
+
"data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.guess_mimetype(filename:)
|
16
|
+
Marcel::MimeType.for(filename)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'active_support/core_ext/object/blank'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Utils
|
6
|
+
# OCR for image files
|
7
|
+
# TODO: add other strategies and document `macocr`
|
8
|
+
module TextRecognition
|
9
|
+
def self.recognize_in_image(filename:)
|
10
|
+
`macocr "#{filename}" 2>/dev/null`.presence
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/chronicle/etl.rb
CHANGED
@@ -1,19 +1,24 @@
|
|
1
|
-
require_relative 'etl/
|
1
|
+
require_relative 'etl/registry/registry'
|
2
2
|
require_relative 'etl/config'
|
3
3
|
require_relative 'etl/exceptions'
|
4
|
+
require_relative 'etl/extraction'
|
4
5
|
require_relative 'etl/extractors/extractor'
|
5
6
|
require_relative 'etl/job_definition'
|
6
7
|
require_relative 'etl/job_log'
|
7
8
|
require_relative 'etl/job_logger'
|
8
9
|
require_relative 'etl/job'
|
9
10
|
require_relative 'etl/loaders/loader'
|
11
|
+
require_relative 'etl/logger'
|
10
12
|
require_relative 'etl/models/activity'
|
13
|
+
require_relative 'etl/models/attachment'
|
11
14
|
require_relative 'etl/models/base'
|
12
15
|
require_relative 'etl/models/entity'
|
13
16
|
require_relative 'etl/models/generic'
|
14
17
|
require_relative 'etl/runner'
|
18
|
+
require_relative 'etl/serializers/serializer'
|
15
19
|
require_relative 'etl/transformers/transformer'
|
20
|
+
require_relative 'etl/utils/binary_attachments'
|
16
21
|
require_relative 'etl/utils/hash_utilities'
|
17
|
-
require_relative 'etl/utils/
|
22
|
+
require_relative 'etl/utils/text_recognition'
|
18
23
|
require_relative 'etl/utils/progress_bar'
|
19
24
|
require_relative 'etl/version'
|