chronicle-etl 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +3 -0
- data/README.md +20 -13
- data/chronicle-etl.gemspec +11 -8
- data/lib/chronicle/etl/cli/connectors.rb +19 -7
- data/lib/chronicle/etl/cli/jobs.rb +24 -18
- data/lib/chronicle/etl/cli/main.rb +10 -2
- data/lib/chronicle/etl/config.rb +1 -1
- data/lib/chronicle/etl/exceptions.rb +12 -1
- data/lib/chronicle/etl/extraction.rb +12 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -36
- data/lib/chronicle/etl/extractors/extractor.rb +9 -1
- data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
- data/lib/chronicle/etl/job.rb +30 -29
- data/lib/chronicle/etl/job_definition.rb +45 -7
- data/lib/chronicle/etl/job_log.rb +10 -0
- data/lib/chronicle/etl/job_logger.rb +23 -20
- data/lib/chronicle/etl/loaders/csv_loader.rb +4 -0
- data/lib/chronicle/etl/loaders/loader.rb +1 -1
- data/lib/chronicle/etl/loaders/rest_loader.rb +5 -1
- data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
- data/lib/chronicle/etl/loaders/table_loader.rb +57 -7
- data/lib/chronicle/etl/logger.rb +48 -0
- data/lib/chronicle/etl/models/attachment.rb +14 -0
- data/lib/chronicle/etl/models/base.rb +23 -7
- data/lib/chronicle/etl/models/entity.rb +9 -3
- data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
- data/lib/chronicle/etl/registry/registry.rb +52 -0
- data/lib/chronicle/etl/registry/self_registering.rb +25 -0
- data/lib/chronicle/etl/runner.rb +57 -7
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
- data/lib/chronicle/etl/serializers/serializer.rb +27 -0
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
- data/lib/chronicle/etl/transformers/transformer.rb +39 -9
- data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
- data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +7 -2
- metadata +96 -44
- data/Gemfile.lock +0 -91
- data/lib/chronicle/etl/catalog.rb +0 -108
- data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
# A singleton class that acts as a registry of connector classes available for ETL jobs
|
6
|
+
module Registry
|
7
|
+
PHASES = [:extractor, :transformer, :loader]
|
8
|
+
|
9
|
+
class << self
|
10
|
+
attr_accessor :connectors
|
11
|
+
|
12
|
+
def load_all!
|
13
|
+
load_connectors_from_gems
|
14
|
+
end
|
15
|
+
|
16
|
+
def load_connectors_from_gems
|
17
|
+
Gem::Specification.filter{|s| s.name.match(/^chronicle/) }.each do |gem|
|
18
|
+
require_str = gem.name.gsub('chronicle-', 'chronicle/')
|
19
|
+
require require_str rescue LoadError
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def install_connector name
|
24
|
+
gem_name = "chronicle-#{name}"
|
25
|
+
Gem.install(gem_name)
|
26
|
+
end
|
27
|
+
|
28
|
+
def register connector
|
29
|
+
@connectors ||= []
|
30
|
+
@connectors << connector
|
31
|
+
end
|
32
|
+
|
33
|
+
def find_by_phase_and_identifier(phase, identifier)
|
34
|
+
connector = find_within_loaded_connectors(phase, identifier)
|
35
|
+
unless connector
|
36
|
+
# Only load external connectors (slow) if not found in built-in connectors
|
37
|
+
load_all!
|
38
|
+
connector = find_within_loaded_connectors(phase, identifier)
|
39
|
+
end
|
40
|
+
connector || raise(ConnectorNotAvailableError.new("Connector '#{identifier}' not found"))
|
41
|
+
end
|
42
|
+
|
43
|
+
def find_within_loaded_connectors(phase, identifier)
|
44
|
+
@connectors.find { |c| c.phase == phase && c.identifier == identifier }
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
require_relative 'self_registering'
|
52
|
+
require_relative 'connector_registration'
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Registry
|
6
|
+
# Gives a connector class the ability to let the Chronicle::ETL::Registry
|
7
|
+
# know about itself
|
8
|
+
module SelfRegistering
|
9
|
+
extend Forwardable
|
10
|
+
|
11
|
+
attr_accessor :connector_registration
|
12
|
+
|
13
|
+
def_delegators :@connector_registration, :description, :provider, :identifier
|
14
|
+
|
15
|
+
# Creates a ConnectorRegistration for this connector's details and register's it
|
16
|
+
# into the Registry
|
17
|
+
def register_connector
|
18
|
+
@connector_registration ||= ::Chronicle::ETL::Registry::ConnectorRegistration.new(self)
|
19
|
+
yield @connector_registration if block_given?
|
20
|
+
::Chronicle::ETL::Registry.register(@connector_registration)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'colorize'
|
2
|
+
require 'chronic_duration'
|
2
3
|
|
3
4
|
class Chronicle::ETL::Runner
|
4
5
|
def initialize(job)
|
@@ -14,24 +15,73 @@ class Chronicle::ETL::Runner
|
|
14
15
|
loader.start
|
15
16
|
|
16
17
|
total = extractor.results_count
|
17
|
-
progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
18
|
+
@progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
19
|
+
Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
|
18
20
|
|
19
|
-
|
20
|
-
|
21
|
+
Chronicle::ETL::Logger.info(tty_log_job_start)
|
22
|
+
extractor.extract do |extraction|
|
23
|
+
unless extraction.is_a?(Chronicle::ETL::Extraction)
|
24
|
+
raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
|
25
|
+
end
|
26
|
+
|
27
|
+
transformer = @job.instantiate_transformer(extraction)
|
21
28
|
record = transformer.transform
|
22
29
|
|
23
30
|
unless record.is_a?(Chronicle::ETL::Models::Base)
|
24
|
-
raise Chronicle::ETL::
|
31
|
+
raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
|
25
32
|
end
|
26
33
|
|
34
|
+
Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
|
27
35
|
@job_logger.log_transformation(transformer)
|
28
|
-
|
29
|
-
|
36
|
+
|
37
|
+
loader.load(record) unless @job.dry_run?
|
38
|
+
rescue Chronicle::ETL::TransformationError => e
|
39
|
+
Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
|
40
|
+
ensure
|
41
|
+
@progress_bar.increment
|
30
42
|
end
|
31
43
|
|
32
|
-
progress_bar.finish
|
44
|
+
@progress_bar.finish
|
33
45
|
loader.finish
|
34
46
|
@job_logger.finish
|
47
|
+
rescue Interrupt
|
48
|
+
Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
|
49
|
+
@job_logger.error
|
50
|
+
rescue StandardError => e
|
51
|
+
raise e
|
52
|
+
ensure
|
35
53
|
@job_logger.save
|
54
|
+
@progress_bar.finish
|
55
|
+
Chronicle::ETL::Logger.detach_from_progress_bar
|
56
|
+
Chronicle::ETL::Logger.info(tty_log_completion)
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def tty_log_job_start
|
62
|
+
output = "Beginning job "
|
63
|
+
output += "'#{@job.name}'".bold if @job.name
|
64
|
+
output
|
65
|
+
end
|
66
|
+
|
67
|
+
def tty_log_transformation transformer
|
68
|
+
output = " ✓".green
|
69
|
+
output += " #{transformer}"
|
70
|
+
end
|
71
|
+
|
72
|
+
def tty_log_transformation_failure exception
|
73
|
+
output = " ✖".red
|
74
|
+
output += " Failed to build #{exception.transformation}. #{exception.message}"
|
75
|
+
end
|
76
|
+
|
77
|
+
def tty_log_completion
|
78
|
+
status = @job_logger.success ? 'Success' : 'Failed'
|
79
|
+
output = "\nCompleted job "
|
80
|
+
output += "'#{@job.name}'".bold if @job.name
|
81
|
+
output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
|
82
|
+
output += "\n Status:\t".light_black + status
|
83
|
+
output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
|
84
|
+
output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
|
85
|
+
output
|
36
86
|
end
|
37
87
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class JSONAPISerializer < Chronicle::ETL::Serializer
|
4
|
+
def serializable_hash
|
5
|
+
@record
|
6
|
+
.identifier_hash
|
7
|
+
.merge({ attributes: @record.attributes })
|
8
|
+
.merge({ relationships: build_associations })
|
9
|
+
.merge(@record.meta_hash)
|
10
|
+
end
|
11
|
+
|
12
|
+
def build_associations
|
13
|
+
@record.associations.transform_values do |value|
|
14
|
+
association_data =
|
15
|
+
if value.is_a?(Array)
|
16
|
+
value.map { |record| JSONAPISerializer.new(record).serializable_hash }
|
17
|
+
else
|
18
|
+
JSONAPISerializer.new(value).serializable_hash
|
19
|
+
end
|
20
|
+
{ data: association_data }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
# Abstract class representing a Serializer for an ETL record
|
4
|
+
class Serializer
|
5
|
+
# Construct a new instance of this serializer.
|
6
|
+
# == Parameters:
|
7
|
+
# options::
|
8
|
+
# Options for configuring this Serializers
|
9
|
+
def initialize(record, options = {})
|
10
|
+
@record = record
|
11
|
+
@options = options
|
12
|
+
end
|
13
|
+
|
14
|
+
# Serialize a record as a hash
|
15
|
+
def serializable_hash
|
16
|
+
raise NotImplementedError
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.serialize(record)
|
20
|
+
serializer = self.new(record)
|
21
|
+
serializer.serializable_hash
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
require_relative 'jsonapi_serializer'
|
@@ -0,0 +1,253 @@
|
|
1
|
+
require 'mini_exiftool'
|
2
|
+
require 'active_support'
|
3
|
+
require 'active_support/core_ext/object'
|
4
|
+
require 'active_support/core_ext/time'
|
5
|
+
require 'active_support/core_ext/hash/reverse_merge'
|
6
|
+
require 'active_support/core_ext/string/inflections'
|
7
|
+
|
8
|
+
module Chronicle
|
9
|
+
module ETL
|
10
|
+
# Transform a JPEG or other image file into a record.
|
11
|
+
# By default, file mtime and a hash of the file content is used to build
|
12
|
+
# the timestamp and ID respectively but other options are available (such
|
13
|
+
# as reading EXIF tags or extended attributes from the filesystem).
|
14
|
+
#
|
15
|
+
# TODO: This should be extracted into its own plugin
|
16
|
+
class ImageFileTransformer < Chronicle::ETL::Transformer
|
17
|
+
register_connector do |r|
|
18
|
+
r.identifier = 'image-file'
|
19
|
+
r.description = 'an image file'
|
20
|
+
end
|
21
|
+
|
22
|
+
DEFAULT_OPTIONS = {
|
23
|
+
timestamp_strategy: 'file_mtime',
|
24
|
+
id_strategy: 'file_hash',
|
25
|
+
verb: 'photographed',
|
26
|
+
|
27
|
+
# EXIF tags often don't have timezones
|
28
|
+
timezone_default: 'Eastern Time (US & Canada)',
|
29
|
+
include_image_data: true
|
30
|
+
}.freeze
|
31
|
+
|
32
|
+
def initialize(*args)
|
33
|
+
super(*args)
|
34
|
+
@options = @options.reverse_merge(DEFAULT_OPTIONS)
|
35
|
+
end
|
36
|
+
|
37
|
+
def transform
|
38
|
+
# FIXME: set @filename; use block for reading file when necessary
|
39
|
+
@file = File.open(@extraction.data)
|
40
|
+
record = build_created(@file)
|
41
|
+
@file.close
|
42
|
+
record
|
43
|
+
end
|
44
|
+
|
45
|
+
def friendly_identifier
|
46
|
+
@file.path
|
47
|
+
end
|
48
|
+
|
49
|
+
def id
|
50
|
+
@id ||= begin
|
51
|
+
id = build_with_strategy(field: :id, strategy: @options[:id_strategy])
|
52
|
+
raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
|
53
|
+
|
54
|
+
id
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def timestamp
|
59
|
+
@timestamp ||= begin
|
60
|
+
ts = build_with_strategy(field: :timestamp, strategy: @options[:timestamp_strategy])
|
61
|
+
raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
|
62
|
+
|
63
|
+
ts
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def build_created(file)
|
70
|
+
record = ::Chronicle::ETL::Models::Activity.new
|
71
|
+
record.verb = @options[:verb]
|
72
|
+
record.provider = @options[:provider]
|
73
|
+
record.provider_id = id
|
74
|
+
record.end_at = timestamp
|
75
|
+
record.dedupe_on = [[:provider_id, :verb, :provider]]
|
76
|
+
|
77
|
+
record.involved = build_image
|
78
|
+
record.actor = build_actor
|
79
|
+
|
80
|
+
record.assign_attributes(build_gps)
|
81
|
+
record
|
82
|
+
end
|
83
|
+
|
84
|
+
def build_actor
|
85
|
+
actor = ::Chronicle::ETL::Models::Entity.new
|
86
|
+
actor.represents = 'identity'
|
87
|
+
actor.provider = @options[:actor][:provider]
|
88
|
+
actor.slug = @options[:actor][:slug]
|
89
|
+
actor.dedupe_on = [[:provider, :slug, :represents]]
|
90
|
+
actor
|
91
|
+
end
|
92
|
+
|
93
|
+
def build_image
|
94
|
+
image = ::Chronicle::ETL::Models::Entity.new
|
95
|
+
image.represents = @options[:involved][:represents]
|
96
|
+
image.title = build_title
|
97
|
+
image.body = exif['Description']
|
98
|
+
image.provider = @options[:involved][:provider]
|
99
|
+
image.provider_id = id
|
100
|
+
image.assign_attributes(build_gps)
|
101
|
+
image.dedupe_on = [[:provider, :provider_id, :represents]]
|
102
|
+
|
103
|
+
if @options[:ocr_strategy]
|
104
|
+
ocr_text = build_with_strategy(field: :ocr, strategy: @options[:ocr_strategy])
|
105
|
+
image.metadata[:ocr_text] = ocr_text if ocr_text
|
106
|
+
end
|
107
|
+
|
108
|
+
names = extract_people_depicted
|
109
|
+
tags = extract_keywords(names)
|
110
|
+
|
111
|
+
image.depicts = build_people_depicted(names)
|
112
|
+
image.abouts = build_keywords(tags)
|
113
|
+
|
114
|
+
if @options[:include_image_data]
|
115
|
+
attachment = ::Chronicle::ETL::Models::Attachment.new
|
116
|
+
attachment.data = build_image_data
|
117
|
+
image.attachments = [attachment]
|
118
|
+
end
|
119
|
+
|
120
|
+
image
|
121
|
+
end
|
122
|
+
|
123
|
+
def build_keywords(topics)
|
124
|
+
topics.map do |topic|
|
125
|
+
t = ::Chronicle::ETL::Models::Entity.new
|
126
|
+
t.represents = 'topic'
|
127
|
+
t.provider = @options[:involved][:provider]
|
128
|
+
t.title = topic
|
129
|
+
t.slug = topic.parameterize
|
130
|
+
t.dedupe_on = [[:provider, :represents, :slug]]
|
131
|
+
t
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def build_people_depicted(names)
|
136
|
+
names.map do |name|
|
137
|
+
identity = ::Chronicle::ETL::Models::Entity.new
|
138
|
+
identity.represents = 'identity'
|
139
|
+
identity.provider = @options[:involved][:provider]
|
140
|
+
identity.slug = name.parameterize
|
141
|
+
identity.title = name
|
142
|
+
identity.dedupe_on = [[:provider, :represents, :slug]]
|
143
|
+
identity
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def build_gps
|
148
|
+
return {} unless exif['GPSLatitude']
|
149
|
+
|
150
|
+
{
|
151
|
+
lat: exif['GPSLatitude'],
|
152
|
+
lng: exif['GPSLongitude'],
|
153
|
+
elevation: exif['GPSAltitude']
|
154
|
+
}
|
155
|
+
end
|
156
|
+
|
157
|
+
def build_image_data
|
158
|
+
::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
|
159
|
+
end
|
160
|
+
|
161
|
+
def build_title
|
162
|
+
File.basename(@file)
|
163
|
+
end
|
164
|
+
|
165
|
+
def build_with_strategy(field:, strategy:[])
|
166
|
+
strategies = [strategy].flatten.compact
|
167
|
+
strategies.each do |s|
|
168
|
+
builder_method = "build_#{field}_using_#{s}"
|
169
|
+
result = send(builder_method.to_sym)
|
170
|
+
return result if result
|
171
|
+
end
|
172
|
+
return
|
173
|
+
end
|
174
|
+
|
175
|
+
def build_id_using_file_hash
|
176
|
+
Digest::SHA256.hexdigest(File.read(@file))
|
177
|
+
end
|
178
|
+
|
179
|
+
def build_id_using_xattr_version
|
180
|
+
load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
|
181
|
+
end
|
182
|
+
|
183
|
+
def build_id_using_xmp_document_id
|
184
|
+
exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
|
185
|
+
end
|
186
|
+
|
187
|
+
def build_timestamp_using_file_mtime
|
188
|
+
File.mtime(@file)
|
189
|
+
end
|
190
|
+
|
191
|
+
def build_timestamp_using_exif_datetimeoriginal
|
192
|
+
# EXIF tags don't have timezone information. This is a DateTime in UTC
|
193
|
+
timestamp = exif['DateTimeOriginal'] || return
|
194
|
+
|
195
|
+
if exif['OffsetTimeOriginal']
|
196
|
+
# Offset tags are only available in newer EXIF tags. If it exists, we
|
197
|
+
# use it instead of UTC
|
198
|
+
timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
|
199
|
+
elsif false
|
200
|
+
# TODO: support option of using GPS coordinates to determine timezone
|
201
|
+
else
|
202
|
+
zone = ActiveSupport::TimeZone.new(@options[:timezone_default])
|
203
|
+
timestamp = zone.parse(timestamp.asctime)
|
204
|
+
end
|
205
|
+
|
206
|
+
timestamp
|
207
|
+
end
|
208
|
+
|
209
|
+
# TODO: add documentation for how to set up `macocr`
|
210
|
+
def build_ocr_using_macocr
|
211
|
+
`macocr "#{@file.path}" 2>/dev/null`.presence
|
212
|
+
end
|
213
|
+
|
214
|
+
def exif
|
215
|
+
@exif ||= MiniExiftool.new(
|
216
|
+
@file.path,
|
217
|
+
numerical: true,
|
218
|
+
|
219
|
+
# EXIF timestamps don't have timezone information. MiniExifTool uses Time
|
220
|
+
# by default which parses timestamps in local time zone. Using DateTime
|
221
|
+
# parses dates as UTC and then we can apply a timezone offset if the optional
|
222
|
+
# EXIF timezone offset fields are available.
|
223
|
+
# https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
|
224
|
+
timestamps: DateTime
|
225
|
+
)
|
226
|
+
end
|
227
|
+
|
228
|
+
# Figure out which faces are tagged as regions and return a list of their names
|
229
|
+
def extract_people_depicted
|
230
|
+
return [] unless exif['RegionName']
|
231
|
+
|
232
|
+
names = [exif['RegionName']].flatten
|
233
|
+
types = [exif['RegionType']].flatten
|
234
|
+
|
235
|
+
names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
|
236
|
+
end
|
237
|
+
|
238
|
+
# Extract image keywords from EXIF/IPTC tag and subtract out those of which are
|
239
|
+
# tagged people (determiend by looking at face regions)
|
240
|
+
def extract_keywords(people_names = [])
|
241
|
+
[exif['Keywords'] || []].flatten - people_names
|
242
|
+
end
|
243
|
+
|
244
|
+
def load_value_from_xattr_plist attribute
|
245
|
+
require 'nokogiri'
|
246
|
+
xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
|
247
|
+
return unless xml
|
248
|
+
value = Nokogiri::XML.parse(r).xpath("//string").text
|
249
|
+
return value.presence
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
@@ -1,9 +1,18 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
3
|
class NullTransformer < Chronicle::ETL::Transformer
|
4
|
+
register_connector do |r|
|
5
|
+
r.identifier = 'null'
|
6
|
+
r.description = 'in no way'
|
7
|
+
end
|
8
|
+
|
4
9
|
def transform
|
5
|
-
Chronicle::ETL::Models::Generic.new(@data)
|
10
|
+
Chronicle::ETL::Models::Generic.new(@extraction.data)
|
6
11
|
end
|
12
|
+
|
13
|
+
def timestamp; end
|
14
|
+
|
15
|
+
def id; end
|
7
16
|
end
|
8
17
|
end
|
9
18
|
end
|
@@ -2,16 +2,15 @@ module Chronicle
|
|
2
2
|
module ETL
|
3
3
|
# Abstract class representing an Transformer for an ETL job
|
4
4
|
class Transformer
|
5
|
-
extend Chronicle::ETL::
|
5
|
+
extend Chronicle::ETL::Registry::SelfRegistering
|
6
6
|
|
7
7
|
# Construct a new instance of this transformer. Options are passed in from a Runner
|
8
|
-
# ==
|
8
|
+
# == Parameters:
|
9
9
|
# options::
|
10
10
|
# Options for configuring this Transformer
|
11
|
-
def initialize(options = {},
|
11
|
+
def initialize(options = {}, extraction)
|
12
12
|
@options = options
|
13
|
-
@
|
14
|
-
@record = Chronicle::ETL::Models::Activity.new
|
13
|
+
@extraction = extraction
|
15
14
|
end
|
16
15
|
|
17
16
|
# @abstract Subclass is expected to implement #transform
|
@@ -19,16 +18,47 @@ module Chronicle
|
|
19
18
|
# The main entrypoint for transforming a record. Called by a Runner on each extracted record
|
20
19
|
|
21
20
|
# The domain or provider-specific id of the record this transformer is working on.
|
22
|
-
#
|
23
|
-
#
|
24
|
-
|
21
|
+
# It is useful for:
|
22
|
+
# - de-duping records that might exist in the loader's destination
|
23
|
+
# - building a cursor so an extractor doesn't have to start from the beginning of a
|
24
|
+
# a source
|
25
|
+
def id
|
26
|
+
raise NotImplementedError
|
27
|
+
end
|
25
28
|
|
26
29
|
# The domain or provider-specific timestamp of the record this transformer is working on.
|
27
30
|
# Used for building a cursor so an extractor doesn't have to start from the beginning of a
|
28
31
|
# data source from the beginning.
|
29
|
-
def timestamp
|
32
|
+
def timestamp
|
33
|
+
raise NotImplementedError
|
34
|
+
end
|
35
|
+
|
36
|
+
# An optional, human-readable identifier for a transformation, intended for debugging or logging.
|
37
|
+
# By default, it is just the id.
|
38
|
+
def friendly_identifier
|
39
|
+
id
|
40
|
+
end
|
41
|
+
|
42
|
+
def to_s
|
43
|
+
ts = begin
|
44
|
+
unknown = "???"
|
45
|
+
timestamp&.iso8601 || unknown
|
46
|
+
rescue TransformationError, NotImplementedError
|
47
|
+
unknown
|
48
|
+
end
|
49
|
+
|
50
|
+
identifier = begin
|
51
|
+
unknown = self.class.to_s
|
52
|
+
friendly_identifier || self.class.to_s
|
53
|
+
rescue TransformationError, NotImplementedError
|
54
|
+
unknown
|
55
|
+
end
|
56
|
+
|
57
|
+
"[#{ts}] #{identifier}"
|
58
|
+
end
|
30
59
|
end
|
31
60
|
end
|
32
61
|
end
|
33
62
|
|
34
63
|
require_relative 'null_transformer'
|
64
|
+
require_relative 'image_file_transformer'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'marcel'
|
2
|
+
require 'base64'
|
3
|
+
|
4
|
+
module Chronicle
|
5
|
+
module ETL
|
6
|
+
module Utils
|
7
|
+
# Utility methods for dealing with binary files
|
8
|
+
module BinaryAttachments
|
9
|
+
def self.filename_to_base64(filename:, mimetype: nil)
|
10
|
+
mimetype = mimetype || guess_mimetype(filename: filename)
|
11
|
+
|
12
|
+
"data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.guess_mimetype(filename:)
|
16
|
+
Marcel::MimeType.for(filename)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'active_support/core_ext/object/blank'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Utils
|
6
|
+
# OCR for image files
|
7
|
+
# TODO: add other strategies and document `macocr`
|
8
|
+
module TextRecognition
|
9
|
+
def self.recognize_in_image(filename:)
|
10
|
+
`macocr "#{filename}" 2>/dev/null`.presence
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/chronicle/etl.rb
CHANGED
@@ -1,19 +1,24 @@
|
|
1
|
-
require_relative 'etl/
|
1
|
+
require_relative 'etl/registry/registry'
|
2
2
|
require_relative 'etl/config'
|
3
3
|
require_relative 'etl/exceptions'
|
4
|
+
require_relative 'etl/extraction'
|
4
5
|
require_relative 'etl/extractors/extractor'
|
5
6
|
require_relative 'etl/job_definition'
|
6
7
|
require_relative 'etl/job_log'
|
7
8
|
require_relative 'etl/job_logger'
|
8
9
|
require_relative 'etl/job'
|
9
10
|
require_relative 'etl/loaders/loader'
|
11
|
+
require_relative 'etl/logger'
|
10
12
|
require_relative 'etl/models/activity'
|
13
|
+
require_relative 'etl/models/attachment'
|
11
14
|
require_relative 'etl/models/base'
|
12
15
|
require_relative 'etl/models/entity'
|
13
16
|
require_relative 'etl/models/generic'
|
14
17
|
require_relative 'etl/runner'
|
18
|
+
require_relative 'etl/serializers/serializer'
|
15
19
|
require_relative 'etl/transformers/transformer'
|
20
|
+
require_relative 'etl/utils/binary_attachments'
|
16
21
|
require_relative 'etl/utils/hash_utilities'
|
17
|
-
require_relative 'etl/utils/
|
22
|
+
require_relative 'etl/utils/text_recognition'
|
18
23
|
require_relative 'etl/utils/progress_bar'
|
19
24
|
require_relative 'etl/version'
|