chronicle-etl 0.2.4 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +3 -0
- data/.rubocop.yml +31 -1
- data/Guardfile +7 -0
- data/README.md +21 -14
- data/Rakefile +4 -2
- data/chronicle-etl.gemspec +18 -10
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/cli/connectors.rb +53 -7
- data/lib/chronicle/etl/cli/jobs.rb +59 -24
- data/lib/chronicle/etl/cli/main.rb +18 -16
- data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
- data/lib/chronicle/etl/cli.rb +7 -0
- data/lib/chronicle/etl/config.rb +1 -1
- data/lib/chronicle/etl/configurable.rb +150 -0
- data/lib/chronicle/etl/exceptions.rb +14 -1
- data/lib/chronicle/etl/extraction.rb +12 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +32 -31
- data/lib/chronicle/etl/extractors/extractor.rb +25 -13
- data/lib/chronicle/etl/extractors/file_extractor.rb +17 -32
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +37 -0
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
- data/lib/chronicle/etl/job.rb +30 -29
- data/lib/chronicle/etl/job_definition.rb +45 -7
- data/lib/chronicle/etl/job_log.rb +10 -0
- data/lib/chronicle/etl/job_logger.rb +23 -20
- data/lib/chronicle/etl/loaders/csv_loader.rb +5 -1
- data/lib/chronicle/etl/loaders/loader.rb +5 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +9 -5
- data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
- data/lib/chronicle/etl/loaders/table_loader.rb +51 -7
- data/lib/chronicle/etl/logger.rb +48 -0
- data/lib/chronicle/etl/models/attachment.rb +14 -0
- data/lib/chronicle/etl/models/base.rb +23 -7
- data/lib/chronicle/etl/models/entity.rb +9 -3
- data/lib/chronicle/etl/registry/connector_registration.rb +62 -0
- data/lib/chronicle/etl/registry/registry.rb +52 -0
- data/lib/chronicle/etl/registry/self_registering.rb +25 -0
- data/lib/chronicle/etl/runner.rb +58 -7
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
- data/lib/chronicle/etl/serializers/serializer.rb +27 -0
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +247 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
- data/lib/chronicle/etl/transformers/transformer.rb +41 -10
- data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
- data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +8 -2
- metadata +146 -34
- data/.ruby-version +0 -1
- data/Gemfile.lock +0 -91
- data/lib/chronicle/etl/catalog.rb +0 -108
- data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'colorize'
|
2
|
+
require 'chronic_duration'
|
2
3
|
|
3
4
|
class Chronicle::ETL::Runner
|
4
5
|
def initialize(job)
|
@@ -13,25 +14,75 @@ class Chronicle::ETL::Runner
|
|
13
14
|
@job_logger.start
|
14
15
|
loader.start
|
15
16
|
|
17
|
+
extractor.prepare
|
16
18
|
total = extractor.results_count
|
17
|
-
progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
19
|
+
@progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
20
|
+
Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
|
18
21
|
|
19
|
-
|
20
|
-
|
22
|
+
Chronicle::ETL::Logger.info(tty_log_job_start)
|
23
|
+
extractor.extract do |extraction|
|
24
|
+
unless extraction.is_a?(Chronicle::ETL::Extraction)
|
25
|
+
raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
|
26
|
+
end
|
27
|
+
|
28
|
+
transformer = @job.instantiate_transformer(extraction)
|
21
29
|
record = transformer.transform
|
22
30
|
|
23
31
|
unless record.is_a?(Chronicle::ETL::Models::Base)
|
24
|
-
raise Chronicle::ETL::
|
32
|
+
raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
|
25
33
|
end
|
26
34
|
|
35
|
+
Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
|
27
36
|
@job_logger.log_transformation(transformer)
|
28
|
-
|
29
|
-
|
37
|
+
|
38
|
+
loader.load(record) unless @job.dry_run?
|
39
|
+
rescue Chronicle::ETL::TransformationError => e
|
40
|
+
Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
|
41
|
+
ensure
|
42
|
+
@progress_bar.increment
|
30
43
|
end
|
31
44
|
|
32
|
-
progress_bar.finish
|
45
|
+
@progress_bar.finish
|
33
46
|
loader.finish
|
34
47
|
@job_logger.finish
|
48
|
+
rescue Interrupt
|
49
|
+
Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
|
50
|
+
@job_logger.error
|
51
|
+
rescue StandardError => e
|
52
|
+
raise e
|
53
|
+
ensure
|
35
54
|
@job_logger.save
|
55
|
+
@progress_bar.finish
|
56
|
+
Chronicle::ETL::Logger.detach_from_progress_bar
|
57
|
+
Chronicle::ETL::Logger.info(tty_log_completion)
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def tty_log_job_start
|
63
|
+
output = "Beginning job "
|
64
|
+
output += "'#{@job.name}'".bold if @job.name
|
65
|
+
output
|
66
|
+
end
|
67
|
+
|
68
|
+
def tty_log_transformation transformer
|
69
|
+
output = " ✓".green
|
70
|
+
output += " #{transformer}"
|
71
|
+
end
|
72
|
+
|
73
|
+
def tty_log_transformation_failure exception
|
74
|
+
output = " ✖".red
|
75
|
+
output += " Failed to build #{exception.transformation}. #{exception.message}"
|
76
|
+
end
|
77
|
+
|
78
|
+
def tty_log_completion
|
79
|
+
status = @job_logger.success ? 'Success' : 'Failed'
|
80
|
+
output = "\nCompleted job "
|
81
|
+
output += "'#{@job.name}'".bold if @job.name
|
82
|
+
output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
|
83
|
+
output += "\n Status:\t".light_black + status
|
84
|
+
output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
|
85
|
+
output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
|
86
|
+
output
|
36
87
|
end
|
37
88
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class JSONAPISerializer < Chronicle::ETL::Serializer
|
4
|
+
def serializable_hash
|
5
|
+
@record
|
6
|
+
.identifier_hash
|
7
|
+
.merge({ attributes: @record.attributes })
|
8
|
+
.merge({ relationships: build_associations })
|
9
|
+
.merge(@record.meta_hash)
|
10
|
+
end
|
11
|
+
|
12
|
+
def build_associations
|
13
|
+
@record.associations.transform_values do |value|
|
14
|
+
association_data =
|
15
|
+
if value.is_a?(Array)
|
16
|
+
value.map { |record| JSONAPISerializer.new(record).serializable_hash }
|
17
|
+
else
|
18
|
+
JSONAPISerializer.new(value).serializable_hash
|
19
|
+
end
|
20
|
+
{ data: association_data }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
# Abstract class representing a Serializer for an ETL record
|
4
|
+
class Serializer
|
5
|
+
# Construct a new instance of this serializer.
|
6
|
+
# == Parameters:
|
7
|
+
# options::
|
8
|
+
# Options for configuring this Serializers
|
9
|
+
def initialize(record, options = {})
|
10
|
+
@record = record
|
11
|
+
@options = options
|
12
|
+
end
|
13
|
+
|
14
|
+
# Serialize a record as a hash
|
15
|
+
def serializable_hash
|
16
|
+
raise NotImplementedError
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.serialize(record)
|
20
|
+
serializer = self.new(record)
|
21
|
+
serializer.serializable_hash
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
require_relative 'jsonapi_serializer'
|
@@ -0,0 +1,247 @@
|
|
1
|
+
require 'mini_exiftool'
|
2
|
+
require 'active_support'
|
3
|
+
require 'active_support/core_ext/object'
|
4
|
+
require 'active_support/core_ext/time'
|
5
|
+
require 'active_support/core_ext/hash/reverse_merge'
|
6
|
+
require 'active_support/core_ext/string/inflections'
|
7
|
+
|
8
|
+
module Chronicle
|
9
|
+
module ETL
|
10
|
+
# Transform a JPEG or other image file into a record.
|
11
|
+
# By default, file mtime and a hash of the file content is used to build
|
12
|
+
# the timestamp and ID respectively but other options are available (such
|
13
|
+
# as reading EXIF tags or extended attributes from the filesystem).
|
14
|
+
#
|
15
|
+
# TODO: This should be extracted into its own plugin
|
16
|
+
class ImageFileTransformer < Chronicle::ETL::Transformer
|
17
|
+
register_connector do |r|
|
18
|
+
r.identifier = 'image-file'
|
19
|
+
r.description = 'an image file'
|
20
|
+
end
|
21
|
+
|
22
|
+
setting :timestamp_strategy, default: 'file_mtime'
|
23
|
+
setting :id_strategy, default: 'file_hash'
|
24
|
+
setting :verb, default: 'photographed'
|
25
|
+
# EXIF tags often don't have timezones
|
26
|
+
setting :timezone_default, default: 'Eastern Time (US & Canada)'
|
27
|
+
setting :include_image_data, default: true
|
28
|
+
setting :actor
|
29
|
+
setting :involved
|
30
|
+
|
31
|
+
def transform
|
32
|
+
# FIXME: set @filename; use block for reading file when necessary
|
33
|
+
@file = File.open(@extraction.data)
|
34
|
+
record = build_created(@file)
|
35
|
+
@file.close
|
36
|
+
record
|
37
|
+
end
|
38
|
+
|
39
|
+
def friendly_identifier
|
40
|
+
@file.path
|
41
|
+
end
|
42
|
+
|
43
|
+
def id
|
44
|
+
@id ||= begin
|
45
|
+
id = build_with_strategy(field: :id, strategy: @config.id_strategy)
|
46
|
+
raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
|
47
|
+
|
48
|
+
id
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def timestamp
|
53
|
+
@timestamp ||= begin
|
54
|
+
ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
|
55
|
+
raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
|
56
|
+
|
57
|
+
ts
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def build_created(file)
|
64
|
+
record = ::Chronicle::ETL::Models::Activity.new
|
65
|
+
record.verb = @config.verb
|
66
|
+
record.provider = @config.provider
|
67
|
+
record.provider_id = id
|
68
|
+
record.end_at = timestamp
|
69
|
+
record.dedupe_on = [[:provider_id, :verb, :provider]]
|
70
|
+
|
71
|
+
record.involved = build_image
|
72
|
+
record.actor = build_actor
|
73
|
+
|
74
|
+
record.assign_attributes(build_gps)
|
75
|
+
record
|
76
|
+
end
|
77
|
+
|
78
|
+
def build_actor
|
79
|
+
actor = ::Chronicle::ETL::Models::Entity.new
|
80
|
+
actor.represents = 'identity'
|
81
|
+
actor.provider = @config.actor[:provider]
|
82
|
+
actor.slug = @config.actor[:slug]
|
83
|
+
actor.dedupe_on = [[:provider, :slug, :represents]]
|
84
|
+
actor
|
85
|
+
end
|
86
|
+
|
87
|
+
def build_image
|
88
|
+
image = ::Chronicle::ETL::Models::Entity.new
|
89
|
+
image.represents = @config.involved[:represents]
|
90
|
+
image.title = build_title
|
91
|
+
image.body = exif['Description']
|
92
|
+
image.provider = @config.involved[:provider]
|
93
|
+
image.provider_id = id
|
94
|
+
image.assign_attributes(build_gps)
|
95
|
+
image.dedupe_on = [[:provider, :provider_id, :represents]]
|
96
|
+
|
97
|
+
if @config.ocr_strategy
|
98
|
+
ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
|
99
|
+
image.metadata[:ocr_text] = ocr_text if ocr_text
|
100
|
+
end
|
101
|
+
|
102
|
+
names = extract_people_depicted
|
103
|
+
tags = extract_keywords(names)
|
104
|
+
|
105
|
+
image.depicts = build_people_depicted(names)
|
106
|
+
image.abouts = build_keywords(tags)
|
107
|
+
|
108
|
+
if @config.include_image_data
|
109
|
+
attachment = ::Chronicle::ETL::Models::Attachment.new
|
110
|
+
attachment.data = build_image_data
|
111
|
+
image.attachments = [attachment]
|
112
|
+
end
|
113
|
+
|
114
|
+
image
|
115
|
+
end
|
116
|
+
|
117
|
+
def build_keywords(topics)
|
118
|
+
topics.map do |topic|
|
119
|
+
t = ::Chronicle::ETL::Models::Entity.new
|
120
|
+
t.represents = 'topic'
|
121
|
+
t.provider = @config.involved[:provider]
|
122
|
+
t.title = topic
|
123
|
+
t.slug = topic.parameterize
|
124
|
+
t.dedupe_on = [[:provider, :represents, :slug]]
|
125
|
+
t
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def build_people_depicted(names)
|
130
|
+
names.map do |name|
|
131
|
+
identity = ::Chronicle::ETL::Models::Entity.new
|
132
|
+
identity.represents = 'identity'
|
133
|
+
identity.provider = @config.involved[:provider]
|
134
|
+
identity.slug = name.parameterize
|
135
|
+
identity.title = name
|
136
|
+
identity.dedupe_on = [[:provider, :represents, :slug]]
|
137
|
+
identity
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def build_gps
|
142
|
+
return {} unless exif['GPSLatitude']
|
143
|
+
|
144
|
+
{
|
145
|
+
lat: exif['GPSLatitude'],
|
146
|
+
lng: exif['GPSLongitude'],
|
147
|
+
elevation: exif['GPSAltitude']
|
148
|
+
}
|
149
|
+
end
|
150
|
+
|
151
|
+
def build_image_data
|
152
|
+
::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
|
153
|
+
end
|
154
|
+
|
155
|
+
def build_title
|
156
|
+
File.basename(@file)
|
157
|
+
end
|
158
|
+
|
159
|
+
def build_with_strategy(field:, strategy:[])
|
160
|
+
strategies = [strategy].flatten.compact
|
161
|
+
strategies.each do |s|
|
162
|
+
builder_method = "build_#{field}_using_#{s}"
|
163
|
+
result = send(builder_method.to_sym)
|
164
|
+
return result if result
|
165
|
+
end
|
166
|
+
return
|
167
|
+
end
|
168
|
+
|
169
|
+
def build_id_using_file_hash
|
170
|
+
Digest::SHA256.hexdigest(File.read(@file))
|
171
|
+
end
|
172
|
+
|
173
|
+
def build_id_using_xattr_version
|
174
|
+
load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
|
175
|
+
end
|
176
|
+
|
177
|
+
def build_id_using_xmp_document_id
|
178
|
+
exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
|
179
|
+
end
|
180
|
+
|
181
|
+
def build_timestamp_using_file_mtime
|
182
|
+
File.mtime(@file)
|
183
|
+
end
|
184
|
+
|
185
|
+
def build_timestamp_using_exif_datetimeoriginal
|
186
|
+
# EXIF tags don't have timezone information. This is a DateTime in UTC
|
187
|
+
timestamp = exif['DateTimeOriginal'] || return
|
188
|
+
|
189
|
+
if exif['OffsetTimeOriginal']
|
190
|
+
# Offset tags are only available in newer EXIF tags. If it exists, we
|
191
|
+
# use it instead of UTC
|
192
|
+
timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
|
193
|
+
elsif false
|
194
|
+
# TODO: support option of using GPS coordinates to determine timezone
|
195
|
+
else
|
196
|
+
zone = ActiveSupport::TimeZone.new(@config.timezone_default)
|
197
|
+
timestamp = zone.parse(timestamp.asctime)
|
198
|
+
end
|
199
|
+
|
200
|
+
timestamp
|
201
|
+
end
|
202
|
+
|
203
|
+
# TODO: add documentation for how to set up `macocr`
|
204
|
+
def build_ocr_using_macocr
|
205
|
+
`macocr "#{@file.path}" 2>/dev/null`.presence
|
206
|
+
end
|
207
|
+
|
208
|
+
def exif
|
209
|
+
@exif ||= MiniExiftool.new(
|
210
|
+
@file.path,
|
211
|
+
numerical: true,
|
212
|
+
|
213
|
+
# EXIF timestamps don't have timezone information. MiniExifTool uses Time
|
214
|
+
# by default which parses timestamps in local time zone. Using DateTime
|
215
|
+
# parses dates as UTC and then we can apply a timezone offset if the optional
|
216
|
+
# EXIF timezone offset fields are available.
|
217
|
+
# https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
|
218
|
+
timestamps: DateTime
|
219
|
+
)
|
220
|
+
end
|
221
|
+
|
222
|
+
# Figure out which faces are tagged as regions and return a list of their names
|
223
|
+
def extract_people_depicted
|
224
|
+
return [] unless exif['RegionName']
|
225
|
+
|
226
|
+
names = [exif['RegionName']].flatten
|
227
|
+
types = [exif['RegionType']].flatten
|
228
|
+
|
229
|
+
names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
|
230
|
+
end
|
231
|
+
|
232
|
+
# Extract image keywords from EXIF/IPTC tag and subtract out those of which are
|
233
|
+
# tagged people (determiend by looking at face regions)
|
234
|
+
def extract_keywords(people_names = [])
|
235
|
+
[exif['Keywords'] || []].flatten - people_names
|
236
|
+
end
|
237
|
+
|
238
|
+
def load_value_from_xattr_plist attribute
|
239
|
+
require 'nokogiri'
|
240
|
+
xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
|
241
|
+
return unless xml
|
242
|
+
value = Nokogiri::XML.parse(r).xpath("//string").text
|
243
|
+
return value.presence
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
@@ -1,9 +1,18 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
3
|
class NullTransformer < Chronicle::ETL::Transformer
|
4
|
+
register_connector do |r|
|
5
|
+
r.identifier = 'null'
|
6
|
+
r.description = 'in no way'
|
7
|
+
end
|
8
|
+
|
4
9
|
def transform
|
5
|
-
Chronicle::ETL::Models::Generic.new(@data)
|
10
|
+
Chronicle::ETL::Models::Generic.new(@extraction.data)
|
6
11
|
end
|
12
|
+
|
13
|
+
def timestamp; end
|
14
|
+
|
15
|
+
def id; end
|
7
16
|
end
|
8
17
|
end
|
9
18
|
end
|
@@ -2,16 +2,16 @@ module Chronicle
|
|
2
2
|
module ETL
|
3
3
|
# Abstract class representing an Transformer for an ETL job
|
4
4
|
class Transformer
|
5
|
-
extend Chronicle::ETL::
|
5
|
+
extend Chronicle::ETL::Registry::SelfRegistering
|
6
|
+
include Chronicle::ETL::Configurable
|
6
7
|
|
7
8
|
# Construct a new instance of this transformer. Options are passed in from a Runner
|
8
|
-
# ==
|
9
|
+
# == Parameters:
|
9
10
|
# options::
|
10
11
|
# Options for configuring this Transformer
|
11
|
-
def initialize(options = {}
|
12
|
-
@
|
13
|
-
|
14
|
-
@record = Chronicle::ETL::Models::Activity.new
|
12
|
+
def initialize(extraction, options = {})
|
13
|
+
@extraction = extraction
|
14
|
+
apply_options(options)
|
15
15
|
end
|
16
16
|
|
17
17
|
# @abstract Subclass is expected to implement #transform
|
@@ -19,16 +19,47 @@ module Chronicle
|
|
19
19
|
# The main entrypoint for transforming a record. Called by a Runner on each extracted record
|
20
20
|
|
21
21
|
# The domain or provider-specific id of the record this transformer is working on.
|
22
|
-
#
|
23
|
-
#
|
24
|
-
|
22
|
+
# It is useful for:
|
23
|
+
# - de-duping records that might exist in the loader's destination
|
24
|
+
# - building a cursor so an extractor doesn't have to start from the beginning of a
|
25
|
+
# a source
|
26
|
+
def id
|
27
|
+
raise NotImplementedError
|
28
|
+
end
|
25
29
|
|
26
30
|
# The domain or provider-specific timestamp of the record this transformer is working on.
|
27
31
|
# Used for building a cursor so an extractor doesn't have to start from the beginning of a
|
28
32
|
# data source from the beginning.
|
29
|
-
def timestamp
|
33
|
+
def timestamp
|
34
|
+
raise NotImplementedError
|
35
|
+
end
|
36
|
+
|
37
|
+
# An optional, human-readable identifier for a transformation, intended for debugging or logging.
|
38
|
+
# By default, it is just the id.
|
39
|
+
def friendly_identifier
|
40
|
+
id
|
41
|
+
end
|
42
|
+
|
43
|
+
def to_s
|
44
|
+
ts = begin
|
45
|
+
unknown = "???"
|
46
|
+
timestamp&.iso8601 || unknown
|
47
|
+
rescue TransformationError, NotImplementedError
|
48
|
+
unknown
|
49
|
+
end
|
50
|
+
|
51
|
+
identifier = begin
|
52
|
+
unknown = self.class.to_s
|
53
|
+
friendly_identifier || self.class.to_s
|
54
|
+
rescue TransformationError, NotImplementedError
|
55
|
+
unknown
|
56
|
+
end
|
57
|
+
|
58
|
+
"[#{ts}] #{identifier}"
|
59
|
+
end
|
30
60
|
end
|
31
61
|
end
|
32
62
|
end
|
33
63
|
|
34
64
|
require_relative 'null_transformer'
|
65
|
+
require_relative 'image_file_transformer'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'marcel'
|
2
|
+
require 'base64'
|
3
|
+
|
4
|
+
module Chronicle
|
5
|
+
module ETL
|
6
|
+
module Utils
|
7
|
+
# Utility methods for dealing with binary files
|
8
|
+
module BinaryAttachments
|
9
|
+
def self.filename_to_base64(filename:, mimetype: nil)
|
10
|
+
mimetype = mimetype || guess_mimetype(filename: filename)
|
11
|
+
|
12
|
+
"data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.guess_mimetype(filename:)
|
16
|
+
Marcel::MimeType.for(filename)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'active_support/core_ext/object/blank'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Utils
|
6
|
+
# OCR for image files
|
7
|
+
# TODO: add other strategies and document `macocr`
|
8
|
+
module TextRecognition
|
9
|
+
def self.recognize_in_image(filename:)
|
10
|
+
`macocr "#{filename}" 2>/dev/null`.presence
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/chronicle/etl.rb
CHANGED
@@ -1,19 +1,25 @@
|
|
1
|
-
require_relative 'etl/
|
1
|
+
require_relative 'etl/registry/registry'
|
2
2
|
require_relative 'etl/config'
|
3
|
+
require_relative 'etl/configurable'
|
3
4
|
require_relative 'etl/exceptions'
|
5
|
+
require_relative 'etl/extraction'
|
4
6
|
require_relative 'etl/extractors/extractor'
|
5
7
|
require_relative 'etl/job_definition'
|
6
8
|
require_relative 'etl/job_log'
|
7
9
|
require_relative 'etl/job_logger'
|
8
10
|
require_relative 'etl/job'
|
9
11
|
require_relative 'etl/loaders/loader'
|
12
|
+
require_relative 'etl/logger'
|
10
13
|
require_relative 'etl/models/activity'
|
14
|
+
require_relative 'etl/models/attachment'
|
11
15
|
require_relative 'etl/models/base'
|
12
16
|
require_relative 'etl/models/entity'
|
13
17
|
require_relative 'etl/models/generic'
|
14
18
|
require_relative 'etl/runner'
|
19
|
+
require_relative 'etl/serializers/serializer'
|
15
20
|
require_relative 'etl/transformers/transformer'
|
21
|
+
require_relative 'etl/utils/binary_attachments'
|
16
22
|
require_relative 'etl/utils/hash_utilities'
|
17
|
-
require_relative 'etl/utils/
|
23
|
+
require_relative 'etl/utils/text_recognition'
|
18
24
|
require_relative 'etl/utils/progress_bar'
|
19
25
|
require_relative 'etl/version'
|