chronicle-etl 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +3 -0
- data/README.md +22 -15
- data/chronicle-etl.gemspec +13 -7
- data/lib/chronicle/etl/cli/connectors.rb +19 -7
- data/lib/chronicle/etl/cli/jobs.rb +38 -26
- data/lib/chronicle/etl/cli/main.rb +10 -2
- data/lib/chronicle/etl/config.rb +24 -3
- data/lib/chronicle/etl/exceptions.rb +13 -0
- data/lib/chronicle/etl/extraction.rb +12 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -37
- data/lib/chronicle/etl/extractors/extractor.rb +25 -4
- data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
- data/lib/chronicle/etl/job.rb +72 -0
- data/lib/chronicle/etl/job_definition.rb +89 -0
- data/lib/chronicle/etl/job_log.rb +95 -0
- data/lib/chronicle/etl/job_logger.rb +81 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +6 -6
- data/lib/chronicle/etl/loaders/loader.rb +2 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +16 -9
- data/lib/chronicle/etl/loaders/stdout_loader.rb +8 -3
- data/lib/chronicle/etl/loaders/table_loader.rb +58 -7
- data/lib/chronicle/etl/logger.rb +48 -0
- data/lib/chronicle/etl/models/activity.rb +15 -0
- data/lib/chronicle/etl/models/attachment.rb +14 -0
- data/lib/chronicle/etl/models/base.rb +119 -0
- data/lib/chronicle/etl/models/entity.rb +21 -0
- data/lib/chronicle/etl/models/generic.rb +23 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
- data/lib/chronicle/etl/registry/registry.rb +52 -0
- data/lib/chronicle/etl/registry/self_registering.rb +25 -0
- data/lib/chronicle/etl/runner.rb +66 -24
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
- data/lib/chronicle/etl/serializers/serializer.rb +27 -0
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +11 -3
- data/lib/chronicle/etl/transformers/transformer.rb +42 -13
- data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
- data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
- data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +16 -1
- metadata +139 -36
- data/CHANGELOG.md +0 -23
- data/Gemfile.lock +0 -85
- data/lib/chronicle/etl/catalog.rb +0 -102
- data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -0,0 +1,253 @@
|
|
1
|
+
require 'mini_exiftool'
|
2
|
+
require 'active_support'
|
3
|
+
require 'active_support/core_ext/object'
|
4
|
+
require 'active_support/core_ext/time'
|
5
|
+
require 'active_support/core_ext/hash/reverse_merge'
|
6
|
+
require 'active_support/core_ext/string/inflections'
|
7
|
+
|
8
|
+
module Chronicle
|
9
|
+
module ETL
|
10
|
+
# Transform a JPEG or other image file into a record.
|
11
|
+
# By default, file mtime and a hash of the file content is used to build
|
12
|
+
# the timestamp and ID respectively but other options are available (such
|
13
|
+
# as reading EXIF tags or extended attributes from the filesystem).
|
14
|
+
#
|
15
|
+
# TODO: This should be extracted into its own plugin
|
16
|
+
class ImageFileTransformer < Chronicle::ETL::Transformer
|
17
|
+
register_connector do |r|
|
18
|
+
r.identifier = 'image-file'
|
19
|
+
r.description = 'an image file'
|
20
|
+
end
|
21
|
+
|
22
|
+
DEFAULT_OPTIONS = {
|
23
|
+
timestamp_strategy: 'file_mtime',
|
24
|
+
id_strategy: 'file_hash',
|
25
|
+
verb: 'photographed',
|
26
|
+
|
27
|
+
# EXIF tags often don't have timezones
|
28
|
+
timezone_default: 'Eastern Time (US & Canada)',
|
29
|
+
include_image_data: true
|
30
|
+
}.freeze
|
31
|
+
|
32
|
+
def initialize(*args)
|
33
|
+
super(*args)
|
34
|
+
@options = @options.reverse_merge(DEFAULT_OPTIONS)
|
35
|
+
end
|
36
|
+
|
37
|
+
def transform
|
38
|
+
# FIXME: set @filename; use block for reading file when necessary
|
39
|
+
@file = File.open(@extraction.data)
|
40
|
+
record = build_created(@file)
|
41
|
+
@file.close
|
42
|
+
record
|
43
|
+
end
|
44
|
+
|
45
|
+
def friendly_identifier
|
46
|
+
@file.path
|
47
|
+
end
|
48
|
+
|
49
|
+
def id
|
50
|
+
@id ||= begin
|
51
|
+
id = build_with_strategy(field: :id, strategy: @options[:id_strategy])
|
52
|
+
raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
|
53
|
+
|
54
|
+
id
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def timestamp
|
59
|
+
@timestamp ||= begin
|
60
|
+
ts = build_with_strategy(field: :timestamp, strategy: @options[:timestamp_strategy])
|
61
|
+
raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
|
62
|
+
|
63
|
+
ts
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def build_created(file)
|
70
|
+
record = ::Chronicle::ETL::Models::Activity.new
|
71
|
+
record.verb = @options[:verb]
|
72
|
+
record.provider = @options[:provider]
|
73
|
+
record.provider_id = id
|
74
|
+
record.end_at = timestamp
|
75
|
+
record.dedupe_on = [[:provider_id, :verb, :provider]]
|
76
|
+
|
77
|
+
record.involved = build_image
|
78
|
+
record.actor = build_actor
|
79
|
+
|
80
|
+
record.assign_attributes(build_gps)
|
81
|
+
record
|
82
|
+
end
|
83
|
+
|
84
|
+
def build_actor
|
85
|
+
actor = ::Chronicle::ETL::Models::Entity.new
|
86
|
+
actor.represents = 'identity'
|
87
|
+
actor.provider = @options[:actor][:provider]
|
88
|
+
actor.slug = @options[:actor][:slug]
|
89
|
+
actor.dedupe_on = [[:provider, :slug, :represents]]
|
90
|
+
actor
|
91
|
+
end
|
92
|
+
|
93
|
+
def build_image
|
94
|
+
image = ::Chronicle::ETL::Models::Entity.new
|
95
|
+
image.represents = @options[:involved][:represents]
|
96
|
+
image.title = build_title
|
97
|
+
image.body = exif['Description']
|
98
|
+
image.provider = @options[:involved][:provider]
|
99
|
+
image.provider_id = id
|
100
|
+
image.assign_attributes(build_gps)
|
101
|
+
image.dedupe_on = [[:provider, :provider_id, :represents]]
|
102
|
+
|
103
|
+
if @options[:ocr_strategy]
|
104
|
+
ocr_text = build_with_strategy(field: :ocr, strategy: @options[:ocr_strategy])
|
105
|
+
image.metadata[:ocr_text] = ocr_text if ocr_text
|
106
|
+
end
|
107
|
+
|
108
|
+
names = extract_people_depicted
|
109
|
+
tags = extract_keywords(names)
|
110
|
+
|
111
|
+
image.depicts = build_people_depicted(names)
|
112
|
+
image.abouts = build_keywords(tags)
|
113
|
+
|
114
|
+
if @options[:include_image_data]
|
115
|
+
attachment = ::Chronicle::ETL::Models::Attachment.new
|
116
|
+
attachment.data = build_image_data
|
117
|
+
image.attachments = [attachment]
|
118
|
+
end
|
119
|
+
|
120
|
+
image
|
121
|
+
end
|
122
|
+
|
123
|
+
def build_keywords(topics)
|
124
|
+
topics.map do |topic|
|
125
|
+
t = ::Chronicle::ETL::Models::Entity.new
|
126
|
+
t.represents = 'topic'
|
127
|
+
t.provider = @options[:involved][:provider]
|
128
|
+
t.title = topic
|
129
|
+
t.slug = topic.parameterize
|
130
|
+
t.dedupe_on = [[:provider, :represents, :slug]]
|
131
|
+
t
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def build_people_depicted(names)
|
136
|
+
names.map do |name|
|
137
|
+
identity = ::Chronicle::ETL::Models::Entity.new
|
138
|
+
identity.represents = 'identity'
|
139
|
+
identity.provider = @options[:involved][:provider]
|
140
|
+
identity.slug = name.parameterize
|
141
|
+
identity.title = name
|
142
|
+
identity.dedupe_on = [[:provider, :represents, :slug]]
|
143
|
+
identity
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def build_gps
|
148
|
+
return {} unless exif['GPSLatitude']
|
149
|
+
|
150
|
+
{
|
151
|
+
lat: exif['GPSLatitude'],
|
152
|
+
lng: exif['GPSLongitude'],
|
153
|
+
elevation: exif['GPSAltitude']
|
154
|
+
}
|
155
|
+
end
|
156
|
+
|
157
|
+
def build_image_data
|
158
|
+
::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
|
159
|
+
end
|
160
|
+
|
161
|
+
def build_title
|
162
|
+
File.basename(@file)
|
163
|
+
end
|
164
|
+
|
165
|
+
def build_with_strategy(field:, strategy:[])
|
166
|
+
strategies = [strategy].flatten.compact
|
167
|
+
strategies.each do |s|
|
168
|
+
builder_method = "build_#{field}_using_#{s}"
|
169
|
+
result = send(builder_method.to_sym)
|
170
|
+
return result if result
|
171
|
+
end
|
172
|
+
return
|
173
|
+
end
|
174
|
+
|
175
|
+
def build_id_using_file_hash
|
176
|
+
Digest::SHA256.hexdigest(File.read(@file))
|
177
|
+
end
|
178
|
+
|
179
|
+
def build_id_using_xattr_version
|
180
|
+
load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
|
181
|
+
end
|
182
|
+
|
183
|
+
def build_id_using_xmp_document_id
|
184
|
+
exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
|
185
|
+
end
|
186
|
+
|
187
|
+
def build_timestamp_using_file_mtime
|
188
|
+
File.mtime(@file)
|
189
|
+
end
|
190
|
+
|
191
|
+
def build_timestamp_using_exif_datetimeoriginal
|
192
|
+
# EXIF tags don't have timezone information. This is a DateTime in UTC
|
193
|
+
timestamp = exif['DateTimeOriginal'] || return
|
194
|
+
|
195
|
+
if exif['OffsetTimeOriginal']
|
196
|
+
# Offset tags are only available in newer EXIF tags. If it exists, we
|
197
|
+
# use it instead of UTC
|
198
|
+
timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
|
199
|
+
elsif false
|
200
|
+
# TODO: support option of using GPS coordinates to determine timezone
|
201
|
+
else
|
202
|
+
zone = ActiveSupport::TimeZone.new(@options[:timezone_default])
|
203
|
+
timestamp = zone.parse(timestamp.asctime)
|
204
|
+
end
|
205
|
+
|
206
|
+
timestamp
|
207
|
+
end
|
208
|
+
|
209
|
+
# TODO: add documentation for how to set up `macocr`
|
210
|
+
def build_ocr_using_macocr
|
211
|
+
`macocr "#{@file.path}" 2>/dev/null`.presence
|
212
|
+
end
|
213
|
+
|
214
|
+
def exif
|
215
|
+
@exif ||= MiniExiftool.new(
|
216
|
+
@file.path,
|
217
|
+
numerical: true,
|
218
|
+
|
219
|
+
# EXIF timestamps don't have timezone information. MiniExifTool uses Time
|
220
|
+
# by default which parses timestamps in local time zone. Using DateTime
|
221
|
+
# parses dates as UTC and then we can apply a timezone offset if the optional
|
222
|
+
# EXIF timezone offset fields are available.
|
223
|
+
# https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
|
224
|
+
timestamps: DateTime
|
225
|
+
)
|
226
|
+
end
|
227
|
+
|
228
|
+
# Figure out which faces are tagged as regions and return a list of their names
|
229
|
+
def extract_people_depicted
|
230
|
+
return [] unless exif['RegionName']
|
231
|
+
|
232
|
+
names = [exif['RegionName']].flatten
|
233
|
+
types = [exif['RegionType']].flatten
|
234
|
+
|
235
|
+
names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
|
236
|
+
end
|
237
|
+
|
238
|
+
# Extract image keywords from EXIF/IPTC tag and subtract out those of which are
|
239
|
+
# tagged people (determiend by looking at face regions)
|
240
|
+
def extract_keywords(people_names = [])
|
241
|
+
[exif['Keywords'] || []].flatten - people_names
|
242
|
+
end
|
243
|
+
|
244
|
+
def load_value_from_xattr_plist attribute
|
245
|
+
require 'nokogiri'
|
246
|
+
xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
|
247
|
+
return unless xml
|
248
|
+
value = Nokogiri::XML.parse(r).xpath("//string").text
|
249
|
+
return value.presence
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
@@ -1,10 +1,18 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
3
|
class NullTransformer < Chronicle::ETL::Transformer
|
4
|
+
register_connector do |r|
|
5
|
+
r.identifier = 'null'
|
6
|
+
r.description = 'in no way'
|
7
|
+
end
|
8
|
+
|
4
9
|
def transform
|
5
|
-
|
10
|
+
Chronicle::ETL::Models::Generic.new(@extraction.data)
|
6
11
|
end
|
7
|
-
end
|
8
12
|
|
13
|
+
def timestamp; end
|
14
|
+
|
15
|
+
def id; end
|
16
|
+
end
|
9
17
|
end
|
10
|
-
end
|
18
|
+
end
|
@@ -2,34 +2,63 @@ module Chronicle
|
|
2
2
|
module ETL
|
3
3
|
# Abstract class representing an Transformer for an ETL job
|
4
4
|
class Transformer
|
5
|
-
extend Chronicle::ETL::
|
5
|
+
extend Chronicle::ETL::Registry::SelfRegistering
|
6
6
|
|
7
7
|
# Construct a new instance of this transformer. Options are passed in from a Runner
|
8
|
-
# ==
|
8
|
+
# == Parameters:
|
9
9
|
# options::
|
10
10
|
# Options for configuring this Transformer
|
11
|
-
def initialize(options = {},
|
11
|
+
def initialize(options = {}, extraction)
|
12
12
|
@options = options
|
13
|
-
@
|
13
|
+
@extraction = extraction
|
14
14
|
end
|
15
15
|
|
16
|
-
#
|
17
|
-
|
18
|
-
|
19
|
-
end
|
16
|
+
# @abstract Subclass is expected to implement #transform
|
17
|
+
# @!method transform
|
18
|
+
# The main entrypoint for transforming a record. Called by a Runner on each extracted record
|
20
19
|
|
21
20
|
# The domain or provider-specific id of the record this transformer is working on.
|
22
|
-
#
|
23
|
-
#
|
24
|
-
|
21
|
+
# It is useful for:
|
22
|
+
# - de-duping records that might exist in the loader's destination
|
23
|
+
# - building a cursor so an extractor doesn't have to start from the beginning of a
|
24
|
+
# a source
|
25
|
+
def id
|
26
|
+
raise NotImplementedError
|
27
|
+
end
|
25
28
|
|
26
29
|
# The domain or provider-specific timestamp of the record this transformer is working on.
|
27
30
|
# Used for building a cursor so an extractor doesn't have to start from the beginning of a
|
28
31
|
# data source from the beginning.
|
29
|
-
def timestamp
|
32
|
+
def timestamp
|
33
|
+
raise NotImplementedError
|
34
|
+
end
|
35
|
+
|
36
|
+
# An optional, human-readable identifier for a transformation, intended for debugging or logging.
|
37
|
+
# By default, it is just the id.
|
38
|
+
def friendly_identifier
|
39
|
+
id
|
40
|
+
end
|
41
|
+
|
42
|
+
def to_s
|
43
|
+
ts = begin
|
44
|
+
unknown = "???"
|
45
|
+
timestamp&.iso8601 || unknown
|
46
|
+
rescue TransformationError, NotImplementedError
|
47
|
+
unknown
|
48
|
+
end
|
49
|
+
|
50
|
+
identifier = begin
|
51
|
+
unknown = self.class.to_s
|
52
|
+
friendly_identifier || self.class.to_s
|
53
|
+
rescue TransformationError, NotImplementedError
|
54
|
+
unknown
|
55
|
+
end
|
56
|
+
|
57
|
+
"[#{ts}] #{identifier}"
|
58
|
+
end
|
30
59
|
end
|
31
60
|
end
|
32
61
|
end
|
33
62
|
|
34
|
-
require_relative 'json_transformer'
|
35
63
|
require_relative 'null_transformer'
|
64
|
+
require_relative 'image_file_transformer'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'marcel'
|
2
|
+
require 'base64'
|
3
|
+
|
4
|
+
module Chronicle
|
5
|
+
module ETL
|
6
|
+
module Utils
|
7
|
+
# Utility methods for dealing with binary files
|
8
|
+
module BinaryAttachments
|
9
|
+
def self.filename_to_base64(filename:, mimetype: nil)
|
10
|
+
mimetype = mimetype || guess_mimetype(filename: filename)
|
11
|
+
|
12
|
+
"data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.guess_mimetype(filename:)
|
16
|
+
Marcel::MimeType.for(filename)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
module Utils
|
4
|
+
module HashUtilities
|
5
|
+
def self.flatten_hash(hash)
|
6
|
+
hash.each_with_object({}) do |(k, v), h|
|
7
|
+
if v.is_a? Hash
|
8
|
+
flatten_hash(v).map do |h_k, h_v|
|
9
|
+
h["#{k}.#{h_k}".to_sym] = h_v
|
10
|
+
end
|
11
|
+
else
|
12
|
+
h[k] = v
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'active_support/core_ext/object/blank'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Utils
|
6
|
+
# OCR for image files
|
7
|
+
# TODO: add other strategies and document `macocr`
|
8
|
+
module TextRecognition
|
9
|
+
def self.recognize_in_image(filename:)
|
10
|
+
`macocr "#{filename}" 2>/dev/null`.presence
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/chronicle/etl.rb
CHANGED
@@ -1,9 +1,24 @@
|
|
1
|
-
require_relative 'etl/
|
1
|
+
require_relative 'etl/registry/registry'
|
2
2
|
require_relative 'etl/config'
|
3
3
|
require_relative 'etl/exceptions'
|
4
|
+
require_relative 'etl/extraction'
|
4
5
|
require_relative 'etl/extractors/extractor'
|
6
|
+
require_relative 'etl/job_definition'
|
7
|
+
require_relative 'etl/job_log'
|
8
|
+
require_relative 'etl/job_logger'
|
9
|
+
require_relative 'etl/job'
|
5
10
|
require_relative 'etl/loaders/loader'
|
11
|
+
require_relative 'etl/logger'
|
12
|
+
require_relative 'etl/models/activity'
|
13
|
+
require_relative 'etl/models/attachment'
|
14
|
+
require_relative 'etl/models/base'
|
15
|
+
require_relative 'etl/models/entity'
|
16
|
+
require_relative 'etl/models/generic'
|
6
17
|
require_relative 'etl/runner'
|
18
|
+
require_relative 'etl/serializers/serializer'
|
7
19
|
require_relative 'etl/transformers/transformer'
|
20
|
+
require_relative 'etl/utils/binary_attachments'
|
21
|
+
require_relative 'etl/utils/hash_utilities'
|
22
|
+
require_relative 'etl/utils/text_recognition'
|
8
23
|
require_relative 'etl/utils/progress_bar'
|
9
24
|
require_relative 'etl/version'
|