chronicle-etl 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +3 -0
- data/README.md +22 -15
- data/chronicle-etl.gemspec +11 -5
- data/lib/chronicle/etl/cli/connectors.rb +19 -7
- data/lib/chronicle/etl/cli/jobs.rb +38 -27
- data/lib/chronicle/etl/cli/main.rb +10 -2
- data/lib/chronicle/etl/config.rb +24 -3
- data/lib/chronicle/etl/exceptions.rb +30 -0
- data/lib/chronicle/etl/extraction.rb +12 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -37
- data/lib/chronicle/etl/extractors/extractor.rb +19 -1
- data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
- data/lib/chronicle/etl/job.rb +72 -0
- data/lib/chronicle/etl/job_definition.rb +89 -0
- data/lib/chronicle/etl/job_log.rb +95 -0
- data/lib/chronicle/etl/job_logger.rb +81 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +6 -6
- data/lib/chronicle/etl/loaders/loader.rb +2 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +16 -9
- data/lib/chronicle/etl/loaders/stdout_loader.rb +8 -3
- data/lib/chronicle/etl/loaders/table_loader.rb +58 -7
- data/lib/chronicle/etl/logger.rb +48 -0
- data/lib/chronicle/etl/models/activity.rb +15 -0
- data/lib/chronicle/etl/models/attachment.rb +14 -0
- data/lib/chronicle/etl/models/base.rb +119 -0
- data/lib/chronicle/etl/models/entity.rb +21 -0
- data/lib/chronicle/etl/models/generic.rb +23 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
- data/lib/chronicle/etl/registry/registry.rb +52 -0
- data/lib/chronicle/etl/registry/self_registering.rb +25 -0
- data/lib/chronicle/etl/runner.rb +70 -42
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
- data/lib/chronicle/etl/serializers/serializer.rb +27 -0
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +12 -4
- data/lib/chronicle/etl/transformers/transformer.rb +42 -12
- data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
- data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
- data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +17 -1
- metadata +138 -35
- data/CHANGELOG.md +0 -23
- data/Gemfile.lock +0 -85
- data/lib/chronicle/etl/catalog.rb +0 -62
- data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -0,0 +1,253 @@
|
|
1
|
+
require 'mini_exiftool'
|
2
|
+
require 'active_support'
|
3
|
+
require 'active_support/core_ext/object'
|
4
|
+
require 'active_support/core_ext/time'
|
5
|
+
require 'active_support/core_ext/hash/reverse_merge'
|
6
|
+
require 'active_support/core_ext/string/inflections'
|
7
|
+
|
8
|
+
module Chronicle
|
9
|
+
module ETL
|
10
|
+
# Transform a JPEG or other image file into a record.
|
11
|
+
# By default, file mtime and a hash of the file content is used to build
|
12
|
+
# the timestamp and ID respectively but other options are available (such
|
13
|
+
# as reading EXIF tags or extended attributes from the filesystem).
|
14
|
+
#
|
15
|
+
# TODO: This should be extracted into its own plugin
|
16
|
+
class ImageFileTransformer < Chronicle::ETL::Transformer
|
17
|
+
register_connector do |r|
|
18
|
+
r.identifier = 'image-file'
|
19
|
+
r.description = 'an image file'
|
20
|
+
end
|
21
|
+
|
22
|
+
DEFAULT_OPTIONS = {
|
23
|
+
timestamp_strategy: 'file_mtime',
|
24
|
+
id_strategy: 'file_hash',
|
25
|
+
verb: 'photographed',
|
26
|
+
|
27
|
+
# EXIF tags often don't have timezones
|
28
|
+
timezone_default: 'Eastern Time (US & Canada)',
|
29
|
+
include_image_data: true
|
30
|
+
}.freeze
|
31
|
+
|
32
|
+
def initialize(*args)
|
33
|
+
super(*args)
|
34
|
+
@options = @options.reverse_merge(DEFAULT_OPTIONS)
|
35
|
+
end
|
36
|
+
|
37
|
+
def transform
|
38
|
+
# FIXME: set @filename; use block for reading file when necessary
|
39
|
+
@file = File.open(@extraction.data)
|
40
|
+
record = build_created(@file)
|
41
|
+
@file.close
|
42
|
+
record
|
43
|
+
end
|
44
|
+
|
45
|
+
def friendly_identifier
|
46
|
+
@file.path
|
47
|
+
end
|
48
|
+
|
49
|
+
def id
|
50
|
+
@id ||= begin
|
51
|
+
id = build_with_strategy(field: :id, strategy: @options[:id_strategy])
|
52
|
+
raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
|
53
|
+
|
54
|
+
id
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def timestamp
|
59
|
+
@timestamp ||= begin
|
60
|
+
ts = build_with_strategy(field: :timestamp, strategy: @options[:timestamp_strategy])
|
61
|
+
raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
|
62
|
+
|
63
|
+
ts
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def build_created(file)
|
70
|
+
record = ::Chronicle::ETL::Models::Activity.new
|
71
|
+
record.verb = @options[:verb]
|
72
|
+
record.provider = @options[:provider]
|
73
|
+
record.provider_id = id
|
74
|
+
record.end_at = timestamp
|
75
|
+
record.dedupe_on = [[:provider_id, :verb, :provider]]
|
76
|
+
|
77
|
+
record.involved = build_image
|
78
|
+
record.actor = build_actor
|
79
|
+
|
80
|
+
record.assign_attributes(build_gps)
|
81
|
+
record
|
82
|
+
end
|
83
|
+
|
84
|
+
def build_actor
|
85
|
+
actor = ::Chronicle::ETL::Models::Entity.new
|
86
|
+
actor.represents = 'identity'
|
87
|
+
actor.provider = @options[:actor][:provider]
|
88
|
+
actor.slug = @options[:actor][:slug]
|
89
|
+
actor.dedupe_on = [[:provider, :slug, :represents]]
|
90
|
+
actor
|
91
|
+
end
|
92
|
+
|
93
|
+
def build_image
|
94
|
+
image = ::Chronicle::ETL::Models::Entity.new
|
95
|
+
image.represents = @options[:involved][:represents]
|
96
|
+
image.title = build_title
|
97
|
+
image.body = exif['Description']
|
98
|
+
image.provider = @options[:involved][:provider]
|
99
|
+
image.provider_id = id
|
100
|
+
image.assign_attributes(build_gps)
|
101
|
+
image.dedupe_on = [[:provider, :provider_id, :represents]]
|
102
|
+
|
103
|
+
if @options[:ocr_strategy]
|
104
|
+
ocr_text = build_with_strategy(field: :ocr, strategy: @options[:ocr_strategy])
|
105
|
+
image.metadata[:ocr_text] = ocr_text if ocr_text
|
106
|
+
end
|
107
|
+
|
108
|
+
names = extract_people_depicted
|
109
|
+
tags = extract_keywords(names)
|
110
|
+
|
111
|
+
image.depicts = build_people_depicted(names)
|
112
|
+
image.abouts = build_keywords(tags)
|
113
|
+
|
114
|
+
if @options[:include_image_data]
|
115
|
+
attachment = ::Chronicle::ETL::Models::Attachment.new
|
116
|
+
attachment.data = build_image_data
|
117
|
+
image.attachments = [attachment]
|
118
|
+
end
|
119
|
+
|
120
|
+
image
|
121
|
+
end
|
122
|
+
|
123
|
+
def build_keywords(topics)
|
124
|
+
topics.map do |topic|
|
125
|
+
t = ::Chronicle::ETL::Models::Entity.new
|
126
|
+
t.represents = 'topic'
|
127
|
+
t.provider = @options[:involved][:provider]
|
128
|
+
t.title = topic
|
129
|
+
t.slug = topic.parameterize
|
130
|
+
t.dedupe_on = [[:provider, :represents, :slug]]
|
131
|
+
t
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def build_people_depicted(names)
|
136
|
+
names.map do |name|
|
137
|
+
identity = ::Chronicle::ETL::Models::Entity.new
|
138
|
+
identity.represents = 'identity'
|
139
|
+
identity.provider = @options[:involved][:provider]
|
140
|
+
identity.slug = name.parameterize
|
141
|
+
identity.title = name
|
142
|
+
identity.dedupe_on = [[:provider, :represents, :slug]]
|
143
|
+
identity
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def build_gps
|
148
|
+
return {} unless exif['GPSLatitude']
|
149
|
+
|
150
|
+
{
|
151
|
+
lat: exif['GPSLatitude'],
|
152
|
+
lng: exif['GPSLongitude'],
|
153
|
+
elevation: exif['GPSAltitude']
|
154
|
+
}
|
155
|
+
end
|
156
|
+
|
157
|
+
def build_image_data
|
158
|
+
::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
|
159
|
+
end
|
160
|
+
|
161
|
+
def build_title
|
162
|
+
File.basename(@file)
|
163
|
+
end
|
164
|
+
|
165
|
+
def build_with_strategy(field:, strategy:[])
|
166
|
+
strategies = [strategy].flatten.compact
|
167
|
+
strategies.each do |s|
|
168
|
+
builder_method = "build_#{field}_using_#{s}"
|
169
|
+
result = send(builder_method.to_sym)
|
170
|
+
return result if result
|
171
|
+
end
|
172
|
+
return
|
173
|
+
end
|
174
|
+
|
175
|
+
def build_id_using_file_hash
|
176
|
+
Digest::SHA256.hexdigest(File.read(@file))
|
177
|
+
end
|
178
|
+
|
179
|
+
def build_id_using_xattr_version
|
180
|
+
load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
|
181
|
+
end
|
182
|
+
|
183
|
+
def build_id_using_xmp_document_id
|
184
|
+
exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
|
185
|
+
end
|
186
|
+
|
187
|
+
def build_timestamp_using_file_mtime
|
188
|
+
File.mtime(@file)
|
189
|
+
end
|
190
|
+
|
191
|
+
def build_timestamp_using_exif_datetimeoriginal
|
192
|
+
# EXIF tags don't have timezone information. This is a DateTime in UTC
|
193
|
+
timestamp = exif['DateTimeOriginal'] || return
|
194
|
+
|
195
|
+
if exif['OffsetTimeOriginal']
|
196
|
+
# Offset tags are only available in newer EXIF tags. If it exists, we
|
197
|
+
# use it instead of UTC
|
198
|
+
timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
|
199
|
+
elsif false
|
200
|
+
# TODO: support option of using GPS coordinates to determine timezone
|
201
|
+
else
|
202
|
+
zone = ActiveSupport::TimeZone.new(@options[:timezone_default])
|
203
|
+
timestamp = zone.parse(timestamp.asctime)
|
204
|
+
end
|
205
|
+
|
206
|
+
timestamp
|
207
|
+
end
|
208
|
+
|
209
|
+
# TODO: add documentation for how to set up `macocr`
|
210
|
+
def build_ocr_using_macocr
|
211
|
+
`macocr "#{@file.path}" 2>/dev/null`.presence
|
212
|
+
end
|
213
|
+
|
214
|
+
def exif
|
215
|
+
@exif ||= MiniExiftool.new(
|
216
|
+
@file.path,
|
217
|
+
numerical: true,
|
218
|
+
|
219
|
+
# EXIF timestamps don't have timezone information. MiniExifTool uses Time
|
220
|
+
# by default which parses timestamps in local time zone. Using DateTime
|
221
|
+
# parses dates as UTC and then we can apply a timezone offset if the optional
|
222
|
+
# EXIF timezone offset fields are available.
|
223
|
+
# https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
|
224
|
+
timestamps: DateTime
|
225
|
+
)
|
226
|
+
end
|
227
|
+
|
228
|
+
# Figure out which faces are tagged as regions and return a list of their names
|
229
|
+
def extract_people_depicted
|
230
|
+
return [] unless exif['RegionName']
|
231
|
+
|
232
|
+
names = [exif['RegionName']].flatten
|
233
|
+
types = [exif['RegionType']].flatten
|
234
|
+
|
235
|
+
names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
|
236
|
+
end
|
237
|
+
|
238
|
+
# Extract image keywords from EXIF/IPTC tag and subtract out those of which are
|
239
|
+
# tagged people (determiend by looking at face regions)
|
240
|
+
def extract_keywords(people_names = [])
|
241
|
+
[exif['Keywords'] || []].flatten - people_names
|
242
|
+
end
|
243
|
+
|
244
|
+
def load_value_from_xattr_plist attribute
|
245
|
+
require 'nokogiri'
|
246
|
+
xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
|
247
|
+
return unless xml
|
248
|
+
value = Nokogiri::XML.parse(r).xpath("//string").text
|
249
|
+
return value.presence
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
@@ -1,10 +1,18 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
3
|
class NullTransformer < Chronicle::ETL::Transformer
|
4
|
-
|
5
|
-
|
4
|
+
register_connector do |r|
|
5
|
+
r.identifier = 'null'
|
6
|
+
r.description = 'in no way'
|
7
|
+
end
|
8
|
+
|
9
|
+
def transform
|
10
|
+
Chronicle::ETL::Models::Generic.new(@extraction.data)
|
6
11
|
end
|
7
|
-
end
|
8
12
|
|
13
|
+
def timestamp; end
|
14
|
+
|
15
|
+
def id; end
|
16
|
+
end
|
9
17
|
end
|
10
|
-
end
|
18
|
+
end
|
@@ -2,33 +2,63 @@ module Chronicle
|
|
2
2
|
module ETL
|
3
3
|
# Abstract class representing an Transformer for an ETL job
|
4
4
|
class Transformer
|
5
|
-
extend Chronicle::ETL::
|
5
|
+
extend Chronicle::ETL::Registry::SelfRegistering
|
6
6
|
|
7
7
|
# Construct a new instance of this transformer. Options are passed in from a Runner
|
8
|
-
# ==
|
8
|
+
# == Parameters:
|
9
9
|
# options::
|
10
10
|
# Options for configuring this Transformer
|
11
|
-
def initialize(options = {})
|
11
|
+
def initialize(options = {}, extraction)
|
12
12
|
@options = options
|
13
|
+
@extraction = extraction
|
13
14
|
end
|
14
15
|
|
15
|
-
#
|
16
|
-
|
17
|
-
|
18
|
-
end
|
16
|
+
# @abstract Subclass is expected to implement #transform
|
17
|
+
# @!method transform
|
18
|
+
# The main entrypoint for transforming a record. Called by a Runner on each extracted record
|
19
19
|
|
20
20
|
# The domain or provider-specific id of the record this transformer is working on.
|
21
|
-
#
|
22
|
-
#
|
23
|
-
|
21
|
+
# It is useful for:
|
22
|
+
# - de-duping records that might exist in the loader's destination
|
23
|
+
# - building a cursor so an extractor doesn't have to start from the beginning of a
|
24
|
+
# a source
|
25
|
+
def id
|
26
|
+
raise NotImplementedError
|
27
|
+
end
|
24
28
|
|
25
29
|
# The domain or provider-specific timestamp of the record this transformer is working on.
|
26
30
|
# Used for building a cursor so an extractor doesn't have to start from the beginning of a
|
27
31
|
# data source from the beginning.
|
28
|
-
def timestamp
|
32
|
+
def timestamp
|
33
|
+
raise NotImplementedError
|
34
|
+
end
|
35
|
+
|
36
|
+
# An optional, human-readable identifier for a transformation, intended for debugging or logging.
|
37
|
+
# By default, it is just the id.
|
38
|
+
def friendly_identifier
|
39
|
+
id
|
40
|
+
end
|
41
|
+
|
42
|
+
def to_s
|
43
|
+
ts = begin
|
44
|
+
unknown = "???"
|
45
|
+
timestamp&.iso8601 || unknown
|
46
|
+
rescue TransformationError, NotImplementedError
|
47
|
+
unknown
|
48
|
+
end
|
49
|
+
|
50
|
+
identifier = begin
|
51
|
+
unknown = self.class.to_s
|
52
|
+
friendly_identifier || self.class.to_s
|
53
|
+
rescue TransformationError, NotImplementedError
|
54
|
+
unknown
|
55
|
+
end
|
56
|
+
|
57
|
+
"[#{ts}] #{identifier}"
|
58
|
+
end
|
29
59
|
end
|
30
60
|
end
|
31
61
|
end
|
32
62
|
|
33
|
-
require_relative 'json_transformer'
|
34
63
|
require_relative 'null_transformer'
|
64
|
+
require_relative 'image_file_transformer'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'marcel'
|
2
|
+
require 'base64'
|
3
|
+
|
4
|
+
module Chronicle
|
5
|
+
module ETL
|
6
|
+
module Utils
|
7
|
+
# Utility methods for dealing with binary files
|
8
|
+
module BinaryAttachments
|
9
|
+
def self.filename_to_base64(filename:, mimetype: nil)
|
10
|
+
mimetype = mimetype || guess_mimetype(filename: filename)
|
11
|
+
|
12
|
+
"data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.guess_mimetype(filename:)
|
16
|
+
Marcel::MimeType.for(filename)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
module Utils
|
4
|
+
module HashUtilities
|
5
|
+
def self.flatten_hash(hash)
|
6
|
+
hash.each_with_object({}) do |(k, v), h|
|
7
|
+
if v.is_a? Hash
|
8
|
+
flatten_hash(v).map do |h_k, h_v|
|
9
|
+
h["#{k}.#{h_k}".to_sym] = h_v
|
10
|
+
end
|
11
|
+
else
|
12
|
+
h[k] = v
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'active_support/core_ext/object/blank'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Utils
|
6
|
+
# OCR for image files
|
7
|
+
# TODO: add other strategies and document `macocr`
|
8
|
+
module TextRecognition
|
9
|
+
def self.recognize_in_image(filename:)
|
10
|
+
`macocr "#{filename}" 2>/dev/null`.presence
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/chronicle/etl.rb
CHANGED
@@ -1,8 +1,24 @@
|
|
1
|
-
require_relative 'etl/
|
1
|
+
require_relative 'etl/registry/registry'
|
2
2
|
require_relative 'etl/config'
|
3
|
+
require_relative 'etl/exceptions'
|
4
|
+
require_relative 'etl/extraction'
|
3
5
|
require_relative 'etl/extractors/extractor'
|
6
|
+
require_relative 'etl/job_definition'
|
7
|
+
require_relative 'etl/job_log'
|
8
|
+
require_relative 'etl/job_logger'
|
9
|
+
require_relative 'etl/job'
|
4
10
|
require_relative 'etl/loaders/loader'
|
11
|
+
require_relative 'etl/logger'
|
12
|
+
require_relative 'etl/models/activity'
|
13
|
+
require_relative 'etl/models/attachment'
|
14
|
+
require_relative 'etl/models/base'
|
15
|
+
require_relative 'etl/models/entity'
|
16
|
+
require_relative 'etl/models/generic'
|
5
17
|
require_relative 'etl/runner'
|
18
|
+
require_relative 'etl/serializers/serializer'
|
6
19
|
require_relative 'etl/transformers/transformer'
|
20
|
+
require_relative 'etl/utils/binary_attachments'
|
21
|
+
require_relative 'etl/utils/hash_utilities'
|
22
|
+
require_relative 'etl/utils/text_recognition'
|
7
23
|
require_relative 'etl/utils/progress_bar'
|
8
24
|
require_relative 'etl/version'
|