chronicle-etl 0.5.5 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +15 -25
- data/.rubocop.yml +2 -44
- data/Gemfile +2 -2
- data/Guardfile +3 -3
- data/README.md +75 -68
- data/Rakefile +2 -2
- data/bin/console +4 -5
- data/chronicle-etl.gemspec +51 -49
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/authorizer.rb +3 -4
- data/lib/chronicle/etl/cli/authorizations.rb +8 -6
- data/lib/chronicle/etl/cli/connectors.rb +7 -7
- data/lib/chronicle/etl/cli/jobs.rb +130 -53
- data/lib/chronicle/etl/cli/main.rb +29 -29
- data/lib/chronicle/etl/cli/plugins.rb +14 -15
- data/lib/chronicle/etl/cli/secrets.rb +14 -12
- data/lib/chronicle/etl/cli/subcommand_base.rb +5 -3
- data/lib/chronicle/etl/config.rb +18 -8
- data/lib/chronicle/etl/configurable.rb +20 -9
- data/lib/chronicle/etl/exceptions.rb +3 -3
- data/lib/chronicle/etl/extraction.rb +12 -2
- data/lib/chronicle/etl/extractors/csv_extractor.rb +9 -0
- data/lib/chronicle/etl/extractors/extractor.rb +15 -2
- data/lib/chronicle/etl/extractors/file_extractor.rb +5 -3
- data/lib/chronicle/etl/extractors/helpers/input_reader.rb +2 -2
- data/lib/chronicle/etl/extractors/json_extractor.rb +14 -4
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +3 -0
- data/lib/chronicle/etl/job.rb +35 -17
- data/lib/chronicle/etl/job_definition.rb +38 -26
- data/lib/chronicle/etl/job_log.rb +14 -16
- data/lib/chronicle/etl/job_logger.rb +4 -4
- data/lib/chronicle/etl/loaders/csv_loader.rb +17 -4
- data/lib/chronicle/etl/loaders/helpers/stdout_helper.rb +4 -0
- data/lib/chronicle/etl/loaders/json_loader.rb +30 -10
- data/lib/chronicle/etl/loaders/loader.rb +0 -17
- data/lib/chronicle/etl/loaders/rest_loader.rb +7 -7
- data/lib/chronicle/etl/loaders/table_loader.rb +37 -12
- data/lib/chronicle/etl/logger.rb +2 -2
- data/lib/chronicle/etl/oauth_authorizer.rb +8 -8
- data/lib/chronicle/etl/record.rb +15 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +15 -23
- data/lib/chronicle/etl/registry/connectors.rb +93 -36
- data/lib/chronicle/etl/registry/plugin_registration.rb +1 -1
- data/lib/chronicle/etl/registry/plugins.rb +27 -19
- data/lib/chronicle/etl/runner.rb +158 -128
- data/lib/chronicle/etl/secrets.rb +4 -4
- data/lib/chronicle/etl/transformers/buffer_transformer.rb +29 -0
- data/lib/chronicle/etl/transformers/chronicle_transformer.rb +32 -0
- data/lib/chronicle/etl/transformers/chronobase_transformer.rb +100 -0
- data/lib/chronicle/etl/transformers/fields_limit_transformer.rb +23 -0
- data/lib/chronicle/etl/transformers/filter_fields_transformer.rb +60 -0
- data/lib/chronicle/etl/transformers/filter_transformer.rb +30 -0
- data/lib/chronicle/etl/transformers/format_transformer.rb +32 -0
- data/lib/chronicle/etl/transformers/merge_meta_transformer.rb +19 -0
- data/lib/chronicle/etl/transformers/multiply_transformer.rb +21 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +5 -7
- data/lib/chronicle/etl/transformers/sampler_transformer.rb +21 -0
- data/lib/chronicle/etl/transformers/sort_transformer.rb +31 -0
- data/lib/chronicle/etl/transformers/transformer.rb +63 -41
- data/lib/chronicle/etl/utils/binary_attachments.rb +1 -1
- data/lib/chronicle/etl/utils/progress_bar.rb +2 -3
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +6 -8
- metadata +49 -47
- data/lib/chronicle/etl/models/activity.rb +0 -15
- data/lib/chronicle/etl/models/attachment.rb +0 -14
- data/lib/chronicle/etl/models/base.rb +0 -122
- data/lib/chronicle/etl/models/entity.rb +0 -29
- data/lib/chronicle/etl/models/raw.rb +0 -26
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +0 -31
- data/lib/chronicle/etl/serializers/raw_serializer.rb +0 -10
- data/lib/chronicle/etl/serializers/serializer.rb +0 -28
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +0 -247
- data/lib/chronicle/etl/utils/hash_utilities.rb +0 -19
- data/lib/chronicle/etl/utils/text_recognition.rb +0 -15
@@ -1,247 +0,0 @@
|
|
1
|
-
require 'mini_exiftool'
|
2
|
-
require 'active_support'
|
3
|
-
require 'active_support/core_ext/object'
|
4
|
-
require 'active_support/core_ext/time'
|
5
|
-
require 'active_support/core_ext/hash/reverse_merge'
|
6
|
-
require 'active_support/core_ext/string/inflections'
|
7
|
-
|
8
|
-
module Chronicle
|
9
|
-
module ETL
|
10
|
-
# Transform a JPEG or other image file into a record.
|
11
|
-
# By default, file mtime and a hash of the file content is used to build
|
12
|
-
# the timestamp and ID respectively but other options are available (such
|
13
|
-
# as reading EXIF tags or extended attributes from the filesystem).
|
14
|
-
#
|
15
|
-
# TODO: This should be extracted into its own plugin
|
16
|
-
class ImageFileTransformer < Chronicle::ETL::Transformer
|
17
|
-
register_connector do |r|
|
18
|
-
r.identifier = 'image-file'
|
19
|
-
r.description = 'an image file'
|
20
|
-
end
|
21
|
-
|
22
|
-
setting :timestamp_strategy, default: 'file_mtime'
|
23
|
-
setting :id_strategy, default: 'file_hash'
|
24
|
-
setting :verb, default: 'photographed'
|
25
|
-
# EXIF tags often don't have timezones
|
26
|
-
setting :timezone_default, default: 'Eastern Time (US & Canada)'
|
27
|
-
setting :include_image_data, default: true
|
28
|
-
setting :actor
|
29
|
-
setting :involved
|
30
|
-
|
31
|
-
def transform
|
32
|
-
# FIXME: set @filename; use block for reading file when necessary
|
33
|
-
@file = File.open(@extraction.data)
|
34
|
-
record = build_created(@file)
|
35
|
-
@file.close
|
36
|
-
record
|
37
|
-
end
|
38
|
-
|
39
|
-
def friendly_identifier
|
40
|
-
@file.path
|
41
|
-
end
|
42
|
-
|
43
|
-
def id
|
44
|
-
@id ||= begin
|
45
|
-
id = build_with_strategy(field: :id, strategy: @config.id_strategy)
|
46
|
-
raise(UntransformableRecordError, "Could not build id") unless id
|
47
|
-
|
48
|
-
id
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def timestamp
|
53
|
-
@timestamp ||= begin
|
54
|
-
ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
|
55
|
-
raise(UntransformableRecordError, "Could not build timestamp") unless ts
|
56
|
-
|
57
|
-
ts
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
private
|
62
|
-
|
63
|
-
def build_created(file)
|
64
|
-
record = ::Chronicle::ETL::Models::Activity.new
|
65
|
-
record.verb = @config.verb
|
66
|
-
record.provider = @config.provider
|
67
|
-
record.provider_id = id
|
68
|
-
record.end_at = timestamp
|
69
|
-
record.dedupe_on = [[:provider_id, :verb, :provider]]
|
70
|
-
|
71
|
-
record.involved = build_image
|
72
|
-
record.actor = build_actor
|
73
|
-
|
74
|
-
record.assign_attributes(build_gps)
|
75
|
-
record
|
76
|
-
end
|
77
|
-
|
78
|
-
def build_actor
|
79
|
-
actor = ::Chronicle::ETL::Models::Entity.new
|
80
|
-
actor.represents = 'identity'
|
81
|
-
actor.provider = @config.actor[:provider]
|
82
|
-
actor.slug = @config.actor[:slug]
|
83
|
-
actor.dedupe_on = [[:provider, :slug, :represents]]
|
84
|
-
actor
|
85
|
-
end
|
86
|
-
|
87
|
-
def build_image
|
88
|
-
image = ::Chronicle::ETL::Models::Entity.new
|
89
|
-
image.represents = @config.involved[:represents]
|
90
|
-
image.title = build_title
|
91
|
-
image.body = exif['Description']
|
92
|
-
image.provider = @config.involved[:provider]
|
93
|
-
image.provider_id = id
|
94
|
-
image.assign_attributes(build_gps)
|
95
|
-
image.dedupe_on = [[:provider, :provider_id, :represents]]
|
96
|
-
|
97
|
-
if @config.ocr_strategy
|
98
|
-
ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
|
99
|
-
image.metadata[:ocr_text] = ocr_text if ocr_text
|
100
|
-
end
|
101
|
-
|
102
|
-
names = extract_people_depicted
|
103
|
-
tags = extract_keywords(names)
|
104
|
-
|
105
|
-
image.depicts = build_people_depicted(names)
|
106
|
-
image.abouts = build_keywords(tags)
|
107
|
-
|
108
|
-
if @config.include_image_data
|
109
|
-
attachment = ::Chronicle::ETL::Models::Attachment.new
|
110
|
-
attachment.data = build_image_data
|
111
|
-
image.attachments = [attachment]
|
112
|
-
end
|
113
|
-
|
114
|
-
image
|
115
|
-
end
|
116
|
-
|
117
|
-
def build_keywords(topics)
|
118
|
-
topics.map do |topic|
|
119
|
-
t = ::Chronicle::ETL::Models::Entity.new
|
120
|
-
t.represents = 'topic'
|
121
|
-
t.provider = @config.involved[:provider]
|
122
|
-
t.title = topic
|
123
|
-
t.slug = topic.parameterize
|
124
|
-
t.dedupe_on = [[:provider, :represents, :slug]]
|
125
|
-
t
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
def build_people_depicted(names)
|
130
|
-
names.map do |name|
|
131
|
-
identity = ::Chronicle::ETL::Models::Entity.new
|
132
|
-
identity.represents = 'identity'
|
133
|
-
identity.provider = @config.involved[:provider]
|
134
|
-
identity.slug = name.parameterize
|
135
|
-
identity.title = name
|
136
|
-
identity.dedupe_on = [[:provider, :represents, :slug]]
|
137
|
-
identity
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def build_gps
|
142
|
-
return {} unless exif['GPSLatitude']
|
143
|
-
|
144
|
-
{
|
145
|
-
lat: exif['GPSLatitude'],
|
146
|
-
lng: exif['GPSLongitude'],
|
147
|
-
elevation: exif['GPSAltitude']
|
148
|
-
}
|
149
|
-
end
|
150
|
-
|
151
|
-
def build_image_data
|
152
|
-
::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
|
153
|
-
end
|
154
|
-
|
155
|
-
def build_title
|
156
|
-
File.basename(@file)
|
157
|
-
end
|
158
|
-
|
159
|
-
def build_with_strategy(field:, strategy:[])
|
160
|
-
strategies = [strategy].flatten.compact
|
161
|
-
strategies.each do |s|
|
162
|
-
builder_method = "build_#{field}_using_#{s}"
|
163
|
-
result = send(builder_method.to_sym)
|
164
|
-
return result if result
|
165
|
-
end
|
166
|
-
return
|
167
|
-
end
|
168
|
-
|
169
|
-
def build_id_using_file_hash
|
170
|
-
Digest::SHA256.hexdigest(File.read(@file))
|
171
|
-
end
|
172
|
-
|
173
|
-
def build_id_using_xattr_version
|
174
|
-
load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
|
175
|
-
end
|
176
|
-
|
177
|
-
def build_id_using_xmp_document_id
|
178
|
-
exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
|
179
|
-
end
|
180
|
-
|
181
|
-
def build_timestamp_using_file_mtime
|
182
|
-
File.mtime(@file)
|
183
|
-
end
|
184
|
-
|
185
|
-
def build_timestamp_using_exif_datetimeoriginal
|
186
|
-
# EXIF tags don't have timezone information. This is a DateTime in UTC
|
187
|
-
timestamp = exif['DateTimeOriginal'] || return
|
188
|
-
|
189
|
-
if exif['OffsetTimeOriginal']
|
190
|
-
# Offset tags are only available in newer EXIF tags. If it exists, we
|
191
|
-
# use it instead of UTC
|
192
|
-
timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
|
193
|
-
elsif false
|
194
|
-
# TODO: support option of using GPS coordinates to determine timezone
|
195
|
-
else
|
196
|
-
zone = ActiveSupport::TimeZone.new(@config.timezone_default)
|
197
|
-
timestamp = zone.parse(timestamp.asctime)
|
198
|
-
end
|
199
|
-
|
200
|
-
timestamp
|
201
|
-
end
|
202
|
-
|
203
|
-
# TODO: add documentation for how to set up `macocr`
|
204
|
-
def build_ocr_using_macocr
|
205
|
-
`macocr "#{@file.path}" 2>/dev/null`.presence
|
206
|
-
end
|
207
|
-
|
208
|
-
def exif
|
209
|
-
@exif ||= MiniExiftool.new(
|
210
|
-
@file.path,
|
211
|
-
numerical: true,
|
212
|
-
|
213
|
-
# EXIF timestamps don't have timezone information. MiniExifTool uses Time
|
214
|
-
# by default which parses timestamps in local time zone. Using DateTime
|
215
|
-
# parses dates as UTC and then we can apply a timezone offset if the optional
|
216
|
-
# EXIF timezone offset fields are available.
|
217
|
-
# https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
|
218
|
-
timestamps: DateTime
|
219
|
-
)
|
220
|
-
end
|
221
|
-
|
222
|
-
# Figure out which faces are tagged as regions and return a list of their names
|
223
|
-
def extract_people_depicted
|
224
|
-
return [] unless exif['RegionName']
|
225
|
-
|
226
|
-
names = [exif['RegionName']].flatten
|
227
|
-
types = [exif['RegionType']].flatten
|
228
|
-
|
229
|
-
names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
|
230
|
-
end
|
231
|
-
|
232
|
-
# Extract image keywords from EXIF/IPTC tag and subtract out those of which are
|
233
|
-
# tagged people (determiend by looking at face regions)
|
234
|
-
def extract_keywords(people_names = [])
|
235
|
-
[exif['Keywords'] || []].flatten - people_names
|
236
|
-
end
|
237
|
-
|
238
|
-
def load_value_from_xattr_plist attribute
|
239
|
-
require 'nokogiri'
|
240
|
-
xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
|
241
|
-
return unless xml
|
242
|
-
value = Nokogiri::XML.parse(r).xpath("//string").text
|
243
|
-
return value.presence
|
244
|
-
end
|
245
|
-
end
|
246
|
-
end
|
247
|
-
end
|
@@ -1,19 +0,0 @@
|
|
1
|
-
module Chronicle
|
2
|
-
module ETL
|
3
|
-
module Utils
|
4
|
-
module HashUtilities
|
5
|
-
def self.flatten_hash(hash)
|
6
|
-
hash.each_with_object({}) do |(k, v), h|
|
7
|
-
if v.is_a? Hash
|
8
|
-
flatten_hash(v).map do |h_k, h_v|
|
9
|
-
h["#{k}.#{h_k}".to_sym] = h_v
|
10
|
-
end
|
11
|
-
else
|
12
|
-
h[k] = v
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
@@ -1,15 +0,0 @@
|
|
1
|
-
require 'active_support/core_ext/object/blank'
|
2
|
-
|
3
|
-
module Chronicle
|
4
|
-
module ETL
|
5
|
-
module Utils
|
6
|
-
# OCR for image files
|
7
|
-
# TODO: add other strategies and document `macocr`
|
8
|
-
module TextRecognition
|
9
|
-
def self.recognize_in_image(filename:)
|
10
|
-
`macocr "#{filename}" 2>/dev/null`.presence
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|