chronicle-etl 0.5.5 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +15 -25
- data/.rubocop.yml +2 -44
- data/Gemfile +2 -2
- data/Guardfile +3 -3
- data/README.md +75 -68
- data/Rakefile +2 -2
- data/bin/console +4 -5
- data/chronicle-etl.gemspec +51 -49
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/authorizer.rb +3 -4
- data/lib/chronicle/etl/cli/authorizations.rb +8 -6
- data/lib/chronicle/etl/cli/connectors.rb +7 -7
- data/lib/chronicle/etl/cli/jobs.rb +130 -53
- data/lib/chronicle/etl/cli/main.rb +29 -29
- data/lib/chronicle/etl/cli/plugins.rb +14 -15
- data/lib/chronicle/etl/cli/secrets.rb +14 -12
- data/lib/chronicle/etl/cli/subcommand_base.rb +5 -3
- data/lib/chronicle/etl/config.rb +18 -8
- data/lib/chronicle/etl/configurable.rb +20 -9
- data/lib/chronicle/etl/exceptions.rb +3 -3
- data/lib/chronicle/etl/extraction.rb +12 -2
- data/lib/chronicle/etl/extractors/csv_extractor.rb +9 -0
- data/lib/chronicle/etl/extractors/extractor.rb +15 -2
- data/lib/chronicle/etl/extractors/file_extractor.rb +5 -3
- data/lib/chronicle/etl/extractors/helpers/input_reader.rb +2 -2
- data/lib/chronicle/etl/extractors/json_extractor.rb +14 -4
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +3 -0
- data/lib/chronicle/etl/job.rb +35 -17
- data/lib/chronicle/etl/job_definition.rb +38 -26
- data/lib/chronicle/etl/job_log.rb +14 -16
- data/lib/chronicle/etl/job_logger.rb +4 -4
- data/lib/chronicle/etl/loaders/csv_loader.rb +17 -4
- data/lib/chronicle/etl/loaders/helpers/stdout_helper.rb +4 -0
- data/lib/chronicle/etl/loaders/json_loader.rb +30 -10
- data/lib/chronicle/etl/loaders/loader.rb +0 -17
- data/lib/chronicle/etl/loaders/rest_loader.rb +7 -7
- data/lib/chronicle/etl/loaders/table_loader.rb +37 -12
- data/lib/chronicle/etl/logger.rb +2 -2
- data/lib/chronicle/etl/oauth_authorizer.rb +8 -8
- data/lib/chronicle/etl/record.rb +15 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +15 -23
- data/lib/chronicle/etl/registry/connectors.rb +93 -36
- data/lib/chronicle/etl/registry/plugin_registration.rb +1 -1
- data/lib/chronicle/etl/registry/plugins.rb +27 -19
- data/lib/chronicle/etl/runner.rb +158 -128
- data/lib/chronicle/etl/secrets.rb +4 -4
- data/lib/chronicle/etl/transformers/buffer_transformer.rb +29 -0
- data/lib/chronicle/etl/transformers/chronicle_transformer.rb +32 -0
- data/lib/chronicle/etl/transformers/chronobase_transformer.rb +100 -0
- data/lib/chronicle/etl/transformers/fields_limit_transformer.rb +23 -0
- data/lib/chronicle/etl/transformers/filter_fields_transformer.rb +60 -0
- data/lib/chronicle/etl/transformers/filter_transformer.rb +30 -0
- data/lib/chronicle/etl/transformers/format_transformer.rb +32 -0
- data/lib/chronicle/etl/transformers/merge_meta_transformer.rb +19 -0
- data/lib/chronicle/etl/transformers/multiply_transformer.rb +21 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +5 -7
- data/lib/chronicle/etl/transformers/sampler_transformer.rb +21 -0
- data/lib/chronicle/etl/transformers/sort_transformer.rb +31 -0
- data/lib/chronicle/etl/transformers/transformer.rb +63 -41
- data/lib/chronicle/etl/utils/binary_attachments.rb +1 -1
- data/lib/chronicle/etl/utils/progress_bar.rb +2 -3
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +6 -8
- metadata +49 -47
- data/lib/chronicle/etl/models/activity.rb +0 -15
- data/lib/chronicle/etl/models/attachment.rb +0 -14
- data/lib/chronicle/etl/models/base.rb +0 -122
- data/lib/chronicle/etl/models/entity.rb +0 -29
- data/lib/chronicle/etl/models/raw.rb +0 -26
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +0 -31
- data/lib/chronicle/etl/serializers/raw_serializer.rb +0 -10
- data/lib/chronicle/etl/serializers/serializer.rb +0 -28
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +0 -247
- data/lib/chronicle/etl/utils/hash_utilities.rb +0 -19
- data/lib/chronicle/etl/utils/text_recognition.rb +0 -15
@@ -1,247 +0,0 @@
|
|
1
|
-
require 'mini_exiftool'
|
2
|
-
require 'active_support'
|
3
|
-
require 'active_support/core_ext/object'
|
4
|
-
require 'active_support/core_ext/time'
|
5
|
-
require 'active_support/core_ext/hash/reverse_merge'
|
6
|
-
require 'active_support/core_ext/string/inflections'
|
7
|
-
|
8
|
-
module Chronicle
|
9
|
-
module ETL
|
10
|
-
# Transform a JPEG or other image file into a record.
|
11
|
-
# By default, file mtime and a hash of the file content is used to build
|
12
|
-
# the timestamp and ID respectively but other options are available (such
|
13
|
-
# as reading EXIF tags or extended attributes from the filesystem).
|
14
|
-
#
|
15
|
-
# TODO: This should be extracted into its own plugin
|
16
|
-
class ImageFileTransformer < Chronicle::ETL::Transformer
|
17
|
-
register_connector do |r|
|
18
|
-
r.identifier = 'image-file'
|
19
|
-
r.description = 'an image file'
|
20
|
-
end
|
21
|
-
|
22
|
-
setting :timestamp_strategy, default: 'file_mtime'
|
23
|
-
setting :id_strategy, default: 'file_hash'
|
24
|
-
setting :verb, default: 'photographed'
|
25
|
-
# EXIF tags often don't have timezones
|
26
|
-
setting :timezone_default, default: 'Eastern Time (US & Canada)'
|
27
|
-
setting :include_image_data, default: true
|
28
|
-
setting :actor
|
29
|
-
setting :involved
|
30
|
-
|
31
|
-
def transform
|
32
|
-
# FIXME: set @filename; use block for reading file when necessary
|
33
|
-
@file = File.open(@extraction.data)
|
34
|
-
record = build_created(@file)
|
35
|
-
@file.close
|
36
|
-
record
|
37
|
-
end
|
38
|
-
|
39
|
-
def friendly_identifier
|
40
|
-
@file.path
|
41
|
-
end
|
42
|
-
|
43
|
-
def id
|
44
|
-
@id ||= begin
|
45
|
-
id = build_with_strategy(field: :id, strategy: @config.id_strategy)
|
46
|
-
raise(UntransformableRecordError, "Could not build id") unless id
|
47
|
-
|
48
|
-
id
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def timestamp
|
53
|
-
@timestamp ||= begin
|
54
|
-
ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
|
55
|
-
raise(UntransformableRecordError, "Could not build timestamp") unless ts
|
56
|
-
|
57
|
-
ts
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
private
|
62
|
-
|
63
|
-
def build_created(file)
|
64
|
-
record = ::Chronicle::ETL::Models::Activity.new
|
65
|
-
record.verb = @config.verb
|
66
|
-
record.provider = @config.provider
|
67
|
-
record.provider_id = id
|
68
|
-
record.end_at = timestamp
|
69
|
-
record.dedupe_on = [[:provider_id, :verb, :provider]]
|
70
|
-
|
71
|
-
record.involved = build_image
|
72
|
-
record.actor = build_actor
|
73
|
-
|
74
|
-
record.assign_attributes(build_gps)
|
75
|
-
record
|
76
|
-
end
|
77
|
-
|
78
|
-
def build_actor
|
79
|
-
actor = ::Chronicle::ETL::Models::Entity.new
|
80
|
-
actor.represents = 'identity'
|
81
|
-
actor.provider = @config.actor[:provider]
|
82
|
-
actor.slug = @config.actor[:slug]
|
83
|
-
actor.dedupe_on = [[:provider, :slug, :represents]]
|
84
|
-
actor
|
85
|
-
end
|
86
|
-
|
87
|
-
def build_image
|
88
|
-
image = ::Chronicle::ETL::Models::Entity.new
|
89
|
-
image.represents = @config.involved[:represents]
|
90
|
-
image.title = build_title
|
91
|
-
image.body = exif['Description']
|
92
|
-
image.provider = @config.involved[:provider]
|
93
|
-
image.provider_id = id
|
94
|
-
image.assign_attributes(build_gps)
|
95
|
-
image.dedupe_on = [[:provider, :provider_id, :represents]]
|
96
|
-
|
97
|
-
if @config.ocr_strategy
|
98
|
-
ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
|
99
|
-
image.metadata[:ocr_text] = ocr_text if ocr_text
|
100
|
-
end
|
101
|
-
|
102
|
-
names = extract_people_depicted
|
103
|
-
tags = extract_keywords(names)
|
104
|
-
|
105
|
-
image.depicts = build_people_depicted(names)
|
106
|
-
image.abouts = build_keywords(tags)
|
107
|
-
|
108
|
-
if @config.include_image_data
|
109
|
-
attachment = ::Chronicle::ETL::Models::Attachment.new
|
110
|
-
attachment.data = build_image_data
|
111
|
-
image.attachments = [attachment]
|
112
|
-
end
|
113
|
-
|
114
|
-
image
|
115
|
-
end
|
116
|
-
|
117
|
-
def build_keywords(topics)
|
118
|
-
topics.map do |topic|
|
119
|
-
t = ::Chronicle::ETL::Models::Entity.new
|
120
|
-
t.represents = 'topic'
|
121
|
-
t.provider = @config.involved[:provider]
|
122
|
-
t.title = topic
|
123
|
-
t.slug = topic.parameterize
|
124
|
-
t.dedupe_on = [[:provider, :represents, :slug]]
|
125
|
-
t
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
def build_people_depicted(names)
|
130
|
-
names.map do |name|
|
131
|
-
identity = ::Chronicle::ETL::Models::Entity.new
|
132
|
-
identity.represents = 'identity'
|
133
|
-
identity.provider = @config.involved[:provider]
|
134
|
-
identity.slug = name.parameterize
|
135
|
-
identity.title = name
|
136
|
-
identity.dedupe_on = [[:provider, :represents, :slug]]
|
137
|
-
identity
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def build_gps
|
142
|
-
return {} unless exif['GPSLatitude']
|
143
|
-
|
144
|
-
{
|
145
|
-
lat: exif['GPSLatitude'],
|
146
|
-
lng: exif['GPSLongitude'],
|
147
|
-
elevation: exif['GPSAltitude']
|
148
|
-
}
|
149
|
-
end
|
150
|
-
|
151
|
-
def build_image_data
|
152
|
-
::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
|
153
|
-
end
|
154
|
-
|
155
|
-
def build_title
|
156
|
-
File.basename(@file)
|
157
|
-
end
|
158
|
-
|
159
|
-
def build_with_strategy(field:, strategy:[])
|
160
|
-
strategies = [strategy].flatten.compact
|
161
|
-
strategies.each do |s|
|
162
|
-
builder_method = "build_#{field}_using_#{s}"
|
163
|
-
result = send(builder_method.to_sym)
|
164
|
-
return result if result
|
165
|
-
end
|
166
|
-
return
|
167
|
-
end
|
168
|
-
|
169
|
-
def build_id_using_file_hash
|
170
|
-
Digest::SHA256.hexdigest(File.read(@file))
|
171
|
-
end
|
172
|
-
|
173
|
-
def build_id_using_xattr_version
|
174
|
-
load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
|
175
|
-
end
|
176
|
-
|
177
|
-
def build_id_using_xmp_document_id
|
178
|
-
exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
|
179
|
-
end
|
180
|
-
|
181
|
-
def build_timestamp_using_file_mtime
|
182
|
-
File.mtime(@file)
|
183
|
-
end
|
184
|
-
|
185
|
-
def build_timestamp_using_exif_datetimeoriginal
|
186
|
-
# EXIF tags don't have timezone information. This is a DateTime in UTC
|
187
|
-
timestamp = exif['DateTimeOriginal'] || return
|
188
|
-
|
189
|
-
if exif['OffsetTimeOriginal']
|
190
|
-
# Offset tags are only available in newer EXIF tags. If it exists, we
|
191
|
-
# use it instead of UTC
|
192
|
-
timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
|
193
|
-
elsif false
|
194
|
-
# TODO: support option of using GPS coordinates to determine timezone
|
195
|
-
else
|
196
|
-
zone = ActiveSupport::TimeZone.new(@config.timezone_default)
|
197
|
-
timestamp = zone.parse(timestamp.asctime)
|
198
|
-
end
|
199
|
-
|
200
|
-
timestamp
|
201
|
-
end
|
202
|
-
|
203
|
-
# TODO: add documentation for how to set up `macocr`
|
204
|
-
def build_ocr_using_macocr
|
205
|
-
`macocr "#{@file.path}" 2>/dev/null`.presence
|
206
|
-
end
|
207
|
-
|
208
|
-
def exif
|
209
|
-
@exif ||= MiniExiftool.new(
|
210
|
-
@file.path,
|
211
|
-
numerical: true,
|
212
|
-
|
213
|
-
# EXIF timestamps don't have timezone information. MiniExifTool uses Time
|
214
|
-
# by default which parses timestamps in local time zone. Using DateTime
|
215
|
-
# parses dates as UTC and then we can apply a timezone offset if the optional
|
216
|
-
# EXIF timezone offset fields are available.
|
217
|
-
# https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
|
218
|
-
timestamps: DateTime
|
219
|
-
)
|
220
|
-
end
|
221
|
-
|
222
|
-
# Figure out which faces are tagged as regions and return a list of their names
|
223
|
-
def extract_people_depicted
|
224
|
-
return [] unless exif['RegionName']
|
225
|
-
|
226
|
-
names = [exif['RegionName']].flatten
|
227
|
-
types = [exif['RegionType']].flatten
|
228
|
-
|
229
|
-
names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
|
230
|
-
end
|
231
|
-
|
232
|
-
# Extract image keywords from EXIF/IPTC tag and subtract out those of which are
|
233
|
-
# tagged people (determiend by looking at face regions)
|
234
|
-
def extract_keywords(people_names = [])
|
235
|
-
[exif['Keywords'] || []].flatten - people_names
|
236
|
-
end
|
237
|
-
|
238
|
-
def load_value_from_xattr_plist attribute
|
239
|
-
require 'nokogiri'
|
240
|
-
xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
|
241
|
-
return unless xml
|
242
|
-
value = Nokogiri::XML.parse(r).xpath("//string").text
|
243
|
-
return value.presence
|
244
|
-
end
|
245
|
-
end
|
246
|
-
end
|
247
|
-
end
|
@@ -1,19 +0,0 @@
|
|
1
|
-
module Chronicle
|
2
|
-
module ETL
|
3
|
-
module Utils
|
4
|
-
module HashUtilities
|
5
|
-
def self.flatten_hash(hash)
|
6
|
-
hash.each_with_object({}) do |(k, v), h|
|
7
|
-
if v.is_a? Hash
|
8
|
-
flatten_hash(v).map do |h_k, h_v|
|
9
|
-
h["#{k}.#{h_k}".to_sym] = h_v
|
10
|
-
end
|
11
|
-
else
|
12
|
-
h[k] = v
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
@@ -1,15 +0,0 @@
|
|
1
|
-
require 'active_support/core_ext/object/blank'
|
2
|
-
|
3
|
-
module Chronicle
|
4
|
-
module ETL
|
5
|
-
module Utils
|
6
|
-
# OCR for image files
|
7
|
-
# TODO: add other strategies and document `macocr`
|
8
|
-
module TextRecognition
|
9
|
-
def self.recognize_in_image(filename:)
|
10
|
-
`macocr "#{filename}" 2>/dev/null`.presence
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|