chronicle-etl 0.5.5 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +15 -25
  3. data/.rubocop.yml +2 -44
  4. data/Gemfile +2 -2
  5. data/Guardfile +3 -3
  6. data/README.md +75 -68
  7. data/Rakefile +2 -2
  8. data/bin/console +4 -5
  9. data/chronicle-etl.gemspec +51 -49
  10. data/exe/chronicle-etl +1 -1
  11. data/lib/chronicle/etl/authorizer.rb +3 -4
  12. data/lib/chronicle/etl/cli/authorizations.rb +8 -6
  13. data/lib/chronicle/etl/cli/connectors.rb +7 -7
  14. data/lib/chronicle/etl/cli/jobs.rb +130 -53
  15. data/lib/chronicle/etl/cli/main.rb +29 -29
  16. data/lib/chronicle/etl/cli/plugins.rb +14 -15
  17. data/lib/chronicle/etl/cli/secrets.rb +14 -12
  18. data/lib/chronicle/etl/cli/subcommand_base.rb +5 -3
  19. data/lib/chronicle/etl/config.rb +18 -8
  20. data/lib/chronicle/etl/configurable.rb +20 -9
  21. data/lib/chronicle/etl/exceptions.rb +3 -3
  22. data/lib/chronicle/etl/extraction.rb +12 -2
  23. data/lib/chronicle/etl/extractors/csv_extractor.rb +9 -0
  24. data/lib/chronicle/etl/extractors/extractor.rb +15 -2
  25. data/lib/chronicle/etl/extractors/file_extractor.rb +5 -3
  26. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +2 -2
  27. data/lib/chronicle/etl/extractors/json_extractor.rb +14 -4
  28. data/lib/chronicle/etl/extractors/stdin_extractor.rb +3 -0
  29. data/lib/chronicle/etl/job.rb +35 -17
  30. data/lib/chronicle/etl/job_definition.rb +38 -26
  31. data/lib/chronicle/etl/job_log.rb +14 -16
  32. data/lib/chronicle/etl/job_logger.rb +4 -4
  33. data/lib/chronicle/etl/loaders/csv_loader.rb +17 -4
  34. data/lib/chronicle/etl/loaders/helpers/stdout_helper.rb +4 -0
  35. data/lib/chronicle/etl/loaders/json_loader.rb +30 -10
  36. data/lib/chronicle/etl/loaders/loader.rb +0 -17
  37. data/lib/chronicle/etl/loaders/rest_loader.rb +7 -7
  38. data/lib/chronicle/etl/loaders/table_loader.rb +37 -12
  39. data/lib/chronicle/etl/logger.rb +2 -2
  40. data/lib/chronicle/etl/oauth_authorizer.rb +8 -8
  41. data/lib/chronicle/etl/record.rb +15 -0
  42. data/lib/chronicle/etl/registry/connector_registration.rb +15 -23
  43. data/lib/chronicle/etl/registry/connectors.rb +93 -36
  44. data/lib/chronicle/etl/registry/plugin_registration.rb +1 -1
  45. data/lib/chronicle/etl/registry/plugins.rb +27 -19
  46. data/lib/chronicle/etl/runner.rb +158 -128
  47. data/lib/chronicle/etl/secrets.rb +4 -4
  48. data/lib/chronicle/etl/transformers/buffer_transformer.rb +29 -0
  49. data/lib/chronicle/etl/transformers/chronicle_transformer.rb +32 -0
  50. data/lib/chronicle/etl/transformers/chronobase_transformer.rb +100 -0
  51. data/lib/chronicle/etl/transformers/fields_limit_transformer.rb +23 -0
  52. data/lib/chronicle/etl/transformers/filter_fields_transformer.rb +60 -0
  53. data/lib/chronicle/etl/transformers/filter_transformer.rb +30 -0
  54. data/lib/chronicle/etl/transformers/format_transformer.rb +32 -0
  55. data/lib/chronicle/etl/transformers/merge_meta_transformer.rb +19 -0
  56. data/lib/chronicle/etl/transformers/multiply_transformer.rb +21 -0
  57. data/lib/chronicle/etl/transformers/null_transformer.rb +5 -7
  58. data/lib/chronicle/etl/transformers/sampler_transformer.rb +21 -0
  59. data/lib/chronicle/etl/transformers/sort_transformer.rb +31 -0
  60. data/lib/chronicle/etl/transformers/transformer.rb +63 -41
  61. data/lib/chronicle/etl/utils/binary_attachments.rb +1 -1
  62. data/lib/chronicle/etl/utils/progress_bar.rb +2 -3
  63. data/lib/chronicle/etl/version.rb +1 -1
  64. data/lib/chronicle/etl.rb +6 -8
  65. metadata +49 -47
  66. data/lib/chronicle/etl/models/activity.rb +0 -15
  67. data/lib/chronicle/etl/models/attachment.rb +0 -14
  68. data/lib/chronicle/etl/models/base.rb +0 -122
  69. data/lib/chronicle/etl/models/entity.rb +0 -29
  70. data/lib/chronicle/etl/models/raw.rb +0 -26
  71. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +0 -31
  72. data/lib/chronicle/etl/serializers/raw_serializer.rb +0 -10
  73. data/lib/chronicle/etl/serializers/serializer.rb +0 -28
  74. data/lib/chronicle/etl/transformers/image_file_transformer.rb +0 -247
  75. data/lib/chronicle/etl/utils/hash_utilities.rb +0 -19
  76. data/lib/chronicle/etl/utils/text_recognition.rb +0 -15
@@ -1,247 +0,0 @@
1
- require 'mini_exiftool'
2
- require 'active_support'
3
- require 'active_support/core_ext/object'
4
- require 'active_support/core_ext/time'
5
- require 'active_support/core_ext/hash/reverse_merge'
6
- require 'active_support/core_ext/string/inflections'
7
-
8
- module Chronicle
9
- module ETL
10
- # Transform a JPEG or other image file into a record.
11
- # By default, file mtime and a hash of the file content is used to build
12
- # the timestamp and ID respectively but other options are available (such
13
- # as reading EXIF tags or extended attributes from the filesystem).
14
- #
15
- # TODO: This should be extracted into its own plugin
16
- class ImageFileTransformer < Chronicle::ETL::Transformer
17
- register_connector do |r|
18
- r.identifier = 'image-file'
19
- r.description = 'an image file'
20
- end
21
-
22
- setting :timestamp_strategy, default: 'file_mtime'
23
- setting :id_strategy, default: 'file_hash'
24
- setting :verb, default: 'photographed'
25
- # EXIF tags often don't have timezones
26
- setting :timezone_default, default: 'Eastern Time (US & Canada)'
27
- setting :include_image_data, default: true
28
- setting :actor
29
- setting :involved
30
-
31
- def transform
32
- # FIXME: set @filename; use block for reading file when necessary
33
- @file = File.open(@extraction.data)
34
- record = build_created(@file)
35
- @file.close
36
- record
37
- end
38
-
39
- def friendly_identifier
40
- @file.path
41
- end
42
-
43
- def id
44
- @id ||= begin
45
- id = build_with_strategy(field: :id, strategy: @config.id_strategy)
46
- raise(UntransformableRecordError, "Could not build id") unless id
47
-
48
- id
49
- end
50
- end
51
-
52
- def timestamp
53
- @timestamp ||= begin
54
- ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
55
- raise(UntransformableRecordError, "Could not build timestamp") unless ts
56
-
57
- ts
58
- end
59
- end
60
-
61
- private
62
-
63
- def build_created(file)
64
- record = ::Chronicle::ETL::Models::Activity.new
65
- record.verb = @config.verb
66
- record.provider = @config.provider
67
- record.provider_id = id
68
- record.end_at = timestamp
69
- record.dedupe_on = [[:provider_id, :verb, :provider]]
70
-
71
- record.involved = build_image
72
- record.actor = build_actor
73
-
74
- record.assign_attributes(build_gps)
75
- record
76
- end
77
-
78
- def build_actor
79
- actor = ::Chronicle::ETL::Models::Entity.new
80
- actor.represents = 'identity'
81
- actor.provider = @config.actor[:provider]
82
- actor.slug = @config.actor[:slug]
83
- actor.dedupe_on = [[:provider, :slug, :represents]]
84
- actor
85
- end
86
-
87
- def build_image
88
- image = ::Chronicle::ETL::Models::Entity.new
89
- image.represents = @config.involved[:represents]
90
- image.title = build_title
91
- image.body = exif['Description']
92
- image.provider = @config.involved[:provider]
93
- image.provider_id = id
94
- image.assign_attributes(build_gps)
95
- image.dedupe_on = [[:provider, :provider_id, :represents]]
96
-
97
- if @config.ocr_strategy
98
- ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
99
- image.metadata[:ocr_text] = ocr_text if ocr_text
100
- end
101
-
102
- names = extract_people_depicted
103
- tags = extract_keywords(names)
104
-
105
- image.depicts = build_people_depicted(names)
106
- image.abouts = build_keywords(tags)
107
-
108
- if @config.include_image_data
109
- attachment = ::Chronicle::ETL::Models::Attachment.new
110
- attachment.data = build_image_data
111
- image.attachments = [attachment]
112
- end
113
-
114
- image
115
- end
116
-
117
- def build_keywords(topics)
118
- topics.map do |topic|
119
- t = ::Chronicle::ETL::Models::Entity.new
120
- t.represents = 'topic'
121
- t.provider = @config.involved[:provider]
122
- t.title = topic
123
- t.slug = topic.parameterize
124
- t.dedupe_on = [[:provider, :represents, :slug]]
125
- t
126
- end
127
- end
128
-
129
- def build_people_depicted(names)
130
- names.map do |name|
131
- identity = ::Chronicle::ETL::Models::Entity.new
132
- identity.represents = 'identity'
133
- identity.provider = @config.involved[:provider]
134
- identity.slug = name.parameterize
135
- identity.title = name
136
- identity.dedupe_on = [[:provider, :represents, :slug]]
137
- identity
138
- end
139
- end
140
-
141
- def build_gps
142
- return {} unless exif['GPSLatitude']
143
-
144
- {
145
- lat: exif['GPSLatitude'],
146
- lng: exif['GPSLongitude'],
147
- elevation: exif['GPSAltitude']
148
- }
149
- end
150
-
151
- def build_image_data
152
- ::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
153
- end
154
-
155
- def build_title
156
- File.basename(@file)
157
- end
158
-
159
- def build_with_strategy(field:, strategy:[])
160
- strategies = [strategy].flatten.compact
161
- strategies.each do |s|
162
- builder_method = "build_#{field}_using_#{s}"
163
- result = send(builder_method.to_sym)
164
- return result if result
165
- end
166
- return
167
- end
168
-
169
- def build_id_using_file_hash
170
- Digest::SHA256.hexdigest(File.read(@file))
171
- end
172
-
173
- def build_id_using_xattr_version
174
- load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
175
- end
176
-
177
- def build_id_using_xmp_document_id
178
- exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
179
- end
180
-
181
- def build_timestamp_using_file_mtime
182
- File.mtime(@file)
183
- end
184
-
185
- def build_timestamp_using_exif_datetimeoriginal
186
- # EXIF tags don't have timezone information. This is a DateTime in UTC
187
- timestamp = exif['DateTimeOriginal'] || return
188
-
189
- if exif['OffsetTimeOriginal']
190
- # Offset tags are only available in newer EXIF tags. If it exists, we
191
- # use it instead of UTC
192
- timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
193
- elsif false
194
- # TODO: support option of using GPS coordinates to determine timezone
195
- else
196
- zone = ActiveSupport::TimeZone.new(@config.timezone_default)
197
- timestamp = zone.parse(timestamp.asctime)
198
- end
199
-
200
- timestamp
201
- end
202
-
203
- # TODO: add documentation for how to set up `macocr`
204
- def build_ocr_using_macocr
205
- `macocr "#{@file.path}" 2>/dev/null`.presence
206
- end
207
-
208
- def exif
209
- @exif ||= MiniExiftool.new(
210
- @file.path,
211
- numerical: true,
212
-
213
- # EXIF timestamps don't have timezone information. MiniExifTool uses Time
214
- # by default which parses timestamps in local time zone. Using DateTime
215
- # parses dates as UTC and then we can apply a timezone offset if the optional
216
- # EXIF timezone offset fields are available.
217
- # https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
218
- timestamps: DateTime
219
- )
220
- end
221
-
222
- # Figure out which faces are tagged as regions and return a list of their names
223
- def extract_people_depicted
224
- return [] unless exif['RegionName']
225
-
226
- names = [exif['RegionName']].flatten
227
- types = [exif['RegionType']].flatten
228
-
229
- names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
230
- end
231
-
232
- # Extract image keywords from EXIF/IPTC tag and subtract out those of which are
233
- # tagged people (determiend by looking at face regions)
234
- def extract_keywords(people_names = [])
235
- [exif['Keywords'] || []].flatten - people_names
236
- end
237
-
238
- def load_value_from_xattr_plist attribute
239
- require 'nokogiri'
240
- xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
241
- return unless xml
242
- value = Nokogiri::XML.parse(r).xpath("//string").text
243
- return value.presence
244
- end
245
- end
246
- end
247
- end
@@ -1,19 +0,0 @@
1
- module Chronicle
2
- module ETL
3
- module Utils
4
- module HashUtilities
5
- def self.flatten_hash(hash)
6
- hash.each_with_object({}) do |(k, v), h|
7
- if v.is_a? Hash
8
- flatten_hash(v).map do |h_k, h_v|
9
- h["#{k}.#{h_k}".to_sym] = h_v
10
- end
11
- else
12
- h[k] = v
13
- end
14
- end
15
- end
16
- end
17
- end
18
- end
19
- end
@@ -1,15 +0,0 @@
1
- require 'active_support/core_ext/object/blank'
2
-
3
- module Chronicle
4
- module ETL
5
- module Utils
6
- # OCR for image files
7
- # TODO: add other strategies and document `macocr`
8
- module TextRecognition
9
- def self.recognize_in_image(filename:)
10
- `macocr "#{filename}" 2>/dev/null`.presence
11
- end
12
- end
13
- end
14
- end
15
- end