chronicle-etl 0.5.5 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +15 -25
  3. data/.rubocop.yml +2 -44
  4. data/Gemfile +2 -2
  5. data/Guardfile +3 -3
  6. data/README.md +75 -68
  7. data/Rakefile +2 -2
  8. data/bin/console +4 -5
  9. data/chronicle-etl.gemspec +51 -49
  10. data/exe/chronicle-etl +1 -1
  11. data/lib/chronicle/etl/authorizer.rb +3 -4
  12. data/lib/chronicle/etl/cli/authorizations.rb +8 -6
  13. data/lib/chronicle/etl/cli/connectors.rb +7 -7
  14. data/lib/chronicle/etl/cli/jobs.rb +130 -53
  15. data/lib/chronicle/etl/cli/main.rb +29 -29
  16. data/lib/chronicle/etl/cli/plugins.rb +14 -15
  17. data/lib/chronicle/etl/cli/secrets.rb +14 -12
  18. data/lib/chronicle/etl/cli/subcommand_base.rb +5 -3
  19. data/lib/chronicle/etl/config.rb +18 -8
  20. data/lib/chronicle/etl/configurable.rb +20 -9
  21. data/lib/chronicle/etl/exceptions.rb +3 -3
  22. data/lib/chronicle/etl/extraction.rb +12 -2
  23. data/lib/chronicle/etl/extractors/csv_extractor.rb +9 -0
  24. data/lib/chronicle/etl/extractors/extractor.rb +15 -2
  25. data/lib/chronicle/etl/extractors/file_extractor.rb +5 -3
  26. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +2 -2
  27. data/lib/chronicle/etl/extractors/json_extractor.rb +14 -4
  28. data/lib/chronicle/etl/extractors/stdin_extractor.rb +3 -0
  29. data/lib/chronicle/etl/job.rb +35 -17
  30. data/lib/chronicle/etl/job_definition.rb +38 -26
  31. data/lib/chronicle/etl/job_log.rb +14 -16
  32. data/lib/chronicle/etl/job_logger.rb +4 -4
  33. data/lib/chronicle/etl/loaders/csv_loader.rb +17 -4
  34. data/lib/chronicle/etl/loaders/helpers/stdout_helper.rb +4 -0
  35. data/lib/chronicle/etl/loaders/json_loader.rb +30 -10
  36. data/lib/chronicle/etl/loaders/loader.rb +0 -17
  37. data/lib/chronicle/etl/loaders/rest_loader.rb +7 -7
  38. data/lib/chronicle/etl/loaders/table_loader.rb +37 -12
  39. data/lib/chronicle/etl/logger.rb +2 -2
  40. data/lib/chronicle/etl/oauth_authorizer.rb +8 -8
  41. data/lib/chronicle/etl/record.rb +15 -0
  42. data/lib/chronicle/etl/registry/connector_registration.rb +15 -23
  43. data/lib/chronicle/etl/registry/connectors.rb +93 -36
  44. data/lib/chronicle/etl/registry/plugin_registration.rb +1 -1
  45. data/lib/chronicle/etl/registry/plugins.rb +27 -19
  46. data/lib/chronicle/etl/runner.rb +158 -128
  47. data/lib/chronicle/etl/secrets.rb +4 -4
  48. data/lib/chronicle/etl/transformers/buffer_transformer.rb +29 -0
  49. data/lib/chronicle/etl/transformers/chronicle_transformer.rb +32 -0
  50. data/lib/chronicle/etl/transformers/chronobase_transformer.rb +100 -0
  51. data/lib/chronicle/etl/transformers/fields_limit_transformer.rb +23 -0
  52. data/lib/chronicle/etl/transformers/filter_fields_transformer.rb +60 -0
  53. data/lib/chronicle/etl/transformers/filter_transformer.rb +30 -0
  54. data/lib/chronicle/etl/transformers/format_transformer.rb +32 -0
  55. data/lib/chronicle/etl/transformers/merge_meta_transformer.rb +19 -0
  56. data/lib/chronicle/etl/transformers/multiply_transformer.rb +21 -0
  57. data/lib/chronicle/etl/transformers/null_transformer.rb +5 -7
  58. data/lib/chronicle/etl/transformers/sampler_transformer.rb +21 -0
  59. data/lib/chronicle/etl/transformers/sort_transformer.rb +31 -0
  60. data/lib/chronicle/etl/transformers/transformer.rb +63 -41
  61. data/lib/chronicle/etl/utils/binary_attachments.rb +1 -1
  62. data/lib/chronicle/etl/utils/progress_bar.rb +2 -3
  63. data/lib/chronicle/etl/version.rb +1 -1
  64. data/lib/chronicle/etl.rb +6 -8
  65. metadata +49 -47
  66. data/lib/chronicle/etl/models/activity.rb +0 -15
  67. data/lib/chronicle/etl/models/attachment.rb +0 -14
  68. data/lib/chronicle/etl/models/base.rb +0 -122
  69. data/lib/chronicle/etl/models/entity.rb +0 -29
  70. data/lib/chronicle/etl/models/raw.rb +0 -26
  71. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +0 -31
  72. data/lib/chronicle/etl/serializers/raw_serializer.rb +0 -10
  73. data/lib/chronicle/etl/serializers/serializer.rb +0 -28
  74. data/lib/chronicle/etl/transformers/image_file_transformer.rb +0 -247
  75. data/lib/chronicle/etl/utils/hash_utilities.rb +0 -19
  76. data/lib/chronicle/etl/utils/text_recognition.rb +0 -15
@@ -1,247 +0,0 @@
1
- require 'mini_exiftool'
2
- require 'active_support'
3
- require 'active_support/core_ext/object'
4
- require 'active_support/core_ext/time'
5
- require 'active_support/core_ext/hash/reverse_merge'
6
- require 'active_support/core_ext/string/inflections'
7
-
8
- module Chronicle
9
- module ETL
10
- # Transform a JPEG or other image file into a record.
11
- # By default, file mtime and a hash of the file content is used to build
12
- # the timestamp and ID respectively but other options are available (such
13
- # as reading EXIF tags or extended attributes from the filesystem).
14
- #
15
- # TODO: This should be extracted into its own plugin
16
- class ImageFileTransformer < Chronicle::ETL::Transformer
17
- register_connector do |r|
18
- r.identifier = 'image-file'
19
- r.description = 'an image file'
20
- end
21
-
22
- setting :timestamp_strategy, default: 'file_mtime'
23
- setting :id_strategy, default: 'file_hash'
24
- setting :verb, default: 'photographed'
25
- # EXIF tags often don't have timezones
26
- setting :timezone_default, default: 'Eastern Time (US & Canada)'
27
- setting :include_image_data, default: true
28
- setting :actor
29
- setting :involved
30
-
31
- def transform
32
- # FIXME: set @filename; use block for reading file when necessary
33
- @file = File.open(@extraction.data)
34
- record = build_created(@file)
35
- @file.close
36
- record
37
- end
38
-
39
- def friendly_identifier
40
- @file.path
41
- end
42
-
43
- def id
44
- @id ||= begin
45
- id = build_with_strategy(field: :id, strategy: @config.id_strategy)
46
- raise(UntransformableRecordError, "Could not build id") unless id
47
-
48
- id
49
- end
50
- end
51
-
52
- def timestamp
53
- @timestamp ||= begin
54
- ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
55
- raise(UntransformableRecordError, "Could not build timestamp") unless ts
56
-
57
- ts
58
- end
59
- end
60
-
61
- private
62
-
63
- def build_created(file)
64
- record = ::Chronicle::ETL::Models::Activity.new
65
- record.verb = @config.verb
66
- record.provider = @config.provider
67
- record.provider_id = id
68
- record.end_at = timestamp
69
- record.dedupe_on = [[:provider_id, :verb, :provider]]
70
-
71
- record.involved = build_image
72
- record.actor = build_actor
73
-
74
- record.assign_attributes(build_gps)
75
- record
76
- end
77
-
78
- def build_actor
79
- actor = ::Chronicle::ETL::Models::Entity.new
80
- actor.represents = 'identity'
81
- actor.provider = @config.actor[:provider]
82
- actor.slug = @config.actor[:slug]
83
- actor.dedupe_on = [[:provider, :slug, :represents]]
84
- actor
85
- end
86
-
87
- def build_image
88
- image = ::Chronicle::ETL::Models::Entity.new
89
- image.represents = @config.involved[:represents]
90
- image.title = build_title
91
- image.body = exif['Description']
92
- image.provider = @config.involved[:provider]
93
- image.provider_id = id
94
- image.assign_attributes(build_gps)
95
- image.dedupe_on = [[:provider, :provider_id, :represents]]
96
-
97
- if @config.ocr_strategy
98
- ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
99
- image.metadata[:ocr_text] = ocr_text if ocr_text
100
- end
101
-
102
- names = extract_people_depicted
103
- tags = extract_keywords(names)
104
-
105
- image.depicts = build_people_depicted(names)
106
- image.abouts = build_keywords(tags)
107
-
108
- if @config.include_image_data
109
- attachment = ::Chronicle::ETL::Models::Attachment.new
110
- attachment.data = build_image_data
111
- image.attachments = [attachment]
112
- end
113
-
114
- image
115
- end
116
-
117
- def build_keywords(topics)
118
- topics.map do |topic|
119
- t = ::Chronicle::ETL::Models::Entity.new
120
- t.represents = 'topic'
121
- t.provider = @config.involved[:provider]
122
- t.title = topic
123
- t.slug = topic.parameterize
124
- t.dedupe_on = [[:provider, :represents, :slug]]
125
- t
126
- end
127
- end
128
-
129
- def build_people_depicted(names)
130
- names.map do |name|
131
- identity = ::Chronicle::ETL::Models::Entity.new
132
- identity.represents = 'identity'
133
- identity.provider = @config.involved[:provider]
134
- identity.slug = name.parameterize
135
- identity.title = name
136
- identity.dedupe_on = [[:provider, :represents, :slug]]
137
- identity
138
- end
139
- end
140
-
141
- def build_gps
142
- return {} unless exif['GPSLatitude']
143
-
144
- {
145
- lat: exif['GPSLatitude'],
146
- lng: exif['GPSLongitude'],
147
- elevation: exif['GPSAltitude']
148
- }
149
- end
150
-
151
- def build_image_data
152
- ::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
153
- end
154
-
155
- def build_title
156
- File.basename(@file)
157
- end
158
-
159
- def build_with_strategy(field:, strategy:[])
160
- strategies = [strategy].flatten.compact
161
- strategies.each do |s|
162
- builder_method = "build_#{field}_using_#{s}"
163
- result = send(builder_method.to_sym)
164
- return result if result
165
- end
166
- return
167
- end
168
-
169
- def build_id_using_file_hash
170
- Digest::SHA256.hexdigest(File.read(@file))
171
- end
172
-
173
- def build_id_using_xattr_version
174
- load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
175
- end
176
-
177
- def build_id_using_xmp_document_id
178
- exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
179
- end
180
-
181
- def build_timestamp_using_file_mtime
182
- File.mtime(@file)
183
- end
184
-
185
- def build_timestamp_using_exif_datetimeoriginal
186
- # EXIF tags don't have timezone information. This is a DateTime in UTC
187
- timestamp = exif['DateTimeOriginal'] || return
188
-
189
- if exif['OffsetTimeOriginal']
190
- # Offset tags are only available in newer EXIF tags. If it exists, we
191
- # use it instead of UTC
192
- timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
193
- elsif false
194
- # TODO: support option of using GPS coordinates to determine timezone
195
- else
196
- zone = ActiveSupport::TimeZone.new(@config.timezone_default)
197
- timestamp = zone.parse(timestamp.asctime)
198
- end
199
-
200
- timestamp
201
- end
202
-
203
- # TODO: add documentation for how to set up `macocr`
204
- def build_ocr_using_macocr
205
- `macocr "#{@file.path}" 2>/dev/null`.presence
206
- end
207
-
208
- def exif
209
- @exif ||= MiniExiftool.new(
210
- @file.path,
211
- numerical: true,
212
-
213
- # EXIF timestamps don't have timezone information. MiniExifTool uses Time
214
- # by default which parses timestamps in local time zone. Using DateTime
215
- # parses dates as UTC and then we can apply a timezone offset if the optional
216
- # EXIF timezone offset fields are available.
217
- # https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
218
- timestamps: DateTime
219
- )
220
- end
221
-
222
- # Figure out which faces are tagged as regions and return a list of their names
223
- def extract_people_depicted
224
- return [] unless exif['RegionName']
225
-
226
- names = [exif['RegionName']].flatten
227
- types = [exif['RegionType']].flatten
228
-
229
- names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
230
- end
231
-
232
- # Extract image keywords from EXIF/IPTC tag and subtract out those of which are
233
- # tagged people (determiend by looking at face regions)
234
- def extract_keywords(people_names = [])
235
- [exif['Keywords'] || []].flatten - people_names
236
- end
237
-
238
- def load_value_from_xattr_plist attribute
239
- require 'nokogiri'
240
- xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
241
- return unless xml
242
- value = Nokogiri::XML.parse(r).xpath("//string").text
243
- return value.presence
244
- end
245
- end
246
- end
247
- end
@@ -1,19 +0,0 @@
1
- module Chronicle
2
- module ETL
3
- module Utils
4
- module HashUtilities
5
- def self.flatten_hash(hash)
6
- hash.each_with_object({}) do |(k, v), h|
7
- if v.is_a? Hash
8
- flatten_hash(v).map do |h_k, h_v|
9
- h["#{k}.#{h_k}".to_sym] = h_v
10
- end
11
- else
12
- h[k] = v
13
- end
14
- end
15
- end
16
- end
17
- end
18
- end
19
- end
@@ -1,15 +0,0 @@
1
- require 'active_support/core_ext/object/blank'
2
-
3
- module Chronicle
4
- module ETL
5
- module Utils
6
- # OCR for image files
7
- # TODO: add other strategies and document `macocr`
8
- module TextRecognition
9
- def self.recognize_in_image(filename:)
10
- `macocr "#{filename}" 2>/dev/null`.presence
11
- end
12
- end
13
- end
14
- end
15
- end