chronicle-etl 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/README.md +22 -15
  5. data/chronicle-etl.gemspec +11 -5
  6. data/lib/chronicle/etl/cli/connectors.rb +19 -7
  7. data/lib/chronicle/etl/cli/jobs.rb +38 -27
  8. data/lib/chronicle/etl/cli/main.rb +10 -2
  9. data/lib/chronicle/etl/config.rb +24 -3
  10. data/lib/chronicle/etl/exceptions.rb +30 -0
  11. data/lib/chronicle/etl/extraction.rb +12 -0
  12. data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -37
  13. data/lib/chronicle/etl/extractors/extractor.rb +19 -1
  14. data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
  15. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  16. data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
  17. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  18. data/lib/chronicle/etl/job.rb +72 -0
  19. data/lib/chronicle/etl/job_definition.rb +89 -0
  20. data/lib/chronicle/etl/job_log.rb +95 -0
  21. data/lib/chronicle/etl/job_logger.rb +81 -0
  22. data/lib/chronicle/etl/loaders/csv_loader.rb +6 -6
  23. data/lib/chronicle/etl/loaders/loader.rb +2 -2
  24. data/lib/chronicle/etl/loaders/rest_loader.rb +16 -9
  25. data/lib/chronicle/etl/loaders/stdout_loader.rb +8 -3
  26. data/lib/chronicle/etl/loaders/table_loader.rb +58 -7
  27. data/lib/chronicle/etl/logger.rb +48 -0
  28. data/lib/chronicle/etl/models/activity.rb +15 -0
  29. data/lib/chronicle/etl/models/attachment.rb +14 -0
  30. data/lib/chronicle/etl/models/base.rb +119 -0
  31. data/lib/chronicle/etl/models/entity.rb +21 -0
  32. data/lib/chronicle/etl/models/generic.rb +23 -0
  33. data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
  34. data/lib/chronicle/etl/registry/registry.rb +52 -0
  35. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  36. data/lib/chronicle/etl/runner.rb +70 -42
  37. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  38. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  39. data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
  40. data/lib/chronicle/etl/transformers/null_transformer.rb +12 -4
  41. data/lib/chronicle/etl/transformers/transformer.rb +42 -12
  42. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  43. data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
  44. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  45. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  46. data/lib/chronicle/etl/version.rb +1 -1
  47. data/lib/chronicle/etl.rb +17 -1
  48. metadata +138 -35
  49. data/CHANGELOG.md +0 -23
  50. data/Gemfile.lock +0 -85
  51. data/lib/chronicle/etl/catalog.rb +0 -62
  52. data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -0,0 +1,253 @@
1
+ require 'mini_exiftool'
2
+ require 'active_support'
3
+ require 'active_support/core_ext/object'
4
+ require 'active_support/core_ext/time'
5
+ require 'active_support/core_ext/hash/reverse_merge'
6
+ require 'active_support/core_ext/string/inflections'
7
+
8
+ module Chronicle
9
+ module ETL
10
+ # Transform a JPEG or other image file into a record.
11
+ # By default, file mtime and a hash of the file content is used to build
12
+ # the timestamp and ID respectively but other options are available (such
13
+ # as reading EXIF tags or extended attributes from the filesystem).
14
+ #
15
+ # TODO: This should be extracted into its own plugin
16
+ class ImageFileTransformer < Chronicle::ETL::Transformer
17
+ register_connector do |r|
18
+ r.identifier = 'image-file'
19
+ r.description = 'an image file'
20
+ end
21
+
22
+ DEFAULT_OPTIONS = {
23
+ timestamp_strategy: 'file_mtime',
24
+ id_strategy: 'file_hash',
25
+ verb: 'photographed',
26
+
27
+ # EXIF tags often don't have timezones
28
+ timezone_default: 'Eastern Time (US & Canada)',
29
+ include_image_data: true
30
+ }.freeze
31
+
32
+ def initialize(*args)
33
+ super(*args)
34
+ @options = @options.reverse_merge(DEFAULT_OPTIONS)
35
+ end
36
+
37
+ def transform
38
+ # FIXME: set @filename; use block for reading file when necessary
39
+ @file = File.open(@extraction.data)
40
+ record = build_created(@file)
41
+ @file.close
42
+ record
43
+ end
44
+
45
+ def friendly_identifier
46
+ @file.path
47
+ end
48
+
49
+ def id
50
+ @id ||= begin
51
+ id = build_with_strategy(field: :id, strategy: @options[:id_strategy])
52
+ raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
53
+
54
+ id
55
+ end
56
+ end
57
+
58
+ def timestamp
59
+ @timestamp ||= begin
60
+ ts = build_with_strategy(field: :timestamp, strategy: @options[:timestamp_strategy])
61
+ raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
62
+
63
+ ts
64
+ end
65
+ end
66
+
67
+ private
68
+
69
+ def build_created(file)
70
+ record = ::Chronicle::ETL::Models::Activity.new
71
+ record.verb = @options[:verb]
72
+ record.provider = @options[:provider]
73
+ record.provider_id = id
74
+ record.end_at = timestamp
75
+ record.dedupe_on = [[:provider_id, :verb, :provider]]
76
+
77
+ record.involved = build_image
78
+ record.actor = build_actor
79
+
80
+ record.assign_attributes(build_gps)
81
+ record
82
+ end
83
+
84
+ def build_actor
85
+ actor = ::Chronicle::ETL::Models::Entity.new
86
+ actor.represents = 'identity'
87
+ actor.provider = @options[:actor][:provider]
88
+ actor.slug = @options[:actor][:slug]
89
+ actor.dedupe_on = [[:provider, :slug, :represents]]
90
+ actor
91
+ end
92
+
93
+ def build_image
94
+ image = ::Chronicle::ETL::Models::Entity.new
95
+ image.represents = @options[:involved][:represents]
96
+ image.title = build_title
97
+ image.body = exif['Description']
98
+ image.provider = @options[:involved][:provider]
99
+ image.provider_id = id
100
+ image.assign_attributes(build_gps)
101
+ image.dedupe_on = [[:provider, :provider_id, :represents]]
102
+
103
+ if @options[:ocr_strategy]
104
+ ocr_text = build_with_strategy(field: :ocr, strategy: @options[:ocr_strategy])
105
+ image.metadata[:ocr_text] = ocr_text if ocr_text
106
+ end
107
+
108
+ names = extract_people_depicted
109
+ tags = extract_keywords(names)
110
+
111
+ image.depicts = build_people_depicted(names)
112
+ image.abouts = build_keywords(tags)
113
+
114
+ if @options[:include_image_data]
115
+ attachment = ::Chronicle::ETL::Models::Attachment.new
116
+ attachment.data = build_image_data
117
+ image.attachments = [attachment]
118
+ end
119
+
120
+ image
121
+ end
122
+
123
+ def build_keywords(topics)
124
+ topics.map do |topic|
125
+ t = ::Chronicle::ETL::Models::Entity.new
126
+ t.represents = 'topic'
127
+ t.provider = @options[:involved][:provider]
128
+ t.title = topic
129
+ t.slug = topic.parameterize
130
+ t.dedupe_on = [[:provider, :represents, :slug]]
131
+ t
132
+ end
133
+ end
134
+
135
+ def build_people_depicted(names)
136
+ names.map do |name|
137
+ identity = ::Chronicle::ETL::Models::Entity.new
138
+ identity.represents = 'identity'
139
+ identity.provider = @options[:involved][:provider]
140
+ identity.slug = name.parameterize
141
+ identity.title = name
142
+ identity.dedupe_on = [[:provider, :represents, :slug]]
143
+ identity
144
+ end
145
+ end
146
+
147
+ def build_gps
148
+ return {} unless exif['GPSLatitude']
149
+
150
+ {
151
+ lat: exif['GPSLatitude'],
152
+ lng: exif['GPSLongitude'],
153
+ elevation: exif['GPSAltitude']
154
+ }
155
+ end
156
+
157
+ def build_image_data
158
+ ::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
159
+ end
160
+
161
+ def build_title
162
+ File.basename(@file)
163
+ end
164
+
165
+ def build_with_strategy(field:, strategy:[])
166
+ strategies = [strategy].flatten.compact
167
+ strategies.each do |s|
168
+ builder_method = "build_#{field}_using_#{s}"
169
+ result = send(builder_method.to_sym)
170
+ return result if result
171
+ end
172
+ return
173
+ end
174
+
175
+ def build_id_using_file_hash
176
+ Digest::SHA256.hexdigest(File.read(@file))
177
+ end
178
+
179
+ def build_id_using_xattr_version
180
+ load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
181
+ end
182
+
183
+ def build_id_using_xmp_document_id
184
+ exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
185
+ end
186
+
187
+ def build_timestamp_using_file_mtime
188
+ File.mtime(@file)
189
+ end
190
+
191
+ def build_timestamp_using_exif_datetimeoriginal
192
+ # EXIF tags don't have timezone information. This is a DateTime in UTC
193
+ timestamp = exif['DateTimeOriginal'] || return
194
+
195
+ if exif['OffsetTimeOriginal']
196
+ # Offset tags are only available in newer EXIF tags. If it exists, we
197
+ # use it instead of UTC
198
+ timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
199
+ elsif false
200
+ # TODO: support option of using GPS coordinates to determine timezone
201
+ else
202
+ zone = ActiveSupport::TimeZone.new(@options[:timezone_default])
203
+ timestamp = zone.parse(timestamp.asctime)
204
+ end
205
+
206
+ timestamp
207
+ end
208
+
209
+ # TODO: add documentation for how to set up `macocr`
210
+ def build_ocr_using_macocr
211
+ `macocr "#{@file.path}" 2>/dev/null`.presence
212
+ end
213
+
214
+ def exif
215
+ @exif ||= MiniExiftool.new(
216
+ @file.path,
217
+ numerical: true,
218
+
219
+ # EXIF timestamps don't have timezone information. MiniExifTool uses Time
220
+ # by default which parses timestamps in local time zone. Using DateTime
221
+ # parses dates as UTC and then we can apply a timezone offset if the optional
222
+ # EXIF timezone offset fields are available.
223
+ # https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
224
+ timestamps: DateTime
225
+ )
226
+ end
227
+
228
+ # Figure out which faces are tagged as regions and return a list of their names
229
+ def extract_people_depicted
230
+ return [] unless exif['RegionName']
231
+
232
+ names = [exif['RegionName']].flatten
233
+ types = [exif['RegionType']].flatten
234
+
235
+ names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
236
+ end
237
+
238
+ # Extract image keywords from EXIF/IPTC tag and subtract out those of which are
239
+ # tagged people (determiend by looking at face regions)
240
+ def extract_keywords(people_names = [])
241
+ [exif['Keywords'] || []].flatten - people_names
242
+ end
243
+
244
+ def load_value_from_xattr_plist attribute
245
+ require 'nokogiri'
246
+ xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
247
+ return unless xml
248
+ value = Nokogiri::XML.parse(r).xpath("//string").text
249
+ return value.presence
250
+ end
251
+ end
252
+ end
253
+ end
@@ -1,10 +1,18 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class NullTransformer < Chronicle::ETL::Transformer
4
- def transform data
5
- return data
4
+ register_connector do |r|
5
+ r.identifier = 'null'
6
+ r.description = 'in no way'
7
+ end
8
+
9
+ def transform
10
+ Chronicle::ETL::Models::Generic.new(@extraction.data)
6
11
  end
7
- end
8
12
 
13
+ def timestamp; end
14
+
15
+ def id; end
16
+ end
9
17
  end
10
- end
18
+ end
@@ -2,33 +2,63 @@ module Chronicle
2
2
  module ETL
3
3
  # Abstract class representing an Transformer for an ETL job
4
4
  class Transformer
5
- extend Chronicle::ETL::Catalog
5
+ extend Chronicle::ETL::Registry::SelfRegistering
6
6
 
7
7
  # Construct a new instance of this transformer. Options are passed in from a Runner
8
- # == Paramters:
8
+ # == Parameters:
9
9
  # options::
10
10
  # Options for configuring this Transformer
11
- def initialize(options = {})
11
+ def initialize(options = {}, extraction)
12
12
  @options = options
13
+ @extraction = extraction
13
14
  end
14
15
 
15
- # The main entrypoint for transforming a record. Called by a Runner on each extracted record
16
- def transform data
17
- raise NotImplementedError
18
- end
16
+ # @abstract Subclass is expected to implement #transform
17
+ # @!method transform
18
+ # The main entrypoint for transforming a record. Called by a Runner on each extracted record
19
19
 
20
20
  # The domain or provider-specific id of the record this transformer is working on.
21
- # Used for building a cursor so an extractor doesn't have to start from the beginning of a
22
- # data source from the beginning.
23
- def id; end
21
+ # It is useful for:
22
+ # - de-duping records that might exist in the loader's destination
23
+ # - building a cursor so an extractor doesn't have to start from the beginning of a
24
+ # a source
25
+ def id
26
+ raise NotImplementedError
27
+ end
24
28
 
25
29
  # The domain or provider-specific timestamp of the record this transformer is working on.
26
30
  # Used for building a cursor so an extractor doesn't have to start from the beginning of a
27
31
  # data source from the beginning.
28
- def timestamp; end
32
+ def timestamp
33
+ raise NotImplementedError
34
+ end
35
+
36
+ # An optional, human-readable identifier for a transformation, intended for debugging or logging.
37
+ # By default, it is just the id.
38
+ def friendly_identifier
39
+ id
40
+ end
41
+
42
+ def to_s
43
+ ts = begin
44
+ unknown = "???"
45
+ timestamp&.iso8601 || unknown
46
+ rescue TransformationError, NotImplementedError
47
+ unknown
48
+ end
49
+
50
+ identifier = begin
51
+ unknown = self.class.to_s
52
+ friendly_identifier || self.class.to_s
53
+ rescue TransformationError, NotImplementedError
54
+ unknown
55
+ end
56
+
57
+ "[#{ts}] #{identifier}"
58
+ end
29
59
  end
30
60
  end
31
61
  end
32
62
 
33
- require_relative 'json_transformer'
34
63
  require_relative 'null_transformer'
64
+ require_relative 'image_file_transformer'
@@ -0,0 +1,21 @@
1
+ require 'marcel'
2
+ require 'base64'
3
+
4
+ module Chronicle
5
+ module ETL
6
+ module Utils
7
+ # Utility methods for dealing with binary files
8
+ module BinaryAttachments
9
+ def self.filename_to_base64(filename:, mimetype: nil)
10
+ mimetype = mimetype || guess_mimetype(filename: filename)
11
+
12
+ "data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}"
13
+ end
14
+
15
+ def self.guess_mimetype(filename:)
16
+ Marcel::MimeType.for(filename)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,19 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Utils
4
+ module HashUtilities
5
+ def self.flatten_hash(hash)
6
+ hash.each_with_object({}) do |(k, v), h|
7
+ if v.is_a? Hash
8
+ flatten_hash(v).map do |h_k, h_v|
9
+ h["#{k}.#{h_k}".to_sym] = h_v
10
+ end
11
+ else
12
+ h[k] = v
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -64,7 +64,9 @@ module Chronicle
64
64
  end
65
65
 
66
66
  def log(message)
67
- @pbar.log message.inspect
67
+ message.split("\n").each do |line|
68
+ @pbar.log message
69
+ end
68
70
  end
69
71
 
70
72
  def finish
@@ -0,0 +1,15 @@
1
+ require 'active_support/core_ext/object/blank'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Utils
6
+ # OCR for image files
7
+ # TODO: add other strategies and document `macocr`
8
+ module TextRecognition
9
+ def self.recognize_in_image(filename:)
10
+ `macocr "#{filename}" 2>/dev/null`.presence
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.2.1"
3
+ VERSION = "0.3.0"
4
4
  end
5
5
  end
data/lib/chronicle/etl.rb CHANGED
@@ -1,8 +1,24 @@
1
- require_relative 'etl/catalog'
1
+ require_relative 'etl/registry/registry'
2
2
  require_relative 'etl/config'
3
+ require_relative 'etl/exceptions'
4
+ require_relative 'etl/extraction'
3
5
  require_relative 'etl/extractors/extractor'
6
+ require_relative 'etl/job_definition'
7
+ require_relative 'etl/job_log'
8
+ require_relative 'etl/job_logger'
9
+ require_relative 'etl/job'
4
10
  require_relative 'etl/loaders/loader'
11
+ require_relative 'etl/logger'
12
+ require_relative 'etl/models/activity'
13
+ require_relative 'etl/models/attachment'
14
+ require_relative 'etl/models/base'
15
+ require_relative 'etl/models/entity'
16
+ require_relative 'etl/models/generic'
5
17
  require_relative 'etl/runner'
18
+ require_relative 'etl/serializers/serializer'
6
19
  require_relative 'etl/transformers/transformer'
20
+ require_relative 'etl/utils/binary_attachments'
21
+ require_relative 'etl/utils/hash_utilities'
22
+ require_relative 'etl/utils/text_recognition'
7
23
  require_relative 'etl/utils/progress_bar'
8
24
  require_relative 'etl/version'