chronicle-etl 0.2.4 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +35 -0
  3. data/.gitignore +3 -0
  4. data/.rubocop.yml +31 -1
  5. data/Guardfile +7 -0
  6. data/README.md +21 -14
  7. data/Rakefile +4 -2
  8. data/chronicle-etl.gemspec +18 -10
  9. data/exe/chronicle-etl +1 -1
  10. data/lib/chronicle/etl/cli/connectors.rb +53 -7
  11. data/lib/chronicle/etl/cli/jobs.rb +59 -24
  12. data/lib/chronicle/etl/cli/main.rb +18 -16
  13. data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
  14. data/lib/chronicle/etl/cli.rb +7 -0
  15. data/lib/chronicle/etl/config.rb +1 -1
  16. data/lib/chronicle/etl/configurable.rb +150 -0
  17. data/lib/chronicle/etl/exceptions.rb +14 -1
  18. data/lib/chronicle/etl/extraction.rb +12 -0
  19. data/lib/chronicle/etl/extractors/csv_extractor.rb +32 -31
  20. data/lib/chronicle/etl/extractors/extractor.rb +25 -13
  21. data/lib/chronicle/etl/extractors/file_extractor.rb +17 -32
  22. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  23. data/lib/chronicle/etl/extractors/json_extractor.rb +37 -0
  24. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  25. data/lib/chronicle/etl/job.rb +30 -29
  26. data/lib/chronicle/etl/job_definition.rb +45 -7
  27. data/lib/chronicle/etl/job_log.rb +10 -0
  28. data/lib/chronicle/etl/job_logger.rb +23 -20
  29. data/lib/chronicle/etl/loaders/csv_loader.rb +5 -1
  30. data/lib/chronicle/etl/loaders/loader.rb +5 -2
  31. data/lib/chronicle/etl/loaders/rest_loader.rb +9 -5
  32. data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
  33. data/lib/chronicle/etl/loaders/table_loader.rb +51 -7
  34. data/lib/chronicle/etl/logger.rb +48 -0
  35. data/lib/chronicle/etl/models/attachment.rb +14 -0
  36. data/lib/chronicle/etl/models/base.rb +23 -7
  37. data/lib/chronicle/etl/models/entity.rb +9 -3
  38. data/lib/chronicle/etl/registry/connector_registration.rb +62 -0
  39. data/lib/chronicle/etl/registry/registry.rb +52 -0
  40. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  41. data/lib/chronicle/etl/runner.rb +58 -7
  42. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  43. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  44. data/lib/chronicle/etl/transformers/image_file_transformer.rb +247 -0
  45. data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
  46. data/lib/chronicle/etl/transformers/transformer.rb +41 -10
  47. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  48. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  49. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  50. data/lib/chronicle/etl/version.rb +1 -1
  51. data/lib/chronicle/etl.rb +8 -2
  52. metadata +146 -34
  53. data/.ruby-version +0 -1
  54. data/Gemfile.lock +0 -91
  55. data/lib/chronicle/etl/catalog.rb +0 -108
  56. data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
@@ -1,4 +1,5 @@
1
1
  require 'colorize'
2
+ require 'chronic_duration'
2
3
 
3
4
  class Chronicle::ETL::Runner
4
5
  def initialize(job)
@@ -13,25 +14,75 @@ class Chronicle::ETL::Runner
13
14
  @job_logger.start
14
15
  loader.start
15
16
 
17
+ extractor.prepare
16
18
  total = extractor.results_count
17
- progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
19
+ @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
20
+ Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
18
21
 
19
- extractor.extract do |data, metadata|
20
- transformer = @job.instantiate_transformer(data)
22
+ Chronicle::ETL::Logger.info(tty_log_job_start)
23
+ extractor.extract do |extraction|
24
+ unless extraction.is_a?(Chronicle::ETL::Extraction)
25
+ raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
26
+ end
27
+
28
+ transformer = @job.instantiate_transformer(extraction)
21
29
  record = transformer.transform
22
30
 
23
31
  unless record.is_a?(Chronicle::ETL::Models::Base)
24
- raise Chronicle::ETL::InvalidTransformedRecordError, "Transformed data is not a type of Chronicle::ETL::Models"
32
+ raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
25
33
  end
26
34
 
35
+ Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
27
36
  @job_logger.log_transformation(transformer)
28
- loader.load(record)
29
- progress_bar.increment
37
+
38
+ loader.load(record) unless @job.dry_run?
39
+ rescue Chronicle::ETL::TransformationError => e
40
+ Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
41
+ ensure
42
+ @progress_bar.increment
30
43
  end
31
44
 
32
- progress_bar.finish
45
+ @progress_bar.finish
33
46
  loader.finish
34
47
  @job_logger.finish
48
+ rescue Interrupt
49
+ Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
50
+ @job_logger.error
51
+ rescue StandardError => e
52
+ raise e
53
+ ensure
35
54
  @job_logger.save
55
+ @progress_bar.finish
56
+ Chronicle::ETL::Logger.detach_from_progress_bar
57
+ Chronicle::ETL::Logger.info(tty_log_completion)
58
+ end
59
+
60
+ private
61
+
62
+ def tty_log_job_start
63
+ output = "Beginning job "
64
+ output += "'#{@job.name}'".bold if @job.name
65
+ output
66
+ end
67
+
68
+ def tty_log_transformation transformer
69
+ output = " ✓".green
70
+ output += " #{transformer}"
71
+ end
72
+
73
+ def tty_log_transformation_failure exception
74
+ output = " ✖".red
75
+ output += " Failed to build #{exception.transformation}. #{exception.message}"
76
+ end
77
+
78
+ def tty_log_completion
79
+ status = @job_logger.success ? 'Success' : 'Failed'
80
+ output = "\nCompleted job "
81
+ output += "'#{@job.name}'".bold if @job.name
82
+ output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
83
+ output += "\n Status:\t".light_black + status
84
+ output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
85
+ output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
86
+ output
36
87
  end
37
88
  end
@@ -0,0 +1,25 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JSONAPISerializer < Chronicle::ETL::Serializer
4
+ def serializable_hash
5
+ @record
6
+ .identifier_hash
7
+ .merge({ attributes: @record.attributes })
8
+ .merge({ relationships: build_associations })
9
+ .merge(@record.meta_hash)
10
+ end
11
+
12
+ def build_associations
13
+ @record.associations.transform_values do |value|
14
+ association_data =
15
+ if value.is_a?(Array)
16
+ value.map { |record| JSONAPISerializer.new(record).serializable_hash }
17
+ else
18
+ JSONAPISerializer.new(value).serializable_hash
19
+ end
20
+ { data: association_data }
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,27 @@
1
+ module Chronicle
2
+ module ETL
3
+ # Abstract class representing a Serializer for an ETL record
4
+ class Serializer
5
+ # Construct a new instance of this serializer.
6
+ # == Parameters:
7
+ # options::
8
+ # Options for configuring this Serializers
9
+ def initialize(record, options = {})
10
+ @record = record
11
+ @options = options
12
+ end
13
+
14
+ # Serialize a record as a hash
15
+ def serializable_hash
16
+ raise NotImplementedError
17
+ end
18
+
19
+ def self.serialize(record)
20
+ serializer = self.new(record)
21
+ serializer.serializable_hash
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ require_relative 'jsonapi_serializer'
@@ -0,0 +1,247 @@
1
+ require 'mini_exiftool'
2
+ require 'active_support'
3
+ require 'active_support/core_ext/object'
4
+ require 'active_support/core_ext/time'
5
+ require 'active_support/core_ext/hash/reverse_merge'
6
+ require 'active_support/core_ext/string/inflections'
7
+
8
+ module Chronicle
9
+ module ETL
10
+ # Transform a JPEG or other image file into a record.
11
+ # By default, file mtime and a hash of the file content is used to build
12
+ # the timestamp and ID respectively but other options are available (such
13
+ # as reading EXIF tags or extended attributes from the filesystem).
14
+ #
15
+ # TODO: This should be extracted into its own plugin
16
+ class ImageFileTransformer < Chronicle::ETL::Transformer
17
+ register_connector do |r|
18
+ r.identifier = 'image-file'
19
+ r.description = 'an image file'
20
+ end
21
+
22
+ setting :timestamp_strategy, default: 'file_mtime'
23
+ setting :id_strategy, default: 'file_hash'
24
+ setting :verb, default: 'photographed'
25
+ # EXIF tags often don't have timezones
26
+ setting :timezone_default, default: 'Eastern Time (US & Canada)'
27
+ setting :include_image_data, default: true
28
+ setting :actor
29
+ setting :involved
30
+
31
+ def transform
32
+ # FIXME: set @filename; use block for reading file when necessary
33
+ @file = File.open(@extraction.data)
34
+ record = build_created(@file)
35
+ @file.close
36
+ record
37
+ end
38
+
39
+ def friendly_identifier
40
+ @file.path
41
+ end
42
+
43
+ def id
44
+ @id ||= begin
45
+ id = build_with_strategy(field: :id, strategy: @config.id_strategy)
46
+ raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
47
+
48
+ id
49
+ end
50
+ end
51
+
52
+ def timestamp
53
+ @timestamp ||= begin
54
+ ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
55
+ raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
56
+
57
+ ts
58
+ end
59
+ end
60
+
61
+ private
62
+
63
+ def build_created(file)
64
+ record = ::Chronicle::ETL::Models::Activity.new
65
+ record.verb = @config.verb
66
+ record.provider = @config.provider
67
+ record.provider_id = id
68
+ record.end_at = timestamp
69
+ record.dedupe_on = [[:provider_id, :verb, :provider]]
70
+
71
+ record.involved = build_image
72
+ record.actor = build_actor
73
+
74
+ record.assign_attributes(build_gps)
75
+ record
76
+ end
77
+
78
+ def build_actor
79
+ actor = ::Chronicle::ETL::Models::Entity.new
80
+ actor.represents = 'identity'
81
+ actor.provider = @config.actor[:provider]
82
+ actor.slug = @config.actor[:slug]
83
+ actor.dedupe_on = [[:provider, :slug, :represents]]
84
+ actor
85
+ end
86
+
87
+ def build_image
88
+ image = ::Chronicle::ETL::Models::Entity.new
89
+ image.represents = @config.involved[:represents]
90
+ image.title = build_title
91
+ image.body = exif['Description']
92
+ image.provider = @config.involved[:provider]
93
+ image.provider_id = id
94
+ image.assign_attributes(build_gps)
95
+ image.dedupe_on = [[:provider, :provider_id, :represents]]
96
+
97
+ if @config.ocr_strategy
98
+ ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
99
+ image.metadata[:ocr_text] = ocr_text if ocr_text
100
+ end
101
+
102
+ names = extract_people_depicted
103
+ tags = extract_keywords(names)
104
+
105
+ image.depicts = build_people_depicted(names)
106
+ image.abouts = build_keywords(tags)
107
+
108
+ if @config.include_image_data
109
+ attachment = ::Chronicle::ETL::Models::Attachment.new
110
+ attachment.data = build_image_data
111
+ image.attachments = [attachment]
112
+ end
113
+
114
+ image
115
+ end
116
+
117
+ def build_keywords(topics)
118
+ topics.map do |topic|
119
+ t = ::Chronicle::ETL::Models::Entity.new
120
+ t.represents = 'topic'
121
+ t.provider = @config.involved[:provider]
122
+ t.title = topic
123
+ t.slug = topic.parameterize
124
+ t.dedupe_on = [[:provider, :represents, :slug]]
125
+ t
126
+ end
127
+ end
128
+
129
+ def build_people_depicted(names)
130
+ names.map do |name|
131
+ identity = ::Chronicle::ETL::Models::Entity.new
132
+ identity.represents = 'identity'
133
+ identity.provider = @config.involved[:provider]
134
+ identity.slug = name.parameterize
135
+ identity.title = name
136
+ identity.dedupe_on = [[:provider, :represents, :slug]]
137
+ identity
138
+ end
139
+ end
140
+
141
+ def build_gps
142
+ return {} unless exif['GPSLatitude']
143
+
144
+ {
145
+ lat: exif['GPSLatitude'],
146
+ lng: exif['GPSLongitude'],
147
+ elevation: exif['GPSAltitude']
148
+ }
149
+ end
150
+
151
+ def build_image_data
152
+ ::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
153
+ end
154
+
155
+ def build_title
156
+ File.basename(@file)
157
+ end
158
+
159
+ def build_with_strategy(field:, strategy:[])
160
+ strategies = [strategy].flatten.compact
161
+ strategies.each do |s|
162
+ builder_method = "build_#{field}_using_#{s}"
163
+ result = send(builder_method.to_sym)
164
+ return result if result
165
+ end
166
+ return
167
+ end
168
+
169
+ def build_id_using_file_hash
170
+ Digest::SHA256.hexdigest(File.read(@file))
171
+ end
172
+
173
+ def build_id_using_xattr_version
174
+ load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
175
+ end
176
+
177
+ def build_id_using_xmp_document_id
178
+ exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
179
+ end
180
+
181
+ def build_timestamp_using_file_mtime
182
+ File.mtime(@file)
183
+ end
184
+
185
+ def build_timestamp_using_exif_datetimeoriginal
186
+ # EXIF tags don't have timezone information. This is a DateTime in UTC
187
+ timestamp = exif['DateTimeOriginal'] || return
188
+
189
+ if exif['OffsetTimeOriginal']
190
+ # Offset tags are only available in newer EXIF tags. If it exists, we
191
+ # use it instead of UTC
192
+ timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
193
+ elsif false
194
+ # TODO: support option of using GPS coordinates to determine timezone
195
+ else
196
+ zone = ActiveSupport::TimeZone.new(@config.timezone_default)
197
+ timestamp = zone.parse(timestamp.asctime)
198
+ end
199
+
200
+ timestamp
201
+ end
202
+
203
+ # TODO: add documentation for how to set up `macocr`
204
+ def build_ocr_using_macocr
205
+ `macocr "#{@file.path}" 2>/dev/null`.presence
206
+ end
207
+
208
+ def exif
209
+ @exif ||= MiniExiftool.new(
210
+ @file.path,
211
+ numerical: true,
212
+
213
+ # EXIF timestamps don't have timezone information. MiniExifTool uses Time
214
+ # by default which parses timestamps in local time zone. Using DateTime
215
+ # parses dates as UTC and then we can apply a timezone offset if the optional
216
+ # EXIF timezone offset fields are available.
217
+ # https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
218
+ timestamps: DateTime
219
+ )
220
+ end
221
+
222
+ # Figure out which faces are tagged as regions and return a list of their names
223
+ def extract_people_depicted
224
+ return [] unless exif['RegionName']
225
+
226
+ names = [exif['RegionName']].flatten
227
+ types = [exif['RegionType']].flatten
228
+
229
+ names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
230
+ end
231
+
232
+ # Extract image keywords from EXIF/IPTC tag and subtract out those of which are
233
+ # tagged people (determiend by looking at face regions)
234
+ def extract_keywords(people_names = [])
235
+ [exif['Keywords'] || []].flatten - people_names
236
+ end
237
+
238
+ def load_value_from_xattr_plist attribute
239
+ require 'nokogiri'
240
+ xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
241
+ return unless xml
242
+ value = Nokogiri::XML.parse(r).xpath("//string").text
243
+ return value.presence
244
+ end
245
+ end
246
+ end
247
+ end
@@ -1,9 +1,18 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class NullTransformer < Chronicle::ETL::Transformer
4
+ register_connector do |r|
5
+ r.identifier = 'null'
6
+ r.description = 'in no way'
7
+ end
8
+
4
9
  def transform
5
- Chronicle::ETL::Models::Generic.new(@data)
10
+ Chronicle::ETL::Models::Generic.new(@extraction.data)
6
11
  end
12
+
13
+ def timestamp; end
14
+
15
+ def id; end
7
16
  end
8
17
  end
9
18
  end
@@ -2,16 +2,16 @@ module Chronicle
2
2
  module ETL
3
3
  # Abstract class representing an Transformer for an ETL job
4
4
  class Transformer
5
- extend Chronicle::ETL::Catalog
5
+ extend Chronicle::ETL::Registry::SelfRegistering
6
+ include Chronicle::ETL::Configurable
6
7
 
7
8
  # Construct a new instance of this transformer. Options are passed in from a Runner
8
- # == Paramters:
9
+ # == Parameters:
9
10
  # options::
10
11
  # Options for configuring this Transformer
11
- def initialize(options = {}, data)
12
- @options = options
13
- @data = data
14
- @record = Chronicle::ETL::Models::Activity.new
12
+ def initialize(extraction, options = {})
13
+ @extraction = extraction
14
+ apply_options(options)
15
15
  end
16
16
 
17
17
  # @abstract Subclass is expected to implement #transform
@@ -19,16 +19,47 @@ module Chronicle
19
19
  # The main entrypoint for transforming a record. Called by a Runner on each extracted record
20
20
 
21
21
  # The domain or provider-specific id of the record this transformer is working on.
22
- # Used for building a cursor so an extractor doesn't have to start from the beginning of a
23
- # data source from the beginning.
24
- def id; end
22
+ # It is useful for:
23
+ # - de-duping records that might exist in the loader's destination
24
+ # - building a cursor so an extractor doesn't have to start from the beginning of a
25
+ # a source
26
+ def id
27
+ raise NotImplementedError
28
+ end
25
29
 
26
30
  # The domain or provider-specific timestamp of the record this transformer is working on.
27
31
  # Used for building a cursor so an extractor doesn't have to start from the beginning of a
28
32
  # data source from the beginning.
29
- def timestamp; end
33
+ def timestamp
34
+ raise NotImplementedError
35
+ end
36
+
37
+ # An optional, human-readable identifier for a transformation, intended for debugging or logging.
38
+ # By default, it is just the id.
39
+ def friendly_identifier
40
+ id
41
+ end
42
+
43
+ def to_s
44
+ ts = begin
45
+ unknown = "???"
46
+ timestamp&.iso8601 || unknown
47
+ rescue TransformationError, NotImplementedError
48
+ unknown
49
+ end
50
+
51
+ identifier = begin
52
+ unknown = self.class.to_s
53
+ friendly_identifier || self.class.to_s
54
+ rescue TransformationError, NotImplementedError
55
+ unknown
56
+ end
57
+
58
+ "[#{ts}] #{identifier}"
59
+ end
30
60
  end
31
61
  end
32
62
  end
33
63
 
34
64
  require_relative 'null_transformer'
65
+ require_relative 'image_file_transformer'
@@ -0,0 +1,21 @@
1
+ require 'marcel'
2
+ require 'base64'
3
+
4
+ module Chronicle
5
+ module ETL
6
+ module Utils
7
+ # Utility methods for dealing with binary files
8
+ module BinaryAttachments
9
+ def self.filename_to_base64(filename:, mimetype: nil)
10
+ mimetype = mimetype || guess_mimetype(filename: filename)
11
+
12
+ "data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}"
13
+ end
14
+
15
+ def self.guess_mimetype(filename:)
16
+ Marcel::MimeType.for(filename)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -64,7 +64,9 @@ module Chronicle
64
64
  end
65
65
 
66
66
  def log(message)
67
- @pbar.log message
67
+ message.split("\n").each do |line|
68
+ @pbar.log message
69
+ end
68
70
  end
69
71
 
70
72
  def finish
@@ -0,0 +1,15 @@
1
+ require 'active_support/core_ext/object/blank'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Utils
6
+ # OCR for image files
7
+ # TODO: add other strategies and document `macocr`
8
+ module TextRecognition
9
+ def self.recognize_in_image(filename:)
10
+ `macocr "#{filename}" 2>/dev/null`.presence
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.2.4"
3
+ VERSION = "0.4.0"
4
4
  end
5
5
  end
data/lib/chronicle/etl.rb CHANGED
@@ -1,19 +1,25 @@
1
- require_relative 'etl/catalog'
1
+ require_relative 'etl/registry/registry'
2
2
  require_relative 'etl/config'
3
+ require_relative 'etl/configurable'
3
4
  require_relative 'etl/exceptions'
5
+ require_relative 'etl/extraction'
4
6
  require_relative 'etl/extractors/extractor'
5
7
  require_relative 'etl/job_definition'
6
8
  require_relative 'etl/job_log'
7
9
  require_relative 'etl/job_logger'
8
10
  require_relative 'etl/job'
9
11
  require_relative 'etl/loaders/loader'
12
+ require_relative 'etl/logger'
10
13
  require_relative 'etl/models/activity'
14
+ require_relative 'etl/models/attachment'
11
15
  require_relative 'etl/models/base'
12
16
  require_relative 'etl/models/entity'
13
17
  require_relative 'etl/models/generic'
14
18
  require_relative 'etl/runner'
19
+ require_relative 'etl/serializers/serializer'
15
20
  require_relative 'etl/transformers/transformer'
21
+ require_relative 'etl/utils/binary_attachments'
16
22
  require_relative 'etl/utils/hash_utilities'
17
- require_relative 'etl/utils/jsonapi'
23
+ require_relative 'etl/utils/text_recognition'
18
24
  require_relative 'etl/utils/progress_bar'
19
25
  require_relative 'etl/version'