chronicle-etl 0.2.4 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +35 -0
  3. data/.gitignore +3 -0
  4. data/.rubocop.yml +31 -1
  5. data/Guardfile +7 -0
  6. data/README.md +21 -14
  7. data/Rakefile +4 -2
  8. data/chronicle-etl.gemspec +18 -10
  9. data/exe/chronicle-etl +1 -1
  10. data/lib/chronicle/etl/cli/connectors.rb +53 -7
  11. data/lib/chronicle/etl/cli/jobs.rb +59 -24
  12. data/lib/chronicle/etl/cli/main.rb +18 -16
  13. data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
  14. data/lib/chronicle/etl/cli.rb +7 -0
  15. data/lib/chronicle/etl/config.rb +1 -1
  16. data/lib/chronicle/etl/configurable.rb +150 -0
  17. data/lib/chronicle/etl/exceptions.rb +14 -1
  18. data/lib/chronicle/etl/extraction.rb +12 -0
  19. data/lib/chronicle/etl/extractors/csv_extractor.rb +32 -31
  20. data/lib/chronicle/etl/extractors/extractor.rb +25 -13
  21. data/lib/chronicle/etl/extractors/file_extractor.rb +17 -32
  22. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  23. data/lib/chronicle/etl/extractors/json_extractor.rb +37 -0
  24. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  25. data/lib/chronicle/etl/job.rb +30 -29
  26. data/lib/chronicle/etl/job_definition.rb +45 -7
  27. data/lib/chronicle/etl/job_log.rb +10 -0
  28. data/lib/chronicle/etl/job_logger.rb +23 -20
  29. data/lib/chronicle/etl/loaders/csv_loader.rb +5 -1
  30. data/lib/chronicle/etl/loaders/loader.rb +5 -2
  31. data/lib/chronicle/etl/loaders/rest_loader.rb +9 -5
  32. data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
  33. data/lib/chronicle/etl/loaders/table_loader.rb +51 -7
  34. data/lib/chronicle/etl/logger.rb +48 -0
  35. data/lib/chronicle/etl/models/attachment.rb +14 -0
  36. data/lib/chronicle/etl/models/base.rb +23 -7
  37. data/lib/chronicle/etl/models/entity.rb +9 -3
  38. data/lib/chronicle/etl/registry/connector_registration.rb +62 -0
  39. data/lib/chronicle/etl/registry/registry.rb +52 -0
  40. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  41. data/lib/chronicle/etl/runner.rb +58 -7
  42. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  43. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  44. data/lib/chronicle/etl/transformers/image_file_transformer.rb +247 -0
  45. data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
  46. data/lib/chronicle/etl/transformers/transformer.rb +41 -10
  47. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  48. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  49. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  50. data/lib/chronicle/etl/version.rb +1 -1
  51. data/lib/chronicle/etl.rb +8 -2
  52. metadata +146 -34
  53. data/.ruby-version +0 -1
  54. data/Gemfile.lock +0 -91
  55. data/lib/chronicle/etl/catalog.rb +0 -108
  56. data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
@@ -1,4 +1,5 @@
1
1
  require 'colorize'
2
+ require 'chronic_duration'
2
3
 
3
4
  class Chronicle::ETL::Runner
4
5
  def initialize(job)
@@ -13,25 +14,75 @@ class Chronicle::ETL::Runner
13
14
  @job_logger.start
14
15
  loader.start
15
16
 
17
+ extractor.prepare
16
18
  total = extractor.results_count
17
- progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
19
+ @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
20
+ Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
18
21
 
19
- extractor.extract do |data, metadata|
20
- transformer = @job.instantiate_transformer(data)
22
+ Chronicle::ETL::Logger.info(tty_log_job_start)
23
+ extractor.extract do |extraction|
24
+ unless extraction.is_a?(Chronicle::ETL::Extraction)
25
+ raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
26
+ end
27
+
28
+ transformer = @job.instantiate_transformer(extraction)
21
29
  record = transformer.transform
22
30
 
23
31
  unless record.is_a?(Chronicle::ETL::Models::Base)
24
- raise Chronicle::ETL::InvalidTransformedRecordError, "Transformed data is not a type of Chronicle::ETL::Models"
32
+ raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
25
33
  end
26
34
 
35
+ Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
27
36
  @job_logger.log_transformation(transformer)
28
- loader.load(record)
29
- progress_bar.increment
37
+
38
+ loader.load(record) unless @job.dry_run?
39
+ rescue Chronicle::ETL::TransformationError => e
40
+ Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
41
+ ensure
42
+ @progress_bar.increment
30
43
  end
31
44
 
32
- progress_bar.finish
45
+ @progress_bar.finish
33
46
  loader.finish
34
47
  @job_logger.finish
48
+ rescue Interrupt
49
+ Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
50
+ @job_logger.error
51
+ rescue StandardError => e
52
+ raise e
53
+ ensure
35
54
  @job_logger.save
55
+ @progress_bar.finish
56
+ Chronicle::ETL::Logger.detach_from_progress_bar
57
+ Chronicle::ETL::Logger.info(tty_log_completion)
58
+ end
59
+
60
+ private
61
+
62
+ def tty_log_job_start
63
+ output = "Beginning job "
64
+ output += "'#{@job.name}'".bold if @job.name
65
+ output
66
+ end
67
+
68
+ def tty_log_transformation transformer
69
+ output = " ✓".green
70
+ output += " #{transformer}"
71
+ end
72
+
73
+ def tty_log_transformation_failure exception
74
+ output = " ✖".red
75
+ output += " Failed to build #{exception.transformation}. #{exception.message}"
76
+ end
77
+
78
+ def tty_log_completion
79
+ status = @job_logger.success ? 'Success' : 'Failed'
80
+ output = "\nCompleted job "
81
+ output += "'#{@job.name}'".bold if @job.name
82
+ output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
83
+ output += "\n Status:\t".light_black + status
84
+ output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
85
+ output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
86
+ output
36
87
  end
37
88
  end
@@ -0,0 +1,25 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JSONAPISerializer < Chronicle::ETL::Serializer
4
+ def serializable_hash
5
+ @record
6
+ .identifier_hash
7
+ .merge({ attributes: @record.attributes })
8
+ .merge({ relationships: build_associations })
9
+ .merge(@record.meta_hash)
10
+ end
11
+
12
+ def build_associations
13
+ @record.associations.transform_values do |value|
14
+ association_data =
15
+ if value.is_a?(Array)
16
+ value.map { |record| JSONAPISerializer.new(record).serializable_hash }
17
+ else
18
+ JSONAPISerializer.new(value).serializable_hash
19
+ end
20
+ { data: association_data }
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,27 @@
1
+ module Chronicle
2
+ module ETL
3
+ # Abstract class representing a Serializer for an ETL record
4
+ class Serializer
5
+ # Construct a new instance of this serializer.
6
+ # == Parameters:
7
+ # options::
8
+ # Options for configuring this Serializers
9
+ def initialize(record, options = {})
10
+ @record = record
11
+ @options = options
12
+ end
13
+
14
+ # Serialize a record as a hash
15
+ def serializable_hash
16
+ raise NotImplementedError
17
+ end
18
+
19
+ def self.serialize(record)
20
+ serializer = self.new(record)
21
+ serializer.serializable_hash
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ require_relative 'jsonapi_serializer'
@@ -0,0 +1,247 @@
1
+ require 'mini_exiftool'
2
+ require 'active_support'
3
+ require 'active_support/core_ext/object'
4
+ require 'active_support/core_ext/time'
5
+ require 'active_support/core_ext/hash/reverse_merge'
6
+ require 'active_support/core_ext/string/inflections'
7
+
8
+ module Chronicle
9
+ module ETL
10
+ # Transform a JPEG or other image file into a record.
11
+ # By default, file mtime and a hash of the file content is used to build
12
+ # the timestamp and ID respectively but other options are available (such
13
+ # as reading EXIF tags or extended attributes from the filesystem).
14
+ #
15
+ # TODO: This should be extracted into its own plugin
16
+ class ImageFileTransformer < Chronicle::ETL::Transformer
17
+ register_connector do |r|
18
+ r.identifier = 'image-file'
19
+ r.description = 'an image file'
20
+ end
21
+
22
+ setting :timestamp_strategy, default: 'file_mtime'
23
+ setting :id_strategy, default: 'file_hash'
24
+ setting :verb, default: 'photographed'
25
+ # EXIF tags often don't have timezones
26
+ setting :timezone_default, default: 'Eastern Time (US & Canada)'
27
+ setting :include_image_data, default: true
28
+ setting :actor
29
+ setting :involved
30
+
31
+ def transform
32
+ # FIXME: set @filename; use block for reading file when necessary
33
+ @file = File.open(@extraction.data)
34
+ record = build_created(@file)
35
+ @file.close
36
+ record
37
+ end
38
+
39
+ def friendly_identifier
40
+ @file.path
41
+ end
42
+
43
+ def id
44
+ @id ||= begin
45
+ id = build_with_strategy(field: :id, strategy: @config.id_strategy)
46
+ raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
47
+
48
+ id
49
+ end
50
+ end
51
+
52
+ def timestamp
53
+ @timestamp ||= begin
54
+ ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
55
+ raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
56
+
57
+ ts
58
+ end
59
+ end
60
+
61
+ private
62
+
63
+ def build_created(file)
64
+ record = ::Chronicle::ETL::Models::Activity.new
65
+ record.verb = @config.verb
66
+ record.provider = @config.provider
67
+ record.provider_id = id
68
+ record.end_at = timestamp
69
+ record.dedupe_on = [[:provider_id, :verb, :provider]]
70
+
71
+ record.involved = build_image
72
+ record.actor = build_actor
73
+
74
+ record.assign_attributes(build_gps)
75
+ record
76
+ end
77
+
78
+ def build_actor
79
+ actor = ::Chronicle::ETL::Models::Entity.new
80
+ actor.represents = 'identity'
81
+ actor.provider = @config.actor[:provider]
82
+ actor.slug = @config.actor[:slug]
83
+ actor.dedupe_on = [[:provider, :slug, :represents]]
84
+ actor
85
+ end
86
+
87
+ def build_image
88
+ image = ::Chronicle::ETL::Models::Entity.new
89
+ image.represents = @config.involved[:represents]
90
+ image.title = build_title
91
+ image.body = exif['Description']
92
+ image.provider = @config.involved[:provider]
93
+ image.provider_id = id
94
+ image.assign_attributes(build_gps)
95
+ image.dedupe_on = [[:provider, :provider_id, :represents]]
96
+
97
+ if @config.ocr_strategy
98
+ ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
99
+ image.metadata[:ocr_text] = ocr_text if ocr_text
100
+ end
101
+
102
+ names = extract_people_depicted
103
+ tags = extract_keywords(names)
104
+
105
+ image.depicts = build_people_depicted(names)
106
+ image.abouts = build_keywords(tags)
107
+
108
+ if @config.include_image_data
109
+ attachment = ::Chronicle::ETL::Models::Attachment.new
110
+ attachment.data = build_image_data
111
+ image.attachments = [attachment]
112
+ end
113
+
114
+ image
115
+ end
116
+
117
+ def build_keywords(topics)
118
+ topics.map do |topic|
119
+ t = ::Chronicle::ETL::Models::Entity.new
120
+ t.represents = 'topic'
121
+ t.provider = @config.involved[:provider]
122
+ t.title = topic
123
+ t.slug = topic.parameterize
124
+ t.dedupe_on = [[:provider, :represents, :slug]]
125
+ t
126
+ end
127
+ end
128
+
129
+ def build_people_depicted(names)
130
+ names.map do |name|
131
+ identity = ::Chronicle::ETL::Models::Entity.new
132
+ identity.represents = 'identity'
133
+ identity.provider = @config.involved[:provider]
134
+ identity.slug = name.parameterize
135
+ identity.title = name
136
+ identity.dedupe_on = [[:provider, :represents, :slug]]
137
+ identity
138
+ end
139
+ end
140
+
141
+ def build_gps
142
+ return {} unless exif['GPSLatitude']
143
+
144
+ {
145
+ lat: exif['GPSLatitude'],
146
+ lng: exif['GPSLongitude'],
147
+ elevation: exif['GPSAltitude']
148
+ }
149
+ end
150
+
151
+ def build_image_data
152
+ ::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
153
+ end
154
+
155
+ def build_title
156
+ File.basename(@file)
157
+ end
158
+
159
+ def build_with_strategy(field:, strategy:[])
160
+ strategies = [strategy].flatten.compact
161
+ strategies.each do |s|
162
+ builder_method = "build_#{field}_using_#{s}"
163
+ result = send(builder_method.to_sym)
164
+ return result if result
165
+ end
166
+ return
167
+ end
168
+
169
+ def build_id_using_file_hash
170
+ Digest::SHA256.hexdigest(File.read(@file))
171
+ end
172
+
173
+ def build_id_using_xattr_version
174
+ load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
175
+ end
176
+
177
+ def build_id_using_xmp_document_id
178
+ exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
179
+ end
180
+
181
+ def build_timestamp_using_file_mtime
182
+ File.mtime(@file)
183
+ end
184
+
185
+ def build_timestamp_using_exif_datetimeoriginal
186
+ # EXIF tags don't have timezone information. This is a DateTime in UTC
187
+ timestamp = exif['DateTimeOriginal'] || return
188
+
189
+ if exif['OffsetTimeOriginal']
190
+ # Offset tags are only available in newer EXIF tags. If it exists, we
191
+ # use it instead of UTC
192
+ timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
193
+ elsif false
194
+ # TODO: support option of using GPS coordinates to determine timezone
195
+ else
196
+ zone = ActiveSupport::TimeZone.new(@config.timezone_default)
197
+ timestamp = zone.parse(timestamp.asctime)
198
+ end
199
+
200
+ timestamp
201
+ end
202
+
203
+ # TODO: add documentation for how to set up `macocr`
204
+ def build_ocr_using_macocr
205
+ `macocr "#{@file.path}" 2>/dev/null`.presence
206
+ end
207
+
208
+ def exif
209
+ @exif ||= MiniExiftool.new(
210
+ @file.path,
211
+ numerical: true,
212
+
213
+ # EXIF timestamps don't have timezone information. MiniExifTool uses Time
214
+ # by default which parses timestamps in local time zone. Using DateTime
215
+ # parses dates as UTC and then we can apply a timezone offset if the optional
216
+ # EXIF timezone offset fields are available.
217
+ # https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
218
+ timestamps: DateTime
219
+ )
220
+ end
221
+
222
+ # Figure out which faces are tagged as regions and return a list of their names
223
+ def extract_people_depicted
224
+ return [] unless exif['RegionName']
225
+
226
+ names = [exif['RegionName']].flatten
227
+ types = [exif['RegionType']].flatten
228
+
229
+ names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
230
+ end
231
+
232
+ # Extract image keywords from EXIF/IPTC tag and subtract out those of which are
233
+ # tagged people (determiend by looking at face regions)
234
+ def extract_keywords(people_names = [])
235
+ [exif['Keywords'] || []].flatten - people_names
236
+ end
237
+
238
+ def load_value_from_xattr_plist attribute
239
+ require 'nokogiri'
240
+ xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
241
+ return unless xml
242
+ value = Nokogiri::XML.parse(r).xpath("//string").text
243
+ return value.presence
244
+ end
245
+ end
246
+ end
247
+ end
@@ -1,9 +1,18 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class NullTransformer < Chronicle::ETL::Transformer
4
+ register_connector do |r|
5
+ r.identifier = 'null'
6
+ r.description = 'in no way'
7
+ end
8
+
4
9
  def transform
5
- Chronicle::ETL::Models::Generic.new(@data)
10
+ Chronicle::ETL::Models::Generic.new(@extraction.data)
6
11
  end
12
+
13
+ def timestamp; end
14
+
15
+ def id; end
7
16
  end
8
17
  end
9
18
  end
@@ -2,16 +2,16 @@ module Chronicle
2
2
  module ETL
3
3
  # Abstract class representing an Transformer for an ETL job
4
4
  class Transformer
5
- extend Chronicle::ETL::Catalog
5
+ extend Chronicle::ETL::Registry::SelfRegistering
6
+ include Chronicle::ETL::Configurable
6
7
 
7
8
  # Construct a new instance of this transformer. Options are passed in from a Runner
8
- # == Paramters:
9
+ # == Parameters:
9
10
  # options::
10
11
  # Options for configuring this Transformer
11
- def initialize(options = {}, data)
12
- @options = options
13
- @data = data
14
- @record = Chronicle::ETL::Models::Activity.new
12
+ def initialize(extraction, options = {})
13
+ @extraction = extraction
14
+ apply_options(options)
15
15
  end
16
16
 
17
17
  # @abstract Subclass is expected to implement #transform
@@ -19,16 +19,47 @@ module Chronicle
19
19
  # The main entrypoint for transforming a record. Called by a Runner on each extracted record
20
20
 
21
21
  # The domain or provider-specific id of the record this transformer is working on.
22
- # Used for building a cursor so an extractor doesn't have to start from the beginning of a
23
- # data source from the beginning.
24
- def id; end
22
+ # It is useful for:
23
+ # - de-duping records that might exist in the loader's destination
24
+ # - building a cursor so an extractor doesn't have to start from the beginning of a
25
+ # a source
26
+ def id
27
+ raise NotImplementedError
28
+ end
25
29
 
26
30
  # The domain or provider-specific timestamp of the record this transformer is working on.
27
31
  # Used for building a cursor so an extractor doesn't have to start from the beginning of a
28
32
  # data source from the beginning.
29
- def timestamp; end
33
+ def timestamp
34
+ raise NotImplementedError
35
+ end
36
+
37
+ # An optional, human-readable identifier for a transformation, intended for debugging or logging.
38
+ # By default, it is just the id.
39
+ def friendly_identifier
40
+ id
41
+ end
42
+
43
+ def to_s
44
+ ts = begin
45
+ unknown = "???"
46
+ timestamp&.iso8601 || unknown
47
+ rescue TransformationError, NotImplementedError
48
+ unknown
49
+ end
50
+
51
+ identifier = begin
52
+ unknown = self.class.to_s
53
+ friendly_identifier || self.class.to_s
54
+ rescue TransformationError, NotImplementedError
55
+ unknown
56
+ end
57
+
58
+ "[#{ts}] #{identifier}"
59
+ end
30
60
  end
31
61
  end
32
62
  end
33
63
 
34
64
  require_relative 'null_transformer'
65
+ require_relative 'image_file_transformer'
@@ -0,0 +1,21 @@
1
+ require 'marcel'
2
+ require 'base64'
3
+
4
+ module Chronicle
5
+ module ETL
6
+ module Utils
7
+ # Utility methods for dealing with binary files
8
+ module BinaryAttachments
9
+ def self.filename_to_base64(filename:, mimetype: nil)
10
+ mimetype = mimetype || guess_mimetype(filename: filename)
11
+
12
+ "data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}"
13
+ end
14
+
15
+ def self.guess_mimetype(filename:)
16
+ Marcel::MimeType.for(filename)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -64,7 +64,9 @@ module Chronicle
64
64
  end
65
65
 
66
66
  def log(message)
67
- @pbar.log message
67
+ message.split("\n").each do |line|
68
+ @pbar.log message
69
+ end
68
70
  end
69
71
 
70
72
  def finish
@@ -0,0 +1,15 @@
1
+ require 'active_support/core_ext/object/blank'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Utils
6
+ # OCR for image files
7
+ # TODO: add other strategies and document `macocr`
8
+ module TextRecognition
9
+ def self.recognize_in_image(filename:)
10
+ `macocr "#{filename}" 2>/dev/null`.presence
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.2.4"
3
+ VERSION = "0.4.0"
4
4
  end
5
5
  end
data/lib/chronicle/etl.rb CHANGED
@@ -1,19 +1,25 @@
1
- require_relative 'etl/catalog'
1
+ require_relative 'etl/registry/registry'
2
2
  require_relative 'etl/config'
3
+ require_relative 'etl/configurable'
3
4
  require_relative 'etl/exceptions'
5
+ require_relative 'etl/extraction'
4
6
  require_relative 'etl/extractors/extractor'
5
7
  require_relative 'etl/job_definition'
6
8
  require_relative 'etl/job_log'
7
9
  require_relative 'etl/job_logger'
8
10
  require_relative 'etl/job'
9
11
  require_relative 'etl/loaders/loader'
12
+ require_relative 'etl/logger'
10
13
  require_relative 'etl/models/activity'
14
+ require_relative 'etl/models/attachment'
11
15
  require_relative 'etl/models/base'
12
16
  require_relative 'etl/models/entity'
13
17
  require_relative 'etl/models/generic'
14
18
  require_relative 'etl/runner'
19
+ require_relative 'etl/serializers/serializer'
15
20
  require_relative 'etl/transformers/transformer'
21
+ require_relative 'etl/utils/binary_attachments'
16
22
  require_relative 'etl/utils/hash_utilities'
17
- require_relative 'etl/utils/jsonapi'
23
+ require_relative 'etl/utils/text_recognition'
18
24
  require_relative 'etl/utils/progress_bar'
19
25
  require_relative 'etl/version'