chronicle-etl 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/README.md +20 -13
  5. data/chronicle-etl.gemspec +11 -8
  6. data/lib/chronicle/etl/cli/connectors.rb +19 -7
  7. data/lib/chronicle/etl/cli/jobs.rb +24 -18
  8. data/lib/chronicle/etl/cli/main.rb +10 -2
  9. data/lib/chronicle/etl/config.rb +1 -1
  10. data/lib/chronicle/etl/exceptions.rb +12 -1
  11. data/lib/chronicle/etl/extraction.rb +12 -0
  12. data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -36
  13. data/lib/chronicle/etl/extractors/extractor.rb +9 -1
  14. data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
  15. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  16. data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
  17. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  18. data/lib/chronicle/etl/job.rb +30 -29
  19. data/lib/chronicle/etl/job_definition.rb +45 -7
  20. data/lib/chronicle/etl/job_log.rb +10 -0
  21. data/lib/chronicle/etl/job_logger.rb +23 -20
  22. data/lib/chronicle/etl/loaders/csv_loader.rb +4 -0
  23. data/lib/chronicle/etl/loaders/loader.rb +1 -1
  24. data/lib/chronicle/etl/loaders/rest_loader.rb +5 -1
  25. data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
  26. data/lib/chronicle/etl/loaders/table_loader.rb +57 -7
  27. data/lib/chronicle/etl/logger.rb +48 -0
  28. data/lib/chronicle/etl/models/attachment.rb +14 -0
  29. data/lib/chronicle/etl/models/base.rb +23 -7
  30. data/lib/chronicle/etl/models/entity.rb +9 -3
  31. data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
  32. data/lib/chronicle/etl/registry/registry.rb +52 -0
  33. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  34. data/lib/chronicle/etl/runner.rb +57 -7
  35. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  36. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  37. data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
  38. data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
  39. data/lib/chronicle/etl/transformers/transformer.rb +39 -9
  40. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  41. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  42. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  43. data/lib/chronicle/etl/version.rb +1 -1
  44. data/lib/chronicle/etl.rb +7 -2
  45. metadata +96 -44
  46. data/Gemfile.lock +0 -91
  47. data/lib/chronicle/etl/catalog.rb +0 -108
  48. data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
@@ -0,0 +1,52 @@
1
+ require 'rubygems'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A singleton class that acts as a registry of connector classes available for ETL jobs
6
+ module Registry
7
+ PHASES = [:extractor, :transformer, :loader]
8
+
9
+ class << self
10
+ attr_accessor :connectors
11
+
12
+ def load_all!
13
+ load_connectors_from_gems
14
+ end
15
+
16
+ def load_connectors_from_gems
17
+ Gem::Specification.filter{|s| s.name.match(/^chronicle/) }.each do |gem|
18
+ require_str = gem.name.gsub('chronicle-', 'chronicle/')
19
+ require require_str rescue LoadError
20
+ end
21
+ end
22
+
23
+ def install_connector name
24
+ gem_name = "chronicle-#{name}"
25
+ Gem.install(gem_name)
26
+ end
27
+
28
+ def register connector
29
+ @connectors ||= []
30
+ @connectors << connector
31
+ end
32
+
33
+ def find_by_phase_and_identifier(phase, identifier)
34
+ connector = find_within_loaded_connectors(phase, identifier)
35
+ unless connector
36
+ # Only load external connectors (slow) if not found in built-in connectors
37
+ load_all!
38
+ connector = find_within_loaded_connectors(phase, identifier)
39
+ end
40
+ connector || raise(ConnectorNotAvailableError.new("Connector '#{identifier}' not found"))
41
+ end
42
+
43
+ def find_within_loaded_connectors(phase, identifier)
44
+ @connectors.find { |c| c.phase == phase && c.identifier == identifier }
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ require_relative 'self_registering'
52
+ require_relative 'connector_registration'
@@ -0,0 +1,25 @@
1
+ require 'forwardable'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Registry
6
+ # Gives a connector class the ability to let the Chronicle::ETL::Registry
7
+ # know about itself
8
+ module SelfRegistering
9
+ extend Forwardable
10
+
11
+ attr_accessor :connector_registration
12
+
13
+ def_delegators :@connector_registration, :description, :provider, :identifier
14
+
15
+ # Creates a ConnectorRegistration for this connector's details and register's it
16
+ # into the Registry
17
+ def register_connector
18
+ @connector_registration ||= ::Chronicle::ETL::Registry::ConnectorRegistration.new(self)
19
+ yield @connector_registration if block_given?
20
+ ::Chronicle::ETL::Registry.register(@connector_registration)
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -1,4 +1,5 @@
1
1
  require 'colorize'
2
+ require 'chronic_duration'
2
3
 
3
4
  class Chronicle::ETL::Runner
4
5
  def initialize(job)
@@ -14,24 +15,73 @@ class Chronicle::ETL::Runner
14
15
  loader.start
15
16
 
16
17
  total = extractor.results_count
17
- progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
18
+ @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
19
+ Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
18
20
 
19
- extractor.extract do |data, metadata|
20
- transformer = @job.instantiate_transformer(data)
21
+ Chronicle::ETL::Logger.info(tty_log_job_start)
22
+ extractor.extract do |extraction|
23
+ unless extraction.is_a?(Chronicle::ETL::Extraction)
24
+ raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
25
+ end
26
+
27
+ transformer = @job.instantiate_transformer(extraction)
21
28
  record = transformer.transform
22
29
 
23
30
  unless record.is_a?(Chronicle::ETL::Models::Base)
24
- raise Chronicle::ETL::InvalidTransformedRecordError, "Transformed data is not a type of Chronicle::ETL::Models"
31
+ raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
25
32
  end
26
33
 
34
+ Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
27
35
  @job_logger.log_transformation(transformer)
28
- loader.load(record)
29
- progress_bar.increment
36
+
37
+ loader.load(record) unless @job.dry_run?
38
+ rescue Chronicle::ETL::TransformationError => e
39
+ Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
40
+ ensure
41
+ @progress_bar.increment
30
42
  end
31
43
 
32
- progress_bar.finish
44
+ @progress_bar.finish
33
45
  loader.finish
34
46
  @job_logger.finish
47
+ rescue Interrupt
48
+ Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
49
+ @job_logger.error
50
+ rescue StandardError => e
51
+ raise e
52
+ ensure
35
53
  @job_logger.save
54
+ @progress_bar.finish
55
+ Chronicle::ETL::Logger.detach_from_progress_bar
56
+ Chronicle::ETL::Logger.info(tty_log_completion)
57
+ end
58
+
59
+ private
60
+
61
+ def tty_log_job_start
62
+ output = "Beginning job "
63
+ output += "'#{@job.name}'".bold if @job.name
64
+ output
65
+ end
66
+
67
+ def tty_log_transformation transformer
68
+ output = " ✓".green
69
+ output += " #{transformer}"
70
+ end
71
+
72
+ def tty_log_transformation_failure exception
73
+ output = " ✖".red
74
+ output += " Failed to build #{exception.transformation}. #{exception.message}"
75
+ end
76
+
77
+ def tty_log_completion
78
+ status = @job_logger.success ? 'Success' : 'Failed'
79
+ output = "\nCompleted job "
80
+ output += "'#{@job.name}'".bold if @job.name
81
+ output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
82
+ output += "\n Status:\t".light_black + status
83
+ output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
84
+ output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
85
+ output
36
86
  end
37
87
  end
@@ -0,0 +1,25 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JSONAPISerializer < Chronicle::ETL::Serializer
4
+ def serializable_hash
5
+ @record
6
+ .identifier_hash
7
+ .merge({ attributes: @record.attributes })
8
+ .merge({ relationships: build_associations })
9
+ .merge(@record.meta_hash)
10
+ end
11
+
12
+ def build_associations
13
+ @record.associations.transform_values do |value|
14
+ association_data =
15
+ if value.is_a?(Array)
16
+ value.map { |record| JSONAPISerializer.new(record).serializable_hash }
17
+ else
18
+ JSONAPISerializer.new(value).serializable_hash
19
+ end
20
+ { data: association_data }
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,27 @@
1
+ module Chronicle
2
+ module ETL
3
+ # Abstract class representing a Serializer for an ETL record
4
+ class Serializer
5
+ # Construct a new instance of this serializer.
6
+ # == Parameters:
7
+ # options::
8
+ # Options for configuring this Serializers
9
+ def initialize(record, options = {})
10
+ @record = record
11
+ @options = options
12
+ end
13
+
14
+ # Serialize a record as a hash
15
+ def serializable_hash
16
+ raise NotImplementedError
17
+ end
18
+
19
+ def self.serialize(record)
20
+ serializer = self.new(record)
21
+ serializer.serializable_hash
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ require_relative 'jsonapi_serializer'
@@ -0,0 +1,253 @@
1
+ require 'mini_exiftool'
2
+ require 'active_support'
3
+ require 'active_support/core_ext/object'
4
+ require 'active_support/core_ext/time'
5
+ require 'active_support/core_ext/hash/reverse_merge'
6
+ require 'active_support/core_ext/string/inflections'
7
+
8
+ module Chronicle
9
+ module ETL
10
+ # Transform a JPEG or other image file into a record.
11
+ # By default, file mtime and a hash of the file content is used to build
12
+ # the timestamp and ID respectively but other options are available (such
13
+ # as reading EXIF tags or extended attributes from the filesystem).
14
+ #
15
+ # TODO: This should be extracted into its own plugin
16
+ class ImageFileTransformer < Chronicle::ETL::Transformer
17
+ register_connector do |r|
18
+ r.identifier = 'image-file'
19
+ r.description = 'an image file'
20
+ end
21
+
22
+ DEFAULT_OPTIONS = {
23
+ timestamp_strategy: 'file_mtime',
24
+ id_strategy: 'file_hash',
25
+ verb: 'photographed',
26
+
27
+ # EXIF tags often don't have timezones
28
+ timezone_default: 'Eastern Time (US & Canada)',
29
+ include_image_data: true
30
+ }.freeze
31
+
32
+ def initialize(*args)
33
+ super(*args)
34
+ @options = @options.reverse_merge(DEFAULT_OPTIONS)
35
+ end
36
+
37
+ def transform
38
+ # FIXME: set @filename; use block for reading file when necessary
39
+ @file = File.open(@extraction.data)
40
+ record = build_created(@file)
41
+ @file.close
42
+ record
43
+ end
44
+
45
+ def friendly_identifier
46
+ @file.path
47
+ end
48
+
49
+ def id
50
+ @id ||= begin
51
+ id = build_with_strategy(field: :id, strategy: @options[:id_strategy])
52
+ raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
53
+
54
+ id
55
+ end
56
+ end
57
+
58
+ def timestamp
59
+ @timestamp ||= begin
60
+ ts = build_with_strategy(field: :timestamp, strategy: @options[:timestamp_strategy])
61
+ raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
62
+
63
+ ts
64
+ end
65
+ end
66
+
67
+ private
68
+
69
+ def build_created(file)
70
+ record = ::Chronicle::ETL::Models::Activity.new
71
+ record.verb = @options[:verb]
72
+ record.provider = @options[:provider]
73
+ record.provider_id = id
74
+ record.end_at = timestamp
75
+ record.dedupe_on = [[:provider_id, :verb, :provider]]
76
+
77
+ record.involved = build_image
78
+ record.actor = build_actor
79
+
80
+ record.assign_attributes(build_gps)
81
+ record
82
+ end
83
+
84
+ def build_actor
85
+ actor = ::Chronicle::ETL::Models::Entity.new
86
+ actor.represents = 'identity'
87
+ actor.provider = @options[:actor][:provider]
88
+ actor.slug = @options[:actor][:slug]
89
+ actor.dedupe_on = [[:provider, :slug, :represents]]
90
+ actor
91
+ end
92
+
93
+ def build_image
94
+ image = ::Chronicle::ETL::Models::Entity.new
95
+ image.represents = @options[:involved][:represents]
96
+ image.title = build_title
97
+ image.body = exif['Description']
98
+ image.provider = @options[:involved][:provider]
99
+ image.provider_id = id
100
+ image.assign_attributes(build_gps)
101
+ image.dedupe_on = [[:provider, :provider_id, :represents]]
102
+
103
+ if @options[:ocr_strategy]
104
+ ocr_text = build_with_strategy(field: :ocr, strategy: @options[:ocr_strategy])
105
+ image.metadata[:ocr_text] = ocr_text if ocr_text
106
+ end
107
+
108
+ names = extract_people_depicted
109
+ tags = extract_keywords(names)
110
+
111
+ image.depicts = build_people_depicted(names)
112
+ image.abouts = build_keywords(tags)
113
+
114
+ if @options[:include_image_data]
115
+ attachment = ::Chronicle::ETL::Models::Attachment.new
116
+ attachment.data = build_image_data
117
+ image.attachments = [attachment]
118
+ end
119
+
120
+ image
121
+ end
122
+
123
+ def build_keywords(topics)
124
+ topics.map do |topic|
125
+ t = ::Chronicle::ETL::Models::Entity.new
126
+ t.represents = 'topic'
127
+ t.provider = @options[:involved][:provider]
128
+ t.title = topic
129
+ t.slug = topic.parameterize
130
+ t.dedupe_on = [[:provider, :represents, :slug]]
131
+ t
132
+ end
133
+ end
134
+
135
+ def build_people_depicted(names)
136
+ names.map do |name|
137
+ identity = ::Chronicle::ETL::Models::Entity.new
138
+ identity.represents = 'identity'
139
+ identity.provider = @options[:involved][:provider]
140
+ identity.slug = name.parameterize
141
+ identity.title = name
142
+ identity.dedupe_on = [[:provider, :represents, :slug]]
143
+ identity
144
+ end
145
+ end
146
+
147
+ def build_gps
148
+ return {} unless exif['GPSLatitude']
149
+
150
+ {
151
+ lat: exif['GPSLatitude'],
152
+ lng: exif['GPSLongitude'],
153
+ elevation: exif['GPSAltitude']
154
+ }
155
+ end
156
+
157
+ def build_image_data
158
+ ::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
159
+ end
160
+
161
+ def build_title
162
+ File.basename(@file)
163
+ end
164
+
165
+ def build_with_strategy(field:, strategy:[])
166
+ strategies = [strategy].flatten.compact
167
+ strategies.each do |s|
168
+ builder_method = "build_#{field}_using_#{s}"
169
+ result = send(builder_method.to_sym)
170
+ return result if result
171
+ end
172
+ return
173
+ end
174
+
175
+ def build_id_using_file_hash
176
+ Digest::SHA256.hexdigest(File.read(@file))
177
+ end
178
+
179
+ def build_id_using_xattr_version
180
+ load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
181
+ end
182
+
183
+ def build_id_using_xmp_document_id
184
+ exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
185
+ end
186
+
187
+ def build_timestamp_using_file_mtime
188
+ File.mtime(@file)
189
+ end
190
+
191
+ def build_timestamp_using_exif_datetimeoriginal
192
+ # EXIF tags don't have timezone information. This is a DateTime in UTC
193
+ timestamp = exif['DateTimeOriginal'] || return
194
+
195
+ if exif['OffsetTimeOriginal']
196
+ # Offset tags are only available in newer EXIF tags. If it exists, we
197
+ # use it instead of UTC
198
+ timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
199
+ elsif false
200
+ # TODO: support option of using GPS coordinates to determine timezone
201
+ else
202
+ zone = ActiveSupport::TimeZone.new(@options[:timezone_default])
203
+ timestamp = zone.parse(timestamp.asctime)
204
+ end
205
+
206
+ timestamp
207
+ end
208
+
209
+ # TODO: add documentation for how to set up `macocr`
210
+ def build_ocr_using_macocr
211
+ `macocr "#{@file.path}" 2>/dev/null`.presence
212
+ end
213
+
214
+ def exif
215
+ @exif ||= MiniExiftool.new(
216
+ @file.path,
217
+ numerical: true,
218
+
219
+ # EXIF timestamps don't have timezone information. MiniExifTool uses Time
220
+ # by default which parses timestamps in local time zone. Using DateTime
221
+ # parses dates as UTC and then we can apply a timezone offset if the optional
222
+ # EXIF timezone offset fields are available.
223
+ # https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
224
+ timestamps: DateTime
225
+ )
226
+ end
227
+
228
+ # Figure out which faces are tagged as regions and return a list of their names
229
+ def extract_people_depicted
230
+ return [] unless exif['RegionName']
231
+
232
+ names = [exif['RegionName']].flatten
233
+ types = [exif['RegionType']].flatten
234
+
235
+ names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
236
+ end
237
+
238
+ # Extract image keywords from EXIF/IPTC tag and subtract out those of which are
239
+ # tagged people (determiend by looking at face regions)
240
+ def extract_keywords(people_names = [])
241
+ [exif['Keywords'] || []].flatten - people_names
242
+ end
243
+
244
+ def load_value_from_xattr_plist attribute
245
+ require 'nokogiri'
246
+ xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
247
+ return unless xml
248
+ value = Nokogiri::XML.parse(r).xpath("//string").text
249
+ return value.presence
250
+ end
251
+ end
252
+ end
253
+ end
@@ -1,9 +1,18 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class NullTransformer < Chronicle::ETL::Transformer
4
+ register_connector do |r|
5
+ r.identifier = 'null'
6
+ r.description = 'in no way'
7
+ end
8
+
4
9
  def transform
5
- Chronicle::ETL::Models::Generic.new(@data)
10
+ Chronicle::ETL::Models::Generic.new(@extraction.data)
6
11
  end
12
+
13
+ def timestamp; end
14
+
15
+ def id; end
7
16
  end
8
17
  end
9
18
  end
@@ -2,16 +2,15 @@ module Chronicle
2
2
  module ETL
3
3
  # Abstract class representing an Transformer for an ETL job
4
4
  class Transformer
5
- extend Chronicle::ETL::Catalog
5
+ extend Chronicle::ETL::Registry::SelfRegistering
6
6
 
7
7
  # Construct a new instance of this transformer. Options are passed in from a Runner
8
- # == Paramters:
8
+ # == Parameters:
9
9
  # options::
10
10
  # Options for configuring this Transformer
11
- def initialize(options = {}, data)
11
+ def initialize(options = {}, extraction)
12
12
  @options = options
13
- @data = data
14
- @record = Chronicle::ETL::Models::Activity.new
13
+ @extraction = extraction
15
14
  end
16
15
 
17
16
  # @abstract Subclass is expected to implement #transform
@@ -19,16 +18,47 @@ module Chronicle
19
18
  # The main entrypoint for transforming a record. Called by a Runner on each extracted record
20
19
 
21
20
  # The domain or provider-specific id of the record this transformer is working on.
22
- # Used for building a cursor so an extractor doesn't have to start from the beginning of a
23
- # data source from the beginning.
24
- def id; end
21
+ # It is useful for:
22
+ # - de-duping records that might exist in the loader's destination
23
+ # - building a cursor so an extractor doesn't have to start from the beginning of a
24
+ # a source
25
+ def id
26
+ raise NotImplementedError
27
+ end
25
28
 
26
29
  # The domain or provider-specific timestamp of the record this transformer is working on.
27
30
  # Used for building a cursor so an extractor doesn't have to start from the beginning of a
28
31
  # data source from the beginning.
29
- def timestamp; end
32
+ def timestamp
33
+ raise NotImplementedError
34
+ end
35
+
36
+ # An optional, human-readable identifier for a transformation, intended for debugging or logging.
37
+ # By default, it is just the id.
38
+ def friendly_identifier
39
+ id
40
+ end
41
+
42
+ def to_s
43
+ ts = begin
44
+ unknown = "???"
45
+ timestamp&.iso8601 || unknown
46
+ rescue TransformationError, NotImplementedError
47
+ unknown
48
+ end
49
+
50
+ identifier = begin
51
+ unknown = self.class.to_s
52
+ friendly_identifier || self.class.to_s
53
+ rescue TransformationError, NotImplementedError
54
+ unknown
55
+ end
56
+
57
+ "[#{ts}] #{identifier}"
58
+ end
30
59
  end
31
60
  end
32
61
  end
33
62
 
34
63
  require_relative 'null_transformer'
64
+ require_relative 'image_file_transformer'
@@ -0,0 +1,21 @@
1
+ require 'marcel'
2
+ require 'base64'
3
+
4
+ module Chronicle
5
+ module ETL
6
+ module Utils
7
+ # Utility methods for dealing with binary files
8
+ module BinaryAttachments
9
+ def self.filename_to_base64(filename:, mimetype: nil)
10
+ mimetype = mimetype || guess_mimetype(filename: filename)
11
+
12
+ "data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}"
13
+ end
14
+
15
+ def self.guess_mimetype(filename:)
16
+ Marcel::MimeType.for(filename)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -64,7 +64,9 @@ module Chronicle
64
64
  end
65
65
 
66
66
  def log(message)
67
- @pbar.log message
67
+ message.split("\n").each do |line|
68
+ @pbar.log message
69
+ end
68
70
  end
69
71
 
70
72
  def finish
@@ -0,0 +1,15 @@
1
+ require 'active_support/core_ext/object/blank'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Utils
6
+ # OCR for image files
7
+ # TODO: add other strategies and document `macocr`
8
+ module TextRecognition
9
+ def self.recognize_in_image(filename:)
10
+ `macocr "#{filename}" 2>/dev/null`.presence
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.2.4"
3
+ VERSION = "0.3.0"
4
4
  end
5
5
  end
data/lib/chronicle/etl.rb CHANGED
@@ -1,19 +1,24 @@
1
- require_relative 'etl/catalog'
1
+ require_relative 'etl/registry/registry'
2
2
  require_relative 'etl/config'
3
3
  require_relative 'etl/exceptions'
4
+ require_relative 'etl/extraction'
4
5
  require_relative 'etl/extractors/extractor'
5
6
  require_relative 'etl/job_definition'
6
7
  require_relative 'etl/job_log'
7
8
  require_relative 'etl/job_logger'
8
9
  require_relative 'etl/job'
9
10
  require_relative 'etl/loaders/loader'
11
+ require_relative 'etl/logger'
10
12
  require_relative 'etl/models/activity'
13
+ require_relative 'etl/models/attachment'
11
14
  require_relative 'etl/models/base'
12
15
  require_relative 'etl/models/entity'
13
16
  require_relative 'etl/models/generic'
14
17
  require_relative 'etl/runner'
18
+ require_relative 'etl/serializers/serializer'
15
19
  require_relative 'etl/transformers/transformer'
20
+ require_relative 'etl/utils/binary_attachments'
16
21
  require_relative 'etl/utils/hash_utilities'
17
- require_relative 'etl/utils/jsonapi'
22
+ require_relative 'etl/utils/text_recognition'
18
23
  require_relative 'etl/utils/progress_bar'
19
24
  require_relative 'etl/version'