chronicle-etl 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/README.md +20 -13
  5. data/chronicle-etl.gemspec +11 -8
  6. data/lib/chronicle/etl/cli/connectors.rb +19 -7
  7. data/lib/chronicle/etl/cli/jobs.rb +24 -18
  8. data/lib/chronicle/etl/cli/main.rb +10 -2
  9. data/lib/chronicle/etl/config.rb +1 -1
  10. data/lib/chronicle/etl/exceptions.rb +12 -1
  11. data/lib/chronicle/etl/extraction.rb +12 -0
  12. data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -36
  13. data/lib/chronicle/etl/extractors/extractor.rb +9 -1
  14. data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
  15. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  16. data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
  17. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  18. data/lib/chronicle/etl/job.rb +30 -29
  19. data/lib/chronicle/etl/job_definition.rb +45 -7
  20. data/lib/chronicle/etl/job_log.rb +10 -0
  21. data/lib/chronicle/etl/job_logger.rb +23 -20
  22. data/lib/chronicle/etl/loaders/csv_loader.rb +4 -0
  23. data/lib/chronicle/etl/loaders/loader.rb +1 -1
  24. data/lib/chronicle/etl/loaders/rest_loader.rb +5 -1
  25. data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
  26. data/lib/chronicle/etl/loaders/table_loader.rb +57 -7
  27. data/lib/chronicle/etl/logger.rb +48 -0
  28. data/lib/chronicle/etl/models/attachment.rb +14 -0
  29. data/lib/chronicle/etl/models/base.rb +23 -7
  30. data/lib/chronicle/etl/models/entity.rb +9 -3
  31. data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
  32. data/lib/chronicle/etl/registry/registry.rb +52 -0
  33. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  34. data/lib/chronicle/etl/runner.rb +57 -7
  35. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  36. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  37. data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
  38. data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
  39. data/lib/chronicle/etl/transformers/transformer.rb +39 -9
  40. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  41. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  42. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  43. data/lib/chronicle/etl/version.rb +1 -1
  44. data/lib/chronicle/etl.rb +7 -2
  45. metadata +96 -44
  46. data/Gemfile.lock +0 -91
  47. data/lib/chronicle/etl/catalog.rb +0 -108
  48. data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
@@ -0,0 +1,52 @@
1
+ require 'rubygems'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A singleton class that acts as a registry of connector classes available for ETL jobs
6
+ module Registry
7
+ PHASES = [:extractor, :transformer, :loader]
8
+
9
+ class << self
10
+ attr_accessor :connectors
11
+
12
+ def load_all!
13
+ load_connectors_from_gems
14
+ end
15
+
16
+ def load_connectors_from_gems
17
+ Gem::Specification.filter{|s| s.name.match(/^chronicle/) }.each do |gem|
18
+ require_str = gem.name.gsub('chronicle-', 'chronicle/')
19
+ require require_str rescue LoadError
20
+ end
21
+ end
22
+
23
+ def install_connector name
24
+ gem_name = "chronicle-#{name}"
25
+ Gem.install(gem_name)
26
+ end
27
+
28
+ def register connector
29
+ @connectors ||= []
30
+ @connectors << connector
31
+ end
32
+
33
+ def find_by_phase_and_identifier(phase, identifier)
34
+ connector = find_within_loaded_connectors(phase, identifier)
35
+ unless connector
36
+ # Only load external connectors (slow) if not found in built-in connectors
37
+ load_all!
38
+ connector = find_within_loaded_connectors(phase, identifier)
39
+ end
40
+ connector || raise(ConnectorNotAvailableError.new("Connector '#{identifier}' not found"))
41
+ end
42
+
43
+ def find_within_loaded_connectors(phase, identifier)
44
+ @connectors.find { |c| c.phase == phase && c.identifier == identifier }
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ require_relative 'self_registering'
52
+ require_relative 'connector_registration'
@@ -0,0 +1,25 @@
1
+ require 'forwardable'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Registry
6
+ # Gives a connector class the ability to let the Chronicle::ETL::Registry
7
+ # know about itself
8
+ module SelfRegistering
9
+ extend Forwardable
10
+
11
+ attr_accessor :connector_registration
12
+
13
+ def_delegators :@connector_registration, :description, :provider, :identifier
14
+
15
+ # Creates a ConnectorRegistration for this connector's details and register's it
16
+ # into the Registry
17
+ def register_connector
18
+ @connector_registration ||= ::Chronicle::ETL::Registry::ConnectorRegistration.new(self)
19
+ yield @connector_registration if block_given?
20
+ ::Chronicle::ETL::Registry.register(@connector_registration)
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -1,4 +1,5 @@
1
1
  require 'colorize'
2
+ require 'chronic_duration'
2
3
 
3
4
  class Chronicle::ETL::Runner
4
5
  def initialize(job)
@@ -14,24 +15,73 @@ class Chronicle::ETL::Runner
14
15
  loader.start
15
16
 
16
17
  total = extractor.results_count
17
- progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
18
+ @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
19
+ Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
18
20
 
19
- extractor.extract do |data, metadata|
20
- transformer = @job.instantiate_transformer(data)
21
+ Chronicle::ETL::Logger.info(tty_log_job_start)
22
+ extractor.extract do |extraction|
23
+ unless extraction.is_a?(Chronicle::ETL::Extraction)
24
+ raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
25
+ end
26
+
27
+ transformer = @job.instantiate_transformer(extraction)
21
28
  record = transformer.transform
22
29
 
23
30
  unless record.is_a?(Chronicle::ETL::Models::Base)
24
- raise Chronicle::ETL::InvalidTransformedRecordError, "Transformed data is not a type of Chronicle::ETL::Models"
31
+ raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
25
32
  end
26
33
 
34
+ Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
27
35
  @job_logger.log_transformation(transformer)
28
- loader.load(record)
29
- progress_bar.increment
36
+
37
+ loader.load(record) unless @job.dry_run?
38
+ rescue Chronicle::ETL::TransformationError => e
39
+ Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
40
+ ensure
41
+ @progress_bar.increment
30
42
  end
31
43
 
32
- progress_bar.finish
44
+ @progress_bar.finish
33
45
  loader.finish
34
46
  @job_logger.finish
47
+ rescue Interrupt
48
+ Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
49
+ @job_logger.error
50
+ rescue StandardError => e
51
+ raise e
52
+ ensure
35
53
  @job_logger.save
54
+ @progress_bar.finish
55
+ Chronicle::ETL::Logger.detach_from_progress_bar
56
+ Chronicle::ETL::Logger.info(tty_log_completion)
57
+ end
58
+
59
+ private
60
+
61
+ def tty_log_job_start
62
+ output = "Beginning job "
63
+ output += "'#{@job.name}'".bold if @job.name
64
+ output
65
+ end
66
+
67
+ def tty_log_transformation transformer
68
+ output = " ✓".green
69
+ output += " #{transformer}"
70
+ end
71
+
72
+ def tty_log_transformation_failure exception
73
+ output = " ✖".red
74
+ output += " Failed to build #{exception.transformation}. #{exception.message}"
75
+ end
76
+
77
+ def tty_log_completion
78
+ status = @job_logger.success ? 'Success' : 'Failed'
79
+ output = "\nCompleted job "
80
+ output += "'#{@job.name}'".bold if @job.name
81
+ output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
82
+ output += "\n Status:\t".light_black + status
83
+ output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
84
+ output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
85
+ output
36
86
  end
37
87
  end
@@ -0,0 +1,25 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JSONAPISerializer < Chronicle::ETL::Serializer
4
+ def serializable_hash
5
+ @record
6
+ .identifier_hash
7
+ .merge({ attributes: @record.attributes })
8
+ .merge({ relationships: build_associations })
9
+ .merge(@record.meta_hash)
10
+ end
11
+
12
+ def build_associations
13
+ @record.associations.transform_values do |value|
14
+ association_data =
15
+ if value.is_a?(Array)
16
+ value.map { |record| JSONAPISerializer.new(record).serializable_hash }
17
+ else
18
+ JSONAPISerializer.new(value).serializable_hash
19
+ end
20
+ { data: association_data }
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,27 @@
1
+ module Chronicle
2
+ module ETL
3
+ # Abstract class representing a Serializer for an ETL record
4
+ class Serializer
5
+ # Construct a new instance of this serializer.
6
+ # == Parameters:
7
+ # options::
8
+ # Options for configuring this Serializers
9
+ def initialize(record, options = {})
10
+ @record = record
11
+ @options = options
12
+ end
13
+
14
+ # Serialize a record as a hash
15
+ def serializable_hash
16
+ raise NotImplementedError
17
+ end
18
+
19
+ def self.serialize(record)
20
+ serializer = self.new(record)
21
+ serializer.serializable_hash
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ require_relative 'jsonapi_serializer'
@@ -0,0 +1,253 @@
1
+ require 'mini_exiftool'
2
+ require 'active_support'
3
+ require 'active_support/core_ext/object'
4
+ require 'active_support/core_ext/time'
5
+ require 'active_support/core_ext/hash/reverse_merge'
6
+ require 'active_support/core_ext/string/inflections'
7
+
8
+ module Chronicle
9
+ module ETL
10
+ # Transform a JPEG or other image file into a record.
11
+ # By default, file mtime and a hash of the file content is used to build
12
+ # the timestamp and ID respectively but other options are available (such
13
+ # as reading EXIF tags or extended attributes from the filesystem).
14
+ #
15
+ # TODO: This should be extracted into its own plugin
16
+ class ImageFileTransformer < Chronicle::ETL::Transformer
17
+ register_connector do |r|
18
+ r.identifier = 'image-file'
19
+ r.description = 'an image file'
20
+ end
21
+
22
+ DEFAULT_OPTIONS = {
23
+ timestamp_strategy: 'file_mtime',
24
+ id_strategy: 'file_hash',
25
+ verb: 'photographed',
26
+
27
+ # EXIF tags often don't have timezones
28
+ timezone_default: 'Eastern Time (US & Canada)',
29
+ include_image_data: true
30
+ }.freeze
31
+
32
+ def initialize(*args)
33
+ super(*args)
34
+ @options = @options.reverse_merge(DEFAULT_OPTIONS)
35
+ end
36
+
37
+ def transform
38
+ # FIXME: set @filename; use block for reading file when necessary
39
+ @file = File.open(@extraction.data)
40
+ record = build_created(@file)
41
+ @file.close
42
+ record
43
+ end
44
+
45
+ def friendly_identifier
46
+ @file.path
47
+ end
48
+
49
+ def id
50
+ @id ||= begin
51
+ id = build_with_strategy(field: :id, strategy: @options[:id_strategy])
52
+ raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
53
+
54
+ id
55
+ end
56
+ end
57
+
58
+ def timestamp
59
+ @timestamp ||= begin
60
+ ts = build_with_strategy(field: :timestamp, strategy: @options[:timestamp_strategy])
61
+ raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
62
+
63
+ ts
64
+ end
65
+ end
66
+
67
+ private
68
+
69
+ def build_created(file)
70
+ record = ::Chronicle::ETL::Models::Activity.new
71
+ record.verb = @options[:verb]
72
+ record.provider = @options[:provider]
73
+ record.provider_id = id
74
+ record.end_at = timestamp
75
+ record.dedupe_on = [[:provider_id, :verb, :provider]]
76
+
77
+ record.involved = build_image
78
+ record.actor = build_actor
79
+
80
+ record.assign_attributes(build_gps)
81
+ record
82
+ end
83
+
84
+ def build_actor
85
+ actor = ::Chronicle::ETL::Models::Entity.new
86
+ actor.represents = 'identity'
87
+ actor.provider = @options[:actor][:provider]
88
+ actor.slug = @options[:actor][:slug]
89
+ actor.dedupe_on = [[:provider, :slug, :represents]]
90
+ actor
91
+ end
92
+
93
+ def build_image
94
+ image = ::Chronicle::ETL::Models::Entity.new
95
+ image.represents = @options[:involved][:represents]
96
+ image.title = build_title
97
+ image.body = exif['Description']
98
+ image.provider = @options[:involved][:provider]
99
+ image.provider_id = id
100
+ image.assign_attributes(build_gps)
101
+ image.dedupe_on = [[:provider, :provider_id, :represents]]
102
+
103
+ if @options[:ocr_strategy]
104
+ ocr_text = build_with_strategy(field: :ocr, strategy: @options[:ocr_strategy])
105
+ image.metadata[:ocr_text] = ocr_text if ocr_text
106
+ end
107
+
108
+ names = extract_people_depicted
109
+ tags = extract_keywords(names)
110
+
111
+ image.depicts = build_people_depicted(names)
112
+ image.abouts = build_keywords(tags)
113
+
114
+ if @options[:include_image_data]
115
+ attachment = ::Chronicle::ETL::Models::Attachment.new
116
+ attachment.data = build_image_data
117
+ image.attachments = [attachment]
118
+ end
119
+
120
+ image
121
+ end
122
+
123
+ def build_keywords(topics)
124
+ topics.map do |topic|
125
+ t = ::Chronicle::ETL::Models::Entity.new
126
+ t.represents = 'topic'
127
+ t.provider = @options[:involved][:provider]
128
+ t.title = topic
129
+ t.slug = topic.parameterize
130
+ t.dedupe_on = [[:provider, :represents, :slug]]
131
+ t
132
+ end
133
+ end
134
+
135
+ def build_people_depicted(names)
136
+ names.map do |name|
137
+ identity = ::Chronicle::ETL::Models::Entity.new
138
+ identity.represents = 'identity'
139
+ identity.provider = @options[:involved][:provider]
140
+ identity.slug = name.parameterize
141
+ identity.title = name
142
+ identity.dedupe_on = [[:provider, :represents, :slug]]
143
+ identity
144
+ end
145
+ end
146
+
147
+ def build_gps
148
+ return {} unless exif['GPSLatitude']
149
+
150
+ {
151
+ lat: exif['GPSLatitude'],
152
+ lng: exif['GPSLongitude'],
153
+ elevation: exif['GPSAltitude']
154
+ }
155
+ end
156
+
157
+ def build_image_data
158
+ ::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
159
+ end
160
+
161
+ def build_title
162
+ File.basename(@file)
163
+ end
164
+
165
+ def build_with_strategy(field:, strategy:[])
166
+ strategies = [strategy].flatten.compact
167
+ strategies.each do |s|
168
+ builder_method = "build_#{field}_using_#{s}"
169
+ result = send(builder_method.to_sym)
170
+ return result if result
171
+ end
172
+ return
173
+ end
174
+
175
+ def build_id_using_file_hash
176
+ Digest::SHA256.hexdigest(File.read(@file))
177
+ end
178
+
179
+ def build_id_using_xattr_version
180
+ load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
181
+ end
182
+
183
+ def build_id_using_xmp_document_id
184
+ exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
185
+ end
186
+
187
+ def build_timestamp_using_file_mtime
188
+ File.mtime(@file)
189
+ end
190
+
191
+ def build_timestamp_using_exif_datetimeoriginal
192
+ # EXIF tags don't have timezone information. This is a DateTime in UTC
193
+ timestamp = exif['DateTimeOriginal'] || return
194
+
195
+ if exif['OffsetTimeOriginal']
196
+ # Offset tags are only available in newer EXIF tags. If it exists, we
197
+ # use it instead of UTC
198
+ timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
199
+ elsif false
200
+ # TODO: support option of using GPS coordinates to determine timezone
201
+ else
202
+ zone = ActiveSupport::TimeZone.new(@options[:timezone_default])
203
+ timestamp = zone.parse(timestamp.asctime)
204
+ end
205
+
206
+ timestamp
207
+ end
208
+
209
+ # TODO: add documentation for how to set up `macocr`
210
+ def build_ocr_using_macocr
211
+ `macocr "#{@file.path}" 2>/dev/null`.presence
212
+ end
213
+
214
+ def exif
215
+ @exif ||= MiniExiftool.new(
216
+ @file.path,
217
+ numerical: true,
218
+
219
+ # EXIF timestamps don't have timezone information. MiniExifTool uses Time
220
+ # by default which parses timestamps in local time zone. Using DateTime
221
+ # parses dates as UTC and then we can apply a timezone offset if the optional
222
+ # EXIF timezone offset fields are available.
223
+ # https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
224
+ timestamps: DateTime
225
+ )
226
+ end
227
+
228
+ # Figure out which faces are tagged as regions and return a list of their names
229
+ def extract_people_depicted
230
+ return [] unless exif['RegionName']
231
+
232
+ names = [exif['RegionName']].flatten
233
+ types = [exif['RegionType']].flatten
234
+
235
+ names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
236
+ end
237
+
238
+ # Extract image keywords from EXIF/IPTC tag and subtract out those of which are
239
+ # tagged people (determiend by looking at face regions)
240
+ def extract_keywords(people_names = [])
241
+ [exif['Keywords'] || []].flatten - people_names
242
+ end
243
+
244
+ def load_value_from_xattr_plist attribute
245
+ require 'nokogiri'
246
+ xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
247
+ return unless xml
248
+ value = Nokogiri::XML.parse(r).xpath("//string").text
249
+ return value.presence
250
+ end
251
+ end
252
+ end
253
+ end
@@ -1,9 +1,18 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class NullTransformer < Chronicle::ETL::Transformer
4
+ register_connector do |r|
5
+ r.identifier = 'null'
6
+ r.description = 'in no way'
7
+ end
8
+
4
9
  def transform
5
- Chronicle::ETL::Models::Generic.new(@data)
10
+ Chronicle::ETL::Models::Generic.new(@extraction.data)
6
11
  end
12
+
13
+ def timestamp; end
14
+
15
+ def id; end
7
16
  end
8
17
  end
9
18
  end
@@ -2,16 +2,15 @@ module Chronicle
2
2
  module ETL
3
3
  # Abstract class representing an Transformer for an ETL job
4
4
  class Transformer
5
- extend Chronicle::ETL::Catalog
5
+ extend Chronicle::ETL::Registry::SelfRegistering
6
6
 
7
7
  # Construct a new instance of this transformer. Options are passed in from a Runner
8
- # == Paramters:
8
+ # == Parameters:
9
9
  # options::
10
10
  # Options for configuring this Transformer
11
- def initialize(options = {}, data)
11
+ def initialize(options = {}, extraction)
12
12
  @options = options
13
- @data = data
14
- @record = Chronicle::ETL::Models::Activity.new
13
+ @extraction = extraction
15
14
  end
16
15
 
17
16
  # @abstract Subclass is expected to implement #transform
@@ -19,16 +18,47 @@ module Chronicle
19
18
  # The main entrypoint for transforming a record. Called by a Runner on each extracted record
20
19
 
21
20
  # The domain or provider-specific id of the record this transformer is working on.
22
- # Used for building a cursor so an extractor doesn't have to start from the beginning of a
23
- # data source from the beginning.
24
- def id; end
21
+ # It is useful for:
22
+ # - de-duping records that might exist in the loader's destination
23
+ # - building a cursor so an extractor doesn't have to start from the beginning of a
24
+ # a source
25
+ def id
26
+ raise NotImplementedError
27
+ end
25
28
 
26
29
  # The domain or provider-specific timestamp of the record this transformer is working on.
27
30
  # Used for building a cursor so an extractor doesn't have to start from the beginning of a
28
31
  # data source from the beginning.
29
- def timestamp; end
32
+ def timestamp
33
+ raise NotImplementedError
34
+ end
35
+
36
+ # An optional, human-readable identifier for a transformation, intended for debugging or logging.
37
+ # By default, it is just the id.
38
+ def friendly_identifier
39
+ id
40
+ end
41
+
42
+ def to_s
43
+ ts = begin
44
+ unknown = "???"
45
+ timestamp&.iso8601 || unknown
46
+ rescue TransformationError, NotImplementedError
47
+ unknown
48
+ end
49
+
50
+ identifier = begin
51
+ unknown = self.class.to_s
52
+ friendly_identifier || self.class.to_s
53
+ rescue TransformationError, NotImplementedError
54
+ unknown
55
+ end
56
+
57
+ "[#{ts}] #{identifier}"
58
+ end
30
59
  end
31
60
  end
32
61
  end
33
62
 
34
63
  require_relative 'null_transformer'
64
+ require_relative 'image_file_transformer'
@@ -0,0 +1,21 @@
1
+ require 'marcel'
2
+ require 'base64'
3
+
4
+ module Chronicle
5
+ module ETL
6
+ module Utils
7
+ # Utility methods for dealing with binary files
8
+ module BinaryAttachments
9
+ def self.filename_to_base64(filename:, mimetype: nil)
10
+ mimetype = mimetype || guess_mimetype(filename: filename)
11
+
12
+ "data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}"
13
+ end
14
+
15
+ def self.guess_mimetype(filename:)
16
+ Marcel::MimeType.for(filename)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -64,7 +64,9 @@ module Chronicle
64
64
  end
65
65
 
66
66
  def log(message)
67
- @pbar.log message
67
+ message.split("\n").each do |line|
68
+ @pbar.log message
69
+ end
68
70
  end
69
71
 
70
72
  def finish
@@ -0,0 +1,15 @@
1
+ require 'active_support/core_ext/object/blank'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Utils
6
+ # OCR for image files
7
+ # TODO: add other strategies and document `macocr`
8
+ module TextRecognition
9
+ def self.recognize_in_image(filename:)
10
+ `macocr "#{filename}" 2>/dev/null`.presence
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.2.4"
3
+ VERSION = "0.3.0"
4
4
  end
5
5
  end
data/lib/chronicle/etl.rb CHANGED
@@ -1,19 +1,24 @@
1
- require_relative 'etl/catalog'
1
+ require_relative 'etl/registry/registry'
2
2
  require_relative 'etl/config'
3
3
  require_relative 'etl/exceptions'
4
+ require_relative 'etl/extraction'
4
5
  require_relative 'etl/extractors/extractor'
5
6
  require_relative 'etl/job_definition'
6
7
  require_relative 'etl/job_log'
7
8
  require_relative 'etl/job_logger'
8
9
  require_relative 'etl/job'
9
10
  require_relative 'etl/loaders/loader'
11
+ require_relative 'etl/logger'
10
12
  require_relative 'etl/models/activity'
13
+ require_relative 'etl/models/attachment'
11
14
  require_relative 'etl/models/base'
12
15
  require_relative 'etl/models/entity'
13
16
  require_relative 'etl/models/generic'
14
17
  require_relative 'etl/runner'
18
+ require_relative 'etl/serializers/serializer'
15
19
  require_relative 'etl/transformers/transformer'
20
+ require_relative 'etl/utils/binary_attachments'
16
21
  require_relative 'etl/utils/hash_utilities'
17
- require_relative 'etl/utils/jsonapi'
22
+ require_relative 'etl/utils/text_recognition'
18
23
  require_relative 'etl/utils/progress_bar'
19
24
  require_relative 'etl/version'