chronicle-etl 0.5.4 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +15 -25
  3. data/.rubocop.yml +2 -44
  4. data/Gemfile +2 -2
  5. data/Guardfile +3 -3
  6. data/README.md +98 -73
  7. data/Rakefile +2 -2
  8. data/bin/console +4 -5
  9. data/chronicle-etl.gemspec +50 -45
  10. data/exe/chronicle-etl +1 -1
  11. data/lib/chronicle/etl/authorizer.rb +3 -4
  12. data/lib/chronicle/etl/cli/authorizations.rb +10 -8
  13. data/lib/chronicle/etl/cli/connectors.rb +9 -9
  14. data/lib/chronicle/etl/cli/jobs.rb +130 -53
  15. data/lib/chronicle/etl/cli/main.rb +29 -29
  16. data/lib/chronicle/etl/cli/plugins.rb +29 -26
  17. data/lib/chronicle/etl/cli/secrets.rb +14 -12
  18. data/lib/chronicle/etl/cli/subcommand_base.rb +5 -3
  19. data/lib/chronicle/etl/config.rb +20 -7
  20. data/lib/chronicle/etl/configurable.rb +24 -9
  21. data/lib/chronicle/etl/exceptions.rb +3 -3
  22. data/lib/chronicle/etl/extraction.rb +12 -2
  23. data/lib/chronicle/etl/extractors/csv_extractor.rb +9 -0
  24. data/lib/chronicle/etl/extractors/extractor.rb +15 -2
  25. data/lib/chronicle/etl/extractors/file_extractor.rb +5 -3
  26. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +2 -2
  27. data/lib/chronicle/etl/extractors/json_extractor.rb +14 -4
  28. data/lib/chronicle/etl/extractors/stdin_extractor.rb +3 -0
  29. data/lib/chronicle/etl/job.rb +35 -17
  30. data/lib/chronicle/etl/job_definition.rb +39 -27
  31. data/lib/chronicle/etl/job_log.rb +14 -16
  32. data/lib/chronicle/etl/job_logger.rb +4 -4
  33. data/lib/chronicle/etl/loaders/csv_loader.rb +17 -4
  34. data/lib/chronicle/etl/loaders/helpers/stdout_helper.rb +4 -0
  35. data/lib/chronicle/etl/loaders/json_loader.rb +30 -10
  36. data/lib/chronicle/etl/loaders/loader.rb +0 -17
  37. data/lib/chronicle/etl/loaders/rest_loader.rb +7 -7
  38. data/lib/chronicle/etl/loaders/table_loader.rb +37 -12
  39. data/lib/chronicle/etl/logger.rb +3 -3
  40. data/lib/chronicle/etl/oauth_authorizer.rb +8 -10
  41. data/lib/chronicle/etl/record.rb +15 -0
  42. data/lib/chronicle/etl/registry/connector_registration.rb +15 -23
  43. data/lib/chronicle/etl/registry/connectors.rb +117 -0
  44. data/lib/chronicle/etl/registry/plugin_registration.rb +19 -0
  45. data/lib/chronicle/etl/registry/plugins.rb +171 -0
  46. data/lib/chronicle/etl/registry/registry.rb +3 -52
  47. data/lib/chronicle/etl/registry/self_registering.rb +1 -1
  48. data/lib/chronicle/etl/runner.rb +158 -128
  49. data/lib/chronicle/etl/secrets.rb +5 -5
  50. data/lib/chronicle/etl/transformers/buffer_transformer.rb +29 -0
  51. data/lib/chronicle/etl/transformers/chronicle_transformer.rb +32 -0
  52. data/lib/chronicle/etl/transformers/chronobase_transformer.rb +100 -0
  53. data/lib/chronicle/etl/transformers/fields_limit_transformer.rb +23 -0
  54. data/lib/chronicle/etl/transformers/filter_fields_transformer.rb +60 -0
  55. data/lib/chronicle/etl/transformers/filter_transformer.rb +30 -0
  56. data/lib/chronicle/etl/transformers/format_transformer.rb +32 -0
  57. data/lib/chronicle/etl/transformers/merge_meta_transformer.rb +19 -0
  58. data/lib/chronicle/etl/transformers/multiply_transformer.rb +21 -0
  59. data/lib/chronicle/etl/transformers/null_transformer.rb +5 -7
  60. data/lib/chronicle/etl/transformers/sampler_transformer.rb +21 -0
  61. data/lib/chronicle/etl/transformers/sort_transformer.rb +31 -0
  62. data/lib/chronicle/etl/transformers/transformer.rb +63 -41
  63. data/lib/chronicle/etl/utils/binary_attachments.rb +1 -1
  64. data/lib/chronicle/etl/utils/progress_bar.rb +2 -3
  65. data/lib/chronicle/etl/version.rb +1 -1
  66. data/lib/chronicle/etl.rb +6 -8
  67. metadata +91 -45
  68. data/lib/chronicle/etl/models/activity.rb +0 -15
  69. data/lib/chronicle/etl/models/attachment.rb +0 -14
  70. data/lib/chronicle/etl/models/base.rb +0 -122
  71. data/lib/chronicle/etl/models/entity.rb +0 -29
  72. data/lib/chronicle/etl/models/raw.rb +0 -26
  73. data/lib/chronicle/etl/registry/plugin_registry.rb +0 -95
  74. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +0 -31
  75. data/lib/chronicle/etl/serializers/raw_serializer.rb +0 -10
  76. data/lib/chronicle/etl/serializers/serializer.rb +0 -28
  77. data/lib/chronicle/etl/transformers/image_file_transformer.rb +0 -247
  78. data/lib/chronicle/etl/utils/hash_utilities.rb +0 -19
  79. data/lib/chronicle/etl/utils/text_recognition.rb +0 -15
@@ -1,122 +0,0 @@
1
- require 'digest'
2
-
3
- module Chronicle
4
- module ETL
5
- module Models
6
- # Represents a record that's been transformed by a Transformer and
7
- # ready to be loaded. Loosely based on ActiveModel.
8
- #
9
- # @todo Experiment with just mixing in ActiveModel instead of this
10
- # this reimplementation
11
- class Base
12
- ATTRIBUTES = [:provider, :provider_id, :provider_namespace, :lat, :lng, :metadata].freeze
13
- ASSOCIATIONS = [].freeze
14
-
15
- attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
16
-
17
- def initialize(attributes = {})
18
- assign_attributes(attributes) if attributes
19
- @dedupe_on = []
20
- @metadata = {}
21
- end
22
-
23
- # A unique identifier for this model is formed from a type
24
- # and either an id or lids.
25
- def identifier_hash
26
- {
27
- type: self.class::TYPE,
28
- id: @id,
29
- lids: lids
30
- }.compact
31
- end
32
-
33
- # Array of local ids that uniquely identify this record
34
- def lids
35
- @dedupe_on.map do |fields|
36
- generate_lid(fields)
37
- end.compact.uniq
38
- end
39
-
40
- # For a given set of fields of this model, generate a
41
- # unique local id by hashing the field values
42
- def generate_lid fields
43
- raise ArgumentError.new("Must provide an array of symbolized fields") unless fields.is_a?(Array)
44
-
45
- values = fields.sort.map do |field|
46
- instance_variable = "@#{field.to_s}"
47
- self.instance_variable_get(instance_variable)
48
- end
49
-
50
- return if values.any? { |e| e.nil? }
51
-
52
- Digest::SHA256.hexdigest(values.join(","))
53
- end
54
-
55
- # Set of attribute names that this model has is Base's shared
56
- # attributes combined with the child class's
57
- def attribute_list
58
- (ATTRIBUTES + self.class::ATTRIBUTES).uniq
59
- end
60
-
61
- # All of this record's attributes
62
- def attributes
63
- attributes = {}
64
- attribute_list.each do |attribute|
65
- instance_variable = "@#{attribute.to_s}"
66
- attributes[attribute] = self.instance_variable_get(instance_variable)
67
- end
68
- attributes.compact
69
- end
70
-
71
- # All of this record's associations
72
- def associations
73
- association_list = ASSOCIATIONS + self.class::ASSOCIATIONS
74
- attributes = {}
75
- association_list.each do |attribute|
76
- instance_variable = "@#{attribute.to_s}"
77
- association = self.instance_variable_get(instance_variable)
78
- attributes[attribute] = association if association
79
- end
80
- attributes.compact
81
- end
82
-
83
- def associations_hash
84
- associations.map do |k, v|
85
- if v.is_a?(Array)
86
- [k, v.map(&:to_h)]
87
- else
88
- [k, v.to_h]
89
- end
90
- end.to_h
91
- end
92
-
93
- def meta_hash
94
- {
95
- meta: {
96
- dedupe_on: @dedupe_on.map{|d| d.map(&:to_s).join(",")}
97
- }
98
- }
99
- end
100
-
101
- # FIXME: move this to a Utils module
102
- def to_h_flattened
103
- Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
104
- end
105
-
106
- def to_h
107
- identifier_hash
108
- .merge(attributes)
109
- .merge(associations_hash)
110
- .merge(meta_hash)
111
- end
112
-
113
- def assign_attributes attributes
114
- attributes.each do |k, v|
115
- setter = :"#{k}="
116
- public_send(setter, v) if respond_to? setter
117
- end
118
- end
119
- end
120
- end
121
- end
122
- end
@@ -1,29 +0,0 @@
1
- require 'chronicle/etl/models/base'
2
-
3
- module Chronicle
4
- module ETL
5
- module Models
6
- class Entity < Chronicle::ETL::Models::Base
7
- TYPE = 'entities'.freeze
8
- ATTRIBUTES = [:title, :body, :provider_url, :represents, :slug, :myself, :metadata].freeze
9
-
10
- # TODO: This desperately needs a validation system
11
- ASSOCIATIONS = [
12
- :involvements, # inverse of activity's `involved`
13
- :analogous,
14
- :attachments,
15
- :abouts,
16
- :aboutables, # inverse of above
17
- :depicts,
18
- :consumers,
19
- :creators,
20
- :creations,
21
- :contains,
22
- :containers # inverse of above
23
- ].freeze # TODO: add these to reflect Chronicle Schema
24
-
25
- attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
26
- end
27
- end
28
- end
29
- end
@@ -1,26 +0,0 @@
1
- require 'chronicle/etl/models/base'
2
-
3
- module Chronicle
4
- module ETL
5
- module Models
6
- # A record from an extraction with no processing or normalization applied
7
- class Raw
8
- TYPE = 'raw'
9
-
10
- attr_accessor :raw_data
11
-
12
- def initialize(raw_data)
13
- @raw_data = raw_data
14
- end
15
-
16
- def to_h
17
- @raw_data.to_h
18
- end
19
-
20
- def to_h_flattened
21
- Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
22
- end
23
- end
24
- end
25
- end
26
- end
@@ -1,95 +0,0 @@
1
- require 'rubygems'
2
- require 'rubygems/command'
3
- require 'rubygems/commands/install_command'
4
- require 'rubygems/uninstaller'
5
-
6
- module Chronicle
7
- module ETL
8
- module Registry
9
- # Responsible for managing plugins available to chronicle-etl
10
- #
11
- # @todo Better validation for whether a gem is actually a plugin
12
- # @todo Add ways to load a plugin that don't require a gem on rubygems.org
13
- module PluginRegistry
14
- class << self
15
- # Start of a system for having non-gem plugins. Right now, we just
16
- # make registry aware of existenc of name of non-gem plugin
17
- def register_standalone(name)
18
- standalones << name
19
- end
20
-
21
- def standalones
22
- @standalones ||= []
23
- end
24
- end
25
-
26
- # Does this plugin exist?
27
- def self.exists?(name)
28
- # TODO: implement this. Could query rubygems.org or use a hardcoded
29
- # list somewhere
30
- true
31
- end
32
-
33
- # All versions of all plugins currently installed
34
- def self.all_installed
35
- # TODO: add check for chronicle-etl dependency
36
- Gem::Specification.filter { |s| s.name.match(/^chronicle-/) && s.name != "chronicle-etl" }
37
- end
38
-
39
- # Latest version of each installed plugin
40
- def self.all_installed_latest
41
- all_installed.group_by(&:name)
42
- .transform_values { |versions| versions.sort_by(&:version).reverse.first }
43
- .values
44
- end
45
-
46
- # Check whether a given plugin is installed
47
- def self.installed?(name)
48
- (standalones + all_installed.map { |gem| gem.name.gsub("chronicle-", "") }).include?(name)
49
- end
50
-
51
- # Activate a plugin with given name by `require`ing it
52
- def self.activate(name)
53
- # By default, activates the latest available version of a gem
54
- # so don't have to run Kernel#gem separately
55
- require "chronicle/#{name}"
56
- rescue Gem::ConflictError => e
57
- # TODO: figure out if there's more we can do here
58
- raise Chronicle::ETL::PluginConflictError.new(name), "Plugin '#{name}' couldn't be loaded. #{e.message}"
59
- rescue StandardError, LoadError => e
60
- # StandardError to catch random non-loading problems that might occur
61
- # when requiring the plugin (eg class macro invoked the wrong way)
62
- # TODO: decide if this should be separated
63
- raise Chronicle::ETL::PluginLoadError.new(name), "Plugin '#{name}' couldn't be loaded"
64
- end
65
-
66
- # Install a plugin to local gems
67
- def self.install(name)
68
- return if installed?(name)
69
-
70
- gem_name = "chronicle-#{name}"
71
- raise(Chronicle::ETL::PluginNotAvailableError.new(gem_name), "Plugin #{name} doesn't exist") unless exists?(gem_name)
72
-
73
- Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
74
- Gem.install(gem_name)
75
-
76
- activate(name)
77
- rescue Gem::UnsatisfiableDependencyError
78
- # TODO: we need to catch a lot more than this here
79
- raise Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} could not be installed."
80
- end
81
-
82
- # Uninstall a plugin
83
- def self.uninstall(name)
84
- gem_name = "chronicle-#{name}"
85
- Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
86
- uninstaller = Gem::Uninstaller.new(gem_name)
87
- uninstaller.uninstall
88
- rescue Gem::InstallError
89
- # TODO: strengthen this exception handling
90
- raise(Chronicle::ETL::PluginError.new(name), "Plugin #{name} wasn't uninstalled")
91
- end
92
- end
93
- end
94
- end
95
- end
@@ -1,31 +0,0 @@
1
- module Chronicle
2
- module ETL
3
- class JSONAPISerializer < Chronicle::ETL::Serializer
4
- def initialize(*args)
5
- super
6
-
7
- raise(SerializationError, "Record must be a subclass of Chronicle::ETL::Model::Base") unless @record.is_a?(Chronicle::ETL::Models::Base)
8
- end
9
-
10
- def serializable_hash
11
- @record
12
- .identifier_hash
13
- .merge({ attributes: @record.attributes })
14
- .merge({ relationships: build_associations })
15
- .merge(@record.meta_hash)
16
- end
17
-
18
- def build_associations
19
- @record.associations.transform_values do |value|
20
- association_data =
21
- if value.is_a?(Array)
22
- value.map { |record| JSONAPISerializer.new(record).serializable_hash }
23
- else
24
- JSONAPISerializer.new(value).serializable_hash
25
- end
26
- { data: association_data }
27
- end
28
- end
29
- end
30
- end
31
- end
@@ -1,10 +0,0 @@
1
- module Chronicle
2
- module ETL
3
- # Take a Raw model and output `raw_data` as a hash
4
- class RawSerializer < Chronicle::ETL::Serializer
5
- def serializable_hash
6
- @record.to_h
7
- end
8
- end
9
- end
10
- end
@@ -1,28 +0,0 @@
1
- module Chronicle
2
- module ETL
3
- # Abstract class representing a Serializer for an ETL record
4
- class Serializer
5
- # Construct a new instance of this serializer.
6
- # == Parameters:
7
- # options::
8
- # Options for configuring this Serializers
9
- def initialize(record, options = {})
10
- @record = record
11
- @options = options
12
- end
13
-
14
- # Serialize a record as a hash
15
- def serializable_hash
16
- raise NotImplementedError
17
- end
18
-
19
- def self.serialize(record)
20
- serializer = self.new(record)
21
- serializer.serializable_hash
22
- end
23
- end
24
- end
25
- end
26
-
27
- require_relative 'jsonapi_serializer'
28
- require_relative 'raw_serializer'
@@ -1,247 +0,0 @@
1
- require 'mini_exiftool'
2
- require 'active_support'
3
- require 'active_support/core_ext/object'
4
- require 'active_support/core_ext/time'
5
- require 'active_support/core_ext/hash/reverse_merge'
6
- require 'active_support/core_ext/string/inflections'
7
-
8
- module Chronicle
9
- module ETL
10
- # Transform a JPEG or other image file into a record.
11
- # By default, file mtime and a hash of the file content is used to build
12
- # the timestamp and ID respectively but other options are available (such
13
- # as reading EXIF tags or extended attributes from the filesystem).
14
- #
15
- # TODO: This should be extracted into its own plugin
16
- class ImageFileTransformer < Chronicle::ETL::Transformer
17
- register_connector do |r|
18
- r.identifier = 'image-file'
19
- r.description = 'an image file'
20
- end
21
-
22
- setting :timestamp_strategy, default: 'file_mtime'
23
- setting :id_strategy, default: 'file_hash'
24
- setting :verb, default: 'photographed'
25
- # EXIF tags often don't have timezones
26
- setting :timezone_default, default: 'Eastern Time (US & Canada)'
27
- setting :include_image_data, default: true
28
- setting :actor
29
- setting :involved
30
-
31
- def transform
32
- # FIXME: set @filename; use block for reading file when necessary
33
- @file = File.open(@extraction.data)
34
- record = build_created(@file)
35
- @file.close
36
- record
37
- end
38
-
39
- def friendly_identifier
40
- @file.path
41
- end
42
-
43
- def id
44
- @id ||= begin
45
- id = build_with_strategy(field: :id, strategy: @config.id_strategy)
46
- raise(UntransformableRecordError, "Could not build id") unless id
47
-
48
- id
49
- end
50
- end
51
-
52
- def timestamp
53
- @timestamp ||= begin
54
- ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
55
- raise(UntransformableRecordError, "Could not build timestamp") unless ts
56
-
57
- ts
58
- end
59
- end
60
-
61
- private
62
-
63
- def build_created(file)
64
- record = ::Chronicle::ETL::Models::Activity.new
65
- record.verb = @config.verb
66
- record.provider = @config.provider
67
- record.provider_id = id
68
- record.end_at = timestamp
69
- record.dedupe_on = [[:provider_id, :verb, :provider]]
70
-
71
- record.involved = build_image
72
- record.actor = build_actor
73
-
74
- record.assign_attributes(build_gps)
75
- record
76
- end
77
-
78
- def build_actor
79
- actor = ::Chronicle::ETL::Models::Entity.new
80
- actor.represents = 'identity'
81
- actor.provider = @config.actor[:provider]
82
- actor.slug = @config.actor[:slug]
83
- actor.dedupe_on = [[:provider, :slug, :represents]]
84
- actor
85
- end
86
-
87
- def build_image
88
- image = ::Chronicle::ETL::Models::Entity.new
89
- image.represents = @config.involved[:represents]
90
- image.title = build_title
91
- image.body = exif['Description']
92
- image.provider = @config.involved[:provider]
93
- image.provider_id = id
94
- image.assign_attributes(build_gps)
95
- image.dedupe_on = [[:provider, :provider_id, :represents]]
96
-
97
- if @config.ocr_strategy
98
- ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
99
- image.metadata[:ocr_text] = ocr_text if ocr_text
100
- end
101
-
102
- names = extract_people_depicted
103
- tags = extract_keywords(names)
104
-
105
- image.depicts = build_people_depicted(names)
106
- image.abouts = build_keywords(tags)
107
-
108
- if @config.include_image_data
109
- attachment = ::Chronicle::ETL::Models::Attachment.new
110
- attachment.data = build_image_data
111
- image.attachments = [attachment]
112
- end
113
-
114
- image
115
- end
116
-
117
- def build_keywords(topics)
118
- topics.map do |topic|
119
- t = ::Chronicle::ETL::Models::Entity.new
120
- t.represents = 'topic'
121
- t.provider = @config.involved[:provider]
122
- t.title = topic
123
- t.slug = topic.parameterize
124
- t.dedupe_on = [[:provider, :represents, :slug]]
125
- t
126
- end
127
- end
128
-
129
- def build_people_depicted(names)
130
- names.map do |name|
131
- identity = ::Chronicle::ETL::Models::Entity.new
132
- identity.represents = 'identity'
133
- identity.provider = @config.involved[:provider]
134
- identity.slug = name.parameterize
135
- identity.title = name
136
- identity.dedupe_on = [[:provider, :represents, :slug]]
137
- identity
138
- end
139
- end
140
-
141
- def build_gps
142
- return {} unless exif['GPSLatitude']
143
-
144
- {
145
- lat: exif['GPSLatitude'],
146
- lng: exif['GPSLongitude'],
147
- elevation: exif['GPSAltitude']
148
- }
149
- end
150
-
151
- def build_image_data
152
- ::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
153
- end
154
-
155
- def build_title
156
- File.basename(@file)
157
- end
158
-
159
- def build_with_strategy(field:, strategy:[])
160
- strategies = [strategy].flatten.compact
161
- strategies.each do |s|
162
- builder_method = "build_#{field}_using_#{s}"
163
- result = send(builder_method.to_sym)
164
- return result if result
165
- end
166
- return
167
- end
168
-
169
- def build_id_using_file_hash
170
- Digest::SHA256.hexdigest(File.read(@file))
171
- end
172
-
173
- def build_id_using_xattr_version
174
- load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
175
- end
176
-
177
- def build_id_using_xmp_document_id
178
- exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
179
- end
180
-
181
- def build_timestamp_using_file_mtime
182
- File.mtime(@file)
183
- end
184
-
185
- def build_timestamp_using_exif_datetimeoriginal
186
- # EXIF tags don't have timezone information. This is a DateTime in UTC
187
- timestamp = exif['DateTimeOriginal'] || return
188
-
189
- if exif['OffsetTimeOriginal']
190
- # Offset tags are only available in newer EXIF tags. If it exists, we
191
- # use it instead of UTC
192
- timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
193
- elsif false
194
- # TODO: support option of using GPS coordinates to determine timezone
195
- else
196
- zone = ActiveSupport::TimeZone.new(@config.timezone_default)
197
- timestamp = zone.parse(timestamp.asctime)
198
- end
199
-
200
- timestamp
201
- end
202
-
203
- # TODO: add documentation for how to set up `macocr`
204
- def build_ocr_using_macocr
205
- `macocr "#{@file.path}" 2>/dev/null`.presence
206
- end
207
-
208
- def exif
209
- @exif ||= MiniExiftool.new(
210
- @file.path,
211
- numerical: true,
212
-
213
- # EXIF timestamps don't have timezone information. MiniExifTool uses Time
214
- # by default which parses timestamps in local time zone. Using DateTime
215
- # parses dates as UTC and then we can apply a timezone offset if the optional
216
- # EXIF timezone offset fields are available.
217
- # https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
218
- timestamps: DateTime
219
- )
220
- end
221
-
222
- # Figure out which faces are tagged as regions and return a list of their names
223
- def extract_people_depicted
224
- return [] unless exif['RegionName']
225
-
226
- names = [exif['RegionName']].flatten
227
- types = [exif['RegionType']].flatten
228
-
229
- names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
230
- end
231
-
232
- # Extract image keywords from EXIF/IPTC tag and subtract out those of which are
233
- # tagged people (determiend by looking at face regions)
234
- def extract_keywords(people_names = [])
235
- [exif['Keywords'] || []].flatten - people_names
236
- end
237
-
238
- def load_value_from_xattr_plist attribute
239
- require 'nokogiri'
240
- xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
241
- return unless xml
242
- value = Nokogiri::XML.parse(r).xpath("//string").text
243
- return value.presence
244
- end
245
- end
246
- end
247
- end
@@ -1,19 +0,0 @@
1
- module Chronicle
2
- module ETL
3
- module Utils
4
- module HashUtilities
5
- def self.flatten_hash(hash)
6
- hash.each_with_object({}) do |(k, v), h|
7
- if v.is_a? Hash
8
- flatten_hash(v).map do |h_k, h_v|
9
- h["#{k}.#{h_k}".to_sym] = h_v
10
- end
11
- else
12
- h[k] = v
13
- end
14
- end
15
- end
16
- end
17
- end
18
- end
19
- end
@@ -1,15 +0,0 @@
1
- require 'active_support/core_ext/object/blank'
2
-
3
- module Chronicle
4
- module ETL
5
- module Utils
6
- # OCR for image files
7
- # TODO: add other strategies and document `macocr`
8
- module TextRecognition
9
- def self.recognize_in_image(filename:)
10
- `macocr "#{filename}" 2>/dev/null`.presence
11
- end
12
- end
13
- end
14
- end
15
- end