chronicle-etl 0.5.4 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +15 -25
- data/.rubocop.yml +2 -44
- data/Gemfile +2 -2
- data/Guardfile +3 -3
- data/README.md +98 -73
- data/Rakefile +2 -2
- data/bin/console +4 -5
- data/chronicle-etl.gemspec +50 -45
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/authorizer.rb +3 -4
- data/lib/chronicle/etl/cli/authorizations.rb +10 -8
- data/lib/chronicle/etl/cli/connectors.rb +9 -9
- data/lib/chronicle/etl/cli/jobs.rb +130 -53
- data/lib/chronicle/etl/cli/main.rb +29 -29
- data/lib/chronicle/etl/cli/plugins.rb +29 -26
- data/lib/chronicle/etl/cli/secrets.rb +14 -12
- data/lib/chronicle/etl/cli/subcommand_base.rb +5 -3
- data/lib/chronicle/etl/config.rb +20 -7
- data/lib/chronicle/etl/configurable.rb +24 -9
- data/lib/chronicle/etl/exceptions.rb +3 -3
- data/lib/chronicle/etl/extraction.rb +12 -2
- data/lib/chronicle/etl/extractors/csv_extractor.rb +9 -0
- data/lib/chronicle/etl/extractors/extractor.rb +15 -2
- data/lib/chronicle/etl/extractors/file_extractor.rb +5 -3
- data/lib/chronicle/etl/extractors/helpers/input_reader.rb +2 -2
- data/lib/chronicle/etl/extractors/json_extractor.rb +14 -4
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +3 -0
- data/lib/chronicle/etl/job.rb +35 -17
- data/lib/chronicle/etl/job_definition.rb +39 -27
- data/lib/chronicle/etl/job_log.rb +14 -16
- data/lib/chronicle/etl/job_logger.rb +4 -4
- data/lib/chronicle/etl/loaders/csv_loader.rb +17 -4
- data/lib/chronicle/etl/loaders/helpers/stdout_helper.rb +4 -0
- data/lib/chronicle/etl/loaders/json_loader.rb +30 -10
- data/lib/chronicle/etl/loaders/loader.rb +0 -17
- data/lib/chronicle/etl/loaders/rest_loader.rb +7 -7
- data/lib/chronicle/etl/loaders/table_loader.rb +37 -12
- data/lib/chronicle/etl/logger.rb +3 -3
- data/lib/chronicle/etl/oauth_authorizer.rb +8 -10
- data/lib/chronicle/etl/record.rb +15 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +15 -23
- data/lib/chronicle/etl/registry/connectors.rb +117 -0
- data/lib/chronicle/etl/registry/plugin_registration.rb +19 -0
- data/lib/chronicle/etl/registry/plugins.rb +171 -0
- data/lib/chronicle/etl/registry/registry.rb +3 -52
- data/lib/chronicle/etl/registry/self_registering.rb +1 -1
- data/lib/chronicle/etl/runner.rb +158 -128
- data/lib/chronicle/etl/secrets.rb +5 -5
- data/lib/chronicle/etl/transformers/buffer_transformer.rb +29 -0
- data/lib/chronicle/etl/transformers/chronicle_transformer.rb +32 -0
- data/lib/chronicle/etl/transformers/chronobase_transformer.rb +100 -0
- data/lib/chronicle/etl/transformers/fields_limit_transformer.rb +23 -0
- data/lib/chronicle/etl/transformers/filter_fields_transformer.rb +60 -0
- data/lib/chronicle/etl/transformers/filter_transformer.rb +30 -0
- data/lib/chronicle/etl/transformers/format_transformer.rb +32 -0
- data/lib/chronicle/etl/transformers/merge_meta_transformer.rb +19 -0
- data/lib/chronicle/etl/transformers/multiply_transformer.rb +21 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +5 -7
- data/lib/chronicle/etl/transformers/sampler_transformer.rb +21 -0
- data/lib/chronicle/etl/transformers/sort_transformer.rb +31 -0
- data/lib/chronicle/etl/transformers/transformer.rb +63 -41
- data/lib/chronicle/etl/utils/binary_attachments.rb +1 -1
- data/lib/chronicle/etl/utils/progress_bar.rb +2 -3
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +6 -8
- metadata +91 -45
- data/lib/chronicle/etl/models/activity.rb +0 -15
- data/lib/chronicle/etl/models/attachment.rb +0 -14
- data/lib/chronicle/etl/models/base.rb +0 -122
- data/lib/chronicle/etl/models/entity.rb +0 -29
- data/lib/chronicle/etl/models/raw.rb +0 -26
- data/lib/chronicle/etl/registry/plugin_registry.rb +0 -95
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +0 -31
- data/lib/chronicle/etl/serializers/raw_serializer.rb +0 -10
- data/lib/chronicle/etl/serializers/serializer.rb +0 -28
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +0 -247
- data/lib/chronicle/etl/utils/hash_utilities.rb +0 -19
- data/lib/chronicle/etl/utils/text_recognition.rb +0 -15
@@ -1,122 +0,0 @@
|
|
1
|
-
require 'digest'
|
2
|
-
|
3
|
-
module Chronicle
|
4
|
-
module ETL
|
5
|
-
module Models
|
6
|
-
# Represents a record that's been transformed by a Transformer and
|
7
|
-
# ready to be loaded. Loosely based on ActiveModel.
|
8
|
-
#
|
9
|
-
# @todo Experiment with just mixing in ActiveModel instead of this
|
10
|
-
# this reimplementation
|
11
|
-
class Base
|
12
|
-
ATTRIBUTES = [:provider, :provider_id, :provider_namespace, :lat, :lng, :metadata].freeze
|
13
|
-
ASSOCIATIONS = [].freeze
|
14
|
-
|
15
|
-
attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
|
16
|
-
|
17
|
-
def initialize(attributes = {})
|
18
|
-
assign_attributes(attributes) if attributes
|
19
|
-
@dedupe_on = []
|
20
|
-
@metadata = {}
|
21
|
-
end
|
22
|
-
|
23
|
-
# A unique identifier for this model is formed from a type
|
24
|
-
# and either an id or lids.
|
25
|
-
def identifier_hash
|
26
|
-
{
|
27
|
-
type: self.class::TYPE,
|
28
|
-
id: @id,
|
29
|
-
lids: lids
|
30
|
-
}.compact
|
31
|
-
end
|
32
|
-
|
33
|
-
# Array of local ids that uniquely identify this record
|
34
|
-
def lids
|
35
|
-
@dedupe_on.map do |fields|
|
36
|
-
generate_lid(fields)
|
37
|
-
end.compact.uniq
|
38
|
-
end
|
39
|
-
|
40
|
-
# For a given set of fields of this model, generate a
|
41
|
-
# unique local id by hashing the field values
|
42
|
-
def generate_lid fields
|
43
|
-
raise ArgumentError.new("Must provide an array of symbolized fields") unless fields.is_a?(Array)
|
44
|
-
|
45
|
-
values = fields.sort.map do |field|
|
46
|
-
instance_variable = "@#{field.to_s}"
|
47
|
-
self.instance_variable_get(instance_variable)
|
48
|
-
end
|
49
|
-
|
50
|
-
return if values.any? { |e| e.nil? }
|
51
|
-
|
52
|
-
Digest::SHA256.hexdigest(values.join(","))
|
53
|
-
end
|
54
|
-
|
55
|
-
# Set of attribute names that this model has is Base's shared
|
56
|
-
# attributes combined with the child class's
|
57
|
-
def attribute_list
|
58
|
-
(ATTRIBUTES + self.class::ATTRIBUTES).uniq
|
59
|
-
end
|
60
|
-
|
61
|
-
# All of this record's attributes
|
62
|
-
def attributes
|
63
|
-
attributes = {}
|
64
|
-
attribute_list.each do |attribute|
|
65
|
-
instance_variable = "@#{attribute.to_s}"
|
66
|
-
attributes[attribute] = self.instance_variable_get(instance_variable)
|
67
|
-
end
|
68
|
-
attributes.compact
|
69
|
-
end
|
70
|
-
|
71
|
-
# All of this record's associations
|
72
|
-
def associations
|
73
|
-
association_list = ASSOCIATIONS + self.class::ASSOCIATIONS
|
74
|
-
attributes = {}
|
75
|
-
association_list.each do |attribute|
|
76
|
-
instance_variable = "@#{attribute.to_s}"
|
77
|
-
association = self.instance_variable_get(instance_variable)
|
78
|
-
attributes[attribute] = association if association
|
79
|
-
end
|
80
|
-
attributes.compact
|
81
|
-
end
|
82
|
-
|
83
|
-
def associations_hash
|
84
|
-
associations.map do |k, v|
|
85
|
-
if v.is_a?(Array)
|
86
|
-
[k, v.map(&:to_h)]
|
87
|
-
else
|
88
|
-
[k, v.to_h]
|
89
|
-
end
|
90
|
-
end.to_h
|
91
|
-
end
|
92
|
-
|
93
|
-
def meta_hash
|
94
|
-
{
|
95
|
-
meta: {
|
96
|
-
dedupe_on: @dedupe_on.map{|d| d.map(&:to_s).join(",")}
|
97
|
-
}
|
98
|
-
}
|
99
|
-
end
|
100
|
-
|
101
|
-
# FIXME: move this to a Utils module
|
102
|
-
def to_h_flattened
|
103
|
-
Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
|
104
|
-
end
|
105
|
-
|
106
|
-
def to_h
|
107
|
-
identifier_hash
|
108
|
-
.merge(attributes)
|
109
|
-
.merge(associations_hash)
|
110
|
-
.merge(meta_hash)
|
111
|
-
end
|
112
|
-
|
113
|
-
def assign_attributes attributes
|
114
|
-
attributes.each do |k, v|
|
115
|
-
setter = :"#{k}="
|
116
|
-
public_send(setter, v) if respond_to? setter
|
117
|
-
end
|
118
|
-
end
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
122
|
-
end
|
@@ -1,29 +0,0 @@
|
|
1
|
-
require 'chronicle/etl/models/base'
|
2
|
-
|
3
|
-
module Chronicle
|
4
|
-
module ETL
|
5
|
-
module Models
|
6
|
-
class Entity < Chronicle::ETL::Models::Base
|
7
|
-
TYPE = 'entities'.freeze
|
8
|
-
ATTRIBUTES = [:title, :body, :provider_url, :represents, :slug, :myself, :metadata].freeze
|
9
|
-
|
10
|
-
# TODO: This desperately needs a validation system
|
11
|
-
ASSOCIATIONS = [
|
12
|
-
:involvements, # inverse of activity's `involved`
|
13
|
-
:analogous,
|
14
|
-
:attachments,
|
15
|
-
:abouts,
|
16
|
-
:aboutables, # inverse of above
|
17
|
-
:depicts,
|
18
|
-
:consumers,
|
19
|
-
:creators,
|
20
|
-
:creations,
|
21
|
-
:contains,
|
22
|
-
:containers # inverse of above
|
23
|
-
].freeze # TODO: add these to reflect Chronicle Schema
|
24
|
-
|
25
|
-
attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
require 'chronicle/etl/models/base'
|
2
|
-
|
3
|
-
module Chronicle
|
4
|
-
module ETL
|
5
|
-
module Models
|
6
|
-
# A record from an extraction with no processing or normalization applied
|
7
|
-
class Raw
|
8
|
-
TYPE = 'raw'
|
9
|
-
|
10
|
-
attr_accessor :raw_data
|
11
|
-
|
12
|
-
def initialize(raw_data)
|
13
|
-
@raw_data = raw_data
|
14
|
-
end
|
15
|
-
|
16
|
-
def to_h
|
17
|
-
@raw_data.to_h
|
18
|
-
end
|
19
|
-
|
20
|
-
def to_h_flattened
|
21
|
-
Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
@@ -1,95 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'rubygems/command'
|
3
|
-
require 'rubygems/commands/install_command'
|
4
|
-
require 'rubygems/uninstaller'
|
5
|
-
|
6
|
-
module Chronicle
|
7
|
-
module ETL
|
8
|
-
module Registry
|
9
|
-
# Responsible for managing plugins available to chronicle-etl
|
10
|
-
#
|
11
|
-
# @todo Better validation for whether a gem is actually a plugin
|
12
|
-
# @todo Add ways to load a plugin that don't require a gem on rubygems.org
|
13
|
-
module PluginRegistry
|
14
|
-
class << self
|
15
|
-
# Start of a system for having non-gem plugins. Right now, we just
|
16
|
-
# make registry aware of existenc of name of non-gem plugin
|
17
|
-
def register_standalone(name)
|
18
|
-
standalones << name
|
19
|
-
end
|
20
|
-
|
21
|
-
def standalones
|
22
|
-
@standalones ||= []
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
# Does this plugin exist?
|
27
|
-
def self.exists?(name)
|
28
|
-
# TODO: implement this. Could query rubygems.org or use a hardcoded
|
29
|
-
# list somewhere
|
30
|
-
true
|
31
|
-
end
|
32
|
-
|
33
|
-
# All versions of all plugins currently installed
|
34
|
-
def self.all_installed
|
35
|
-
# TODO: add check for chronicle-etl dependency
|
36
|
-
Gem::Specification.filter { |s| s.name.match(/^chronicle-/) && s.name != "chronicle-etl" }
|
37
|
-
end
|
38
|
-
|
39
|
-
# Latest version of each installed plugin
|
40
|
-
def self.all_installed_latest
|
41
|
-
all_installed.group_by(&:name)
|
42
|
-
.transform_values { |versions| versions.sort_by(&:version).reverse.first }
|
43
|
-
.values
|
44
|
-
end
|
45
|
-
|
46
|
-
# Check whether a given plugin is installed
|
47
|
-
def self.installed?(name)
|
48
|
-
(standalones + all_installed.map { |gem| gem.name.gsub("chronicle-", "") }).include?(name)
|
49
|
-
end
|
50
|
-
|
51
|
-
# Activate a plugin with given name by `require`ing it
|
52
|
-
def self.activate(name)
|
53
|
-
# By default, activates the latest available version of a gem
|
54
|
-
# so don't have to run Kernel#gem separately
|
55
|
-
require "chronicle/#{name}"
|
56
|
-
rescue Gem::ConflictError => e
|
57
|
-
# TODO: figure out if there's more we can do here
|
58
|
-
raise Chronicle::ETL::PluginConflictError.new(name), "Plugin '#{name}' couldn't be loaded. #{e.message}"
|
59
|
-
rescue StandardError, LoadError => e
|
60
|
-
# StandardError to catch random non-loading problems that might occur
|
61
|
-
# when requiring the plugin (eg class macro invoked the wrong way)
|
62
|
-
# TODO: decide if this should be separated
|
63
|
-
raise Chronicle::ETL::PluginLoadError.new(name), "Plugin '#{name}' couldn't be loaded"
|
64
|
-
end
|
65
|
-
|
66
|
-
# Install a plugin to local gems
|
67
|
-
def self.install(name)
|
68
|
-
return if installed?(name)
|
69
|
-
|
70
|
-
gem_name = "chronicle-#{name}"
|
71
|
-
raise(Chronicle::ETL::PluginNotAvailableError.new(gem_name), "Plugin #{name} doesn't exist") unless exists?(gem_name)
|
72
|
-
|
73
|
-
Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
|
74
|
-
Gem.install(gem_name)
|
75
|
-
|
76
|
-
activate(name)
|
77
|
-
rescue Gem::UnsatisfiableDependencyError
|
78
|
-
# TODO: we need to catch a lot more than this here
|
79
|
-
raise Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} could not be installed."
|
80
|
-
end
|
81
|
-
|
82
|
-
# Uninstall a plugin
|
83
|
-
def self.uninstall(name)
|
84
|
-
gem_name = "chronicle-#{name}"
|
85
|
-
Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
|
86
|
-
uninstaller = Gem::Uninstaller.new(gem_name)
|
87
|
-
uninstaller.uninstall
|
88
|
-
rescue Gem::InstallError
|
89
|
-
# TODO: strengthen this exception handling
|
90
|
-
raise(Chronicle::ETL::PluginError.new(name), "Plugin #{name} wasn't uninstalled")
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end
|
@@ -1,31 +0,0 @@
|
|
1
|
-
module Chronicle
|
2
|
-
module ETL
|
3
|
-
class JSONAPISerializer < Chronicle::ETL::Serializer
|
4
|
-
def initialize(*args)
|
5
|
-
super
|
6
|
-
|
7
|
-
raise(SerializationError, "Record must be a subclass of Chronicle::ETL::Model::Base") unless @record.is_a?(Chronicle::ETL::Models::Base)
|
8
|
-
end
|
9
|
-
|
10
|
-
def serializable_hash
|
11
|
-
@record
|
12
|
-
.identifier_hash
|
13
|
-
.merge({ attributes: @record.attributes })
|
14
|
-
.merge({ relationships: build_associations })
|
15
|
-
.merge(@record.meta_hash)
|
16
|
-
end
|
17
|
-
|
18
|
-
def build_associations
|
19
|
-
@record.associations.transform_values do |value|
|
20
|
-
association_data =
|
21
|
-
if value.is_a?(Array)
|
22
|
-
value.map { |record| JSONAPISerializer.new(record).serializable_hash }
|
23
|
-
else
|
24
|
-
JSONAPISerializer.new(value).serializable_hash
|
25
|
-
end
|
26
|
-
{ data: association_data }
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
@@ -1,28 +0,0 @@
|
|
1
|
-
module Chronicle
|
2
|
-
module ETL
|
3
|
-
# Abstract class representing a Serializer for an ETL record
|
4
|
-
class Serializer
|
5
|
-
# Construct a new instance of this serializer.
|
6
|
-
# == Parameters:
|
7
|
-
# options::
|
8
|
-
# Options for configuring this Serializers
|
9
|
-
def initialize(record, options = {})
|
10
|
-
@record = record
|
11
|
-
@options = options
|
12
|
-
end
|
13
|
-
|
14
|
-
# Serialize a record as a hash
|
15
|
-
def serializable_hash
|
16
|
-
raise NotImplementedError
|
17
|
-
end
|
18
|
-
|
19
|
-
def self.serialize(record)
|
20
|
-
serializer = self.new(record)
|
21
|
-
serializer.serializable_hash
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
require_relative 'jsonapi_serializer'
|
28
|
-
require_relative 'raw_serializer'
|
@@ -1,247 +0,0 @@
|
|
1
|
-
require 'mini_exiftool'
|
2
|
-
require 'active_support'
|
3
|
-
require 'active_support/core_ext/object'
|
4
|
-
require 'active_support/core_ext/time'
|
5
|
-
require 'active_support/core_ext/hash/reverse_merge'
|
6
|
-
require 'active_support/core_ext/string/inflections'
|
7
|
-
|
8
|
-
module Chronicle
|
9
|
-
module ETL
|
10
|
-
# Transform a JPEG or other image file into a record.
|
11
|
-
# By default, file mtime and a hash of the file content is used to build
|
12
|
-
# the timestamp and ID respectively but other options are available (such
|
13
|
-
# as reading EXIF tags or extended attributes from the filesystem).
|
14
|
-
#
|
15
|
-
# TODO: This should be extracted into its own plugin
|
16
|
-
class ImageFileTransformer < Chronicle::ETL::Transformer
|
17
|
-
register_connector do |r|
|
18
|
-
r.identifier = 'image-file'
|
19
|
-
r.description = 'an image file'
|
20
|
-
end
|
21
|
-
|
22
|
-
setting :timestamp_strategy, default: 'file_mtime'
|
23
|
-
setting :id_strategy, default: 'file_hash'
|
24
|
-
setting :verb, default: 'photographed'
|
25
|
-
# EXIF tags often don't have timezones
|
26
|
-
setting :timezone_default, default: 'Eastern Time (US & Canada)'
|
27
|
-
setting :include_image_data, default: true
|
28
|
-
setting :actor
|
29
|
-
setting :involved
|
30
|
-
|
31
|
-
def transform
|
32
|
-
# FIXME: set @filename; use block for reading file when necessary
|
33
|
-
@file = File.open(@extraction.data)
|
34
|
-
record = build_created(@file)
|
35
|
-
@file.close
|
36
|
-
record
|
37
|
-
end
|
38
|
-
|
39
|
-
def friendly_identifier
|
40
|
-
@file.path
|
41
|
-
end
|
42
|
-
|
43
|
-
def id
|
44
|
-
@id ||= begin
|
45
|
-
id = build_with_strategy(field: :id, strategy: @config.id_strategy)
|
46
|
-
raise(UntransformableRecordError, "Could not build id") unless id
|
47
|
-
|
48
|
-
id
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def timestamp
|
53
|
-
@timestamp ||= begin
|
54
|
-
ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
|
55
|
-
raise(UntransformableRecordError, "Could not build timestamp") unless ts
|
56
|
-
|
57
|
-
ts
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
private
|
62
|
-
|
63
|
-
def build_created(file)
|
64
|
-
record = ::Chronicle::ETL::Models::Activity.new
|
65
|
-
record.verb = @config.verb
|
66
|
-
record.provider = @config.provider
|
67
|
-
record.provider_id = id
|
68
|
-
record.end_at = timestamp
|
69
|
-
record.dedupe_on = [[:provider_id, :verb, :provider]]
|
70
|
-
|
71
|
-
record.involved = build_image
|
72
|
-
record.actor = build_actor
|
73
|
-
|
74
|
-
record.assign_attributes(build_gps)
|
75
|
-
record
|
76
|
-
end
|
77
|
-
|
78
|
-
def build_actor
|
79
|
-
actor = ::Chronicle::ETL::Models::Entity.new
|
80
|
-
actor.represents = 'identity'
|
81
|
-
actor.provider = @config.actor[:provider]
|
82
|
-
actor.slug = @config.actor[:slug]
|
83
|
-
actor.dedupe_on = [[:provider, :slug, :represents]]
|
84
|
-
actor
|
85
|
-
end
|
86
|
-
|
87
|
-
def build_image
|
88
|
-
image = ::Chronicle::ETL::Models::Entity.new
|
89
|
-
image.represents = @config.involved[:represents]
|
90
|
-
image.title = build_title
|
91
|
-
image.body = exif['Description']
|
92
|
-
image.provider = @config.involved[:provider]
|
93
|
-
image.provider_id = id
|
94
|
-
image.assign_attributes(build_gps)
|
95
|
-
image.dedupe_on = [[:provider, :provider_id, :represents]]
|
96
|
-
|
97
|
-
if @config.ocr_strategy
|
98
|
-
ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
|
99
|
-
image.metadata[:ocr_text] = ocr_text if ocr_text
|
100
|
-
end
|
101
|
-
|
102
|
-
names = extract_people_depicted
|
103
|
-
tags = extract_keywords(names)
|
104
|
-
|
105
|
-
image.depicts = build_people_depicted(names)
|
106
|
-
image.abouts = build_keywords(tags)
|
107
|
-
|
108
|
-
if @config.include_image_data
|
109
|
-
attachment = ::Chronicle::ETL::Models::Attachment.new
|
110
|
-
attachment.data = build_image_data
|
111
|
-
image.attachments = [attachment]
|
112
|
-
end
|
113
|
-
|
114
|
-
image
|
115
|
-
end
|
116
|
-
|
117
|
-
def build_keywords(topics)
|
118
|
-
topics.map do |topic|
|
119
|
-
t = ::Chronicle::ETL::Models::Entity.new
|
120
|
-
t.represents = 'topic'
|
121
|
-
t.provider = @config.involved[:provider]
|
122
|
-
t.title = topic
|
123
|
-
t.slug = topic.parameterize
|
124
|
-
t.dedupe_on = [[:provider, :represents, :slug]]
|
125
|
-
t
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
def build_people_depicted(names)
|
130
|
-
names.map do |name|
|
131
|
-
identity = ::Chronicle::ETL::Models::Entity.new
|
132
|
-
identity.represents = 'identity'
|
133
|
-
identity.provider = @config.involved[:provider]
|
134
|
-
identity.slug = name.parameterize
|
135
|
-
identity.title = name
|
136
|
-
identity.dedupe_on = [[:provider, :represents, :slug]]
|
137
|
-
identity
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def build_gps
|
142
|
-
return {} unless exif['GPSLatitude']
|
143
|
-
|
144
|
-
{
|
145
|
-
lat: exif['GPSLatitude'],
|
146
|
-
lng: exif['GPSLongitude'],
|
147
|
-
elevation: exif['GPSAltitude']
|
148
|
-
}
|
149
|
-
end
|
150
|
-
|
151
|
-
def build_image_data
|
152
|
-
::Chronicle::ETL::Utils::BinaryAttachments.filename_to_base64(filename: @file.path)
|
153
|
-
end
|
154
|
-
|
155
|
-
def build_title
|
156
|
-
File.basename(@file)
|
157
|
-
end
|
158
|
-
|
159
|
-
def build_with_strategy(field:, strategy:[])
|
160
|
-
strategies = [strategy].flatten.compact
|
161
|
-
strategies.each do |s|
|
162
|
-
builder_method = "build_#{field}_using_#{s}"
|
163
|
-
result = send(builder_method.to_sym)
|
164
|
-
return result if result
|
165
|
-
end
|
166
|
-
return
|
167
|
-
end
|
168
|
-
|
169
|
-
def build_id_using_file_hash
|
170
|
-
Digest::SHA256.hexdigest(File.read(@file))
|
171
|
-
end
|
172
|
-
|
173
|
-
def build_id_using_xattr_version
|
174
|
-
load_value_from_xattr_plist("com.apple.metadata:kMDItemVersion")
|
175
|
-
end
|
176
|
-
|
177
|
-
def build_id_using_xmp_document_id
|
178
|
-
exif['OriginalDocumentID'] || exif['DerivedFromDocumentID']
|
179
|
-
end
|
180
|
-
|
181
|
-
def build_timestamp_using_file_mtime
|
182
|
-
File.mtime(@file)
|
183
|
-
end
|
184
|
-
|
185
|
-
def build_timestamp_using_exif_datetimeoriginal
|
186
|
-
# EXIF tags don't have timezone information. This is a DateTime in UTC
|
187
|
-
timestamp = exif['DateTimeOriginal'] || return
|
188
|
-
|
189
|
-
if exif['OffsetTimeOriginal']
|
190
|
-
# Offset tags are only available in newer EXIF tags. If it exists, we
|
191
|
-
# use it instead of UTC
|
192
|
-
timestamp = timestamp.change(offset: exif['OffsetTimeOriginal'])
|
193
|
-
elsif false
|
194
|
-
# TODO: support option of using GPS coordinates to determine timezone
|
195
|
-
else
|
196
|
-
zone = ActiveSupport::TimeZone.new(@config.timezone_default)
|
197
|
-
timestamp = zone.parse(timestamp.asctime)
|
198
|
-
end
|
199
|
-
|
200
|
-
timestamp
|
201
|
-
end
|
202
|
-
|
203
|
-
# TODO: add documentation for how to set up `macocr`
|
204
|
-
def build_ocr_using_macocr
|
205
|
-
`macocr "#{@file.path}" 2>/dev/null`.presence
|
206
|
-
end
|
207
|
-
|
208
|
-
def exif
|
209
|
-
@exif ||= MiniExiftool.new(
|
210
|
-
@file.path,
|
211
|
-
numerical: true,
|
212
|
-
|
213
|
-
# EXIF timestamps don't have timezone information. MiniExifTool uses Time
|
214
|
-
# by default which parses timestamps in local time zone. Using DateTime
|
215
|
-
# parses dates as UTC and then we can apply a timezone offset if the optional
|
216
|
-
# EXIF timezone offset fields are available.
|
217
|
-
# https://github.com/janfri/mini_exiftool/issues/39#issuecomment-832587649
|
218
|
-
timestamps: DateTime
|
219
|
-
)
|
220
|
-
end
|
221
|
-
|
222
|
-
# Figure out which faces are tagged as regions and return a list of their names
|
223
|
-
def extract_people_depicted
|
224
|
-
return [] unless exif['RegionName']
|
225
|
-
|
226
|
-
names = [exif['RegionName']].flatten
|
227
|
-
types = [exif['RegionType']].flatten
|
228
|
-
|
229
|
-
names.zip(types).select{|x| x[1] == 'Face'}.map{|x| x[0]}.uniq
|
230
|
-
end
|
231
|
-
|
232
|
-
# Extract image keywords from EXIF/IPTC tag and subtract out those of which are
|
233
|
-
# tagged people (determiend by looking at face regions)
|
234
|
-
def extract_keywords(people_names = [])
|
235
|
-
[exif['Keywords'] || []].flatten - people_names
|
236
|
-
end
|
237
|
-
|
238
|
-
def load_value_from_xattr_plist attribute
|
239
|
-
require 'nokogiri'
|
240
|
-
xml = `xattr -p #{attribute} \"#{@file.path}\" | xxd -r -p | plutil -convert xml1 -o - -- - 2>/dev/null`
|
241
|
-
return unless xml
|
242
|
-
value = Nokogiri::XML.parse(r).xpath("//string").text
|
243
|
-
return value.presence
|
244
|
-
end
|
245
|
-
end
|
246
|
-
end
|
247
|
-
end
|
@@ -1,19 +0,0 @@
|
|
1
|
-
module Chronicle
|
2
|
-
module ETL
|
3
|
-
module Utils
|
4
|
-
module HashUtilities
|
5
|
-
def self.flatten_hash(hash)
|
6
|
-
hash.each_with_object({}) do |(k, v), h|
|
7
|
-
if v.is_a? Hash
|
8
|
-
flatten_hash(v).map do |h_k, h_v|
|
9
|
-
h["#{k}.#{h_k}".to_sym] = h_v
|
10
|
-
end
|
11
|
-
else
|
12
|
-
h[k] = v
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
@@ -1,15 +0,0 @@
|
|
1
|
-
require 'active_support/core_ext/object/blank'
|
2
|
-
|
3
|
-
module Chronicle
|
4
|
-
module ETL
|
5
|
-
module Utils
|
6
|
-
# OCR for image files
|
7
|
-
# TODO: add other strategies and document `macocr`
|
8
|
-
module TextRecognition
|
9
|
-
def self.recognize_in_image(filename:)
|
10
|
-
`macocr "#{filename}" 2>/dev/null`.presence
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|