chronicle-etl 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +35 -0
- data/.rubocop.yml +28 -1
- data/Guardfile +7 -0
- data/README.md +149 -85
- data/Rakefile +4 -2
- data/chronicle-etl.gemspec +10 -5
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/cli/connectors.rb +34 -0
- data/lib/chronicle/etl/cli/jobs.rb +44 -12
- data/lib/chronicle/etl/cli/main.rb +13 -19
- data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
- data/lib/chronicle/etl/cli.rb +7 -0
- data/lib/chronicle/etl/configurable.rb +158 -0
- data/lib/chronicle/etl/exceptions.rb +7 -1
- data/lib/chronicle/etl/extractors/csv_extractor.rb +24 -23
- data/lib/chronicle/etl/extractors/extractor.rb +23 -19
- data/lib/chronicle/etl/extractors/file_extractor.rb +34 -11
- data/lib/chronicle/etl/extractors/helpers/input_reader.rb +76 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +19 -18
- data/lib/chronicle/etl/job.rb +1 -1
- data/lib/chronicle/etl/job_definition.rb +1 -1
- data/lib/chronicle/etl/loaders/csv_loader.rb +1 -1
- data/lib/chronicle/etl/loaders/json_loader.rb +44 -0
- data/lib/chronicle/etl/loaders/loader.rb +5 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +5 -5
- data/lib/chronicle/etl/loaders/table_loader.rb +21 -24
- data/lib/chronicle/etl/logger.rb +1 -0
- data/lib/chronicle/etl/models/base.rb +3 -0
- data/lib/chronicle/etl/models/entity.rb +8 -2
- data/lib/chronicle/etl/models/raw.rb +26 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +1 -0
- data/lib/chronicle/etl/runner.rb +6 -4
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +6 -0
- data/lib/chronicle/etl/serializers/raw_serializer.rb +10 -0
- data/lib/chronicle/etl/serializers/serializer.rb +2 -1
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +22 -28
- data/lib/chronicle/etl/transformers/null_transformer.rb +1 -1
- data/lib/chronicle/etl/transformers/transformer.rb +3 -2
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +12 -4
- metadata +80 -19
- data/.ruby-version +0 -1
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +0 -104
- data/lib/chronicle/etl/loaders/stdout_loader.rb +0 -14
- data/lib/chronicle/etl/models/generic.rb +0 -23
@@ -0,0 +1,44 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class JSONLoader < Chronicle::ETL::Loader
|
4
|
+
register_connector do |r|
|
5
|
+
r.description = 'json'
|
6
|
+
end
|
7
|
+
|
8
|
+
setting :serializer
|
9
|
+
setting :output, default: $stdout
|
10
|
+
|
11
|
+
def start
|
12
|
+
if @config.output == $stdout
|
13
|
+
@output = @config.output
|
14
|
+
else
|
15
|
+
@output = File.open(@config.output, "w")
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def load(record)
|
20
|
+
serialized = serializer.serialize(record)
|
21
|
+
|
22
|
+
# When dealing with raw data, we can get improperly encoded strings
|
23
|
+
# (eg from sqlite database columns). We force conversion to UTF-8
|
24
|
+
# before converting into JSON
|
25
|
+
encoded = serialized.transform_values do |value|
|
26
|
+
next value unless value.is_a?(String)
|
27
|
+
|
28
|
+
value.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
29
|
+
end
|
30
|
+
@output.puts encoded.to_json
|
31
|
+
end
|
32
|
+
|
33
|
+
def finish
|
34
|
+
@output.close
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def serializer
|
40
|
+
@config.serializer || Chronicle::ETL::RawSerializer
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -3,13 +3,16 @@ module Chronicle
|
|
3
3
|
# Abstract class representing a Loader for an ETL job
|
4
4
|
class Loader
|
5
5
|
extend Chronicle::ETL::Registry::SelfRegistering
|
6
|
+
include Chronicle::ETL::Configurable
|
7
|
+
|
8
|
+
setting :output
|
6
9
|
|
7
10
|
# Construct a new instance of this loader. Options are passed in from a Runner
|
8
11
|
# == Parameters:
|
9
12
|
# options::
|
10
13
|
# Options for configuring this Loader
|
11
14
|
def initialize(options = {})
|
12
|
-
|
15
|
+
apply_options(options)
|
13
16
|
end
|
14
17
|
|
15
18
|
# Called once before processing records
|
@@ -27,6 +30,6 @@ module Chronicle
|
|
27
30
|
end
|
28
31
|
|
29
32
|
require_relative 'csv_loader'
|
33
|
+
require_relative 'json_loader'
|
30
34
|
require_relative 'rest_loader'
|
31
|
-
require_relative 'stdout_loader'
|
32
35
|
require_relative 'table_loader'
|
@@ -9,19 +9,19 @@ module Chronicle
|
|
9
9
|
r.description = 'a REST endpoint'
|
10
10
|
end
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
setting :hostname, required: true
|
13
|
+
setting :endpoint, required: true
|
14
|
+
setting :access_token
|
15
15
|
|
16
16
|
def load(record)
|
17
17
|
payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
|
18
18
|
# have the outer data key that json-api expects
|
19
19
|
payload = { data: payload } unless payload[:data]
|
20
20
|
|
21
|
-
uri = URI.parse("#{@
|
21
|
+
uri = URI.parse("#{@config.hostname}#{@config.endpoint}")
|
22
22
|
|
23
23
|
header = {
|
24
|
-
"Authorization" => "Bearer #{@
|
24
|
+
"Authorization" => "Bearer #{@config.access_token}",
|
25
25
|
"Content-Type": 'application/json'
|
26
26
|
}
|
27
27
|
use_ssl = uri.scheme == 'https'
|
@@ -9,59 +9,56 @@ module Chronicle
|
|
9
9
|
r.description = 'an ASCII table'
|
10
10
|
end
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
table_renderer: :basic
|
18
|
-
}.freeze
|
19
|
-
|
20
|
-
def initialize(options={})
|
21
|
-
@options = options.reverse_merge(DEFAULT_OPTIONS)
|
22
|
-
@records = []
|
23
|
-
end
|
12
|
+
setting :fields_limit, default: nil
|
13
|
+
setting :fields_exclude, default: ['lids', 'type']
|
14
|
+
setting :fields, default: []
|
15
|
+
setting :truncate_values_at, default: 40
|
16
|
+
setting :table_renderer, default: :basic
|
24
17
|
|
25
18
|
def load(record)
|
26
|
-
|
19
|
+
records << record.to_h_flattened
|
27
20
|
end
|
28
21
|
|
29
22
|
def finish
|
30
|
-
return if
|
23
|
+
return if records.empty?
|
31
24
|
|
32
|
-
headers = build_headers(
|
33
|
-
rows = build_rows(
|
25
|
+
headers = build_headers(records)
|
26
|
+
rows = build_rows(records, headers)
|
34
27
|
|
35
28
|
@table = TTY::Table.new(header: headers, rows: rows)
|
36
29
|
puts @table.render(
|
37
|
-
@
|
30
|
+
@config.table_renderer.to_sym,
|
38
31
|
padding: [0, 2, 0, 0]
|
39
32
|
)
|
40
33
|
end
|
41
34
|
|
35
|
+
def records
|
36
|
+
@records ||= []
|
37
|
+
end
|
38
|
+
|
42
39
|
private
|
43
40
|
|
44
41
|
def build_headers(records)
|
45
42
|
headers =
|
46
|
-
if @
|
47
|
-
Set[*@
|
43
|
+
if @config.fields.any?
|
44
|
+
Set[*@config.fields]
|
48
45
|
else
|
49
46
|
# use all the keys of the flattened record hash
|
50
47
|
Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
|
51
48
|
end
|
52
49
|
|
53
|
-
headers = headers.delete_if { |header| header.end_with?(*@
|
54
|
-
headers = headers.first(@
|
50
|
+
headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) } if @config.fields_exclude.any?
|
51
|
+
headers = headers.first(@config.fields_limit) if @config.fields_limit
|
55
52
|
|
56
53
|
headers.to_a.map(&:to_sym)
|
57
54
|
end
|
58
55
|
|
59
56
|
def build_rows(records, headers)
|
60
57
|
records.map do |record|
|
61
|
-
values = record.values_at(*headers).map{|value| value.to_s }
|
58
|
+
values = record.transform_keys(&:to_sym).values_at(*headers).map{|value| value.to_s }
|
62
59
|
|
63
|
-
if @
|
64
|
-
values = values.map{ |value| value.truncate(@
|
60
|
+
if @config.truncate_values_at
|
61
|
+
values = values.map{ |value| value.truncate(@config.truncate_values_at) }
|
65
62
|
end
|
66
63
|
|
67
64
|
values
|
data/lib/chronicle/etl/logger.rb
CHANGED
@@ -5,6 +5,9 @@ module Chronicle
|
|
5
5
|
module Models
|
6
6
|
# Represents a record that's been transformed by a Transformer and
|
7
7
|
# ready to be loaded. Loosely based on ActiveModel.
|
8
|
+
#
|
9
|
+
# @todo Experiment with just mixing in ActiveModel instead of this
|
10
|
+
# this reimplementation
|
8
11
|
class Base
|
9
12
|
ATTRIBUTES = [:provider, :provider_id, :lat, :lng, :metadata].freeze
|
10
13
|
ASSOCIATIONS = [].freeze
|
@@ -5,13 +5,19 @@ module Chronicle
|
|
5
5
|
module Models
|
6
6
|
class Entity < Chronicle::ETL::Models::Base
|
7
7
|
TYPE = 'entities'.freeze
|
8
|
-
ATTRIBUTES = [:title, :body, :represents, :slug, :myself, :metadata].freeze
|
8
|
+
ATTRIBUTES = [:title, :body, :provider_url, :represents, :slug, :myself, :metadata].freeze
|
9
|
+
|
10
|
+
# TODO: This desperately needs a validation system
|
9
11
|
ASSOCIATIONS = [
|
12
|
+
:involvements, # inverse of activity's `involved`
|
13
|
+
|
10
14
|
:attachments,
|
11
15
|
:abouts,
|
16
|
+
:aboutables, # inverse of above
|
12
17
|
:depicts,
|
13
18
|
:consumers,
|
14
|
-
:contains
|
19
|
+
:contains,
|
20
|
+
:containers # inverse of above
|
15
21
|
].freeze # TODO: add these to reflect Chronicle Schema
|
16
22
|
|
17
23
|
attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
# A record from an extraction with no processing or normalization applied
|
7
|
+
class Raw
|
8
|
+
TYPE = 'raw'
|
9
|
+
|
10
|
+
attr_accessor :raw_data
|
11
|
+
|
12
|
+
def initialize(raw_data)
|
13
|
+
@raw_data = raw_data
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_h
|
17
|
+
@raw_data.to_h
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_h_flattened
|
21
|
+
Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -3,6 +3,7 @@ module Chronicle
|
|
3
3
|
module Registry
|
4
4
|
# Records details about a connector such as its provider and a description
|
5
5
|
class ConnectorRegistration
|
6
|
+
# FIXME: refactor custom accessor methods later in file
|
6
7
|
attr_accessor :identifier, :provider, :klass, :description
|
7
8
|
|
8
9
|
def initialize(klass)
|
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -14,6 +14,7 @@ class Chronicle::ETL::Runner
|
|
14
14
|
@job_logger.start
|
15
15
|
loader.start
|
16
16
|
|
17
|
+
extractor.prepare
|
17
18
|
total = extractor.results_count
|
18
19
|
@progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
19
20
|
Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
|
@@ -27,9 +28,10 @@ class Chronicle::ETL::Runner
|
|
27
28
|
transformer = @job.instantiate_transformer(extraction)
|
28
29
|
record = transformer.transform
|
29
30
|
|
30
|
-
|
31
|
-
|
32
|
-
|
31
|
+
# TODO: rethink this
|
32
|
+
# unless record.is_a?(Chronicle::ETL::Models)
|
33
|
+
# raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
|
34
|
+
# end
|
33
35
|
|
34
36
|
Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
|
35
37
|
@job_logger.log_transformation(transformer)
|
@@ -51,7 +53,7 @@ class Chronicle::ETL::Runner
|
|
51
53
|
raise e
|
52
54
|
ensure
|
53
55
|
@job_logger.save
|
54
|
-
@progress_bar
|
56
|
+
@progress_bar&.finish
|
55
57
|
Chronicle::ETL::Logger.detach_from_progress_bar
|
56
58
|
Chronicle::ETL::Logger.info(tty_log_completion)
|
57
59
|
end
|
@@ -1,6 +1,12 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
3
|
class JSONAPISerializer < Chronicle::ETL::Serializer
|
4
|
+
def initialize(*args)
|
5
|
+
super
|
6
|
+
|
7
|
+
raise(SerializationError, "Record must be a subclass of Chronicle::ETL::Model::Base") unless @record.is_a?(Chronicle::ETL::Models::Base)
|
8
|
+
end
|
9
|
+
|
4
10
|
def serializable_hash
|
5
11
|
@record
|
6
12
|
.identifier_hash
|
@@ -19,20 +19,14 @@ module Chronicle
|
|
19
19
|
r.description = 'an image file'
|
20
20
|
end
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
}.freeze
|
31
|
-
|
32
|
-
def initialize(*args)
|
33
|
-
super(*args)
|
34
|
-
@options = @options.reverse_merge(DEFAULT_OPTIONS)
|
35
|
-
end
|
22
|
+
setting :timestamp_strategy, default: 'file_mtime'
|
23
|
+
setting :id_strategy, default: 'file_hash'
|
24
|
+
setting :verb, default: 'photographed'
|
25
|
+
# EXIF tags often don't have timezones
|
26
|
+
setting :timezone_default, default: 'Eastern Time (US & Canada)'
|
27
|
+
setting :include_image_data, default: true
|
28
|
+
setting :actor
|
29
|
+
setting :involved
|
36
30
|
|
37
31
|
def transform
|
38
32
|
# FIXME: set @filename; use block for reading file when necessary
|
@@ -48,7 +42,7 @@ module Chronicle
|
|
48
42
|
|
49
43
|
def id
|
50
44
|
@id ||= begin
|
51
|
-
id = build_with_strategy(field: :id, strategy: @
|
45
|
+
id = build_with_strategy(field: :id, strategy: @config.id_strategy)
|
52
46
|
raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
|
53
47
|
|
54
48
|
id
|
@@ -57,7 +51,7 @@ module Chronicle
|
|
57
51
|
|
58
52
|
def timestamp
|
59
53
|
@timestamp ||= begin
|
60
|
-
ts = build_with_strategy(field: :timestamp, strategy: @
|
54
|
+
ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
|
61
55
|
raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
|
62
56
|
|
63
57
|
ts
|
@@ -68,8 +62,8 @@ module Chronicle
|
|
68
62
|
|
69
63
|
def build_created(file)
|
70
64
|
record = ::Chronicle::ETL::Models::Activity.new
|
71
|
-
record.verb = @
|
72
|
-
record.provider = @
|
65
|
+
record.verb = @config.verb
|
66
|
+
record.provider = @config.provider
|
73
67
|
record.provider_id = id
|
74
68
|
record.end_at = timestamp
|
75
69
|
record.dedupe_on = [[:provider_id, :verb, :provider]]
|
@@ -84,24 +78,24 @@ module Chronicle
|
|
84
78
|
def build_actor
|
85
79
|
actor = ::Chronicle::ETL::Models::Entity.new
|
86
80
|
actor.represents = 'identity'
|
87
|
-
actor.provider = @
|
88
|
-
actor.slug = @
|
81
|
+
actor.provider = @config.actor[:provider]
|
82
|
+
actor.slug = @config.actor[:slug]
|
89
83
|
actor.dedupe_on = [[:provider, :slug, :represents]]
|
90
84
|
actor
|
91
85
|
end
|
92
86
|
|
93
87
|
def build_image
|
94
88
|
image = ::Chronicle::ETL::Models::Entity.new
|
95
|
-
image.represents = @
|
89
|
+
image.represents = @config.involved[:represents]
|
96
90
|
image.title = build_title
|
97
91
|
image.body = exif['Description']
|
98
|
-
image.provider = @
|
92
|
+
image.provider = @config.involved[:provider]
|
99
93
|
image.provider_id = id
|
100
94
|
image.assign_attributes(build_gps)
|
101
95
|
image.dedupe_on = [[:provider, :provider_id, :represents]]
|
102
96
|
|
103
|
-
if @
|
104
|
-
ocr_text = build_with_strategy(field: :ocr, strategy: @
|
97
|
+
if @config.ocr_strategy
|
98
|
+
ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
|
105
99
|
image.metadata[:ocr_text] = ocr_text if ocr_text
|
106
100
|
end
|
107
101
|
|
@@ -111,7 +105,7 @@ module Chronicle
|
|
111
105
|
image.depicts = build_people_depicted(names)
|
112
106
|
image.abouts = build_keywords(tags)
|
113
107
|
|
114
|
-
if @
|
108
|
+
if @config.include_image_data
|
115
109
|
attachment = ::Chronicle::ETL::Models::Attachment.new
|
116
110
|
attachment.data = build_image_data
|
117
111
|
image.attachments = [attachment]
|
@@ -124,7 +118,7 @@ module Chronicle
|
|
124
118
|
topics.map do |topic|
|
125
119
|
t = ::Chronicle::ETL::Models::Entity.new
|
126
120
|
t.represents = 'topic'
|
127
|
-
t.provider = @
|
121
|
+
t.provider = @config.involved[:provider]
|
128
122
|
t.title = topic
|
129
123
|
t.slug = topic.parameterize
|
130
124
|
t.dedupe_on = [[:provider, :represents, :slug]]
|
@@ -136,7 +130,7 @@ module Chronicle
|
|
136
130
|
names.map do |name|
|
137
131
|
identity = ::Chronicle::ETL::Models::Entity.new
|
138
132
|
identity.represents = 'identity'
|
139
|
-
identity.provider = @
|
133
|
+
identity.provider = @config.involved[:provider]
|
140
134
|
identity.slug = name.parameterize
|
141
135
|
identity.title = name
|
142
136
|
identity.dedupe_on = [[:provider, :represents, :slug]]
|
@@ -199,7 +193,7 @@ module Chronicle
|
|
199
193
|
elsif false
|
200
194
|
# TODO: support option of using GPS coordinates to determine timezone
|
201
195
|
else
|
202
|
-
zone = ActiveSupport::TimeZone.new(@
|
196
|
+
zone = ActiveSupport::TimeZone.new(@config.timezone_default)
|
203
197
|
timestamp = zone.parse(timestamp.asctime)
|
204
198
|
end
|
205
199
|
|
@@ -3,14 +3,15 @@ module Chronicle
|
|
3
3
|
# Abstract class representing an Transformer for an ETL job
|
4
4
|
class Transformer
|
5
5
|
extend Chronicle::ETL::Registry::SelfRegistering
|
6
|
+
include Chronicle::ETL::Configurable
|
6
7
|
|
7
8
|
# Construct a new instance of this transformer. Options are passed in from a Runner
|
8
9
|
# == Parameters:
|
9
10
|
# options::
|
10
11
|
# Options for configuring this Transformer
|
11
|
-
def initialize(options = {}
|
12
|
-
@options = options
|
12
|
+
def initialize(extraction, options = {})
|
13
13
|
@extraction = extraction
|
14
|
+
apply_options(options)
|
14
15
|
end
|
15
16
|
|
16
17
|
# @abstract Subclass is expected to implement #transform
|
data/lib/chronicle/etl.rb
CHANGED
@@ -1,24 +1,32 @@
|
|
1
1
|
require_relative 'etl/registry/registry'
|
2
2
|
require_relative 'etl/config'
|
3
|
+
require_relative 'etl/configurable'
|
3
4
|
require_relative 'etl/exceptions'
|
4
5
|
require_relative 'etl/extraction'
|
5
|
-
require_relative 'etl/extractors/extractor'
|
6
6
|
require_relative 'etl/job_definition'
|
7
7
|
require_relative 'etl/job_log'
|
8
8
|
require_relative 'etl/job_logger'
|
9
9
|
require_relative 'etl/job'
|
10
|
-
require_relative 'etl/loaders/loader'
|
11
10
|
require_relative 'etl/logger'
|
12
11
|
require_relative 'etl/models/activity'
|
13
12
|
require_relative 'etl/models/attachment'
|
14
13
|
require_relative 'etl/models/base'
|
14
|
+
require_relative 'etl/models/raw'
|
15
15
|
require_relative 'etl/models/entity'
|
16
|
-
require_relative 'etl/models/generic'
|
17
16
|
require_relative 'etl/runner'
|
18
17
|
require_relative 'etl/serializers/serializer'
|
19
|
-
require_relative 'etl/transformers/transformer'
|
20
18
|
require_relative 'etl/utils/binary_attachments'
|
21
19
|
require_relative 'etl/utils/hash_utilities'
|
22
20
|
require_relative 'etl/utils/text_recognition'
|
23
21
|
require_relative 'etl/utils/progress_bar'
|
24
22
|
require_relative 'etl/version'
|
23
|
+
|
24
|
+
require_relative 'etl/extractors/extractor'
|
25
|
+
require_relative 'etl/loaders/loader'
|
26
|
+
require_relative 'etl/transformers/transformer'
|
27
|
+
|
28
|
+
begin
|
29
|
+
require 'pry'
|
30
|
+
rescue LoadError
|
31
|
+
# Pry not available
|
32
|
+
end
|