chronicle-etl 0.3.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +35 -0
- data/.rubocop.yml +28 -1
- data/Guardfile +7 -0
- data/README.md +149 -85
- data/Rakefile +4 -2
- data/chronicle-etl.gemspec +10 -5
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/cli/connectors.rb +34 -0
- data/lib/chronicle/etl/cli/jobs.rb +44 -12
- data/lib/chronicle/etl/cli/main.rb +13 -19
- data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
- data/lib/chronicle/etl/cli.rb +7 -0
- data/lib/chronicle/etl/configurable.rb +158 -0
- data/lib/chronicle/etl/exceptions.rb +7 -1
- data/lib/chronicle/etl/extractors/csv_extractor.rb +24 -23
- data/lib/chronicle/etl/extractors/extractor.rb +23 -19
- data/lib/chronicle/etl/extractors/file_extractor.rb +34 -11
- data/lib/chronicle/etl/extractors/helpers/input_reader.rb +76 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +19 -18
- data/lib/chronicle/etl/job.rb +1 -1
- data/lib/chronicle/etl/job_definition.rb +1 -1
- data/lib/chronicle/etl/loaders/csv_loader.rb +1 -1
- data/lib/chronicle/etl/loaders/json_loader.rb +44 -0
- data/lib/chronicle/etl/loaders/loader.rb +5 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +5 -5
- data/lib/chronicle/etl/loaders/table_loader.rb +21 -24
- data/lib/chronicle/etl/logger.rb +1 -0
- data/lib/chronicle/etl/models/base.rb +3 -0
- data/lib/chronicle/etl/models/entity.rb +8 -2
- data/lib/chronicle/etl/models/raw.rb +26 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +1 -0
- data/lib/chronicle/etl/runner.rb +6 -4
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +6 -0
- data/lib/chronicle/etl/serializers/raw_serializer.rb +10 -0
- data/lib/chronicle/etl/serializers/serializer.rb +2 -1
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +22 -28
- data/lib/chronicle/etl/transformers/null_transformer.rb +1 -1
- data/lib/chronicle/etl/transformers/transformer.rb +3 -2
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +12 -4
- metadata +80 -19
- data/.ruby-version +0 -1
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +0 -104
- data/lib/chronicle/etl/loaders/stdout_loader.rb +0 -14
- data/lib/chronicle/etl/models/generic.rb +0 -23
@@ -0,0 +1,44 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class JSONLoader < Chronicle::ETL::Loader
|
4
|
+
register_connector do |r|
|
5
|
+
r.description = 'json'
|
6
|
+
end
|
7
|
+
|
8
|
+
setting :serializer
|
9
|
+
setting :output, default: $stdout
|
10
|
+
|
11
|
+
def start
|
12
|
+
if @config.output == $stdout
|
13
|
+
@output = @config.output
|
14
|
+
else
|
15
|
+
@output = File.open(@config.output, "w")
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def load(record)
|
20
|
+
serialized = serializer.serialize(record)
|
21
|
+
|
22
|
+
# When dealing with raw data, we can get improperly encoded strings
|
23
|
+
# (eg from sqlite database columns). We force conversion to UTF-8
|
24
|
+
# before converting into JSON
|
25
|
+
encoded = serialized.transform_values do |value|
|
26
|
+
next value unless value.is_a?(String)
|
27
|
+
|
28
|
+
value.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
29
|
+
end
|
30
|
+
@output.puts encoded.to_json
|
31
|
+
end
|
32
|
+
|
33
|
+
def finish
|
34
|
+
@output.close
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def serializer
|
40
|
+
@config.serializer || Chronicle::ETL::RawSerializer
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -3,13 +3,16 @@ module Chronicle
|
|
3
3
|
# Abstract class representing a Loader for an ETL job
|
4
4
|
class Loader
|
5
5
|
extend Chronicle::ETL::Registry::SelfRegistering
|
6
|
+
include Chronicle::ETL::Configurable
|
7
|
+
|
8
|
+
setting :output
|
6
9
|
|
7
10
|
# Construct a new instance of this loader. Options are passed in from a Runner
|
8
11
|
# == Parameters:
|
9
12
|
# options::
|
10
13
|
# Options for configuring this Loader
|
11
14
|
def initialize(options = {})
|
12
|
-
|
15
|
+
apply_options(options)
|
13
16
|
end
|
14
17
|
|
15
18
|
# Called once before processing records
|
@@ -27,6 +30,6 @@ module Chronicle
|
|
27
30
|
end
|
28
31
|
|
29
32
|
require_relative 'csv_loader'
|
33
|
+
require_relative 'json_loader'
|
30
34
|
require_relative 'rest_loader'
|
31
|
-
require_relative 'stdout_loader'
|
32
35
|
require_relative 'table_loader'
|
@@ -9,19 +9,19 @@ module Chronicle
|
|
9
9
|
r.description = 'a REST endpoint'
|
10
10
|
end
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
setting :hostname, required: true
|
13
|
+
setting :endpoint, required: true
|
14
|
+
setting :access_token
|
15
15
|
|
16
16
|
def load(record)
|
17
17
|
payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
|
18
18
|
# have the outer data key that json-api expects
|
19
19
|
payload = { data: payload } unless payload[:data]
|
20
20
|
|
21
|
-
uri = URI.parse("#{@
|
21
|
+
uri = URI.parse("#{@config.hostname}#{@config.endpoint}")
|
22
22
|
|
23
23
|
header = {
|
24
|
-
"Authorization" => "Bearer #{@
|
24
|
+
"Authorization" => "Bearer #{@config.access_token}",
|
25
25
|
"Content-Type": 'application/json'
|
26
26
|
}
|
27
27
|
use_ssl = uri.scheme == 'https'
|
@@ -9,59 +9,56 @@ module Chronicle
|
|
9
9
|
r.description = 'an ASCII table'
|
10
10
|
end
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
table_renderer: :basic
|
18
|
-
}.freeze
|
19
|
-
|
20
|
-
def initialize(options={})
|
21
|
-
@options = options.reverse_merge(DEFAULT_OPTIONS)
|
22
|
-
@records = []
|
23
|
-
end
|
12
|
+
setting :fields_limit, default: nil
|
13
|
+
setting :fields_exclude, default: ['lids', 'type']
|
14
|
+
setting :fields, default: []
|
15
|
+
setting :truncate_values_at, default: 40
|
16
|
+
setting :table_renderer, default: :basic
|
24
17
|
|
25
18
|
def load(record)
|
26
|
-
|
19
|
+
records << record.to_h_flattened
|
27
20
|
end
|
28
21
|
|
29
22
|
def finish
|
30
|
-
return if
|
23
|
+
return if records.empty?
|
31
24
|
|
32
|
-
headers = build_headers(
|
33
|
-
rows = build_rows(
|
25
|
+
headers = build_headers(records)
|
26
|
+
rows = build_rows(records, headers)
|
34
27
|
|
35
28
|
@table = TTY::Table.new(header: headers, rows: rows)
|
36
29
|
puts @table.render(
|
37
|
-
@
|
30
|
+
@config.table_renderer.to_sym,
|
38
31
|
padding: [0, 2, 0, 0]
|
39
32
|
)
|
40
33
|
end
|
41
34
|
|
35
|
+
def records
|
36
|
+
@records ||= []
|
37
|
+
end
|
38
|
+
|
42
39
|
private
|
43
40
|
|
44
41
|
def build_headers(records)
|
45
42
|
headers =
|
46
|
-
if @
|
47
|
-
Set[*@
|
43
|
+
if @config.fields.any?
|
44
|
+
Set[*@config.fields]
|
48
45
|
else
|
49
46
|
# use all the keys of the flattened record hash
|
50
47
|
Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
|
51
48
|
end
|
52
49
|
|
53
|
-
headers = headers.delete_if { |header| header.end_with?(*@
|
54
|
-
headers = headers.first(@
|
50
|
+
headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) } if @config.fields_exclude.any?
|
51
|
+
headers = headers.first(@config.fields_limit) if @config.fields_limit
|
55
52
|
|
56
53
|
headers.to_a.map(&:to_sym)
|
57
54
|
end
|
58
55
|
|
59
56
|
def build_rows(records, headers)
|
60
57
|
records.map do |record|
|
61
|
-
values = record.values_at(*headers).map{|value| value.to_s }
|
58
|
+
values = record.transform_keys(&:to_sym).values_at(*headers).map{|value| value.to_s }
|
62
59
|
|
63
|
-
if @
|
64
|
-
values = values.map{ |value| value.truncate(@
|
60
|
+
if @config.truncate_values_at
|
61
|
+
values = values.map{ |value| value.truncate(@config.truncate_values_at) }
|
65
62
|
end
|
66
63
|
|
67
64
|
values
|
data/lib/chronicle/etl/logger.rb
CHANGED
@@ -5,6 +5,9 @@ module Chronicle
|
|
5
5
|
module Models
|
6
6
|
# Represents a record that's been transformed by a Transformer and
|
7
7
|
# ready to be loaded. Loosely based on ActiveModel.
|
8
|
+
#
|
9
|
+
# @todo Experiment with just mixing in ActiveModel instead of this
|
10
|
+
# this reimplementation
|
8
11
|
class Base
|
9
12
|
ATTRIBUTES = [:provider, :provider_id, :lat, :lng, :metadata].freeze
|
10
13
|
ASSOCIATIONS = [].freeze
|
@@ -5,13 +5,19 @@ module Chronicle
|
|
5
5
|
module Models
|
6
6
|
class Entity < Chronicle::ETL::Models::Base
|
7
7
|
TYPE = 'entities'.freeze
|
8
|
-
ATTRIBUTES = [:title, :body, :represents, :slug, :myself, :metadata].freeze
|
8
|
+
ATTRIBUTES = [:title, :body, :provider_url, :represents, :slug, :myself, :metadata].freeze
|
9
|
+
|
10
|
+
# TODO: This desperately needs a validation system
|
9
11
|
ASSOCIATIONS = [
|
12
|
+
:involvements, # inverse of activity's `involved`
|
13
|
+
|
10
14
|
:attachments,
|
11
15
|
:abouts,
|
16
|
+
:aboutables, # inverse of above
|
12
17
|
:depicts,
|
13
18
|
:consumers,
|
14
|
-
:contains
|
19
|
+
:contains,
|
20
|
+
:containers # inverse of above
|
15
21
|
].freeze # TODO: add these to reflect Chronicle Schema
|
16
22
|
|
17
23
|
attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
# A record from an extraction with no processing or normalization applied
|
7
|
+
class Raw
|
8
|
+
TYPE = 'raw'
|
9
|
+
|
10
|
+
attr_accessor :raw_data
|
11
|
+
|
12
|
+
def initialize(raw_data)
|
13
|
+
@raw_data = raw_data
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_h
|
17
|
+
@raw_data.to_h
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_h_flattened
|
21
|
+
Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -3,6 +3,7 @@ module Chronicle
|
|
3
3
|
module Registry
|
4
4
|
# Records details about a connector such as its provider and a description
|
5
5
|
class ConnectorRegistration
|
6
|
+
# FIXME: refactor custom accessor methods later in file
|
6
7
|
attr_accessor :identifier, :provider, :klass, :description
|
7
8
|
|
8
9
|
def initialize(klass)
|
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -14,6 +14,7 @@ class Chronicle::ETL::Runner
|
|
14
14
|
@job_logger.start
|
15
15
|
loader.start
|
16
16
|
|
17
|
+
extractor.prepare
|
17
18
|
total = extractor.results_count
|
18
19
|
@progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
19
20
|
Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
|
@@ -27,9 +28,10 @@ class Chronicle::ETL::Runner
|
|
27
28
|
transformer = @job.instantiate_transformer(extraction)
|
28
29
|
record = transformer.transform
|
29
30
|
|
30
|
-
|
31
|
-
|
32
|
-
|
31
|
+
# TODO: rethink this
|
32
|
+
# unless record.is_a?(Chronicle::ETL::Models)
|
33
|
+
# raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
|
34
|
+
# end
|
33
35
|
|
34
36
|
Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
|
35
37
|
@job_logger.log_transformation(transformer)
|
@@ -51,7 +53,7 @@ class Chronicle::ETL::Runner
|
|
51
53
|
raise e
|
52
54
|
ensure
|
53
55
|
@job_logger.save
|
54
|
-
@progress_bar
|
56
|
+
@progress_bar&.finish
|
55
57
|
Chronicle::ETL::Logger.detach_from_progress_bar
|
56
58
|
Chronicle::ETL::Logger.info(tty_log_completion)
|
57
59
|
end
|
@@ -1,6 +1,12 @@
|
|
1
1
|
module Chronicle
|
2
2
|
module ETL
|
3
3
|
class JSONAPISerializer < Chronicle::ETL::Serializer
|
4
|
+
def initialize(*args)
|
5
|
+
super
|
6
|
+
|
7
|
+
raise(SerializationError, "Record must be a subclass of Chronicle::ETL::Model::Base") unless @record.is_a?(Chronicle::ETL::Models::Base)
|
8
|
+
end
|
9
|
+
|
4
10
|
def serializable_hash
|
5
11
|
@record
|
6
12
|
.identifier_hash
|
@@ -19,20 +19,14 @@ module Chronicle
|
|
19
19
|
r.description = 'an image file'
|
20
20
|
end
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
}.freeze
|
31
|
-
|
32
|
-
def initialize(*args)
|
33
|
-
super(*args)
|
34
|
-
@options = @options.reverse_merge(DEFAULT_OPTIONS)
|
35
|
-
end
|
22
|
+
setting :timestamp_strategy, default: 'file_mtime'
|
23
|
+
setting :id_strategy, default: 'file_hash'
|
24
|
+
setting :verb, default: 'photographed'
|
25
|
+
# EXIF tags often don't have timezones
|
26
|
+
setting :timezone_default, default: 'Eastern Time (US & Canada)'
|
27
|
+
setting :include_image_data, default: true
|
28
|
+
setting :actor
|
29
|
+
setting :involved
|
36
30
|
|
37
31
|
def transform
|
38
32
|
# FIXME: set @filename; use block for reading file when necessary
|
@@ -48,7 +42,7 @@ module Chronicle
|
|
48
42
|
|
49
43
|
def id
|
50
44
|
@id ||= begin
|
51
|
-
id = build_with_strategy(field: :id, strategy: @
|
45
|
+
id = build_with_strategy(field: :id, strategy: @config.id_strategy)
|
52
46
|
raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
|
53
47
|
|
54
48
|
id
|
@@ -57,7 +51,7 @@ module Chronicle
|
|
57
51
|
|
58
52
|
def timestamp
|
59
53
|
@timestamp ||= begin
|
60
|
-
ts = build_with_strategy(field: :timestamp, strategy: @
|
54
|
+
ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
|
61
55
|
raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
|
62
56
|
|
63
57
|
ts
|
@@ -68,8 +62,8 @@ module Chronicle
|
|
68
62
|
|
69
63
|
def build_created(file)
|
70
64
|
record = ::Chronicle::ETL::Models::Activity.new
|
71
|
-
record.verb = @
|
72
|
-
record.provider = @
|
65
|
+
record.verb = @config.verb
|
66
|
+
record.provider = @config.provider
|
73
67
|
record.provider_id = id
|
74
68
|
record.end_at = timestamp
|
75
69
|
record.dedupe_on = [[:provider_id, :verb, :provider]]
|
@@ -84,24 +78,24 @@ module Chronicle
|
|
84
78
|
def build_actor
|
85
79
|
actor = ::Chronicle::ETL::Models::Entity.new
|
86
80
|
actor.represents = 'identity'
|
87
|
-
actor.provider = @
|
88
|
-
actor.slug = @
|
81
|
+
actor.provider = @config.actor[:provider]
|
82
|
+
actor.slug = @config.actor[:slug]
|
89
83
|
actor.dedupe_on = [[:provider, :slug, :represents]]
|
90
84
|
actor
|
91
85
|
end
|
92
86
|
|
93
87
|
def build_image
|
94
88
|
image = ::Chronicle::ETL::Models::Entity.new
|
95
|
-
image.represents = @
|
89
|
+
image.represents = @config.involved[:represents]
|
96
90
|
image.title = build_title
|
97
91
|
image.body = exif['Description']
|
98
|
-
image.provider = @
|
92
|
+
image.provider = @config.involved[:provider]
|
99
93
|
image.provider_id = id
|
100
94
|
image.assign_attributes(build_gps)
|
101
95
|
image.dedupe_on = [[:provider, :provider_id, :represents]]
|
102
96
|
|
103
|
-
if @
|
104
|
-
ocr_text = build_with_strategy(field: :ocr, strategy: @
|
97
|
+
if @config.ocr_strategy
|
98
|
+
ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
|
105
99
|
image.metadata[:ocr_text] = ocr_text if ocr_text
|
106
100
|
end
|
107
101
|
|
@@ -111,7 +105,7 @@ module Chronicle
|
|
111
105
|
image.depicts = build_people_depicted(names)
|
112
106
|
image.abouts = build_keywords(tags)
|
113
107
|
|
114
|
-
if @
|
108
|
+
if @config.include_image_data
|
115
109
|
attachment = ::Chronicle::ETL::Models::Attachment.new
|
116
110
|
attachment.data = build_image_data
|
117
111
|
image.attachments = [attachment]
|
@@ -124,7 +118,7 @@ module Chronicle
|
|
124
118
|
topics.map do |topic|
|
125
119
|
t = ::Chronicle::ETL::Models::Entity.new
|
126
120
|
t.represents = 'topic'
|
127
|
-
t.provider = @
|
121
|
+
t.provider = @config.involved[:provider]
|
128
122
|
t.title = topic
|
129
123
|
t.slug = topic.parameterize
|
130
124
|
t.dedupe_on = [[:provider, :represents, :slug]]
|
@@ -136,7 +130,7 @@ module Chronicle
|
|
136
130
|
names.map do |name|
|
137
131
|
identity = ::Chronicle::ETL::Models::Entity.new
|
138
132
|
identity.represents = 'identity'
|
139
|
-
identity.provider = @
|
133
|
+
identity.provider = @config.involved[:provider]
|
140
134
|
identity.slug = name.parameterize
|
141
135
|
identity.title = name
|
142
136
|
identity.dedupe_on = [[:provider, :represents, :slug]]
|
@@ -199,7 +193,7 @@ module Chronicle
|
|
199
193
|
elsif false
|
200
194
|
# TODO: support option of using GPS coordinates to determine timezone
|
201
195
|
else
|
202
|
-
zone = ActiveSupport::TimeZone.new(@
|
196
|
+
zone = ActiveSupport::TimeZone.new(@config.timezone_default)
|
203
197
|
timestamp = zone.parse(timestamp.asctime)
|
204
198
|
end
|
205
199
|
|
@@ -3,14 +3,15 @@ module Chronicle
|
|
3
3
|
# Abstract class representing an Transformer for an ETL job
|
4
4
|
class Transformer
|
5
5
|
extend Chronicle::ETL::Registry::SelfRegistering
|
6
|
+
include Chronicle::ETL::Configurable
|
6
7
|
|
7
8
|
# Construct a new instance of this transformer. Options are passed in from a Runner
|
8
9
|
# == Parameters:
|
9
10
|
# options::
|
10
11
|
# Options for configuring this Transformer
|
11
|
-
def initialize(options = {}
|
12
|
-
@options = options
|
12
|
+
def initialize(extraction, options = {})
|
13
13
|
@extraction = extraction
|
14
|
+
apply_options(options)
|
14
15
|
end
|
15
16
|
|
16
17
|
# @abstract Subclass is expected to implement #transform
|
data/lib/chronicle/etl.rb
CHANGED
@@ -1,24 +1,32 @@
|
|
1
1
|
require_relative 'etl/registry/registry'
|
2
2
|
require_relative 'etl/config'
|
3
|
+
require_relative 'etl/configurable'
|
3
4
|
require_relative 'etl/exceptions'
|
4
5
|
require_relative 'etl/extraction'
|
5
|
-
require_relative 'etl/extractors/extractor'
|
6
6
|
require_relative 'etl/job_definition'
|
7
7
|
require_relative 'etl/job_log'
|
8
8
|
require_relative 'etl/job_logger'
|
9
9
|
require_relative 'etl/job'
|
10
|
-
require_relative 'etl/loaders/loader'
|
11
10
|
require_relative 'etl/logger'
|
12
11
|
require_relative 'etl/models/activity'
|
13
12
|
require_relative 'etl/models/attachment'
|
14
13
|
require_relative 'etl/models/base'
|
14
|
+
require_relative 'etl/models/raw'
|
15
15
|
require_relative 'etl/models/entity'
|
16
|
-
require_relative 'etl/models/generic'
|
17
16
|
require_relative 'etl/runner'
|
18
17
|
require_relative 'etl/serializers/serializer'
|
19
|
-
require_relative 'etl/transformers/transformer'
|
20
18
|
require_relative 'etl/utils/binary_attachments'
|
21
19
|
require_relative 'etl/utils/hash_utilities'
|
22
20
|
require_relative 'etl/utils/text_recognition'
|
23
21
|
require_relative 'etl/utils/progress_bar'
|
24
22
|
require_relative 'etl/version'
|
23
|
+
|
24
|
+
require_relative 'etl/extractors/extractor'
|
25
|
+
require_relative 'etl/loaders/loader'
|
26
|
+
require_relative 'etl/transformers/transformer'
|
27
|
+
|
28
|
+
begin
|
29
|
+
require 'pry'
|
30
|
+
rescue LoadError
|
31
|
+
# Pry not available
|
32
|
+
end
|