chronicle-etl 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +35 -0
  3. data/.rubocop.yml +28 -1
  4. data/Guardfile +7 -0
  5. data/README.md +149 -85
  6. data/Rakefile +4 -2
  7. data/chronicle-etl.gemspec +10 -5
  8. data/exe/chronicle-etl +1 -1
  9. data/lib/chronicle/etl/cli/connectors.rb +34 -0
  10. data/lib/chronicle/etl/cli/jobs.rb +44 -12
  11. data/lib/chronicle/etl/cli/main.rb +13 -19
  12. data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
  13. data/lib/chronicle/etl/cli.rb +7 -0
  14. data/lib/chronicle/etl/configurable.rb +158 -0
  15. data/lib/chronicle/etl/exceptions.rb +7 -1
  16. data/lib/chronicle/etl/extractors/csv_extractor.rb +24 -23
  17. data/lib/chronicle/etl/extractors/extractor.rb +23 -19
  18. data/lib/chronicle/etl/extractors/file_extractor.rb +34 -11
  19. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +76 -0
  20. data/lib/chronicle/etl/extractors/json_extractor.rb +19 -18
  21. data/lib/chronicle/etl/job.rb +1 -1
  22. data/lib/chronicle/etl/job_definition.rb +1 -1
  23. data/lib/chronicle/etl/loaders/csv_loader.rb +1 -1
  24. data/lib/chronicle/etl/loaders/json_loader.rb +44 -0
  25. data/lib/chronicle/etl/loaders/loader.rb +5 -2
  26. data/lib/chronicle/etl/loaders/rest_loader.rb +5 -5
  27. data/lib/chronicle/etl/loaders/table_loader.rb +21 -24
  28. data/lib/chronicle/etl/logger.rb +1 -0
  29. data/lib/chronicle/etl/models/base.rb +3 -0
  30. data/lib/chronicle/etl/models/entity.rb +8 -2
  31. data/lib/chronicle/etl/models/raw.rb +26 -0
  32. data/lib/chronicle/etl/registry/connector_registration.rb +1 -0
  33. data/lib/chronicle/etl/runner.rb +6 -4
  34. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +6 -0
  35. data/lib/chronicle/etl/serializers/raw_serializer.rb +10 -0
  36. data/lib/chronicle/etl/serializers/serializer.rb +2 -1
  37. data/lib/chronicle/etl/transformers/image_file_transformer.rb +22 -28
  38. data/lib/chronicle/etl/transformers/null_transformer.rb +1 -1
  39. data/lib/chronicle/etl/transformers/transformer.rb +3 -2
  40. data/lib/chronicle/etl/version.rb +1 -1
  41. data/lib/chronicle/etl.rb +12 -4
  42. metadata +80 -19
  43. data/.ruby-version +0 -1
  44. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +0 -104
  45. data/lib/chronicle/etl/loaders/stdout_loader.rb +0 -14
  46. data/lib/chronicle/etl/models/generic.rb +0 -23
@@ -0,0 +1,44 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JSONLoader < Chronicle::ETL::Loader
4
+ register_connector do |r|
5
+ r.description = 'json'
6
+ end
7
+
8
+ setting :serializer
9
+ setting :output, default: $stdout
10
+
11
+ def start
12
+ if @config.output == $stdout
13
+ @output = @config.output
14
+ else
15
+ @output = File.open(@config.output, "w")
16
+ end
17
+ end
18
+
19
+ def load(record)
20
+ serialized = serializer.serialize(record)
21
+
22
+ # When dealing with raw data, we can get improperly encoded strings
23
+ # (eg from sqlite database columns). We force conversion to UTF-8
24
+ # before converting into JSON
25
+ encoded = serialized.transform_values do |value|
26
+ next value unless value.is_a?(String)
27
+
28
+ value.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
29
+ end
30
+ @output.puts encoded.to_json
31
+ end
32
+
33
+ def finish
34
+ @output.close
35
+ end
36
+
37
+ private
38
+
39
+ def serializer
40
+ @config.serializer || Chronicle::ETL::RawSerializer
41
+ end
42
+ end
43
+ end
44
+ end
@@ -3,13 +3,16 @@ module Chronicle
3
3
  # Abstract class representing a Loader for an ETL job
4
4
  class Loader
5
5
  extend Chronicle::ETL::Registry::SelfRegistering
6
+ include Chronicle::ETL::Configurable
7
+
8
+ setting :output
6
9
 
7
10
  # Construct a new instance of this loader. Options are passed in from a Runner
8
11
  # == Parameters:
9
12
  # options::
10
13
  # Options for configuring this Loader
11
14
  def initialize(options = {})
12
- @options = options
15
+ apply_options(options)
13
16
  end
14
17
 
15
18
  # Called once before processing records
@@ -27,6 +30,6 @@ module Chronicle
27
30
  end
28
31
 
29
32
  require_relative 'csv_loader'
33
+ require_relative 'json_loader'
30
34
  require_relative 'rest_loader'
31
- require_relative 'stdout_loader'
32
35
  require_relative 'table_loader'
@@ -9,19 +9,19 @@ module Chronicle
9
9
  r.description = 'a REST endpoint'
10
10
  end
11
11
 
12
- def initialize( options={} )
13
- super(options)
14
- end
12
+ setting :hostname, required: true
13
+ setting :endpoint, required: true
14
+ setting :access_token
15
15
 
16
16
  def load(record)
17
17
  payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
18
18
  # have the outer data key that json-api expects
19
19
  payload = { data: payload } unless payload[:data]
20
20
 
21
- uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
21
+ uri = URI.parse("#{@config.hostname}#{@config.endpoint}")
22
22
 
23
23
  header = {
24
- "Authorization" => "Bearer #{@options[:access_token]}",
24
+ "Authorization" => "Bearer #{@config.access_token}",
25
25
  "Content-Type": 'application/json'
26
26
  }
27
27
  use_ssl = uri.scheme == 'https'
@@ -9,59 +9,56 @@ module Chronicle
9
9
  r.description = 'an ASCII table'
10
10
  end
11
11
 
12
- DEFAULT_OPTIONS = {
13
- fields_limit: nil,
14
- fields_exclude: ['lids', 'type'],
15
- fields_include: [],
16
- truncate_values_at: nil,
17
- table_renderer: :basic
18
- }.freeze
19
-
20
- def initialize(options={})
21
- @options = options.reverse_merge(DEFAULT_OPTIONS)
22
- @records = []
23
- end
12
+ setting :fields_limit, default: nil
13
+ setting :fields_exclude, default: ['lids', 'type']
14
+ setting :fields, default: []
15
+ setting :truncate_values_at, default: 40
16
+ setting :table_renderer, default: :basic
24
17
 
25
18
  def load(record)
26
- @records << record.to_h_flattened
19
+ records << record.to_h_flattened
27
20
  end
28
21
 
29
22
  def finish
30
- return if @records.empty?
23
+ return if records.empty?
31
24
 
32
- headers = build_headers(@records)
33
- rows = build_rows(@records, headers)
25
+ headers = build_headers(records)
26
+ rows = build_rows(records, headers)
34
27
 
35
28
  @table = TTY::Table.new(header: headers, rows: rows)
36
29
  puts @table.render(
37
- @options[:table_renderer].to_sym,
30
+ @config.table_renderer.to_sym,
38
31
  padding: [0, 2, 0, 0]
39
32
  )
40
33
  end
41
34
 
35
+ def records
36
+ @records ||= []
37
+ end
38
+
42
39
  private
43
40
 
44
41
  def build_headers(records)
45
42
  headers =
46
- if @options[:fields_include].any?
47
- Set[*@options[:fields_include]]
43
+ if @config.fields.any?
44
+ Set[*@config.fields]
48
45
  else
49
46
  # use all the keys of the flattened record hash
50
47
  Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
51
48
  end
52
49
 
53
- headers = headers.delete_if { |header| header.end_with?(*@options[:fields_exclude]) } if @options[:fields_exclude].any?
54
- headers = headers.first(@options[:fields_limit]) if @options[:fields_limit]
50
+ headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) } if @config.fields_exclude.any?
51
+ headers = headers.first(@config.fields_limit) if @config.fields_limit
55
52
 
56
53
  headers.to_a.map(&:to_sym)
57
54
  end
58
55
 
59
56
  def build_rows(records, headers)
60
57
  records.map do |record|
61
- values = record.values_at(*headers).map{|value| value.to_s }
58
+ values = record.transform_keys(&:to_sym).values_at(*headers).map{|value| value.to_s }
62
59
 
63
- if @options[:truncate_values_at]
64
- values = values.map{ |value| value.truncate(@options[:truncate_values_at]) }
60
+ if @config.truncate_values_at
61
+ values = values.map{ |value| value.truncate(@config.truncate_values_at) }
65
62
  end
66
63
 
67
64
  values
@@ -8,6 +8,7 @@ module Chronicle
8
8
  WARN = 2
9
9
  ERROR = 3
10
10
  FATAL = 4
11
+ SILENT = 5
11
12
 
12
13
  attr_accessor :log_level
13
14
 
@@ -5,6 +5,9 @@ module Chronicle
5
5
  module Models
6
6
  # Represents a record that's been transformed by a Transformer and
7
7
  # ready to be loaded. Loosely based on ActiveModel.
8
+ #
9
+ # @todo Experiment with just mixing in ActiveModel instead of this
10
+ # this reimplementation
8
11
  class Base
9
12
  ATTRIBUTES = [:provider, :provider_id, :lat, :lng, :metadata].freeze
10
13
  ASSOCIATIONS = [].freeze
@@ -5,13 +5,19 @@ module Chronicle
5
5
  module Models
6
6
  class Entity < Chronicle::ETL::Models::Base
7
7
  TYPE = 'entities'.freeze
8
- ATTRIBUTES = [:title, :body, :represents, :slug, :myself, :metadata].freeze
8
+ ATTRIBUTES = [:title, :body, :provider_url, :represents, :slug, :myself, :metadata].freeze
9
+
10
+ # TODO: This desperately needs a validation system
9
11
  ASSOCIATIONS = [
12
+ :involvements, # inverse of activity's `involved`
13
+
10
14
  :attachments,
11
15
  :abouts,
16
+ :aboutables, # inverse of above
12
17
  :depicts,
13
18
  :consumers,
14
- :contains
19
+ :contains,
20
+ :containers # inverse of above
15
21
  ].freeze # TODO: add these to reflect Chronicle Schema
16
22
 
17
23
  attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
@@ -0,0 +1,26 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ # A record from an extraction with no processing or normalization applied
7
+ class Raw
8
+ TYPE = 'raw'
9
+
10
+ attr_accessor :raw_data
11
+
12
+ def initialize(raw_data)
13
+ @raw_data = raw_data
14
+ end
15
+
16
+ def to_h
17
+ @raw_data.to_h
18
+ end
19
+
20
+ def to_h_flattened
21
+ Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -3,6 +3,7 @@ module Chronicle
3
3
  module Registry
4
4
  # Records details about a connector such as its provider and a description
5
5
  class ConnectorRegistration
6
+ # FIXME: refactor custom accessor methods later in file
6
7
  attr_accessor :identifier, :provider, :klass, :description
7
8
 
8
9
  def initialize(klass)
@@ -14,6 +14,7 @@ class Chronicle::ETL::Runner
14
14
  @job_logger.start
15
15
  loader.start
16
16
 
17
+ extractor.prepare
17
18
  total = extractor.results_count
18
19
  @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
19
20
  Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
@@ -27,9 +28,10 @@ class Chronicle::ETL::Runner
27
28
  transformer = @job.instantiate_transformer(extraction)
28
29
  record = transformer.transform
29
30
 
30
- unless record.is_a?(Chronicle::ETL::Models::Base)
31
- raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
32
- end
31
+ # TODO: rethink this
32
+ # unless record.is_a?(Chronicle::ETL::Models)
33
+ # raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
34
+ # end
33
35
 
34
36
  Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
35
37
  @job_logger.log_transformation(transformer)
@@ -51,7 +53,7 @@ class Chronicle::ETL::Runner
51
53
  raise e
52
54
  ensure
53
55
  @job_logger.save
54
- @progress_bar.finish
56
+ @progress_bar&.finish
55
57
  Chronicle::ETL::Logger.detach_from_progress_bar
56
58
  Chronicle::ETL::Logger.info(tty_log_completion)
57
59
  end
@@ -1,6 +1,12 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class JSONAPISerializer < Chronicle::ETL::Serializer
4
+ def initialize(*args)
5
+ super
6
+
7
+ raise(SerializationError, "Record must be a subclass of Chronicle::ETL::Model::Base") unless @record.is_a?(Chronicle::ETL::Models::Base)
8
+ end
9
+
4
10
  def serializable_hash
5
11
  @record
6
12
  .identifier_hash
@@ -0,0 +1,10 @@
1
+ module Chronicle
2
+ module ETL
3
+ # Take a Raw model and output `raw_data` as a hash
4
+ class RawSerializer < Chronicle::ETL::Serializer
5
+ def serializable_hash
6
+ @record.to_h
7
+ end
8
+ end
9
+ end
10
+ end
@@ -24,4 +24,5 @@ module Chronicle
24
24
  end
25
25
  end
26
26
 
27
- require_relative 'jsonapi_serializer'
27
+ require_relative 'jsonapi_serializer'
28
+ require_relative 'raw_serializer'
@@ -19,20 +19,14 @@ module Chronicle
19
19
  r.description = 'an image file'
20
20
  end
21
21
 
22
- DEFAULT_OPTIONS = {
23
- timestamp_strategy: 'file_mtime',
24
- id_strategy: 'file_hash',
25
- verb: 'photographed',
26
-
27
- # EXIF tags often don't have timezones
28
- timezone_default: 'Eastern Time (US & Canada)',
29
- include_image_data: true
30
- }.freeze
31
-
32
- def initialize(*args)
33
- super(*args)
34
- @options = @options.reverse_merge(DEFAULT_OPTIONS)
35
- end
22
+ setting :timestamp_strategy, default: 'file_mtime'
23
+ setting :id_strategy, default: 'file_hash'
24
+ setting :verb, default: 'photographed'
25
+ # EXIF tags often don't have timezones
26
+ setting :timezone_default, default: 'Eastern Time (US & Canada)'
27
+ setting :include_image_data, default: true
28
+ setting :actor
29
+ setting :involved
36
30
 
37
31
  def transform
38
32
  # FIXME: set @filename; use block for reading file when necessary
@@ -48,7 +42,7 @@ module Chronicle
48
42
 
49
43
  def id
50
44
  @id ||= begin
51
- id = build_with_strategy(field: :id, strategy: @options[:id_strategy])
45
+ id = build_with_strategy(field: :id, strategy: @config.id_strategy)
52
46
  raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
53
47
 
54
48
  id
@@ -57,7 +51,7 @@ module Chronicle
57
51
 
58
52
  def timestamp
59
53
  @timestamp ||= begin
60
- ts = build_with_strategy(field: :timestamp, strategy: @options[:timestamp_strategy])
54
+ ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
61
55
  raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
62
56
 
63
57
  ts
@@ -68,8 +62,8 @@ module Chronicle
68
62
 
69
63
  def build_created(file)
70
64
  record = ::Chronicle::ETL::Models::Activity.new
71
- record.verb = @options[:verb]
72
- record.provider = @options[:provider]
65
+ record.verb = @config.verb
66
+ record.provider = @config.provider
73
67
  record.provider_id = id
74
68
  record.end_at = timestamp
75
69
  record.dedupe_on = [[:provider_id, :verb, :provider]]
@@ -84,24 +78,24 @@ module Chronicle
84
78
  def build_actor
85
79
  actor = ::Chronicle::ETL::Models::Entity.new
86
80
  actor.represents = 'identity'
87
- actor.provider = @options[:actor][:provider]
88
- actor.slug = @options[:actor][:slug]
81
+ actor.provider = @config.actor[:provider]
82
+ actor.slug = @config.actor[:slug]
89
83
  actor.dedupe_on = [[:provider, :slug, :represents]]
90
84
  actor
91
85
  end
92
86
 
93
87
  def build_image
94
88
  image = ::Chronicle::ETL::Models::Entity.new
95
- image.represents = @options[:involved][:represents]
89
+ image.represents = @config.involved[:represents]
96
90
  image.title = build_title
97
91
  image.body = exif['Description']
98
- image.provider = @options[:involved][:provider]
92
+ image.provider = @config.involved[:provider]
99
93
  image.provider_id = id
100
94
  image.assign_attributes(build_gps)
101
95
  image.dedupe_on = [[:provider, :provider_id, :represents]]
102
96
 
103
- if @options[:ocr_strategy]
104
- ocr_text = build_with_strategy(field: :ocr, strategy: @options[:ocr_strategy])
97
+ if @config.ocr_strategy
98
+ ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
105
99
  image.metadata[:ocr_text] = ocr_text if ocr_text
106
100
  end
107
101
 
@@ -111,7 +105,7 @@ module Chronicle
111
105
  image.depicts = build_people_depicted(names)
112
106
  image.abouts = build_keywords(tags)
113
107
 
114
- if @options[:include_image_data]
108
+ if @config.include_image_data
115
109
  attachment = ::Chronicle::ETL::Models::Attachment.new
116
110
  attachment.data = build_image_data
117
111
  image.attachments = [attachment]
@@ -124,7 +118,7 @@ module Chronicle
124
118
  topics.map do |topic|
125
119
  t = ::Chronicle::ETL::Models::Entity.new
126
120
  t.represents = 'topic'
127
- t.provider = @options[:involved][:provider]
121
+ t.provider = @config.involved[:provider]
128
122
  t.title = topic
129
123
  t.slug = topic.parameterize
130
124
  t.dedupe_on = [[:provider, :represents, :slug]]
@@ -136,7 +130,7 @@ module Chronicle
136
130
  names.map do |name|
137
131
  identity = ::Chronicle::ETL::Models::Entity.new
138
132
  identity.represents = 'identity'
139
- identity.provider = @options[:involved][:provider]
133
+ identity.provider = @config.involved[:provider]
140
134
  identity.slug = name.parameterize
141
135
  identity.title = name
142
136
  identity.dedupe_on = [[:provider, :represents, :slug]]
@@ -199,7 +193,7 @@ module Chronicle
199
193
  elsif false
200
194
  # TODO: support option of using GPS coordinates to determine timezone
201
195
  else
202
- zone = ActiveSupport::TimeZone.new(@options[:timezone_default])
196
+ zone = ActiveSupport::TimeZone.new(@config.timezone_default)
203
197
  timestamp = zone.parse(timestamp.asctime)
204
198
  end
205
199
 
@@ -7,7 +7,7 @@ module Chronicle
7
7
  end
8
8
 
9
9
  def transform
10
- Chronicle::ETL::Models::Generic.new(@extraction.data)
10
+ Chronicle::ETL::Models::Raw.new(@extraction.data)
11
11
  end
12
12
 
13
13
  def timestamp; end
@@ -3,14 +3,15 @@ module Chronicle
3
3
  # Abstract class representing an Transformer for an ETL job
4
4
  class Transformer
5
5
  extend Chronicle::ETL::Registry::SelfRegistering
6
+ include Chronicle::ETL::Configurable
6
7
 
7
8
  # Construct a new instance of this transformer. Options are passed in from a Runner
8
9
  # == Parameters:
9
10
  # options::
10
11
  # Options for configuring this Transformer
11
- def initialize(options = {}, extraction)
12
- @options = options
12
+ def initialize(extraction, options = {})
13
13
  @extraction = extraction
14
+ apply_options(options)
14
15
  end
15
16
 
16
17
  # @abstract Subclass is expected to implement #transform
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.3.0"
3
+ VERSION = "0.4.1"
4
4
  end
5
5
  end
data/lib/chronicle/etl.rb CHANGED
@@ -1,24 +1,32 @@
1
1
  require_relative 'etl/registry/registry'
2
2
  require_relative 'etl/config'
3
+ require_relative 'etl/configurable'
3
4
  require_relative 'etl/exceptions'
4
5
  require_relative 'etl/extraction'
5
- require_relative 'etl/extractors/extractor'
6
6
  require_relative 'etl/job_definition'
7
7
  require_relative 'etl/job_log'
8
8
  require_relative 'etl/job_logger'
9
9
  require_relative 'etl/job'
10
- require_relative 'etl/loaders/loader'
11
10
  require_relative 'etl/logger'
12
11
  require_relative 'etl/models/activity'
13
12
  require_relative 'etl/models/attachment'
14
13
  require_relative 'etl/models/base'
14
+ require_relative 'etl/models/raw'
15
15
  require_relative 'etl/models/entity'
16
- require_relative 'etl/models/generic'
17
16
  require_relative 'etl/runner'
18
17
  require_relative 'etl/serializers/serializer'
19
- require_relative 'etl/transformers/transformer'
20
18
  require_relative 'etl/utils/binary_attachments'
21
19
  require_relative 'etl/utils/hash_utilities'
22
20
  require_relative 'etl/utils/text_recognition'
23
21
  require_relative 'etl/utils/progress_bar'
24
22
  require_relative 'etl/version'
23
+
24
+ require_relative 'etl/extractors/extractor'
25
+ require_relative 'etl/loaders/loader'
26
+ require_relative 'etl/transformers/transformer'
27
+
28
+ begin
29
+ require 'pry'
30
+ rescue LoadError
31
+ # Pry not available
32
+ end