chronicle-etl 0.3.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +35 -0
  3. data/.rubocop.yml +28 -1
  4. data/Guardfile +7 -0
  5. data/README.md +149 -85
  6. data/Rakefile +4 -2
  7. data/chronicle-etl.gemspec +10 -5
  8. data/exe/chronicle-etl +1 -1
  9. data/lib/chronicle/etl/cli/connectors.rb +34 -0
  10. data/lib/chronicle/etl/cli/jobs.rb +44 -12
  11. data/lib/chronicle/etl/cli/main.rb +13 -19
  12. data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
  13. data/lib/chronicle/etl/cli.rb +7 -0
  14. data/lib/chronicle/etl/configurable.rb +158 -0
  15. data/lib/chronicle/etl/exceptions.rb +7 -1
  16. data/lib/chronicle/etl/extractors/csv_extractor.rb +24 -23
  17. data/lib/chronicle/etl/extractors/extractor.rb +23 -19
  18. data/lib/chronicle/etl/extractors/file_extractor.rb +34 -11
  19. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +76 -0
  20. data/lib/chronicle/etl/extractors/json_extractor.rb +19 -18
  21. data/lib/chronicle/etl/job.rb +1 -1
  22. data/lib/chronicle/etl/job_definition.rb +1 -1
  23. data/lib/chronicle/etl/loaders/csv_loader.rb +1 -1
  24. data/lib/chronicle/etl/loaders/json_loader.rb +44 -0
  25. data/lib/chronicle/etl/loaders/loader.rb +5 -2
  26. data/lib/chronicle/etl/loaders/rest_loader.rb +5 -5
  27. data/lib/chronicle/etl/loaders/table_loader.rb +21 -24
  28. data/lib/chronicle/etl/logger.rb +1 -0
  29. data/lib/chronicle/etl/models/base.rb +3 -0
  30. data/lib/chronicle/etl/models/entity.rb +8 -2
  31. data/lib/chronicle/etl/models/raw.rb +26 -0
  32. data/lib/chronicle/etl/registry/connector_registration.rb +1 -0
  33. data/lib/chronicle/etl/runner.rb +6 -4
  34. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +6 -0
  35. data/lib/chronicle/etl/serializers/raw_serializer.rb +10 -0
  36. data/lib/chronicle/etl/serializers/serializer.rb +2 -1
  37. data/lib/chronicle/etl/transformers/image_file_transformer.rb +22 -28
  38. data/lib/chronicle/etl/transformers/null_transformer.rb +1 -1
  39. data/lib/chronicle/etl/transformers/transformer.rb +3 -2
  40. data/lib/chronicle/etl/version.rb +1 -1
  41. data/lib/chronicle/etl.rb +12 -4
  42. metadata +80 -19
  43. data/.ruby-version +0 -1
  44. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +0 -104
  45. data/lib/chronicle/etl/loaders/stdout_loader.rb +0 -14
  46. data/lib/chronicle/etl/models/generic.rb +0 -23
@@ -0,0 +1,44 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JSONLoader < Chronicle::ETL::Loader
4
+ register_connector do |r|
5
+ r.description = 'json'
6
+ end
7
+
8
+ setting :serializer
9
+ setting :output, default: $stdout
10
+
11
+ def start
12
+ if @config.output == $stdout
13
+ @output = @config.output
14
+ else
15
+ @output = File.open(@config.output, "w")
16
+ end
17
+ end
18
+
19
+ def load(record)
20
+ serialized = serializer.serialize(record)
21
+
22
+ # When dealing with raw data, we can get improperly encoded strings
23
+ # (eg from sqlite database columns). We force conversion to UTF-8
24
+ # before converting into JSON
25
+ encoded = serialized.transform_values do |value|
26
+ next value unless value.is_a?(String)
27
+
28
+ value.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
29
+ end
30
+ @output.puts encoded.to_json
31
+ end
32
+
33
+ def finish
34
+ @output.close
35
+ end
36
+
37
+ private
38
+
39
+ def serializer
40
+ @config.serializer || Chronicle::ETL::RawSerializer
41
+ end
42
+ end
43
+ end
44
+ end
@@ -3,13 +3,16 @@ module Chronicle
3
3
  # Abstract class representing a Loader for an ETL job
4
4
  class Loader
5
5
  extend Chronicle::ETL::Registry::SelfRegistering
6
+ include Chronicle::ETL::Configurable
7
+
8
+ setting :output
6
9
 
7
10
  # Construct a new instance of this loader. Options are passed in from a Runner
8
11
  # == Parameters:
9
12
  # options::
10
13
  # Options for configuring this Loader
11
14
  def initialize(options = {})
12
- @options = options
15
+ apply_options(options)
13
16
  end
14
17
 
15
18
  # Called once before processing records
@@ -27,6 +30,6 @@ module Chronicle
27
30
  end
28
31
 
29
32
  require_relative 'csv_loader'
33
+ require_relative 'json_loader'
30
34
  require_relative 'rest_loader'
31
- require_relative 'stdout_loader'
32
35
  require_relative 'table_loader'
@@ -9,19 +9,19 @@ module Chronicle
9
9
  r.description = 'a REST endpoint'
10
10
  end
11
11
 
12
- def initialize( options={} )
13
- super(options)
14
- end
12
+ setting :hostname, required: true
13
+ setting :endpoint, required: true
14
+ setting :access_token
15
15
 
16
16
  def load(record)
17
17
  payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
18
18
  # have the outer data key that json-api expects
19
19
  payload = { data: payload } unless payload[:data]
20
20
 
21
- uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
21
+ uri = URI.parse("#{@config.hostname}#{@config.endpoint}")
22
22
 
23
23
  header = {
24
- "Authorization" => "Bearer #{@options[:access_token]}",
24
+ "Authorization" => "Bearer #{@config.access_token}",
25
25
  "Content-Type": 'application/json'
26
26
  }
27
27
  use_ssl = uri.scheme == 'https'
@@ -9,59 +9,56 @@ module Chronicle
9
9
  r.description = 'an ASCII table'
10
10
  end
11
11
 
12
- DEFAULT_OPTIONS = {
13
- fields_limit: nil,
14
- fields_exclude: ['lids', 'type'],
15
- fields_include: [],
16
- truncate_values_at: nil,
17
- table_renderer: :basic
18
- }.freeze
19
-
20
- def initialize(options={})
21
- @options = options.reverse_merge(DEFAULT_OPTIONS)
22
- @records = []
23
- end
12
+ setting :fields_limit, default: nil
13
+ setting :fields_exclude, default: ['lids', 'type']
14
+ setting :fields, default: []
15
+ setting :truncate_values_at, default: 40
16
+ setting :table_renderer, default: :basic
24
17
 
25
18
  def load(record)
26
- @records << record.to_h_flattened
19
+ records << record.to_h_flattened
27
20
  end
28
21
 
29
22
  def finish
30
- return if @records.empty?
23
+ return if records.empty?
31
24
 
32
- headers = build_headers(@records)
33
- rows = build_rows(@records, headers)
25
+ headers = build_headers(records)
26
+ rows = build_rows(records, headers)
34
27
 
35
28
  @table = TTY::Table.new(header: headers, rows: rows)
36
29
  puts @table.render(
37
- @options[:table_renderer].to_sym,
30
+ @config.table_renderer.to_sym,
38
31
  padding: [0, 2, 0, 0]
39
32
  )
40
33
  end
41
34
 
35
+ def records
36
+ @records ||= []
37
+ end
38
+
42
39
  private
43
40
 
44
41
  def build_headers(records)
45
42
  headers =
46
- if @options[:fields_include].any?
47
- Set[*@options[:fields_include]]
43
+ if @config.fields.any?
44
+ Set[*@config.fields]
48
45
  else
49
46
  # use all the keys of the flattened record hash
50
47
  Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
51
48
  end
52
49
 
53
- headers = headers.delete_if { |header| header.end_with?(*@options[:fields_exclude]) } if @options[:fields_exclude].any?
54
- headers = headers.first(@options[:fields_limit]) if @options[:fields_limit]
50
+ headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) } if @config.fields_exclude.any?
51
+ headers = headers.first(@config.fields_limit) if @config.fields_limit
55
52
 
56
53
  headers.to_a.map(&:to_sym)
57
54
  end
58
55
 
59
56
  def build_rows(records, headers)
60
57
  records.map do |record|
61
- values = record.values_at(*headers).map{|value| value.to_s }
58
+ values = record.transform_keys(&:to_sym).values_at(*headers).map{|value| value.to_s }
62
59
 
63
- if @options[:truncate_values_at]
64
- values = values.map{ |value| value.truncate(@options[:truncate_values_at]) }
60
+ if @config.truncate_values_at
61
+ values = values.map{ |value| value.truncate(@config.truncate_values_at) }
65
62
  end
66
63
 
67
64
  values
@@ -8,6 +8,7 @@ module Chronicle
8
8
  WARN = 2
9
9
  ERROR = 3
10
10
  FATAL = 4
11
+ SILENT = 5
11
12
 
12
13
  attr_accessor :log_level
13
14
 
@@ -5,6 +5,9 @@ module Chronicle
5
5
  module Models
6
6
  # Represents a record that's been transformed by a Transformer and
7
7
  # ready to be loaded. Loosely based on ActiveModel.
8
+ #
9
+ # @todo Experiment with just mixing in ActiveModel instead of this
10
+ # this reimplementation
8
11
  class Base
9
12
  ATTRIBUTES = [:provider, :provider_id, :lat, :lng, :metadata].freeze
10
13
  ASSOCIATIONS = [].freeze
@@ -5,13 +5,19 @@ module Chronicle
5
5
  module Models
6
6
  class Entity < Chronicle::ETL::Models::Base
7
7
  TYPE = 'entities'.freeze
8
- ATTRIBUTES = [:title, :body, :represents, :slug, :myself, :metadata].freeze
8
+ ATTRIBUTES = [:title, :body, :provider_url, :represents, :slug, :myself, :metadata].freeze
9
+
10
+ # TODO: This desperately needs a validation system
9
11
  ASSOCIATIONS = [
12
+ :involvements, # inverse of activity's `involved`
13
+
10
14
  :attachments,
11
15
  :abouts,
16
+ :aboutables, # inverse of above
12
17
  :depicts,
13
18
  :consumers,
14
- :contains
19
+ :contains,
20
+ :containers # inverse of above
15
21
  ].freeze # TODO: add these to reflect Chronicle Schema
16
22
 
17
23
  attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
@@ -0,0 +1,26 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ # A record from an extraction with no processing or normalization applied
7
+ class Raw
8
+ TYPE = 'raw'
9
+
10
+ attr_accessor :raw_data
11
+
12
+ def initialize(raw_data)
13
+ @raw_data = raw_data
14
+ end
15
+
16
+ def to_h
17
+ @raw_data.to_h
18
+ end
19
+
20
+ def to_h_flattened
21
+ Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -3,6 +3,7 @@ module Chronicle
3
3
  module Registry
4
4
  # Records details about a connector such as its provider and a description
5
5
  class ConnectorRegistration
6
+ # FIXME: refactor custom accessor methods later in file
6
7
  attr_accessor :identifier, :provider, :klass, :description
7
8
 
8
9
  def initialize(klass)
@@ -14,6 +14,7 @@ class Chronicle::ETL::Runner
14
14
  @job_logger.start
15
15
  loader.start
16
16
 
17
+ extractor.prepare
17
18
  total = extractor.results_count
18
19
  @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
19
20
  Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
@@ -27,9 +28,10 @@ class Chronicle::ETL::Runner
27
28
  transformer = @job.instantiate_transformer(extraction)
28
29
  record = transformer.transform
29
30
 
30
- unless record.is_a?(Chronicle::ETL::Models::Base)
31
- raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
32
- end
31
+ # TODO: rethink this
32
+ # unless record.is_a?(Chronicle::ETL::Models)
33
+ # raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
34
+ # end
33
35
 
34
36
  Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
35
37
  @job_logger.log_transformation(transformer)
@@ -51,7 +53,7 @@ class Chronicle::ETL::Runner
51
53
  raise e
52
54
  ensure
53
55
  @job_logger.save
54
- @progress_bar.finish
56
+ @progress_bar&.finish
55
57
  Chronicle::ETL::Logger.detach_from_progress_bar
56
58
  Chronicle::ETL::Logger.info(tty_log_completion)
57
59
  end
@@ -1,6 +1,12 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class JSONAPISerializer < Chronicle::ETL::Serializer
4
+ def initialize(*args)
5
+ super
6
+
7
+ raise(SerializationError, "Record must be a subclass of Chronicle::ETL::Model::Base") unless @record.is_a?(Chronicle::ETL::Models::Base)
8
+ end
9
+
4
10
  def serializable_hash
5
11
  @record
6
12
  .identifier_hash
@@ -0,0 +1,10 @@
1
+ module Chronicle
2
+ module ETL
3
+ # Take a Raw model and output `raw_data` as a hash
4
+ class RawSerializer < Chronicle::ETL::Serializer
5
+ def serializable_hash
6
+ @record.to_h
7
+ end
8
+ end
9
+ end
10
+ end
@@ -24,4 +24,5 @@ module Chronicle
24
24
  end
25
25
  end
26
26
 
27
- require_relative 'jsonapi_serializer'
27
+ require_relative 'jsonapi_serializer'
28
+ require_relative 'raw_serializer'
@@ -19,20 +19,14 @@ module Chronicle
19
19
  r.description = 'an image file'
20
20
  end
21
21
 
22
- DEFAULT_OPTIONS = {
23
- timestamp_strategy: 'file_mtime',
24
- id_strategy: 'file_hash',
25
- verb: 'photographed',
26
-
27
- # EXIF tags often don't have timezones
28
- timezone_default: 'Eastern Time (US & Canada)',
29
- include_image_data: true
30
- }.freeze
31
-
32
- def initialize(*args)
33
- super(*args)
34
- @options = @options.reverse_merge(DEFAULT_OPTIONS)
35
- end
22
+ setting :timestamp_strategy, default: 'file_mtime'
23
+ setting :id_strategy, default: 'file_hash'
24
+ setting :verb, default: 'photographed'
25
+ # EXIF tags often don't have timezones
26
+ setting :timezone_default, default: 'Eastern Time (US & Canada)'
27
+ setting :include_image_data, default: true
28
+ setting :actor
29
+ setting :involved
36
30
 
37
31
  def transform
38
32
  # FIXME: set @filename; use block for reading file when necessary
@@ -48,7 +42,7 @@ module Chronicle
48
42
 
49
43
  def id
50
44
  @id ||= begin
51
- id = build_with_strategy(field: :id, strategy: @options[:id_strategy])
45
+ id = build_with_strategy(field: :id, strategy: @config.id_strategy)
52
46
  raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
53
47
 
54
48
  id
@@ -57,7 +51,7 @@ module Chronicle
57
51
 
58
52
  def timestamp
59
53
  @timestamp ||= begin
60
- ts = build_with_strategy(field: :timestamp, strategy: @options[:timestamp_strategy])
54
+ ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
61
55
  raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
62
56
 
63
57
  ts
@@ -68,8 +62,8 @@ module Chronicle
68
62
 
69
63
  def build_created(file)
70
64
  record = ::Chronicle::ETL::Models::Activity.new
71
- record.verb = @options[:verb]
72
- record.provider = @options[:provider]
65
+ record.verb = @config.verb
66
+ record.provider = @config.provider
73
67
  record.provider_id = id
74
68
  record.end_at = timestamp
75
69
  record.dedupe_on = [[:provider_id, :verb, :provider]]
@@ -84,24 +78,24 @@ module Chronicle
84
78
  def build_actor
85
79
  actor = ::Chronicle::ETL::Models::Entity.new
86
80
  actor.represents = 'identity'
87
- actor.provider = @options[:actor][:provider]
88
- actor.slug = @options[:actor][:slug]
81
+ actor.provider = @config.actor[:provider]
82
+ actor.slug = @config.actor[:slug]
89
83
  actor.dedupe_on = [[:provider, :slug, :represents]]
90
84
  actor
91
85
  end
92
86
 
93
87
  def build_image
94
88
  image = ::Chronicle::ETL::Models::Entity.new
95
- image.represents = @options[:involved][:represents]
89
+ image.represents = @config.involved[:represents]
96
90
  image.title = build_title
97
91
  image.body = exif['Description']
98
- image.provider = @options[:involved][:provider]
92
+ image.provider = @config.involved[:provider]
99
93
  image.provider_id = id
100
94
  image.assign_attributes(build_gps)
101
95
  image.dedupe_on = [[:provider, :provider_id, :represents]]
102
96
 
103
- if @options[:ocr_strategy]
104
- ocr_text = build_with_strategy(field: :ocr, strategy: @options[:ocr_strategy])
97
+ if @config.ocr_strategy
98
+ ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
105
99
  image.metadata[:ocr_text] = ocr_text if ocr_text
106
100
  end
107
101
 
@@ -111,7 +105,7 @@ module Chronicle
111
105
  image.depicts = build_people_depicted(names)
112
106
  image.abouts = build_keywords(tags)
113
107
 
114
- if @options[:include_image_data]
108
+ if @config.include_image_data
115
109
  attachment = ::Chronicle::ETL::Models::Attachment.new
116
110
  attachment.data = build_image_data
117
111
  image.attachments = [attachment]
@@ -124,7 +118,7 @@ module Chronicle
124
118
  topics.map do |topic|
125
119
  t = ::Chronicle::ETL::Models::Entity.new
126
120
  t.represents = 'topic'
127
- t.provider = @options[:involved][:provider]
121
+ t.provider = @config.involved[:provider]
128
122
  t.title = topic
129
123
  t.slug = topic.parameterize
130
124
  t.dedupe_on = [[:provider, :represents, :slug]]
@@ -136,7 +130,7 @@ module Chronicle
136
130
  names.map do |name|
137
131
  identity = ::Chronicle::ETL::Models::Entity.new
138
132
  identity.represents = 'identity'
139
- identity.provider = @options[:involved][:provider]
133
+ identity.provider = @config.involved[:provider]
140
134
  identity.slug = name.parameterize
141
135
  identity.title = name
142
136
  identity.dedupe_on = [[:provider, :represents, :slug]]
@@ -199,7 +193,7 @@ module Chronicle
199
193
  elsif false
200
194
  # TODO: support option of using GPS coordinates to determine timezone
201
195
  else
202
- zone = ActiveSupport::TimeZone.new(@options[:timezone_default])
196
+ zone = ActiveSupport::TimeZone.new(@config.timezone_default)
203
197
  timestamp = zone.parse(timestamp.asctime)
204
198
  end
205
199
 
@@ -7,7 +7,7 @@ module Chronicle
7
7
  end
8
8
 
9
9
  def transform
10
- Chronicle::ETL::Models::Generic.new(@extraction.data)
10
+ Chronicle::ETL::Models::Raw.new(@extraction.data)
11
11
  end
12
12
 
13
13
  def timestamp; end
@@ -3,14 +3,15 @@ module Chronicle
3
3
  # Abstract class representing an Transformer for an ETL job
4
4
  class Transformer
5
5
  extend Chronicle::ETL::Registry::SelfRegistering
6
+ include Chronicle::ETL::Configurable
6
7
 
7
8
  # Construct a new instance of this transformer. Options are passed in from a Runner
8
9
  # == Parameters:
9
10
  # options::
10
11
  # Options for configuring this Transformer
11
- def initialize(options = {}, extraction)
12
- @options = options
12
+ def initialize(extraction, options = {})
13
13
  @extraction = extraction
14
+ apply_options(options)
14
15
  end
15
16
 
16
17
  # @abstract Subclass is expected to implement #transform
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.3.0"
3
+ VERSION = "0.4.1"
4
4
  end
5
5
  end
data/lib/chronicle/etl.rb CHANGED
@@ -1,24 +1,32 @@
1
1
  require_relative 'etl/registry/registry'
2
2
  require_relative 'etl/config'
3
+ require_relative 'etl/configurable'
3
4
  require_relative 'etl/exceptions'
4
5
  require_relative 'etl/extraction'
5
- require_relative 'etl/extractors/extractor'
6
6
  require_relative 'etl/job_definition'
7
7
  require_relative 'etl/job_log'
8
8
  require_relative 'etl/job_logger'
9
9
  require_relative 'etl/job'
10
- require_relative 'etl/loaders/loader'
11
10
  require_relative 'etl/logger'
12
11
  require_relative 'etl/models/activity'
13
12
  require_relative 'etl/models/attachment'
14
13
  require_relative 'etl/models/base'
14
+ require_relative 'etl/models/raw'
15
15
  require_relative 'etl/models/entity'
16
- require_relative 'etl/models/generic'
17
16
  require_relative 'etl/runner'
18
17
  require_relative 'etl/serializers/serializer'
19
- require_relative 'etl/transformers/transformer'
20
18
  require_relative 'etl/utils/binary_attachments'
21
19
  require_relative 'etl/utils/hash_utilities'
22
20
  require_relative 'etl/utils/text_recognition'
23
21
  require_relative 'etl/utils/progress_bar'
24
22
  require_relative 'etl/version'
23
+
24
+ require_relative 'etl/extractors/extractor'
25
+ require_relative 'etl/loaders/loader'
26
+ require_relative 'etl/transformers/transformer'
27
+
28
+ begin
29
+ require 'pry'
30
+ rescue LoadError
31
+ # Pry not available
32
+ end