chronicle-etl 0.4.0 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +2 -2
  3. data/.rubocop.yml +3 -0
  4. data/README.md +156 -81
  5. data/chronicle-etl.gemspec +3 -0
  6. data/lib/chronicle/etl/cli/cli_base.rb +31 -0
  7. data/lib/chronicle/etl/cli/connectors.rb +4 -11
  8. data/lib/chronicle/etl/cli/jobs.rb +49 -22
  9. data/lib/chronicle/etl/cli/main.rb +32 -1
  10. data/lib/chronicle/etl/cli/plugins.rb +62 -0
  11. data/lib/chronicle/etl/cli/subcommand_base.rb +1 -1
  12. data/lib/chronicle/etl/cli.rb +3 -0
  13. data/lib/chronicle/etl/config.rb +7 -4
  14. data/lib/chronicle/etl/configurable.rb +15 -2
  15. data/lib/chronicle/etl/exceptions.rb +29 -2
  16. data/lib/chronicle/etl/extractors/csv_extractor.rb +24 -17
  17. data/lib/chronicle/etl/extractors/extractor.rb +5 -5
  18. data/lib/chronicle/etl/extractors/file_extractor.rb +33 -13
  19. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +76 -0
  20. data/lib/chronicle/etl/extractors/json_extractor.rb +21 -12
  21. data/lib/chronicle/etl/job.rb +7 -1
  22. data/lib/chronicle/etl/job_definition.rb +32 -6
  23. data/lib/chronicle/etl/loaders/csv_loader.rb +35 -8
  24. data/lib/chronicle/etl/loaders/helpers/encoding_helper.rb +18 -0
  25. data/lib/chronicle/etl/loaders/json_loader.rb +44 -0
  26. data/lib/chronicle/etl/loaders/loader.rb +24 -1
  27. data/lib/chronicle/etl/loaders/table_loader.rb +13 -26
  28. data/lib/chronicle/etl/logger.rb +6 -2
  29. data/lib/chronicle/etl/models/base.rb +3 -0
  30. data/lib/chronicle/etl/models/entity.rb +8 -2
  31. data/lib/chronicle/etl/models/raw.rb +26 -0
  32. data/lib/chronicle/etl/registry/connector_registration.rb +5 -0
  33. data/lib/chronicle/etl/registry/plugin_registry.rb +75 -0
  34. data/lib/chronicle/etl/registry/registry.rb +27 -14
  35. data/lib/chronicle/etl/runner.rb +35 -17
  36. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +6 -0
  37. data/lib/chronicle/etl/serializers/raw_serializer.rb +10 -0
  38. data/lib/chronicle/etl/serializers/serializer.rb +2 -1
  39. data/lib/chronicle/etl/transformers/null_transformer.rb +1 -1
  40. data/lib/chronicle/etl/version.rb +1 -1
  41. data/lib/chronicle/etl.rb +11 -4
  42. metadata +53 -6
  43. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +0 -104
  44. data/lib/chronicle/etl/loaders/stdout_loader.rb +0 -14
  45. data/lib/chronicle/etl/models/generic.rb +0 -23
@@ -0,0 +1,44 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JSONLoader < Chronicle::ETL::Loader
4
+ register_connector do |r|
5
+ r.description = 'json'
6
+ end
7
+
8
+ setting :serializer
9
+ setting :output, default: $stdout
10
+
11
+ def start
12
+ if @config.output == $stdout
13
+ @output = @config.output
14
+ else
15
+ @output = File.open(@config.output, "w")
16
+ end
17
+ end
18
+
19
+ def load(record)
20
+ serialized = serializer.serialize(record)
21
+
22
+ # When dealing with raw data, we can get improperly encoded strings
23
+ # (eg from sqlite database columns). We force conversion to UTF-8
24
+ # before converting into JSON
25
+ encoded = serialized.transform_values do |value|
26
+ next value unless value.is_a?(String)
27
+
28
+ force_utf8(value)
29
+ end
30
+ @output.puts encoded.to_json
31
+ end
32
+
33
+ def finish
34
+ @output.close
35
+ end
36
+
37
+ private
38
+
39
+ def serializer
40
+ @config.serializer || Chronicle::ETL::RawSerializer
41
+ end
42
+ end
43
+ end
44
+ end
@@ -1,11 +1,17 @@
1
+ require_relative 'helpers/encoding_helper'
2
+
1
3
  module Chronicle
2
4
  module ETL
3
5
  # Abstract class representing a Loader for an ETL job
4
6
  class Loader
5
7
  extend Chronicle::ETL::Registry::SelfRegistering
6
8
  include Chronicle::ETL::Configurable
9
+ include Chronicle::ETL::Loaders::Helpers::EncodingHelper
7
10
 
8
11
  setting :output
12
+ setting :fields
13
+ setting :fields_limit, default: nil
14
+ setting :fields_exclude
9
15
 
10
16
  # Construct a new instance of this loader. Options are passed in from a Runner
11
17
  # == Parameters:
@@ -25,11 +31,28 @@ module Chronicle
25
31
 
26
32
  # Called once there are no more records to process
27
33
  def finish; end
34
+
35
+ private
36
+
37
+ def build_headers(records)
38
+ headers =
39
+ if @config.fields && @config.fields.any?
40
+ Set[*@config.fields]
41
+ else
42
+ # use all the keys of the flattened record hash
43
+ Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
44
+ end
45
+
46
+ headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) }
47
+ headers = headers.first(@config.fields_limit) if @config.fields_limit
48
+
49
+ headers.to_a.map(&:to_sym)
50
+ end
28
51
  end
29
52
  end
30
53
  end
31
54
 
32
55
  require_relative 'csv_loader'
56
+ require_relative 'json_loader'
33
57
  require_relative 'rest_loader'
34
- require_relative 'stdout_loader'
35
58
  require_relative 'table_loader'
@@ -9,51 +9,38 @@ module Chronicle
9
9
  r.description = 'an ASCII table'
10
10
  end
11
11
 
12
- setting :fields_limit, default: nil
13
- setting :fields_exclude, default: ['lids', 'type']
14
- setting :fields_include, default: []
15
12
  setting :truncate_values_at, default: 40
16
13
  setting :table_renderer, default: :basic
14
+ setting :fields_exclude, default: ['lids', 'type']
15
+ setting :header_row, default: true
17
16
 
18
17
  def load(record)
19
- @records ||= []
20
- @records << record.to_h_flattened
18
+ records << record.to_h_flattened
21
19
  end
22
20
 
23
21
  def finish
24
- return if @records.empty?
22
+ return if records.empty?
25
23
 
26
- headers = build_headers(@records)
27
- rows = build_rows(@records, headers)
24
+ headers = build_headers(records)
25
+ rows = build_rows(records, headers)
28
26
 
29
- @table = TTY::Table.new(header: headers, rows: rows)
27
+ @table = TTY::Table.new(header: (headers if @config.header_row), rows: rows)
30
28
  puts @table.render(
31
29
  @config.table_renderer.to_sym,
32
30
  padding: [0, 2, 0, 0]
33
31
  )
34
32
  end
35
33
 
36
- private
37
-
38
- def build_headers(records)
39
- headers =
40
- if @config.fields_include.any?
41
- Set[*@config.fields_include]
42
- else
43
- # use all the keys of the flattened record hash
44
- Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
45
- end
46
-
47
- headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) } if @config.fields_exclude.any?
48
- headers = headers.first(@config.fields_limit) if @config.fields_limit
49
-
50
- headers.to_a.map(&:to_sym)
34
+ def records
35
+ @records ||= []
51
36
  end
52
37
 
38
+ private
39
+
53
40
  def build_rows(records, headers)
54
41
  records.map do |record|
55
- values = record.values_at(*headers).map{|value| value.to_s }
56
-
42
+ values = record.transform_keys(&:to_sym).values_at(*headers).map{|value| value.to_s }
43
+ values = values.map { |value| force_utf8(value) }
57
44
  if @config.truncate_values_at
58
45
  values = values.map{ |value| value.truncate(@config.truncate_values_at) }
59
46
  end
@@ -8,11 +8,11 @@ module Chronicle
8
8
  WARN = 2
9
9
  ERROR = 3
10
10
  FATAL = 4
11
+ SILENT = 5
11
12
 
12
13
  attr_accessor :log_level
13
14
 
14
15
  @log_level = INFO
15
- @destination = $stderr
16
16
 
17
17
  def output message, level
18
18
  return unless level >= @log_level
@@ -20,10 +20,14 @@ module Chronicle
20
20
  if @progress_bar
21
21
  @progress_bar.log(message)
22
22
  else
23
- @destination.puts(message)
23
+ $stderr.puts(message)
24
24
  end
25
25
  end
26
26
 
27
+ def fatal(message)
28
+ output(message, FATAL)
29
+ end
30
+
27
31
  def error(message)
28
32
  output(message, ERROR)
29
33
  end
@@ -5,6 +5,9 @@ module Chronicle
5
5
  module Models
6
6
  # Represents a record that's been transformed by a Transformer and
7
7
  # ready to be loaded. Loosely based on ActiveModel.
8
+ #
9
+ # @todo Experiment with just mixing in ActiveModel instead of this
10
+ # this reimplementation
8
11
  class Base
9
12
  ATTRIBUTES = [:provider, :provider_id, :lat, :lng, :metadata].freeze
10
13
  ASSOCIATIONS = [].freeze
@@ -5,13 +5,19 @@ module Chronicle
5
5
  module Models
6
6
  class Entity < Chronicle::ETL::Models::Base
7
7
  TYPE = 'entities'.freeze
8
- ATTRIBUTES = [:title, :body, :represents, :slug, :myself, :metadata].freeze
8
+ ATTRIBUTES = [:title, :body, :provider_url, :represents, :slug, :myself, :metadata].freeze
9
+
10
+ # TODO: This desperately needs a validation system
9
11
  ASSOCIATIONS = [
12
+ :involvements, # inverse of activity's `involved`
13
+
10
14
  :attachments,
11
15
  :abouts,
16
+ :aboutables, # inverse of above
12
17
  :depicts,
13
18
  :consumers,
14
- :contains
19
+ :contains,
20
+ :containers # inverse of above
15
21
  ].freeze # TODO: add these to reflect Chronicle Schema
16
22
 
17
23
  attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
@@ -0,0 +1,26 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ # A record from an extraction with no processing or normalization applied
7
+ class Raw
8
+ TYPE = 'raw'
9
+
10
+ attr_accessor :raw_data
11
+
12
+ def initialize(raw_data)
13
+ @raw_data = raw_data
14
+ end
15
+
16
+ def to_h
17
+ @raw_data.to_h
18
+ end
19
+
20
+ def to_h_flattened
21
+ Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -44,6 +44,11 @@ module Chronicle
44
44
  @provider || (built_in? ? 'chronicle' : '')
45
45
  end
46
46
 
47
+ # TODO: allow overriding here. Maybe through self-registration process
48
+ def plugin
49
+ @provider
50
+ end
51
+
47
52
  def descriptive_phrase
48
53
  prefix = case phase
49
54
  when :extractor
@@ -0,0 +1,75 @@
1
+ require 'rubygems'
2
+ require 'rubygems/command'
3
+ require 'rubygems/commands/install_command'
4
+ require 'rubygems/uninstaller'
5
+
6
+ module Chronicle
7
+ module ETL
8
+ module Registry
9
+ # Responsible for managing plugins available to chronicle-etl
10
+ #
11
+ # @todo Better validation for whether a gem is actually a plugin
12
+ # @todo Add ways to load a plugin that don't require a gem on rubygems.org
13
+ module PluginRegistry
14
+ # Does this plugin exist?
15
+ def self.exists?(name)
16
+ # TODO: implement this. Could query rubygems.org or have a
17
+ # hardcoded approved list
18
+ true
19
+ end
20
+
21
+ # All versions of all plugins currently installed
22
+ def self.all_installed
23
+ # TODO: add check for chronicle-etl dependency
24
+ Gem::Specification.filter { |s| s.name.match(/^chronicle-/) && s.name != "chronicle-etl" }
25
+ end
26
+
27
+ # Latest version of each installed plugin
28
+ def self.all_installed_latest
29
+ all_installed.group_by(&:name)
30
+ .transform_values { |versions| versions.sort_by(&:version).reverse.first }
31
+ .values
32
+ end
33
+
34
+ # Activate a plugin with given name by `require`ing it
35
+ def self.activate(name)
36
+ # By default, activates the latest available version of a gem
37
+ # so don't have to run Kernel#gem separately
38
+ require "chronicle/#{name}"
39
+ rescue Gem::ConflictError => e
40
+ # TODO: figure out if there's more we can do here
41
+ raise Chronicle::ETL::PluginConflictError.new(name), "Plugin '#{name}' couldn't be loaded. #{e.message}"
42
+ rescue LoadError => e
43
+ raise Chronicle::ETL::PluginLoadError.new(name), "Plugin '#{name}' couldn't be loaded" if exists?(name)
44
+
45
+ raise Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} doesn't exist"
46
+ end
47
+
48
+ # Install a plugin to local gems
49
+ def self.install(name)
50
+ gem_name = "chronicle-#{name}"
51
+ raise(Chronicle::ETL::PluginNotAvailableError.new(gem_name), "Plugin #{name} doesn't exist") unless exists?(gem_name)
52
+
53
+ Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
54
+ Gem.install(gem_name)
55
+
56
+ activate(name)
57
+ rescue Gem::UnsatisfiableDependencyError
58
+ # TODO: we need to catch a lot more than this here
59
+ raise Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} could not be installed."
60
+ end
61
+
62
+ # Uninstall a plugin
63
+ def self.uninstall(name)
64
+ gem_name = "chronicle-#{name}"
65
+ Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
66
+ uninstaller = Gem::Uninstaller.new(gem_name)
67
+ uninstaller.uninstall
68
+ rescue Gem::InstallError
69
+ # TODO: strengthen this exception handling
70
+ raise(Chronicle::ETL::PluginError.new(name), "Plugin #{name} wasn't uninstalled")
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
@@ -20,28 +20,40 @@ module Chronicle
20
20
  end
21
21
  end
22
22
 
23
- def install_connector name
24
- gem_name = "chronicle-#{name}"
25
- Gem.install(gem_name)
23
+ def register connector
24
+ connectors << connector
26
25
  end
27
26
 
28
- def register connector
27
+ def connectors
29
28
  @connectors ||= []
30
- @connectors << connector
31
29
  end
32
30
 
33
31
  def find_by_phase_and_identifier(phase, identifier)
34
- connector = find_within_loaded_connectors(phase, identifier)
35
- unless connector
36
- # Only load external connectors (slow) if not found in built-in connectors
37
- load_all!
38
- connector = find_within_loaded_connectors(phase, identifier)
32
+ # Simple case: built in connector
33
+ connector = connectors.find { |c| c.phase == phase && c.identifier == identifier }
34
+ return connector if connector
35
+
36
+ # if not available in built-in connectors, try to activate a
37
+ # relevant plugin and try again
38
+ if identifier.include?(":")
39
+ plugin, name = identifier.split(":")
40
+ else
41
+ # This case handles the case where the identifier is a
42
+ # shorthand (ie `imessage`) because there's only one default
43
+ # connector.
44
+ plugin = identifier
39
45
  end
40
- connector || raise(ConnectorNotAvailableError.new("Connector '#{identifier}' not found"))
41
- end
42
46
 
43
- def find_within_loaded_connectors(phase, identifier)
44
- @connectors.find { |c| c.phase == phase && c.identifier == identifier }
47
+ PluginRegistry.activate(plugin)
48
+
49
+ candidates = connectors.select { |c| c.phase == phase && c.plugin == plugin }
50
+ # if no name given, just use first connector with right phase/plugin
51
+ # TODO: set up a property for connectors to specify that they're the
52
+ # default connector for the plugin
53
+ candidates = candidates.select { |c| c.identifier == name } if name
54
+ connector = candidates.first
55
+
56
+ connector || raise(ConnectorNotAvailableError, "Connector '#{identifier}' not found")
45
57
  end
46
58
  end
47
59
  end
@@ -50,3 +62,4 @@ end
50
62
 
51
63
  require_relative 'self_registering'
52
64
  require_relative 'connector_registration'
65
+ require_relative 'plugin_registry'
@@ -8,19 +8,41 @@ class Chronicle::ETL::Runner
8
8
  end
9
9
 
10
10
  def run!
11
- extractor = @job.instantiate_extractor
12
- loader = @job.instantiate_loader
11
+ validate_job
12
+ instantiate_connectors
13
+ prepare_job
14
+ prepare_ui
15
+ run_extraction
16
+ finish_job
17
+ end
18
+
19
+ private
20
+
21
+ def validate_job
22
+ @job.job_definition.validate!
23
+ end
13
24
 
25
+ def instantiate_connectors
26
+ @extractor = @job.instantiate_extractor
27
+ @loader = @job.instantiate_loader
28
+ end
29
+
30
+ def prepare_job
31
+ Chronicle::ETL::Logger.info(tty_log_job_start)
14
32
  @job_logger.start
15
- loader.start
33
+ @loader.start
34
+ @extractor.prepare
35
+ end
16
36
 
17
- extractor.prepare
18
- total = extractor.results_count
37
+ def prepare_ui
38
+ total = @extractor.results_count
19
39
  @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
20
40
  Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
41
+ end
21
42
 
22
- Chronicle::ETL::Logger.info(tty_log_job_start)
23
- extractor.extract do |extraction|
43
+ # TODO: refactor this further
44
+ def run_extraction
45
+ @extractor.extract do |extraction|
24
46
  unless extraction.is_a?(Chronicle::ETL::Extraction)
25
47
  raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
26
48
  end
@@ -28,14 +50,10 @@ class Chronicle::ETL::Runner
28
50
  transformer = @job.instantiate_transformer(extraction)
29
51
  record = transformer.transform
30
52
 
31
- unless record.is_a?(Chronicle::ETL::Models::Base)
32
- raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
33
- end
34
-
35
53
  Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
36
54
  @job_logger.log_transformation(transformer)
37
55
 
38
- loader.load(record) unless @job.dry_run?
56
+ @loader.load(record) unless @job.dry_run?
39
57
  rescue Chronicle::ETL::TransformationError => e
40
58
  Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
41
59
  ensure
@@ -43,22 +61,22 @@ class Chronicle::ETL::Runner
43
61
  end
44
62
 
45
63
  @progress_bar.finish
46
- loader.finish
64
+ @loader.finish
47
65
  @job_logger.finish
48
66
  rescue Interrupt
49
67
  Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
50
68
  @job_logger.error
51
69
  rescue StandardError => e
52
70
  raise e
53
- ensure
71
+ end
72
+
73
+ def finish_job
54
74
  @job_logger.save
55
- @progress_bar.finish
75
+ @progress_bar&.finish
56
76
  Chronicle::ETL::Logger.detach_from_progress_bar
57
77
  Chronicle::ETL::Logger.info(tty_log_completion)
58
78
  end
59
79
 
60
- private
61
-
62
80
  def tty_log_job_start
63
81
  output = "Beginning job "
64
82
  output += "'#{@job.name}'".bold if @job.name
@@ -1,6 +1,12 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class JSONAPISerializer < Chronicle::ETL::Serializer
4
+ def initialize(*args)
5
+ super
6
+
7
+ raise(SerializationError, "Record must be a subclass of Chronicle::ETL::Model::Base") unless @record.is_a?(Chronicle::ETL::Models::Base)
8
+ end
9
+
4
10
  def serializable_hash
5
11
  @record
6
12
  .identifier_hash
@@ -0,0 +1,10 @@
1
+ module Chronicle
2
+ module ETL
3
+ # Take a Raw model and output `raw_data` as a hash
4
+ class RawSerializer < Chronicle::ETL::Serializer
5
+ def serializable_hash
6
+ @record.to_h
7
+ end
8
+ end
9
+ end
10
+ end
@@ -24,4 +24,5 @@ module Chronicle
24
24
  end
25
25
  end
26
26
 
27
- require_relative 'jsonapi_serializer'
27
+ require_relative 'jsonapi_serializer'
28
+ require_relative 'raw_serializer'
@@ -7,7 +7,7 @@ module Chronicle
7
7
  end
8
8
 
9
9
  def transform
10
- Chronicle::ETL::Models::Generic.new(@extraction.data)
10
+ Chronicle::ETL::Models::Raw.new(@extraction.data)
11
11
  end
12
12
 
13
13
  def timestamp; end
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.4.0"
3
+ VERSION = "0.4.3"
4
4
  end
5
5
  end
data/lib/chronicle/etl.rb CHANGED
@@ -3,23 +3,30 @@ require_relative 'etl/config'
3
3
  require_relative 'etl/configurable'
4
4
  require_relative 'etl/exceptions'
5
5
  require_relative 'etl/extraction'
6
- require_relative 'etl/extractors/extractor'
7
6
  require_relative 'etl/job_definition'
8
7
  require_relative 'etl/job_log'
9
8
  require_relative 'etl/job_logger'
10
9
  require_relative 'etl/job'
11
- require_relative 'etl/loaders/loader'
12
10
  require_relative 'etl/logger'
13
11
  require_relative 'etl/models/activity'
14
12
  require_relative 'etl/models/attachment'
15
13
  require_relative 'etl/models/base'
14
+ require_relative 'etl/models/raw'
16
15
  require_relative 'etl/models/entity'
17
- require_relative 'etl/models/generic'
18
16
  require_relative 'etl/runner'
19
17
  require_relative 'etl/serializers/serializer'
20
- require_relative 'etl/transformers/transformer'
21
18
  require_relative 'etl/utils/binary_attachments'
22
19
  require_relative 'etl/utils/hash_utilities'
23
20
  require_relative 'etl/utils/text_recognition'
24
21
  require_relative 'etl/utils/progress_bar'
25
22
  require_relative 'etl/version'
23
+
24
+ require_relative 'etl/extractors/extractor'
25
+ require_relative 'etl/loaders/loader'
26
+ require_relative 'etl/transformers/transformer'
27
+
28
+ begin
29
+ require 'pry'
30
+ rescue LoadError
31
+ # Pry not available
32
+ end