chronicle-etl 0.4.0 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +2 -2
  3. data/.rubocop.yml +3 -0
  4. data/README.md +156 -81
  5. data/chronicle-etl.gemspec +3 -0
  6. data/lib/chronicle/etl/cli/cli_base.rb +31 -0
  7. data/lib/chronicle/etl/cli/connectors.rb +4 -11
  8. data/lib/chronicle/etl/cli/jobs.rb +49 -22
  9. data/lib/chronicle/etl/cli/main.rb +32 -1
  10. data/lib/chronicle/etl/cli/plugins.rb +62 -0
  11. data/lib/chronicle/etl/cli/subcommand_base.rb +1 -1
  12. data/lib/chronicle/etl/cli.rb +3 -0
  13. data/lib/chronicle/etl/config.rb +7 -4
  14. data/lib/chronicle/etl/configurable.rb +15 -2
  15. data/lib/chronicle/etl/exceptions.rb +29 -2
  16. data/lib/chronicle/etl/extractors/csv_extractor.rb +24 -17
  17. data/lib/chronicle/etl/extractors/extractor.rb +5 -5
  18. data/lib/chronicle/etl/extractors/file_extractor.rb +33 -13
  19. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +76 -0
  20. data/lib/chronicle/etl/extractors/json_extractor.rb +21 -12
  21. data/lib/chronicle/etl/job.rb +7 -1
  22. data/lib/chronicle/etl/job_definition.rb +32 -6
  23. data/lib/chronicle/etl/loaders/csv_loader.rb +35 -8
  24. data/lib/chronicle/etl/loaders/helpers/encoding_helper.rb +18 -0
  25. data/lib/chronicle/etl/loaders/json_loader.rb +44 -0
  26. data/lib/chronicle/etl/loaders/loader.rb +24 -1
  27. data/lib/chronicle/etl/loaders/table_loader.rb +13 -26
  28. data/lib/chronicle/etl/logger.rb +6 -2
  29. data/lib/chronicle/etl/models/base.rb +3 -0
  30. data/lib/chronicle/etl/models/entity.rb +8 -2
  31. data/lib/chronicle/etl/models/raw.rb +26 -0
  32. data/lib/chronicle/etl/registry/connector_registration.rb +5 -0
  33. data/lib/chronicle/etl/registry/plugin_registry.rb +75 -0
  34. data/lib/chronicle/etl/registry/registry.rb +27 -14
  35. data/lib/chronicle/etl/runner.rb +35 -17
  36. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +6 -0
  37. data/lib/chronicle/etl/serializers/raw_serializer.rb +10 -0
  38. data/lib/chronicle/etl/serializers/serializer.rb +2 -1
  39. data/lib/chronicle/etl/transformers/null_transformer.rb +1 -1
  40. data/lib/chronicle/etl/version.rb +1 -1
  41. data/lib/chronicle/etl.rb +11 -4
  42. metadata +53 -6
  43. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +0 -104
  44. data/lib/chronicle/etl/loaders/stdout_loader.rb +0 -14
  45. data/lib/chronicle/etl/models/generic.rb +0 -23
@@ -0,0 +1,44 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JSONLoader < Chronicle::ETL::Loader
4
+ register_connector do |r|
5
+ r.description = 'json'
6
+ end
7
+
8
+ setting :serializer
9
+ setting :output, default: $stdout
10
+
11
+ def start
12
+ if @config.output == $stdout
13
+ @output = @config.output
14
+ else
15
+ @output = File.open(@config.output, "w")
16
+ end
17
+ end
18
+
19
+ def load(record)
20
+ serialized = serializer.serialize(record)
21
+
22
+ # When dealing with raw data, we can get improperly encoded strings
23
+ # (eg from sqlite database columns). We force conversion to UTF-8
24
+ # before converting into JSON
25
+ encoded = serialized.transform_values do |value|
26
+ next value unless value.is_a?(String)
27
+
28
+ force_utf8(value)
29
+ end
30
+ @output.puts encoded.to_json
31
+ end
32
+
33
+ def finish
34
+ @output.close
35
+ end
36
+
37
+ private
38
+
39
+ def serializer
40
+ @config.serializer || Chronicle::ETL::RawSerializer
41
+ end
42
+ end
43
+ end
44
+ end
@@ -1,11 +1,17 @@
1
+ require_relative 'helpers/encoding_helper'
2
+
1
3
  module Chronicle
2
4
  module ETL
3
5
  # Abstract class representing a Loader for an ETL job
4
6
  class Loader
5
7
  extend Chronicle::ETL::Registry::SelfRegistering
6
8
  include Chronicle::ETL::Configurable
9
+ include Chronicle::ETL::Loaders::Helpers::EncodingHelper
7
10
 
8
11
  setting :output
12
+ setting :fields
13
+ setting :fields_limit, default: nil
14
+ setting :fields_exclude
9
15
 
10
16
  # Construct a new instance of this loader. Options are passed in from a Runner
11
17
  # == Parameters:
@@ -25,11 +31,28 @@ module Chronicle
25
31
 
26
32
  # Called once there are no more records to process
27
33
  def finish; end
34
+
35
+ private
36
+
37
+ def build_headers(records)
38
+ headers =
39
+ if @config.fields && @config.fields.any?
40
+ Set[*@config.fields]
41
+ else
42
+ # use all the keys of the flattened record hash
43
+ Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
44
+ end
45
+
46
+ headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) }
47
+ headers = headers.first(@config.fields_limit) if @config.fields_limit
48
+
49
+ headers.to_a.map(&:to_sym)
50
+ end
28
51
  end
29
52
  end
30
53
  end
31
54
 
32
55
  require_relative 'csv_loader'
56
+ require_relative 'json_loader'
33
57
  require_relative 'rest_loader'
34
- require_relative 'stdout_loader'
35
58
  require_relative 'table_loader'
@@ -9,51 +9,38 @@ module Chronicle
9
9
  r.description = 'an ASCII table'
10
10
  end
11
11
 
12
- setting :fields_limit, default: nil
13
- setting :fields_exclude, default: ['lids', 'type']
14
- setting :fields_include, default: []
15
12
  setting :truncate_values_at, default: 40
16
13
  setting :table_renderer, default: :basic
14
+ setting :fields_exclude, default: ['lids', 'type']
15
+ setting :header_row, default: true
17
16
 
18
17
  def load(record)
19
- @records ||= []
20
- @records << record.to_h_flattened
18
+ records << record.to_h_flattened
21
19
  end
22
20
 
23
21
  def finish
24
- return if @records.empty?
22
+ return if records.empty?
25
23
 
26
- headers = build_headers(@records)
27
- rows = build_rows(@records, headers)
24
+ headers = build_headers(records)
25
+ rows = build_rows(records, headers)
28
26
 
29
- @table = TTY::Table.new(header: headers, rows: rows)
27
+ @table = TTY::Table.new(header: (headers if @config.header_row), rows: rows)
30
28
  puts @table.render(
31
29
  @config.table_renderer.to_sym,
32
30
  padding: [0, 2, 0, 0]
33
31
  )
34
32
  end
35
33
 
36
- private
37
-
38
- def build_headers(records)
39
- headers =
40
- if @config.fields_include.any?
41
- Set[*@config.fields_include]
42
- else
43
- # use all the keys of the flattened record hash
44
- Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
45
- end
46
-
47
- headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) } if @config.fields_exclude.any?
48
- headers = headers.first(@config.fields_limit) if @config.fields_limit
49
-
50
- headers.to_a.map(&:to_sym)
34
+ def records
35
+ @records ||= []
51
36
  end
52
37
 
38
+ private
39
+
53
40
  def build_rows(records, headers)
54
41
  records.map do |record|
55
- values = record.values_at(*headers).map{|value| value.to_s }
56
-
42
+ values = record.transform_keys(&:to_sym).values_at(*headers).map{|value| value.to_s }
43
+ values = values.map { |value| force_utf8(value) }
57
44
  if @config.truncate_values_at
58
45
  values = values.map{ |value| value.truncate(@config.truncate_values_at) }
59
46
  end
@@ -8,11 +8,11 @@ module Chronicle
8
8
  WARN = 2
9
9
  ERROR = 3
10
10
  FATAL = 4
11
+ SILENT = 5
11
12
 
12
13
  attr_accessor :log_level
13
14
 
14
15
  @log_level = INFO
15
- @destination = $stderr
16
16
 
17
17
  def output message, level
18
18
  return unless level >= @log_level
@@ -20,10 +20,14 @@ module Chronicle
20
20
  if @progress_bar
21
21
  @progress_bar.log(message)
22
22
  else
23
- @destination.puts(message)
23
+ $stderr.puts(message)
24
24
  end
25
25
  end
26
26
 
27
+ def fatal(message)
28
+ output(message, FATAL)
29
+ end
30
+
27
31
  def error(message)
28
32
  output(message, ERROR)
29
33
  end
@@ -5,6 +5,9 @@ module Chronicle
5
5
  module Models
6
6
  # Represents a record that's been transformed by a Transformer and
7
7
  # ready to be loaded. Loosely based on ActiveModel.
8
+ #
9
+ # @todo Experiment with just mixing in ActiveModel instead of this
10
+ # this reimplementation
8
11
  class Base
9
12
  ATTRIBUTES = [:provider, :provider_id, :lat, :lng, :metadata].freeze
10
13
  ASSOCIATIONS = [].freeze
@@ -5,13 +5,19 @@ module Chronicle
5
5
  module Models
6
6
  class Entity < Chronicle::ETL::Models::Base
7
7
  TYPE = 'entities'.freeze
8
- ATTRIBUTES = [:title, :body, :represents, :slug, :myself, :metadata].freeze
8
+ ATTRIBUTES = [:title, :body, :provider_url, :represents, :slug, :myself, :metadata].freeze
9
+
10
+ # TODO: This desperately needs a validation system
9
11
  ASSOCIATIONS = [
12
+ :involvements, # inverse of activity's `involved`
13
+
10
14
  :attachments,
11
15
  :abouts,
16
+ :aboutables, # inverse of above
12
17
  :depicts,
13
18
  :consumers,
14
- :contains
19
+ :contains,
20
+ :containers # inverse of above
15
21
  ].freeze # TODO: add these to reflect Chronicle Schema
16
22
 
17
23
  attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
@@ -0,0 +1,26 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ # A record from an extraction with no processing or normalization applied
7
+ class Raw
8
+ TYPE = 'raw'
9
+
10
+ attr_accessor :raw_data
11
+
12
+ def initialize(raw_data)
13
+ @raw_data = raw_data
14
+ end
15
+
16
+ def to_h
17
+ @raw_data.to_h
18
+ end
19
+
20
+ def to_h_flattened
21
+ Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -44,6 +44,11 @@ module Chronicle
44
44
  @provider || (built_in? ? 'chronicle' : '')
45
45
  end
46
46
 
47
+ # TODO: allow overriding here. Maybe through self-registration process
48
+ def plugin
49
+ @provider
50
+ end
51
+
47
52
  def descriptive_phrase
48
53
  prefix = case phase
49
54
  when :extractor
@@ -0,0 +1,75 @@
1
+ require 'rubygems'
2
+ require 'rubygems/command'
3
+ require 'rubygems/commands/install_command'
4
+ require 'rubygems/uninstaller'
5
+
6
+ module Chronicle
7
+ module ETL
8
+ module Registry
9
+ # Responsible for managing plugins available to chronicle-etl
10
+ #
11
+ # @todo Better validation for whether a gem is actually a plugin
12
+ # @todo Add ways to load a plugin that don't require a gem on rubygems.org
13
+ module PluginRegistry
14
+ # Does this plugin exist?
15
+ def self.exists?(name)
16
+ # TODO: implement this. Could query rubygems.org or have a
17
+ # hardcoded approved list
18
+ true
19
+ end
20
+
21
+ # All versions of all plugins currently installed
22
+ def self.all_installed
23
+ # TODO: add check for chronicle-etl dependency
24
+ Gem::Specification.filter { |s| s.name.match(/^chronicle-/) && s.name != "chronicle-etl" }
25
+ end
26
+
27
+ # Latest version of each installed plugin
28
+ def self.all_installed_latest
29
+ all_installed.group_by(&:name)
30
+ .transform_values { |versions| versions.sort_by(&:version).reverse.first }
31
+ .values
32
+ end
33
+
34
+ # Activate a plugin with given name by `require`ing it
35
+ def self.activate(name)
36
+ # By default, activates the latest available version of a gem
37
+ # so don't have to run Kernel#gem separately
38
+ require "chronicle/#{name}"
39
+ rescue Gem::ConflictError => e
40
+ # TODO: figure out if there's more we can do here
41
+ raise Chronicle::ETL::PluginConflictError.new(name), "Plugin '#{name}' couldn't be loaded. #{e.message}"
42
+ rescue LoadError => e
43
+ raise Chronicle::ETL::PluginLoadError.new(name), "Plugin '#{name}' couldn't be loaded" if exists?(name)
44
+
45
+ raise Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} doesn't exist"
46
+ end
47
+
48
+ # Install a plugin to local gems
49
+ def self.install(name)
50
+ gem_name = "chronicle-#{name}"
51
+ raise(Chronicle::ETL::PluginNotAvailableError.new(gem_name), "Plugin #{name} doesn't exist") unless exists?(gem_name)
52
+
53
+ Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
54
+ Gem.install(gem_name)
55
+
56
+ activate(name)
57
+ rescue Gem::UnsatisfiableDependencyError
58
+ # TODO: we need to catch a lot more than this here
59
+ raise Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} could not be installed."
60
+ end
61
+
62
+ # Uninstall a plugin
63
+ def self.uninstall(name)
64
+ gem_name = "chronicle-#{name}"
65
+ Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
66
+ uninstaller = Gem::Uninstaller.new(gem_name)
67
+ uninstaller.uninstall
68
+ rescue Gem::InstallError
69
+ # TODO: strengthen this exception handling
70
+ raise(Chronicle::ETL::PluginError.new(name), "Plugin #{name} wasn't uninstalled")
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
@@ -20,28 +20,40 @@ module Chronicle
20
20
  end
21
21
  end
22
22
 
23
- def install_connector name
24
- gem_name = "chronicle-#{name}"
25
- Gem.install(gem_name)
23
+ def register connector
24
+ connectors << connector
26
25
  end
27
26
 
28
- def register connector
27
+ def connectors
29
28
  @connectors ||= []
30
- @connectors << connector
31
29
  end
32
30
 
33
31
  def find_by_phase_and_identifier(phase, identifier)
34
- connector = find_within_loaded_connectors(phase, identifier)
35
- unless connector
36
- # Only load external connectors (slow) if not found in built-in connectors
37
- load_all!
38
- connector = find_within_loaded_connectors(phase, identifier)
32
+ # Simple case: built in connector
33
+ connector = connectors.find { |c| c.phase == phase && c.identifier == identifier }
34
+ return connector if connector
35
+
36
+ # if not available in built-in connectors, try to activate a
37
+ # relevant plugin and try again
38
+ if identifier.include?(":")
39
+ plugin, name = identifier.split(":")
40
+ else
41
+ # This case handles the case where the identifier is a
42
+ # shorthand (ie `imessage`) because there's only one default
43
+ # connector.
44
+ plugin = identifier
39
45
  end
40
- connector || raise(ConnectorNotAvailableError.new("Connector '#{identifier}' not found"))
41
- end
42
46
 
43
- def find_within_loaded_connectors(phase, identifier)
44
- @connectors.find { |c| c.phase == phase && c.identifier == identifier }
47
+ PluginRegistry.activate(plugin)
48
+
49
+ candidates = connectors.select { |c| c.phase == phase && c.plugin == plugin }
50
+ # if no name given, just use first connector with right phase/plugin
51
+ # TODO: set up a property for connectors to specify that they're the
52
+ # default connector for the plugin
53
+ candidates = candidates.select { |c| c.identifier == name } if name
54
+ connector = candidates.first
55
+
56
+ connector || raise(ConnectorNotAvailableError, "Connector '#{identifier}' not found")
45
57
  end
46
58
  end
47
59
  end
@@ -50,3 +62,4 @@ end
50
62
 
51
63
  require_relative 'self_registering'
52
64
  require_relative 'connector_registration'
65
+ require_relative 'plugin_registry'
@@ -8,19 +8,41 @@ class Chronicle::ETL::Runner
8
8
  end
9
9
 
10
10
  def run!
11
- extractor = @job.instantiate_extractor
12
- loader = @job.instantiate_loader
11
+ validate_job
12
+ instantiate_connectors
13
+ prepare_job
14
+ prepare_ui
15
+ run_extraction
16
+ finish_job
17
+ end
18
+
19
+ private
20
+
21
+ def validate_job
22
+ @job.job_definition.validate!
23
+ end
13
24
 
25
+ def instantiate_connectors
26
+ @extractor = @job.instantiate_extractor
27
+ @loader = @job.instantiate_loader
28
+ end
29
+
30
+ def prepare_job
31
+ Chronicle::ETL::Logger.info(tty_log_job_start)
14
32
  @job_logger.start
15
- loader.start
33
+ @loader.start
34
+ @extractor.prepare
35
+ end
16
36
 
17
- extractor.prepare
18
- total = extractor.results_count
37
+ def prepare_ui
38
+ total = @extractor.results_count
19
39
  @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
20
40
  Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
41
+ end
21
42
 
22
- Chronicle::ETL::Logger.info(tty_log_job_start)
23
- extractor.extract do |extraction|
43
+ # TODO: refactor this further
44
+ def run_extraction
45
+ @extractor.extract do |extraction|
24
46
  unless extraction.is_a?(Chronicle::ETL::Extraction)
25
47
  raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
26
48
  end
@@ -28,14 +50,10 @@ class Chronicle::ETL::Runner
28
50
  transformer = @job.instantiate_transformer(extraction)
29
51
  record = transformer.transform
30
52
 
31
- unless record.is_a?(Chronicle::ETL::Models::Base)
32
- raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
33
- end
34
-
35
53
  Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
36
54
  @job_logger.log_transformation(transformer)
37
55
 
38
- loader.load(record) unless @job.dry_run?
56
+ @loader.load(record) unless @job.dry_run?
39
57
  rescue Chronicle::ETL::TransformationError => e
40
58
  Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
41
59
  ensure
@@ -43,22 +61,22 @@ class Chronicle::ETL::Runner
43
61
  end
44
62
 
45
63
  @progress_bar.finish
46
- loader.finish
64
+ @loader.finish
47
65
  @job_logger.finish
48
66
  rescue Interrupt
49
67
  Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
50
68
  @job_logger.error
51
69
  rescue StandardError => e
52
70
  raise e
53
- ensure
71
+ end
72
+
73
+ def finish_job
54
74
  @job_logger.save
55
- @progress_bar.finish
75
+ @progress_bar&.finish
56
76
  Chronicle::ETL::Logger.detach_from_progress_bar
57
77
  Chronicle::ETL::Logger.info(tty_log_completion)
58
78
  end
59
79
 
60
- private
61
-
62
80
  def tty_log_job_start
63
81
  output = "Beginning job "
64
82
  output += "'#{@job.name}'".bold if @job.name
@@ -1,6 +1,12 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class JSONAPISerializer < Chronicle::ETL::Serializer
4
+ def initialize(*args)
5
+ super
6
+
7
+ raise(SerializationError, "Record must be a subclass of Chronicle::ETL::Model::Base") unless @record.is_a?(Chronicle::ETL::Models::Base)
8
+ end
9
+
4
10
  def serializable_hash
5
11
  @record
6
12
  .identifier_hash
@@ -0,0 +1,10 @@
1
+ module Chronicle
2
+ module ETL
3
+ # Take a Raw model and output `raw_data` as a hash
4
+ class RawSerializer < Chronicle::ETL::Serializer
5
+ def serializable_hash
6
+ @record.to_h
7
+ end
8
+ end
9
+ end
10
+ end
@@ -24,4 +24,5 @@ module Chronicle
24
24
  end
25
25
  end
26
26
 
27
- require_relative 'jsonapi_serializer'
27
+ require_relative 'jsonapi_serializer'
28
+ require_relative 'raw_serializer'
@@ -7,7 +7,7 @@ module Chronicle
7
7
  end
8
8
 
9
9
  def transform
10
- Chronicle::ETL::Models::Generic.new(@extraction.data)
10
+ Chronicle::ETL::Models::Raw.new(@extraction.data)
11
11
  end
12
12
 
13
13
  def timestamp; end
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.4.0"
3
+ VERSION = "0.4.3"
4
4
  end
5
5
  end
data/lib/chronicle/etl.rb CHANGED
@@ -3,23 +3,30 @@ require_relative 'etl/config'
3
3
  require_relative 'etl/configurable'
4
4
  require_relative 'etl/exceptions'
5
5
  require_relative 'etl/extraction'
6
- require_relative 'etl/extractors/extractor'
7
6
  require_relative 'etl/job_definition'
8
7
  require_relative 'etl/job_log'
9
8
  require_relative 'etl/job_logger'
10
9
  require_relative 'etl/job'
11
- require_relative 'etl/loaders/loader'
12
10
  require_relative 'etl/logger'
13
11
  require_relative 'etl/models/activity'
14
12
  require_relative 'etl/models/attachment'
15
13
  require_relative 'etl/models/base'
14
+ require_relative 'etl/models/raw'
16
15
  require_relative 'etl/models/entity'
17
- require_relative 'etl/models/generic'
18
16
  require_relative 'etl/runner'
19
17
  require_relative 'etl/serializers/serializer'
20
- require_relative 'etl/transformers/transformer'
21
18
  require_relative 'etl/utils/binary_attachments'
22
19
  require_relative 'etl/utils/hash_utilities'
23
20
  require_relative 'etl/utils/text_recognition'
24
21
  require_relative 'etl/utils/progress_bar'
25
22
  require_relative 'etl/version'
23
+
24
+ require_relative 'etl/extractors/extractor'
25
+ require_relative 'etl/loaders/loader'
26
+ require_relative 'etl/transformers/transformer'
27
+
28
+ begin
29
+ require 'pry'
30
+ rescue LoadError
31
+ # Pry not available
32
+ end