chronicle-etl 0.5.4 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +15 -25
  3. data/.rubocop.yml +2 -44
  4. data/Gemfile +2 -2
  5. data/Guardfile +3 -3
  6. data/README.md +98 -73
  7. data/Rakefile +2 -2
  8. data/bin/console +4 -5
  9. data/chronicle-etl.gemspec +50 -45
  10. data/exe/chronicle-etl +1 -1
  11. data/lib/chronicle/etl/authorizer.rb +3 -4
  12. data/lib/chronicle/etl/cli/authorizations.rb +10 -8
  13. data/lib/chronicle/etl/cli/connectors.rb +9 -9
  14. data/lib/chronicle/etl/cli/jobs.rb +130 -53
  15. data/lib/chronicle/etl/cli/main.rb +29 -29
  16. data/lib/chronicle/etl/cli/plugins.rb +29 -26
  17. data/lib/chronicle/etl/cli/secrets.rb +14 -12
  18. data/lib/chronicle/etl/cli/subcommand_base.rb +5 -3
  19. data/lib/chronicle/etl/config.rb +20 -7
  20. data/lib/chronicle/etl/configurable.rb +24 -9
  21. data/lib/chronicle/etl/exceptions.rb +3 -3
  22. data/lib/chronicle/etl/extraction.rb +12 -2
  23. data/lib/chronicle/etl/extractors/csv_extractor.rb +9 -0
  24. data/lib/chronicle/etl/extractors/extractor.rb +15 -2
  25. data/lib/chronicle/etl/extractors/file_extractor.rb +5 -3
  26. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +2 -2
  27. data/lib/chronicle/etl/extractors/json_extractor.rb +14 -4
  28. data/lib/chronicle/etl/extractors/stdin_extractor.rb +3 -0
  29. data/lib/chronicle/etl/job.rb +35 -17
  30. data/lib/chronicle/etl/job_definition.rb +39 -27
  31. data/lib/chronicle/etl/job_log.rb +14 -16
  32. data/lib/chronicle/etl/job_logger.rb +4 -4
  33. data/lib/chronicle/etl/loaders/csv_loader.rb +17 -4
  34. data/lib/chronicle/etl/loaders/helpers/stdout_helper.rb +4 -0
  35. data/lib/chronicle/etl/loaders/json_loader.rb +30 -10
  36. data/lib/chronicle/etl/loaders/loader.rb +0 -17
  37. data/lib/chronicle/etl/loaders/rest_loader.rb +7 -7
  38. data/lib/chronicle/etl/loaders/table_loader.rb +37 -12
  39. data/lib/chronicle/etl/logger.rb +3 -3
  40. data/lib/chronicle/etl/oauth_authorizer.rb +8 -10
  41. data/lib/chronicle/etl/record.rb +15 -0
  42. data/lib/chronicle/etl/registry/connector_registration.rb +15 -23
  43. data/lib/chronicle/etl/registry/connectors.rb +117 -0
  44. data/lib/chronicle/etl/registry/plugin_registration.rb +19 -0
  45. data/lib/chronicle/etl/registry/plugins.rb +171 -0
  46. data/lib/chronicle/etl/registry/registry.rb +3 -52
  47. data/lib/chronicle/etl/registry/self_registering.rb +1 -1
  48. data/lib/chronicle/etl/runner.rb +158 -128
  49. data/lib/chronicle/etl/secrets.rb +5 -5
  50. data/lib/chronicle/etl/transformers/buffer_transformer.rb +29 -0
  51. data/lib/chronicle/etl/transformers/chronicle_transformer.rb +32 -0
  52. data/lib/chronicle/etl/transformers/chronobase_transformer.rb +100 -0
  53. data/lib/chronicle/etl/transformers/fields_limit_transformer.rb +23 -0
  54. data/lib/chronicle/etl/transformers/filter_fields_transformer.rb +60 -0
  55. data/lib/chronicle/etl/transformers/filter_transformer.rb +30 -0
  56. data/lib/chronicle/etl/transformers/format_transformer.rb +32 -0
  57. data/lib/chronicle/etl/transformers/merge_meta_transformer.rb +19 -0
  58. data/lib/chronicle/etl/transformers/multiply_transformer.rb +21 -0
  59. data/lib/chronicle/etl/transformers/null_transformer.rb +5 -7
  60. data/lib/chronicle/etl/transformers/sampler_transformer.rb +21 -0
  61. data/lib/chronicle/etl/transformers/sort_transformer.rb +31 -0
  62. data/lib/chronicle/etl/transformers/transformer.rb +63 -41
  63. data/lib/chronicle/etl/utils/binary_attachments.rb +1 -1
  64. data/lib/chronicle/etl/utils/progress_bar.rb +2 -3
  65. data/lib/chronicle/etl/version.rb +1 -1
  66. data/lib/chronicle/etl.rb +6 -8
  67. metadata +91 -45
  68. data/lib/chronicle/etl/models/activity.rb +0 -15
  69. data/lib/chronicle/etl/models/attachment.rb +0 -14
  70. data/lib/chronicle/etl/models/base.rb +0 -122
  71. data/lib/chronicle/etl/models/entity.rb +0 -29
  72. data/lib/chronicle/etl/models/raw.rb +0 -26
  73. data/lib/chronicle/etl/registry/plugin_registry.rb +0 -95
  74. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +0 -31
  75. data/lib/chronicle/etl/serializers/raw_serializer.rb +0 -10
  76. data/lib/chronicle/etl/serializers/serializer.rb +0 -28
  77. data/lib/chronicle/etl/transformers/image_file_transformer.rb +0 -247
  78. data/lib/chronicle/etl/utils/hash_utilities.rb +0 -19
  79. data/lib/chronicle/etl/utils/text_recognition.rb +0 -15
@@ -1,15 +1,18 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Chronicle
2
4
  module ETL
3
5
  module Registry
4
- # Records details about a connector such as its provider and a description
6
+ # Records details about a connector such as its source provider and a description
5
7
  class ConnectorRegistration
6
- # FIXME: refactor custom accessor methods later in file
7
- attr_accessor :identifier, :provider, :klass, :description
8
+ attr_accessor :klass, :identifier, :source, :strategy, :type, :description, :from_schema, :to_schema
8
9
 
10
+ # Create a new connector registration
9
11
  def initialize(klass)
10
12
  @klass = klass
11
13
  end
12
14
 
15
+ # The ETL phase of this connector
13
16
  def phase
14
17
  if klass.ancestors.include? Chronicle::ETL::Extractor
15
18
  :extractor
@@ -24,6 +27,7 @@ module Chronicle
24
27
  "#{phase}-#{identifier}"
25
28
  end
26
29
 
30
+ # Whether this connector is built-in to Chronicle
27
31
  def built_in?
28
32
  @klass.to_s.include? 'Chronicle::ETL'
29
33
  end
@@ -32,32 +36,20 @@ module Chronicle
32
36
  @klass.to_s
33
37
  end
34
38
 
35
- def identifier
36
- @identifier || @klass.to_s.split('::').last.gsub!(/(Extractor$|Loader$|Transformer$)/, '').downcase
37
- end
38
-
39
- def description
40
- @description || @klass.to_s.split('::').last
41
- end
42
-
43
- def provider
44
- @provider || (built_in? ? 'chronicle' : '')
45
- end
46
-
47
39
  # TODO: allow overriding here. Maybe through self-registration process
48
40
  def plugin
49
- @provider
41
+ @source
50
42
  end
51
43
 
52
44
  def descriptive_phrase
53
45
  prefix = case phase
54
- when :extractor
55
- "Extracts from"
56
- when :transformer
57
- "Transforms"
58
- when :loader
59
- "Loads to"
60
- end
46
+ when :extractor
47
+ 'Extracts from'
48
+ when :transformer
49
+ 'Transforms'
50
+ when :loader
51
+ 'Loads to'
52
+ end
61
53
 
62
54
  "#{prefix} #{description}"
63
55
  end
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rubygems'
4
+
5
+ module Chronicle
6
+ module ETL
7
+ module Registry
8
+ # A singleton class that acts as a registry of connector classes available for ETL jobs
9
+ module Connectors
10
+ PHASES = %i[extractor transformer loader].freeze
11
+ public_constant :PHASES
12
+
13
+ class << self
14
+ attr_accessor :connectors
15
+ end
16
+
17
+ def self.register(connector)
18
+ connectors << connector
19
+ end
20
+
21
+ def self.connectors
22
+ @connectors ||= []
23
+ end
24
+
25
+ def self.ancestor_for_phase(phase)
26
+ case phase
27
+ when :extractor
28
+ Chronicle::ETL::Extractor
29
+ when :transformer
30
+ Chronicle::ETL::Transformer
31
+ when :loader
32
+ Chronicle::ETL::Loader
33
+ end
34
+ end
35
+
36
+ def self.find_converter_for_source(source:, type: nil, strategy: nil, target: nil)
37
+ # FIXME: we're assuming extractor plugin has been loaded already
38
+ # This may not be the case if the schema converter is running
39
+ # off a json dump off extraction data.
40
+ # plugin = source_klass.connector_registration.source
41
+ # type = source_klass.connector_registration.type
42
+ # strategy = source_klass.connector_registration.strategy
43
+
44
+ connectors.find do |c|
45
+ c.phase == :transformer &&
46
+ c.source == source &&
47
+ (type.nil? || c.type == type) &&
48
+ (strategy.nil? || c.strategy == strategy || c.strategy.nil?) &&
49
+ (target.nil? || c.to_schema == target)
50
+ end
51
+ end
52
+
53
+ # Find connector from amongst those currently loaded
54
+ def self.find_by_phase_and_identifier_built_in(phase, identifier)
55
+ connectors.find { |c| c.phase == phase.to_sym && c.identifier == identifier.to_sym }
56
+ end
57
+
58
+ # Find connector and load relevant plugin to find it if necessary
59
+ def self.find_by_phase_and_identifier(phase, identifier)
60
+ connector = find_by_phase_and_identifier_built_in(phase, identifier)
61
+ return connector if connector
62
+
63
+ # determine if we need to try to load a local file. if it has a dot in the identifier, we treat it as a file
64
+ return find_by_phase_and_identifier_local(phase, identifier) if identifier.to_s.include?('.')
65
+
66
+ # Example identifier: lastfm:listens:api
67
+ plugin, type, strategy = identifier.split(':')
68
+ .map { |part| part.gsub('-', '_') }
69
+ .map(&:to_sym)
70
+
71
+ plugin_identifier = plugin.to_s.gsub('_', '-')
72
+
73
+ unless Chronicle::ETL::Registry::Plugins.installed?(plugin_identifier)
74
+ raise Chronicle::ETL::PluginNotInstalledError, plugin_identifier
75
+ end
76
+
77
+ Chronicle::ETL::Registry::Plugins.activate(plugin_identifier)
78
+
79
+ # find most specific connector that matches the identifier
80
+ connector = connectors.find do |c|
81
+ c.plugin == plugin && (type.nil? || c.type == type) && (strategy.nil? || c.strategy == strategy)
82
+ end
83
+
84
+ connector || raise(ConnectorNotAvailableError, "Connector '#{identifier}' not found")
85
+ end
86
+
87
+ # Load a plugin from local file system
88
+ def self.find_by_phase_and_identifier_local(phase, identifier)
89
+ script = File.read(identifier)
90
+ raise ConnectorNotAvailableError, "Connector '#{identifier}' not found" if script.nil?
91
+
92
+ # load the file by evaluating the contents
93
+ eval(script, TOPLEVEL_BINDING, __FILE__, __LINE__) # rubocop:disable Security/Eval
94
+
95
+ # read the file and look for all class definitions in the ruby script.
96
+ class_names = script.scan(/class (\w+)/).flatten
97
+
98
+ class_names.each do |class_name|
99
+ klass = Object.const_get(class_name)
100
+
101
+ next unless klass.ancestors.include?(ancestor_for_phase(phase))
102
+
103
+ registration = ::Chronicle::ETL::Registry::ConnectorRegistration.new(klass)
104
+
105
+ klass.connector_registration = registration
106
+ return registration
107
+ # return klass
108
+ rescue NameError
109
+ # ignore
110
+ end
111
+
112
+ raise ConnectorNotAvailableError, "Connector '#{identifier}' not found"
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,19 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Registry
4
+ class PluginRegistration
5
+ attr_accessor :name, :description, :gem, :version, :installed, :gemspec
6
+
7
+ def initialize(name = nil)
8
+ @installed = false
9
+ @name = name
10
+ yield self if block_given?
11
+ end
12
+
13
+ def installed?
14
+ @installed || false
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,171 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rubygems'
4
+ require 'rubygems/command'
5
+ require 'rubygems/commands/install_command'
6
+ require 'rubygems/uninstaller'
7
+ require 'gems'
8
+ require 'active_support/core_ext/hash/deep_merge'
9
+
10
+ module Chronicle
11
+ module ETL
12
+ module Registry
13
+ # Responsible for managing plugins available to chronicle-etl
14
+ #
15
+ # @todo Better validation for whether a gem is actually a plugin
16
+ # @todo Add ways to load a plugin that don't require a gem on rubygems.org
17
+ module Plugins
18
+ KNOWN_PLUGINS = %w[
19
+ apple-podcasts
20
+ email
21
+ foursquare
22
+ github
23
+ imessage
24
+ pinboard
25
+ safari
26
+ shell
27
+ spotify
28
+ zulip
29
+ ].freeze
30
+ public_constant :KNOWN_PLUGINS
31
+
32
+ # Start of a system for having non-gem plugins. Right now, we just
33
+ # make registry aware of existence of name of non-gem plugin
34
+ def self.register_standalone(name:)
35
+ plugin = Chronicle::ETL::Registry::PluginRegistration.new do |p|
36
+ p.name = name.to_sym
37
+ p.installed = true
38
+ end
39
+
40
+ installed_standalone << plugin
41
+ end
42
+
43
+ # Plugins either installed as gems or manually loaded/registered
44
+ def self.installed
45
+ installed_standalone + installed_as_gem
46
+ end
47
+
48
+ # Check whether a given plugin is installed
49
+ def self.installed?(name)
50
+ installed.map(&:name).include?(name.to_sym)
51
+ end
52
+
53
+ # List of plugins installed as standalone
54
+ def self.installed_standalone
55
+ @installed_standalone ||= []
56
+ end
57
+
58
+ # List of plugins installed as gems
59
+ def self.installed_as_gem
60
+ installed_gemspecs_latest.map do |gem|
61
+ Chronicle::ETL::Registry::PluginRegistration.new do |p|
62
+ p.name = gem.name.sub('chronicle-', '').to_sym
63
+ p.gem = gem.name
64
+ p.description = gem.description
65
+ p.version = gem.version.to_s
66
+ p.installed = true
67
+ end
68
+ end
69
+ end
70
+
71
+ # List of all plugins available to chronicle-etl
72
+ def self.available
73
+ available_as_gem
74
+ end
75
+
76
+ # List of plugins available through rubygems
77
+ # TODO: make this concurrent
78
+ def self.available_as_gem
79
+ KNOWN_PLUGINS.map do |name|
80
+ info = gem_info(name)
81
+ Chronicle::ETL::Registry::PluginRegistration.new do |p|
82
+ p.name = name
83
+ p.gem = info['name']
84
+ p.version = info['version']
85
+ p.description = info['info']
86
+ end
87
+ end
88
+ end
89
+
90
+ # Load info about a gem plugin from rubygems API
91
+ def self.gem_info(name)
92
+ gem_name = "chronicle-#{name}"
93
+ Gems.info(gem_name)
94
+ end
95
+
96
+ # Union of installed gems (latest version) + available gems
97
+ def self.all
98
+ (installed + available)
99
+ .group_by(&:name)
100
+ .transform_values { |plugin| plugin.find(&:installed) || plugin.first }
101
+ .values
102
+ end
103
+
104
+ # Does a plugin with a given name exist?
105
+ def self.exists?(name)
106
+ KNOWN_PLUGINS.include?(name)
107
+ end
108
+
109
+ # All versions of all plugins currently installed
110
+ def self.installed_gemspecs
111
+ # TODO: add check for chronicle-etl dependency
112
+ Gem::Specification.filter do |s|
113
+ s.name.match(/^chronicle-/) && s.name != 'chronicle-etl' && s.name != 'chronicle-core'
114
+ end
115
+ end
116
+
117
+ # Latest version of each installed plugin
118
+ def self.installed_gemspecs_latest
119
+ installed_gemspecs.group_by(&:name)
120
+ .transform_values { |versions| versions.sort_by(&:version).reverse.first }
121
+ .values
122
+ end
123
+
124
+ # Activate a plugin with given name by `require`ing it
125
+ def self.activate(name)
126
+ # By default, activates the latest available version of a gem
127
+ # so don't have to run Kernel#gem separately
128
+
129
+ plugin_require_name = name.to_s.gsub('-', '_')
130
+ require "chronicle/#{plugin_require_name}"
131
+ rescue Gem::ConflictError => e
132
+ # TODO: figure out if there's more we can do here
133
+ raise Chronicle::ETL::PluginConflictError.new(name),
134
+ "Plugin '#{plugin_require_name}' couldn't be loaded. #{e.message}"
135
+ rescue StandardError, LoadError
136
+ # StandardError to catch random non-loading problems that might occur
137
+ # when requiring the plugin (eg class macro invoked the wrong way)
138
+ # TODO: decide if this should be separated
139
+ raise Chronicle::ETL::PluginLoadError.new(name), "Plugin '#{plugin_require_name}' couldn't be loaded"
140
+ end
141
+
142
+ # Install a plugin to local gems
143
+ def self.install(name)
144
+ return if installed?(name)
145
+ raise(Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} doesn't exist") unless exists?(name)
146
+
147
+ gem_name = "chronicle-#{name}"
148
+
149
+ Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
150
+ Gem.install(gem_name)
151
+
152
+ activate(name)
153
+ rescue Gem::UnsatisfiableDependencyError
154
+ # TODO: we need to catch a lot more than this here
155
+ raise Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} could not be installed."
156
+ end
157
+
158
+ # Uninstall a plugin
159
+ def self.uninstall(name)
160
+ gem_name = "chronicle-#{name}"
161
+ Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
162
+ uninstaller = Gem::Uninstaller.new(gem_name)
163
+ uninstaller.uninstall
164
+ rescue Gem::InstallError
165
+ # TODO: strengthen this exception handling
166
+ raise(Chronicle::ETL::PluginError.new(name), "Plugin #{name} wasn't uninstalled")
167
+ end
168
+ end
169
+ end
170
+ end
171
+ end
@@ -1,61 +1,12 @@
1
- require 'rubygems'
2
-
3
1
  module Chronicle
4
2
  module ETL
5
- # A singleton class that acts as a registry of connector classes available for ETL jobs
6
3
  module Registry
7
- PHASES = [:extractor, :transformer, :loader]
8
-
9
- class << self
10
- attr_accessor :connectors
11
-
12
- def register(connector)
13
- connectors << connector
14
- end
15
-
16
- def connectors
17
- @connectors ||= []
18
- end
19
-
20
- # Find connector from amongst those currently loaded
21
- def find_by_phase_and_identifier_local(phase, identifier)
22
- connector = connectors.find { |c| c.phase == phase && c.identifier == identifier }
23
- end
24
-
25
- # Find connector and load relevant plugin to find it if necessary
26
- def find_by_phase_and_identifier(phase, identifier)
27
- connector = find_by_phase_and_identifier_local(phase, identifier)
28
- return connector if connector
29
-
30
- # if not available in built-in connectors, try to activate a
31
- # relevant plugin and try again
32
- if identifier.include?(":")
33
- plugin, name = identifier.split(":")
34
- else
35
- # This case handles the case where the identifier is a
36
- # shorthand (ie `imessage`) because there's only one default
37
- # connector.
38
- plugin = identifier
39
- end
40
-
41
- raise(Chronicle::ETL::PluginNotInstalledError.new(plugin)) unless PluginRegistry.installed?(plugin)
42
-
43
- PluginRegistry.activate(plugin)
44
-
45
- candidates = connectors.select { |c| c.phase == phase && c.plugin == plugin }
46
- # if no name given, just use first connector with right phase/plugin
47
- # TODO: set up a property for connectors to specify that they're the
48
- # default connector for the plugin
49
- candidates = candidates.select { |c| c.identifier == name } if name
50
- connector = candidates.first
51
-
52
- connector || raise(ConnectorNotAvailableError, "Connector '#{identifier}' not found")
53
- end
54
- end
55
4
  end
56
5
  end
57
6
  end
58
7
 
59
8
  require_relative 'self_registering'
60
9
  require_relative 'connector_registration'
61
- require_relative 'plugin_registry'
10
+ require_relative 'connectors'
11
+ require_relative 'plugin_registration'
12
+ require_relative 'plugins'
@@ -17,7 +17,7 @@ module Chronicle
17
17
  def register_connector
18
18
  @connector_registration ||= ::Chronicle::ETL::Registry::ConnectorRegistration.new(self)
19
19
  yield @connector_registration if block_given?
20
- ::Chronicle::ETL::Registry.register(@connector_registration)
20
+ ::Chronicle::ETL::Registry::Connectors.register(@connector_registration)
21
21
  end
22
22
  end
23
23
  end