chronicle-etl 0.3.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +35 -0
  3. data/.rubocop.yml +31 -1
  4. data/Guardfile +7 -0
  5. data/README.md +157 -82
  6. data/Rakefile +4 -2
  7. data/chronicle-etl.gemspec +11 -3
  8. data/exe/chronicle-etl +1 -1
  9. data/lib/chronicle/etl/cli/connectors.rb +34 -5
  10. data/lib/chronicle/etl/cli/jobs.rb +90 -24
  11. data/lib/chronicle/etl/cli/main.rb +41 -19
  12. data/lib/chronicle/etl/cli/plugins.rb +62 -0
  13. data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
  14. data/lib/chronicle/etl/cli.rb +9 -0
  15. data/lib/chronicle/etl/config.rb +7 -4
  16. data/lib/chronicle/etl/configurable.rb +163 -0
  17. data/lib/chronicle/etl/exceptions.rb +29 -1
  18. data/lib/chronicle/etl/extractors/csv_extractor.rb +24 -23
  19. data/lib/chronicle/etl/extractors/extractor.rb +16 -15
  20. data/lib/chronicle/etl/extractors/file_extractor.rb +34 -11
  21. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +76 -0
  22. data/lib/chronicle/etl/extractors/json_extractor.rb +19 -18
  23. data/lib/chronicle/etl/job.rb +8 -2
  24. data/lib/chronicle/etl/job_definition.rb +20 -5
  25. data/lib/chronicle/etl/loaders/csv_loader.rb +36 -9
  26. data/lib/chronicle/etl/loaders/helpers/encoding_helper.rb +18 -0
  27. data/lib/chronicle/etl/loaders/json_loader.rb +44 -0
  28. data/lib/chronicle/etl/loaders/loader.rb +28 -2
  29. data/lib/chronicle/etl/loaders/rest_loader.rb +5 -5
  30. data/lib/chronicle/etl/loaders/table_loader.rb +18 -37
  31. data/lib/chronicle/etl/logger.rb +6 -2
  32. data/lib/chronicle/etl/models/base.rb +3 -0
  33. data/lib/chronicle/etl/models/entity.rb +8 -2
  34. data/lib/chronicle/etl/models/raw.rb +26 -0
  35. data/lib/chronicle/etl/registry/connector_registration.rb +6 -0
  36. data/lib/chronicle/etl/registry/plugin_registry.rb +70 -0
  37. data/lib/chronicle/etl/registry/registry.rb +27 -14
  38. data/lib/chronicle/etl/runner.rb +35 -17
  39. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +6 -0
  40. data/lib/chronicle/etl/serializers/raw_serializer.rb +10 -0
  41. data/lib/chronicle/etl/serializers/serializer.rb +2 -1
  42. data/lib/chronicle/etl/transformers/image_file_transformer.rb +22 -28
  43. data/lib/chronicle/etl/transformers/null_transformer.rb +1 -1
  44. data/lib/chronicle/etl/transformers/transformer.rb +3 -2
  45. data/lib/chronicle/etl/version.rb +1 -1
  46. data/lib/chronicle/etl.rb +12 -4
  47. metadata +123 -18
  48. data/.ruby-version +0 -1
  49. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +0 -104
  50. data/lib/chronicle/etl/loaders/stdout_loader.rb +0 -14
  51. data/lib/chronicle/etl/models/generic.rb +0 -23
@@ -2,46 +2,47 @@ require 'csv'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
- class CsvExtractor < Chronicle::ETL::Extractor
6
- include Extractors::Helpers::FilesystemReader
5
+ class CSVExtractor < Chronicle::ETL::Extractor
6
+ include Extractors::Helpers::InputReader
7
7
 
8
8
  register_connector do |r|
9
- r.description = 'input as CSV'
9
+ r.description = 'CSV'
10
10
  end
11
11
 
12
- DEFAULT_OPTIONS = {
13
- headers: true,
14
- filename: $stdin
15
- }.freeze
12
+ setting :headers, default: true
16
13
 
17
- def initialize(options = {})
18
- super(DEFAULT_OPTIONS.merge(options))
14
+ def prepare
15
+ @csvs = prepare_sources
19
16
  end
20
17
 
21
18
  def extract
22
- csv = initialize_csv
23
- csv.each do |row|
24
- yield Chronicle::ETL::Extraction.new(data: row.to_h)
19
+ @csvs.each do |csv|
20
+ csv.read.each do |row|
21
+ yield Chronicle::ETL::Extraction.new(data: row.to_h)
22
+ end
25
23
  end
26
24
  end
27
25
 
28
26
  def results_count
29
- CSV.read(@options[:filename], headers: @options[:headers]).count unless stdin?(@options[:filename])
27
+ @csvs.reduce(0) do |total_rows, csv|
28
+ row_count = csv.readlines.size
29
+ csv.rewind
30
+ total_rows + row_count
31
+ end
30
32
  end
31
33
 
32
34
  private
33
35
 
34
- def initialize_csv
35
- headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
36
-
37
- csv_options = {
38
- headers: headers,
39
- converters: :all
40
- }
41
-
42
- open_from_filesystem(filename: @options[:filename]) do |file|
43
- return CSV.new(file, **csv_options)
36
+ def prepare_sources
37
+ @csvs = []
38
+ read_input do |csv_data|
39
+ csv_options = {
40
+ headers: @config.headers.is_a?(String) ? @config.headers.split(',') : @config.headers,
41
+ converters: :all
42
+ }
43
+ @csvs << CSV.new(csv_data, **csv_options)
44
44
  end
45
+ @csvs
45
46
  end
46
47
  end
47
48
  end
@@ -5,15 +5,20 @@ module Chronicle
5
5
  # Abstract class representing an Extractor for an ETL job
6
6
  class Extractor
7
7
  extend Chronicle::ETL::Registry::SelfRegistering
8
+ include Chronicle::ETL::Configurable
9
+
10
+ setting :since, type: :time
11
+ setting :until, type: :time
12
+ setting :limit, type: :numeric
13
+ setting :load_after_id
14
+ setting :input
8
15
 
9
16
  # Construct a new instance of this extractor. Options are passed in from a Runner
10
- # == Paramters:
17
+ # == Parameters:
11
18
  # options::
12
19
  # Options for configuring this Extractor
13
20
  def initialize(options = {})
14
- @options = options.transform_keys!(&:to_sym)
15
- sanitize_options
16
- handle_continuation
21
+ apply_options(options)
17
22
  end
18
23
 
19
24
  # Hook called before #extract. Useful for gathering data, initailizing proxies, etc
@@ -30,22 +35,18 @@ module Chronicle
30
35
 
31
36
  private
32
37
 
33
- def sanitize_options
34
- @options[:load_since] = Time.parse(@options[:load_since]) if @options[:load_since] && @options[:load_since].is_a?(String)
35
- @options[:load_until] = Time.parse(@options[:load_until]) if @options[:load_until] && @options[:load_until].is_a?(String)
36
- end
37
-
38
- def handle_continuation
39
- return unless @options[:continuation]
38
+ # TODO: reimplemenet this
39
+ # def handle_continuation
40
+ # return unless @config.continuation
40
41
 
41
- @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation].highest_timestamp
42
- @options[:load_after_id] = @options[:continuation].last_id if @options[:continuation].last_id
43
- end
42
+ # @config.since = @config.continuation.highest_timestamp if @config.continuation.highest_timestamp
43
+ # @config.load_after_id = @config.continuation.last_id if @config.continuation.last_id
44
+ # end
44
45
  end
45
46
  end
46
47
  end
47
48
 
48
- require_relative 'helpers/filesystem_reader'
49
+ require_relative 'helpers/input_reader'
49
50
  require_relative 'csv_extractor'
50
51
  require_relative 'file_extractor'
51
52
  require_relative 'json_extractor'
@@ -2,32 +2,55 @@ require 'pathname'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
+ # Return filenames that match a pattern in a directory
5
6
  class FileExtractor < Chronicle::ETL::Extractor
6
- include Extractors::Helpers::FilesystemReader
7
7
 
8
8
  register_connector do |r|
9
9
  r.description = 'file or directory of files'
10
10
  end
11
11
 
12
+ setting :input, default: ['.']
13
+ setting :dir_glob_pattern, default: "**/*"
14
+ setting :larger_than
15
+ setting :smaller_than
16
+
17
+ def prepare
18
+ @pathnames = gather_files
19
+ end
20
+
12
21
  def extract
13
- filenames.each do |filename|
14
- yield Chronicle::ETL::Extraction.new(data: filename)
22
+ @pathnames.each do |pathname|
23
+ yield Chronicle::ETL::Extraction.new(data: pathname.to_path)
15
24
  end
16
25
  end
17
26
 
18
27
  def results_count
19
- filenames.count
28
+ @pathnames.count
20
29
  end
21
30
 
22
31
  private
23
32
 
24
- def filenames
25
- @filenames ||= filenames_in_directory(
26
- path: @options[:filename],
27
- dir_glob_pattern: @options[:dir_glob_pattern],
28
- load_since: @options[:load_since],
29
- load_until: @options[:load_until]
30
- )
33
+ def gather_files
34
+ roots = [@config.input].flatten.map { |filename| Pathname.new(filename) }
35
+ raise(ExtractionError, "Input must exist") unless roots.all?(&:exist?)
36
+
37
+ directories, files = roots.partition(&:directory?)
38
+
39
+ directories.each do |directory|
40
+ files += Dir.glob(File.join(directory, @config.dir_glob_pattern)).map { |filename| Pathname.new(filename) }
41
+ end
42
+
43
+ files = files.uniq
44
+
45
+ files = files.keep_if { |f| (f.mtime > @config.since) } if @config.since
46
+ files = files.keep_if { |f| (f.mtime < @config.until) } if @config.until
47
+
48
+ # pass in file sizes in bytes
49
+ files = files.keep_if { |f| (f.size < @config.smaller_than) } if @config.smaller_than
50
+ files = files.keep_if { |f| (f.size > @config.larger_than) } if @config.larger_than
51
+
52
+ # # TODO: incorporate sort argument
53
+ files.sort_by(&:mtime)
31
54
  end
32
55
  end
33
56
  end
@@ -0,0 +1,76 @@
1
+ require 'pathname'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Extractors
6
+ module Helpers
7
+ module InputReader
8
+ # Return an array of input filenames; converts a single string
9
+ # to an array if necessary
10
+ def filenames
11
+ [@config.input].flatten.map
12
+ end
13
+
14
+ # Filenames as an array of pathnames
15
+ def pathnames
16
+ filenames.map { |filename| Pathname.new(filename) }
17
+ end
18
+
19
+ # Whether we're reading from files
20
+ def read_from_files?
21
+ filenames.any?
22
+ end
23
+
24
+ # Whether we're reading input from stdin
25
+ def read_from_stdin?
26
+ !read_from_files? && $stdin.stat.pipe?
27
+ end
28
+
29
+ # Read input sources and yield each content
30
+ def read_input
31
+ if read_from_files?
32
+ pathnames.each do |pathname|
33
+ File.open(pathname) do |file|
34
+ yield file.read, pathname.to_path
35
+ end
36
+ end
37
+ elsif read_from_stdin?
38
+ yield $stdin.read, $stdin
39
+ else
40
+ raise ExtractionError, "No input files or stdin provided"
41
+ end
42
+ end
43
+
44
+ # Read input sources line by line
45
+ def read_input_as_lines(&block)
46
+ if read_from_files?
47
+ lines_from_files(&block)
48
+ elsif read_from_stdin?
49
+ lines_from_stdin(&block)
50
+ else
51
+ raise ExtractionError, "No input files or stdin provided"
52
+ end
53
+ end
54
+
55
+ private
56
+
57
+ def lines_from_files(&block)
58
+ pathnames.each do |pathname|
59
+ File.open(pathname) do |file|
60
+ lines_from_io(file, &block)
61
+ end
62
+ end
63
+ end
64
+
65
+ def lines_from_stdin(&block)
66
+ lines_from_io($stdin, &block)
67
+ end
68
+
69
+ def lines_from_io(io, &block)
70
+ io.each_line(&block)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -1,43 +1,44 @@
1
1
  module Chronicle
2
2
  module ETL
3
- class JsonExtractor < Chronicle::ETL::Extractor
4
- include Extractors::Helpers::FilesystemReader
3
+ class JSONExtractor < Chronicle::ETL::Extractor
4
+ include Extractors::Helpers::InputReader
5
5
 
6
6
  register_connector do |r|
7
- r.description = 'input as JSON'
7
+ r.description = 'JSON'
8
8
  end
9
9
 
10
- DEFAULT_OPTIONS = {
11
- filename: $stdin,
10
+ setting :jsonl, default: true, type: :boolean
12
11
 
13
- # We're expecting line-separated json objects
14
- jsonl: true
15
- }.freeze
16
-
17
- def initialize(options = {})
18
- super(DEFAULT_OPTIONS.merge(options))
12
+ def prepare
13
+ @jsons = []
14
+ load_input do |input|
15
+ @jsons << parse_data(input)
16
+ end
19
17
  end
20
18
 
21
19
  def extract
22
- load_input do |input|
23
- parsed_data = parse_data(input)
24
- yield Chronicle::ETL::Extraction.new(data: parsed_data) if parsed_data
20
+ @jsons.each do |json|
21
+ yield Chronicle::ETL::Extraction.new(data: json)
25
22
  end
26
23
  end
27
24
 
28
25
  def results_count
26
+ @jsons.count
29
27
  end
30
28
 
31
29
  private
32
30
 
33
31
  def parse_data data
34
32
  JSON.parse(data)
35
- rescue JSON::ParserError => e
33
+ rescue JSON::ParserError
34
+ raise Chronicle::ETL::ExtractionError, "Could not parse JSON"
36
35
  end
37
36
 
38
- def load_input
39
- read_from_filesystem(filename: @options[:filename]) do |data|
40
- yield data
37
+ def load_input(&block)
38
+ if @config.jsonl
39
+ read_input_as_lines(&block)
40
+ else
41
+ read_input(&block)
41
42
  end
42
43
  end
43
44
  end
@@ -1,6 +1,11 @@
1
1
  require 'forwardable'
2
+
2
3
  module Chronicle
3
4
  module ETL
5
+ # A runner job
6
+ #
7
+ # TODO: this can probably be merged with JobDefinition. Not clear
8
+ # where the boundaries are
4
9
  class Job
5
10
  extend Forwardable
6
11
 
@@ -12,7 +17,8 @@ module Chronicle
12
17
  :transformer_klass,
13
18
  :transformer_options,
14
19
  :loader_klass,
15
- :loader_options
20
+ :loader_options,
21
+ :job_definition
16
22
 
17
23
  # TODO: build a proper id system
18
24
  alias id name
@@ -35,7 +41,7 @@ module Chronicle
35
41
 
36
42
  def instantiate_transformer(extraction)
37
43
  @transformer_klass = @job_definition.transformer_klass
38
- @transformer_klass.new(@transformer_options, extraction)
44
+ @transformer_klass.new(extraction, @transformer_options)
39
45
  end
40
46
 
41
47
  def instantiate_loader
@@ -14,17 +14,36 @@ module Chronicle
14
14
  options: {}
15
15
  },
16
16
  loader: {
17
- name: 'stdout',
17
+ name: 'table',
18
18
  options: {}
19
19
  }
20
20
  }.freeze
21
21
 
22
+ attr_reader :errors
22
23
  attr_accessor :definition
23
24
 
24
25
  def initialize()
25
26
  @definition = SKELETON_DEFINITION
26
27
  end
27
28
 
29
+ def validate
30
+ @errors = []
31
+
32
+ Chronicle::ETL::Registry::PHASES.each do |phase|
33
+ __send__("#{phase}_klass".to_sym)
34
+ rescue Chronicle::ETL::PluginError => e
35
+ @errors << e
36
+ end
37
+
38
+ @errors.empty?
39
+ end
40
+
41
+ def validate!
42
+ raise(Chronicle::ETL::JobDefinitionError.new(self), "Job definition is invalid") unless validate
43
+
44
+ true
45
+ end
46
+
28
47
  # Add config hash to this definition
29
48
  def add_config(config = {})
30
49
  @definition = @definition.deep_merge(config)
@@ -80,10 +99,6 @@ module Chronicle
80
99
  end
81
100
  end
82
101
  end
83
-
84
- def validate
85
- return true # TODO
86
- end
87
102
  end
88
103
  end
89
104
  end
@@ -2,27 +2,54 @@ require 'csv'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
- class CsvLoader < Chronicle::ETL::Loader
5
+ class CSVLoader < Chronicle::ETL::Loader
6
6
  register_connector do |r|
7
7
  r.description = 'CSV'
8
8
  end
9
9
 
10
- def initialize(options={})
11
- super(options)
12
- @rows = []
10
+ setting :output, default: $stdout
11
+ setting :headers, default: true
12
+ setting :header_row, default: true
13
+
14
+ def records
15
+ @records ||= []
13
16
  end
14
17
 
15
18
  def load(record)
16
- @rows << record.to_h_flattened.values
19
+ records << record.to_h_flattened
17
20
  end
18
21
 
19
22
  def finish
20
- z = $stdout
21
- CSV(z) do |csv|
22
- @rows.each do |row|
23
- csv << row
23
+ return unless records.any?
24
+
25
+ headers = build_headers(records)
26
+
27
+ csv_options = {}
28
+ if @config.headers
29
+ csv_options[:write_headers] = @config.header_row
30
+ csv_options[:headers] = headers
31
+ end
32
+
33
+ if @config.output.is_a?(IO)
34
+ # This might seem like a duplication of the default value ($stdout)
35
+ # but it's because rspec overwrites $stdout (in helper #capture) to
36
+ # capture output.
37
+ io = $stdout.dup
38
+ else
39
+ io = File.open(@config.output, "w+")
40
+ end
41
+
42
+ output = CSV.generate(**csv_options) do |csv|
43
+ records.each do |record|
44
+ csv << record
45
+ .transform_keys(&:to_sym)
46
+ .values_at(*headers)
47
+ .map { |value| force_utf8(value) }
24
48
  end
25
49
  end
50
+
51
+ io.write(output)
52
+ io.close
26
53
  end
27
54
  end
28
55
  end
@@ -0,0 +1,18 @@
1
+ require 'pathname'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Loaders
6
+ module Helpers
7
+ module EncodingHelper
8
+ # Mostly useful for handling loading with binary data from a raw extraction
9
+ def force_utf8(value)
10
+ return value unless value.is_a?(String)
11
+
12
+ value.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,44 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JSONLoader < Chronicle::ETL::Loader
4
+ register_connector do |r|
5
+ r.description = 'json'
6
+ end
7
+
8
+ setting :serializer
9
+ setting :output, default: $stdout
10
+
11
+ def start
12
+ if @config.output == $stdout
13
+ @output = @config.output
14
+ else
15
+ @output = File.open(@config.output, "w")
16
+ end
17
+ end
18
+
19
+ def load(record)
20
+ serialized = serializer.serialize(record)
21
+
22
+ # When dealing with raw data, we can get improperly encoded strings
23
+ # (eg from sqlite database columns). We force conversion to UTF-8
24
+ # before converting into JSON
25
+ encoded = serialized.transform_values do |value|
26
+ next value unless value.is_a?(String)
27
+
28
+ force_utf8(value)
29
+ end
30
+ @output.puts encoded.to_json
31
+ end
32
+
33
+ def finish
34
+ @output.close
35
+ end
36
+
37
+ private
38
+
39
+ def serializer
40
+ @config.serializer || Chronicle::ETL::RawSerializer
41
+ end
42
+ end
43
+ end
44
+ end
@@ -1,15 +1,24 @@
1
+ require_relative 'helpers/encoding_helper'
2
+
1
3
  module Chronicle
2
4
  module ETL
3
5
  # Abstract class representing a Loader for an ETL job
4
6
  class Loader
5
7
  extend Chronicle::ETL::Registry::SelfRegistering
8
+ include Chronicle::ETL::Configurable
9
+ include Chronicle::ETL::Loaders::Helpers::EncodingHelper
10
+
11
+ setting :output
12
+ setting :fields
13
+ setting :fields_limit, default: nil
14
+ setting :fields_exclude
6
15
 
7
16
  # Construct a new instance of this loader. Options are passed in from a Runner
8
17
  # == Parameters:
9
18
  # options::
10
19
  # Options for configuring this Loader
11
20
  def initialize(options = {})
12
- @options = options
21
+ apply_options(options)
13
22
  end
14
23
 
15
24
  # Called once before processing records
@@ -22,11 +31,28 @@ module Chronicle
22
31
 
23
32
  # Called once there are no more records to process
24
33
  def finish; end
34
+
35
+ private
36
+
37
+ def build_headers(records)
38
+ headers =
39
+ if @config.fields && @config.fields.any?
40
+ Set[*@config.fields]
41
+ else
42
+ # use all the keys of the flattened record hash
43
+ Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
44
+ end
45
+
46
+ headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) }
47
+ headers = headers.first(@config.fields_limit) if @config.fields_limit
48
+
49
+ headers.to_a.map(&:to_sym)
50
+ end
25
51
  end
26
52
  end
27
53
  end
28
54
 
29
55
  require_relative 'csv_loader'
56
+ require_relative 'json_loader'
30
57
  require_relative 'rest_loader'
31
- require_relative 'stdout_loader'
32
58
  require_relative 'table_loader'
@@ -9,19 +9,19 @@ module Chronicle
9
9
  r.description = 'a REST endpoint'
10
10
  end
11
11
 
12
- def initialize( options={} )
13
- super(options)
14
- end
12
+ setting :hostname, required: true
13
+ setting :endpoint, required: true
14
+ setting :access_token
15
15
 
16
16
  def load(record)
17
17
  payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
18
18
  # have the outer data key that json-api expects
19
19
  payload = { data: payload } unless payload[:data]
20
20
 
21
- uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
21
+ uri = URI.parse("#{@config.hostname}#{@config.endpoint}")
22
22
 
23
23
  header = {
24
- "Authorization" => "Bearer #{@options[:access_token]}",
24
+ "Authorization" => "Bearer #{@config.access_token}",
25
25
  "Content-Type": 'application/json'
26
26
  }
27
27
  use_ssl = uri.scheme == 'https'