chronicle-etl 0.3.1 → 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +35 -0
  3. data/.rubocop.yml +31 -1
  4. data/Guardfile +7 -0
  5. data/README.md +157 -82
  6. data/Rakefile +4 -2
  7. data/chronicle-etl.gemspec +11 -3
  8. data/exe/chronicle-etl +1 -1
  9. data/lib/chronicle/etl/cli/connectors.rb +34 -5
  10. data/lib/chronicle/etl/cli/jobs.rb +90 -24
  11. data/lib/chronicle/etl/cli/main.rb +41 -19
  12. data/lib/chronicle/etl/cli/plugins.rb +62 -0
  13. data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
  14. data/lib/chronicle/etl/cli.rb +9 -0
  15. data/lib/chronicle/etl/config.rb +7 -4
  16. data/lib/chronicle/etl/configurable.rb +163 -0
  17. data/lib/chronicle/etl/exceptions.rb +29 -1
  18. data/lib/chronicle/etl/extractors/csv_extractor.rb +24 -23
  19. data/lib/chronicle/etl/extractors/extractor.rb +16 -15
  20. data/lib/chronicle/etl/extractors/file_extractor.rb +34 -11
  21. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +76 -0
  22. data/lib/chronicle/etl/extractors/json_extractor.rb +19 -18
  23. data/lib/chronicle/etl/job.rb +8 -2
  24. data/lib/chronicle/etl/job_definition.rb +20 -5
  25. data/lib/chronicle/etl/loaders/csv_loader.rb +36 -9
  26. data/lib/chronicle/etl/loaders/helpers/encoding_helper.rb +18 -0
  27. data/lib/chronicle/etl/loaders/json_loader.rb +44 -0
  28. data/lib/chronicle/etl/loaders/loader.rb +28 -2
  29. data/lib/chronicle/etl/loaders/rest_loader.rb +5 -5
  30. data/lib/chronicle/etl/loaders/table_loader.rb +18 -37
  31. data/lib/chronicle/etl/logger.rb +6 -2
  32. data/lib/chronicle/etl/models/base.rb +3 -0
  33. data/lib/chronicle/etl/models/entity.rb +8 -2
  34. data/lib/chronicle/etl/models/raw.rb +26 -0
  35. data/lib/chronicle/etl/registry/connector_registration.rb +6 -0
  36. data/lib/chronicle/etl/registry/plugin_registry.rb +70 -0
  37. data/lib/chronicle/etl/registry/registry.rb +27 -14
  38. data/lib/chronicle/etl/runner.rb +35 -17
  39. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +6 -0
  40. data/lib/chronicle/etl/serializers/raw_serializer.rb +10 -0
  41. data/lib/chronicle/etl/serializers/serializer.rb +2 -1
  42. data/lib/chronicle/etl/transformers/image_file_transformer.rb +22 -28
  43. data/lib/chronicle/etl/transformers/null_transformer.rb +1 -1
  44. data/lib/chronicle/etl/transformers/transformer.rb +3 -2
  45. data/lib/chronicle/etl/version.rb +1 -1
  46. data/lib/chronicle/etl.rb +12 -4
  47. metadata +123 -18
  48. data/.ruby-version +0 -1
  49. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +0 -104
  50. data/lib/chronicle/etl/loaders/stdout_loader.rb +0 -14
  51. data/lib/chronicle/etl/models/generic.rb +0 -23
@@ -2,46 +2,47 @@ require 'csv'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
- class CsvExtractor < Chronicle::ETL::Extractor
6
- include Extractors::Helpers::FilesystemReader
5
+ class CSVExtractor < Chronicle::ETL::Extractor
6
+ include Extractors::Helpers::InputReader
7
7
 
8
8
  register_connector do |r|
9
- r.description = 'input as CSV'
9
+ r.description = 'CSV'
10
10
  end
11
11
 
12
- DEFAULT_OPTIONS = {
13
- headers: true,
14
- filename: $stdin
15
- }.freeze
12
+ setting :headers, default: true
16
13
 
17
- def initialize(options = {})
18
- super(DEFAULT_OPTIONS.merge(options))
14
+ def prepare
15
+ @csvs = prepare_sources
19
16
  end
20
17
 
21
18
  def extract
22
- csv = initialize_csv
23
- csv.each do |row|
24
- yield Chronicle::ETL::Extraction.new(data: row.to_h)
19
+ @csvs.each do |csv|
20
+ csv.read.each do |row|
21
+ yield Chronicle::ETL::Extraction.new(data: row.to_h)
22
+ end
25
23
  end
26
24
  end
27
25
 
28
26
  def results_count
29
- CSV.read(@options[:filename], headers: @options[:headers]).count unless stdin?(@options[:filename])
27
+ @csvs.reduce(0) do |total_rows, csv|
28
+ row_count = csv.readlines.size
29
+ csv.rewind
30
+ total_rows + row_count
31
+ end
30
32
  end
31
33
 
32
34
  private
33
35
 
34
- def initialize_csv
35
- headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
36
-
37
- csv_options = {
38
- headers: headers,
39
- converters: :all
40
- }
41
-
42
- open_from_filesystem(filename: @options[:filename]) do |file|
43
- return CSV.new(file, **csv_options)
36
+ def prepare_sources
37
+ @csvs = []
38
+ read_input do |csv_data|
39
+ csv_options = {
40
+ headers: @config.headers.is_a?(String) ? @config.headers.split(',') : @config.headers,
41
+ converters: :all
42
+ }
43
+ @csvs << CSV.new(csv_data, **csv_options)
44
44
  end
45
+ @csvs
45
46
  end
46
47
  end
47
48
  end
@@ -5,15 +5,20 @@ module Chronicle
5
5
  # Abstract class representing an Extractor for an ETL job
6
6
  class Extractor
7
7
  extend Chronicle::ETL::Registry::SelfRegistering
8
+ include Chronicle::ETL::Configurable
9
+
10
+ setting :since, type: :time
11
+ setting :until, type: :time
12
+ setting :limit, type: :numeric
13
+ setting :load_after_id
14
+ setting :input
8
15
 
9
16
  # Construct a new instance of this extractor. Options are passed in from a Runner
10
- # == Paramters:
17
+ # == Parameters:
11
18
  # options::
12
19
  # Options for configuring this Extractor
13
20
  def initialize(options = {})
14
- @options = options.transform_keys!(&:to_sym)
15
- sanitize_options
16
- handle_continuation
21
+ apply_options(options)
17
22
  end
18
23
 
19
24
  # Hook called before #extract. Useful for gathering data, initailizing proxies, etc
@@ -30,22 +35,18 @@ module Chronicle
30
35
 
31
36
  private
32
37
 
33
- def sanitize_options
34
- @options[:load_since] = Time.parse(@options[:load_since]) if @options[:load_since] && @options[:load_since].is_a?(String)
35
- @options[:load_until] = Time.parse(@options[:load_until]) if @options[:load_until] && @options[:load_until].is_a?(String)
36
- end
37
-
38
- def handle_continuation
39
- return unless @options[:continuation]
38
+ # TODO: reimplemenet this
39
+ # def handle_continuation
40
+ # return unless @config.continuation
40
41
 
41
- @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation].highest_timestamp
42
- @options[:load_after_id] = @options[:continuation].last_id if @options[:continuation].last_id
43
- end
42
+ # @config.since = @config.continuation.highest_timestamp if @config.continuation.highest_timestamp
43
+ # @config.load_after_id = @config.continuation.last_id if @config.continuation.last_id
44
+ # end
44
45
  end
45
46
  end
46
47
  end
47
48
 
48
- require_relative 'helpers/filesystem_reader'
49
+ require_relative 'helpers/input_reader'
49
50
  require_relative 'csv_extractor'
50
51
  require_relative 'file_extractor'
51
52
  require_relative 'json_extractor'
@@ -2,32 +2,55 @@ require 'pathname'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
+ # Return filenames that match a pattern in a directory
5
6
  class FileExtractor < Chronicle::ETL::Extractor
6
- include Extractors::Helpers::FilesystemReader
7
7
 
8
8
  register_connector do |r|
9
9
  r.description = 'file or directory of files'
10
10
  end
11
11
 
12
+ setting :input, default: ['.']
13
+ setting :dir_glob_pattern, default: "**/*"
14
+ setting :larger_than
15
+ setting :smaller_than
16
+
17
+ def prepare
18
+ @pathnames = gather_files
19
+ end
20
+
12
21
  def extract
13
- filenames.each do |filename|
14
- yield Chronicle::ETL::Extraction.new(data: filename)
22
+ @pathnames.each do |pathname|
23
+ yield Chronicle::ETL::Extraction.new(data: pathname.to_path)
15
24
  end
16
25
  end
17
26
 
18
27
  def results_count
19
- filenames.count
28
+ @pathnames.count
20
29
  end
21
30
 
22
31
  private
23
32
 
24
- def filenames
25
- @filenames ||= filenames_in_directory(
26
- path: @options[:filename],
27
- dir_glob_pattern: @options[:dir_glob_pattern],
28
- load_since: @options[:load_since],
29
- load_until: @options[:load_until]
30
- )
33
+ def gather_files
34
+ roots = [@config.input].flatten.map { |filename| Pathname.new(filename) }
35
+ raise(ExtractionError, "Input must exist") unless roots.all?(&:exist?)
36
+
37
+ directories, files = roots.partition(&:directory?)
38
+
39
+ directories.each do |directory|
40
+ files += Dir.glob(File.join(directory, @config.dir_glob_pattern)).map { |filename| Pathname.new(filename) }
41
+ end
42
+
43
+ files = files.uniq
44
+
45
+ files = files.keep_if { |f| (f.mtime > @config.since) } if @config.since
46
+ files = files.keep_if { |f| (f.mtime < @config.until) } if @config.until
47
+
48
+ # pass in file sizes in bytes
49
+ files = files.keep_if { |f| (f.size < @config.smaller_than) } if @config.smaller_than
50
+ files = files.keep_if { |f| (f.size > @config.larger_than) } if @config.larger_than
51
+
52
+ # # TODO: incorporate sort argument
53
+ files.sort_by(&:mtime)
31
54
  end
32
55
  end
33
56
  end
@@ -0,0 +1,76 @@
1
+ require 'pathname'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Extractors
6
+ module Helpers
7
+ module InputReader
8
+ # Return an array of input filenames; converts a single string
9
+ # to an array if necessary
10
+ def filenames
11
+ [@config.input].flatten.map
12
+ end
13
+
14
+ # Filenames as an array of pathnames
15
+ def pathnames
16
+ filenames.map { |filename| Pathname.new(filename) }
17
+ end
18
+
19
+ # Whether we're reading from files
20
+ def read_from_files?
21
+ filenames.any?
22
+ end
23
+
24
+ # Whether we're reading input from stdin
25
+ def read_from_stdin?
26
+ !read_from_files? && $stdin.stat.pipe?
27
+ end
28
+
29
+ # Read input sources and yield each content
30
+ def read_input
31
+ if read_from_files?
32
+ pathnames.each do |pathname|
33
+ File.open(pathname) do |file|
34
+ yield file.read, pathname.to_path
35
+ end
36
+ end
37
+ elsif read_from_stdin?
38
+ yield $stdin.read, $stdin
39
+ else
40
+ raise ExtractionError, "No input files or stdin provided"
41
+ end
42
+ end
43
+
44
+ # Read input sources line by line
45
+ def read_input_as_lines(&block)
46
+ if read_from_files?
47
+ lines_from_files(&block)
48
+ elsif read_from_stdin?
49
+ lines_from_stdin(&block)
50
+ else
51
+ raise ExtractionError, "No input files or stdin provided"
52
+ end
53
+ end
54
+
55
+ private
56
+
57
+ def lines_from_files(&block)
58
+ pathnames.each do |pathname|
59
+ File.open(pathname) do |file|
60
+ lines_from_io(file, &block)
61
+ end
62
+ end
63
+ end
64
+
65
+ def lines_from_stdin(&block)
66
+ lines_from_io($stdin, &block)
67
+ end
68
+
69
+ def lines_from_io(io, &block)
70
+ io.each_line(&block)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -1,43 +1,44 @@
1
1
  module Chronicle
2
2
  module ETL
3
- class JsonExtractor < Chronicle::ETL::Extractor
4
- include Extractors::Helpers::FilesystemReader
3
+ class JSONExtractor < Chronicle::ETL::Extractor
4
+ include Extractors::Helpers::InputReader
5
5
 
6
6
  register_connector do |r|
7
- r.description = 'input as JSON'
7
+ r.description = 'JSON'
8
8
  end
9
9
 
10
- DEFAULT_OPTIONS = {
11
- filename: $stdin,
10
+ setting :jsonl, default: true, type: :boolean
12
11
 
13
- # We're expecting line-separated json objects
14
- jsonl: true
15
- }.freeze
16
-
17
- def initialize(options = {})
18
- super(DEFAULT_OPTIONS.merge(options))
12
+ def prepare
13
+ @jsons = []
14
+ load_input do |input|
15
+ @jsons << parse_data(input)
16
+ end
19
17
  end
20
18
 
21
19
  def extract
22
- load_input do |input|
23
- parsed_data = parse_data(input)
24
- yield Chronicle::ETL::Extraction.new(data: parsed_data) if parsed_data
20
+ @jsons.each do |json|
21
+ yield Chronicle::ETL::Extraction.new(data: json)
25
22
  end
26
23
  end
27
24
 
28
25
  def results_count
26
+ @jsons.count
29
27
  end
30
28
 
31
29
  private
32
30
 
33
31
  def parse_data data
34
32
  JSON.parse(data)
35
- rescue JSON::ParserError => e
33
+ rescue JSON::ParserError
34
+ raise Chronicle::ETL::ExtractionError, "Could not parse JSON"
36
35
  end
37
36
 
38
- def load_input
39
- read_from_filesystem(filename: @options[:filename]) do |data|
40
- yield data
37
+ def load_input(&block)
38
+ if @config.jsonl
39
+ read_input_as_lines(&block)
40
+ else
41
+ read_input(&block)
41
42
  end
42
43
  end
43
44
  end
@@ -1,6 +1,11 @@
1
1
  require 'forwardable'
2
+
2
3
  module Chronicle
3
4
  module ETL
5
+ # A runner job
6
+ #
7
+ # TODO: this can probably be merged with JobDefinition. Not clear
8
+ # where the boundaries are
4
9
  class Job
5
10
  extend Forwardable
6
11
 
@@ -12,7 +17,8 @@ module Chronicle
12
17
  :transformer_klass,
13
18
  :transformer_options,
14
19
  :loader_klass,
15
- :loader_options
20
+ :loader_options,
21
+ :job_definition
16
22
 
17
23
  # TODO: build a proper id system
18
24
  alias id name
@@ -35,7 +41,7 @@ module Chronicle
35
41
 
36
42
  def instantiate_transformer(extraction)
37
43
  @transformer_klass = @job_definition.transformer_klass
38
- @transformer_klass.new(@transformer_options, extraction)
44
+ @transformer_klass.new(extraction, @transformer_options)
39
45
  end
40
46
 
41
47
  def instantiate_loader
@@ -14,17 +14,36 @@ module Chronicle
14
14
  options: {}
15
15
  },
16
16
  loader: {
17
- name: 'stdout',
17
+ name: 'table',
18
18
  options: {}
19
19
  }
20
20
  }.freeze
21
21
 
22
+ attr_reader :errors
22
23
  attr_accessor :definition
23
24
 
24
25
  def initialize()
25
26
  @definition = SKELETON_DEFINITION
26
27
  end
27
28
 
29
+ def validate
30
+ @errors = []
31
+
32
+ Chronicle::ETL::Registry::PHASES.each do |phase|
33
+ __send__("#{phase}_klass".to_sym)
34
+ rescue Chronicle::ETL::PluginError => e
35
+ @errors << e
36
+ end
37
+
38
+ @errors.empty?
39
+ end
40
+
41
+ def validate!
42
+ raise(Chronicle::ETL::JobDefinitionError.new(self), "Job definition is invalid") unless validate
43
+
44
+ true
45
+ end
46
+
28
47
  # Add config hash to this definition
29
48
  def add_config(config = {})
30
49
  @definition = @definition.deep_merge(config)
@@ -80,10 +99,6 @@ module Chronicle
80
99
  end
81
100
  end
82
101
  end
83
-
84
- def validate
85
- return true # TODO
86
- end
87
102
  end
88
103
  end
89
104
  end
@@ -2,27 +2,54 @@ require 'csv'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
- class CsvLoader < Chronicle::ETL::Loader
5
+ class CSVLoader < Chronicle::ETL::Loader
6
6
  register_connector do |r|
7
7
  r.description = 'CSV'
8
8
  end
9
9
 
10
- def initialize(options={})
11
- super(options)
12
- @rows = []
10
+ setting :output, default: $stdout
11
+ setting :headers, default: true
12
+ setting :header_row, default: true
13
+
14
+ def records
15
+ @records ||= []
13
16
  end
14
17
 
15
18
  def load(record)
16
- @rows << record.to_h_flattened.values
19
+ records << record.to_h_flattened
17
20
  end
18
21
 
19
22
  def finish
20
- z = $stdout
21
- CSV(z) do |csv|
22
- @rows.each do |row|
23
- csv << row
23
+ return unless records.any?
24
+
25
+ headers = build_headers(records)
26
+
27
+ csv_options = {}
28
+ if @config.headers
29
+ csv_options[:write_headers] = @config.header_row
30
+ csv_options[:headers] = headers
31
+ end
32
+
33
+ if @config.output.is_a?(IO)
34
+ # This might seem like a duplication of the default value ($stdout)
35
+ # but it's because rspec overwrites $stdout (in helper #capture) to
36
+ # capture output.
37
+ io = $stdout.dup
38
+ else
39
+ io = File.open(@config.output, "w+")
40
+ end
41
+
42
+ output = CSV.generate(**csv_options) do |csv|
43
+ records.each do |record|
44
+ csv << record
45
+ .transform_keys(&:to_sym)
46
+ .values_at(*headers)
47
+ .map { |value| force_utf8(value) }
24
48
  end
25
49
  end
50
+
51
+ io.write(output)
52
+ io.close
26
53
  end
27
54
  end
28
55
  end
@@ -0,0 +1,18 @@
1
+ require 'pathname'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Loaders
6
+ module Helpers
7
+ module EncodingHelper
8
+ # Mostly useful for handling loading with binary data from a raw extraction
9
+ def force_utf8(value)
10
+ return value unless value.is_a?(String)
11
+
12
+ value.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,44 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JSONLoader < Chronicle::ETL::Loader
4
+ register_connector do |r|
5
+ r.description = 'json'
6
+ end
7
+
8
+ setting :serializer
9
+ setting :output, default: $stdout
10
+
11
+ def start
12
+ if @config.output == $stdout
13
+ @output = @config.output
14
+ else
15
+ @output = File.open(@config.output, "w")
16
+ end
17
+ end
18
+
19
+ def load(record)
20
+ serialized = serializer.serialize(record)
21
+
22
+ # When dealing with raw data, we can get improperly encoded strings
23
+ # (eg from sqlite database columns). We force conversion to UTF-8
24
+ # before converting into JSON
25
+ encoded = serialized.transform_values do |value|
26
+ next value unless value.is_a?(String)
27
+
28
+ force_utf8(value)
29
+ end
30
+ @output.puts encoded.to_json
31
+ end
32
+
33
+ def finish
34
+ @output.close
35
+ end
36
+
37
+ private
38
+
39
+ def serializer
40
+ @config.serializer || Chronicle::ETL::RawSerializer
41
+ end
42
+ end
43
+ end
44
+ end
@@ -1,15 +1,24 @@
1
+ require_relative 'helpers/encoding_helper'
2
+
1
3
  module Chronicle
2
4
  module ETL
3
5
  # Abstract class representing a Loader for an ETL job
4
6
  class Loader
5
7
  extend Chronicle::ETL::Registry::SelfRegistering
8
+ include Chronicle::ETL::Configurable
9
+ include Chronicle::ETL::Loaders::Helpers::EncodingHelper
10
+
11
+ setting :output
12
+ setting :fields
13
+ setting :fields_limit, default: nil
14
+ setting :fields_exclude
6
15
 
7
16
  # Construct a new instance of this loader. Options are passed in from a Runner
8
17
  # == Parameters:
9
18
  # options::
10
19
  # Options for configuring this Loader
11
20
  def initialize(options = {})
12
- @options = options
21
+ apply_options(options)
13
22
  end
14
23
 
15
24
  # Called once before processing records
@@ -22,11 +31,28 @@ module Chronicle
22
31
 
23
32
  # Called once there are no more records to process
24
33
  def finish; end
34
+
35
+ private
36
+
37
+ def build_headers(records)
38
+ headers =
39
+ if @config.fields && @config.fields.any?
40
+ Set[*@config.fields]
41
+ else
42
+ # use all the keys of the flattened record hash
43
+ Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
44
+ end
45
+
46
+ headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) }
47
+ headers = headers.first(@config.fields_limit) if @config.fields_limit
48
+
49
+ headers.to_a.map(&:to_sym)
50
+ end
25
51
  end
26
52
  end
27
53
  end
28
54
 
29
55
  require_relative 'csv_loader'
56
+ require_relative 'json_loader'
30
57
  require_relative 'rest_loader'
31
- require_relative 'stdout_loader'
32
58
  require_relative 'table_loader'
@@ -9,19 +9,19 @@ module Chronicle
9
9
  r.description = 'a REST endpoint'
10
10
  end
11
11
 
12
- def initialize( options={} )
13
- super(options)
14
- end
12
+ setting :hostname, required: true
13
+ setting :endpoint, required: true
14
+ setting :access_token
15
15
 
16
16
  def load(record)
17
17
  payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
18
18
  # have the outer data key that json-api expects
19
19
  payload = { data: payload } unless payload[:data]
20
20
 
21
- uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
21
+ uri = URI.parse("#{@config.hostname}#{@config.endpoint}")
22
22
 
23
23
  header = {
24
- "Authorization" => "Bearer #{@options[:access_token]}",
24
+ "Authorization" => "Bearer #{@config.access_token}",
25
25
  "Content-Type": 'application/json'
26
26
  }
27
27
  use_ssl = uri.scheme == 'https'