chronicle-etl 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/README.md +22 -15
  5. data/chronicle-etl.gemspec +13 -7
  6. data/lib/chronicle/etl/cli/connectors.rb +19 -7
  7. data/lib/chronicle/etl/cli/jobs.rb +38 -26
  8. data/lib/chronicle/etl/cli/main.rb +10 -2
  9. data/lib/chronicle/etl/config.rb +24 -3
  10. data/lib/chronicle/etl/exceptions.rb +13 -0
  11. data/lib/chronicle/etl/extraction.rb +12 -0
  12. data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -37
  13. data/lib/chronicle/etl/extractors/extractor.rb +25 -4
  14. data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
  15. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  16. data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
  17. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  18. data/lib/chronicle/etl/job.rb +72 -0
  19. data/lib/chronicle/etl/job_definition.rb +89 -0
  20. data/lib/chronicle/etl/job_log.rb +95 -0
  21. data/lib/chronicle/etl/job_logger.rb +81 -0
  22. data/lib/chronicle/etl/loaders/csv_loader.rb +6 -6
  23. data/lib/chronicle/etl/loaders/loader.rb +2 -2
  24. data/lib/chronicle/etl/loaders/rest_loader.rb +16 -9
  25. data/lib/chronicle/etl/loaders/stdout_loader.rb +8 -3
  26. data/lib/chronicle/etl/loaders/table_loader.rb +58 -7
  27. data/lib/chronicle/etl/logger.rb +48 -0
  28. data/lib/chronicle/etl/models/activity.rb +15 -0
  29. data/lib/chronicle/etl/models/attachment.rb +14 -0
  30. data/lib/chronicle/etl/models/base.rb +119 -0
  31. data/lib/chronicle/etl/models/entity.rb +21 -0
  32. data/lib/chronicle/etl/models/generic.rb +23 -0
  33. data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
  34. data/lib/chronicle/etl/registry/registry.rb +52 -0
  35. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  36. data/lib/chronicle/etl/runner.rb +66 -24
  37. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  38. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  39. data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
  40. data/lib/chronicle/etl/transformers/null_transformer.rb +11 -3
  41. data/lib/chronicle/etl/transformers/transformer.rb +42 -13
  42. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  43. data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
  44. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  45. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  46. data/lib/chronicle/etl/version.rb +1 -1
  47. data/lib/chronicle/etl.rb +16 -1
  48. metadata +139 -36
  49. data/CHANGELOG.md +0 -23
  50. data/Gemfile.lock +0 -85
  51. data/lib/chronicle/etl/catalog.rb +0 -102
  52. data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -3,49 +3,31 @@ require 'pathname'
3
3
  module Chronicle
4
4
  module ETL
5
5
  class FileExtractor < Chronicle::ETL::Extractor
6
- def extract
7
- if file?
8
- extract_file do |data, metadata|
9
- yield(data, metadata)
10
- end
11
- elsif directory?
12
- extract_from_directory do |data, metadata|
13
- yield(data, metadata)
14
- end
15
- end
16
- end
6
+ include Extractors::Helpers::FilesystemReader
17
7
 
18
- def results_count
19
- if file?
20
- return 1
21
- else
22
- search_pattern = File.join(@options[:filename], '**/*.eml')
23
- Dir.glob(search_pattern).count
24
- end
8
+ register_connector do |r|
9
+ r.description = 'file or directory of files'
25
10
  end
26
11
 
27
- private
28
-
29
- def extract_from_directory
30
- search_pattern = File.join(@options[:filename], '**/*.eml')
31
- filenames = Dir.glob(search_pattern)
12
+ def extract
32
13
  filenames.each do |filename|
33
- file = File.open(filename)
34
- yield(file.read, {filename: file})
14
+ yield Chronicle::ETL::Extraction.new(data: filename)
35
15
  end
36
16
  end
37
17
 
38
- def extract_file
39
- file = File.open(@options[:filename])
40
- yield(file.read, {filename: @options[:filename]})
18
+ def results_count
19
+ filenames.count
41
20
  end
42
21
 
43
- def directory?
44
- Pathname.new(@options[:filename]).directory?
45
- end
22
+ private
46
23
 
47
- def file?
48
- Pathname.new(@options[:filename]).file?
24
+ def filenames
25
+ @filenames ||= filenames_in_directory(
26
+ path: @options[:filename],
27
+ dir_glob_pattern: @options[:dir_glob_pattern],
28
+ load_since: @options[:load_since],
29
+ load_until: @options[:load_until]
30
+ )
49
31
  end
50
32
  end
51
33
  end
@@ -0,0 +1,104 @@
1
+ require 'pathname'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Extractors
6
+ module Helpers
7
+ module FilesystemReader
8
+
9
+ def filenames_in_directory(...)
10
+ filenames = gather_files(...)
11
+ if block_given?
12
+ filenames.each do |filename|
13
+ yield filename
14
+ end
15
+ else
16
+ filenames
17
+ end
18
+ end
19
+
20
+ def read_from_filesystem(filename:, yield_each_line: true, dir_glob_pattern: '**/*')
21
+ open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
22
+ if yield_each_line
23
+ file.each_line do |line|
24
+ yield line
25
+ end
26
+ else
27
+ yield file.read
28
+ end
29
+ end
30
+ end
31
+
32
+ def open_from_filesystem(filename:, dir_glob_pattern: '**/*')
33
+ open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
34
+ yield file
35
+ end
36
+ end
37
+
38
+ def results_count
39
+ raise NotImplementedError
40
+ # if file?
41
+ # return 1
42
+ # else
43
+ # search_pattern = File.join(@options[:filename], '**/*')
44
+ # Dir.glob(search_pattern).count
45
+ # end
46
+ end
47
+
48
+ private
49
+
50
+ def gather_files(path:, dir_glob_pattern: '**/*', load_since: nil, load_until: nil, smaller_than: nil, larger_than: nil, sort: :mtime)
51
+ search_pattern = File.join(path, '**', dir_glob_pattern)
52
+ files = Dir.glob(search_pattern)
53
+
54
+ files = files.keep_if {|f| (File.mtime(f) > load_since)} if load_since
55
+ files = files.keep_if {|f| (File.mtime(f) < load_until)} if load_until
56
+
57
+ # pass in file sizes in bytes
58
+ files = files.keep_if {|f| (File.size(f) < smaller_than)} if smaller_than
59
+ files = files.keep_if {|f| (File.size(f) > larger_than)} if larger_than
60
+
61
+ # TODO: incorporate sort argument
62
+ files.sort_by{ |f| File.mtime(f) }
63
+ end
64
+
65
+ def select_files_in_directory(path:, dir_glob_pattern: '**/*')
66
+ raise IOError.new("#{path} is not a directory.") unless directory?(path)
67
+
68
+ search_pattern = File.join(path, dir_glob_pattern)
69
+ Dir.glob(search_pattern).each do |filename|
70
+ yield(filename)
71
+ end
72
+ end
73
+
74
+ def open_files(filename:, dir_glob_pattern:)
75
+ if stdin?(filename)
76
+ yield $stdin
77
+ elsif directory?(filename)
78
+ search_pattern = File.join(filename, dir_glob_pattern)
79
+ filenames = Dir.glob(search_pattern)
80
+ filenames.each do |filename|
81
+ file = File.open(filename)
82
+ yield(file)
83
+ end
84
+ elsif file?(filename)
85
+ yield File.open(filename)
86
+ end
87
+ end
88
+
89
+ def stdin?(filename)
90
+ filename == $stdin
91
+ end
92
+
93
+ def directory?(filename)
94
+ Pathname.new(filename).directory?
95
+ end
96
+
97
+ def file?(filename)
98
+ Pathname.new(filename).file?
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,45 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JsonExtractor < Chronicle::ETL::Extractor
4
+ include Extractors::Helpers::FilesystemReader
5
+
6
+ register_connector do |r|
7
+ r.description = 'input as JSON'
8
+ end
9
+
10
+ DEFAULT_OPTIONS = {
11
+ filename: $stdin,
12
+
13
+ # We're expecting line-separated json objects
14
+ jsonl: true
15
+ }.freeze
16
+
17
+ def initialize(options = {})
18
+ super(DEFAULT_OPTIONS.merge(options))
19
+ end
20
+
21
+ def extract
22
+ load_input do |input|
23
+ parsed_data = parse_data(input)
24
+ yield Chronicle::ETL::Extraction.new(data: parsed_data) if parsed_data
25
+ end
26
+ end
27
+
28
+ def results_count
29
+ end
30
+
31
+ private
32
+
33
+ def parse_data data
34
+ JSON.parse(data)
35
+ rescue JSON::ParserError => e
36
+ end
37
+
38
+ def load_input
39
+ read_from_filesystem(filename: @options[:filename]) do |data|
40
+ yield data
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -1,9 +1,14 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class StdinExtractor < Chronicle::ETL::Extractor
4
+ register_connector do |r|
5
+ r.description = 'stdin'
6
+ end
7
+
4
8
  def extract
5
9
  $stdin.read.each_line do |line|
6
- yield line
10
+ data = { line: line.strip }
11
+ yield Chronicle::ETL::Extraction.new(data: data)
7
12
  end
8
13
  end
9
14
  end
@@ -0,0 +1,72 @@
1
+ require 'forwardable'
2
+ module Chronicle
3
+ module ETL
4
+ class Job
5
+ extend Forwardable
6
+
7
+ def_delegators :@job_definition, :dry_run?
8
+
9
+ attr_accessor :name,
10
+ :extractor_klass,
11
+ :extractor_options,
12
+ :transformer_klass,
13
+ :transformer_options,
14
+ :loader_klass,
15
+ :loader_options
16
+
17
+ # TODO: build a proper id system
18
+ alias id name
19
+
20
+ def initialize(job_definition)
21
+ @job_definition = job_definition
22
+ @name = @job_definition.definition[:name]
23
+ @extractor_options = @job_definition.extractor_options
24
+ @transformer_options = @job_definition.transformer_options
25
+ @loader_options = @job_definition.loader_options
26
+
27
+ set_continuation if use_continuation?
28
+ yield self if block_given?
29
+ end
30
+
31
+ def instantiate_extractor
32
+ @extractor_klass = @job_definition.extractor_klass
33
+ @extractor_klass.new(@extractor_options)
34
+ end
35
+
36
+ def instantiate_transformer(extraction)
37
+ @transformer_klass = @job_definition.transformer_klass
38
+ @transformer_klass.new(@transformer_options, extraction)
39
+ end
40
+
41
+ def instantiate_loader
42
+ @loader_klass = @job_definition.loader_klass
43
+ @loader_klass.new(@loader_options)
44
+ end
45
+
46
+ def save_log?
47
+ # TODO: this needs more nuance
48
+ return !id.nil?
49
+ end
50
+
51
+ def to_s
52
+ output = "Job"
53
+ output += " '#{name}'".bold if name
54
+ output += "\n"
55
+ output += " → Extracting from #{@job_definition.extractor_klass.description}\n"
56
+ output += " → Transforming #{@job_definition.transformer_klass.description}\n"
57
+ output += " → Loading to #{@job_definition.loader_klass.description}\n"
58
+ end
59
+
60
+ private
61
+
62
+ def set_continuation
63
+ continuation = Chronicle::ETL::JobLogger.load_latest(@id)
64
+ @extractor_options[:continuation] = continuation
65
+ end
66
+
67
+ def use_continuation?
68
+ @job_definition.incremental?
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,89 @@
1
+ require 'active_support/core_ext/hash/deep_merge'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class JobDefinition
6
+ SKELETON_DEFINITION = {
7
+ incremental: false,
8
+ extractor: {
9
+ name: 'stdin',
10
+ options: {}
11
+ },
12
+ transformer: {
13
+ name: 'null',
14
+ options: {}
15
+ },
16
+ loader: {
17
+ name: 'stdout',
18
+ options: {}
19
+ }
20
+ }.freeze
21
+
22
+ attr_accessor :definition
23
+
24
+ def initialize()
25
+ @definition = SKELETON_DEFINITION
26
+ end
27
+
28
+ # Add config hash to this definition
29
+ def add_config(config = {})
30
+ @definition = @definition.deep_merge(config)
31
+ load_credentials
32
+ validate
33
+ end
34
+
35
+ # Is this job continuing from a previous run?
36
+ def incremental?
37
+ @definition[:incremental]
38
+ end
39
+
40
+ def dry_run?
41
+ @definition[:dry_run]
42
+ end
43
+
44
+ def extractor_klass
45
+ load_klass(:extractor, @definition[:extractor][:name])
46
+ end
47
+
48
+ def transformer_klass
49
+ load_klass(:transformer, @definition[:transformer][:name])
50
+ end
51
+
52
+ def loader_klass
53
+ load_klass(:loader, @definition[:loader][:name])
54
+ end
55
+
56
+ def extractor_options
57
+ @definition[:extractor][:options]
58
+ end
59
+
60
+ def transformer_options
61
+ @definition[:transformer][:options]
62
+ end
63
+
64
+ def loader_options
65
+ @definition[:loader][:options]
66
+ end
67
+
68
+ private
69
+
70
+ def load_klass(phase, identifier)
71
+ Chronicle::ETL::Registry.find_by_phase_and_identifier(phase, identifier).klass
72
+ end
73
+
74
+ def load_credentials
75
+ Chronicle::ETL::Registry::PHASES.each do |phase|
76
+ credentials_name = @definition[phase].dig(:options, :credentials)
77
+ if credentials_name
78
+ credentials = Chronicle::ETL::Config.load_credentials(credentials_name)
79
+ @definition[phase][:options].deep_merge(credentials)
80
+ end
81
+ end
82
+ end
83
+
84
+ def validate
85
+ return true # TODO
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,95 @@
1
+ require 'forwardable'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A record of what happened in the running of a job. We're interested in
6
+ # tracking when it ran, if it was successful, and what the latest record
7
+ # we found is (to use as a cursor for the next time)
8
+ class JobLog
9
+ extend Forwardable
10
+
11
+ attr_accessor :job,
12
+ :job_id,
13
+ :last_id,
14
+ :highest_timestamp,
15
+ :num_records_processed,
16
+ :started_at,
17
+ :finished_at,
18
+ :success
19
+
20
+ def_delegators :@job, :save_log?
21
+
22
+ # Create a new JobLog for a given Job
23
+ def initialize
24
+ @num_records_processed = 0
25
+ @success = false
26
+ yield self if block_given?
27
+ end
28
+
29
+ # Log the result of a single transformation in a job
30
+ # @param transformer [Chronicle::ETL::Tranformer] The transformer that ran
31
+ def log_transformation(transformer)
32
+ @last_id = transformer.id if transformer.id
33
+
34
+ # Save the highest timestamp that we've encountered so far
35
+ @highest_timestamp = [transformer.timestamp, @highest_timestamp].compact.max if transformer.timestamp
36
+
37
+ # TODO: a transformer might yield nil. We might also want certain transformers to explode
38
+ # records into multiple new ones. Therefore, this this variable will need more subtle behaviour
39
+ @num_records_processed += 1
40
+ end
41
+
42
+ # Indicate that a job has started
43
+ def start
44
+ @started_at = Time.now
45
+ end
46
+
47
+ # Indicate that a job has finished
48
+ def finish
49
+ @finished_at = Time.now
50
+ @success = true
51
+ end
52
+
53
+ def error
54
+ @finished_at = Time.now
55
+ end
56
+
57
+ def job= job
58
+ @job = job
59
+ @job_id = job.id
60
+ end
61
+
62
+ def duration
63
+ return unless @finished_at
64
+
65
+ @finished_at - @started_at
66
+ end
67
+
68
+ # Take a JobLog's instance variables and turn them into a hash representation
69
+ def serialize
70
+ {
71
+ job_id: @job_id,
72
+ last_id: @last_id,
73
+ highest_timestamp: @highest_timestamp,
74
+ num_records_processed: @num_records_processed,
75
+ started_at: @started_at,
76
+ finished_at: @finished_at,
77
+ success: @success
78
+ }
79
+ end
80
+
81
+ private
82
+
83
+ # Create a new JobLog and set its instance variables from a serialized hash
84
+ def self.build_from_serialized attrs
85
+ attrs.delete(:id)
86
+ new do |job_log|
87
+ attrs.each do |key, value|
88
+ setter = "#{key.to_s}=".to_sym
89
+ job_log.send(setter, value)
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,81 @@
1
+ require 'sequel'
2
+ require 'forwardable'
3
+
4
+ module Chronicle
5
+ module ETL
6
+ # Saves JobLogs to db and loads previous ones
7
+ class JobLogger
8
+ extend Forwardable
9
+
10
+ def_delegators :@job_log, :start, :finish, :error, :log_transformation, :duration, :success
11
+ attr_accessor :job_log
12
+
13
+ # For a given `job_id`, return the last successful log
14
+ def self.load_latest(job_id)
15
+ with_db_connection do |db|
16
+ attrs = db[:job_logs].reverse_order(:finished_at).where(success: true).first
17
+ JobLog.build_from_serialized(attrs) if attrs
18
+ end
19
+ end
20
+
21
+ def self.with_db_connection
22
+ initialize_db unless db_exists?
23
+ Sequel.connect("sqlite://#{db_filename}") do |db|
24
+ initialize_schema(db) unless schema_exists?(db)
25
+ yield db
26
+ end
27
+ end
28
+
29
+ def self.db_exists?
30
+ File.exists?(db_filename)
31
+ end
32
+
33
+ def self.schema_exists?(db)
34
+ return db.tables.include? :job_logs
35
+ end
36
+
37
+ def self.db_filename
38
+ data = Runcom::Data.new "chronicle/etl/job_log.db"
39
+ filename = data.all[0].to_s
40
+ end
41
+
42
+ def self.initialize_db
43
+ FileUtils.mkdir_p(File.dirname(db_filename))
44
+ end
45
+
46
+ def self.initialize_schema db
47
+ db.create_table :job_logs do
48
+ primary_key :id
49
+ String :job_id, null: false
50
+ String :last_id
51
+ Time :highest_timestamp
52
+ Integer :num_records_processed
53
+ boolean :success, default: false
54
+ Time :started_at
55
+ Time :finished_at
56
+ end
57
+ end
58
+
59
+ # Create a new JobLogger
60
+ def initialize(job)
61
+ @job_log = JobLog.new do |job_log|
62
+ job_log.job = job
63
+ end
64
+ end
65
+
66
+ # Save this JobLogger's JobLog to db
67
+ def save
68
+ return unless @job_log.save_log?
69
+
70
+ JobLogger.with_db_connection do |db|
71
+ dataset = db[:job_logs]
72
+ dataset.insert(@job_log.serialize)
73
+ end
74
+ end
75
+
76
+ def summarize
77
+ @job_log.inspect
78
+ end
79
+ end
80
+ end
81
+ end
@@ -3,17 +3,17 @@ require 'csv'
3
3
  module Chronicle
4
4
  module ETL
5
5
  class CsvLoader < Chronicle::ETL::Loader
6
+ register_connector do |r|
7
+ r.description = 'CSV'
8
+ end
9
+
6
10
  def initialize(options={})
7
11
  super(options)
8
12
  @rows = []
9
13
  end
10
14
 
11
- def load(result)
12
- if (result.is_a? Hash)
13
- @rows << result.values
14
- else
15
- @rows << result
16
- end
15
+ def load(record)
16
+ @rows << record.to_h_flattened.values
17
17
  end
18
18
 
19
19
  def finish
@@ -2,10 +2,10 @@ module Chronicle
2
2
  module ETL
3
3
  # Abstract class representing a Loader for an ETL job
4
4
  class Loader
5
- extend Chronicle::ETL::Catalog
5
+ extend Chronicle::ETL::Registry::SelfRegistering
6
6
 
7
7
  # Construct a new instance of this loader. Options are passed in from a Runner
8
- # == Paramters:
8
+ # == Parameters:
9
9
  # options::
10
10
  # Options for configuring this Loader
11
11
  def initialize(options = {})
@@ -5,25 +5,32 @@ require 'json'
5
5
  module Chronicle
6
6
  module ETL
7
7
  class RestLoader < Chronicle::ETL::Loader
8
- def initialize(options={})
8
+ register_connector do |r|
9
+ r.description = 'a REST endpoint'
10
+ end
11
+
12
+ def initialize( options={} )
9
13
  super(options)
10
14
  end
11
15
 
12
- def load(result)
16
+ def load(record)
17
+ payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
18
+ # have the outer data key that json-api expects
19
+ payload = { data: payload } unless payload[:data]
20
+
13
21
  uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
14
22
 
15
23
  header = {
16
24
  "Authorization" => "Bearer #{@options[:access_token]}",
17
25
  "Content-Type": 'application/json'
18
26
  }
27
+ use_ssl = uri.scheme == 'https'
19
28
 
20
- http = Net::HTTP.new(uri.host, uri.port)
21
- request = Net::HTTP::Post.new(uri.request_uri, header)
22
-
23
- obj = {data: result} unless result[:data]
24
- request.body = obj.to_json
25
-
26
- response = http.request(request)
29
+ Net::HTTP.start(uri.host, uri.port, use_ssl: use_ssl) do |http|
30
+ request = Net::HTTP::Post.new(uri.request_uri, header)
31
+ request.body = payload.to_json
32
+ http.request(request)
33
+ end
27
34
  end
28
35
  end
29
36
  end
@@ -1,9 +1,14 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class StdoutLoader < Chronicle::ETL::Loader
4
- def load(result)
5
- puts result.inspect
4
+ register_connector do |r|
5
+ r.description = 'stdout'
6
+ end
7
+
8
+ def load(record)
9
+ serializer = Chronicle::ETL::JSONAPISerializer.new(record)
10
+ puts serializer.serializable_hash.to_json
6
11
  end
7
12
  end
8
13
  end
9
- end
14
+ end