chronicle-etl 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/README.md +22 -15
  5. data/chronicle-etl.gemspec +11 -5
  6. data/lib/chronicle/etl/cli/connectors.rb +19 -7
  7. data/lib/chronicle/etl/cli/jobs.rb +38 -27
  8. data/lib/chronicle/etl/cli/main.rb +10 -2
  9. data/lib/chronicle/etl/config.rb +24 -3
  10. data/lib/chronicle/etl/exceptions.rb +30 -0
  11. data/lib/chronicle/etl/extraction.rb +12 -0
  12. data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -37
  13. data/lib/chronicle/etl/extractors/extractor.rb +19 -1
  14. data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
  15. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  16. data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
  17. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  18. data/lib/chronicle/etl/job.rb +72 -0
  19. data/lib/chronicle/etl/job_definition.rb +89 -0
  20. data/lib/chronicle/etl/job_log.rb +95 -0
  21. data/lib/chronicle/etl/job_logger.rb +81 -0
  22. data/lib/chronicle/etl/loaders/csv_loader.rb +6 -6
  23. data/lib/chronicle/etl/loaders/loader.rb +2 -2
  24. data/lib/chronicle/etl/loaders/rest_loader.rb +16 -9
  25. data/lib/chronicle/etl/loaders/stdout_loader.rb +8 -3
  26. data/lib/chronicle/etl/loaders/table_loader.rb +58 -7
  27. data/lib/chronicle/etl/logger.rb +48 -0
  28. data/lib/chronicle/etl/models/activity.rb +15 -0
  29. data/lib/chronicle/etl/models/attachment.rb +14 -0
  30. data/lib/chronicle/etl/models/base.rb +119 -0
  31. data/lib/chronicle/etl/models/entity.rb +21 -0
  32. data/lib/chronicle/etl/models/generic.rb +23 -0
  33. data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
  34. data/lib/chronicle/etl/registry/registry.rb +52 -0
  35. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  36. data/lib/chronicle/etl/runner.rb +70 -42
  37. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  38. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  39. data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
  40. data/lib/chronicle/etl/transformers/null_transformer.rb +12 -4
  41. data/lib/chronicle/etl/transformers/transformer.rb +42 -12
  42. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  43. data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
  44. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  45. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  46. data/lib/chronicle/etl/version.rb +1 -1
  47. data/lib/chronicle/etl.rb +17 -1
  48. metadata +138 -35
  49. data/CHANGELOG.md +0 -23
  50. data/Gemfile.lock +0 -85
  51. data/lib/chronicle/etl/catalog.rb +0 -62
  52. data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -0,0 +1,104 @@
1
+ require 'pathname'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Extractors
6
+ module Helpers
7
+ module FilesystemReader
8
+
9
+ def filenames_in_directory(...)
10
+ filenames = gather_files(...)
11
+ if block_given?
12
+ filenames.each do |filename|
13
+ yield filename
14
+ end
15
+ else
16
+ filenames
17
+ end
18
+ end
19
+
20
+ def read_from_filesystem(filename:, yield_each_line: true, dir_glob_pattern: '**/*')
21
+ open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
22
+ if yield_each_line
23
+ file.each_line do |line|
24
+ yield line
25
+ end
26
+ else
27
+ yield file.read
28
+ end
29
+ end
30
+ end
31
+
32
+ def open_from_filesystem(filename:, dir_glob_pattern: '**/*')
33
+ open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
34
+ yield file
35
+ end
36
+ end
37
+
38
+ def results_count
39
+ raise NotImplementedError
40
+ # if file?
41
+ # return 1
42
+ # else
43
+ # search_pattern = File.join(@options[:filename], '**/*')
44
+ # Dir.glob(search_pattern).count
45
+ # end
46
+ end
47
+
48
+ private
49
+
50
+ def gather_files(path:, dir_glob_pattern: '**/*', load_since: nil, load_until: nil, smaller_than: nil, larger_than: nil, sort: :mtime)
51
+ search_pattern = File.join(path, '**', dir_glob_pattern)
52
+ files = Dir.glob(search_pattern)
53
+
54
+ files = files.keep_if {|f| (File.mtime(f) > load_since)} if load_since
55
+ files = files.keep_if {|f| (File.mtime(f) < load_until)} if load_until
56
+
57
+ # pass in file sizes in bytes
58
+ files = files.keep_if {|f| (File.size(f) < smaller_than)} if smaller_than
59
+ files = files.keep_if {|f| (File.size(f) > larger_than)} if larger_than
60
+
61
+ # TODO: incorporate sort argument
62
+ files.sort_by{ |f| File.mtime(f) }
63
+ end
64
+
65
+ def select_files_in_directory(path:, dir_glob_pattern: '**/*')
66
+ raise IOError.new("#{path} is not a directory.") unless directory?(path)
67
+
68
+ search_pattern = File.join(path, dir_glob_pattern)
69
+ Dir.glob(search_pattern).each do |filename|
70
+ yield(filename)
71
+ end
72
+ end
73
+
74
+ def open_files(filename:, dir_glob_pattern:)
75
+ if stdin?(filename)
76
+ yield $stdin
77
+ elsif directory?(filename)
78
+ search_pattern = File.join(filename, dir_glob_pattern)
79
+ filenames = Dir.glob(search_pattern)
80
+ filenames.each do |filename|
81
+ file = File.open(filename)
82
+ yield(file)
83
+ end
84
+ elsif file?(filename)
85
+ yield File.open(filename)
86
+ end
87
+ end
88
+
89
+ def stdin?(filename)
90
+ filename == $stdin
91
+ end
92
+
93
+ def directory?(filename)
94
+ Pathname.new(filename).directory?
95
+ end
96
+
97
+ def file?(filename)
98
+ Pathname.new(filename).file?
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,45 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JsonExtractor < Chronicle::ETL::Extractor
4
+ include Extractors::Helpers::FilesystemReader
5
+
6
+ register_connector do |r|
7
+ r.description = 'input as JSON'
8
+ end
9
+
10
+ DEFAULT_OPTIONS = {
11
+ filename: $stdin,
12
+
13
+ # We're expecting line-separated json objects
14
+ jsonl: true
15
+ }.freeze
16
+
17
+ def initialize(options = {})
18
+ super(DEFAULT_OPTIONS.merge(options))
19
+ end
20
+
21
+ def extract
22
+ load_input do |input|
23
+ parsed_data = parse_data(input)
24
+ yield Chronicle::ETL::Extraction.new(data: parsed_data) if parsed_data
25
+ end
26
+ end
27
+
28
+ def results_count
29
+ end
30
+
31
+ private
32
+
33
+ def parse_data data
34
+ JSON.parse(data)
35
+ rescue JSON::ParserError => e
36
+ end
37
+
38
+ def load_input
39
+ read_from_filesystem(filename: @options[:filename]) do |data|
40
+ yield data
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -1,9 +1,14 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class StdinExtractor < Chronicle::ETL::Extractor
4
+ register_connector do |r|
5
+ r.description = 'stdin'
6
+ end
7
+
4
8
  def extract
5
9
  $stdin.read.each_line do |line|
6
- yield line
10
+ data = { line: line.strip }
11
+ yield Chronicle::ETL::Extraction.new(data: data)
7
12
  end
8
13
  end
9
14
  end
@@ -0,0 +1,72 @@
1
+ require 'forwardable'
2
+ module Chronicle
3
+ module ETL
4
+ class Job
5
+ extend Forwardable
6
+
7
+ def_delegators :@job_definition, :dry_run?
8
+
9
+ attr_accessor :name,
10
+ :extractor_klass,
11
+ :extractor_options,
12
+ :transformer_klass,
13
+ :transformer_options,
14
+ :loader_klass,
15
+ :loader_options
16
+
17
+ # TODO: build a proper id system
18
+ alias id name
19
+
20
+ def initialize(job_definition)
21
+ @job_definition = job_definition
22
+ @name = @job_definition.definition[:name]
23
+ @extractor_options = @job_definition.extractor_options
24
+ @transformer_options = @job_definition.transformer_options
25
+ @loader_options = @job_definition.loader_options
26
+
27
+ set_continuation if use_continuation?
28
+ yield self if block_given?
29
+ end
30
+
31
+ def instantiate_extractor
32
+ @extractor_klass = @job_definition.extractor_klass
33
+ @extractor_klass.new(@extractor_options)
34
+ end
35
+
36
+ def instantiate_transformer(extraction)
37
+ @transformer_klass = @job_definition.transformer_klass
38
+ @transformer_klass.new(@transformer_options, extraction)
39
+ end
40
+
41
+ def instantiate_loader
42
+ @loader_klass = @job_definition.loader_klass
43
+ @loader_klass.new(@loader_options)
44
+ end
45
+
46
+ def save_log?
47
+ # TODO: this needs more nuance
48
+ return !id.nil?
49
+ end
50
+
51
+ def to_s
52
+ output = "Job"
53
+ output += " '#{name}'".bold if name
54
+ output += "\n"
55
+ output += " → Extracting from #{@job_definition.extractor_klass.description}\n"
56
+ output += " → Transforming #{@job_definition.transformer_klass.description}\n"
57
+ output += " → Loading to #{@job_definition.loader_klass.description}\n"
58
+ end
59
+
60
+ private
61
+
62
+ def set_continuation
63
+ continuation = Chronicle::ETL::JobLogger.load_latest(@id)
64
+ @extractor_options[:continuation] = continuation
65
+ end
66
+
67
+ def use_continuation?
68
+ @job_definition.incremental?
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,89 @@
1
+ require 'active_support/core_ext/hash/deep_merge'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class JobDefinition
6
+ SKELETON_DEFINITION = {
7
+ incremental: false,
8
+ extractor: {
9
+ name: 'stdin',
10
+ options: {}
11
+ },
12
+ transformer: {
13
+ name: 'null',
14
+ options: {}
15
+ },
16
+ loader: {
17
+ name: 'stdout',
18
+ options: {}
19
+ }
20
+ }.freeze
21
+
22
+ attr_accessor :definition
23
+
24
+ def initialize()
25
+ @definition = SKELETON_DEFINITION
26
+ end
27
+
28
+ # Add config hash to this definition
29
+ def add_config(config = {})
30
+ @definition = @definition.deep_merge(config)
31
+ load_credentials
32
+ validate
33
+ end
34
+
35
+ # Is this job continuing from a previous run?
36
+ def incremental?
37
+ @definition[:incremental]
38
+ end
39
+
40
+ def dry_run?
41
+ @definition[:dry_run]
42
+ end
43
+
44
+ def extractor_klass
45
+ load_klass(:extractor, @definition[:extractor][:name])
46
+ end
47
+
48
+ def transformer_klass
49
+ load_klass(:transformer, @definition[:transformer][:name])
50
+ end
51
+
52
+ def loader_klass
53
+ load_klass(:loader, @definition[:loader][:name])
54
+ end
55
+
56
+ def extractor_options
57
+ @definition[:extractor][:options]
58
+ end
59
+
60
+ def transformer_options
61
+ @definition[:transformer][:options]
62
+ end
63
+
64
+ def loader_options
65
+ @definition[:loader][:options]
66
+ end
67
+
68
+ private
69
+
70
+ def load_klass(phase, identifier)
71
+ Chronicle::ETL::Registry.find_by_phase_and_identifier(phase, identifier).klass
72
+ end
73
+
74
+ def load_credentials
75
+ Chronicle::ETL::Registry::PHASES.each do |phase|
76
+ credentials_name = @definition[phase].dig(:options, :credentials)
77
+ if credentials_name
78
+ credentials = Chronicle::ETL::Config.load_credentials(credentials_name)
79
+ @definition[phase][:options].deep_merge(credentials)
80
+ end
81
+ end
82
+ end
83
+
84
+ def validate
85
+ return true # TODO
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,95 @@
1
+ require 'forwardable'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A record of what happened in the running of a job. We're interested in
6
+ # tracking when it ran, if it was successful, and what the latest record
7
+ # we found is (to use as a cursor for the next time)
8
+ class JobLog
9
+ extend Forwardable
10
+
11
+ attr_accessor :job,
12
+ :job_id,
13
+ :last_id,
14
+ :highest_timestamp,
15
+ :num_records_processed,
16
+ :started_at,
17
+ :finished_at,
18
+ :success
19
+
20
+ def_delegators :@job, :save_log?
21
+
22
+ # Create a new JobLog for a given Job
23
+ def initialize
24
+ @num_records_processed = 0
25
+ @success = false
26
+ yield self if block_given?
27
+ end
28
+
29
+ # Log the result of a single transformation in a job
30
+ # @param transformer [Chronicle::ETL::Tranformer] The transformer that ran
31
+ def log_transformation(transformer)
32
+ @last_id = transformer.id if transformer.id
33
+
34
+ # Save the highest timestamp that we've encountered so far
35
+ @highest_timestamp = [transformer.timestamp, @highest_timestamp].compact.max if transformer.timestamp
36
+
37
+ # TODO: a transformer might yield nil. We might also want certain transformers to explode
38
+ # records into multiple new ones. Therefore, this this variable will need more subtle behaviour
39
+ @num_records_processed += 1
40
+ end
41
+
42
+ # Indicate that a job has started
43
+ def start
44
+ @started_at = Time.now
45
+ end
46
+
47
+ # Indicate that a job has finished
48
+ def finish
49
+ @finished_at = Time.now
50
+ @success = true
51
+ end
52
+
53
+ def error
54
+ @finished_at = Time.now
55
+ end
56
+
57
+ def job= job
58
+ @job = job
59
+ @job_id = job.id
60
+ end
61
+
62
+ def duration
63
+ return unless @finished_at
64
+
65
+ @finished_at - @started_at
66
+ end
67
+
68
+ # Take a JobLog's instance variables and turn them into a hash representation
69
+ def serialize
70
+ {
71
+ job_id: @job_id,
72
+ last_id: @last_id,
73
+ highest_timestamp: @highest_timestamp,
74
+ num_records_processed: @num_records_processed,
75
+ started_at: @started_at,
76
+ finished_at: @finished_at,
77
+ success: @success
78
+ }
79
+ end
80
+
81
+ private
82
+
83
+ # Create a new JobLog and set its instance variables from a serialized hash
84
+ def self.build_from_serialized attrs
85
+ attrs.delete(:id)
86
+ new do |job_log|
87
+ attrs.each do |key, value|
88
+ setter = "#{key.to_s}=".to_sym
89
+ job_log.send(setter, value)
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,81 @@
1
+ require 'sequel'
2
+ require 'forwardable'
3
+
4
+ module Chronicle
5
+ module ETL
6
+ # Saves JobLogs to db and loads previous ones
7
+ class JobLogger
8
+ extend Forwardable
9
+
10
+ def_delegators :@job_log, :start, :finish, :error, :log_transformation, :duration, :success
11
+ attr_accessor :job_log
12
+
13
+ # For a given `job_id`, return the last successful log
14
+ def self.load_latest(job_id)
15
+ with_db_connection do |db|
16
+ attrs = db[:job_logs].reverse_order(:finished_at).where(success: true).first
17
+ JobLog.build_from_serialized(attrs) if attrs
18
+ end
19
+ end
20
+
21
+ def self.with_db_connection
22
+ initialize_db unless db_exists?
23
+ Sequel.connect("sqlite://#{db_filename}") do |db|
24
+ initialize_schema(db) unless schema_exists?(db)
25
+ yield db
26
+ end
27
+ end
28
+
29
+ def self.db_exists?
30
+ File.exists?(db_filename)
31
+ end
32
+
33
+ def self.schema_exists?(db)
34
+ return db.tables.include? :job_logs
35
+ end
36
+
37
+ def self.db_filename
38
+ data = Runcom::Data.new "chronicle/etl/job_log.db"
39
+ filename = data.all[0].to_s
40
+ end
41
+
42
+ def self.initialize_db
43
+ FileUtils.mkdir_p(File.dirname(db_filename))
44
+ end
45
+
46
+ def self.initialize_schema db
47
+ db.create_table :job_logs do
48
+ primary_key :id
49
+ String :job_id, null: false
50
+ String :last_id
51
+ Time :highest_timestamp
52
+ Integer :num_records_processed
53
+ boolean :success, default: false
54
+ Time :started_at
55
+ Time :finished_at
56
+ end
57
+ end
58
+
59
+ # Create a new JobLogger
60
+ def initialize(job)
61
+ @job_log = JobLog.new do |job_log|
62
+ job_log.job = job
63
+ end
64
+ end
65
+
66
+ # Save this JobLogger's JobLog to db
67
+ def save
68
+ return unless @job_log.save_log?
69
+
70
+ JobLogger.with_db_connection do |db|
71
+ dataset = db[:job_logs]
72
+ dataset.insert(@job_log.serialize)
73
+ end
74
+ end
75
+
76
+ def summarize
77
+ @job_log.inspect
78
+ end
79
+ end
80
+ end
81
+ end
@@ -3,17 +3,17 @@ require 'csv'
3
3
  module Chronicle
4
4
  module ETL
5
5
  class CsvLoader < Chronicle::ETL::Loader
6
+ register_connector do |r|
7
+ r.description = 'CSV'
8
+ end
9
+
6
10
  def initialize(options={})
7
11
  super(options)
8
12
  @rows = []
9
13
  end
10
14
 
11
- def load(result)
12
- if (result.is_a? Hash)
13
- @rows << result.values
14
- else
15
- @rows << result
16
- end
15
+ def load(record)
16
+ @rows << record.to_h_flattened.values
17
17
  end
18
18
 
19
19
  def finish
@@ -2,10 +2,10 @@ module Chronicle
2
2
  module ETL
3
3
  # Abstract class representing a Loader for an ETL job
4
4
  class Loader
5
- extend Chronicle::ETL::Catalog
5
+ extend Chronicle::ETL::Registry::SelfRegistering
6
6
 
7
7
  # Construct a new instance of this loader. Options are passed in from a Runner
8
- # == Paramters:
8
+ # == Parameters:
9
9
  # options::
10
10
  # Options for configuring this Loader
11
11
  def initialize(options = {})
@@ -5,25 +5,32 @@ require 'json'
5
5
  module Chronicle
6
6
  module ETL
7
7
  class RestLoader < Chronicle::ETL::Loader
8
- def initialize(options={})
8
+ register_connector do |r|
9
+ r.description = 'a REST endpoint'
10
+ end
11
+
12
+ def initialize( options={} )
9
13
  super(options)
10
14
  end
11
15
 
12
- def load(result)
16
+ def load(record)
17
+ payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
18
+ # have the outer data key that json-api expects
19
+ payload = { data: payload } unless payload[:data]
20
+
13
21
  uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
14
22
 
15
23
  header = {
16
24
  "Authorization" => "Bearer #{@options[:access_token]}",
17
25
  "Content-Type": 'application/json'
18
26
  }
27
+ use_ssl = uri.scheme == 'https'
19
28
 
20
- http = Net::HTTP.new(uri.host, uri.port)
21
- request = Net::HTTP::Post.new(uri.request_uri, header)
22
-
23
- obj = {data: result} unless result[:data]
24
- request.body = obj.to_json
25
-
26
- response = http.request(request)
29
+ Net::HTTP.start(uri.host, uri.port, use_ssl: use_ssl) do |http|
30
+ request = Net::HTTP::Post.new(uri.request_uri, header)
31
+ request.body = payload.to_json
32
+ http.request(request)
33
+ end
27
34
  end
28
35
  end
29
36
  end
@@ -1,9 +1,14 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class StdoutLoader < Chronicle::ETL::Loader
4
- def load(result)
5
- puts result.inspect
4
+ register_connector do |r|
5
+ r.description = 'stdout'
6
+ end
7
+
8
+ def load(record)
9
+ serializer = Chronicle::ETL::JSONAPISerializer.new(record)
10
+ puts serializer.serializable_hash.to_json
6
11
  end
7
12
  end
8
13
  end
9
- end
14
+ end
@@ -1,20 +1,71 @@
1
1
  require 'tty/table'
2
+ require 'active_support/core_ext/string/filters'
3
+ require 'active_support/core_ext/hash/reverse_merge'
2
4
 
3
5
  module Chronicle
4
6
  module ETL
5
7
  class TableLoader < Chronicle::ETL::Loader
6
- def initialize(options)
7
- super(options)
8
+ register_connector do |r|
9
+ r.description = 'an ASCII table'
8
10
  end
9
11
 
10
- def load(result)
11
- @table ||= TTY::Table.new(header: result.keys)
12
- values = result.values.map{|x| x.to_s[0..30]}
13
- @table << values
12
+ DEFAULT_OPTIONS = {
13
+ fields_limit: nil,
14
+ fields_exclude: ['lids', 'type'],
15
+ fields_include: [],
16
+ truncate_values_at: nil,
17
+ table_renderer: :basic
18
+ }.freeze
19
+
20
+ def initialize(options={})
21
+ @options = options.reverse_merge(DEFAULT_OPTIONS)
22
+ @records = []
23
+ end
24
+
25
+ def load(record)
26
+ @records << record.to_h_flattened
14
27
  end
15
28
 
16
29
  def finish
17
- puts @table.render(:ascii, padding: [0, 1])
30
+ return if @records.empty?
31
+
32
+ headers = build_headers(@records)
33
+ rows = build_rows(@records, headers)
34
+
35
+ @table = TTY::Table.new(header: headers, rows: rows)
36
+ puts @table.render(
37
+ @options[:table_renderer].to_sym,
38
+ padding: [0, 2, 0, 0]
39
+ )
40
+ end
41
+
42
+ private
43
+
44
+ def build_headers(records)
45
+ headers =
46
+ if @options[:fields_include].any?
47
+ Set[*@options[:fields_include]]
48
+ else
49
+ # use all the keys of the flattened record hash
50
+ Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
51
+ end
52
+
53
+ headers = headers.delete_if { |header| header.end_with?(*@options[:fields_exclude]) } if @options[:fields_exclude].any?
54
+ headers = headers.first(@options[:fields_limit]) if @options[:fields_limit]
55
+
56
+ headers.to_a.map(&:to_sym)
57
+ end
58
+
59
+ def build_rows(records, headers)
60
+ records.map do |record|
61
+ values = record.values_at(*headers).map{|value| value.to_s }
62
+
63
+ if @options[:truncate_values_at]
64
+ values = values.map{ |value| value.truncate(@options[:truncate_values_at]) }
65
+ end
66
+
67
+ values
68
+ end
18
69
  end
19
70
  end
20
71
  end