remi 0.2.27 → 0.2.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -0
  3. data/Gemfile.lock +34 -5
  4. data/features/metadata.feature +17 -0
  5. data/features/step_definitions/remi_step.rb +6 -6
  6. data/features/transforms/date_diff.feature +1 -0
  7. data/jobs/aggregate_job.rb +0 -1
  8. data/jobs/all_jobs_shared.rb +0 -2
  9. data/jobs/copy_source_job.rb +0 -1
  10. data/jobs/csv_file_target_job.rb +0 -1
  11. data/jobs/metadata_job.rb +60 -0
  12. data/jobs/parameters_job.rb +1 -1
  13. data/jobs/sample_job.rb +19 -20
  14. data/jobs/sftp_file_target_job.rb +0 -1
  15. data/jobs/transforms/date_diff_job.rb +1 -1
  16. data/jobs/transforms/nvl_job.rb +1 -1
  17. data/jobs/transforms/parse_date_job.rb +7 -4
  18. data/jobs/transforms/prefix_job.rb +1 -1
  19. data/jobs/transforms/truncate_job.rb +1 -1
  20. data/lib/remi.rb +10 -15
  21. data/lib/remi/cucumber/business_rules.rb +23 -23
  22. data/lib/remi/cucumber/data_source.rb +2 -1
  23. data/lib/remi/data_frame.rb +36 -0
  24. data/lib/remi/data_frame/daru.rb +67 -0
  25. data/lib/remi/data_subject.rb +71 -10
  26. data/lib/remi/data_subject/csv_file.rb +151 -0
  27. data/lib/remi/data_subject/data_frame.rb +53 -0
  28. data/lib/remi/data_subject/postgres.rb +136 -0
  29. data/lib/remi/data_subject/salesforce.rb +136 -0
  30. data/lib/remi/data_subject/sftp_file.rb +66 -0
  31. data/lib/remi/fields.rb +8 -0
  32. data/lib/remi/source_to_target_map.rb +56 -32
  33. data/lib/remi/transform.rb +426 -83
  34. data/lib/remi/version.rb +1 -1
  35. data/remi.gemspec +2 -1
  36. data/spec/metadata_spec.rb +62 -0
  37. metadata +15 -28
  38. data/lib/remi/data_source.rb +0 -13
  39. data/lib/remi/data_source/csv_file.rb +0 -101
  40. data/lib/remi/data_source/data_frame.rb +0 -16
  41. data/lib/remi/data_source/postgres.rb +0 -58
  42. data/lib/remi/data_source/salesforce.rb +0 -87
  43. data/lib/remi/data_target.rb +0 -15
  44. data/lib/remi/data_target/csv_file.rb +0 -42
  45. data/lib/remi/data_target/data_frame.rb +0 -14
  46. data/lib/remi/data_target/postgres.rb +0 -74
  47. data/lib/remi/data_target/salesforce.rb +0 -54
  48. data/lib/remi/data_target/sftp_file.rb +0 -54
  49. data/lib/remi/refinements/daru.rb +0 -85
@@ -0,0 +1,53 @@
1
+ module Remi
2
+
3
+ class DataSource::DataFrame < Remi::DataSubject
4
+ include Remi::DataSubject::DataSource
5
+
6
+ def initialize(*args, **kargs, &block)
7
+ super
8
+ init_df(*args, **kargs, &block)
9
+ end
10
+
11
+ # Public: Called to extract data from the source.
12
+ #
13
+ # Returns data in a format that can be used to create a dataframe.
14
+ def extract!
15
+ @extract = []
16
+ end
17
+
18
+ # Public: Converts extracted data to a dataframe
19
+ #
20
+ # Returns a Remi::DataFrame
21
+ def to_dataframe
22
+ DataFrame.create(@remi_df_type, extract, order: @fields.keys)
23
+ end
24
+
25
+ private
26
+
27
+ def init_df(*args, **kargs, &block)
28
+ end
29
+ end
30
+
31
+
32
+ class DataTarget::DataFrame < Remi::DataSubject
33
+ include Remi::DataSubject::DataTarget
34
+
35
+ def initialize(*args, **kargs, &block)
36
+ super
37
+ init_df(*args, **kargs, &block)
38
+ end
39
+
40
+ # Public: Performs the load operation, regardless of whether it has
41
+ # already executed.
42
+ #
43
+ # Returns true if the load operation was successful
44
+ def load!
45
+ true
46
+ end
47
+
48
+ private
49
+
50
+ def init_df(*args, **kargs, &block)
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,136 @@
1
+ module Remi
2
+ module DataSubject::Postgres
3
+ def connection
4
+ @connection ||= PG.connect(
5
+ host: @credentials[:host] || 'localhost',
6
+ port: @credentials[:port] || 5432,
7
+ dbname: @credentials[:dbname],
8
+ user: @credentials[:user] || `whoami`.chomp,
9
+ password: @credentials[:password],
10
+ sslmode: @credentials[:sslmode] || 'allow'
11
+ )
12
+ end
13
+ end
14
+
15
+
16
+ class DataSource::Postgres < Remi::DataSubject
17
+ include Remi::DataSubject::DataSource
18
+ include Remi::DataSubject::Postgres
19
+
20
+
21
+ def initialize(*args, **kargs, &block)
22
+ super
23
+ init_postgres(*args, **kargs, &block)
24
+ end
25
+
26
+ # Public: Called to extract data from the source.
27
+ #
28
+ # Returns data in a format that can be used to create a dataframe.
29
+ def extract!
30
+ @logger.info "Executing query #{@query}"
31
+ @extract = connection.exec @query
32
+ end
33
+
34
+ # Public: Converts extracted data to a dataframe.
35
+ # Currently only supports Daru DataFrames.
36
+ #
37
+ # Returns a Remi::DataFrame
38
+ def to_dataframe
39
+ # Performance for larger sets could be improved by using bulk query (via COPY)
40
+ @logger.info "Converting query to a dataframe"
41
+
42
+ hash_array = {}
43
+ extract.each do |row|
44
+ row.each do |field, value|
45
+ (hash_array[field_symbolizer.call(field)] ||= []) << value
46
+ end
47
+ end
48
+
49
+ # After converting to DF, clear the PG results to save memory.
50
+ extract.clear
51
+
52
+ Remi::DataFrame.create(@remi_df_type, hash_array, order: hash_array.keys)
53
+ end
54
+
55
+
56
+ private
57
+
58
+ def init_postgres(*args, credentials:, query:, **kargs, &block)
59
+ @credentials = credentials
60
+ @query = query
61
+ end
62
+ end
63
+
64
+
65
+
66
+ # VERY PRELIMINARY IMPLEMENTAtION - ONLY LOADS TO TEMP TABLES
67
+ # IT IS THEN UP TO THE USER TO DO ELT TO LOAD THE FINAL TABLE
68
+ class DataTarget::Postgres < Remi::DataSubject
69
+ include Remi::DataSubject::DataTarget
70
+ include Remi::DataSubject::Postgres
71
+
72
+ def initialize(*args, **kargs, &block)
73
+ super
74
+ init_postgres(*args, **kargs, &block)
75
+ end
76
+
77
+ # Public: Performs the load operation, regardless of whether it has
78
+ # already executed.
79
+ #
80
+ # Returns true if the load operation was successful
81
+ def load!
82
+ @logger.info "Performing postgres load to table #{@table_name}"
83
+ create_target_table
84
+ load_target_table
85
+
86
+ true
87
+ end
88
+
89
+
90
+ private
91
+
92
+ def init_postgres(*args, credentials:, table_name:, **kargs, &block)
93
+ @credentials = credentials
94
+ @table_name = table_name
95
+ end
96
+
97
+ def fields_with_type_ddl
98
+ @fields.map { |k,v| "#{k} #{v[:type]}" }.join(', ')
99
+ end
100
+
101
+ def create_target_table
102
+ create_table_sql = <<-EOT
103
+ CREATE TEMPORARY TABLE #{@table_name} (
104
+ #{fields_with_type_ddl}
105
+ )
106
+ EOT
107
+
108
+ @logger.info create_table_sql
109
+ connection.exec create_table_sql
110
+ end
111
+
112
+ def load_target_table
113
+ connection.copy_data "COPY #{@table_name} (#{@fields.keys.join(', ')}) FROM STDIN" do
114
+ df.each(:row) do |row|
115
+ row_str = @fields.keys.map do |field|
116
+ field = row[field]
117
+ case
118
+ when field.respond_to?(:strftime)
119
+ field.strftime('%Y-%m-%d %H:%M:%S')
120
+ when field.respond_to?(:map)
121
+ field.to_json.gsub("\t", '\t')
122
+ when field.blank? && !field.nil?
123
+ ''
124
+ when field.nil?
125
+ '\N'
126
+ else
127
+ field.to_s.gsub(/[\t\n\r]/, "\t" => '\t', "\n" => '\n', "\r" => '\r')
128
+ end
129
+ end.join("\t")
130
+
131
+ connection.put_copy_data row_str + "\n"
132
+ end
133
+ end
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,136 @@
1
+ require 'restforce'
2
+ require 'salesforce_bulk_api'
3
+ require 'remi/sf_bulk_helper'
4
+
5
+ module Remi
6
+ module DataSubject::Salesforce
7
+ def field_symbolizer
8
+ Remi::FieldSymbolizers[:salesforce]
9
+ end
10
+
11
+ def restforce_client
12
+ @restforce_client ||= begin
13
+ client = Restforce.new(@credentials)
14
+
15
+ #run a dummy query to initiate a connection. Workaround for Bulk API problem
16
+ # https://github.com/yatish27/salesforce_bulk_api/issues/33
17
+ client.query('SELECT Id FROM Contact LIMIT 1')
18
+ client
19
+ end
20
+ end
21
+ end
22
+
23
+
24
+ class DataSource::Salesforce < Remi::DataSubject
25
+ include Remi::DataSubject::DataSource
26
+ include Remi::DataSubject::Salesforce
27
+
28
+ def initialize(*args, **kargs, &block)
29
+ super
30
+ init_salesforce(*args, **kargs, &block)
31
+ end
32
+
33
+ # Public: Called to extract data from the source.
34
+ #
35
+ # Returns data in a format that can be used to create a dataframe.
36
+ def extract!
37
+ @extract = sf_bulk.query(@sfo, @query, 10000)
38
+
39
+ check_for_errors(@extract)
40
+ @extract
41
+ end
42
+
43
+ def sf_bulk
44
+ @sf_bulk ||= SalesforceBulkApi::Api.new(restforce_client).tap { |o| o.connection.set_status_throttle(5) }
45
+ end
46
+
47
+ # Public: Converts extracted data to a dataframe.
48
+ # Currently only supports Daru DataFrames.
49
+ #
50
+ # Returns a Remi::DataFrame
51
+ def to_dataframe
52
+ @logger.info "Converting salesforce query results to a dataframe"
53
+
54
+ hash_array = {}
55
+ extract['batches'].each do |batch|
56
+ next unless batch['response']
57
+
58
+ batch['response'].each do |record|
59
+ record.each do |field, value|
60
+ next if ['xsi:type','type'].include? field
61
+ (hash_array[field.to_sym] ||= []) << case value.first
62
+ when Hash
63
+ value.first["xsi:nil"] == "true" ? nil : value.first
64
+ else
65
+ value.first
66
+ end
67
+ end
68
+ end
69
+
70
+ # delete raw result at end of processing to free memory
71
+ batch['response'] = nil
72
+ end
73
+
74
+ Remi::DataFrame.create(@remi_df_type, hash_array, order: hash_array.keys)
75
+ end
76
+
77
+
78
+ private
79
+
80
+ def init_salesforce(*args, object:, query:, credentials:, api: :bulk, **kargs, &block)
81
+ @sfo = object
82
+ @query = query
83
+ @credentials = credentials
84
+ @api = api
85
+ end
86
+
87
+ def check_for_errors(sf_result)
88
+ sf_result['batches'].each do |batch|
89
+ raise "Error with batch #{batch['id']} - #{batch['state']}: #{batch['stateMessage']}" unless batch['state'].first == 'Completed'
90
+ end
91
+ end
92
+ end
93
+
94
+
95
+ class DataTarget::Salesforce < Remi::DataSubject
96
+ include Remi::DataSubject::DataTarget
97
+ include Remi::DataSubject::Salesforce
98
+
99
+ def initialize(*args, **kargs, &block)
100
+ super
101
+ init_salesforce(*args, **kargs, &block)
102
+ end
103
+
104
+ # Public: Performs the load operation, regardless of whether it has
105
+ # already executed.
106
+ #
107
+ # Returns true if the load operation was successful
108
+ def load!
109
+ @logger.info "Performing Salesforce #{@operation} on object #{@sfo}"
110
+
111
+ if @operation == :update
112
+ Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
113
+ elsif @operation == :create
114
+ Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
115
+ elsif @operation == :upsert
116
+ Remi::SfBulkHelper::SfBulkUpsert.upsert(restforce_client, @sfo, df_as_array_of_hashes, external_id: @external_id, logger: @logger)
117
+ else
118
+ raise ArgumentError, "Unknown operation: #{@operation}"
119
+ end
120
+
121
+ true
122
+ end
123
+
124
+ private
125
+
126
+ def init_salesforce(*args, object:, operation:, credentials:, external_id: 'Id', api: :bulk, **kargs, &block)
127
+ @sfo = object
128
+ @operation = operation
129
+ @external_id = external_id
130
+ @credentials = credentials
131
+ @api = api
132
+ end
133
+ end
134
+
135
+
136
+ end
@@ -0,0 +1,66 @@
1
+ module Remi
2
+
3
+ class DataTarget::SftpFile < Remi::DataSubject
4
+ include Remi::DataSubject::DataTarget
5
+
6
+ def initialize(*args, **kargs, &block)
7
+ super
8
+ init_sftp_file(*args, **kargs, &block)
9
+ end
10
+
11
+ attr_reader :local_path
12
+ attr_reader :remote_path
13
+
14
+ # Public: Performs the load operation, regardless of whether it has
15
+ # already executed.
16
+ #
17
+ # Returns true if the load operation was successful
18
+ def load!
19
+ @logger.info "Uploading #{@local_path} to #{@credentials[:username]}@#{@credentials[:host]}: #{@remote_path}"
20
+ connection do |sftp|
21
+ retry_upload { sftp.upload! @local_path, @remote_path }
22
+ end
23
+
24
+ true
25
+ end
26
+
27
+
28
+ private
29
+
30
+ def init_sftp_file(*args, credentials:, local_path:, remote_path: File.basename(local_path), **kargs, &block)
31
+ @credentials = credentials
32
+ @local_path = local_path
33
+ @remote_path = remote_path
34
+ init_df
35
+ end
36
+
37
+ def init_df
38
+ parameter_df = Daru::DataFrame.new(
39
+ local_path: Array(@local_path),
40
+ remote_path: Array(@remote_path)
41
+ )
42
+ self.df = parameter_df
43
+ end
44
+
45
+ def connection(&block)
46
+ result = nil
47
+ Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
48
+ result = yield sftp
49
+ end
50
+ result
51
+ end
52
+
53
+ def retry_upload(ntry=2, &block)
54
+ 1.upto(ntry).each do |itry|
55
+ begin
56
+ block.call
57
+ rescue RuntimeError => err
58
+ raise err unless itry < ntry
59
+ @logger.error "Upload failed with error: #{err.message}"
60
+ @logger.error "Retry attempt #{itry}/#{ntry-1}"
61
+ sleep(1)
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,8 @@
1
+ module Remi
2
+ class Fields < SimpleDelegator
3
+ def initialize(fields=Hash.new({}))
4
+ @fields = Hash.new({}).merge fields
5
+ super(@fields)
6
+ end
7
+ end
8
+ end
@@ -1,50 +1,61 @@
1
1
  module Remi
2
2
  class SourceToTargetMap
3
- def initialize(source_df, target_df=nil)
3
+ def initialize(source_df, target_df=nil, source_metadata: Remi::Fields.new, target_metadata: Remi::Fields.new)
4
4
  @source_df = source_df
5
- @target_df = target_df || @source_df
5
+ @source_metadata = source_metadata
6
+
7
+ if target_df
8
+ @target_df = target_df
9
+ @target_metadata = target_metadata
10
+ else
11
+ @target_df = @source_df
12
+ @target_metadata = @source_metadata
13
+ end
6
14
 
7
15
  reset_map
8
16
  end
9
17
 
10
- def self.apply(source_df, target_df=nil, &block)
11
- target_df ||= source_df
12
- Docile.dsl_eval(SourceToTargetMap.new(source_df, target_df), &block)
18
+ def self.apply(source_df, target_df=nil, source_metadata: Remi::Fields.new, target_metadata: Remi::Fields.new, &block)
19
+ sttm = SourceToTargetMap.new(source_df, target_df, source_metadata: source_metadata, target_metadata: target_metadata)
20
+ Docile.dsl_eval(sttm, &block)
13
21
  end
14
22
 
15
- def source(*source_fields)
16
- @source_fields = Array(source_fields)
23
+ def source(*source_vectors)
24
+ @source_vectors = Array(source_vectors)
17
25
  self
18
26
  end
19
27
 
20
28
  def transform(*transforms)
21
29
  @transforms += Array(transforms)
30
+ @transform_procs += Array(transforms).map { |t| t.to_proc }
22
31
  self
23
32
  end
24
33
 
25
- def target(*target_fields)
26
- @target_fields = Array(target_fields)
34
+ def target(*target_vectors)
35
+ @target_vectors = Array(target_vectors)
27
36
  self
28
37
  end
29
38
 
30
39
  def reset_map
31
- @source_fields = []
32
- @target_fields = []
40
+ @source_vectors = []
41
+ @target_vectors = []
33
42
  @transforms = []
43
+ @transform_procs = []
34
44
  end
35
45
 
36
46
  def map(*args)
47
+ inject_transform_with_metadata
48
+
37
49
  case
38
- when @source_fields.include?(nil)
50
+ when @source_vectors.include?(nil)
39
51
  do_map_generic
40
- when @source_fields.size == 1 && @transforms.size == 0
52
+ when @source_vectors.size == 1 && @transforms.size == 0
41
53
  do_map_direct_copy
42
- when @source_fields.size == 1 && @target_fields.size == 1
43
- do_map_single_source_and_target_field
54
+ when @source_vectors.size == 1 && @target_vectors.size == 1
55
+ do_map_single_source_and_target_vector
44
56
  else
45
57
  do_map_generic
46
58
  end
47
-
48
59
  reset_map
49
60
  end
50
61
 
@@ -52,39 +63,52 @@ module Remi
52
63
 
53
64
  private
54
65
 
66
+ def inject_transform_with_metadata
67
+ @transforms.each do |tform|
68
+ if tform.respond_to? :source_metadata
69
+ meta = @source_vectors.map { |v| @source_metadata[v] || {} }
70
+ tform.source_metadata = meta.size > 1 ? meta : meta.first
71
+ end
72
+ if tform.respond_to? :target_metadata
73
+ meta = @target_vectors.map { |v| @target_metadata[v] || {} }
74
+ tform.target_metadata = meta.size > 1 ? meta : meta.first
75
+ end
76
+ end
77
+ end
78
+
55
79
  def do_map_direct_copy
56
- @target_fields.each do |target_field|
57
- @target_df[target_field] = @source_df[@source_fields.first].dup
80
+ @target_vectors.each do |target_vector|
81
+ @target_df[target_vector] = @source_df[@source_vectors.first].dup
58
82
  end
59
83
  end
60
84
 
61
- def do_map_single_source_and_target_field
62
- @target_df[@target_fields.first] = @source_df[@source_fields.first].recode do |field_value|
63
- @transforms.reduce(field_value) { |value, tform| tform.call(*(value || [nil])) }
85
+ def do_map_single_source_and_target_vector
86
+ @target_df[@target_vectors.first] = @source_df[@source_vectors.first].recode do |vector_value|
87
+ @transform_procs.reduce(vector_value) { |value, tform| tform.call(*(value || [nil])) }
64
88
  end
65
89
  end
66
90
 
67
91
  def do_map_generic
68
- work_vector = if @source_fields.size == 1 && @source_fields.first != nil
69
- @source_df[@source_fields.first].dup
70
- elsif @source_fields.size > 1
92
+ work_vector = if @source_vectors.size == 1 && @source_vectors.first != nil
93
+ @source_df[@source_vectors.first].dup
94
+ elsif @source_vectors.size > 1
71
95
  # It's faster to zip together several vectors and recode those than it is to
72
96
  # recode a dataframe row by row!
73
- Daru::Vector.new(@source_df[@source_fields.first].zip(*@source_fields[1..-1].map { |name| @source_df[name] }), index: @source_df.index)
97
+ Daru::Vector.new(@source_df[@source_vectors.first].zip(*@source_vectors[1..-1].map { |name| @source_df[name] }), index: @source_df.index)
74
98
  else
75
99
  Daru::Vector.new([], index: @source_df.index)
76
100
  end
77
101
 
78
- work_vector.recode! do |field_value|
79
- @transforms.reduce(field_value) { |value, tform| tform.call(*(value || [nil])) }
102
+ work_vector.recode! do |vector_value|
103
+ @transform_procs.reduce(vector_value) { |value, tform| tform.call(*(value || [nil])) }
80
104
  end
81
105
 
82
- @target_fields.each_with_index do |target_field, field_idx|
83
- @target_df[target_field] = work_vector.recode do |field_value|
84
- if field_value.is_a?(Array) then
85
- field_value[field_idx]
106
+ @target_vectors.each_with_index do |target_vector, vector_idx|
107
+ @target_df[target_vector] = work_vector.recode do |vector_value|
108
+ if vector_value.is_a?(Array) then
109
+ vector_value[vector_idx]
86
110
  else
87
- field_value
111
+ vector_value
88
112
  end
89
113
  end
90
114
  end