remi 0.2.27 → 0.2.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +34 -5
- data/features/metadata.feature +17 -0
- data/features/step_definitions/remi_step.rb +6 -6
- data/features/transforms/date_diff.feature +1 -0
- data/jobs/aggregate_job.rb +0 -1
- data/jobs/all_jobs_shared.rb +0 -2
- data/jobs/copy_source_job.rb +0 -1
- data/jobs/csv_file_target_job.rb +0 -1
- data/jobs/metadata_job.rb +60 -0
- data/jobs/parameters_job.rb +1 -1
- data/jobs/sample_job.rb +19 -20
- data/jobs/sftp_file_target_job.rb +0 -1
- data/jobs/transforms/date_diff_job.rb +1 -1
- data/jobs/transforms/nvl_job.rb +1 -1
- data/jobs/transforms/parse_date_job.rb +7 -4
- data/jobs/transforms/prefix_job.rb +1 -1
- data/jobs/transforms/truncate_job.rb +1 -1
- data/lib/remi.rb +10 -15
- data/lib/remi/cucumber/business_rules.rb +23 -23
- data/lib/remi/cucumber/data_source.rb +2 -1
- data/lib/remi/data_frame.rb +36 -0
- data/lib/remi/data_frame/daru.rb +67 -0
- data/lib/remi/data_subject.rb +71 -10
- data/lib/remi/data_subject/csv_file.rb +151 -0
- data/lib/remi/data_subject/data_frame.rb +53 -0
- data/lib/remi/data_subject/postgres.rb +136 -0
- data/lib/remi/data_subject/salesforce.rb +136 -0
- data/lib/remi/data_subject/sftp_file.rb +66 -0
- data/lib/remi/fields.rb +8 -0
- data/lib/remi/source_to_target_map.rb +56 -32
- data/lib/remi/transform.rb +426 -83
- data/lib/remi/version.rb +1 -1
- data/remi.gemspec +2 -1
- data/spec/metadata_spec.rb +62 -0
- metadata +15 -28
- data/lib/remi/data_source.rb +0 -13
- data/lib/remi/data_source/csv_file.rb +0 -101
- data/lib/remi/data_source/data_frame.rb +0 -16
- data/lib/remi/data_source/postgres.rb +0 -58
- data/lib/remi/data_source/salesforce.rb +0 -87
- data/lib/remi/data_target.rb +0 -15
- data/lib/remi/data_target/csv_file.rb +0 -42
- data/lib/remi/data_target/data_frame.rb +0 -14
- data/lib/remi/data_target/postgres.rb +0 -74
- data/lib/remi/data_target/salesforce.rb +0 -54
- data/lib/remi/data_target/sftp_file.rb +0 -54
- data/lib/remi/refinements/daru.rb +0 -85
@@ -0,0 +1,53 @@
|
|
1
|
+
module Remi
|
2
|
+
|
3
|
+
class DataSource::DataFrame < Remi::DataSubject
|
4
|
+
include Remi::DataSubject::DataSource
|
5
|
+
|
6
|
+
def initialize(*args, **kargs, &block)
|
7
|
+
super
|
8
|
+
init_df(*args, **kargs, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Public: Called to extract data from the source.
|
12
|
+
#
|
13
|
+
# Returns data in a format that can be used to create a dataframe.
|
14
|
+
def extract!
|
15
|
+
@extract = []
|
16
|
+
end
|
17
|
+
|
18
|
+
# Public: Converts extracted data to a dataframe
|
19
|
+
#
|
20
|
+
# Returns a Remi::DataFrame
|
21
|
+
def to_dataframe
|
22
|
+
DataFrame.create(@remi_df_type, extract, order: @fields.keys)
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def init_df(*args, **kargs, &block)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
class DataTarget::DataFrame < Remi::DataSubject
|
33
|
+
include Remi::DataSubject::DataTarget
|
34
|
+
|
35
|
+
def initialize(*args, **kargs, &block)
|
36
|
+
super
|
37
|
+
init_df(*args, **kargs, &block)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Public: Performs the load operation, regardless of whether it has
|
41
|
+
# already executed.
|
42
|
+
#
|
43
|
+
# Returns true if the load operation was successful
|
44
|
+
def load!
|
45
|
+
true
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def init_df(*args, **kargs, &block)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
module Remi
|
2
|
+
module DataSubject::Postgres
|
3
|
+
def connection
|
4
|
+
@connection ||= PG.connect(
|
5
|
+
host: @credentials[:host] || 'localhost',
|
6
|
+
port: @credentials[:port] || 5432,
|
7
|
+
dbname: @credentials[:dbname],
|
8
|
+
user: @credentials[:user] || `whoami`.chomp,
|
9
|
+
password: @credentials[:password],
|
10
|
+
sslmode: @credentials[:sslmode] || 'allow'
|
11
|
+
)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
class DataSource::Postgres < Remi::DataSubject
|
17
|
+
include Remi::DataSubject::DataSource
|
18
|
+
include Remi::DataSubject::Postgres
|
19
|
+
|
20
|
+
|
21
|
+
def initialize(*args, **kargs, &block)
|
22
|
+
super
|
23
|
+
init_postgres(*args, **kargs, &block)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Public: Called to extract data from the source.
|
27
|
+
#
|
28
|
+
# Returns data in a format that can be used to create a dataframe.
|
29
|
+
def extract!
|
30
|
+
@logger.info "Executing query #{@query}"
|
31
|
+
@extract = connection.exec @query
|
32
|
+
end
|
33
|
+
|
34
|
+
# Public: Converts extracted data to a dataframe.
|
35
|
+
# Currently only supports Daru DataFrames.
|
36
|
+
#
|
37
|
+
# Returns a Remi::DataFrame
|
38
|
+
def to_dataframe
|
39
|
+
# Performance for larger sets could be improved by using bulk query (via COPY)
|
40
|
+
@logger.info "Converting query to a dataframe"
|
41
|
+
|
42
|
+
hash_array = {}
|
43
|
+
extract.each do |row|
|
44
|
+
row.each do |field, value|
|
45
|
+
(hash_array[field_symbolizer.call(field)] ||= []) << value
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# After converting to DF, clear the PG results to save memory.
|
50
|
+
extract.clear
|
51
|
+
|
52
|
+
Remi::DataFrame.create(@remi_df_type, hash_array, order: hash_array.keys)
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def init_postgres(*args, credentials:, query:, **kargs, &block)
|
59
|
+
@credentials = credentials
|
60
|
+
@query = query
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
# VERY PRELIMINARY IMPLEMENTAtION - ONLY LOADS TO TEMP TABLES
|
67
|
+
# IT IS THEN UP TO THE USER TO DO ELT TO LOAD THE FINAL TABLE
|
68
|
+
class DataTarget::Postgres < Remi::DataSubject
|
69
|
+
include Remi::DataSubject::DataTarget
|
70
|
+
include Remi::DataSubject::Postgres
|
71
|
+
|
72
|
+
def initialize(*args, **kargs, &block)
|
73
|
+
super
|
74
|
+
init_postgres(*args, **kargs, &block)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Public: Performs the load operation, regardless of whether it has
|
78
|
+
# already executed.
|
79
|
+
#
|
80
|
+
# Returns true if the load operation was successful
|
81
|
+
def load!
|
82
|
+
@logger.info "Performing postgres load to table #{@table_name}"
|
83
|
+
create_target_table
|
84
|
+
load_target_table
|
85
|
+
|
86
|
+
true
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
private
|
91
|
+
|
92
|
+
def init_postgres(*args, credentials:, table_name:, **kargs, &block)
|
93
|
+
@credentials = credentials
|
94
|
+
@table_name = table_name
|
95
|
+
end
|
96
|
+
|
97
|
+
def fields_with_type_ddl
|
98
|
+
@fields.map { |k,v| "#{k} #{v[:type]}" }.join(', ')
|
99
|
+
end
|
100
|
+
|
101
|
+
def create_target_table
|
102
|
+
create_table_sql = <<-EOT
|
103
|
+
CREATE TEMPORARY TABLE #{@table_name} (
|
104
|
+
#{fields_with_type_ddl}
|
105
|
+
)
|
106
|
+
EOT
|
107
|
+
|
108
|
+
@logger.info create_table_sql
|
109
|
+
connection.exec create_table_sql
|
110
|
+
end
|
111
|
+
|
112
|
+
def load_target_table
|
113
|
+
connection.copy_data "COPY #{@table_name} (#{@fields.keys.join(', ')}) FROM STDIN" do
|
114
|
+
df.each(:row) do |row|
|
115
|
+
row_str = @fields.keys.map do |field|
|
116
|
+
field = row[field]
|
117
|
+
case
|
118
|
+
when field.respond_to?(:strftime)
|
119
|
+
field.strftime('%Y-%m-%d %H:%M:%S')
|
120
|
+
when field.respond_to?(:map)
|
121
|
+
field.to_json.gsub("\t", '\t')
|
122
|
+
when field.blank? && !field.nil?
|
123
|
+
''
|
124
|
+
when field.nil?
|
125
|
+
'\N'
|
126
|
+
else
|
127
|
+
field.to_s.gsub(/[\t\n\r]/, "\t" => '\t', "\n" => '\n', "\r" => '\r')
|
128
|
+
end
|
129
|
+
end.join("\t")
|
130
|
+
|
131
|
+
connection.put_copy_data row_str + "\n"
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'restforce'
|
2
|
+
require 'salesforce_bulk_api'
|
3
|
+
require 'remi/sf_bulk_helper'
|
4
|
+
|
5
|
+
module Remi
|
6
|
+
module DataSubject::Salesforce
|
7
|
+
def field_symbolizer
|
8
|
+
Remi::FieldSymbolizers[:salesforce]
|
9
|
+
end
|
10
|
+
|
11
|
+
def restforce_client
|
12
|
+
@restforce_client ||= begin
|
13
|
+
client = Restforce.new(@credentials)
|
14
|
+
|
15
|
+
#run a dummy query to initiate a connection. Workaround for Bulk API problem
|
16
|
+
# https://github.com/yatish27/salesforce_bulk_api/issues/33
|
17
|
+
client.query('SELECT Id FROM Contact LIMIT 1')
|
18
|
+
client
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
class DataSource::Salesforce < Remi::DataSubject
|
25
|
+
include Remi::DataSubject::DataSource
|
26
|
+
include Remi::DataSubject::Salesforce
|
27
|
+
|
28
|
+
def initialize(*args, **kargs, &block)
|
29
|
+
super
|
30
|
+
init_salesforce(*args, **kargs, &block)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Public: Called to extract data from the source.
|
34
|
+
#
|
35
|
+
# Returns data in a format that can be used to create a dataframe.
|
36
|
+
def extract!
|
37
|
+
@extract = sf_bulk.query(@sfo, @query, 10000)
|
38
|
+
|
39
|
+
check_for_errors(@extract)
|
40
|
+
@extract
|
41
|
+
end
|
42
|
+
|
43
|
+
def sf_bulk
|
44
|
+
@sf_bulk ||= SalesforceBulkApi::Api.new(restforce_client).tap { |o| o.connection.set_status_throttle(5) }
|
45
|
+
end
|
46
|
+
|
47
|
+
# Public: Converts extracted data to a dataframe.
|
48
|
+
# Currently only supports Daru DataFrames.
|
49
|
+
#
|
50
|
+
# Returns a Remi::DataFrame
|
51
|
+
def to_dataframe
|
52
|
+
@logger.info "Converting salesforce query results to a dataframe"
|
53
|
+
|
54
|
+
hash_array = {}
|
55
|
+
extract['batches'].each do |batch|
|
56
|
+
next unless batch['response']
|
57
|
+
|
58
|
+
batch['response'].each do |record|
|
59
|
+
record.each do |field, value|
|
60
|
+
next if ['xsi:type','type'].include? field
|
61
|
+
(hash_array[field.to_sym] ||= []) << case value.first
|
62
|
+
when Hash
|
63
|
+
value.first["xsi:nil"] == "true" ? nil : value.first
|
64
|
+
else
|
65
|
+
value.first
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# delete raw result at end of processing to free memory
|
71
|
+
batch['response'] = nil
|
72
|
+
end
|
73
|
+
|
74
|
+
Remi::DataFrame.create(@remi_df_type, hash_array, order: hash_array.keys)
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def init_salesforce(*args, object:, query:, credentials:, api: :bulk, **kargs, &block)
|
81
|
+
@sfo = object
|
82
|
+
@query = query
|
83
|
+
@credentials = credentials
|
84
|
+
@api = api
|
85
|
+
end
|
86
|
+
|
87
|
+
def check_for_errors(sf_result)
|
88
|
+
sf_result['batches'].each do |batch|
|
89
|
+
raise "Error with batch #{batch['id']} - #{batch['state']}: #{batch['stateMessage']}" unless batch['state'].first == 'Completed'
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
class DataTarget::Salesforce < Remi::DataSubject
|
96
|
+
include Remi::DataSubject::DataTarget
|
97
|
+
include Remi::DataSubject::Salesforce
|
98
|
+
|
99
|
+
def initialize(*args, **kargs, &block)
|
100
|
+
super
|
101
|
+
init_salesforce(*args, **kargs, &block)
|
102
|
+
end
|
103
|
+
|
104
|
+
# Public: Performs the load operation, regardless of whether it has
|
105
|
+
# already executed.
|
106
|
+
#
|
107
|
+
# Returns true if the load operation was successful
|
108
|
+
def load!
|
109
|
+
@logger.info "Performing Salesforce #{@operation} on object #{@sfo}"
|
110
|
+
|
111
|
+
if @operation == :update
|
112
|
+
Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
|
113
|
+
elsif @operation == :create
|
114
|
+
Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
|
115
|
+
elsif @operation == :upsert
|
116
|
+
Remi::SfBulkHelper::SfBulkUpsert.upsert(restforce_client, @sfo, df_as_array_of_hashes, external_id: @external_id, logger: @logger)
|
117
|
+
else
|
118
|
+
raise ArgumentError, "Unknown operation: #{@operation}"
|
119
|
+
end
|
120
|
+
|
121
|
+
true
|
122
|
+
end
|
123
|
+
|
124
|
+
private
|
125
|
+
|
126
|
+
def init_salesforce(*args, object:, operation:, credentials:, external_id: 'Id', api: :bulk, **kargs, &block)
|
127
|
+
@sfo = object
|
128
|
+
@operation = operation
|
129
|
+
@external_id = external_id
|
130
|
+
@credentials = credentials
|
131
|
+
@api = api
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module Remi
|
2
|
+
|
3
|
+
class DataTarget::SftpFile < Remi::DataSubject
|
4
|
+
include Remi::DataSubject::DataTarget
|
5
|
+
|
6
|
+
def initialize(*args, **kargs, &block)
|
7
|
+
super
|
8
|
+
init_sftp_file(*args, **kargs, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
attr_reader :local_path
|
12
|
+
attr_reader :remote_path
|
13
|
+
|
14
|
+
# Public: Performs the load operation, regardless of whether it has
|
15
|
+
# already executed.
|
16
|
+
#
|
17
|
+
# Returns true if the load operation was successful
|
18
|
+
def load!
|
19
|
+
@logger.info "Uploading #{@local_path} to #{@credentials[:username]}@#{@credentials[:host]}: #{@remote_path}"
|
20
|
+
connection do |sftp|
|
21
|
+
retry_upload { sftp.upload! @local_path, @remote_path }
|
22
|
+
end
|
23
|
+
|
24
|
+
true
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def init_sftp_file(*args, credentials:, local_path:, remote_path: File.basename(local_path), **kargs, &block)
|
31
|
+
@credentials = credentials
|
32
|
+
@local_path = local_path
|
33
|
+
@remote_path = remote_path
|
34
|
+
init_df
|
35
|
+
end
|
36
|
+
|
37
|
+
def init_df
|
38
|
+
parameter_df = Daru::DataFrame.new(
|
39
|
+
local_path: Array(@local_path),
|
40
|
+
remote_path: Array(@remote_path)
|
41
|
+
)
|
42
|
+
self.df = parameter_df
|
43
|
+
end
|
44
|
+
|
45
|
+
def connection(&block)
|
46
|
+
result = nil
|
47
|
+
Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
|
48
|
+
result = yield sftp
|
49
|
+
end
|
50
|
+
result
|
51
|
+
end
|
52
|
+
|
53
|
+
def retry_upload(ntry=2, &block)
|
54
|
+
1.upto(ntry).each do |itry|
|
55
|
+
begin
|
56
|
+
block.call
|
57
|
+
rescue RuntimeError => err
|
58
|
+
raise err unless itry < ntry
|
59
|
+
@logger.error "Upload failed with error: #{err.message}"
|
60
|
+
@logger.error "Retry attempt #{itry}/#{ntry-1}"
|
61
|
+
sleep(1)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/lib/remi/fields.rb
ADDED
@@ -1,50 +1,61 @@
|
|
1
1
|
module Remi
|
2
2
|
class SourceToTargetMap
|
3
|
-
def initialize(source_df, target_df=nil)
|
3
|
+
def initialize(source_df, target_df=nil, source_metadata: Remi::Fields.new, target_metadata: Remi::Fields.new)
|
4
4
|
@source_df = source_df
|
5
|
-
@
|
5
|
+
@source_metadata = source_metadata
|
6
|
+
|
7
|
+
if target_df
|
8
|
+
@target_df = target_df
|
9
|
+
@target_metadata = target_metadata
|
10
|
+
else
|
11
|
+
@target_df = @source_df
|
12
|
+
@target_metadata = @source_metadata
|
13
|
+
end
|
6
14
|
|
7
15
|
reset_map
|
8
16
|
end
|
9
17
|
|
10
|
-
def self.apply(source_df, target_df=nil, &block)
|
11
|
-
target_df
|
12
|
-
Docile.dsl_eval(
|
18
|
+
def self.apply(source_df, target_df=nil, source_metadata: Remi::Fields.new, target_metadata: Remi::Fields.new, &block)
|
19
|
+
sttm = SourceToTargetMap.new(source_df, target_df, source_metadata: source_metadata, target_metadata: target_metadata)
|
20
|
+
Docile.dsl_eval(sttm, &block)
|
13
21
|
end
|
14
22
|
|
15
|
-
def source(*
|
16
|
-
@
|
23
|
+
def source(*source_vectors)
|
24
|
+
@source_vectors = Array(source_vectors)
|
17
25
|
self
|
18
26
|
end
|
19
27
|
|
20
28
|
def transform(*transforms)
|
21
29
|
@transforms += Array(transforms)
|
30
|
+
@transform_procs += Array(transforms).map { |t| t.to_proc }
|
22
31
|
self
|
23
32
|
end
|
24
33
|
|
25
|
-
def target(*
|
26
|
-
@
|
34
|
+
def target(*target_vectors)
|
35
|
+
@target_vectors = Array(target_vectors)
|
27
36
|
self
|
28
37
|
end
|
29
38
|
|
30
39
|
def reset_map
|
31
|
-
@
|
32
|
-
@
|
40
|
+
@source_vectors = []
|
41
|
+
@target_vectors = []
|
33
42
|
@transforms = []
|
43
|
+
@transform_procs = []
|
34
44
|
end
|
35
45
|
|
36
46
|
def map(*args)
|
47
|
+
inject_transform_with_metadata
|
48
|
+
|
37
49
|
case
|
38
|
-
when @
|
50
|
+
when @source_vectors.include?(nil)
|
39
51
|
do_map_generic
|
40
|
-
when @
|
52
|
+
when @source_vectors.size == 1 && @transforms.size == 0
|
41
53
|
do_map_direct_copy
|
42
|
-
when @
|
43
|
-
|
54
|
+
when @source_vectors.size == 1 && @target_vectors.size == 1
|
55
|
+
do_map_single_source_and_target_vector
|
44
56
|
else
|
45
57
|
do_map_generic
|
46
58
|
end
|
47
|
-
|
48
59
|
reset_map
|
49
60
|
end
|
50
61
|
|
@@ -52,39 +63,52 @@ module Remi
|
|
52
63
|
|
53
64
|
private
|
54
65
|
|
66
|
+
def inject_transform_with_metadata
|
67
|
+
@transforms.each do |tform|
|
68
|
+
if tform.respond_to? :source_metadata
|
69
|
+
meta = @source_vectors.map { |v| @source_metadata[v] || {} }
|
70
|
+
tform.source_metadata = meta.size > 1 ? meta : meta.first
|
71
|
+
end
|
72
|
+
if tform.respond_to? :target_metadata
|
73
|
+
meta = @target_vectors.map { |v| @target_metadata[v] || {} }
|
74
|
+
tform.target_metadata = meta.size > 1 ? meta : meta.first
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
55
79
|
def do_map_direct_copy
|
56
|
-
@
|
57
|
-
@target_df[
|
80
|
+
@target_vectors.each do |target_vector|
|
81
|
+
@target_df[target_vector] = @source_df[@source_vectors.first].dup
|
58
82
|
end
|
59
83
|
end
|
60
84
|
|
61
|
-
def
|
62
|
-
@target_df[@
|
63
|
-
@
|
85
|
+
def do_map_single_source_and_target_vector
|
86
|
+
@target_df[@target_vectors.first] = @source_df[@source_vectors.first].recode do |vector_value|
|
87
|
+
@transform_procs.reduce(vector_value) { |value, tform| tform.call(*(value || [nil])) }
|
64
88
|
end
|
65
89
|
end
|
66
90
|
|
67
91
|
def do_map_generic
|
68
|
-
work_vector = if @
|
69
|
-
@source_df[@
|
70
|
-
elsif @
|
92
|
+
work_vector = if @source_vectors.size == 1 && @source_vectors.first != nil
|
93
|
+
@source_df[@source_vectors.first].dup
|
94
|
+
elsif @source_vectors.size > 1
|
71
95
|
# It's faster to zip together several vectors and recode those than it is to
|
72
96
|
# recode a dataframe row by row!
|
73
|
-
Daru::Vector.new(@source_df[@
|
97
|
+
Daru::Vector.new(@source_df[@source_vectors.first].zip(*@source_vectors[1..-1].map { |name| @source_df[name] }), index: @source_df.index)
|
74
98
|
else
|
75
99
|
Daru::Vector.new([], index: @source_df.index)
|
76
100
|
end
|
77
101
|
|
78
|
-
work_vector.recode! do |
|
79
|
-
@
|
102
|
+
work_vector.recode! do |vector_value|
|
103
|
+
@transform_procs.reduce(vector_value) { |value, tform| tform.call(*(value || [nil])) }
|
80
104
|
end
|
81
105
|
|
82
|
-
@
|
83
|
-
@target_df[
|
84
|
-
if
|
85
|
-
|
106
|
+
@target_vectors.each_with_index do |target_vector, vector_idx|
|
107
|
+
@target_df[target_vector] = work_vector.recode do |vector_value|
|
108
|
+
if vector_value.is_a?(Array) then
|
109
|
+
vector_value[vector_idx]
|
86
110
|
else
|
87
|
-
|
111
|
+
vector_value
|
88
112
|
end
|
89
113
|
end
|
90
114
|
end
|