remi 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0de4e8f2de3129e2e4b93c3d22dc5f718a05b56a
4
- data.tar.gz: d963548c553f1918b33bd391038bc3481ce4a5d8
3
+ metadata.gz: 051d6add4664343ee59a6c722a2abadc15ea4377
4
+ data.tar.gz: f7f438b794a08948617b767dfca83e58533300ad
5
5
  SHA512:
6
- metadata.gz: d01e67e38c2a76784e65a22536d2d9cba7c9f56dc3686e8d0d23ea1e5176cb8495cd06977ba85357d336cbf7fd91641f79795d68df69542ce9e94b39bc85c6ec
7
- data.tar.gz: 07cec77fc7c40299207081f5ea7390cdd3cf863ac49295e9de166e4736ff95ae5d57e642de2a10ea74d1d812aeeff2ed4ffdb6a93c232658efc7552332a9f1e9
6
+ metadata.gz: d0e46a405da1e48dc0b82afe9c350b26f0b436fe99aa1cdf9500d035ec5a01612257c679338063920d370ead1e09c68ec35f7de9d732338417874a396ed8634c
7
+ data.tar.gz: d39aaad0be382a3f70359c03eeaa1b13ef9a594967cd029014da40ca2882cffe53a81a534cdf4c8221399fb10636e5c1304ac591e6a0ef589ecd1d0fab1b07f2
data/Gemfile CHANGED
@@ -7,4 +7,4 @@ gem 'daru', '0.1.4.1', git: 'git@github.com:inside-track/daru.git', branch: '0.1
7
7
  gem 'restforce', '~> 2.1'
8
8
  gem 'salesforce_bulk_api', git: 'git@github.com:inside-track/salesforce_bulk_api.git', branch: 'master'
9
9
  gem 'soapforce', '~> 0.5'
10
- gem 'aws-sdk', '~> 2.3'
10
+ gem 'aws-sdk', '~> 2.10'
@@ -18,7 +18,7 @@ GIT
18
18
  PATH
19
19
  remote: .
20
20
  specs:
21
- remi (0.3.2)
21
+ remi (0.3.3)
22
22
  activesupport (~> 4.2)
23
23
  bond (~> 0.5)
24
24
  cucumber (~> 2.1)
@@ -43,12 +43,14 @@ GEM
43
43
  akami (1.3.1)
44
44
  gyoku (>= 0.4.0)
45
45
  nokogiri
46
- aws-sdk (2.3.5)
47
- aws-sdk-resources (= 2.3.5)
48
- aws-sdk-core (2.3.5)
46
+ aws-sdk (2.10.3)
47
+ aws-sdk-resources (= 2.10.3)
48
+ aws-sdk-core (2.10.3)
49
+ aws-sigv4 (~> 1.0)
49
50
  jmespath (~> 1.0)
50
- aws-sdk-resources (2.3.5)
51
- aws-sdk-core (= 2.3.5)
51
+ aws-sdk-resources (2.10.3)
52
+ aws-sdk-core (= 2.10.3)
53
+ aws-sigv4 (1.0.0)
52
54
  backports (3.6.8)
53
55
  bond (0.5.1)
54
56
  builder (3.2.2)
@@ -104,10 +106,8 @@ GEM
104
106
  mimemagic (~> 0.3)
105
107
  multi_json (~> 1.11)
106
108
  rbczmq (~> 1.7)
107
- jmespath (1.2.4)
108
- json_pure (>= 1.8.1)
109
+ jmespath (1.3.1)
109
110
  json (1.8.3)
110
- json_pure (1.8.3)
111
111
  jwt (1.5.6)
112
112
  little-plugger (1.1.4)
113
113
  logging (2.1.0)
@@ -187,7 +187,7 @@ PLATFORMS
187
187
  ruby
188
188
 
189
189
  DEPENDENCIES
190
- aws-sdk (~> 2.3)
190
+ aws-sdk (~> 2.10)
191
191
  daru (= 0.1.4.1)!
192
192
  github-markup (~> 1.4)
193
193
  google-api-client (~> 0.9)
@@ -200,4 +200,4 @@ DEPENDENCIES
200
200
  yard (~> 0.9)
201
201
 
202
202
  BUNDLED WITH
203
- 1.14.3
203
+ 1.15.1
@@ -0,0 +1,10 @@
1
+ Feature: Tests targets that are S3 Files.
2
+
3
+ Background:
4
+ Given the job is 'S3 File Target'
5
+ And the job target 'Some File'
6
+
7
+ Scenario: Defining the remote path.
8
+ Given the target 'Some File'
9
+ Then the file is uploaded to the S3 bucket "the-big-one"
10
+ And the file is uploaded to the remote path "some_file_*Today: %Y%m%d*.csv"
@@ -69,6 +69,14 @@ Then /^the file is uploaded to the remote path "([^"]+)"$/ do |remote_path|
69
69
  expect(@brt.target.data_subject.loaders.map(&:remote_path)).to include expected_path
70
70
  end
71
71
 
72
+ Then /^the file is uploaded to the S3 bucket "([^"]+)"$/ do |bucket_name|
73
+ expected_bucket_name = Remi::Testing::BusinessRules::ParseFormula.parse(bucket_name)
74
+ bucket_names = @brt.target.data_subject.loaders.map do |loader|
75
+ loader.bucket_name if loader.respond_to? :bucket_name
76
+ end
77
+ expect(bucket_names).to include expected_bucket_name
78
+ end
79
+
72
80
  ## CSV Options
73
81
 
74
82
  Given /^the (source|target) file is delimited with a (\w+)$/ do |st, delimiter|
@@ -124,6 +132,16 @@ Given /^the (source|target) file contains all of the following headers in this o
124
132
  expect(@brt.send(st.to_sym).data_subject.df.vectors.to_a).to eq @brt.send(st.to_sym).fields.field_names
125
133
  end
126
134
 
135
+ Given /^the (source|target) file contains all of the following headers in no particular order:$/ do |st, table|
136
+ table.rows.each do |row|
137
+ field = row.first
138
+ step "the #{st} field '#{field}'"
139
+ end
140
+
141
+ @brt.run_transforms if st == 'target'
142
+ expect(@brt.send(st.to_sym).data_subject.df.vectors.to_a).to match_array @brt.send(st.to_sym).fields.field_names
143
+ end
144
+
127
145
  ### Source
128
146
 
129
147
  Given /^the source '([[:alnum:]\s\-_]+)'$/ do |arg|
@@ -260,6 +278,7 @@ Then /^the target field '([^']+)' has the label '([^']+)'$/ do |target_field, la
260
278
  data_field = @brt.targets.fields.next
261
279
  expect(data_field.metadata[:label]).to eq label
262
280
  expect(data_field.name).to eq target_field
281
+
263
282
  end
264
283
 
265
284
  Then /^the target field '([^']+)' is copied from the source field$/ do |target_field|
@@ -780,3 +799,10 @@ Then /^the target '([[:alnum:]\s\-_]+)' has (\d+) record(?:s|) where '([[:alnum:
780
799
  @brt.run_transforms
781
800
  expect(@brt.targets[target_name].where_between(field_name, low_value, high_value).size).to eq nrecords.to_i
782
801
  end
802
+
803
+ Then /^the target field '([^']+)' (?:has|is set to) the multiline value$/ do |target_field, value|
804
+ step "the target field '#{target_field}'"
805
+ @brt.run_transforms
806
+ target_name, target_field_name = @brt.targets.parse_full_field(target_field)
807
+ expect(@brt.targets[target_name].fields[target_field_name].value).to eq Remi::Testing::BusinessRules::ParseFormula.parse(value)
808
+ end
@@ -0,0 +1,23 @@
1
+ require_relative 'all_jobs_shared'
2
+ require 'aws-sdk'
3
+
4
+ class S3FileTargetJob < Remi::Job
5
+ target :some_file do
6
+ encoder Remi::Encoder::CsvFile.new
7
+ loader Remi::Loader::S3File.new(
8
+ credentials: {
9
+ aws_access_key_id: 'blort',
10
+ aws_secret_access_key: 'blerg',
11
+ region: 'us-west-2'
12
+ },
13
+ kms_opt: {
14
+ ciphertext: 'blergity'
15
+ },
16
+ bucket: 'the-big-one',
17
+ remote_path: "some_file_#{DateTime.current.strftime('%Y%m%d')}.csv"
18
+ )
19
+ end
20
+
21
+ transform :main do
22
+ end
23
+ end
@@ -93,7 +93,11 @@ module Remi
93
93
  sttm = SourceToTargetMap.new(df, source_metadata: fields)
94
94
  fields.keys.each do |field|
95
95
  next unless (types.size == 0 || types.include?(fields[field][:type])) && df.vectors.include?(field)
96
- sttm.source(field).target(field).transform(Remi::Transform::EnforceType.new).execute
96
+ begin
97
+ sttm.source(field).target(field).transform(Remi::Transform::EnforceType.new).execute
98
+ rescue StandardError => err
99
+ raise ArgumentError, "Field '#{field}': #{err.message}"
100
+ end
97
101
  end
98
102
 
99
103
  self
@@ -8,10 +8,10 @@ module Remi
8
8
  end
9
9
 
10
10
 
11
- # The None Parser just returns what it is given.
11
+ # The None Parser just returns an empty dataframe if it's not given any data
12
12
  class Parser::None < Parser
13
- def parse(data)
14
- data
13
+ def parse(data=nil)
14
+ data || Remi::DataFrame::Daru.new([], order: fields.keys)
15
15
  end
16
16
  end
17
17
 
@@ -1,13 +1,59 @@
1
1
  module Remi
2
2
 
3
+ module DataSubject::S3File
4
+ attr_accessor :region
5
+ attr_accessor :aws_credentials
6
+
7
+ def init_aws_credentials(credentials)
8
+ @aws_credentials = Aws::Credentials.new(
9
+ credentials.fetch(:aws_access_key_id, ENV['AWS_ACCESS_KEY_ID']),
10
+ credentials.fetch(:aws_secret_access_key, ENV['AWS_SECRET_ACCESS_KEY'])
11
+ )
12
+ end
13
+
14
+ def s3
15
+ @s3 ||= Aws::S3::Resource.new(
16
+ credentials: aws_credentials,
17
+ region: region
18
+ )
19
+ end
20
+
21
+ def encrypt_args
22
+ @kms_args || {}
23
+ end
24
+
25
+ def init_kms(opt)
26
+ return nil unless opt
27
+
28
+ kms = Aws::KMS::Client.new(
29
+ region: @region,
30
+ credentials: @aws_credentials
31
+ )
32
+
33
+ ciphertext = opt.fetch(:ciphertext)
34
+ algorithm = opt.fetch(:algorithm, 'AES256')
35
+ key = kms.decrypt(ciphertext_blob: Base64.decode64(ciphertext)).plaintext
36
+
37
+ @kms_args = {
38
+ sse_customer_algorithm: algorithm,
39
+ sse_customer_key: key
40
+ }
41
+ end
42
+ end
43
+
3
44
  # S3 File extractor
4
45
  # Used to extract files from Amazon S3
5
46
  #
6
- # @example
47
+ # @example Standard use
7
48
  #
8
49
  # class MyJob < Remi::Job
9
50
  # source :some_file do
10
51
  # extractor Remi::Extractor::S3File.new(
52
+ # credentials: {
53
+ # aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
54
+ # aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
55
+ # region: 'us-west-2'
56
+ # },
11
57
  # bucket: 'my-awesome-bucket',
12
58
  # remote_path: 'some_file-',
13
59
  # most_recent_only: true
@@ -28,9 +74,40 @@ module Remi
28
74
  # # 0 1 Albert
29
75
  # # 1 2 Betsy
30
76
  # # 2 3 Camu
77
+ #
78
+ # @example Using AWS KMS
79
+ # To use AWS KMS, supply a :ciphertext and optional :algorithm (default is AES256).
80
+ # The encrypted key stored in the ciphertext must be the same as that used when the file was written.
81
+ #
82
+ # class MyJob < Remi::Job
83
+ # source :some_file do
84
+ # extractor Remi::Extractor::S3File.new(
85
+ # credentials: {
86
+ # aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
87
+ # aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
88
+ # region: 'us-west-2'
89
+ # },
90
+ # bucket: 'my-awesome-bucket',
91
+ # remote_path: 'some_file-',
92
+ # most_recent_only: true,
93
+ # kms_opt: {
94
+ # ciphertext: '<base64-encoded ciphertext>'
95
+ # }
96
+ # )
97
+ # parser Remi::Parser::CsvFile.new(
98
+ # csv_options: {
99
+ # headers: true,
100
+ # col_sep: '|'
101
+ # }
102
+ # )
103
+ # end
104
+ # end
31
105
  class Extractor::S3File < Extractor::FileSystem
106
+ include Remi::DataSubject::S3File
32
107
 
33
- # @param bucket_name [String] S3 bucket containing the files
108
+ # @param bucket [String] Name of S3 bucket containing the files
109
+ # @param kms_opt [Hash] Hash containing AWS KMS options
110
+ # @param credentials [Hash] Hash containing AWS credentials (must contain :aws_access_key_id, :aws_secret_access_key, :region)
34
111
  def initialize(*args, **kargs, &block)
35
112
  super
36
113
  init_s3_file(*args, **kargs, &block)
@@ -39,10 +116,12 @@ module Remi
39
116
  # Called to extract files from the source filesystem.
40
117
  # @return [Array<String>] An array of paths to a local copy of the files extacted
41
118
  def extract
119
+ init_kms(@kms_opt)
120
+
42
121
  entries.map do |entry|
43
122
  local_file = File.join(@local_path, entry.name)
44
123
  logger.info "Downloading #{entry.pathname} from S3 to #{local_file}"
45
- File.open(local_file, 'wb') { |file| entry.raw.get(response_target: file) }
124
+ File.open(local_file, 'wb') { |file| entry.raw.get({ response_target: file }.merge(encrypt_args)) }
46
125
  local_file
47
126
  end
48
127
  end
@@ -55,7 +134,7 @@ module Remi
55
134
  # @return [Array<Extractor::FileSystemEntry>] List of objects in the bucket/prefix
56
135
  def all_entries!
57
136
  # S3 does not track anything like a create time, so use last modified for both
58
- bucket.objects(prefix: @remote_path.to_s).map do |entry|
137
+ s3.bucket(@bucket_name).objects(prefix: @remote_path.to_s).map do |entry|
59
138
  Extractor::FileSystemEntry.new(
60
139
  pathname: entry.key,
61
140
  create_time: entry.last_modified,
@@ -65,20 +144,128 @@ module Remi
65
144
  end
66
145
  end
67
146
 
68
- # @return [Aws::S3::Client] The S3 client used
69
- def s3_client
70
- @s3_client ||= Aws::S3::Client.new
71
- end
72
-
73
147
  private
74
148
 
75
- def init_s3_file(*args, bucket:, **kargs)
149
+ def init_s3_file(*args, credentials: {}, bucket:, kms_opt: nil, **kargs)
150
+ @region = credentials.fetch(:region, 'us-west-2')
151
+ @kms_opt = kms_opt
152
+ init_aws_credentials(credentials)
153
+
76
154
  @bucket_name = bucket
77
155
  end
156
+ end
157
+
158
+
159
+
160
+ # S3 File loader
161
+ # Used to post files to Amazon S3
162
+ #
163
+ # @example Standard use
164
+ #
165
+ # class MyJob < Remi::Job
166
+ # target :some_file do
167
+ # encoder Remi::Encoder::CsvFile.new
168
+ # loader Remi::Loader::S3File.new(
169
+ # credentials: {
170
+ # aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
171
+ # aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
172
+ # region: 'us-west-2'
173
+ # },
174
+ # bucket: 'itk-de-archive',
175
+ # remote_path: 'awesome.csv'
176
+ # )
177
+ # end
178
+ # end
179
+ #
180
+ # job = MyJob.new
181
+ # job.some_file.df = Daru::DataFrame.new(
182
+ # {
183
+ # numbers: [1,2,3],
184
+ # words: ['one', 'two', 'three']
185
+ # }
186
+ # )
187
+ # job.some_file.load
188
+ #
189
+ # @example Using AWS KMS
190
+ # To use AWS KMS, supply a :ciphertext and optional :algorithm (default is AES256).
191
+ # The encrypted key stored in the ciphertext must be the same as that used for reading the file.
192
+ #
193
+ # class MyJob < Remi::Job
194
+ # target :some_file do
195
+ # encoder Remi::Encoder::CsvFile.new
196
+ # loader Remi::Loader::S3File.new(
197
+ # credentials: {
198
+ # aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
199
+ # aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
200
+ # region: 'us-west-2'
201
+ # },
202
+ # bucket: 'itk-de-archive',
203
+ # remote_path: 'awesome.csv',
204
+ # kms_opt: {
205
+ # ciphertext: '<base64-encoded ciphertext>'
206
+ # }
207
+ # )
208
+ # end
209
+ # end
210
+ #
211
+ # @example Generating a ciphertext
212
+ # A ciphertext can be generated using the AWS SDK
213
+ #
214
+ # require 'aws-sdk'
215
+ # require 'base64'
216
+ #
217
+ # aws_credentials = Aws::Credentials.new(
218
+ # ENV['AWS_ACCESS_KEY_ID'],
219
+ # ENV['AWS_SECRET_ACCESS_KEY']
220
+ # )
221
+ #
222
+ # kms = Aws::KMS::Client.new(
223
+ # region: 'us-west-2',
224
+ # credentials: aws_credentials
225
+ # )
226
+ #
227
+ # # See AWS docs for creating keys: http://docs.aws.amazon.com/kms/latest/developerguide/create-keys.html
228
+ # data_key = kms.generate_data_key(
229
+ # key_id: 'alias/alias-of-kms-key',
230
+ # key_spec: 'AES_256'
231
+ # )
232
+ #
233
+ # ciphertext = Base64.strict_encode64(data_key.ciphertext_blob)
234
+ # #=> "AQIDAHjmmRVcBAdMHsA9VUoJKgbW8niK2qL1qPcQ2OWEUlh5XAFw0vfl+QIgawB8cbAZ2OqXAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMIUIFFh++2w4d9al7AgEQgDvSRXQCOPLSMOjRS/lM5uxuyRV47qInlKKBIezIaYzXuFu1sRU+L46HqRyS0XqR4flFJ/fc8yEj3pU1UA=="
235
+ class Loader::S3File < Loader
236
+ include Remi::DataSubject::S3File
237
+
238
+ # @param bucket [String] Name of S3 bucket containing the files
239
+ # @param kms_opt [Hash] Hash containing AWS KMS options
240
+ # @param credentials [Hash] Hash containing AWS credentials (must contain :aws_access_key_id, :aws_secret_access_key, :region)
241
+ def initialize(*args, **kargs, &block)
242
+ super
243
+ init_s3_loader(*args, **kargs, &block)
244
+ end
245
+
246
+ attr_reader :remote_path
247
+ attr_reader :bucket_name
78
248
 
79
- def bucket
80
- @bucket ||= Aws::S3::Bucket.new(@bucket_name, client: s3_client)
249
+ # Copies data to S3
250
+ # @param data [Object] The path to the file in the temporary work location
251
+ # @return [true] On success
252
+ def load(data)
253
+ init_kms(@kms_opt)
254
+
255
+ @logger.info "Writing file #{data} to S3 #{@bucket_name} as #{@remote_path}"
256
+ s3.bucket(@bucket_name).object(@remote_path).upload_file(data, encrypt_args)
257
+ true
81
258
  end
82
259
 
260
+ private
261
+
262
+ def init_s3_loader(*args, credentials:{}, bucket:, remote_path:, kms_opt: nil, **kargs, &block)
263
+ @region = credentials.fetch(:region, 'us-west-2')
264
+ @kms_opt = kms_opt
265
+ init_aws_credentials(credentials)
266
+
267
+ @bucket_name = bucket
268
+ @remote_path = remote_path
269
+ end
83
270
  end
84
271
  end
@@ -185,11 +185,11 @@ module Remi
185
185
  if @operation == :update
186
186
  Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, data, batch_size: @batch_size, logger: logger)
187
187
  elsif @operation == :create
188
- Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, data, batch_size: @batch_size, logger: logger)
188
+ Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, data, batch_size: @batch_size, max_attempts: 1, logger: logger)
189
189
  elsif @operation == :upsert
190
190
  Remi::SfBulkHelper::SfBulkUpsert.upsert(restforce_client, @sfo, data, batch_size: @batch_size, external_id: @external_id, logger: logger)
191
191
  elsif @operation == :delete
192
- Remi::SfBulkHelper::SfBulkDelete.upsert(restforce_client, @sfo, data, batch_size: @batch_size, logger: logger)
192
+ Remi::SfBulkHelper::SfBulkDelete.delete(restforce_client, @sfo, data, batch_size: @batch_size, logger: logger)
193
193
  else
194
194
  raise ArgumentError, "Unknown operation: #{@operation}"
195
195
  end
@@ -79,7 +79,9 @@ module Remi
79
79
  end
80
80
 
81
81
  merge_id = Array(row.delete(@merge_id_field))
82
- soapforce_client.merge(@sfo, row, merge_id)
82
+ merge_row = row.select { |_, v| !v.blank? }
83
+ logger.info "Merging Id #{merge_id} into #{merge_row}"
84
+ soapforce_client.merge!(@sfo, merge_row, merge_id)
83
85
  end
84
86
  else
85
87
  raise ArgumentError, "Unknown soap operation: #{@operation}"
@@ -1,4 +1,44 @@
1
1
  module Remi
2
+ module DataSubject::SftpFile
3
+
4
+ attr_reader :sftp_session
5
+
6
+ def sftp_retry(&block)
7
+ tries ||= @retries
8
+
9
+ block.call
10
+ rescue StandardError => err
11
+ if (tries -= 1) > 0
12
+ logger.error "Error: #{err.message}"
13
+ logger.error "Will retry #{tries} more times"
14
+ sleep(1)
15
+ retry
16
+ else
17
+ raise err
18
+ end
19
+ end
20
+
21
+ def begin_connection
22
+ sftp_retry do
23
+ Timeout.timeout(@timeout) do
24
+ @ssh_session = Net::SSH.start(@host, @username, password: @password, port: @port, number_of_password_prompts: 0)
25
+ @sftp_session = Net::SFTP::Session.new(@ssh_session)
26
+ @sftp_session.connect!
27
+ end
28
+ end
29
+ end
30
+
31
+ def end_connection
32
+ @sftp_session.close_channel unless @sftp_session.nil?
33
+ @ssh_session.close unless @ssh_session.nil?
34
+
35
+ Timeout.timeout(@timeout) do
36
+ sleep 1 until (@sftp_session.nil? || @sftp_session.closed?) && (@ssh_session.nil? || @ssh_session.closed?)
37
+ end
38
+ end
39
+ end
40
+
41
+
2
42
 
3
43
  # Sftp File extractor
4
44
  # Used to extract files from an SFTP server
@@ -35,13 +75,15 @@ module Remi
35
75
  # # 1 2 Betsy
36
76
  # # 2 3 Camu
37
77
  class Extractor::SftpFile < Extractor::FileSystem
38
- N_RETRY = 3
78
+ include DataSubject::SftpFile
39
79
 
40
80
  # @param credentials [Hash] Options hash containing login credentials
41
81
  # @param credentials [String] :host SFTP host (e.g., coolserver.com)
42
82
  # @param credentials [String] :username SFTP username
43
83
  # @param credentials [String] :password SFTP password
44
84
  # @param credentials [String] :port SFTP port (default: 22)
85
+ # @param retries [Integer] Number of times a connection or operation will be retried (default: 3)
86
+ # @param timeout [Integer] Number of seconds to wait for establishing/closing a connection (default: 30)
45
87
  def initialize(*args, **kargs, &block)
46
88
  super
47
89
  init_sftp_extractor(*args, **kargs)
@@ -55,15 +97,16 @@ module Remi
55
97
  # Called to extract files from the source filesystem.
56
98
  # @return [Array<String>] An array of paths to a local copy of the files extacted
57
99
  def extract
58
- connection do |sftp|
59
- entries.map do |entry|
60
- local_file = File.join(@local_path, entry.name)
61
- logger.info "Downloading #{entry.name} to #{local_file}"
62
- retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
63
- local_file
100
+ begin_connection
64
101
 
65
- end
102
+ entries.map do |entry|
103
+ local_file = File.join(@local_path, entry.name)
104
+ logger.info "Downloading #{entry.name} to #{local_file}"
105
+ sftp_retry { sftp_session.download!(File.join(@remote_path, entry.name), local_file) }
106
+ local_file
66
107
  end
108
+ ensure
109
+ end_connection
67
110
  end
68
111
 
69
112
  # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
@@ -73,8 +116,7 @@ module Remi
73
116
 
74
117
  # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
75
118
  def all_entries!
76
- sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path) }
77
- sftp_entries.map do |entry|
119
+ sftp_session.dir.entries(@remote_path).map do |entry|
78
120
  # Early versions of the protocol don't support create time, fake it with modified time?
79
121
  FileSystemEntry.new(
80
122
  pathname: File.join(@remote_path, entry.name),
@@ -87,33 +129,13 @@ module Remi
87
129
 
88
130
  private
89
131
 
90
- def init_sftp_extractor(*args, credentials:, **kargs)
132
+ def init_sftp_extractor(*args, credentials:, retries: 3, timeout: 30, **kargs)
91
133
  @host = credentials.fetch(:host)
92
134
  @username = credentials.fetch(:username)
93
- @password = credentials.fetch(:password)
135
+ @password = credentials.fetch(:password, nil)
94
136
  @port = credentials.fetch(:port, '22')
95
- end
96
-
97
- def connection(&block)
98
- result = nil
99
- Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
100
- result = yield sftp
101
- end
102
- result
103
- end
104
-
105
- def retry_download(&block)
106
- 1.upto(N_RETRY).each do |itry|
107
- begin
108
- block.call
109
- break
110
- rescue RuntimeError => err
111
- raise err unless itry < N_RETRY
112
- logger.error "Download failed with error: #{err.message}"
113
- logger.error "Retry attempt #{itry}/#{N_RETRY-1}"
114
- sleep(1)
115
- end
116
- end
137
+ @retries = retries
138
+ @timeout = timeout
117
139
  end
118
140
  end
119
141
 
@@ -143,8 +165,16 @@ module Remi
143
165
  # job.my_target.df = my_df
144
166
  # job.my_target.load
145
167
  class Loader::SftpFile < Loader
168
+ include DataSubject::SftpFile
146
169
 
170
+ # @param credentials [Hash] Options hash containing login credentials
171
+ # @param credentials [String] :host SFTP host (e.g., coolserver.com)
172
+ # @param credentials [String] :username SFTP username
173
+ # @param credentials [String] :password SFTP password
174
+ # @param credentials [String] :port SFTP port (default: 22)
147
175
  # @param remote_path [String, Pathname] Full path to the file to be created on the target filesystem
176
+ # @param retries [Integer] Number of times a connection or operation will be retried (default: 3)
177
+ # @param timeout [Integer] Number of seconds to wait for establishing/closing a connection (default: 30)
148
178
  def initialize(*args, **kargs, &block)
149
179
  super
150
180
  init_sftp_loader(*args, **kargs, &block)
@@ -156,42 +186,27 @@ module Remi
156
186
  # @param data [Object] The path to the file in the temporary work location
157
187
  # @return [true] On success
158
188
  def load(data)
159
- logger.info "Uploading #{data} to #{@credentials[:username]}@#{@credentials[:host]}: #{@remote_path}"
160
- connection do |sftp|
161
- retry_upload { sftp.upload! data, @remote_path }
162
- end
189
+ begin_connection
190
+
191
+ logger.info "Uploading #{data} to #{@username}@#{@host}: #{@remote_path}"
192
+ sftp_retry { sftp_session.upload! data, @remote_path }
163
193
 
164
194
  true
195
+ ensure
196
+ end_connection
165
197
  end
166
198
 
167
199
 
168
200
  private
169
201
 
170
- def init_sftp_loader(*args, credentials:, remote_path:, **kargs, &block)
171
- @credentials = credentials
202
+ def init_sftp_loader(*args, credentials:, remote_path:, retries: 3, timeout: 30, **kargs, &block)
203
+ @host = credentials.fetch(:host)
204
+ @username = credentials.fetch(:username)
205
+ @password = credentials.fetch(:password, nil)
206
+ @port = credentials.fetch(:port, '22')
172
207
  @remote_path = remote_path
173
- end
174
-
175
- def connection(&block)
176
- result = nil
177
- Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
178
- result = yield sftp
179
- end
180
- result
181
- end
182
-
183
- def retry_upload(ntry=2, &block)
184
- 1.upto(ntry).each do |itry|
185
- begin
186
- block.call
187
- break
188
- rescue RuntimeError => err
189
- raise err unless itry < ntry
190
- logger.error "Upload failed with error: #{err.message}"
191
- logger.error "Retry attempt #{itry}/#{ntry-1}"
192
- sleep(1)
193
- end
194
- end
208
+ @retries = retries
209
+ @timeout = timeout
195
210
  end
196
211
  end
197
212
  end
@@ -15,6 +15,7 @@ module Remi
15
15
  # end
16
16
  # tform.execute
17
17
  class Transform
18
+ class IncompatibleTargetIndexError < StandardError; end
18
19
 
19
20
  FieldMap = Struct.new(:from_subject, :to_subject, :field_from_to)
20
21
 
@@ -152,6 +153,19 @@ module Remi
152
153
  sub_trans_ds = field_map.from_subject
153
154
  fields_to_map = field_map.field_from_to.keys
154
155
 
156
+ job_idx = job_ds.df.index.to_a
157
+ sub_idx = sub_trans_ds.df.index.to_a
158
+ diff = ((job_idx | sub_idx) - (job_idx & sub_idx))
159
+ if job_idx.size > 0 && diff.size > 0 then
160
+ msg = <<-EOT
161
+ Incompatible target index!
162
+ Sub transform target #{sub_trans_ds.name} index is #{sub_trans_ds.df.index.inspect}
163
+ Job transform target #{job_ds.name} index is #{job_ds.df.index.inspect}
164
+ EOT
165
+ raise IncompatibleTargetIndexError.new msg
166
+ end
167
+
168
+
155
169
  fields_to_map.each do |sub_trans_field|
156
170
  job_field = field_map.field_from_to[sub_trans_field]
157
171
  job_ds.fields[job_field].merge! sub_trans_ds.fields[sub_trans_field]
@@ -128,10 +128,13 @@ module Remi
128
128
 
129
129
  # Private: Converts the transformed data into vectors in the target dataframe.
130
130
  def map_to_target_df
131
+ index = @target_df.index.size > 0 ? @target_df.index : @source_df.index
132
+
131
133
  result_hash_of_arrays.each do |vector, values|
132
- @target_df[vector] = Daru::Vector.new(values, index: @source_df.index)
134
+ @target_df[vector] = Daru::Vector.new(values, index: index)
133
135
  end
134
136
 
137
+ @target_df.index = index
135
138
  @target_df
136
139
  end
137
140
 
@@ -550,6 +550,9 @@ module Remi
550
550
  raise ArgumentError, "Unknown type enforcement: #{type}"
551
551
  end
552
552
  end
553
+
554
+ rescue StandardError => err
555
+ raise ArgumentError, "Unable to convert value '#{value}' to type '#{type}': #{err.message}"
553
556
  end
554
557
  end
555
558
 
@@ -1,3 +1,3 @@
1
1
  module Remi
2
- VERSION = '0.3.2'
2
+ VERSION = '0.3.3'
3
3
  end
@@ -14,7 +14,11 @@ describe Parser::None do
14
14
  let(:parser) { Parser::None.new }
15
15
 
16
16
  context '#parse' do
17
- it 'returns what it is given' do
17
+ it 'returns an empty dataframe when given no data' do
18
+ expect(parser.parse.to_a).to eq Remi::DataFrame::Daru.new([]).to_a
19
+ end
20
+
21
+ it 'returns an what it was given' do
18
22
  expect(parser.parse('some data')).to eq 'some data'
19
23
  end
20
24
  end
@@ -9,14 +9,23 @@ describe Extractor::S3File do
9
9
  }
10
10
 
11
11
  prefix = "the-best-prefix"
12
- @s3_file = Extractor::S3File.new(bucket: 'the-best-bucket', remote_path: "#{prefix}")
13
- @s3_file.s3_client.stub_responses(:list_objects, {
12
+ credentials = {
13
+ aws_access_key_id: 'BLAH',
14
+ aws_secret_access_key: 'DEBLAH'
15
+ }
16
+
17
+ @s3_file = Extractor::S3File.new(
18
+ bucket: 'the-best-bucket',
19
+ credentials: credentials,
20
+ remote_path: "#{prefix}"
21
+ )
22
+
23
+ @s3_file.s3.client.stub_responses(:list_objects, {
14
24
  contents: [
15
25
  { key: "#{prefix}/file1.csv" },
16
26
  { key: "#{prefix}/file2.csv" }
17
27
  ]
18
28
  })
19
-
20
29
  end
21
30
 
22
31
  it 'returns all entries' do
@@ -45,7 +45,7 @@ describe Loader::SalesforceSoap do
45
45
  { Id: '1234', Custom__c: 'something', Merge_Id: '5678' }
46
46
  ]
47
47
 
48
- expect(soapforce_client).to receive(:merge) do
48
+ expect(soapforce_client).to receive(:merge!) do
49
49
  [
50
50
  :Contact,
51
51
  {
@@ -65,7 +65,25 @@ describe Loader::SalesforceSoap do
65
65
  { Id: '2', Custom__c: 'something', Merge_Id: '20' }
66
66
  ]
67
67
 
68
- expect(soapforce_client).to receive(:merge).twice
68
+ expect(soapforce_client).to receive(:merge!).twice
69
+ loader.load(data)
70
+ end
71
+
72
+ it 'excludes blank data fields from the merge command' do
73
+ data = [
74
+ { Id: '1234', Custom__c: '', Merge_Id: '5678' }
75
+ ]
76
+
77
+ expect(soapforce_client).to receive(:merge!) do
78
+ [
79
+ :Contact,
80
+ {
81
+ Id: '1234'
82
+ },
83
+ ['5678']
84
+ ]
85
+ end
86
+
69
87
  loader.load(data)
70
88
  end
71
89
 
@@ -76,5 +94,4 @@ describe Loader::SalesforceSoap do
76
94
 
77
95
  expect { loader.load(data) }.to raise_error KeyError
78
96
  end
79
-
80
97
  end
@@ -10,21 +10,25 @@ describe Extractor::SftpFile do
10
10
  }
11
11
  }
12
12
 
13
- let(:sftp_file) {
13
+ def generate_extractor
14
14
  Extractor::SftpFile.new(
15
15
  credentials: credentials,
16
16
  remote_path: remote_path
17
17
  )
18
- }
18
+ end
19
+
20
+ let(:extractor) { generate_extractor }
19
21
 
20
22
  let(:remote_filenames) { ['file1.csv', 'file2.csv'] }
21
- let(:sftp_session) { instance_double('Net:SFTP::Session') }
22
23
 
23
24
  before do
24
- sftp_dir = instance_double('Net::SFTP::Operations::Dir')
25
+ allow(extractor).to receive(:begin_connection)
25
26
 
26
- allow(Net::SFTP).to receive(:start).and_yield sftp_session
27
- allow(sftp_session).to receive(:dir).and_return sftp_dir
27
+ sftp_session = double('sftp_session')
28
+ allow(extractor).to receive(:sftp_session).and_return(sftp_session)
29
+
30
+ sftp_dir = instance_double('Net::SFTP::Operations::Dir')
31
+ allow(sftp_session).to receive(:dir).and_return(sftp_dir)
28
32
 
29
33
  allow(sftp_dir).to receive(:entries).and_return(remote_filenames.map { |fname|
30
34
  Net::SFTP::Protocol::V04::Name.new(
@@ -36,65 +40,76 @@ describe Extractor::SftpFile do
36
40
 
37
41
  context '.new' do
38
42
  it 'creates an instance with valid parameters' do
39
- sftp_file
43
+ extractor
40
44
  end
41
45
 
42
46
  it 'requires a hostname' do
43
47
  credentials.delete(:host)
44
- expect { sftp_file }.to raise_error KeyError
48
+ expect { generate_extractor }.to raise_error KeyError
45
49
  end
46
50
 
47
51
  it 'requires a username' do
48
52
  credentials.delete(:username)
49
- expect { sftp_file }.to raise_error KeyError
53
+ expect { generate_extractor }.to raise_error KeyError
50
54
  end
51
55
 
52
- it 'requires a password' do
56
+ it 'does not require a password' do # If empty, it will use private keys
53
57
  credentials.delete(:password)
54
- expect { sftp_file }.to raise_error KeyError
58
+ expect { generate_extractor }.not_to raise_error
55
59
  end
56
60
 
57
61
  it 'defaults to using port 22' do
58
- expect(sftp_file.port).to eq '22'
62
+ expect(extractor.port).to eq '22'
59
63
  end
60
64
 
61
65
  it 'allows the port to be defined in the credentials' do
62
66
  credentials[:port] = '1234'
63
- expect(sftp_file.port).to eq '1234'
67
+ expect(generate_extractor.port).to eq '1234'
64
68
  end
65
69
  end
66
70
 
67
71
  context '#all_entires' do
68
72
  it 'returns all entries' do
69
- expect(sftp_file.all_entries.map(&:name)).to eq remote_filenames
73
+ expect(extractor.all_entries.map(&:name)).to eq remote_filenames
70
74
  end
71
75
  end
72
76
 
73
77
  context '#extract' do
74
78
  it 'downloads files from the ftp' do
75
- expect(sftp_session).to receive(:download!).exactly(remote_filenames.size).times
76
- sftp_file.extract
79
+ expect(extractor.sftp_session).to receive(:download!).exactly(remote_filenames.size).times
80
+ extractor.extract
77
81
  end
78
82
 
79
83
  it 'creates local files with the right names' do
80
- allow(sftp_session).to receive(:download!)
81
- expect(sftp_file.extract.map { |f| Pathname.new(f).basename.to_s }).to eq remote_filenames
84
+ allow(extractor.sftp_session).to receive(:download!)
85
+ expect(extractor.extract.map { |f| Pathname.new(f).basename.to_s }).to eq remote_filenames
82
86
  end
83
87
  end
84
88
  end
85
89
 
86
90
 
87
91
  describe Loader::SftpFile do
88
- let(:loader) { Loader::SftpFile.new(credentials: {}, remote_path: 'some_path') }
92
+
93
+ let(:credentials) {
94
+ {
95
+ host: 'host',
96
+ username: 'username',
97
+ password: 'password'
98
+ }
99
+ }
100
+
101
+ let(:loader) { Loader::SftpFile.new(credentials: credentials, remote_path: 'some_path') }
89
102
  let(:data) { double('some_data') }
90
- let(:sftp_session) { instance_double('Net:SFTP::Session') }
91
103
 
92
104
  before do
93
- allow(Net::SFTP).to receive(:start).and_yield sftp_session
105
+ allow(loader).to receive(:begin_connection)
106
+
107
+ sftp_session = double('sftp_session')
108
+ allow(loader).to receive(:sftp_session).and_return(sftp_session)
94
109
  end
95
110
 
96
111
  it 'loads a csv to a target sftp filesystem' do
97
- expect(sftp_session).to receive(:upload!).with(data, 'some_path')
112
+ expect(loader.sftp_session).to receive(:upload!).with(data, 'some_path')
98
113
  loader.load data
99
114
  end
100
115
  end
@@ -253,5 +253,89 @@ describe Job do
253
253
  my_transform.execute
254
254
  end
255
255
  end
256
+
257
+ describe '#import - edge cases' do
258
+ before do
259
+ class MyJob
260
+ source :job_source do
261
+ fields({ :id => {}, :name => {} })
262
+ end
263
+ target :job_target do
264
+ fields({ :id => {}, :name => {}, :funny_name => {} })
265
+ end
266
+ end
267
+
268
+ job.job_source.df = Remi::DataFrame::Daru.new({
269
+ id: [1, 2, 3],
270
+ name: ['one', 'two', 'three']
271
+ })
272
+ end
273
+
274
+ it 'correctly maps back to a source if the sub transform sorts the data' do
275
+ sub_transform = Job::Transform.new('arbitrary') do
276
+ source :st_source, [:id, :name]
277
+ target :st_target, [:funny_name]
278
+
279
+ st_source.df.sort!([:id], ascending: [false])
280
+
281
+ Remi::SourceToTargetMap.apply(st_source.df, st_target.df) do
282
+ map source(:name) .target(:funny_name)
283
+ .transform(->(v) { "funny-#{v}" })
284
+ end
285
+ end
286
+
287
+ my_transform = Job::Transform.new(job) do
288
+ import sub_transform do
289
+ map_source_fields :job_source, :st_source, {
290
+ :id => :id,
291
+ :name => :name
292
+ }
293
+ map_target_fields :st_target, :job_source, {
294
+ :funny_name => :funny_name
295
+ }
296
+ end
297
+
298
+ job.job_target.df = job.job_source.df.dup
299
+ end
300
+
301
+ my_transform.execute
302
+ expect(job.job_target.df[:funny_name].to_a).to eq(
303
+ job.job_target.df[:name].to_a.map { |v| "funny-#{v}" }
304
+ )
305
+ end
306
+
307
+ it 'raises an error if the subtransform fucks with index', wip: true do
308
+ sub_transform = Job::Transform.new('arbitrary') do
309
+ source :st_source, [:id, :name]
310
+ target :st_target, [:funny_name]
311
+
312
+ duplicated_df = Daru::DataFrame.new({ id: Array(st_source.df[:id][0]) * 3 })
313
+
314
+ st_source.df = st_source.df.join(duplicated_df, on: [:id], how: :left)
315
+
316
+ Remi::SourceToTargetMap.apply(st_source.df, st_target.df) do
317
+ map source(:name) .target(:funny_name)
318
+ .transform(->(v) { "funny-#{v}" })
319
+ end
320
+ end
321
+
322
+ my_transform = Job::Transform.new(job) do
323
+ import sub_transform do
324
+ map_source_fields :job_source, :st_source, {
325
+ :id => :id,
326
+ :name => :name
327
+ }
328
+ map_target_fields :st_target, :job_source, {
329
+ :funny_name => :funny_name
330
+ }
331
+ end
332
+
333
+ job.job_target.df = job.job_source.df.dup
334
+ end
335
+
336
+ expect { my_transform.execute }.to raise_error Job::Transform::IncompatibleTargetIndexError
337
+ end
338
+ end
339
+
256
340
  end
257
341
  end
@@ -298,4 +298,34 @@ describe SourceToTargetMap do
298
298
  expect(sttm).to be_a(Remi::DataFrame::Daru)
299
299
  end
300
300
  end
301
+
302
+ describe 'source and target dataframes differ', wip: true do
303
+ it 'does not fail when the dataframe has been filtered' do
304
+ some_df = Daru::DataFrame.new(
305
+ {
306
+ :id => [1,2,3,4,5],
307
+ :something => ['x','','x','','x'],
308
+ :name => ['one', 'two', 'three', 'four', 'five']
309
+ }
310
+ )
311
+
312
+ filtered_df = some_df.where(some_df[:something].eq('x'))
313
+ target_df = Remi::DataFrame::Daru.new([])
314
+
315
+ Remi::SourceToTargetMap.apply(filtered_df, target_df) do
316
+ map source(:id) .target(:id)
317
+ map source(:name) .target(:name)
318
+ end
319
+
320
+ result = target_df[:id, :name].to_h.each_with_object({}) { |(k,v), h| h[k] = v.to_a }
321
+ expect(result).to eq({
322
+ :id => [1, 3, 5],
323
+ :name => ['one', 'three', 'five']
324
+ })
325
+ end
326
+
327
+
328
+ end
329
+
330
+
301
331
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sterling Paramore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-01-25 00:00:00.000000000 Z
11
+ date: 2017-06-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bond
@@ -231,6 +231,7 @@ files:
231
231
  - features/json.feature
232
232
  - features/metadata.feature
233
233
  - features/parameters.feature
234
+ - features/s3_file_target_job.feature
234
235
  - features/sample_job.feature
235
236
  - features/sftp_file_target_job.feature
236
237
  - features/step_definitions/remi_step.rb
@@ -254,6 +255,7 @@ files:
254
255
  - jobs/json_job.rb
255
256
  - jobs/metadata_job.rb
256
257
  - jobs/parameters_job.rb
258
+ - jobs/s3_file_target_job.rb
257
259
  - jobs/sample_job.rb
258
260
  - jobs/sftp_file_target_job.rb
259
261
  - jobs/sub_job_example_job.rb
@@ -372,6 +374,7 @@ test_files:
372
374
  - features/json.feature
373
375
  - features/metadata.feature
374
376
  - features/parameters.feature
377
+ - features/s3_file_target_job.feature
375
378
  - features/sample_job.feature
376
379
  - features/sftp_file_target_job.feature
377
380
  - features/step_definitions/remi_step.rb