remi 0.3.2 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0de4e8f2de3129e2e4b93c3d22dc5f718a05b56a
4
- data.tar.gz: d963548c553f1918b33bd391038bc3481ce4a5d8
3
+ metadata.gz: 051d6add4664343ee59a6c722a2abadc15ea4377
4
+ data.tar.gz: f7f438b794a08948617b767dfca83e58533300ad
5
5
  SHA512:
6
- metadata.gz: d01e67e38c2a76784e65a22536d2d9cba7c9f56dc3686e8d0d23ea1e5176cb8495cd06977ba85357d336cbf7fd91641f79795d68df69542ce9e94b39bc85c6ec
7
- data.tar.gz: 07cec77fc7c40299207081f5ea7390cdd3cf863ac49295e9de166e4736ff95ae5d57e642de2a10ea74d1d812aeeff2ed4ffdb6a93c232658efc7552332a9f1e9
6
+ metadata.gz: d0e46a405da1e48dc0b82afe9c350b26f0b436fe99aa1cdf9500d035ec5a01612257c679338063920d370ead1e09c68ec35f7de9d732338417874a396ed8634c
7
+ data.tar.gz: d39aaad0be382a3f70359c03eeaa1b13ef9a594967cd029014da40ca2882cffe53a81a534cdf4c8221399fb10636e5c1304ac591e6a0ef589ecd1d0fab1b07f2
data/Gemfile CHANGED
@@ -7,4 +7,4 @@ gem 'daru', '0.1.4.1', git: 'git@github.com:inside-track/daru.git', branch: '0.1
7
7
  gem 'restforce', '~> 2.1'
8
8
  gem 'salesforce_bulk_api', git: 'git@github.com:inside-track/salesforce_bulk_api.git', branch: 'master'
9
9
  gem 'soapforce', '~> 0.5'
10
- gem 'aws-sdk', '~> 2.3'
10
+ gem 'aws-sdk', '~> 2.10'
@@ -18,7 +18,7 @@ GIT
18
18
  PATH
19
19
  remote: .
20
20
  specs:
21
- remi (0.3.2)
21
+ remi (0.3.3)
22
22
  activesupport (~> 4.2)
23
23
  bond (~> 0.5)
24
24
  cucumber (~> 2.1)
@@ -43,12 +43,14 @@ GEM
43
43
  akami (1.3.1)
44
44
  gyoku (>= 0.4.0)
45
45
  nokogiri
46
- aws-sdk (2.3.5)
47
- aws-sdk-resources (= 2.3.5)
48
- aws-sdk-core (2.3.5)
46
+ aws-sdk (2.10.3)
47
+ aws-sdk-resources (= 2.10.3)
48
+ aws-sdk-core (2.10.3)
49
+ aws-sigv4 (~> 1.0)
49
50
  jmespath (~> 1.0)
50
- aws-sdk-resources (2.3.5)
51
- aws-sdk-core (= 2.3.5)
51
+ aws-sdk-resources (2.10.3)
52
+ aws-sdk-core (= 2.10.3)
53
+ aws-sigv4 (1.0.0)
52
54
  backports (3.6.8)
53
55
  bond (0.5.1)
54
56
  builder (3.2.2)
@@ -104,10 +106,8 @@ GEM
104
106
  mimemagic (~> 0.3)
105
107
  multi_json (~> 1.11)
106
108
  rbczmq (~> 1.7)
107
- jmespath (1.2.4)
108
- json_pure (>= 1.8.1)
109
+ jmespath (1.3.1)
109
110
  json (1.8.3)
110
- json_pure (1.8.3)
111
111
  jwt (1.5.6)
112
112
  little-plugger (1.1.4)
113
113
  logging (2.1.0)
@@ -187,7 +187,7 @@ PLATFORMS
187
187
  ruby
188
188
 
189
189
  DEPENDENCIES
190
- aws-sdk (~> 2.3)
190
+ aws-sdk (~> 2.10)
191
191
  daru (= 0.1.4.1)!
192
192
  github-markup (~> 1.4)
193
193
  google-api-client (~> 0.9)
@@ -200,4 +200,4 @@ DEPENDENCIES
200
200
  yard (~> 0.9)
201
201
 
202
202
  BUNDLED WITH
203
- 1.14.3
203
+ 1.15.1
@@ -0,0 +1,10 @@
1
+ Feature: Tests targets that are S3 Files.
2
+
3
+ Background:
4
+ Given the job is 'S3 File Target'
5
+ And the job target 'Some File'
6
+
7
+ Scenario: Defining the remote path.
8
+ Given the target 'Some File'
9
+ Then the file is uploaded to the S3 bucket "the-big-one"
10
+ And the file is uploaded to the remote path "some_file_*Today: %Y%m%d*.csv"
@@ -69,6 +69,14 @@ Then /^the file is uploaded to the remote path "([^"]+)"$/ do |remote_path|
69
69
  expect(@brt.target.data_subject.loaders.map(&:remote_path)).to include expected_path
70
70
  end
71
71
 
72
+ Then /^the file is uploaded to the S3 bucket "([^"]+)"$/ do |bucket_name|
73
+ expected_bucket_name = Remi::Testing::BusinessRules::ParseFormula.parse(bucket_name)
74
+ bucket_names = @brt.target.data_subject.loaders.map do |loader|
75
+ loader.bucket_name if loader.respond_to? :bucket_name
76
+ end
77
+ expect(bucket_names).to include expected_bucket_name
78
+ end
79
+
72
80
  ## CSV Options
73
81
 
74
82
  Given /^the (source|target) file is delimited with a (\w+)$/ do |st, delimiter|
@@ -124,6 +132,16 @@ Given /^the (source|target) file contains all of the following headers in this o
124
132
  expect(@brt.send(st.to_sym).data_subject.df.vectors.to_a).to eq @brt.send(st.to_sym).fields.field_names
125
133
  end
126
134
 
135
+ Given /^the (source|target) file contains all of the following headers in no particular order:$/ do |st, table|
136
+ table.rows.each do |row|
137
+ field = row.first
138
+ step "the #{st} field '#{field}'"
139
+ end
140
+
141
+ @brt.run_transforms if st == 'target'
142
+ expect(@brt.send(st.to_sym).data_subject.df.vectors.to_a).to match_array @brt.send(st.to_sym).fields.field_names
143
+ end
144
+
127
145
  ### Source
128
146
 
129
147
  Given /^the source '([[:alnum:]\s\-_]+)'$/ do |arg|
@@ -260,6 +278,7 @@ Then /^the target field '([^']+)' has the label '([^']+)'$/ do |target_field, la
260
278
  data_field = @brt.targets.fields.next
261
279
  expect(data_field.metadata[:label]).to eq label
262
280
  expect(data_field.name).to eq target_field
281
+
263
282
  end
264
283
 
265
284
  Then /^the target field '([^']+)' is copied from the source field$/ do |target_field|
@@ -780,3 +799,10 @@ Then /^the target '([[:alnum:]\s\-_]+)' has (\d+) record(?:s|) where '([[:alnum:
780
799
  @brt.run_transforms
781
800
  expect(@brt.targets[target_name].where_between(field_name, low_value, high_value).size).to eq nrecords.to_i
782
801
  end
802
+
803
+ Then /^the target field '([^']+)' (?:has|is set to) the multiline value$/ do |target_field, value|
804
+ step "the target field '#{target_field}'"
805
+ @brt.run_transforms
806
+ target_name, target_field_name = @brt.targets.parse_full_field(target_field)
807
+ expect(@brt.targets[target_name].fields[target_field_name].value).to eq Remi::Testing::BusinessRules::ParseFormula.parse(value)
808
+ end
@@ -0,0 +1,23 @@
1
+ require_relative 'all_jobs_shared'
2
+ require 'aws-sdk'
3
+
4
+ class S3FileTargetJob < Remi::Job
5
+ target :some_file do
6
+ encoder Remi::Encoder::CsvFile.new
7
+ loader Remi::Loader::S3File.new(
8
+ credentials: {
9
+ aws_access_key_id: 'blort',
10
+ aws_secret_access_key: 'blerg',
11
+ region: 'us-west-2'
12
+ },
13
+ kms_opt: {
14
+ ciphertext: 'blergity'
15
+ },
16
+ bucket: 'the-big-one',
17
+ remote_path: "some_file_#{DateTime.current.strftime('%Y%m%d')}.csv"
18
+ )
19
+ end
20
+
21
+ transform :main do
22
+ end
23
+ end
@@ -93,7 +93,11 @@ module Remi
93
93
  sttm = SourceToTargetMap.new(df, source_metadata: fields)
94
94
  fields.keys.each do |field|
95
95
  next unless (types.size == 0 || types.include?(fields[field][:type])) && df.vectors.include?(field)
96
- sttm.source(field).target(field).transform(Remi::Transform::EnforceType.new).execute
96
+ begin
97
+ sttm.source(field).target(field).transform(Remi::Transform::EnforceType.new).execute
98
+ rescue StandardError => err
99
+ raise ArgumentError, "Field '#{field}': #{err.message}"
100
+ end
97
101
  end
98
102
 
99
103
  self
@@ -8,10 +8,10 @@ module Remi
8
8
  end
9
9
 
10
10
 
11
- # The None Parser just returns what it is given.
11
+ # The None Parser just returns an empty dataframe if it's not given any data
12
12
  class Parser::None < Parser
13
- def parse(data)
14
- data
13
+ def parse(data=nil)
14
+ data || Remi::DataFrame::Daru.new([], order: fields.keys)
15
15
  end
16
16
  end
17
17
 
@@ -1,13 +1,59 @@
1
1
  module Remi
2
2
 
3
+ module DataSubject::S3File
4
+ attr_accessor :region
5
+ attr_accessor :aws_credentials
6
+
7
+ def init_aws_credentials(credentials)
8
+ @aws_credentials = Aws::Credentials.new(
9
+ credentials.fetch(:aws_access_key_id, ENV['AWS_ACCESS_KEY_ID']),
10
+ credentials.fetch(:aws_secret_access_key, ENV['AWS_SECRET_ACCESS_KEY'])
11
+ )
12
+ end
13
+
14
+ def s3
15
+ @s3 ||= Aws::S3::Resource.new(
16
+ credentials: aws_credentials,
17
+ region: region
18
+ )
19
+ end
20
+
21
+ def encrypt_args
22
+ @kms_args || {}
23
+ end
24
+
25
+ def init_kms(opt)
26
+ return nil unless opt
27
+
28
+ kms = Aws::KMS::Client.new(
29
+ region: @region,
30
+ credentials: @aws_credentials
31
+ )
32
+
33
+ ciphertext = opt.fetch(:ciphertext)
34
+ algorithm = opt.fetch(:algorithm, 'AES256')
35
+ key = kms.decrypt(ciphertext_blob: Base64.decode64(ciphertext)).plaintext
36
+
37
+ @kms_args = {
38
+ sse_customer_algorithm: algorithm,
39
+ sse_customer_key: key
40
+ }
41
+ end
42
+ end
43
+
3
44
  # S3 File extractor
4
45
  # Used to extract files from Amazon S3
5
46
  #
6
- # @example
47
+ # @example Standard use
7
48
  #
8
49
  # class MyJob < Remi::Job
9
50
  # source :some_file do
10
51
  # extractor Remi::Extractor::S3File.new(
52
+ # credentials: {
53
+ # aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
54
+ # aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
55
+ # region: 'us-west-2'
56
+ # },
11
57
  # bucket: 'my-awesome-bucket',
12
58
  # remote_path: 'some_file-',
13
59
  # most_recent_only: true
@@ -28,9 +74,40 @@ module Remi
28
74
  # # 0 1 Albert
29
75
  # # 1 2 Betsy
30
76
  # # 2 3 Camu
77
+ #
78
+ # @example Using AWS KMS
79
+ # To use AWS KMS, supply a :ciphertext and optional :algorithm (default is AES256).
80
+ # The encrypted key stored in the ciphertext must be the same as that used when the file was written.
81
+ #
82
+ # class MyJob < Remi::Job
83
+ # source :some_file do
84
+ # extractor Remi::Extractor::S3File.new(
85
+ # credentials: {
86
+ # aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
87
+ # aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
88
+ # region: 'us-west-2'
89
+ # },
90
+ # bucket: 'my-awesome-bucket',
91
+ # remote_path: 'some_file-',
92
+ # most_recent_only: true,
93
+ # kms_opt: {
94
+ # ciphertext: '<base64-encoded ciphertext>'
95
+ # }
96
+ # )
97
+ # parser Remi::Parser::CsvFile.new(
98
+ # csv_options: {
99
+ # headers: true,
100
+ # col_sep: '|'
101
+ # }
102
+ # )
103
+ # end
104
+ # end
31
105
  class Extractor::S3File < Extractor::FileSystem
106
+ include Remi::DataSubject::S3File
32
107
 
33
- # @param bucket_name [String] S3 bucket containing the files
108
+ # @param bucket [String] Name of S3 bucket containing the files
109
+ # @param kms_opt [Hash] Hash containing AWS KMS options
110
+ # @param credentials [Hash] Hash containing AWS credentials (must contain :aws_access_key_id, :aws_secret_access_key, :region)
34
111
  def initialize(*args, **kargs, &block)
35
112
  super
36
113
  init_s3_file(*args, **kargs, &block)
@@ -39,10 +116,12 @@ module Remi
39
116
  # Called to extract files from the source filesystem.
40
117
  # @return [Array<String>] An array of paths to a local copy of the files extacted
41
118
  def extract
119
+ init_kms(@kms_opt)
120
+
42
121
  entries.map do |entry|
43
122
  local_file = File.join(@local_path, entry.name)
44
123
  logger.info "Downloading #{entry.pathname} from S3 to #{local_file}"
45
- File.open(local_file, 'wb') { |file| entry.raw.get(response_target: file) }
124
+ File.open(local_file, 'wb') { |file| entry.raw.get({ response_target: file }.merge(encrypt_args)) }
46
125
  local_file
47
126
  end
48
127
  end
@@ -55,7 +134,7 @@ module Remi
55
134
  # @return [Array<Extractor::FileSystemEntry>] List of objects in the bucket/prefix
56
135
  def all_entries!
57
136
  # S3 does not track anything like a create time, so use last modified for both
58
- bucket.objects(prefix: @remote_path.to_s).map do |entry|
137
+ s3.bucket(@bucket_name).objects(prefix: @remote_path.to_s).map do |entry|
59
138
  Extractor::FileSystemEntry.new(
60
139
  pathname: entry.key,
61
140
  create_time: entry.last_modified,
@@ -65,20 +144,128 @@ module Remi
65
144
  end
66
145
  end
67
146
 
68
- # @return [Aws::S3::Client] The S3 client used
69
- def s3_client
70
- @s3_client ||= Aws::S3::Client.new
71
- end
72
-
73
147
  private
74
148
 
75
- def init_s3_file(*args, bucket:, **kargs)
149
+ def init_s3_file(*args, credentials: {}, bucket:, kms_opt: nil, **kargs)
150
+ @region = credentials.fetch(:region, 'us-west-2')
151
+ @kms_opt = kms_opt
152
+ init_aws_credentials(credentials)
153
+
76
154
  @bucket_name = bucket
77
155
  end
156
+ end
157
+
158
+
159
+
160
+ # S3 File loader
161
+ # Used to post files to Amazon S3
162
+ #
163
+ # @example Standard use
164
+ #
165
+ # class MyJob < Remi::Job
166
+ # target :some_file do
167
+ # encoder Remi::Encoder::CsvFile.new
168
+ # loader Remi::Loader::S3File.new(
169
+ # credentials: {
170
+ # aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
171
+ # aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
172
+ # region: 'us-west-2'
173
+ # },
174
+ # bucket: 'itk-de-archive',
175
+ # remote_path: 'awesome.csv'
176
+ # )
177
+ # end
178
+ # end
179
+ #
180
+ # job = MyJob.new
181
+ # job.some_file.df = Daru::DataFrame.new(
182
+ # {
183
+ # numbers: [1,2,3],
184
+ # words: ['one', 'two', 'three']
185
+ # }
186
+ # )
187
+ # job.some_file.load
188
+ #
189
+ # @example Using AWS KMS
190
+ # To use AWS KMS, supply a :ciphertext and optional :algorithm (default is AES256).
191
+ # The encrypted key stored in the ciphertext must be the same as that used for reading the file.
192
+ #
193
+ # class MyJob < Remi::Job
194
+ # target :some_file do
195
+ # encoder Remi::Encoder::CsvFile.new
196
+ # loader Remi::Loader::S3File.new(
197
+ # credentials: {
198
+ # aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
199
+ # aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
200
+ # region: 'us-west-2'
201
+ # },
202
+ # bucket: 'itk-de-archive',
203
+ # remote_path: 'awesome.csv',
204
+ # kms_opt: {
205
+ # ciphertext: '<base64-encoded ciphertext>'
206
+ # }
207
+ # )
208
+ # end
209
+ # end
210
+ #
211
+ # @example Generating a ciphertext
212
+ # A ciphertext can be generated using the AWS SDK
213
+ #
214
+ # require 'aws-sdk'
215
+ # require 'base64'
216
+ #
217
+ # aws_credentials = Aws::Credentials.new(
218
+ # ENV['AWS_ACCESS_KEY_ID'],
219
+ # ENV['AWS_SECRET_ACCESS_KEY']
220
+ # )
221
+ #
222
+ # kms = Aws::KMS::Client.new(
223
+ # region: 'us-west-2',
224
+ # credentials: aws_credentials
225
+ # )
226
+ #
227
+ # # See AWS docs for creating keys: http://docs.aws.amazon.com/kms/latest/developerguide/create-keys.html
228
+ # data_key = kms.generate_data_key(
229
+ # key_id: 'alias/alias-of-kms-key',
230
+ # key_spec: 'AES_256'
231
+ # )
232
+ #
233
+ # ciphertext = Base64.strict_encode64(data_key.ciphertext_blob)
234
+ # #=> "AQIDAHjmmRVcBAdMHsA9VUoJKgbW8niK2qL1qPcQ2OWEUlh5XAFw0vfl+QIgawB8cbAZ2OqXAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMIUIFFh++2w4d9al7AgEQgDvSRXQCOPLSMOjRS/lM5uxuyRV47qInlKKBIezIaYzXuFu1sRU+L46HqRyS0XqR4flFJ/fc8yEj3pU1UA=="
235
+ class Loader::S3File < Loader
236
+ include Remi::DataSubject::S3File
237
+
238
+ # @param bucket [String] Name of S3 bucket containing the files
239
+ # @param kms_opt [Hash] Hash containing AWS KMS options
240
+ # @param credentials [Hash] Hash containing AWS credentials (must contain :aws_access_key_id, :aws_secret_access_key, :region)
241
+ def initialize(*args, **kargs, &block)
242
+ super
243
+ init_s3_loader(*args, **kargs, &block)
244
+ end
245
+
246
+ attr_reader :remote_path
247
+ attr_reader :bucket_name
78
248
 
79
- def bucket
80
- @bucket ||= Aws::S3::Bucket.new(@bucket_name, client: s3_client)
249
+ # Copies data to S3
250
+ # @param data [Object] The path to the file in the temporary work location
251
+ # @return [true] On success
252
+ def load(data)
253
+ init_kms(@kms_opt)
254
+
255
+ @logger.info "Writing file #{data} to S3 #{@bucket_name} as #{@remote_path}"
256
+ s3.bucket(@bucket_name).object(@remote_path).upload_file(data, encrypt_args)
257
+ true
81
258
  end
82
259
 
260
+ private
261
+
262
+ def init_s3_loader(*args, credentials:{}, bucket:, remote_path:, kms_opt: nil, **kargs, &block)
263
+ @region = credentials.fetch(:region, 'us-west-2')
264
+ @kms_opt = kms_opt
265
+ init_aws_credentials(credentials)
266
+
267
+ @bucket_name = bucket
268
+ @remote_path = remote_path
269
+ end
83
270
  end
84
271
  end
@@ -185,11 +185,11 @@ module Remi
185
185
  if @operation == :update
186
186
  Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, data, batch_size: @batch_size, logger: logger)
187
187
  elsif @operation == :create
188
- Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, data, batch_size: @batch_size, logger: logger)
188
+ Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, data, batch_size: @batch_size, max_attempts: 1, logger: logger)
189
189
  elsif @operation == :upsert
190
190
  Remi::SfBulkHelper::SfBulkUpsert.upsert(restforce_client, @sfo, data, batch_size: @batch_size, external_id: @external_id, logger: logger)
191
191
  elsif @operation == :delete
192
- Remi::SfBulkHelper::SfBulkDelete.upsert(restforce_client, @sfo, data, batch_size: @batch_size, logger: logger)
192
+ Remi::SfBulkHelper::SfBulkDelete.delete(restforce_client, @sfo, data, batch_size: @batch_size, logger: logger)
193
193
  else
194
194
  raise ArgumentError, "Unknown operation: #{@operation}"
195
195
  end
@@ -79,7 +79,9 @@ module Remi
79
79
  end
80
80
 
81
81
  merge_id = Array(row.delete(@merge_id_field))
82
- soapforce_client.merge(@sfo, row, merge_id)
82
+ merge_row = row.select { |_, v| !v.blank? }
83
+ logger.info "Merging Id #{merge_id} into #{merge_row}"
84
+ soapforce_client.merge!(@sfo, merge_row, merge_id)
83
85
  end
84
86
  else
85
87
  raise ArgumentError, "Unknown soap operation: #{@operation}"
@@ -1,4 +1,44 @@
1
1
  module Remi
2
+ module DataSubject::SftpFile
3
+
4
+ attr_reader :sftp_session
5
+
6
+ def sftp_retry(&block)
7
+ tries ||= @retries
8
+
9
+ block.call
10
+ rescue StandardError => err
11
+ if (tries -= 1) > 0
12
+ logger.error "Error: #{err.message}"
13
+ logger.error "Will retry #{tries} more times"
14
+ sleep(1)
15
+ retry
16
+ else
17
+ raise err
18
+ end
19
+ end
20
+
21
+ def begin_connection
22
+ sftp_retry do
23
+ Timeout.timeout(@timeout) do
24
+ @ssh_session = Net::SSH.start(@host, @username, password: @password, port: @port, number_of_password_prompts: 0)
25
+ @sftp_session = Net::SFTP::Session.new(@ssh_session)
26
+ @sftp_session.connect!
27
+ end
28
+ end
29
+ end
30
+
31
+ def end_connection
32
+ @sftp_session.close_channel unless @sftp_session.nil?
33
+ @ssh_session.close unless @ssh_session.nil?
34
+
35
+ Timeout.timeout(@timeout) do
36
+ sleep 1 until (@sftp_session.nil? || @sftp_session.closed?) && (@ssh_session.nil? || @ssh_session.closed?)
37
+ end
38
+ end
39
+ end
40
+
41
+
2
42
 
3
43
  # Sftp File extractor
4
44
  # Used to extract files from an SFTP server
@@ -35,13 +75,15 @@ module Remi
35
75
  # # 1 2 Betsy
36
76
  # # 2 3 Camu
37
77
  class Extractor::SftpFile < Extractor::FileSystem
38
- N_RETRY = 3
78
+ include DataSubject::SftpFile
39
79
 
40
80
  # @param credentials [Hash] Options hash containing login credentials
41
81
  # @param credentials [String] :host SFTP host (e.g., coolserver.com)
42
82
  # @param credentials [String] :username SFTP username
43
83
  # @param credentials [String] :password SFTP password
44
84
  # @param credentials [String] :port SFTP port (default: 22)
85
+ # @param retries [Integer] Number of times a connection or operation will be retried (default: 3)
86
+ # @param timeout [Integer] Number of seconds to wait for establishing/closing a connection (default: 30)
45
87
  def initialize(*args, **kargs, &block)
46
88
  super
47
89
  init_sftp_extractor(*args, **kargs)
@@ -55,15 +97,16 @@ module Remi
55
97
  # Called to extract files from the source filesystem.
56
98
  # @return [Array<String>] An array of paths to a local copy of the files extacted
57
99
  def extract
58
- connection do |sftp|
59
- entries.map do |entry|
60
- local_file = File.join(@local_path, entry.name)
61
- logger.info "Downloading #{entry.name} to #{local_file}"
62
- retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
63
- local_file
100
+ begin_connection
64
101
 
65
- end
102
+ entries.map do |entry|
103
+ local_file = File.join(@local_path, entry.name)
104
+ logger.info "Downloading #{entry.name} to #{local_file}"
105
+ sftp_retry { sftp_session.download!(File.join(@remote_path, entry.name), local_file) }
106
+ local_file
66
107
  end
108
+ ensure
109
+ end_connection
67
110
  end
68
111
 
69
112
  # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
@@ -73,8 +116,7 @@ module Remi
73
116
 
74
117
  # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
75
118
  def all_entries!
76
- sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path) }
77
- sftp_entries.map do |entry|
119
+ sftp_session.dir.entries(@remote_path).map do |entry|
78
120
  # Early versions of the protocol don't support create time, fake it with modified time?
79
121
  FileSystemEntry.new(
80
122
  pathname: File.join(@remote_path, entry.name),
@@ -87,33 +129,13 @@ module Remi
87
129
 
88
130
  private
89
131
 
90
- def init_sftp_extractor(*args, credentials:, **kargs)
132
+ def init_sftp_extractor(*args, credentials:, retries: 3, timeout: 30, **kargs)
91
133
  @host = credentials.fetch(:host)
92
134
  @username = credentials.fetch(:username)
93
- @password = credentials.fetch(:password)
135
+ @password = credentials.fetch(:password, nil)
94
136
  @port = credentials.fetch(:port, '22')
95
- end
96
-
97
- def connection(&block)
98
- result = nil
99
- Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
100
- result = yield sftp
101
- end
102
- result
103
- end
104
-
105
- def retry_download(&block)
106
- 1.upto(N_RETRY).each do |itry|
107
- begin
108
- block.call
109
- break
110
- rescue RuntimeError => err
111
- raise err unless itry < N_RETRY
112
- logger.error "Download failed with error: #{err.message}"
113
- logger.error "Retry attempt #{itry}/#{N_RETRY-1}"
114
- sleep(1)
115
- end
116
- end
137
+ @retries = retries
138
+ @timeout = timeout
117
139
  end
118
140
  end
119
141
 
@@ -143,8 +165,16 @@ module Remi
143
165
  # job.my_target.df = my_df
144
166
  # job.my_target.load
145
167
  class Loader::SftpFile < Loader
168
+ include DataSubject::SftpFile
146
169
 
170
+ # @param credentials [Hash] Options hash containing login credentials
171
+ # @param credentials [String] :host SFTP host (e.g., coolserver.com)
172
+ # @param credentials [String] :username SFTP username
173
+ # @param credentials [String] :password SFTP password
174
+ # @param credentials [String] :port SFTP port (default: 22)
147
175
  # @param remote_path [String, Pathname] Full path to the file to be created on the target filesystem
176
+ # @param retries [Integer] Number of times a connection or operation will be retried (default: 3)
177
+ # @param timeout [Integer] Number of seconds to wait for establishing/closing a connection (default: 30)
148
178
  def initialize(*args, **kargs, &block)
149
179
  super
150
180
  init_sftp_loader(*args, **kargs, &block)
@@ -156,42 +186,27 @@ module Remi
156
186
  # @param data [Object] The path to the file in the temporary work location
157
187
  # @return [true] On success
158
188
  def load(data)
159
- logger.info "Uploading #{data} to #{@credentials[:username]}@#{@credentials[:host]}: #{@remote_path}"
160
- connection do |sftp|
161
- retry_upload { sftp.upload! data, @remote_path }
162
- end
189
+ begin_connection
190
+
191
+ logger.info "Uploading #{data} to #{@username}@#{@host}: #{@remote_path}"
192
+ sftp_retry { sftp_session.upload! data, @remote_path }
163
193
 
164
194
  true
195
+ ensure
196
+ end_connection
165
197
  end
166
198
 
167
199
 
168
200
  private
169
201
 
170
- def init_sftp_loader(*args, credentials:, remote_path:, **kargs, &block)
171
- @credentials = credentials
202
+ def init_sftp_loader(*args, credentials:, remote_path:, retries: 3, timeout: 30, **kargs, &block)
203
+ @host = credentials.fetch(:host)
204
+ @username = credentials.fetch(:username)
205
+ @password = credentials.fetch(:password, nil)
206
+ @port = credentials.fetch(:port, '22')
172
207
  @remote_path = remote_path
173
- end
174
-
175
- def connection(&block)
176
- result = nil
177
- Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
178
- result = yield sftp
179
- end
180
- result
181
- end
182
-
183
- def retry_upload(ntry=2, &block)
184
- 1.upto(ntry).each do |itry|
185
- begin
186
- block.call
187
- break
188
- rescue RuntimeError => err
189
- raise err unless itry < ntry
190
- logger.error "Upload failed with error: #{err.message}"
191
- logger.error "Retry attempt #{itry}/#{ntry-1}"
192
- sleep(1)
193
- end
194
- end
208
+ @retries = retries
209
+ @timeout = timeout
195
210
  end
196
211
  end
197
212
  end
@@ -15,6 +15,7 @@ module Remi
15
15
  # end
16
16
  # tform.execute
17
17
  class Transform
18
+ class IncompatibleTargetIndexError < StandardError; end
18
19
 
19
20
  FieldMap = Struct.new(:from_subject, :to_subject, :field_from_to)
20
21
 
@@ -152,6 +153,19 @@ module Remi
152
153
  sub_trans_ds = field_map.from_subject
153
154
  fields_to_map = field_map.field_from_to.keys
154
155
 
156
+ job_idx = job_ds.df.index.to_a
157
+ sub_idx = sub_trans_ds.df.index.to_a
158
+ diff = ((job_idx | sub_idx) - (job_idx & sub_idx))
159
+ if job_idx.size > 0 && diff.size > 0 then
160
+ msg = <<-EOT
161
+ Incompatible target index!
162
+ Sub transform target #{sub_trans_ds.name} index is #{sub_trans_ds.df.index.inspect}
163
+ Job transform target #{job_ds.name} index is #{job_ds.df.index.inspect}
164
+ EOT
165
+ raise IncompatibleTargetIndexError.new msg
166
+ end
167
+
168
+
155
169
  fields_to_map.each do |sub_trans_field|
156
170
  job_field = field_map.field_from_to[sub_trans_field]
157
171
  job_ds.fields[job_field].merge! sub_trans_ds.fields[sub_trans_field]
@@ -128,10 +128,13 @@ module Remi
128
128
 
129
129
  # Private: Converts the transformed data into vectors in the target dataframe.
130
130
  def map_to_target_df
131
+ index = @target_df.index.size > 0 ? @target_df.index : @source_df.index
132
+
131
133
  result_hash_of_arrays.each do |vector, values|
132
- @target_df[vector] = Daru::Vector.new(values, index: @source_df.index)
134
+ @target_df[vector] = Daru::Vector.new(values, index: index)
133
135
  end
134
136
 
137
+ @target_df.index = index
135
138
  @target_df
136
139
  end
137
140
 
@@ -550,6 +550,9 @@ module Remi
550
550
  raise ArgumentError, "Unknown type enforcement: #{type}"
551
551
  end
552
552
  end
553
+
554
+ rescue StandardError => err
555
+ raise ArgumentError, "Unable to convert value '#{value}' to type '#{type}': #{err.message}"
553
556
  end
554
557
  end
555
558
 
@@ -1,3 +1,3 @@
1
1
  module Remi
2
- VERSION = '0.3.2'
2
+ VERSION = '0.3.3'
3
3
  end
@@ -14,7 +14,11 @@ describe Parser::None do
14
14
  let(:parser) { Parser::None.new }
15
15
 
16
16
  context '#parse' do
17
- it 'returns what it is given' do
17
+ it 'returns an empty dataframe when given no data' do
18
+ expect(parser.parse.to_a).to eq Remi::DataFrame::Daru.new([]).to_a
19
+ end
20
+
21
+ it 'returns an what it was given' do
18
22
  expect(parser.parse('some data')).to eq 'some data'
19
23
  end
20
24
  end
@@ -9,14 +9,23 @@ describe Extractor::S3File do
9
9
  }
10
10
 
11
11
  prefix = "the-best-prefix"
12
- @s3_file = Extractor::S3File.new(bucket: 'the-best-bucket', remote_path: "#{prefix}")
13
- @s3_file.s3_client.stub_responses(:list_objects, {
12
+ credentials = {
13
+ aws_access_key_id: 'BLAH',
14
+ aws_secret_access_key: 'DEBLAH'
15
+ }
16
+
17
+ @s3_file = Extractor::S3File.new(
18
+ bucket: 'the-best-bucket',
19
+ credentials: credentials,
20
+ remote_path: "#{prefix}"
21
+ )
22
+
23
+ @s3_file.s3.client.stub_responses(:list_objects, {
14
24
  contents: [
15
25
  { key: "#{prefix}/file1.csv" },
16
26
  { key: "#{prefix}/file2.csv" }
17
27
  ]
18
28
  })
19
-
20
29
  end
21
30
 
22
31
  it 'returns all entries' do
@@ -45,7 +45,7 @@ describe Loader::SalesforceSoap do
45
45
  { Id: '1234', Custom__c: 'something', Merge_Id: '5678' }
46
46
  ]
47
47
 
48
- expect(soapforce_client).to receive(:merge) do
48
+ expect(soapforce_client).to receive(:merge!) do
49
49
  [
50
50
  :Contact,
51
51
  {
@@ -65,7 +65,25 @@ describe Loader::SalesforceSoap do
65
65
  { Id: '2', Custom__c: 'something', Merge_Id: '20' }
66
66
  ]
67
67
 
68
- expect(soapforce_client).to receive(:merge).twice
68
+ expect(soapforce_client).to receive(:merge!).twice
69
+ loader.load(data)
70
+ end
71
+
72
+ it 'excludes blank data fields from the merge command' do
73
+ data = [
74
+ { Id: '1234', Custom__c: '', Merge_Id: '5678' }
75
+ ]
76
+
77
+ expect(soapforce_client).to receive(:merge!) do
78
+ [
79
+ :Contact,
80
+ {
81
+ Id: '1234'
82
+ },
83
+ ['5678']
84
+ ]
85
+ end
86
+
69
87
  loader.load(data)
70
88
  end
71
89
 
@@ -76,5 +94,4 @@ describe Loader::SalesforceSoap do
76
94
 
77
95
  expect { loader.load(data) }.to raise_error KeyError
78
96
  end
79
-
80
97
  end
@@ -10,21 +10,25 @@ describe Extractor::SftpFile do
10
10
  }
11
11
  }
12
12
 
13
- let(:sftp_file) {
13
+ def generate_extractor
14
14
  Extractor::SftpFile.new(
15
15
  credentials: credentials,
16
16
  remote_path: remote_path
17
17
  )
18
- }
18
+ end
19
+
20
+ let(:extractor) { generate_extractor }
19
21
 
20
22
  let(:remote_filenames) { ['file1.csv', 'file2.csv'] }
21
- let(:sftp_session) { instance_double('Net:SFTP::Session') }
22
23
 
23
24
  before do
24
- sftp_dir = instance_double('Net::SFTP::Operations::Dir')
25
+ allow(extractor).to receive(:begin_connection)
25
26
 
26
- allow(Net::SFTP).to receive(:start).and_yield sftp_session
27
- allow(sftp_session).to receive(:dir).and_return sftp_dir
27
+ sftp_session = double('sftp_session')
28
+ allow(extractor).to receive(:sftp_session).and_return(sftp_session)
29
+
30
+ sftp_dir = instance_double('Net::SFTP::Operations::Dir')
31
+ allow(sftp_session).to receive(:dir).and_return(sftp_dir)
28
32
 
29
33
  allow(sftp_dir).to receive(:entries).and_return(remote_filenames.map { |fname|
30
34
  Net::SFTP::Protocol::V04::Name.new(
@@ -36,65 +40,76 @@ describe Extractor::SftpFile do
36
40
 
37
41
  context '.new' do
38
42
  it 'creates an instance with valid parameters' do
39
- sftp_file
43
+ extractor
40
44
  end
41
45
 
42
46
  it 'requires a hostname' do
43
47
  credentials.delete(:host)
44
- expect { sftp_file }.to raise_error KeyError
48
+ expect { generate_extractor }.to raise_error KeyError
45
49
  end
46
50
 
47
51
  it 'requires a username' do
48
52
  credentials.delete(:username)
49
- expect { sftp_file }.to raise_error KeyError
53
+ expect { generate_extractor }.to raise_error KeyError
50
54
  end
51
55
 
52
- it 'requires a password' do
56
+ it 'does not require a password' do # If empty, it will use private keys
53
57
  credentials.delete(:password)
54
- expect { sftp_file }.to raise_error KeyError
58
+ expect { generate_extractor }.not_to raise_error
55
59
  end
56
60
 
57
61
  it 'defaults to using port 22' do
58
- expect(sftp_file.port).to eq '22'
62
+ expect(extractor.port).to eq '22'
59
63
  end
60
64
 
61
65
  it 'allows the port to be defined in the credentials' do
62
66
  credentials[:port] = '1234'
63
- expect(sftp_file.port).to eq '1234'
67
+ expect(generate_extractor.port).to eq '1234'
64
68
  end
65
69
  end
66
70
 
67
71
  context '#all_entires' do
68
72
  it 'returns all entries' do
69
- expect(sftp_file.all_entries.map(&:name)).to eq remote_filenames
73
+ expect(extractor.all_entries.map(&:name)).to eq remote_filenames
70
74
  end
71
75
  end
72
76
 
73
77
  context '#extract' do
74
78
  it 'downloads files from the ftp' do
75
- expect(sftp_session).to receive(:download!).exactly(remote_filenames.size).times
76
- sftp_file.extract
79
+ expect(extractor.sftp_session).to receive(:download!).exactly(remote_filenames.size).times
80
+ extractor.extract
77
81
  end
78
82
 
79
83
  it 'creates local files with the right names' do
80
- allow(sftp_session).to receive(:download!)
81
- expect(sftp_file.extract.map { |f| Pathname.new(f).basename.to_s }).to eq remote_filenames
84
+ allow(extractor.sftp_session).to receive(:download!)
85
+ expect(extractor.extract.map { |f| Pathname.new(f).basename.to_s }).to eq remote_filenames
82
86
  end
83
87
  end
84
88
  end
85
89
 
86
90
 
87
91
  describe Loader::SftpFile do
88
- let(:loader) { Loader::SftpFile.new(credentials: {}, remote_path: 'some_path') }
92
+
93
+ let(:credentials) {
94
+ {
95
+ host: 'host',
96
+ username: 'username',
97
+ password: 'password'
98
+ }
99
+ }
100
+
101
+ let(:loader) { Loader::SftpFile.new(credentials: credentials, remote_path: 'some_path') }
89
102
  let(:data) { double('some_data') }
90
- let(:sftp_session) { instance_double('Net:SFTP::Session') }
91
103
 
92
104
  before do
93
- allow(Net::SFTP).to receive(:start).and_yield sftp_session
105
+ allow(loader).to receive(:begin_connection)
106
+
107
+ sftp_session = double('sftp_session')
108
+ allow(loader).to receive(:sftp_session).and_return(sftp_session)
94
109
  end
95
110
 
96
111
  it 'loads a csv to a target sftp filesystem' do
97
- expect(sftp_session).to receive(:upload!).with(data, 'some_path')
112
+ expect(loader.sftp_session).to receive(:upload!).with(data, 'some_path')
98
113
  loader.load data
99
114
  end
100
115
  end
@@ -253,5 +253,89 @@ describe Job do
253
253
  my_transform.execute
254
254
  end
255
255
  end
256
+
257
+ describe '#import - edge cases' do
258
+ before do
259
+ class MyJob
260
+ source :job_source do
261
+ fields({ :id => {}, :name => {} })
262
+ end
263
+ target :job_target do
264
+ fields({ :id => {}, :name => {}, :funny_name => {} })
265
+ end
266
+ end
267
+
268
+ job.job_source.df = Remi::DataFrame::Daru.new({
269
+ id: [1, 2, 3],
270
+ name: ['one', 'two', 'three']
271
+ })
272
+ end
273
+
274
+ it 'correctly maps back to a source if the sub transform sorts the data' do
275
+ sub_transform = Job::Transform.new('arbitrary') do
276
+ source :st_source, [:id, :name]
277
+ target :st_target, [:funny_name]
278
+
279
+ st_source.df.sort!([:id], ascending: [false])
280
+
281
+ Remi::SourceToTargetMap.apply(st_source.df, st_target.df) do
282
+ map source(:name) .target(:funny_name)
283
+ .transform(->(v) { "funny-#{v}" })
284
+ end
285
+ end
286
+
287
+ my_transform = Job::Transform.new(job) do
288
+ import sub_transform do
289
+ map_source_fields :job_source, :st_source, {
290
+ :id => :id,
291
+ :name => :name
292
+ }
293
+ map_target_fields :st_target, :job_source, {
294
+ :funny_name => :funny_name
295
+ }
296
+ end
297
+
298
+ job.job_target.df = job.job_source.df.dup
299
+ end
300
+
301
+ my_transform.execute
302
+ expect(job.job_target.df[:funny_name].to_a).to eq(
303
+ job.job_target.df[:name].to_a.map { |v| "funny-#{v}" }
304
+ )
305
+ end
306
+
307
+ it 'raises an error if the subtransform fucks with index', wip: true do
308
+ sub_transform = Job::Transform.new('arbitrary') do
309
+ source :st_source, [:id, :name]
310
+ target :st_target, [:funny_name]
311
+
312
+ duplicated_df = Daru::DataFrame.new({ id: Array(st_source.df[:id][0]) * 3 })
313
+
314
+ st_source.df = st_source.df.join(duplicated_df, on: [:id], how: :left)
315
+
316
+ Remi::SourceToTargetMap.apply(st_source.df, st_target.df) do
317
+ map source(:name) .target(:funny_name)
318
+ .transform(->(v) { "funny-#{v}" })
319
+ end
320
+ end
321
+
322
+ my_transform = Job::Transform.new(job) do
323
+ import sub_transform do
324
+ map_source_fields :job_source, :st_source, {
325
+ :id => :id,
326
+ :name => :name
327
+ }
328
+ map_target_fields :st_target, :job_source, {
329
+ :funny_name => :funny_name
330
+ }
331
+ end
332
+
333
+ job.job_target.df = job.job_source.df.dup
334
+ end
335
+
336
+ expect { my_transform.execute }.to raise_error Job::Transform::IncompatibleTargetIndexError
337
+ end
338
+ end
339
+
256
340
  end
257
341
  end
@@ -298,4 +298,34 @@ describe SourceToTargetMap do
298
298
  expect(sttm).to be_a(Remi::DataFrame::Daru)
299
299
  end
300
300
  end
301
+
302
+ describe 'source and target dataframes differ', wip: true do
303
+ it 'does not fail when the dataframe has been filtered' do
304
+ some_df = Daru::DataFrame.new(
305
+ {
306
+ :id => [1,2,3,4,5],
307
+ :something => ['x','','x','','x'],
308
+ :name => ['one', 'two', 'three', 'four', 'five']
309
+ }
310
+ )
311
+
312
+ filtered_df = some_df.where(some_df[:something].eq('x'))
313
+ target_df = Remi::DataFrame::Daru.new([])
314
+
315
+ Remi::SourceToTargetMap.apply(filtered_df, target_df) do
316
+ map source(:id) .target(:id)
317
+ map source(:name) .target(:name)
318
+ end
319
+
320
+ result = target_df[:id, :name].to_h.each_with_object({}) { |(k,v), h| h[k] = v.to_a }
321
+ expect(result).to eq({
322
+ :id => [1, 3, 5],
323
+ :name => ['one', 'three', 'five']
324
+ })
325
+ end
326
+
327
+
328
+ end
329
+
330
+
301
331
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sterling Paramore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-01-25 00:00:00.000000000 Z
11
+ date: 2017-06-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bond
@@ -231,6 +231,7 @@ files:
231
231
  - features/json.feature
232
232
  - features/metadata.feature
233
233
  - features/parameters.feature
234
+ - features/s3_file_target_job.feature
234
235
  - features/sample_job.feature
235
236
  - features/sftp_file_target_job.feature
236
237
  - features/step_definitions/remi_step.rb
@@ -254,6 +255,7 @@ files:
254
255
  - jobs/json_job.rb
255
256
  - jobs/metadata_job.rb
256
257
  - jobs/parameters_job.rb
258
+ - jobs/s3_file_target_job.rb
257
259
  - jobs/sample_job.rb
258
260
  - jobs/sftp_file_target_job.rb
259
261
  - jobs/sub_job_example_job.rb
@@ -372,6 +374,7 @@ test_files:
372
374
  - features/json.feature
373
375
  - features/metadata.feature
374
376
  - features/parameters.feature
377
+ - features/s3_file_target_job.feature
375
378
  - features/sample_job.feature
376
379
  - features/sftp_file_target_job.feature
377
380
  - features/step_definitions/remi_step.rb