remi 0.2.31 → 0.2.32

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7c7c81a972d6fc604a4761b7fff19a26ec187065
4
- data.tar.gz: f07165f263dee57adb1802182cf72551d3326ab4
3
+ metadata.gz: 465e12569ac09f43f5e9f20fcf40ac4ee9f2e14c
4
+ data.tar.gz: 2c81e6feda132e5c3bc3aea5259f907a56bf0f6f
5
5
  SHA512:
6
- metadata.gz: d4ea4cde266110c374b75ef527f0ec71a674bba865c3da46107687fddb4871b3fda5aa601c44cd0fad5857038d8dff1c4dce297cf1d710ca335e2afafaa882dd
7
- data.tar.gz: 7a25d3e85b4ae727998ff196b79395db40bc0d584c04c061069c4d0a457bd79530f7a66b4ce3480df739ebfcef6dda7ef011c5e8171f76bac2326ee392e1d01a
6
+ metadata.gz: 8632e44d75be366bc3323567f597c73b5efb5e88a8fd07f94e1fa69e09a9a9c5b0f021f9f4ac13d7ae94030957f559136dfd09ad6f2626eb92743108f25cf307
7
+ data.tar.gz: 8657740cdb339d3bd311ec87d3aab8bbb63e005bd8bf07e5cac565bb7276417940e5885014dc95b8c5e3b6d69412b467cdeb0acff69ab981590cf884faa8948f
data/Gemfile CHANGED
@@ -5,3 +5,4 @@ gemspec
5
5
  gem 'daru', '0.1.2', git: 'git@github.com:inside-track/daru.git', branch: 'itk-master'
6
6
  gem 'restforce', '~> 2.1'
7
7
  gem 'salesforce_bulk_api', git: 'git@github.com:inside-track/salesforce_bulk_api.git', branch: 'master'
8
+ gem 'aws-sdk', '~> 2.3'
data/Gemfile.lock CHANGED
@@ -19,7 +19,7 @@ GIT
19
19
  PATH
20
20
  remote: .
21
21
  specs:
22
- remi (0.2.31)
22
+ remi (0.2.32)
23
23
  activesupport (~> 4.2)
24
24
  bond (~> 0.5)
25
25
  cucumber (~> 2.1)
@@ -40,6 +40,12 @@ GEM
40
40
  minitest (~> 5.1)
41
41
  thread_safe (~> 0.3, >= 0.3.4)
42
42
  tzinfo (~> 1.1)
43
+ aws-sdk (2.3.5)
44
+ aws-sdk-resources (= 2.3.5)
45
+ aws-sdk-core (2.3.5)
46
+ jmespath (~> 1.0)
47
+ aws-sdk-resources (2.3.5)
48
+ aws-sdk-core (= 2.3.5)
43
49
  bond (0.5.1)
44
50
  builder (3.2.2)
45
51
  clbustos-rtf (0.4.2)
@@ -70,7 +76,10 @@ GEM
70
76
  mimemagic (~> 0.3)
71
77
  multi_json (~> 1.11)
72
78
  rbczmq (~> 1.7)
79
+ jmespath (1.2.4)
80
+ json_pure (>= 1.8.1)
73
81
  json (1.8.3)
82
+ json_pure (1.8.3)
74
83
  mimemagic (0.3.1)
75
84
  minitest (5.8.4)
76
85
  multi_json (1.11.2)
@@ -128,6 +137,7 @@ PLATFORMS
128
137
  ruby
129
138
 
130
139
  DEPENDENCIES
140
+ aws-sdk (~> 2.3)
131
141
  daru (= 0.1.2)!
132
142
  iruby (= 0.2.7)
133
143
  remi!
@@ -334,6 +334,7 @@ end
334
334
  ### Transforms
335
335
 
336
336
  Then /^the target field '([^']+)' is a concatenation of the source fields '(.+)', delimited by "([^"]*)"$/ do |target_field, source_field_list, delimiter|
337
+ delimiter = delimiter.gsub(/(\\n|\\t)/, '\n' => "\n", '\t' => "t" )
337
338
  source_fields = "'#{source_field_list}'".gsub(' and ', ', ').split(',').map do |field_with_quotes|
338
339
  full_field_name = field_with_quotes.match(/'(.+)'/)[1]
339
340
 
@@ -71,7 +71,7 @@ module Remi
71
71
 
72
72
  def extractor=(arg)
73
73
  case arg
74
- when Extractor::SftpFile, Extractor::LocalFile
74
+ when Extractor::SftpFile, Extractor::LocalFile, Extractor::S3File
75
75
  @extractor = arg
76
76
  when String
77
77
  @extractor = Extractor::LocalFile.new(path: arg)
@@ -110,11 +110,11 @@ module Remi
110
110
 
111
111
  df_as_array_of_hashes = df.to_a[0] # This probably wouldn't work with a non-Daru df
112
112
  if @operation == :update
113
- Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
113
+ Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, batch_size: @batch_size, logger: @logger)
114
114
  elsif @operation == :create
115
- Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
115
+ Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, batch_size: @batch_size, logger: @logger)
116
116
  elsif @operation == :upsert
117
- Remi::SfBulkHelper::SfBulkUpsert.upsert(restforce_client, @sfo, df_as_array_of_hashes, external_id: @external_id, logger: @logger)
117
+ Remi::SfBulkHelper::SfBulkUpsert.upsert(restforce_client, @sfo, df_as_array_of_hashes, batch_size: @batch_size, external_id: @external_id, logger: @logger)
118
118
  else
119
119
  raise ArgumentError, "Unknown operation: #{@operation}"
120
120
  end
@@ -124,9 +124,10 @@ module Remi
124
124
 
125
125
  private
126
126
 
127
- def init_salesforce(*args, object:, operation:, credentials:, external_id: 'Id', api: :bulk, **kargs, &block)
127
+ def init_salesforce(*args, object:, operation:, credentials:, batch_size: 5000, external_id: 'Id', api: :bulk, **kargs, &block)
128
128
  @sfo = object
129
129
  @operation = operation
130
+ @batch_size = batch_size
130
131
  @external_id = external_id
131
132
  @credentials = credentials
132
133
  @api = api
@@ -0,0 +1,87 @@
1
+ module Remi
2
+ module Extractor
3
+
4
+ class FileSystemEntry
5
+ def initialize(pathname:, create_time:, modified_time:, raw: nil)
6
+ @pathname = Pathname.new(pathname)
7
+ @create_time = create_time
8
+ @modified_time = modified_time
9
+ @raw = raw
10
+ end
11
+
12
+ attr_reader :pathname, :create_time, :modified_time, :raw
13
+
14
+ def name
15
+ @pathname.basename.to_s
16
+ end
17
+ end
18
+
19
+
20
+ class FileSystem
21
+
22
+ class FileNotFoundError < StandardError; end
23
+
24
+ def initialize(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, logger: Remi::Settings.logger, **kargs, &block)
25
+ @remote_path = Pathname.new(remote_path)
26
+ @pattern = pattern
27
+ @local_path = Pathname.new(local_path)
28
+ @most_recent_only = most_recent_only
29
+ @group_by = group_by
30
+ @most_recent_by = most_recent_by
31
+ @logger = logger
32
+ end
33
+
34
+ attr_reader :logger
35
+
36
+ # Public: Called to extract files from the source filesystem.
37
+ #
38
+ # Returns an array with containing the paths to all files extracted.
39
+ def extract
40
+ raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
41
+ end
42
+
43
+ # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
44
+ # NOTE: all_entries is responsible for matching the path using @remote_path
45
+ def all_entries
46
+ raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
47
+ end
48
+
49
+ # Public: Returns just the entries that are to be extracted.
50
+ def entries
51
+ if @group_by
52
+ most_recent_matching_entry_in_group
53
+ elsif @most_recent_only
54
+ Array(most_recent_matching_entry)
55
+ else
56
+ matching_entries
57
+ end
58
+ end
59
+
60
+ def matching_entries
61
+ all_entries.select { |e| @pattern.match e.name }
62
+ end
63
+
64
+ def most_recent_matching_entry
65
+ matching_entries.sort_by { |e| e.send(@most_recent_by) }.reverse.first
66
+ end
67
+
68
+ def most_recent_matching_entry_in_group
69
+ entries_with_group = matching_entries.map do |entry|
70
+ match = entry.name.match(@group_by)
71
+ next unless match
72
+
73
+ group = match.to_a[1..-1]
74
+ { group: group, entry: entry }
75
+ end.compact
76
+ sorted_entries_with_group = entries_with_group.sort_by { |e| [e[:group], e[:entry].send(@most_recent_by)] }.reverse
77
+
78
+ last_group = nil
79
+ sorted_entries_with_group.map do |entry|
80
+ next unless entry[:group] != last_group
81
+ last_group = entry[:group]
82
+ entry[:entry]
83
+ end.compact
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,57 @@
1
+ module Remi
2
+ module Extractor
3
+
4
+ class S3File < FileSystem
5
+
6
+ def initialize(*args, **kargs, &block)
7
+ super
8
+ init_s3_file(*args, **kargs, &block)
9
+ end
10
+
11
+ # Public: Called to extract files from the source filesystem.
12
+ #
13
+ # Returns an array with containing the paths to all files extracted.
14
+ def extract
15
+ entries.map do |entry|
16
+ local_file = File.join(@local_path, entry.name)
17
+ @logger.info "Downloading #{entry.pathname} from S3 to #{local_file}"
18
+ File.open(local_file, 'wb') { |file| entry.raw.get(response_target: file) }
19
+ local_file
20
+ end
21
+ end
22
+
23
+ # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
24
+ def all_entries
25
+ @all_entries ||= all_entries!
26
+ end
27
+
28
+ def all_entries!
29
+ # S3 does not track anything like a create time, so use last modified for both
30
+ bucket.objects(prefix: @remote_path.to_s).map do |entry|
31
+ FileSystemEntry.new(
32
+ pathname: entry.key,
33
+ create_time: entry.last_modified,
34
+ modified_time: entry.last_modified,
35
+ raw: entry
36
+ )
37
+ end
38
+ end
39
+
40
+ def s3_client
41
+ @s3_client ||= Aws::S3::Client.new
42
+ end
43
+
44
+ private
45
+
46
+ def init_s3_file(*args, bucket:, **kargs)
47
+ @bucket_name = bucket
48
+ end
49
+
50
+ def bucket
51
+ @bucket ||= Aws::S3::Bucket.new(@bucket_name, client: s3_client)
52
+ end
53
+
54
+ end
55
+
56
+ end
57
+ end
@@ -0,0 +1,78 @@
1
+ module Remi
2
+ module Extractor
3
+
4
+ class SftpFileNew < FileSystem
5
+
6
+ N_RETRY = 3
7
+
8
+ def initialize(*args, **kargs)
9
+ super
10
+ init_sftp_file(*args, **kargs)
11
+ end
12
+
13
+ # Public: Called to extract files from the source filesystem.
14
+ #
15
+ # Returns an array with containing the paths to all files extracted.
16
+ def extract
17
+ connection do |sftp|
18
+ entries.map do |entry|
19
+ local_file = File.join(@local_path, entry.name)
20
+ @logger.info "Downloading #{entry.name} to #{local_file}"
21
+ retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
22
+ local_file
23
+ end
24
+ end
25
+ end
26
+
27
+ # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
28
+ def all_entries
29
+ @all_entries ||= all_entries!
30
+ end
31
+
32
+ def all_entries!
33
+ sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path.dirname) }
34
+ sftp_entries.map do |entry|
35
+ # Early versions of the protocol don't support create time, fake it with modified time?
36
+ FileSystemEntry.new(
37
+ name: File.join(@remote_path.dirname, entry.name),
38
+ create_time: entry.respond_to?(:createtime) ? entry.createtime : entry.mtime,
39
+ modified_time: entry.mtime
40
+ )
41
+ end
42
+ end
43
+
44
+
45
+ private
46
+
47
+ def init_sftp_file(*args, credentials:, **kargs)
48
+ @host = credentials.fetch(:host)
49
+ @username = credentials.fetch(:username)
50
+ @password = credentials.fetch(:password)
51
+ @port = credentials.fetch(:port, '22')
52
+ end
53
+
54
+ def connection(&block)
55
+ result = nil
56
+ Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
57
+ result = yield sftp
58
+ end
59
+ result
60
+ end
61
+
62
+ def retry_download(&block)
63
+ 1.upto(N_RETRY).each do |itry|
64
+ begin
65
+ block.call
66
+ break
67
+ rescue RuntimeError => err
68
+ raise err unless itry < ntry
69
+ @logger.error "Download failed with error: #{err.message}"
70
+ @logger.error "Retry attempt #{itry}/#{ntry-1}"
71
+ sleep(1)
72
+ end
73
+ end
74
+ end
75
+ end
76
+
77
+ end
78
+ end
data/lib/remi/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Remi
2
- VERSION = '0.2.31'
2
+ VERSION = '0.2.32'
3
3
  end
data/lib/remi.rb CHANGED
@@ -42,7 +42,10 @@ require 'remi/field_symbolizers'
42
42
 
43
43
  require 'remi/refinements/symbolizer'
44
44
 
45
- require 'remi/extractor/sftp_file'
45
+ require 'remi/extractor/sftp_file' # deprecated
46
+ require 'remi/extractor/file_system'
47
+ require 'remi/extractor/s3_file'
48
+
46
49
 
47
50
  require 'remi/fields'
48
51
  require 'remi/data_frame'
@@ -0,0 +1,138 @@
1
+ require 'remi_spec'
2
+
3
+ describe Extractor::FileSystem do
4
+ before do
5
+ now = Time.new
6
+
7
+ example_files = [
8
+ { pathname: "pdir/ApplicantsA-9.csv", create_time: now - 10.minutes },
9
+ { pathname: "pdir/ApplicantsA-3.csv", create_time: now - 5.minutes },
10
+ { pathname: "pdir/ApplicantsA-5.csv", create_time: now - 1.minutes },
11
+ { pathname: "pdir/ApplicantsB-7.csv", create_time: now - 10.minutes },
12
+ { pathname: "pdir/ApplicantsB-6.csv", create_time: now - 5.minutes },
13
+ { pathname: "pdir/ApplicantsB-2.csv", create_time: now - 1.minutes },
14
+ { pathname: "pdir/ApplicantsB-2.txt", create_time: now - 0.minutes },
15
+ { pathname: "pdir/Apples.csv", createtime: now - 1.minutes },
16
+ { pathname: "otherdir/ApplicantsA-11.csv", createtime: now - 1.minutes },
17
+ ]
18
+
19
+ remote_path = 'pdir'
20
+ allow_any_instance_of(Extractor::FileSystem).to receive(:all_entries) do
21
+ example_files.map do |entry|
22
+ Extractor::FileSystemEntry.new(
23
+ pathname: entry[:pathname],
24
+ create_time: entry[:create_time],
25
+ modified_time: entry[:create_time]
26
+ ) if Pathname.new(entry[:pathname]).dirname.to_s == remote_path
27
+ end.compact
28
+ end
29
+
30
+ @params = { remote_path: remote_path }
31
+ end
32
+
33
+ let(:file_system) { Extractor::FileSystem.new(**@params) }
34
+
35
+
36
+
37
+ context 'extracting all files matching a pattern' do
38
+ before do
39
+ @params.merge!({
40
+ pattern: /ApplicantsA-\d+\.csv/
41
+ })
42
+ end
43
+
44
+ it 'does not extract non-matching files' do
45
+ expect(file_system.entries.map(&:name)).not_to include "Apples.csv"
46
+ end
47
+
48
+ it 'does not extract files not in the target directory' do
49
+ expect(file_system.entries.map(&:name)).not_to include "ApplicantsA-11.csv"
50
+ end
51
+
52
+ it 'extracts all matching files' do
53
+ expect(file_system.entries.map(&:name)).to match_array([
54
+ "ApplicantsA-9.csv",
55
+ "ApplicantsA-3.csv",
56
+ "ApplicantsA-5.csv"
57
+ ])
58
+ end
59
+ end
60
+
61
+
62
+ context 'extracting only the most recent matching a pattern' do
63
+ before do
64
+ @params.merge!({
65
+ pattern: /ApplicantsA-\d+\.csv/,
66
+ most_recent_only: true
67
+ })
68
+ end
69
+
70
+ it 'extracts only the most recent matching file' do
71
+ expect(file_system.entries.map(&:name)).to match_array([
72
+ "ApplicantsA-5.csv"
73
+ ])
74
+ end
75
+
76
+ context 'using filename instead of createtime' do
77
+ before do
78
+ @params.merge!({
79
+ most_recent_by: :name
80
+ })
81
+ end
82
+
83
+ it 'extracts only the most recent matching file' do
84
+ expect(file_system.entries.map(&:name)).to match_array([
85
+ "ApplicantsA-9.csv"
86
+ ])
87
+ end
88
+ end
89
+ end
90
+
91
+
92
+ context 'extracting files matching a pattern with a by group' do
93
+ before do
94
+ @params.merge!({
95
+ pattern: /^Applicants(A|B)-\d+\.csv/,
96
+ group_by: /^Applicants(A|B)/
97
+ })
98
+ end
99
+
100
+ it 'extracts the most recent file that matches a particular regex' do
101
+ expect(file_system.entries.map(&:name)).to match_array([
102
+ "ApplicantsA-5.csv",
103
+ "ApplicantsB-2.csv"
104
+ ])
105
+ end
106
+
107
+ context 'with a minimally selective pre-filter' do
108
+ before do
109
+ @params.merge!({
110
+ pattern: /^Applicants/
111
+ })
112
+ end
113
+
114
+ it 'extracts the most recent file that matches a particular regex' do
115
+ expect(file_system.entries.map(&:name)).to match_array([
116
+ "ApplicantsA-5.csv",
117
+ "ApplicantsB-2.txt"
118
+ ])
119
+ end
120
+ end
121
+
122
+ context 'using filename instead of createtime' do
123
+ before do
124
+ @params.merge!({
125
+ most_recent_by: :name
126
+ })
127
+ end
128
+
129
+ it 'extracts only the most recent matching file' do
130
+ expect(file_system.entries.map(&:name)).to match_array([
131
+ "ApplicantsA-9.csv",
132
+ "ApplicantsB-7.csv"
133
+ ])
134
+ end
135
+ end
136
+
137
+ end
138
+ end
@@ -0,0 +1,25 @@
1
+ require 'remi_spec'
2
+ require 'aws-sdk'
3
+
4
+ describe Extractor::S3File do
5
+
6
+ before do
7
+ Aws.config[:s3] = {
8
+ stub_responses: true
9
+ }
10
+
11
+ prefix = "the-best-prefix"
12
+ @s3_file = Extractor::S3File.new(bucket: 'the-best-bucket', remote_path: "#{prefix}")
13
+ @s3_file.s3_client.stub_responses(:list_objects, {
14
+ contents: [
15
+ { key: "#{prefix}/file1.csv" },
16
+ { key: "#{prefix}/file2.csv" }
17
+ ]
18
+ })
19
+
20
+ end
21
+
22
+ it 'returns all entries' do
23
+ expect(@s3_file.all_entries.map(&:name)).to eq ['file1.csv', 'file2.csv']
24
+ end
25
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.31
4
+ version: 0.2.32
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sterling Paramore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-10 00:00:00.000000000 Z
11
+ date: 2016-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bond
@@ -231,7 +231,10 @@ files:
231
231
  - lib/remi/data_subject/postgres.rb
232
232
  - lib/remi/data_subject/salesforce.rb
233
233
  - lib/remi/data_subject/sftp_file.rb
234
+ - lib/remi/extractor/file_system.rb
235
+ - lib/remi/extractor/s3_file.rb
234
236
  - lib/remi/extractor/sftp_file.rb
237
+ - lib/remi/extractor/sftp_file_new.rb
235
238
  - lib/remi/field_symbolizers.rb
236
239
  - lib/remi/fields.rb
237
240
  - lib/remi/job.rb
@@ -242,6 +245,8 @@ files:
242
245
  - lib/remi/transform.rb
243
246
  - lib/remi/version.rb
244
247
  - remi.gemspec
248
+ - spec/extractor/file_system_spec.rb
249
+ - spec/extractor/s3_file_spec.rb
245
250
  - spec/extractor/sftp_file_spec.rb
246
251
  - spec/metadata_spec.rb
247
252
  - spec/remi_spec.rb
@@ -291,6 +296,8 @@ test_files:
291
296
  - features/transforms/prefix.feature
292
297
  - features/transforms/truncate.feature
293
298
  - features/transforms/truthy.feature
299
+ - spec/extractor/file_system_spec.rb
300
+ - spec/extractor/s3_file_spec.rb
294
301
  - spec/extractor/sftp_file_spec.rb
295
302
  - spec/metadata_spec.rb
296
303
  - spec/remi_spec.rb