remi 0.2.31 → 0.2.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7c7c81a972d6fc604a4761b7fff19a26ec187065
4
- data.tar.gz: f07165f263dee57adb1802182cf72551d3326ab4
3
+ metadata.gz: 465e12569ac09f43f5e9f20fcf40ac4ee9f2e14c
4
+ data.tar.gz: 2c81e6feda132e5c3bc3aea5259f907a56bf0f6f
5
5
  SHA512:
6
- metadata.gz: d4ea4cde266110c374b75ef527f0ec71a674bba865c3da46107687fddb4871b3fda5aa601c44cd0fad5857038d8dff1c4dce297cf1d710ca335e2afafaa882dd
7
- data.tar.gz: 7a25d3e85b4ae727998ff196b79395db40bc0d584c04c061069c4d0a457bd79530f7a66b4ce3480df739ebfcef6dda7ef011c5e8171f76bac2326ee392e1d01a
6
+ metadata.gz: 8632e44d75be366bc3323567f597c73b5efb5e88a8fd07f94e1fa69e09a9a9c5b0f021f9f4ac13d7ae94030957f559136dfd09ad6f2626eb92743108f25cf307
7
+ data.tar.gz: 8657740cdb339d3bd311ec87d3aab8bbb63e005bd8bf07e5cac565bb7276417940e5885014dc95b8c5e3b6d69412b467cdeb0acff69ab981590cf884faa8948f
data/Gemfile CHANGED
@@ -5,3 +5,4 @@ gemspec
5
5
  gem 'daru', '0.1.2', git: 'git@github.com:inside-track/daru.git', branch: 'itk-master'
6
6
  gem 'restforce', '~> 2.1'
7
7
  gem 'salesforce_bulk_api', git: 'git@github.com:inside-track/salesforce_bulk_api.git', branch: 'master'
8
+ gem 'aws-sdk', '~> 2.3'
data/Gemfile.lock CHANGED
@@ -19,7 +19,7 @@ GIT
19
19
  PATH
20
20
  remote: .
21
21
  specs:
22
- remi (0.2.31)
22
+ remi (0.2.32)
23
23
  activesupport (~> 4.2)
24
24
  bond (~> 0.5)
25
25
  cucumber (~> 2.1)
@@ -40,6 +40,12 @@ GEM
40
40
  minitest (~> 5.1)
41
41
  thread_safe (~> 0.3, >= 0.3.4)
42
42
  tzinfo (~> 1.1)
43
+ aws-sdk (2.3.5)
44
+ aws-sdk-resources (= 2.3.5)
45
+ aws-sdk-core (2.3.5)
46
+ jmespath (~> 1.0)
47
+ aws-sdk-resources (2.3.5)
48
+ aws-sdk-core (= 2.3.5)
43
49
  bond (0.5.1)
44
50
  builder (3.2.2)
45
51
  clbustos-rtf (0.4.2)
@@ -70,7 +76,10 @@ GEM
70
76
  mimemagic (~> 0.3)
71
77
  multi_json (~> 1.11)
72
78
  rbczmq (~> 1.7)
79
+ jmespath (1.2.4)
80
+ json_pure (>= 1.8.1)
73
81
  json (1.8.3)
82
+ json_pure (1.8.3)
74
83
  mimemagic (0.3.1)
75
84
  minitest (5.8.4)
76
85
  multi_json (1.11.2)
@@ -128,6 +137,7 @@ PLATFORMS
128
137
  ruby
129
138
 
130
139
  DEPENDENCIES
140
+ aws-sdk (~> 2.3)
131
141
  daru (= 0.1.2)!
132
142
  iruby (= 0.2.7)
133
143
  remi!
@@ -334,6 +334,7 @@ end
334
334
  ### Transforms
335
335
 
336
336
  Then /^the target field '([^']+)' is a concatenation of the source fields '(.+)', delimited by "([^"]*)"$/ do |target_field, source_field_list, delimiter|
337
+ delimiter = delimiter.gsub(/(\\n|\\t)/, '\n' => "\n", '\t' => "t" )
337
338
  source_fields = "'#{source_field_list}'".gsub(' and ', ', ').split(',').map do |field_with_quotes|
338
339
  full_field_name = field_with_quotes.match(/'(.+)'/)[1]
339
340
 
@@ -71,7 +71,7 @@ module Remi
71
71
 
72
72
  def extractor=(arg)
73
73
  case arg
74
- when Extractor::SftpFile, Extractor::LocalFile
74
+ when Extractor::SftpFile, Extractor::LocalFile, Extractor::S3File
75
75
  @extractor = arg
76
76
  when String
77
77
  @extractor = Extractor::LocalFile.new(path: arg)
@@ -110,11 +110,11 @@ module Remi
110
110
 
111
111
  df_as_array_of_hashes = df.to_a[0] # This probably wouldn't work with a non-Daru df
112
112
  if @operation == :update
113
- Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
113
+ Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, batch_size: @batch_size, logger: @logger)
114
114
  elsif @operation == :create
115
- Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
115
+ Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, batch_size: @batch_size, logger: @logger)
116
116
  elsif @operation == :upsert
117
- Remi::SfBulkHelper::SfBulkUpsert.upsert(restforce_client, @sfo, df_as_array_of_hashes, external_id: @external_id, logger: @logger)
117
+ Remi::SfBulkHelper::SfBulkUpsert.upsert(restforce_client, @sfo, df_as_array_of_hashes, batch_size: @batch_size, external_id: @external_id, logger: @logger)
118
118
  else
119
119
  raise ArgumentError, "Unknown operation: #{@operation}"
120
120
  end
@@ -124,9 +124,10 @@ module Remi
124
124
 
125
125
  private
126
126
 
127
- def init_salesforce(*args, object:, operation:, credentials:, external_id: 'Id', api: :bulk, **kargs, &block)
127
+ def init_salesforce(*args, object:, operation:, credentials:, batch_size: 5000, external_id: 'Id', api: :bulk, **kargs, &block)
128
128
  @sfo = object
129
129
  @operation = operation
130
+ @batch_size = batch_size
130
131
  @external_id = external_id
131
132
  @credentials = credentials
132
133
  @api = api
@@ -0,0 +1,87 @@
1
+ module Remi
2
+ module Extractor
3
+
4
+ class FileSystemEntry
5
+ def initialize(pathname:, create_time:, modified_time:, raw: nil)
6
+ @pathname = Pathname.new(pathname)
7
+ @create_time = create_time
8
+ @modified_time = modified_time
9
+ @raw = raw
10
+ end
11
+
12
+ attr_reader :pathname, :create_time, :modified_time, :raw
13
+
14
+ def name
15
+ @pathname.basename.to_s
16
+ end
17
+ end
18
+
19
+
20
+ class FileSystem
21
+
22
+ class FileNotFoundError < StandardError; end
23
+
24
+ def initialize(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, logger: Remi::Settings.logger, **kargs, &block)
25
+ @remote_path = Pathname.new(remote_path)
26
+ @pattern = pattern
27
+ @local_path = Pathname.new(local_path)
28
+ @most_recent_only = most_recent_only
29
+ @group_by = group_by
30
+ @most_recent_by = most_recent_by
31
+ @logger = logger
32
+ end
33
+
34
+ attr_reader :logger
35
+
36
+ # Public: Called to extract files from the source filesystem.
37
+ #
38
+ # Returns an array with containing the paths to all files extracted.
39
+ def extract
40
+ raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
41
+ end
42
+
43
+ # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
44
+ # NOTE: all_entries is responsible for matching the path using @remote_path
45
+ def all_entries
46
+ raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
47
+ end
48
+
49
+ # Public: Returns just the entries that are to be extracted.
50
+ def entries
51
+ if @group_by
52
+ most_recent_matching_entry_in_group
53
+ elsif @most_recent_only
54
+ Array(most_recent_matching_entry)
55
+ else
56
+ matching_entries
57
+ end
58
+ end
59
+
60
+ def matching_entries
61
+ all_entries.select { |e| @pattern.match e.name }
62
+ end
63
+
64
+ def most_recent_matching_entry
65
+ matching_entries.sort_by { |e| e.send(@most_recent_by) }.reverse.first
66
+ end
67
+
68
+ def most_recent_matching_entry_in_group
69
+ entries_with_group = matching_entries.map do |entry|
70
+ match = entry.name.match(@group_by)
71
+ next unless match
72
+
73
+ group = match.to_a[1..-1]
74
+ { group: group, entry: entry }
75
+ end.compact
76
+ sorted_entries_with_group = entries_with_group.sort_by { |e| [e[:group], e[:entry].send(@most_recent_by)] }.reverse
77
+
78
+ last_group = nil
79
+ sorted_entries_with_group.map do |entry|
80
+ next unless entry[:group] != last_group
81
+ last_group = entry[:group]
82
+ entry[:entry]
83
+ end.compact
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,57 @@
1
+ module Remi
2
+ module Extractor
3
+
4
+ class S3File < FileSystem
5
+
6
+ def initialize(*args, **kargs, &block)
7
+ super
8
+ init_s3_file(*args, **kargs, &block)
9
+ end
10
+
11
+ # Public: Called to extract files from the source filesystem.
12
+ #
13
+ # Returns an array with containing the paths to all files extracted.
14
+ def extract
15
+ entries.map do |entry|
16
+ local_file = File.join(@local_path, entry.name)
17
+ @logger.info "Downloading #{entry.pathname} from S3 to #{local_file}"
18
+ File.open(local_file, 'wb') { |file| entry.raw.get(response_target: file) }
19
+ local_file
20
+ end
21
+ end
22
+
23
+ # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
24
+ def all_entries
25
+ @all_entries ||= all_entries!
26
+ end
27
+
28
+ def all_entries!
29
+ # S3 does not track anything like a create time, so use last modified for both
30
+ bucket.objects(prefix: @remote_path.to_s).map do |entry|
31
+ FileSystemEntry.new(
32
+ pathname: entry.key,
33
+ create_time: entry.last_modified,
34
+ modified_time: entry.last_modified,
35
+ raw: entry
36
+ )
37
+ end
38
+ end
39
+
40
+ def s3_client
41
+ @s3_client ||= Aws::S3::Client.new
42
+ end
43
+
44
+ private
45
+
46
+ def init_s3_file(*args, bucket:, **kargs)
47
+ @bucket_name = bucket
48
+ end
49
+
50
+ def bucket
51
+ @bucket ||= Aws::S3::Bucket.new(@bucket_name, client: s3_client)
52
+ end
53
+
54
+ end
55
+
56
+ end
57
+ end
@@ -0,0 +1,78 @@
1
+ module Remi
2
+ module Extractor
3
+
4
+ class SftpFileNew < FileSystem
5
+
6
+ N_RETRY = 3
7
+
8
+ def initialize(*args, **kargs)
9
+ super
10
+ init_sftp_file(*args, **kargs)
11
+ end
12
+
13
+ # Public: Called to extract files from the source filesystem.
14
+ #
15
+ # Returns an array with containing the paths to all files extracted.
16
+ def extract
17
+ connection do |sftp|
18
+ entries.map do |entry|
19
+ local_file = File.join(@local_path, entry.name)
20
+ @logger.info "Downloading #{entry.name} to #{local_file}"
21
+ retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
22
+ local_file
23
+ end
24
+ end
25
+ end
26
+
27
+ # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
28
+ def all_entries
29
+ @all_entries ||= all_entries!
30
+ end
31
+
32
+ def all_entries!
33
+ sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path.dirname) }
34
+ sftp_entries.map do |entry|
35
+ # Early versions of the protocol don't support create time, fake it with modified time?
36
+ FileSystemEntry.new(
37
+ name: File.join(@remote_path.dirname, entry.name),
38
+ create_time: entry.respond_to?(:createtime) ? entry.createtime : entry.mtime,
39
+ modified_time: entry.mtime
40
+ )
41
+ end
42
+ end
43
+
44
+
45
+ private
46
+
47
+ def init_sftp_file(*args, credentials:, **kargs)
48
+ @host = credentials.fetch(:host)
49
+ @username = credentials.fetch(:username)
50
+ @password = credentials.fetch(:password)
51
+ @port = credentials.fetch(:port, '22')
52
+ end
53
+
54
+ def connection(&block)
55
+ result = nil
56
+ Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
57
+ result = yield sftp
58
+ end
59
+ result
60
+ end
61
+
62
+ def retry_download(&block)
63
+ 1.upto(N_RETRY).each do |itry|
64
+ begin
65
+ block.call
66
+ break
67
+ rescue RuntimeError => err
68
+ raise err unless itry < ntry
69
+ @logger.error "Download failed with error: #{err.message}"
70
+ @logger.error "Retry attempt #{itry}/#{ntry-1}"
71
+ sleep(1)
72
+ end
73
+ end
74
+ end
75
+ end
76
+
77
+ end
78
+ end
data/lib/remi/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Remi
2
- VERSION = '0.2.31'
2
+ VERSION = '0.2.32'
3
3
  end
data/lib/remi.rb CHANGED
@@ -42,7 +42,10 @@ require 'remi/field_symbolizers'
42
42
 
43
43
  require 'remi/refinements/symbolizer'
44
44
 
45
- require 'remi/extractor/sftp_file'
45
+ require 'remi/extractor/sftp_file' # deprecated
46
+ require 'remi/extractor/file_system'
47
+ require 'remi/extractor/s3_file'
48
+
46
49
 
47
50
  require 'remi/fields'
48
51
  require 'remi/data_frame'
@@ -0,0 +1,138 @@
1
+ require 'remi_spec'
2
+
3
+ describe Extractor::FileSystem do
4
+ before do
5
+ now = Time.new
6
+
7
+ example_files = [
8
+ { pathname: "pdir/ApplicantsA-9.csv", create_time: now - 10.minutes },
9
+ { pathname: "pdir/ApplicantsA-3.csv", create_time: now - 5.minutes },
10
+ { pathname: "pdir/ApplicantsA-5.csv", create_time: now - 1.minutes },
11
+ { pathname: "pdir/ApplicantsB-7.csv", create_time: now - 10.minutes },
12
+ { pathname: "pdir/ApplicantsB-6.csv", create_time: now - 5.minutes },
13
+ { pathname: "pdir/ApplicantsB-2.csv", create_time: now - 1.minutes },
14
+ { pathname: "pdir/ApplicantsB-2.txt", create_time: now - 0.minutes },
15
+ { pathname: "pdir/Apples.csv", createtime: now - 1.minutes },
16
+ { pathname: "otherdir/ApplicantsA-11.csv", createtime: now - 1.minutes },
17
+ ]
18
+
19
+ remote_path = 'pdir'
20
+ allow_any_instance_of(Extractor::FileSystem).to receive(:all_entries) do
21
+ example_files.map do |entry|
22
+ Extractor::FileSystemEntry.new(
23
+ pathname: entry[:pathname],
24
+ create_time: entry[:create_time],
25
+ modified_time: entry[:create_time]
26
+ ) if Pathname.new(entry[:pathname]).dirname.to_s == remote_path
27
+ end.compact
28
+ end
29
+
30
+ @params = { remote_path: remote_path }
31
+ end
32
+
33
+ let(:file_system) { Extractor::FileSystem.new(**@params) }
34
+
35
+
36
+
37
+ context 'extracting all files matching a pattern' do
38
+ before do
39
+ @params.merge!({
40
+ pattern: /ApplicantsA-\d+\.csv/
41
+ })
42
+ end
43
+
44
+ it 'does not extract non-matching files' do
45
+ expect(file_system.entries.map(&:name)).not_to include "Apples.csv"
46
+ end
47
+
48
+ it 'does not extract files not in the target directory' do
49
+ expect(file_system.entries.map(&:name)).not_to include "ApplicantsA-11.csv"
50
+ end
51
+
52
+ it 'extracts all matching files' do
53
+ expect(file_system.entries.map(&:name)).to match_array([
54
+ "ApplicantsA-9.csv",
55
+ "ApplicantsA-3.csv",
56
+ "ApplicantsA-5.csv"
57
+ ])
58
+ end
59
+ end
60
+
61
+
62
+ context 'extracting only the most recent matching a pattern' do
63
+ before do
64
+ @params.merge!({
65
+ pattern: /ApplicantsA-\d+\.csv/,
66
+ most_recent_only: true
67
+ })
68
+ end
69
+
70
+ it 'extracts only the most recent matching file' do
71
+ expect(file_system.entries.map(&:name)).to match_array([
72
+ "ApplicantsA-5.csv"
73
+ ])
74
+ end
75
+
76
+ context 'using filename instead of createtime' do
77
+ before do
78
+ @params.merge!({
79
+ most_recent_by: :name
80
+ })
81
+ end
82
+
83
+ it 'extracts only the most recent matching file' do
84
+ expect(file_system.entries.map(&:name)).to match_array([
85
+ "ApplicantsA-9.csv"
86
+ ])
87
+ end
88
+ end
89
+ end
90
+
91
+
92
+ context 'extracting files matching a pattern with a by group' do
93
+ before do
94
+ @params.merge!({
95
+ pattern: /^Applicants(A|B)-\d+\.csv/,
96
+ group_by: /^Applicants(A|B)/
97
+ })
98
+ end
99
+
100
+ it 'extracts the most recent file that matches a particular regex' do
101
+ expect(file_system.entries.map(&:name)).to match_array([
102
+ "ApplicantsA-5.csv",
103
+ "ApplicantsB-2.csv"
104
+ ])
105
+ end
106
+
107
+ context 'with a minimally selective pre-filter' do
108
+ before do
109
+ @params.merge!({
110
+ pattern: /^Applicants/
111
+ })
112
+ end
113
+
114
+ it 'extracts the most recent file that matches a particular regex' do
115
+ expect(file_system.entries.map(&:name)).to match_array([
116
+ "ApplicantsA-5.csv",
117
+ "ApplicantsB-2.txt"
118
+ ])
119
+ end
120
+ end
121
+
122
+ context 'using filename instead of createtime' do
123
+ before do
124
+ @params.merge!({
125
+ most_recent_by: :name
126
+ })
127
+ end
128
+
129
+ it 'extracts only the most recent matching file' do
130
+ expect(file_system.entries.map(&:name)).to match_array([
131
+ "ApplicantsA-9.csv",
132
+ "ApplicantsB-7.csv"
133
+ ])
134
+ end
135
+ end
136
+
137
+ end
138
+ end
@@ -0,0 +1,25 @@
1
+ require 'remi_spec'
2
+ require 'aws-sdk'
3
+
4
+ describe Extractor::S3File do
5
+
6
+ before do
7
+ Aws.config[:s3] = {
8
+ stub_responses: true
9
+ }
10
+
11
+ prefix = "the-best-prefix"
12
+ @s3_file = Extractor::S3File.new(bucket: 'the-best-bucket', remote_path: "#{prefix}")
13
+ @s3_file.s3_client.stub_responses(:list_objects, {
14
+ contents: [
15
+ { key: "#{prefix}/file1.csv" },
16
+ { key: "#{prefix}/file2.csv" }
17
+ ]
18
+ })
19
+
20
+ end
21
+
22
+ it 'returns all entries' do
23
+ expect(@s3_file.all_entries.map(&:name)).to eq ['file1.csv', 'file2.csv']
24
+ end
25
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.31
4
+ version: 0.2.32
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sterling Paramore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-10 00:00:00.000000000 Z
11
+ date: 2016-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bond
@@ -231,7 +231,10 @@ files:
231
231
  - lib/remi/data_subject/postgres.rb
232
232
  - lib/remi/data_subject/salesforce.rb
233
233
  - lib/remi/data_subject/sftp_file.rb
234
+ - lib/remi/extractor/file_system.rb
235
+ - lib/remi/extractor/s3_file.rb
234
236
  - lib/remi/extractor/sftp_file.rb
237
+ - lib/remi/extractor/sftp_file_new.rb
235
238
  - lib/remi/field_symbolizers.rb
236
239
  - lib/remi/fields.rb
237
240
  - lib/remi/job.rb
@@ -242,6 +245,8 @@ files:
242
245
  - lib/remi/transform.rb
243
246
  - lib/remi/version.rb
244
247
  - remi.gemspec
248
+ - spec/extractor/file_system_spec.rb
249
+ - spec/extractor/s3_file_spec.rb
245
250
  - spec/extractor/sftp_file_spec.rb
246
251
  - spec/metadata_spec.rb
247
252
  - spec/remi_spec.rb
@@ -291,6 +296,8 @@ test_files:
291
296
  - features/transforms/prefix.feature
292
297
  - features/transforms/truncate.feature
293
298
  - features/transforms/truthy.feature
299
+ - spec/extractor/file_system_spec.rb
300
+ - spec/extractor/s3_file_spec.rb
294
301
  - spec/extractor/sftp_file_spec.rb
295
302
  - spec/metadata_spec.rb
296
303
  - spec/remi_spec.rb