remi 0.2.35 → 0.2.36
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/features/sample_job.feature +0 -4
- data/features/step_definitions/remi_step.rb +21 -19
- data/features/support/env.rb +9 -0
- data/jobs/sample_job.rb +2 -2
- data/lib/remi.rb +2 -1
- data/lib/remi/cucumber/business_rules.rb +8 -85
- data/lib/remi/data_subject/csv_file.rb +3 -10
- data/lib/remi/extractor/file_system.rb +6 -1
- data/lib/remi/extractor/local_file.rb +43 -0
- data/lib/remi/extractor/sftp_file.rb +41 -94
- data/lib/remi/fields.rb +21 -0
- data/lib/remi/settings.rb +8 -0
- data/lib/remi/transform.rb +2 -2
- data/lib/remi/version.rb +1 -1
- data/spec/data_subject/csv_file_spec.rb +6 -5
- data/spec/extractor/local_file_spec.rb +31 -0
- data/spec/extractor/sftp_file_spec.rb +56 -97
- data/spec/fields_spec.rb +97 -0
- data/spec/transform_spec.rb +53 -0
- metadata +9 -3
- data/lib/remi/extractor/sftp_file_new.rb +0 -78
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0b43c0f454a9c3df185534347b76d5d7ac0a37c4
|
4
|
+
data.tar.gz: 024381d3a8b2da98b1de66fddd14c0929a304d3d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bc680e8048b300f645c013a3f261506b6c4e1efdef0f240f37eebb0d393f2cc948de06d53d1d3912122011ba8d2755ac8e851c6399af79c13d68b467389a1118
|
7
|
+
data.tar.gz: cb5f72b9db98cdf7b79fcaee6a1597e9259b805779b93655ba0738b5e8a14240a0e57cc6078b92cdc2c414f21eb3af034eb78a185e974f3c28707667ea80f792
|
data/Gemfile.lock
CHANGED
data/features/sample_job.feature
CHANGED
@@ -33,10 +33,6 @@ Feature: This is a sample feature file.
|
|
33
33
|
And files with names matching the pattern /^SampleFile_(\d+)\.txt/
|
34
34
|
Then the file with the latest date stamp will be downloaded for processing
|
35
35
|
|
36
|
-
Given files with names that do not match the pattern /^SampleFile_(\d+)\.txt/
|
37
|
-
Then no files will be downloaded for processing
|
38
|
-
|
39
|
-
|
40
36
|
Scenario: In order to be parsed and properly processed, the file must conform
|
41
37
|
to expectations about its structure and content.
|
42
38
|
|
@@ -44,34 +44,23 @@ end
|
|
44
44
|
### Source file processing
|
45
45
|
|
46
46
|
Given /^files with names matching the pattern \/(.*)\/$/ do |pattern|
|
47
|
-
@brt.
|
47
|
+
expect(@brt.source.data_subject.extractor.pattern).to eq Regexp.new(pattern)
|
48
48
|
end
|
49
49
|
|
50
|
-
Given /^
|
51
|
-
@brt.
|
52
|
-
end
|
53
|
-
|
54
|
-
Given /^files delivered within the last (\d+) hours$/ do |hours|
|
55
|
-
@brt.filestore.delivered_since(Time.now - hours.to_i * 3600)
|
56
|
-
end
|
57
|
-
|
58
|
-
Given /^files were delivered more than (\d+) hours ago$/ do |hours|
|
59
|
-
@brt.filestore.delivered_before(Time.now - hours.to_i * 3600)
|
50
|
+
Given /^download groups defined by the pattern \/(.*)\/$/ do |pattern|
|
51
|
+
expect(@brt.source.data_subject.extractor.group_by).to eq Regexp.new(pattern)
|
60
52
|
end
|
61
53
|
|
62
54
|
Then /^the file with the latest date stamp will be downloaded for processing$/ do
|
63
|
-
@brt.
|
64
|
-
@brt.source.mock_extractor(@brt.filestore)
|
65
|
-
expect(@brt.source.extract).to match_array Array(@brt.filestore.latest)
|
55
|
+
expect(@brt.source.data_subject.extractor.most_recent_by).to eq :create_time
|
66
56
|
end
|
67
57
|
|
68
|
-
Then /^files will be downloaded for processing$/ do
|
58
|
+
Then /^all files matching the pattern will be downloaded for processing$/ do
|
59
|
+
expect(@brt.source.data_subject.extractor.most_recent_only).to eq false
|
69
60
|
end
|
70
61
|
|
71
|
-
Then /^
|
72
|
-
@brt.
|
73
|
-
@brt.source.mock_extractor(@brt.filestore)
|
74
|
-
expect { @brt.source.extract }.to raise_error Remi::Extractor::SftpFile::FileNotFoundError
|
62
|
+
Then /^the file that comes last in an alphanumeric sort by group will be downloaded for processing$/ do
|
63
|
+
expect(@brt.source.data_subject.extractor.most_recent_by).to eq :name
|
75
64
|
end
|
76
65
|
|
77
66
|
Then /^the file is uploaded to the remote path "([^"]+)"$/ do |remote_path|
|
@@ -150,6 +139,19 @@ Given /^the source field (?:has|is set to) the value "([^"]*)"$/ do |value|
|
|
150
139
|
end
|
151
140
|
end
|
152
141
|
|
142
|
+
Given /^the source field '([^']+)' (?:has|is set to) the json value$/ do |source_field, value|
|
143
|
+
step "the source field '#{source_field}'"
|
144
|
+
|
145
|
+
source_name, source_field_name = @brt.sources.parse_full_field(source_field)
|
146
|
+
@brt.sources[source_name].fields[source_field_name].value = JSON.parse(value)
|
147
|
+
end
|
148
|
+
|
149
|
+
Given /^the source field (?:has|is set to) the json value$/ do |value|
|
150
|
+
@brt.sources.fields.each do |field|
|
151
|
+
step "the source field '#{field.full_name}' is set to the json value \"#{value}\""
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
153
155
|
When /^the source field '([^']+)' (?:has an empty value|is blank)$/ do |source_field|
|
154
156
|
step "the source field '#{source_field}'"
|
155
157
|
|
data/features/support/env.rb
CHANGED
@@ -6,11 +6,20 @@ require 'remi'
|
|
6
6
|
require 'remi/cucumber'
|
7
7
|
|
8
8
|
Remi::Settings.log_level = Logger::ERROR
|
9
|
+
Remi::Settings.jobs_dir = File.join(__dir__, '../../jobs')
|
9
10
|
|
10
11
|
Before do
|
11
12
|
# Restart the random number generator prior to each scenario to
|
12
13
|
# ensure we have reproducibility of random output
|
13
14
|
Kernel.srand(35983958269835333)
|
15
|
+
|
16
|
+
# Monkey patch faker gem so that dummy random dates and ranges are generated consistently
|
17
|
+
class Faker::Base
|
18
|
+
def self.rand_in_range(from, to)
|
19
|
+
from, to = to, from if to < from
|
20
|
+
Random.rand(from..to)
|
21
|
+
end
|
22
|
+
end
|
14
23
|
end
|
15
24
|
|
16
25
|
After do
|
data/jobs/sample_job.rb
CHANGED
@@ -27,8 +27,8 @@ class SampleJob
|
|
27
27
|
define_source :sample_file, Remi::DataSource::CsvFile,
|
28
28
|
extractor: Remi::Extractor::SftpFile.new(
|
29
29
|
credentials: params[:sftp],
|
30
|
-
|
31
|
-
|
30
|
+
remote_path: '/',
|
31
|
+
pattern: /^SampleFile_(\d+)\.txt/,
|
32
32
|
most_recent_only: true
|
33
33
|
),
|
34
34
|
csv_options: {
|
data/lib/remi.rb
CHANGED
@@ -42,8 +42,9 @@ require 'remi/field_symbolizers'
|
|
42
42
|
|
43
43
|
require 'remi/refinements/symbolizer'
|
44
44
|
|
45
|
-
require 'remi/extractor/sftp_file' # deprecated
|
46
45
|
require 'remi/extractor/file_system'
|
46
|
+
require 'remi/extractor/local_file'
|
47
|
+
require 'remi/extractor/sftp_file'
|
47
48
|
require 'remi/extractor/s3_file'
|
48
49
|
|
49
50
|
|
@@ -113,6 +113,7 @@ module Remi::BusinessRules
|
|
113
113
|
|
114
114
|
def initialize(job_name)
|
115
115
|
job_class_name = "#{job_name.gsub(/\s/,'')}Job"
|
116
|
+
require_job_file(job_class_name)
|
116
117
|
@job = Object.const_get(job_class_name).new
|
117
118
|
|
118
119
|
@job_sources = DataSubjectCollection.new
|
@@ -121,8 +122,6 @@ module Remi::BusinessRules
|
|
121
122
|
@sources = DataSubjectCollection.new
|
122
123
|
@targets = DataSubjectCollection.new
|
123
124
|
@examples = DataExampleCollection.new
|
124
|
-
|
125
|
-
@filestore = Filestore.new
|
126
125
|
end
|
127
126
|
|
128
127
|
attr_reader :job
|
@@ -131,7 +130,13 @@ module Remi::BusinessRules
|
|
131
130
|
attr_reader :sources
|
132
131
|
attr_reader :targets
|
133
132
|
attr_reader :examples
|
134
|
-
|
133
|
+
|
134
|
+
def require_job_file(job_class_name)
|
135
|
+
job_file = Dir["#{Remi::Settings.jobs_dir}/**/*_job.rb"].map do |fname|
|
136
|
+
fname if File.basename(fname) == "#{job_class_name.underscore}.rb"
|
137
|
+
end.compact.pop
|
138
|
+
require job_file
|
139
|
+
end
|
135
140
|
|
136
141
|
def add_job_source(name)
|
137
142
|
raise "Unknown source #{name} for job" unless @job.methods.include? name.symbolize
|
@@ -396,17 +401,6 @@ module Remi::BusinessRules
|
|
396
401
|
@data_subject.df[vector_name].recode! { |v| i += 1 }
|
397
402
|
end
|
398
403
|
|
399
|
-
def mock_extractor(filestore)
|
400
|
-
extractor = class << @data_subject.extractor; self; end
|
401
|
-
|
402
|
-
extractor.send(:define_method, :all_entries, ->() { filestore.sftp_entries })
|
403
|
-
extractor.send(:define_method, :download, ->(to_download) { to_download.map { |e| e.name } })
|
404
|
-
end
|
405
|
-
|
406
|
-
def extract
|
407
|
-
@data_subject.extractor.extract
|
408
|
-
end
|
409
|
-
|
410
404
|
def csv_options
|
411
405
|
@data_subject.csv_options
|
412
406
|
end
|
@@ -558,75 +552,4 @@ module Remi::BusinessRules
|
|
558
552
|
end
|
559
553
|
end
|
560
554
|
end
|
561
|
-
|
562
|
-
|
563
|
-
class Filestore
|
564
|
-
def initialize
|
565
|
-
@files = []
|
566
|
-
@delivered = {}
|
567
|
-
end
|
568
|
-
|
569
|
-
attr_reader :sftp_entries
|
570
|
-
|
571
|
-
def pattern(pattern)
|
572
|
-
@pattern = pattern
|
573
|
-
end
|
574
|
-
|
575
|
-
def anti_pattern(pattern)
|
576
|
-
@pattern = /^ThisBetterNeverMatchAnythingOrIWillShootYou\d{8}Times$/
|
577
|
-
end
|
578
|
-
|
579
|
-
def delivered_since(date_time)
|
580
|
-
@delivered = { :since => date_time }
|
581
|
-
end
|
582
|
-
|
583
|
-
def delivered_before(date_time)
|
584
|
-
@delivered = { :before => date_time }
|
585
|
-
end
|
586
|
-
|
587
|
-
def latest
|
588
|
-
@files.max_by { |f| f[:attributes][:createdtime] }[:name]
|
589
|
-
end
|
590
|
-
|
591
|
-
def generate
|
592
|
-
psuedorand = Random.new(4985674985672348954987589429)
|
593
|
-
|
594
|
-
generate_files_with_pattern
|
595
|
-
@files.map! do |file|
|
596
|
-
date_method = @delivered.keys.first
|
597
|
-
if date_method == :since
|
598
|
-
file[:attributes][:createdtime] = @delivered[:since] + 10 + psuedorand.rand * 100
|
599
|
-
elsif date_method == :before
|
600
|
-
file[:attributes][:createdtime] = @delivered[:since] - 10 - psuedorand.rand * 100
|
601
|
-
else
|
602
|
-
file[:attributes][:createdtime] = Time.now - 10 - psuedorand.rand * 100
|
603
|
-
end
|
604
|
-
file
|
605
|
-
end
|
606
|
-
end
|
607
|
-
|
608
|
-
def sftp_entries
|
609
|
-
@files.map do |file|
|
610
|
-
Net::SFTP::Protocol::V04::Name.new(
|
611
|
-
file[:name],
|
612
|
-
Net::SFTP::Protocol::V04::Attributes.new(createtime: file[:attributes][:createdtime])
|
613
|
-
)
|
614
|
-
end
|
615
|
-
end
|
616
|
-
|
617
|
-
private
|
618
|
-
|
619
|
-
def generate_files_with_pattern
|
620
|
-
filenames = 1.upto(5).map { |f| @pattern.random_example }.uniq
|
621
|
-
|
622
|
-
@files = filenames.map do |fname|
|
623
|
-
{
|
624
|
-
name: fname,
|
625
|
-
attributes: {
|
626
|
-
createdtime: nil
|
627
|
-
}
|
628
|
-
}
|
629
|
-
end
|
630
|
-
end
|
631
|
-
end
|
632
555
|
end
|
@@ -53,6 +53,8 @@ module Remi
|
|
53
53
|
# Assumes that each file has exactly the same structure
|
54
54
|
result_df = nil
|
55
55
|
extract.each_with_index do |filename, idx|
|
56
|
+
filename = filename.to_s
|
57
|
+
|
56
58
|
@logger.info "Converting #{filename} to a dataframe"
|
57
59
|
processed_filename = preprocess(filename)
|
58
60
|
csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options
|
@@ -71,16 +73,7 @@ module Remi
|
|
71
73
|
|
72
74
|
|
73
75
|
def extractor=(arg)
|
74
|
-
|
75
|
-
when Extractor::SftpFile, Extractor::LocalFile, Extractor::S3File
|
76
|
-
@extractor = arg
|
77
|
-
when String
|
78
|
-
@extractor = Extractor::LocalFile.new(path: arg)
|
79
|
-
when Regexp
|
80
|
-
raise "Adding regex matching to local files would be easy, not done yet"
|
81
|
-
else
|
82
|
-
raise "Unknown extractor of type #{arg.class}: #{arg}"
|
83
|
-
end
|
76
|
+
@extractor = arg.respond_to?(:extract) ? arg : Extractor::LocalFile.new(remote_path: arg.to_s)
|
84
77
|
end
|
85
78
|
|
86
79
|
# Only going to support single file for now
|
@@ -18,7 +18,6 @@ module Remi
|
|
18
18
|
|
19
19
|
|
20
20
|
class FileSystem
|
21
|
-
|
22
21
|
class FileNotFoundError < StandardError; end
|
23
22
|
|
24
23
|
def initialize(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, logger: Remi::Settings.logger, **kargs, &block)
|
@@ -31,6 +30,12 @@ module Remi
|
|
31
30
|
@logger = logger
|
32
31
|
end
|
33
32
|
|
33
|
+
attr_reader :remote_path
|
34
|
+
attr_reader :pattern
|
35
|
+
attr_reader :local_path
|
36
|
+
attr_reader :most_recent_only
|
37
|
+
attr_reader :group_by
|
38
|
+
attr_reader :most_recent_by
|
34
39
|
attr_reader :logger
|
35
40
|
|
36
41
|
# Public: Called to extract files from the source filesystem.
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Remi
|
2
|
+
module Extractor
|
3
|
+
|
4
|
+
class LocalFile < FileSystem
|
5
|
+
def initialize(*args, **kargs)
|
6
|
+
super
|
7
|
+
init_local_file(*args, **kargs)
|
8
|
+
end
|
9
|
+
|
10
|
+
# Public: Called to extract files from the source filesystem.
|
11
|
+
#
|
12
|
+
# Returns an array with containing the paths to all files extracted.
|
13
|
+
def extract
|
14
|
+
entries.map(&:pathname)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
|
18
|
+
def all_entries
|
19
|
+
@all_entries ||= all_entries!
|
20
|
+
end
|
21
|
+
|
22
|
+
def all_entries!
|
23
|
+
dir = @remote_path.directory? ? @remote_path + '*' : @remote_path
|
24
|
+
Dir[dir].map do |entry|
|
25
|
+
path = Pathname.new(entry)
|
26
|
+
if path.file?
|
27
|
+
FileSystemEntry.new(
|
28
|
+
pathname: path.realpath.to_s,
|
29
|
+
create_time: path.ctime,
|
30
|
+
modified_time: path.mtime
|
31
|
+
)
|
32
|
+
end
|
33
|
+
end.compact
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def init_local_file(*args, **kargs)
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -1,125 +1,71 @@
|
|
1
1
|
module Remi
|
2
2
|
module Extractor
|
3
3
|
|
4
|
-
class
|
5
|
-
def initialize(path:, folder: nil)
|
6
|
-
@path = path
|
7
|
-
@folder = folder
|
8
|
-
end
|
9
|
-
|
10
|
-
def extract
|
11
|
-
if @folder
|
12
|
-
Dir.entries(@folder).map do |entry|
|
13
|
-
next unless entry.match(@path)
|
14
|
-
File.join(@folder,entry)
|
15
|
-
end.compact
|
16
|
-
else
|
17
|
-
@path
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
class SftpFile
|
23
|
-
|
24
|
-
class FileNotFoundError < StandardError; end
|
4
|
+
class SftpFile < FileSystem
|
25
5
|
|
26
|
-
|
27
|
-
def <=> (target)
|
28
|
-
-(self.value <=> target.value)
|
29
|
-
end
|
30
|
-
end
|
6
|
+
N_RETRY = 3
|
31
7
|
|
32
|
-
def initialize(
|
33
|
-
|
34
|
-
|
35
|
-
@remote_folder = remote_folder
|
36
|
-
@local_folder = local_folder
|
37
|
-
@port = port || (credentials && credentials[:port]) || '22'
|
38
|
-
@most_recent_only = most_recent_only
|
39
|
-
@group_by = group_by
|
40
|
-
@most_recent_by = most_recent_by
|
41
|
-
@logger = logger
|
8
|
+
def initialize(*args, **kargs)
|
9
|
+
super
|
10
|
+
init_sftp_file(*args, **kargs)
|
42
11
|
end
|
43
12
|
|
44
|
-
attr_reader :
|
13
|
+
attr_reader :host
|
14
|
+
attr_reader :username
|
15
|
+
attr_reader :password
|
16
|
+
attr_reader :port
|
45
17
|
|
18
|
+
# Public: Called to extract files from the source filesystem.
|
19
|
+
#
|
20
|
+
# Returns an array with containing the paths to all files extracted.
|
46
21
|
def extract
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
elsif @most_recent_only
|
55
|
-
Array(most_recent_entry(matching_entries))
|
56
|
-
else
|
57
|
-
matching_entries
|
22
|
+
connection do |sftp|
|
23
|
+
entries.map do |entry|
|
24
|
+
local_file = File.join(@local_path, entry.name)
|
25
|
+
@logger.info "Downloading #{entry.name} to #{local_file}"
|
26
|
+
retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
|
27
|
+
local_file
|
28
|
+
end
|
58
29
|
end
|
59
30
|
end
|
60
31
|
|
61
|
-
|
62
|
-
|
32
|
+
# Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
|
33
|
+
def all_entries
|
34
|
+
@all_entries ||= all_entries!
|
63
35
|
end
|
64
36
|
|
65
|
-
def
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
if @most_recent_by == :filename
|
75
|
-
entry.name
|
76
|
-
else
|
77
|
-
entry.attributes.send(@most_recent_by)
|
37
|
+
def all_entries!
|
38
|
+
sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path) }
|
39
|
+
sftp_entries.map do |entry|
|
40
|
+
# Early versions of the protocol don't support create time, fake it with modified time?
|
41
|
+
FileSystemEntry.new(
|
42
|
+
pathname: File.join(@remote_path, entry.name),
|
43
|
+
create_time: entry.attributes.respond_to?(:createtime) ? entry.attributes.createtime : entry.attributes.mtime,
|
44
|
+
modified_time: entry.attributes.mtime
|
45
|
+
)
|
78
46
|
end
|
79
47
|
end
|
80
48
|
|
81
|
-
def most_recent_in_group(match_group = @group_by)
|
82
|
-
entries_with_group = matching_entries.map do |entry|
|
83
|
-
match = entry.name.match(match_group)
|
84
|
-
next unless match
|
85
49
|
|
86
|
-
|
87
|
-
{ group: group, entry: entry }
|
88
|
-
end.compact
|
89
|
-
entries_with_group.sort_by! { |e| [e[:group], SortDesc.new(sort_files_by(e[:entry]))] }
|
90
|
-
|
91
|
-
last_group = nil
|
92
|
-
entries_with_group.map do |entry|
|
93
|
-
next unless entry[:group] != last_group
|
94
|
-
last_group = entry[:group]
|
95
|
-
entry[:entry]
|
96
|
-
end.compact
|
97
|
-
end
|
50
|
+
private
|
98
51
|
|
99
|
-
def
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
retry_download(ntry) { sftp.download!(File.join(remote_folder, entry.name), local_file) }
|
105
|
-
local_file
|
106
|
-
end
|
107
|
-
end
|
52
|
+
def init_sftp_file(*args, credentials:, **kargs)
|
53
|
+
@host = credentials.fetch(:host)
|
54
|
+
@username = credentials.fetch(:username)
|
55
|
+
@password = credentials.fetch(:password)
|
56
|
+
@port = credentials.fetch(:port, '22')
|
108
57
|
end
|
109
58
|
|
110
|
-
|
111
|
-
private
|
112
|
-
|
113
59
|
def connection(&block)
|
114
60
|
result = nil
|
115
|
-
Net::SFTP.start(@
|
61
|
+
Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
|
116
62
|
result = yield sftp
|
117
63
|
end
|
118
64
|
result
|
119
65
|
end
|
120
66
|
|
121
|
-
def retry_download(
|
122
|
-
1.upto(
|
67
|
+
def retry_download(&block)
|
68
|
+
1.upto(N_RETRY).each do |itry|
|
123
69
|
begin
|
124
70
|
block.call
|
125
71
|
break
|
@@ -132,5 +78,6 @@ module Remi
|
|
132
78
|
end
|
133
79
|
end
|
134
80
|
end
|
81
|
+
|
135
82
|
end
|
136
83
|
end
|
data/lib/remi/fields.rb
CHANGED
@@ -4,5 +4,26 @@ module Remi
|
|
4
4
|
@fields = Hash.new({}).merge fields
|
5
5
|
super(@fields)
|
6
6
|
end
|
7
|
+
|
8
|
+
|
9
|
+
def dup
|
10
|
+
Fields.new(@fields.dup)
|
11
|
+
end
|
12
|
+
|
13
|
+
def merge(other_fields, prefix: nil)
|
14
|
+
dup.merge!(other_fields, prefix: prefix)
|
15
|
+
end
|
16
|
+
|
17
|
+
def merge!(other_fields, prefix: nil)
|
18
|
+
@fields.merge!(other_fields) do |key, this_val, other_val|
|
19
|
+
if prefix
|
20
|
+
@fields["#{prefix}#{key}".to_sym] = other_val
|
21
|
+
this_val
|
22
|
+
else
|
23
|
+
this_val.merge other_val
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
7
28
|
end
|
8
29
|
end
|
data/lib/remi/settings.rb
CHANGED
data/lib/remi/transform.rb
CHANGED
@@ -18,7 +18,7 @@ module Remi
|
|
18
18
|
attr_accessor :target_metadata
|
19
19
|
|
20
20
|
# Public: Set to true if the transform expects multiple arguments (default: false)
|
21
|
-
attr_reader :
|
21
|
+
attr_reader :multi_args
|
22
22
|
|
23
23
|
# Public: Defines the operation of this transform class.
|
24
24
|
#
|
@@ -36,7 +36,7 @@ module Remi
|
|
36
36
|
#
|
37
37
|
# Returns the transformed value.
|
38
38
|
def call(*values)
|
39
|
-
if @
|
39
|
+
if @multi_args
|
40
40
|
to_proc.call(*values)
|
41
41
|
else
|
42
42
|
to_proc.call(Array(values).first)
|
data/lib/remi/version.rb
CHANGED
@@ -22,7 +22,8 @@ describe DataSource::CsvFile do
|
|
22
22
|
filename_field: :from_file
|
23
23
|
)
|
24
24
|
|
25
|
-
|
25
|
+
expected_files = [Pathname.new('spec/fixtures/basic.csv').realpath.to_s] * 2
|
26
|
+
expect(csv.df[:from_file].to_a).to eq expected_files
|
26
27
|
end
|
27
28
|
|
28
29
|
it "preprocesses records when required" do
|
@@ -56,12 +57,11 @@ describe DataSource::CsvFile do
|
|
56
57
|
expect(csv.df.to_a).to eq expected_df.to_a
|
57
58
|
end
|
58
59
|
|
59
|
-
|
60
|
-
it "combines multiple csv files into a single dataframe", skip: 'TODO' do
|
60
|
+
it "combines multiple csv files into a single dataframe" do
|
61
61
|
csv = Remi::DataSource::CsvFile.new(
|
62
62
|
extractor: Remi::Extractor::LocalFile.new(
|
63
63
|
remote_path: 'spec/fixtures',
|
64
|
-
pattern:
|
64
|
+
pattern: /basic(|2)\.csv/
|
65
65
|
)
|
66
66
|
)
|
67
67
|
|
@@ -69,9 +69,10 @@ describe DataSource::CsvFile do
|
|
69
69
|
{
|
70
70
|
column_a: ['value 1A', 'value 2A', 'value 1A', 'value 2A'],
|
71
71
|
column_b: ['value 1B', 'value 2B', nil, nil],
|
72
|
-
|
72
|
+
column_c: [nil, nil, 'value 1C', 'value 2C']
|
73
73
|
}
|
74
74
|
)
|
75
|
+
|
75
76
|
expect(csv.df.to_a).to eq expected_df.to_a
|
76
77
|
end
|
77
78
|
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'remi_spec'
|
2
|
+
|
3
|
+
describe Extractor::LocalFile do
|
4
|
+
let(:remote_path) { "#{Pathname.new(__FILE__).dirname}" }
|
5
|
+
|
6
|
+
let(:local_file) {
|
7
|
+
Extractor::LocalFile.new(
|
8
|
+
remote_path: remote_path
|
9
|
+
)
|
10
|
+
}
|
11
|
+
|
12
|
+
let(:remote_filenames) { Dir[remote_path + '/*'].map { |f| Pathname.new(f).basename.to_s } }
|
13
|
+
|
14
|
+
context '.new' do
|
15
|
+
it 'creates an instance with valid parameters' do
|
16
|
+
local_file
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
context '#all_entires' do
|
21
|
+
it 'returns all entries' do
|
22
|
+
expect(local_file.all_entries.map(&:name)).to eq remote_filenames
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
context '#extract' do
|
27
|
+
it 'references local files with the right names' do
|
28
|
+
expect(local_file.extract.map { |f| Pathname.new(f).basename.to_s }).to eq remote_filenames
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -1,125 +1,84 @@
|
|
1
1
|
require 'remi_spec'
|
2
2
|
|
3
3
|
describe Extractor::SftpFile do
|
4
|
-
|
5
|
-
|
4
|
+
let(:remote_path) { '' }
|
5
|
+
let(:credentials) {
|
6
|
+
{
|
7
|
+
host: 'host',
|
8
|
+
username: 'username',
|
9
|
+
password: 'password'
|
10
|
+
}
|
11
|
+
}
|
12
|
+
|
13
|
+
let(:sftp_file) {
|
14
|
+
Extractor::SftpFile.new(
|
15
|
+
credentials: credentials,
|
16
|
+
remote_path: remote_path
|
17
|
+
)
|
18
|
+
}
|
19
|
+
|
20
|
+
let(:remote_filenames) { ['file1.csv', 'file2.csv'] }
|
21
|
+
let(:sftp_session) { instance_double('Net:SFTP::Session') }
|
6
22
|
|
7
|
-
|
8
|
-
|
9
|
-
{ name: "ApplicantsA-3.csv", createtime: now - 5.minutes },
|
10
|
-
{ name: "ApplicantsA-5.csv", createtime: now - 1.minutes },
|
11
|
-
{ name: "ApplicantsB-7.csv", createtime: now - 10.minutes },
|
12
|
-
{ name: "ApplicantsB-6.csv", createtime: now - 5.minutes },
|
13
|
-
{ name: "ApplicantsB-2.csv", createtime: now - 1.minutes },
|
14
|
-
{ name: "ApplicantsB-2.txt", createtime: now - 0.minutes },
|
15
|
-
{ name: "Apples.csv", createtime: now - 1.minutes },
|
16
|
-
]
|
23
|
+
before do
|
24
|
+
sftp_dir = instance_double('Net::SFTP::Operations::Dir')
|
17
25
|
|
18
|
-
|
19
|
-
|
20
|
-
Net::SFTP::Protocol::V04::Name.new(
|
21
|
-
file[:name],
|
22
|
-
Net::SFTP::Protocol::V04::Attributes.new(createtime: file[:createtime])
|
23
|
-
)
|
24
|
-
end
|
25
|
-
end
|
26
|
+
allow(Net::SFTP).to receive(:start).and_yield sftp_session
|
27
|
+
allow(sftp_session).to receive(:dir).and_return sftp_dir
|
26
28
|
|
27
|
-
|
29
|
+
allow(sftp_dir).to receive(:entries).and_return(remote_filenames.map { |fname|
|
30
|
+
Net::SFTP::Protocol::V04::Name.new(
|
31
|
+
fname,
|
32
|
+
Net::SFTP::Protocol::V04::Attributes.new(createtime: Time.new.to_i, mtime: Time.new.to_i)
|
33
|
+
)
|
34
|
+
})
|
28
35
|
end
|
29
36
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
context 'extracting all files matching a pattern' do
|
36
|
-
before do
|
37
|
-
@params[:remote_file] = /ApplicantsA-\d+\.csv/
|
37
|
+
context '.new' do
|
38
|
+
it 'creates an instance with valid parameters' do
|
39
|
+
sftp_file
|
38
40
|
end
|
39
41
|
|
40
|
-
it '
|
41
|
-
|
42
|
+
it 'requires a hostname' do
|
43
|
+
credentials.delete(:host)
|
44
|
+
expect { sftp_file }.to raise_error KeyError
|
42
45
|
end
|
43
46
|
|
44
|
-
it '
|
45
|
-
|
46
|
-
|
47
|
-
"ApplicantsA-3.csv",
|
48
|
-
"ApplicantsA-5.csv"
|
49
|
-
])
|
47
|
+
it 'requires a username' do
|
48
|
+
credentials.delete(:username)
|
49
|
+
expect { sftp_file }.to raise_error KeyError
|
50
50
|
end
|
51
|
-
end
|
52
|
-
|
53
51
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
remote_file: /ApplicantsA-\d+\.csv/,
|
58
|
-
most_recent_only: true
|
59
|
-
})
|
52
|
+
it 'requires a password' do
|
53
|
+
credentials.delete(:password)
|
54
|
+
expect { sftp_file }.to raise_error KeyError
|
60
55
|
end
|
61
56
|
|
62
|
-
it '
|
63
|
-
expect(
|
64
|
-
"ApplicantsA-5.csv"
|
65
|
-
])
|
57
|
+
it 'defaults to using port 22' do
|
58
|
+
expect(sftp_file.port).to eq '22'
|
66
59
|
end
|
67
60
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
end
|
72
|
-
|
73
|
-
it 'extracts only the most recent matching file' do
|
74
|
-
expect(sftpfile.to_download.map(&:name)).to match_array([
|
75
|
-
"ApplicantsA-9.csv"
|
76
|
-
])
|
77
|
-
end
|
61
|
+
it 'allows the port to be defined in the credentials' do
|
62
|
+
credentials[:port] = '1234'
|
63
|
+
expect(sftp_file.port).to eq '1234'
|
78
64
|
end
|
79
65
|
end
|
80
66
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
@params.merge!({
|
85
|
-
credentials: nil,
|
86
|
-
remote_file: /^Applicants(A|B)-\d+\.csv/,
|
87
|
-
group_by: /^Applicants(A|B)/
|
88
|
-
})
|
89
|
-
end
|
90
|
-
|
91
|
-
it 'extracts the most recent file that matches a particular regex' do
|
92
|
-
expect(sftpfile.to_download.map(&:name)).to match_array([
|
93
|
-
"ApplicantsA-5.csv",
|
94
|
-
"ApplicantsB-2.csv"
|
95
|
-
])
|
67
|
+
context '#all_entires' do
|
68
|
+
it 'returns all entries' do
|
69
|
+
expect(sftp_file.all_entries.map(&:name)).to eq remote_filenames
|
96
70
|
end
|
71
|
+
end
|
97
72
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
it 'extracts the most recent file that matches a particular regex' do
|
104
|
-
expect(sftpfile.to_download.map(&:name)).to match_array([
|
105
|
-
"ApplicantsA-5.csv",
|
106
|
-
"ApplicantsB-2.txt"
|
107
|
-
])
|
108
|
-
end
|
73
|
+
context '#extract' do
|
74
|
+
it 'downloads files from the ftp' do
|
75
|
+
expect(sftp_session).to receive(:download!).exactly(remote_filenames.size).times
|
76
|
+
sftp_file.extract
|
109
77
|
end
|
110
78
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
end
|
115
|
-
|
116
|
-
it 'extracts only the most recent matching file' do
|
117
|
-
expect(sftpfile.to_download.map(&:name)).to match_array([
|
118
|
-
"ApplicantsA-9.csv",
|
119
|
-
"ApplicantsB-7.csv"
|
120
|
-
])
|
121
|
-
end
|
79
|
+
it 'creates local files with the right names' do
|
80
|
+
allow(sftp_session).to receive(:download!)
|
81
|
+
expect(sftp_file.extract.map { |f| Pathname.new(f).basename.to_s }).to eq remote_filenames
|
122
82
|
end
|
123
|
-
|
124
83
|
end
|
125
84
|
end
|
data/spec/fields_spec.rb
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
require_relative 'remi_spec'
|
2
|
+
|
3
|
+
describe Fields do
|
4
|
+
|
5
|
+
let :base_fields do
|
6
|
+
Fields.new(
|
7
|
+
{
|
8
|
+
col1: { from: :base, base: true },
|
9
|
+
col2: { from: :base, base: true }
|
10
|
+
}
|
11
|
+
)
|
12
|
+
end
|
13
|
+
|
14
|
+
let :fields2 do
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
context "merging field sets" do
|
19
|
+
|
20
|
+
context "when there is no overlap" do
|
21
|
+
it "unions field sets" do
|
22
|
+
other_fields = Fields.new(
|
23
|
+
{
|
24
|
+
col3: {},
|
25
|
+
col4: {}
|
26
|
+
}
|
27
|
+
)
|
28
|
+
|
29
|
+
merged_fields = base_fields.merge other_fields
|
30
|
+
|
31
|
+
expect(merged_fields.keys).to eq [:col1, :col2, :col3, :col4]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
context "when there is overlap" do
|
36
|
+
let :other_fields do
|
37
|
+
Fields.new(
|
38
|
+
{
|
39
|
+
col2: { from: :other, other: true },
|
40
|
+
col3: { from: :other, other: true }
|
41
|
+
}
|
42
|
+
)
|
43
|
+
end
|
44
|
+
|
45
|
+
it "unions field sets when there is overlap" do
|
46
|
+
merged_fields = base_fields.merge other_fields
|
47
|
+
expect(merged_fields.keys).to eq [:col1, :col2, :col3]
|
48
|
+
end
|
49
|
+
|
50
|
+
it "merges overlapping metadata" do
|
51
|
+
merged_fields = base_fields.merge other_fields
|
52
|
+
|
53
|
+
expect(merged_fields).to eq(
|
54
|
+
{
|
55
|
+
col1: { from: :base, base: true },
|
56
|
+
col2: { from: :other, base: true, other: true },
|
57
|
+
col3: { from: :other, other: true }
|
58
|
+
}
|
59
|
+
)
|
60
|
+
end
|
61
|
+
|
62
|
+
it "does not affect the original field sets" do
|
63
|
+
merged_fields = base_fields.merge other_fields
|
64
|
+
|
65
|
+
expect(base_fields).to eq(
|
66
|
+
{
|
67
|
+
col1: { from: :base, base: true },
|
68
|
+
col2: { from: :base, base: true }
|
69
|
+
}
|
70
|
+
)
|
71
|
+
|
72
|
+
expect(other_fields).to eq(
|
73
|
+
{
|
74
|
+
col2: { from: :other, other: true },
|
75
|
+
col3: { from: :other, other: true }
|
76
|
+
}
|
77
|
+
)
|
78
|
+
end
|
79
|
+
|
80
|
+
context "with a prefix" do
|
81
|
+
it "creates new fields for names that conflict" do
|
82
|
+
merged_fields = base_fields.merge other_fields, prefix: :other_
|
83
|
+
|
84
|
+
expect(merged_fields).to eq(
|
85
|
+
{
|
86
|
+
col1: { from: :base, base: true },
|
87
|
+
col2: { from: :base, base: true },
|
88
|
+
other_col2: { from: :other, other: true },
|
89
|
+
col3: { from: :other, other: true }
|
90
|
+
}
|
91
|
+
)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require_relative 'remi_spec'
|
2
|
+
|
3
|
+
describe Transform do
|
4
|
+
|
5
|
+
context 'a transform with a single argument' do
|
6
|
+
before do
|
7
|
+
class SingleArgument < Transform
|
8
|
+
def initialize(*args, **kargs, &block)
|
9
|
+
super
|
10
|
+
end
|
11
|
+
|
12
|
+
def transform(value)
|
13
|
+
value
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
let(:transform) { SingleArgument.new }
|
19
|
+
|
20
|
+
it 'can be converted into a proc and called' do
|
21
|
+
expect(transform.to_proc.call(5)).to eq 5
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'can be called directly' do
|
25
|
+
expect(transform.call(5)).to eq 5
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
context 'a transform that accepts multiple arguments' do
|
30
|
+
before do
|
31
|
+
class MultipleArgument < Transform
|
32
|
+
def initialize(*args, **kargs, &block)
|
33
|
+
super
|
34
|
+
@multi_args = true
|
35
|
+
end
|
36
|
+
|
37
|
+
def transform(*values)
|
38
|
+
Array(values)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
let(:transform) { MultipleArgument.new }
|
44
|
+
|
45
|
+
it 'can be converted into a proc and called' do
|
46
|
+
expect(transform.to_proc.call(1, 2)).to eq [1, 2]
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'can be called directly' do
|
50
|
+
expect(transform.call(1, 2)).to eq [1, 2]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: remi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.36
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sterling Paramore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-06-
|
11
|
+
date: 2016-06-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bond
|
@@ -232,9 +232,9 @@ files:
|
|
232
232
|
- lib/remi/data_subject/salesforce.rb
|
233
233
|
- lib/remi/data_subject/sftp_file.rb
|
234
234
|
- lib/remi/extractor/file_system.rb
|
235
|
+
- lib/remi/extractor/local_file.rb
|
235
236
|
- lib/remi/extractor/s3_file.rb
|
236
237
|
- lib/remi/extractor/sftp_file.rb
|
237
|
-
- lib/remi/extractor/sftp_file_new.rb
|
238
238
|
- lib/remi/field_symbolizers.rb
|
239
239
|
- lib/remi/fields.rb
|
240
240
|
- lib/remi/job.rb
|
@@ -248,13 +248,16 @@ files:
|
|
248
248
|
- spec/data_subject/csv_file_spec.rb
|
249
249
|
- spec/data_subject/data_frame.rb
|
250
250
|
- spec/extractor/file_system_spec.rb
|
251
|
+
- spec/extractor/local_file_spec.rb
|
251
252
|
- spec/extractor/s3_file_spec.rb
|
252
253
|
- spec/extractor/sftp_file_spec.rb
|
254
|
+
- spec/fields_spec.rb
|
253
255
|
- spec/fixtures/basic.csv
|
254
256
|
- spec/fixtures/basic2.csv
|
255
257
|
- spec/fixtures/unsupported_escape.csv
|
256
258
|
- spec/metadata_spec.rb
|
257
259
|
- spec/remi_spec.rb
|
260
|
+
- spec/transform_spec.rb
|
258
261
|
- workbooks/sample_workbook.ipynb
|
259
262
|
- workbooks/workbook_helper.rb
|
260
263
|
homepage: https://github.com/inside-track/remi
|
@@ -304,10 +307,13 @@ test_files:
|
|
304
307
|
- spec/data_subject/csv_file_spec.rb
|
305
308
|
- spec/data_subject/data_frame.rb
|
306
309
|
- spec/extractor/file_system_spec.rb
|
310
|
+
- spec/extractor/local_file_spec.rb
|
307
311
|
- spec/extractor/s3_file_spec.rb
|
308
312
|
- spec/extractor/sftp_file_spec.rb
|
313
|
+
- spec/fields_spec.rb
|
309
314
|
- spec/fixtures/basic.csv
|
310
315
|
- spec/fixtures/basic2.csv
|
311
316
|
- spec/fixtures/unsupported_escape.csv
|
312
317
|
- spec/metadata_spec.rb
|
313
318
|
- spec/remi_spec.rb
|
319
|
+
- spec/transform_spec.rb
|
@@ -1,78 +0,0 @@
|
|
1
|
-
module Remi
|
2
|
-
module Extractor
|
3
|
-
|
4
|
-
class SftpFileNew < FileSystem
|
5
|
-
|
6
|
-
N_RETRY = 3
|
7
|
-
|
8
|
-
def initialize(*args, **kargs)
|
9
|
-
super
|
10
|
-
init_sftp_file(*args, **kargs)
|
11
|
-
end
|
12
|
-
|
13
|
-
# Public: Called to extract files from the source filesystem.
|
14
|
-
#
|
15
|
-
# Returns an array with containing the paths to all files extracted.
|
16
|
-
def extract
|
17
|
-
connection do |sftp|
|
18
|
-
entries.map do |entry|
|
19
|
-
local_file = File.join(@local_path, entry.name)
|
20
|
-
@logger.info "Downloading #{entry.name} to #{local_file}"
|
21
|
-
retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
|
22
|
-
local_file
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
# Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
|
28
|
-
def all_entries
|
29
|
-
@all_entries ||= all_entries!
|
30
|
-
end
|
31
|
-
|
32
|
-
def all_entries!
|
33
|
-
sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path.dirname) }
|
34
|
-
sftp_entries.map do |entry|
|
35
|
-
# Early versions of the protocol don't support create time, fake it with modified time?
|
36
|
-
FileSystemEntry.new(
|
37
|
-
name: File.join(@remote_path.dirname, entry.name),
|
38
|
-
create_time: entry.respond_to?(:createtime) ? entry.createtime : entry.mtime,
|
39
|
-
modified_time: entry.mtime
|
40
|
-
)
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
|
45
|
-
private
|
46
|
-
|
47
|
-
def init_sftp_file(*args, credentials:, **kargs)
|
48
|
-
@host = credentials.fetch(:host)
|
49
|
-
@username = credentials.fetch(:username)
|
50
|
-
@password = credentials.fetch(:password)
|
51
|
-
@port = credentials.fetch(:port, '22')
|
52
|
-
end
|
53
|
-
|
54
|
-
def connection(&block)
|
55
|
-
result = nil
|
56
|
-
Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
|
57
|
-
result = yield sftp
|
58
|
-
end
|
59
|
-
result
|
60
|
-
end
|
61
|
-
|
62
|
-
def retry_download(&block)
|
63
|
-
1.upto(N_RETRY).each do |itry|
|
64
|
-
begin
|
65
|
-
block.call
|
66
|
-
break
|
67
|
-
rescue RuntimeError => err
|
68
|
-
raise err unless itry < ntry
|
69
|
-
@logger.error "Download failed with error: #{err.message}"
|
70
|
-
@logger.error "Retry attempt #{itry}/#{ntry-1}"
|
71
|
-
sleep(1)
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
end
|
78
|
-
end
|