remi 0.2.35 → 0.2.36

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6ac6de18abdeb2269c391a732e283e54c995344f
4
- data.tar.gz: 51f64d79e4b176bb4f2dbd15a4407fe0823aa13f
3
+ metadata.gz: 0b43c0f454a9c3df185534347b76d5d7ac0a37c4
4
+ data.tar.gz: 024381d3a8b2da98b1de66fddd14c0929a304d3d
5
5
  SHA512:
6
- metadata.gz: 9978c92842fc114224f06c72aacaca3175f9b6467b2189fcce5d8f60f26335da324f4f5daf4ae770f33b266ba5f1757f032dd0de7c3ea25f926cdd8f64b20c50
7
- data.tar.gz: 8390273e8305709141f4823d837cfcbf7b0fcf891ad2f9cdecdb418bdee5be11cc25da8e3daa9afa5ad14ec42e0a813b23ba2d220133dbce1e58ea82c551f99e
6
+ metadata.gz: bc680e8048b300f645c013a3f261506b6c4e1efdef0f240f37eebb0d393f2cc948de06d53d1d3912122011ba8d2755ac8e851c6399af79c13d68b467389a1118
7
+ data.tar.gz: cb5f72b9db98cdf7b79fcaee6a1597e9259b805779b93655ba0738b5e8a14240a0e57cc6078b92cdc2c414f21eb3af034eb78a185e974f3c28707667ea80f792
data/Gemfile.lock CHANGED
@@ -19,7 +19,7 @@ GIT
19
19
  PATH
20
20
  remote: .
21
21
  specs:
22
- remi (0.2.35)
22
+ remi (0.2.36)
23
23
  activesupport (~> 4.2)
24
24
  bond (~> 0.5)
25
25
  cucumber (~> 2.1)
@@ -33,10 +33,6 @@ Feature: This is a sample feature file.
33
33
  And files with names matching the pattern /^SampleFile_(\d+)\.txt/
34
34
  Then the file with the latest date stamp will be downloaded for processing
35
35
 
36
- Given files with names that do not match the pattern /^SampleFile_(\d+)\.txt/
37
- Then no files will be downloaded for processing
38
-
39
-
40
36
  Scenario: In order to be parsed and properly processed, the file must conform
41
37
  to expectations about its structure and content.
42
38
 
@@ -44,34 +44,23 @@ end
44
44
  ### Source file processing
45
45
 
46
46
  Given /^files with names matching the pattern \/(.*)\/$/ do |pattern|
47
- @brt.filestore.pattern(Regexp.new(pattern))
47
+ expect(@brt.source.data_subject.extractor.pattern).to eq Regexp.new(pattern)
48
48
  end
49
49
 
50
- Given /^files with names that do not match the pattern \/(.*)\/$/ do |pattern|
51
- @brt.filestore.anti_pattern(Regexp.new(pattern))
52
- end
53
-
54
- Given /^files delivered within the last (\d+) hours$/ do |hours|
55
- @brt.filestore.delivered_since(Time.now - hours.to_i * 3600)
56
- end
57
-
58
- Given /^files were delivered more than (\d+) hours ago$/ do |hours|
59
- @brt.filestore.delivered_before(Time.now - hours.to_i * 3600)
50
+ Given /^download groups defined by the pattern \/(.*)\/$/ do |pattern|
51
+ expect(@brt.source.data_subject.extractor.group_by).to eq Regexp.new(pattern)
60
52
  end
61
53
 
62
54
  Then /^the file with the latest date stamp will be downloaded for processing$/ do
63
- @brt.filestore.generate
64
- @brt.source.mock_extractor(@brt.filestore)
65
- expect(@brt.source.extract).to match_array Array(@brt.filestore.latest)
55
+ expect(@brt.source.data_subject.extractor.most_recent_by).to eq :create_time
66
56
  end
67
57
 
68
- Then /^files will be downloaded for processing$/ do
58
+ Then /^all files matching the pattern will be downloaded for processing$/ do
59
+ expect(@brt.source.data_subject.extractor.most_recent_only).to eq false
69
60
  end
70
61
 
71
- Then /^no files will be downloaded for processing$/ do
72
- @brt.filestore.generate
73
- @brt.source.mock_extractor(@brt.filestore)
74
- expect { @brt.source.extract }.to raise_error Remi::Extractor::SftpFile::FileNotFoundError
62
+ Then /^the file that comes last in an alphanumeric sort by group will be downloaded for processing$/ do
63
+ expect(@brt.source.data_subject.extractor.most_recent_by).to eq :name
75
64
  end
76
65
 
77
66
  Then /^the file is uploaded to the remote path "([^"]+)"$/ do |remote_path|
@@ -150,6 +139,19 @@ Given /^the source field (?:has|is set to) the value "([^"]*)"$/ do |value|
150
139
  end
151
140
  end
152
141
 
142
+ Given /^the source field '([^']+)' (?:has|is set to) the json value$/ do |source_field, value|
143
+ step "the source field '#{source_field}'"
144
+
145
+ source_name, source_field_name = @brt.sources.parse_full_field(source_field)
146
+ @brt.sources[source_name].fields[source_field_name].value = JSON.parse(value)
147
+ end
148
+
149
+ Given /^the source field (?:has|is set to) the json value$/ do |value|
150
+ @brt.sources.fields.each do |field|
151
+ step "the source field '#{field.full_name}' is set to the json value \"#{value}\""
152
+ end
153
+ end
154
+
153
155
  When /^the source field '([^']+)' (?:has an empty value|is blank)$/ do |source_field|
154
156
  step "the source field '#{source_field}'"
155
157
 
@@ -6,11 +6,20 @@ require 'remi'
6
6
  require 'remi/cucumber'
7
7
 
8
8
  Remi::Settings.log_level = Logger::ERROR
9
+ Remi::Settings.jobs_dir = File.join(__dir__, '../../jobs')
9
10
 
10
11
  Before do
11
12
  # Restart the random number generator prior to each scenario to
12
13
  # ensure we have reproducibility of random output
13
14
  Kernel.srand(35983958269835333)
15
+
16
+ # Monkey patch faker gem so that dummy random dates and ranges are generated consistently
17
+ class Faker::Base
18
+ def self.rand_in_range(from, to)
19
+ from, to = to, from if to < from
20
+ Random.rand(from..to)
21
+ end
22
+ end
14
23
  end
15
24
 
16
25
  After do
data/jobs/sample_job.rb CHANGED
@@ -27,8 +27,8 @@ class SampleJob
27
27
  define_source :sample_file, Remi::DataSource::CsvFile,
28
28
  extractor: Remi::Extractor::SftpFile.new(
29
29
  credentials: params[:sftp],
30
- remote_file: /^SampleFile_(\d+)\.txt/,
31
- remote_folder: '/',
30
+ remote_path: '/',
31
+ pattern: /^SampleFile_(\d+)\.txt/,
32
32
  most_recent_only: true
33
33
  ),
34
34
  csv_options: {
data/lib/remi.rb CHANGED
@@ -42,8 +42,9 @@ require 'remi/field_symbolizers'
42
42
 
43
43
  require 'remi/refinements/symbolizer'
44
44
 
45
- require 'remi/extractor/sftp_file' # deprecated
46
45
  require 'remi/extractor/file_system'
46
+ require 'remi/extractor/local_file'
47
+ require 'remi/extractor/sftp_file'
47
48
  require 'remi/extractor/s3_file'
48
49
 
49
50
 
@@ -113,6 +113,7 @@ module Remi::BusinessRules
113
113
 
114
114
  def initialize(job_name)
115
115
  job_class_name = "#{job_name.gsub(/\s/,'')}Job"
116
+ require_job_file(job_class_name)
116
117
  @job = Object.const_get(job_class_name).new
117
118
 
118
119
  @job_sources = DataSubjectCollection.new
@@ -121,8 +122,6 @@ module Remi::BusinessRules
121
122
  @sources = DataSubjectCollection.new
122
123
  @targets = DataSubjectCollection.new
123
124
  @examples = DataExampleCollection.new
124
-
125
- @filestore = Filestore.new
126
125
  end
127
126
 
128
127
  attr_reader :job
@@ -131,7 +130,13 @@ module Remi::BusinessRules
131
130
  attr_reader :sources
132
131
  attr_reader :targets
133
132
  attr_reader :examples
134
- attr_reader :filestore
133
+
134
+ def require_job_file(job_class_name)
135
+ job_file = Dir["#{Remi::Settings.jobs_dir}/**/*_job.rb"].map do |fname|
136
+ fname if File.basename(fname) == "#{job_class_name.underscore}.rb"
137
+ end.compact.pop
138
+ require job_file
139
+ end
135
140
 
136
141
  def add_job_source(name)
137
142
  raise "Unknown source #{name} for job" unless @job.methods.include? name.symbolize
@@ -396,17 +401,6 @@ module Remi::BusinessRules
396
401
  @data_subject.df[vector_name].recode! { |v| i += 1 }
397
402
  end
398
403
 
399
- def mock_extractor(filestore)
400
- extractor = class << @data_subject.extractor; self; end
401
-
402
- extractor.send(:define_method, :all_entries, ->() { filestore.sftp_entries })
403
- extractor.send(:define_method, :download, ->(to_download) { to_download.map { |e| e.name } })
404
- end
405
-
406
- def extract
407
- @data_subject.extractor.extract
408
- end
409
-
410
404
  def csv_options
411
405
  @data_subject.csv_options
412
406
  end
@@ -558,75 +552,4 @@ module Remi::BusinessRules
558
552
  end
559
553
  end
560
554
  end
561
-
562
-
563
- class Filestore
564
- def initialize
565
- @files = []
566
- @delivered = {}
567
- end
568
-
569
- attr_reader :sftp_entries
570
-
571
- def pattern(pattern)
572
- @pattern = pattern
573
- end
574
-
575
- def anti_pattern(pattern)
576
- @pattern = /^ThisBetterNeverMatchAnythingOrIWillShootYou\d{8}Times$/
577
- end
578
-
579
- def delivered_since(date_time)
580
- @delivered = { :since => date_time }
581
- end
582
-
583
- def delivered_before(date_time)
584
- @delivered = { :before => date_time }
585
- end
586
-
587
- def latest
588
- @files.max_by { |f| f[:attributes][:createdtime] }[:name]
589
- end
590
-
591
- def generate
592
- psuedorand = Random.new(4985674985672348954987589429)
593
-
594
- generate_files_with_pattern
595
- @files.map! do |file|
596
- date_method = @delivered.keys.first
597
- if date_method == :since
598
- file[:attributes][:createdtime] = @delivered[:since] + 10 + psuedorand.rand * 100
599
- elsif date_method == :before
600
- file[:attributes][:createdtime] = @delivered[:since] - 10 - psuedorand.rand * 100
601
- else
602
- file[:attributes][:createdtime] = Time.now - 10 - psuedorand.rand * 100
603
- end
604
- file
605
- end
606
- end
607
-
608
- def sftp_entries
609
- @files.map do |file|
610
- Net::SFTP::Protocol::V04::Name.new(
611
- file[:name],
612
- Net::SFTP::Protocol::V04::Attributes.new(createtime: file[:attributes][:createdtime])
613
- )
614
- end
615
- end
616
-
617
- private
618
-
619
- def generate_files_with_pattern
620
- filenames = 1.upto(5).map { |f| @pattern.random_example }.uniq
621
-
622
- @files = filenames.map do |fname|
623
- {
624
- name: fname,
625
- attributes: {
626
- createdtime: nil
627
- }
628
- }
629
- end
630
- end
631
- end
632
555
  end
@@ -53,6 +53,8 @@ module Remi
53
53
  # Assumes that each file has exactly the same structure
54
54
  result_df = nil
55
55
  extract.each_with_index do |filename, idx|
56
+ filename = filename.to_s
57
+
56
58
  @logger.info "Converting #{filename} to a dataframe"
57
59
  processed_filename = preprocess(filename)
58
60
  csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options
@@ -71,16 +73,7 @@ module Remi
71
73
 
72
74
 
73
75
  def extractor=(arg)
74
- case arg
75
- when Extractor::SftpFile, Extractor::LocalFile, Extractor::S3File
76
- @extractor = arg
77
- when String
78
- @extractor = Extractor::LocalFile.new(path: arg)
79
- when Regexp
80
- raise "Adding regex matching to local files would be easy, not done yet"
81
- else
82
- raise "Unknown extractor of type #{arg.class}: #{arg}"
83
- end
76
+ @extractor = arg.respond_to?(:extract) ? arg : Extractor::LocalFile.new(remote_path: arg.to_s)
84
77
  end
85
78
 
86
79
  # Only going to support single file for now
@@ -18,7 +18,6 @@ module Remi
18
18
 
19
19
 
20
20
  class FileSystem
21
-
22
21
  class FileNotFoundError < StandardError; end
23
22
 
24
23
  def initialize(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, logger: Remi::Settings.logger, **kargs, &block)
@@ -31,6 +30,12 @@ module Remi
31
30
  @logger = logger
32
31
  end
33
32
 
33
+ attr_reader :remote_path
34
+ attr_reader :pattern
35
+ attr_reader :local_path
36
+ attr_reader :most_recent_only
37
+ attr_reader :group_by
38
+ attr_reader :most_recent_by
34
39
  attr_reader :logger
35
40
 
36
41
  # Public: Called to extract files from the source filesystem.
@@ -0,0 +1,43 @@
1
+ module Remi
2
+ module Extractor
3
+
4
+ class LocalFile < FileSystem
5
+ def initialize(*args, **kargs)
6
+ super
7
+ init_local_file(*args, **kargs)
8
+ end
9
+
10
+ # Public: Called to extract files from the source filesystem.
11
+ #
12
+ # Returns an array with containing the paths to all files extracted.
13
+ def extract
14
+ entries.map(&:pathname)
15
+ end
16
+
17
+ # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
18
+ def all_entries
19
+ @all_entries ||= all_entries!
20
+ end
21
+
22
+ def all_entries!
23
+ dir = @remote_path.directory? ? @remote_path + '*' : @remote_path
24
+ Dir[dir].map do |entry|
25
+ path = Pathname.new(entry)
26
+ if path.file?
27
+ FileSystemEntry.new(
28
+ pathname: path.realpath.to_s,
29
+ create_time: path.ctime,
30
+ modified_time: path.mtime
31
+ )
32
+ end
33
+ end.compact
34
+ end
35
+
36
+ private
37
+
38
+ def init_local_file(*args, **kargs)
39
+ end
40
+
41
+ end
42
+ end
43
+ end
@@ -1,125 +1,71 @@
1
1
  module Remi
2
2
  module Extractor
3
3
 
4
- class LocalFile
5
- def initialize(path:, folder: nil)
6
- @path = path
7
- @folder = folder
8
- end
9
-
10
- def extract
11
- if @folder
12
- Dir.entries(@folder).map do |entry|
13
- next unless entry.match(@path)
14
- File.join(@folder,entry)
15
- end.compact
16
- else
17
- @path
18
- end
19
- end
20
- end
21
-
22
- class SftpFile
23
-
24
- class FileNotFoundError < StandardError; end
4
+ class SftpFile < FileSystem
25
5
 
26
- SortDesc = Struct.new(:value) do
27
- def <=> (target)
28
- -(self.value <=> target.value)
29
- end
30
- end
6
+ N_RETRY = 3
31
7
 
32
- def initialize(credentials:, remote_file:, remote_folder: '', local_folder: Settings.work_dir, port: nil, most_recent_only: false, group_by: nil, most_recent_by: :createtime, logger: Remi::Settings.logger)
33
- @credentials = credentials
34
- @remote_file = remote_file
35
- @remote_folder = remote_folder
36
- @local_folder = local_folder
37
- @port = port || (credentials && credentials[:port]) || '22'
38
- @most_recent_only = most_recent_only
39
- @group_by = group_by
40
- @most_recent_by = most_recent_by
41
- @logger = logger
8
+ def initialize(*args, **kargs)
9
+ super
10
+ init_sftp_file(*args, **kargs)
42
11
  end
43
12
 
44
- attr_reader :logger
13
+ attr_reader :host
14
+ attr_reader :username
15
+ attr_reader :password
16
+ attr_reader :port
45
17
 
18
+ # Public: Called to extract files from the source filesystem.
19
+ #
20
+ # Returns an array with containing the paths to all files extracted.
46
21
  def extract
47
- raise FileNotFoundError, "File not found: #{@remote_file}" if to_download.size == 0
48
- download(to_download)
49
- end
50
-
51
- def to_download
52
- if @group_by
53
- most_recent_in_group
54
- elsif @most_recent_only
55
- Array(most_recent_entry(matching_entries))
56
- else
57
- matching_entries
22
+ connection do |sftp|
23
+ entries.map do |entry|
24
+ local_file = File.join(@local_path, entry.name)
25
+ @logger.info "Downloading #{entry.name} to #{local_file}"
26
+ retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
27
+ local_file
28
+ end
58
29
  end
59
30
  end
60
31
 
61
- def all_entries(remote_folder = @remote_folder)
62
- @all_entries ||= connection { |sftp| sftp.dir.entries(File.join("/", remote_folder)) }
32
+ # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
33
+ def all_entries
34
+ @all_entries ||= all_entries!
63
35
  end
64
36
 
65
- def matching_entries(match_name = @remote_file)
66
- all_entries.select { |e| match_name.match e.name }
67
- end
68
-
69
- def most_recent_entry(entries = matching_entries)
70
- entries.sort_by { |e| sort_files_by(e) }.reverse!.first
71
- end
72
-
73
- def sort_files_by(entry)
74
- if @most_recent_by == :filename
75
- entry.name
76
- else
77
- entry.attributes.send(@most_recent_by)
37
+ def all_entries!
38
+ sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path) }
39
+ sftp_entries.map do |entry|
40
+ # Early versions of the protocol don't support create time, fake it with modified time?
41
+ FileSystemEntry.new(
42
+ pathname: File.join(@remote_path, entry.name),
43
+ create_time: entry.attributes.respond_to?(:createtime) ? entry.attributes.createtime : entry.attributes.mtime,
44
+ modified_time: entry.attributes.mtime
45
+ )
78
46
  end
79
47
  end
80
48
 
81
- def most_recent_in_group(match_group = @group_by)
82
- entries_with_group = matching_entries.map do |entry|
83
- match = entry.name.match(match_group)
84
- next unless match
85
49
 
86
- group = match.to_a[1..-1]
87
- { group: group, entry: entry }
88
- end.compact
89
- entries_with_group.sort_by! { |e| [e[:group], SortDesc.new(sort_files_by(e[:entry]))] }
90
-
91
- last_group = nil
92
- entries_with_group.map do |entry|
93
- next unless entry[:group] != last_group
94
- last_group = entry[:group]
95
- entry[:entry]
96
- end.compact
97
- end
50
+ private
98
51
 
99
- def download(entries_to_download, remote_folder: @remote_folder, local_folder: @local_folder, ntry: 3)
100
- connection do |sftp|
101
- entries_to_download.map do |entry|
102
- local_file = File.join(local_folder, entry.name)
103
- @logger.info "Downloading #{entry.name} to #{local_file}"
104
- retry_download(ntry) { sftp.download!(File.join(remote_folder, entry.name), local_file) }
105
- local_file
106
- end
107
- end
52
+ def init_sftp_file(*args, credentials:, **kargs)
53
+ @host = credentials.fetch(:host)
54
+ @username = credentials.fetch(:username)
55
+ @password = credentials.fetch(:password)
56
+ @port = credentials.fetch(:port, '22')
108
57
  end
109
58
 
110
-
111
- private
112
-
113
59
  def connection(&block)
114
60
  result = nil
115
- Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @port) do |sftp|
61
+ Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
116
62
  result = yield sftp
117
63
  end
118
64
  result
119
65
  end
120
66
 
121
- def retry_download(ntry=2, &block)
122
- 1.upto(ntry).each do |itry|
67
+ def retry_download(&block)
68
+ 1.upto(N_RETRY).each do |itry|
123
69
  begin
124
70
  block.call
125
71
  break
@@ -132,5 +78,6 @@ module Remi
132
78
  end
133
79
  end
134
80
  end
81
+
135
82
  end
136
83
  end
data/lib/remi/fields.rb CHANGED
@@ -4,5 +4,26 @@ module Remi
4
4
  @fields = Hash.new({}).merge fields
5
5
  super(@fields)
6
6
  end
7
+
8
+
9
+ def dup
10
+ Fields.new(@fields.dup)
11
+ end
12
+
13
+ def merge(other_fields, prefix: nil)
14
+ dup.merge!(other_fields, prefix: prefix)
15
+ end
16
+
17
+ def merge!(other_fields, prefix: nil)
18
+ @fields.merge!(other_fields) do |key, this_val, other_val|
19
+ if prefix
20
+ @fields["#{prefix}#{key}".to_sym] = other_val
21
+ this_val
22
+ else
23
+ this_val.merge other_val
24
+ end
25
+ end
26
+ end
27
+
7
28
  end
8
29
  end
data/lib/remi/settings.rb CHANGED
@@ -10,6 +10,14 @@ module Remi
10
10
  @work_dir = arg
11
11
  end
12
12
 
13
+ def jobs_dir
14
+ @jobs_dir ||= Pathname.new('jobs').realpath
15
+ end
16
+
17
+ def jobs_dir=(arg)
18
+ @jobs_dir = Pathname.new(arg).realpath
19
+ end
20
+
13
21
  def log_level
14
22
  @log_level ||= Logger::INFO
15
23
  end
@@ -18,7 +18,7 @@ module Remi
18
18
  attr_accessor :target_metadata
19
19
 
20
20
  # Public: Set to true if the transform expects multiple arguments (default: false)
21
- attr_reader :multi_arg
21
+ attr_reader :multi_args
22
22
 
23
23
  # Public: Defines the operation of this transform class.
24
24
  #
@@ -36,7 +36,7 @@ module Remi
36
36
  #
37
37
  # Returns the transformed value.
38
38
  def call(*values)
39
- if @multi_arg
39
+ if @multi_args
40
40
  to_proc.call(*values)
41
41
  else
42
42
  to_proc.call(Array(values).first)
data/lib/remi/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Remi
2
- VERSION = '0.2.35'
2
+ VERSION = '0.2.36'
3
3
  end
@@ -22,7 +22,8 @@ describe DataSource::CsvFile do
22
22
  filename_field: :from_file
23
23
  )
24
24
 
25
- expect(csv.df[:from_file].to_a).to eq ['spec/fixtures/basic.csv'] * 2
25
+ expected_files = [Pathname.new('spec/fixtures/basic.csv').realpath.to_s] * 2
26
+ expect(csv.df[:from_file].to_a).to eq expected_files
26
27
  end
27
28
 
28
29
  it "preprocesses records when required" do
@@ -56,12 +57,11 @@ describe DataSource::CsvFile do
56
57
  expect(csv.df.to_a).to eq expected_df.to_a
57
58
  end
58
59
 
59
- # Do this when I retire the old LocalFile
60
- it "combines multiple csv files into a single dataframe", skip: 'TODO' do
60
+ it "combines multiple csv files into a single dataframe" do
61
61
  csv = Remi::DataSource::CsvFile.new(
62
62
  extractor: Remi::Extractor::LocalFile.new(
63
63
  remote_path: 'spec/fixtures',
64
- pattern: 'basic(|2)\.csv'
64
+ pattern: /basic(|2)\.csv/
65
65
  )
66
66
  )
67
67
 
@@ -69,9 +69,10 @@ describe DataSource::CsvFile do
69
69
  {
70
70
  column_a: ['value 1A', 'value 2A', 'value 1A', 'value 2A'],
71
71
  column_b: ['value 1B', 'value 2B', nil, nil],
72
- columb_c: [nil, nil, 'value 1C', 'value 2C']
72
+ column_c: [nil, nil, 'value 1C', 'value 2C']
73
73
  }
74
74
  )
75
+
75
76
  expect(csv.df.to_a).to eq expected_df.to_a
76
77
  end
77
78
 
@@ -0,0 +1,31 @@
1
+ require 'remi_spec'
2
+
3
+ describe Extractor::LocalFile do
4
+ let(:remote_path) { "#{Pathname.new(__FILE__).dirname}" }
5
+
6
+ let(:local_file) {
7
+ Extractor::LocalFile.new(
8
+ remote_path: remote_path
9
+ )
10
+ }
11
+
12
+ let(:remote_filenames) { Dir[remote_path + '/*'].map { |f| Pathname.new(f).basename.to_s } }
13
+
14
+ context '.new' do
15
+ it 'creates an instance with valid parameters' do
16
+ local_file
17
+ end
18
+ end
19
+
20
+ context '#all_entires' do
21
+ it 'returns all entries' do
22
+ expect(local_file.all_entries.map(&:name)).to eq remote_filenames
23
+ end
24
+ end
25
+
26
+ context '#extract' do
27
+ it 'references local files with the right names' do
28
+ expect(local_file.extract.map { |f| Pathname.new(f).basename.to_s }).to eq remote_filenames
29
+ end
30
+ end
31
+ end
@@ -1,125 +1,84 @@
1
1
  require 'remi_spec'
2
2
 
3
3
  describe Extractor::SftpFile do
4
- before do
5
- now = Time.new
4
+ let(:remote_path) { '' }
5
+ let(:credentials) {
6
+ {
7
+ host: 'host',
8
+ username: 'username',
9
+ password: 'password'
10
+ }
11
+ }
12
+
13
+ let(:sftp_file) {
14
+ Extractor::SftpFile.new(
15
+ credentials: credentials,
16
+ remote_path: remote_path
17
+ )
18
+ }
19
+
20
+ let(:remote_filenames) { ['file1.csv', 'file2.csv'] }
21
+ let(:sftp_session) { instance_double('Net:SFTP::Session') }
6
22
 
7
- example_files = [
8
- { name: "ApplicantsA-9.csv", createtime: now - 10.minutes },
9
- { name: "ApplicantsA-3.csv", createtime: now - 5.minutes },
10
- { name: "ApplicantsA-5.csv", createtime: now - 1.minutes },
11
- { name: "ApplicantsB-7.csv", createtime: now - 10.minutes },
12
- { name: "ApplicantsB-6.csv", createtime: now - 5.minutes },
13
- { name: "ApplicantsB-2.csv", createtime: now - 1.minutes },
14
- { name: "ApplicantsB-2.txt", createtime: now - 0.minutes },
15
- { name: "Apples.csv", createtime: now - 1.minutes },
16
- ]
23
+ before do
24
+ sftp_dir = instance_double('Net::SFTP::Operations::Dir')
17
25
 
18
- allow_any_instance_of(Extractor::SftpFile).to receive(:all_entries) do
19
- example_files.map do |file|
20
- Net::SFTP::Protocol::V04::Name.new(
21
- file[:name],
22
- Net::SFTP::Protocol::V04::Attributes.new(createtime: file[:createtime])
23
- )
24
- end
25
- end
26
+ allow(Net::SFTP).to receive(:start).and_yield sftp_session
27
+ allow(sftp_session).to receive(:dir).and_return sftp_dir
26
28
 
27
- @params = { credentials: nil }
29
+ allow(sftp_dir).to receive(:entries).and_return(remote_filenames.map { |fname|
30
+ Net::SFTP::Protocol::V04::Name.new(
31
+ fname,
32
+ Net::SFTP::Protocol::V04::Attributes.new(createtime: Time.new.to_i, mtime: Time.new.to_i)
33
+ )
34
+ })
28
35
  end
29
36
 
30
- let(:sftpfile) { Extractor::SftpFile.new(**@params) }
31
-
32
-
33
-
34
-
35
- context 'extracting all files matching a pattern' do
36
- before do
37
- @params[:remote_file] = /ApplicantsA-\d+\.csv/
37
+ context '.new' do
38
+ it 'creates an instance with valid parameters' do
39
+ sftp_file
38
40
  end
39
41
 
40
- it 'does not extract non-matching files' do
41
- expect(sftpfile.to_download.map(&:name)).not_to include "Apples.csv"
42
+ it 'requires a hostname' do
43
+ credentials.delete(:host)
44
+ expect { sftp_file }.to raise_error KeyError
42
45
  end
43
46
 
44
- it 'extracts all matching files' do
45
- expect(sftpfile.to_download.map(&:name)).to match_array([
46
- "ApplicantsA-9.csv",
47
- "ApplicantsA-3.csv",
48
- "ApplicantsA-5.csv"
49
- ])
47
+ it 'requires a username' do
48
+ credentials.delete(:username)
49
+ expect { sftp_file }.to raise_error KeyError
50
50
  end
51
- end
52
-
53
51
 
54
- context 'extracting only the most recent matching a pattern' do
55
- before do
56
- @params.merge!({
57
- remote_file: /ApplicantsA-\d+\.csv/,
58
- most_recent_only: true
59
- })
52
+ it 'requires a password' do
53
+ credentials.delete(:password)
54
+ expect { sftp_file }.to raise_error KeyError
60
55
  end
61
56
 
62
- it 'extracts only the most recent matching file' do
63
- expect(sftpfile.to_download.map(&:name)).to match_array([
64
- "ApplicantsA-5.csv"
65
- ])
57
+ it 'defaults to using port 22' do
58
+ expect(sftp_file.port).to eq '22'
66
59
  end
67
60
 
68
- context 'using filename instead of createtime' do
69
- before do
70
- @params[:most_recent_by] = :filename
71
- end
72
-
73
- it 'extracts only the most recent matching file' do
74
- expect(sftpfile.to_download.map(&:name)).to match_array([
75
- "ApplicantsA-9.csv"
76
- ])
77
- end
61
+ it 'allows the port to be defined in the credentials' do
62
+ credentials[:port] = '1234'
63
+ expect(sftp_file.port).to eq '1234'
78
64
  end
79
65
  end
80
66
 
81
-
82
- context 'extracting files matching a pattern with a by group' do
83
- before do
84
- @params.merge!({
85
- credentials: nil,
86
- remote_file: /^Applicants(A|B)-\d+\.csv/,
87
- group_by: /^Applicants(A|B)/
88
- })
89
- end
90
-
91
- it 'extracts the most recent file that matches a particular regex' do
92
- expect(sftpfile.to_download.map(&:name)).to match_array([
93
- "ApplicantsA-5.csv",
94
- "ApplicantsB-2.csv"
95
- ])
67
+ context '#all_entires' do
68
+ it 'returns all entries' do
69
+ expect(sftp_file.all_entries.map(&:name)).to eq remote_filenames
96
70
  end
71
+ end
97
72
 
98
- context 'with a minimally selective pre-filter' do
99
- before do
100
- @params[:remote_file] = /^Applicants/
101
- end
102
-
103
- it 'extracts the most recent file that matches a particular regex' do
104
- expect(sftpfile.to_download.map(&:name)).to match_array([
105
- "ApplicantsA-5.csv",
106
- "ApplicantsB-2.txt"
107
- ])
108
- end
73
+ context '#extract' do
74
+ it 'downloads files from the ftp' do
75
+ expect(sftp_session).to receive(:download!).exactly(remote_filenames.size).times
76
+ sftp_file.extract
109
77
  end
110
78
 
111
- context 'using filename instead of createtime' do
112
- before do
113
- @params[:most_recent_by] = :filename
114
- end
115
-
116
- it 'extracts only the most recent matching file' do
117
- expect(sftpfile.to_download.map(&:name)).to match_array([
118
- "ApplicantsA-9.csv",
119
- "ApplicantsB-7.csv"
120
- ])
121
- end
79
+ it 'creates local files with the right names' do
80
+ allow(sftp_session).to receive(:download!)
81
+ expect(sftp_file.extract.map { |f| Pathname.new(f).basename.to_s }).to eq remote_filenames
122
82
  end
123
-
124
83
  end
125
84
  end
@@ -0,0 +1,97 @@
1
+ require_relative 'remi_spec'
2
+
3
+ describe Fields do
4
+
5
+ let :base_fields do
6
+ Fields.new(
7
+ {
8
+ col1: { from: :base, base: true },
9
+ col2: { from: :base, base: true }
10
+ }
11
+ )
12
+ end
13
+
14
+ let :fields2 do
15
+
16
+ end
17
+
18
+ context "merging field sets" do
19
+
20
+ context "when there is no overlap" do
21
+ it "unions field sets" do
22
+ other_fields = Fields.new(
23
+ {
24
+ col3: {},
25
+ col4: {}
26
+ }
27
+ )
28
+
29
+ merged_fields = base_fields.merge other_fields
30
+
31
+ expect(merged_fields.keys).to eq [:col1, :col2, :col3, :col4]
32
+ end
33
+ end
34
+
35
+ context "when there is overlap" do
36
+ let :other_fields do
37
+ Fields.new(
38
+ {
39
+ col2: { from: :other, other: true },
40
+ col3: { from: :other, other: true }
41
+ }
42
+ )
43
+ end
44
+
45
+ it "unions field sets when there is overlap" do
46
+ merged_fields = base_fields.merge other_fields
47
+ expect(merged_fields.keys).to eq [:col1, :col2, :col3]
48
+ end
49
+
50
+ it "merges overlapping metadata" do
51
+ merged_fields = base_fields.merge other_fields
52
+
53
+ expect(merged_fields).to eq(
54
+ {
55
+ col1: { from: :base, base: true },
56
+ col2: { from: :other, base: true, other: true },
57
+ col3: { from: :other, other: true }
58
+ }
59
+ )
60
+ end
61
+
62
+ it "does not affect the original field sets" do
63
+ merged_fields = base_fields.merge other_fields
64
+
65
+ expect(base_fields).to eq(
66
+ {
67
+ col1: { from: :base, base: true },
68
+ col2: { from: :base, base: true }
69
+ }
70
+ )
71
+
72
+ expect(other_fields).to eq(
73
+ {
74
+ col2: { from: :other, other: true },
75
+ col3: { from: :other, other: true }
76
+ }
77
+ )
78
+ end
79
+
80
+ context "with a prefix" do
81
+ it "creates new fields for names that conflict" do
82
+ merged_fields = base_fields.merge other_fields, prefix: :other_
83
+
84
+ expect(merged_fields).to eq(
85
+ {
86
+ col1: { from: :base, base: true },
87
+ col2: { from: :base, base: true },
88
+ other_col2: { from: :other, other: true },
89
+ col3: { from: :other, other: true }
90
+ }
91
+ )
92
+ end
93
+ end
94
+
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,53 @@
1
+ require_relative 'remi_spec'
2
+
3
+ describe Transform do
4
+
5
+ context 'a transform with a single argument' do
6
+ before do
7
+ class SingleArgument < Transform
8
+ def initialize(*args, **kargs, &block)
9
+ super
10
+ end
11
+
12
+ def transform(value)
13
+ value
14
+ end
15
+ end
16
+ end
17
+
18
+ let(:transform) { SingleArgument.new }
19
+
20
+ it 'can be converted into a proc and called' do
21
+ expect(transform.to_proc.call(5)).to eq 5
22
+ end
23
+
24
+ it 'can be called directly' do
25
+ expect(transform.call(5)).to eq 5
26
+ end
27
+ end
28
+
29
+ context 'a transform that accepts multiple arguments' do
30
+ before do
31
+ class MultipleArgument < Transform
32
+ def initialize(*args, **kargs, &block)
33
+ super
34
+ @multi_args = true
35
+ end
36
+
37
+ def transform(*values)
38
+ Array(values)
39
+ end
40
+ end
41
+ end
42
+
43
+ let(:transform) { MultipleArgument.new }
44
+
45
+ it 'can be converted into a proc and called' do
46
+ expect(transform.to_proc.call(1, 2)).to eq [1, 2]
47
+ end
48
+
49
+ it 'can be called directly' do
50
+ expect(transform.call(1, 2)).to eq [1, 2]
51
+ end
52
+ end
53
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.35
4
+ version: 0.2.36
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sterling Paramore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-06-08 00:00:00.000000000 Z
11
+ date: 2016-06-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bond
@@ -232,9 +232,9 @@ files:
232
232
  - lib/remi/data_subject/salesforce.rb
233
233
  - lib/remi/data_subject/sftp_file.rb
234
234
  - lib/remi/extractor/file_system.rb
235
+ - lib/remi/extractor/local_file.rb
235
236
  - lib/remi/extractor/s3_file.rb
236
237
  - lib/remi/extractor/sftp_file.rb
237
- - lib/remi/extractor/sftp_file_new.rb
238
238
  - lib/remi/field_symbolizers.rb
239
239
  - lib/remi/fields.rb
240
240
  - lib/remi/job.rb
@@ -248,13 +248,16 @@ files:
248
248
  - spec/data_subject/csv_file_spec.rb
249
249
  - spec/data_subject/data_frame.rb
250
250
  - spec/extractor/file_system_spec.rb
251
+ - spec/extractor/local_file_spec.rb
251
252
  - spec/extractor/s3_file_spec.rb
252
253
  - spec/extractor/sftp_file_spec.rb
254
+ - spec/fields_spec.rb
253
255
  - spec/fixtures/basic.csv
254
256
  - spec/fixtures/basic2.csv
255
257
  - spec/fixtures/unsupported_escape.csv
256
258
  - spec/metadata_spec.rb
257
259
  - spec/remi_spec.rb
260
+ - spec/transform_spec.rb
258
261
  - workbooks/sample_workbook.ipynb
259
262
  - workbooks/workbook_helper.rb
260
263
  homepage: https://github.com/inside-track/remi
@@ -304,10 +307,13 @@ test_files:
304
307
  - spec/data_subject/csv_file_spec.rb
305
308
  - spec/data_subject/data_frame.rb
306
309
  - spec/extractor/file_system_spec.rb
310
+ - spec/extractor/local_file_spec.rb
307
311
  - spec/extractor/s3_file_spec.rb
308
312
  - spec/extractor/sftp_file_spec.rb
313
+ - spec/fields_spec.rb
309
314
  - spec/fixtures/basic.csv
310
315
  - spec/fixtures/basic2.csv
311
316
  - spec/fixtures/unsupported_escape.csv
312
317
  - spec/metadata_spec.rb
313
318
  - spec/remi_spec.rb
319
+ - spec/transform_spec.rb
@@ -1,78 +0,0 @@
1
- module Remi
2
- module Extractor
3
-
4
- class SftpFileNew < FileSystem
5
-
6
- N_RETRY = 3
7
-
8
- def initialize(*args, **kargs)
9
- super
10
- init_sftp_file(*args, **kargs)
11
- end
12
-
13
- # Public: Called to extract files from the source filesystem.
14
- #
15
- # Returns an array with containing the paths to all files extracted.
16
- def extract
17
- connection do |sftp|
18
- entries.map do |entry|
19
- local_file = File.join(@local_path, entry.name)
20
- @logger.info "Downloading #{entry.name} to #{local_file}"
21
- retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
22
- local_file
23
- end
24
- end
25
- end
26
-
27
- # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
28
- def all_entries
29
- @all_entries ||= all_entries!
30
- end
31
-
32
- def all_entries!
33
- sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path.dirname) }
34
- sftp_entries.map do |entry|
35
- # Early versions of the protocol don't support create time, fake it with modified time?
36
- FileSystemEntry.new(
37
- name: File.join(@remote_path.dirname, entry.name),
38
- create_time: entry.respond_to?(:createtime) ? entry.createtime : entry.mtime,
39
- modified_time: entry.mtime
40
- )
41
- end
42
- end
43
-
44
-
45
- private
46
-
47
- def init_sftp_file(*args, credentials:, **kargs)
48
- @host = credentials.fetch(:host)
49
- @username = credentials.fetch(:username)
50
- @password = credentials.fetch(:password)
51
- @port = credentials.fetch(:port, '22')
52
- end
53
-
54
- def connection(&block)
55
- result = nil
56
- Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
57
- result = yield sftp
58
- end
59
- result
60
- end
61
-
62
- def retry_download(&block)
63
- 1.upto(N_RETRY).each do |itry|
64
- begin
65
- block.call
66
- break
67
- rescue RuntimeError => err
68
- raise err unless itry < ntry
69
- @logger.error "Download failed with error: #{err.message}"
70
- @logger.error "Retry attempt #{itry}/#{ntry-1}"
71
- sleep(1)
72
- end
73
- end
74
- end
75
- end
76
-
77
- end
78
- end