remi 0.2.35 → 0.2.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6ac6de18abdeb2269c391a732e283e54c995344f
4
- data.tar.gz: 51f64d79e4b176bb4f2dbd15a4407fe0823aa13f
3
+ metadata.gz: 0b43c0f454a9c3df185534347b76d5d7ac0a37c4
4
+ data.tar.gz: 024381d3a8b2da98b1de66fddd14c0929a304d3d
5
5
  SHA512:
6
- metadata.gz: 9978c92842fc114224f06c72aacaca3175f9b6467b2189fcce5d8f60f26335da324f4f5daf4ae770f33b266ba5f1757f032dd0de7c3ea25f926cdd8f64b20c50
7
- data.tar.gz: 8390273e8305709141f4823d837cfcbf7b0fcf891ad2f9cdecdb418bdee5be11cc25da8e3daa9afa5ad14ec42e0a813b23ba2d220133dbce1e58ea82c551f99e
6
+ metadata.gz: bc680e8048b300f645c013a3f261506b6c4e1efdef0f240f37eebb0d393f2cc948de06d53d1d3912122011ba8d2755ac8e851c6399af79c13d68b467389a1118
7
+ data.tar.gz: cb5f72b9db98cdf7b79fcaee6a1597e9259b805779b93655ba0738b5e8a14240a0e57cc6078b92cdc2c414f21eb3af034eb78a185e974f3c28707667ea80f792
data/Gemfile.lock CHANGED
@@ -19,7 +19,7 @@ GIT
19
19
  PATH
20
20
  remote: .
21
21
  specs:
22
- remi (0.2.35)
22
+ remi (0.2.36)
23
23
  activesupport (~> 4.2)
24
24
  bond (~> 0.5)
25
25
  cucumber (~> 2.1)
@@ -33,10 +33,6 @@ Feature: This is a sample feature file.
33
33
  And files with names matching the pattern /^SampleFile_(\d+)\.txt/
34
34
  Then the file with the latest date stamp will be downloaded for processing
35
35
 
36
- Given files with names that do not match the pattern /^SampleFile_(\d+)\.txt/
37
- Then no files will be downloaded for processing
38
-
39
-
40
36
  Scenario: In order to be parsed and properly processed, the file must conform
41
37
  to expectations about its structure and content.
42
38
 
@@ -44,34 +44,23 @@ end
44
44
  ### Source file processing
45
45
 
46
46
  Given /^files with names matching the pattern \/(.*)\/$/ do |pattern|
47
- @brt.filestore.pattern(Regexp.new(pattern))
47
+ expect(@brt.source.data_subject.extractor.pattern).to eq Regexp.new(pattern)
48
48
  end
49
49
 
50
- Given /^files with names that do not match the pattern \/(.*)\/$/ do |pattern|
51
- @brt.filestore.anti_pattern(Regexp.new(pattern))
52
- end
53
-
54
- Given /^files delivered within the last (\d+) hours$/ do |hours|
55
- @brt.filestore.delivered_since(Time.now - hours.to_i * 3600)
56
- end
57
-
58
- Given /^files were delivered more than (\d+) hours ago$/ do |hours|
59
- @brt.filestore.delivered_before(Time.now - hours.to_i * 3600)
50
+ Given /^download groups defined by the pattern \/(.*)\/$/ do |pattern|
51
+ expect(@brt.source.data_subject.extractor.group_by).to eq Regexp.new(pattern)
60
52
  end
61
53
 
62
54
  Then /^the file with the latest date stamp will be downloaded for processing$/ do
63
- @brt.filestore.generate
64
- @brt.source.mock_extractor(@brt.filestore)
65
- expect(@brt.source.extract).to match_array Array(@brt.filestore.latest)
55
+ expect(@brt.source.data_subject.extractor.most_recent_by).to eq :create_time
66
56
  end
67
57
 
68
- Then /^files will be downloaded for processing$/ do
58
+ Then /^all files matching the pattern will be downloaded for processing$/ do
59
+ expect(@brt.source.data_subject.extractor.most_recent_only).to eq false
69
60
  end
70
61
 
71
- Then /^no files will be downloaded for processing$/ do
72
- @brt.filestore.generate
73
- @brt.source.mock_extractor(@brt.filestore)
74
- expect { @brt.source.extract }.to raise_error Remi::Extractor::SftpFile::FileNotFoundError
62
+ Then /^the file that comes last in an alphanumeric sort by group will be downloaded for processing$/ do
63
+ expect(@brt.source.data_subject.extractor.most_recent_by).to eq :name
75
64
  end
76
65
 
77
66
  Then /^the file is uploaded to the remote path "([^"]+)"$/ do |remote_path|
@@ -150,6 +139,19 @@ Given /^the source field (?:has|is set to) the value "([^"]*)"$/ do |value|
150
139
  end
151
140
  end
152
141
 
142
+ Given /^the source field '([^']+)' (?:has|is set to) the json value$/ do |source_field, value|
143
+ step "the source field '#{source_field}'"
144
+
145
+ source_name, source_field_name = @brt.sources.parse_full_field(source_field)
146
+ @brt.sources[source_name].fields[source_field_name].value = JSON.parse(value)
147
+ end
148
+
149
+ Given /^the source field (?:has|is set to) the json value$/ do |value|
150
+ @brt.sources.fields.each do |field|
151
+ step "the source field '#{field.full_name}' is set to the json value \"#{value}\""
152
+ end
153
+ end
154
+
153
155
  When /^the source field '([^']+)' (?:has an empty value|is blank)$/ do |source_field|
154
156
  step "the source field '#{source_field}'"
155
157
 
@@ -6,11 +6,20 @@ require 'remi'
6
6
  require 'remi/cucumber'
7
7
 
8
8
  Remi::Settings.log_level = Logger::ERROR
9
+ Remi::Settings.jobs_dir = File.join(__dir__, '../../jobs')
9
10
 
10
11
  Before do
11
12
  # Restart the random number generator prior to each scenario to
12
13
  # ensure we have reproducibility of random output
13
14
  Kernel.srand(35983958269835333)
15
+
16
+ # Monkey patch faker gem so that dummy random dates and ranges are generated consistently
17
+ class Faker::Base
18
+ def self.rand_in_range(from, to)
19
+ from, to = to, from if to < from
20
+ Random.rand(from..to)
21
+ end
22
+ end
14
23
  end
15
24
 
16
25
  After do
data/jobs/sample_job.rb CHANGED
@@ -27,8 +27,8 @@ class SampleJob
27
27
  define_source :sample_file, Remi::DataSource::CsvFile,
28
28
  extractor: Remi::Extractor::SftpFile.new(
29
29
  credentials: params[:sftp],
30
- remote_file: /^SampleFile_(\d+)\.txt/,
31
- remote_folder: '/',
30
+ remote_path: '/',
31
+ pattern: /^SampleFile_(\d+)\.txt/,
32
32
  most_recent_only: true
33
33
  ),
34
34
  csv_options: {
data/lib/remi.rb CHANGED
@@ -42,8 +42,9 @@ require 'remi/field_symbolizers'
42
42
 
43
43
  require 'remi/refinements/symbolizer'
44
44
 
45
- require 'remi/extractor/sftp_file' # deprecated
46
45
  require 'remi/extractor/file_system'
46
+ require 'remi/extractor/local_file'
47
+ require 'remi/extractor/sftp_file'
47
48
  require 'remi/extractor/s3_file'
48
49
 
49
50
 
@@ -113,6 +113,7 @@ module Remi::BusinessRules
113
113
 
114
114
  def initialize(job_name)
115
115
  job_class_name = "#{job_name.gsub(/\s/,'')}Job"
116
+ require_job_file(job_class_name)
116
117
  @job = Object.const_get(job_class_name).new
117
118
 
118
119
  @job_sources = DataSubjectCollection.new
@@ -121,8 +122,6 @@ module Remi::BusinessRules
121
122
  @sources = DataSubjectCollection.new
122
123
  @targets = DataSubjectCollection.new
123
124
  @examples = DataExampleCollection.new
124
-
125
- @filestore = Filestore.new
126
125
  end
127
126
 
128
127
  attr_reader :job
@@ -131,7 +130,13 @@ module Remi::BusinessRules
131
130
  attr_reader :sources
132
131
  attr_reader :targets
133
132
  attr_reader :examples
134
- attr_reader :filestore
133
+
134
+ def require_job_file(job_class_name)
135
+ job_file = Dir["#{Remi::Settings.jobs_dir}/**/*_job.rb"].map do |fname|
136
+ fname if File.basename(fname) == "#{job_class_name.underscore}.rb"
137
+ end.compact.pop
138
+ require job_file
139
+ end
135
140
 
136
141
  def add_job_source(name)
137
142
  raise "Unknown source #{name} for job" unless @job.methods.include? name.symbolize
@@ -396,17 +401,6 @@ module Remi::BusinessRules
396
401
  @data_subject.df[vector_name].recode! { |v| i += 1 }
397
402
  end
398
403
 
399
- def mock_extractor(filestore)
400
- extractor = class << @data_subject.extractor; self; end
401
-
402
- extractor.send(:define_method, :all_entries, ->() { filestore.sftp_entries })
403
- extractor.send(:define_method, :download, ->(to_download) { to_download.map { |e| e.name } })
404
- end
405
-
406
- def extract
407
- @data_subject.extractor.extract
408
- end
409
-
410
404
  def csv_options
411
405
  @data_subject.csv_options
412
406
  end
@@ -558,75 +552,4 @@ module Remi::BusinessRules
558
552
  end
559
553
  end
560
554
  end
561
-
562
-
563
- class Filestore
564
- def initialize
565
- @files = []
566
- @delivered = {}
567
- end
568
-
569
- attr_reader :sftp_entries
570
-
571
- def pattern(pattern)
572
- @pattern = pattern
573
- end
574
-
575
- def anti_pattern(pattern)
576
- @pattern = /^ThisBetterNeverMatchAnythingOrIWillShootYou\d{8}Times$/
577
- end
578
-
579
- def delivered_since(date_time)
580
- @delivered = { :since => date_time }
581
- end
582
-
583
- def delivered_before(date_time)
584
- @delivered = { :before => date_time }
585
- end
586
-
587
- def latest
588
- @files.max_by { |f| f[:attributes][:createdtime] }[:name]
589
- end
590
-
591
- def generate
592
- psuedorand = Random.new(4985674985672348954987589429)
593
-
594
- generate_files_with_pattern
595
- @files.map! do |file|
596
- date_method = @delivered.keys.first
597
- if date_method == :since
598
- file[:attributes][:createdtime] = @delivered[:since] + 10 + psuedorand.rand * 100
599
- elsif date_method == :before
600
- file[:attributes][:createdtime] = @delivered[:since] - 10 - psuedorand.rand * 100
601
- else
602
- file[:attributes][:createdtime] = Time.now - 10 - psuedorand.rand * 100
603
- end
604
- file
605
- end
606
- end
607
-
608
- def sftp_entries
609
- @files.map do |file|
610
- Net::SFTP::Protocol::V04::Name.new(
611
- file[:name],
612
- Net::SFTP::Protocol::V04::Attributes.new(createtime: file[:attributes][:createdtime])
613
- )
614
- end
615
- end
616
-
617
- private
618
-
619
- def generate_files_with_pattern
620
- filenames = 1.upto(5).map { |f| @pattern.random_example }.uniq
621
-
622
- @files = filenames.map do |fname|
623
- {
624
- name: fname,
625
- attributes: {
626
- createdtime: nil
627
- }
628
- }
629
- end
630
- end
631
- end
632
555
  end
@@ -53,6 +53,8 @@ module Remi
53
53
  # Assumes that each file has exactly the same structure
54
54
  result_df = nil
55
55
  extract.each_with_index do |filename, idx|
56
+ filename = filename.to_s
57
+
56
58
  @logger.info "Converting #{filename} to a dataframe"
57
59
  processed_filename = preprocess(filename)
58
60
  csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options
@@ -71,16 +73,7 @@ module Remi
71
73
 
72
74
 
73
75
  def extractor=(arg)
74
- case arg
75
- when Extractor::SftpFile, Extractor::LocalFile, Extractor::S3File
76
- @extractor = arg
77
- when String
78
- @extractor = Extractor::LocalFile.new(path: arg)
79
- when Regexp
80
- raise "Adding regex matching to local files would be easy, not done yet"
81
- else
82
- raise "Unknown extractor of type #{arg.class}: #{arg}"
83
- end
76
+ @extractor = arg.respond_to?(:extract) ? arg : Extractor::LocalFile.new(remote_path: arg.to_s)
84
77
  end
85
78
 
86
79
  # Only going to support single file for now
@@ -18,7 +18,6 @@ module Remi
18
18
 
19
19
 
20
20
  class FileSystem
21
-
22
21
  class FileNotFoundError < StandardError; end
23
22
 
24
23
  def initialize(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, logger: Remi::Settings.logger, **kargs, &block)
@@ -31,6 +30,12 @@ module Remi
31
30
  @logger = logger
32
31
  end
33
32
 
33
+ attr_reader :remote_path
34
+ attr_reader :pattern
35
+ attr_reader :local_path
36
+ attr_reader :most_recent_only
37
+ attr_reader :group_by
38
+ attr_reader :most_recent_by
34
39
  attr_reader :logger
35
40
 
36
41
  # Public: Called to extract files from the source filesystem.
@@ -0,0 +1,43 @@
1
+ module Remi
2
+ module Extractor
3
+
4
+ class LocalFile < FileSystem
5
+ def initialize(*args, **kargs)
6
+ super
7
+ init_local_file(*args, **kargs)
8
+ end
9
+
10
+ # Public: Called to extract files from the source filesystem.
11
+ #
12
+ # Returns an array with containing the paths to all files extracted.
13
+ def extract
14
+ entries.map(&:pathname)
15
+ end
16
+
17
+ # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
18
+ def all_entries
19
+ @all_entries ||= all_entries!
20
+ end
21
+
22
+ def all_entries!
23
+ dir = @remote_path.directory? ? @remote_path + '*' : @remote_path
24
+ Dir[dir].map do |entry|
25
+ path = Pathname.new(entry)
26
+ if path.file?
27
+ FileSystemEntry.new(
28
+ pathname: path.realpath.to_s,
29
+ create_time: path.ctime,
30
+ modified_time: path.mtime
31
+ )
32
+ end
33
+ end.compact
34
+ end
35
+
36
+ private
37
+
38
+ def init_local_file(*args, **kargs)
39
+ end
40
+
41
+ end
42
+ end
43
+ end
@@ -1,125 +1,71 @@
1
1
  module Remi
2
2
  module Extractor
3
3
 
4
- class LocalFile
5
- def initialize(path:, folder: nil)
6
- @path = path
7
- @folder = folder
8
- end
9
-
10
- def extract
11
- if @folder
12
- Dir.entries(@folder).map do |entry|
13
- next unless entry.match(@path)
14
- File.join(@folder,entry)
15
- end.compact
16
- else
17
- @path
18
- end
19
- end
20
- end
21
-
22
- class SftpFile
23
-
24
- class FileNotFoundError < StandardError; end
4
+ class SftpFile < FileSystem
25
5
 
26
- SortDesc = Struct.new(:value) do
27
- def <=> (target)
28
- -(self.value <=> target.value)
29
- end
30
- end
6
+ N_RETRY = 3
31
7
 
32
- def initialize(credentials:, remote_file:, remote_folder: '', local_folder: Settings.work_dir, port: nil, most_recent_only: false, group_by: nil, most_recent_by: :createtime, logger: Remi::Settings.logger)
33
- @credentials = credentials
34
- @remote_file = remote_file
35
- @remote_folder = remote_folder
36
- @local_folder = local_folder
37
- @port = port || (credentials && credentials[:port]) || '22'
38
- @most_recent_only = most_recent_only
39
- @group_by = group_by
40
- @most_recent_by = most_recent_by
41
- @logger = logger
8
+ def initialize(*args, **kargs)
9
+ super
10
+ init_sftp_file(*args, **kargs)
42
11
  end
43
12
 
44
- attr_reader :logger
13
+ attr_reader :host
14
+ attr_reader :username
15
+ attr_reader :password
16
+ attr_reader :port
45
17
 
18
+ # Public: Called to extract files from the source filesystem.
19
+ #
20
+ # Returns an array with containing the paths to all files extracted.
46
21
  def extract
47
- raise FileNotFoundError, "File not found: #{@remote_file}" if to_download.size == 0
48
- download(to_download)
49
- end
50
-
51
- def to_download
52
- if @group_by
53
- most_recent_in_group
54
- elsif @most_recent_only
55
- Array(most_recent_entry(matching_entries))
56
- else
57
- matching_entries
22
+ connection do |sftp|
23
+ entries.map do |entry|
24
+ local_file = File.join(@local_path, entry.name)
25
+ @logger.info "Downloading #{entry.name} to #{local_file}"
26
+ retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
27
+ local_file
28
+ end
58
29
  end
59
30
  end
60
31
 
61
- def all_entries(remote_folder = @remote_folder)
62
- @all_entries ||= connection { |sftp| sftp.dir.entries(File.join("/", remote_folder)) }
32
+ # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
33
+ def all_entries
34
+ @all_entries ||= all_entries!
63
35
  end
64
36
 
65
- def matching_entries(match_name = @remote_file)
66
- all_entries.select { |e| match_name.match e.name }
67
- end
68
-
69
- def most_recent_entry(entries = matching_entries)
70
- entries.sort_by { |e| sort_files_by(e) }.reverse!.first
71
- end
72
-
73
- def sort_files_by(entry)
74
- if @most_recent_by == :filename
75
- entry.name
76
- else
77
- entry.attributes.send(@most_recent_by)
37
+ def all_entries!
38
+ sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path) }
39
+ sftp_entries.map do |entry|
40
+ # Early versions of the protocol don't support create time, fake it with modified time?
41
+ FileSystemEntry.new(
42
+ pathname: File.join(@remote_path, entry.name),
43
+ create_time: entry.attributes.respond_to?(:createtime) ? entry.attributes.createtime : entry.attributes.mtime,
44
+ modified_time: entry.attributes.mtime
45
+ )
78
46
  end
79
47
  end
80
48
 
81
- def most_recent_in_group(match_group = @group_by)
82
- entries_with_group = matching_entries.map do |entry|
83
- match = entry.name.match(match_group)
84
- next unless match
85
49
 
86
- group = match.to_a[1..-1]
87
- { group: group, entry: entry }
88
- end.compact
89
- entries_with_group.sort_by! { |e| [e[:group], SortDesc.new(sort_files_by(e[:entry]))] }
90
-
91
- last_group = nil
92
- entries_with_group.map do |entry|
93
- next unless entry[:group] != last_group
94
- last_group = entry[:group]
95
- entry[:entry]
96
- end.compact
97
- end
50
+ private
98
51
 
99
- def download(entries_to_download, remote_folder: @remote_folder, local_folder: @local_folder, ntry: 3)
100
- connection do |sftp|
101
- entries_to_download.map do |entry|
102
- local_file = File.join(local_folder, entry.name)
103
- @logger.info "Downloading #{entry.name} to #{local_file}"
104
- retry_download(ntry) { sftp.download!(File.join(remote_folder, entry.name), local_file) }
105
- local_file
106
- end
107
- end
52
+ def init_sftp_file(*args, credentials:, **kargs)
53
+ @host = credentials.fetch(:host)
54
+ @username = credentials.fetch(:username)
55
+ @password = credentials.fetch(:password)
56
+ @port = credentials.fetch(:port, '22')
108
57
  end
109
58
 
110
-
111
- private
112
-
113
59
  def connection(&block)
114
60
  result = nil
115
- Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @port) do |sftp|
61
+ Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
116
62
  result = yield sftp
117
63
  end
118
64
  result
119
65
  end
120
66
 
121
- def retry_download(ntry=2, &block)
122
- 1.upto(ntry).each do |itry|
67
+ def retry_download(&block)
68
+ 1.upto(N_RETRY).each do |itry|
123
69
  begin
124
70
  block.call
125
71
  break
@@ -132,5 +78,6 @@ module Remi
132
78
  end
133
79
  end
134
80
  end
81
+
135
82
  end
136
83
  end
data/lib/remi/fields.rb CHANGED
@@ -4,5 +4,26 @@ module Remi
4
4
  @fields = Hash.new({}).merge fields
5
5
  super(@fields)
6
6
  end
7
+
8
+
9
+ def dup
10
+ Fields.new(@fields.dup)
11
+ end
12
+
13
+ def merge(other_fields, prefix: nil)
14
+ dup.merge!(other_fields, prefix: prefix)
15
+ end
16
+
17
+ def merge!(other_fields, prefix: nil)
18
+ @fields.merge!(other_fields) do |key, this_val, other_val|
19
+ if prefix
20
+ @fields["#{prefix}#{key}".to_sym] = other_val
21
+ this_val
22
+ else
23
+ this_val.merge other_val
24
+ end
25
+ end
26
+ end
27
+
7
28
  end
8
29
  end
data/lib/remi/settings.rb CHANGED
@@ -10,6 +10,14 @@ module Remi
10
10
  @work_dir = arg
11
11
  end
12
12
 
13
+ def jobs_dir
14
+ @jobs_dir ||= Pathname.new('jobs').realpath
15
+ end
16
+
17
+ def jobs_dir=(arg)
18
+ @jobs_dir = Pathname.new(arg).realpath
19
+ end
20
+
13
21
  def log_level
14
22
  @log_level ||= Logger::INFO
15
23
  end
@@ -18,7 +18,7 @@ module Remi
18
18
  attr_accessor :target_metadata
19
19
 
20
20
  # Public: Set to true if the transform expects multiple arguments (default: false)
21
- attr_reader :multi_arg
21
+ attr_reader :multi_args
22
22
 
23
23
  # Public: Defines the operation of this transform class.
24
24
  #
@@ -36,7 +36,7 @@ module Remi
36
36
  #
37
37
  # Returns the transformed value.
38
38
  def call(*values)
39
- if @multi_arg
39
+ if @multi_args
40
40
  to_proc.call(*values)
41
41
  else
42
42
  to_proc.call(Array(values).first)
data/lib/remi/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Remi
2
- VERSION = '0.2.35'
2
+ VERSION = '0.2.36'
3
3
  end
@@ -22,7 +22,8 @@ describe DataSource::CsvFile do
22
22
  filename_field: :from_file
23
23
  )
24
24
 
25
- expect(csv.df[:from_file].to_a).to eq ['spec/fixtures/basic.csv'] * 2
25
+ expected_files = [Pathname.new('spec/fixtures/basic.csv').realpath.to_s] * 2
26
+ expect(csv.df[:from_file].to_a).to eq expected_files
26
27
  end
27
28
 
28
29
  it "preprocesses records when required" do
@@ -56,12 +57,11 @@ describe DataSource::CsvFile do
56
57
  expect(csv.df.to_a).to eq expected_df.to_a
57
58
  end
58
59
 
59
- # Do this when I retire the old LocalFile
60
- it "combines multiple csv files into a single dataframe", skip: 'TODO' do
60
+ it "combines multiple csv files into a single dataframe" do
61
61
  csv = Remi::DataSource::CsvFile.new(
62
62
  extractor: Remi::Extractor::LocalFile.new(
63
63
  remote_path: 'spec/fixtures',
64
- pattern: 'basic(|2)\.csv'
64
+ pattern: /basic(|2)\.csv/
65
65
  )
66
66
  )
67
67
 
@@ -69,9 +69,10 @@ describe DataSource::CsvFile do
69
69
  {
70
70
  column_a: ['value 1A', 'value 2A', 'value 1A', 'value 2A'],
71
71
  column_b: ['value 1B', 'value 2B', nil, nil],
72
- columb_c: [nil, nil, 'value 1C', 'value 2C']
72
+ column_c: [nil, nil, 'value 1C', 'value 2C']
73
73
  }
74
74
  )
75
+
75
76
  expect(csv.df.to_a).to eq expected_df.to_a
76
77
  end
77
78
 
@@ -0,0 +1,31 @@
1
+ require 'remi_spec'
2
+
3
+ describe Extractor::LocalFile do
4
+ let(:remote_path) { "#{Pathname.new(__FILE__).dirname}" }
5
+
6
+ let(:local_file) {
7
+ Extractor::LocalFile.new(
8
+ remote_path: remote_path
9
+ )
10
+ }
11
+
12
+ let(:remote_filenames) { Dir[remote_path + '/*'].map { |f| Pathname.new(f).basename.to_s } }
13
+
14
+ context '.new' do
15
+ it 'creates an instance with valid parameters' do
16
+ local_file
17
+ end
18
+ end
19
+
20
+ context '#all_entires' do
21
+ it 'returns all entries' do
22
+ expect(local_file.all_entries.map(&:name)).to eq remote_filenames
23
+ end
24
+ end
25
+
26
+ context '#extract' do
27
+ it 'references local files with the right names' do
28
+ expect(local_file.extract.map { |f| Pathname.new(f).basename.to_s }).to eq remote_filenames
29
+ end
30
+ end
31
+ end
@@ -1,125 +1,84 @@
1
1
  require 'remi_spec'
2
2
 
3
3
  describe Extractor::SftpFile do
4
- before do
5
- now = Time.new
4
+ let(:remote_path) { '' }
5
+ let(:credentials) {
6
+ {
7
+ host: 'host',
8
+ username: 'username',
9
+ password: 'password'
10
+ }
11
+ }
12
+
13
+ let(:sftp_file) {
14
+ Extractor::SftpFile.new(
15
+ credentials: credentials,
16
+ remote_path: remote_path
17
+ )
18
+ }
19
+
20
+ let(:remote_filenames) { ['file1.csv', 'file2.csv'] }
21
+ let(:sftp_session) { instance_double('Net:SFTP::Session') }
6
22
 
7
- example_files = [
8
- { name: "ApplicantsA-9.csv", createtime: now - 10.minutes },
9
- { name: "ApplicantsA-3.csv", createtime: now - 5.minutes },
10
- { name: "ApplicantsA-5.csv", createtime: now - 1.minutes },
11
- { name: "ApplicantsB-7.csv", createtime: now - 10.minutes },
12
- { name: "ApplicantsB-6.csv", createtime: now - 5.minutes },
13
- { name: "ApplicantsB-2.csv", createtime: now - 1.minutes },
14
- { name: "ApplicantsB-2.txt", createtime: now - 0.minutes },
15
- { name: "Apples.csv", createtime: now - 1.minutes },
16
- ]
23
+ before do
24
+ sftp_dir = instance_double('Net::SFTP::Operations::Dir')
17
25
 
18
- allow_any_instance_of(Extractor::SftpFile).to receive(:all_entries) do
19
- example_files.map do |file|
20
- Net::SFTP::Protocol::V04::Name.new(
21
- file[:name],
22
- Net::SFTP::Protocol::V04::Attributes.new(createtime: file[:createtime])
23
- )
24
- end
25
- end
26
+ allow(Net::SFTP).to receive(:start).and_yield sftp_session
27
+ allow(sftp_session).to receive(:dir).and_return sftp_dir
26
28
 
27
- @params = { credentials: nil }
29
+ allow(sftp_dir).to receive(:entries).and_return(remote_filenames.map { |fname|
30
+ Net::SFTP::Protocol::V04::Name.new(
31
+ fname,
32
+ Net::SFTP::Protocol::V04::Attributes.new(createtime: Time.new.to_i, mtime: Time.new.to_i)
33
+ )
34
+ })
28
35
  end
29
36
 
30
- let(:sftpfile) { Extractor::SftpFile.new(**@params) }
31
-
32
-
33
-
34
-
35
- context 'extracting all files matching a pattern' do
36
- before do
37
- @params[:remote_file] = /ApplicantsA-\d+\.csv/
37
+ context '.new' do
38
+ it 'creates an instance with valid parameters' do
39
+ sftp_file
38
40
  end
39
41
 
40
- it 'does not extract non-matching files' do
41
- expect(sftpfile.to_download.map(&:name)).not_to include "Apples.csv"
42
+ it 'requires a hostname' do
43
+ credentials.delete(:host)
44
+ expect { sftp_file }.to raise_error KeyError
42
45
  end
43
46
 
44
- it 'extracts all matching files' do
45
- expect(sftpfile.to_download.map(&:name)).to match_array([
46
- "ApplicantsA-9.csv",
47
- "ApplicantsA-3.csv",
48
- "ApplicantsA-5.csv"
49
- ])
47
+ it 'requires a username' do
48
+ credentials.delete(:username)
49
+ expect { sftp_file }.to raise_error KeyError
50
50
  end
51
- end
52
-
53
51
 
54
- context 'extracting only the most recent matching a pattern' do
55
- before do
56
- @params.merge!({
57
- remote_file: /ApplicantsA-\d+\.csv/,
58
- most_recent_only: true
59
- })
52
+ it 'requires a password' do
53
+ credentials.delete(:password)
54
+ expect { sftp_file }.to raise_error KeyError
60
55
  end
61
56
 
62
- it 'extracts only the most recent matching file' do
63
- expect(sftpfile.to_download.map(&:name)).to match_array([
64
- "ApplicantsA-5.csv"
65
- ])
57
+ it 'defaults to using port 22' do
58
+ expect(sftp_file.port).to eq '22'
66
59
  end
67
60
 
68
- context 'using filename instead of createtime' do
69
- before do
70
- @params[:most_recent_by] = :filename
71
- end
72
-
73
- it 'extracts only the most recent matching file' do
74
- expect(sftpfile.to_download.map(&:name)).to match_array([
75
- "ApplicantsA-9.csv"
76
- ])
77
- end
61
+ it 'allows the port to be defined in the credentials' do
62
+ credentials[:port] = '1234'
63
+ expect(sftp_file.port).to eq '1234'
78
64
  end
79
65
  end
80
66
 
81
-
82
- context 'extracting files matching a pattern with a by group' do
83
- before do
84
- @params.merge!({
85
- credentials: nil,
86
- remote_file: /^Applicants(A|B)-\d+\.csv/,
87
- group_by: /^Applicants(A|B)/
88
- })
89
- end
90
-
91
- it 'extracts the most recent file that matches a particular regex' do
92
- expect(sftpfile.to_download.map(&:name)).to match_array([
93
- "ApplicantsA-5.csv",
94
- "ApplicantsB-2.csv"
95
- ])
67
+ context '#all_entires' do
68
+ it 'returns all entries' do
69
+ expect(sftp_file.all_entries.map(&:name)).to eq remote_filenames
96
70
  end
71
+ end
97
72
 
98
- context 'with a minimally selective pre-filter' do
99
- before do
100
- @params[:remote_file] = /^Applicants/
101
- end
102
-
103
- it 'extracts the most recent file that matches a particular regex' do
104
- expect(sftpfile.to_download.map(&:name)).to match_array([
105
- "ApplicantsA-5.csv",
106
- "ApplicantsB-2.txt"
107
- ])
108
- end
73
+ context '#extract' do
74
+ it 'downloads files from the ftp' do
75
+ expect(sftp_session).to receive(:download!).exactly(remote_filenames.size).times
76
+ sftp_file.extract
109
77
  end
110
78
 
111
- context 'using filename instead of createtime' do
112
- before do
113
- @params[:most_recent_by] = :filename
114
- end
115
-
116
- it 'extracts only the most recent matching file' do
117
- expect(sftpfile.to_download.map(&:name)).to match_array([
118
- "ApplicantsA-9.csv",
119
- "ApplicantsB-7.csv"
120
- ])
121
- end
79
+ it 'creates local files with the right names' do
80
+ allow(sftp_session).to receive(:download!)
81
+ expect(sftp_file.extract.map { |f| Pathname.new(f).basename.to_s }).to eq remote_filenames
122
82
  end
123
-
124
83
  end
125
84
  end
@@ -0,0 +1,97 @@
1
+ require_relative 'remi_spec'
2
+
3
+ describe Fields do
4
+
5
+ let :base_fields do
6
+ Fields.new(
7
+ {
8
+ col1: { from: :base, base: true },
9
+ col2: { from: :base, base: true }
10
+ }
11
+ )
12
+ end
13
+
14
+ let :fields2 do
15
+
16
+ end
17
+
18
+ context "merging field sets" do
19
+
20
+ context "when there is no overlap" do
21
+ it "unions field sets" do
22
+ other_fields = Fields.new(
23
+ {
24
+ col3: {},
25
+ col4: {}
26
+ }
27
+ )
28
+
29
+ merged_fields = base_fields.merge other_fields
30
+
31
+ expect(merged_fields.keys).to eq [:col1, :col2, :col3, :col4]
32
+ end
33
+ end
34
+
35
+ context "when there is overlap" do
36
+ let :other_fields do
37
+ Fields.new(
38
+ {
39
+ col2: { from: :other, other: true },
40
+ col3: { from: :other, other: true }
41
+ }
42
+ )
43
+ end
44
+
45
+ it "unions field sets when there is overlap" do
46
+ merged_fields = base_fields.merge other_fields
47
+ expect(merged_fields.keys).to eq [:col1, :col2, :col3]
48
+ end
49
+
50
+ it "merges overlapping metadata" do
51
+ merged_fields = base_fields.merge other_fields
52
+
53
+ expect(merged_fields).to eq(
54
+ {
55
+ col1: { from: :base, base: true },
56
+ col2: { from: :other, base: true, other: true },
57
+ col3: { from: :other, other: true }
58
+ }
59
+ )
60
+ end
61
+
62
+ it "does not affect the original field sets" do
63
+ merged_fields = base_fields.merge other_fields
64
+
65
+ expect(base_fields).to eq(
66
+ {
67
+ col1: { from: :base, base: true },
68
+ col2: { from: :base, base: true }
69
+ }
70
+ )
71
+
72
+ expect(other_fields).to eq(
73
+ {
74
+ col2: { from: :other, other: true },
75
+ col3: { from: :other, other: true }
76
+ }
77
+ )
78
+ end
79
+
80
+ context "with a prefix" do
81
+ it "creates new fields for names that conflict" do
82
+ merged_fields = base_fields.merge other_fields, prefix: :other_
83
+
84
+ expect(merged_fields).to eq(
85
+ {
86
+ col1: { from: :base, base: true },
87
+ col2: { from: :base, base: true },
88
+ other_col2: { from: :other, other: true },
89
+ col3: { from: :other, other: true }
90
+ }
91
+ )
92
+ end
93
+ end
94
+
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,53 @@
1
+ require_relative 'remi_spec'
2
+
3
+ describe Transform do
4
+
5
+ context 'a transform with a single argument' do
6
+ before do
7
+ class SingleArgument < Transform
8
+ def initialize(*args, **kargs, &block)
9
+ super
10
+ end
11
+
12
+ def transform(value)
13
+ value
14
+ end
15
+ end
16
+ end
17
+
18
+ let(:transform) { SingleArgument.new }
19
+
20
+ it 'can be converted into a proc and called' do
21
+ expect(transform.to_proc.call(5)).to eq 5
22
+ end
23
+
24
+ it 'can be called directly' do
25
+ expect(transform.call(5)).to eq 5
26
+ end
27
+ end
28
+
29
+ context 'a transform that accepts multiple arguments' do
30
+ before do
31
+ class MultipleArgument < Transform
32
+ def initialize(*args, **kargs, &block)
33
+ super
34
+ @multi_args = true
35
+ end
36
+
37
+ def transform(*values)
38
+ Array(values)
39
+ end
40
+ end
41
+ end
42
+
43
+ let(:transform) { MultipleArgument.new }
44
+
45
+ it 'can be converted into a proc and called' do
46
+ expect(transform.to_proc.call(1, 2)).to eq [1, 2]
47
+ end
48
+
49
+ it 'can be called directly' do
50
+ expect(transform.call(1, 2)).to eq [1, 2]
51
+ end
52
+ end
53
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.35
4
+ version: 0.2.36
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sterling Paramore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-06-08 00:00:00.000000000 Z
11
+ date: 2016-06-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bond
@@ -232,9 +232,9 @@ files:
232
232
  - lib/remi/data_subject/salesforce.rb
233
233
  - lib/remi/data_subject/sftp_file.rb
234
234
  - lib/remi/extractor/file_system.rb
235
+ - lib/remi/extractor/local_file.rb
235
236
  - lib/remi/extractor/s3_file.rb
236
237
  - lib/remi/extractor/sftp_file.rb
237
- - lib/remi/extractor/sftp_file_new.rb
238
238
  - lib/remi/field_symbolizers.rb
239
239
  - lib/remi/fields.rb
240
240
  - lib/remi/job.rb
@@ -248,13 +248,16 @@ files:
248
248
  - spec/data_subject/csv_file_spec.rb
249
249
  - spec/data_subject/data_frame.rb
250
250
  - spec/extractor/file_system_spec.rb
251
+ - spec/extractor/local_file_spec.rb
251
252
  - spec/extractor/s3_file_spec.rb
252
253
  - spec/extractor/sftp_file_spec.rb
254
+ - spec/fields_spec.rb
253
255
  - spec/fixtures/basic.csv
254
256
  - spec/fixtures/basic2.csv
255
257
  - spec/fixtures/unsupported_escape.csv
256
258
  - spec/metadata_spec.rb
257
259
  - spec/remi_spec.rb
260
+ - spec/transform_spec.rb
258
261
  - workbooks/sample_workbook.ipynb
259
262
  - workbooks/workbook_helper.rb
260
263
  homepage: https://github.com/inside-track/remi
@@ -304,10 +307,13 @@ test_files:
304
307
  - spec/data_subject/csv_file_spec.rb
305
308
  - spec/data_subject/data_frame.rb
306
309
  - spec/extractor/file_system_spec.rb
310
+ - spec/extractor/local_file_spec.rb
307
311
  - spec/extractor/s3_file_spec.rb
308
312
  - spec/extractor/sftp_file_spec.rb
313
+ - spec/fields_spec.rb
309
314
  - spec/fixtures/basic.csv
310
315
  - spec/fixtures/basic2.csv
311
316
  - spec/fixtures/unsupported_escape.csv
312
317
  - spec/metadata_spec.rb
313
318
  - spec/remi_spec.rb
319
+ - spec/transform_spec.rb
@@ -1,78 +0,0 @@
1
- module Remi
2
- module Extractor
3
-
4
- class SftpFileNew < FileSystem
5
-
6
- N_RETRY = 3
7
-
8
- def initialize(*args, **kargs)
9
- super
10
- init_sftp_file(*args, **kargs)
11
- end
12
-
13
- # Public: Called to extract files from the source filesystem.
14
- #
15
- # Returns an array with containing the paths to all files extracted.
16
- def extract
17
- connection do |sftp|
18
- entries.map do |entry|
19
- local_file = File.join(@local_path, entry.name)
20
- @logger.info "Downloading #{entry.name} to #{local_file}"
21
- retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
22
- local_file
23
- end
24
- end
25
- end
26
-
27
- # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
28
- def all_entries
29
- @all_entries ||= all_entries!
30
- end
31
-
32
- def all_entries!
33
- sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path.dirname) }
34
- sftp_entries.map do |entry|
35
- # Early versions of the protocol don't support create time, fake it with modified time?
36
- FileSystemEntry.new(
37
- name: File.join(@remote_path.dirname, entry.name),
38
- create_time: entry.respond_to?(:createtime) ? entry.createtime : entry.mtime,
39
- modified_time: entry.mtime
40
- )
41
- end
42
- end
43
-
44
-
45
- private
46
-
47
- def init_sftp_file(*args, credentials:, **kargs)
48
- @host = credentials.fetch(:host)
49
- @username = credentials.fetch(:username)
50
- @password = credentials.fetch(:password)
51
- @port = credentials.fetch(:port, '22')
52
- end
53
-
54
- def connection(&block)
55
- result = nil
56
- Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
57
- result = yield sftp
58
- end
59
- result
60
- end
61
-
62
- def retry_download(&block)
63
- 1.upto(N_RETRY).each do |itry|
64
- begin
65
- block.call
66
- break
67
- rescue RuntimeError => err
68
- raise err unless itry < ntry
69
- @logger.error "Download failed with error: #{err.message}"
70
- @logger.error "Retry attempt #{itry}/#{ntry-1}"
71
- sleep(1)
72
- end
73
- end
74
- end
75
- end
76
-
77
- end
78
- end