remi 0.2.19 → 0.2.20

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9b1844e0c62caecbd5d416c5b2ea035f38dae670
4
- data.tar.gz: 5246aa674a9f203aef16ff6d775625cc0441fe40
3
+ metadata.gz: ea1259debd08b1c32c279209ab9a87eb5b0607ac
4
+ data.tar.gz: 624f19339fae31fe5951dc31001f913a5f834c1c
5
5
  SHA512:
6
- metadata.gz: 774b90bd45ff5474a8e402928b5b40979c0a1e9dc6b2d08dced42dee25c567c5806aeb2b2d68a0f98efb1388ef5f3b2be6d47bcc80da1b7b3842152dfafce653
7
- data.tar.gz: 3f2ff9109ea794f6f2006ca66d7f9cc55ba574e826391d2a57c15172fcc1a25047c28709d270266e2f9abf03c38ce031fd693f70d07964208b34a5ad60eaf2dd
6
+ metadata.gz: 08513f0eba7dca6fc95c90f288ca1581d592cda4d5c2f32073c4ec85ffa53b2521c72aaceabdd9a4482171caf1935c4714ca85d785b2a10a22f65d5325ee8eb4
7
+ data.tar.gz: a74eecaf4bfce033db38fddf4daef177320f447c01f15cabab39cb494db485943177e3385b3dc1a2d3b8f6e428289e410659f8d1aa9e907e464c6ac779d64ce4
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- remi (0.2.19)
4
+ remi (0.2.20)
5
5
  activesupport (~> 4.2)
6
6
  bond (~> 0.5)
7
7
  cucumber (~> 2.1)
@@ -12,3 +12,8 @@ Feature: Test the truncate transformer.
12
12
 
13
13
  And the source field 'My Field' is set to the value "something"
14
14
  Then the target field 'Truncated Field' is set to the value "somet"
15
+
16
+ And the job parameter 'truncate_len' is "7"
17
+
18
+ And the source field 'My Field' is set to the value "something"
19
+ Then the target field 'Truncated Field' is set to the value "somethi"
@@ -3,6 +3,8 @@ module Remi
3
3
  class CsvFile
4
4
  include DataSource
5
5
 
6
+ using Remi::Refinements::Daru
7
+
6
8
  def self.default_csv_options
7
9
  CSV::DEFAULT_OPTIONS.merge({
8
10
  headers: true,
@@ -14,10 +16,11 @@ module Remi
14
16
  end
15
17
 
16
18
 
17
- def initialize(fields: {}, extractor:, csv_options: {}, logger: Remi::Settings.logger)
19
+ def initialize(fields: {}, extractor:, csv_options: {}, filename_field: nil, logger: Remi::Settings.logger)
18
20
  @fields = fields
19
21
  self.extractor = extractor
20
22
  @csv_options = self.class.default_csv_options.merge(csv_options)
23
+ @filename_field = filename_field
21
24
  @logger = logger
22
25
  end
23
26
 
@@ -30,7 +33,11 @@ module Remi
30
33
  end
31
34
 
32
35
  def extract
33
- Array(@extractor.extract).tap { |x| raise "Multiple files not supported" if x.size > 1 }
36
+ @extracted = Array(@extractor.extract)
37
+ end
38
+
39
+ def extracted
40
+ @extracted || extract
34
41
  end
35
42
 
36
43
  def extractor=(arg)
@@ -38,7 +45,7 @@ module Remi
38
45
  when Extractor::SftpFile, Extractor::LocalFile
39
46
  @extractor = arg
40
47
  when String
41
- @extractor = Extractor::LocalFile.new(arg)
48
+ @extractor = Extractor::LocalFile.new(path: arg)
42
49
  when Regexp
43
50
  raise "Adding regex matching to local files would be easy, not done yet"
44
51
  else
@@ -48,7 +55,8 @@ module Remi
48
55
 
49
56
  # Only going to support single file for now
50
57
  def source_filename
51
- @source_filename ||= extract.first
58
+ raise "Multiple source files detected" if extracted.size > 1
59
+ @source_filename ||= extracted.first
52
60
  end
53
61
 
54
62
  def first_line
@@ -67,8 +75,21 @@ module Remi
67
75
  end
68
76
 
69
77
  def to_dataframe
70
- @logger.info "Converting #{source_filename} to a dataframe"
71
- Daru::DataFrame.from_csv source_filename, @csv_options
78
+ # Assumes that each file has exactly the same structure
79
+ result_df = nil
80
+ extracted.each_with_index do |filename, idx|
81
+ @logger.info "Converting #{filename} to a dataframe"
82
+ csv_df = Daru::DataFrame.from_csv filename, @csv_options
83
+
84
+ csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
85
+ if idx == 0
86
+ result_df = csv_df
87
+ else
88
+ result_df = result_df.concat csv_df
89
+ end
90
+ end
91
+
92
+ result_df
72
93
  end
73
94
 
74
95
  def df
@@ -2,12 +2,20 @@ module Remi
2
2
  module Extractor
3
3
 
4
4
  class LocalFile
5
- def initialize(path)
5
+ def initialize(path:, folder: nil)
6
6
  @path = path
7
+ @folder = folder
7
8
  end
8
9
 
9
10
  def extract
10
- @path
11
+ if @folder
12
+ Dir.entries(@folder).map do |entry|
13
+ next unless entry.match(@path)
14
+ File.join(@folder,entry)
15
+ end.compact
16
+ else
17
+ @path
18
+ end
11
19
  end
12
20
  end
13
21
 
@@ -15,24 +23,41 @@ module Remi
15
23
 
16
24
  class FileNotFoundError < StandardError; end
17
25
 
18
- def initialize(credentials:, remote_file:, remote_folder: '', local_folder: Settings.work_dir, port: '22', most_recent_only: false, logger: Remi::Settings.logger)
26
+ SortDesc = Struct.new(:value) do
27
+ def <=> (target)
28
+ -(self.value <=> target.value)
29
+ end
30
+ end
31
+
32
+ def initialize(credentials:, remote_file:, remote_folder: '', local_folder: Settings.work_dir, port: nil, most_recent_only: false, group_by: nil, most_recent_by: :createtime, logger: Remi::Settings.logger)
19
33
  @credentials = credentials
20
34
  @remote_file = remote_file
21
35
  @remote_folder = remote_folder
22
36
  @local_folder = local_folder
23
- @port = port
37
+ @port = port || (credentials && credentials[:port]) || '22'
24
38
  @most_recent_only = most_recent_only
39
+ @group_by = group_by
40
+ @most_recent_by = most_recent_by
25
41
  @logger = logger
26
42
  end
27
43
 
28
44
  attr_reader :logger
29
45
 
30
46
  def extract
31
- to_download = @most_recent_only ? Array(most_recent_entry(matching_entries)) : matching_entries
32
47
  raise FileNotFoundError, "File not found: #{@remote_file}" if to_download.size == 0
33
48
  download(to_download)
34
49
  end
35
50
 
51
+ def to_download
52
+ if @group_by
53
+ most_recent_in_group
54
+ elsif @most_recent_only
55
+ Array(most_recent_entry(matching_entries))
56
+ else
57
+ matching_entries
58
+ end
59
+ end
60
+
36
61
  def all_entries(remote_folder = @remote_folder)
37
62
  @all_entries ||= connection { |sftp| sftp.dir.entries(File.join("/", remote_folder)) }
38
63
  end
@@ -42,15 +67,41 @@ module Remi
42
67
  end
43
68
 
44
69
  def most_recent_entry(entries = matching_entries)
45
- entries.sort_by { |e| e.attributes.createtime }.reverse!.first
70
+ entries.sort_by { |e| sort_files_by(e) }.reverse!.first
71
+ end
72
+
73
+ def sort_files_by(entry)
74
+ if @most_recent_by == :filename
75
+ entry.name
76
+ else
77
+ entry.attributes.send(@most_recent_by)
78
+ end
79
+ end
80
+
81
+ def most_recent_in_group(match_group = @group_by)
82
+ entries_with_group = matching_entries.map do |entry|
83
+ match = entry.name.match(match_group)
84
+ next unless match
85
+
86
+ group = match.to_a[1..-1]
87
+ { group: group, entry: entry }
88
+ end.compact
89
+ entries_with_group.sort_by! { |e| [e[:group], SortDesc.new(sort_files_by(e[:entry]))] }
90
+
91
+ last_group = nil
92
+ entries_with_group.map do |entry|
93
+ next unless entry[:group] != last_group
94
+ last_group = entry[:group]
95
+ entry[:entry]
96
+ end.compact
46
97
  end
47
98
 
48
- def download(to_download = matching_entries, local_folder: @local_folder, ntry: 3)
99
+ def download(entries_to_download, remote_folder: @remote_folder, local_folder: @local_folder, ntry: 3)
49
100
  connection do |sftp|
50
- to_download.map do |entry|
101
+ entries_to_download.map do |entry|
51
102
  local_file = File.join(local_folder, entry.name)
52
103
  @logger.info "Downloading #{entry.name} to #{local_file}"
53
- retry_download(ntry) { sftp.download!(entry.name, local_file) }
104
+ retry_download(ntry) { sftp.download!(File.join(remote_folder, entry.name), local_file) }
54
105
  local_file
55
106
  end
56
107
  end
@@ -71,6 +122,7 @@ module Remi
71
122
  1.upto(ntry).each do |itry|
72
123
  begin
73
124
  block.call
125
+ break
74
126
  rescue RuntimeError => err
75
127
  raise err unless itry < ntry
76
128
  @logger.error "Download failed with error: #{err.message}"
@@ -13,15 +13,23 @@ module Remi
13
13
  dupdf
14
14
  end
15
15
 
16
- # Public: Fixes a bug where the dataframe on the left side of the
17
- # concatenation is accidentally modified.
16
+ # Public: Allows for combining dataframes with different columns
18
17
  def concat other_df
19
- vectors = []
20
- @vectors.each do |v|
21
- vectors << self[v].dup.to_a.concat(other_df[v].to_a)
18
+ vectors = @vectors.to_a
19
+ data = []
20
+
21
+ vectors.each do |v|
22
+ other_vec = other_df.vectors.include?(v) ? other_df[v].to_a : [nil] * other_df.size
23
+ data << self[v].dup.to_a.concat(other_vec)
24
+ end
25
+
26
+ other_df.vectors.each do |v|
27
+ next if vectors.include?(v)
28
+ vectors << v
29
+ data << ([nil] * self.size).concat(other_df[v].to_a)
22
30
  end
23
31
 
24
- ::Daru::DataFrame.new(vectors, order: @vectors)
32
+ ::Daru::DataFrame.new(data, order: vectors)
25
33
  end
26
34
 
27
35
  # Public: Saves a Dataframe to a file.
@@ -40,7 +40,7 @@ module Remi
40
40
 
41
41
  def truncate(len)
42
42
  memoize_as_lambda(__method__, len) do |(mlen), larg|
43
- larg.slice(0,len)
43
+ larg.slice(0,mlen)
44
44
  end
45
45
  end
46
46
 
data/lib/remi/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Remi
2
- VERSION = '0.2.19'
2
+ VERSION = '0.2.20'
3
3
  end
@@ -0,0 +1,125 @@
1
+ require 'remi_spec'
2
+
3
+ describe Extractor::SftpFile do
4
+ before do
5
+ now = Time.new
6
+
7
+ example_files = [
8
+ { name: "ApplicantsA-9.csv", createtime: now - 10.minutes },
9
+ { name: "ApplicantsA-3.csv", createtime: now - 5.minutes },
10
+ { name: "ApplicantsA-5.csv", createtime: now - 1.minutes },
11
+ { name: "ApplicantsB-7.csv", createtime: now - 10.minutes },
12
+ { name: "ApplicantsB-6.csv", createtime: now - 5.minutes },
13
+ { name: "ApplicantsB-2.csv", createtime: now - 1.minutes },
14
+ { name: "ApplicantsB-2.txt", createtime: now - 0.minutes },
15
+ { name: "Apples.csv", createtime: now - 1.minutes },
16
+ ]
17
+
18
+ allow_any_instance_of(Extractor::SftpFile).to receive(:all_entries) do
19
+ example_files.map do |file|
20
+ Net::SFTP::Protocol::V04::Name.new(
21
+ file[:name],
22
+ Net::SFTP::Protocol::V04::Attributes.new(createtime: file[:createtime])
23
+ )
24
+ end
25
+ end
26
+
27
+ @params = { credentials: nil }
28
+ end
29
+
30
+ let(:sftpfile) { Extractor::SftpFile.new(**@params) }
31
+
32
+
33
+
34
+
35
+ context 'extracting all files matching a pattern' do
36
+ before do
37
+ @params[:remote_file] = /ApplicantsA-\d+\.csv/
38
+ end
39
+
40
+ it 'does not extract non-matching files' do
41
+ expect(sftpfile.to_download.map(&:name)).not_to include "Apples.csv"
42
+ end
43
+
44
+ it 'extracts all matching files' do
45
+ expect(sftpfile.to_download.map(&:name)).to match_array([
46
+ "ApplicantsA-9.csv",
47
+ "ApplicantsA-3.csv",
48
+ "ApplicantsA-5.csv"
49
+ ])
50
+ end
51
+ end
52
+
53
+
54
+ context 'extracting only the most recent matching a pattern' do
55
+ before do
56
+ @params.merge!({
57
+ remote_file: /ApplicantsA-\d+\.csv/,
58
+ most_recent_only: true
59
+ })
60
+ end
61
+
62
+ it 'extracts only the most recent matching file' do
63
+ expect(sftpfile.to_download.map(&:name)).to match_array([
64
+ "ApplicantsA-5.csv"
65
+ ])
66
+ end
67
+
68
+ context 'using filename instead of createtime' do
69
+ before do
70
+ @params[:most_recent_by] = :filename
71
+ end
72
+
73
+ it 'extracts only the most recent matching file' do
74
+ expect(sftpfile.to_download.map(&:name)).to match_array([
75
+ "ApplicantsA-9.csv"
76
+ ])
77
+ end
78
+ end
79
+ end
80
+
81
+
82
+ context 'extracting files matching a pattern with a by group' do
83
+ before do
84
+ @params.merge!({
85
+ credentials: nil,
86
+ remote_file: /^Applicants(A|B)-\d+\.csv/,
87
+ group_by: /^Applicants(A|B)/
88
+ })
89
+ end
90
+
91
+ it 'extracts the most recent file that matches a particular regex' do
92
+ expect(sftpfile.to_download.map(&:name)).to match_array([
93
+ "ApplicantsA-5.csv",
94
+ "ApplicantsB-2.csv"
95
+ ])
96
+ end
97
+
98
+ context 'with a minimally selective pre-filter' do
99
+ before do
100
+ @params[:remote_file] = /^Applicants/
101
+ end
102
+
103
+ it 'extracts the most recent file that matches a particular regex' do
104
+ expect(sftpfile.to_download.map(&:name)).to match_array([
105
+ "ApplicantsA-5.csv",
106
+ "ApplicantsB-2.txt"
107
+ ])
108
+ end
109
+ end
110
+
111
+ context 'using filename instead of createtime' do
112
+ before do
113
+ @params[:most_recent_by] = :filename
114
+ end
115
+
116
+ it 'extracts only the most recent matching file' do
117
+ expect(sftpfile.to_download.map(&:name)).to match_array([
118
+ "ApplicantsA-9.csv",
119
+ "ApplicantsB-7.csv"
120
+ ])
121
+ end
122
+ end
123
+
124
+ end
125
+ end
data/spec/remi_spec.rb ADDED
@@ -0,0 +1,8 @@
1
+ File.expand_path(File.join(File.dirname(__FILE__),'../lib')).tap {|pwd| $LOAD_PATH.unshift(pwd) unless $LOAD_PATH.include?(pwd)}
2
+
3
+ require 'rubygems'
4
+ require 'bundler/setup'
5
+
6
+ require 'remi'
7
+
8
+ include Remi
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.19
4
+ version: 0.2.20
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sterling Paramore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-04 00:00:00.000000000 Z
11
+ date: 2016-03-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daru
@@ -279,6 +279,8 @@ files:
279
279
  - lib/remi/transform.rb
280
280
  - lib/remi/version.rb
281
281
  - remi.gemspec
282
+ - spec/extractor/sftp_file_spec.rb
283
+ - spec/remi_spec.rb
282
284
  - workbooks/sample_workbook.ipynb
283
285
  - workbooks/workbook_helper.rb
284
286
  homepage: https://github.com/inside-track/remi
@@ -321,3 +323,5 @@ test_files:
321
323
  - features/transforms/parse_date.feature
322
324
  - features/transforms/prefix.feature
323
325
  - features/transforms/truncate.feature
326
+ - spec/extractor/sftp_file_spec.rb
327
+ - spec/remi_spec.rb