remi 0.2.19 → 0.2.20
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/features/transforms/truncate.feature +5 -0
- data/lib/remi/data_source/csv_file.rb +27 -6
- data/lib/remi/extractor/sftp_file.rb +61 -9
- data/lib/remi/refinements/daru.rb +14 -6
- data/lib/remi/transform.rb +1 -1
- data/lib/remi/version.rb +1 -1
- data/spec/extractor/sftp_file_spec.rb +125 -0
- data/spec/remi_spec.rb +8 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ea1259debd08b1c32c279209ab9a87eb5b0607ac
|
4
|
+
data.tar.gz: 624f19339fae31fe5951dc31001f913a5f834c1c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 08513f0eba7dca6fc95c90f288ca1581d592cda4d5c2f32073c4ec85ffa53b2521c72aaceabdd9a4482171caf1935c4714ca85d785b2a10a22f65d5325ee8eb4
|
7
|
+
data.tar.gz: a74eecaf4bfce033db38fddf4daef177320f447c01f15cabab39cb494db485943177e3385b3dc1a2d3b8f6e428289e410659f8d1aa9e907e464c6ac779d64ce4
|
data/Gemfile.lock
CHANGED
@@ -12,3 +12,8 @@ Feature: Test the truncate transformer.
|
|
12
12
|
|
13
13
|
And the source field 'My Field' is set to the value "something"
|
14
14
|
Then the target field 'Truncated Field' is set to the value "somet"
|
15
|
+
|
16
|
+
And the job parameter 'truncate_len' is "7"
|
17
|
+
|
18
|
+
And the source field 'My Field' is set to the value "something"
|
19
|
+
Then the target field 'Truncated Field' is set to the value "somethi"
|
@@ -3,6 +3,8 @@ module Remi
|
|
3
3
|
class CsvFile
|
4
4
|
include DataSource
|
5
5
|
|
6
|
+
using Remi::Refinements::Daru
|
7
|
+
|
6
8
|
def self.default_csv_options
|
7
9
|
CSV::DEFAULT_OPTIONS.merge({
|
8
10
|
headers: true,
|
@@ -14,10 +16,11 @@ module Remi
|
|
14
16
|
end
|
15
17
|
|
16
18
|
|
17
|
-
def initialize(fields: {}, extractor:, csv_options: {}, logger: Remi::Settings.logger)
|
19
|
+
def initialize(fields: {}, extractor:, csv_options: {}, filename_field: nil, logger: Remi::Settings.logger)
|
18
20
|
@fields = fields
|
19
21
|
self.extractor = extractor
|
20
22
|
@csv_options = self.class.default_csv_options.merge(csv_options)
|
23
|
+
@filename_field = filename_field
|
21
24
|
@logger = logger
|
22
25
|
end
|
23
26
|
|
@@ -30,7 +33,11 @@ module Remi
|
|
30
33
|
end
|
31
34
|
|
32
35
|
def extract
|
33
|
-
Array(@extractor.extract)
|
36
|
+
@extracted = Array(@extractor.extract)
|
37
|
+
end
|
38
|
+
|
39
|
+
def extracted
|
40
|
+
@extracted || extract
|
34
41
|
end
|
35
42
|
|
36
43
|
def extractor=(arg)
|
@@ -38,7 +45,7 @@ module Remi
|
|
38
45
|
when Extractor::SftpFile, Extractor::LocalFile
|
39
46
|
@extractor = arg
|
40
47
|
when String
|
41
|
-
@extractor = Extractor::LocalFile.new(arg)
|
48
|
+
@extractor = Extractor::LocalFile.new(path: arg)
|
42
49
|
when Regexp
|
43
50
|
raise "Adding regex matching to local files would be easy, not done yet"
|
44
51
|
else
|
@@ -48,7 +55,8 @@ module Remi
|
|
48
55
|
|
49
56
|
# Only going to support single file for now
|
50
57
|
def source_filename
|
51
|
-
|
58
|
+
raise "Multiple source files detected" if extracted.size > 1
|
59
|
+
@source_filename ||= extracted.first
|
52
60
|
end
|
53
61
|
|
54
62
|
def first_line
|
@@ -67,8 +75,21 @@ module Remi
|
|
67
75
|
end
|
68
76
|
|
69
77
|
def to_dataframe
|
70
|
-
|
71
|
-
|
78
|
+
# Assumes that each file has exactly the same structure
|
79
|
+
result_df = nil
|
80
|
+
extracted.each_with_index do |filename, idx|
|
81
|
+
@logger.info "Converting #{filename} to a dataframe"
|
82
|
+
csv_df = Daru::DataFrame.from_csv filename, @csv_options
|
83
|
+
|
84
|
+
csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
|
85
|
+
if idx == 0
|
86
|
+
result_df = csv_df
|
87
|
+
else
|
88
|
+
result_df = result_df.concat csv_df
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
result_df
|
72
93
|
end
|
73
94
|
|
74
95
|
def df
|
@@ -2,12 +2,20 @@ module Remi
|
|
2
2
|
module Extractor
|
3
3
|
|
4
4
|
class LocalFile
|
5
|
-
def initialize(path)
|
5
|
+
def initialize(path:, folder: nil)
|
6
6
|
@path = path
|
7
|
+
@folder = folder
|
7
8
|
end
|
8
9
|
|
9
10
|
def extract
|
10
|
-
@
|
11
|
+
if @folder
|
12
|
+
Dir.entries(@folder).map do |entry|
|
13
|
+
next unless entry.match(@path)
|
14
|
+
File.join(@folder,entry)
|
15
|
+
end.compact
|
16
|
+
else
|
17
|
+
@path
|
18
|
+
end
|
11
19
|
end
|
12
20
|
end
|
13
21
|
|
@@ -15,24 +23,41 @@ module Remi
|
|
15
23
|
|
16
24
|
class FileNotFoundError < StandardError; end
|
17
25
|
|
18
|
-
|
26
|
+
SortDesc = Struct.new(:value) do
|
27
|
+
def <=> (target)
|
28
|
+
-(self.value <=> target.value)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize(credentials:, remote_file:, remote_folder: '', local_folder: Settings.work_dir, port: nil, most_recent_only: false, group_by: nil, most_recent_by: :createtime, logger: Remi::Settings.logger)
|
19
33
|
@credentials = credentials
|
20
34
|
@remote_file = remote_file
|
21
35
|
@remote_folder = remote_folder
|
22
36
|
@local_folder = local_folder
|
23
|
-
@port = port
|
37
|
+
@port = port || (credentials && credentials[:port]) || '22'
|
24
38
|
@most_recent_only = most_recent_only
|
39
|
+
@group_by = group_by
|
40
|
+
@most_recent_by = most_recent_by
|
25
41
|
@logger = logger
|
26
42
|
end
|
27
43
|
|
28
44
|
attr_reader :logger
|
29
45
|
|
30
46
|
def extract
|
31
|
-
to_download = @most_recent_only ? Array(most_recent_entry(matching_entries)) : matching_entries
|
32
47
|
raise FileNotFoundError, "File not found: #{@remote_file}" if to_download.size == 0
|
33
48
|
download(to_download)
|
34
49
|
end
|
35
50
|
|
51
|
+
def to_download
|
52
|
+
if @group_by
|
53
|
+
most_recent_in_group
|
54
|
+
elsif @most_recent_only
|
55
|
+
Array(most_recent_entry(matching_entries))
|
56
|
+
else
|
57
|
+
matching_entries
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
36
61
|
def all_entries(remote_folder = @remote_folder)
|
37
62
|
@all_entries ||= connection { |sftp| sftp.dir.entries(File.join("/", remote_folder)) }
|
38
63
|
end
|
@@ -42,15 +67,41 @@ module Remi
|
|
42
67
|
end
|
43
68
|
|
44
69
|
def most_recent_entry(entries = matching_entries)
|
45
|
-
entries.sort_by { |e| e
|
70
|
+
entries.sort_by { |e| sort_files_by(e) }.reverse!.first
|
71
|
+
end
|
72
|
+
|
73
|
+
def sort_files_by(entry)
|
74
|
+
if @most_recent_by == :filename
|
75
|
+
entry.name
|
76
|
+
else
|
77
|
+
entry.attributes.send(@most_recent_by)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def most_recent_in_group(match_group = @group_by)
|
82
|
+
entries_with_group = matching_entries.map do |entry|
|
83
|
+
match = entry.name.match(match_group)
|
84
|
+
next unless match
|
85
|
+
|
86
|
+
group = match.to_a[1..-1]
|
87
|
+
{ group: group, entry: entry }
|
88
|
+
end.compact
|
89
|
+
entries_with_group.sort_by! { |e| [e[:group], SortDesc.new(sort_files_by(e[:entry]))] }
|
90
|
+
|
91
|
+
last_group = nil
|
92
|
+
entries_with_group.map do |entry|
|
93
|
+
next unless entry[:group] != last_group
|
94
|
+
last_group = entry[:group]
|
95
|
+
entry[:entry]
|
96
|
+
end.compact
|
46
97
|
end
|
47
98
|
|
48
|
-
def download(
|
99
|
+
def download(entries_to_download, remote_folder: @remote_folder, local_folder: @local_folder, ntry: 3)
|
49
100
|
connection do |sftp|
|
50
|
-
|
101
|
+
entries_to_download.map do |entry|
|
51
102
|
local_file = File.join(local_folder, entry.name)
|
52
103
|
@logger.info "Downloading #{entry.name} to #{local_file}"
|
53
|
-
retry_download(ntry) { sftp.download!(entry.name, local_file) }
|
104
|
+
retry_download(ntry) { sftp.download!(File.join(remote_folder, entry.name), local_file) }
|
54
105
|
local_file
|
55
106
|
end
|
56
107
|
end
|
@@ -71,6 +122,7 @@ module Remi
|
|
71
122
|
1.upto(ntry).each do |itry|
|
72
123
|
begin
|
73
124
|
block.call
|
125
|
+
break
|
74
126
|
rescue RuntimeError => err
|
75
127
|
raise err unless itry < ntry
|
76
128
|
@logger.error "Download failed with error: #{err.message}"
|
@@ -13,15 +13,23 @@ module Remi
|
|
13
13
|
dupdf
|
14
14
|
end
|
15
15
|
|
16
|
-
# Public:
|
17
|
-
# concatenation is accidentally modified.
|
16
|
+
# Public: Allows for combining dataframes with different columns
|
18
17
|
def concat other_df
|
19
|
-
vectors =
|
20
|
-
|
21
|
-
|
18
|
+
vectors = @vectors.to_a
|
19
|
+
data = []
|
20
|
+
|
21
|
+
vectors.each do |v|
|
22
|
+
other_vec = other_df.vectors.include?(v) ? other_df[v].to_a : [nil] * other_df.size
|
23
|
+
data << self[v].dup.to_a.concat(other_vec)
|
24
|
+
end
|
25
|
+
|
26
|
+
other_df.vectors.each do |v|
|
27
|
+
next if vectors.include?(v)
|
28
|
+
vectors << v
|
29
|
+
data << ([nil] * self.size).concat(other_df[v].to_a)
|
22
30
|
end
|
23
31
|
|
24
|
-
::Daru::DataFrame.new(
|
32
|
+
::Daru::DataFrame.new(data, order: vectors)
|
25
33
|
end
|
26
34
|
|
27
35
|
# Public: Saves a Dataframe to a file.
|
data/lib/remi/transform.rb
CHANGED
data/lib/remi/version.rb
CHANGED
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'remi_spec'
|
2
|
+
|
3
|
+
describe Extractor::SftpFile do
|
4
|
+
before do
|
5
|
+
now = Time.new
|
6
|
+
|
7
|
+
example_files = [
|
8
|
+
{ name: "ApplicantsA-9.csv", createtime: now - 10.minutes },
|
9
|
+
{ name: "ApplicantsA-3.csv", createtime: now - 5.minutes },
|
10
|
+
{ name: "ApplicantsA-5.csv", createtime: now - 1.minutes },
|
11
|
+
{ name: "ApplicantsB-7.csv", createtime: now - 10.minutes },
|
12
|
+
{ name: "ApplicantsB-6.csv", createtime: now - 5.minutes },
|
13
|
+
{ name: "ApplicantsB-2.csv", createtime: now - 1.minutes },
|
14
|
+
{ name: "ApplicantsB-2.txt", createtime: now - 0.minutes },
|
15
|
+
{ name: "Apples.csv", createtime: now - 1.minutes },
|
16
|
+
]
|
17
|
+
|
18
|
+
allow_any_instance_of(Extractor::SftpFile).to receive(:all_entries) do
|
19
|
+
example_files.map do |file|
|
20
|
+
Net::SFTP::Protocol::V04::Name.new(
|
21
|
+
file[:name],
|
22
|
+
Net::SFTP::Protocol::V04::Attributes.new(createtime: file[:createtime])
|
23
|
+
)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
@params = { credentials: nil }
|
28
|
+
end
|
29
|
+
|
30
|
+
let(:sftpfile) { Extractor::SftpFile.new(**@params) }
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
context 'extracting all files matching a pattern' do
|
36
|
+
before do
|
37
|
+
@params[:remote_file] = /ApplicantsA-\d+\.csv/
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'does not extract non-matching files' do
|
41
|
+
expect(sftpfile.to_download.map(&:name)).not_to include "Apples.csv"
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'extracts all matching files' do
|
45
|
+
expect(sftpfile.to_download.map(&:name)).to match_array([
|
46
|
+
"ApplicantsA-9.csv",
|
47
|
+
"ApplicantsA-3.csv",
|
48
|
+
"ApplicantsA-5.csv"
|
49
|
+
])
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
context 'extracting only the most recent matching a pattern' do
|
55
|
+
before do
|
56
|
+
@params.merge!({
|
57
|
+
remote_file: /ApplicantsA-\d+\.csv/,
|
58
|
+
most_recent_only: true
|
59
|
+
})
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'extracts only the most recent matching file' do
|
63
|
+
expect(sftpfile.to_download.map(&:name)).to match_array([
|
64
|
+
"ApplicantsA-5.csv"
|
65
|
+
])
|
66
|
+
end
|
67
|
+
|
68
|
+
context 'using filename instead of createtime' do
|
69
|
+
before do
|
70
|
+
@params[:most_recent_by] = :filename
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'extracts only the most recent matching file' do
|
74
|
+
expect(sftpfile.to_download.map(&:name)).to match_array([
|
75
|
+
"ApplicantsA-9.csv"
|
76
|
+
])
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
context 'extracting files matching a pattern with a by group' do
|
83
|
+
before do
|
84
|
+
@params.merge!({
|
85
|
+
credentials: nil,
|
86
|
+
remote_file: /^Applicants(A|B)-\d+\.csv/,
|
87
|
+
group_by: /^Applicants(A|B)/
|
88
|
+
})
|
89
|
+
end
|
90
|
+
|
91
|
+
it 'extracts the most recent file that matches a particular regex' do
|
92
|
+
expect(sftpfile.to_download.map(&:name)).to match_array([
|
93
|
+
"ApplicantsA-5.csv",
|
94
|
+
"ApplicantsB-2.csv"
|
95
|
+
])
|
96
|
+
end
|
97
|
+
|
98
|
+
context 'with a minimally selective pre-filter' do
|
99
|
+
before do
|
100
|
+
@params[:remote_file] = /^Applicants/
|
101
|
+
end
|
102
|
+
|
103
|
+
it 'extracts the most recent file that matches a particular regex' do
|
104
|
+
expect(sftpfile.to_download.map(&:name)).to match_array([
|
105
|
+
"ApplicantsA-5.csv",
|
106
|
+
"ApplicantsB-2.txt"
|
107
|
+
])
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
context 'using filename instead of createtime' do
|
112
|
+
before do
|
113
|
+
@params[:most_recent_by] = :filename
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'extracts only the most recent matching file' do
|
117
|
+
expect(sftpfile.to_download.map(&:name)).to match_array([
|
118
|
+
"ApplicantsA-9.csv",
|
119
|
+
"ApplicantsB-7.csv"
|
120
|
+
])
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
end
|
data/spec/remi_spec.rb
ADDED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: remi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sterling Paramore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daru
|
@@ -279,6 +279,8 @@ files:
|
|
279
279
|
- lib/remi/transform.rb
|
280
280
|
- lib/remi/version.rb
|
281
281
|
- remi.gemspec
|
282
|
+
- spec/extractor/sftp_file_spec.rb
|
283
|
+
- spec/remi_spec.rb
|
282
284
|
- workbooks/sample_workbook.ipynb
|
283
285
|
- workbooks/workbook_helper.rb
|
284
286
|
homepage: https://github.com/inside-track/remi
|
@@ -321,3 +323,5 @@ test_files:
|
|
321
323
|
- features/transforms/parse_date.feature
|
322
324
|
- features/transforms/prefix.feature
|
323
325
|
- features/transforms/truncate.feature
|
326
|
+
- spec/extractor/sftp_file_spec.rb
|
327
|
+
- spec/remi_spec.rb
|