remi 0.2.19 → 0.2.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/features/transforms/truncate.feature +5 -0
- data/lib/remi/data_source/csv_file.rb +27 -6
- data/lib/remi/extractor/sftp_file.rb +61 -9
- data/lib/remi/refinements/daru.rb +14 -6
- data/lib/remi/transform.rb +1 -1
- data/lib/remi/version.rb +1 -1
- data/spec/extractor/sftp_file_spec.rb +125 -0
- data/spec/remi_spec.rb +8 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ea1259debd08b1c32c279209ab9a87eb5b0607ac
|
4
|
+
data.tar.gz: 624f19339fae31fe5951dc31001f913a5f834c1c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 08513f0eba7dca6fc95c90f288ca1581d592cda4d5c2f32073c4ec85ffa53b2521c72aaceabdd9a4482171caf1935c4714ca85d785b2a10a22f65d5325ee8eb4
|
7
|
+
data.tar.gz: a74eecaf4bfce033db38fddf4daef177320f447c01f15cabab39cb494db485943177e3385b3dc1a2d3b8f6e428289e410659f8d1aa9e907e464c6ac779d64ce4
|
data/Gemfile.lock
CHANGED
@@ -12,3 +12,8 @@ Feature: Test the truncate transformer.
|
|
12
12
|
|
13
13
|
And the source field 'My Field' is set to the value "something"
|
14
14
|
Then the target field 'Truncated Field' is set to the value "somet"
|
15
|
+
|
16
|
+
And the job parameter 'truncate_len' is "7"
|
17
|
+
|
18
|
+
And the source field 'My Field' is set to the value "something"
|
19
|
+
Then the target field 'Truncated Field' is set to the value "somethi"
|
@@ -3,6 +3,8 @@ module Remi
|
|
3
3
|
class CsvFile
|
4
4
|
include DataSource
|
5
5
|
|
6
|
+
using Remi::Refinements::Daru
|
7
|
+
|
6
8
|
def self.default_csv_options
|
7
9
|
CSV::DEFAULT_OPTIONS.merge({
|
8
10
|
headers: true,
|
@@ -14,10 +16,11 @@ module Remi
|
|
14
16
|
end
|
15
17
|
|
16
18
|
|
17
|
-
def initialize(fields: {}, extractor:, csv_options: {}, logger: Remi::Settings.logger)
|
19
|
+
def initialize(fields: {}, extractor:, csv_options: {}, filename_field: nil, logger: Remi::Settings.logger)
|
18
20
|
@fields = fields
|
19
21
|
self.extractor = extractor
|
20
22
|
@csv_options = self.class.default_csv_options.merge(csv_options)
|
23
|
+
@filename_field = filename_field
|
21
24
|
@logger = logger
|
22
25
|
end
|
23
26
|
|
@@ -30,7 +33,11 @@ module Remi
|
|
30
33
|
end
|
31
34
|
|
32
35
|
def extract
|
33
|
-
Array(@extractor.extract)
|
36
|
+
@extracted = Array(@extractor.extract)
|
37
|
+
end
|
38
|
+
|
39
|
+
def extracted
|
40
|
+
@extracted || extract
|
34
41
|
end
|
35
42
|
|
36
43
|
def extractor=(arg)
|
@@ -38,7 +45,7 @@ module Remi
|
|
38
45
|
when Extractor::SftpFile, Extractor::LocalFile
|
39
46
|
@extractor = arg
|
40
47
|
when String
|
41
|
-
@extractor = Extractor::LocalFile.new(arg)
|
48
|
+
@extractor = Extractor::LocalFile.new(path: arg)
|
42
49
|
when Regexp
|
43
50
|
raise "Adding regex matching to local files would be easy, not done yet"
|
44
51
|
else
|
@@ -48,7 +55,8 @@ module Remi
|
|
48
55
|
|
49
56
|
# Only going to support single file for now
|
50
57
|
def source_filename
|
51
|
-
|
58
|
+
raise "Multiple source files detected" if extracted.size > 1
|
59
|
+
@source_filename ||= extracted.first
|
52
60
|
end
|
53
61
|
|
54
62
|
def first_line
|
@@ -67,8 +75,21 @@ module Remi
|
|
67
75
|
end
|
68
76
|
|
69
77
|
def to_dataframe
|
70
|
-
|
71
|
-
|
78
|
+
# Assumes that each file has exactly the same structure
|
79
|
+
result_df = nil
|
80
|
+
extracted.each_with_index do |filename, idx|
|
81
|
+
@logger.info "Converting #{filename} to a dataframe"
|
82
|
+
csv_df = Daru::DataFrame.from_csv filename, @csv_options
|
83
|
+
|
84
|
+
csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
|
85
|
+
if idx == 0
|
86
|
+
result_df = csv_df
|
87
|
+
else
|
88
|
+
result_df = result_df.concat csv_df
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
result_df
|
72
93
|
end
|
73
94
|
|
74
95
|
def df
|
@@ -2,12 +2,20 @@ module Remi
|
|
2
2
|
module Extractor
|
3
3
|
|
4
4
|
class LocalFile
|
5
|
-
def initialize(path)
|
5
|
+
def initialize(path:, folder: nil)
|
6
6
|
@path = path
|
7
|
+
@folder = folder
|
7
8
|
end
|
8
9
|
|
9
10
|
def extract
|
10
|
-
@
|
11
|
+
if @folder
|
12
|
+
Dir.entries(@folder).map do |entry|
|
13
|
+
next unless entry.match(@path)
|
14
|
+
File.join(@folder,entry)
|
15
|
+
end.compact
|
16
|
+
else
|
17
|
+
@path
|
18
|
+
end
|
11
19
|
end
|
12
20
|
end
|
13
21
|
|
@@ -15,24 +23,41 @@ module Remi
|
|
15
23
|
|
16
24
|
class FileNotFoundError < StandardError; end
|
17
25
|
|
18
|
-
|
26
|
+
SortDesc = Struct.new(:value) do
|
27
|
+
def <=> (target)
|
28
|
+
-(self.value <=> target.value)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize(credentials:, remote_file:, remote_folder: '', local_folder: Settings.work_dir, port: nil, most_recent_only: false, group_by: nil, most_recent_by: :createtime, logger: Remi::Settings.logger)
|
19
33
|
@credentials = credentials
|
20
34
|
@remote_file = remote_file
|
21
35
|
@remote_folder = remote_folder
|
22
36
|
@local_folder = local_folder
|
23
|
-
@port = port
|
37
|
+
@port = port || (credentials && credentials[:port]) || '22'
|
24
38
|
@most_recent_only = most_recent_only
|
39
|
+
@group_by = group_by
|
40
|
+
@most_recent_by = most_recent_by
|
25
41
|
@logger = logger
|
26
42
|
end
|
27
43
|
|
28
44
|
attr_reader :logger
|
29
45
|
|
30
46
|
def extract
|
31
|
-
to_download = @most_recent_only ? Array(most_recent_entry(matching_entries)) : matching_entries
|
32
47
|
raise FileNotFoundError, "File not found: #{@remote_file}" if to_download.size == 0
|
33
48
|
download(to_download)
|
34
49
|
end
|
35
50
|
|
51
|
+
def to_download
|
52
|
+
if @group_by
|
53
|
+
most_recent_in_group
|
54
|
+
elsif @most_recent_only
|
55
|
+
Array(most_recent_entry(matching_entries))
|
56
|
+
else
|
57
|
+
matching_entries
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
36
61
|
def all_entries(remote_folder = @remote_folder)
|
37
62
|
@all_entries ||= connection { |sftp| sftp.dir.entries(File.join("/", remote_folder)) }
|
38
63
|
end
|
@@ -42,15 +67,41 @@ module Remi
|
|
42
67
|
end
|
43
68
|
|
44
69
|
def most_recent_entry(entries = matching_entries)
|
45
|
-
entries.sort_by { |e| e
|
70
|
+
entries.sort_by { |e| sort_files_by(e) }.reverse!.first
|
71
|
+
end
|
72
|
+
|
73
|
+
def sort_files_by(entry)
|
74
|
+
if @most_recent_by == :filename
|
75
|
+
entry.name
|
76
|
+
else
|
77
|
+
entry.attributes.send(@most_recent_by)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def most_recent_in_group(match_group = @group_by)
|
82
|
+
entries_with_group = matching_entries.map do |entry|
|
83
|
+
match = entry.name.match(match_group)
|
84
|
+
next unless match
|
85
|
+
|
86
|
+
group = match.to_a[1..-1]
|
87
|
+
{ group: group, entry: entry }
|
88
|
+
end.compact
|
89
|
+
entries_with_group.sort_by! { |e| [e[:group], SortDesc.new(sort_files_by(e[:entry]))] }
|
90
|
+
|
91
|
+
last_group = nil
|
92
|
+
entries_with_group.map do |entry|
|
93
|
+
next unless entry[:group] != last_group
|
94
|
+
last_group = entry[:group]
|
95
|
+
entry[:entry]
|
96
|
+
end.compact
|
46
97
|
end
|
47
98
|
|
48
|
-
def download(
|
99
|
+
def download(entries_to_download, remote_folder: @remote_folder, local_folder: @local_folder, ntry: 3)
|
49
100
|
connection do |sftp|
|
50
|
-
|
101
|
+
entries_to_download.map do |entry|
|
51
102
|
local_file = File.join(local_folder, entry.name)
|
52
103
|
@logger.info "Downloading #{entry.name} to #{local_file}"
|
53
|
-
retry_download(ntry) { sftp.download!(entry.name, local_file) }
|
104
|
+
retry_download(ntry) { sftp.download!(File.join(remote_folder, entry.name), local_file) }
|
54
105
|
local_file
|
55
106
|
end
|
56
107
|
end
|
@@ -71,6 +122,7 @@ module Remi
|
|
71
122
|
1.upto(ntry).each do |itry|
|
72
123
|
begin
|
73
124
|
block.call
|
125
|
+
break
|
74
126
|
rescue RuntimeError => err
|
75
127
|
raise err unless itry < ntry
|
76
128
|
@logger.error "Download failed with error: #{err.message}"
|
@@ -13,15 +13,23 @@ module Remi
|
|
13
13
|
dupdf
|
14
14
|
end
|
15
15
|
|
16
|
-
# Public:
|
17
|
-
# concatenation is accidentally modified.
|
16
|
+
# Public: Allows for combining dataframes with different columns
|
18
17
|
def concat other_df
|
19
|
-
vectors =
|
20
|
-
|
21
|
-
|
18
|
+
vectors = @vectors.to_a
|
19
|
+
data = []
|
20
|
+
|
21
|
+
vectors.each do |v|
|
22
|
+
other_vec = other_df.vectors.include?(v) ? other_df[v].to_a : [nil] * other_df.size
|
23
|
+
data << self[v].dup.to_a.concat(other_vec)
|
24
|
+
end
|
25
|
+
|
26
|
+
other_df.vectors.each do |v|
|
27
|
+
next if vectors.include?(v)
|
28
|
+
vectors << v
|
29
|
+
data << ([nil] * self.size).concat(other_df[v].to_a)
|
22
30
|
end
|
23
31
|
|
24
|
-
::Daru::DataFrame.new(
|
32
|
+
::Daru::DataFrame.new(data, order: vectors)
|
25
33
|
end
|
26
34
|
|
27
35
|
# Public: Saves a Dataframe to a file.
|
data/lib/remi/transform.rb
CHANGED
data/lib/remi/version.rb
CHANGED
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'remi_spec'
|
2
|
+
|
3
|
+
describe Extractor::SftpFile do
|
4
|
+
before do
|
5
|
+
now = Time.new
|
6
|
+
|
7
|
+
example_files = [
|
8
|
+
{ name: "ApplicantsA-9.csv", createtime: now - 10.minutes },
|
9
|
+
{ name: "ApplicantsA-3.csv", createtime: now - 5.minutes },
|
10
|
+
{ name: "ApplicantsA-5.csv", createtime: now - 1.minutes },
|
11
|
+
{ name: "ApplicantsB-7.csv", createtime: now - 10.minutes },
|
12
|
+
{ name: "ApplicantsB-6.csv", createtime: now - 5.minutes },
|
13
|
+
{ name: "ApplicantsB-2.csv", createtime: now - 1.minutes },
|
14
|
+
{ name: "ApplicantsB-2.txt", createtime: now - 0.minutes },
|
15
|
+
{ name: "Apples.csv", createtime: now - 1.minutes },
|
16
|
+
]
|
17
|
+
|
18
|
+
allow_any_instance_of(Extractor::SftpFile).to receive(:all_entries) do
|
19
|
+
example_files.map do |file|
|
20
|
+
Net::SFTP::Protocol::V04::Name.new(
|
21
|
+
file[:name],
|
22
|
+
Net::SFTP::Protocol::V04::Attributes.new(createtime: file[:createtime])
|
23
|
+
)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
@params = { credentials: nil }
|
28
|
+
end
|
29
|
+
|
30
|
+
let(:sftpfile) { Extractor::SftpFile.new(**@params) }
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
context 'extracting all files matching a pattern' do
|
36
|
+
before do
|
37
|
+
@params[:remote_file] = /ApplicantsA-\d+\.csv/
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'does not extract non-matching files' do
|
41
|
+
expect(sftpfile.to_download.map(&:name)).not_to include "Apples.csv"
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'extracts all matching files' do
|
45
|
+
expect(sftpfile.to_download.map(&:name)).to match_array([
|
46
|
+
"ApplicantsA-9.csv",
|
47
|
+
"ApplicantsA-3.csv",
|
48
|
+
"ApplicantsA-5.csv"
|
49
|
+
])
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
context 'extracting only the most recent matching a pattern' do
|
55
|
+
before do
|
56
|
+
@params.merge!({
|
57
|
+
remote_file: /ApplicantsA-\d+\.csv/,
|
58
|
+
most_recent_only: true
|
59
|
+
})
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'extracts only the most recent matching file' do
|
63
|
+
expect(sftpfile.to_download.map(&:name)).to match_array([
|
64
|
+
"ApplicantsA-5.csv"
|
65
|
+
])
|
66
|
+
end
|
67
|
+
|
68
|
+
context 'using filename instead of createtime' do
|
69
|
+
before do
|
70
|
+
@params[:most_recent_by] = :filename
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'extracts only the most recent matching file' do
|
74
|
+
expect(sftpfile.to_download.map(&:name)).to match_array([
|
75
|
+
"ApplicantsA-9.csv"
|
76
|
+
])
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
context 'extracting files matching a pattern with a by group' do
|
83
|
+
before do
|
84
|
+
@params.merge!({
|
85
|
+
credentials: nil,
|
86
|
+
remote_file: /^Applicants(A|B)-\d+\.csv/,
|
87
|
+
group_by: /^Applicants(A|B)/
|
88
|
+
})
|
89
|
+
end
|
90
|
+
|
91
|
+
it 'extracts the most recent file that matches a particular regex' do
|
92
|
+
expect(sftpfile.to_download.map(&:name)).to match_array([
|
93
|
+
"ApplicantsA-5.csv",
|
94
|
+
"ApplicantsB-2.csv"
|
95
|
+
])
|
96
|
+
end
|
97
|
+
|
98
|
+
context 'with a minimally selective pre-filter' do
|
99
|
+
before do
|
100
|
+
@params[:remote_file] = /^Applicants/
|
101
|
+
end
|
102
|
+
|
103
|
+
it 'extracts the most recent file that matches a particular regex' do
|
104
|
+
expect(sftpfile.to_download.map(&:name)).to match_array([
|
105
|
+
"ApplicantsA-5.csv",
|
106
|
+
"ApplicantsB-2.txt"
|
107
|
+
])
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
context 'using filename instead of createtime' do
|
112
|
+
before do
|
113
|
+
@params[:most_recent_by] = :filename
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'extracts only the most recent matching file' do
|
117
|
+
expect(sftpfile.to_download.map(&:name)).to match_array([
|
118
|
+
"ApplicantsA-9.csv",
|
119
|
+
"ApplicantsB-7.csv"
|
120
|
+
])
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
end
|
data/spec/remi_spec.rb
ADDED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: remi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sterling Paramore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daru
|
@@ -279,6 +279,8 @@ files:
|
|
279
279
|
- lib/remi/transform.rb
|
280
280
|
- lib/remi/version.rb
|
281
281
|
- remi.gemspec
|
282
|
+
- spec/extractor/sftp_file_spec.rb
|
283
|
+
- spec/remi_spec.rb
|
282
284
|
- workbooks/sample_workbook.ipynb
|
283
285
|
- workbooks/workbook_helper.rb
|
284
286
|
homepage: https://github.com/inside-track/remi
|
@@ -321,3 +323,5 @@ test_files:
|
|
321
323
|
- features/transforms/parse_date.feature
|
322
324
|
- features/transforms/prefix.feature
|
323
325
|
- features/transforms/truncate.feature
|
326
|
+
- spec/extractor/sftp_file_spec.rb
|
327
|
+
- spec/remi_spec.rb
|