remi 0.2.33 → 0.2.34
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/features/step_definitions/remi_step.rb +7 -0
- data/lib/remi/data_subject/csv_file.rb +22 -2
- data/lib/remi/version.rb +1 -1
- data/spec/data_subject/csv_file_spec.rb +78 -0
- data/spec/fixtures/basic.csv +3 -0
- data/spec/fixtures/basic2.csv +3 -0
- data/spec/fixtures/unsupported_escape.csv +3 -0
- metadata +10 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 49fe8ceb344f11f09d99cc4c93cfb03a23e4ba48
|
4
|
+
data.tar.gz: 320de69f750fc9695209f23413a8f6761de8bf3f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7253a752da9a2dfeed5b27128bef939d042e0fc67af42a6bb4720cf9e37f8826d703ad4aee1a676e52f803093f37914dbacba7bd5844fab678373c11febf9064
|
7
|
+
data.tar.gz: bd5440f12c67f16f4190d56147130d619aee83c76f8fe93da156653ae7d7d1ffa93372767aceca06db2869bcb91d6731191cb4f7e0f77b223e971b651145dbcb
|
data/Gemfile.lock
CHANGED
@@ -656,6 +656,13 @@ Then /^the target has (\d+) record(?:s|)$/ do |nrecords|
|
|
656
656
|
step "the target '#{target_name}' has #{nrecords} records"
|
657
657
|
end
|
658
658
|
|
659
|
+
Then /^the targets have (\d+) record(?:s|)$/ do |nrecords|
|
660
|
+
@brt.run_transforms
|
661
|
+
|
662
|
+
obs_nrecords = @brt.targets.keys.reduce(0) { |sum, target_name| sum += @brt.targets[target_name].size }
|
663
|
+
expect(obs_nrecords).to eq nrecords.to_i
|
664
|
+
end
|
665
|
+
|
659
666
|
Then /^the target '([[:alnum:]\s\-_]+)' has (\d+) record(?:s|)$/ do |target_name, nrecords|
|
660
667
|
@brt.run_transforms
|
661
668
|
expect(@brt.targets[target_name].size).to eq nrecords.to_i
|
@@ -54,7 +54,8 @@ module Remi
|
|
54
54
|
result_df = nil
|
55
55
|
extract.each_with_index do |filename, idx|
|
56
56
|
@logger.info "Converting #{filename} to a dataframe"
|
57
|
-
|
57
|
+
processed_filename = preprocess(filename)
|
58
|
+
csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options
|
58
59
|
|
59
60
|
csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
|
60
61
|
if idx == 0
|
@@ -104,12 +105,31 @@ module Remi
|
|
104
105
|
end
|
105
106
|
|
106
107
|
|
108
|
+
|
107
109
|
private
|
108
110
|
|
109
|
-
def
|
111
|
+
def preprocess(filename)
|
112
|
+
return filename unless @preprocessor
|
113
|
+
@logger.info "Preprocessing #{filename}"
|
114
|
+
tmp_filename = File.join(Remi::Settings.work_dir, "#{Pathname.new(filename).basename}-#{SecureRandom.uuid}")
|
115
|
+
|
116
|
+
dirname = Pathname.new(tmp_filename).dirname
|
117
|
+
FileUtils.mkdir_p(dirname) unless File.directory? dirname
|
118
|
+
|
119
|
+
File.open(tmp_filename, 'w') do |outfile|
|
120
|
+
File.foreach(filename) do |in_line|
|
121
|
+
outfile.write @preprocessor.call(in_line)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
tmp_filename
|
126
|
+
end
|
127
|
+
|
128
|
+
def init_csv_file(*args, extractor:, csv_options: {}, filename_field: nil, preprocessor: nil, **kargs, &block)
|
110
129
|
self.extractor = extractor
|
111
130
|
@csv_options = self.class.default_csv_options.merge(csv_options)
|
112
131
|
@filename_field = filename_field
|
132
|
+
@preprocessor = preprocessor
|
113
133
|
end
|
114
134
|
end
|
115
135
|
|
data/lib/remi/version.rb
CHANGED
@@ -0,0 +1,78 @@
|
|
1
|
+
require_relative '../remi_spec'
|
2
|
+
|
3
|
+
describe DataSource::CsvFile do
|
4
|
+
|
5
|
+
it "converts a CSV into a dataframe" do
|
6
|
+
csv = Remi::DataSource::CsvFile.new(
|
7
|
+
extractor: 'spec/fixtures/basic.csv'
|
8
|
+
)
|
9
|
+
|
10
|
+
expected_df = Remi::DataFrame::Daru.new(
|
11
|
+
{
|
12
|
+
column_a: ['value 1A', 'value 2A'],
|
13
|
+
column_b: ['value 1B', 'value 2B']
|
14
|
+
}
|
15
|
+
)
|
16
|
+
expect(csv.df.to_a).to eq expected_df.to_a
|
17
|
+
end
|
18
|
+
|
19
|
+
it "adds filename when requested" do
|
20
|
+
csv = Remi::DataSource::CsvFile.new(
|
21
|
+
extractor: 'spec/fixtures/basic.csv',
|
22
|
+
filename_field: :from_file
|
23
|
+
)
|
24
|
+
|
25
|
+
expect(csv.df[:from_file].to_a).to eq ['spec/fixtures/basic.csv'] * 2
|
26
|
+
end
|
27
|
+
|
28
|
+
it "preprocesses records when required" do
|
29
|
+
csv = Remi::DataSource::CsvFile.new(
|
30
|
+
extractor: 'spec/fixtures/unsupported_escape.csv',
|
31
|
+
preprocessor: ->(line) { line.gsub(/\\"/,'""') }
|
32
|
+
)
|
33
|
+
|
34
|
+
expected_df = Remi::DataFrame::Daru.new(
|
35
|
+
{
|
36
|
+
column_a: ['value 1A', 'value 2A'],
|
37
|
+
column_b: ['value "1B"', 'value "2B"']
|
38
|
+
}
|
39
|
+
)
|
40
|
+
expect(csv.df.to_a).to eq expected_df.to_a
|
41
|
+
end
|
42
|
+
|
43
|
+
it "accepts standard Ruby CSV options" do
|
44
|
+
csv = Remi::DataSource::CsvFile.new(
|
45
|
+
extractor: 'spec/fixtures/basic.csv',
|
46
|
+
preprocessor: ->(line) { line.gsub(/,/,'|') },
|
47
|
+
csv_options: { col_sep: '|' }
|
48
|
+
)
|
49
|
+
|
50
|
+
expected_df = Remi::DataFrame::Daru.new(
|
51
|
+
{
|
52
|
+
column_a: ['value 1A', 'value 2A'],
|
53
|
+
column_b: ['value 1B', 'value 2B']
|
54
|
+
}
|
55
|
+
)
|
56
|
+
expect(csv.df.to_a).to eq expected_df.to_a
|
57
|
+
end
|
58
|
+
|
59
|
+
# Do this when I retire the old LocalFile
|
60
|
+
it "combines multiple csv files into a single dataframe", skip: 'TODO' do
|
61
|
+
csv = Remi::DataSource::CsvFile.new(
|
62
|
+
extractor: Remi::Extractor::LocalFile.new(
|
63
|
+
remote_path: 'spec/fixtures',
|
64
|
+
pattern: 'basic(|2)\.csv'
|
65
|
+
)
|
66
|
+
)
|
67
|
+
|
68
|
+
expected_df = Remi::DataFrame::Daru.new(
|
69
|
+
{
|
70
|
+
column_a: ['value 1A', 'value 2A', 'value 1A', 'value 2A'],
|
71
|
+
column_b: ['value 1B', 'value 2B', nil, nil],
|
72
|
+
columb_c: [nil, nil, 'value 1C', 'value 2C']
|
73
|
+
}
|
74
|
+
)
|
75
|
+
expect(csv.df.to_a).to eq expected_df.to_a
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: remi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.34
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sterling Paramore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-06-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bond
|
@@ -245,9 +245,13 @@ files:
|
|
245
245
|
- lib/remi/transform.rb
|
246
246
|
- lib/remi/version.rb
|
247
247
|
- remi.gemspec
|
248
|
+
- spec/data_subject/csv_file_spec.rb
|
248
249
|
- spec/extractor/file_system_spec.rb
|
249
250
|
- spec/extractor/s3_file_spec.rb
|
250
251
|
- spec/extractor/sftp_file_spec.rb
|
252
|
+
- spec/fixtures/basic.csv
|
253
|
+
- spec/fixtures/basic2.csv
|
254
|
+
- spec/fixtures/unsupported_escape.csv
|
251
255
|
- spec/metadata_spec.rb
|
252
256
|
- spec/remi_spec.rb
|
253
257
|
- workbooks/sample_workbook.ipynb
|
@@ -296,8 +300,12 @@ test_files:
|
|
296
300
|
- features/transforms/prefix.feature
|
297
301
|
- features/transforms/truncate.feature
|
298
302
|
- features/transforms/truthy.feature
|
303
|
+
- spec/data_subject/csv_file_spec.rb
|
299
304
|
- spec/extractor/file_system_spec.rb
|
300
305
|
- spec/extractor/s3_file_spec.rb
|
301
306
|
- spec/extractor/sftp_file_spec.rb
|
307
|
+
- spec/fixtures/basic.csv
|
308
|
+
- spec/fixtures/basic2.csv
|
309
|
+
- spec/fixtures/unsupported_escape.csv
|
302
310
|
- spec/metadata_spec.rb
|
303
311
|
- spec/remi_spec.rb
|