remi 0.2.33 → 0.2.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/features/step_definitions/remi_step.rb +7 -0
- data/lib/remi/data_subject/csv_file.rb +22 -2
- data/lib/remi/version.rb +1 -1
- data/spec/data_subject/csv_file_spec.rb +78 -0
- data/spec/fixtures/basic.csv +3 -0
- data/spec/fixtures/basic2.csv +3 -0
- data/spec/fixtures/unsupported_escape.csv +3 -0
- metadata +10 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 49fe8ceb344f11f09d99cc4c93cfb03a23e4ba48
|
|
4
|
+
data.tar.gz: 320de69f750fc9695209f23413a8f6761de8bf3f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7253a752da9a2dfeed5b27128bef939d042e0fc67af42a6bb4720cf9e37f8826d703ad4aee1a676e52f803093f37914dbacba7bd5844fab678373c11febf9064
|
|
7
|
+
data.tar.gz: bd5440f12c67f16f4190d56147130d619aee83c76f8fe93da156653ae7d7d1ffa93372767aceca06db2869bcb91d6731191cb4f7e0f77b223e971b651145dbcb
|
data/Gemfile.lock
CHANGED
|
@@ -656,6 +656,13 @@ Then /^the target has (\d+) record(?:s|)$/ do |nrecords|
|
|
|
656
656
|
step "the target '#{target_name}' has #{nrecords} records"
|
|
657
657
|
end
|
|
658
658
|
|
|
659
|
+
Then /^the targets have (\d+) record(?:s|)$/ do |nrecords|
|
|
660
|
+
@brt.run_transforms
|
|
661
|
+
|
|
662
|
+
obs_nrecords = @brt.targets.keys.reduce(0) { |sum, target_name| sum += @brt.targets[target_name].size }
|
|
663
|
+
expect(obs_nrecords).to eq nrecords.to_i
|
|
664
|
+
end
|
|
665
|
+
|
|
659
666
|
Then /^the target '([[:alnum:]\s\-_]+)' has (\d+) record(?:s|)$/ do |target_name, nrecords|
|
|
660
667
|
@brt.run_transforms
|
|
661
668
|
expect(@brt.targets[target_name].size).to eq nrecords.to_i
|
|
@@ -54,7 +54,8 @@ module Remi
|
|
|
54
54
|
result_df = nil
|
|
55
55
|
extract.each_with_index do |filename, idx|
|
|
56
56
|
@logger.info "Converting #{filename} to a dataframe"
|
|
57
|
-
|
|
57
|
+
processed_filename = preprocess(filename)
|
|
58
|
+
csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options
|
|
58
59
|
|
|
59
60
|
csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
|
|
60
61
|
if idx == 0
|
|
@@ -104,12 +105,31 @@ module Remi
|
|
|
104
105
|
end
|
|
105
106
|
|
|
106
107
|
|
|
108
|
+
|
|
107
109
|
private
|
|
108
110
|
|
|
109
|
-
def
|
|
111
|
+
def preprocess(filename)
|
|
112
|
+
return filename unless @preprocessor
|
|
113
|
+
@logger.info "Preprocessing #{filename}"
|
|
114
|
+
tmp_filename = File.join(Remi::Settings.work_dir, "#{Pathname.new(filename).basename}-#{SecureRandom.uuid}")
|
|
115
|
+
|
|
116
|
+
dirname = Pathname.new(tmp_filename).dirname
|
|
117
|
+
FileUtils.mkdir_p(dirname) unless File.directory? dirname
|
|
118
|
+
|
|
119
|
+
File.open(tmp_filename, 'w') do |outfile|
|
|
120
|
+
File.foreach(filename) do |in_line|
|
|
121
|
+
outfile.write @preprocessor.call(in_line)
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
tmp_filename
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def init_csv_file(*args, extractor:, csv_options: {}, filename_field: nil, preprocessor: nil, **kargs, &block)
|
|
110
129
|
self.extractor = extractor
|
|
111
130
|
@csv_options = self.class.default_csv_options.merge(csv_options)
|
|
112
131
|
@filename_field = filename_field
|
|
132
|
+
@preprocessor = preprocessor
|
|
113
133
|
end
|
|
114
134
|
end
|
|
115
135
|
|
data/lib/remi/version.rb
CHANGED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
require_relative '../remi_spec'
|
|
2
|
+
|
|
3
|
+
describe DataSource::CsvFile do
|
|
4
|
+
|
|
5
|
+
it "converts a CSV into a dataframe" do
|
|
6
|
+
csv = Remi::DataSource::CsvFile.new(
|
|
7
|
+
extractor: 'spec/fixtures/basic.csv'
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
expected_df = Remi::DataFrame::Daru.new(
|
|
11
|
+
{
|
|
12
|
+
column_a: ['value 1A', 'value 2A'],
|
|
13
|
+
column_b: ['value 1B', 'value 2B']
|
|
14
|
+
}
|
|
15
|
+
)
|
|
16
|
+
expect(csv.df.to_a).to eq expected_df.to_a
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it "adds filename when requested" do
|
|
20
|
+
csv = Remi::DataSource::CsvFile.new(
|
|
21
|
+
extractor: 'spec/fixtures/basic.csv',
|
|
22
|
+
filename_field: :from_file
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
expect(csv.df[:from_file].to_a).to eq ['spec/fixtures/basic.csv'] * 2
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
it "preprocesses records when required" do
|
|
29
|
+
csv = Remi::DataSource::CsvFile.new(
|
|
30
|
+
extractor: 'spec/fixtures/unsupported_escape.csv',
|
|
31
|
+
preprocessor: ->(line) { line.gsub(/\\"/,'""') }
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
expected_df = Remi::DataFrame::Daru.new(
|
|
35
|
+
{
|
|
36
|
+
column_a: ['value 1A', 'value 2A'],
|
|
37
|
+
column_b: ['value "1B"', 'value "2B"']
|
|
38
|
+
}
|
|
39
|
+
)
|
|
40
|
+
expect(csv.df.to_a).to eq expected_df.to_a
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it "accepts standard Ruby CSV options" do
|
|
44
|
+
csv = Remi::DataSource::CsvFile.new(
|
|
45
|
+
extractor: 'spec/fixtures/basic.csv',
|
|
46
|
+
preprocessor: ->(line) { line.gsub(/,/,'|') },
|
|
47
|
+
csv_options: { col_sep: '|' }
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
expected_df = Remi::DataFrame::Daru.new(
|
|
51
|
+
{
|
|
52
|
+
column_a: ['value 1A', 'value 2A'],
|
|
53
|
+
column_b: ['value 1B', 'value 2B']
|
|
54
|
+
}
|
|
55
|
+
)
|
|
56
|
+
expect(csv.df.to_a).to eq expected_df.to_a
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Do this when I retire the old LocalFile
|
|
60
|
+
it "combines multiple csv files into a single dataframe", skip: 'TODO' do
|
|
61
|
+
csv = Remi::DataSource::CsvFile.new(
|
|
62
|
+
extractor: Remi::Extractor::LocalFile.new(
|
|
63
|
+
remote_path: 'spec/fixtures',
|
|
64
|
+
pattern: 'basic(|2)\.csv'
|
|
65
|
+
)
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
expected_df = Remi::DataFrame::Daru.new(
|
|
69
|
+
{
|
|
70
|
+
column_a: ['value 1A', 'value 2A', 'value 1A', 'value 2A'],
|
|
71
|
+
column_b: ['value 1B', 'value 2B', nil, nil],
|
|
72
|
+
columb_c: [nil, nil, 'value 1C', 'value 2C']
|
|
73
|
+
}
|
|
74
|
+
)
|
|
75
|
+
expect(csv.df.to_a).to eq expected_df.to_a
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: remi
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.34
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Sterling Paramore
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2016-
|
|
11
|
+
date: 2016-06-01 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bond
|
|
@@ -245,9 +245,13 @@ files:
|
|
|
245
245
|
- lib/remi/transform.rb
|
|
246
246
|
- lib/remi/version.rb
|
|
247
247
|
- remi.gemspec
|
|
248
|
+
- spec/data_subject/csv_file_spec.rb
|
|
248
249
|
- spec/extractor/file_system_spec.rb
|
|
249
250
|
- spec/extractor/s3_file_spec.rb
|
|
250
251
|
- spec/extractor/sftp_file_spec.rb
|
|
252
|
+
- spec/fixtures/basic.csv
|
|
253
|
+
- spec/fixtures/basic2.csv
|
|
254
|
+
- spec/fixtures/unsupported_escape.csv
|
|
251
255
|
- spec/metadata_spec.rb
|
|
252
256
|
- spec/remi_spec.rb
|
|
253
257
|
- workbooks/sample_workbook.ipynb
|
|
@@ -296,8 +300,12 @@ test_files:
|
|
|
296
300
|
- features/transforms/prefix.feature
|
|
297
301
|
- features/transforms/truncate.feature
|
|
298
302
|
- features/transforms/truthy.feature
|
|
303
|
+
- spec/data_subject/csv_file_spec.rb
|
|
299
304
|
- spec/extractor/file_system_spec.rb
|
|
300
305
|
- spec/extractor/s3_file_spec.rb
|
|
301
306
|
- spec/extractor/sftp_file_spec.rb
|
|
307
|
+
- spec/fixtures/basic.csv
|
|
308
|
+
- spec/fixtures/basic2.csv
|
|
309
|
+
- spec/fixtures/unsupported_escape.csv
|
|
302
310
|
- spec/metadata_spec.rb
|
|
303
311
|
- spec/remi_spec.rb
|