remi 0.2.33 → 0.2.34

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a91453040868eaabb19fb50db4aafa0f0da4a643
4
- data.tar.gz: 8530db1145f89b1abcb8450fe8bcb4eb5325409d
3
+ metadata.gz: 49fe8ceb344f11f09d99cc4c93cfb03a23e4ba48
4
+ data.tar.gz: 320de69f750fc9695209f23413a8f6761de8bf3f
5
5
  SHA512:
6
- metadata.gz: 866bd205d40ea549fc8b11fd06d599a54a12bae6cdc3ddbb746c377c5bd17617ddf4eb48b0e88e2869df529eeafe26043b8fba7bbea8641ecc81eb2455b8480f
7
- data.tar.gz: fc7906b2ffd875e42519a065a21212237cf48c0ece1a0ad393f58b83a02ea6f40811290e4fc21068a0322d9aa2828d8e0e5ea5bec61cb835ed9602eba0c87a6f
6
+ metadata.gz: 7253a752da9a2dfeed5b27128bef939d042e0fc67af42a6bb4720cf9e37f8826d703ad4aee1a676e52f803093f37914dbacba7bd5844fab678373c11febf9064
7
+ data.tar.gz: bd5440f12c67f16f4190d56147130d619aee83c76f8fe93da156653ae7d7d1ffa93372767aceca06db2869bcb91d6731191cb4f7e0f77b223e971b651145dbcb
data/Gemfile.lock CHANGED
@@ -19,7 +19,7 @@ GIT
19
19
  PATH
20
20
  remote: .
21
21
  specs:
22
- remi (0.2.33)
22
+ remi (0.2.34)
23
23
  activesupport (~> 4.2)
24
24
  bond (~> 0.5)
25
25
  cucumber (~> 2.1)
@@ -656,6 +656,13 @@ Then /^the target has (\d+) record(?:s|)$/ do |nrecords|
656
656
  step "the target '#{target_name}' has #{nrecords} records"
657
657
  end
658
658
 
659
+ Then /^the targets have (\d+) record(?:s|)$/ do |nrecords|
660
+ @brt.run_transforms
661
+
662
+ obs_nrecords = @brt.targets.keys.reduce(0) { |sum, target_name| sum += @brt.targets[target_name].size }
663
+ expect(obs_nrecords).to eq nrecords.to_i
664
+ end
665
+
659
666
  Then /^the target '([[:alnum:]\s\-_]+)' has (\d+) record(?:s|)$/ do |target_name, nrecords|
660
667
  @brt.run_transforms
661
668
  expect(@brt.targets[target_name].size).to eq nrecords.to_i
@@ -54,7 +54,8 @@ module Remi
54
54
  result_df = nil
55
55
  extract.each_with_index do |filename, idx|
56
56
  @logger.info "Converting #{filename} to a dataframe"
57
- csv_df = Daru::DataFrame.from_csv filename, @csv_options
57
+ processed_filename = preprocess(filename)
58
+ csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options
58
59
 
59
60
  csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
60
61
  if idx == 0
@@ -104,12 +105,31 @@ module Remi
104
105
  end
105
106
 
106
107
 
108
+
107
109
  private
108
110
 
109
- def init_csv_file(*args, extractor:, csv_options: {}, filename_field: nil, **kargs, &block)
111
+ def preprocess(filename)
112
+ return filename unless @preprocessor
113
+ @logger.info "Preprocessing #{filename}"
114
+ tmp_filename = File.join(Remi::Settings.work_dir, "#{Pathname.new(filename).basename}-#{SecureRandom.uuid}")
115
+
116
+ dirname = Pathname.new(tmp_filename).dirname
117
+ FileUtils.mkdir_p(dirname) unless File.directory? dirname
118
+
119
+ File.open(tmp_filename, 'w') do |outfile|
120
+ File.foreach(filename) do |in_line|
121
+ outfile.write @preprocessor.call(in_line)
122
+ end
123
+ end
124
+
125
+ tmp_filename
126
+ end
127
+
128
+ def init_csv_file(*args, extractor:, csv_options: {}, filename_field: nil, preprocessor: nil, **kargs, &block)
110
129
  self.extractor = extractor
111
130
  @csv_options = self.class.default_csv_options.merge(csv_options)
112
131
  @filename_field = filename_field
132
+ @preprocessor = preprocessor
113
133
  end
114
134
  end
115
135
 
data/lib/remi/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Remi
2
- VERSION = '0.2.33'
2
+ VERSION = '0.2.34'
3
3
  end
@@ -0,0 +1,78 @@
1
+ require_relative '../remi_spec'
2
+
3
+ describe DataSource::CsvFile do
4
+
5
+ it "converts a CSV into a dataframe" do
6
+ csv = Remi::DataSource::CsvFile.new(
7
+ extractor: 'spec/fixtures/basic.csv'
8
+ )
9
+
10
+ expected_df = Remi::DataFrame::Daru.new(
11
+ {
12
+ column_a: ['value 1A', 'value 2A'],
13
+ column_b: ['value 1B', 'value 2B']
14
+ }
15
+ )
16
+ expect(csv.df.to_a).to eq expected_df.to_a
17
+ end
18
+
19
+ it "adds filename when requested" do
20
+ csv = Remi::DataSource::CsvFile.new(
21
+ extractor: 'spec/fixtures/basic.csv',
22
+ filename_field: :from_file
23
+ )
24
+
25
+ expect(csv.df[:from_file].to_a).to eq ['spec/fixtures/basic.csv'] * 2
26
+ end
27
+
28
+ it "preprocesses records when required" do
29
+ csv = Remi::DataSource::CsvFile.new(
30
+ extractor: 'spec/fixtures/unsupported_escape.csv',
31
+ preprocessor: ->(line) { line.gsub(/\\"/,'""') }
32
+ )
33
+
34
+ expected_df = Remi::DataFrame::Daru.new(
35
+ {
36
+ column_a: ['value 1A', 'value 2A'],
37
+ column_b: ['value "1B"', 'value "2B"']
38
+ }
39
+ )
40
+ expect(csv.df.to_a).to eq expected_df.to_a
41
+ end
42
+
43
+ it "accepts standard Ruby CSV options" do
44
+ csv = Remi::DataSource::CsvFile.new(
45
+ extractor: 'spec/fixtures/basic.csv',
46
+ preprocessor: ->(line) { line.gsub(/,/,'|') },
47
+ csv_options: { col_sep: '|' }
48
+ )
49
+
50
+ expected_df = Remi::DataFrame::Daru.new(
51
+ {
52
+ column_a: ['value 1A', 'value 2A'],
53
+ column_b: ['value 1B', 'value 2B']
54
+ }
55
+ )
56
+ expect(csv.df.to_a).to eq expected_df.to_a
57
+ end
58
+
59
+ # Do this when I retire the old LocalFile
60
+ it "combines multiple csv files into a single dataframe", skip: 'TODO' do
61
+ csv = Remi::DataSource::CsvFile.new(
62
+ extractor: Remi::Extractor::LocalFile.new(
63
+ remote_path: 'spec/fixtures',
64
+ pattern: 'basic(|2)\.csv'
65
+ )
66
+ )
67
+
68
+ expected_df = Remi::DataFrame::Daru.new(
69
+ {
70
+ column_a: ['value 1A', 'value 2A', 'value 1A', 'value 2A'],
71
+ column_b: ['value 1B', 'value 2B', nil, nil],
72
+ columb_c: [nil, nil, 'value 1C', 'value 2C']
73
+ }
74
+ )
75
+ expect(csv.df.to_a).to eq expected_df.to_a
76
+ end
77
+
78
+ end
@@ -0,0 +1,3 @@
1
+ column A,column B
2
+ value 1A,value 1B
3
+ value 2A,value 2B
@@ -0,0 +1,3 @@
1
+ column A,column C
2
+ value 1A,value 1C
3
+ value 2A,value 2C
@@ -0,0 +1,3 @@
1
+ column A,column B
2
+ value 1A,"value \"1B\""
3
+ value 2A,"value \"2B\""
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.33
4
+ version: 0.2.34
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sterling Paramore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-23 00:00:00.000000000 Z
11
+ date: 2016-06-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bond
@@ -245,9 +245,13 @@ files:
245
245
  - lib/remi/transform.rb
246
246
  - lib/remi/version.rb
247
247
  - remi.gemspec
248
+ - spec/data_subject/csv_file_spec.rb
248
249
  - spec/extractor/file_system_spec.rb
249
250
  - spec/extractor/s3_file_spec.rb
250
251
  - spec/extractor/sftp_file_spec.rb
252
+ - spec/fixtures/basic.csv
253
+ - spec/fixtures/basic2.csv
254
+ - spec/fixtures/unsupported_escape.csv
251
255
  - spec/metadata_spec.rb
252
256
  - spec/remi_spec.rb
253
257
  - workbooks/sample_workbook.ipynb
@@ -296,8 +300,12 @@ test_files:
296
300
  - features/transforms/prefix.feature
297
301
  - features/transforms/truncate.feature
298
302
  - features/transforms/truthy.feature
303
+ - spec/data_subject/csv_file_spec.rb
299
304
  - spec/extractor/file_system_spec.rb
300
305
  - spec/extractor/s3_file_spec.rb
301
306
  - spec/extractor/sftp_file_spec.rb
307
+ - spec/fixtures/basic.csv
308
+ - spec/fixtures/basic2.csv
309
+ - spec/fixtures/unsupported_escape.csv
302
310
  - spec/metadata_spec.rb
303
311
  - spec/remi_spec.rb