remi 0.2.33 → 0.2.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a91453040868eaabb19fb50db4aafa0f0da4a643
4
- data.tar.gz: 8530db1145f89b1abcb8450fe8bcb4eb5325409d
3
+ metadata.gz: 49fe8ceb344f11f09d99cc4c93cfb03a23e4ba48
4
+ data.tar.gz: 320de69f750fc9695209f23413a8f6761de8bf3f
5
5
  SHA512:
6
- metadata.gz: 866bd205d40ea549fc8b11fd06d599a54a12bae6cdc3ddbb746c377c5bd17617ddf4eb48b0e88e2869df529eeafe26043b8fba7bbea8641ecc81eb2455b8480f
7
- data.tar.gz: fc7906b2ffd875e42519a065a21212237cf48c0ece1a0ad393f58b83a02ea6f40811290e4fc21068a0322d9aa2828d8e0e5ea5bec61cb835ed9602eba0c87a6f
6
+ metadata.gz: 7253a752da9a2dfeed5b27128bef939d042e0fc67af42a6bb4720cf9e37f8826d703ad4aee1a676e52f803093f37914dbacba7bd5844fab678373c11febf9064
7
+ data.tar.gz: bd5440f12c67f16f4190d56147130d619aee83c76f8fe93da156653ae7d7d1ffa93372767aceca06db2869bcb91d6731191cb4f7e0f77b223e971b651145dbcb
data/Gemfile.lock CHANGED
@@ -19,7 +19,7 @@ GIT
19
19
  PATH
20
20
  remote: .
21
21
  specs:
22
- remi (0.2.33)
22
+ remi (0.2.34)
23
23
  activesupport (~> 4.2)
24
24
  bond (~> 0.5)
25
25
  cucumber (~> 2.1)
@@ -656,6 +656,13 @@ Then /^the target has (\d+) record(?:s|)$/ do |nrecords|
656
656
  step "the target '#{target_name}' has #{nrecords} records"
657
657
  end
658
658
 
659
+ Then /^the targets have (\d+) record(?:s|)$/ do |nrecords|
660
+ @brt.run_transforms
661
+
662
+ obs_nrecords = @brt.targets.keys.reduce(0) { |sum, target_name| sum += @brt.targets[target_name].size }
663
+ expect(obs_nrecords).to eq nrecords.to_i
664
+ end
665
+
659
666
  Then /^the target '([[:alnum:]\s\-_]+)' has (\d+) record(?:s|)$/ do |target_name, nrecords|
660
667
  @brt.run_transforms
661
668
  expect(@brt.targets[target_name].size).to eq nrecords.to_i
@@ -54,7 +54,8 @@ module Remi
54
54
  result_df = nil
55
55
  extract.each_with_index do |filename, idx|
56
56
  @logger.info "Converting #{filename} to a dataframe"
57
- csv_df = Daru::DataFrame.from_csv filename, @csv_options
57
+ processed_filename = preprocess(filename)
58
+ csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options
58
59
 
59
60
  csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
60
61
  if idx == 0
@@ -104,12 +105,31 @@ module Remi
104
105
  end
105
106
 
106
107
 
108
+
107
109
  private
108
110
 
109
- def init_csv_file(*args, extractor:, csv_options: {}, filename_field: nil, **kargs, &block)
111
+ def preprocess(filename)
112
+ return filename unless @preprocessor
113
+ @logger.info "Preprocessing #{filename}"
114
+ tmp_filename = File.join(Remi::Settings.work_dir, "#{Pathname.new(filename).basename}-#{SecureRandom.uuid}")
115
+
116
+ dirname = Pathname.new(tmp_filename).dirname
117
+ FileUtils.mkdir_p(dirname) unless File.directory? dirname
118
+
119
+ File.open(tmp_filename, 'w') do |outfile|
120
+ File.foreach(filename) do |in_line|
121
+ outfile.write @preprocessor.call(in_line)
122
+ end
123
+ end
124
+
125
+ tmp_filename
126
+ end
127
+
128
+ def init_csv_file(*args, extractor:, csv_options: {}, filename_field: nil, preprocessor: nil, **kargs, &block)
110
129
  self.extractor = extractor
111
130
  @csv_options = self.class.default_csv_options.merge(csv_options)
112
131
  @filename_field = filename_field
132
+ @preprocessor = preprocessor
113
133
  end
114
134
  end
115
135
 
data/lib/remi/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Remi
2
- VERSION = '0.2.33'
2
+ VERSION = '0.2.34'
3
3
  end
@@ -0,0 +1,78 @@
1
+ require_relative '../remi_spec'
2
+
3
+ describe DataSource::CsvFile do
4
+
5
+ it "converts a CSV into a dataframe" do
6
+ csv = Remi::DataSource::CsvFile.new(
7
+ extractor: 'spec/fixtures/basic.csv'
8
+ )
9
+
10
+ expected_df = Remi::DataFrame::Daru.new(
11
+ {
12
+ column_a: ['value 1A', 'value 2A'],
13
+ column_b: ['value 1B', 'value 2B']
14
+ }
15
+ )
16
+ expect(csv.df.to_a).to eq expected_df.to_a
17
+ end
18
+
19
+ it "adds filename when requested" do
20
+ csv = Remi::DataSource::CsvFile.new(
21
+ extractor: 'spec/fixtures/basic.csv',
22
+ filename_field: :from_file
23
+ )
24
+
25
+ expect(csv.df[:from_file].to_a).to eq ['spec/fixtures/basic.csv'] * 2
26
+ end
27
+
28
+ it "preprocesses records when required" do
29
+ csv = Remi::DataSource::CsvFile.new(
30
+ extractor: 'spec/fixtures/unsupported_escape.csv',
31
+ preprocessor: ->(line) { line.gsub(/\\"/,'""') }
32
+ )
33
+
34
+ expected_df = Remi::DataFrame::Daru.new(
35
+ {
36
+ column_a: ['value 1A', 'value 2A'],
37
+ column_b: ['value "1B"', 'value "2B"']
38
+ }
39
+ )
40
+ expect(csv.df.to_a).to eq expected_df.to_a
41
+ end
42
+
43
+ it "accepts standard Ruby CSV options" do
44
+ csv = Remi::DataSource::CsvFile.new(
45
+ extractor: 'spec/fixtures/basic.csv',
46
+ preprocessor: ->(line) { line.gsub(/,/,'|') },
47
+ csv_options: { col_sep: '|' }
48
+ )
49
+
50
+ expected_df = Remi::DataFrame::Daru.new(
51
+ {
52
+ column_a: ['value 1A', 'value 2A'],
53
+ column_b: ['value 1B', 'value 2B']
54
+ }
55
+ )
56
+ expect(csv.df.to_a).to eq expected_df.to_a
57
+ end
58
+
59
+ # Do this when I retire the old LocalFile
60
+ it "combines multiple csv files into a single dataframe", skip: 'TODO' do
61
+ csv = Remi::DataSource::CsvFile.new(
62
+ extractor: Remi::Extractor::LocalFile.new(
63
+ remote_path: 'spec/fixtures',
64
+ pattern: 'basic(|2)\.csv'
65
+ )
66
+ )
67
+
68
+ expected_df = Remi::DataFrame::Daru.new(
69
+ {
70
+ column_a: ['value 1A', 'value 2A', 'value 1A', 'value 2A'],
71
+ column_b: ['value 1B', 'value 2B', nil, nil],
72
+ columb_c: [nil, nil, 'value 1C', 'value 2C']
73
+ }
74
+ )
75
+ expect(csv.df.to_a).to eq expected_df.to_a
76
+ end
77
+
78
+ end
@@ -0,0 +1,3 @@
1
+ column A,column B
2
+ value 1A,value 1B
3
+ value 2A,value 2B
@@ -0,0 +1,3 @@
1
+ column A,column C
2
+ value 1A,value 1C
3
+ value 2A,value 2C
@@ -0,0 +1,3 @@
1
+ column A,column B
2
+ value 1A,"value \"1B\""
3
+ value 2A,"value \"2B\""
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.33
4
+ version: 0.2.34
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sterling Paramore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-23 00:00:00.000000000 Z
11
+ date: 2016-06-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bond
@@ -245,9 +245,13 @@ files:
245
245
  - lib/remi/transform.rb
246
246
  - lib/remi/version.rb
247
247
  - remi.gemspec
248
+ - spec/data_subject/csv_file_spec.rb
248
249
  - spec/extractor/file_system_spec.rb
249
250
  - spec/extractor/s3_file_spec.rb
250
251
  - spec/extractor/sftp_file_spec.rb
252
+ - spec/fixtures/basic.csv
253
+ - spec/fixtures/basic2.csv
254
+ - spec/fixtures/unsupported_escape.csv
251
255
  - spec/metadata_spec.rb
252
256
  - spec/remi_spec.rb
253
257
  - workbooks/sample_workbook.ipynb
@@ -296,8 +300,12 @@ test_files:
296
300
  - features/transforms/prefix.feature
297
301
  - features/transforms/truncate.feature
298
302
  - features/transforms/truthy.feature
303
+ - spec/data_subject/csv_file_spec.rb
299
304
  - spec/extractor/file_system_spec.rb
300
305
  - spec/extractor/s3_file_spec.rb
301
306
  - spec/extractor/sftp_file_spec.rb
307
+ - spec/fixtures/basic.csv
308
+ - spec/fixtures/basic2.csv
309
+ - spec/fixtures/unsupported_escape.csv
302
310
  - spec/metadata_spec.rb
303
311
  - spec/remi_spec.rb