smarter_csv 1.0.18 → 1.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -34
- data/lib/smarter_csv/smarter_csv.rb +8 -6
- data/lib/smarter_csv/version.rb +1 -1
- data/spec/smarter_csv/keep_headers_spec.rb +24 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9022f349dd8ee2590c73b198fb83114a8c96932d
|
4
|
+
data.tar.gz: 9c1a769c72e08e2e78d15ad444bf3f9642cd33e5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e3ccf944663244bc4b336d9980c26f1fda874d48586a131f3c761b6885a2753ac443c80a559046e2c6670f90ba192155e10aceb0e84798add22c9a20d78653a1
|
7
|
+
data.tar.gz: 69b3abf03488df9b79b796dd7efbc3612bd273fb1e4f6f156b238213ef377e22ff1852ae17fb7384722dfbba456d9ab36313e5d2c43d5a599a696d008194cd29
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# SmarterCSV [](http://travis-ci.org/tilo/smarter_csv)
|
2
2
|
|
3
|
-
`smarter_csv` is a Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, suitable for direct processing with Mongoid or ActiveRecord,
|
3
|
+
`smarter_csv` is a Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, suitable for direct processing with Mongoid or ActiveRecord,
|
4
4
|
and parallel processing with Resque or Sidekiq.
|
5
5
|
|
6
6
|
One `smarter_csv` user wrote:
|
@@ -32,6 +32,8 @@ The two main choices you have in terms of how to call `SmarterCSV.process` are:
|
|
32
32
|
* calling `process` with or without a block
|
33
33
|
* passing a `:chunk_size` to the `process` method, and processing the CSV-file in chunks, rather than in one piece.
|
34
34
|
|
35
|
+
Tip: If you are uncertain about what line endings a CSV-file uses, try specifying `:row_sep => :auto` as part of the options. Checkout Example 5 for unusual `:row_sep` and `:col_sep`.
|
36
|
+
|
35
37
|
#### Example 1a: How SmarterCSV processes CSV-files as array of hashes:
|
36
38
|
Please note how each hash contains only the keys for columns with non-null values.
|
37
39
|
|
@@ -40,15 +42,15 @@ Please note how each hash contains only the keys for columns with non-null value
|
|
40
42
|
Dan,McAllister,2,,,
|
41
43
|
Lucy,Laweless,,5,,
|
42
44
|
Miles,O'Brian,,,,21
|
43
|
-
Nancy,Homes,2,,1,
|
45
|
+
Nancy,Homes,2,,1,
|
44
46
|
$ irb
|
45
47
|
> require 'smarter_csv'
|
46
|
-
=> true
|
48
|
+
=> true
|
47
49
|
> pets_by_owner = SmarterCSV.process('/tmp/pets.csv')
|
48
50
|
=> [ {:first_name=>"Dan", :last_name=>"McAllister", :dogs=>"2"},
|
49
|
-
{:first_name=>"Lucy", :last_name=>"Laweless", :cats=>"5"},
|
50
|
-
{:first_name=>"Miles", :last_name=>"O'Brian", :fish=>"21"},
|
51
|
-
{:first_name=>"Nancy", :last_name=>"Homes", :dogs=>"2", :birds=>"1"}
|
51
|
+
{:first_name=>"Lucy", :last_name=>"Laweless", :cats=>"5"},
|
52
|
+
{:first_name=>"Miles", :last_name=>"O'Brian", :fish=>"21"},
|
53
|
+
{:first_name=>"Nancy", :last_name=>"Homes", :dogs=>"2", :birds=>"1"}
|
52
54
|
]
|
53
55
|
|
54
56
|
|
@@ -57,7 +59,7 @@ Please note how the returned array contains two sub-arrays containing the chunks
|
|
57
59
|
In case the number of rows is not cleanly divisible by `:chunk_size`, the last chunk contains fewer hashes.
|
58
60
|
|
59
61
|
> pets_by_owner = SmarterCSV.process('/tmp/pets.csv', {:chunk_size => 2, :key_mapping => {:first_name => :first, :last_name => :last}})
|
60
|
-
=> [ [ {:first=>"Dan", :last=>"McAllister", :dogs=>"2"}, {:first=>"Lucy", :last=>"Laweless", :cats=>"5"} ],
|
62
|
+
=> [ [ {:first=>"Dan", :last=>"McAllister", :dogs=>"2"}, {:first=>"Lucy", :last=>"Laweless", :cats=>"5"} ],
|
61
63
|
[ {:first=>"Miles", :last=>"O'Brian", :fish=>"21"}, {:first=>"Nancy", :last=>"Homes", :dogs=>"2", :birds=>"1"} ]
|
62
64
|
]
|
63
65
|
|
@@ -75,7 +77,7 @@ and how the `process` method returns the number of chunks when called with a blo
|
|
75
77
|
|
76
78
|
[{:dogs=>"2", :full_name=>"Dan McAllister"}, {:cats=>"5", :full_name=>"Lucy Laweless"}]
|
77
79
|
[{:fish=>"21", :full_name=>"Miles O'Brian"}, {:dogs=>"2", :birds=>"1", :full_name=>"Nancy Homes"}]
|
78
|
-
=> 2
|
80
|
+
=> 2
|
79
81
|
|
80
82
|
#### Example 2: Reading a CSV-File in one Chunk, returning one Array of Hashes:
|
81
83
|
|
@@ -88,20 +90,21 @@ and how the `process` method returns the number of chunks when called with a blo
|
|
88
90
|
|
89
91
|
# without using chunks:
|
90
92
|
filename = '/tmp/some.csv'
|
91
|
-
|
93
|
+
options = {:key_mapping => {:unwanted_row => nil, :old_row_name => :new_name}}
|
94
|
+
n = SmarterCSV.process(filename, options) do |array|
|
92
95
|
# we're passing a block in, to process each resulting hash / =row (the block takes array of hashes)
|
93
96
|
# when chunking is not enabled, there is only one hash in each array
|
94
97
|
MyModel.create( array.first )
|
95
98
|
end
|
96
99
|
|
97
|
-
=> returns number of chunks / rows we processed
|
98
|
-
|
100
|
+
=> returns number of chunks / rows we processed
|
99
101
|
|
100
102
|
#### Example 4: Populate a MongoDB Database in Chunks of 100 records with SmarterCSV:
|
101
103
|
|
102
104
|
# using chunks:
|
103
105
|
filename = '/tmp/some.csv'
|
104
|
-
|
106
|
+
options = {:chunk_size => 100, :key_mapping => {:unwanted_row => nil, :old_row_name => :new_name}}
|
107
|
+
n = SmarterCSV.process(filename, options) do |chunk|
|
105
108
|
# we're passing a block in, to process each resulting hash / row (block takes array of hashes)
|
106
109
|
# when chunking is enabled, there are up to :chunk_size hashes in each chunk
|
107
110
|
MyModel.collection.insert( chunk ) # insert up to 100 records at a time
|
@@ -112,9 +115,12 @@ and how the `process` method returns the number of chunks when called with a blo
|
|
112
115
|
|
113
116
|
#### Example 5: Reading a CSV-like File, and Processing it with Resque:
|
114
117
|
|
115
|
-
filename = '/tmp/strange_db_dump' # a file with CRTL-A as col_separator, and with CTRL-B\n as record_separator (hello iTunes)
|
116
|
-
|
117
|
-
|
118
|
+
filename = '/tmp/strange_db_dump' # a file with CRTL-A as col_separator, and with CTRL-B\n as record_separator (hello iTunes!)
|
119
|
+
options = {
|
120
|
+
:col_sep => "\cA", :row_sep => "\cB\n", :comment_regexp => /^#/,
|
121
|
+
:chunk_size => 100 , :key_mapping => {:export_date => nil, :name => :genre}
|
122
|
+
}
|
123
|
+
n = SmarterCSV.process(filename, options) do |chunk|
|
118
124
|
Resque.enque( ResqueWorkerClass, chunk ) # pass chunks of CSV-data to Resque workers for parallel processing
|
119
125
|
end
|
120
126
|
=> returns number of chunks
|
@@ -139,18 +145,14 @@ The options and the block are optional.
|
|
139
145
|
| :quote_char | '"' | quotation character |
|
140
146
|
| :comment_regexp | /^#/ | regular expression which matches comment lines (see NOTE about the CSV header) |
|
141
147
|
| :chunk_size | nil | if set, determines the desired chunk-size (defaults to nil, no chunk processing) |
|
148
|
+
---------------------------------------------------------------------------------------------------------------------------------
|
142
149
|
| :key_mapping | nil | a hash which maps headers from the CSV file to keys in the result hash |
|
143
150
|
| :remove_unmapped_keys | false | when using :key_mapping option, should non-mapped keys / columns be removed? |
|
144
151
|
| :downcase_header | true | downcase all column headers |
|
145
152
|
| :strings_as_keys | false | use strings instead of symbols as the keys in the result hashes |
|
146
153
|
| :strip_whitespace | true | remove whitespace before/after values and headers |
|
147
|
-
| :
|
148
|
-
|
|
149
|
-
| :remove_values_matching | nil | removes key/value pairs if value matches given regular expressions. e.g.: |
|
150
|
-
| | | /^\$0\.0+$/ to match $0.00 , or /^#VALUE!$/ to match errors in Excel spreadsheets |
|
151
|
-
| :convert_values_to_numeric | true | converts strings containing Integers or Floats to the appropriate class |
|
152
|
-
| | | also accepts either {:except => [:key1,:key2]} or {:only => :key3} |
|
153
|
-
| :remove_empty_hashes | true | remove / ignore any hashes which don't have any key/value pairs |
|
154
|
+
| :keep_original_headers | false | keep the original headers from the CSV-file as-is. |
|
155
|
+
| | | Disables other flags manipulating the header fields. |
|
154
156
|
| :user_provided_headers | nil | *careful with that axe!* |
|
155
157
|
| | | user provided Array of header strings or symbols, to define |
|
156
158
|
| | | what headers should be used, overriding any in-file headers. |
|
@@ -159,6 +161,14 @@ The options and the block are optional.
|
|
159
161
|
| :headers_in_file | true | Whether or not the file contains headers as the first line. |
|
160
162
|
| | | Important if the file does not contain headers, |
|
161
163
|
| | | otherwise you would lose the first line of data. |
|
164
|
+
---------------------------------------------------------------------------------------------------------------------------------
|
165
|
+
| :remove_empty_values | true | remove values which have nil or empty strings as values |
|
166
|
+
| :remove_zero_values | true | remove values which have a numeric value equal to zero / 0 |
|
167
|
+
| :remove_values_matching | nil | removes key/value pairs if value matches given regular expressions. e.g.: |
|
168
|
+
| | | /^\$0\.0+$/ to match $0.00 , or /^#VALUE!$/ to match errors in Excel spreadsheets |
|
169
|
+
| :convert_values_to_numeric | true | converts strings containing Integers or Floats to the appropriate class |
|
170
|
+
| | | also accepts either {:except => [:key1,:key2]} or {:only => :key3} |
|
171
|
+
| :remove_empty_hashes | true | remove / ignore any hashes which don't have any key/value pairs |
|
162
172
|
| :file_encoding | utf-8 | Set the file encoding eg.: 'windows-1252' or 'iso-8859-1' |
|
163
173
|
| :force_simple_split | false | force simiple splitting on :col_sep character for non-standard CSV-files. |
|
164
174
|
| | | e.g. when :quote_char is not properly escaped |
|
@@ -225,18 +235,21 @@ Or install it yourself as:
|
|
225
235
|
|
226
236
|
## Changes
|
227
237
|
|
238
|
+
#### 1.0.19 (2014-10-29)
|
239
|
+
* added option :keep_original_headers to keep CSV-headers as-is (thanks to Benjamin Thouret)
|
240
|
+
|
228
241
|
#### 1.0.18 (2014-10-27)
|
229
242
|
* added support for multi-line fields / csv fields containing CR (thanks to Chris Hilton) (issue #31)
|
230
|
-
|
243
|
+
|
231
244
|
#### 1.0.17 (2014-01-13)
|
232
245
|
* added option to set :row_sep to :auto , for automatic detection of the row-separator (issue #22)
|
233
246
|
|
234
247
|
#### 1.0.16 (2014-01-13)
|
235
248
|
* :convert_values_to_numeric option can now be qualified with :except or :only (thanks to Hugo Lepetit)
|
236
249
|
* removed deprecated `process_csv` method
|
237
|
-
|
250
|
+
|
238
251
|
#### 1.0.15 (2013-12-07)
|
239
|
-
* new option:
|
252
|
+
* new option:
|
240
253
|
* :remove_unmapped_keys to completely ignore columns which were not mapped with :key_mapping (thanks to Dave Sanders)
|
241
254
|
|
242
255
|
#### 1.0.14 (2013-11-01)
|
@@ -281,12 +294,12 @@ Or install it yourself as:
|
|
281
294
|
|
282
295
|
#### 1.0.4 (2012-08-17)
|
283
296
|
|
284
|
-
* renamed the following options:
|
297
|
+
* renamed the following options:
|
285
298
|
* :strip_whitepace_from_values => :strip_whitespace - removes leading/trailing whitespace from headers and values
|
286
299
|
|
287
300
|
#### 1.0.3 (2012-08-16)
|
288
301
|
|
289
|
-
* added the following options:
|
302
|
+
* added the following options:
|
290
303
|
* :strip_whitepace_from_values - removes leading/trailing whitespace from values
|
291
304
|
|
292
305
|
#### 1.0.2 (2012-08-02)
|
@@ -297,7 +310,7 @@ Or install it yourself as:
|
|
297
310
|
|
298
311
|
#### 1.0.1 (2012-07-30)
|
299
312
|
|
300
|
-
* added the following options:
|
313
|
+
* added the following options:
|
301
314
|
* :downcase_header
|
302
315
|
* :strings_as_keys
|
303
316
|
* :remove_zero_values
|
@@ -307,7 +320,7 @@ Or install it yourself as:
|
|
307
320
|
|
308
321
|
* renamed the following options:
|
309
322
|
* :remove_empty_fields => :remove_empty_values
|
310
|
-
|
323
|
+
|
311
324
|
|
312
325
|
#### 1.0.0 (2012-07-29)
|
313
326
|
|
@@ -323,15 +336,16 @@ Please [open an Issue on GitHub](https://github.com/tilo/smarter_csv/issues) if
|
|
323
336
|
|
324
337
|
## Special Thanks
|
325
338
|
|
326
|
-
Many thanks to people who have filed issues and sent comments.
|
339
|
+
Many thanks to people who have filed issues and sent comments.
|
327
340
|
And a special thanks to those who contributed pull requests:
|
328
341
|
|
342
|
+
* [Benjamin Thouret](https://github.com/benichu)
|
329
343
|
* [Chris Hilton](https://github.com/chrismhilton)
|
330
344
|
* [Sean Duckett](http://github.com/sduckett)
|
331
|
-
* [Alex Ong](http://github.com/khaong)
|
332
|
-
* [Martin Nilsson](http://github.com/MrTin)
|
333
|
-
* [Eustáquio Rangel](http://github.com/taq)
|
334
|
-
* [Pavel](http://github.com/paxa)
|
345
|
+
* [Alex Ong](http://github.com/khaong)
|
346
|
+
* [Martin Nilsson](http://github.com/MrTin)
|
347
|
+
* [Eustáquio Rangel](http://github.com/taq)
|
348
|
+
* [Pavel](http://github.com/paxa)
|
335
349
|
* [Félix Bellanger](https://github.com/Keeguon)
|
336
350
|
* [Graham Wetzler](https://github.com/grahamwetzler)
|
337
351
|
* [Marcos G. Zimmermann](https://github.com/marcosgz)
|
@@ -9,7 +9,7 @@ module SmarterCSV
|
|
9
9
|
:remove_empty_values => true, :remove_zero_values => false , :remove_values_matching => nil , :remove_empty_hashes => true , :strip_whitespace => true,
|
10
10
|
:convert_values_to_numeric => true, :strip_chars_from_headers => nil , :user_provided_headers => nil , :headers_in_file => true,
|
11
11
|
:comment_regexp => /^#/, :chunk_size => nil , :key_mapping_hash => nil , :downcase_header => true, :strings_as_keys => false, :file_encoding => 'utf-8',
|
12
|
-
:remove_unmapped_keys => false,
|
12
|
+
:remove_unmapped_keys => false, :keep_original_headers => false,
|
13
13
|
}
|
14
14
|
options = default_options.merge(options)
|
15
15
|
csv_options = options.select{|k,v| [:col_sep, :row_sep, :quote_char].include?(k)} # options.slice(:col_sep, :row_sep, :quote_char)
|
@@ -39,8 +39,10 @@ module SmarterCSV
|
|
39
39
|
end
|
40
40
|
file_headerA.map!{|x| x.gsub(%r/options[:quote_char]/,'') }
|
41
41
|
file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
|
42
|
-
|
43
|
-
|
42
|
+
unless options[:keep_original_headers]
|
43
|
+
file_headerA.map!{|x| x.gsub(/\s+/,'_')}
|
44
|
+
file_headerA.map!{|x| x.downcase } if options[:downcase_header]
|
45
|
+
end
|
44
46
|
|
45
47
|
# puts "HeaderA: #{file_headerA.join(' , ')}" if options[:verbose]
|
46
48
|
|
@@ -59,7 +61,7 @@ module SmarterCSV
|
|
59
61
|
else
|
60
62
|
headerA = file_headerA
|
61
63
|
end
|
62
|
-
headerA.map!{|x| x.to_sym } unless options[:strings_as_keys]
|
64
|
+
headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
|
63
65
|
|
64
66
|
unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
|
65
67
|
key_mappingH = options[:key_mapping]
|
@@ -90,12 +92,12 @@ module SmarterCSV
|
|
90
92
|
|
91
93
|
# cater for the quoted csv data containing the row separator carriage return character
|
92
94
|
# in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
|
93
|
-
# by detecting the existence of an uneven number of quote characters
|
95
|
+
# by detecting the existence of an uneven number of quote characters
|
94
96
|
while line.count(options[:quote_char])%2 == 1
|
95
97
|
print "line contains uneven number of quote chars so including content of next line" if options[:verbose]
|
96
98
|
line += f.readline
|
97
99
|
end
|
98
|
-
|
100
|
+
|
99
101
|
line.chomp! # will use $/ which is set to options[:col_sep]
|
100
102
|
|
101
103
|
if (line =~ %r{#{options[:quote_char]}}) and (! options[:force_simple_split])
|
data/lib/smarter_csv/version.rb
CHANGED
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
fixture_path = 'spec/fixtures'
|
4
|
+
|
5
|
+
describe 'be_able_to' do
|
6
|
+
it 'not_downcase_headers' do
|
7
|
+
options = {:keep_original_headers => true}
|
8
|
+
data = SmarterCSV.process("#{fixture_path}/basic.csv", options)
|
9
|
+
data.size.should == 5
|
10
|
+
# all the keys should be string
|
11
|
+
data.each{|item| item.keys.each{|x| x.class.should be == String}}
|
12
|
+
|
13
|
+
data.each do |item|
|
14
|
+
item.keys.each do |key|
|
15
|
+
['First Name','Last Name','Dogs','Cats','Birds','Fish'].should include( key )
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
data.each do |h|
|
20
|
+
h.size.should <= 6
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: smarter_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- |
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-10-
|
12
|
+
date: 2014-10-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -70,6 +70,7 @@ files:
|
|
70
70
|
- spec/smarter_csv/chunked_reading_spec.rb
|
71
71
|
- spec/smarter_csv/column_separator_spec.rb
|
72
72
|
- spec/smarter_csv/convert_values_to_numeric_spec.rb
|
73
|
+
- spec/smarter_csv/keep_headers_spec.rb
|
73
74
|
- spec/smarter_csv/key_mapping_spec.rb
|
74
75
|
- spec/smarter_csv/line_ending_spec.rb
|
75
76
|
- spec/smarter_csv/load_basic_spec.rb
|
@@ -137,6 +138,7 @@ test_files:
|
|
137
138
|
- spec/smarter_csv/chunked_reading_spec.rb
|
138
139
|
- spec/smarter_csv/column_separator_spec.rb
|
139
140
|
- spec/smarter_csv/convert_values_to_numeric_spec.rb
|
141
|
+
- spec/smarter_csv/keep_headers_spec.rb
|
140
142
|
- spec/smarter_csv/key_mapping_spec.rb
|
141
143
|
- spec/smarter_csv/line_ending_spec.rb
|
142
144
|
- spec/smarter_csv/load_basic_spec.rb
|