smarter_csv 1.1.4 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +12 -22
- data/Gemfile +3 -2
- data/README.md +19 -9
- data/lib/smarter_csv/smarter_csv.rb +11 -4
- data/lib/smarter_csv/version.rb +1 -1
- metadata +9 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 042aadb2bc5426a07a64f09e781bccbd728e8052
|
4
|
+
data.tar.gz: ba48c2e303591d4027e05d1208c225381d362857
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 58cb92edabb46bdcb48598d4b4b02b5f0f09cc63378e818ac672daf8d722b5fbf1b246df5db262dff306e87943a2bb2bebbb753944adc5449b19cd5a1475c00b
|
7
|
+
data.tar.gz: 31fe30f2b2027274a5252c55b234b120327be6ce652f7ad71232bd8a920e33d30cbae42577fff398d0a61574dc17b3016cd3fa1d520eec3dd4636569cc62860e
|
data/.travis.yml
CHANGED
@@ -1,29 +1,19 @@
|
|
1
1
|
language: ruby
|
2
2
|
bundler_args: --without development
|
3
|
-
|
4
|
-
-
|
5
|
-
-
|
6
|
-
|
7
|
-
- 2.0.0
|
8
|
-
- 2.1.3
|
9
|
-
- 2.2.2
|
10
|
-
- jruby
|
11
|
-
- ruby-head
|
12
|
-
- jruby-head
|
13
|
-
- ree
|
14
|
-
- rbx
|
15
|
-
# jdk:
|
16
|
-
# - oraclejdk7
|
17
|
-
# - openjdk7
|
18
|
-
env: JRUBY_OPTS="--server -Xcompile.invokedynamic=false -J-XX:+TieredCompilation -J-XX:TieredStopAtLevel=1 -J-noverify -J-Xms512m -J-Xmx1024m"
|
3
|
+
before_install:
|
4
|
+
- gem install bundler
|
5
|
+
- gem update --system
|
6
|
+
|
19
7
|
matrix:
|
20
|
-
|
21
|
-
-
|
22
|
-
- rvm:
|
8
|
+
include:
|
9
|
+
- rvm: 2.2.8
|
10
|
+
- rvm: 2.3.5
|
11
|
+
- rvm: 2.4.2
|
12
|
+
- rvm: jruby-9.1.13.0
|
13
|
+
env:
|
14
|
+
- JRUBY_OPTS="--server -Xcompile.invokedynamic=false -J-XX:+TieredCompilation -J-XX:TieredStopAtLevel=1 -J-noverify -J-Xms512m -J-Xmx1024m"
|
23
15
|
- rvm: ruby-head
|
24
|
-
|
25
|
-
- rvm: 1.8.7
|
26
|
-
- rvm: jruby-18mode
|
16
|
+
|
27
17
|
branches:
|
28
18
|
only:
|
29
19
|
- master
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# SmarterCSV
|
2
2
|
|
3
|
-
[](http://travis-ci.org/tilo/smarter_csv) [](http://badge.fury.io/rb/smarter_csv)
|
4
4
|
|
5
5
|
`smarter_csv` is a Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, suitable for direct processing with Mongoid or ActiveRecord,
|
6
6
|
and parallel processing with Resque or Sidekiq.
|
@@ -35,7 +35,10 @@ The two main choices you have in terms of how to call `SmarterCSV.process` are:
|
|
35
35
|
* passing a `:chunk_size` to the `process` method, and processing the CSV-file in chunks, rather than in one piece.
|
36
36
|
|
37
37
|
Tip: If you are uncertain about what line endings a CSV-file uses, try specifying `:row_sep => :auto` as part of the options.
|
38
|
-
But this could be slow
|
38
|
+
But this could be slow if we would analyze the whole CSV file first (previous to 1.1.5 the whole file was analyzed).
|
39
|
+
To speed things up, you can setting the option `:auto_row_sep_chars` to only analyze the first N characters of the file (default is 500; nil or 0 will check the whole file).
|
40
|
+
You can also set the `:row_sep` manually! Checkout Example 5 for unusual `:row_sep` and `:col_sep`.
|
41
|
+
|
39
42
|
|
40
43
|
#### Example 1a: How SmarterCSV processes CSV-files as array of hashes:
|
41
44
|
Please note how each hash contains only the keys for columns with non-null values.
|
@@ -166,7 +169,7 @@ NOTE: If you use `key_mappings` and `value_converters`, make sure that the value
|
|
166
169
|
=> Float
|
167
170
|
|
168
171
|
## Parallel Processing
|
169
|
-
[Jack](https://github.com/xjlin0) wrote an interesting article about [Speeding up CSV parsing with parallel processing](http://xjlin0.github.io/tech/2015/05/25/faster-parsing-csv-with-parallel-processing
|
172
|
+
[Jack](https://github.com/xjlin0) wrote an interesting article about [Speeding up CSV parsing with parallel processing](http://xjlin0.github.io/tech/2015/05/25/faster-parsing-csv-with-parallel-processing)
|
170
173
|
|
171
174
|
## Documentation
|
172
175
|
|
@@ -184,6 +187,7 @@ The options and the block are optional.
|
|
184
187
|
| :col_sep | ',' | column separator |
|
185
188
|
| :row_sep | $/ ,"\n" | row separator or record separator , defaults to system's $/ , which defaults to "\n" |
|
186
189
|
| | | This can also be set to :auto, but will process the whole cvs file first (slow!) |
|
190
|
+
| :auto_row_sep_chars | 500 | How many characters to analyze when using `:row_sep => :auto`. nil or 0 means whole file. |
|
187
191
|
| :quote_char | '"' | quotation character |
|
188
192
|
| :comment_regexp | /^#/ | regular expression which matches comment lines (see NOTE about the CSV header) |
|
189
193
|
| :chunk_size | nil | if set, determines the desired chunk-size (defaults to nil, no chunk processing) |
|
@@ -216,7 +220,7 @@ The options and the block are optional.
|
|
216
220
|
| | | also accepts either {:except => [:key1,:key2]} or {:only => :key3} |
|
217
221
|
| :remove_empty_hashes | true | remove / ignore any hashes which don't have any key/value pairs |
|
218
222
|
| :file_encoding | utf-8 | Set the file encoding eg.: 'windows-1252' or 'iso-8859-1' |
|
219
|
-
| :force_simple_split | false | force
|
223
|
+
| :force_simple_split | false | force simple splitting on :col_sep character for non-standard CSV-files. |
|
220
224
|
| | | e.g. when :quote_char is not properly escaped |
|
221
225
|
| :verbose | false | print out line number while processing (to track down problems in input files) |
|
222
226
|
|
@@ -261,10 +265,6 @@ The options and the block are optional.
|
|
261
265
|
* some CSV files use un-escaped quotation characters inside fields. This can cause the import to break. To get around this, use the `:force_simple_split => true` option in combination with `:strip_chars_from_headers => /[\-"]/` . This will also significantly speed up the import.
|
262
266
|
If you would force a different :quote_char instead (setting it to a non-used character), then the import would be up to 5-times slower than using `:force_simple_split`.
|
263
267
|
|
264
|
-
#### Known Issues:
|
265
|
-
* if you are using 1.8.7 versions of Ruby, JRuby, or Ruby Enterprise Edition, `smarter_csv` will have problems with double-quoted fields, because of a bug in an underlying library.
|
266
|
-
|
267
|
-
|
268
268
|
## See also:
|
269
269
|
|
270
270
|
http://www.unixgods.org/~tilo/Ruby/process_csv_as_hashes.html
|
@@ -293,8 +293,14 @@ Planned in the next releases:
|
|
293
293
|
|
294
294
|
## Changes
|
295
295
|
|
296
|
+
#### 1.1.5 (2017-11-05)
|
297
|
+
* fix issue with invalid byte sequences in header (issue #103, thanks to Dave Myron)
|
298
|
+
* fix issue with invalid byte sequences in multi-line data (thanks to Ivan Ushakov)
|
299
|
+
* analyze only 500 characters by default when `:row_sep => :auto` is used.
|
300
|
+
added option `row_sep_auto_chars` to change the default if necessary. (thanks to Matthieu Paret)
|
301
|
+
|
296
302
|
#### 1.1.4 (2017-01-16)
|
297
|
-
* fixing UTF-8 related bug which was introduced in 1.1.2 (
|
303
|
+
* fixing UTF-8 related bug which was introduced in 1.1.2 (thanks to Tirdad C.)
|
298
304
|
|
299
305
|
#### 1.1.3 (2016-12-30)
|
300
306
|
* added warning when options indicate UTF-8 processing, but input filehandle is not opened with r:UTF-8 option
|
@@ -449,6 +455,10 @@ And a special thanks to those who contributed pull requests:
|
|
449
455
|
* [Michael](https://github.com/polycarpou)
|
450
456
|
* [Kevin Coleman](https://github.com/KevinColemanInc)
|
451
457
|
* [Tirdad C.](https://github.com/tridadc)
|
458
|
+
* [Dave Myron](https://github.com/contentfree)
|
459
|
+
* [Ivan Ushakov](https://github.com/IvanUshakov)
|
460
|
+
* [Matthieu Paret](https://github.com/mtparet)
|
461
|
+
* [Rohit Amarnath](https://github.com/ramarnat)
|
452
462
|
|
453
463
|
|
454
464
|
## Contributing
|
@@ -9,7 +9,8 @@ module SmarterCSV
|
|
9
9
|
:remove_empty_values => true, :remove_zero_values => false , :remove_values_matching => nil , :remove_empty_hashes => true , :strip_whitespace => true,
|
10
10
|
:convert_values_to_numeric => true, :strip_chars_from_headers => nil , :user_provided_headers => nil , :headers_in_file => true,
|
11
11
|
:comment_regexp => /^#/, :chunk_size => nil , :key_mapping_hash => nil , :downcase_header => true, :strings_as_keys => false, :file_encoding => 'utf-8',
|
12
|
-
:remove_unmapped_keys => false, :keep_original_headers => false, :value_converters => nil, :skip_lines => nil, :force_utf8 => false, :invalid_byte_sequence => ''
|
12
|
+
:remove_unmapped_keys => false, :keep_original_headers => false, :value_converters => nil, :skip_lines => nil, :force_utf8 => false, :invalid_byte_sequence => '',
|
13
|
+
:auto_row_sep_chars => 500
|
13
14
|
}
|
14
15
|
options = default_options.merge(options)
|
15
16
|
options[:invalid_byte_sequence] = '' if options[:invalid_byte_sequence].nil?
|
@@ -27,7 +28,7 @@ module SmarterCSV
|
|
27
28
|
end
|
28
29
|
|
29
30
|
if options[:row_sep] == :auto
|
30
|
-
options[:row_sep] =
|
31
|
+
options[:row_sep] = line_ending = SmarterCSV.guess_line_ending( f, options )
|
31
32
|
f.rewind
|
32
33
|
end
|
33
34
|
$/ = options[:row_sep]
|
@@ -39,8 +40,9 @@ module SmarterCSV
|
|
39
40
|
if options[:headers_in_file] # extract the header line
|
40
41
|
# process the header line in the CSV file..
|
41
42
|
# the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
|
42
|
-
header = f.readline
|
43
|
+
header = f.readline
|
43
44
|
header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
45
|
+
header = header.sub(options[:comment_regexp],'').chomp(options[:row_sep])
|
44
46
|
|
45
47
|
file_line_count += 1
|
46
48
|
csv_line_count += 1
|
@@ -118,7 +120,9 @@ module SmarterCSV
|
|
118
120
|
# by detecting the existence of an uneven number of quote characters
|
119
121
|
multiline = line.count(options[:quote_char])%2 == 1
|
120
122
|
while line.count(options[:quote_char])%2 == 1
|
121
|
-
|
123
|
+
next_line = f.readline
|
124
|
+
next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
125
|
+
line += next_line
|
122
126
|
file_line_count += 1
|
123
127
|
end
|
124
128
|
print "\nline contains uneven number of quote chars so including content through file line %d\n" % file_line_count if options[:verbose] && multiline
|
@@ -251,6 +255,7 @@ module SmarterCSV
|
|
251
255
|
# count how many of the pre-defined line-endings we find
|
252
256
|
# ignoring those contained within quote characters
|
253
257
|
last_char = nil
|
258
|
+
lines = 0
|
254
259
|
filehandle.each_char do |c|
|
255
260
|
quoted_char = !quoted_char if c == options[:quote_char]
|
256
261
|
next if quoted_char
|
@@ -265,6 +270,8 @@ module SmarterCSV
|
|
265
270
|
counts["\n"] += 1
|
266
271
|
end
|
267
272
|
last_char = c
|
273
|
+
lines += 1
|
274
|
+
break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
|
268
275
|
end
|
269
276
|
counts["\r"] += 1 if last_char == "\r"
|
270
277
|
# find the key/value pair with the largest counter:
|
data/lib/smarter_csv/version.rb
CHANGED
metadata
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: smarter_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
8
|
-
|
7
|
+
- 'Tilo Sloboda
|
8
|
+
|
9
|
+
'
|
9
10
|
autorequire:
|
10
11
|
bindir: bin
|
11
12
|
cert_chain: []
|
12
|
-
date: 2017-
|
13
|
+
date: 2017-11-06 00:00:00.000000000 Z
|
13
14
|
dependencies:
|
14
15
|
- !ruby/object:Gem::Dependency
|
15
16
|
name: rspec
|
@@ -29,8 +30,9 @@ description: Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes,
|
|
29
30
|
optional features for processing large files in parallel, embedded comments, unusual
|
30
31
|
field- and record-separators, flexible mapping of CSV-headers to Hash-keys
|
31
32
|
email:
|
32
|
-
-
|
33
|
-
|
33
|
+
- 'tilo.sloboda@gmail.com
|
34
|
+
|
35
|
+
'
|
34
36
|
executables: []
|
35
37
|
extensions: []
|
36
38
|
extra_rdoc_files: []
|
@@ -123,7 +125,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
123
125
|
requirements:
|
124
126
|
- csv
|
125
127
|
rubyforge_project:
|
126
|
-
rubygems_version: 2.
|
128
|
+
rubygems_version: 2.6.13
|
127
129
|
signing_key:
|
128
130
|
specification_version: 4
|
129
131
|
summary: Ruby Gem for smarter importing of CSV Files (and CSV-like files), with lots
|