smarter_csv 1.1.4 → 1.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +12 -22
- data/Gemfile +3 -2
- data/README.md +19 -9
- data/lib/smarter_csv/smarter_csv.rb +11 -4
- data/lib/smarter_csv/version.rb +1 -1
- metadata +9 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 042aadb2bc5426a07a64f09e781bccbd728e8052
|
4
|
+
data.tar.gz: ba48c2e303591d4027e05d1208c225381d362857
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 58cb92edabb46bdcb48598d4b4b02b5f0f09cc63378e818ac672daf8d722b5fbf1b246df5db262dff306e87943a2bb2bebbb753944adc5449b19cd5a1475c00b
|
7
|
+
data.tar.gz: 31fe30f2b2027274a5252c55b234b120327be6ce652f7ad71232bd8a920e33d30cbae42577fff398d0a61574dc17b3016cd3fa1d520eec3dd4636569cc62860e
|
data/.travis.yml
CHANGED
@@ -1,29 +1,19 @@
|
|
1
1
|
language: ruby
|
2
2
|
bundler_args: --without development
|
3
|
-
|
4
|
-
-
|
5
|
-
-
|
6
|
-
|
7
|
-
- 2.0.0
|
8
|
-
- 2.1.3
|
9
|
-
- 2.2.2
|
10
|
-
- jruby
|
11
|
-
- ruby-head
|
12
|
-
- jruby-head
|
13
|
-
- ree
|
14
|
-
- rbx
|
15
|
-
# jdk:
|
16
|
-
# - oraclejdk7
|
17
|
-
# - openjdk7
|
18
|
-
env: JRUBY_OPTS="--server -Xcompile.invokedynamic=false -J-XX:+TieredCompilation -J-XX:TieredStopAtLevel=1 -J-noverify -J-Xms512m -J-Xmx1024m"
|
3
|
+
before_install:
|
4
|
+
- gem install bundler
|
5
|
+
- gem update --system
|
6
|
+
|
19
7
|
matrix:
|
20
|
-
|
21
|
-
-
|
22
|
-
- rvm:
|
8
|
+
include:
|
9
|
+
- rvm: 2.2.8
|
10
|
+
- rvm: 2.3.5
|
11
|
+
- rvm: 2.4.2
|
12
|
+
- rvm: jruby-9.1.13.0
|
13
|
+
env:
|
14
|
+
- JRUBY_OPTS="--server -Xcompile.invokedynamic=false -J-XX:+TieredCompilation -J-XX:TieredStopAtLevel=1 -J-noverify -J-Xms512m -J-Xmx1024m"
|
23
15
|
- rvm: ruby-head
|
24
|
-
|
25
|
-
- rvm: 1.8.7
|
26
|
-
- rvm: jruby-18mode
|
16
|
+
|
27
17
|
branches:
|
28
18
|
only:
|
29
19
|
- master
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# SmarterCSV
|
2
2
|
|
3
|
-
[![Build Status](https://secure.travis-ci.org/tilo/smarter_csv.
|
3
|
+
[![Build Status](https://secure.travis-ci.org/tilo/smarter_csv.svg?branch=master)](http://travis-ci.org/tilo/smarter_csv) [![Gem Version](https://badge.fury.io/rb/smarter_csv.svg)](http://badge.fury.io/rb/smarter_csv)
|
4
4
|
|
5
5
|
`smarter_csv` is a Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, suitable for direct processing with Mongoid or ActiveRecord,
|
6
6
|
and parallel processing with Resque or Sidekiq.
|
@@ -35,7 +35,10 @@ The two main choices you have in terms of how to call `SmarterCSV.process` are:
|
|
35
35
|
* passing a `:chunk_size` to the `process` method, and processing the CSV-file in chunks, rather than in one piece.
|
36
36
|
|
37
37
|
Tip: If you are uncertain about what line endings a CSV-file uses, try specifying `:row_sep => :auto` as part of the options.
|
38
|
-
But this could be slow
|
38
|
+
But this could be slow if we would analyze the whole CSV file first (previous to 1.1.5 the whole file was analyzed).
|
39
|
+
To speed things up, you can setting the option `:auto_row_sep_chars` to only analyze the first N characters of the file (default is 500; nil or 0 will check the whole file).
|
40
|
+
You can also set the `:row_sep` manually! Checkout Example 5 for unusual `:row_sep` and `:col_sep`.
|
41
|
+
|
39
42
|
|
40
43
|
#### Example 1a: How SmarterCSV processes CSV-files as array of hashes:
|
41
44
|
Please note how each hash contains only the keys for columns with non-null values.
|
@@ -166,7 +169,7 @@ NOTE: If you use `key_mappings` and `value_converters`, make sure that the value
|
|
166
169
|
=> Float
|
167
170
|
|
168
171
|
## Parallel Processing
|
169
|
-
[Jack](https://github.com/xjlin0) wrote an interesting article about [Speeding up CSV parsing with parallel processing](http://xjlin0.github.io/tech/2015/05/25/faster-parsing-csv-with-parallel-processing
|
172
|
+
[Jack](https://github.com/xjlin0) wrote an interesting article about [Speeding up CSV parsing with parallel processing](http://xjlin0.github.io/tech/2015/05/25/faster-parsing-csv-with-parallel-processing)
|
170
173
|
|
171
174
|
## Documentation
|
172
175
|
|
@@ -184,6 +187,7 @@ The options and the block are optional.
|
|
184
187
|
| :col_sep | ',' | column separator |
|
185
188
|
| :row_sep | $/ ,"\n" | row separator or record separator , defaults to system's $/ , which defaults to "\n" |
|
186
189
|
| | | This can also be set to :auto, but will process the whole cvs file first (slow!) |
|
190
|
+
| :auto_row_sep_chars | 500 | How many characters to analyze when using `:row_sep => :auto`. nil or 0 means whole file. |
|
187
191
|
| :quote_char | '"' | quotation character |
|
188
192
|
| :comment_regexp | /^#/ | regular expression which matches comment lines (see NOTE about the CSV header) |
|
189
193
|
| :chunk_size | nil | if set, determines the desired chunk-size (defaults to nil, no chunk processing) |
|
@@ -216,7 +220,7 @@ The options and the block are optional.
|
|
216
220
|
| | | also accepts either {:except => [:key1,:key2]} or {:only => :key3} |
|
217
221
|
| :remove_empty_hashes | true | remove / ignore any hashes which don't have any key/value pairs |
|
218
222
|
| :file_encoding | utf-8 | Set the file encoding eg.: 'windows-1252' or 'iso-8859-1' |
|
219
|
-
| :force_simple_split | false | force
|
223
|
+
| :force_simple_split | false | force simple splitting on :col_sep character for non-standard CSV-files. |
|
220
224
|
| | | e.g. when :quote_char is not properly escaped |
|
221
225
|
| :verbose | false | print out line number while processing (to track down problems in input files) |
|
222
226
|
|
@@ -261,10 +265,6 @@ The options and the block are optional.
|
|
261
265
|
* some CSV files use un-escaped quotation characters inside fields. This can cause the import to break. To get around this, use the `:force_simple_split => true` option in combination with `:strip_chars_from_headers => /[\-"]/` . This will also significantly speed up the import.
|
262
266
|
If you would force a different :quote_char instead (setting it to a non-used character), then the import would be up to 5-times slower than using `:force_simple_split`.
|
263
267
|
|
264
|
-
#### Known Issues:
|
265
|
-
* if you are using 1.8.7 versions of Ruby, JRuby, or Ruby Enterprise Edition, `smarter_csv` will have problems with double-quoted fields, because of a bug in an underlying library.
|
266
|
-
|
267
|
-
|
268
268
|
## See also:
|
269
269
|
|
270
270
|
http://www.unixgods.org/~tilo/Ruby/process_csv_as_hashes.html
|
@@ -293,8 +293,14 @@ Planned in the next releases:
|
|
293
293
|
|
294
294
|
## Changes
|
295
295
|
|
296
|
+
#### 1.1.5 (2017-11-05)
|
297
|
+
* fix issue with invalid byte sequences in header (issue #103, thanks to Dave Myron)
|
298
|
+
* fix issue with invalid byte sequences in multi-line data (thanks to Ivan Ushakov)
|
299
|
+
* analyze only 500 characters by default when `:row_sep => :auto` is used.
|
300
|
+
added option `row_sep_auto_chars` to change the default if necessary. (thanks to Matthieu Paret)
|
301
|
+
|
296
302
|
#### 1.1.4 (2017-01-16)
|
297
|
-
* fixing UTF-8 related bug which was introduced in 1.1.2 (
|
303
|
+
* fixing UTF-8 related bug which was introduced in 1.1.2 (thanks to Tirdad C.)
|
298
304
|
|
299
305
|
#### 1.1.3 (2016-12-30)
|
300
306
|
* added warning when options indicate UTF-8 processing, but input filehandle is not opened with r:UTF-8 option
|
@@ -449,6 +455,10 @@ And a special thanks to those who contributed pull requests:
|
|
449
455
|
* [Michael](https://github.com/polycarpou)
|
450
456
|
* [Kevin Coleman](https://github.com/KevinColemanInc)
|
451
457
|
* [Tirdad C.](https://github.com/tridadc)
|
458
|
+
* [Dave Myron](https://github.com/contentfree)
|
459
|
+
* [Ivan Ushakov](https://github.com/IvanUshakov)
|
460
|
+
* [Matthieu Paret](https://github.com/mtparet)
|
461
|
+
* [Rohit Amarnath](https://github.com/ramarnat)
|
452
462
|
|
453
463
|
|
454
464
|
## Contributing
|
@@ -9,7 +9,8 @@ module SmarterCSV
|
|
9
9
|
:remove_empty_values => true, :remove_zero_values => false , :remove_values_matching => nil , :remove_empty_hashes => true , :strip_whitespace => true,
|
10
10
|
:convert_values_to_numeric => true, :strip_chars_from_headers => nil , :user_provided_headers => nil , :headers_in_file => true,
|
11
11
|
:comment_regexp => /^#/, :chunk_size => nil , :key_mapping_hash => nil , :downcase_header => true, :strings_as_keys => false, :file_encoding => 'utf-8',
|
12
|
-
:remove_unmapped_keys => false, :keep_original_headers => false, :value_converters => nil, :skip_lines => nil, :force_utf8 => false, :invalid_byte_sequence => ''
|
12
|
+
:remove_unmapped_keys => false, :keep_original_headers => false, :value_converters => nil, :skip_lines => nil, :force_utf8 => false, :invalid_byte_sequence => '',
|
13
|
+
:auto_row_sep_chars => 500
|
13
14
|
}
|
14
15
|
options = default_options.merge(options)
|
15
16
|
options[:invalid_byte_sequence] = '' if options[:invalid_byte_sequence].nil?
|
@@ -27,7 +28,7 @@ module SmarterCSV
|
|
27
28
|
end
|
28
29
|
|
29
30
|
if options[:row_sep] == :auto
|
30
|
-
options[:row_sep] =
|
31
|
+
options[:row_sep] = line_ending = SmarterCSV.guess_line_ending( f, options )
|
31
32
|
f.rewind
|
32
33
|
end
|
33
34
|
$/ = options[:row_sep]
|
@@ -39,8 +40,9 @@ module SmarterCSV
|
|
39
40
|
if options[:headers_in_file] # extract the header line
|
40
41
|
# process the header line in the CSV file..
|
41
42
|
# the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
|
42
|
-
header = f.readline
|
43
|
+
header = f.readline
|
43
44
|
header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
45
|
+
header = header.sub(options[:comment_regexp],'').chomp(options[:row_sep])
|
44
46
|
|
45
47
|
file_line_count += 1
|
46
48
|
csv_line_count += 1
|
@@ -118,7 +120,9 @@ module SmarterCSV
|
|
118
120
|
# by detecting the existence of an uneven number of quote characters
|
119
121
|
multiline = line.count(options[:quote_char])%2 == 1
|
120
122
|
while line.count(options[:quote_char])%2 == 1
|
121
|
-
|
123
|
+
next_line = f.readline
|
124
|
+
next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
125
|
+
line += next_line
|
122
126
|
file_line_count += 1
|
123
127
|
end
|
124
128
|
print "\nline contains uneven number of quote chars so including content through file line %d\n" % file_line_count if options[:verbose] && multiline
|
@@ -251,6 +255,7 @@ module SmarterCSV
|
|
251
255
|
# count how many of the pre-defined line-endings we find
|
252
256
|
# ignoring those contained within quote characters
|
253
257
|
last_char = nil
|
258
|
+
lines = 0
|
254
259
|
filehandle.each_char do |c|
|
255
260
|
quoted_char = !quoted_char if c == options[:quote_char]
|
256
261
|
next if quoted_char
|
@@ -265,6 +270,8 @@ module SmarterCSV
|
|
265
270
|
counts["\n"] += 1
|
266
271
|
end
|
267
272
|
last_char = c
|
273
|
+
lines += 1
|
274
|
+
break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
|
268
275
|
end
|
269
276
|
counts["\r"] += 1 if last_char == "\r"
|
270
277
|
# find the key/value pair with the largest counter:
|
data/lib/smarter_csv/version.rb
CHANGED
metadata
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: smarter_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
8
|
-
|
7
|
+
- 'Tilo Sloboda
|
8
|
+
|
9
|
+
'
|
9
10
|
autorequire:
|
10
11
|
bindir: bin
|
11
12
|
cert_chain: []
|
12
|
-
date: 2017-
|
13
|
+
date: 2017-11-06 00:00:00.000000000 Z
|
13
14
|
dependencies:
|
14
15
|
- !ruby/object:Gem::Dependency
|
15
16
|
name: rspec
|
@@ -29,8 +30,9 @@ description: Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes,
|
|
29
30
|
optional features for processing large files in parallel, embedded comments, unusual
|
30
31
|
field- and record-separators, flexible mapping of CSV-headers to Hash-keys
|
31
32
|
email:
|
32
|
-
-
|
33
|
-
|
33
|
+
- 'tilo.sloboda@gmail.com
|
34
|
+
|
35
|
+
'
|
34
36
|
executables: []
|
35
37
|
extensions: []
|
36
38
|
extra_rdoc_files: []
|
@@ -123,7 +125,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
123
125
|
requirements:
|
124
126
|
- csv
|
125
127
|
rubyforge_project:
|
126
|
-
rubygems_version: 2.
|
128
|
+
rubygems_version: 2.6.13
|
127
129
|
signing_key:
|
128
130
|
specification_version: 4
|
129
131
|
summary: Ruby Gem for smarter importing of CSV Files (and CSV-like files), with lots
|