smarter_csv 1.1.5 → 1.12.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.rspec +1 -2
- data/.rubocop.yml +154 -0
- data/CHANGELOG.md +364 -0
- data/CONTRIBUTORS.md +56 -0
- data/Gemfile +7 -2
- data/LICENSE.txt +21 -0
- data/README.md +44 -441
- data/Rakefile +39 -19
- data/TO_DO_v2.md +14 -0
- data/docs/_introduction.md +56 -0
- data/docs/basic_api.md +157 -0
- data/docs/batch_processing.md +68 -0
- data/docs/data_transformations.md +50 -0
- data/docs/examples.md +75 -0
- data/docs/header_transformations.md +113 -0
- data/docs/header_validations.md +36 -0
- data/docs/options.md +98 -0
- data/docs/row_col_sep.md +104 -0
- data/docs/value_converters.md +68 -0
- data/ext/smarter_csv/extconf.rb +14 -0
- data/ext/smarter_csv/smarter_csv.c +97 -0
- data/lib/smarter_csv/auto_detection.rb +78 -0
- data/lib/smarter_csv/errors.rb +16 -0
- data/lib/smarter_csv/file_io.rb +50 -0
- data/lib/smarter_csv/hash_transformations.rb +91 -0
- data/lib/smarter_csv/header_transformations.rb +63 -0
- data/lib/smarter_csv/header_validations.rb +34 -0
- data/lib/smarter_csv/headers.rb +68 -0
- data/lib/smarter_csv/options.rb +95 -0
- data/lib/smarter_csv/parser.rb +90 -0
- data/lib/smarter_csv/reader.rb +243 -0
- data/lib/smarter_csv/version.rb +3 -1
- data/lib/smarter_csv/writer.rb +116 -0
- data/lib/smarter_csv.rb +91 -3
- data/smarter_csv.gemspec +43 -20
- metadata +122 -137
- data/.gitignore +0 -8
- data/.travis.yml +0 -19
- data/lib/extensions/hash.rb +0 -7
- data/lib/smarter_csv/smarter_csv.rb +0 -281
- data/spec/fixtures/basic.csv +0 -8
- data/spec/fixtures/binary.csv +0 -1
- data/spec/fixtures/carriage_returns_n.csv +0 -18
- data/spec/fixtures/carriage_returns_quoted.csv +0 -3
- data/spec/fixtures/carriage_returns_r.csv +0 -1
- data/spec/fixtures/carriage_returns_rn.csv +0 -18
- data/spec/fixtures/chunk_cornercase.csv +0 -10
- data/spec/fixtures/empty.csv +0 -5
- data/spec/fixtures/line_endings_n.csv +0 -4
- data/spec/fixtures/line_endings_r.csv +0 -1
- data/spec/fixtures/line_endings_rn.csv +0 -4
- data/spec/fixtures/lots_of_columns.csv +0 -2
- data/spec/fixtures/malformed.csv +0 -3
- data/spec/fixtures/malformed_header.csv +0 -3
- data/spec/fixtures/money.csv +0 -3
- data/spec/fixtures/no_header.csv +0 -7
- data/spec/fixtures/numeric.csv +0 -5
- data/spec/fixtures/pets.csv +0 -5
- data/spec/fixtures/quoted.csv +0 -5
- data/spec/fixtures/separator.csv +0 -4
- data/spec/fixtures/skip_lines.csv +0 -8
- data/spec/fixtures/valid_unicode.csv +0 -5
- data/spec/fixtures/with_dashes.csv +0 -8
- data/spec/fixtures/with_dates.csv +0 -4
- data/spec/smarter_csv/binary_file2_spec.rb +0 -24
- data/spec/smarter_csv/binary_file_spec.rb +0 -22
- data/spec/smarter_csv/carriage_return_spec.rb +0 -170
- data/spec/smarter_csv/chunked_reading_spec.rb +0 -14
- data/spec/smarter_csv/close_file_spec.rb +0 -15
- data/spec/smarter_csv/column_separator_spec.rb +0 -11
- data/spec/smarter_csv/convert_values_to_numeric_spec.rb +0 -48
- data/spec/smarter_csv/extenstions_spec.rb +0 -17
- data/spec/smarter_csv/header_transformation_spec.rb +0 -21
- data/spec/smarter_csv/keep_headers_spec.rb +0 -24
- data/spec/smarter_csv/key_mapping_spec.rb +0 -25
- data/spec/smarter_csv/line_ending_spec.rb +0 -43
- data/spec/smarter_csv/load_basic_spec.rb +0 -20
- data/spec/smarter_csv/malformed_spec.rb +0 -21
- data/spec/smarter_csv/no_header_spec.rb +0 -24
- data/spec/smarter_csv/not_downcase_header_spec.rb +0 -24
- data/spec/smarter_csv/quoted_spec.rb +0 -23
- data/spec/smarter_csv/remove_empty_values_spec.rb +0 -13
- data/spec/smarter_csv/remove_keys_from_hashes_spec.rb +0 -25
- data/spec/smarter_csv/remove_not_mapped_keys_spec.rb +0 -35
- data/spec/smarter_csv/remove_values_matching_spec.rb +0 -26
- data/spec/smarter_csv/remove_zero_values_spec.rb +0 -25
- data/spec/smarter_csv/skip_lines_spec.rb +0 -29
- data/spec/smarter_csv/strings_as_keys_spec.rb +0 -24
- data/spec/smarter_csv/strip_chars_from_headers_spec.rb +0 -24
- data/spec/smarter_csv/valid_unicode_spec.rb +0 -94
- data/spec/smarter_csv/value_converters_spec.rb +0 -52
- data/spec/spec/spec_helper.rb +0 -17
- data/spec/spec.opts +0 -2
- data/spec/spec_helper.rb +0 -21
data/docs/row_col_sep.md
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
|
2
|
+
### Contents
|
3
|
+
|
4
|
+
* [Introduction](./_introduction.md)
|
5
|
+
* [The Basic API](./basic_api.md)
|
6
|
+
* [Batch Processing](././batch_processing.md)
|
7
|
+
* [Configuration Options](./options.md)
|
8
|
+
* [**Row and Column Separators**](./row_col_sep.md)
|
9
|
+
* [Header Transformations](./header_transformations.md)
|
10
|
+
* [Header Validations](./header_validations.md)
|
11
|
+
* [Data Transformations](./data_transformations.md)
|
12
|
+
* [Value Converters](./value_converters.md)
|
13
|
+
|
14
|
+
--------------
|
15
|
+
|
16
|
+
# Row and Column Separators
|
17
|
+
|
18
|
+
## Automatic Detection
|
19
|
+
|
20
|
+
Convenient defaults allow automatic detection of the column and row separators: `row_sep: :auto`, `col_sep: :auto`. This makes it easier to process any CSV files without having to examine the line endings or column separators, e.g. when users upload CSV files to your service and you have no control over the incoming files.
|
21
|
+
|
22
|
+
You can change the setting `:auto_row_sep_chars` to only analyze the first N characters of the file (default is 500 characters); `nil` or `0` will check the whole file). Of course you can also set the `:row_sep` manually.
|
23
|
+
|
24
|
+
|
25
|
+
## Column Separator `col_sep`
|
26
|
+
|
27
|
+
The automatic detection of column separators considers: `,`, `\t`, `;`, `:`, `|`.
|
28
|
+
|
29
|
+
Some CSV files may contain an unusual column separqator, which could even be a control character.
|
30
|
+
|
31
|
+
## Row Separator `row_sep`
|
32
|
+
|
33
|
+
The automatic detection of row separators considers: `\n`, `\r\n`, `\r`.
|
34
|
+
|
35
|
+
Some CSV files may contain an unusual row separqator, which could even be a control character.
|
36
|
+
|
37
|
+
|
38
|
+
## Custom / Non-Standard CSV Formats
|
39
|
+
|
40
|
+
Besides custom values for `col_sep`, `row_sep`, some other customizations of CSV files are:
|
41
|
+
* the presence of a number of leading lines before the header or data section start.
|
42
|
+
* the presence of comment lines, e.g. lines starting with `#`
|
43
|
+
|
44
|
+
To explore these special cases, please use the following examples.
|
45
|
+
|
46
|
+
### Example 1: reading an iTunes DB dump
|
47
|
+
|
48
|
+
This data format uses CTRL-A as the column separator, and CTRL-B as the record separator. It also has comment lines that start with a `#` character. This also maps the header `name` to `genre`, and ignores the column `export_date`.
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
filename = '/tmp/itunes_db_dump'
|
52
|
+
options = {
|
53
|
+
:col_sep => "\cA", :row_sep => "\cB\n", :comment_regexp => /^#/,
|
54
|
+
:chunk_size => 100 , :key_mapping => {export_date: nil, name: :genre},
|
55
|
+
}
|
56
|
+
n = SmarterCSV.process(filename, options) do |chunk|
|
57
|
+
SidekiqWorkerClass.process_async(chunk) # pass an array of hashes to Sidekiq workers for parallel processing
|
58
|
+
end
|
59
|
+
=> returns number of chunks
|
60
|
+
```
|
61
|
+
|
62
|
+
### Example 2: Reading a CSV-File with custom col_sep, row_sep
|
63
|
+
In this example we have an unusual CSV file with `|` as the row separator, and `#` as the column separator.
|
64
|
+
This unusual format needs explicit options `col_sep` and `row_sep`.
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
filename = '/tmp/input_file.txt'
|
68
|
+
recordsA = SmarterCSV.process(filename, {col_sep: "#", row_sep: "|"})
|
69
|
+
|
70
|
+
=> returns an array of hashes
|
71
|
+
```
|
72
|
+
|
73
|
+
### Example 3:
|
74
|
+
In this example, we use `skip_lines: 3` to skip and ignore the first 3 lines in the input
|
75
|
+
|
76
|
+
|
77
|
+
```ruby
|
78
|
+
filename = '/tmp/input_file.txt'
|
79
|
+
recordsA = SmarterCSV.process(filename, {skip_lines: 3})
|
80
|
+
|
81
|
+
=> returns an array of hashes
|
82
|
+
```
|
83
|
+
|
84
|
+
|
85
|
+
### Example 4: reading an iTunes DB dump
|
86
|
+
|
87
|
+
In this example, we use `comment_regexp` to filter out and ignore any lines starting with `#`
|
88
|
+
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
# Consider a file with CRTL-A as col_separator, and with CTRL-B\n as record_separator (hello iTunes!)
|
92
|
+
filename = '/tmp/strange_db_dump'
|
93
|
+
options = {
|
94
|
+
:col_sep => "\cA", :row_sep => "\cB\n", :comment_regexp => /^#/,
|
95
|
+
:chunk_size => 100 , :key_mapping => {:export_date => nil, :name => :genre},
|
96
|
+
}
|
97
|
+
n = SmarterCSV.process(filename, options) do |chunk|
|
98
|
+
SidekiqWorkerClass.process_async(chunk) # pass an array of hashes to Sidekiq workers for parallel processing
|
99
|
+
end
|
100
|
+
=> returns number of chunks
|
101
|
+
```
|
102
|
+
|
103
|
+
----------------
|
104
|
+
PREVIOUS: [Configuration Options](./options.md) | NEXT: [Header Transformations](./header_transformations.md)
|
@@ -0,0 +1,68 @@
|
|
1
|
+
|
2
|
+
### Contents
|
3
|
+
|
4
|
+
* [Introduction](./_introduction.md)
|
5
|
+
* [The Basic API](./basic_api.md)
|
6
|
+
* [Batch Processing](././batch_processing.md)
|
7
|
+
* [Configuration Options](./options.md)
|
8
|
+
* [Row and Column Separators](./row_col_sep.md)
|
9
|
+
* [Header Transformations](./header_transformations.md)
|
10
|
+
* [Header Validations](./header_validations.md)
|
11
|
+
* [Data Transformations](./data_transformations.md)
|
12
|
+
* [**Value Converters**](./value_converters.md)
|
13
|
+
|
14
|
+
--------------
|
15
|
+
|
16
|
+
# Using Value Converters
|
17
|
+
|
18
|
+
Value Converters allow you to do custom transformations specific rows, to help you massage the data so it fits the expectations of your down-stream process, such as creating a DB record.
|
19
|
+
|
20
|
+
If you use `key_mappings` and `value_converters`, make sure that the value converters references the keys based on the final mapped name, not the original name in the CSV file.
|
21
|
+
|
22
|
+
```ruby
|
23
|
+
$ cat spec/fixtures/with_dates.csv
|
24
|
+
first,last,date,price
|
25
|
+
Ben,Miller,10/30/1998,$44.50
|
26
|
+
Tom,Turner,2/1/2011,$15.99
|
27
|
+
Ken,Smith,01/09/2013,$199.99
|
28
|
+
|
29
|
+
$ irb
|
30
|
+
> require 'smarter_csv'
|
31
|
+
> require 'date'
|
32
|
+
|
33
|
+
# define a custom converter class, which implements self.convert(value)
|
34
|
+
class DateConverter
|
35
|
+
def self.convert(value)
|
36
|
+
Date.strptime( value, '%m/%d/%Y') # parses custom date format into Date instance
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class DollarConverter
|
41
|
+
def self.convert(value)
|
42
|
+
value.sub('$','').to_f # strips the dollar sign and creates a Float value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
require 'money'
|
47
|
+
class MoneyConverter
|
48
|
+
def self.convert(value)
|
49
|
+
# depending on locale you might want to also remove the indicator for thousands, e.g. comma
|
50
|
+
Money.from_amount(value.gsub(/[\s\$]/,'').to_f) # creates a Money instance (based on cents)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
options = {:value_converters => {:date => DateConverter, :price => DollarConverter}}
|
55
|
+
data = SmarterCSV.process("spec/fixtures/with_dates.csv", options)
|
56
|
+
first_record = data.first
|
57
|
+
first_record[:date]
|
58
|
+
=> #<Date: 1998-10-30 ((2451117j,0s,0n),+0s,2299161j)>
|
59
|
+
first_record[:date].class
|
60
|
+
=> Date
|
61
|
+
first_record[:price]
|
62
|
+
=> 44.50
|
63
|
+
first_record[:price].class
|
64
|
+
=> Float
|
65
|
+
```
|
66
|
+
|
67
|
+
--------------------
|
68
|
+
PREVIOUS: [Data Transformations](./data_transformations.md) | UP: [README](../README.md)
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'mkmf'
|
4
|
+
require "rbconfig"
|
5
|
+
|
6
|
+
if RbConfig::MAKEFILE_CONFIG["CFLAGS"].include?("-g -O3")
|
7
|
+
fixed_CFLAGS = RbConfig::MAKEFILE_CONFIG["CFLAGS"].sub("-g -O3", "-O3 $(cflags)")
|
8
|
+
puts("Fix CFLAGS: #{RbConfig::MAKEFILE_CONFIG["CFLAGS"]} -> #{fixed_CFLAGS}")
|
9
|
+
RbConfig::MAKEFILE_CONFIG["CFLAGS"] = fixed_CFLAGS
|
10
|
+
end
|
11
|
+
|
12
|
+
CONFIG["optflags"] = "-O3"
|
13
|
+
|
14
|
+
create_makefile('smarter_csv/smarter_csv')
|
@@ -0,0 +1,97 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "ruby/encoding.h"
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <stdbool.h>
|
5
|
+
|
6
|
+
#ifndef bool
|
7
|
+
#define bool int
|
8
|
+
#define false ((bool)0)
|
9
|
+
#define true ((bool)1)
|
10
|
+
#endif
|
11
|
+
|
12
|
+
/*
|
13
|
+
max_size: pass nil if no limit is specified
|
14
|
+
*/
|
15
|
+
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size) {
|
16
|
+
if (RB_TYPE_P(line, T_NIL) == 1) {
|
17
|
+
return rb_ary_new();
|
18
|
+
}
|
19
|
+
|
20
|
+
if (RB_TYPE_P(line, T_STRING) != 1) {
|
21
|
+
rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
|
22
|
+
}
|
23
|
+
|
24
|
+
rb_encoding *encoding = rb_enc_get(line); /* get the encoding from the input line */
|
25
|
+
char *startP = RSTRING_PTR(line); /* may not be null terminated */
|
26
|
+
long line_len = RSTRING_LEN(line);
|
27
|
+
char *endP = startP + line_len ; /* points behind the string */
|
28
|
+
char *p = startP;
|
29
|
+
|
30
|
+
char *col_sepP = RSTRING_PTR(col_sep);
|
31
|
+
long col_sep_len = RSTRING_LEN(col_sep);
|
32
|
+
|
33
|
+
char *quoteP = RSTRING_PTR(quote_char);
|
34
|
+
long quote_count = 0;
|
35
|
+
|
36
|
+
bool col_sep_found = true;
|
37
|
+
|
38
|
+
VALUE elements = rb_ary_new();
|
39
|
+
VALUE field;
|
40
|
+
long i;
|
41
|
+
|
42
|
+
char prev_char = '\0'; // Store the previous character for comparison against an escape character
|
43
|
+
long backslash_count = 0; // to count consecutive backslash characters
|
44
|
+
|
45
|
+
while (p < endP) {
|
46
|
+
/* does the remaining string start with col_sep ? */
|
47
|
+
col_sep_found = true;
|
48
|
+
for(i=0; (i < col_sep_len) && (p+i < endP) ; i++) {
|
49
|
+
col_sep_found = col_sep_found && (*(p+i) == *(col_sepP+i));
|
50
|
+
}
|
51
|
+
/* if col_sep was found and we have even quotes */
|
52
|
+
if (col_sep_found && (quote_count % 2 == 0)) {
|
53
|
+
/* if max_size != nil && lements.size >= header_size */
|
54
|
+
if ((max_size != Qnil) && RARRAY_LEN(elements) >= NUM2INT(max_size)) {
|
55
|
+
break;
|
56
|
+
} else {
|
57
|
+
/* push that field with original encoding onto the results */
|
58
|
+
field = rb_enc_str_new(startP, p - startP, encoding);
|
59
|
+
rb_ary_push(elements, field);
|
60
|
+
|
61
|
+
p += col_sep_len;
|
62
|
+
startP = p;
|
63
|
+
}
|
64
|
+
} else {
|
65
|
+
if (*p == '\\') {
|
66
|
+
backslash_count++;
|
67
|
+
} else {
|
68
|
+
if (*p == *quoteP && (backslash_count % 2 == 0)) {
|
69
|
+
quote_count++;
|
70
|
+
}
|
71
|
+
backslash_count = 0; // no more consecutive backslash characters
|
72
|
+
}
|
73
|
+
p++;
|
74
|
+
}
|
75
|
+
|
76
|
+
prev_char = *(p - 1); // Update the previous character
|
77
|
+
} /* while */
|
78
|
+
|
79
|
+
/* check if the last part of the line needs to be processed */
|
80
|
+
if ((max_size == Qnil) || RARRAY_LEN(elements) < NUM2INT(max_size)) {
|
81
|
+
/* copy the remaining line as a field with original encoding onto the results */
|
82
|
+
field = rb_enc_str_new(startP, endP - startP, encoding);
|
83
|
+
rb_ary_push(elements, field);
|
84
|
+
}
|
85
|
+
|
86
|
+
return elements;
|
87
|
+
}
|
88
|
+
|
89
|
+
VALUE SmarterCSV = Qnil;
|
90
|
+
VALUE Parser = Qnil;
|
91
|
+
|
92
|
+
void Init_smarter_csv(void) {
|
93
|
+
SmarterCSV = rb_define_module("SmarterCSV");
|
94
|
+
Parser = rb_define_module_under(SmarterCSV, "Parser");
|
95
|
+
|
96
|
+
rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 4);
|
97
|
+
}
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
module AutoDetection
|
5
|
+
protected
|
6
|
+
|
7
|
+
# If file has headers, then guesses column separator from headers.
|
8
|
+
# Otherwise guesses column separator from contents.
|
9
|
+
# Raises exception if none is found.
|
10
|
+
def guess_column_separator(filehandle, options)
|
11
|
+
skip_lines(filehandle, options)
|
12
|
+
|
13
|
+
delimiters = [',', "\t", ';', ':', '|']
|
14
|
+
|
15
|
+
line = nil
|
16
|
+
has_header = options[:headers_in_file]
|
17
|
+
candidates = Hash.new(0)
|
18
|
+
count = has_header ? 1 : 5
|
19
|
+
count.times do
|
20
|
+
line = readline_with_counts(filehandle, options)
|
21
|
+
delimiters.each do |d|
|
22
|
+
escaped_quote = Regexp.escape(options[:quote_char])
|
23
|
+
|
24
|
+
# Count only non-quoted occurrences of the delimiter
|
25
|
+
non_quoted_text = line.split(/#{escaped_quote}[^#{escaped_quote}]*#{escaped_quote}/).join
|
26
|
+
|
27
|
+
candidates[d] += non_quoted_text.scan(d).count
|
28
|
+
end
|
29
|
+
rescue EOFError # short files
|
30
|
+
break
|
31
|
+
end
|
32
|
+
rewind(filehandle)
|
33
|
+
|
34
|
+
if candidates.values.max == 0
|
35
|
+
# if the header only contains
|
36
|
+
return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
|
37
|
+
|
38
|
+
raise SmarterCSV::NoColSepDetected
|
39
|
+
end
|
40
|
+
|
41
|
+
candidates.key(candidates.values.max)
|
42
|
+
end
|
43
|
+
|
44
|
+
# limitation: this currently reads the whole file in before making a decision
|
45
|
+
def guess_line_ending(filehandle, options)
|
46
|
+
counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
|
47
|
+
quoted_char = false
|
48
|
+
|
49
|
+
# count how many of the pre-defined line-endings we find
|
50
|
+
# ignoring those contained within quote characters
|
51
|
+
last_char = nil
|
52
|
+
lines = 0
|
53
|
+
filehandle.each_char do |c|
|
54
|
+
quoted_char = !quoted_char if c == options[:quote_char]
|
55
|
+
next if quoted_char
|
56
|
+
|
57
|
+
if last_char == "\r"
|
58
|
+
if c == "\n"
|
59
|
+
counts["\r\n"] += 1
|
60
|
+
else
|
61
|
+
counts["\r"] += 1 # \r are counted after they appeared
|
62
|
+
end
|
63
|
+
elsif c == "\n"
|
64
|
+
counts["\n"] += 1
|
65
|
+
end
|
66
|
+
last_char = c
|
67
|
+
lines += 1
|
68
|
+
break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
|
69
|
+
end
|
70
|
+
rewind(filehandle)
|
71
|
+
|
72
|
+
counts["\r"] += 1 if last_char == "\r"
|
73
|
+
# find the most frequent key/value pair:
|
74
|
+
most_frequent_key, _count = counts.max_by{|_, v| v}
|
75
|
+
most_frequent_key
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
class Error < StandardError; end # new code should rescue this instead
|
5
|
+
# Reader:
|
6
|
+
class SmarterCSVException < Error; end # for backwards compatibility
|
7
|
+
class HeaderSizeMismatch < SmarterCSVException; end
|
8
|
+
class IncorrectOption < SmarterCSVException; end
|
9
|
+
class ValidationError < SmarterCSVException; end
|
10
|
+
class DuplicateHeaders < SmarterCSVException; end
|
11
|
+
class MissingKeys < SmarterCSVException; end # previously known as MissingHeaders
|
12
|
+
class NoColSepDetected < SmarterCSVException; end
|
13
|
+
class KeyMappingError < SmarterCSVException; end
|
14
|
+
# Writer:
|
15
|
+
class InvalidInputData < SmarterCSVException; end
|
16
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
module FileIO
|
5
|
+
protected
|
6
|
+
|
7
|
+
def readline_with_counts(filehandle, options)
|
8
|
+
line = filehandle.readline(options[:row_sep])
|
9
|
+
@file_line_count += 1
|
10
|
+
@csv_line_count += 1
|
11
|
+
line = remove_bom(line) if @csv_line_count == 1
|
12
|
+
line
|
13
|
+
end
|
14
|
+
|
15
|
+
def skip_lines(filehandle, options)
|
16
|
+
options[:skip_lines].to_i.times do
|
17
|
+
readline_with_counts(filehandle, options)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def rewind(filehandle)
|
22
|
+
@file_line_count = 0
|
23
|
+
@csv_line_count = 0
|
24
|
+
filehandle.rewind
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
UTF_32_BOM = %w[0 0 fe ff].freeze
|
30
|
+
UTF_32LE_BOM = %w[ff fe 0 0].freeze
|
31
|
+
UTF_8_BOM = %w[ef bb bf].freeze
|
32
|
+
UTF_16_BOM = %w[fe ff].freeze
|
33
|
+
UTF_16LE_BOM = %w[ff fe].freeze
|
34
|
+
|
35
|
+
def remove_bom(str)
|
36
|
+
str_as_hex = str.bytes.map{|x| x.to_s(16)}
|
37
|
+
# if string does not start with one of the bytes, there is no BOM
|
38
|
+
return str unless %w[ef fe ff 0].include?(str_as_hex[0])
|
39
|
+
|
40
|
+
return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
|
41
|
+
return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
|
42
|
+
return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
|
43
|
+
|
44
|
+
# :nocov:
|
45
|
+
puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
|
46
|
+
str
|
47
|
+
# :nocov:
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
module HashTransformations
|
5
|
+
def hash_transformations(hash, options)
|
6
|
+
# there may be unmapped keys, or keys purposedly mapped to nil or an empty key..
|
7
|
+
# make sure we delete any key/value pairs from the hash, which the user wanted to delete:
|
8
|
+
remove_empty_values = options[:remove_empty_values] == true
|
9
|
+
remove_zero_values = options[:remove_zero_values]
|
10
|
+
remove_values_matching = options[:remove_values_matching]
|
11
|
+
convert_to_numeric = options[:convert_values_to_numeric]
|
12
|
+
value_converters = options[:value_converters]
|
13
|
+
|
14
|
+
hash.each_with_object({}) do |(k, v), new_hash|
|
15
|
+
next if k.nil? || k == '' || k == :""
|
16
|
+
next if remove_empty_values && (has_rails ? v.blank? : blank?(v))
|
17
|
+
next if remove_zero_values && v.is_a?(String) && v =~ /^(0+|0+\.0+)$/ # values are Strings
|
18
|
+
next if remove_values_matching && v =~ remove_values_matching
|
19
|
+
|
20
|
+
# deal with the :only / :except options to :convert_values_to_numeric
|
21
|
+
if convert_to_numeric && !limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
|
22
|
+
if v =~ /^[+-]?\d+\.\d+$/
|
23
|
+
v = v.to_f
|
24
|
+
elsif v =~ /^[+-]?\d+$/
|
25
|
+
v = v.to_i
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
converter = value_converters[k] if value_converters
|
30
|
+
v = converter.convert(v) if converter
|
31
|
+
|
32
|
+
new_hash[k] = v
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# def hash_transformations(hash, options)
|
37
|
+
# # there may be unmapped keys, or keys purposedly mapped to nil or an empty key..
|
38
|
+
# # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
|
39
|
+
# hash.delete(nil)
|
40
|
+
# hash.delete('')
|
41
|
+
# hash.delete(:"")
|
42
|
+
|
43
|
+
# if options[:remove_empty_values] == true
|
44
|
+
# hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
|
45
|
+
# end
|
46
|
+
|
47
|
+
# hash.delete_if{|_k, v| !v.nil? && v =~ /^(0+|0+\.0+)$/} if options[:remove_zero_values] # values are Strings
|
48
|
+
# hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]
|
49
|
+
|
50
|
+
# if options[:convert_values_to_numeric]
|
51
|
+
# hash.each do |k, v|
|
52
|
+
# # deal with the :only / :except options to :convert_values_to_numeric
|
53
|
+
# next if limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
|
54
|
+
|
55
|
+
# # convert if it's a numeric value:
|
56
|
+
# case v
|
57
|
+
# when /^[+-]?\d+\.\d+$/
|
58
|
+
# hash[k] = v.to_f
|
59
|
+
# when /^[+-]?\d+$/
|
60
|
+
# hash[k] = v.to_i
|
61
|
+
# end
|
62
|
+
# end
|
63
|
+
# end
|
64
|
+
|
65
|
+
# if options[:value_converters]
|
66
|
+
# hash.each do |k, v|
|
67
|
+
# converter = options[:value_converters][k]
|
68
|
+
# next unless converter
|
69
|
+
|
70
|
+
# hash[k] = converter.convert(v)
|
71
|
+
# end
|
72
|
+
# end
|
73
|
+
|
74
|
+
# hash
|
75
|
+
# end
|
76
|
+
|
77
|
+
protected
|
78
|
+
|
79
|
+
# acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
|
80
|
+
def limit_execution_for_only_or_except(options, option_name, key)
|
81
|
+
if options[option_name].is_a?(Hash)
|
82
|
+
if options[option_name].has_key?(:except)
|
83
|
+
return true if Array(options[option_name][:except]).include?(key)
|
84
|
+
elsif options[option_name].has_key?(:only)
|
85
|
+
return true unless Array(options[option_name][:only]).include?(key)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
false
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
module HeaderTransformations
|
5
|
+
# transform the headers that were in the file:
|
6
|
+
def header_transformations(header_array, options)
|
7
|
+
header_array.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
|
8
|
+
header_array.map!{|x| x.strip} if options[:strip_whitespace]
|
9
|
+
|
10
|
+
unless options[:keep_original_headers]
|
11
|
+
header_array.map!{|x| x.gsub(/\s+|-+/, '_')}
|
12
|
+
header_array.map!{|x| x.downcase} if options[:downcase_header]
|
13
|
+
end
|
14
|
+
|
15
|
+
# detect duplicate headers and disambiguate
|
16
|
+
header_array = disambiguate_headers(header_array, options) if options[:duplicate_header_suffix]
|
17
|
+
# symbolize headers
|
18
|
+
header_array = header_array.map{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
|
19
|
+
# doesn't make sense to re-map when we have user_provided_headers
|
20
|
+
header_array = remap_headers(header_array, options) if options[:key_mapping]
|
21
|
+
|
22
|
+
header_array
|
23
|
+
end
|
24
|
+
|
25
|
+
def disambiguate_headers(headers, options)
|
26
|
+
counts = Hash.new(0)
|
27
|
+
headers.map do |header|
|
28
|
+
counts[header] += 1
|
29
|
+
counts[header] > 1 ? "#{header}#{options[:duplicate_header_suffix]}#{counts[header]}" : header
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# do some key mapping on the keys in the file header
|
34
|
+
# if you want to completely delete a key, then map it to nil or to ''
|
35
|
+
def remap_headers(headers, options)
|
36
|
+
key_mapping = options[:key_mapping]
|
37
|
+
if key_mapping.empty? || !key_mapping.is_a?(Hash) || key_mapping.keys.empty?
|
38
|
+
raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for key_mapping! Expecting hash with from -> to mappings")
|
39
|
+
end
|
40
|
+
|
41
|
+
key_mapping = options[:key_mapping]
|
42
|
+
# if silence_missing_keys are not set, raise error if missing header
|
43
|
+
missing_keys = key_mapping.keys - headers
|
44
|
+
# if the user passes a list of speciffic mapped keys that are optional
|
45
|
+
missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
|
46
|
+
|
47
|
+
unless missing_keys.empty? || options[:silence_missing_keys] == true
|
48
|
+
raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
|
49
|
+
end
|
50
|
+
|
51
|
+
headers.map! do |header|
|
52
|
+
if key_mapping.has_key?(header)
|
53
|
+
key_mapping[header].nil? ? nil : key_mapping[header]
|
54
|
+
elsif options[:remove_unmapped_keys]
|
55
|
+
nil
|
56
|
+
else
|
57
|
+
header
|
58
|
+
end
|
59
|
+
end
|
60
|
+
headers
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
module HeaderValidations
|
5
|
+
def header_validations(headers, options)
|
6
|
+
check_duplicate_headers(headers, options)
|
7
|
+
check_required_headers(headers, options)
|
8
|
+
end
|
9
|
+
|
10
|
+
def check_duplicate_headers(headers, _options)
|
11
|
+
header_counts = Hash.new(0)
|
12
|
+
headers.each { |header| header_counts[header] += 1 unless header.nil? }
|
13
|
+
|
14
|
+
duplicates = header_counts.select { |_, count| count > 1 }
|
15
|
+
|
16
|
+
unless duplicates.empty?
|
17
|
+
raise(SmarterCSV::DuplicateHeaders, "Duplicate Headers in CSV: #{duplicates.inspect}")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'set'
|
22
|
+
|
23
|
+
def check_required_headers(headers, options)
|
24
|
+
if options[:required_keys] && options[:required_keys].is_a?(Array)
|
25
|
+
headers_set = headers.to_set
|
26
|
+
missing_keys = options[:required_keys].select { |k| !headers_set.include?(k) }
|
27
|
+
|
28
|
+
unless missing_keys.empty?
|
29
|
+
raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}. Check `reader.headers` for original headers."
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
module Headers
|
5
|
+
def process_headers(filehandle, options)
|
6
|
+
@raw_header = nil # header as it appears in the file
|
7
|
+
@headers = nil # the processed headers
|
8
|
+
header_array = []
|
9
|
+
file_header_size = nil
|
10
|
+
|
11
|
+
# if headers_in_file, get the headers -> We get the number of columns, even when user provided headers
|
12
|
+
if options[:headers_in_file] # extract the header line
|
13
|
+
# process the header line in the CSV file..
|
14
|
+
# the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
|
15
|
+
header_line = @raw_header = readline_with_counts(filehandle, options)
|
16
|
+
header_line = preprocess_header_line(header_line, options)
|
17
|
+
|
18
|
+
file_header_array, file_header_size = parse(header_line, options)
|
19
|
+
|
20
|
+
file_header_array = header_transformations(file_header_array, options)
|
21
|
+
|
22
|
+
else
|
23
|
+
unless options[:user_provided_headers]
|
24
|
+
raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
if options[:user_provided_headers]
|
29
|
+
unless options[:user_provided_headers].is_a?(Array) && !options[:user_provided_headers].empty?
|
30
|
+
raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for user_provided_headers! Expecting array with headers.")
|
31
|
+
end
|
32
|
+
|
33
|
+
# use user-provided headers
|
34
|
+
user_header_array = options[:user_provided_headers]
|
35
|
+
# user_provided_headers: their count should match the headers_in_file if any
|
36
|
+
if defined?(file_header_size) && !file_header_size.nil?
|
37
|
+
if user_header_array.size != file_header_size
|
38
|
+
raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{user_header_array.size} headers != CSV-file has #{file_header_size} headers"
|
39
|
+
else
|
40
|
+
# we could print out the mapping of file_header_array to header_array here
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
header_array = user_header_array
|
45
|
+
else
|
46
|
+
header_array = file_header_array
|
47
|
+
end
|
48
|
+
|
49
|
+
[header_array, header_array.size]
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def preprocess_header_line(header_line, options)
|
55
|
+
header_line = enforce_utf8_encoding(header_line, options)
|
56
|
+
header_line = remove_comments_from_header(header_line, options)
|
57
|
+
header_line = header_line.chomp(options[:row_sep])
|
58
|
+
header_line.gsub!(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
|
59
|
+
header_line
|
60
|
+
end
|
61
|
+
|
62
|
+
def remove_comments_from_header(header, options)
|
63
|
+
return header unless options[:comment_regexp]
|
64
|
+
|
65
|
+
header.sub(options[:comment_regexp], '')
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|