smarter_csv 1.1.5 → 1.12.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (95) hide show
  1. checksums.yaml +5 -5
  2. data/.rspec +1 -2
  3. data/.rubocop.yml +154 -0
  4. data/CHANGELOG.md +364 -0
  5. data/CONTRIBUTORS.md +56 -0
  6. data/Gemfile +7 -2
  7. data/LICENSE.txt +21 -0
  8. data/README.md +44 -441
  9. data/Rakefile +39 -19
  10. data/TO_DO_v2.md +14 -0
  11. data/docs/_introduction.md +56 -0
  12. data/docs/basic_api.md +157 -0
  13. data/docs/batch_processing.md +68 -0
  14. data/docs/data_transformations.md +50 -0
  15. data/docs/examples.md +75 -0
  16. data/docs/header_transformations.md +113 -0
  17. data/docs/header_validations.md +36 -0
  18. data/docs/options.md +98 -0
  19. data/docs/row_col_sep.md +104 -0
  20. data/docs/value_converters.md +68 -0
  21. data/ext/smarter_csv/extconf.rb +14 -0
  22. data/ext/smarter_csv/smarter_csv.c +97 -0
  23. data/lib/smarter_csv/auto_detection.rb +78 -0
  24. data/lib/smarter_csv/errors.rb +16 -0
  25. data/lib/smarter_csv/file_io.rb +50 -0
  26. data/lib/smarter_csv/hash_transformations.rb +91 -0
  27. data/lib/smarter_csv/header_transformations.rb +63 -0
  28. data/lib/smarter_csv/header_validations.rb +34 -0
  29. data/lib/smarter_csv/headers.rb +68 -0
  30. data/lib/smarter_csv/options.rb +95 -0
  31. data/lib/smarter_csv/parser.rb +90 -0
  32. data/lib/smarter_csv/reader.rb +243 -0
  33. data/lib/smarter_csv/version.rb +3 -1
  34. data/lib/smarter_csv/writer.rb +116 -0
  35. data/lib/smarter_csv.rb +91 -3
  36. data/smarter_csv.gemspec +43 -20
  37. metadata +122 -137
  38. data/.gitignore +0 -8
  39. data/.travis.yml +0 -19
  40. data/lib/extensions/hash.rb +0 -7
  41. data/lib/smarter_csv/smarter_csv.rb +0 -281
  42. data/spec/fixtures/basic.csv +0 -8
  43. data/spec/fixtures/binary.csv +0 -1
  44. data/spec/fixtures/carriage_returns_n.csv +0 -18
  45. data/spec/fixtures/carriage_returns_quoted.csv +0 -3
  46. data/spec/fixtures/carriage_returns_r.csv +0 -1
  47. data/spec/fixtures/carriage_returns_rn.csv +0 -18
  48. data/spec/fixtures/chunk_cornercase.csv +0 -10
  49. data/spec/fixtures/empty.csv +0 -5
  50. data/spec/fixtures/line_endings_n.csv +0 -4
  51. data/spec/fixtures/line_endings_r.csv +0 -1
  52. data/spec/fixtures/line_endings_rn.csv +0 -4
  53. data/spec/fixtures/lots_of_columns.csv +0 -2
  54. data/spec/fixtures/malformed.csv +0 -3
  55. data/spec/fixtures/malformed_header.csv +0 -3
  56. data/spec/fixtures/money.csv +0 -3
  57. data/spec/fixtures/no_header.csv +0 -7
  58. data/spec/fixtures/numeric.csv +0 -5
  59. data/spec/fixtures/pets.csv +0 -5
  60. data/spec/fixtures/quoted.csv +0 -5
  61. data/spec/fixtures/separator.csv +0 -4
  62. data/spec/fixtures/skip_lines.csv +0 -8
  63. data/spec/fixtures/valid_unicode.csv +0 -5
  64. data/spec/fixtures/with_dashes.csv +0 -8
  65. data/spec/fixtures/with_dates.csv +0 -4
  66. data/spec/smarter_csv/binary_file2_spec.rb +0 -24
  67. data/spec/smarter_csv/binary_file_spec.rb +0 -22
  68. data/spec/smarter_csv/carriage_return_spec.rb +0 -170
  69. data/spec/smarter_csv/chunked_reading_spec.rb +0 -14
  70. data/spec/smarter_csv/close_file_spec.rb +0 -15
  71. data/spec/smarter_csv/column_separator_spec.rb +0 -11
  72. data/spec/smarter_csv/convert_values_to_numeric_spec.rb +0 -48
  73. data/spec/smarter_csv/extenstions_spec.rb +0 -17
  74. data/spec/smarter_csv/header_transformation_spec.rb +0 -21
  75. data/spec/smarter_csv/keep_headers_spec.rb +0 -24
  76. data/spec/smarter_csv/key_mapping_spec.rb +0 -25
  77. data/spec/smarter_csv/line_ending_spec.rb +0 -43
  78. data/spec/smarter_csv/load_basic_spec.rb +0 -20
  79. data/spec/smarter_csv/malformed_spec.rb +0 -21
  80. data/spec/smarter_csv/no_header_spec.rb +0 -24
  81. data/spec/smarter_csv/not_downcase_header_spec.rb +0 -24
  82. data/spec/smarter_csv/quoted_spec.rb +0 -23
  83. data/spec/smarter_csv/remove_empty_values_spec.rb +0 -13
  84. data/spec/smarter_csv/remove_keys_from_hashes_spec.rb +0 -25
  85. data/spec/smarter_csv/remove_not_mapped_keys_spec.rb +0 -35
  86. data/spec/smarter_csv/remove_values_matching_spec.rb +0 -26
  87. data/spec/smarter_csv/remove_zero_values_spec.rb +0 -25
  88. data/spec/smarter_csv/skip_lines_spec.rb +0 -29
  89. data/spec/smarter_csv/strings_as_keys_spec.rb +0 -24
  90. data/spec/smarter_csv/strip_chars_from_headers_spec.rb +0 -24
  91. data/spec/smarter_csv/valid_unicode_spec.rb +0 -94
  92. data/spec/smarter_csv/value_converters_spec.rb +0 -52
  93. data/spec/spec/spec_helper.rb +0 -17
  94. data/spec/spec.opts +0 -2
  95. data/spec/spec_helper.rb +0 -21
@@ -0,0 +1,104 @@
1
+
2
+ ### Contents
3
+
4
+ * [Introduction](./_introduction.md)
5
+ * [The Basic API](./basic_api.md)
6
+ * [Batch Processing](././batch_processing.md)
7
+ * [Configuration Options](./options.md)
8
+ * [**Row and Column Separators**](./row_col_sep.md)
9
+ * [Header Transformations](./header_transformations.md)
10
+ * [Header Validations](./header_validations.md)
11
+ * [Data Transformations](./data_transformations.md)
12
+ * [Value Converters](./value_converters.md)
13
+
14
+ --------------
15
+
16
+ # Row and Column Separators
17
+
18
+ ## Automatic Detection
19
+
20
+ Convenient defaults allow automatic detection of the column and row separators: `row_sep: :auto`, `col_sep: :auto`. This makes it easier to process any CSV files without having to examine the line endings or column separators, e.g. when users upload CSV files to your service and you have no control over the incoming files.
21
+
22
+ You can change the setting `:auto_row_sep_chars` to only analyze the first N characters of the file (default is 500 characters); `nil` or `0` will check the whole file). Of course you can also set the `:row_sep` manually.
23
+
24
+
25
+ ## Column Separator `col_sep`
26
+
27
+ The automatic detection of column separators considers: `,`, `\t`, `;`, `:`, `|`.
28
+
29
+ Some CSV files may contain an unusual column separqator, which could even be a control character.
30
+
31
+ ## Row Separator `row_sep`
32
+
33
+ The automatic detection of row separators considers: `\n`, `\r\n`, `\r`.
34
+
35
+ Some CSV files may contain an unusual row separqator, which could even be a control character.
36
+
37
+
38
+ ## Custom / Non-Standard CSV Formats
39
+
40
+ Besides custom values for `col_sep`, `row_sep`, some other customizations of CSV files are:
41
+ * the presence of a number of leading lines before the header or data section start.
42
+ * the presence of comment lines, e.g. lines starting with `#`
43
+
44
+ To explore these special cases, please use the following examples.
45
+
46
+ ### Example 1: reading an iTunes DB dump
47
+
48
+ This data format uses CTRL-A as the column separator, and CTRL-B as the record separator. It also has comment lines that start with a `#` character. This also maps the header `name` to `genre`, and ignores the column `export_date`.
49
+
50
+ ```ruby
51
+ filename = '/tmp/itunes_db_dump'
52
+ options = {
53
+ :col_sep => "\cA", :row_sep => "\cB\n", :comment_regexp => /^#/,
54
+ :chunk_size => 100 , :key_mapping => {export_date: nil, name: :genre},
55
+ }
56
+ n = SmarterCSV.process(filename, options) do |chunk|
57
+ SidekiqWorkerClass.process_async(chunk) # pass an array of hashes to Sidekiq workers for parallel processing
58
+ end
59
+ => returns number of chunks
60
+ ```
61
+
62
+ ### Example 2: Reading a CSV-File with custom col_sep, row_sep
63
+ In this example we have an unusual CSV file with `|` as the row separator, and `#` as the column separator.
64
+ This unusual format needs explicit options `col_sep` and `row_sep`.
65
+
66
+ ```ruby
67
+ filename = '/tmp/input_file.txt'
68
+ recordsA = SmarterCSV.process(filename, {col_sep: "#", row_sep: "|"})
69
+
70
+ => returns an array of hashes
71
+ ```
72
+
73
+ ### Example 3:
74
+ In this example, we use `skip_lines: 3` to skip and ignore the first 3 lines in the input
75
+
76
+
77
+ ```ruby
78
+ filename = '/tmp/input_file.txt'
79
+ recordsA = SmarterCSV.process(filename, {skip_lines: 3})
80
+
81
+ => returns an array of hashes
82
+ ```
83
+
84
+
85
+ ### Example 4: reading an iTunes DB dump
86
+
87
+ In this example, we use `comment_regexp` to filter out and ignore any lines starting with `#`
88
+
89
+
90
+ ```ruby
91
+ # Consider a file with CRTL-A as col_separator, and with CTRL-B\n as record_separator (hello iTunes!)
92
+ filename = '/tmp/strange_db_dump'
93
+ options = {
94
+ :col_sep => "\cA", :row_sep => "\cB\n", :comment_regexp => /^#/,
95
+ :chunk_size => 100 , :key_mapping => {:export_date => nil, :name => :genre},
96
+ }
97
+ n = SmarterCSV.process(filename, options) do |chunk|
98
+ SidekiqWorkerClass.process_async(chunk) # pass an array of hashes to Sidekiq workers for parallel processing
99
+ end
100
+ => returns number of chunks
101
+ ```
102
+
103
+ ----------------
104
+ PREVIOUS: [Configuration Options](./options.md) | NEXT: [Header Transformations](./header_transformations.md)
@@ -0,0 +1,68 @@
1
+
2
+ ### Contents
3
+
4
+ * [Introduction](./_introduction.md)
5
+ * [The Basic API](./basic_api.md)
6
+ * [Batch Processing](././batch_processing.md)
7
+ * [Configuration Options](./options.md)
8
+ * [Row and Column Separators](./row_col_sep.md)
9
+ * [Header Transformations](./header_transformations.md)
10
+ * [Header Validations](./header_validations.md)
11
+ * [Data Transformations](./data_transformations.md)
12
+ * [**Value Converters**](./value_converters.md)
13
+
14
+ --------------
15
+
16
+ # Using Value Converters
17
+
18
+ Value Converters allow you to do custom transformations specific rows, to help you massage the data so it fits the expectations of your down-stream process, such as creating a DB record.
19
+
20
+ If you use `key_mappings` and `value_converters`, make sure that the value converters references the keys based on the final mapped name, not the original name in the CSV file.
21
+
22
+ ```ruby
23
+ $ cat spec/fixtures/with_dates.csv
24
+ first,last,date,price
25
+ Ben,Miller,10/30/1998,$44.50
26
+ Tom,Turner,2/1/2011,$15.99
27
+ Ken,Smith,01/09/2013,$199.99
28
+
29
+ $ irb
30
+ > require 'smarter_csv'
31
+ > require 'date'
32
+
33
+ # define a custom converter class, which implements self.convert(value)
34
+ class DateConverter
35
+ def self.convert(value)
36
+ Date.strptime( value, '%m/%d/%Y') # parses custom date format into Date instance
37
+ end
38
+ end
39
+
40
+ class DollarConverter
41
+ def self.convert(value)
42
+ value.sub('$','').to_f # strips the dollar sign and creates a Float value
43
+ end
44
+ end
45
+
46
+ require 'money'
47
+ class MoneyConverter
48
+ def self.convert(value)
49
+ # depending on locale you might want to also remove the indicator for thousands, e.g. comma
50
+ Money.from_amount(value.gsub(/[\s\$]/,'').to_f) # creates a Money instance (based on cents)
51
+ end
52
+ end
53
+
54
+ options = {:value_converters => {:date => DateConverter, :price => DollarConverter}}
55
+ data = SmarterCSV.process("spec/fixtures/with_dates.csv", options)
56
+ first_record = data.first
57
+ first_record[:date]
58
+ => #<Date: 1998-10-30 ((2451117j,0s,0n),+0s,2299161j)>
59
+ first_record[:date].class
60
+ => Date
61
+ first_record[:price]
62
+ => 44.50
63
+ first_record[:price].class
64
+ => Float
65
+ ```
66
+
67
+ --------------------
68
+ PREVIOUS: [Data Transformations](./data_transformations.md) | UP: [README](../README.md)
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mkmf'
4
+ require "rbconfig"
5
+
6
+ if RbConfig::MAKEFILE_CONFIG["CFLAGS"].include?("-g -O3")
7
+ fixed_CFLAGS = RbConfig::MAKEFILE_CONFIG["CFLAGS"].sub("-g -O3", "-O3 $(cflags)")
8
+ puts("Fix CFLAGS: #{RbConfig::MAKEFILE_CONFIG["CFLAGS"]} -> #{fixed_CFLAGS}")
9
+ RbConfig::MAKEFILE_CONFIG["CFLAGS"] = fixed_CFLAGS
10
+ end
11
+
12
+ CONFIG["optflags"] = "-O3"
13
+
14
+ create_makefile('smarter_csv/smarter_csv')
@@ -0,0 +1,97 @@
1
+ #include "ruby.h"
2
+ #include "ruby/encoding.h"
3
+ #include <stdio.h>
4
+ #include <stdbool.h>
5
+
6
+ #ifndef bool
7
+ #define bool int
8
+ #define false ((bool)0)
9
+ #define true ((bool)1)
10
+ #endif
11
+
12
+ /*
13
+ max_size: pass nil if no limit is specified
14
+ */
15
+ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size) {
16
+ if (RB_TYPE_P(line, T_NIL) == 1) {
17
+ return rb_ary_new();
18
+ }
19
+
20
+ if (RB_TYPE_P(line, T_STRING) != 1) {
21
+ rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
22
+ }
23
+
24
+ rb_encoding *encoding = rb_enc_get(line); /* get the encoding from the input line */
25
+ char *startP = RSTRING_PTR(line); /* may not be null terminated */
26
+ long line_len = RSTRING_LEN(line);
27
+ char *endP = startP + line_len ; /* points behind the string */
28
+ char *p = startP;
29
+
30
+ char *col_sepP = RSTRING_PTR(col_sep);
31
+ long col_sep_len = RSTRING_LEN(col_sep);
32
+
33
+ char *quoteP = RSTRING_PTR(quote_char);
34
+ long quote_count = 0;
35
+
36
+ bool col_sep_found = true;
37
+
38
+ VALUE elements = rb_ary_new();
39
+ VALUE field;
40
+ long i;
41
+
42
+ char prev_char = '\0'; // Store the previous character for comparison against an escape character
43
+ long backslash_count = 0; // to count consecutive backslash characters
44
+
45
+ while (p < endP) {
46
+ /* does the remaining string start with col_sep ? */
47
+ col_sep_found = true;
48
+ for(i=0; (i < col_sep_len) && (p+i < endP) ; i++) {
49
+ col_sep_found = col_sep_found && (*(p+i) == *(col_sepP+i));
50
+ }
51
+ /* if col_sep was found and we have even quotes */
52
+ if (col_sep_found && (quote_count % 2 == 0)) {
53
+ /* if max_size != nil && lements.size >= header_size */
54
+ if ((max_size != Qnil) && RARRAY_LEN(elements) >= NUM2INT(max_size)) {
55
+ break;
56
+ } else {
57
+ /* push that field with original encoding onto the results */
58
+ field = rb_enc_str_new(startP, p - startP, encoding);
59
+ rb_ary_push(elements, field);
60
+
61
+ p += col_sep_len;
62
+ startP = p;
63
+ }
64
+ } else {
65
+ if (*p == '\\') {
66
+ backslash_count++;
67
+ } else {
68
+ if (*p == *quoteP && (backslash_count % 2 == 0)) {
69
+ quote_count++;
70
+ }
71
+ backslash_count = 0; // no more consecutive backslash characters
72
+ }
73
+ p++;
74
+ }
75
+
76
+ prev_char = *(p - 1); // Update the previous character
77
+ } /* while */
78
+
79
+ /* check if the last part of the line needs to be processed */
80
+ if ((max_size == Qnil) || RARRAY_LEN(elements) < NUM2INT(max_size)) {
81
+ /* copy the remaining line as a field with original encoding onto the results */
82
+ field = rb_enc_str_new(startP, endP - startP, encoding);
83
+ rb_ary_push(elements, field);
84
+ }
85
+
86
+ return elements;
87
+ }
88
+
89
+ VALUE SmarterCSV = Qnil;
90
+ VALUE Parser = Qnil;
91
+
92
+ void Init_smarter_csv(void) {
93
+ SmarterCSV = rb_define_module("SmarterCSV");
94
+ Parser = rb_define_module_under(SmarterCSV, "Parser");
95
+
96
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 4);
97
+ }
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ module AutoDetection
5
+ protected
6
+
7
+ # If file has headers, then guesses column separator from headers.
8
+ # Otherwise guesses column separator from contents.
9
+ # Raises exception if none is found.
10
+ def guess_column_separator(filehandle, options)
11
+ skip_lines(filehandle, options)
12
+
13
+ delimiters = [',', "\t", ';', ':', '|']
14
+
15
+ line = nil
16
+ has_header = options[:headers_in_file]
17
+ candidates = Hash.new(0)
18
+ count = has_header ? 1 : 5
19
+ count.times do
20
+ line = readline_with_counts(filehandle, options)
21
+ delimiters.each do |d|
22
+ escaped_quote = Regexp.escape(options[:quote_char])
23
+
24
+ # Count only non-quoted occurrences of the delimiter
25
+ non_quoted_text = line.split(/#{escaped_quote}[^#{escaped_quote}]*#{escaped_quote}/).join
26
+
27
+ candidates[d] += non_quoted_text.scan(d).count
28
+ end
29
+ rescue EOFError # short files
30
+ break
31
+ end
32
+ rewind(filehandle)
33
+
34
+ if candidates.values.max == 0
35
+ # if the header only contains
36
+ return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
37
+
38
+ raise SmarterCSV::NoColSepDetected
39
+ end
40
+
41
+ candidates.key(candidates.values.max)
42
+ end
43
+
44
+ # limitation: this currently reads the whole file in before making a decision
45
+ def guess_line_ending(filehandle, options)
46
+ counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
47
+ quoted_char = false
48
+
49
+ # count how many of the pre-defined line-endings we find
50
+ # ignoring those contained within quote characters
51
+ last_char = nil
52
+ lines = 0
53
+ filehandle.each_char do |c|
54
+ quoted_char = !quoted_char if c == options[:quote_char]
55
+ next if quoted_char
56
+
57
+ if last_char == "\r"
58
+ if c == "\n"
59
+ counts["\r\n"] += 1
60
+ else
61
+ counts["\r"] += 1 # \r are counted after they appeared
62
+ end
63
+ elsif c == "\n"
64
+ counts["\n"] += 1
65
+ end
66
+ last_char = c
67
+ lines += 1
68
+ break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
69
+ end
70
+ rewind(filehandle)
71
+
72
+ counts["\r"] += 1 if last_char == "\r"
73
+ # find the most frequent key/value pair:
74
+ most_frequent_key, _count = counts.max_by{|_, v| v}
75
+ most_frequent_key
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class Error < StandardError; end # new code should rescue this instead
5
+ # Reader:
6
+ class SmarterCSVException < Error; end # for backwards compatibility
7
+ class HeaderSizeMismatch < SmarterCSVException; end
8
+ class IncorrectOption < SmarterCSVException; end
9
+ class ValidationError < SmarterCSVException; end
10
+ class DuplicateHeaders < SmarterCSVException; end
11
+ class MissingKeys < SmarterCSVException; end # previously known as MissingHeaders
12
+ class NoColSepDetected < SmarterCSVException; end
13
+ class KeyMappingError < SmarterCSVException; end
14
+ # Writer:
15
+ class InvalidInputData < SmarterCSVException; end
16
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ module FileIO
5
+ protected
6
+
7
+ def readline_with_counts(filehandle, options)
8
+ line = filehandle.readline(options[:row_sep])
9
+ @file_line_count += 1
10
+ @csv_line_count += 1
11
+ line = remove_bom(line) if @csv_line_count == 1
12
+ line
13
+ end
14
+
15
+ def skip_lines(filehandle, options)
16
+ options[:skip_lines].to_i.times do
17
+ readline_with_counts(filehandle, options)
18
+ end
19
+ end
20
+
21
+ def rewind(filehandle)
22
+ @file_line_count = 0
23
+ @csv_line_count = 0
24
+ filehandle.rewind
25
+ end
26
+
27
+ private
28
+
29
+ UTF_32_BOM = %w[0 0 fe ff].freeze
30
+ UTF_32LE_BOM = %w[ff fe 0 0].freeze
31
+ UTF_8_BOM = %w[ef bb bf].freeze
32
+ UTF_16_BOM = %w[fe ff].freeze
33
+ UTF_16LE_BOM = %w[ff fe].freeze
34
+
35
+ def remove_bom(str)
36
+ str_as_hex = str.bytes.map{|x| x.to_s(16)}
37
+ # if string does not start with one of the bytes, there is no BOM
38
+ return str unless %w[ef fe ff 0].include?(str_as_hex[0])
39
+
40
+ return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
41
+ return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
42
+ return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
43
+
44
+ # :nocov:
45
+ puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
46
+ str
47
+ # :nocov:
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ module HashTransformations
5
+ def hash_transformations(hash, options)
6
+ # there may be unmapped keys, or keys purposedly mapped to nil or an empty key..
7
+ # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
8
+ remove_empty_values = options[:remove_empty_values] == true
9
+ remove_zero_values = options[:remove_zero_values]
10
+ remove_values_matching = options[:remove_values_matching]
11
+ convert_to_numeric = options[:convert_values_to_numeric]
12
+ value_converters = options[:value_converters]
13
+
14
+ hash.each_with_object({}) do |(k, v), new_hash|
15
+ next if k.nil? || k == '' || k == :""
16
+ next if remove_empty_values && (has_rails ? v.blank? : blank?(v))
17
+ next if remove_zero_values && v.is_a?(String) && v =~ /^(0+|0+\.0+)$/ # values are Strings
18
+ next if remove_values_matching && v =~ remove_values_matching
19
+
20
+ # deal with the :only / :except options to :convert_values_to_numeric
21
+ if convert_to_numeric && !limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
22
+ if v =~ /^[+-]?\d+\.\d+$/
23
+ v = v.to_f
24
+ elsif v =~ /^[+-]?\d+$/
25
+ v = v.to_i
26
+ end
27
+ end
28
+
29
+ converter = value_converters[k] if value_converters
30
+ v = converter.convert(v) if converter
31
+
32
+ new_hash[k] = v
33
+ end
34
+ end
35
+
36
+ # def hash_transformations(hash, options)
37
+ # # there may be unmapped keys, or keys purposedly mapped to nil or an empty key..
38
+ # # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
39
+ # hash.delete(nil)
40
+ # hash.delete('')
41
+ # hash.delete(:"")
42
+
43
+ # if options[:remove_empty_values] == true
44
+ # hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
45
+ # end
46
+
47
+ # hash.delete_if{|_k, v| !v.nil? && v =~ /^(0+|0+\.0+)$/} if options[:remove_zero_values] # values are Strings
48
+ # hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]
49
+
50
+ # if options[:convert_values_to_numeric]
51
+ # hash.each do |k, v|
52
+ # # deal with the :only / :except options to :convert_values_to_numeric
53
+ # next if limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
54
+
55
+ # # convert if it's a numeric value:
56
+ # case v
57
+ # when /^[+-]?\d+\.\d+$/
58
+ # hash[k] = v.to_f
59
+ # when /^[+-]?\d+$/
60
+ # hash[k] = v.to_i
61
+ # end
62
+ # end
63
+ # end
64
+
65
+ # if options[:value_converters]
66
+ # hash.each do |k, v|
67
+ # converter = options[:value_converters][k]
68
+ # next unless converter
69
+
70
+ # hash[k] = converter.convert(v)
71
+ # end
72
+ # end
73
+
74
+ # hash
75
+ # end
76
+
77
+ protected
78
+
79
+ # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
80
+ def limit_execution_for_only_or_except(options, option_name, key)
81
+ if options[option_name].is_a?(Hash)
82
+ if options[option_name].has_key?(:except)
83
+ return true if Array(options[option_name][:except]).include?(key)
84
+ elsif options[option_name].has_key?(:only)
85
+ return true unless Array(options[option_name][:only]).include?(key)
86
+ end
87
+ end
88
+ false
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ module HeaderTransformations
5
+ # transform the headers that were in the file:
6
+ def header_transformations(header_array, options)
7
+ header_array.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
8
+ header_array.map!{|x| x.strip} if options[:strip_whitespace]
9
+
10
+ unless options[:keep_original_headers]
11
+ header_array.map!{|x| x.gsub(/\s+|-+/, '_')}
12
+ header_array.map!{|x| x.downcase} if options[:downcase_header]
13
+ end
14
+
15
+ # detect duplicate headers and disambiguate
16
+ header_array = disambiguate_headers(header_array, options) if options[:duplicate_header_suffix]
17
+ # symbolize headers
18
+ header_array = header_array.map{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
19
+ # doesn't make sense to re-map when we have user_provided_headers
20
+ header_array = remap_headers(header_array, options) if options[:key_mapping]
21
+
22
+ header_array
23
+ end
24
+
25
+ def disambiguate_headers(headers, options)
26
+ counts = Hash.new(0)
27
+ headers.map do |header|
28
+ counts[header] += 1
29
+ counts[header] > 1 ? "#{header}#{options[:duplicate_header_suffix]}#{counts[header]}" : header
30
+ end
31
+ end
32
+
33
+ # do some key mapping on the keys in the file header
34
+ # if you want to completely delete a key, then map it to nil or to ''
35
+ def remap_headers(headers, options)
36
+ key_mapping = options[:key_mapping]
37
+ if key_mapping.empty? || !key_mapping.is_a?(Hash) || key_mapping.keys.empty?
38
+ raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for key_mapping! Expecting hash with from -> to mappings")
39
+ end
40
+
41
+ key_mapping = options[:key_mapping]
42
+ # if silence_missing_keys are not set, raise error if missing header
43
+ missing_keys = key_mapping.keys - headers
44
+ # if the user passes a list of speciffic mapped keys that are optional
45
+ missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
46
+
47
+ unless missing_keys.empty? || options[:silence_missing_keys] == true
48
+ raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
49
+ end
50
+
51
+ headers.map! do |header|
52
+ if key_mapping.has_key?(header)
53
+ key_mapping[header].nil? ? nil : key_mapping[header]
54
+ elsif options[:remove_unmapped_keys]
55
+ nil
56
+ else
57
+ header
58
+ end
59
+ end
60
+ headers
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ module HeaderValidations
5
+ def header_validations(headers, options)
6
+ check_duplicate_headers(headers, options)
7
+ check_required_headers(headers, options)
8
+ end
9
+
10
+ def check_duplicate_headers(headers, _options)
11
+ header_counts = Hash.new(0)
12
+ headers.each { |header| header_counts[header] += 1 unless header.nil? }
13
+
14
+ duplicates = header_counts.select { |_, count| count > 1 }
15
+
16
+ unless duplicates.empty?
17
+ raise(SmarterCSV::DuplicateHeaders, "Duplicate Headers in CSV: #{duplicates.inspect}")
18
+ end
19
+ end
20
+
21
+ require 'set'
22
+
23
+ def check_required_headers(headers, options)
24
+ if options[:required_keys] && options[:required_keys].is_a?(Array)
25
+ headers_set = headers.to_set
26
+ missing_keys = options[:required_keys].select { |k| !headers_set.include?(k) }
27
+
28
+ unless missing_keys.empty?
29
+ raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}. Check `reader.headers` for original headers."
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ module Headers
5
+ def process_headers(filehandle, options)
6
+ @raw_header = nil # header as it appears in the file
7
+ @headers = nil # the processed headers
8
+ header_array = []
9
+ file_header_size = nil
10
+
11
+ # if headers_in_file, get the headers -> We get the number of columns, even when user provided headers
12
+ if options[:headers_in_file] # extract the header line
13
+ # process the header line in the CSV file..
14
+ # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
15
+ header_line = @raw_header = readline_with_counts(filehandle, options)
16
+ header_line = preprocess_header_line(header_line, options)
17
+
18
+ file_header_array, file_header_size = parse(header_line, options)
19
+
20
+ file_header_array = header_transformations(file_header_array, options)
21
+
22
+ else
23
+ unless options[:user_provided_headers]
24
+ raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers"
25
+ end
26
+ end
27
+
28
+ if options[:user_provided_headers]
29
+ unless options[:user_provided_headers].is_a?(Array) && !options[:user_provided_headers].empty?
30
+ raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for user_provided_headers! Expecting array with headers.")
31
+ end
32
+
33
+ # use user-provided headers
34
+ user_header_array = options[:user_provided_headers]
35
+ # user_provided_headers: their count should match the headers_in_file if any
36
+ if defined?(file_header_size) && !file_header_size.nil?
37
+ if user_header_array.size != file_header_size
38
+ raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{user_header_array.size} headers != CSV-file has #{file_header_size} headers"
39
+ else
40
+ # we could print out the mapping of file_header_array to header_array here
41
+ end
42
+ end
43
+
44
+ header_array = user_header_array
45
+ else
46
+ header_array = file_header_array
47
+ end
48
+
49
+ [header_array, header_array.size]
50
+ end
51
+
52
+ private
53
+
54
+ def preprocess_header_line(header_line, options)
55
+ header_line = enforce_utf8_encoding(header_line, options)
56
+ header_line = remove_comments_from_header(header_line, options)
57
+ header_line = header_line.chomp(options[:row_sep])
58
+ header_line.gsub!(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
59
+ header_line
60
+ end
61
+
62
+ def remove_comments_from_header(header, options)
63
+ return header unless options[:comment_regexp]
64
+
65
+ header.sub(options[:comment_regexp], '')
66
+ end
67
+ end
68
+ end