smarter_csv 1.14.2 → 1.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,193 @@
1
+ #include "ruby.h"
2
+ #include "ruby/encoding.h"
3
+ #include <stdio.h>
4
+ #include <stdbool.h>
5
+ #include <string.h>
6
+
7
+ #ifndef bool
8
+ #define bool int
9
+ #define false ((bool)0)
10
+ #define true ((bool)1)
11
+ #endif
12
+
13
+ VALUE SmarterCSV = Qnil;
14
+ VALUE eMalformedCSVError = Qnil;
15
+ VALUE Parser = Qnil;
16
+ VALUE Qempty_string = Qnil; // shared frozen empty string
17
+
18
+ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val) {
19
+ if (RB_TYPE_P(line, T_NIL) == 1) {
20
+ return rb_ary_new();
21
+ }
22
+
23
+ if (RB_TYPE_P(line, T_STRING) != 1) {
24
+ rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
25
+ }
26
+
27
+ rb_encoding *encoding = rb_enc_get(line); /* get the encoding from the input line */
28
+ char *startP = RSTRING_PTR(line); /* may not be null terminated */
29
+ long line_len = RSTRING_LEN(line);
30
+ char *endP = startP + line_len; /* points behind the string */
31
+ char *p = startP;
32
+
33
+ char *col_sepP = RSTRING_PTR(col_sep);
34
+ long col_sep_len = RSTRING_LEN(col_sep);
35
+
36
+ char *quoteP = RSTRING_PTR(quote_char);
37
+ char quote_char_val = quoteP[0];
38
+ size_t quote_len = strlen(quoteP);
39
+
40
+ bool col_sep_found = true;
41
+
42
+ VALUE elements = rb_ary_new();
43
+ VALUE field;
44
+ long i;
45
+
46
+ /* Variables for escaped quote handling */
47
+ long backslash_count = 0;
48
+ bool in_quotes = false;
49
+
50
+ /* Optimization 1: maintain count instead of calling RARRAY_LEN repeatedly */
51
+ long element_count = 0;
52
+
53
+ /* Optimization 2: cache max_size value if not nil */
54
+ int max_fields = -1;
55
+ if (max_size != Qnil) {
56
+ max_fields = NUM2INT(max_size);
57
+ if (max_fields < 0) {
58
+ return rb_ary_new(); // Return empty array early
59
+ }
60
+ }
61
+
62
+ bool has_quotes = RTEST(has_quotes_val);
63
+
64
+ /* Fast-path for clean CSV lines: no quotes, 1-character separator */
65
+ if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
66
+ char sep = *col_sepP;
67
+ char *sep_pos = NULL;
68
+
69
+ while ((sep_pos = memchr(p, sep, endP - p))) {
70
+ if ((max_fields >= 0) && (element_count >= max_fields)) {
71
+ break;
72
+ }
73
+
74
+ /* check if only spaces */
75
+ bool only_spaces = true;
76
+ for (char *s = startP; s < sep_pos; s++) {
77
+ if (*s != ' ') {
78
+ only_spaces = false;
79
+ break;
80
+ }
81
+ }
82
+
83
+ field = only_spaces ? Qempty_string : rb_enc_str_new(startP, sep_pos - startP, encoding);
84
+ rb_ary_push(elements, field);
85
+ element_count++;
86
+
87
+ p = sep_pos + 1;
88
+ startP = p;
89
+ }
90
+
91
+ /* check if the last part of the line needs to be processed */
92
+ if ((max_fields < 0) || (element_count < max_fields)) {
93
+ bool only_spaces = true;
94
+ for (char *s = startP; s < endP; s++) {
95
+ if (*s != ' ') {
96
+ only_spaces = false;
97
+ break;
98
+ }
99
+ }
100
+ field = only_spaces ? Qempty_string : rb_enc_str_new(startP, endP - startP, encoding);
101
+ rb_ary_push(elements, field);
102
+ }
103
+
104
+ return elements;
105
+ }
106
+
107
+ /* default path for quoted/multi-separator lines */
108
+ while (p < endP) {
109
+ /* does the remaining string start with col_sep ? */
110
+ col_sep_found = true;
111
+ for(i=0; (i < col_sep_len) && (p+i < endP); i++) {
112
+ col_sep_found = col_sep_found && (*(p+i) == *(col_sepP+i));
113
+ }
114
+ /* if col_sep was found and we're not inside quotes */
115
+ if (col_sep_found && !in_quotes) {
116
+ if ((max_fields >= 0) && (element_count >= max_fields)) {
117
+ break;
118
+ } else {
119
+ long field_len = p - startP;
120
+ char *raw_field = startP;
121
+
122
+ /* quote cleanup inline */
123
+ if (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val) {
124
+ raw_field++;
125
+ field_len -= 2;
126
+ }
127
+
128
+ field = rb_enc_str_new(raw_field, field_len, encoding);
129
+ if (strstr(RSTRING_PTR(field), quoteP)) {
130
+ VALUE doubled = rb_str_new_cstr(quoteP);
131
+ rb_str_replace(field, rb_funcall(field, rb_intern("gsub"), 2, rb_str_cat(doubled, quoteP, quote_len), rb_str_new(quoteP, quote_len)));
132
+ }
133
+
134
+ rb_ary_push(elements, field);
135
+ element_count++;
136
+
137
+ p += col_sep_len;
138
+ startP = p;
139
+ backslash_count = 0; // Reset backslash count at the start of a new field
140
+ }
141
+ } else {
142
+ if (*p == '\\') {
143
+ backslash_count++;
144
+ } else {
145
+ if (*p == quote_char_val) {
146
+ if (backslash_count % 2 == 0) {
147
+ /* Even number of backslashes means quote is not escaped */
148
+ in_quotes = !in_quotes;
149
+ }
150
+ /* Else, quote is escaped; do nothing */
151
+ }
152
+ backslash_count = 0; // Reset after any character other than backslash
153
+ }
154
+ p++;
155
+ }
156
+ } /* while */
157
+
158
+ /* Check for unclosed quotes at the end of the line */
159
+ if (in_quotes) {
160
+ rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
161
+ }
162
+
163
+ /* check if the last part of the line needs to be processed */
164
+ if ((max_fields < 0) || (element_count < max_fields)) {
165
+ long field_len = endP - startP;
166
+ char *raw_field = startP;
167
+
168
+ if (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val) {
169
+ raw_field++;
170
+ field_len -= 2;
171
+ }
172
+
173
+ field = rb_enc_str_new(raw_field, field_len, encoding);
174
+ if (strstr(RSTRING_PTR(field), quoteP)) {
175
+ VALUE doubled = rb_str_new_cstr(quoteP);
176
+ rb_str_replace(field, rb_funcall(field, rb_intern("gsub"), 2, rb_str_cat(doubled, quoteP, quote_len), rb_str_new(quoteP, quote_len)));
177
+ }
178
+
179
+ rb_ary_push(elements, field);
180
+ }
181
+
182
+ return elements;
183
+ }
184
+
185
+ void Init_smarter_csv(void) {
186
+ // these modules and the error class are already defined in Ruby code, make them accessible:
187
+ SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
188
+ Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
189
+ eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
190
+ Qempty_string = rb_str_new_literal("");
191
+ rb_gc_register_address(&Qempty_string);
192
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 5);
193
+ }
@@ -0,0 +1,196 @@
1
+ #include "ruby.h"
2
+ #include "ruby/encoding.h"
3
+ #include <stdio.h>
4
+ #include <stdbool.h>
5
+ #include <string.h>
6
+
7
+ #ifndef bool
8
+ #define bool int
9
+ #define false ((bool)0)
10
+ #define true ((bool)1)
11
+ #endif
12
+
13
+ VALUE SmarterCSV = Qnil;
14
+ VALUE eMalformedCSVError = Qnil;
15
+ VALUE Parser = Qnil;
16
+ VALUE Qempty_string = Qnil; // shared frozen empty string
17
+
18
+ typedef struct {
19
+ const char *start;
20
+ long length;
21
+ bool needs_unescape;
22
+ } ParsedField;
23
+
24
+ static bool detect_and_strip_quotes(const char **start, long *len, char quote_char, bool *found_double) {
25
+ if (*len < 2 || (*start)[0] != quote_char || (*start)[*len - 1] != quote_char) return false;
26
+
27
+ *start = *start + 1;
28
+ *len = *len - 2;
29
+
30
+ for (long i = 0; i < *len - 1; i++) {
31
+ if ((*start)[i] == quote_char && (*start)[i + 1] == quote_char) {
32
+ *found_double = true;
33
+ return true;
34
+ }
35
+ }
36
+ *found_double = false;
37
+ return true;
38
+ }
39
+
40
+ static VALUE unescape_quotes(const char *start, long len, char quote_char, rb_encoding *enc) {
41
+ char *buf = ALLOC_N(char, len);
42
+ long j = 0;
43
+
44
+ for (long i = 0; i < len; i++) {
45
+ if (start[i] == quote_char && i + 1 < len && start[i + 1] == quote_char) {
46
+ buf[j++] = quote_char;
47
+ i++;
48
+ } else {
49
+ buf[j++] = start[i];
50
+ }
51
+ }
52
+
53
+ VALUE str = rb_enc_str_new(buf, j, enc);
54
+ xfree(buf);
55
+ return str;
56
+ }
57
+
58
+ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val) {
59
+ if (RB_TYPE_P(line, T_NIL)) return rb_ary_new();
60
+ if (!RB_TYPE_P(line, T_STRING)) rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
61
+
62
+ rb_encoding *encoding = rb_enc_get(line);
63
+ char *startP = RSTRING_PTR(line);
64
+ long line_len = RSTRING_LEN(line);
65
+ char *endP = startP + line_len;
66
+ char *p = startP;
67
+
68
+ char *col_sepP = RSTRING_PTR(col_sep);
69
+ long col_sep_len = RSTRING_LEN(col_sep);
70
+
71
+ char *quoteP = RSTRING_PTR(quote_char);
72
+ char quote_char_val = quoteP[0];
73
+ size_t quote_len = strlen(quoteP);
74
+
75
+ VALUE elements = rb_ary_new();
76
+
77
+ long backslash_count = 0;
78
+ bool in_quotes = false;
79
+ long element_count = 0;
80
+
81
+ int max_fields = -1;
82
+ if (max_size != Qnil) {
83
+ max_fields = NUM2INT(max_size);
84
+ if (max_fields < 0) return rb_ary_new();
85
+ }
86
+
87
+ bool has_quotes = RTEST(has_quotes_val);
88
+
89
+ if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
90
+ char sep = *col_sepP;
91
+ char *sep_pos = NULL;
92
+
93
+ while ((sep_pos = memchr(p, sep, endP - p))) {
94
+ if ((max_fields >= 0) && (element_count >= max_fields)) break;
95
+
96
+ bool only_spaces = true;
97
+ for (char *s = startP; s < sep_pos; s++) {
98
+ if (*s != ' ') { only_spaces = false; break; }
99
+ }
100
+
101
+ VALUE field = only_spaces ? Qempty_string : rb_enc_str_new(startP, sep_pos - startP, encoding);
102
+ rb_ary_push(elements, field);
103
+ element_count++;
104
+
105
+ p = sep_pos + 1;
106
+ startP = p;
107
+ }
108
+
109
+ if ((max_fields < 0) || (element_count < max_fields)) {
110
+ bool only_spaces = true;
111
+ for (char *s = startP; s < endP; s++) {
112
+ if (*s != ' ') { only_spaces = false; break; }
113
+ }
114
+ VALUE field = only_spaces ? Qempty_string : rb_enc_str_new(startP, endP - startP, encoding);
115
+ rb_ary_push(elements, field);
116
+ }
117
+
118
+ return elements;
119
+ }
120
+
121
+ ParsedField *fields = ALLOC_N(ParsedField, 128);
122
+ long field_count = 0;
123
+ char *field_start = startP;
124
+
125
+ while (p < endP) {
126
+ bool col_sep_found = (p + col_sep_len <= endP) && (memcmp(p, col_sepP, col_sep_len) == 0);
127
+
128
+ if (col_sep_found && !in_quotes) {
129
+ if ((max_fields >= 0) && (field_count >= max_fields)) break;
130
+
131
+ long field_len = p - field_start;
132
+ const char *actual_start = field_start;
133
+ long actual_len = field_len;
134
+ bool needs_unescape = false;
135
+ bool quoted = detect_and_strip_quotes(&actual_start, &actual_len, quote_char_val, &needs_unescape);
136
+ if (!quoted) {
137
+ for (long i = 0; i < actual_len - 1; i++) {
138
+ if (actual_start[i] == quote_char_val && actual_start[i + 1] == quote_char_val) {
139
+ needs_unescape = true;
140
+ break;
141
+ }
142
+ }
143
+ }
144
+
145
+ fields[field_count++] = (ParsedField){ actual_start, actual_len, needs_unescape };
146
+ p += col_sep_len;
147
+ field_start = p;
148
+ backslash_count = 0;
149
+ } else {
150
+ if (*p == '\\') {
151
+ backslash_count++;
152
+ } else {
153
+ if (*p == quote_char_val && (backslash_count % 2 == 0)) {
154
+ in_quotes = !in_quotes;
155
+ }
156
+ backslash_count = 0;
157
+ }
158
+ p++;
159
+ }
160
+ }
161
+
162
+ if (in_quotes) {
163
+ xfree(fields);
164
+ rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
165
+ }
166
+
167
+ if ((max_fields < 0) || (field_count < max_fields)) {
168
+ long field_len = endP - field_start;
169
+ const char *actual_start = field_start;
170
+ long actual_len = field_len;
171
+ bool needs_unescape = false;
172
+ detect_and_strip_quotes(&actual_start, &actual_len, quote_char_val, &needs_unescape);
173
+
174
+ fields[field_count++] = (ParsedField){ actual_start, actual_len, needs_unescape };
175
+ }
176
+
177
+ for (long i = 0; i < field_count; i++) {
178
+ ParsedField f = fields[i];
179
+ VALUE field = f.length == 0 ? Qempty_string :
180
+ (f.needs_unescape ? unescape_quotes(f.start, f.length, quote_char_val, encoding)
181
+ : rb_enc_str_new(f.start, f.length, encoding));
182
+ rb_ary_push(elements, field);
183
+ }
184
+
185
+ xfree(fields);
186
+ return elements;
187
+ }
188
+
189
+ void Init_smarter_csv(void) {
190
+ SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
191
+ Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
192
+ eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
193
+ Qempty_string = rb_str_new_literal("");
194
+ rb_gc_register_address(&Qempty_string);
195
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 5);
196
+ }
@@ -2,6 +2,8 @@
2
2
 
3
3
  module SmarterCSV
4
4
  module Parser
5
+ EMPTY_STRING = ''.freeze
6
+
5
7
  protected
6
8
 
7
9
  ###
@@ -11,17 +13,16 @@ module SmarterCSV
11
13
  ###
12
14
  def parse(line, options, header_size = nil)
13
15
  # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
16
+ has_quotes = line.include?(options[:quote_char])
14
17
 
15
18
  if options[:acceleration] && has_acceleration
16
19
  # :nocov:
17
- has_quotes = line =~ /#{options[:quote_char]}/
18
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
19
- elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
20
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace])
20
21
  [elements, elements.size]
21
22
  # :nocov:
22
23
  else
23
24
  # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
24
- parse_csv_line_ruby(line, options, header_size)
25
+ parse_csv_line_ruby(line, options, header_size, has_quotes)
25
26
  end
26
27
  end
27
28
 
@@ -46,7 +47,7 @@ module SmarterCSV
46
47
  #
47
48
  # Our convention is that empty fields are returned as empty strings, not as nil.
48
49
 
49
- def parse_csv_line_ruby(line, options, header_size = nil)
50
+ def parse_csv_line_ruby(line, options, header_size = nil, has_quotes = false)
50
51
  return [[], 0] if line.nil?
51
52
 
52
53
  line_size = line.size
@@ -98,11 +99,13 @@ module SmarterCSV
98
99
  elements << cleanup_quotes(line[start..-1], quote)
99
100
  end
100
101
 
102
+ elements.map!(&:strip) if options[:strip_whitespace]
101
103
  [elements, elements.size]
102
104
  end
103
105
 
104
106
  def cleanup_quotes(field, quote)
105
- return field if field.nil?
107
+ return nil if field.nil?
108
+ return EMPTY_STRING if field.empty?
106
109
 
107
110
  # Remove surrounding quotes if present
108
111
  if field.start_with?(quote) && field.end_with?(quote)
@@ -110,9 +113,13 @@ module SmarterCSV
110
113
  end
111
114
 
112
115
  # Replace double quotes with a single quote
113
- field.gsub!((quote * 2).to_s, quote)
116
+ field.gsub!(doubled_quote(quote), quote)
114
117
 
115
118
  field
116
119
  end
120
+
121
+ def doubled_quote(quote)
122
+ @doubled_quote ||= (quote * 2).to_s.freeze
123
+ end
117
124
  end
118
125
  end
@@ -128,6 +128,7 @@ module SmarterCSV
128
128
  line.chomp!(options[:row_sep])
129
129
 
130
130
  # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
131
+ # we are now stripping whitespace inside the parse() methods
131
132
  dataA, data_size = parse(line, options) # we parse the extra columns
132
133
 
133
134
  if options[:strict]
@@ -141,8 +142,6 @@ module SmarterCSV
141
142
  end
142
143
  end
143
144
 
144
- dataA.map!{|x| x.strip} if options[:strip_whitespace]
145
-
146
145
  # if all values are blank, then ignore this line
147
146
  next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
148
147
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.14.2"
4
+ VERSION = "1.14.3"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.14.2
4
+ version: 1.14.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-04-10 00:00:00.000000000 Z
11
+ date: 2025-05-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -127,8 +127,20 @@ files:
127
127
  - docs/options.md
128
128
  - docs/row_col_sep.md
129
129
  - docs/value_converters.md
130
+ - ext/smarter_csv/Makefile
130
131
  - ext/smarter_csv/extconf.rb
131
132
  - ext/smarter_csv/smarter_csv.c
133
+ - ext/smarter_csv/smarter_csv.c.works
134
+ - ext/smarter_csv/smarter_csv.c.works10
135
+ - ext/smarter_csv/smarter_csv.c.works11
136
+ - ext/smarter_csv/smarter_csv.c.works14
137
+ - ext/smarter_csv/smarter_csv.c.works15
138
+ - ext/smarter_csv/smarter_csv.c.works2
139
+ - ext/smarter_csv/smarter_csv.c.works4
140
+ - ext/smarter_csv/smarter_csv.c.works5
141
+ - ext/smarter_csv/smarter_csv.c.works7
142
+ - ext/smarter_csv/smarter_csv.c.works8
143
+ - ext/smarter_csv/smarter_csv.c.works9
132
144
  - lib/smarter_csv.rb
133
145
  - lib/smarter_csv/auto_detection.rb
134
146
  - lib/smarter_csv/errors.rb