vfcsv 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,476 @@
1
+ use magnus::{function, prelude::*, Error, RArray, RHash, RString, Ruby};
2
+
3
+ /// SIMD-accelerated CSV parser
4
+ ///
5
+ /// Two-stage architecture inspired by simdjson:
6
+ /// 1. Stage 1: Find structural characters (commas, quotes, newlines) using SIMD
7
+ /// 2. Stage 2: Extract fields based on structural indices
8
+
9
+ // Configuration for parsing
10
+ struct CsvConfig {
11
+ col_sep: u8,
12
+ quote_char: u8,
13
+ }
14
+
15
+ impl Default for CsvConfig {
16
+ fn default() -> Self {
17
+ Self {
18
+ col_sep: b',',
19
+ quote_char: b'"',
20
+ }
21
+ }
22
+ }
23
+
24
+ /// Main CSV parser - dispatches to SIMD or portable implementation
25
+ fn parse(csv: RString, col_sep: RString, quote_char: RString) -> Result<RArray, Error> {
26
+ let ruby = Ruby::get().unwrap();
27
+
28
+ let input = unsafe { csv.as_slice() };
29
+
30
+ // Get config from Ruby strings
31
+ let col_sep_bytes = unsafe { col_sep.as_slice() };
32
+ let quote_bytes = unsafe { quote_char.as_slice() };
33
+
34
+ let config = CsvConfig {
35
+ col_sep: col_sep_bytes.first().copied().unwrap_or(b','),
36
+ quote_char: quote_bytes.first().copied().unwrap_or(b'"'),
37
+ };
38
+
39
+ // Parse CSV
40
+ let rows = parse_csv(input, &config);
41
+
42
+ // Convert to Ruby arrays
43
+ let result = ruby.ary_new_capa(rows.len());
44
+ for row in rows {
45
+ let rb_row = ruby.ary_new_capa(row.len());
46
+ for field in row {
47
+ rb_row.push(ruby.str_new(&field))?;
48
+ }
49
+ result.push(rb_row)?;
50
+ }
51
+
52
+ Ok(result)
53
+ }
54
+
55
+ /// Core CSV parsing logic
56
+ #[inline]
57
+ fn parse_csv(input: &[u8], config: &CsvConfig) -> Vec<Vec<String>> {
58
+ if input.is_empty() {
59
+ return Vec::new();
60
+ }
61
+
62
+ // Use SIMD-accelerated parsing on supported platforms
63
+ #[cfg(target_arch = "aarch64")]
64
+ {
65
+ parse_csv_neon(input, config)
66
+ }
67
+
68
+ #[cfg(target_arch = "x86_64")]
69
+ {
70
+ if is_x86_feature_detected!("avx2") {
71
+ parse_csv_avx2(input, config)
72
+ } else {
73
+ parse_csv_portable(input, config)
74
+ }
75
+ }
76
+
77
+ #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
78
+ {
79
+ parse_csv_portable(input, config)
80
+ }
81
+ }
82
+
83
+ /// NEON-accelerated CSV parsing for ARM64 (Apple Silicon)
84
+ #[cfg(target_arch = "aarch64")]
85
+ fn parse_csv_neon(input: &[u8], config: &CsvConfig) -> Vec<Vec<String>> {
86
+ use std::arch::aarch64::*;
87
+
88
+ let mut rows: Vec<Vec<String>> = Vec::with_capacity(input.len() / 50); // Estimate rows
89
+ let mut current_row: Vec<String> = Vec::with_capacity(16);
90
+ let mut field_start: usize = 0;
91
+ let mut in_quotes = false;
92
+ let mut i = 0;
93
+
94
+ let col_sep = config.col_sep;
95
+ let quote_char = config.quote_char;
96
+ let len = input.len();
97
+
98
+ // Process 16 bytes at a time with NEON
99
+ while i + 16 <= len {
100
+ if in_quotes {
101
+ // When inside quotes, just look for the closing quote
102
+ // Use SIMD to scan for quote character
103
+ unsafe {
104
+ let chunk = vld1q_u8(input.as_ptr().add(i));
105
+ let quote_vec = vdupq_n_u8(quote_char);
106
+ let matches = vceqq_u8(chunk, quote_vec);
107
+
108
+ // Convert to bitmask
109
+ let mask = neon_movemask(matches);
110
+
111
+ if mask != 0 {
112
+ // Found a quote - process byte by byte from here
113
+ let quote_pos = i + mask.trailing_zeros() as usize;
114
+
115
+ // Check if it's an escaped quote ("")
116
+ if quote_pos + 1 < len && input[quote_pos + 1] == quote_char {
117
+ // Escaped quote - continue searching
118
+ i = quote_pos + 2;
119
+ continue;
120
+ } else {
121
+ // End of quoted field
122
+ in_quotes = false;
123
+ i = quote_pos + 1;
124
+ continue;
125
+ }
126
+ }
127
+ }
128
+ i += 16;
129
+ } else {
130
+ // Not in quotes - look for comma, newline, or quote
131
+ unsafe {
132
+ let chunk = vld1q_u8(input.as_ptr().add(i));
133
+
134
+ let comma_vec = vdupq_n_u8(col_sep);
135
+ let newline_vec = vdupq_n_u8(b'\n');
136
+ let cr_vec = vdupq_n_u8(b'\r');
137
+ let quote_vec = vdupq_n_u8(quote_char);
138
+
139
+ let comma_matches = vceqq_u8(chunk, comma_vec);
140
+ let newline_matches = vceqq_u8(chunk, newline_vec);
141
+ let cr_matches = vceqq_u8(chunk, cr_vec);
142
+ let quote_matches = vceqq_u8(chunk, quote_vec);
143
+
144
+ // Combine all structural character matches
145
+ let structural = vorrq_u8(
146
+ vorrq_u8(comma_matches, newline_matches),
147
+ vorrq_u8(cr_matches, quote_matches)
148
+ );
149
+
150
+ let mask = neon_movemask(structural);
151
+
152
+ if mask != 0 {
153
+ // Found structural character - process it
154
+ let pos = i + mask.trailing_zeros() as usize;
155
+ let byte = input[pos];
156
+
157
+ if byte == quote_char {
158
+ if pos == field_start {
159
+ // Start of quoted field
160
+ in_quotes = true;
161
+ field_start = pos + 1;
162
+ i = pos + 1;
163
+ } else {
164
+ i = pos + 1;
165
+ }
166
+ } else if byte == col_sep {
167
+ // End of field
168
+ let field = extract_field(input, field_start, pos, quote_char);
169
+ current_row.push(field);
170
+ field_start = pos + 1;
171
+ i = pos + 1;
172
+ } else if byte == b'\n' {
173
+ // End of row
174
+ let end_pos = if pos > 0 && input[pos - 1] == b'\r' {
175
+ pos - 1
176
+ } else {
177
+ pos
178
+ };
179
+ let field = extract_field(input, field_start, end_pos, quote_char);
180
+ current_row.push(field);
181
+ if !current_row.is_empty() {
182
+ rows.push(std::mem::take(&mut current_row));
183
+ current_row = Vec::with_capacity(16);
184
+ }
185
+ field_start = pos + 1;
186
+ i = pos + 1;
187
+ } else if byte == b'\r' {
188
+ // Handle \r\n or bare \r
189
+ let field = extract_field(input, field_start, pos, quote_char);
190
+ current_row.push(field);
191
+ if !current_row.is_empty() {
192
+ rows.push(std::mem::take(&mut current_row));
193
+ current_row = Vec::with_capacity(16);
194
+ }
195
+ if pos + 1 < len && input[pos + 1] == b'\n' {
196
+ field_start = pos + 2;
197
+ i = pos + 2;
198
+ } else {
199
+ field_start = pos + 1;
200
+ i = pos + 1;
201
+ }
202
+ }
203
+ } else {
204
+ i += 16;
205
+ }
206
+ }
207
+ }
208
+ }
209
+
210
+ // Handle remaining bytes with portable code
211
+ while i < len {
212
+ let byte = input[i];
213
+
214
+ if in_quotes {
215
+ if byte == quote_char {
216
+ if i + 1 < len && input[i + 1] == quote_char {
217
+ // Escaped quote
218
+ i += 2;
219
+ } else {
220
+ // End of quoted field
221
+ in_quotes = false;
222
+ i += 1;
223
+ }
224
+ } else {
225
+ i += 1;
226
+ }
227
+ } else {
228
+ if byte == quote_char && i == field_start {
229
+ in_quotes = true;
230
+ field_start = i + 1;
231
+ i += 1;
232
+ } else if byte == col_sep {
233
+ let field = extract_field(input, field_start, i, quote_char);
234
+ current_row.push(field);
235
+ field_start = i + 1;
236
+ i += 1;
237
+ } else if byte == b'\n' {
238
+ let end_pos = if i > 0 && input[i - 1] == b'\r' { i - 1 } else { i };
239
+ let field = extract_field(input, field_start, end_pos, quote_char);
240
+ current_row.push(field);
241
+ if !current_row.is_empty() {
242
+ rows.push(std::mem::take(&mut current_row));
243
+ current_row = Vec::with_capacity(16);
244
+ }
245
+ field_start = i + 1;
246
+ i += 1;
247
+ } else if byte == b'\r' {
248
+ let field = extract_field(input, field_start, i, quote_char);
249
+ current_row.push(field);
250
+ if !current_row.is_empty() {
251
+ rows.push(std::mem::take(&mut current_row));
252
+ current_row = Vec::with_capacity(16);
253
+ }
254
+ if i + 1 < len && input[i + 1] == b'\n' {
255
+ field_start = i + 2;
256
+ i += 2;
257
+ } else {
258
+ field_start = i + 1;
259
+ i += 1;
260
+ }
261
+ } else {
262
+ i += 1;
263
+ }
264
+ }
265
+ }
266
+
267
+ // Handle last field if any
268
+ // Note: field_start == len means there's an empty trailing field (after trailing comma)
269
+ if field_start <= len && (!current_row.is_empty() || field_start < len) {
270
+ let field = extract_field(input, field_start, len, quote_char);
271
+ current_row.push(field);
272
+ }
273
+ if !current_row.is_empty() {
274
+ rows.push(current_row);
275
+ }
276
+
277
+ rows
278
+ }
279
+
280
+ /// Convert NEON comparison result to bitmask
281
+ #[cfg(target_arch = "aarch64")]
282
+ #[inline]
283
+ unsafe fn neon_movemask(v: std::arch::aarch64::uint8x16_t) -> u16 {
284
+ use std::arch::aarch64::*;
285
+
286
+ // Create a mask with bit positions
287
+ let mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
288
+ let mask_vec = vld1q_u8(mask.as_ptr());
289
+
290
+ // AND with mask to get positional bits
291
+ let masked = vandq_u8(v, mask_vec);
292
+
293
+ // Pairwise add to combine bytes
294
+ let paired = vpaddlq_u8(masked);
295
+ let paired2 = vpaddlq_u16(paired);
296
+ let paired3 = vpaddlq_u32(paired2);
297
+
298
+ // Extract the two 64-bit halves
299
+ let low = vgetq_lane_u64(paired3, 0) as u8;
300
+ let high = vgetq_lane_u64(paired3, 1) as u8;
301
+
302
+ (low as u16) | ((high as u16) << 8)
303
+ }
304
+
305
+ /// AVX2-accelerated CSV parsing for x86_64
306
+ #[cfg(target_arch = "x86_64")]
307
+ fn parse_csv_avx2(input: &[u8], config: &CsvConfig) -> Vec<Vec<String>> {
308
+ // For now, fall back to portable - AVX2 can be added later
309
+ parse_csv_portable(input, config)
310
+ }
311
+
312
+ /// Portable (non-SIMD) CSV parsing - used on x86_64 without AVX2 or other architectures
313
+ #[allow(dead_code)]
314
+ fn parse_csv_portable(input: &[u8], config: &CsvConfig) -> Vec<Vec<String>> {
315
+ let mut rows: Vec<Vec<String>> = Vec::with_capacity(input.len() / 50);
316
+ let mut current_row: Vec<String> = Vec::with_capacity(16);
317
+ let mut field_start: usize = 0;
318
+ let mut in_quotes = false;
319
+ let mut i = 0;
320
+
321
+ let col_sep = config.col_sep;
322
+ let quote_char = config.quote_char;
323
+ let len = input.len();
324
+
325
+ while i < len {
326
+ let byte = input[i];
327
+
328
+ if in_quotes {
329
+ if byte == quote_char {
330
+ if i + 1 < len && input[i + 1] == quote_char {
331
+ // Escaped quote ""
332
+ i += 2;
333
+ } else {
334
+ // End of quoted field
335
+ in_quotes = false;
336
+ i += 1;
337
+ }
338
+ } else {
339
+ i += 1;
340
+ }
341
+ } else {
342
+ if byte == quote_char && i == field_start {
343
+ // Start of quoted field
344
+ in_quotes = true;
345
+ field_start = i + 1;
346
+ i += 1;
347
+ } else if byte == col_sep {
348
+ // Field separator
349
+ let field = extract_field(input, field_start, i, quote_char);
350
+ current_row.push(field);
351
+ field_start = i + 1;
352
+ i += 1;
353
+ } else if byte == b'\n' {
354
+ // End of row
355
+ let end_pos = if i > 0 && input[i - 1] == b'\r' { i - 1 } else { i };
356
+ let field = extract_field(input, field_start, end_pos, quote_char);
357
+ current_row.push(field);
358
+ if !current_row.is_empty() {
359
+ rows.push(std::mem::take(&mut current_row));
360
+ current_row = Vec::with_capacity(16);
361
+ }
362
+ field_start = i + 1;
363
+ i += 1;
364
+ } else if byte == b'\r' {
365
+ // Carriage return - handle \r\n or bare \r
366
+ let field = extract_field(input, field_start, i, quote_char);
367
+ current_row.push(field);
368
+ if !current_row.is_empty() {
369
+ rows.push(std::mem::take(&mut current_row));
370
+ current_row = Vec::with_capacity(16);
371
+ }
372
+ if i + 1 < len && input[i + 1] == b'\n' {
373
+ field_start = i + 2;
374
+ i += 2;
375
+ } else {
376
+ field_start = i + 1;
377
+ i += 1;
378
+ }
379
+ } else {
380
+ i += 1;
381
+ }
382
+ }
383
+ }
384
+
385
+ // Handle last field
386
+ // Note: field_start == len means there's an empty trailing field (after trailing comma)
387
+ if field_start <= len && (!current_row.is_empty() || field_start < len) {
388
+ let end = if len > 0 && input[len - 1] == b'\r' { len - 1 } else { len };
389
+ let field = extract_field(input, field_start, end, quote_char);
390
+ current_row.push(field);
391
+ }
392
+ if !current_row.is_empty() {
393
+ rows.push(current_row);
394
+ }
395
+
396
+ rows
397
+ }
398
+
399
+ /// Extract a field from the input, handling quoted fields and escaped quotes
400
+ #[inline]
401
+ fn extract_field(input: &[u8], start: usize, end: usize, quote_char: u8) -> String {
402
+ if start >= end {
403
+ return String::new();
404
+ }
405
+
406
+ let field_bytes = &input[start..end];
407
+
408
+ // Check if field was quoted (look at character before start)
409
+ let was_quoted = start > 0 && input[start - 1] == quote_char;
410
+
411
+ // If quoted and ends with quote, strip the trailing quote
412
+ let field_bytes = if was_quoted && !field_bytes.is_empty() && field_bytes[field_bytes.len() - 1] == quote_char {
413
+ &field_bytes[..field_bytes.len() - 1]
414
+ } else {
415
+ field_bytes
416
+ };
417
+
418
+ // Check if we need to unescape doubled quotes
419
+ if was_quoted && memchr::memchr(quote_char, field_bytes).is_some() {
420
+ // Has quotes inside - need to unescape ""
421
+ let mut result = Vec::with_capacity(field_bytes.len());
422
+ let mut i = 0;
423
+ while i < field_bytes.len() {
424
+ if field_bytes[i] == quote_char && i + 1 < field_bytes.len() && field_bytes[i + 1] == quote_char {
425
+ result.push(quote_char);
426
+ i += 2;
427
+ } else {
428
+ result.push(field_bytes[i]);
429
+ i += 1;
430
+ }
431
+ }
432
+ String::from_utf8_lossy(&result).into_owned()
433
+ } else {
434
+ String::from_utf8_lossy(field_bytes).into_owned()
435
+ }
436
+ }
437
+
438
+ /// Get SIMD capability information
439
+ fn simd_info() -> Result<RHash, Error> {
440
+ let ruby = Ruby::get().unwrap();
441
+ let info = ruby.hash_new();
442
+
443
+ #[cfg(target_arch = "x86_64")]
444
+ {
445
+ info.aset(
446
+ ruby.to_symbol("avx2"),
447
+ std::arch::is_x86_feature_detected!("avx2"),
448
+ )?;
449
+ info.aset(
450
+ ruby.to_symbol("sse42"),
451
+ std::arch::is_x86_feature_detected!("sse4.2"),
452
+ )?;
453
+ info.aset(ruby.to_symbol("arch"), ruby.str_new("x86_64"))?;
454
+ }
455
+
456
+ #[cfg(target_arch = "aarch64")]
457
+ {
458
+ info.aset(ruby.to_symbol("neon"), true)?;
459
+ info.aset(ruby.to_symbol("arch"), ruby.str_new("aarch64"))?;
460
+ }
461
+
462
+ info.aset(ruby.to_symbol("backend"), ruby.str_new("vfcsv-simd"))?;
463
+
464
+ Ok(info)
465
+ }
466
+
467
+ #[magnus::init]
468
+ fn init(ruby: &Ruby) -> Result<(), Error> {
469
+ let class = ruby.define_class("VFCSV", ruby.class_object())?;
470
+ let rust_ext = class.define_module("RustExt")?;
471
+
472
+ rust_ext.define_singleton_method("parse", function!(parse, 3))?;
473
+ rust_ext.define_singleton_method("simd_info", function!(simd_info, 0))?;
474
+
475
+ Ok(())
476
+ }