RubyGems - vfcsv - Versions diffs - 1.0.0 - Mend

vfcsv 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/ext/vfcsv_rust/src/lib.rs ADDED Viewed

@@ -0,0 +1,476 @@
+use magnus::{function, prelude::*, Error, RArray, RHash, RString, Ruby};
+/// SIMD-accelerated CSV parser
+///
+/// Two-stage architecture inspired by simdjson:
+/// 1. Stage 1: Find structural characters (commas, quotes, newlines) using SIMD
+/// 2. Stage 2: Extract fields based on structural indices
+// Configuration for parsing
+struct CsvConfig {
+    col_sep: u8,
+    quote_char: u8,
+}
+impl Default for CsvConfig {
+    fn default() -> Self {
+        Self {
+            col_sep: b',',
+            quote_char: b'"',
+        }
+    }
+}
+/// Main CSV parser - dispatches to SIMD or portable implementation
+fn parse(csv: RString, col_sep: RString, quote_char: RString) -> Result<RArray, Error> {
+    let ruby = Ruby::get().unwrap();
+    let input = unsafe { csv.as_slice() };
+    // Get config from Ruby strings
+    let col_sep_bytes = unsafe { col_sep.as_slice() };
+    let quote_bytes = unsafe { quote_char.as_slice() };
+    let config = CsvConfig {
+        col_sep: col_sep_bytes.first().copied().unwrap_or(b','),
+        quote_char: quote_bytes.first().copied().unwrap_or(b'"'),
+    };
+    // Parse CSV
+    let rows = parse_csv(input, &config);
+    // Convert to Ruby arrays
+    let result = ruby.ary_new_capa(rows.len());
+    for row in rows {
+        let rb_row = ruby.ary_new_capa(row.len());
+        for field in row {
+            rb_row.push(ruby.str_new(&field))?;
+        }
+        result.push(rb_row)?;
+    }
+    Ok(result)
+}
+/// Core CSV parsing logic
+#[inline]
+fn parse_csv(input: &[u8], config: &CsvConfig) -> Vec<Vec<String>> {
+    if input.is_empty() {
+        return Vec::new();
+    }
+    // Use SIMD-accelerated parsing on supported platforms
+    #[cfg(target_arch = "aarch64")]
+    {
+        parse_csv_neon(input, config)
+    }
+    #[cfg(target_arch = "x86_64")]
+    {
+        if is_x86_feature_detected!("avx2") {
+            parse_csv_avx2(input, config)
+        } else {
+            parse_csv_portable(input, config)
+        }
+    }
+    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
+    {
+        parse_csv_portable(input, config)
+    }
+}
+/// NEON-accelerated CSV parsing for ARM64 (Apple Silicon)
+#[cfg(target_arch = "aarch64")]
+fn parse_csv_neon(input: &[u8], config: &CsvConfig) -> Vec<Vec<String>> {
+    use std::arch::aarch64::*;
+    let mut rows: Vec<Vec<String>> = Vec::with_capacity(input.len() / 50); // Estimate rows
+    let mut current_row: Vec<String> = Vec::with_capacity(16);
+    let mut field_start: usize = 0;
+    let mut in_quotes = false;
+    let mut i = 0;
+    let col_sep = config.col_sep;
+    let quote_char = config.quote_char;
+    let len = input.len();
+    // Process 16 bytes at a time with NEON
+    while i + 16 <= len {
+        if in_quotes {
+            // When inside quotes, just look for the closing quote
+            // Use SIMD to scan for quote character
+            unsafe {
+                let chunk = vld1q_u8(input.as_ptr().add(i));
+                let quote_vec = vdupq_n_u8(quote_char);
+                let matches = vceqq_u8(chunk, quote_vec);
+                // Convert to bitmask
+                let mask = neon_movemask(matches);
+                if mask != 0 {
+                    // Found a quote - process byte by byte from here
+                    let quote_pos = i + mask.trailing_zeros() as usize;
+                    // Check if it's an escaped quote ("")
+                    if quote_pos + 1 < len && input[quote_pos + 1] == quote_char {
+                        // Escaped quote - continue searching
+                        i = quote_pos + 2;
+                        continue;
+                    } else {
+                        // End of quoted field
+                        in_quotes = false;
+                        i = quote_pos + 1;
+                        continue;
+                    }
+                }
+            }
+            i += 16;
+        } else {
+            // Not in quotes - look for comma, newline, or quote
+            unsafe {
+                let chunk = vld1q_u8(input.as_ptr().add(i));
+                let comma_vec = vdupq_n_u8(col_sep);
+                let newline_vec = vdupq_n_u8(b'\n');
+                let cr_vec = vdupq_n_u8(b'\r');
+                let quote_vec = vdupq_n_u8(quote_char);
+                let comma_matches = vceqq_u8(chunk, comma_vec);
+                let newline_matches = vceqq_u8(chunk, newline_vec);
+                let cr_matches = vceqq_u8(chunk, cr_vec);
+                let quote_matches = vceqq_u8(chunk, quote_vec);
+                // Combine all structural character matches
+                let structural = vorrq_u8(
+                    vorrq_u8(comma_matches, newline_matches),
+                    vorrq_u8(cr_matches, quote_matches)
+                );
+                let mask = neon_movemask(structural);
+                if mask != 0 {
+                    // Found structural character - process it
+                    let pos = i + mask.trailing_zeros() as usize;
+                    let byte = input[pos];
+                    if byte == quote_char {
+                        if pos == field_start {
+                            // Start of quoted field
+                            in_quotes = true;
+                            field_start = pos + 1;
+                            i = pos + 1;
+                        } else {
+                            i = pos + 1;
+                        }
+                    } else if byte == col_sep {
+                        // End of field
+                        let field = extract_field(input, field_start, pos, quote_char);
+                        current_row.push(field);
+                        field_start = pos + 1;
+                        i = pos + 1;
+                    } else if byte == b'\n' {
+                        // End of row
+                        let end_pos = if pos > 0 && input[pos - 1] == b'\r' {
+                            pos - 1
+                        } else {
+                            pos
+                        };
+                        let field = extract_field(input, field_start, end_pos, quote_char);
+                        current_row.push(field);
+                        if !current_row.is_empty() {
+                            rows.push(std::mem::take(&mut current_row));
+                            current_row = Vec::with_capacity(16);
+                        }
+                        field_start = pos + 1;
+                        i = pos + 1;
+                    } else if byte == b'\r' {
+                        // Handle \r\n or bare \r
+                        let field = extract_field(input, field_start, pos, quote_char);
+                        current_row.push(field);
+                        if !current_row.is_empty() {
+                            rows.push(std::mem::take(&mut current_row));
+                            current_row = Vec::with_capacity(16);
+                        }
+                        if pos + 1 < len && input[pos + 1] == b'\n' {
+                            field_start = pos + 2;
+                            i = pos + 2;
+                        } else {
+                            field_start = pos + 1;
+                            i = pos + 1;
+                        }
+                    }
+                } else {
+                    i += 16;
+                }
+            }
+        }
+    }
+    // Handle remaining bytes with portable code
+    while i < len {
+        let byte = input[i];
+        if in_quotes {
+            if byte == quote_char {
+                if i + 1 < len && input[i + 1] == quote_char {
+                    // Escaped quote
+                    i += 2;
+                } else {
+                    // End of quoted field
+                    in_quotes = false;
+                    i += 1;
+                }
+            } else {
+                i += 1;
+            }
+        } else {
+            if byte == quote_char && i == field_start {
+                in_quotes = true;
+                field_start = i + 1;
+                i += 1;
+            } else if byte == col_sep {
+                let field = extract_field(input, field_start, i, quote_char);
+                current_row.push(field);
+                field_start = i + 1;
+                i += 1;
+            } else if byte == b'\n' {
+                let end_pos = if i > 0 && input[i - 1] == b'\r' { i - 1 } else { i };
+                let field = extract_field(input, field_start, end_pos, quote_char);
+                current_row.push(field);
+                if !current_row.is_empty() {
+                    rows.push(std::mem::take(&mut current_row));
+                    current_row = Vec::with_capacity(16);
+                }
+                field_start = i + 1;
+                i += 1;
+            } else if byte == b'\r' {
+                let field = extract_field(input, field_start, i, quote_char);
+                current_row.push(field);
+                if !current_row.is_empty() {
+                    rows.push(std::mem::take(&mut current_row));
+                    current_row = Vec::with_capacity(16);
+                }
+                if i + 1 < len && input[i + 1] == b'\n' {
+                    field_start = i + 2;
+                    i += 2;
+                } else {
+                    field_start = i + 1;
+                    i += 1;
+                }
+            } else {
+                i += 1;
+            }
+        }
+    }
+    // Handle last field if any
+    // Note: field_start == len means there's an empty trailing field (after trailing comma)
+    if field_start <= len && (!current_row.is_empty() || field_start < len) {
+        let field = extract_field(input, field_start, len, quote_char);
+        current_row.push(field);
+    }
+    if !current_row.is_empty() {
+        rows.push(current_row);
+    }
+    rows
+}
+/// Convert NEON comparison result to bitmask
+#[cfg(target_arch = "aarch64")]
+#[inline]
+unsafe fn neon_movemask(v: std::arch::aarch64::uint8x16_t) -> u16 {
+    use std::arch::aarch64::*;
+    // Create a mask with bit positions
+    let mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
+    let mask_vec = vld1q_u8(mask.as_ptr());
+    // AND with mask to get positional bits
+    let masked = vandq_u8(v, mask_vec);
+    // Pairwise add to combine bytes
+    let paired = vpaddlq_u8(masked);
+    let paired2 = vpaddlq_u16(paired);
+    let paired3 = vpaddlq_u32(paired2);
+    // Extract the two 64-bit halves
+    let low = vgetq_lane_u64(paired3, 0) as u8;
+    let high = vgetq_lane_u64(paired3, 1) as u8;
+    (low as u16) | ((high as u16) << 8)
+}
+/// AVX2-accelerated CSV parsing for x86_64
+#[cfg(target_arch = "x86_64")]
+fn parse_csv_avx2(input: &[u8], config: &CsvConfig) -> Vec<Vec<String>> {
+    // For now, fall back to portable - AVX2 can be added later
+    parse_csv_portable(input, config)
+}
+/// Portable (non-SIMD) CSV parsing - used on x86_64 without AVX2 or other architectures
+#[allow(dead_code)]
+fn parse_csv_portable(input: &[u8], config: &CsvConfig) -> Vec<Vec<String>> {
+    let mut rows: Vec<Vec<String>> = Vec::with_capacity(input.len() / 50);
+    let mut current_row: Vec<String> = Vec::with_capacity(16);
+    let mut field_start: usize = 0;
+    let mut in_quotes = false;
+    let mut i = 0;
+    let col_sep = config.col_sep;
+    let quote_char = config.quote_char;
+    let len = input.len();
+    while i < len {
+        let byte = input[i];
+        if in_quotes {
+            if byte == quote_char {
+                if i + 1 < len && input[i + 1] == quote_char {
+                    // Escaped quote ""
+                    i += 2;
+                } else {
+                    // End of quoted field
+                    in_quotes = false;
+                    i += 1;
+                }
+            } else {
+                i += 1;
+            }
+        } else {
+            if byte == quote_char && i == field_start {
+                // Start of quoted field
+                in_quotes = true;
+                field_start = i + 1;
+                i += 1;
+            } else if byte == col_sep {
+                // Field separator
+                let field = extract_field(input, field_start, i, quote_char);
+                current_row.push(field);
+                field_start = i + 1;
+                i += 1;
+            } else if byte == b'\n' {
+                // End of row
+                let end_pos = if i > 0 && input[i - 1] == b'\r' { i - 1 } else { i };
+                let field = extract_field(input, field_start, end_pos, quote_char);
+                current_row.push(field);
+                if !current_row.is_empty() {
+                    rows.push(std::mem::take(&mut current_row));
+                    current_row = Vec::with_capacity(16);
+                }
+                field_start = i + 1;
+                i += 1;
+            } else if byte == b'\r' {
+                // Carriage return - handle \r\n or bare \r
+                let field = extract_field(input, field_start, i, quote_char);
+                current_row.push(field);
+                if !current_row.is_empty() {
+                    rows.push(std::mem::take(&mut current_row));
+                    current_row = Vec::with_capacity(16);
+                }
+                if i + 1 < len && input[i + 1] == b'\n' {
+                    field_start = i + 2;
+                    i += 2;
+                } else {
+                    field_start = i + 1;
+                    i += 1;
+                }
+            } else {
+                i += 1;
+            }
+        }
+    }
+    // Handle last field
+    // Note: field_start == len means there's an empty trailing field (after trailing comma)
+    if field_start <= len && (!current_row.is_empty() || field_start < len) {
+        let end = if len > 0 && input[len - 1] == b'\r' { len - 1 } else { len };
+        let field = extract_field(input, field_start, end, quote_char);
+        current_row.push(field);
+    }
+    if !current_row.is_empty() {
+        rows.push(current_row);
+    }
+    rows
+}
+/// Extract a field from the input, handling quoted fields and escaped quotes
+#[inline]
+fn extract_field(input: &[u8], start: usize, end: usize, quote_char: u8) -> String {
+    if start >= end {
+        return String::new();
+    }
+    let field_bytes = &input[start..end];
+    // Check if field was quoted (look at character before start)
+    let was_quoted = start > 0 && input[start - 1] == quote_char;
+    // If quoted and ends with quote, strip the trailing quote
+    let field_bytes = if was_quoted && !field_bytes.is_empty() && field_bytes[field_bytes.len() - 1] == quote_char {
+        &field_bytes[..field_bytes.len() - 1]
+    } else {
+        field_bytes
+    };
+    // Check if we need to unescape doubled quotes
+    if was_quoted && memchr::memchr(quote_char, field_bytes).is_some() {
+        // Has quotes inside - need to unescape ""
+        let mut result = Vec::with_capacity(field_bytes.len());
+        let mut i = 0;
+        while i < field_bytes.len() {
+            if field_bytes[i] == quote_char && i + 1 < field_bytes.len() && field_bytes[i + 1] == quote_char {
+                result.push(quote_char);
+                i += 2;
+            } else {
+                result.push(field_bytes[i]);
+                i += 1;
+            }
+        }
+        String::from_utf8_lossy(&result).into_owned()
+    } else {
+        String::from_utf8_lossy(field_bytes).into_owned()
+    }
+}
+/// Get SIMD capability information
+fn simd_info() -> Result<RHash, Error> {
+    let ruby = Ruby::get().unwrap();
+    let info = ruby.hash_new();
+    #[cfg(target_arch = "x86_64")]
+    {
+        info.aset(
+            ruby.to_symbol("avx2"),
+            std::arch::is_x86_feature_detected!("avx2"),
+        )?;
+        info.aset(
+            ruby.to_symbol("sse42"),
+            std::arch::is_x86_feature_detected!("sse4.2"),
+        )?;
+        info.aset(ruby.to_symbol("arch"), ruby.str_new("x86_64"))?;
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        info.aset(ruby.to_symbol("neon"), true)?;
+        info.aset(ruby.to_symbol("arch"), ruby.str_new("aarch64"))?;
+    }
+    info.aset(ruby.to_symbol("backend"), ruby.str_new("vfcsv-simd"))?;
+    Ok(info)
+}
+#[magnus::init]
+fn init(ruby: &Ruby) -> Result<(), Error> {
+    let class = ruby.define_class("VFCSV", ruby.class_object())?;
+    let rust_ext = class.define_module("RustExt")?;
+    rust_ext.define_singleton_method("parse", function!(parse, 3))?;
+    rust_ext.define_singleton_method("simd_info", function!(simd_info, 0))?;
+    Ok(())
+}