vfcsv 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/vfcsv.rb ADDED
@@ -0,0 +1,568 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "vfcsv/version"
4
+ require "date"
5
+
6
+ # VFCSV - Very Fast CSV Parser
7
+ #
8
+ # Drop-in replacement for Ruby's CSV library with SIMD acceleration.
9
+ # Provides 2-20x faster parsing while maintaining full API compatibility.
10
+ #
11
+ # @example Basic usage (drop-in replacement)
12
+ # # Instead of: require 'csv'
13
+ # require 'vfcsv'
14
+ #
15
+ # # Use VFCSV exactly like CSV
16
+ # VFCSV.parse("a,b,c\n1,2,3")
17
+ # VFCSV.read("data.csv", headers: true)
18
+ # VFCSV.foreach("data.csv") { |row| puts row }
19
+ #
20
+ class VFCSV
21
+ class MalformedCSVError < StandardError; end
22
+
23
+ # Built-in converters matching Ruby's CSV
24
+ Converters = {
25
+ integer: ->(value) {
26
+ begin
27
+ Integer(value, 10)
28
+ rescue ArgumentError, TypeError
29
+ value
30
+ end
31
+ },
32
+ float: ->(value) {
33
+ begin
34
+ Float(value)
35
+ rescue ArgumentError, TypeError
36
+ value
37
+ end
38
+ },
39
+ numeric: ->(value) {
40
+ begin
41
+ Integer(value, 10)
42
+ rescue ArgumentError, TypeError
43
+ begin
44
+ Float(value)
45
+ rescue ArgumentError, TypeError
46
+ value
47
+ end
48
+ end
49
+ },
50
+ date: ->(value) {
51
+ begin
52
+ Date.parse(value)
53
+ rescue ArgumentError, TypeError
54
+ value
55
+ end
56
+ },
57
+ date_time: ->(value) {
58
+ begin
59
+ DateTime.parse(value)
60
+ rescue ArgumentError, TypeError
61
+ value
62
+ end
63
+ },
64
+ all: ->(value) {
65
+ # Try numeric first, then date_time
66
+ result = begin
67
+ Integer(value, 10)
68
+ rescue ArgumentError, TypeError
69
+ begin
70
+ Float(value)
71
+ rescue ArgumentError, TypeError
72
+ begin
73
+ DateTime.parse(value)
74
+ rescue ArgumentError, TypeError
75
+ value
76
+ end
77
+ end
78
+ end
79
+ result
80
+ }
81
+ }.freeze
82
+
83
+ # Built-in header converters matching Ruby's CSV
84
+ HeaderConverters = {
85
+ downcase: ->(header) { header.downcase },
86
+ symbol: ->(header) {
87
+ header.encode(Encoding::UTF_8)
88
+ .downcase
89
+ .gsub(/\s+/, "_")
90
+ .gsub(/[^\w]/, "")
91
+ .to_sym
92
+ }
93
+ }.freeze
94
+
95
+ # Default options matching Ruby's CSV
96
+ DEFAULT_OPTIONS = {
97
+ col_sep: ",",
98
+ row_sep: :auto,
99
+ quote_char: '"',
100
+ field_size_limit: nil,
101
+ converters: nil,
102
+ unconverted_fields: nil,
103
+ headers: false,
104
+ return_headers: false,
105
+ header_converters: nil,
106
+ skip_blanks: false,
107
+ skip_lines: nil,
108
+ force_quotes: false,
109
+ liberal_parsing: false,
110
+ quote_empty: true,
111
+ nil_value: nil,
112
+ empty_value: "",
113
+ }.freeze
114
+
115
+ class << self
116
+ # Parse a CSV string into an array of arrays (or Table if headers: true)
117
+ #
118
+ # @param str [String] CSV data to parse
119
+ # @param options [Hash] Parsing options
120
+ # @option options [String] :col_sep Column separator (default: ",")
121
+ # @option options [String] :quote_char Quote character (default: '"')
122
+ # @option options [Boolean] :headers Treat first row as headers (default: false)
123
+ # @option options [Symbol, Array, Proc] :converters Value converters
124
+ # @option options [Symbol, Array, Proc] :header_converters Header converters
125
+ # @option options [Boolean] :skip_blanks Skip blank rows (default: false)
126
+ # @option options [Regexp] :skip_lines Skip lines matching pattern
127
+ # @return [Array<Array<String>>] or [Table] if headers: true
128
+ #
129
+ # @example Parse simple CSV
130
+ # VFCSV.parse("a,b,c\n1,2,3")
131
+ # #=> [["a", "b", "c"], ["1", "2", "3"]]
132
+ #
133
+ # @example Parse with headers
134
+ # VFCSV.parse("a,b,c\n1,2,3", headers: true)
135
+ # #=> #<VFCSV::Table>
136
+ #
137
+ def parse(str, **options, &block)
138
+ opts = DEFAULT_OPTIONS.merge(options)
139
+ rows = rust_ext.parse(str.to_s, opts[:col_sep].to_s, opts[:quote_char].to_s)
140
+
141
+ # Post-process: convert empty strings to nil (matching Ruby CSV behavior)
142
+ # Also handle blank rows (single empty field -> empty array)
143
+ nil_value = opts[:nil_value]
144
+ rows = rows.map do |row|
145
+ # A row with just one empty field is a blank row
146
+ if row.size == 1 && row[0].empty?
147
+ []
148
+ else
149
+ row.map { |field| field.empty? ? nil_value : field }
150
+ end
151
+ end
152
+
153
+ # Handle skip_blanks
154
+ if opts[:skip_blanks]
155
+ rows = rows.reject { |row| row.empty? || row.all?(&:nil?) }
156
+ end
157
+
158
+ # Handle skip_lines
159
+ if opts[:skip_lines]
160
+ pattern = opts[:skip_lines]
161
+ original_str_lines = str.to_s.lines
162
+ rows = rows.reject.with_index do |_row, i|
163
+ i < original_str_lines.length && original_str_lines[i].match?(pattern)
164
+ end
165
+ end
166
+
167
+ if opts[:headers] && rows.length > 0
168
+ header_row = rows.shift
169
+
170
+ # Apply header converters
171
+ header_row = apply_header_converters(header_row, opts[:header_converters])
172
+
173
+ # Build table of Row objects
174
+ table_rows = rows.map do |row|
175
+ # Apply converters to values
176
+ converted_row = apply_converters(row, opts[:converters])
177
+ Row.new(header_row, converted_row)
178
+ end
179
+
180
+ result = Table.new(table_rows, headers: header_row)
181
+
182
+ if block_given?
183
+ result.each(&block)
184
+ nil
185
+ else
186
+ result
187
+ end
188
+ else
189
+ # Apply converters to all rows
190
+ if opts[:converters]
191
+ rows = rows.map { |row| apply_converters(row, opts[:converters]) }
192
+ end
193
+
194
+ if block_given?
195
+ rows.each(&block)
196
+ nil
197
+ else
198
+ rows
199
+ end
200
+ end
201
+ end
202
+
203
+ # Parse a single CSV line
204
+ #
205
+ # @param line [String] Single CSV line
206
+ # @param options [Hash] Parsing options
207
+ # @return [Array<String>] Fields from the line
208
+ #
209
+ # @example
210
+ # VFCSV.parse_line("a,b,c")
211
+ # #=> ["a", "b", "c"]
212
+ #
213
+ def parse_line(line, **options)
214
+ opts = DEFAULT_OPTIONS.merge(options)
215
+ rows = rust_ext.parse(line.to_s, opts[:col_sep].to_s, opts[:quote_char].to_s)
216
+ row = rows.first || []
217
+
218
+ if opts[:converters]
219
+ row = apply_converters(row, opts[:converters])
220
+ end
221
+
222
+ row
223
+ end
224
+
225
+ # Read a CSV file
226
+ #
227
+ # @param path [String] Path to CSV file
228
+ # @param options [Hash] Parsing options (same as parse)
229
+ # @return [Array<Array<String>>] or [Table] if headers: true
230
+ #
231
+ # @example
232
+ # VFCSV.read("data.csv")
233
+ # VFCSV.read("data.csv", headers: true)
234
+ #
235
+ def read(path, **options)
236
+ parse(File.read(path), **options)
237
+ end
238
+
239
+ # Alias for read
240
+ def readlines(path, **options)
241
+ read(path, **options)
242
+ end
243
+
244
+ # Iterate over a CSV file row by row
245
+ #
246
+ # @param path [String] Path to CSV file
247
+ # @param mode [String] File open mode (ignored, for compatibility)
248
+ # @param options [Hash] Parsing options
249
+ # @yield [Array<String>] or [Row] Each row
250
+ # @return [Enumerator] if no block given
251
+ #
252
+ # @example
253
+ # VFCSV.foreach("data.csv") { |row| puts row.inspect }
254
+ # VFCSV.foreach("data.csv", headers: true) { |row| puts row["name"] }
255
+ #
256
+ def foreach(path, mode = "r", **options, &block)
257
+ return to_enum(__method__, path, mode, **options) unless block_given?
258
+
259
+ parse(File.read(path), **options, &block)
260
+ end
261
+
262
+ # Generate CSV string from data
263
+ #
264
+ # @param str [String, nil] Optional string to append to
265
+ # @param options [Hash] Generation options
266
+ # @yield [VFCSV] CSV generator
267
+ # @return [String] Generated CSV
268
+ #
269
+ # @example
270
+ # VFCSV.generate do |csv|
271
+ # csv << ["a", "b", "c"]
272
+ # csv << [1, 2, 3]
273
+ # end
274
+ # #=> "a,b,c\n1,2,3\n"
275
+ #
276
+ def generate(str = nil, **options)
277
+ opts = DEFAULT_OPTIONS.merge(options)
278
+ generator = Generator.new(str || "", opts)
279
+ yield generator if block_given?
280
+ generator.to_s
281
+ end
282
+
283
+ # Generate a single CSV line
284
+ #
285
+ # @param row [Array] Fields to generate
286
+ # @param options [Hash] Generation options
287
+ # @return [String] CSV line
288
+ #
289
+ # @example
290
+ # VFCSV.generate_line(["a", "b", "c"])
291
+ # #=> "a,b,c\n"
292
+ #
293
+ def generate_line(row, **options)
294
+ opts = DEFAULT_OPTIONS.merge(options)
295
+ Generator.generate_line(row, **opts)
296
+ end
297
+
298
+ # Generate multiple CSV lines
299
+ #
300
+ # @param rows [Array<Array>] Rows to generate
301
+ # @param options [Hash] Generation options
302
+ # @return [String] CSV string
303
+ #
304
+ def generate_lines(rows, **options)
305
+ generate(**options) do |csv|
306
+ rows.each { |row| csv << row }
307
+ end
308
+ end
309
+
310
+ # Read CSV as a table with headers
311
+ #
312
+ # @param path [String] Path to CSV file
313
+ # @param options [Hash] Parsing options
314
+ # @return [Table] Table object
315
+ #
316
+ def table(path, **options)
317
+ read(path, headers: true, **options)
318
+ end
319
+
320
+ # Open a CSV file for reading or writing
321
+ #
322
+ # @param path [String] Path to CSV file
323
+ # @param mode [String] File open mode ("r", "w", "a", etc.)
324
+ # @param options [Hash] CSV options
325
+ # @yield [VFCSV] CSV instance
326
+ # @return [Object] Result of block, or VFCSV instance
327
+ #
328
+ def open(path, mode = "r", **options, &block)
329
+ if mode.include?("w") || mode.include?("a")
330
+ # Writing mode
331
+ csv = Writer.new(path, mode, options)
332
+ if block_given?
333
+ begin
334
+ yield csv
335
+ ensure
336
+ csv.close
337
+ end
338
+ else
339
+ csv
340
+ end
341
+ else
342
+ # Reading mode - just use foreach
343
+ if block_given?
344
+ foreach(path, mode, **options, &block)
345
+ else
346
+ read(path, **options)
347
+ end
348
+ end
349
+ end
350
+
351
+ # Get or create a CSV instance (for compatibility)
352
+ #
353
+ # @param data [String, IO] CSV data source
354
+ # @param options [Hash] CSV options
355
+ # @return [VFCSV::Instance]
356
+ #
357
+ def instance(data = nil, **options)
358
+ Instance.new(data, **options)
359
+ end
360
+
361
+ # Filter CSV input to output (compatibility method)
362
+ def filter(input = $stdin, output = $stdout, **options)
363
+ # Read from input, transform, write to output
364
+ input_str = input.respond_to?(:read) ? input.read : input.to_s
365
+ rows = parse(input_str, **options)
366
+
367
+ result = if block_given?
368
+ rows.map { |row| yield row }
369
+ else
370
+ rows
371
+ end
372
+
373
+ output_str = generate(**options) do |csv|
374
+ result.each { |row| csv << row if row }
375
+ end
376
+
377
+ if output.respond_to?(:write)
378
+ output.write(output_str)
379
+ end
380
+
381
+ output_str
382
+ end
383
+
384
+ # Get SIMD information
385
+ def simd_info
386
+ rust_ext.simd_info
387
+ end
388
+
389
+ private
390
+
391
+ def rust_ext
392
+ @rust_ext ||= begin
393
+ require_relative "vfcsv/vfcsv_rust"
394
+ RustExt
395
+ end
396
+ end
397
+
398
+ def apply_converters(row, converters)
399
+ return row if converters.nil?
400
+
401
+ converter_procs = normalize_converters(converters, Converters)
402
+ return row if converter_procs.empty?
403
+
404
+ row.map do |value|
405
+ converter_procs.reduce(value) do |v, converter|
406
+ converter.call(v)
407
+ end
408
+ end
409
+ end
410
+
411
+ def apply_header_converters(headers, converters)
412
+ return headers if converters.nil?
413
+
414
+ converter_procs = normalize_converters(converters, HeaderConverters)
415
+ return headers if converter_procs.empty?
416
+
417
+ headers.map do |header|
418
+ converter_procs.reduce(header) do |h, converter|
419
+ converter.call(h)
420
+ end
421
+ end
422
+ end
423
+
424
+ def normalize_converters(converters, builtin_hash)
425
+ case converters
426
+ when nil
427
+ []
428
+ when Symbol
429
+ [builtin_hash[converters]].compact
430
+ when Proc
431
+ [converters]
432
+ when Array
433
+ converters.flat_map { |c| normalize_converters(c, builtin_hash) }
434
+ else
435
+ []
436
+ end
437
+ end
438
+ end
439
+
440
+ # Generator for building CSV strings
441
+ class Generator
442
+ def initialize(str, options)
443
+ @output = str.dup
444
+ @col_sep = options[:col_sep] || ","
445
+ @quote_char = options[:quote_char] || '"'
446
+ @row_sep = options[:row_sep] == :auto ? "\n" : (options[:row_sep] || "\n")
447
+ @force_quotes = options[:force_quotes] || false
448
+ @quote_empty = options.fetch(:quote_empty, true)
449
+ end
450
+
451
+ def <<(row)
452
+ @output << self.class.generate_line(row,
453
+ col_sep: @col_sep,
454
+ quote_char: @quote_char,
455
+ row_sep: @row_sep,
456
+ force_quotes: @force_quotes,
457
+ quote_empty: @quote_empty
458
+ )
459
+ self
460
+ end
461
+ alias_method :add_row, :<<
462
+ alias_method :puts, :<<
463
+
464
+ def to_s
465
+ @output
466
+ end
467
+
468
+ def self.generate_line(row, **options)
469
+ col_sep = options[:col_sep] || ","
470
+ quote_char = options[:quote_char] || '"'
471
+ row_sep = options[:row_sep]
472
+ row_sep = "\n" if row_sep.nil? || row_sep == :auto
473
+ force_quotes = options[:force_quotes] || false
474
+ quote_empty = options.fetch(:quote_empty, true)
475
+
476
+ fields = row.map do |field|
477
+ field_str = field.to_s
478
+ if force_quotes || needs_quoting?(field_str, col_sep, quote_char) || (quote_empty && field_str.empty?)
479
+ quote_field(field_str, quote_char)
480
+ else
481
+ field_str
482
+ end
483
+ end
484
+
485
+ fields.join(col_sep) + row_sep
486
+ end
487
+
488
+ def self.needs_quoting?(str, col_sep, quote_char)
489
+ str.include?(col_sep) || str.include?(quote_char) || str.include?("\n") || str.include?("\r")
490
+ end
491
+
492
+ def self.quote_field(str, quote_char)
493
+ escaped = str.gsub(quote_char, quote_char + quote_char)
494
+ "#{quote_char}#{escaped}#{quote_char}"
495
+ end
496
+ end
497
+
498
+ # Writer for streaming CSV to files
499
+ class Writer
500
+ def initialize(path, mode, options)
501
+ @file = File.open(path, mode)
502
+ @options = VFCSV::DEFAULT_OPTIONS.merge(options)
503
+ @col_sep = @options[:col_sep] || ","
504
+ @quote_char = @options[:quote_char] || '"'
505
+ @force_quotes = @options[:force_quotes] || false
506
+ @quote_empty = @options.fetch(:quote_empty, true)
507
+ end
508
+
509
+ def <<(row)
510
+ @file.write(Generator.generate_line(row,
511
+ col_sep: @col_sep,
512
+ quote_char: @quote_char,
513
+ force_quotes: @force_quotes,
514
+ quote_empty: @quote_empty
515
+ ))
516
+ self
517
+ end
518
+ alias_method :add_row, :<<
519
+ alias_method :puts, :<<
520
+
521
+ def close
522
+ @file.close
523
+ end
524
+ end
525
+
526
+ # Instance wrapper for stateful CSV operations
527
+ class Instance
528
+ include Enumerable
529
+
530
+ def initialize(data = nil, **options)
531
+ @data = data
532
+ @options = VFCSV::DEFAULT_OPTIONS.merge(options)
533
+ @rows = nil
534
+ end
535
+
536
+ def each(&block)
537
+ return to_enum(__method__) unless block_given?
538
+ ensure_parsed
539
+ @rows.each(&block)
540
+ end
541
+
542
+ def read
543
+ ensure_parsed
544
+ @rows
545
+ end
546
+
547
+ def headers
548
+ ensure_parsed
549
+ @rows.respond_to?(:headers) ? @rows.headers : nil
550
+ end
551
+
552
+ private
553
+
554
+ def ensure_parsed
555
+ return if @rows
556
+ str = @data.respond_to?(:read) ? @data.read : @data.to_s
557
+ @rows = VFCSV.parse(str, **@options)
558
+ end
559
+ end
560
+
561
+ # Rust extension module - methods defined in Rust
562
+ module RustExt
563
+ end
564
+ end
565
+
566
+ # Load Row and Table classes
567
+ require_relative "vfcsv/row"
568
+ require_relative "vfcsv/table"
data/vfcsv.gemspec ADDED
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/vfcsv/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "vfcsv"
7
+ spec.version = VFCSV::VERSION
8
+ spec.authors = ["Chris Hasinski"]
9
+ spec.email = ["krzysztof.hasinski@gmail.com"]
10
+
11
+ spec.summary = "VFCSV - Drop-in replacement for Ruby's CSV with SIMD acceleration"
12
+ spec.description = "SIMD-accelerated CSV parser - drop-in replacement for Ruby's CSV library. " \
13
+ "Uses NEON on ARM64 and AVX2 on x86_64 for 2-6x faster parsing. " \
14
+ "Full API compatibility with CSV::Row, CSV::Table, converters, and all options."
15
+ spec.homepage = "https://github.com/khasinski/vfcsv"
16
+ spec.license = "MIT"
17
+ spec.required_ruby_version = ">= 3.0.0"
18
+
19
+ spec.metadata["homepage_uri"] = spec.homepage
20
+ spec.metadata["source_code_uri"] = "#{spec.homepage}"
21
+ spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
22
+ spec.metadata["rubygems_mfa_required"] = "true"
23
+
24
+ spec.files = Dir.chdir(__dir__) do
25
+ `git ls-files -z`.split("\x0").reject do |f|
26
+ (File.expand_path(f) == __FILE__) ||
27
+ f.start_with?(*%w[bin/ test/ spec/ features/ .git .github])
28
+ end
29
+ end
30
+
31
+ spec.require_paths = ["lib"]
32
+ spec.extensions = ["ext/vfcsv_rust/extconf.rb"]
33
+
34
+ # Build dependencies
35
+ spec.add_dependency "rb_sys", "~> 0.9"
36
+
37
+ # Development dependencies
38
+ spec.add_development_dependency "rake", "~> 13.0"
39
+ spec.add_development_dependency "rake-compiler", "~> 1.2"
40
+ spec.add_development_dependency "rb_sys", "~> 0.9"
41
+ spec.add_development_dependency "minitest", "~> 5.0"
42
+ spec.add_development_dependency "benchmark-ips", "~> 2.0"
43
+ end