@rip-lang/csv 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -0
- package/csv.rip +30 -39
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -235,6 +235,37 @@ CSV.writer(opts) # create reusable Writer instance
|
|
|
235
235
|
CSV.formatRow(row, opts) # format single row -> string
|
|
236
236
|
```
|
|
237
237
|
|
|
238
|
+
## Performance
|
|
239
|
+
|
|
240
|
+
The parser consistently delivers **300-430 MB/s** throughput on real-world
|
|
241
|
+
CSV files, scaling linearly from kilobytes to gigabytes:
|
|
242
|
+
|
|
243
|
+
| File | Size | Rows | Fields/row | Time | Throughput |
|
|
244
|
+
|------|------|------|-----------|------|-----------|
|
|
245
|
+
| Medical records | 10.5 MB | 43,962 | 44 | 39ms | 269 MB/s |
|
|
246
|
+
| Japanese postal codes | 10.9 MB | 124,565 | 15 | 26ms | 414 MB/s |
|
|
247
|
+
| Geodata | 24.8 MB | 662,061 | 6 | 65ms | 382 MB/s |
|
|
248
|
+
| Lab results (large) | 137.3 MB | 493,962 | 44 | 466ms | 294 MB/s |
|
|
249
|
+
| Lab results (XL) | 315.8 MB | 997,195 | 44 | 1.1s | 287 MB/s |
|
|
250
|
+
| Lab results (1GB+) | 1.2 GB | 3,497,822 | 44 | 4.1s | 298 MB/s |
|
|
251
|
+
|
|
252
|
+
Quote-free files hit the fast path (~420 MB/s). Files with quoted fields
|
|
253
|
+
use the full path (~300 MB/s). The `each` callback mode is slightly faster
|
|
254
|
+
than array mode since it skips array allocation.
|
|
255
|
+
|
|
256
|
+
For context, popular JS CSV parsers typically achieve 30-120 MB/s (Papa Parse,
|
|
257
|
+
csv-parse, d3-dsv). This library is comfortably in the top tier of the JS
|
|
258
|
+
ecosystem.
|
|
259
|
+
|
|
260
|
+
## Roadmap
|
|
261
|
+
|
|
262
|
+
- **Streaming file reader** — chunked parsing for files that don't fit in
|
|
263
|
+
memory, splitting at safe quote boundaries
|
|
264
|
+
- **`transform` callback** — per-cell value transformation during parsing
|
|
265
|
+
- **`dynamicTyping`** — auto-convert `"42"` to `42`, `"true"` to `true`
|
|
266
|
+
- **Column selection** — parse only specific columns by index or name
|
|
267
|
+
- **Error/warning collection** — report recovered issues in relax mode
|
|
268
|
+
|
|
238
269
|
## License
|
|
239
270
|
|
|
240
271
|
MIT
|
package/csv.rip
CHANGED
|
@@ -66,10 +66,10 @@ def probe(str, opts = {})
|
|
|
66
66
|
if n > bestCount
|
|
67
67
|
best = d
|
|
68
68
|
bestCount = n
|
|
69
|
-
sep ?= best
|
|
69
|
+
sep ?= best ?? ','
|
|
70
70
|
|
|
71
71
|
# detect quoting
|
|
72
|
-
quote = opts.quote
|
|
72
|
+
quote = opts.quote ?? '"'
|
|
73
73
|
hasQuotes = sample.indexOf(quote) >= 0
|
|
74
74
|
|
|
75
75
|
# detect escape style: backslash vs doubled quote
|
|
@@ -83,17 +83,17 @@ def probe(str, opts = {})
|
|
|
83
83
|
# merge with user options (user wins)
|
|
84
84
|
{
|
|
85
85
|
str
|
|
86
|
-
sep: opts.sep
|
|
86
|
+
sep: opts.sep ?? sep
|
|
87
87
|
quote: quote
|
|
88
88
|
escape: escape
|
|
89
|
-
row: opts.row
|
|
89
|
+
row: opts.row ?? row
|
|
90
90
|
hasQuotes: hasQuotes
|
|
91
|
-
excel: opts.excel
|
|
92
|
-
relax: opts.relax
|
|
93
|
-
strip: opts.strip
|
|
94
|
-
headers: opts.headers
|
|
95
|
-
comments: opts.comments
|
|
96
|
-
skipBlanks: opts.skipBlanks
|
|
91
|
+
excel: opts.excel ?? false
|
|
92
|
+
relax: opts.relax ?? false
|
|
93
|
+
strip: opts.strip ?? false
|
|
94
|
+
headers: opts.headers ?? false
|
|
95
|
+
comments: opts.comments ?? null
|
|
96
|
+
skipBlanks: opts.skipBlanks ?? true
|
|
97
97
|
each: opts.each ?? null
|
|
98
98
|
}
|
|
99
99
|
|
|
@@ -117,7 +117,7 @@ def makeEmitter(cfg)
|
|
|
117
117
|
if ctx.keys
|
|
118
118
|
obj = {}
|
|
119
119
|
for key, i in ctx.keys
|
|
120
|
-
obj[key] = row[i]
|
|
120
|
+
obj[key] = row[i] ?? ''
|
|
121
121
|
if each
|
|
122
122
|
ctx.count++
|
|
123
123
|
return each(obj, ctx.count - 1) isnt false
|
|
@@ -202,7 +202,7 @@ def readFast(str, cfg)
|
|
|
202
202
|
|
|
203
203
|
def readFull(str, cfg)
|
|
204
204
|
{sep, quote, escape, excel, relax} = cfg
|
|
205
|
-
{comments, skipBlanks} = cfg
|
|
205
|
+
{row: rowDelim, comments, skipBlanks} = cfg
|
|
206
206
|
{emit, result} = makeEmitter(cfg)
|
|
207
207
|
|
|
208
208
|
sepCode = sep.charCodeAt(0)
|
|
@@ -210,6 +210,8 @@ def readFull(str, cfg)
|
|
|
210
210
|
sepLen = sep.length
|
|
211
211
|
escSame = escape is quote
|
|
212
212
|
escCode = escape.charCodeAt(0)
|
|
213
|
+
nlChar = rowDelim[0] # '\n' or '\r' (for \r\n)
|
|
214
|
+
nlCode = nlChar.charCodeAt(0)
|
|
213
215
|
len = str.length
|
|
214
216
|
pos = 0
|
|
215
217
|
|
|
@@ -221,14 +223,12 @@ def readFull(str, cfg)
|
|
|
221
223
|
|
|
222
224
|
# skip empty lines at line start
|
|
223
225
|
if atLineStart
|
|
224
|
-
if skipBlanks and
|
|
226
|
+
if skipBlanks and c is nlCode
|
|
225
227
|
pos += crlfLen(str, pos)
|
|
226
228
|
continue
|
|
227
229
|
if comments and str[pos] is comments
|
|
228
|
-
nl = str.indexOf(
|
|
229
|
-
|
|
230
|
-
nl = str.indexOf('\r', pos)
|
|
231
|
-
pos = nl is -1 ? len : nl + 1
|
|
230
|
+
nl = str.indexOf(nlChar, pos)
|
|
231
|
+
pos = nl is -1 ? len : nl + crlfLen(str, nl)
|
|
232
232
|
continue
|
|
233
233
|
atLineStart = false
|
|
234
234
|
|
|
@@ -271,7 +271,7 @@ def readFull(str, cfg)
|
|
|
271
271
|
break if pos >= len # end of string
|
|
272
272
|
|
|
273
273
|
c2 = str.charCodeAt(pos)
|
|
274
|
-
break if c2 is sepCode or c2 is
|
|
274
|
+
break if c2 is sepCode or c2 is nlCode # valid end-of-field
|
|
275
275
|
|
|
276
276
|
# unexpected character after closing quote
|
|
277
277
|
unless relax
|
|
@@ -288,14 +288,14 @@ def readFull(str, cfg)
|
|
|
288
288
|
c2 = str.charCodeAt(pos)
|
|
289
289
|
if c2 is sepCode
|
|
290
290
|
pos += sepLen
|
|
291
|
-
else if c2 is
|
|
291
|
+
else if c2 is nlCode
|
|
292
292
|
pos += crlfLen(str, pos)
|
|
293
293
|
break unless emit(row)
|
|
294
294
|
row = []
|
|
295
295
|
atLineStart = true
|
|
296
296
|
|
|
297
297
|
# === newline (end of row) ===
|
|
298
|
-
else if c is
|
|
298
|
+
else if c is nlCode
|
|
299
299
|
pos += crlfLen(str, pos)
|
|
300
300
|
break unless emit(row)
|
|
301
301
|
row = []
|
|
@@ -309,17 +309,8 @@ def readFull(str, cfg)
|
|
|
309
309
|
# === unquoted field ===
|
|
310
310
|
else
|
|
311
311
|
# indexOf ratchet: find nearest sep or newline
|
|
312
|
-
s
|
|
313
|
-
|
|
314
|
-
r = str.indexOf('\r', pos)
|
|
315
|
-
|
|
316
|
-
# nearest newline (\r or \n)
|
|
317
|
-
if r >= 0 and n >= 0
|
|
318
|
-
nl = Math.min(r, n)
|
|
319
|
-
else if r >= 0
|
|
320
|
-
nl = r
|
|
321
|
-
else
|
|
322
|
-
nl = n
|
|
312
|
+
s = str.indexOf(sep, pos)
|
|
313
|
+
nl = str.indexOf(nlChar, pos)
|
|
323
314
|
|
|
324
315
|
# take the nearer boundary
|
|
325
316
|
if s >= 0 and (nl is -1 or s < nl)
|
|
@@ -346,13 +337,13 @@ def readFull(str, cfg)
|
|
|
346
337
|
|
|
347
338
|
class Writer
|
|
348
339
|
constructor: (opts = {}) ->
|
|
349
|
-
@sep = opts.sep
|
|
350
|
-
@quote = opts.quote
|
|
351
|
-
@escape = opts.escape
|
|
352
|
-
@mode = opts.mode
|
|
353
|
-
@excel = opts.excel
|
|
354
|
-
@drop = opts.drop
|
|
355
|
-
@rowsep = opts.rowsep
|
|
340
|
+
@sep = opts.sep ?? ','
|
|
341
|
+
@quote = opts.quote ?? '"'
|
|
342
|
+
@escape = opts.escape ?? @quote
|
|
343
|
+
@mode = opts.mode ?? 'compact'
|
|
344
|
+
@excel = opts.excel ?? false
|
|
345
|
+
@drop = opts.drop ?? false
|
|
346
|
+
@rowsep = opts.rowsep ?? '\n'
|
|
356
347
|
|
|
357
348
|
# pre-compute escaped quote
|
|
358
349
|
@esc = @escape + @quote
|
|
@@ -367,7 +358,7 @@ class Writer
|
|
|
367
358
|
|
|
368
359
|
# format a single row as a CSV line (no trailing row separator)
|
|
369
360
|
row: (data) ->
|
|
370
|
-
cells = (String(v
|
|
361
|
+
cells = (String(v ?? '') for v in data)
|
|
371
362
|
|
|
372
363
|
# drop trailing empty columns
|
|
373
364
|
if @drop
|