@jupyterlab/csvviewer 4.0.0-alpha.2 → 4.0.0-alpha.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/parse.ts ADDED
@@ -0,0 +1,577 @@
1
+ // Copyright (c) Jupyter Development Team.
2
+ // Distributed under the terms of the Modified BSD License.
3
+
4
+ /*
5
+ Possible options to add to the parser:
6
+
7
+ - Optional offsets array to modify, so we don't need to create a new offsets list (we would need to be careful not to overwrite things if a row needs to be truncated.)
8
+ - Comment character at the start of the line
9
+ - Skip empty whitespace lines
10
+ - Skip rows with empty columns
11
+ - Logging an error for too many or too few fields on a line
12
+ - Ignore whitespace around delimiters
13
+ - Add an exported function in this file for getting a field from the returned offsets array (including stripping field or row delimiters and parsing quoted data). Right now this logic is in the DSVModel. Likely we want to keep the logic there for speed, but having it here as well will make the parser more self-contained and usable by others.
14
+ - Sanity check on field size, with an error if the field exceeds the size
15
+ - Tests against https://github.com/maxogden/csv-spectrum
16
+ - Benchmark against https://www.npmjs.com/package/csv-parser and https://www.npmjs.com/package/csv-string and fast-csv.
17
+
18
+ */
19
+
20
+ /**
21
+ * Interface for a delimiter-separated data parser.
22
+ *
23
+ * @param options: The parser options
24
+ * @returns An object giving the offsets for the rows or columns parsed.
25
+ *
26
+ * #### Notes
27
+ * The parsers are based on [RFC 4180](https://tools.ietf.org/html/rfc4180).
28
+ */
29
+ export type IParser = (options: IParser.IOptions) => IParser.IResults;
30
+
31
+ export namespace IParser {
32
+ /**
33
+ * The options for a parser.
34
+ */
35
+ export interface IOptions {
36
+ /**
37
+ * The data to parse.
38
+ */
39
+ data: string;
40
+
41
+ /**
42
+ * Whether to return column offsets in the offsets array.
43
+ *
44
+ * #### Notes
45
+ * If false, the returned offsets array contains just the row offsets. If
46
+ * true, the returned offsets array contains all column offsets for each
47
+ * column in the rows (i.e., it has nrows*ncols entries). Individual rows
48
+ * will have empty columns added or extra columns merged into the last
49
+ * column if they do not have exactly ncols columns.
50
+ */
51
+ columnOffsets: boolean;
52
+
53
+ /**
54
+ * The delimiter to use. Defaults to ','.
55
+ */
56
+ delimiter?: string;
57
+
58
+ /**
59
+ * The row delimiter to use. Defaults to '\r\n'.
60
+ */
61
+ rowDelimiter?: string;
62
+
63
+ /**
64
+ * The quote character for quoting fields. Defaults to the double quote (").
65
+ *
66
+ * #### Notes
67
+ * As specified in [RFC 4180](https://tools.ietf.org/html/rfc4180), quotes
68
+ * are escaped in a quoted field by doubling them (for example, "a""b" is the field
69
+ * a"b).
70
+ */
71
+ quote?: string;
72
+
73
+ /**
74
+ * The starting index in the string for processing. Defaults to 0. This
75
+ * index should be the first character of a new row. This must be less than
76
+ * data.length.
77
+ */
78
+ startIndex?: number;
79
+
80
+ /**
81
+ * Maximum number of rows to parse.
82
+ *
83
+ * If this is not given, parsing proceeds to the end of the data.
84
+ */
85
+ maxRows?: number;
86
+
87
+ /**
88
+ * Number of columns in each row to parse.
89
+ *
90
+ * #### Notes
91
+ * If this is not given, the ncols defaults to the number of columns in the
92
+ * first row.
93
+ */
94
+ ncols?: number;
95
+ }
96
+
97
+ /**
98
+ * The results from a parser.
99
+ */
100
+ export interface IResults {
101
+ /**
102
+ * The number of rows parsed.
103
+ */
104
+ nrows: number;
105
+
106
+ /**
107
+ * The number of columns parsed, or 0 if only row offsets are returned.
108
+ */
109
+ ncols: number;
110
+
111
+ /**
112
+ * The index offsets into the data string for the rows or data items.
113
+ *
114
+ * #### Notes
115
+ * If the columnOffsets argument to the parser is false, the offsets array
116
+ * will be an array of length nrows, where `offsets[r]` is the index of the
117
+ * first character of row r.
118
+ *
119
+ * If the columnOffsets argument to the parser is true, the offsets array
120
+ * will be an array of length `nrows*ncols`, where `offsets[r*ncols + c]` is
121
+ * the index of the first character of the item in row r, column c.
122
+ */
123
+ offsets: number[];
124
+ }
125
+ }
126
+
127
+ /**
128
+ * Possible parser states.
129
+ */
130
+ enum STATE {
131
+ QUOTED_FIELD,
132
+ QUOTED_FIELD_QUOTE,
133
+ UNQUOTED_FIELD,
134
+ NEW_FIELD,
135
+ NEW_ROW
136
+ }
137
+
138
+ /**
139
+ * Possible row delimiters for the parser.
140
+ */
141
+ enum ROW_DELIMITER {
142
+ CR,
143
+ CRLF,
144
+ LF
145
+ }
146
+
147
+ /**
148
+ * Parse delimiter-separated data.
149
+ *
150
+ * @param options: The parser options
151
+ * @returns An object giving the offsets for the rows or columns parsed.
152
+ *
153
+ * #### Notes
154
+ * This implementation is based on [RFC 4180](https://tools.ietf.org/html/rfc4180).
155
+ */
156
+ export function parseDSV(options: IParser.IOptions): IParser.IResults {
157
+ const {
158
+ data,
159
+ columnOffsets,
160
+ delimiter = ',',
161
+ startIndex = 0,
162
+ maxRows = 0xffffffff,
163
+ rowDelimiter = '\r\n',
164
+ quote = '"'
165
+ } = options;
166
+
167
+ // ncols will be set automatically if it is undefined.
168
+ let ncols = options.ncols;
169
+
170
+ // The number of rows we've already parsed.
171
+ let nrows = 0;
172
+
173
+ // The row or column offsets we return.
174
+ const offsets = [];
175
+
176
+ // Set up some useful local variables.
177
+ const CH_DELIMITER = delimiter.charCodeAt(0);
178
+ const CH_QUOTE = quote.charCodeAt(0);
179
+ const CH_LF = 10; // \n
180
+ const CH_CR = 13; // \r
181
+ const endIndex = data.length;
182
+ const {
183
+ QUOTED_FIELD,
184
+ QUOTED_FIELD_QUOTE,
185
+ UNQUOTED_FIELD,
186
+ NEW_FIELD,
187
+ NEW_ROW
188
+ } = STATE;
189
+ const { CR, LF, CRLF } = ROW_DELIMITER;
190
+ const [rowDelimiterCode, rowDelimiterLength] =
191
+ rowDelimiter === '\r\n'
192
+ ? [CRLF, 2]
193
+ : rowDelimiter === '\r'
194
+ ? [CR, 1]
195
+ : [LF, 1];
196
+
197
+ // Always start off at the beginning of a row.
198
+ let state = NEW_ROW;
199
+
200
+ // Set up the starting index.
201
+ let i = startIndex;
202
+
203
+ // We initialize to 0 just in case we are asked to parse past the end of the
204
+ // string. In that case, we want the number of columns to be 0.
205
+ let col = 0;
206
+
207
+ // Declare some useful temporaries
208
+ let char;
209
+
210
+ // Loop through the data string
211
+ while (i < endIndex) {
212
+ // i is the index of a character in the state.
213
+
214
+ // If we just hit a new row, and there are still characters left, push a new
215
+ // offset on and reset the column counter. We want this logic at the top of
216
+ // the while loop rather than the bottom because we don't want a trailing
217
+ // row delimiter at the end of the data to trigger a new row offset.
218
+ if (state === NEW_ROW) {
219
+ // Start a new row and reset the column counter.
220
+ offsets.push(i);
221
+ col = 1;
222
+ }
223
+
224
+ // Below, we handle this character, modify the parser state and increment the index to be consistent.
225
+
226
+ // Get the integer code for the current character, so the comparisons below
227
+ // are faster.
228
+ char = data.charCodeAt(i);
229
+
230
+ // Update the parser state. This switch statement is responsible for
231
+ // updating the state to be consistent with the index i+1 (we increment i
232
+ // after the switch statement). In some situations, we may increment i
233
+ // inside this loop to skip over indices as a shortcut.
234
+ switch (state) {
235
+ // At the beginning of a row or field, we can have a quote, row delimiter, or field delimiter.
236
+ case NEW_ROW:
237
+ case NEW_FIELD:
238
+ switch (char) {
239
+ // If we have a quote, we are starting an escaped field.
240
+ case CH_QUOTE:
241
+ state = QUOTED_FIELD;
242
+ break;
243
+
244
+ // A field delimiter means we are starting a new field.
245
+ case CH_DELIMITER:
246
+ state = NEW_FIELD;
247
+ break;
248
+
249
+ // A row delimiter means we are starting a new row.
250
+ case CH_CR:
251
+ if (rowDelimiterCode === CR) {
252
+ state = NEW_ROW;
253
+ } else if (
254
+ rowDelimiterCode === CRLF &&
255
+ data.charCodeAt(i + 1) === CH_LF
256
+ ) {
257
+ // If we see an expected \r\n, then increment to the end of the delimiter.
258
+ i++;
259
+ state = NEW_ROW;
260
+ } else {
261
+ throw `string index ${i} (in row ${nrows}, column ${col}): carriage return found, but not as part of a row delimiter C ${data.charCodeAt(
262
+ i + 1
263
+ )}`;
264
+ }
265
+ break;
266
+ case CH_LF:
267
+ if (rowDelimiterCode === LF) {
268
+ state = NEW_ROW;
269
+ } else {
270
+ throw `string index ${i} (in row ${nrows}, column ${col}): line feed found, but row delimiter starts with a carriage return`;
271
+ }
272
+ break;
273
+
274
+ // Otherwise, we are starting an unquoted field.
275
+ default:
276
+ state = UNQUOTED_FIELD;
277
+ break;
278
+ }
279
+ break;
280
+
281
+ // We are in a quoted field.
282
+ case QUOTED_FIELD:
283
+ // Skip ahead until we see another quote, which either ends the quoted
284
+ // field or starts an escaped quote.
285
+ i = data.indexOf(quote, i);
286
+ if (i < 0) {
287
+ throw `string index ${i} (in row ${nrows}, column ${col}): mismatched quote`;
288
+ }
289
+ state = QUOTED_FIELD_QUOTE;
290
+ break;
291
+
292
+ // We just saw a quote in a quoted field. This could be the end of the
293
+ // field, or it could be a repeated quote (i.e., an escaped quote according
294
+ // to RFC 4180).
295
+ case QUOTED_FIELD_QUOTE:
296
+ switch (char) {
297
+ // Another quote means we just saw an escaped quote, so we are still in
298
+ // the quoted field.
299
+ case CH_QUOTE:
300
+ state = QUOTED_FIELD;
301
+ break;
302
+
303
+ // A field or row delimiter means the quoted field just ended and we are
304
+ // going into a new field or new row.
305
+ case CH_DELIMITER:
306
+ state = NEW_FIELD;
307
+ break;
308
+
309
+ // A row delimiter means we are starting a new row in the next index.
310
+ case CH_CR:
311
+ if (rowDelimiterCode === CR) {
312
+ state = NEW_ROW;
313
+ } else if (
314
+ rowDelimiterCode === CRLF &&
315
+ data.charCodeAt(i + 1) === CH_LF
316
+ ) {
317
+ // If we see an expected \r\n, then increment to the end of the delimiter.
318
+ i++;
319
+ state = NEW_ROW;
320
+ } else {
321
+ throw `string index ${i} (in row ${nrows}, column ${col}): carriage return found, but not as part of a row delimiter C ${data.charCodeAt(
322
+ i + 1
323
+ )}`;
324
+ }
325
+ break;
326
+ case CH_LF:
327
+ if (rowDelimiterCode === LF) {
328
+ state = NEW_ROW;
329
+ } else {
330
+ throw `string index ${i} (in row ${nrows}, column ${col}): line feed found, but row delimiter starts with a carriage return`;
331
+ }
332
+ break;
333
+
334
+ default:
335
+ throw `string index ${i} (in row ${nrows}, column ${col}): quote in escaped field not followed by quote, delimiter, or row delimiter`;
336
+ }
337
+ break;
338
+
339
+ // We are in an unquoted field, so the only thing we look for is the next
340
+ // row or field delimiter.
341
+ case UNQUOTED_FIELD:
342
+ // Skip ahead to either the next field delimiter or possible start of a
343
+ // row delimiter (CR or LF).
344
+ while (i < endIndex) {
345
+ char = data.charCodeAt(i);
346
+ if (char === CH_DELIMITER || char === CH_LF || char === CH_CR) {
347
+ break;
348
+ }
349
+ i++;
350
+ }
351
+
352
+ // Process the character we're seeing in an unquoted field.
353
+ switch (char) {
354
+ // A field delimiter means we are starting a new field.
355
+ case CH_DELIMITER:
356
+ state = NEW_FIELD;
357
+ break;
358
+
359
+ // A row delimiter means we are starting a new row in the next index.
360
+ case CH_CR:
361
+ if (rowDelimiterCode === CR) {
362
+ state = NEW_ROW;
363
+ } else if (
364
+ rowDelimiterCode === CRLF &&
365
+ data.charCodeAt(i + 1) === CH_LF
366
+ ) {
367
+ // If we see an expected \r\n, then increment to the end of the delimiter.
368
+ i++;
369
+ state = NEW_ROW;
370
+ } else {
371
+ throw `string index ${i} (in row ${nrows}, column ${col}): carriage return found, but not as part of a row delimiter C ${data.charCodeAt(
372
+ i + 1
373
+ )}`;
374
+ }
375
+ break;
376
+ case CH_LF:
377
+ if (rowDelimiterCode === LF) {
378
+ state = NEW_ROW;
379
+ } else {
380
+ throw `string index ${i} (in row ${nrows}, column ${col}): line feed found, but row delimiter starts with a carriage return`;
381
+ }
382
+ break;
383
+
384
+ // Otherwise, we continue on in the unquoted field.
385
+ default:
386
+ continue;
387
+ }
388
+ break;
389
+
390
+ // We should never reach this point since the parser state is handled above,
391
+ // so throw an error if we do.
392
+ default:
393
+ throw `string index ${i} (in row ${nrows}, column ${col}): state not recognized`;
394
+ }
395
+
396
+ // Increment i to the next character index
397
+ i++;
398
+
399
+ // Update return values based on state.
400
+ switch (state) {
401
+ case NEW_ROW:
402
+ nrows++;
403
+
404
+ // If ncols is undefined, set it to the number of columns in this row (first row implied).
405
+ if (ncols === undefined) {
406
+ if (nrows !== 1) {
407
+ throw new Error('Error parsing default number of columns');
408
+ }
409
+ ncols = col;
410
+ }
411
+
412
+ // Pad or truncate the column offsets in the previous row if we are
413
+ // returning them.
414
+ if (columnOffsets === true) {
415
+ if (col < ncols) {
416
+ // We didn't have enough columns, so add some more column offsets that
417
+ // point to just before the row delimiter we just saw.
418
+ for (; col < ncols; col++) {
419
+ offsets.push(i - rowDelimiterLength);
420
+ }
421
+ } else if (col > ncols) {
422
+ // We had too many columns, so truncate them.
423
+ offsets.length = offsets.length - (col - ncols);
424
+ }
425
+ }
426
+
427
+ // Shortcut return if nrows reaches the maximum rows we are to parse.
428
+ if (nrows === maxRows) {
429
+ return { nrows, ncols: columnOffsets ? ncols : 0, offsets };
430
+ }
431
+ break;
432
+
433
+ case NEW_FIELD:
434
+ // If we are returning column offsets, log the current index.
435
+ if (columnOffsets === true) {
436
+ offsets.push(i);
437
+ }
438
+
439
+ // Update the column counter.
440
+ col++;
441
+ break;
442
+
443
+ default:
444
+ break;
445
+ }
446
+ }
447
+
448
+ // If we finished parsing and we are *not* in the NEW_ROW state, then do the
449
+ // column padding/truncation for the last row. Also make sure ncols is
450
+ // defined.
451
+ if (state !== NEW_ROW) {
452
+ nrows++;
453
+ if (columnOffsets === true) {
454
+ // If ncols is *still* undefined, then we only parsed one row and didn't
455
+ // have a newline, so set it to the number of columns we found.
456
+ if (ncols === undefined) {
457
+ ncols = col;
458
+ }
459
+
460
+ if (col < ncols) {
461
+ // We didn't have enough columns, so add some more column offsets that
462
+ // point to just before the row delimiter we just saw.
463
+ for (; col < ncols; col++) {
464
+ offsets.push(i - (rowDelimiterLength - 1));
465
+ }
466
+ } else if (col > ncols) {
467
+ // We had too many columns, so truncate them.
468
+ offsets.length = offsets.length - (col - ncols);
469
+ }
470
+ }
471
+ }
472
+
473
+ return { nrows, ncols: columnOffsets ? ncols ?? 0 : 0, offsets };
474
+ }
475
+
476
+ /**
477
+ * Parse delimiter-separated data where no delimiter is quoted.
478
+ *
479
+ * @param options: The parser options
480
+ * @returns An object giving the offsets for the rows or columns parsed.
481
+ *
482
+ * #### Notes
483
+ * This function is an optimized parser for cases where there are no field or
484
+ * row delimiters in quotes. Note that the data can have quotes, but they are
485
+ * not interpreted in any special way. This implementation is based on [RFC
486
+ * 4180](https://tools.ietf.org/html/rfc4180), but disregards quotes.
487
+ */
488
+ export function parseDSVNoQuotes(options: IParser.IOptions): IParser.IResults {
489
+ // Set option defaults.
490
+ const {
491
+ data,
492
+ columnOffsets,
493
+ delimiter = ',',
494
+ rowDelimiter = '\r\n',
495
+ startIndex = 0,
496
+ maxRows = 0xffffffff
497
+ } = options;
498
+
499
+ // ncols will be set automatically if it is undefined.
500
+ let ncols = options.ncols;
501
+
502
+ // Set up our return variables.
503
+ const offsets: number[] = [];
504
+ let nrows = 0;
505
+
506
+ // Set up various state variables.
507
+ const rowDelimiterLength = rowDelimiter.length;
508
+ let currRow = startIndex;
509
+ const len = data.length;
510
+ let nextRow: number;
511
+ let col: number;
512
+ let rowString: string;
513
+ let colIndex: number;
514
+
515
+ // The end of the current row.
516
+ let rowEnd: number;
517
+
518
+ // Start parsing at the start index.
519
+ nextRow = startIndex;
520
+
521
+ // Loop through rows until we run out of data or we've reached maxRows.
522
+ while (nextRow !== -1 && nrows < maxRows && currRow < len) {
523
+ // Store the offset for the beginning of the row and increment the rows.
524
+ offsets.push(currRow);
525
+ nrows++;
526
+
527
+ // Find the next row delimiter.
528
+ nextRow = data.indexOf(rowDelimiter, currRow);
529
+
530
+ // If the next row delimiter is not found, set the end of the row to the
531
+ // end of the data string.
532
+ rowEnd = nextRow === -1 ? len : nextRow;
533
+
534
+ // If we are returning column offsets, push them onto the array.
535
+ if (columnOffsets === true) {
536
+ // Find the next field delimiter. We slice the current row out so that
537
+ // the indexOf will stop at the end of the row. It may possibly be faster
538
+ // to just use a loop to check each character.
539
+ col = 1;
540
+ rowString = data.slice(currRow, rowEnd);
541
+ colIndex = rowString.indexOf(delimiter);
542
+
543
+ if (ncols === undefined) {
544
+ // If we don't know how many columns we need, loop through and find all
545
+ // of the field delimiters in this row.
546
+ while (colIndex !== -1) {
547
+ offsets.push(currRow + colIndex + 1);
548
+ col++;
549
+ colIndex = rowString.indexOf(delimiter, colIndex + 1);
550
+ }
551
+
552
+ // Set ncols to the number of fields we found.
553
+ ncols = col;
554
+ } else {
555
+ // If we know the number of columns we expect, find the field delimiters
556
+ // up to that many columns.
557
+ while (colIndex !== -1 && col < ncols) {
558
+ offsets.push(currRow + colIndex + 1);
559
+ col++;
560
+ colIndex = rowString.indexOf(delimiter, colIndex + 1);
561
+ }
562
+
563
+ // If we didn't reach the number of columns we expected, pad the offsets
564
+ // with the offset just before the row delimiter.
565
+ while (col < ncols) {
566
+ offsets.push(rowEnd);
567
+ col++;
568
+ }
569
+ }
570
+ }
571
+
572
+ // Skip past the row delimiter at the end of the row.
573
+ currRow = rowEnd + rowDelimiterLength;
574
+ }
575
+
576
+ return { nrows, ncols: columnOffsets ? ncols ?? 0 : 0, offsets };
577
+ }