jtcsv 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,613 @@
1
+ /**
2
+ * Stream CSV to JSON Converter - Node.js Module
3
+ *
4
+ * A streaming implementation for converting CSV data to JSON format
5
+ * with memory-efficient processing for large files.
6
+ *
7
+ * @module stream-csv-to-json
8
+ */
9
+
10
+ const {
11
+ ValidationError,
12
+ SecurityError,
13
+ FileSystemError,
14
+ ParsingError,
15
+ LimitError,
16
+ ConfigurationError,
17
+ safeExecute
18
+ } = require('./errors');
19
+
20
+ const { Transform, Writable } = require('stream');
21
+ const { pipeline } = require('stream/promises');
22
+
23
+ /**
24
+ * Creates a transform stream that converts CSV chunks to JSON objects
25
+ *
26
+ * @param {Object} options - Configuration options
27
+ * @param {string} [options.delimiter=';'] - CSV delimiter character
28
+ * @param {boolean} [options.hasHeaders=true] - Whether CSV has headers row
29
+ * @param {Object} [options.renameMap={}] - Map for renaming column headers { newKey: oldKey }
30
+ * @param {boolean} [options.trim=true] - Trim whitespace from values
31
+ * @param {boolean} [options.parseNumbers=false] - Parse numeric values
32
+ * @param {boolean} [options.parseBooleans=false] - Parse boolean values
33
+ * @param {number} [options.maxRows=Infinity] - Maximum number of rows to process
34
+ * @param {Function} [options.transform] - Custom transform function for each row
35
+ * @param {Object} [options.schema] - JSON schema for validation and formatting
36
+ * @returns {Transform} Transform stream
37
+ *
38
+ * @example
39
+ * const { createCsvToJsonStream } = require('./stream-csv-to-json');
40
+ *
41
+ * const transformStream = createCsvToJsonStream({
42
+ * delimiter: ',',
43
+ * parseNumbers: true,
44
+ * parseBooleans: true
45
+ * });
46
+ *
47
+ * // Pipe CSV text to JSON objects
48
+ * csvReadableStream.pipe(transformStream).pipe(jsonWritableStream);
49
+ */
50
+ function createCsvToJsonStream(options = {}) {
51
+ return safeExecute(() => {
52
+ const opts = options && typeof options === 'object' ? options : {};
53
+
54
+ const {
55
+ delimiter = ';',
56
+ hasHeaders = true,
57
+ renameMap = {},
58
+ trim = true,
59
+ parseNumbers = false,
60
+ parseBooleans = false,
61
+ maxRows = Infinity,
62
+ transform = null,
63
+ schema = null
64
+ } = opts;
65
+
66
+ // Validate options
67
+ if (delimiter && typeof delimiter !== 'string') {
68
+ throw new ConfigurationError('Delimiter must be a string');
69
+ }
70
+
71
+ if (delimiter && delimiter.length !== 1) {
72
+ throw new ConfigurationError('Delimiter must be a single character');
73
+ }
74
+
75
+ if (renameMap && typeof renameMap !== 'object') {
76
+ throw new ConfigurationError('renameMap must be an object');
77
+ }
78
+
79
+ if (maxRows !== Infinity && (typeof maxRows !== 'number' || maxRows <= 0)) {
80
+ throw new ConfigurationError('maxRows must be a positive number or Infinity');
81
+ }
82
+
83
+ if (transform && typeof transform !== 'function') {
84
+ throw new ConfigurationError('transform must be a function');
85
+ }
86
+
87
+ if (schema && typeof schema !== 'object') {
88
+ throw new ConfigurationError('schema must be an object');
89
+ }
90
+
91
+ let buffer = '';
92
+ let headers = null;
93
+ let headersProcessed = false;
94
+ let rowCount = 0;
95
+ let lineNumber = 0;
96
+ let insideQuotes = false;
97
+ let schemaValidators = null;
98
+
99
+ // Initialize schema validators if schema is provided
100
+ if (schema) {
101
+ schemaValidators = createSchemaValidators(schema);
102
+ }
103
+
104
+ /**
105
+ * Parses a CSV line with proper quote handling
106
+ *
107
+ * @private
108
+ * @param {string} line - CSV line to parse
109
+ * @returns {string[]} Array of field values
110
+ */
111
+ const parseCsvLine = (line) => {
112
+ const fields = [];
113
+ let currentField = '';
114
+ let insideQuotesLocal = false;
115
+ let i = 0;
116
+
117
+ while (i < line.length) {
118
+ const char = line[i];
119
+
120
+ if (char === '"') {
121
+ if (insideQuotesLocal) {
122
+ // Check for escaped quote ("")
123
+ if (i + 1 < line.length && line[i + 1] === '"') {
124
+ currentField += '"';
125
+ i++; // Skip next quote
126
+ } else {
127
+ insideQuotesLocal = false;
128
+ }
129
+ } else {
130
+ insideQuotesLocal = true;
131
+ }
132
+ i++;
133
+ continue;
134
+ }
135
+
136
+ if (!insideQuotesLocal && char === delimiter) {
137
+ fields.push(currentField);
138
+ currentField = '';
139
+ i++;
140
+ continue;
141
+ }
142
+
143
+ currentField += char;
144
+ i++;
145
+ }
146
+
147
+ // Add last field
148
+ fields.push(currentField);
149
+
150
+ // Check for unclosed quotes
151
+ if (insideQuotesLocal) {
152
+ throw new ParsingError('Unclosed quotes in CSV', lineNumber);
153
+ }
154
+
155
+ return fields;
156
+ };
157
+
158
+ /**
159
+ * Parses a CSV value based on options
160
+ *
161
+ * @private
162
+ * @param {string} value - Raw CSV value
163
+ * @returns {*} Parsed value
164
+ */
165
+ const parseCsvValue = (value) => {
166
+ let result = value;
167
+
168
+ if (trim) {
169
+ result = result.trim();
170
+ }
171
+
172
+ // Remove Excel formula protection
173
+ if (result.startsWith("'")) {
174
+ result = result.substring(1);
175
+ }
176
+
177
+ // Parse numbers
178
+ if (parseNumbers && /^-?\d+(\.\d+)?$/.test(result)) {
179
+ const num = parseFloat(result);
180
+ if (!isNaN(num)) {
181
+ return num;
182
+ }
183
+ }
184
+
185
+ // Parse booleans
186
+ if (parseBooleans) {
187
+ const lowerValue = result.toLowerCase();
188
+ if (lowerValue === 'true') {
189
+ return true;
190
+ }
191
+ if (lowerValue === 'false') {
192
+ return false;
193
+ }
194
+ }
195
+
196
+ // Parse empty strings as null
197
+ if (result === '') {
198
+ return null;
199
+ }
200
+
201
+ return result;
202
+ };
203
+
204
+ /**
205
+ * Formats value based on schema
206
+ *
207
+ * @private
208
+ * @param {*} value - The value to format
209
+ * @param {string} key - The key/field name
210
+ * @returns {*} Formatted value
211
+ */
212
+ const formatValue = (value, key) => {
213
+ if (!schemaValidators || !schemaValidators[key]) {
214
+ return value;
215
+ }
216
+
217
+ const validator = schemaValidators[key];
218
+
219
+ // Apply formatting if available
220
+ if (validator.format) {
221
+ return validator.format(value);
222
+ }
223
+
224
+ return value;
225
+ };
226
+
227
+ /**
228
+ * Validates value against schema
229
+ *
230
+ * @private
231
+ * @param {*} value - The value to validate
232
+ * @param {string} key - The key/field name
233
+ * @returns {boolean} True if valid
234
+ */
235
+ const validateValue = (value, key) => {
236
+ if (!schemaValidators || !schemaValidators[key]) {
237
+ return true;
238
+ }
239
+
240
+ const validator = schemaValidators[key];
241
+
242
+ // Apply validation if available
243
+ if (validator.validate) {
244
+ return validator.validate(value);
245
+ }
246
+
247
+ return true;
248
+ };
249
+
250
+ /**
251
+ * Processes a complete line of CSV
252
+ *
253
+ * @private
254
+ * @param {string} line - Complete CSV line
255
+ * @returns {Object|null} JSON object or null for headers
256
+ */
257
+ const processLine = (line) => {
258
+ lineNumber++;
259
+
260
+ // Skip empty lines
261
+ if (line.trim() === '') {
262
+ return null;
263
+ }
264
+
265
+ try {
266
+ const fields = parseCsvLine(line);
267
+
268
+ // Process headers
269
+ if (hasHeaders && !headersProcessed) {
270
+ headers = fields.map(field => {
271
+ const trimmed = trim ? field.trim() : field;
272
+ return renameMap[trimmed] || trimmed;
273
+ });
274
+ headersProcessed = true;
275
+ return null;
276
+ }
277
+
278
+ // Generate headers if not provided
279
+ if (!headers) {
280
+ headers = fields.map((_, index) => `column${index + 1}`);
281
+ }
282
+
283
+ // Check row limit
284
+ if (rowCount >= maxRows) {
285
+ throw new LimitError(
286
+ `CSV size exceeds maximum limit of ${maxRows} rows`,
287
+ maxRows,
288
+ rowCount
289
+ );
290
+ }
291
+
292
+ // Build JSON object
293
+ const row = {};
294
+ const fieldCount = Math.min(fields.length, headers.length);
295
+
296
+ for (let j = 0; j < fieldCount; j++) {
297
+ let value = parseCsvValue(fields[j]);
298
+ const key = headers[j];
299
+
300
+ // Format value based on schema
301
+ value = formatValue(value, key);
302
+
303
+ // Validate value against schema
304
+ if (!validateValue(value, key)) {
305
+ throw new ValidationError(`Invalid value for field '${key}': ${value}`);
306
+ }
307
+
308
+ row[key] = value;
309
+ }
310
+
311
+ // Apply custom transform if provided
312
+ let result = row;
313
+ if (transform) {
314
+ try {
315
+ result = transform(row);
316
+ if (!result || typeof result !== 'object') {
317
+ throw new ValidationError('Transform function must return an object');
318
+ }
319
+ } catch (error) {
320
+ throw new ValidationError(`Transform function error: ${error.message}`);
321
+ }
322
+ }
323
+
324
+ rowCount++;
325
+ return result;
326
+ } catch (error) {
327
+ if (error instanceof ParsingError) {
328
+ error.lineNumber = lineNumber;
329
+ }
330
+ throw error;
331
+ }
332
+ };
333
+
334
+ return new Transform({
335
+ objectMode: true,
336
+ writableObjectMode: false,
337
+ readableObjectMode: true,
338
+
339
+ transform(chunk, encoding, callback) {
340
+ try {
341
+ const chunkStr = chunk.toString();
342
+ buffer += chunkStr;
343
+
344
+ // Process complete lines
345
+ const lines = [];
346
+ let start = 0;
347
+
348
+ for (let i = 0; i < buffer.length; i++) {
349
+ const char = buffer[i];
350
+
351
+ if (char === '"') {
352
+ insideQuotes = !insideQuotes;
353
+ }
354
+
355
+ if (!insideQuotes && char === '\n') {
356
+ const line = buffer.substring(start, i).replace(/\r$/, '');
357
+ lines.push(line);
358
+ start = i + 1;
359
+ }
360
+ }
361
+
362
+ // Keep incomplete line in buffer
363
+ buffer = buffer.substring(start);
364
+
365
+ // Process complete lines
366
+ for (const line of lines) {
367
+ const result = processLine(line);
368
+ if (result !== null) {
369
+ this.push(result);
370
+ }
371
+ }
372
+
373
+ callback();
374
+ } catch (error) {
375
+ callback(error);
376
+ }
377
+ },
378
+
379
+ flush(callback) {
380
+ try {
381
+ // Process remaining buffer
382
+ if (buffer.trim() !== '') {
383
+ const result = processLine(buffer.replace(/\r$/, ''));
384
+ if (result !== null) {
385
+ this.push(result);
386
+ }
387
+ }
388
+
389
+ // If no headers were found but were expected, generate them
390
+ if (hasHeaders && !headersProcessed && headers === null) {
391
+ // This means the CSV was empty or had no data rows
392
+ // Nothing to do here
393
+ }
394
+
395
+ callback();
396
+ } catch (error) {
397
+ callback(error);
398
+ }
399
+ }
400
+ });
401
+ }, 'STREAM_CREATION_ERROR', { function: 'createCsvToJsonStream' });
402
+ }
403
+
404
+ /**
405
+ * Converts a readable stream of CSV text to JSON objects
406
+ *
407
+ * @param {Readable} inputStream - Readable stream of CSV text
408
+ * @param {Writable} outputStream - Writable stream for JSON objects
409
+ * @param {Object} options - Configuration options (same as createCsvToJsonStream)
410
+ * @returns {Promise<void>}
411
+ *
412
+ * @example
413
+ * const { streamCsvToJson } = require('./stream-csv-to-json');
414
+ *
415
+ * await streamCsvToJson(csvStream, jsonStream, {
416
+ * delimiter: ',',
417
+ * parseNumbers: true,
418
+ * schema: {
419
+ * properties: {
420
+ * id: { type: 'integer' },
421
+ * name: { type: 'string', minLength: 1 }
422
+ * }
423
+ * }
424
+ * });
425
+ */
426
+ async function streamCsvToJson(inputStream, outputStream, options = {}) {
427
+ return safeExecute(async () => {
428
+ const transformStream = createCsvToJsonStream(options);
429
+
430
+ await pipeline(
431
+ inputStream,
432
+ transformStream,
433
+ outputStream
434
+ );
435
+ }, 'STREAM_PROCESSING_ERROR', { function: 'streamCsvToJson' });
436
+ }
437
+
438
+ /**
439
+ * Reads CSV file and converts it to JSON using streaming
440
+ *
441
+ * @param {string} filePath - Path to CSV file
442
+ * @param {Object} options - Configuration options (same as createCsvToJsonStream)
443
+ * @returns {Promise<Readable>} Readable stream of JSON objects
444
+ *
445
+ * @example
446
+ * const { createCsvFileToJsonStream } = require('./stream-csv-to-json');
447
+ *
448
+ * const jsonStream = await createCsvFileToJsonStream('./large-data.csv', {
449
+ * delimiter: ',',
450
+ * parseNumbers: true
451
+ * });
452
+ *
453
+ * jsonStream.pipe(process.stdout);
454
+ */
455
+ async function createCsvFileToJsonStream(filePath, options = {}) {
456
+ return safeExecute(async () => {
457
+ const fs = require('fs');
458
+ const path = require('path');
459
+
460
+ // Validate file path
461
+ if (typeof filePath !== 'string' || filePath.trim() === '') {
462
+ throw new ValidationError('File path must be a non-empty string');
463
+ }
464
+
465
+ if (!filePath.toLowerCase().endsWith('.csv')) {
466
+ throw new ValidationError('File must have .csv extension');
467
+ }
468
+
469
+ // Prevent directory traversal attacks
470
+ const normalizedPath = path.normalize(filePath);
471
+ if (normalizedPath.includes('..') ||
472
+ /\\\.\.\\|\/\.\.\//.test(filePath) ||
473
+ filePath.startsWith('..') ||
474
+ filePath.includes('/..')) {
475
+ throw new SecurityError('Directory traversal detected in file path');
476
+ }
477
+
478
+ const safePath = path.resolve(filePath);
479
+
480
+ // Check if file exists
481
+ try {
482
+ await fs.promises.access(safePath, fs.constants.R_OK);
483
+ } catch (error) {
484
+ throw new FileSystemError(`File not found or not readable: ${safePath}`, error);
485
+ }
486
+
487
+ // Create read stream
488
+ const readStream = fs.createReadStream(safePath, 'utf8');
489
+ const transformStream = createCsvToJsonStream(options);
490
+
491
+ // Pipe through transform
492
+ return readStream.pipe(transformStream);
493
+ }, 'FILE_STREAM_ERROR', { function: 'createCsvFileToJsonStream' });
494
+ }
495
+
496
+ /**
497
+ * Creates schema validators from JSON schema
498
+ *
499
+ * @private
500
+ * @param {Object} schema - JSON schema
501
+ * @returns {Object} Validators object
502
+ */
503
+ function createSchemaValidators(schema) {
504
+ const validators = {};
505
+
506
+ if (!schema.properties) {
507
+ return validators;
508
+ }
509
+
510
+ for (const [key, definition] of Object.entries(schema.properties)) {
511
+ const validator = {
512
+ type: definition.type,
513
+ required: schema.required && schema.required.includes(key)
514
+ };
515
+
516
+ // Add format function for dates
517
+ if (definition.type === 'string' && definition.format === 'date-time') {
518
+ validator.format = (value) => {
519
+ if (value instanceof Date) {
520
+ return value.toISOString();
521
+ }
522
+ if (typeof value === 'string') {
523
+ // Try to parse as date
524
+ const date = new Date(value);
525
+ if (!isNaN(date.getTime())) {
526
+ return date.toISOString();
527
+ }
528
+ }
529
+ return value;
530
+ };
531
+ }
532
+
533
+ // Add validation function
534
+ validator.validate = (value) => {
535
+ if (value === null || value === undefined) {
536
+ return !validator.required;
537
+ }
538
+
539
+ // Type validation
540
+ if (definition.type === 'string' && typeof value !== 'string') {
541
+ return false;
542
+ }
543
+ if (definition.type === 'number' && typeof value !== 'number') {
544
+ return false;
545
+ }
546
+ if (definition.type === 'integer' && (!Number.isInteger(value) || typeof value !== 'number')) {
547
+ return false;
548
+ }
549
+ if (definition.type === 'boolean' && typeof value !== 'boolean') {
550
+ return false;
551
+ }
552
+
553
+ // Additional constraints
554
+ if (definition.minimum !== undefined && value < definition.minimum) {
555
+ return false;
556
+ }
557
+ if (definition.maximum !== undefined && value > definition.maximum) {
558
+ return false;
559
+ }
560
+ if (definition.minLength !== undefined && value.length < definition.minLength) {
561
+ return false;
562
+ }
563
+ if (definition.maxLength !== undefined && value.length > definition.maxLength) {
564
+ return false;
565
+ }
566
+ if (definition.pattern && !new RegExp(definition.pattern).test(value)) {
567
+ return false;
568
+ }
569
+
570
+ return true;
571
+ };
572
+
573
+ validators[key] = validator;
574
+ }
575
+
576
+ return validators;
577
+ }
578
+
579
+ /**
580
+ * Creates a writable stream that collects JSON objects into an array
581
+ *
582
+ * @returns {Writable} Writable stream that collects data
583
+ */
584
+ function createJsonCollectorStream() {
585
+ const collectedData = [];
586
+
587
+ return new Writable({
588
+ objectMode: true,
589
+
590
+ write(chunk, encoding, callback) {
591
+ collectedData.push(chunk);
592
+ callback();
593
+ },
594
+
595
+ final(callback) {
596
+ this._collectedData = collectedData;
597
+ callback();
598
+ }
599
+ });
600
+ }
601
+
602
+ module.exports = {
603
+ createCsvToJsonStream,
604
+ streamCsvToJson,
605
+ createCsvFileToJsonStream,
606
+ createJsonCollectorStream,
607
+ createSchemaValidators
608
+ };
609
+
610
+ // For ES6 module compatibility
611
+ if (typeof module !== 'undefined' && module.exports) {
612
+ module.exports.default = createCsvToJsonStream;
613
+ }