@loaders.gl/csv 4.2.0-alpha.3 → 4.2.0-alpha.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,209 +1,258 @@
1
+ // loaders.gl
2
+ // SPDX-License-Identifier: MIT
3
+ // Copyright (c) vis.gl contributors
1
4
  import { AsyncQueue, TableBatchBuilder, convertToArrayRow, convertToObjectRow } from '@loaders.gl/schema';
2
5
  import Papa from "./papaparse/papaparse.js";
3
6
  import AsyncIteratorStreamer from "./papaparse/async-iterator-streamer.js";
4
- const VERSION = typeof "4.2.0-alpha.3" !== 'undefined' ? "4.2.0-alpha.3" : 'latest';
7
+ // __VERSION__ is injected by babel-plugin-version-inline
8
+ // @ts-ignore TS2304: Cannot find name '__VERSION__'.
9
+ const VERSION = typeof "4.2.0-alpha.4" !== 'undefined' ? "4.2.0-alpha.4" : 'latest';
5
10
  const DEFAULT_CSV_SHAPE = 'object-row-table';
6
11
  export const CSVLoader = {
7
- id: 'csv',
8
- module: 'csv',
9
- name: 'CSV',
10
- version: VERSION,
11
- extensions: ['csv', 'tsv', 'dsv'],
12
- mimeTypes: ['text/csv', 'text/tab-separated-values', 'text/dsv'],
13
- category: 'table',
14
- parse: async (arrayBuffer, options) => parseCSV(new TextDecoder().decode(arrayBuffer), options),
15
- parseText: (text, options) => parseCSV(text, options),
16
- parseInBatches: parseCSVInBatches,
17
- options: {
18
- csv: {
19
- shape: DEFAULT_CSV_SHAPE,
20
- optimizeMemoryUsage: false,
21
- header: 'auto',
22
- columnPrefix: 'column',
23
- quoteChar: '"',
24
- escapeChar: '"',
25
- dynamicTyping: true,
26
- comments: false,
27
- skipEmptyLines: true,
28
- delimitersToGuess: [',', '\t', '|', ';']
12
+ id: 'csv',
13
+ module: 'csv',
14
+ name: 'CSV',
15
+ version: VERSION,
16
+ extensions: ['csv', 'tsv', 'dsv'],
17
+ mimeTypes: ['text/csv', 'text/tab-separated-values', 'text/dsv'],
18
+ category: 'table',
19
+ parse: async (arrayBuffer, options) => parseCSV(new TextDecoder().decode(arrayBuffer), options),
20
+ parseText: (text, options) => parseCSV(text, options),
21
+ parseInBatches: parseCSVInBatches,
22
+ // @ts-ignore
23
+ // testText: null,
24
+ options: {
25
+ csv: {
26
+ shape: DEFAULT_CSV_SHAPE, // 'object-row-table'
27
+ optimizeMemoryUsage: false,
28
+ // CSV options
29
+ header: 'auto',
30
+ columnPrefix: 'column',
31
+ // delimiter: auto
32
+ // newline: auto
33
+ quoteChar: '"',
34
+ escapeChar: '"',
35
+ dynamicTyping: true,
36
+ comments: false,
37
+ skipEmptyLines: true,
38
+ // transform: null?
39
+ delimitersToGuess: [',', '\t', '|', ';']
40
+ // fastMode: auto
41
+ }
29
42
  }
30
- }
31
43
  };
32
44
  async function parseCSV(csvText, options) {
33
- const csvOptions = {
34
- ...CSVLoader.options.csv,
35
- ...(options === null || options === void 0 ? void 0 : options.csv)
36
- };
37
- const firstRow = readFirstRow(csvText);
38
- const header = csvOptions.header === 'auto' ? isHeaderRow(firstRow) : Boolean(csvOptions.header);
39
- const parseWithHeader = header;
40
- const papaparseConfig = {
41
- ...csvOptions,
42
- header: parseWithHeader,
43
- download: false,
44
- transformHeader: parseWithHeader ? duplicateColumnTransformer() : undefined,
45
- error: e => {
46
- throw new Error(e);
45
+ // Apps can call the parse method directly, we so apply default options here
46
+ const csvOptions = { ...CSVLoader.options.csv, ...options?.csv };
47
+ const firstRow = readFirstRow(csvText);
48
+ const header = csvOptions.header === 'auto' ? isHeaderRow(firstRow) : Boolean(csvOptions.header);
49
+ const parseWithHeader = header;
50
+ const papaparseConfig = {
51
+ // dynamicTyping: true,
52
+ ...csvOptions,
53
+ header: parseWithHeader,
54
+ download: false, // We handle loading, no need for papaparse to do it for us
55
+ transformHeader: parseWithHeader ? duplicateColumnTransformer() : undefined,
56
+ error: (e) => {
57
+ throw new Error(e);
58
+ }
59
+ };
60
+ const result = Papa.parse(csvText, papaparseConfig);
61
+ const rows = result.data;
62
+ const headerRow = result.meta.fields || generateHeader(csvOptions.columnPrefix, firstRow.length);
63
+ const shape = csvOptions.shape || DEFAULT_CSV_SHAPE;
64
+ switch (shape) {
65
+ case 'object-row-table':
66
+ return {
67
+ shape: 'object-row-table',
68
+ data: rows.map((row) => (Array.isArray(row) ? convertToObjectRow(row, headerRow) : row))
69
+ };
70
+ case 'array-row-table':
71
+ return {
72
+ shape: 'array-row-table',
73
+ data: rows.map((row) => (Array.isArray(row) ? row : convertToArrayRow(row, headerRow)))
74
+ };
75
+ default:
76
+ throw new Error(shape);
47
77
  }
48
- };
49
- const result = Papa.parse(csvText, papaparseConfig);
50
- const rows = result.data;
51
- const headerRow = result.meta.fields || generateHeader(csvOptions.columnPrefix, firstRow.length);
52
- const shape = csvOptions.shape || DEFAULT_CSV_SHAPE;
53
- switch (shape) {
54
- case 'object-row-table':
55
- return {
56
- shape: 'object-row-table',
57
- data: rows.map(row => Array.isArray(row) ? convertToObjectRow(row, headerRow) : row)
58
- };
59
- case 'array-row-table':
60
- return {
61
- shape: 'array-row-table',
62
- data: rows.map(row => Array.isArray(row) ? row : convertToArrayRow(row, headerRow))
63
- };
64
- default:
65
- throw new Error(shape);
66
- }
67
78
  }
79
+ // TODO - support batch size 0 = no batching/single batch?
68
80
  function parseCSVInBatches(asyncIterator, options) {
69
- var _options;
70
- options = {
71
- ...options
72
- };
73
- if (options.batchSize === 'auto') {
74
- options.batchSize = 4000;
75
- }
76
- const csvOptions = {
77
- ...CSVLoader.options.csv,
78
- ...((_options = options) === null || _options === void 0 ? void 0 : _options.csv)
79
- };
80
- const asyncQueue = new AsyncQueue();
81
- let isFirstRow = true;
82
- let headerRow = null;
83
- let tableBatchBuilder = null;
84
- let schema = null;
85
- const config = {
86
- ...csvOptions,
87
- header: false,
88
- download: false,
89
- chunkSize: 1024 * 1024 * 5,
90
- skipEmptyLines: false,
91
- step(results) {
92
- let row = results.data;
93
- if (csvOptions.skipEmptyLines) {
94
- const collapsedRow = row.flat().join('').trim();
95
- if (collapsedRow === '') {
96
- return;
97
- }
98
- }
99
- const bytesUsed = results.meta.cursor;
100
- if (isFirstRow && !headerRow) {
101
- const header = csvOptions.header === 'auto' ? isHeaderRow(row) : Boolean(csvOptions.header);
102
- if (header) {
103
- headerRow = row.map(duplicateColumnTransformer());
104
- return;
105
- }
106
- }
107
- if (isFirstRow) {
108
- isFirstRow = false;
109
- if (!headerRow) {
110
- headerRow = generateHeader(csvOptions.columnPrefix, row.length);
111
- }
112
- schema = deduceSchema(row, headerRow);
113
- }
114
- if (csvOptions.optimizeMemoryUsage) {
115
- row = JSON.parse(JSON.stringify(row));
116
- }
117
- const shape = csvOptions.shape || DEFAULT_CSV_SHAPE;
118
- tableBatchBuilder = tableBatchBuilder || new TableBatchBuilder(schema, {
119
- shape,
120
- ...options
121
- });
122
- try {
123
- tableBatchBuilder.addRow(row);
124
- const batch = tableBatchBuilder && tableBatchBuilder.getFullBatch({
125
- bytesUsed
126
- });
127
- if (batch) {
128
- asyncQueue.enqueue(batch);
129
- }
130
- } catch (error) {
131
- asyncQueue.enqueue(error);
132
- }
133
- },
134
- complete(results) {
135
- try {
136
- const bytesUsed = results.meta.cursor;
137
- const batch = tableBatchBuilder && tableBatchBuilder.getFinalBatch({
138
- bytesUsed
139
- });
140
- if (batch) {
141
- asyncQueue.enqueue(batch);
142
- }
143
- } catch (error) {
144
- asyncQueue.enqueue(error);
145
- }
146
- asyncQueue.close();
81
+ // Papaparse does not support standard batch size handling
82
+ // TODO - investigate papaparse chunks mode
83
+ options = { ...options };
84
+ if (options.batchSize === 'auto') {
85
+ options.batchSize = 4000;
147
86
  }
148
- };
149
- Papa.parse(asyncIterator, config, AsyncIteratorStreamer);
150
- return asyncQueue;
87
+ // Apps can call the parse method directly, we so apply default options here
88
+ const csvOptions = { ...CSVLoader.options.csv, ...options?.csv };
89
+ const asyncQueue = new AsyncQueue();
90
+ let isFirstRow = true;
91
+ let headerRow = null;
92
+ let tableBatchBuilder = null;
93
+ let schema = null;
94
+ const config = {
95
+ // dynamicTyping: true, // Convert numbers and boolean values in rows from strings,
96
+ ...csvOptions,
97
+ header: false, // Unfortunately, header detection is not automatic and does not infer shapes
98
+ download: false, // We handle loading, no need for papaparse to do it for us
99
+ // chunkSize is set to 5MB explicitly (same as Papaparse default) due to a bug where the
100
+ // streaming parser gets stuck if skipEmptyLines and a step callback are both supplied.
101
+ // See https://github.com/mholt/PapaParse/issues/465
102
+ chunkSize: 1024 * 1024 * 5,
103
+ // skipEmptyLines is set to a boolean value if supplied. Greedy is set to true
104
+ // skipEmptyLines is handled manually given two bugs where the streaming parser gets stuck if
105
+ // both of the skipEmptyLines and step callback options are provided:
106
+ // - true doesn't work unless chunkSize is set: https://github.com/mholt/PapaParse/issues/465
107
+ // - greedy doesn't work: https://github.com/mholt/PapaParse/issues/825
108
+ skipEmptyLines: false,
109
+ // step is called on every row
110
+ // eslint-disable-next-line complexity, max-statements
111
+ step(results) {
112
+ let row = results.data;
113
+ if (csvOptions.skipEmptyLines) {
114
+ // Manually reject lines that are empty
115
+ const collapsedRow = row.flat().join('').trim();
116
+ if (collapsedRow === '') {
117
+ return;
118
+ }
119
+ }
120
+ const bytesUsed = results.meta.cursor;
121
+ // Check if we need to save a header row
122
+ if (isFirstRow && !headerRow) {
123
+ // Auto detects or can be forced with csvOptions.header
124
+ const header = csvOptions.header === 'auto' ? isHeaderRow(row) : Boolean(csvOptions.header);
125
+ if (header) {
126
+ headerRow = row.map(duplicateColumnTransformer());
127
+ return;
128
+ }
129
+ }
130
+ // If first data row, we can deduce the schema
131
+ if (isFirstRow) {
132
+ isFirstRow = false;
133
+ if (!headerRow) {
134
+ headerRow = generateHeader(csvOptions.columnPrefix, row.length);
135
+ }
136
+ schema = deduceSchema(row, headerRow);
137
+ }
138
+ if (csvOptions.optimizeMemoryUsage) {
139
+ // A workaround to allocate new strings and don't retain pointers to original strings.
140
+ // https://bugs.chromium.org/p/v8/issues/detail?id=2869
141
+ row = JSON.parse(JSON.stringify(row));
142
+ }
143
+ const shape = csvOptions.shape || DEFAULT_CSV_SHAPE;
144
+ // Add the row
145
+ tableBatchBuilder =
146
+ tableBatchBuilder ||
147
+ new TableBatchBuilder(
148
+ // @ts-expect-error TODO this is not a proper schema
149
+ schema, {
150
+ shape,
151
+ ...options
152
+ });
153
+ try {
154
+ tableBatchBuilder.addRow(row);
155
+ // If a batch has been completed, emit it
156
+ const batch = tableBatchBuilder && tableBatchBuilder.getFullBatch({ bytesUsed });
157
+ if (batch) {
158
+ asyncQueue.enqueue(batch);
159
+ }
160
+ }
161
+ catch (error) {
162
+ asyncQueue.enqueue(error);
163
+ }
164
+ },
165
+ // complete is called when all rows have been read
166
+ complete(results) {
167
+ try {
168
+ const bytesUsed = results.meta.cursor;
169
+ // Ensure any final (partial) batch gets emitted
170
+ const batch = tableBatchBuilder && tableBatchBuilder.getFinalBatch({ bytesUsed });
171
+ if (batch) {
172
+ asyncQueue.enqueue(batch);
173
+ }
174
+ }
175
+ catch (error) {
176
+ asyncQueue.enqueue(error);
177
+ }
178
+ asyncQueue.close();
179
+ }
180
+ };
181
+ Papa.parse(asyncIterator, config, AsyncIteratorStreamer);
182
+ // TODO - Does it matter if we return asyncIterable or asyncIterator
183
+ // return asyncQueue[Symbol.asyncIterator]();
184
+ return asyncQueue;
151
185
  }
186
+ /**
187
+ * Checks if a certain row is a header row
188
+ * @param row the row to check
189
+ * @returns true if the row looks like a header
190
+ */
152
191
  function isHeaderRow(row) {
153
- return row && row.every(value => typeof value === 'string');
192
+ return row && row.every((value) => typeof value === 'string');
154
193
  }
194
+ /**
195
+ * Reads, parses, and returns the first row of a CSV text
196
+ * @param csvText the csv text to parse
197
+ * @returns the first row
198
+ */
155
199
  function readFirstRow(csvText) {
156
- const result = Papa.parse(csvText, {
157
- download: false,
158
- dynamicTyping: true,
159
- preview: 1
160
- });
161
- return result.data[0];
200
+ const result = Papa.parse(csvText, {
201
+ download: false,
202
+ dynamicTyping: true,
203
+ preview: 1
204
+ });
205
+ return result.data[0];
162
206
  }
207
+ /**
208
+ * Creates a transformer that renames duplicate columns. This is needed as Papaparse doesn't handle
209
+ * duplicate header columns and would use the latest occurrence by default.
210
+ * See the header option in https://www.papaparse.com/docs#config
211
+ * @returns a transform function that returns sanitized names for duplicate fields
212
+ */
163
213
  function duplicateColumnTransformer() {
164
- const observedColumns = new Set();
165
- return col => {
166
- let colName = col;
167
- let counter = 1;
168
- while (observedColumns.has(colName)) {
169
- colName = `${col}.${counter}`;
170
- counter++;
171
- }
172
- observedColumns.add(colName);
173
- return colName;
174
- };
214
+ const observedColumns = new Set();
215
+ return (col) => {
216
+ let colName = col;
217
+ let counter = 1;
218
+ while (observedColumns.has(colName)) {
219
+ colName = `${col}.${counter}`;
220
+ counter++;
221
+ }
222
+ observedColumns.add(colName);
223
+ return colName;
224
+ };
175
225
  }
176
- function generateHeader(columnPrefix) {
177
- let count = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 0;
178
- const headers = [];
179
- for (let i = 0; i < count; i++) {
180
- headers.push(`${columnPrefix}${i + 1}`);
181
- }
182
- return headers;
226
+ /**
227
+ * Generates the header of a CSV given a prefix and a column count
228
+ * @param columnPrefix the columnPrefix to use
229
+ * @param count the count of column names to generate
230
+ * @returns an array of column names
231
+ */
232
+ function generateHeader(columnPrefix, count = 0) {
233
+ const headers = [];
234
+ for (let i = 0; i < count; i++) {
235
+ headers.push(`${columnPrefix}${i + 1}`);
236
+ }
237
+ return headers;
183
238
  }
184
239
  function deduceSchema(row, headerRow) {
185
- const schema = headerRow ? {} : [];
186
- for (let i = 0; i < row.length; i++) {
187
- const columnName = headerRow && headerRow[i] || i;
188
- const value = row[i];
189
- switch (typeof value) {
190
- case 'number':
191
- case 'boolean':
192
- schema[columnName] = {
193
- name: String(columnName),
194
- index: i,
195
- type: Float32Array
196
- };
197
- break;
198
- case 'string':
199
- default:
200
- schema[columnName] = {
201
- name: String(columnName),
202
- index: i,
203
- type: Array
204
- };
240
+ const schema = headerRow ? {} : [];
241
+ for (let i = 0; i < row.length; i++) {
242
+ const columnName = (headerRow && headerRow[i]) || i;
243
+ const value = row[i];
244
+ switch (typeof value) {
245
+ case 'number':
246
+ case 'boolean':
247
+ // TODO - booleans could be handled differently...
248
+ schema[columnName] = { name: String(columnName), index: i, type: Float32Array };
249
+ break;
250
+ case 'string':
251
+ default:
252
+ schema[columnName] = { name: String(columnName), index: i, type: Array };
253
+ // We currently only handle numeric rows
254
+ // TODO we could offer a function to map strings to numbers?
255
+ }
205
256
  }
206
- }
207
- return schema;
257
+ return schema;
208
258
  }
209
- //# sourceMappingURL=csv-loader.js.map
@@ -1,18 +1,20 @@
1
+ // loaders.gl
2
+ // SPDX-License-Identifier: MIT
3
+ // Copyright (c) vis.gl contributors
1
4
  import { encodeTableAsCSV } from "./lib/encoders/encode-csv.js";
2
5
  export const CSVWriter = {
3
- id: 'csv',
4
- version: 'latest',
5
- module: 'csv',
6
- name: 'CSV',
7
- extensions: ['csv'],
8
- mimeTypes: ['text/csv'],
9
- options: {
10
- csv: {
11
- useDisplayNames: false
12
- }
13
- },
14
- text: true,
15
- encode: async (table, options) => new TextEncoder().encode(encodeTableAsCSV(table, options)).buffer,
16
- encodeTextSync: (table, options) => encodeTableAsCSV(table, options)
6
+ id: 'csv',
7
+ version: 'latest',
8
+ module: 'csv',
9
+ name: 'CSV',
10
+ extensions: ['csv'],
11
+ mimeTypes: ['text/csv'],
12
+ options: {
13
+ csv: {
14
+ useDisplayNames: false
15
+ }
16
+ },
17
+ text: true,
18
+ encode: async (table, options) => new TextEncoder().encode(encodeTableAsCSV(table, options)).buffer,
19
+ encodeTextSync: (table, options) => encodeTableAsCSV(table, options)
17
20
  };
18
- //# sourceMappingURL=csv-writer.js.map