@loaders.gl/parquet 3.4.6 → 4.0.0-alpha.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/dist.min.js +27 -34
- package/dist/dist.min.js.map +3 -3
- package/dist/es5/index.js +6 -6
- package/dist/es5/index.js.map +1 -1
- package/dist/es5/lib/arrow/convert-row-group-to-columns.js.map +1 -1
- package/dist/es5/lib/arrow/convert-schema-from-parquet.js +58 -42
- package/dist/es5/lib/arrow/convert-schema-from-parquet.js.map +1 -1
- package/dist/es5/lib/arrow/convert-schema-to-parquet.js +33 -31
- package/dist/es5/lib/arrow/convert-schema-to-parquet.js.map +1 -1
- package/dist/es5/lib/geo/decode-geo-metadata.js +12 -8
- package/dist/es5/lib/geo/decode-geo-metadata.js.map +1 -1
- package/dist/es5/lib/parsers/parse-parquet-to-columns.js +11 -7
- package/dist/es5/lib/parsers/parse-parquet-to-columns.js.map +1 -1
- package/dist/es5/lib/parsers/parse-parquet-to-rows.js +51 -29
- package/dist/es5/lib/parsers/parse-parquet-to-rows.js.map +1 -1
- package/dist/es5/lib/wasm/parse-parquet-wasm.js +6 -6
- package/dist/es5/lib/wasm/parse-parquet-wasm.js.map +1 -1
- package/dist/es5/parquet-loader.js +16 -4
- package/dist/es5/parquet-loader.js.map +1 -1
- package/dist/es5/parquet-wasm-loader.js +1 -1
- package/dist/es5/parquet-wasm-loader.js.map +1 -1
- package/dist/es5/parquet-wasm-writer.js +1 -1
- package/dist/es5/parquet-wasm-writer.js.map +1 -1
- package/dist/es5/parquet-writer.js +1 -1
- package/dist/es5/parquet-writer.js.map +1 -1
- package/dist/es5/parquetjs/encoder/parquet-encoder.js.map +1 -1
- package/dist/es5/parquetjs/parser/decoders.js.map +1 -1
- package/dist/es5/parquetjs/parser/parquet-reader.js +1 -1
- package/dist/es5/parquetjs/parser/parquet-reader.js.map +1 -1
- package/dist/es5/parquetjs/schema/declare.js +4 -4
- package/dist/es5/parquetjs/schema/declare.js.map +1 -1
- package/dist/es5/parquetjs/schema/schema.js +7 -7
- package/dist/es5/parquetjs/schema/schema.js.map +1 -1
- package/dist/es5/parquetjs/schema/shred.js +117 -22
- package/dist/es5/parquetjs/schema/shred.js.map +1 -1
- package/dist/esm/index.js +5 -5
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/lib/arrow/convert-row-group-to-columns.js.map +1 -1
- package/dist/esm/lib/arrow/convert-schema-from-parquet.js +57 -41
- package/dist/esm/lib/arrow/convert-schema-from-parquet.js.map +1 -1
- package/dist/esm/lib/arrow/convert-schema-to-parquet.js +33 -31
- package/dist/esm/lib/arrow/convert-schema-to-parquet.js.map +1 -1
- package/dist/esm/lib/geo/decode-geo-metadata.js +12 -8
- package/dist/esm/lib/geo/decode-geo-metadata.js.map +1 -1
- package/dist/esm/lib/parsers/parse-parquet-to-columns.js +12 -8
- package/dist/esm/lib/parsers/parse-parquet-to-columns.js.map +1 -1
- package/dist/esm/lib/parsers/parse-parquet-to-rows.js +14 -3
- package/dist/esm/lib/parsers/parse-parquet-to-rows.js.map +1 -1
- package/dist/esm/lib/wasm/parse-parquet-wasm.js +3 -3
- package/dist/esm/lib/wasm/parse-parquet-wasm.js.map +1 -1
- package/dist/esm/parquet-loader.js +14 -2
- package/dist/esm/parquet-loader.js.map +1 -1
- package/dist/esm/parquet-wasm-loader.js +1 -1
- package/dist/esm/parquet-wasm-loader.js.map +1 -1
- package/dist/esm/parquet-wasm-writer.js +1 -1
- package/dist/esm/parquet-wasm-writer.js.map +1 -1
- package/dist/esm/parquet-writer.js +1 -1
- package/dist/esm/parquet-writer.js.map +1 -1
- package/dist/esm/parquetjs/encoder/parquet-encoder.js.map +1 -1
- package/dist/esm/parquetjs/parser/decoders.js.map +1 -1
- package/dist/esm/parquetjs/parser/parquet-reader.js +2 -2
- package/dist/esm/parquetjs/parser/parquet-reader.js.map +1 -1
- package/dist/esm/parquetjs/schema/declare.js +1 -1
- package/dist/esm/parquetjs/schema/declare.js.map +1 -1
- package/dist/esm/parquetjs/schema/schema.js +6 -6
- package/dist/esm/parquetjs/schema/schema.js.map +1 -1
- package/dist/esm/parquetjs/schema/shred.js +108 -21
- package/dist/esm/parquetjs/schema/shred.js.map +1 -1
- package/dist/index.d.ts +8 -49
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +8 -6
- package/dist/lib/arrow/convert-row-group-to-columns.d.ts +2 -2
- package/dist/lib/arrow/convert-row-group-to-columns.d.ts.map +1 -1
- package/dist/lib/arrow/convert-schema-from-parquet.d.ts +4 -4
- package/dist/lib/arrow/convert-schema-from-parquet.d.ts.map +1 -1
- package/dist/lib/arrow/convert-schema-from-parquet.js +48 -44
- package/dist/lib/arrow/convert-schema-to-parquet.d.ts +1 -1
- package/dist/lib/arrow/convert-schema-to-parquet.d.ts.map +1 -1
- package/dist/lib/arrow/convert-schema-to-parquet.js +30 -31
- package/dist/lib/geo/decode-geo-metadata.js +12 -8
- package/dist/lib/parsers/parse-parquet-to-columns.d.ts +2 -2
- package/dist/lib/parsers/parse-parquet-to-columns.d.ts.map +1 -1
- package/dist/lib/parsers/parse-parquet-to-columns.js +13 -7
- package/dist/lib/parsers/parse-parquet-to-rows.d.ts +3 -2
- package/dist/lib/parsers/parse-parquet-to-rows.d.ts.map +1 -1
- package/dist/lib/parsers/parse-parquet-to-rows.js +16 -19
- package/dist/lib/wasm/parse-parquet-wasm.d.ts +3 -3
- package/dist/lib/wasm/parse-parquet-wasm.d.ts.map +1 -1
- package/dist/lib/wasm/parse-parquet-wasm.js +3 -3
- package/dist/parquet-loader.d.ts +3 -14
- package/dist/parquet-loader.d.ts.map +1 -1
- package/dist/parquet-loader.js +14 -2
- package/dist/parquet-worker.js +31 -38
- package/dist/parquet-worker.js.map +3 -3
- package/dist/parquet-writer.d.ts +2 -1
- package/dist/parquet-writer.d.ts.map +1 -1
- package/dist/parquet-writer.js +1 -0
- package/dist/parquetjs/encoder/parquet-encoder.d.ts +4 -4
- package/dist/parquetjs/encoder/parquet-encoder.d.ts.map +1 -1
- package/dist/parquetjs/parser/decoders.d.ts +2 -2
- package/dist/parquetjs/parser/decoders.d.ts.map +1 -1
- package/dist/parquetjs/parser/parquet-reader.d.ts +6 -6
- package/dist/parquetjs/parser/parquet-reader.d.ts.map +1 -1
- package/dist/parquetjs/parser/parquet-reader.js +1 -1
- package/dist/parquetjs/schema/declare.d.ts +6 -5
- package/dist/parquetjs/schema/declare.d.ts.map +1 -1
- package/dist/parquetjs/schema/declare.js +3 -3
- package/dist/parquetjs/schema/schema.d.ts +4 -4
- package/dist/parquetjs/schema/schema.d.ts.map +1 -1
- package/dist/parquetjs/schema/schema.js +5 -5
- package/dist/parquetjs/schema/shred.d.ts +17 -111
- package/dist/parquetjs/schema/shred.d.ts.map +1 -1
- package/dist/parquetjs/schema/shred.js +127 -119
- package/package.json +8 -8
- package/src/index.ts +32 -9
- package/src/lib/arrow/convert-row-group-to-columns.ts +2 -2
- package/src/lib/arrow/convert-schema-from-parquet.ts +56 -66
- package/src/lib/arrow/convert-schema-to-parquet.ts +32 -44
- package/src/lib/geo/decode-geo-metadata.ts +17 -8
- package/src/lib/parsers/parse-parquet-to-columns.ts +22 -11
- package/src/lib/parsers/parse-parquet-to-rows.ts +28 -23
- package/src/lib/wasm/parse-parquet-wasm.ts +7 -7
- package/src/parquet-loader.ts +25 -2
- package/src/parquet-writer.ts +4 -1
- package/src/parquetjs/encoder/parquet-encoder.ts +11 -10
- package/src/parquetjs/parser/decoders.ts +3 -3
- package/src/parquetjs/parser/parquet-reader.ts +7 -7
- package/src/parquetjs/schema/declare.ts +6 -5
- package/src/parquetjs/schema/schema.ts +8 -8
- package/src/parquetjs/schema/shred.ts +142 -103
|
@@ -24,9 +24,9 @@ var __importStar = (this && this.__importStar) || function (mod) {
|
|
|
24
24
|
return result;
|
|
25
25
|
};
|
|
26
26
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
27
|
-
exports.
|
|
27
|
+
exports.materializeColumns = exports.materializeRows = exports.shredRecord = exports.shredBuffer = exports.ParquetRowGroup = void 0;
|
|
28
28
|
const declare_1 = require("./declare");
|
|
29
|
-
Object.defineProperty(exports, "
|
|
29
|
+
Object.defineProperty(exports, "ParquetRowGroup", { enumerable: true, get: function () { return declare_1.ParquetRowGroup; } });
|
|
30
30
|
const Types = __importStar(require("./types"));
|
|
31
31
|
function shredBuffer(schema) {
|
|
32
32
|
const columnData = {};
|
|
@@ -46,14 +46,14 @@ exports.shredBuffer = shredBuffer;
|
|
|
46
46
|
* 'Shred' a record into a list of <value, repetition_level, definition_level>
|
|
47
47
|
* tuples per column using the Google Dremel Algorithm..
|
|
48
48
|
*
|
|
49
|
-
* The
|
|
50
|
-
* will be returned. You may re-use the
|
|
51
|
-
* to append to an existing
|
|
49
|
+
* The rowGroup argument must point to an object into which the shredded record
|
|
50
|
+
* will be returned. You may re-use the rowGroup for repeated calls to this function
|
|
51
|
+
* to append to an existing rowGroup, as long as the schema is unchanged.
|
|
52
52
|
*
|
|
53
|
-
* The format in which the shredded records will be stored in the
|
|
53
|
+
* The format in which the shredded records will be stored in the rowGroup is as
|
|
54
54
|
* follows:
|
|
55
55
|
*
|
|
56
|
-
*
|
|
56
|
+
* rowGroup = {
|
|
57
57
|
* columnData: [
|
|
58
58
|
* 'my_col': {
|
|
59
59
|
* dlevels: [d1, d2, .. dN],
|
|
@@ -64,22 +64,22 @@ exports.shredBuffer = shredBuffer;
|
|
|
64
64
|
* rowCount: X,
|
|
65
65
|
* }
|
|
66
66
|
*/
|
|
67
|
-
function shredRecord(schema, record,
|
|
67
|
+
function shredRecord(schema, record, rowGroup) {
|
|
68
68
|
/* shred the record, this may raise an exception */
|
|
69
69
|
const data = shredBuffer(schema).columnData;
|
|
70
70
|
shredRecordFields(schema.fields, record, data, 0, 0);
|
|
71
|
-
/* if no error during shredding, add the shredded record to the
|
|
72
|
-
if (
|
|
73
|
-
|
|
74
|
-
|
|
71
|
+
/* if no error during shredding, add the shredded record to the rowGroup */
|
|
72
|
+
if (rowGroup.rowCount === 0) {
|
|
73
|
+
rowGroup.rowCount = 1;
|
|
74
|
+
rowGroup.columnData = data;
|
|
75
75
|
return;
|
|
76
76
|
}
|
|
77
|
-
|
|
77
|
+
rowGroup.rowCount += 1;
|
|
78
78
|
for (const field of schema.fieldList) {
|
|
79
|
-
Array.prototype.push.apply(
|
|
80
|
-
Array.prototype.push.apply(
|
|
81
|
-
Array.prototype.push.apply(
|
|
82
|
-
|
|
79
|
+
Array.prototype.push.apply(rowGroup.columnData[field.key].rlevels, data[field.key].rlevels);
|
|
80
|
+
Array.prototype.push.apply(rowGroup.columnData[field.key].dlevels, data[field.key].dlevels);
|
|
81
|
+
Array.prototype.push.apply(rowGroup.columnData[field.key].values, data[field.key].values);
|
|
82
|
+
rowGroup.columnData[field.key].count += data[field.key].count;
|
|
83
83
|
}
|
|
84
84
|
}
|
|
85
85
|
exports.shredRecord = shredRecord;
|
|
@@ -139,10 +139,10 @@ function shredRecordFields(fields, record, data, rLevel, dLevel) {
|
|
|
139
139
|
* tuples back to nested records (objects/arrays) using the Google Dremel
|
|
140
140
|
* Algorithm..
|
|
141
141
|
*
|
|
142
|
-
* The
|
|
142
|
+
* The rowGroup argument must point to an object with the following structure (i.e.
|
|
143
143
|
* the same structure that is returned by shredRecords):
|
|
144
144
|
*
|
|
145
|
-
*
|
|
145
|
+
* rowGroup = {
|
|
146
146
|
* columnData: [
|
|
147
147
|
* 'my_col': {
|
|
148
148
|
* dlevels: [d1, d2, .. dN],
|
|
@@ -153,22 +153,24 @@ function shredRecordFields(fields, record, data, rLevel, dLevel) {
|
|
|
153
153
|
* rowCount: X,
|
|
154
154
|
* }
|
|
155
155
|
*/
|
|
156
|
-
function
|
|
157
|
-
const
|
|
158
|
-
|
|
159
|
-
|
|
156
|
+
function materializeRows(schema, rowGroup) {
|
|
157
|
+
const rows = [];
|
|
158
|
+
// rows = new Array(rowGroup.rowCount).fill({})'
|
|
159
|
+
for (let i = 0; i < rowGroup.rowCount; i++) {
|
|
160
|
+
rows.push({});
|
|
160
161
|
}
|
|
161
|
-
for (const key in
|
|
162
|
-
const columnData =
|
|
162
|
+
for (const key in rowGroup.columnData) {
|
|
163
|
+
const columnData = rowGroup.columnData[key];
|
|
163
164
|
if (columnData.count) {
|
|
164
|
-
|
|
165
|
+
materializeColumnAsRows(schema, columnData, key, rows);
|
|
165
166
|
}
|
|
166
167
|
}
|
|
167
|
-
return
|
|
168
|
+
return rows;
|
|
168
169
|
}
|
|
169
|
-
exports.
|
|
170
|
+
exports.materializeRows = materializeRows;
|
|
171
|
+
/** Populate record fields for one column */
|
|
170
172
|
// eslint-disable-next-line max-statements, complexity
|
|
171
|
-
function
|
|
173
|
+
function materializeColumnAsRows(schema, columnData, key, rows) {
|
|
172
174
|
const field = schema.findField(key);
|
|
173
175
|
const branch = schema.findFieldBranch(key);
|
|
174
176
|
// tslint:disable-next-line:prefer-array-literal
|
|
@@ -180,7 +182,7 @@ function materializeColumn(schema, columnData, key, records) {
|
|
|
180
182
|
rLevels[rLevel]++;
|
|
181
183
|
rLevels.fill(0, rLevel + 1);
|
|
182
184
|
let rIndex = 0;
|
|
183
|
-
let record =
|
|
185
|
+
let record = rows[rLevels[rIndex++] - 1];
|
|
184
186
|
// Internal nodes - Build a nested row object
|
|
185
187
|
for (const step of branch) {
|
|
186
188
|
if (step === field || dLevel < step.dLevelMax) {
|
|
@@ -235,10 +237,10 @@ function materializeColumn(schema, columnData, key, records) {
|
|
|
235
237
|
* tuples back to nested records (objects/arrays) using the Google Dremel
|
|
236
238
|
* Algorithm..
|
|
237
239
|
*
|
|
238
|
-
* The
|
|
240
|
+
* The rowGroup argument must point to an object with the following structure (i.e.
|
|
239
241
|
* the same structure that is returned by shredRecords):
|
|
240
242
|
*
|
|
241
|
-
*
|
|
243
|
+
* rowGroup = {
|
|
242
244
|
* columnData: [
|
|
243
245
|
* 'my_col': {
|
|
244
246
|
* dlevels: [d1, d2, .. dN],
|
|
@@ -248,100 +250,106 @@ function materializeColumn(schema, columnData, key, records) {
|
|
|
248
250
|
* ],
|
|
249
251
|
* rowCount: X,
|
|
250
252
|
* }
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
253
|
+
*/
|
|
254
|
+
function materializeColumns(schema, rowGroup) {
|
|
255
|
+
const columns = {};
|
|
256
|
+
for (const key in rowGroup.columnData) {
|
|
257
|
+
const columnData = rowGroup.columnData[key];
|
|
258
|
+
if (columnData.count) {
|
|
259
|
+
materializeColumnAsColumnarArray(schema, columnData, rowGroup.rowCount, key, columns);
|
|
260
|
+
}
|
|
258
261
|
}
|
|
259
|
-
|
|
260
|
-
return columns;
|
|
262
|
+
return columns;
|
|
261
263
|
}
|
|
262
|
-
|
|
264
|
+
exports.materializeColumns = materializeColumns;
|
|
263
265
|
// eslint-disable-next-line max-statements, complexity
|
|
264
|
-
function
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
key: string,
|
|
268
|
-
columns: Record<string, unknown>
|
|
269
|
-
) {
|
|
270
|
-
if (columnData.count <= 0) {
|
|
271
|
-
return;
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
const record = columns;
|
|
275
|
-
|
|
276
|
-
const field = schema.findField(key);
|
|
277
|
-
const branch = schema.findFieldBranch(key);
|
|
278
|
-
|
|
279
|
-
// tslint:disable-next-line:prefer-array-literal
|
|
280
|
-
const rLevels: number[] = new Array(field.rLevelMax + 1).fill(0);
|
|
281
|
-
let vIndex = 0;
|
|
282
|
-
|
|
283
|
-
let i = 0;
|
|
284
|
-
const dLevel = columnData.dlevels[i];
|
|
285
|
-
const rLevel = columnData.rlevels[i];
|
|
286
|
-
rLevels[rLevel]++;
|
|
287
|
-
rLevels.fill(0, rLevel + 1);
|
|
288
|
-
|
|
289
|
-
let rIndex = 0;
|
|
290
|
-
let record = records[rLevels[rIndex++] - 1];
|
|
291
|
-
|
|
292
|
-
// Internal nodes
|
|
293
|
-
for (const step of branch) {
|
|
294
|
-
if (step === field || dLevel < step.dLevelMax) {
|
|
295
|
-
break;
|
|
266
|
+
function materializeColumnAsColumnarArray(schema, columnData, rowCount, key, columns) {
|
|
267
|
+
if (columnData.count <= 0) {
|
|
268
|
+
return;
|
|
296
269
|
}
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
270
|
+
const field = schema.findField(key);
|
|
271
|
+
const branch = schema.findFieldBranch(key);
|
|
272
|
+
const columnName = branch[0].name;
|
|
273
|
+
let column;
|
|
274
|
+
const { values } = columnData;
|
|
275
|
+
if (values.length === rowCount && branch[0].primitiveType) {
|
|
276
|
+
// if (branch[0].repetitionType === `REQUIRED`) {
|
|
277
|
+
// switch (branch[0].primitiveType) {
|
|
278
|
+
// case 'INT32': return values instanceof Int32Array ? values : new Int32Array(values);
|
|
279
|
+
// }
|
|
280
|
+
// }
|
|
281
|
+
column = values;
|
|
282
|
+
}
|
|
283
|
+
if (column) {
|
|
284
|
+
columns[columnName] = column;
|
|
285
|
+
return;
|
|
286
|
+
}
|
|
287
|
+
column = new Array(rowCount);
|
|
288
|
+
for (let i = 0; i < rowCount; i++) {
|
|
289
|
+
column[i] = {};
|
|
290
|
+
}
|
|
291
|
+
columns[columnName] = column;
|
|
292
|
+
// tslint:disable-next-line:prefer-array-literal
|
|
293
|
+
const rLevels = new Array(field.rLevelMax + 1).fill(0);
|
|
294
|
+
let vIndex = 0;
|
|
295
|
+
for (let i = 0; i < columnData.count; i++) {
|
|
296
|
+
const dLevel = columnData.dlevels[i];
|
|
297
|
+
const rLevel = columnData.rlevels[i];
|
|
298
|
+
rLevels[rLevel]++;
|
|
299
|
+
rLevels.fill(0, rLevel + 1);
|
|
300
|
+
let rIndex = 0;
|
|
301
|
+
let record = column[rLevels[rIndex++] - 1];
|
|
302
|
+
// Internal nodes - Build a nested row object
|
|
303
|
+
for (const step of branch) {
|
|
304
|
+
if (step === field || dLevel < step.dLevelMax) {
|
|
305
|
+
break;
|
|
306
|
+
}
|
|
307
|
+
switch (step.repetitionType) {
|
|
308
|
+
case 'REPEATED':
|
|
309
|
+
if (!(step.name in record)) {
|
|
310
|
+
// eslint-disable max-depth
|
|
311
|
+
record[step.name] = [];
|
|
312
|
+
}
|
|
313
|
+
const ix = rLevels[rIndex++];
|
|
314
|
+
while (record[step.name].length <= ix) {
|
|
315
|
+
// eslint-disable max-depth
|
|
316
|
+
record[step.name].push({});
|
|
317
|
+
}
|
|
318
|
+
record = record[step.name][ix];
|
|
319
|
+
break;
|
|
320
|
+
default:
|
|
321
|
+
record[step.name] = record[step.name] || {};
|
|
322
|
+
record = record[step.name];
|
|
323
|
+
}
|
|
303
324
|
}
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
325
|
+
// Leaf node - Add the value
|
|
326
|
+
if (dLevel === field.dLevelMax) {
|
|
327
|
+
const value = Types.fromPrimitive(
|
|
328
|
+
// @ts-ignore
|
|
329
|
+
field.originalType || field.primitiveType, columnData.values[vIndex], field);
|
|
330
|
+
vIndex++;
|
|
331
|
+
switch (field.repetitionType) {
|
|
332
|
+
case 'REPEATED':
|
|
333
|
+
if (!(field.name in record)) {
|
|
334
|
+
// eslint-disable max-depth
|
|
335
|
+
record[field.name] = [];
|
|
336
|
+
}
|
|
337
|
+
const ix = rLevels[rIndex];
|
|
338
|
+
while (record[field.name].length <= ix) {
|
|
339
|
+
// eslint-disable max-depth
|
|
340
|
+
record[field.name].push(null);
|
|
341
|
+
}
|
|
342
|
+
record[field.name][ix] = value;
|
|
343
|
+
break;
|
|
344
|
+
default:
|
|
345
|
+
record[field.name] = value;
|
|
346
|
+
}
|
|
308
347
|
}
|
|
309
|
-
record = record[step.name][ix];
|
|
310
|
-
break;
|
|
311
|
-
|
|
312
|
-
default:
|
|
313
|
-
record[step.name] = record[step.name] || {};
|
|
314
|
-
record = record[step.name];
|
|
315
348
|
}
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
const value = Types.fromPrimitive(
|
|
321
|
-
// @ts-ignore
|
|
322
|
-
field.originalType || field.primitiveType,
|
|
323
|
-
columnData.values[vIndex],
|
|
324
|
-
field
|
|
325
|
-
);
|
|
326
|
-
vIndex++;
|
|
327
|
-
|
|
328
|
-
switch (field.repetitionType) {
|
|
329
|
-
case 'REPEATED':
|
|
330
|
-
if (!(field.name in record)) {
|
|
331
|
-
// eslint-disable max-depth
|
|
332
|
-
record[field.name] = [];
|
|
333
|
-
}
|
|
334
|
-
const ix = rLevels[rIndex];
|
|
335
|
-
while (record[field.name].length <= ix) {
|
|
336
|
-
// eslint-disable max-depth
|
|
337
|
-
record[field.name].push(null);
|
|
349
|
+
// Remove one level of nesting
|
|
350
|
+
for (let i = 0; i < rowCount; ++i) {
|
|
351
|
+
if (columnName in column[i]) {
|
|
352
|
+
column[i] = column[i][columnName];
|
|
338
353
|
}
|
|
339
|
-
record[field.name][ix] = value;
|
|
340
|
-
break;
|
|
341
|
-
|
|
342
|
-
default:
|
|
343
|
-
record[field.name] = value;
|
|
344
354
|
}
|
|
345
|
-
}
|
|
346
355
|
}
|
|
347
|
-
*/
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@loaders.gl/parquet",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "4.0.0-alpha.10",
|
|
4
4
|
"description": "Framework-independent loader for Apache Parquet files",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"publishConfig": {
|
|
@@ -41,10 +41,10 @@
|
|
|
41
41
|
"./src/lib/wasm/load-wasm/load-wasm-node.ts": "./src/lib/wasm/load-wasm/load-wasm-browser.ts"
|
|
42
42
|
},
|
|
43
43
|
"dependencies": {
|
|
44
|
-
"@loaders.gl/bson": "
|
|
45
|
-
"@loaders.gl/compression": "
|
|
46
|
-
"@loaders.gl/loader-utils": "
|
|
47
|
-
"@loaders.gl/schema": "
|
|
44
|
+
"@loaders.gl/bson": "4.0.0-alpha.10",
|
|
45
|
+
"@loaders.gl/compression": "4.0.0-alpha.10",
|
|
46
|
+
"@loaders.gl/loader-utils": "4.0.0-alpha.10",
|
|
47
|
+
"@loaders.gl/schema": "4.0.0-alpha.10",
|
|
48
48
|
"async-mutex": "^0.2.2",
|
|
49
49
|
"brotli": "^1.3.2",
|
|
50
50
|
"int53": "^0.2.4",
|
|
@@ -58,14 +58,14 @@
|
|
|
58
58
|
"zstd-codec": "^0.1"
|
|
59
59
|
},
|
|
60
60
|
"peerDependencies": {
|
|
61
|
-
"apache-arrow": "^
|
|
61
|
+
"apache-arrow": "^9.0.0"
|
|
62
62
|
},
|
|
63
63
|
"devDependencies": {
|
|
64
64
|
"@types/node": "^10.14.15",
|
|
65
65
|
"@types/node-int64": "^0.4.29",
|
|
66
66
|
"@types/thrift": "^0.10.8",
|
|
67
67
|
"@types/varint": "^5.0.0",
|
|
68
|
-
"apache-arrow": "^
|
|
68
|
+
"apache-arrow": "^9.0.0"
|
|
69
69
|
},
|
|
70
|
-
"gitHead": "
|
|
70
|
+
"gitHead": "7efdbe09e02098aad6d985e4d6465d08806e19a9"
|
|
71
71
|
}
|
package/src/index.ts
CHANGED
|
@@ -1,33 +1,56 @@
|
|
|
1
|
+
// loaders.gl, MIT license
|
|
2
|
+
|
|
1
3
|
import type {LoaderWithParser} from '@loaders.gl/loader-utils';
|
|
4
|
+
import type {
|
|
5
|
+
ObjectRowTable,
|
|
6
|
+
ObjectRowTableBatch,
|
|
7
|
+
ColumnarTable,
|
|
8
|
+
ColumnarTableBatch
|
|
9
|
+
} from '@loaders.gl/schema';
|
|
10
|
+
import type {Table as ArrowTable} from 'apache-arrow';
|
|
2
11
|
|
|
3
12
|
// ParquetLoader
|
|
4
13
|
|
|
5
|
-
import {
|
|
6
|
-
|
|
14
|
+
import {
|
|
15
|
+
ParquetLoader as ParquetWorkerLoader,
|
|
16
|
+
ParquetLoader as ParquetColumnarWorkerLoader,
|
|
17
|
+
ParquetLoaderOptions
|
|
18
|
+
} from './parquet-loader';
|
|
7
19
|
import {parseParquet, parseParquetFileInBatches} from './lib/parsers/parse-parquet-to-rows';
|
|
8
20
|
import {
|
|
9
21
|
parseParquetInColumns,
|
|
10
22
|
parseParquetFileInColumnarBatches
|
|
11
23
|
} from './lib/parsers/parse-parquet-to-columns';
|
|
12
|
-
|
|
24
|
+
|
|
25
|
+
import {parseParquetWasm, ParquetWasmLoaderOptions} from './lib/wasm/parse-parquet-wasm';
|
|
26
|
+
import {ParquetWasmLoader as ParquetWasmWorkerLoader} from './parquet-wasm-loader';
|
|
13
27
|
|
|
14
28
|
export {ParquetWorkerLoader, ParquetWasmWorkerLoader};
|
|
15
29
|
|
|
16
30
|
/** ParquetJS table loader */
|
|
17
|
-
export const ParquetLoader
|
|
31
|
+
export const ParquetLoader: LoaderWithParser<
|
|
32
|
+
ObjectRowTable,
|
|
33
|
+
ObjectRowTableBatch,
|
|
34
|
+
ParquetLoaderOptions
|
|
35
|
+
> = {
|
|
18
36
|
...ParquetWorkerLoader,
|
|
19
37
|
parse: parseParquet,
|
|
20
38
|
parseFileInBatches: parseParquetFileInBatches
|
|
21
39
|
};
|
|
22
40
|
|
|
23
41
|
/** ParquetJS table loader */
|
|
24
|
-
|
|
25
|
-
|
|
42
|
+
// @ts-expect-error
|
|
43
|
+
export const ParquetColumnarLoader: LoaderWithParser<
|
|
44
|
+
ColumnarTable,
|
|
45
|
+
ColumnarTableBatch,
|
|
46
|
+
ParquetLoaderOptions
|
|
47
|
+
> = {
|
|
48
|
+
...ParquetColumnarWorkerLoader,
|
|
26
49
|
parse: parseParquetInColumns,
|
|
27
50
|
parseFileInBatches: parseParquetFileInColumnarBatches
|
|
28
51
|
};
|
|
29
52
|
|
|
30
|
-
export const ParquetWasmLoader = {
|
|
53
|
+
export const ParquetWasmLoader: LoaderWithParser<ArrowTable, never, ParquetWasmLoaderOptions> = {
|
|
31
54
|
...ParquetWasmWorkerLoader,
|
|
32
55
|
parse: parseParquetWasm
|
|
33
56
|
};
|
|
@@ -46,8 +69,8 @@ export {ParquetReader} from './parquetjs/parser/parquet-reader';
|
|
|
46
69
|
export {ParquetEncoder} from './parquetjs/encoder/parquet-encoder';
|
|
47
70
|
|
|
48
71
|
export {
|
|
49
|
-
|
|
50
|
-
|
|
72
|
+
convertParquetSchema,
|
|
73
|
+
convertParquetSchema as convertParquetToArrowSchema
|
|
51
74
|
} from './lib/arrow/convert-schema-from-parquet';
|
|
52
75
|
|
|
53
76
|
// TESTS
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
// loaders.gl, MIT license
|
|
2
2
|
|
|
3
3
|
import {Schema} from '@loaders.gl/schema';
|
|
4
|
-
import {
|
|
4
|
+
import {ParquetRowGroup} from '@loaders.gl/parquet/parquetjs/schema/declare';
|
|
5
5
|
|
|
6
6
|
export function convertParquetRowGroupToColumns(
|
|
7
7
|
schema: Schema,
|
|
8
|
-
rowGroup:
|
|
8
|
+
rowGroup: ParquetRowGroup
|
|
9
9
|
): Record<string, any[]> {
|
|
10
10
|
const columns: Record<string, any[]> = {};
|
|
11
11
|
for (const [columnName, data] of Object.entries(rowGroup.columnData)) {
|
|
@@ -1,68 +1,57 @@
|
|
|
1
1
|
// loaders.gl, MIT license
|
|
2
2
|
|
|
3
|
+
import {Schema, Field, DataType} from '@loaders.gl/schema';
|
|
4
|
+
|
|
3
5
|
import type {ParquetSchema} from '../../parquetjs/schema/schema';
|
|
4
6
|
import type {FieldDefinition, ParquetField, ParquetType} from '../../parquetjs/schema/declare';
|
|
5
|
-
import {FileMetaData} from '
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
TIME_MICROS: Int64,
|
|
39
|
-
TIMESTAMP_MILLIS: Int64,
|
|
40
|
-
TIMESTAMP_MICROS: Int64,
|
|
41
|
-
UINT_8: Int32,
|
|
42
|
-
UINT_16: Uint16,
|
|
43
|
-
UINT_32: Uint32,
|
|
44
|
-
UINT_64: Uint64,
|
|
45
|
-
INT_8: Int8,
|
|
46
|
-
INT_16: Int16,
|
|
47
|
-
INT_32: Int32,
|
|
48
|
-
INT_64: Int64,
|
|
49
|
-
JSON: Binary,
|
|
50
|
-
BSON: Binary,
|
|
51
|
-
// TODO check interval type
|
|
52
|
-
INTERVAL: Binary,
|
|
53
|
-
DECIMAL_INT32: Float32,
|
|
54
|
-
DECIMAL_INT64: Float64,
|
|
55
|
-
DECIMAL_BYTE_ARRAY: Float64,
|
|
56
|
-
DECIMAL_FIXED_LEN_BYTE_ARRAY: Float64
|
|
7
|
+
import {FileMetaData} from '../../parquetjs/parquet-thrift';
|
|
8
|
+
|
|
9
|
+
export const PARQUET_TYPE_MAPPING: {[type in ParquetType]: DataType} = {
|
|
10
|
+
BOOLEAN: 'bool',
|
|
11
|
+
INT32: 'int32',
|
|
12
|
+
INT64: 'float64',
|
|
13
|
+
INT96: 'float64',
|
|
14
|
+
FLOAT: 'float32',
|
|
15
|
+
DOUBLE: 'float64',
|
|
16
|
+
BYTE_ARRAY: 'binary',
|
|
17
|
+
FIXED_LEN_BYTE_ARRAY: 'binary',
|
|
18
|
+
UTF8: 'utf8',
|
|
19
|
+
DATE: 'int32',
|
|
20
|
+
TIME_MILLIS: 'int64',
|
|
21
|
+
TIME_MICROS: 'int64',
|
|
22
|
+
TIMESTAMP_MILLIS: 'int64',
|
|
23
|
+
TIMESTAMP_MICROS: 'int64',
|
|
24
|
+
UINT_8: 'int32',
|
|
25
|
+
UINT_16: 'uint16',
|
|
26
|
+
UINT_32: 'uint32',
|
|
27
|
+
UINT_64: 'uint64',
|
|
28
|
+
INT_8: 'int8',
|
|
29
|
+
INT_16: 'int16',
|
|
30
|
+
INT_32: 'int32',
|
|
31
|
+
INT_64: 'int64',
|
|
32
|
+
JSON: 'binary',
|
|
33
|
+
BSON: 'binary',
|
|
34
|
+
// TODO check interal type
|
|
35
|
+
INTERVAL: 'binary',
|
|
36
|
+
DECIMAL_INT32: 'float32',
|
|
37
|
+
DECIMAL_INT64: 'float64',
|
|
38
|
+
DECIMAL_BYTE_ARRAY: 'float64',
|
|
39
|
+
DECIMAL_FIXED_LEN_BYTE_ARRAY: 'float64'
|
|
57
40
|
};
|
|
58
41
|
|
|
59
|
-
export function
|
|
42
|
+
export function convertParquetSchema(
|
|
60
43
|
parquetSchema: ParquetSchema,
|
|
61
|
-
parquetMetadata
|
|
44
|
+
parquetMetadata: FileMetaData | null
|
|
62
45
|
): Schema {
|
|
63
46
|
const fields = getFields(parquetSchema.schema);
|
|
64
47
|
const metadata = parquetMetadata && getSchemaMetadata(parquetMetadata);
|
|
65
|
-
|
|
48
|
+
|
|
49
|
+
const schema: Schema = {
|
|
50
|
+
fields,
|
|
51
|
+
metadata: metadata || {}
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
return schema;
|
|
66
55
|
}
|
|
67
56
|
|
|
68
57
|
function getFields(schema: FieldDefinition): Field[] {
|
|
@@ -72,13 +61,12 @@ function getFields(schema: FieldDefinition): Field[] {
|
|
|
72
61
|
const field = schema[name];
|
|
73
62
|
|
|
74
63
|
if (field.fields) {
|
|
75
|
-
const
|
|
76
|
-
|
|
77
|
-
fields.push(nestedField);
|
|
64
|
+
const children = getFields(field.fields);
|
|
65
|
+
fields.push({name, type: {type: 'struct', children}, nullable: field.optional});
|
|
78
66
|
} else {
|
|
79
|
-
const
|
|
67
|
+
const type = PARQUET_TYPE_MAPPING[field.type];
|
|
80
68
|
const metadata = getFieldMetadata(field);
|
|
81
|
-
const arrowField =
|
|
69
|
+
const arrowField = {name, type, nullable: field.optional, metadata};
|
|
82
70
|
fields.push(arrowField);
|
|
83
71
|
}
|
|
84
72
|
}
|
|
@@ -86,27 +74,29 @@ function getFields(schema: FieldDefinition): Field[] {
|
|
|
86
74
|
return fields;
|
|
87
75
|
}
|
|
88
76
|
|
|
89
|
-
function getFieldMetadata(field: ParquetField):
|
|
90
|
-
|
|
77
|
+
function getFieldMetadata(field: ParquetField): Record<string, string> | undefined {
|
|
78
|
+
let metadata: Record<string, string> | undefined;
|
|
91
79
|
|
|
92
80
|
for (const key in field) {
|
|
93
81
|
if (key !== 'name') {
|
|
94
82
|
let value = field[key] || '';
|
|
95
83
|
value = typeof field[key] !== 'string' ? JSON.stringify(field[key]) : field[key];
|
|
96
|
-
metadata
|
|
84
|
+
metadata = metadata || {};
|
|
85
|
+
metadata[key] = value;
|
|
97
86
|
}
|
|
98
87
|
}
|
|
99
88
|
|
|
100
89
|
return metadata;
|
|
101
90
|
}
|
|
102
91
|
|
|
103
|
-
function getSchemaMetadata(parquetMetadata: FileMetaData):
|
|
104
|
-
|
|
92
|
+
function getSchemaMetadata(parquetMetadata: FileMetaData): Record<string, string> | undefined {
|
|
93
|
+
let metadata: Record<string, string> | undefined;
|
|
105
94
|
|
|
106
95
|
const keyValueList = parquetMetadata.key_value_metadata || [];
|
|
107
96
|
for (const {key, value} of keyValueList) {
|
|
108
97
|
if (typeof value === 'string') {
|
|
109
|
-
metadata
|
|
98
|
+
metadata = metadata || {};
|
|
99
|
+
metadata[key] = value;
|
|
110
100
|
}
|
|
111
101
|
}
|
|
112
102
|
|