@loaders.gl/parquet 3.4.6 → 4.0.0-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. package/dist/dist.min.js +27 -34
  2. package/dist/dist.min.js.map +3 -3
  3. package/dist/es5/index.js +6 -6
  4. package/dist/es5/index.js.map +1 -1
  5. package/dist/es5/lib/arrow/convert-row-group-to-columns.js.map +1 -1
  6. package/dist/es5/lib/arrow/convert-schema-from-parquet.js +58 -42
  7. package/dist/es5/lib/arrow/convert-schema-from-parquet.js.map +1 -1
  8. package/dist/es5/lib/arrow/convert-schema-to-parquet.js +33 -31
  9. package/dist/es5/lib/arrow/convert-schema-to-parquet.js.map +1 -1
  10. package/dist/es5/lib/geo/decode-geo-metadata.js +12 -8
  11. package/dist/es5/lib/geo/decode-geo-metadata.js.map +1 -1
  12. package/dist/es5/lib/parsers/parse-parquet-to-columns.js +11 -7
  13. package/dist/es5/lib/parsers/parse-parquet-to-columns.js.map +1 -1
  14. package/dist/es5/lib/parsers/parse-parquet-to-rows.js +51 -29
  15. package/dist/es5/lib/parsers/parse-parquet-to-rows.js.map +1 -1
  16. package/dist/es5/lib/wasm/parse-parquet-wasm.js +6 -6
  17. package/dist/es5/lib/wasm/parse-parquet-wasm.js.map +1 -1
  18. package/dist/es5/parquet-loader.js +16 -4
  19. package/dist/es5/parquet-loader.js.map +1 -1
  20. package/dist/es5/parquet-wasm-loader.js +1 -1
  21. package/dist/es5/parquet-wasm-loader.js.map +1 -1
  22. package/dist/es5/parquet-wasm-writer.js +1 -1
  23. package/dist/es5/parquet-wasm-writer.js.map +1 -1
  24. package/dist/es5/parquet-writer.js +1 -1
  25. package/dist/es5/parquet-writer.js.map +1 -1
  26. package/dist/es5/parquetjs/encoder/parquet-encoder.js.map +1 -1
  27. package/dist/es5/parquetjs/parser/decoders.js.map +1 -1
  28. package/dist/es5/parquetjs/parser/parquet-reader.js +1 -1
  29. package/dist/es5/parquetjs/parser/parquet-reader.js.map +1 -1
  30. package/dist/es5/parquetjs/schema/declare.js +4 -4
  31. package/dist/es5/parquetjs/schema/declare.js.map +1 -1
  32. package/dist/es5/parquetjs/schema/schema.js +7 -7
  33. package/dist/es5/parquetjs/schema/schema.js.map +1 -1
  34. package/dist/es5/parquetjs/schema/shred.js +117 -22
  35. package/dist/es5/parquetjs/schema/shred.js.map +1 -1
  36. package/dist/esm/index.js +5 -5
  37. package/dist/esm/index.js.map +1 -1
  38. package/dist/esm/lib/arrow/convert-row-group-to-columns.js.map +1 -1
  39. package/dist/esm/lib/arrow/convert-schema-from-parquet.js +57 -41
  40. package/dist/esm/lib/arrow/convert-schema-from-parquet.js.map +1 -1
  41. package/dist/esm/lib/arrow/convert-schema-to-parquet.js +33 -31
  42. package/dist/esm/lib/arrow/convert-schema-to-parquet.js.map +1 -1
  43. package/dist/esm/lib/geo/decode-geo-metadata.js +12 -8
  44. package/dist/esm/lib/geo/decode-geo-metadata.js.map +1 -1
  45. package/dist/esm/lib/parsers/parse-parquet-to-columns.js +12 -8
  46. package/dist/esm/lib/parsers/parse-parquet-to-columns.js.map +1 -1
  47. package/dist/esm/lib/parsers/parse-parquet-to-rows.js +14 -3
  48. package/dist/esm/lib/parsers/parse-parquet-to-rows.js.map +1 -1
  49. package/dist/esm/lib/wasm/parse-parquet-wasm.js +3 -3
  50. package/dist/esm/lib/wasm/parse-parquet-wasm.js.map +1 -1
  51. package/dist/esm/parquet-loader.js +14 -2
  52. package/dist/esm/parquet-loader.js.map +1 -1
  53. package/dist/esm/parquet-wasm-loader.js +1 -1
  54. package/dist/esm/parquet-wasm-loader.js.map +1 -1
  55. package/dist/esm/parquet-wasm-writer.js +1 -1
  56. package/dist/esm/parquet-wasm-writer.js.map +1 -1
  57. package/dist/esm/parquet-writer.js +1 -1
  58. package/dist/esm/parquet-writer.js.map +1 -1
  59. package/dist/esm/parquetjs/encoder/parquet-encoder.js.map +1 -1
  60. package/dist/esm/parquetjs/parser/decoders.js.map +1 -1
  61. package/dist/esm/parquetjs/parser/parquet-reader.js +2 -2
  62. package/dist/esm/parquetjs/parser/parquet-reader.js.map +1 -1
  63. package/dist/esm/parquetjs/schema/declare.js +1 -1
  64. package/dist/esm/parquetjs/schema/declare.js.map +1 -1
  65. package/dist/esm/parquetjs/schema/schema.js +6 -6
  66. package/dist/esm/parquetjs/schema/schema.js.map +1 -1
  67. package/dist/esm/parquetjs/schema/shred.js +108 -21
  68. package/dist/esm/parquetjs/schema/shred.js.map +1 -1
  69. package/dist/index.d.ts +8 -49
  70. package/dist/index.d.ts.map +1 -1
  71. package/dist/index.js +8 -6
  72. package/dist/lib/arrow/convert-row-group-to-columns.d.ts +2 -2
  73. package/dist/lib/arrow/convert-row-group-to-columns.d.ts.map +1 -1
  74. package/dist/lib/arrow/convert-schema-from-parquet.d.ts +4 -4
  75. package/dist/lib/arrow/convert-schema-from-parquet.d.ts.map +1 -1
  76. package/dist/lib/arrow/convert-schema-from-parquet.js +48 -44
  77. package/dist/lib/arrow/convert-schema-to-parquet.d.ts +1 -1
  78. package/dist/lib/arrow/convert-schema-to-parquet.d.ts.map +1 -1
  79. package/dist/lib/arrow/convert-schema-to-parquet.js +30 -31
  80. package/dist/lib/geo/decode-geo-metadata.js +12 -8
  81. package/dist/lib/parsers/parse-parquet-to-columns.d.ts +2 -2
  82. package/dist/lib/parsers/parse-parquet-to-columns.d.ts.map +1 -1
  83. package/dist/lib/parsers/parse-parquet-to-columns.js +13 -7
  84. package/dist/lib/parsers/parse-parquet-to-rows.d.ts +3 -2
  85. package/dist/lib/parsers/parse-parquet-to-rows.d.ts.map +1 -1
  86. package/dist/lib/parsers/parse-parquet-to-rows.js +16 -19
  87. package/dist/lib/wasm/parse-parquet-wasm.d.ts +3 -3
  88. package/dist/lib/wasm/parse-parquet-wasm.d.ts.map +1 -1
  89. package/dist/lib/wasm/parse-parquet-wasm.js +3 -3
  90. package/dist/parquet-loader.d.ts +3 -14
  91. package/dist/parquet-loader.d.ts.map +1 -1
  92. package/dist/parquet-loader.js +14 -2
  93. package/dist/parquet-worker.js +31 -38
  94. package/dist/parquet-worker.js.map +3 -3
  95. package/dist/parquet-writer.d.ts +2 -1
  96. package/dist/parquet-writer.d.ts.map +1 -1
  97. package/dist/parquet-writer.js +1 -0
  98. package/dist/parquetjs/encoder/parquet-encoder.d.ts +4 -4
  99. package/dist/parquetjs/encoder/parquet-encoder.d.ts.map +1 -1
  100. package/dist/parquetjs/parser/decoders.d.ts +2 -2
  101. package/dist/parquetjs/parser/decoders.d.ts.map +1 -1
  102. package/dist/parquetjs/parser/parquet-reader.d.ts +6 -6
  103. package/dist/parquetjs/parser/parquet-reader.d.ts.map +1 -1
  104. package/dist/parquetjs/parser/parquet-reader.js +1 -1
  105. package/dist/parquetjs/schema/declare.d.ts +6 -5
  106. package/dist/parquetjs/schema/declare.d.ts.map +1 -1
  107. package/dist/parquetjs/schema/declare.js +3 -3
  108. package/dist/parquetjs/schema/schema.d.ts +4 -4
  109. package/dist/parquetjs/schema/schema.d.ts.map +1 -1
  110. package/dist/parquetjs/schema/schema.js +5 -5
  111. package/dist/parquetjs/schema/shred.d.ts +17 -111
  112. package/dist/parquetjs/schema/shred.d.ts.map +1 -1
  113. package/dist/parquetjs/schema/shred.js +127 -119
  114. package/package.json +8 -8
  115. package/src/index.ts +32 -9
  116. package/src/lib/arrow/convert-row-group-to-columns.ts +2 -2
  117. package/src/lib/arrow/convert-schema-from-parquet.ts +56 -66
  118. package/src/lib/arrow/convert-schema-to-parquet.ts +32 -44
  119. package/src/lib/geo/decode-geo-metadata.ts +17 -8
  120. package/src/lib/parsers/parse-parquet-to-columns.ts +22 -11
  121. package/src/lib/parsers/parse-parquet-to-rows.ts +28 -23
  122. package/src/lib/wasm/parse-parquet-wasm.ts +7 -7
  123. package/src/parquet-loader.ts +25 -2
  124. package/src/parquet-writer.ts +4 -1
  125. package/src/parquetjs/encoder/parquet-encoder.ts +11 -10
  126. package/src/parquetjs/parser/decoders.ts +3 -3
  127. package/src/parquetjs/parser/parquet-reader.ts +7 -7
  128. package/src/parquetjs/schema/declare.ts +6 -5
  129. package/src/parquetjs/schema/schema.ts +8 -8
  130. package/src/parquetjs/schema/shred.ts +142 -103
@@ -24,9 +24,9 @@ var __importStar = (this && this.__importStar) || function (mod) {
24
24
  return result;
25
25
  };
26
26
  Object.defineProperty(exports, "__esModule", { value: true });
27
- exports.materializeRecords = exports.shredRecord = exports.shredBuffer = exports.ParquetBuffer = void 0;
27
+ exports.materializeColumns = exports.materializeRows = exports.shredRecord = exports.shredBuffer = exports.ParquetRowGroup = void 0;
28
28
  const declare_1 = require("./declare");
29
- Object.defineProperty(exports, "ParquetBuffer", { enumerable: true, get: function () { return declare_1.ParquetBuffer; } });
29
+ Object.defineProperty(exports, "ParquetRowGroup", { enumerable: true, get: function () { return declare_1.ParquetRowGroup; } });
30
30
  const Types = __importStar(require("./types"));
31
31
  function shredBuffer(schema) {
32
32
  const columnData = {};
@@ -46,14 +46,14 @@ exports.shredBuffer = shredBuffer;
46
46
  * 'Shred' a record into a list of <value, repetition_level, definition_level>
47
47
  * tuples per column using the Google Dremel Algorithm..
48
48
  *
49
- * The buffer argument must point to an object into which the shredded record
50
- * will be returned. You may re-use the buffer for repeated calls to this function
51
- * to append to an existing buffer, as long as the schema is unchanged.
49
+ * The rowGroup argument must point to an object into which the shredded record
50
+ * will be returned. You may re-use the rowGroup for repeated calls to this function
51
+ * to append to an existing rowGroup, as long as the schema is unchanged.
52
52
  *
53
- * The format in which the shredded records will be stored in the buffer is as
53
+ * The format in which the shredded records will be stored in the rowGroup is as
54
54
  * follows:
55
55
  *
56
- * buffer = {
56
+ * rowGroup = {
57
57
  * columnData: [
58
58
  * 'my_col': {
59
59
  * dlevels: [d1, d2, .. dN],
@@ -64,22 +64,22 @@ exports.shredBuffer = shredBuffer;
64
64
  * rowCount: X,
65
65
  * }
66
66
  */
67
- function shredRecord(schema, record, buffer) {
67
+ function shredRecord(schema, record, rowGroup) {
68
68
  /* shred the record, this may raise an exception */
69
69
  const data = shredBuffer(schema).columnData;
70
70
  shredRecordFields(schema.fields, record, data, 0, 0);
71
- /* if no error during shredding, add the shredded record to the buffer */
72
- if (buffer.rowCount === 0) {
73
- buffer.rowCount = 1;
74
- buffer.columnData = data;
71
+ /* if no error during shredding, add the shredded record to the rowGroup */
72
+ if (rowGroup.rowCount === 0) {
73
+ rowGroup.rowCount = 1;
74
+ rowGroup.columnData = data;
75
75
  return;
76
76
  }
77
- buffer.rowCount += 1;
77
+ rowGroup.rowCount += 1;
78
78
  for (const field of schema.fieldList) {
79
- Array.prototype.push.apply(buffer.columnData[field.key].rlevels, data[field.key].rlevels);
80
- Array.prototype.push.apply(buffer.columnData[field.key].dlevels, data[field.key].dlevels);
81
- Array.prototype.push.apply(buffer.columnData[field.key].values, data[field.key].values);
82
- buffer.columnData[field.key].count += data[field.key].count;
79
+ Array.prototype.push.apply(rowGroup.columnData[field.key].rlevels, data[field.key].rlevels);
80
+ Array.prototype.push.apply(rowGroup.columnData[field.key].dlevels, data[field.key].dlevels);
81
+ Array.prototype.push.apply(rowGroup.columnData[field.key].values, data[field.key].values);
82
+ rowGroup.columnData[field.key].count += data[field.key].count;
83
83
  }
84
84
  }
85
85
  exports.shredRecord = shredRecord;
@@ -139,10 +139,10 @@ function shredRecordFields(fields, record, data, rLevel, dLevel) {
139
139
  * tuples back to nested records (objects/arrays) using the Google Dremel
140
140
  * Algorithm..
141
141
  *
142
- * The buffer argument must point to an object with the following structure (i.e.
142
+ * The rowGroup argument must point to an object with the following structure (i.e.
143
143
  * the same structure that is returned by shredRecords):
144
144
  *
145
- * buffer = {
145
+ * rowGroup = {
146
146
  * columnData: [
147
147
  * 'my_col': {
148
148
  * dlevels: [d1, d2, .. dN],
@@ -153,22 +153,24 @@ function shredRecordFields(fields, record, data, rLevel, dLevel) {
153
153
  * rowCount: X,
154
154
  * }
155
155
  */
156
- function materializeRecords(schema, buffer) {
157
- const records = [];
158
- for (let i = 0; i < buffer.rowCount; i++) {
159
- records.push({});
156
+ function materializeRows(schema, rowGroup) {
157
+ const rows = [];
158
+ // rows = new Array(rowGroup.rowCount).fill({})'
159
+ for (let i = 0; i < rowGroup.rowCount; i++) {
160
+ rows.push({});
160
161
  }
161
- for (const key in buffer.columnData) {
162
- const columnData = buffer.columnData[key];
162
+ for (const key in rowGroup.columnData) {
163
+ const columnData = rowGroup.columnData[key];
163
164
  if (columnData.count) {
164
- materializeColumn(schema, columnData, key, records);
165
+ materializeColumnAsRows(schema, columnData, key, rows);
165
166
  }
166
167
  }
167
- return records;
168
+ return rows;
168
169
  }
169
- exports.materializeRecords = materializeRecords;
170
+ exports.materializeRows = materializeRows;
171
+ /** Populate record fields for one column */
170
172
  // eslint-disable-next-line max-statements, complexity
171
- function materializeColumn(schema, columnData, key, records) {
173
+ function materializeColumnAsRows(schema, columnData, key, rows) {
172
174
  const field = schema.findField(key);
173
175
  const branch = schema.findFieldBranch(key);
174
176
  // tslint:disable-next-line:prefer-array-literal
@@ -180,7 +182,7 @@ function materializeColumn(schema, columnData, key, records) {
180
182
  rLevels[rLevel]++;
181
183
  rLevels.fill(0, rLevel + 1);
182
184
  let rIndex = 0;
183
- let record = records[rLevels[rIndex++] - 1];
185
+ let record = rows[rLevels[rIndex++] - 1];
184
186
  // Internal nodes - Build a nested row object
185
187
  for (const step of branch) {
186
188
  if (step === field || dLevel < step.dLevelMax) {
@@ -235,10 +237,10 @@ function materializeColumn(schema, columnData, key, records) {
235
237
  * tuples back to nested records (objects/arrays) using the Google Dremel
236
238
  * Algorithm..
237
239
  *
238
- * The buffer argument must point to an object with the following structure (i.e.
240
+ * The rowGroup argument must point to an object with the following structure (i.e.
239
241
  * the same structure that is returned by shredRecords):
240
242
  *
241
- * buffer = {
243
+ * rowGroup = {
242
244
  * columnData: [
243
245
  * 'my_col': {
244
246
  * dlevels: [d1, d2, .. dN],
@@ -248,100 +250,106 @@ function materializeColumn(schema, columnData, key, records) {
248
250
  * ],
249
251
  * rowCount: X,
250
252
  * }
251
- *
252
- export function extractColumns(schema: ParquetSchema, buffer: ParquetBuffer): Record<string, unknown> {
253
- const columns: ParquetRecord = {};
254
- for (const key in buffer.columnData) {
255
- const columnData = buffer.columnData[key];
256
- if (columnData.count) {
257
- extractColumn(schema, columnData, key, columns);
253
+ */
254
+ function materializeColumns(schema, rowGroup) {
255
+ const columns = {};
256
+ for (const key in rowGroup.columnData) {
257
+ const columnData = rowGroup.columnData[key];
258
+ if (columnData.count) {
259
+ materializeColumnAsColumnarArray(schema, columnData, rowGroup.rowCount, key, columns);
260
+ }
258
261
  }
259
- }
260
- return columns;
262
+ return columns;
261
263
  }
262
-
264
+ exports.materializeColumns = materializeColumns;
263
265
  // eslint-disable-next-line max-statements, complexity
264
- function extractColumn(
265
- schema: ParquetSchema,
266
- columnData: ParquetData,
267
- key: string,
268
- columns: Record<string, unknown>
269
- ) {
270
- if (columnData.count <= 0) {
271
- return;
272
- }
273
-
274
- const record = columns;
275
-
276
- const field = schema.findField(key);
277
- const branch = schema.findFieldBranch(key);
278
-
279
- // tslint:disable-next-line:prefer-array-literal
280
- const rLevels: number[] = new Array(field.rLevelMax + 1).fill(0);
281
- let vIndex = 0;
282
-
283
- let i = 0;
284
- const dLevel = columnData.dlevels[i];
285
- const rLevel = columnData.rlevels[i];
286
- rLevels[rLevel]++;
287
- rLevels.fill(0, rLevel + 1);
288
-
289
- let rIndex = 0;
290
- let record = records[rLevels[rIndex++] - 1];
291
-
292
- // Internal nodes
293
- for (const step of branch) {
294
- if (step === field || dLevel < step.dLevelMax) {
295
- break;
266
+ function materializeColumnAsColumnarArray(schema, columnData, rowCount, key, columns) {
267
+ if (columnData.count <= 0) {
268
+ return;
296
269
  }
297
-
298
- switch (step.repetitionType) {
299
- case 'REPEATED':
300
- if (!(step.name in record)) {
301
- // eslint-disable max-depth
302
- record[step.name] = [];
270
+ const field = schema.findField(key);
271
+ const branch = schema.findFieldBranch(key);
272
+ const columnName = branch[0].name;
273
+ let column;
274
+ const { values } = columnData;
275
+ if (values.length === rowCount && branch[0].primitiveType) {
276
+ // if (branch[0].repetitionType === `REQUIRED`) {
277
+ // switch (branch[0].primitiveType) {
278
+ // case 'INT32': return values instanceof Int32Array ? values : new Int32Array(values);
279
+ // }
280
+ // }
281
+ column = values;
282
+ }
283
+ if (column) {
284
+ columns[columnName] = column;
285
+ return;
286
+ }
287
+ column = new Array(rowCount);
288
+ for (let i = 0; i < rowCount; i++) {
289
+ column[i] = {};
290
+ }
291
+ columns[columnName] = column;
292
+ // tslint:disable-next-line:prefer-array-literal
293
+ const rLevels = new Array(field.rLevelMax + 1).fill(0);
294
+ let vIndex = 0;
295
+ for (let i = 0; i < columnData.count; i++) {
296
+ const dLevel = columnData.dlevels[i];
297
+ const rLevel = columnData.rlevels[i];
298
+ rLevels[rLevel]++;
299
+ rLevels.fill(0, rLevel + 1);
300
+ let rIndex = 0;
301
+ let record = column[rLevels[rIndex++] - 1];
302
+ // Internal nodes - Build a nested row object
303
+ for (const step of branch) {
304
+ if (step === field || dLevel < step.dLevelMax) {
305
+ break;
306
+ }
307
+ switch (step.repetitionType) {
308
+ case 'REPEATED':
309
+ if (!(step.name in record)) {
310
+ // eslint-disable max-depth
311
+ record[step.name] = [];
312
+ }
313
+ const ix = rLevels[rIndex++];
314
+ while (record[step.name].length <= ix) {
315
+ // eslint-disable max-depth
316
+ record[step.name].push({});
317
+ }
318
+ record = record[step.name][ix];
319
+ break;
320
+ default:
321
+ record[step.name] = record[step.name] || {};
322
+ record = record[step.name];
323
+ }
303
324
  }
304
- const ix = rLevels[rIndex++];
305
- while (record[step.name].length <= ix) {
306
- // eslint-disable max-depth
307
- record[step.name].push({});
325
+ // Leaf node - Add the value
326
+ if (dLevel === field.dLevelMax) {
327
+ const value = Types.fromPrimitive(
328
+ // @ts-ignore
329
+ field.originalType || field.primitiveType, columnData.values[vIndex], field);
330
+ vIndex++;
331
+ switch (field.repetitionType) {
332
+ case 'REPEATED':
333
+ if (!(field.name in record)) {
334
+ // eslint-disable max-depth
335
+ record[field.name] = [];
336
+ }
337
+ const ix = rLevels[rIndex];
338
+ while (record[field.name].length <= ix) {
339
+ // eslint-disable max-depth
340
+ record[field.name].push(null);
341
+ }
342
+ record[field.name][ix] = value;
343
+ break;
344
+ default:
345
+ record[field.name] = value;
346
+ }
308
347
  }
309
- record = record[step.name][ix];
310
- break;
311
-
312
- default:
313
- record[step.name] = record[step.name] || {};
314
- record = record[step.name];
315
348
  }
316
- }
317
-
318
- // Leaf node
319
- if (dLevel === field.dLevelMax) {
320
- const value = Types.fromPrimitive(
321
- // @ts-ignore
322
- field.originalType || field.primitiveType,
323
- columnData.values[vIndex],
324
- field
325
- );
326
- vIndex++;
327
-
328
- switch (field.repetitionType) {
329
- case 'REPEATED':
330
- if (!(field.name in record)) {
331
- // eslint-disable max-depth
332
- record[field.name] = [];
333
- }
334
- const ix = rLevels[rIndex];
335
- while (record[field.name].length <= ix) {
336
- // eslint-disable max-depth
337
- record[field.name].push(null);
349
+ // Remove one level of nesting
350
+ for (let i = 0; i < rowCount; ++i) {
351
+ if (columnName in column[i]) {
352
+ column[i] = column[i][columnName];
338
353
  }
339
- record[field.name][ix] = value;
340
- break;
341
-
342
- default:
343
- record[field.name] = value;
344
354
  }
345
- }
346
355
  }
347
- */
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@loaders.gl/parquet",
3
- "version": "3.4.6",
3
+ "version": "4.0.0-alpha.10",
4
4
  "description": "Framework-independent loader for Apache Parquet files",
5
5
  "license": "MIT",
6
6
  "publishConfig": {
@@ -41,10 +41,10 @@
41
41
  "./src/lib/wasm/load-wasm/load-wasm-node.ts": "./src/lib/wasm/load-wasm/load-wasm-browser.ts"
42
42
  },
43
43
  "dependencies": {
44
- "@loaders.gl/bson": "3.4.6",
45
- "@loaders.gl/compression": "3.4.6",
46
- "@loaders.gl/loader-utils": "3.4.6",
47
- "@loaders.gl/schema": "3.4.6",
44
+ "@loaders.gl/bson": "4.0.0-alpha.10",
45
+ "@loaders.gl/compression": "4.0.0-alpha.10",
46
+ "@loaders.gl/loader-utils": "4.0.0-alpha.10",
47
+ "@loaders.gl/schema": "4.0.0-alpha.10",
48
48
  "async-mutex": "^0.2.2",
49
49
  "brotli": "^1.3.2",
50
50
  "int53": "^0.2.4",
@@ -58,14 +58,14 @@
58
58
  "zstd-codec": "^0.1"
59
59
  },
60
60
  "peerDependencies": {
61
- "apache-arrow": "^4.0.0"
61
+ "apache-arrow": "^9.0.0"
62
62
  },
63
63
  "devDependencies": {
64
64
  "@types/node": "^10.14.15",
65
65
  "@types/node-int64": "^0.4.29",
66
66
  "@types/thrift": "^0.10.8",
67
67
  "@types/varint": "^5.0.0",
68
- "apache-arrow": "^4.0.0"
68
+ "apache-arrow": "^9.0.0"
69
69
  },
70
- "gitHead": "f878cbf97013ab99866390ef58e6ca26717af6cb"
70
+ "gitHead": "7efdbe09e02098aad6d985e4d6465d08806e19a9"
71
71
  }
package/src/index.ts CHANGED
@@ -1,33 +1,56 @@
1
+ // loaders.gl, MIT license
2
+
1
3
  import type {LoaderWithParser} from '@loaders.gl/loader-utils';
4
+ import type {
5
+ ObjectRowTable,
6
+ ObjectRowTableBatch,
7
+ ColumnarTable,
8
+ ColumnarTableBatch
9
+ } from '@loaders.gl/schema';
10
+ import type {Table as ArrowTable} from 'apache-arrow';
2
11
 
3
12
  // ParquetLoader
4
13
 
5
- import {ParquetWasmLoader as ParquetWasmWorkerLoader} from './parquet-wasm-loader';
6
- import {ParquetLoader as ParquetWorkerLoader} from './parquet-loader';
14
+ import {
15
+ ParquetLoader as ParquetWorkerLoader,
16
+ ParquetLoader as ParquetColumnarWorkerLoader,
17
+ ParquetLoaderOptions
18
+ } from './parquet-loader';
7
19
  import {parseParquet, parseParquetFileInBatches} from './lib/parsers/parse-parquet-to-rows';
8
20
  import {
9
21
  parseParquetInColumns,
10
22
  parseParquetFileInColumnarBatches
11
23
  } from './lib/parsers/parse-parquet-to-columns';
12
- import {parseParquet as parseParquetWasm} from './lib/wasm/parse-parquet-wasm';
24
+
25
+ import {parseParquetWasm, ParquetWasmLoaderOptions} from './lib/wasm/parse-parquet-wasm';
26
+ import {ParquetWasmLoader as ParquetWasmWorkerLoader} from './parquet-wasm-loader';
13
27
 
14
28
  export {ParquetWorkerLoader, ParquetWasmWorkerLoader};
15
29
 
16
30
  /** ParquetJS table loader */
17
- export const ParquetLoader = {
31
+ export const ParquetLoader: LoaderWithParser<
32
+ ObjectRowTable,
33
+ ObjectRowTableBatch,
34
+ ParquetLoaderOptions
35
+ > = {
18
36
  ...ParquetWorkerLoader,
19
37
  parse: parseParquet,
20
38
  parseFileInBatches: parseParquetFileInBatches
21
39
  };
22
40
 
23
41
  /** ParquetJS table loader */
24
- export const ParquetColumnarLoader = {
25
- ...ParquetWorkerLoader,
42
+ // @ts-expect-error
43
+ export const ParquetColumnarLoader: LoaderWithParser<
44
+ ColumnarTable,
45
+ ColumnarTableBatch,
46
+ ParquetLoaderOptions
47
+ > = {
48
+ ...ParquetColumnarWorkerLoader,
26
49
  parse: parseParquetInColumns,
27
50
  parseFileInBatches: parseParquetFileInColumnarBatches
28
51
  };
29
52
 
30
- export const ParquetWasmLoader = {
53
+ export const ParquetWasmLoader: LoaderWithParser<ArrowTable, never, ParquetWasmLoaderOptions> = {
31
54
  ...ParquetWasmWorkerLoader,
32
55
  parse: parseParquetWasm
33
56
  };
@@ -46,8 +69,8 @@ export {ParquetReader} from './parquetjs/parser/parquet-reader';
46
69
  export {ParquetEncoder} from './parquetjs/encoder/parquet-encoder';
47
70
 
48
71
  export {
49
- convertSchemaFromParquet,
50
- convertSchemaFromParquet as convertParquetToArrowSchema
72
+ convertParquetSchema,
73
+ convertParquetSchema as convertParquetToArrowSchema
51
74
  } from './lib/arrow/convert-schema-from-parquet';
52
75
 
53
76
  // TESTS
@@ -1,11 +1,11 @@
1
1
  // loaders.gl, MIT license
2
2
 
3
3
  import {Schema} from '@loaders.gl/schema';
4
- import {ParquetBuffer} from '@loaders.gl/parquet/parquetjs/schema/declare';
4
+ import {ParquetRowGroup} from '@loaders.gl/parquet/parquetjs/schema/declare';
5
5
 
6
6
  export function convertParquetRowGroupToColumns(
7
7
  schema: Schema,
8
- rowGroup: ParquetBuffer
8
+ rowGroup: ParquetRowGroup
9
9
  ): Record<string, any[]> {
10
10
  const columns: Record<string, any[]> = {};
11
11
  for (const [columnName, data] of Object.entries(rowGroup.columnData)) {
@@ -1,68 +1,57 @@
1
1
  // loaders.gl, MIT license
2
2
 
3
+ import {Schema, Field, DataType} from '@loaders.gl/schema';
4
+
3
5
  import type {ParquetSchema} from '../../parquetjs/schema/schema';
4
6
  import type {FieldDefinition, ParquetField, ParquetType} from '../../parquetjs/schema/declare';
5
- import {FileMetaData} from '@loaders.gl/parquet/parquetjs/parquet-thrift';
6
-
7
- import {
8
- Schema,
9
- Struct,
10
- Field,
11
- DataType,
12
- Bool,
13
- Float64,
14
- Int32,
15
- Float32,
16
- Binary,
17
- Utf8,
18
- Int64,
19
- Uint16,
20
- Uint32,
21
- Uint64,
22
- Int8,
23
- Int16
24
- } from '@loaders.gl/schema';
25
-
26
- export const PARQUET_TYPE_MAPPING: {[type in ParquetType]: typeof DataType} = {
27
- BOOLEAN: Bool,
28
- INT32: Int32,
29
- INT64: Float64,
30
- INT96: Float64,
31
- FLOAT: Float32,
32
- DOUBLE: Float64,
33
- BYTE_ARRAY: Binary,
34
- FIXED_LEN_BYTE_ARRAY: Binary,
35
- UTF8: Utf8,
36
- DATE: Int32,
37
- TIME_MILLIS: Int64,
38
- TIME_MICROS: Int64,
39
- TIMESTAMP_MILLIS: Int64,
40
- TIMESTAMP_MICROS: Int64,
41
- UINT_8: Int32,
42
- UINT_16: Uint16,
43
- UINT_32: Uint32,
44
- UINT_64: Uint64,
45
- INT_8: Int8,
46
- INT_16: Int16,
47
- INT_32: Int32,
48
- INT_64: Int64,
49
- JSON: Binary,
50
- BSON: Binary,
51
- // TODO check interval type
52
- INTERVAL: Binary,
53
- DECIMAL_INT32: Float32,
54
- DECIMAL_INT64: Float64,
55
- DECIMAL_BYTE_ARRAY: Float64,
56
- DECIMAL_FIXED_LEN_BYTE_ARRAY: Float64
7
+ import {FileMetaData} from '../../parquetjs/parquet-thrift';
8
+
9
+ export const PARQUET_TYPE_MAPPING: {[type in ParquetType]: DataType} = {
10
+ BOOLEAN: 'bool',
11
+ INT32: 'int32',
12
+ INT64: 'float64',
13
+ INT96: 'float64',
14
+ FLOAT: 'float32',
15
+ DOUBLE: 'float64',
16
+ BYTE_ARRAY: 'binary',
17
+ FIXED_LEN_BYTE_ARRAY: 'binary',
18
+ UTF8: 'utf8',
19
+ DATE: 'int32',
20
+ TIME_MILLIS: 'int64',
21
+ TIME_MICROS: 'int64',
22
+ TIMESTAMP_MILLIS: 'int64',
23
+ TIMESTAMP_MICROS: 'int64',
24
+ UINT_8: 'int32',
25
+ UINT_16: 'uint16',
26
+ UINT_32: 'uint32',
27
+ UINT_64: 'uint64',
28
+ INT_8: 'int8',
29
+ INT_16: 'int16',
30
+ INT_32: 'int32',
31
+ INT_64: 'int64',
32
+ JSON: 'binary',
33
+ BSON: 'binary',
34
+ // TODO check interal type
35
+ INTERVAL: 'binary',
36
+ DECIMAL_INT32: 'float32',
37
+ DECIMAL_INT64: 'float64',
38
+ DECIMAL_BYTE_ARRAY: 'float64',
39
+ DECIMAL_FIXED_LEN_BYTE_ARRAY: 'float64'
57
40
  };
58
41
 
59
- export function convertSchemaFromParquet(
42
+ export function convertParquetSchema(
60
43
  parquetSchema: ParquetSchema,
61
- parquetMetadata?: FileMetaData
44
+ parquetMetadata: FileMetaData | null
62
45
  ): Schema {
63
46
  const fields = getFields(parquetSchema.schema);
64
47
  const metadata = parquetMetadata && getSchemaMetadata(parquetMetadata);
65
- return new Schema(fields, metadata);
48
+
49
+ const schema: Schema = {
50
+ fields,
51
+ metadata: metadata || {}
52
+ };
53
+
54
+ return schema;
66
55
  }
67
56
 
68
57
  function getFields(schema: FieldDefinition): Field[] {
@@ -72,13 +61,12 @@ function getFields(schema: FieldDefinition): Field[] {
72
61
  const field = schema[name];
73
62
 
74
63
  if (field.fields) {
75
- const childFields = getFields(field.fields);
76
- const nestedField = new Field(name, new Struct(childFields), field.optional);
77
- fields.push(nestedField);
64
+ const children = getFields(field.fields);
65
+ fields.push({name, type: {type: 'struct', children}, nullable: field.optional});
78
66
  } else {
79
- const FieldType = PARQUET_TYPE_MAPPING[field.type];
67
+ const type = PARQUET_TYPE_MAPPING[field.type];
80
68
  const metadata = getFieldMetadata(field);
81
- const arrowField = new Field(name, new FieldType(), field.optional, metadata);
69
+ const arrowField = {name, type, nullable: field.optional, metadata};
82
70
  fields.push(arrowField);
83
71
  }
84
72
  }
@@ -86,27 +74,29 @@ function getFields(schema: FieldDefinition): Field[] {
86
74
  return fields;
87
75
  }
88
76
 
89
- function getFieldMetadata(field: ParquetField): Map<string, string> {
90
- const metadata = new Map();
77
+ function getFieldMetadata(field: ParquetField): Record<string, string> | undefined {
78
+ let metadata: Record<string, string> | undefined;
91
79
 
92
80
  for (const key in field) {
93
81
  if (key !== 'name') {
94
82
  let value = field[key] || '';
95
83
  value = typeof field[key] !== 'string' ? JSON.stringify(field[key]) : field[key];
96
- metadata.set(key, value);
84
+ metadata = metadata || {};
85
+ metadata[key] = value;
97
86
  }
98
87
  }
99
88
 
100
89
  return metadata;
101
90
  }
102
91
 
103
- function getSchemaMetadata(parquetMetadata: FileMetaData): Map<string, string> {
104
- const metadata = new Map();
92
+ function getSchemaMetadata(parquetMetadata: FileMetaData): Record<string, string> | undefined {
93
+ let metadata: Record<string, string> | undefined;
105
94
 
106
95
  const keyValueList = parquetMetadata.key_value_metadata || [];
107
96
  for (const {key, value} of keyValueList) {
108
97
  if (typeof value === 'string') {
109
- metadata.set(key, value);
98
+ metadata = metadata || {};
99
+ metadata[key] = value;
110
100
  }
111
101
  }
112
102