@loaders.gl/parquet 4.2.1 → 4.3.0-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/index.cjs +112 -24
  2. package/dist/index.cjs.map +4 -4
  3. package/dist/lib/constants.d.ts +1 -1
  4. package/dist/lib/constants.d.ts.map +1 -1
  5. package/dist/lib/constants.js +2 -2
  6. package/dist/lib/encoders/encode-parquet-wasm.d.ts.map +1 -0
  7. package/dist/lib/{wasm → encoders}/encode-parquet-wasm.js +1 -1
  8. package/dist/lib/parsers/parse-parquet-wasm.d.ts +10 -0
  9. package/dist/lib/parsers/parse-parquet-wasm.d.ts.map +1 -0
  10. package/dist/lib/parsers/parse-parquet-wasm.js +51 -0
  11. package/dist/lib/parsers/parse-parquet.d.ts.map +1 -1
  12. package/dist/lib/parsers/parse-parquet.js +10 -0
  13. package/dist/lib/utils/load-wasm.d.ts +3 -0
  14. package/dist/lib/utils/load-wasm.d.ts.map +1 -0
  15. package/dist/lib/utils/make-stream-iterator.d.ts +11 -0
  16. package/dist/lib/utils/make-stream-iterator.d.ts.map +1 -0
  17. package/dist/lib/utils/make-stream-iterator.js +67 -0
  18. package/dist/parquet-loader.js +1 -1
  19. package/dist/parquet-wasm-loader.d.ts +17 -5
  20. package/dist/parquet-wasm-loader.d.ts.map +1 -1
  21. package/dist/parquet-wasm-loader.js +19 -4
  22. package/dist/parquet-wasm-writer.js +1 -1
  23. package/dist/parquet-writer.js +1 -1
  24. package/dist/parquet_wasm_bg.wasm +0 -0
  25. package/dist/parquetjs/codecs/rle.d.ts.map +1 -1
  26. package/dist/parquetjs/codecs/rle.js +1 -0
  27. package/package.json +13 -12
  28. package/src/lib/constants.ts +2 -1
  29. package/src/lib/{wasm → encoders}/encode-parquet-wasm.ts +1 -1
  30. package/src/lib/parsers/parse-parquet-wasm.ts +72 -0
  31. package/src/lib/parsers/parse-parquet.ts +10 -0
  32. package/src/lib/utils/make-stream-iterator.ts +87 -0
  33. package/src/parquet-wasm-loader.ts +36 -9
  34. package/src/parquet-wasm-writer.ts +1 -1
  35. package/src/parquetjs/codecs/rle.ts +3 -1
  36. package/dist/arrow1_bg.wasm +0 -0
  37. package/dist/lib/wasm/encode-parquet-wasm.d.ts.map +0 -1
  38. package/dist/lib/wasm/load-wasm.d.ts +0 -3
  39. package/dist/lib/wasm/load-wasm.d.ts.map +0 -1
  40. package/dist/lib/wasm/parse-parquet-wasm.d.ts +0 -4
  41. package/dist/lib/wasm/parse-parquet-wasm.d.ts.map +0 -1
  42. package/dist/lib/wasm/parse-parquet-wasm.js +0 -24
  43. package/src/lib/wasm/parse-parquet-wasm.ts +0 -33
  44. package/src/lib/wip/convert-schema-deep.java.disabled +0 -910
  45. package/src/lib/wip/convert-schema-deep.rs.disabled +0 -976
  46. /package/dist/lib/{wasm → encoders}/encode-parquet-wasm.d.ts +0 -0
  47. /package/dist/lib/{wasm → utils}/load-wasm.js +0 -0
  48. /package/src/lib/{wasm → utils}/load-wasm.ts +0 -0
package/dist/index.cjs CHANGED
@@ -1745,6 +1745,9 @@ var Buffer3 = installBufferPolyfill();
1745
1745
  // dist/parquet-loader.js
1746
1746
  var import_loader_utils2 = require("@loaders.gl/loader-utils");
1747
1747
 
1748
+ // dist/lib/parsers/parse-parquet.js
1749
+ var import_log = __toESM(require("@probe.gl/log"), 1);
1750
+
1748
1751
  // dist/parquetjs/codecs/plain.js
1749
1752
  var import_int53 = __toESM(require("int53"), 1);
1750
1753
  function encodeValues(type, values, opts) {
@@ -6320,8 +6323,8 @@ var FileMetaData = class {
6320
6323
  };
6321
6324
 
6322
6325
  // dist/lib/constants.js
6323
- var VERSION = true ? "4.2.0" : "latest";
6324
- var PARQUET_WASM_URL = "https://unpkg.com/parquet-wasm@0.6.0-beta.1/esm/arrow1_bg.wasm";
6326
+ var VERSION = true ? "4.3.0-alpha.1" : "latest";
6327
+ var PARQUET_WASM_URL = "https://unpkg.com/parquet-wasm@0.6.1/esm/parquet_wasm_bg.wasm";
6325
6328
  var PARQUET_MAGIC = "PAR1";
6326
6329
  var PARQUET_MAGIC_ENCRYPTED = "PARE";
6327
6330
  var PARQUET_RDLVL_TYPE = "INT32";
@@ -6961,9 +6964,18 @@ async function parseParquetFile(file, options) {
6961
6964
  const rows = [];
6962
6965
  const rowBatches = reader.rowBatchIterator(options == null ? void 0 : options.parquet);
6963
6966
  for await (const rowBatch of rowBatches) {
6967
+ let limitHasReached = false;
6964
6968
  for (const row of rowBatch) {
6969
+ if ((options == null ? void 0 : options.limit) && rows.length >= (options == null ? void 0 : options.limit)) {
6970
+ limitHasReached = true;
6971
+ break;
6972
+ }
6965
6973
  rows.push(row);
6966
6974
  }
6975
+ if (limitHasReached) {
6976
+ import_log.default.warn(`Rows number limit has been reached. Only first ${options == null ? void 0 : options.limit} are loaded`)();
6977
+ break;
6978
+ }
6967
6979
  }
6968
6980
  const objectRowTable = {
6969
6981
  shape: "object-row-table",
@@ -7101,7 +7113,7 @@ function convertRowGroupToTableBatch(rowGroup, parquetSchema, schema) {
7101
7113
  }
7102
7114
 
7103
7115
  // dist/parquet-loader.js
7104
- var VERSION2 = true ? "4.2.0" : "latest";
7116
+ var VERSION2 = true ? "4.3.0-alpha.1" : "latest";
7105
7117
  var ParquetWorkerLoader = {
7106
7118
  dataType: null,
7107
7119
  batchType: null,
@@ -7190,7 +7202,7 @@ var ParquetColumnarLoader = {
7190
7202
  };
7191
7203
 
7192
7204
  // dist/parquet-writer.js
7193
- var VERSION3 = true ? "4.2.0" : "latest";
7205
+ var VERSION3 = true ? "4.3.0-alpha.1" : "latest";
7194
7206
  var ParquetWriter = {
7195
7207
  name: "Apache Parquet",
7196
7208
  id: "parquet",
@@ -7207,10 +7219,14 @@ function encodeSync(data, options) {
7207
7219
  return new ArrayBuffer(0);
7208
7220
  }
7209
7221
 
7210
- // dist/lib/wasm/parse-parquet-wasm.js
7222
+ // dist/parquet-wasm-loader.js
7223
+ var import_loader_utils4 = require("@loaders.gl/loader-utils");
7224
+
7225
+ // dist/lib/parsers/parse-parquet-wasm.js
7211
7226
  var import_arrow = require("@loaders.gl/arrow");
7227
+ var arrow = __toESM(require("apache-arrow"), 1);
7212
7228
 
7213
- // dist/lib/wasm/load-wasm.js
7229
+ // dist/lib/utils/load-wasm.js
7214
7230
  var import_parquet_wasm = __toESM(require("parquet-wasm"), 1);
7215
7231
  var parquetWasm = __toESM(require("parquet-wasm"), 1);
7216
7232
  var initializePromise;
@@ -7225,23 +7241,75 @@ async function loadWasm(wasmUrl = PARQUET_WASM_URL) {
7225
7241
  return parquetWasm;
7226
7242
  }
7227
7243
 
7228
- // dist/lib/wasm/parse-parquet-wasm.js
7229
- var arrow = __toESM(require("apache-arrow"), 1);
7230
- async function parseParquetWasm(arrayBuffer, options) {
7231
- var _a;
7232
- const arr = new Uint8Array(arrayBuffer);
7233
- const wasmUrl = (_a = options == null ? void 0 : options.parquet) == null ? void 0 : _a.wasmUrl;
7234
- const wasm = await loadWasm(wasmUrl);
7235
- const wasmTable = wasm.readParquet(arr);
7244
+ // dist/lib/utils/make-stream-iterator.js
7245
+ var import_loader_utils3 = require("@loaders.gl/loader-utils");
7246
+ function makeStreamIterator(stream, options) {
7247
+ return import_loader_utils3.isBrowser ? makeBrowserStreamIterator(stream, options) : makeNodeStreamIterator(stream, options);
7248
+ }
7249
+ async function* makeBrowserStreamIterator(stream, options) {
7250
+ const reader = stream.getReader();
7251
+ let nextBatchPromise;
7236
7252
  try {
7237
- const ipcStream = wasmTable.intoIPCStream();
7238
- const arrowTable = arrow.tableFromIPC(ipcStream);
7239
- return {
7253
+ while (true) {
7254
+ const currentBatchPromise = nextBatchPromise || reader.read();
7255
+ if (options == null ? void 0 : options._streamReadAhead) {
7256
+ nextBatchPromise = reader.read();
7257
+ }
7258
+ const { done, value } = await currentBatchPromise;
7259
+ if (done) {
7260
+ return;
7261
+ }
7262
+ if (value) {
7263
+ yield value;
7264
+ }
7265
+ }
7266
+ } catch (error) {
7267
+ reader.releaseLock();
7268
+ }
7269
+ }
7270
+ async function* makeNodeStreamIterator(stream, options) {
7271
+ yield* stream;
7272
+ }
7273
+
7274
+ // dist/lib/parsers/parse-parquet-wasm.js
7275
+ async function parseParquetFileWasm(file, options) {
7276
+ const wasmUrl = options == null ? void 0 : options.wasmUrl;
7277
+ const wasm = await loadWasm(wasmUrl);
7278
+ let parquetFile;
7279
+ if (file.handle instanceof Blob) {
7280
+ parquetFile = await wasm.ParquetFile.fromFile(file.handle);
7281
+ } else {
7282
+ parquetFile = await wasm.ParquetFile.fromUrl(file.url);
7283
+ }
7284
+ const wasmTable = await parquetFile.read(options);
7285
+ const ipcStream = wasmTable.intoIPCStream();
7286
+ const arrowTable = arrow.tableFromIPC(ipcStream);
7287
+ return {
7288
+ shape: "arrow-table",
7289
+ schema: (0, import_arrow.serializeArrowSchema)(arrowTable.schema),
7290
+ data: arrowTable
7291
+ };
7292
+ }
7293
+ async function* parseParquetFileInBatchesWasm(file, options) {
7294
+ const wasmUrl = options == null ? void 0 : options.wasmUrl;
7295
+ const wasm = await loadWasm(wasmUrl);
7296
+ let parquetFile;
7297
+ if (file.handle instanceof Blob) {
7298
+ parquetFile = await wasm.ParquetFile.fromFile(file.handle);
7299
+ } else {
7300
+ parquetFile = await wasm.ParquetFile.fromUrl(file.url);
7301
+ }
7302
+ const stream = await parquetFile.stream(options);
7303
+ let schema;
7304
+ for await (const table of makeStreamIterator(stream)) {
7305
+ schema ||= (0, import_arrow.serializeArrowSchema)(table.schema);
7306
+ yield {
7307
+ batchType: "data",
7240
7308
  shape: "arrow-table",
7241
- schema: (0, import_arrow.serializeArrowSchema)(arrowTable.schema),
7242
- data: arrowTable
7309
+ schema,
7310
+ data: table.batches[0],
7311
+ length: table.numRows
7243
7312
  };
7244
- } finally {
7245
7313
  }
7246
7314
  }
7247
7315
 
@@ -7261,7 +7329,19 @@ var ParquetWasmWorkerLoader = {
7261
7329
  tests: ["PAR1", "PARE"],
7262
7330
  options: {
7263
7331
  parquet: {
7264
- type: "arrow-table",
7332
+ shape: "arrow-table",
7333
+ limit: void 0,
7334
+ // Provide a limit to the number of rows to be read.
7335
+ offset: 0,
7336
+ // Provide an offset to skip over the given number of rows.
7337
+ batchSize: void 0,
7338
+ // The number of rows in each batch. If not provided, the upstream parquet default is 1024.
7339
+ columns: void 0,
7340
+ // The column names from the file to read.
7341
+ rowGroups: void 0,
7342
+ // Only read data from the provided row group indexes.
7343
+ concurrency: void 0,
7344
+ // The number of concurrent requests to make
7265
7345
  wasmUrl: PARQUET_WASM_URL
7266
7346
  }
7267
7347
  }
@@ -7269,12 +7349,20 @@ var ParquetWasmWorkerLoader = {
7269
7349
  var ParquetWasmLoader = {
7270
7350
  ...ParquetWasmWorkerLoader,
7271
7351
  parse(arrayBuffer, options) {
7272
- options = { parquet: { ...ParquetWasmLoader.options.parquet, ...options == null ? void 0 : options.parquet }, ...options };
7273
- return parseParquetWasm(arrayBuffer, options);
7352
+ const wasmOptions = { ...ParquetWasmLoader.options.parquet, ...options == null ? void 0 : options.parquet };
7353
+ return parseParquetFileWasm(new import_loader_utils4.BlobFile(arrayBuffer), wasmOptions);
7354
+ },
7355
+ parseFile(file, options) {
7356
+ const wasmOptions = { ...ParquetWasmLoader.options.parquet, ...options == null ? void 0 : options.parquet };
7357
+ return parseParquetFileWasm(file, wasmOptions);
7358
+ },
7359
+ parseFileInBatches(file, options) {
7360
+ const wasmOptions = { ...ParquetWasmLoader.options.parquet, ...options == null ? void 0 : options.parquet };
7361
+ return parseParquetFileInBatchesWasm(file, wasmOptions);
7274
7362
  }
7275
7363
  };
7276
7364
 
7277
- // dist/lib/wasm/encode-parquet-wasm.js
7365
+ // dist/lib/encoders/encode-parquet-wasm.js
7278
7366
  var arrow2 = __toESM(require("apache-arrow"), 1);
7279
7367
  async function encode(table, options) {
7280
7368
  var _a;