@loaders.gl/parquet 4.2.1 → 4.3.0-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/index.cjs +112 -24
  2. package/dist/index.cjs.map +4 -4
  3. package/dist/lib/constants.d.ts +1 -1
  4. package/dist/lib/constants.d.ts.map +1 -1
  5. package/dist/lib/constants.js +2 -2
  6. package/dist/lib/encoders/encode-parquet-wasm.d.ts.map +1 -0
  7. package/dist/lib/{wasm → encoders}/encode-parquet-wasm.js +1 -1
  8. package/dist/lib/parsers/parse-parquet-wasm.d.ts +10 -0
  9. package/dist/lib/parsers/parse-parquet-wasm.d.ts.map +1 -0
  10. package/dist/lib/parsers/parse-parquet-wasm.js +51 -0
  11. package/dist/lib/parsers/parse-parquet.d.ts.map +1 -1
  12. package/dist/lib/parsers/parse-parquet.js +10 -0
  13. package/dist/lib/utils/load-wasm.d.ts +3 -0
  14. package/dist/lib/utils/load-wasm.d.ts.map +1 -0
  15. package/dist/lib/utils/make-stream-iterator.d.ts +11 -0
  16. package/dist/lib/utils/make-stream-iterator.d.ts.map +1 -0
  17. package/dist/lib/utils/make-stream-iterator.js +67 -0
  18. package/dist/parquet-loader.js +1 -1
  19. package/dist/parquet-wasm-loader.d.ts +17 -5
  20. package/dist/parquet-wasm-loader.d.ts.map +1 -1
  21. package/dist/parquet-wasm-loader.js +19 -4
  22. package/dist/parquet-wasm-writer.js +1 -1
  23. package/dist/parquet-writer.js +1 -1
  24. package/dist/parquet_wasm_bg.wasm +0 -0
  25. package/dist/parquetjs/codecs/rle.d.ts.map +1 -1
  26. package/dist/parquetjs/codecs/rle.js +1 -0
  27. package/package.json +13 -12
  28. package/src/lib/constants.ts +2 -1
  29. package/src/lib/{wasm → encoders}/encode-parquet-wasm.ts +1 -1
  30. package/src/lib/parsers/parse-parquet-wasm.ts +72 -0
  31. package/src/lib/parsers/parse-parquet.ts +10 -0
  32. package/src/lib/utils/make-stream-iterator.ts +87 -0
  33. package/src/parquet-wasm-loader.ts +36 -9
  34. package/src/parquet-wasm-writer.ts +1 -1
  35. package/src/parquetjs/codecs/rle.ts +3 -1
  36. package/dist/arrow1_bg.wasm +0 -0
  37. package/dist/lib/wasm/encode-parquet-wasm.d.ts.map +0 -1
  38. package/dist/lib/wasm/load-wasm.d.ts +0 -3
  39. package/dist/lib/wasm/load-wasm.d.ts.map +0 -1
  40. package/dist/lib/wasm/parse-parquet-wasm.d.ts +0 -4
  41. package/dist/lib/wasm/parse-parquet-wasm.d.ts.map +0 -1
  42. package/dist/lib/wasm/parse-parquet-wasm.js +0 -24
  43. package/src/lib/wasm/parse-parquet-wasm.ts +0 -33
  44. package/src/lib/wip/convert-schema-deep.java.disabled +0 -910
  45. package/src/lib/wip/convert-schema-deep.rs.disabled +0 -976
  46. /package/dist/lib/{wasm → encoders}/encode-parquet-wasm.d.ts +0 -0
  47. /package/dist/lib/{wasm → utils}/load-wasm.js +0 -0
  48. /package/src/lib/{wasm → utils}/load-wasm.ts +0 -0
@@ -1,5 +1,5 @@
1
1
  export declare const VERSION: any;
2
- export declare const PARQUET_WASM_URL = "https://unpkg.com/parquet-wasm@0.6.0-beta.1/esm/arrow1_bg.wasm";
2
+ export declare const PARQUET_WASM_URL = "https://unpkg.com/parquet-wasm@0.6.1/esm/parquet_wasm_bg.wasm";
3
3
  /**
4
4
  * Parquet File Magic String
5
5
  */
@@ -1 +1 @@
1
- {"version":3,"file":"constants.d.ts","sourceRoot":"","sources":["../../src/lib/constants.ts"],"names":[],"mappings":"AAMA,eAAO,MAAM,OAAO,KAA8D,CAAC;AACnF,eAAO,MAAM,gBAAgB,mEAAmE,CAAC;AAEjG;;GAEG;AACH,eAAO,MAAM,aAAa,SAAS,CAAC;AACpC,eAAO,MAAM,uBAAuB,SAAS,CAAC;AAE9C;;GAEG;AACH,eAAO,MAAM,eAAe,IAAI,CAAC;AAEjC;;GAEG;AACH,eAAO,MAAM,kBAAkB,UAAU,CAAC;AAC1C,eAAO,MAAM,sBAAsB,QAAQ,CAAC"}
1
+ {"version":3,"file":"constants.d.ts","sourceRoot":"","sources":["../../src/lib/constants.ts"],"names":[],"mappings":"AAMA,eAAO,MAAM,OAAO,KAA8D,CAAC;AAEnF,eAAO,MAAM,gBAAgB,kEAAkE,CAAC;AAEhG;;GAEG;AACH,eAAO,MAAM,aAAa,SAAS,CAAC;AACpC,eAAO,MAAM,uBAAuB,SAAS,CAAC;AAE9C;;GAEG;AACH,eAAO,MAAM,eAAe,IAAI,CAAC;AAEjC;;GAEG;AACH,eAAO,MAAM,kBAAkB,UAAU,CAAC;AAC1C,eAAO,MAAM,sBAAsB,QAAQ,CAAC"}
@@ -3,8 +3,8 @@
3
3
  // Copyright (c) vis.gl contributors
4
4
  // __VERSION__ is injected by babel-plugin-version-inline
5
5
  // @ts-ignore TS2304: Cannot find name '__VERSION__'.
6
- export const VERSION = typeof "4.2.0" !== 'undefined' ? "4.2.0" : 'latest';
7
- export const PARQUET_WASM_URL = 'https://unpkg.com/parquet-wasm@0.6.0-beta.1/esm/arrow1_bg.wasm';
6
+ export const VERSION = typeof "4.3.0-alpha.1" !== 'undefined' ? "4.3.0-alpha.1" : 'latest';
7
+ export const PARQUET_WASM_URL = 'https://unpkg.com/parquet-wasm@0.6.1/esm/parquet_wasm_bg.wasm';
8
8
  /**
9
9
  * Parquet File Magic String
10
10
  */
@@ -0,0 +1 @@
1
+ {"version":3,"file":"encode-parquet-wasm.d.ts","sourceRoot":"","sources":["../../../src/lib/encoders/encode-parquet-wasm.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAC,UAAU,EAAC,MAAM,mBAAmB,CAAC;AAKlD,OAAO,KAAK,EAAC,oBAAoB,EAAC,qCAAkC;AAEpE;;GAEG;AACH,wBAAsB,MAAM,CAC1B,KAAK,EAAE,UAAU,EACjB,OAAO,EAAE,oBAAoB,GAC5B,OAAO,CAAC,WAAW,CAAC,CAsBtB"}
@@ -2,7 +2,7 @@
2
2
  // SPDX-License-Identifier: MIT
3
3
  // Copyright (c) vis.gl contributors
4
4
  import * as arrow from 'apache-arrow';
5
- import { loadWasm } from "./load-wasm.js";
5
+ import { loadWasm } from "../utils/load-wasm.js";
6
6
  /**
7
7
  * Encode Arrow arrow.Table to Parquet buffer
8
8
  */
@@ -0,0 +1,10 @@
1
+ import type { ReadableFile } from '@loaders.gl/loader-utils';
2
+ import type { ArrowTable, ArrowTableBatch } from '@loaders.gl/schema';
3
+ import type * as parquetWasm from 'parquet-wasm';
4
+ export declare function parseParquetFileWasm(file: ReadableFile, options?: parquetWasm.ReaderOptions & {
5
+ wasmUrl: string;
6
+ }): Promise<ArrowTable>;
7
+ export declare function parseParquetFileInBatchesWasm(file: ReadableFile, options: parquetWasm.ReaderOptions & {
8
+ wasmUrl: string;
9
+ }): AsyncIterable<ArrowTableBatch>;
10
+ //# sourceMappingURL=parse-parquet-wasm.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"parse-parquet-wasm.d.ts","sourceRoot":"","sources":["../../../src/lib/parsers/parse-parquet-wasm.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAC,YAAY,EAAC,MAAM,0BAA0B,CAAC;AAC3D,OAAO,KAAK,EAAC,UAAU,EAAE,eAAe,EAAS,MAAM,oBAAoB,CAAC;AAG5E,OAAO,KAAK,KAAK,WAAW,MAAM,cAAc,CAAC;AAMjD,wBAAsB,oBAAoB,CACxC,IAAI,EAAE,YAAY,EAClB,OAAO,CAAC,EAAE,WAAW,CAAC,aAAa,GAAG;IAAC,OAAO,EAAE,MAAM,CAAA;CAAC,GACtD,OAAO,CAAC,UAAU,CAAC,CAsBrB;AAED,wBAAuB,6BAA6B,CAClD,IAAI,EAAE,YAAY,EAClB,OAAO,EAAE,WAAW,CAAC,aAAa,GAAG;IAAC,OAAO,EAAE,MAAM,CAAA;CAAC,GACrD,aAAa,CAAC,eAAe,CAAC,CA0BhC"}
@@ -0,0 +1,51 @@
1
+ // loaders.gl
2
+ // SPDX-License-Identifier: MIT
3
+ // Copyright (c) vis.gl contributors
4
+ import { serializeArrowSchema } from '@loaders.gl/arrow';
5
+ import * as arrow from 'apache-arrow';
6
+ import { loadWasm } from "../utils/load-wasm.js";
7
+ import { makeStreamIterator } from "../utils/make-stream-iterator.js";
8
+ export async function parseParquetFileWasm(file, options) {
9
+ const wasmUrl = options?.wasmUrl;
10
+ const wasm = await loadWasm(wasmUrl);
11
+ let parquetFile;
12
+ if (file.handle instanceof Blob) {
13
+ // TODO - let's assume fromFile() works on Blobs and not just on File...
14
+ parquetFile = await wasm.ParquetFile.fromFile(file.handle);
15
+ }
16
+ else {
17
+ parquetFile = await wasm.ParquetFile.fromUrl(file.url);
18
+ }
19
+ const wasmTable = await parquetFile.read(options);
20
+ const ipcStream = wasmTable.intoIPCStream();
21
+ const arrowTable = arrow.tableFromIPC(ipcStream);
22
+ return {
23
+ shape: 'arrow-table',
24
+ schema: serializeArrowSchema(arrowTable.schema),
25
+ data: arrowTable
26
+ };
27
+ }
28
+ export async function* parseParquetFileInBatchesWasm(file, options) {
29
+ const wasmUrl = options?.wasmUrl;
30
+ const wasm = await loadWasm(wasmUrl);
31
+ let parquetFile;
32
+ if (file.handle instanceof Blob) {
33
+ // Works on Blobs: https://kylebarron.dev/parquet-wasm/classes/esm_parquet_wasm.ParquetFile.html#fromFile
34
+ parquetFile = await wasm.ParquetFile.fromFile(file.handle);
35
+ }
36
+ else {
37
+ parquetFile = await wasm.ParquetFile.fromUrl(file.url);
38
+ }
39
+ const stream = await parquetFile.stream(options);
40
+ let schema;
41
+ for await (const table of makeStreamIterator(stream)) {
42
+ schema ||= serializeArrowSchema(table.schema);
43
+ yield {
44
+ batchType: 'data',
45
+ shape: 'arrow-table',
46
+ schema,
47
+ data: table.batches[0],
48
+ length: table.numRows
49
+ };
50
+ }
51
+ }
@@ -1 +1 @@
1
- {"version":3,"file":"parse-parquet.d.ts","sourceRoot":"","sources":["../../../src/lib/parsers/parse-parquet.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAC,YAAY,EAAC,MAAM,0BAA0B,CAAC;AAC3D,OAAO,KAAK,EAAC,cAAc,EAAE,mBAAmB,EAAC,MAAM,oBAAoB,CAAC;AAE5E,OAAO,KAAK,EAAC,oBAAoB,EAAC,gCAA6B;AAO/D;;;;;GAKG;AACH,wBAAsB,gBAAgB,CACpC,IAAI,EAAE,YAAY,EAClB,OAAO,CAAC,EAAE,oBAAoB,GAC7B,OAAO,CAAC,cAAc,CAAC,CA2BzB;AAED;;;;GAIG;AACH,wBAAuB,yBAAyB,CAC9C,IAAI,EAAE,YAAY,EAClB,OAAO,CAAC,EAAE,oBAAoB,GAC7B,aAAa,CAAC,mBAAmB,CAAC,CA0BpC"}
1
+ {"version":3,"file":"parse-parquet.d.ts","sourceRoot":"","sources":["../../../src/lib/parsers/parse-parquet.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAC,YAAY,EAAC,MAAM,0BAA0B,CAAC;AAC3D,OAAO,KAAK,EAAC,cAAc,EAAE,mBAAmB,EAAC,MAAM,oBAAoB,CAAC;AAE5E,OAAO,KAAK,EAAC,oBAAoB,EAAC,gCAA6B;AAO/D;;;;;GAKG;AACH,wBAAsB,gBAAgB,CACpC,IAAI,EAAE,YAAY,EAClB,OAAO,CAAC,EAAE,oBAAoB,GAC7B,OAAO,CAAC,cAAc,CAAC,CAoCzB;AAED;;;;GAIG;AACH,wBAAuB,yBAAyB,CAC9C,IAAI,EAAE,YAAY,EAClB,OAAO,CAAC,EAAE,oBAAoB,GAC7B,aAAa,CAAC,mBAAmB,CAAC,CA0BpC"}
@@ -1,6 +1,7 @@
1
1
  // loaders.gl
2
2
  // SPDX-License-Identifier: MIT
3
3
  // Copyright (c) vis.gl contributors
4
+ import { default as log } from '@probe.gl/log';
4
5
  import { ParquetReader } from "../../parquetjs/parser/parquet-reader.js";
5
6
  import { getSchemaFromParquetReader } from "./get-parquet-schema.js";
6
7
  import { installBufferPolyfill } from "../../polyfills/buffer/index.js";
@@ -21,10 +22,19 @@ export async function parseParquetFile(file, options) {
21
22
  const rows = [];
22
23
  const rowBatches = reader.rowBatchIterator(options?.parquet);
23
24
  for await (const rowBatch of rowBatches) {
25
+ let limitHasReached = false;
24
26
  // we have only one input batch so return
25
27
  for (const row of rowBatch) {
28
+ if (options?.limit && rows.length >= options?.limit) {
29
+ limitHasReached = true;
30
+ break;
31
+ }
26
32
  rows.push(row);
27
33
  }
34
+ if (limitHasReached) {
35
+ log.warn(`Rows number limit has been reached. Only first ${options?.limit} are loaded`)();
36
+ break;
37
+ }
28
38
  }
29
39
  const objectRowTable = {
30
40
  shape: 'object-row-table',
@@ -0,0 +1,3 @@
1
+ import * as parquetWasm from 'parquet-wasm';
2
+ export declare function loadWasm(wasmUrl?: string): Promise<typeof parquetWasm>;
3
+ //# sourceMappingURL=load-wasm.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"load-wasm.d.ts","sourceRoot":"","sources":["../../../src/lib/utils/load-wasm.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,WAAW,MAAM,cAAc,CAAC;AAK5C,wBAAsB,QAAQ,CAAC,OAAO,GAAE,MAAyB,+BAUhE"}
@@ -0,0 +1,11 @@
1
+ /// <reference types="node" />
2
+ import type { Readable } from 'stream';
3
+ export type StreamIteratorOptions = {
4
+ _streamReadAhead?: boolean;
5
+ };
6
+ /**
7
+ * Returns an async iterable that reads from a stream (works in both Node.js and browsers)
8
+ * @param stream stream to iterator over
9
+ */
10
+ export declare function makeStreamIterator<T>(stream: ReadableStream<T> | Readable, options?: StreamIteratorOptions): AsyncIterable<T>;
11
+ //# sourceMappingURL=make-stream-iterator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"make-stream-iterator.d.ts","sourceRoot":"","sources":["../../../src/lib/utils/make-stream-iterator.ts"],"names":[],"mappings":";AAIA,OAAO,KAAK,EAAC,QAAQ,EAAC,MAAM,QAAQ,CAAC;AAGrC,MAAM,MAAM,qBAAqB,GAAG;IAClC,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,CAAC;AAEF;;;GAGG;AACH,wBAAgB,kBAAkB,CAAC,CAAC,EAClC,MAAM,EAAE,cAAc,CAAC,CAAC,CAAC,GAAG,QAAQ,EACpC,OAAO,CAAC,EAAE,qBAAqB,GAC9B,aAAa,CAAC,CAAC,CAAC,CAIlB"}
@@ -0,0 +1,67 @@
1
+ // loaders.gl
2
+ // SPDX-License-Identifier: MIT
3
+ // Copyright (c) vis.gl contributors
4
+ import { isBrowser } from '@loaders.gl/loader-utils';
5
+ /**
6
+ * Returns an async iterable that reads from a stream (works in both Node.js and browsers)
7
+ * @param stream stream to iterator over
8
+ */
9
+ export function makeStreamIterator(stream, options) {
10
+ return isBrowser
11
+ ? makeBrowserStreamIterator(stream, options)
12
+ : makeNodeStreamIterator(stream, options);
13
+ }
14
+ /**
15
+ * Returns an async iterable that reads from a DOM (browser) stream
16
+ * @param stream stream to iterate from
17
+ * @see https://jakearchibald.com/2017/async-iterators-and-generators/#making-streams-iterate
18
+ */
19
+ async function* makeBrowserStreamIterator(stream, options) {
20
+ // WhatWG: stream is supposed to have a `getIterator` method
21
+ // if (typeof stream.getIterator === 'function') {
22
+ // return stream.getIterator();
23
+ // }
24
+ // if (typeof stream[Symbol.asyncIterator] === 'function') {
25
+ // return makeToArrayBufferIterator(stream);
26
+ // }
27
+ // In the browser, we first need to get a lock on the stream
28
+ const reader = stream.getReader();
29
+ let nextBatchPromise;
30
+ try {
31
+ // eslint-disable-next-line no-constant-condition
32
+ while (true) {
33
+ const currentBatchPromise = nextBatchPromise || reader.read();
34
+ // Issue a read for an additional batch, while we await the next batch
35
+ // Idea is to make fetching happen in parallel with processing / parsing
36
+ if (options?._streamReadAhead) {
37
+ nextBatchPromise = reader.read();
38
+ }
39
+ // Read from the stream
40
+ // value is a Uint8Array
41
+ const { done, value } = await currentBatchPromise;
42
+ // Exit if we're done
43
+ if (done) {
44
+ return;
45
+ }
46
+ // Else yield the chunk
47
+ if (value) {
48
+ yield value;
49
+ }
50
+ }
51
+ }
52
+ catch (error) {
53
+ // TODO - examples makes it look like this should always be called,
54
+ // but that generates exceptions so only call it if we do not reach the end
55
+ reader.releaseLock();
56
+ }
57
+ }
58
+ /**
59
+ * Returns an async iterable that reads from a DOM (browser) stream
60
+ * @param stream stream to iterate from
61
+ * @note Requires Node.js >= 10
62
+ */
63
+ async function* makeNodeStreamIterator(stream, options) {
64
+ // Hacky test for node version to ensure we don't call bad polyfills
65
+ // NODE 10+: stream is an asyncIterator
66
+ yield* stream;
67
+ }
@@ -8,7 +8,7 @@ import { parseParquetFileInColumns, parseParquetFileInColumnarBatches } from "./
8
8
  export { Buffer } from "./polyfills/buffer/install-buffer-polyfill.js";
9
9
  // __VERSION__ is injected by babel-plugin-version-inline
10
10
  // @ts-ignore TS2304: Cannot find name '__VERSION__'.
11
- const VERSION = typeof "4.2.0" !== 'undefined' ? "4.2.0" : 'latest';
11
+ const VERSION = typeof "4.3.0-alpha.1" !== 'undefined' ? "4.3.0-alpha.1" : 'latest';
12
12
  /**
13
13
  * ParquetJS table loader
14
14
  */
@@ -1,16 +1,22 @@
1
+ import type { ArrowTable, ArrowTableBatch } from '@loaders.gl/schema';
1
2
  import type { LoaderOptions } from '@loaders.gl/loader-utils';
2
- import type { ArrowTable } from '@loaders.gl/arrow';
3
3
  /** Parquet WASM loader options */
4
4
  export type ParquetWasmLoaderOptions = LoaderOptions & {
5
5
  parquet?: {
6
- type?: 'arrow-table';
6
+ shape?: 'arrow-table';
7
+ limit?: number;
8
+ offset?: number;
9
+ batchSize?: number;
10
+ columns?: string[];
11
+ rowGroups?: number[];
12
+ concurrency?: number;
7
13
  wasmUrl?: string;
8
14
  };
9
15
  };
10
16
  /** Parquet WASM table loader */
11
17
  export declare const ParquetWasmWorkerLoader: {
12
18
  readonly dataType: ArrowTable;
13
- readonly batchType: never;
19
+ readonly batchType: ArrowTableBatch;
14
20
  readonly name: "Apache Parquet";
15
21
  readonly id: "parquet-wasm";
16
22
  readonly module: "parquet";
@@ -23,8 +29,14 @@ export declare const ParquetWasmWorkerLoader: {
23
29
  readonly tests: ["PAR1", "PARE"];
24
30
  readonly options: {
25
31
  readonly parquet: {
26
- readonly type: "arrow-table";
27
- readonly wasmUrl: "https://unpkg.com/parquet-wasm@0.6.0-beta.1/esm/arrow1_bg.wasm";
32
+ readonly shape: "arrow-table";
33
+ readonly limit: undefined;
34
+ readonly offset: 0;
35
+ readonly batchSize: undefined;
36
+ readonly columns: undefined;
37
+ readonly rowGroups: undefined;
38
+ readonly concurrency: undefined;
39
+ readonly wasmUrl: "https://unpkg.com/parquet-wasm@0.6.1/esm/parquet_wasm_bg.wasm";
28
40
  };
29
41
  };
30
42
  };
@@ -1 +1 @@
1
- {"version":3,"file":"parquet-wasm-loader.d.ts","sourceRoot":"","sources":["../src/parquet-wasm-loader.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAA2B,aAAa,EAAC,MAAM,0BAA0B,CAAC;AACtF,OAAO,KAAK,EAAC,UAAU,EAAC,MAAM,mBAAmB,CAAC;AAKlD,kCAAkC;AAClC,MAAM,MAAM,wBAAwB,GAAG,aAAa,GAAG;IACrD,OAAO,CAAC,EAAE;QACR,IAAI,CAAC,EAAE,aAAa,CAAC;QACrB,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC;CACH,CAAC;AAEF,gCAAgC;AAChC,eAAO,MAAM,uBAAuB;;;;;;;;;;;;;;;;;;;CAoBoC,CAAC;AAEzE,gCAAgC;AAChC,eAAO,MAAM,iBAAiB,KAMoD,CAAC"}
1
+ {"version":3,"file":"parquet-wasm-loader.d.ts","sourceRoot":"","sources":["../src/parquet-wasm-loader.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAC,UAAU,EAAE,eAAe,EAAC,MAAM,oBAAoB,CAAC;AACpE,OAAO,KAAK,EAA2B,aAAa,EAAC,MAAM,0BAA0B,CAAC;AAStF,kCAAkC;AAClC,MAAM,MAAM,wBAAwB,GAAG,aAAa,GAAG;IACrD,OAAO,CAAC,EAAE;QACR,KAAK,CAAC,EAAE,aAAa,CAAC;QACtB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;QACnB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;QACrB,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC;CACH,CAAC;AAEF,gCAAgC;AAChC,eAAO,MAAM,uBAAuB;;;;;;;;;;;;;;;;;;;;;;;;;CA0B8C,CAAC;AAEnF,gCAAgC;AAChC,eAAO,MAAM,iBAAiB,KAiB8D,CAAC"}
@@ -1,7 +1,8 @@
1
1
  // loaders.gl
2
2
  // SPDX-License-Identifier: MIT
3
3
  // Copyright (c) vis.gl contributors
4
- import { parseParquetWasm } from "./lib/wasm/parse-parquet-wasm.js";
4
+ import { BlobFile } from '@loaders.gl/loader-utils';
5
+ import { parseParquetFileWasm, parseParquetFileInBatchesWasm } from "./lib/parsers/parse-parquet-wasm.js";
5
6
  import { VERSION, PARQUET_WASM_URL } from "./lib/constants.js";
6
7
  /** Parquet WASM table loader */
7
8
  export const ParquetWasmWorkerLoader = {
@@ -19,7 +20,13 @@ export const ParquetWasmWorkerLoader = {
19
20
  tests: ['PAR1', 'PARE'],
20
21
  options: {
21
22
  parquet: {
22
- type: 'arrow-table',
23
+ shape: 'arrow-table',
24
+ limit: undefined, // Provide a limit to the number of rows to be read.
25
+ offset: 0, // Provide an offset to skip over the given number of rows.
26
+ batchSize: undefined, // The number of rows in each batch. If not provided, the upstream parquet default is 1024.
27
+ columns: undefined, // The column names from the file to read.
28
+ rowGroups: undefined, // Only read data from the provided row group indexes.
29
+ concurrency: undefined, // The number of concurrent requests to make
23
30
  wasmUrl: PARQUET_WASM_URL
24
31
  }
25
32
  }
@@ -28,7 +35,15 @@ export const ParquetWasmWorkerLoader = {
28
35
  export const ParquetWasmLoader = {
29
36
  ...ParquetWasmWorkerLoader,
30
37
  parse(arrayBuffer, options) {
31
- options = { parquet: { ...ParquetWasmLoader.options.parquet, ...options?.parquet }, ...options };
32
- return parseParquetWasm(arrayBuffer, options);
38
+ const wasmOptions = { ...ParquetWasmLoader.options.parquet, ...options?.parquet };
39
+ return parseParquetFileWasm(new BlobFile(arrayBuffer), wasmOptions);
40
+ },
41
+ parseFile(file, options) {
42
+ const wasmOptions = { ...ParquetWasmLoader.options.parquet, ...options?.parquet };
43
+ return parseParquetFileWasm(file, wasmOptions);
44
+ },
45
+ parseFileInBatches(file, options) {
46
+ const wasmOptions = { ...ParquetWasmLoader.options.parquet, ...options?.parquet };
47
+ return parseParquetFileInBatchesWasm(file, wasmOptions);
33
48
  }
34
49
  };
@@ -1,7 +1,7 @@
1
1
  // loaders.gl
2
2
  // SPDX-License-Identifier: MIT
3
3
  // Copyright (c) vis.gl contributors
4
- import { encode } from "./lib/wasm/encode-parquet-wasm.js";
4
+ import { encode } from "./lib/encoders/encode-parquet-wasm.js";
5
5
  import { VERSION, PARQUET_WASM_URL } from "./lib/constants.js";
6
6
  /** Parquet WASM writer */
7
7
  export const ParquetWasmWriter = {
@@ -3,7 +3,7 @@
3
3
  // Copyright (c) vis.gl contributors
4
4
  // __VERSION__ is injected by babel-plugin-version-inline
5
5
  // @ts-ignore TS2304: Cannot find name '__VERSION__'.
6
- const VERSION = typeof "4.2.0" !== 'undefined' ? "4.2.0" : 'latest';
6
+ const VERSION = typeof "4.3.0-alpha.1" !== 'undefined' ? "4.3.0-alpha.1" : 'latest';
7
7
  export const ParquetWriter = {
8
8
  name: 'Apache Parquet',
9
9
  id: 'parquet',
Binary file
@@ -1 +1 @@
1
- {"version":3,"file":"rle.d.ts","sourceRoot":"","sources":["../../../src/parquetjs/codecs/rle.ts"],"names":[],"mappings":";AAEA,OAAO,KAAK,EAAC,aAAa,EAAC,6BAA0B;AACrD,OAAO,KAAK,EAAC,YAAY,EAAE,mBAAmB,EAAC,qBAAkB;AAIjE,wBAAgB,YAAY,CAC1B,IAAI,EAAE,aAAa,EACnB,MAAM,EAAE,GAAG,EAAE,EACb,IAAI,EAAE,mBAAmB,GACxB,MAAM,CA4DR;AAED,wBAAgB,YAAY,CAC1B,IAAI,EAAE,aAAa,EACnB,MAAM,EAAE,YAAY,EACpB,KAAK,EAAE,MAAM,EACb,IAAI,EAAE,mBAAmB,GACxB,MAAM,EAAE,CAgCV"}
1
+ {"version":3,"file":"rle.d.ts","sourceRoot":"","sources":["../../../src/parquetjs/codecs/rle.ts"],"names":[],"mappings":";AAEA,OAAO,KAAK,EAAC,aAAa,EAAC,6BAA0B;AACrD,OAAO,KAAK,EAAC,YAAY,EAAE,mBAAmB,EAAC,qBAAkB;AAIjE,wBAAgB,YAAY,CAC1B,IAAI,EAAE,aAAa,EACnB,MAAM,EAAE,GAAG,EAAE,EACb,IAAI,EAAE,mBAAmB,GACxB,MAAM,CA4DR;AAED,wBAAgB,YAAY,CAC1B,IAAI,EAAE,aAAa,EACnB,MAAM,EAAE,YAAY,EACpB,KAAK,EAAE,MAAM,EACb,IAAI,EAAE,mBAAmB,GACxB,MAAM,EAAE,CAkCV"}
@@ -76,6 +76,7 @@ export function decodeValues(type, cursor, count, opts) {
76
76
  const count = header >> 1;
77
77
  decodedValues = decodeRunRepeated(cursor, count, opts);
78
78
  }
79
+ // strange failure in docusaurus / webpack if we don't cast the type here
79
80
  for (const value of decodedValues) {
80
81
  values.push(value);
81
82
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@loaders.gl/parquet",
3
- "version": "4.2.1",
3
+ "version": "4.3.0-alpha.2",
4
4
  "description": "Framework-independent loader for Apache Parquet files",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -43,7 +43,7 @@
43
43
  "build-bundle": "ocular-bundle ./bundle.ts --output=dist/dist.min.js",
44
44
  "build-bundle-dev": "ocular-bundle ./bundle.ts --env=dev --output=dist/dist.dev.js",
45
45
  "build-worker": "esbuild src/workers/parquet-worker.ts --outfile=dist/parquet-worker.js --bundle --minify --sourcemap --external:{util,fs,path,crypto,events} --define:__VERSION__=\\\"$npm_package_version\\\"",
46
- "copy-wasm": "cp ../../node_modules/parquet-wasm/esm2/arrow1_bg.wasm dist/arrow1_bg.wasm"
46
+ "copy-wasm": "cp ../../node_modules/parquet-wasm/esm/parquet_wasm_bg.wasm dist/parquet_wasm_bg.wasm"
47
47
  },
48
48
  "browser": {
49
49
  "./dist/polyfills/buffer/buffer-polyfill.node.js": "./dist/polyfills/buffer/buffer-polyfill.browser.js",
@@ -60,13 +60,14 @@
60
60
  "base64-js and ieee754 are used by buffer polyfill"
61
61
  ],
62
62
  "dependencies": {
63
- "@loaders.gl/arrow": "4.2.1",
64
- "@loaders.gl/bson": "4.2.1",
65
- "@loaders.gl/compression": "4.2.1",
66
- "@loaders.gl/gis": "4.2.1",
67
- "@loaders.gl/loader-utils": "4.2.1",
68
- "@loaders.gl/schema": "4.2.1",
69
- "@loaders.gl/wkt": "4.2.1",
63
+ "@loaders.gl/arrow": "4.3.0-alpha.2",
64
+ "@loaders.gl/bson": "4.3.0-alpha.2",
65
+ "@loaders.gl/compression": "4.3.0-alpha.2",
66
+ "@loaders.gl/gis": "4.3.0-alpha.2",
67
+ "@loaders.gl/loader-utils": "4.3.0-alpha.2",
68
+ "@loaders.gl/schema": "4.3.0-alpha.2",
69
+ "@loaders.gl/wkt": "4.3.0-alpha.2",
70
+ "@probe.gl/log": "^4.0.9",
70
71
  "async-mutex": "^0.2.2",
71
72
  "base64-js": "^1.3.1",
72
73
  "brotli": "^1.3.2",
@@ -75,11 +76,11 @@
75
76
  "lz4js": "^0.2.0",
76
77
  "node-int64": "^0.4.0",
77
78
  "object-stream": "0.0.1",
78
- "parquet-wasm": "^0.6.0-beta.1",
79
+ "parquet-wasm": "^0.6.1",
79
80
  "snappyjs": "^0.6.0",
80
81
  "thrift": "^0.19.0",
81
82
  "util": "^0.12.5",
82
- "varint": "^5.0.0",
83
+ "varint": "^6.0.0",
83
84
  "zstd-codec": "^0.1"
84
85
  },
85
86
  "devDependencies": {
@@ -93,5 +94,5 @@
93
94
  "@loaders.gl/core": "^4.0.0",
94
95
  "apache-arrow": ">= 15.0.0"
95
96
  },
96
- "gitHead": "b2ea799ff9f233ab8af0a6c3696e28ddf6e3c5a3"
97
+ "gitHead": "77a3cb538ab7a1fbf74245f25590210451689f5c"
97
98
  }
@@ -5,7 +5,8 @@
5
5
  // __VERSION__ is injected by babel-plugin-version-inline
6
6
  // @ts-ignore TS2304: Cannot find name '__VERSION__'.
7
7
  export const VERSION = typeof __VERSION__ !== 'undefined' ? __VERSION__ : 'latest';
8
- export const PARQUET_WASM_URL = 'https://unpkg.com/parquet-wasm@0.6.0-beta.1/esm/arrow1_bg.wasm';
8
+
9
+ export const PARQUET_WASM_URL = 'https://unpkg.com/parquet-wasm@0.6.1/esm/parquet_wasm_bg.wasm';
9
10
 
10
11
  /**
11
12
  * Parquet File Magic String
@@ -5,7 +5,7 @@
5
5
  import type {ArrowTable} from '@loaders.gl/arrow';
6
6
 
7
7
  import * as arrow from 'apache-arrow';
8
- import {loadWasm} from './load-wasm';
8
+ import {loadWasm} from '../utils/load-wasm';
9
9
 
10
10
  import type {ParquetWriterOptions} from '../../parquet-wasm-writer';
11
11
 
@@ -0,0 +1,72 @@
1
+ // loaders.gl
2
+ // SPDX-License-Identifier: MIT
3
+ // Copyright (c) vis.gl contributors
4
+
5
+ // eslint-disable
6
+ import type {ReadableFile} from '@loaders.gl/loader-utils';
7
+ import type {ArrowTable, ArrowTableBatch, Schema} from '@loaders.gl/schema';
8
+ import {serializeArrowSchema} from '@loaders.gl/arrow';
9
+
10
+ import type * as parquetWasm from 'parquet-wasm';
11
+ import * as arrow from 'apache-arrow';
12
+
13
+ import {loadWasm} from '../utils/load-wasm';
14
+ import {makeStreamIterator} from '../utils/make-stream-iterator';
15
+
16
+ export async function parseParquetFileWasm(
17
+ file: ReadableFile,
18
+ options?: parquetWasm.ReaderOptions & {wasmUrl: string}
19
+ ): Promise<ArrowTable> {
20
+ const wasmUrl = options?.wasmUrl;
21
+
22
+ const wasm = await loadWasm(wasmUrl);
23
+
24
+ let parquetFile: parquetWasm.ParquetFile;
25
+ if (file.handle instanceof Blob) {
26
+ // TODO - let's assume fromFile() works on Blobs and not just on File...
27
+ parquetFile = await wasm.ParquetFile.fromFile(file.handle as File);
28
+ } else {
29
+ parquetFile = await wasm.ParquetFile.fromUrl(file.url);
30
+ }
31
+
32
+ const wasmTable = await parquetFile.read(options);
33
+ const ipcStream = wasmTable.intoIPCStream();
34
+ const arrowTable = arrow.tableFromIPC(ipcStream);
35
+
36
+ return {
37
+ shape: 'arrow-table',
38
+ schema: serializeArrowSchema(arrowTable.schema),
39
+ data: arrowTable
40
+ };
41
+ }
42
+
43
+ export async function* parseParquetFileInBatchesWasm(
44
+ file: ReadableFile,
45
+ options: parquetWasm.ReaderOptions & {wasmUrl: string}
46
+ ): AsyncIterable<ArrowTableBatch> {
47
+ const wasmUrl = options?.wasmUrl;
48
+
49
+ const wasm = await loadWasm(wasmUrl);
50
+
51
+ let parquetFile: parquetWasm.ParquetFile;
52
+ if (file.handle instanceof Blob) {
53
+ // Works on Blobs: https://kylebarron.dev/parquet-wasm/classes/esm_parquet_wasm.ParquetFile.html#fromFile
54
+ parquetFile = await wasm.ParquetFile.fromFile(file.handle as File);
55
+ } else {
56
+ parquetFile = await wasm.ParquetFile.fromUrl(file.url);
57
+ }
58
+
59
+ const stream: ReadableStream<arrow.Table> = await parquetFile.stream(options);
60
+
61
+ let schema: Schema;
62
+ for await (const table of makeStreamIterator(stream)) {
63
+ schema ||= serializeArrowSchema(table.schema);
64
+ yield {
65
+ batchType: 'data',
66
+ shape: 'arrow-table',
67
+ schema,
68
+ data: table.batches[0],
69
+ length: table.numRows
70
+ };
71
+ }
72
+ }
@@ -2,6 +2,7 @@
2
2
  // SPDX-License-Identifier: MIT
3
3
  // Copyright (c) vis.gl contributors
4
4
 
5
+ import {default as log} from '@probe.gl/log';
5
6
  import type {ReadableFile} from '@loaders.gl/loader-utils';
6
7
  import type {ObjectRowTable, ObjectRowTableBatch} from '@loaders.gl/schema';
7
8
 
@@ -35,10 +36,19 @@ export async function parseParquetFile(
35
36
 
36
37
  const rowBatches = reader.rowBatchIterator(options?.parquet);
37
38
  for await (const rowBatch of rowBatches) {
39
+ let limitHasReached = false;
38
40
  // we have only one input batch so return
39
41
  for (const row of rowBatch) {
42
+ if (options?.limit && rows.length >= options?.limit) {
43
+ limitHasReached = true;
44
+ break;
45
+ }
40
46
  rows.push(row);
41
47
  }
48
+ if (limitHasReached) {
49
+ log.warn(`Rows number limit has been reached. Only first ${options?.limit} are loaded`)();
50
+ break;
51
+ }
42
52
  }
43
53
  const objectRowTable: ObjectRowTable = {
44
54
  shape: 'object-row-table',
@@ -0,0 +1,87 @@
1
+ // loaders.gl
2
+ // SPDX-License-Identifier: MIT
3
+ // Copyright (c) vis.gl contributors
4
+
5
+ import type {Readable} from 'stream';
6
+ import {isBrowser} from '@loaders.gl/loader-utils';
7
+
8
+ export type StreamIteratorOptions = {
9
+ _streamReadAhead?: boolean;
10
+ };
11
+
12
+ /**
13
+ * Returns an async iterable that reads from a stream (works in both Node.js and browsers)
14
+ * @param stream stream to iterator over
15
+ */
16
+ export function makeStreamIterator<T>(
17
+ stream: ReadableStream<T> | Readable,
18
+ options?: StreamIteratorOptions
19
+ ): AsyncIterable<T> {
20
+ return isBrowser
21
+ ? makeBrowserStreamIterator(stream as ReadableStream<T>, options)
22
+ : makeNodeStreamIterator(stream as Readable, options);
23
+ }
24
+
25
+ /**
26
+ * Returns an async iterable that reads from a DOM (browser) stream
27
+ * @param stream stream to iterate from
28
+ * @see https://jakearchibald.com/2017/async-iterators-and-generators/#making-streams-iterate
29
+ */
30
+ async function* makeBrowserStreamIterator<T>(
31
+ stream: ReadableStream<T>,
32
+ options?: StreamIteratorOptions
33
+ ): AsyncIterable<T> {
34
+ // WhatWG: stream is supposed to have a `getIterator` method
35
+ // if (typeof stream.getIterator === 'function') {
36
+ // return stream.getIterator();
37
+ // }
38
+ // if (typeof stream[Symbol.asyncIterator] === 'function') {
39
+ // return makeToArrayBufferIterator(stream);
40
+ // }
41
+
42
+ // In the browser, we first need to get a lock on the stream
43
+ const reader = stream.getReader();
44
+
45
+ let nextBatchPromise: Promise<{done?: boolean; value?: T}> | undefined;
46
+
47
+ try {
48
+ // eslint-disable-next-line no-constant-condition
49
+ while (true) {
50
+ const currentBatchPromise = nextBatchPromise || reader.read();
51
+ // Issue a read for an additional batch, while we await the next batch
52
+ // Idea is to make fetching happen in parallel with processing / parsing
53
+ if (options?._streamReadAhead) {
54
+ nextBatchPromise = reader.read();
55
+ }
56
+ // Read from the stream
57
+ // value is a Uint8Array
58
+ const {done, value} = await currentBatchPromise;
59
+ // Exit if we're done
60
+ if (done) {
61
+ return;
62
+ }
63
+ // Else yield the chunk
64
+ if (value) {
65
+ yield value;
66
+ }
67
+ }
68
+ } catch (error) {
69
+ // TODO - examples makes it look like this should always be called,
70
+ // but that generates exceptions so only call it if we do not reach the end
71
+ reader.releaseLock();
72
+ }
73
+ }
74
+
75
+ /**
76
+ * Returns an async iterable that reads from a DOM (browser) stream
77
+ * @param stream stream to iterate from
78
+ * @note Requires Node.js >= 10
79
+ */
80
+ async function* makeNodeStreamIterator<T>(
81
+ stream: Readable,
82
+ options?: StreamIteratorOptions
83
+ ): AsyncIterable<T> {
84
+ // Hacky test for node version to ensure we don't call bad polyfills
85
+ // NODE 10+: stream is an asyncIterator
86
+ yield* stream;
87
+ }