@loaders.gl/parquet 4.3.0-alpha.1 → 4.3.0-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +100 -24
- package/dist/index.cjs.map +4 -4
- package/dist/lib/constants.d.ts +1 -1
- package/dist/lib/constants.d.ts.map +1 -1
- package/dist/lib/constants.js +2 -2
- package/dist/lib/encoders/encode-parquet-wasm.d.ts.map +1 -0
- package/dist/lib/{wasm → encoders}/encode-parquet-wasm.js +1 -1
- package/dist/lib/parsers/parse-parquet-wasm.d.ts +10 -0
- package/dist/lib/parsers/parse-parquet-wasm.d.ts.map +1 -0
- package/dist/lib/parsers/parse-parquet-wasm.js +51 -0
- package/dist/lib/utils/load-wasm.d.ts +3 -0
- package/dist/lib/utils/load-wasm.d.ts.map +1 -0
- package/dist/lib/utils/make-stream-iterator.d.ts +11 -0
- package/dist/lib/utils/make-stream-iterator.d.ts.map +1 -0
- package/dist/lib/utils/make-stream-iterator.js +67 -0
- package/dist/parquet-loader.js +1 -1
- package/dist/parquet-wasm-loader.d.ts +17 -5
- package/dist/parquet-wasm-loader.d.ts.map +1 -1
- package/dist/parquet-wasm-loader.js +19 -4
- package/dist/parquet-wasm-writer.js +1 -1
- package/dist/parquet-writer.js +1 -1
- package/dist/parquet_wasm_bg.wasm +0 -0
- package/dist/parquetjs/codecs/rle.d.ts.map +1 -1
- package/dist/parquetjs/codecs/rle.js +1 -0
- package/package.json +12 -12
- package/src/lib/constants.ts +2 -1
- package/src/lib/{wasm → encoders}/encode-parquet-wasm.ts +1 -1
- package/src/lib/parsers/parse-parquet-wasm.ts +72 -0
- package/src/lib/utils/make-stream-iterator.ts +87 -0
- package/src/parquet-wasm-loader.ts +36 -9
- package/src/parquet-wasm-writer.ts +1 -1
- package/src/parquetjs/codecs/rle.ts +3 -1
- package/dist/arrow1_bg.wasm +0 -0
- package/dist/lib/wasm/encode-parquet-wasm.d.ts.map +0 -1
- package/dist/lib/wasm/load-wasm.d.ts +0 -3
- package/dist/lib/wasm/load-wasm.d.ts.map +0 -1
- package/dist/lib/wasm/parse-parquet-wasm.d.ts +0 -4
- package/dist/lib/wasm/parse-parquet-wasm.d.ts.map +0 -1
- package/dist/lib/wasm/parse-parquet-wasm.js +0 -24
- package/src/lib/wasm/parse-parquet-wasm.ts +0 -33
- package/src/lib/wip/convert-schema-deep.java.disabled +0 -910
- package/src/lib/wip/convert-schema-deep.rs.disabled +0 -976
- /package/dist/lib/{wasm → encoders}/encode-parquet-wasm.d.ts +0 -0
- /package/dist/lib/{wasm → utils}/load-wasm.js +0 -0
- /package/src/lib/{wasm → utils}/load-wasm.ts +0 -0
package/dist/lib/constants.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
export declare const VERSION: any;
|
|
2
|
-
export declare const PARQUET_WASM_URL = "https://unpkg.com/parquet-wasm@0.6.
|
|
2
|
+
export declare const PARQUET_WASM_URL = "https://unpkg.com/parquet-wasm@0.6.1/esm/parquet_wasm_bg.wasm";
|
|
3
3
|
/**
|
|
4
4
|
* Parquet File Magic String
|
|
5
5
|
*/
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"constants.d.ts","sourceRoot":"","sources":["../../src/lib/constants.ts"],"names":[],"mappings":"AAMA,eAAO,MAAM,OAAO,KAA8D,CAAC;
|
|
1
|
+
{"version":3,"file":"constants.d.ts","sourceRoot":"","sources":["../../src/lib/constants.ts"],"names":[],"mappings":"AAMA,eAAO,MAAM,OAAO,KAA8D,CAAC;AAEnF,eAAO,MAAM,gBAAgB,kEAAkE,CAAC;AAEhG;;GAEG;AACH,eAAO,MAAM,aAAa,SAAS,CAAC;AACpC,eAAO,MAAM,uBAAuB,SAAS,CAAC;AAE9C;;GAEG;AACH,eAAO,MAAM,eAAe,IAAI,CAAC;AAEjC;;GAEG;AACH,eAAO,MAAM,kBAAkB,UAAU,CAAC;AAC1C,eAAO,MAAM,sBAAsB,QAAQ,CAAC"}
|
package/dist/lib/constants.js
CHANGED
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
// Copyright (c) vis.gl contributors
|
|
4
4
|
// __VERSION__ is injected by babel-plugin-version-inline
|
|
5
5
|
// @ts-ignore TS2304: Cannot find name '__VERSION__'.
|
|
6
|
-
export const VERSION = typeof "4.
|
|
7
|
-
export const PARQUET_WASM_URL = 'https://unpkg.com/parquet-wasm@0.6.
|
|
6
|
+
export const VERSION = typeof "4.3.0-alpha.1" !== 'undefined' ? "4.3.0-alpha.1" : 'latest';
|
|
7
|
+
export const PARQUET_WASM_URL = 'https://unpkg.com/parquet-wasm@0.6.1/esm/parquet_wasm_bg.wasm';
|
|
8
8
|
/**
|
|
9
9
|
* Parquet File Magic String
|
|
10
10
|
*/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"encode-parquet-wasm.d.ts","sourceRoot":"","sources":["../../../src/lib/encoders/encode-parquet-wasm.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAC,UAAU,EAAC,MAAM,mBAAmB,CAAC;AAKlD,OAAO,KAAK,EAAC,oBAAoB,EAAC,qCAAkC;AAEpE;;GAEG;AACH,wBAAsB,MAAM,CAC1B,KAAK,EAAE,UAAU,EACjB,OAAO,EAAE,oBAAoB,GAC5B,OAAO,CAAC,WAAW,CAAC,CAsBtB"}
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
// SPDX-License-Identifier: MIT
|
|
3
3
|
// Copyright (c) vis.gl contributors
|
|
4
4
|
import * as arrow from 'apache-arrow';
|
|
5
|
-
import { loadWasm } from "
|
|
5
|
+
import { loadWasm } from "../utils/load-wasm.js";
|
|
6
6
|
/**
|
|
7
7
|
* Encode Arrow arrow.Table to Parquet buffer
|
|
8
8
|
*/
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { ReadableFile } from '@loaders.gl/loader-utils';
|
|
2
|
+
import type { ArrowTable, ArrowTableBatch } from '@loaders.gl/schema';
|
|
3
|
+
import type * as parquetWasm from 'parquet-wasm';
|
|
4
|
+
export declare function parseParquetFileWasm(file: ReadableFile, options?: parquetWasm.ReaderOptions & {
|
|
5
|
+
wasmUrl: string;
|
|
6
|
+
}): Promise<ArrowTable>;
|
|
7
|
+
export declare function parseParquetFileInBatchesWasm(file: ReadableFile, options: parquetWasm.ReaderOptions & {
|
|
8
|
+
wasmUrl: string;
|
|
9
|
+
}): AsyncIterable<ArrowTableBatch>;
|
|
10
|
+
//# sourceMappingURL=parse-parquet-wasm.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"parse-parquet-wasm.d.ts","sourceRoot":"","sources":["../../../src/lib/parsers/parse-parquet-wasm.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAC,YAAY,EAAC,MAAM,0BAA0B,CAAC;AAC3D,OAAO,KAAK,EAAC,UAAU,EAAE,eAAe,EAAS,MAAM,oBAAoB,CAAC;AAG5E,OAAO,KAAK,KAAK,WAAW,MAAM,cAAc,CAAC;AAMjD,wBAAsB,oBAAoB,CACxC,IAAI,EAAE,YAAY,EAClB,OAAO,CAAC,EAAE,WAAW,CAAC,aAAa,GAAG;IAAC,OAAO,EAAE,MAAM,CAAA;CAAC,GACtD,OAAO,CAAC,UAAU,CAAC,CAsBrB;AAED,wBAAuB,6BAA6B,CAClD,IAAI,EAAE,YAAY,EAClB,OAAO,EAAE,WAAW,CAAC,aAAa,GAAG;IAAC,OAAO,EAAE,MAAM,CAAA;CAAC,GACrD,aAAa,CAAC,eAAe,CAAC,CA0BhC"}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
// loaders.gl
|
|
2
|
+
// SPDX-License-Identifier: MIT
|
|
3
|
+
// Copyright (c) vis.gl contributors
|
|
4
|
+
import { serializeArrowSchema } from '@loaders.gl/arrow';
|
|
5
|
+
import * as arrow from 'apache-arrow';
|
|
6
|
+
import { loadWasm } from "../utils/load-wasm.js";
|
|
7
|
+
import { makeStreamIterator } from "../utils/make-stream-iterator.js";
|
|
8
|
+
export async function parseParquetFileWasm(file, options) {
|
|
9
|
+
const wasmUrl = options?.wasmUrl;
|
|
10
|
+
const wasm = await loadWasm(wasmUrl);
|
|
11
|
+
let parquetFile;
|
|
12
|
+
if (file.handle instanceof Blob) {
|
|
13
|
+
// TODO - let's assume fromFile() works on Blobs and not just on File...
|
|
14
|
+
parquetFile = await wasm.ParquetFile.fromFile(file.handle);
|
|
15
|
+
}
|
|
16
|
+
else {
|
|
17
|
+
parquetFile = await wasm.ParquetFile.fromUrl(file.url);
|
|
18
|
+
}
|
|
19
|
+
const wasmTable = await parquetFile.read(options);
|
|
20
|
+
const ipcStream = wasmTable.intoIPCStream();
|
|
21
|
+
const arrowTable = arrow.tableFromIPC(ipcStream);
|
|
22
|
+
return {
|
|
23
|
+
shape: 'arrow-table',
|
|
24
|
+
schema: serializeArrowSchema(arrowTable.schema),
|
|
25
|
+
data: arrowTable
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
export async function* parseParquetFileInBatchesWasm(file, options) {
|
|
29
|
+
const wasmUrl = options?.wasmUrl;
|
|
30
|
+
const wasm = await loadWasm(wasmUrl);
|
|
31
|
+
let parquetFile;
|
|
32
|
+
if (file.handle instanceof Blob) {
|
|
33
|
+
// Works on Blobs: https://kylebarron.dev/parquet-wasm/classes/esm_parquet_wasm.ParquetFile.html#fromFile
|
|
34
|
+
parquetFile = await wasm.ParquetFile.fromFile(file.handle);
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
parquetFile = await wasm.ParquetFile.fromUrl(file.url);
|
|
38
|
+
}
|
|
39
|
+
const stream = await parquetFile.stream(options);
|
|
40
|
+
let schema;
|
|
41
|
+
for await (const table of makeStreamIterator(stream)) {
|
|
42
|
+
schema ||= serializeArrowSchema(table.schema);
|
|
43
|
+
yield {
|
|
44
|
+
batchType: 'data',
|
|
45
|
+
shape: 'arrow-table',
|
|
46
|
+
schema,
|
|
47
|
+
data: table.batches[0],
|
|
48
|
+
length: table.numRows
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"load-wasm.d.ts","sourceRoot":"","sources":["../../../src/lib/utils/load-wasm.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,WAAW,MAAM,cAAc,CAAC;AAK5C,wBAAsB,QAAQ,CAAC,OAAO,GAAE,MAAyB,+BAUhE"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/// <reference types="node" />
|
|
2
|
+
import type { Readable } from 'stream';
|
|
3
|
+
export type StreamIteratorOptions = {
|
|
4
|
+
_streamReadAhead?: boolean;
|
|
5
|
+
};
|
|
6
|
+
/**
|
|
7
|
+
* Returns an async iterable that reads from a stream (works in both Node.js and browsers)
|
|
8
|
+
* @param stream stream to iterator over
|
|
9
|
+
*/
|
|
10
|
+
export declare function makeStreamIterator<T>(stream: ReadableStream<T> | Readable, options?: StreamIteratorOptions): AsyncIterable<T>;
|
|
11
|
+
//# sourceMappingURL=make-stream-iterator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"make-stream-iterator.d.ts","sourceRoot":"","sources":["../../../src/lib/utils/make-stream-iterator.ts"],"names":[],"mappings":";AAIA,OAAO,KAAK,EAAC,QAAQ,EAAC,MAAM,QAAQ,CAAC;AAGrC,MAAM,MAAM,qBAAqB,GAAG;IAClC,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,CAAC;AAEF;;;GAGG;AACH,wBAAgB,kBAAkB,CAAC,CAAC,EAClC,MAAM,EAAE,cAAc,CAAC,CAAC,CAAC,GAAG,QAAQ,EACpC,OAAO,CAAC,EAAE,qBAAqB,GAC9B,aAAa,CAAC,CAAC,CAAC,CAIlB"}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
// loaders.gl
|
|
2
|
+
// SPDX-License-Identifier: MIT
|
|
3
|
+
// Copyright (c) vis.gl contributors
|
|
4
|
+
import { isBrowser } from '@loaders.gl/loader-utils';
|
|
5
|
+
/**
|
|
6
|
+
* Returns an async iterable that reads from a stream (works in both Node.js and browsers)
|
|
7
|
+
* @param stream stream to iterator over
|
|
8
|
+
*/
|
|
9
|
+
export function makeStreamIterator(stream, options) {
|
|
10
|
+
return isBrowser
|
|
11
|
+
? makeBrowserStreamIterator(stream, options)
|
|
12
|
+
: makeNodeStreamIterator(stream, options);
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Returns an async iterable that reads from a DOM (browser) stream
|
|
16
|
+
* @param stream stream to iterate from
|
|
17
|
+
* @see https://jakearchibald.com/2017/async-iterators-and-generators/#making-streams-iterate
|
|
18
|
+
*/
|
|
19
|
+
async function* makeBrowserStreamIterator(stream, options) {
|
|
20
|
+
// WhatWG: stream is supposed to have a `getIterator` method
|
|
21
|
+
// if (typeof stream.getIterator === 'function') {
|
|
22
|
+
// return stream.getIterator();
|
|
23
|
+
// }
|
|
24
|
+
// if (typeof stream[Symbol.asyncIterator] === 'function') {
|
|
25
|
+
// return makeToArrayBufferIterator(stream);
|
|
26
|
+
// }
|
|
27
|
+
// In the browser, we first need to get a lock on the stream
|
|
28
|
+
const reader = stream.getReader();
|
|
29
|
+
let nextBatchPromise;
|
|
30
|
+
try {
|
|
31
|
+
// eslint-disable-next-line no-constant-condition
|
|
32
|
+
while (true) {
|
|
33
|
+
const currentBatchPromise = nextBatchPromise || reader.read();
|
|
34
|
+
// Issue a read for an additional batch, while we await the next batch
|
|
35
|
+
// Idea is to make fetching happen in parallel with processing / parsing
|
|
36
|
+
if (options?._streamReadAhead) {
|
|
37
|
+
nextBatchPromise = reader.read();
|
|
38
|
+
}
|
|
39
|
+
// Read from the stream
|
|
40
|
+
// value is a Uint8Array
|
|
41
|
+
const { done, value } = await currentBatchPromise;
|
|
42
|
+
// Exit if we're done
|
|
43
|
+
if (done) {
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
// Else yield the chunk
|
|
47
|
+
if (value) {
|
|
48
|
+
yield value;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
catch (error) {
|
|
53
|
+
// TODO - examples makes it look like this should always be called,
|
|
54
|
+
// but that generates exceptions so only call it if we do not reach the end
|
|
55
|
+
reader.releaseLock();
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Returns an async iterable that reads from a DOM (browser) stream
|
|
60
|
+
* @param stream stream to iterate from
|
|
61
|
+
* @note Requires Node.js >= 10
|
|
62
|
+
*/
|
|
63
|
+
async function* makeNodeStreamIterator(stream, options) {
|
|
64
|
+
// Hacky test for node version to ensure we don't call bad polyfills
|
|
65
|
+
// NODE 10+: stream is an asyncIterator
|
|
66
|
+
yield* stream;
|
|
67
|
+
}
|
package/dist/parquet-loader.js
CHANGED
|
@@ -8,7 +8,7 @@ import { parseParquetFileInColumns, parseParquetFileInColumnarBatches } from "./
|
|
|
8
8
|
export { Buffer } from "./polyfills/buffer/install-buffer-polyfill.js";
|
|
9
9
|
// __VERSION__ is injected by babel-plugin-version-inline
|
|
10
10
|
// @ts-ignore TS2304: Cannot find name '__VERSION__'.
|
|
11
|
-
const VERSION = typeof "4.
|
|
11
|
+
const VERSION = typeof "4.3.0-alpha.1" !== 'undefined' ? "4.3.0-alpha.1" : 'latest';
|
|
12
12
|
/**
|
|
13
13
|
* ParquetJS table loader
|
|
14
14
|
*/
|
|
@@ -1,16 +1,22 @@
|
|
|
1
|
+
import type { ArrowTable, ArrowTableBatch } from '@loaders.gl/schema';
|
|
1
2
|
import type { LoaderOptions } from '@loaders.gl/loader-utils';
|
|
2
|
-
import type { ArrowTable } from '@loaders.gl/arrow';
|
|
3
3
|
/** Parquet WASM loader options */
|
|
4
4
|
export type ParquetWasmLoaderOptions = LoaderOptions & {
|
|
5
5
|
parquet?: {
|
|
6
|
-
|
|
6
|
+
shape?: 'arrow-table';
|
|
7
|
+
limit?: number;
|
|
8
|
+
offset?: number;
|
|
9
|
+
batchSize?: number;
|
|
10
|
+
columns?: string[];
|
|
11
|
+
rowGroups?: number[];
|
|
12
|
+
concurrency?: number;
|
|
7
13
|
wasmUrl?: string;
|
|
8
14
|
};
|
|
9
15
|
};
|
|
10
16
|
/** Parquet WASM table loader */
|
|
11
17
|
export declare const ParquetWasmWorkerLoader: {
|
|
12
18
|
readonly dataType: ArrowTable;
|
|
13
|
-
readonly batchType:
|
|
19
|
+
readonly batchType: ArrowTableBatch;
|
|
14
20
|
readonly name: "Apache Parquet";
|
|
15
21
|
readonly id: "parquet-wasm";
|
|
16
22
|
readonly module: "parquet";
|
|
@@ -23,8 +29,14 @@ export declare const ParquetWasmWorkerLoader: {
|
|
|
23
29
|
readonly tests: ["PAR1", "PARE"];
|
|
24
30
|
readonly options: {
|
|
25
31
|
readonly parquet: {
|
|
26
|
-
readonly
|
|
27
|
-
readonly
|
|
32
|
+
readonly shape: "arrow-table";
|
|
33
|
+
readonly limit: undefined;
|
|
34
|
+
readonly offset: 0;
|
|
35
|
+
readonly batchSize: undefined;
|
|
36
|
+
readonly columns: undefined;
|
|
37
|
+
readonly rowGroups: undefined;
|
|
38
|
+
readonly concurrency: undefined;
|
|
39
|
+
readonly wasmUrl: "https://unpkg.com/parquet-wasm@0.6.1/esm/parquet_wasm_bg.wasm";
|
|
28
40
|
};
|
|
29
41
|
};
|
|
30
42
|
};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"parquet-wasm-loader.d.ts","sourceRoot":"","sources":["../src/parquet-wasm-loader.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,
|
|
1
|
+
{"version":3,"file":"parquet-wasm-loader.d.ts","sourceRoot":"","sources":["../src/parquet-wasm-loader.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAC,UAAU,EAAE,eAAe,EAAC,MAAM,oBAAoB,CAAC;AACpE,OAAO,KAAK,EAA2B,aAAa,EAAC,MAAM,0BAA0B,CAAC;AAStF,kCAAkC;AAClC,MAAM,MAAM,wBAAwB,GAAG,aAAa,GAAG;IACrD,OAAO,CAAC,EAAE;QACR,KAAK,CAAC,EAAE,aAAa,CAAC;QACtB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;QACnB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;QACrB,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC;CACH,CAAC;AAEF,gCAAgC;AAChC,eAAO,MAAM,uBAAuB;;;;;;;;;;;;;;;;;;;;;;;;;CA0B8C,CAAC;AAEnF,gCAAgC;AAChC,eAAO,MAAM,iBAAiB,KAiB8D,CAAC"}
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
// loaders.gl
|
|
2
2
|
// SPDX-License-Identifier: MIT
|
|
3
3
|
// Copyright (c) vis.gl contributors
|
|
4
|
-
import {
|
|
4
|
+
import { BlobFile } from '@loaders.gl/loader-utils';
|
|
5
|
+
import { parseParquetFileWasm, parseParquetFileInBatchesWasm } from "./lib/parsers/parse-parquet-wasm.js";
|
|
5
6
|
import { VERSION, PARQUET_WASM_URL } from "./lib/constants.js";
|
|
6
7
|
/** Parquet WASM table loader */
|
|
7
8
|
export const ParquetWasmWorkerLoader = {
|
|
@@ -19,7 +20,13 @@ export const ParquetWasmWorkerLoader = {
|
|
|
19
20
|
tests: ['PAR1', 'PARE'],
|
|
20
21
|
options: {
|
|
21
22
|
parquet: {
|
|
22
|
-
|
|
23
|
+
shape: 'arrow-table',
|
|
24
|
+
limit: undefined, // Provide a limit to the number of rows to be read.
|
|
25
|
+
offset: 0, // Provide an offset to skip over the given number of rows.
|
|
26
|
+
batchSize: undefined, // The number of rows in each batch. If not provided, the upstream parquet default is 1024.
|
|
27
|
+
columns: undefined, // The column names from the file to read.
|
|
28
|
+
rowGroups: undefined, // Only read data from the provided row group indexes.
|
|
29
|
+
concurrency: undefined, // The number of concurrent requests to make
|
|
23
30
|
wasmUrl: PARQUET_WASM_URL
|
|
24
31
|
}
|
|
25
32
|
}
|
|
@@ -28,7 +35,15 @@ export const ParquetWasmWorkerLoader = {
|
|
|
28
35
|
export const ParquetWasmLoader = {
|
|
29
36
|
...ParquetWasmWorkerLoader,
|
|
30
37
|
parse(arrayBuffer, options) {
|
|
31
|
-
|
|
32
|
-
return
|
|
38
|
+
const wasmOptions = { ...ParquetWasmLoader.options.parquet, ...options?.parquet };
|
|
39
|
+
return parseParquetFileWasm(new BlobFile(arrayBuffer), wasmOptions);
|
|
40
|
+
},
|
|
41
|
+
parseFile(file, options) {
|
|
42
|
+
const wasmOptions = { ...ParquetWasmLoader.options.parquet, ...options?.parquet };
|
|
43
|
+
return parseParquetFileWasm(file, wasmOptions);
|
|
44
|
+
},
|
|
45
|
+
parseFileInBatches(file, options) {
|
|
46
|
+
const wasmOptions = { ...ParquetWasmLoader.options.parquet, ...options?.parquet };
|
|
47
|
+
return parseParquetFileInBatchesWasm(file, wasmOptions);
|
|
33
48
|
}
|
|
34
49
|
};
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// loaders.gl
|
|
2
2
|
// SPDX-License-Identifier: MIT
|
|
3
3
|
// Copyright (c) vis.gl contributors
|
|
4
|
-
import { encode } from "./lib/
|
|
4
|
+
import { encode } from "./lib/encoders/encode-parquet-wasm.js";
|
|
5
5
|
import { VERSION, PARQUET_WASM_URL } from "./lib/constants.js";
|
|
6
6
|
/** Parquet WASM writer */
|
|
7
7
|
export const ParquetWasmWriter = {
|
package/dist/parquet-writer.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
// Copyright (c) vis.gl contributors
|
|
4
4
|
// __VERSION__ is injected by babel-plugin-version-inline
|
|
5
5
|
// @ts-ignore TS2304: Cannot find name '__VERSION__'.
|
|
6
|
-
const VERSION = typeof "4.
|
|
6
|
+
const VERSION = typeof "4.3.0-alpha.1" !== 'undefined' ? "4.3.0-alpha.1" : 'latest';
|
|
7
7
|
export const ParquetWriter = {
|
|
8
8
|
name: 'Apache Parquet',
|
|
9
9
|
id: 'parquet',
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"rle.d.ts","sourceRoot":"","sources":["../../../src/parquetjs/codecs/rle.ts"],"names":[],"mappings":";AAEA,OAAO,KAAK,EAAC,aAAa,EAAC,6BAA0B;AACrD,OAAO,KAAK,EAAC,YAAY,EAAE,mBAAmB,EAAC,qBAAkB;AAIjE,wBAAgB,YAAY,CAC1B,IAAI,EAAE,aAAa,EACnB,MAAM,EAAE,GAAG,EAAE,EACb,IAAI,EAAE,mBAAmB,GACxB,MAAM,CA4DR;AAED,wBAAgB,YAAY,CAC1B,IAAI,EAAE,aAAa,EACnB,MAAM,EAAE,YAAY,EACpB,KAAK,EAAE,MAAM,EACb,IAAI,EAAE,mBAAmB,GACxB,MAAM,EAAE,
|
|
1
|
+
{"version":3,"file":"rle.d.ts","sourceRoot":"","sources":["../../../src/parquetjs/codecs/rle.ts"],"names":[],"mappings":";AAEA,OAAO,KAAK,EAAC,aAAa,EAAC,6BAA0B;AACrD,OAAO,KAAK,EAAC,YAAY,EAAE,mBAAmB,EAAC,qBAAkB;AAIjE,wBAAgB,YAAY,CAC1B,IAAI,EAAE,aAAa,EACnB,MAAM,EAAE,GAAG,EAAE,EACb,IAAI,EAAE,mBAAmB,GACxB,MAAM,CA4DR;AAED,wBAAgB,YAAY,CAC1B,IAAI,EAAE,aAAa,EACnB,MAAM,EAAE,YAAY,EACpB,KAAK,EAAE,MAAM,EACb,IAAI,EAAE,mBAAmB,GACxB,MAAM,EAAE,CAkCV"}
|
|
@@ -76,6 +76,7 @@ export function decodeValues(type, cursor, count, opts) {
|
|
|
76
76
|
const count = header >> 1;
|
|
77
77
|
decodedValues = decodeRunRepeated(cursor, count, opts);
|
|
78
78
|
}
|
|
79
|
+
// strange failure in docusaurus / webpack if we don't cast the type here
|
|
79
80
|
for (const value of decodedValues) {
|
|
80
81
|
values.push(value);
|
|
81
82
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@loaders.gl/parquet",
|
|
3
|
-
"version": "4.3.0-alpha.
|
|
3
|
+
"version": "4.3.0-alpha.2",
|
|
4
4
|
"description": "Framework-independent loader for Apache Parquet files",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -43,7 +43,7 @@
|
|
|
43
43
|
"build-bundle": "ocular-bundle ./bundle.ts --output=dist/dist.min.js",
|
|
44
44
|
"build-bundle-dev": "ocular-bundle ./bundle.ts --env=dev --output=dist/dist.dev.js",
|
|
45
45
|
"build-worker": "esbuild src/workers/parquet-worker.ts --outfile=dist/parquet-worker.js --bundle --minify --sourcemap --external:{util,fs,path,crypto,events} --define:__VERSION__=\\\"$npm_package_version\\\"",
|
|
46
|
-
"copy-wasm": "cp ../../node_modules/parquet-wasm/
|
|
46
|
+
"copy-wasm": "cp ../../node_modules/parquet-wasm/esm/parquet_wasm_bg.wasm dist/parquet_wasm_bg.wasm"
|
|
47
47
|
},
|
|
48
48
|
"browser": {
|
|
49
49
|
"./dist/polyfills/buffer/buffer-polyfill.node.js": "./dist/polyfills/buffer/buffer-polyfill.browser.js",
|
|
@@ -60,13 +60,13 @@
|
|
|
60
60
|
"base64-js and ieee754 are used by buffer polyfill"
|
|
61
61
|
],
|
|
62
62
|
"dependencies": {
|
|
63
|
-
"@loaders.gl/arrow": "4.3.0-alpha.
|
|
64
|
-
"@loaders.gl/bson": "4.3.0-alpha.
|
|
65
|
-
"@loaders.gl/compression": "4.3.0-alpha.
|
|
66
|
-
"@loaders.gl/gis": "4.3.0-alpha.
|
|
67
|
-
"@loaders.gl/loader-utils": "4.3.0-alpha.
|
|
68
|
-
"@loaders.gl/schema": "4.3.0-alpha.
|
|
69
|
-
"@loaders.gl/wkt": "4.3.0-alpha.
|
|
63
|
+
"@loaders.gl/arrow": "4.3.0-alpha.2",
|
|
64
|
+
"@loaders.gl/bson": "4.3.0-alpha.2",
|
|
65
|
+
"@loaders.gl/compression": "4.3.0-alpha.2",
|
|
66
|
+
"@loaders.gl/gis": "4.3.0-alpha.2",
|
|
67
|
+
"@loaders.gl/loader-utils": "4.3.0-alpha.2",
|
|
68
|
+
"@loaders.gl/schema": "4.3.0-alpha.2",
|
|
69
|
+
"@loaders.gl/wkt": "4.3.0-alpha.2",
|
|
70
70
|
"@probe.gl/log": "^4.0.9",
|
|
71
71
|
"async-mutex": "^0.2.2",
|
|
72
72
|
"base64-js": "^1.3.1",
|
|
@@ -76,11 +76,11 @@
|
|
|
76
76
|
"lz4js": "^0.2.0",
|
|
77
77
|
"node-int64": "^0.4.0",
|
|
78
78
|
"object-stream": "0.0.1",
|
|
79
|
-
"parquet-wasm": "^0.6.
|
|
79
|
+
"parquet-wasm": "^0.6.1",
|
|
80
80
|
"snappyjs": "^0.6.0",
|
|
81
81
|
"thrift": "^0.19.0",
|
|
82
82
|
"util": "^0.12.5",
|
|
83
|
-
"varint": "^
|
|
83
|
+
"varint": "^6.0.0",
|
|
84
84
|
"zstd-codec": "^0.1"
|
|
85
85
|
},
|
|
86
86
|
"devDependencies": {
|
|
@@ -94,5 +94,5 @@
|
|
|
94
94
|
"@loaders.gl/core": "^4.0.0",
|
|
95
95
|
"apache-arrow": ">= 15.0.0"
|
|
96
96
|
},
|
|
97
|
-
"gitHead": "
|
|
97
|
+
"gitHead": "77a3cb538ab7a1fbf74245f25590210451689f5c"
|
|
98
98
|
}
|
package/src/lib/constants.ts
CHANGED
|
@@ -5,7 +5,8 @@
|
|
|
5
5
|
// __VERSION__ is injected by babel-plugin-version-inline
|
|
6
6
|
// @ts-ignore TS2304: Cannot find name '__VERSION__'.
|
|
7
7
|
export const VERSION = typeof __VERSION__ !== 'undefined' ? __VERSION__ : 'latest';
|
|
8
|
-
|
|
8
|
+
|
|
9
|
+
export const PARQUET_WASM_URL = 'https://unpkg.com/parquet-wasm@0.6.1/esm/parquet_wasm_bg.wasm';
|
|
9
10
|
|
|
10
11
|
/**
|
|
11
12
|
* Parquet File Magic String
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
// loaders.gl
|
|
2
|
+
// SPDX-License-Identifier: MIT
|
|
3
|
+
// Copyright (c) vis.gl contributors
|
|
4
|
+
|
|
5
|
+
// eslint-disable
|
|
6
|
+
import type {ReadableFile} from '@loaders.gl/loader-utils';
|
|
7
|
+
import type {ArrowTable, ArrowTableBatch, Schema} from '@loaders.gl/schema';
|
|
8
|
+
import {serializeArrowSchema} from '@loaders.gl/arrow';
|
|
9
|
+
|
|
10
|
+
import type * as parquetWasm from 'parquet-wasm';
|
|
11
|
+
import * as arrow from 'apache-arrow';
|
|
12
|
+
|
|
13
|
+
import {loadWasm} from '../utils/load-wasm';
|
|
14
|
+
import {makeStreamIterator} from '../utils/make-stream-iterator';
|
|
15
|
+
|
|
16
|
+
export async function parseParquetFileWasm(
|
|
17
|
+
file: ReadableFile,
|
|
18
|
+
options?: parquetWasm.ReaderOptions & {wasmUrl: string}
|
|
19
|
+
): Promise<ArrowTable> {
|
|
20
|
+
const wasmUrl = options?.wasmUrl;
|
|
21
|
+
|
|
22
|
+
const wasm = await loadWasm(wasmUrl);
|
|
23
|
+
|
|
24
|
+
let parquetFile: parquetWasm.ParquetFile;
|
|
25
|
+
if (file.handle instanceof Blob) {
|
|
26
|
+
// TODO - let's assume fromFile() works on Blobs and not just on File...
|
|
27
|
+
parquetFile = await wasm.ParquetFile.fromFile(file.handle as File);
|
|
28
|
+
} else {
|
|
29
|
+
parquetFile = await wasm.ParquetFile.fromUrl(file.url);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const wasmTable = await parquetFile.read(options);
|
|
33
|
+
const ipcStream = wasmTable.intoIPCStream();
|
|
34
|
+
const arrowTable = arrow.tableFromIPC(ipcStream);
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
shape: 'arrow-table',
|
|
38
|
+
schema: serializeArrowSchema(arrowTable.schema),
|
|
39
|
+
data: arrowTable
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export async function* parseParquetFileInBatchesWasm(
|
|
44
|
+
file: ReadableFile,
|
|
45
|
+
options: parquetWasm.ReaderOptions & {wasmUrl: string}
|
|
46
|
+
): AsyncIterable<ArrowTableBatch> {
|
|
47
|
+
const wasmUrl = options?.wasmUrl;
|
|
48
|
+
|
|
49
|
+
const wasm = await loadWasm(wasmUrl);
|
|
50
|
+
|
|
51
|
+
let parquetFile: parquetWasm.ParquetFile;
|
|
52
|
+
if (file.handle instanceof Blob) {
|
|
53
|
+
// Works on Blobs: https://kylebarron.dev/parquet-wasm/classes/esm_parquet_wasm.ParquetFile.html#fromFile
|
|
54
|
+
parquetFile = await wasm.ParquetFile.fromFile(file.handle as File);
|
|
55
|
+
} else {
|
|
56
|
+
parquetFile = await wasm.ParquetFile.fromUrl(file.url);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const stream: ReadableStream<arrow.Table> = await parquetFile.stream(options);
|
|
60
|
+
|
|
61
|
+
let schema: Schema;
|
|
62
|
+
for await (const table of makeStreamIterator(stream)) {
|
|
63
|
+
schema ||= serializeArrowSchema(table.schema);
|
|
64
|
+
yield {
|
|
65
|
+
batchType: 'data',
|
|
66
|
+
shape: 'arrow-table',
|
|
67
|
+
schema,
|
|
68
|
+
data: table.batches[0],
|
|
69
|
+
length: table.numRows
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
// loaders.gl
|
|
2
|
+
// SPDX-License-Identifier: MIT
|
|
3
|
+
// Copyright (c) vis.gl contributors
|
|
4
|
+
|
|
5
|
+
import type {Readable} from 'stream';
|
|
6
|
+
import {isBrowser} from '@loaders.gl/loader-utils';
|
|
7
|
+
|
|
8
|
+
export type StreamIteratorOptions = {
|
|
9
|
+
_streamReadAhead?: boolean;
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Returns an async iterable that reads from a stream (works in both Node.js and browsers)
|
|
14
|
+
* @param stream stream to iterator over
|
|
15
|
+
*/
|
|
16
|
+
export function makeStreamIterator<T>(
|
|
17
|
+
stream: ReadableStream<T> | Readable,
|
|
18
|
+
options?: StreamIteratorOptions
|
|
19
|
+
): AsyncIterable<T> {
|
|
20
|
+
return isBrowser
|
|
21
|
+
? makeBrowserStreamIterator(stream as ReadableStream<T>, options)
|
|
22
|
+
: makeNodeStreamIterator(stream as Readable, options);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Returns an async iterable that reads from a DOM (browser) stream
|
|
27
|
+
* @param stream stream to iterate from
|
|
28
|
+
* @see https://jakearchibald.com/2017/async-iterators-and-generators/#making-streams-iterate
|
|
29
|
+
*/
|
|
30
|
+
async function* makeBrowserStreamIterator<T>(
|
|
31
|
+
stream: ReadableStream<T>,
|
|
32
|
+
options?: StreamIteratorOptions
|
|
33
|
+
): AsyncIterable<T> {
|
|
34
|
+
// WhatWG: stream is supposed to have a `getIterator` method
|
|
35
|
+
// if (typeof stream.getIterator === 'function') {
|
|
36
|
+
// return stream.getIterator();
|
|
37
|
+
// }
|
|
38
|
+
// if (typeof stream[Symbol.asyncIterator] === 'function') {
|
|
39
|
+
// return makeToArrayBufferIterator(stream);
|
|
40
|
+
// }
|
|
41
|
+
|
|
42
|
+
// In the browser, we first need to get a lock on the stream
|
|
43
|
+
const reader = stream.getReader();
|
|
44
|
+
|
|
45
|
+
let nextBatchPromise: Promise<{done?: boolean; value?: T}> | undefined;
|
|
46
|
+
|
|
47
|
+
try {
|
|
48
|
+
// eslint-disable-next-line no-constant-condition
|
|
49
|
+
while (true) {
|
|
50
|
+
const currentBatchPromise = nextBatchPromise || reader.read();
|
|
51
|
+
// Issue a read for an additional batch, while we await the next batch
|
|
52
|
+
// Idea is to make fetching happen in parallel with processing / parsing
|
|
53
|
+
if (options?._streamReadAhead) {
|
|
54
|
+
nextBatchPromise = reader.read();
|
|
55
|
+
}
|
|
56
|
+
// Read from the stream
|
|
57
|
+
// value is a Uint8Array
|
|
58
|
+
const {done, value} = await currentBatchPromise;
|
|
59
|
+
// Exit if we're done
|
|
60
|
+
if (done) {
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
// Else yield the chunk
|
|
64
|
+
if (value) {
|
|
65
|
+
yield value;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
} catch (error) {
|
|
69
|
+
// TODO - examples makes it look like this should always be called,
|
|
70
|
+
// but that generates exceptions so only call it if we do not reach the end
|
|
71
|
+
reader.releaseLock();
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Returns an async iterable that reads from a DOM (browser) stream
|
|
77
|
+
* @param stream stream to iterate from
|
|
78
|
+
* @note Requires Node.js >= 10
|
|
79
|
+
*/
|
|
80
|
+
async function* makeNodeStreamIterator<T>(
|
|
81
|
+
stream: Readable,
|
|
82
|
+
options?: StreamIteratorOptions
|
|
83
|
+
): AsyncIterable<T> {
|
|
84
|
+
// Hacky test for node version to ensure we don't call bad polyfills
|
|
85
|
+
// NODE 10+: stream is an asyncIterator
|
|
86
|
+
yield* stream;
|
|
87
|
+
}
|