@loaders.gl/parquet 3.0.12 → 3.1.0-alpha.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/dist.min.js +7 -18
- package/dist/dist.min.js.map +1 -1
- package/dist/es5/bundle.js +2 -4
- package/dist/es5/bundle.js.map +1 -1
- package/dist/es5/constants.js +17 -0
- package/dist/es5/constants.js.map +1 -0
- package/dist/es5/index.js +53 -21
- package/dist/es5/index.js.map +1 -1
- package/dist/es5/lib/convert-schema.js +82 -0
- package/dist/es5/lib/convert-schema.js.map +1 -0
- package/dist/es5/lib/parse-parquet.js +173 -0
- package/dist/es5/lib/parse-parquet.js.map +1 -0
- package/dist/es5/lib/read-array-buffer.js +53 -0
- package/dist/es5/lib/read-array-buffer.js.map +1 -0
- package/dist/es5/parquet-loader.js +6 -79
- package/dist/es5/parquet-loader.js.map +1 -1
- package/dist/es5/parquet-writer.js +1 -1
- package/dist/es5/parquet-writer.js.map +1 -1
- package/dist/es5/parquetjs/codecs/dictionary.js +30 -0
- package/dist/es5/parquetjs/codecs/dictionary.js.map +1 -0
- package/dist/es5/parquetjs/codecs/index.js +10 -0
- package/dist/es5/parquetjs/codecs/index.js.map +1 -1
- package/dist/es5/parquetjs/codecs/rle.js +2 -2
- package/dist/es5/parquetjs/codecs/rle.js.map +1 -1
- package/dist/es5/parquetjs/compression.js +138 -104
- package/dist/es5/parquetjs/compression.js.map +1 -1
- package/dist/es5/parquetjs/{writer.js → encoder/writer.js} +397 -228
- package/dist/es5/parquetjs/encoder/writer.js.map +1 -0
- package/dist/es5/parquetjs/parquet-thrift/CompressionCodec.js +1 -0
- package/dist/es5/parquetjs/parquet-thrift/CompressionCodec.js.map +1 -1
- package/dist/es5/parquetjs/parser/decoders.js +495 -0
- package/dist/es5/parquetjs/parser/decoders.js.map +1 -0
- package/dist/es5/parquetjs/parser/parquet-cursor.js +215 -0
- package/dist/es5/parquetjs/parser/parquet-cursor.js.map +1 -0
- package/dist/es5/parquetjs/parser/parquet-envelope-reader.js +452 -0
- package/dist/es5/parquetjs/parser/parquet-envelope-reader.js.map +1 -0
- package/dist/es5/parquetjs/parser/parquet-reader.js +413 -0
- package/dist/es5/parquetjs/parser/parquet-reader.js.map +1 -0
- package/dist/es5/parquetjs/schema/declare.js.map +1 -1
- package/dist/es5/parquetjs/schema/schema.js +2 -0
- package/dist/es5/parquetjs/schema/schema.js.map +1 -1
- package/dist/es5/parquetjs/schema/shred.js +2 -1
- package/dist/es5/parquetjs/schema/shred.js.map +1 -1
- package/dist/es5/parquetjs/schema/types.js +79 -4
- package/dist/es5/parquetjs/schema/types.js.map +1 -1
- package/dist/es5/parquetjs/utils/buffer-utils.js +21 -0
- package/dist/es5/parquetjs/utils/buffer-utils.js.map +1 -0
- package/dist/es5/parquetjs/utils/file-utils.js +108 -0
- package/dist/es5/parquetjs/utils/file-utils.js.map +1 -0
- package/dist/es5/parquetjs/{util.js → utils/read-utils.js} +13 -113
- package/dist/es5/parquetjs/utils/read-utils.js.map +1 -0
- package/dist/esm/bundle.js +2 -4
- package/dist/esm/bundle.js.map +1 -1
- package/dist/esm/constants.js +6 -0
- package/dist/esm/constants.js.map +1 -0
- package/dist/esm/index.js +14 -4
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/lib/convert-schema.js +71 -0
- package/dist/esm/lib/convert-schema.js.map +1 -0
- package/dist/esm/lib/parse-parquet.js +28 -0
- package/dist/esm/lib/parse-parquet.js.map +1 -0
- package/dist/esm/lib/read-array-buffer.js +9 -0
- package/dist/esm/lib/read-array-buffer.js.map +1 -0
- package/dist/esm/parquet-loader.js +4 -24
- package/dist/esm/parquet-loader.js.map +1 -1
- package/dist/esm/parquet-writer.js +1 -1
- package/dist/esm/parquet-writer.js.map +1 -1
- package/dist/esm/parquetjs/codecs/dictionary.js +12 -0
- package/dist/esm/parquetjs/codecs/dictionary.js.map +1 -0
- package/dist/esm/parquetjs/codecs/index.js +9 -0
- package/dist/esm/parquetjs/codecs/index.js.map +1 -1
- package/dist/esm/parquetjs/codecs/rle.js +2 -2
- package/dist/esm/parquetjs/codecs/rle.js.map +1 -1
- package/dist/esm/parquetjs/compression.js +54 -105
- package/dist/esm/parquetjs/compression.js.map +1 -1
- package/dist/esm/parquetjs/{writer.js → encoder/writer.js} +32 -35
- package/dist/esm/parquetjs/encoder/writer.js.map +1 -0
- package/dist/esm/parquetjs/parquet-thrift/CompressionCodec.js +1 -0
- package/dist/esm/parquetjs/parquet-thrift/CompressionCodec.js.map +1 -1
- package/dist/esm/parquetjs/parser/decoders.js +300 -0
- package/dist/esm/parquetjs/parser/decoders.js.map +1 -0
- package/dist/esm/parquetjs/parser/parquet-cursor.js +90 -0
- package/dist/esm/parquetjs/parser/parquet-cursor.js.map +1 -0
- package/dist/esm/parquetjs/parser/parquet-envelope-reader.js +164 -0
- package/dist/esm/parquetjs/parser/parquet-envelope-reader.js.map +1 -0
- package/dist/esm/parquetjs/parser/parquet-reader.js +133 -0
- package/dist/esm/parquetjs/parser/parquet-reader.js.map +1 -0
- package/dist/esm/parquetjs/schema/declare.js.map +1 -1
- package/dist/esm/parquetjs/schema/schema.js +2 -0
- package/dist/esm/parquetjs/schema/schema.js.map +1 -1
- package/dist/esm/parquetjs/schema/shred.js +2 -1
- package/dist/esm/parquetjs/schema/shred.js.map +1 -1
- package/dist/esm/parquetjs/schema/types.js +78 -4
- package/dist/esm/parquetjs/schema/types.js.map +1 -1
- package/dist/esm/parquetjs/utils/buffer-utils.js +12 -0
- package/dist/esm/parquetjs/utils/buffer-utils.js.map +1 -0
- package/dist/esm/parquetjs/utils/file-utils.js +79 -0
- package/dist/esm/parquetjs/utils/file-utils.js.map +1 -0
- package/dist/esm/parquetjs/{util.js → utils/read-utils.js} +11 -89
- package/dist/esm/parquetjs/utils/read-utils.js.map +1 -0
- package/dist/parquet-worker.js +7 -18
- package/dist/parquet-worker.js.map +1 -1
- package/package.json +10 -10
- package/src/bundle.ts +2 -3
- package/src/constants.ts +17 -0
- package/src/index.ts +30 -4
- package/src/lib/convert-schema.ts +95 -0
- package/src/lib/parse-parquet.ts +27 -0
- package/{dist/es5/libs → src/lib}/read-array-buffer.ts +0 -0
- package/src/parquet-loader.ts +4 -24
- package/src/parquetjs/codecs/dictionary.ts +11 -0
- package/src/parquetjs/codecs/index.ts +13 -0
- package/src/parquetjs/codecs/rle.ts +4 -2
- package/src/parquetjs/compression.ts +89 -50
- package/src/parquetjs/{writer.ts → encoder/writer.ts} +46 -45
- package/src/parquetjs/parquet-thrift/CompressionCodec.ts +2 -1
- package/src/parquetjs/parser/decoders.ts +448 -0
- package/src/parquetjs/parser/parquet-cursor.ts +94 -0
- package/src/parquetjs/parser/parquet-envelope-reader.ts +210 -0
- package/src/parquetjs/parser/parquet-reader.ts +179 -0
- package/src/parquetjs/schema/declare.ts +48 -2
- package/src/parquetjs/schema/schema.ts +2 -0
- package/src/parquetjs/schema/shred.ts +3 -1
- package/src/parquetjs/schema/types.ts +82 -5
- package/src/parquetjs/utils/buffer-utils.ts +18 -0
- package/src/parquetjs/utils/file-utils.ts +96 -0
- package/src/parquetjs/{util.ts → utils/read-utils.ts} +13 -110
- package/dist/dist.es5.min.js +0 -51
- package/dist/dist.es5.min.js.map +0 -1
- package/dist/es5/parquetjs/compression.ts.disabled +0 -105
- package/dist/es5/parquetjs/reader.js +0 -1078
- package/dist/es5/parquetjs/reader.js.map +0 -1
- package/dist/es5/parquetjs/util.js.map +0 -1
- package/dist/es5/parquetjs/writer.js.map +0 -1
- package/dist/esm/libs/read-array-buffer.ts +0 -31
- package/dist/esm/parquetjs/compression.ts.disabled +0 -105
- package/dist/esm/parquetjs/reader.js +0 -524
- package/dist/esm/parquetjs/reader.js.map +0 -1
- package/dist/esm/parquetjs/util.js.map +0 -1
- package/dist/esm/parquetjs/writer.js.map +0 -1
- package/src/libs/read-array-buffer.ts +0 -31
- package/src/parquetjs/compression.ts.disabled +0 -105
- package/src/parquetjs/reader.ts +0 -707
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
// import type {LoaderWithParser, Loader, LoaderOptions} from '@loaders.gl/loader-utils';
|
|
2
|
+
import type {ParquetLoaderOptions} from '../parquet-loader';
|
|
3
|
+
|
|
4
|
+
import {ParquetReader} from '../parquetjs/parser/parquet-reader';
|
|
5
|
+
|
|
6
|
+
export async function parseParquet(arrayBuffer: ArrayBuffer, options?: ParquetLoaderOptions) {
|
|
7
|
+
const blob = new Blob([arrayBuffer]);
|
|
8
|
+
for await (const batch of parseParquetFileInBatches(blob, options)) {
|
|
9
|
+
return batch;
|
|
10
|
+
}
|
|
11
|
+
return null;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export async function* parseParquetFileInBatches(blob: Blob, options?: ParquetLoaderOptions) {
|
|
15
|
+
const reader = await ParquetReader.openBlob(blob);
|
|
16
|
+
const rows: any[][] = [];
|
|
17
|
+
try {
|
|
18
|
+
const cursor = reader.getCursor();
|
|
19
|
+
let record: any[] | null;
|
|
20
|
+
while ((record = await cursor.next())) {
|
|
21
|
+
rows.push(record);
|
|
22
|
+
}
|
|
23
|
+
} finally {
|
|
24
|
+
await reader.close();
|
|
25
|
+
}
|
|
26
|
+
yield rows;
|
|
27
|
+
}
|
|
File without changes
|
package/src/parquet-loader.ts
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
import {ParquetReader} from './parquetjs/reader';
|
|
1
|
+
import type {Loader, LoaderOptions} from '@loaders.gl/loader-utils';
|
|
3
2
|
|
|
4
3
|
// __VERSION__ is injected by babel-plugin-version-inline
|
|
5
4
|
// @ts-ignore TS2304: Cannot find name '__VERSION__'.
|
|
@@ -20,7 +19,7 @@ const DEFAULT_PARQUET_LOADER_OPTIONS: ParquetLoaderOptions = {
|
|
|
20
19
|
};
|
|
21
20
|
|
|
22
21
|
/** ParquetJS table loader */
|
|
23
|
-
export const
|
|
22
|
+
export const ParquetLoader = {
|
|
24
23
|
name: 'Apache Parquet',
|
|
25
24
|
id: 'parquet',
|
|
26
25
|
module: 'parquet',
|
|
@@ -30,27 +29,8 @@ export const ParquetWorkerLoader: Loader = {
|
|
|
30
29
|
extensions: ['parquet'],
|
|
31
30
|
mimeTypes: ['application/octet-stream'],
|
|
32
31
|
binary: true,
|
|
33
|
-
|
|
32
|
+
tests: ['PAR1', 'PARE'],
|
|
34
33
|
options: DEFAULT_PARQUET_LOADER_OPTIONS
|
|
35
34
|
};
|
|
36
35
|
|
|
37
|
-
|
|
38
|
-
export const ParquetLoader: LoaderWithParser = {
|
|
39
|
-
...ParquetWorkerLoader,
|
|
40
|
-
parse
|
|
41
|
-
};
|
|
42
|
-
|
|
43
|
-
async function parse(arrayBuffer: ArrayBuffer, options?: ParquetLoaderOptions) {
|
|
44
|
-
const reader = await ParquetReader.openArrayBuffer(arrayBuffer);
|
|
45
|
-
const rows: any[][] = [];
|
|
46
|
-
try {
|
|
47
|
-
const cursor = reader.getCursor();
|
|
48
|
-
let record: any[] | null;
|
|
49
|
-
while ((record = await cursor.next())) {
|
|
50
|
-
rows.push(record);
|
|
51
|
-
}
|
|
52
|
-
} finally {
|
|
53
|
-
await reader.close();
|
|
54
|
-
}
|
|
55
|
-
return rows;
|
|
56
|
-
}
|
|
36
|
+
export const _typecheckParquetLoader: Loader = ParquetLoader;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import {decodeValues as decodeRleValues} from './rle';
|
|
2
|
+
|
|
3
|
+
export function decodeValues(type, cursor, count, opts) {
|
|
4
|
+
opts.bitWidth = cursor.buffer.slice(cursor.offset, cursor.offset + 1).readInt8(0);
|
|
5
|
+
cursor.offset += 1;
|
|
6
|
+
return decodeRleValues(type, cursor, count, {...opts, disableEnvelope: true});
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export function encodeValues(type, cursor, count, opts) {
|
|
10
|
+
throw new Error('Encode dictionary functionality is not supported');
|
|
11
|
+
}
|
|
@@ -3,6 +3,7 @@ import type {ParquetCodec} from '../schema/declare';
|
|
|
3
3
|
import type {ParquetCodecKit} from './declare';
|
|
4
4
|
import * as PLAIN from './plain';
|
|
5
5
|
import * as RLE from './rle';
|
|
6
|
+
import * as DICTIONARY from './dictionary';
|
|
6
7
|
|
|
7
8
|
export * from './declare';
|
|
8
9
|
|
|
@@ -14,5 +15,17 @@ export const PARQUET_CODECS: Record<ParquetCodec, ParquetCodecKit> = {
|
|
|
14
15
|
RLE: {
|
|
15
16
|
encodeValues: RLE.encodeValues,
|
|
16
17
|
decodeValues: RLE.decodeValues
|
|
18
|
+
},
|
|
19
|
+
// Using the PLAIN_DICTIONARY enum value is deprecated in the Parquet 2.0 specification.
|
|
20
|
+
PLAIN_DICTIONARY: {
|
|
21
|
+
// @ts-ignore
|
|
22
|
+
encodeValues: DICTIONARY.encodeValues,
|
|
23
|
+
decodeValues: DICTIONARY.decodeValues
|
|
24
|
+
},
|
|
25
|
+
// Prefer using RLE_DICTIONARY in a data page and PLAIN in a dictionary page for Parquet 2.0+ files.
|
|
26
|
+
RLE_DICTIONARY: {
|
|
27
|
+
// @ts-ignore
|
|
28
|
+
encodeValues: DICTIONARY.encodeValues,
|
|
29
|
+
decodeValues: DICTIONARY.decodeValues
|
|
17
30
|
}
|
|
18
31
|
};
|
|
@@ -138,7 +138,8 @@ function decodeRunRepeated(
|
|
|
138
138
|
|
|
139
139
|
let value = 0;
|
|
140
140
|
for (let i = 0; i < Math.ceil(bitWidth / 8); i++) {
|
|
141
|
-
|
|
141
|
+
// eslint-disable-next-line
|
|
142
|
+
value << 8; // TODO - this looks wrong
|
|
142
143
|
value += cursor.buffer[cursor.offset];
|
|
143
144
|
cursor.offset += 1;
|
|
144
145
|
}
|
|
@@ -173,7 +174,8 @@ function encodeRunRepeated(value: number, count: number, opts: ParquetCodecOptio
|
|
|
173
174
|
|
|
174
175
|
for (let i = 0; i < buf.length; i++) {
|
|
175
176
|
buf.writeUInt8(value & 0xff, i);
|
|
176
|
-
|
|
177
|
+
// eslint-disable-next-line
|
|
178
|
+
value >> 8; // TODO - this looks wrong
|
|
177
179
|
}
|
|
178
180
|
|
|
179
181
|
return Buffer.concat([Buffer.from(varint.encode(count << 1)), buf]);
|
|
@@ -1,57 +1,107 @@
|
|
|
1
1
|
// Forked from https://github.com/kbajalc/parquets under MIT license (Copyright (c) 2017 ironSource Ltd.)
|
|
2
2
|
/* eslint-disable camelcase */
|
|
3
|
+
// Forked from https://github.com/ironSource/parquetjs under MIT license
|
|
4
|
+
|
|
5
|
+
import {
|
|
6
|
+
Compression,
|
|
7
|
+
NoCompression,
|
|
8
|
+
GZipCompression,
|
|
9
|
+
SnappyCompression,
|
|
10
|
+
BrotliCompression,
|
|
11
|
+
LZOCompression,
|
|
12
|
+
LZ4Compression,
|
|
13
|
+
ZstdCompression
|
|
14
|
+
} from '@loaders.gl/compression';
|
|
15
|
+
|
|
3
16
|
import {ParquetCompression} from './schema/declare';
|
|
4
|
-
import
|
|
5
|
-
|
|
6
|
-
|
|
17
|
+
import {toArrayBuffer, toBuffer} from './utils/buffer-utils';
|
|
18
|
+
|
|
19
|
+
// TODO switch to worker compression to avoid bundling...
|
|
20
|
+
|
|
21
|
+
// import brotli from 'brotli'; - brotli has problems with decompress in browsers
|
|
22
|
+
import brotliDecompress from 'brotli/decompress';
|
|
23
|
+
import lz4js from 'lz4js';
|
|
24
|
+
import lzo from 'lzo';
|
|
25
|
+
import {ZstdCodec} from 'zstd-codec';
|
|
26
|
+
|
|
27
|
+
// Inject large dependencies through Compression constructor options
|
|
28
|
+
const modules = {
|
|
29
|
+
// brotli has problems with decompress in browsers
|
|
30
|
+
brotli: {
|
|
31
|
+
decompress: brotliDecompress,
|
|
32
|
+
compress: () => {
|
|
33
|
+
throw new Error('brotli compress');
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
lz4js,
|
|
37
|
+
lzo,
|
|
38
|
+
'zstd-codec': ZstdCodec
|
|
39
|
+
};
|
|
7
40
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
41
|
+
// See https://github.com/apache/parquet-format/blob/master/Compression.md
|
|
42
|
+
export const PARQUET_COMPRESSION_METHODS: Record<ParquetCompression, Compression> = {
|
|
43
|
+
UNCOMPRESSED: new NoCompression(),
|
|
44
|
+
GZIP: new GZipCompression(),
|
|
45
|
+
SNAPPY: new SnappyCompression(),
|
|
46
|
+
BROTLI: new BrotliCompression({modules}),
|
|
47
|
+
// TODO: Understand difference between LZ4 and LZ4_RAW
|
|
48
|
+
LZ4: new LZ4Compression({modules}),
|
|
49
|
+
LZ4_RAW: new LZ4Compression({modules}),
|
|
50
|
+
LZO: new LZOCompression({modules}),
|
|
51
|
+
ZSTD: new ZstdCompression({modules})
|
|
52
|
+
};
|
|
11
53
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
54
|
+
/**
|
|
55
|
+
* Register compressions that have big external libraries
|
|
56
|
+
* @param options.modules External library dependencies
|
|
57
|
+
*/
|
|
58
|
+
export async function preloadCompressions(options?: {modules: {[key: string]: any}}) {
|
|
59
|
+
const compressions = Object.values(PARQUET_COMPRESSION_METHODS);
|
|
60
|
+
return await Promise.all(compressions.map((compression) => compression.preload()));
|
|
15
61
|
}
|
|
16
62
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
inflate: inflate_gzip
|
|
25
|
-
},
|
|
26
|
-
SNAPPY: {
|
|
27
|
-
deflate: deflate_snappy,
|
|
28
|
-
inflate: inflate_snappy
|
|
29
|
-
},
|
|
30
|
-
LZO: {
|
|
31
|
-
deflate: deflate_lzo,
|
|
32
|
-
inflate: inflate_lzo
|
|
33
|
-
},
|
|
34
|
-
BROTLI: {
|
|
35
|
-
deflate: deflate_brotli,
|
|
36
|
-
inflate: inflate_brotli
|
|
37
|
-
},
|
|
38
|
-
LZ4: {
|
|
39
|
-
deflate: deflate_lz4,
|
|
40
|
-
inflate: inflate_lz4
|
|
63
|
+
/**
|
|
64
|
+
* Deflate a value using compression method `method`
|
|
65
|
+
*/
|
|
66
|
+
export async function deflate(method: ParquetCompression, value: Buffer): Promise<Buffer> {
|
|
67
|
+
const compression = PARQUET_COMPRESSION_METHODS[method];
|
|
68
|
+
if (!compression) {
|
|
69
|
+
throw new Error(`parquet: invalid compression method: ${method}`);
|
|
41
70
|
}
|
|
42
|
-
|
|
71
|
+
const inputArrayBuffer = toArrayBuffer(value);
|
|
72
|
+
const compressedArrayBuffer = await compression.compress(inputArrayBuffer);
|
|
73
|
+
return toBuffer(compressedArrayBuffer);
|
|
74
|
+
}
|
|
43
75
|
|
|
44
76
|
/**
|
|
45
|
-
*
|
|
77
|
+
* Inflate a value using compression method `method`
|
|
78
|
+
*/
|
|
79
|
+
export async function decompress(
|
|
80
|
+
method: ParquetCompression,
|
|
81
|
+
value: Buffer,
|
|
82
|
+
size: number
|
|
83
|
+
): Promise<Buffer> {
|
|
84
|
+
const compression = PARQUET_COMPRESSION_METHODS[method];
|
|
85
|
+
if (!compression) {
|
|
86
|
+
throw new Error(`parquet: invalid compression method: ${method}`);
|
|
87
|
+
}
|
|
88
|
+
const inputArrayBuffer = toArrayBuffer(value);
|
|
89
|
+
const compressedArrayBuffer = await compression.decompress(inputArrayBuffer, size);
|
|
90
|
+
return toBuffer(compressedArrayBuffer);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/*
|
|
94
|
+
* Inflate a value using compression method `method`
|
|
46
95
|
*/
|
|
47
|
-
export function
|
|
96
|
+
export function inflate(method: ParquetCompression, value: Buffer, size: number): Buffer {
|
|
48
97
|
if (!(method in PARQUET_COMPRESSION_METHODS)) {
|
|
49
98
|
throw new Error(`invalid compression method: ${method}`);
|
|
50
99
|
}
|
|
51
|
-
|
|
52
|
-
return PARQUET_COMPRESSION_METHODS[method].
|
|
100
|
+
// @ts-ignore
|
|
101
|
+
return PARQUET_COMPRESSION_METHODS[method].inflate(value, size);
|
|
53
102
|
}
|
|
54
103
|
|
|
104
|
+
/*
|
|
55
105
|
function deflate_identity(value: Buffer): Buffer {
|
|
56
106
|
return value;
|
|
57
107
|
}
|
|
@@ -92,18 +142,6 @@ function deflate_lz4(value: Buffer): Buffer {
|
|
|
92
142
|
throw err;
|
|
93
143
|
}
|
|
94
144
|
}
|
|
95
|
-
|
|
96
|
-
/**
|
|
97
|
-
* Inflate a value using compression method `method`
|
|
98
|
-
*/
|
|
99
|
-
export function inflate(method: ParquetCompression, value: Buffer, size: number): Buffer {
|
|
100
|
-
if (!(method in PARQUET_COMPRESSION_METHODS)) {
|
|
101
|
-
throw new Error(`invalid compression method: ${method}`);
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
return PARQUET_COMPRESSION_METHODS[method].inflate(value, size);
|
|
105
|
-
}
|
|
106
|
-
|
|
107
145
|
function inflate_identity(value: Buffer): Buffer {
|
|
108
146
|
return value;
|
|
109
147
|
}
|
|
@@ -142,3 +180,4 @@ function inflate_brotli(value: Buffer): Buffer {
|
|
|
142
180
|
}
|
|
143
181
|
return Buffer.from(brotli.decompress(value));
|
|
144
182
|
}
|
|
183
|
+
*/
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
// Forked from https://github.com/kbajalc/parquets under MIT license (Copyright (c) 2017 ironSource Ltd.)
|
|
2
2
|
/* eslint-disable camelcase */
|
|
3
3
|
import {Transform, Writable} from 'stream';
|
|
4
|
-
import {ParquetCodecOptions, PARQUET_CODECS} from '
|
|
5
|
-
import * as Compression from '
|
|
4
|
+
import {ParquetCodecOptions, PARQUET_CODECS} from '../codecs';
|
|
5
|
+
import * as Compression from '../compression';
|
|
6
6
|
import {
|
|
7
7
|
ParquetBuffer,
|
|
8
8
|
ParquetCodec,
|
|
9
9
|
ParquetData,
|
|
10
10
|
ParquetField,
|
|
11
11
|
PrimitiveType
|
|
12
|
-
} from '
|
|
13
|
-
import {ParquetSchema} from '
|
|
14
|
-
import * as Shred from '
|
|
12
|
+
} from '../schema/declare';
|
|
13
|
+
import {ParquetSchema} from '../schema/schema';
|
|
14
|
+
import * as Shred from '../schema/shred';
|
|
15
15
|
import {
|
|
16
16
|
ColumnChunk,
|
|
17
17
|
ColumnMetaData,
|
|
@@ -28,8 +28,9 @@ import {
|
|
|
28
28
|
RowGroup,
|
|
29
29
|
SchemaElement,
|
|
30
30
|
Type
|
|
31
|
-
} from '
|
|
32
|
-
import
|
|
31
|
+
} from '../parquet-thrift';
|
|
32
|
+
import {osopen, oswrite, osclose} from '../utils/file-utils';
|
|
33
|
+
import {getBitWidth, serializeThrift} from '../utils/read-utils';
|
|
33
34
|
import Int64 from 'node-int64';
|
|
34
35
|
|
|
35
36
|
/**
|
|
@@ -85,7 +86,7 @@ export class ParquetWriter<T> {
|
|
|
85
86
|
path: string,
|
|
86
87
|
opts?: ParquetWriterOptions
|
|
87
88
|
): Promise<ParquetWriter<T>> {
|
|
88
|
-
const outputStream = await
|
|
89
|
+
const outputStream = await osopen(path, opts);
|
|
89
90
|
return ParquetWriter.openStream(schema, outputStream, opts);
|
|
90
91
|
}
|
|
91
92
|
|
|
@@ -131,13 +132,16 @@ export class ParquetWriter<T> {
|
|
|
131
132
|
this.closed = false;
|
|
132
133
|
this.userMetadata = {};
|
|
133
134
|
|
|
135
|
+
// eslint-disable-next-line @typescript-eslint/no-floating-promises
|
|
136
|
+
this.writeHeader();
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
async writeHeader(): Promise<void> {
|
|
134
140
|
// TODO - better not mess with promises in the constructor
|
|
135
141
|
try {
|
|
136
|
-
|
|
137
|
-
envelopeWriter.writeHeader();
|
|
142
|
+
await this.envelopeWriter.writeHeader();
|
|
138
143
|
} catch (err) {
|
|
139
|
-
|
|
140
|
-
envelopeWriter.close();
|
|
144
|
+
await this.envelopeWriter.close();
|
|
141
145
|
throw err;
|
|
142
146
|
}
|
|
143
147
|
}
|
|
@@ -226,8 +230,8 @@ export class ParquetEnvelopeWriter {
|
|
|
226
230
|
outputStream: Writable,
|
|
227
231
|
opts: ParquetWriterOptions
|
|
228
232
|
): Promise<ParquetEnvelopeWriter> {
|
|
229
|
-
const writeFn =
|
|
230
|
-
const closeFn =
|
|
233
|
+
const writeFn = oswrite.bind(undefined, outputStream);
|
|
234
|
+
const closeFn = osclose.bind(undefined, outputStream);
|
|
231
235
|
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts);
|
|
232
236
|
}
|
|
233
237
|
|
|
@@ -274,7 +278,7 @@ export class ParquetEnvelopeWriter {
|
|
|
274
278
|
* shredRecord method
|
|
275
279
|
*/
|
|
276
280
|
async writeRowGroup(records: ParquetBuffer): Promise<void> {
|
|
277
|
-
const rgroup = encodeRowGroup(this.schema, records, {
|
|
281
|
+
const rgroup = await encodeRowGroup(this.schema, records, {
|
|
278
282
|
baseOffset: this.offset,
|
|
279
283
|
pageSize: this.pageSize,
|
|
280
284
|
useDataPageV2: this.useDataPageV2
|
|
@@ -282,7 +286,7 @@ export class ParquetEnvelopeWriter {
|
|
|
282
286
|
|
|
283
287
|
this.rowCount += records.rowCount;
|
|
284
288
|
this.rowGroups.push(rgroup.metadata);
|
|
285
|
-
return this.writeSection(rgroup.body);
|
|
289
|
+
return await this.writeSection(rgroup.body);
|
|
286
290
|
}
|
|
287
291
|
|
|
288
292
|
/**
|
|
@@ -363,19 +367,19 @@ function encodeValues(
|
|
|
363
367
|
/**
|
|
364
368
|
* Encode a parquet data page
|
|
365
369
|
*/
|
|
366
|
-
function encodeDataPage(
|
|
370
|
+
async function encodeDataPage(
|
|
367
371
|
column: ParquetField,
|
|
368
372
|
data: ParquetData
|
|
369
|
-
): {
|
|
373
|
+
): Promise<{
|
|
370
374
|
header: PageHeader;
|
|
371
375
|
headerSize: number;
|
|
372
376
|
page: Buffer;
|
|
373
|
-
} {
|
|
377
|
+
}> {
|
|
374
378
|
/* encode repetition and definition levels */
|
|
375
379
|
let rLevelsBuf = Buffer.alloc(0);
|
|
376
380
|
if (column.rLevelMax > 0) {
|
|
377
381
|
rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.rlevels, {
|
|
378
|
-
bitWidth:
|
|
382
|
+
bitWidth: getBitWidth(column.rLevelMax)
|
|
379
383
|
// disableEnvelope: false
|
|
380
384
|
});
|
|
381
385
|
}
|
|
@@ -383,7 +387,7 @@ function encodeDataPage(
|
|
|
383
387
|
let dLevelsBuf = Buffer.alloc(0);
|
|
384
388
|
if (column.dLevelMax > 0) {
|
|
385
389
|
dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.dlevels, {
|
|
386
|
-
bitWidth:
|
|
390
|
+
bitWidth: getBitWidth(column.dLevelMax)
|
|
387
391
|
// disableEnvelope: false
|
|
388
392
|
});
|
|
389
393
|
}
|
|
@@ -397,7 +401,7 @@ function encodeDataPage(
|
|
|
397
401
|
const dataBuf = Buffer.concat([rLevelsBuf, dLevelsBuf, valuesBuf]);
|
|
398
402
|
|
|
399
403
|
// compression = column.compression === 'UNCOMPRESSED' ? (compression || 'UNCOMPRESSED') : column.compression;
|
|
400
|
-
const compressedBuf = Compression.deflate(column.compression!, dataBuf);
|
|
404
|
+
const compressedBuf = await Compression.deflate(column.compression!, dataBuf);
|
|
401
405
|
|
|
402
406
|
/* build page header */
|
|
403
407
|
const header = new PageHeader({
|
|
@@ -413,7 +417,7 @@ function encodeDataPage(
|
|
|
413
417
|
});
|
|
414
418
|
|
|
415
419
|
/* concat page header, repetition and definition levels and values */
|
|
416
|
-
const headerBuf =
|
|
420
|
+
const headerBuf = serializeThrift(header);
|
|
417
421
|
const page = Buffer.concat([headerBuf, compressedBuf]);
|
|
418
422
|
|
|
419
423
|
return {header, headerSize: headerBuf.length, page};
|
|
@@ -422,15 +426,15 @@ function encodeDataPage(
|
|
|
422
426
|
/**
|
|
423
427
|
* Encode a parquet data page (v2)
|
|
424
428
|
*/
|
|
425
|
-
function encodeDataPageV2(
|
|
429
|
+
async function encodeDataPageV2(
|
|
426
430
|
column: ParquetField,
|
|
427
431
|
data: ParquetData,
|
|
428
432
|
rowCount: number
|
|
429
|
-
): {
|
|
433
|
+
): Promise<{
|
|
430
434
|
header: PageHeader;
|
|
431
435
|
headerSize: number;
|
|
432
436
|
page: Buffer;
|
|
433
|
-
} {
|
|
437
|
+
}> {
|
|
434
438
|
/* encode values */
|
|
435
439
|
const valuesBuf = encodeValues(column.primitiveType!, column.encoding!, data.values, {
|
|
436
440
|
typeLength: column.typeLength,
|
|
@@ -438,13 +442,13 @@ function encodeDataPageV2(
|
|
|
438
442
|
});
|
|
439
443
|
|
|
440
444
|
// compression = column.compression === 'UNCOMPRESSED' ? (compression || 'UNCOMPRESSED') : column.compression;
|
|
441
|
-
const compressedBuf = Compression.deflate(column.compression!, valuesBuf);
|
|
445
|
+
const compressedBuf = await Compression.deflate(column.compression!, valuesBuf);
|
|
442
446
|
|
|
443
447
|
/* encode repetition and definition levels */
|
|
444
448
|
let rLevelsBuf = Buffer.alloc(0);
|
|
445
449
|
if (column.rLevelMax > 0) {
|
|
446
450
|
rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.rlevels, {
|
|
447
|
-
bitWidth:
|
|
451
|
+
bitWidth: getBitWidth(column.rLevelMax),
|
|
448
452
|
disableEnvelope: true
|
|
449
453
|
});
|
|
450
454
|
}
|
|
@@ -452,7 +456,7 @@ function encodeDataPageV2(
|
|
|
452
456
|
let dLevelsBuf = Buffer.alloc(0);
|
|
453
457
|
if (column.dLevelMax > 0) {
|
|
454
458
|
dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.dlevels, {
|
|
455
|
-
bitWidth:
|
|
459
|
+
bitWidth: getBitWidth(column.dLevelMax),
|
|
456
460
|
disableEnvelope: true
|
|
457
461
|
});
|
|
458
462
|
}
|
|
@@ -474,7 +478,7 @@ function encodeDataPageV2(
|
|
|
474
478
|
});
|
|
475
479
|
|
|
476
480
|
/* concat page header, repetition and definition levels and values */
|
|
477
|
-
const headerBuf =
|
|
481
|
+
const headerBuf = serializeThrift(header);
|
|
478
482
|
const page = Buffer.concat([headerBuf, rLevelsBuf, dLevelsBuf, compressedBuf]);
|
|
479
483
|
return {header, headerSize: headerBuf.length, page};
|
|
480
484
|
}
|
|
@@ -482,16 +486,16 @@ function encodeDataPageV2(
|
|
|
482
486
|
/**
|
|
483
487
|
* Encode an array of values into a parquet column chunk
|
|
484
488
|
*/
|
|
485
|
-
function encodeColumnChunk(
|
|
489
|
+
async function encodeColumnChunk(
|
|
486
490
|
column: ParquetField,
|
|
487
491
|
buffer: ParquetBuffer,
|
|
488
492
|
offset: number,
|
|
489
493
|
opts: ParquetWriterOptions
|
|
490
|
-
): {
|
|
494
|
+
): Promise<{
|
|
491
495
|
body: Buffer;
|
|
492
496
|
metadata: ColumnMetaData;
|
|
493
497
|
metadataOffset: number;
|
|
494
|
-
} {
|
|
498
|
+
}> {
|
|
495
499
|
const data = buffer.columnData[column.path.join()];
|
|
496
500
|
const baseOffset = (opts.baseOffset || 0) + offset;
|
|
497
501
|
/* encode data page(s) */
|
|
@@ -502,12 +506,9 @@ function encodeColumnChunk(
|
|
|
502
506
|
// tslint:disable-next-line:variable-name
|
|
503
507
|
let total_compressed_size = 0;
|
|
504
508
|
{
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
} else {
|
|
509
|
-
result = encodeDataPage(column, data);
|
|
510
|
-
}
|
|
509
|
+
const result = opts.useDataPageV2
|
|
510
|
+
? await encodeDataPageV2(column, data, buffer.rowCount)
|
|
511
|
+
: await encodeDataPage(column, data);
|
|
511
512
|
// pages.push(result.page);
|
|
512
513
|
pageBuf = result.page;
|
|
513
514
|
total_uncompressed_size += result.header.uncompressed_page_size + result.headerSize;
|
|
@@ -535,21 +536,21 @@ function encodeColumnChunk(
|
|
|
535
536
|
|
|
536
537
|
/* concat metadata header and data pages */
|
|
537
538
|
const metadataOffset = baseOffset + pageBuf.length;
|
|
538
|
-
const body = Buffer.concat([pageBuf,
|
|
539
|
+
const body = Buffer.concat([pageBuf, serializeThrift(metadata)]);
|
|
539
540
|
return {body, metadata, metadataOffset};
|
|
540
541
|
}
|
|
541
542
|
|
|
542
543
|
/**
|
|
543
544
|
* Encode a list of column values into a parquet row group
|
|
544
545
|
*/
|
|
545
|
-
function encodeRowGroup(
|
|
546
|
+
async function encodeRowGroup(
|
|
546
547
|
schema: ParquetSchema,
|
|
547
548
|
data: ParquetBuffer,
|
|
548
549
|
opts: ParquetWriterOptions
|
|
549
|
-
): {
|
|
550
|
+
): Promise<{
|
|
550
551
|
body: Buffer;
|
|
551
552
|
metadata: RowGroup;
|
|
552
|
-
} {
|
|
553
|
+
}> {
|
|
553
554
|
const metadata = new RowGroup({
|
|
554
555
|
num_rows: data.rowCount,
|
|
555
556
|
columns: [],
|
|
@@ -562,7 +563,7 @@ function encodeRowGroup(
|
|
|
562
563
|
continue; // eslint-disable-line no-continue
|
|
563
564
|
}
|
|
564
565
|
|
|
565
|
-
const cchunkData = encodeColumnChunk(field, data, body.length, opts);
|
|
566
|
+
const cchunkData = await encodeColumnChunk(field, data, body.length, opts);
|
|
566
567
|
|
|
567
568
|
const cchunk = new ColumnChunk({
|
|
568
569
|
file_offset: cchunkData.metadataOffset,
|
|
@@ -634,7 +635,7 @@ function encodeFooter(
|
|
|
634
635
|
metadata.schema.push(schemaElem);
|
|
635
636
|
}
|
|
636
637
|
|
|
637
|
-
const metadataEncoded =
|
|
638
|
+
const metadataEncoded = serializeThrift(metadata);
|
|
638
639
|
const footerEncoded = Buffer.alloc(metadataEncoded.length + 8);
|
|
639
640
|
metadataEncoded.copy(footerEncoded);
|
|
640
641
|
footerEncoded.writeUInt32LE(metadataEncoded.length, metadataEncoded.length);
|