@loaders.gl/parquet 3.0.12 → 3.1.0-alpha.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/dist.min.js +7 -18
- package/dist/dist.min.js.map +1 -1
- package/dist/es5/bundle.js +2 -4
- package/dist/es5/bundle.js.map +1 -1
- package/dist/es5/constants.js +17 -0
- package/dist/es5/constants.js.map +1 -0
- package/dist/es5/index.js +53 -21
- package/dist/es5/index.js.map +1 -1
- package/dist/es5/lib/convert-schema.js +82 -0
- package/dist/es5/lib/convert-schema.js.map +1 -0
- package/dist/es5/lib/parse-parquet.js +173 -0
- package/dist/es5/lib/parse-parquet.js.map +1 -0
- package/dist/es5/lib/read-array-buffer.js +53 -0
- package/dist/es5/lib/read-array-buffer.js.map +1 -0
- package/dist/es5/parquet-loader.js +6 -79
- package/dist/es5/parquet-loader.js.map +1 -1
- package/dist/es5/parquet-writer.js +1 -1
- package/dist/es5/parquet-writer.js.map +1 -1
- package/dist/es5/parquetjs/codecs/dictionary.js +30 -0
- package/dist/es5/parquetjs/codecs/dictionary.js.map +1 -0
- package/dist/es5/parquetjs/codecs/index.js +10 -0
- package/dist/es5/parquetjs/codecs/index.js.map +1 -1
- package/dist/es5/parquetjs/codecs/rle.js +2 -2
- package/dist/es5/parquetjs/codecs/rle.js.map +1 -1
- package/dist/es5/parquetjs/compression.js +138 -104
- package/dist/es5/parquetjs/compression.js.map +1 -1
- package/dist/es5/parquetjs/{writer.js → encoder/writer.js} +397 -228
- package/dist/es5/parquetjs/encoder/writer.js.map +1 -0
- package/dist/es5/parquetjs/parquet-thrift/CompressionCodec.js +1 -0
- package/dist/es5/parquetjs/parquet-thrift/CompressionCodec.js.map +1 -1
- package/dist/es5/parquetjs/parser/decoders.js +495 -0
- package/dist/es5/parquetjs/parser/decoders.js.map +1 -0
- package/dist/es5/parquetjs/parser/parquet-cursor.js +215 -0
- package/dist/es5/parquetjs/parser/parquet-cursor.js.map +1 -0
- package/dist/es5/parquetjs/parser/parquet-envelope-reader.js +452 -0
- package/dist/es5/parquetjs/parser/parquet-envelope-reader.js.map +1 -0
- package/dist/es5/parquetjs/parser/parquet-reader.js +413 -0
- package/dist/es5/parquetjs/parser/parquet-reader.js.map +1 -0
- package/dist/es5/parquetjs/schema/declare.js.map +1 -1
- package/dist/es5/parquetjs/schema/schema.js +2 -0
- package/dist/es5/parquetjs/schema/schema.js.map +1 -1
- package/dist/es5/parquetjs/schema/shred.js +2 -1
- package/dist/es5/parquetjs/schema/shred.js.map +1 -1
- package/dist/es5/parquetjs/schema/types.js +79 -4
- package/dist/es5/parquetjs/schema/types.js.map +1 -1
- package/dist/es5/parquetjs/utils/buffer-utils.js +21 -0
- package/dist/es5/parquetjs/utils/buffer-utils.js.map +1 -0
- package/dist/es5/parquetjs/utils/file-utils.js +108 -0
- package/dist/es5/parquetjs/utils/file-utils.js.map +1 -0
- package/dist/es5/parquetjs/{util.js → utils/read-utils.js} +13 -113
- package/dist/es5/parquetjs/utils/read-utils.js.map +1 -0
- package/dist/esm/bundle.js +2 -4
- package/dist/esm/bundle.js.map +1 -1
- package/dist/esm/constants.js +6 -0
- package/dist/esm/constants.js.map +1 -0
- package/dist/esm/index.js +14 -4
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/lib/convert-schema.js +71 -0
- package/dist/esm/lib/convert-schema.js.map +1 -0
- package/dist/esm/lib/parse-parquet.js +28 -0
- package/dist/esm/lib/parse-parquet.js.map +1 -0
- package/dist/esm/lib/read-array-buffer.js +9 -0
- package/dist/esm/lib/read-array-buffer.js.map +1 -0
- package/dist/esm/parquet-loader.js +4 -24
- package/dist/esm/parquet-loader.js.map +1 -1
- package/dist/esm/parquet-writer.js +1 -1
- package/dist/esm/parquet-writer.js.map +1 -1
- package/dist/esm/parquetjs/codecs/dictionary.js +12 -0
- package/dist/esm/parquetjs/codecs/dictionary.js.map +1 -0
- package/dist/esm/parquetjs/codecs/index.js +9 -0
- package/dist/esm/parquetjs/codecs/index.js.map +1 -1
- package/dist/esm/parquetjs/codecs/rle.js +2 -2
- package/dist/esm/parquetjs/codecs/rle.js.map +1 -1
- package/dist/esm/parquetjs/compression.js +54 -105
- package/dist/esm/parquetjs/compression.js.map +1 -1
- package/dist/esm/parquetjs/{writer.js → encoder/writer.js} +32 -35
- package/dist/esm/parquetjs/encoder/writer.js.map +1 -0
- package/dist/esm/parquetjs/parquet-thrift/CompressionCodec.js +1 -0
- package/dist/esm/parquetjs/parquet-thrift/CompressionCodec.js.map +1 -1
- package/dist/esm/parquetjs/parser/decoders.js +300 -0
- package/dist/esm/parquetjs/parser/decoders.js.map +1 -0
- package/dist/esm/parquetjs/parser/parquet-cursor.js +90 -0
- package/dist/esm/parquetjs/parser/parquet-cursor.js.map +1 -0
- package/dist/esm/parquetjs/parser/parquet-envelope-reader.js +164 -0
- package/dist/esm/parquetjs/parser/parquet-envelope-reader.js.map +1 -0
- package/dist/esm/parquetjs/parser/parquet-reader.js +133 -0
- package/dist/esm/parquetjs/parser/parquet-reader.js.map +1 -0
- package/dist/esm/parquetjs/schema/declare.js.map +1 -1
- package/dist/esm/parquetjs/schema/schema.js +2 -0
- package/dist/esm/parquetjs/schema/schema.js.map +1 -1
- package/dist/esm/parquetjs/schema/shred.js +2 -1
- package/dist/esm/parquetjs/schema/shred.js.map +1 -1
- package/dist/esm/parquetjs/schema/types.js +78 -4
- package/dist/esm/parquetjs/schema/types.js.map +1 -1
- package/dist/esm/parquetjs/utils/buffer-utils.js +12 -0
- package/dist/esm/parquetjs/utils/buffer-utils.js.map +1 -0
- package/dist/esm/parquetjs/utils/file-utils.js +79 -0
- package/dist/esm/parquetjs/utils/file-utils.js.map +1 -0
- package/dist/esm/parquetjs/{util.js → utils/read-utils.js} +11 -89
- package/dist/esm/parquetjs/utils/read-utils.js.map +1 -0
- package/dist/parquet-worker.js +7 -18
- package/dist/parquet-worker.js.map +1 -1
- package/package.json +10 -10
- package/src/bundle.ts +2 -3
- package/src/constants.ts +17 -0
- package/src/index.ts +30 -4
- package/src/lib/convert-schema.ts +95 -0
- package/src/lib/parse-parquet.ts +27 -0
- package/{dist/es5/libs → src/lib}/read-array-buffer.ts +0 -0
- package/src/parquet-loader.ts +4 -24
- package/src/parquetjs/codecs/dictionary.ts +11 -0
- package/src/parquetjs/codecs/index.ts +13 -0
- package/src/parquetjs/codecs/rle.ts +4 -2
- package/src/parquetjs/compression.ts +89 -50
- package/src/parquetjs/{writer.ts → encoder/writer.ts} +46 -45
- package/src/parquetjs/parquet-thrift/CompressionCodec.ts +2 -1
- package/src/parquetjs/parser/decoders.ts +448 -0
- package/src/parquetjs/parser/parquet-cursor.ts +94 -0
- package/src/parquetjs/parser/parquet-envelope-reader.ts +210 -0
- package/src/parquetjs/parser/parquet-reader.ts +179 -0
- package/src/parquetjs/schema/declare.ts +48 -2
- package/src/parquetjs/schema/schema.ts +2 -0
- package/src/parquetjs/schema/shred.ts +3 -1
- package/src/parquetjs/schema/types.ts +82 -5
- package/src/parquetjs/utils/buffer-utils.ts +18 -0
- package/src/parquetjs/utils/file-utils.ts +96 -0
- package/src/parquetjs/{util.ts → utils/read-utils.ts} +13 -110
- package/dist/dist.es5.min.js +0 -51
- package/dist/dist.es5.min.js.map +0 -1
- package/dist/es5/parquetjs/compression.ts.disabled +0 -105
- package/dist/es5/parquetjs/reader.js +0 -1078
- package/dist/es5/parquetjs/reader.js.map +0 -1
- package/dist/es5/parquetjs/util.js.map +0 -1
- package/dist/es5/parquetjs/writer.js.map +0 -1
- package/dist/esm/libs/read-array-buffer.ts +0 -31
- package/dist/esm/parquetjs/compression.ts.disabled +0 -105
- package/dist/esm/parquetjs/reader.js +0 -524
- package/dist/esm/parquetjs/reader.js.map +0 -1
- package/dist/esm/parquetjs/util.js.map +0 -1
- package/dist/esm/parquetjs/writer.js.map +0 -1
- package/src/libs/read-array-buffer.ts +0 -31
- package/src/parquetjs/compression.ts.disabled +0 -105
- package/src/parquetjs/reader.ts +0 -707
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../../src/parquetjs/compression.ts"],"names":["
|
|
1
|
+
{"version":3,"sources":["../../../src/parquetjs/compression.ts"],"names":["NoCompression","GZipCompression","SnappyCompression","BrotliCompression","LZOCompression","LZ4Compression","ZstdCompression","toArrayBuffer","toBuffer","brotliDecompress","lz4js","lzo","ZstdCodec","modules","brotli","decompress","compress","Error","PARQUET_COMPRESSION_METHODS","UNCOMPRESSED","GZIP","SNAPPY","BROTLI","LZ4","LZ4_RAW","LZO","ZSTD","preloadCompressions","options","compressions","Object","values","Promise","all","map","compression","preload","deflate","method","value","inputArrayBuffer","compressedArrayBuffer","size","inflate"],"mappings":"AAIA,SAEEA,aAFF,EAGEC,eAHF,EAIEC,iBAJF,EAKEC,iBALF,EAMEC,cANF,EAOEC,cAPF,EAQEC,eARF,QASO,yBATP;AAYA,SAAQC,aAAR,EAAuBC,QAAvB,QAAsC,sBAAtC;AAKA,OAAOC,gBAAP,MAA6B,mBAA7B;AACA,OAAOC,KAAP,MAAkB,OAAlB;AACA,OAAOC,GAAP,MAAgB,KAAhB;AACA,SAAQC,SAAR,QAAwB,YAAxB;AAGA,MAAMC,OAAO,GAAG;AAEdC,EAAAA,MAAM,EAAE;AACNC,IAAAA,UAAU,EAAEN,gBADN;AAENO,IAAAA,QAAQ,EAAE,MAAM;AACd,YAAM,IAAIC,KAAJ,CAAU,iBAAV,CAAN;AACD;AAJK,GAFM;AAQdP,EAAAA,KARc;AASdC,EAAAA,GATc;AAUd,gBAAcC;AAVA,CAAhB;AAcA,OAAO,MAAMM,2BAAoE,GAAG;AAClFC,EAAAA,YAAY,EAAE,IAAInB,aAAJ,EADoE;AAElFoB,EAAAA,IAAI,EAAE,IAAInB,eAAJ,EAF4E;AAGlFoB,EAAAA,MAAM,EAAE,IAAInB,iBAAJ,EAH0E;AAIlFoB,EAAAA,MAAM,EAAE,IAAInB,iBAAJ,CAAsB;AAACU,IAAAA;AAAD,GAAtB,CAJ0E;AAMlFU,EAAAA,GAAG,EAAE,IAAIlB,cAAJ,CAAmB;AAACQ,IAAAA;AAAD,GAAnB,CAN6E;AAOlFW,EAAAA,OAAO,EAAE,IAAInB,cAAJ,CAAmB;AAACQ,IAAAA;AAAD,GAAnB,CAPyE;AAQlFY,EAAAA,GAAG,EAAE,IAAIrB,cAAJ,CAAmB;AAACS,IAAAA;AAAD,GAAnB,CAR6E;AASlFa,EAAAA,IAAI,EAAE,IAAIpB,eAAJ,CAAoB;AAACO,IAAAA;AAAD,GAApB;AAT4E,CAA7E;AAgBP,OAAO,eAAec,mBAAf,CAAmCC,OAAnC,EAA8E;AACnF,QAAMC,YAAY,GAAGC,MAAM,CAACC,MAAP,CAAcb,2BAAd,CAArB;AACA,SAAO,MAAMc,OAAO,CAACC,GAAR,CAAYJ,YAAY,CAACK,GAAb,CAAkBC,WAAD,IAAiBA,WAAW,CAACC,OAAZ,EAAlC,CAAZ,CAAb;AACD;AAKD,OAAO,eAAeC,OAAf,CAAuBC,MAAvB,EAAmDC,KAAnD,EAAmF;AACxF,QAAMJ,WAAW,GAAGjB,2BAA2B,CAACoB,MAAD,CAA/C;;AACA,MAAI,CAACH,WAAL,EAAkB;AAChB,UAAM,IAAIlB,KAAJ,CAAW,wCAAuCqB,MAAO,EAAzD,CAAN;AACD;;AACD,QAAME,gBAAgB,GAAGjC,aAAa,CAACgC,KAAD,CAAtC;AACA,QAAME,qBAAqB,GAAG,MAAMN,WAAW,CAACnB,QAAZ,CAAqBwB,gBAArB,CAApC;AACA,SAAOhC,QAAQ,CAACiC,qBAAD,CAAf;AACD;AAKD,OAAO,eAAe1B,UAAf,CACLuB,MADK,EAELC,KAFK,EAGLG,IAHK,EAIY;AACjB,QAAMP,WAAW,GAAGjB,2BAA2B,CAACoB,MAAD,CAA/C;;AACA,MAAI,CAACH,WAAL,EAAkB;AAChB,UAAM,IAAIlB,KAAJ,CAAW,wCAAuCqB,MAAO,EAAzD,CAAN;AACD;;AACD,QAAME,gBAAgB,GAAGjC,aAAa,CAACgC,KAAD,CAAtC;AACA,QAAME,qBAAqB,GAAG,MAAMN,WAAW,CAACpB,UAAZ,CAAuByB,gBAAvB,EAAyCE,IAAzC,CAApC;AACA,SAAOlC,QAAQ,CAACiC,qBAAD,CAAf;AACD;AAKD,OAAO,SAASE,OAAT,CAAiBL,MAAjB,EAA6CC,KAA7C,EAA4DG,IAA5D,EAAkF;AACvF,MAAI,EAAEJ,MAAM,IAAIpB,2BAAZ,CAAJ,EAA8C;AAC5C,UAAM,IAAID,KAAJ,CAAW,+BAA8BqB,MAAO,EAAhD,CAAN;AACD;;AAED,SAAOpB,2BAA2B,CAACoB,MAAD,CAA3B,CAAoCK,OAApC,CAA4CJ,KAA5C,EAAmDG,IAAnD,CAAP;AACD","sourcesContent":["// Forked from https://github.com/kbajalc/parquets under MIT license (Copyright (c) 2017 ironSource Ltd.)\n/* eslint-disable camelcase */\n// Forked from https://github.com/ironSource/parquetjs under MIT license\n\nimport {\n Compression,\n NoCompression,\n GZipCompression,\n SnappyCompression,\n BrotliCompression,\n LZOCompression,\n LZ4Compression,\n ZstdCompression\n} from '@loaders.gl/compression';\n\nimport {ParquetCompression} from './schema/declare';\nimport {toArrayBuffer, toBuffer} from './utils/buffer-utils';\n\n// TODO switch to worker compression to avoid bundling...\n\n// import brotli from 'brotli'; - brotli has problems with decompress in browsers\nimport brotliDecompress from 'brotli/decompress';\nimport lz4js from 'lz4js';\nimport lzo from 'lzo';\nimport {ZstdCodec} from 'zstd-codec';\n\n// Inject large dependencies through Compression constructor options\nconst modules = {\n // brotli has problems with decompress in browsers\n brotli: {\n decompress: brotliDecompress,\n compress: () => {\n throw new Error('brotli compress');\n }\n },\n lz4js,\n lzo,\n 'zstd-codec': ZstdCodec\n};\n\n// See https://github.com/apache/parquet-format/blob/master/Compression.md\nexport const PARQUET_COMPRESSION_METHODS: Record<ParquetCompression, Compression> = {\n UNCOMPRESSED: new NoCompression(),\n GZIP: new GZipCompression(),\n SNAPPY: new SnappyCompression(),\n BROTLI: new BrotliCompression({modules}),\n // TODO: Understand difference between LZ4 and LZ4_RAW\n LZ4: new LZ4Compression({modules}),\n LZ4_RAW: new LZ4Compression({modules}),\n LZO: new LZOCompression({modules}),\n ZSTD: new ZstdCompression({modules})\n};\n\n/**\n * Register compressions that have big external libraries\n * @param options.modules External library dependencies\n */\nexport async function preloadCompressions(options?: {modules: {[key: string]: any}}) {\n const compressions = Object.values(PARQUET_COMPRESSION_METHODS);\n return await Promise.all(compressions.map((compression) => compression.preload()));\n}\n\n/**\n * Deflate a value using compression method `method`\n */\nexport async function deflate(method: ParquetCompression, value: Buffer): Promise<Buffer> {\n const compression = PARQUET_COMPRESSION_METHODS[method];\n if (!compression) {\n throw new Error(`parquet: invalid compression method: ${method}`);\n }\n const inputArrayBuffer = toArrayBuffer(value);\n const compressedArrayBuffer = await compression.compress(inputArrayBuffer);\n return toBuffer(compressedArrayBuffer);\n}\n\n/**\n * Inflate a value using compression method `method`\n */\nexport async function decompress(\n method: ParquetCompression,\n value: Buffer,\n size: number\n): Promise<Buffer> {\n const compression = PARQUET_COMPRESSION_METHODS[method];\n if (!compression) {\n throw new Error(`parquet: invalid compression method: ${method}`);\n }\n const inputArrayBuffer = toArrayBuffer(value);\n const compressedArrayBuffer = await compression.decompress(inputArrayBuffer, size);\n return toBuffer(compressedArrayBuffer);\n}\n\n/*\n * Inflate a value using compression method `method`\n */\nexport function inflate(method: ParquetCompression, value: Buffer, size: number): Buffer {\n if (!(method in PARQUET_COMPRESSION_METHODS)) {\n throw new Error(`invalid compression method: ${method}`);\n }\n // @ts-ignore\n return PARQUET_COMPRESSION_METHODS[method].inflate(value, size);\n}\n\n/*\nfunction deflate_identity(value: Buffer): Buffer {\n return value;\n}\n\nfunction deflate_gzip(value: Buffer): Buffer {\n return zlib.gzipSync(value);\n}\n\nfunction deflate_snappy(value: Buffer): Buffer {\n return snappyjs.compress(value);\n}\n\nfunction deflate_lzo(value: Buffer): Buffer {\n lzo = lzo || Util.load('lzo');\n return lzo.compress(value);\n}\n\nfunction deflate_brotli(value: Buffer): Buffer {\n brotli = brotli || Util.load('brotli');\n const result = brotli.compress(value, {\n mode: 0,\n quality: 8,\n lgwin: 22\n });\n return result ? Buffer.from(result) : Buffer.alloc(0);\n}\n\nfunction deflate_lz4(value: Buffer): Buffer {\n lz4js = lz4js || Util.load('lz4js');\n try {\n // let result = Buffer.alloc(lz4js.encodeBound(value.length));\n // const compressedSize = lz4.encodeBlock(value, result);\n // // remove unnecessary bytes\n // result = result.slice(0, compressedSize);\n // return result;\n return Buffer.from(lz4js.compress(value));\n } catch (err) {\n throw err;\n }\n}\nfunction inflate_identity(value: Buffer): Buffer {\n return value;\n}\n\nfunction inflate_gzip(value: Buffer): Buffer {\n return zlib.gunzipSync(value);\n}\n\nfunction inflate_snappy(value: Buffer): Buffer {\n return snappyjs.uncompress(value);\n}\n\nfunction inflate_lzo(value: Buffer, size: number): Buffer {\n lzo = lzo || Util.load('lzo');\n return lzo.decompress(value, size);\n}\n\nfunction inflate_lz4(value: Buffer, size: number): Buffer {\n lz4js = lz4js || Util.load('lz4js');\n try {\n // let result = Buffer.alloc(size);\n // const uncompressedSize = lz4js.decodeBlock(value, result);\n // // remove unnecessary bytes\n // result = result.slice(0, uncompressedSize);\n // return result;\n return Buffer.from(lz4js.decompress(value, size));\n } catch (err) {\n throw err;\n }\n}\n\nfunction inflate_brotli(value: Buffer): Buffer {\n brotli = brotli || Util.load('brotli');\n if (!value.length) {\n return Buffer.alloc(0);\n }\n return Buffer.from(brotli.decompress(value));\n}\n*/\n"],"file":"compression.js"}
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import _defineProperty from "@babel/runtime/helpers/esm/defineProperty";
|
|
2
2
|
import { Transform } from 'stream';
|
|
3
|
-
import { PARQUET_CODECS } from '
|
|
4
|
-
import * as Compression from '
|
|
5
|
-
import * as Shred from '
|
|
6
|
-
import { ColumnChunk, ColumnMetaData, CompressionCodec, ConvertedType, DataPageHeader, DataPageHeaderV2, Encoding, FieldRepetitionType, FileMetaData, KeyValue, PageHeader, PageType, RowGroup, SchemaElement, Type } from '
|
|
7
|
-
import
|
|
3
|
+
import { PARQUET_CODECS } from '../codecs';
|
|
4
|
+
import * as Compression from '../compression';
|
|
5
|
+
import * as Shred from '../schema/shred';
|
|
6
|
+
import { ColumnChunk, ColumnMetaData, CompressionCodec, ConvertedType, DataPageHeader, DataPageHeaderV2, Encoding, FieldRepetitionType, FileMetaData, KeyValue, PageHeader, PageType, RowGroup, SchemaElement, Type } from '../parquet-thrift';
|
|
7
|
+
import { osopen, oswrite, osclose } from '../utils/file-utils';
|
|
8
|
+
import { getBitWidth, serializeThrift } from '../utils/read-utils';
|
|
8
9
|
import Int64 from 'node-int64';
|
|
9
10
|
const PARQUET_MAGIC = 'PAR1';
|
|
10
11
|
const PARQUET_VERSION = 1;
|
|
@@ -14,7 +15,7 @@ const PARQUET_RDLVL_TYPE = 'INT32';
|
|
|
14
15
|
const PARQUET_RDLVL_ENCODING = 'RLE';
|
|
15
16
|
export class ParquetWriter {
|
|
16
17
|
static async openFile(schema, path, opts) {
|
|
17
|
-
const outputStream = await
|
|
18
|
+
const outputStream = await osopen(path, opts);
|
|
18
19
|
return ParquetWriter.openStream(schema, outputStream, opts);
|
|
19
20
|
}
|
|
20
21
|
|
|
@@ -46,11 +47,14 @@ export class ParquetWriter {
|
|
|
46
47
|
this.rowGroupSize = opts.rowGroupSize || PARQUET_DEFAULT_ROW_GROUP_SIZE;
|
|
47
48
|
this.closed = false;
|
|
48
49
|
this.userMetadata = {};
|
|
50
|
+
this.writeHeader();
|
|
51
|
+
}
|
|
49
52
|
|
|
53
|
+
async writeHeader() {
|
|
50
54
|
try {
|
|
51
|
-
envelopeWriter.writeHeader();
|
|
55
|
+
await this.envelopeWriter.writeHeader();
|
|
52
56
|
} catch (err) {
|
|
53
|
-
envelopeWriter.close();
|
|
57
|
+
await this.envelopeWriter.close();
|
|
54
58
|
throw err;
|
|
55
59
|
}
|
|
56
60
|
}
|
|
@@ -101,8 +105,8 @@ export class ParquetWriter {
|
|
|
101
105
|
}
|
|
102
106
|
export class ParquetEnvelopeWriter {
|
|
103
107
|
static async openStream(schema, outputStream, opts) {
|
|
104
|
-
const writeFn =
|
|
105
|
-
const closeFn =
|
|
108
|
+
const writeFn = oswrite.bind(undefined, outputStream);
|
|
109
|
+
const closeFn = osclose.bind(undefined, outputStream);
|
|
106
110
|
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts);
|
|
107
111
|
}
|
|
108
112
|
|
|
@@ -143,14 +147,14 @@ export class ParquetEnvelopeWriter {
|
|
|
143
147
|
}
|
|
144
148
|
|
|
145
149
|
async writeRowGroup(records) {
|
|
146
|
-
const rgroup = encodeRowGroup(this.schema, records, {
|
|
150
|
+
const rgroup = await encodeRowGroup(this.schema, records, {
|
|
147
151
|
baseOffset: this.offset,
|
|
148
152
|
pageSize: this.pageSize,
|
|
149
153
|
useDataPageV2: this.useDataPageV2
|
|
150
154
|
});
|
|
151
155
|
this.rowCount += records.rowCount;
|
|
152
156
|
this.rowGroups.push(rgroup.metadata);
|
|
153
|
-
return this.writeSection(rgroup.body);
|
|
157
|
+
return await this.writeSection(rgroup.body);
|
|
154
158
|
}
|
|
155
159
|
|
|
156
160
|
writeFooter(userMetadata) {
|
|
@@ -206,12 +210,12 @@ function encodeValues(type, encoding, values, opts) {
|
|
|
206
210
|
return PARQUET_CODECS[encoding].encodeValues(type, values, opts);
|
|
207
211
|
}
|
|
208
212
|
|
|
209
|
-
function encodeDataPage(column, data) {
|
|
213
|
+
async function encodeDataPage(column, data) {
|
|
210
214
|
let rLevelsBuf = Buffer.alloc(0);
|
|
211
215
|
|
|
212
216
|
if (column.rLevelMax > 0) {
|
|
213
217
|
rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.rlevels, {
|
|
214
|
-
bitWidth:
|
|
218
|
+
bitWidth: getBitWidth(column.rLevelMax)
|
|
215
219
|
});
|
|
216
220
|
}
|
|
217
221
|
|
|
@@ -219,7 +223,7 @@ function encodeDataPage(column, data) {
|
|
|
219
223
|
|
|
220
224
|
if (column.dLevelMax > 0) {
|
|
221
225
|
dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.dlevels, {
|
|
222
|
-
bitWidth:
|
|
226
|
+
bitWidth: getBitWidth(column.dLevelMax)
|
|
223
227
|
});
|
|
224
228
|
}
|
|
225
229
|
|
|
@@ -228,7 +232,7 @@ function encodeDataPage(column, data) {
|
|
|
228
232
|
bitWidth: column.typeLength
|
|
229
233
|
});
|
|
230
234
|
const dataBuf = Buffer.concat([rLevelsBuf, dLevelsBuf, valuesBuf]);
|
|
231
|
-
const compressedBuf = Compression.deflate(column.compression, dataBuf);
|
|
235
|
+
const compressedBuf = await Compression.deflate(column.compression, dataBuf);
|
|
232
236
|
const header = new PageHeader({
|
|
233
237
|
type: PageType.DATA_PAGE,
|
|
234
238
|
data_page_header: new DataPageHeader({
|
|
@@ -240,7 +244,7 @@ function encodeDataPage(column, data) {
|
|
|
240
244
|
uncompressed_page_size: dataBuf.length,
|
|
241
245
|
compressed_page_size: compressedBuf.length
|
|
242
246
|
});
|
|
243
|
-
const headerBuf =
|
|
247
|
+
const headerBuf = serializeThrift(header);
|
|
244
248
|
const page = Buffer.concat([headerBuf, compressedBuf]);
|
|
245
249
|
return {
|
|
246
250
|
header,
|
|
@@ -249,17 +253,17 @@ function encodeDataPage(column, data) {
|
|
|
249
253
|
};
|
|
250
254
|
}
|
|
251
255
|
|
|
252
|
-
function encodeDataPageV2(column, data, rowCount) {
|
|
256
|
+
async function encodeDataPageV2(column, data, rowCount) {
|
|
253
257
|
const valuesBuf = encodeValues(column.primitiveType, column.encoding, data.values, {
|
|
254
258
|
typeLength: column.typeLength,
|
|
255
259
|
bitWidth: column.typeLength
|
|
256
260
|
});
|
|
257
|
-
const compressedBuf = Compression.deflate(column.compression, valuesBuf);
|
|
261
|
+
const compressedBuf = await Compression.deflate(column.compression, valuesBuf);
|
|
258
262
|
let rLevelsBuf = Buffer.alloc(0);
|
|
259
263
|
|
|
260
264
|
if (column.rLevelMax > 0) {
|
|
261
265
|
rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.rlevels, {
|
|
262
|
-
bitWidth:
|
|
266
|
+
bitWidth: getBitWidth(column.rLevelMax),
|
|
263
267
|
disableEnvelope: true
|
|
264
268
|
});
|
|
265
269
|
}
|
|
@@ -268,7 +272,7 @@ function encodeDataPageV2(column, data, rowCount) {
|
|
|
268
272
|
|
|
269
273
|
if (column.dLevelMax > 0) {
|
|
270
274
|
dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.dlevels, {
|
|
271
|
-
bitWidth:
|
|
275
|
+
bitWidth: getBitWidth(column.dLevelMax),
|
|
272
276
|
disableEnvelope: true
|
|
273
277
|
});
|
|
274
278
|
}
|
|
@@ -287,7 +291,7 @@ function encodeDataPageV2(column, data, rowCount) {
|
|
|
287
291
|
uncompressed_page_size: rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length,
|
|
288
292
|
compressed_page_size: rLevelsBuf.length + dLevelsBuf.length + compressedBuf.length
|
|
289
293
|
});
|
|
290
|
-
const headerBuf =
|
|
294
|
+
const headerBuf = serializeThrift(header);
|
|
291
295
|
const page = Buffer.concat([headerBuf, rLevelsBuf, dLevelsBuf, compressedBuf]);
|
|
292
296
|
return {
|
|
293
297
|
header,
|
|
@@ -296,21 +300,14 @@ function encodeDataPageV2(column, data, rowCount) {
|
|
|
296
300
|
};
|
|
297
301
|
}
|
|
298
302
|
|
|
299
|
-
function encodeColumnChunk(column, buffer, offset, opts) {
|
|
303
|
+
async function encodeColumnChunk(column, buffer, offset, opts) {
|
|
300
304
|
const data = buffer.columnData[column.path.join()];
|
|
301
305
|
const baseOffset = (opts.baseOffset || 0) + offset;
|
|
302
306
|
let pageBuf;
|
|
303
307
|
let total_uncompressed_size = 0;
|
|
304
308
|
let total_compressed_size = 0;
|
|
305
309
|
{
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
if (opts.useDataPageV2) {
|
|
309
|
-
result = encodeDataPageV2(column, data, buffer.rowCount);
|
|
310
|
-
} else {
|
|
311
|
-
result = encodeDataPage(column, data);
|
|
312
|
-
}
|
|
313
|
-
|
|
310
|
+
const result = opts.useDataPageV2 ? await encodeDataPageV2(column, data, buffer.rowCount) : await encodeDataPage(column, data);
|
|
314
311
|
pageBuf = result.page;
|
|
315
312
|
total_uncompressed_size += result.header.uncompressed_page_size + result.headerSize;
|
|
316
313
|
total_compressed_size += result.header.compressed_page_size + result.headerSize;
|
|
@@ -328,7 +325,7 @@ function encodeColumnChunk(column, buffer, offset, opts) {
|
|
|
328
325
|
metadata.encodings.push(Encoding[PARQUET_RDLVL_ENCODING]);
|
|
329
326
|
metadata.encodings.push(Encoding[column.encoding]);
|
|
330
327
|
const metadataOffset = baseOffset + pageBuf.length;
|
|
331
|
-
const body = Buffer.concat([pageBuf,
|
|
328
|
+
const body = Buffer.concat([pageBuf, serializeThrift(metadata)]);
|
|
332
329
|
return {
|
|
333
330
|
body,
|
|
334
331
|
metadata,
|
|
@@ -336,7 +333,7 @@ function encodeColumnChunk(column, buffer, offset, opts) {
|
|
|
336
333
|
};
|
|
337
334
|
}
|
|
338
335
|
|
|
339
|
-
function encodeRowGroup(schema, data, opts) {
|
|
336
|
+
async function encodeRowGroup(schema, data, opts) {
|
|
340
337
|
const metadata = new RowGroup({
|
|
341
338
|
num_rows: data.rowCount,
|
|
342
339
|
columns: [],
|
|
@@ -349,7 +346,7 @@ function encodeRowGroup(schema, data, opts) {
|
|
|
349
346
|
continue;
|
|
350
347
|
}
|
|
351
348
|
|
|
352
|
-
const cchunkData = encodeColumnChunk(field, data, body.length, opts);
|
|
349
|
+
const cchunkData = await encodeColumnChunk(field, data, body.length, opts);
|
|
353
350
|
const cchunk = new ColumnChunk({
|
|
354
351
|
file_offset: cchunkData.metadataOffset,
|
|
355
352
|
meta_data: cchunkData.metadata
|
|
@@ -414,7 +411,7 @@ function encodeFooter(schema, rowCount, rowGroups, userMetadata) {
|
|
|
414
411
|
metadata.schema.push(schemaElem);
|
|
415
412
|
}
|
|
416
413
|
|
|
417
|
-
const metadataEncoded =
|
|
414
|
+
const metadataEncoded = serializeThrift(metadata);
|
|
418
415
|
const footerEncoded = Buffer.alloc(metadataEncoded.length + 8);
|
|
419
416
|
metadataEncoded.copy(footerEncoded);
|
|
420
417
|
footerEncoded.writeUInt32LE(metadataEncoded.length, metadataEncoded.length);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../../../src/parquetjs/encoder/writer.ts"],"names":["Transform","PARQUET_CODECS","Compression","Shred","ColumnChunk","ColumnMetaData","CompressionCodec","ConvertedType","DataPageHeader","DataPageHeaderV2","Encoding","FieldRepetitionType","FileMetaData","KeyValue","PageHeader","PageType","RowGroup","SchemaElement","Type","osopen","oswrite","osclose","getBitWidth","serializeThrift","Int64","PARQUET_MAGIC","PARQUET_VERSION","PARQUET_DEFAULT_PAGE_SIZE","PARQUET_DEFAULT_ROW_GROUP_SIZE","PARQUET_RDLVL_TYPE","PARQUET_RDLVL_ENCODING","ParquetWriter","openFile","schema","path","opts","outputStream","openStream","envelopeWriter","ParquetEnvelopeWriter","constructor","rowBuffer","rowGroupSize","closed","userMetadata","writeHeader","err","close","appendRow","row","Error","shredRecord","rowCount","callback","writeFooter","setMetadata","key","value","String","setRowGroupSize","cnt","setPageSize","writeFn","bind","undefined","closeFn","fileOffset","write","offset","rowGroups","pageSize","useDataPageV2","Boolean","writeSection","buf","length","Buffer","from","writeRowGroup","records","rgroup","encodeRowGroup","baseOffset","push","metadata","body","encodeFooter","ParquetTransformer","objectMode","writeProxy","t","b","writer","_transform","encoding","then","Promise","resolve","_flush","encodeValues","type","values","encodeDataPage","column","data","rLevelsBuf","alloc","rLevelMax","rlevels","bitWidth","dLevelsBuf","dLevelMax","dlevels","valuesBuf","primitiveType","typeLength","dataBuf","concat","compressedBuf","deflate","compression","header","DATA_PAGE","data_page_header","num_values","count","definition_level_encoding","repetition_level_encoding","uncompressed_page_size","compressed_page_size","headerBuf","page","headerSize","encodeDataPageV2","disableEnvelope","DATA_PAGE_V2","data_page_header_v2","num_nulls","num_rows","definition_levels_byte_length","repetition_levels_byte_length","is_compressed","encodeColumnChunk","buffer","columnData","join","pageBuf","total_uncompressed_size","total_compressed_size","result","path_in_schema","data_page_offset","encodings","codec","metadataOffset","columns","total_byte_size","field","fieldList","isNested","cchunkData","cchunk","file_offset","meta_data","Number","version","created_by","row_groups","key_value_metadata","kv","schemaRoot","name","num_children","Object","keys","fields","relt","repetitionType","schemaElem","repetition_type","fieldCount","originalType","converted_type","type_length","metadataEncoded","footerEncoded","copy","writeUInt32LE"],"mappings":";AAEA,SAAQA,SAAR,QAAkC,QAAlC;AACA,SAA6BC,cAA7B,QAAkD,WAAlD;AACA,OAAO,KAAKC,WAAZ,MAA6B,gBAA7B;AASA,OAAO,KAAKC,KAAZ,MAAuB,iBAAvB;AACA,SACEC,WADF,EAEEC,cAFF,EAGEC,gBAHF,EAIEC,aAJF,EAKEC,cALF,EAMEC,gBANF,EAOEC,QAPF,EAQEC,mBARF,EASEC,YATF,EAUEC,QAVF,EAWEC,UAXF,EAYEC,QAZF,EAaEC,QAbF,EAcEC,aAdF,EAeEC,IAfF,QAgBO,mBAhBP;AAiBA,SAAQC,MAAR,EAAgBC,OAAhB,EAAyBC,OAAzB,QAAuC,qBAAvC;AACA,SAAQC,WAAR,EAAqBC,eAArB,QAA2C,qBAA3C;AACA,OAAOC,KAAP,MAAkB,YAAlB;AAKA,MAAMC,aAAa,GAAG,MAAtB;AAKA,MAAMC,eAAe,GAAG,CAAxB;AAKA,MAAMC,yBAAyB,GAAG,IAAlC;AACA,MAAMC,8BAA8B,GAAG,IAAvC;AAKA,MAAMC,kBAAkB,GAAG,OAA3B;AACA,MAAMC,sBAAsB,GAAG,KAA/B;AAuBA,OAAO,MAAMC,aAAN,CAAuB;AAKP,eAARC,QAAQ,CACnBC,MADmB,EAEnBC,IAFmB,EAGnBC,IAHmB,EAIQ;AAC3B,UAAMC,YAAY,GAAG,MAAMjB,MAAM,CAACe,IAAD,EAAOC,IAAP,CAAjC;AACA,WAAOJ,aAAa,CAACM,UAAd,CAAyBJ,MAAzB,EAAiCG,YAAjC,EAA+CD,IAA/C,CAAP;AACD;;AAMsB,eAAVE,UAAU,CACrBJ,MADqB,EAErBG,YAFqB,EAGrBD,IAHqB,EAIM;AAC3B,QAAI,CAACA,IAAL,EAAW;AAETA,MAAAA,IAAI,GAAG,EAAP;AACD;;AAED,UAAMG,cAAc,GAAG,MAAMC,qBAAqB,CAACF,UAAtB,CAAiCJ,MAAjC,EAAyCG,YAAzC,EAAuDD,IAAvD,CAA7B;AAEA,WAAO,IAAIJ,aAAJ,CAAkBE,MAAlB,EAA0BK,cAA1B,EAA0CH,IAA1C,CAAP;AACD;;AAYDK,EAAAA,WAAW,CACTP,MADS,EAETK,cAFS,EAGTH,IAHS,EAIT;AAAA;;AAAA;;AAAA;;AAAA;;AAAA;;AAAA;;AACA,SAAKF,MAAL,GAAcA,MAAd;AACA,SAAKK,cAAL,GAAsBA,cAAtB;AAEA,SAAKG,SAAL,GAAiB,EAAjB;AACA,SAAKC,YAAL,GAAoBP,IAAI,CAACO,YAAL,IAAqBd,8BAAzC;AACA,SAAKe,MAAL,GAAc,KAAd;AACA,SAAKC,YAAL,GAAoB,EAApB;AAGA,SAAKC,WAAL;AACD;;AAEgB,QAAXA,WAAW,GAAkB;AAEjC,QAAI;AACF,YAAM,KAAKP,cAAL,CAAoBO,WAApB,EAAN;AACD,KAFD,CAEE,OAAOC,GAAP,EAAY;AACZ,YAAM,KAAKR,cAAL,CAAoBS,KAApB,EAAN;AACA,YAAMD,GAAN;AACD;AACF;;AAMc,QAATE,SAAS,CAAIC,GAAJ,EAA2B;AACxC,QAAI,KAAKN,MAAT,EAAiB;AACf,YAAM,IAAIO,KAAJ,CAAU,mBAAV,CAAN;AACD;;AACD/C,IAAAA,KAAK,CAACgD,WAAN,CAAkB,KAAKlB,MAAvB,EAA+BgB,GAA/B,EAAoC,KAAKR,SAAzC;;AACA,QAAI,KAAKA,SAAL,CAAeW,QAAf,IAA2B,KAAKV,YAApC,EAAkD;AAEhD,WAAKD,SAAL,GAAiB,EAAjB;AACD;AACF;;AAQU,QAALM,KAAK,CAACM,QAAD,EAAuC;AAChD,QAAI,KAAKV,MAAT,EAAiB;AACf,YAAM,IAAIO,KAAJ,CAAU,mBAAV,CAAN;AACD;;AAED,SAAKP,MAAL,GAAc,IAAd;;AAEA,QAAI,KAAKF,SAAL,CAAeW,QAAf,GAA0B,CAA1B,IAA+B,KAAKX,SAAL,CAAeW,QAAf,IAA2B,KAAKV,YAAnE,EAAiF;AAE/E,WAAKD,SAAL,GAAiB,EAAjB;AACD;;AAED,UAAM,KAAKH,cAAL,CAAoBgB,WAApB,CAAgC,KAAKV,YAArC,CAAN;AACA,UAAM,KAAKN,cAAL,CAAoBS,KAApB,EAAN;;AAGA,QAAIM,QAAJ,EAAc;AACZA,MAAAA,QAAQ;AACT;AACF;;AAKDE,EAAAA,WAAW,CAACC,GAAD,EAAcC,KAAd,EAAmC;AAE5C,SAAKb,YAAL,CAAkBc,MAAM,CAACF,GAAD,CAAxB,IAAiCE,MAAM,CAACD,KAAD,CAAvC;AACD;;AAQDE,EAAAA,eAAe,CAACC,GAAD,EAAoB;AACjC,SAAKlB,YAAL,GAAoBkB,GAApB;AACD;;AAMDC,EAAAA,WAAW,CAACD,GAAD,EAAoB;AAC7B,SAAKtB,cAAL,CAAoBuB,WAApB,CAAgCD,GAAhC;AACD;;AAxI2B;AAiJ9B,OAAO,MAAMrB,qBAAN,CAA4B;AAIV,eAAVF,UAAU,CACrBJ,MADqB,EAErBG,YAFqB,EAGrBD,IAHqB,EAIW;AAChC,UAAM2B,OAAO,GAAG1C,OAAO,CAAC2C,IAAR,CAAaC,SAAb,EAAwB5B,YAAxB,CAAhB;AACA,UAAM6B,OAAO,GAAG5C,OAAO,CAAC0C,IAAR,CAAaC,SAAb,EAAwB5B,YAAxB,CAAhB;AACA,WAAO,IAAIG,qBAAJ,CAA0BN,MAA1B,EAAkC6B,OAAlC,EAA2CG,OAA3C,EAAoD,CAApD,EAAuD9B,IAAvD,CAAP;AACD;;AAWDK,EAAAA,WAAW,CACTP,MADS,EAET6B,OAFS,EAGTG,OAHS,EAITC,UAJS,EAKT/B,IALS,EAMT;AAAA;;AAAA;;AAAA;;AAAA;;AAAA;;AAAA;;AAAA;;AAAA;;AACA,SAAKF,MAAL,GAAcA,MAAd;AACA,SAAKkC,KAAL,GAAaL,OAAb;AACA,SAAKf,KAAL,GAAakB,OAAb;AACA,SAAKG,MAAL,GAAcF,UAAd;AACA,SAAKd,QAAL,GAAgB,CAAhB;AACA,SAAKiB,SAAL,GAAiB,EAAjB;AACA,SAAKC,QAAL,GAAgBnC,IAAI,CAACmC,QAAL,IAAiB3C,yBAAjC;AACA,SAAK4C,aAAL,GAAqB,mBAAmBpC,IAAnB,GAA0BqC,OAAO,CAACrC,IAAI,CAACoC,aAAN,CAAjC,GAAwD,KAA7E;AACD;;AAEDE,EAAAA,YAAY,CAACC,GAAD,EAA6B;AACvC,SAAKN,MAAL,IAAeM,GAAG,CAACC,MAAnB;AACA,WAAO,KAAKR,KAAL,CAAWO,GAAX,CAAP;AACD;;AAKD7B,EAAAA,WAAW,GAAkB;AAC3B,WAAO,KAAK4B,YAAL,CAAkBG,MAAM,CAACC,IAAP,CAAYpD,aAAZ,CAAlB,CAAP;AACD;;AAMkB,QAAbqD,aAAa,CAACC,OAAD,EAAwC;AACzD,UAAMC,MAAM,GAAG,MAAMC,cAAc,CAAC,KAAKhD,MAAN,EAAc8C,OAAd,EAAuB;AACxDG,MAAAA,UAAU,EAAE,KAAKd,MADuC;AAExDE,MAAAA,QAAQ,EAAE,KAAKA,QAFyC;AAGxDC,MAAAA,aAAa,EAAE,KAAKA;AAHoC,KAAvB,CAAnC;AAMA,SAAKnB,QAAL,IAAiB2B,OAAO,CAAC3B,QAAzB;AACA,SAAKiB,SAAL,CAAec,IAAf,CAAoBH,MAAM,CAACI,QAA3B;AACA,WAAO,MAAM,KAAKX,YAAL,CAAkBO,MAAM,CAACK,IAAzB,CAAb;AACD;;AAKD/B,EAAAA,WAAW,CAACV,YAAD,EAAsD;AAC/D,QAAI,CAACA,YAAL,EAAmB;AAEjBA,MAAAA,YAAY,GAAG,EAAf;AACD;;AAED,WAAO,KAAK6B,YAAL,CACLa,YAAY,CAAC,KAAKrD,MAAN,EAAc,KAAKmB,QAAnB,EAA6B,KAAKiB,SAAlC,EAA6CzB,YAA7C,CADP,CAAP;AAGD;;AAMDiB,EAAAA,WAAW,CAACD,GAAD,EAAoB;AAC7B,SAAKU,QAAL,GAAgBV,GAAhB;AACD;;AAxFgC;AA8FnC,OAAO,MAAM2B,kBAAN,SAAoCvF,SAApC,CAA8C;AAGnDwC,EAAAA,WAAW,CAACP,MAAD,EAAwBE,IAA0B,GAAG,EAArD,EAAyD;AAClE,UAAM;AAACqD,MAAAA,UAAU,EAAE;AAAb,KAAN;;AADkE;;AAGlE,UAAMC,UAAU,GAAI,UAAUC,CAAV,EAAsC;AACxD,aAAO,gBAAgBC,CAAhB,EAAuC;AAC5CD,QAAAA,CAAC,CAACP,IAAF,CAAOQ,CAAP;AACD,OAFD;AAGD,KAJkB,CAIhB,IAJgB,CAAnB;;AAMA,SAAKC,MAAL,GAAc,IAAI7D,aAAJ,CACZE,MADY,EAEZ,IAAIM,qBAAJ,CAA0BN,MAA1B,EAAkCwD,UAAlC,EAA8C,YAAY,CAAE,CAA5D,EAA8D,CAA9D,EAAiEtD,IAAjE,CAFY,EAGZA,IAHY,CAAd;AAKD;;AAGD0D,EAAAA,UAAU,CAAC5C,GAAD,EAAW6C,QAAX,EAA6BzC,QAA7B,EAA2E;AACnF,QAAIJ,GAAJ,EAAS;AACP,aAAO,KAAK2C,MAAL,CAAY5C,SAAZ,CAAsBC,GAAtB,EAA2B8C,IAA3B,CAAgC1C,QAAhC,CAAP;AACD;;AACDA,IAAAA,QAAQ;AACR,WAAO2C,OAAO,CAACC,OAAR,EAAP;AACD;;AAGW,QAANC,MAAM,CAAC7C,QAAD,EAAgC;AAC1C,UAAM,KAAKuC,MAAL,CAAY7C,KAAZ,CAAkBM,QAAlB,CAAN;AACD;;AA/BkD;;AAqCrD,SAAS8C,YAAT,CACEC,IADF,EAEEN,QAFF,EAGEO,MAHF,EAIElE,IAJF,EAKE;AACA,MAAI,EAAE2D,QAAQ,IAAI7F,cAAd,CAAJ,EAAmC;AACjC,UAAM,IAAIiD,KAAJ,CAAW,qBAAoB4C,QAAS,EAAxC,CAAN;AACD;;AACD,SAAO7F,cAAc,CAAC6F,QAAD,CAAd,CAAyBK,YAAzB,CAAsCC,IAAtC,EAA4CC,MAA5C,EAAoDlE,IAApD,CAAP;AACD;;AAKD,eAAemE,cAAf,CACEC,MADF,EAEEC,IAFF,EAOG;AAED,MAAIC,UAAU,GAAG7B,MAAM,CAAC8B,KAAP,CAAa,CAAb,CAAjB;;AACA,MAAIH,MAAM,CAACI,SAAP,GAAmB,CAAvB,EAA0B;AACxBF,IAAAA,UAAU,GAAGN,YAAY,CAACtE,kBAAD,EAAqBC,sBAArB,EAA6C0E,IAAI,CAACI,OAAlD,EAA2D;AAClFC,MAAAA,QAAQ,EAAEvF,WAAW,CAACiF,MAAM,CAACI,SAAR;AAD6D,KAA3D,CAAzB;AAID;;AAED,MAAIG,UAAU,GAAGlC,MAAM,CAAC8B,KAAP,CAAa,CAAb,CAAjB;;AACA,MAAIH,MAAM,CAACQ,SAAP,GAAmB,CAAvB,EAA0B;AACxBD,IAAAA,UAAU,GAAGX,YAAY,CAACtE,kBAAD,EAAqBC,sBAArB,EAA6C0E,IAAI,CAACQ,OAAlD,EAA2D;AAClFH,MAAAA,QAAQ,EAAEvF,WAAW,CAACiF,MAAM,CAACQ,SAAR;AAD6D,KAA3D,CAAzB;AAID;;AAGD,QAAME,SAAS,GAAGd,YAAY,CAACI,MAAM,CAACW,aAAR,EAAwBX,MAAM,CAACT,QAA/B,EAA0CU,IAAI,CAACH,MAA/C,EAAuD;AACnFc,IAAAA,UAAU,EAAEZ,MAAM,CAACY,UADgE;AAEnFN,IAAAA,QAAQ,EAAEN,MAAM,CAACY;AAFkE,GAAvD,CAA9B;AAKA,QAAMC,OAAO,GAAGxC,MAAM,CAACyC,MAAP,CAAc,CAACZ,UAAD,EAAaK,UAAb,EAAyBG,SAAzB,CAAd,CAAhB;AAGA,QAAMK,aAAa,GAAG,MAAMpH,WAAW,CAACqH,OAAZ,CAAoBhB,MAAM,CAACiB,WAA3B,EAAyCJ,OAAzC,CAA5B;AAGA,QAAMK,MAAM,GAAG,IAAI3G,UAAJ,CAAe;AAC5BsF,IAAAA,IAAI,EAAErF,QAAQ,CAAC2G,SADa;AAE5BC,IAAAA,gBAAgB,EAAE,IAAInH,cAAJ,CAAmB;AACnCoH,MAAAA,UAAU,EAAEpB,IAAI,CAACqB,KADkB;AAEnC/B,MAAAA,QAAQ,EAAEpF,QAAQ,CAAC6F,MAAM,CAACT,QAAR,CAFiB;AAGnCgC,MAAAA,yBAAyB,EAAEpH,QAAQ,CAACoB,sBAAD,CAHA;AAInCiG,MAAAA,yBAAyB,EAAErH,QAAQ,CAACoB,sBAAD;AAJA,KAAnB,CAFU;AAQ5BkG,IAAAA,sBAAsB,EAAEZ,OAAO,CAACzC,MARJ;AAS5BsD,IAAAA,oBAAoB,EAAEX,aAAa,CAAC3C;AATR,GAAf,CAAf;AAaA,QAAMuD,SAAS,GAAG3G,eAAe,CAACkG,MAAD,CAAjC;AACA,QAAMU,IAAI,GAAGvD,MAAM,CAACyC,MAAP,CAAc,CAACa,SAAD,EAAYZ,aAAZ,CAAd,CAAb;AAEA,SAAO;AAACG,IAAAA,MAAD;AAASW,IAAAA,UAAU,EAAEF,SAAS,CAACvD,MAA/B;AAAuCwD,IAAAA;AAAvC,GAAP;AACD;;AAKD,eAAeE,gBAAf,CACE9B,MADF,EAEEC,IAFF,EAGEpD,QAHF,EAQG;AAED,QAAM6D,SAAS,GAAGd,YAAY,CAACI,MAAM,CAACW,aAAR,EAAwBX,MAAM,CAACT,QAA/B,EAA0CU,IAAI,CAACH,MAA/C,EAAuD;AACnFc,IAAAA,UAAU,EAAEZ,MAAM,CAACY,UADgE;AAEnFN,IAAAA,QAAQ,EAAEN,MAAM,CAACY;AAFkE,GAAvD,CAA9B;AAMA,QAAMG,aAAa,GAAG,MAAMpH,WAAW,CAACqH,OAAZ,CAAoBhB,MAAM,CAACiB,WAA3B,EAAyCP,SAAzC,CAA5B;AAGA,MAAIR,UAAU,GAAG7B,MAAM,CAAC8B,KAAP,CAAa,CAAb,CAAjB;;AACA,MAAIH,MAAM,CAACI,SAAP,GAAmB,CAAvB,EAA0B;AACxBF,IAAAA,UAAU,GAAGN,YAAY,CAACtE,kBAAD,EAAqBC,sBAArB,EAA6C0E,IAAI,CAACI,OAAlD,EAA2D;AAClFC,MAAAA,QAAQ,EAAEvF,WAAW,CAACiF,MAAM,CAACI,SAAR,CAD6D;AAElF2B,MAAAA,eAAe,EAAE;AAFiE,KAA3D,CAAzB;AAID;;AAED,MAAIxB,UAAU,GAAGlC,MAAM,CAAC8B,KAAP,CAAa,CAAb,CAAjB;;AACA,MAAIH,MAAM,CAACQ,SAAP,GAAmB,CAAvB,EAA0B;AACxBD,IAAAA,UAAU,GAAGX,YAAY,CAACtE,kBAAD,EAAqBC,sBAArB,EAA6C0E,IAAI,CAACQ,OAAlD,EAA2D;AAClFH,MAAAA,QAAQ,EAAEvF,WAAW,CAACiF,MAAM,CAACQ,SAAR,CAD6D;AAElFuB,MAAAA,eAAe,EAAE;AAFiE,KAA3D,CAAzB;AAID;;AAGD,QAAMb,MAAM,GAAG,IAAI3G,UAAJ,CAAe;AAC5BsF,IAAAA,IAAI,EAAErF,QAAQ,CAACwH,YADa;AAE5BC,IAAAA,mBAAmB,EAAE,IAAI/H,gBAAJ,CAAqB;AACxCmH,MAAAA,UAAU,EAAEpB,IAAI,CAACqB,KADuB;AAExCY,MAAAA,SAAS,EAAEjC,IAAI,CAACqB,KAAL,GAAarB,IAAI,CAACH,MAAL,CAAY1B,MAFI;AAGxC+D,MAAAA,QAAQ,EAAEtF,QAH8B;AAIxC0C,MAAAA,QAAQ,EAAEpF,QAAQ,CAAC6F,MAAM,CAACT,QAAR,CAJsB;AAKxC6C,MAAAA,6BAA6B,EAAE7B,UAAU,CAACnC,MALF;AAMxCiE,MAAAA,6BAA6B,EAAEnC,UAAU,CAAC9B,MANF;AAOxCkE,MAAAA,aAAa,EAAEtC,MAAM,CAACiB,WAAP,KAAuB;AAPE,KAArB,CAFO;AAW5BQ,IAAAA,sBAAsB,EAAEvB,UAAU,CAAC9B,MAAX,GAAoBmC,UAAU,CAACnC,MAA/B,GAAwCsC,SAAS,CAACtC,MAX9C;AAY5BsD,IAAAA,oBAAoB,EAAExB,UAAU,CAAC9B,MAAX,GAAoBmC,UAAU,CAACnC,MAA/B,GAAwC2C,aAAa,CAAC3C;AAZhD,GAAf,CAAf;AAgBA,QAAMuD,SAAS,GAAG3G,eAAe,CAACkG,MAAD,CAAjC;AACA,QAAMU,IAAI,GAAGvD,MAAM,CAACyC,MAAP,CAAc,CAACa,SAAD,EAAYzB,UAAZ,EAAwBK,UAAxB,EAAoCQ,aAApC,CAAd,CAAb;AACA,SAAO;AAACG,IAAAA,MAAD;AAASW,IAAAA,UAAU,EAAEF,SAAS,CAACvD,MAA/B;AAAuCwD,IAAAA;AAAvC,GAAP;AACD;;AAKD,eAAeW,iBAAf,CACEvC,MADF,EAEEwC,MAFF,EAGE3E,MAHF,EAIEjC,IAJF,EASG;AACD,QAAMqE,IAAI,GAAGuC,MAAM,CAACC,UAAP,CAAkBzC,MAAM,CAACrE,IAAP,CAAY+G,IAAZ,EAAlB,CAAb;AACA,QAAM/D,UAAU,GAAG,CAAC/C,IAAI,CAAC+C,UAAL,IAAmB,CAApB,IAAyBd,MAA5C;AAGA,MAAI8E,OAAJ;AAEA,MAAIC,uBAAuB,GAAG,CAA9B;AAEA,MAAIC,qBAAqB,GAAG,CAA5B;AACA;AACE,UAAMC,MAAM,GAAGlH,IAAI,CAACoC,aAAL,GACX,MAAM8D,gBAAgB,CAAC9B,MAAD,EAASC,IAAT,EAAeuC,MAAM,CAAC3F,QAAtB,CADX,GAEX,MAAMkD,cAAc,CAACC,MAAD,EAASC,IAAT,CAFxB;AAIA0C,IAAAA,OAAO,GAAGG,MAAM,CAAClB,IAAjB;AACAgB,IAAAA,uBAAuB,IAAIE,MAAM,CAAC5B,MAAP,CAAcO,sBAAd,GAAuCqB,MAAM,CAACjB,UAAzE;AACAgB,IAAAA,qBAAqB,IAAIC,MAAM,CAAC5B,MAAP,CAAcQ,oBAAd,GAAqCoB,MAAM,CAACjB,UAArE;AACD;AAMD,QAAMhD,QAAQ,GAAG,IAAI/E,cAAJ,CAAmB;AAClCiJ,IAAAA,cAAc,EAAE/C,MAAM,CAACrE,IADW;AAElC0F,IAAAA,UAAU,EAAEpB,IAAI,CAACqB,KAFiB;AAGlC0B,IAAAA,gBAAgB,EAAErE,UAHgB;AAIlCsE,IAAAA,SAAS,EAAE,EAJuB;AAKlCL,IAAAA,uBALkC;AAMlCC,IAAAA,qBANkC;AAOlChD,IAAAA,IAAI,EAAElF,IAAI,CAACqF,MAAM,CAACW,aAAR,CAPwB;AAQlCuC,IAAAA,KAAK,EAAEnJ,gBAAgB,CAACiG,MAAM,CAACiB,WAAR;AARW,GAAnB,CAAjB;AAYApC,EAAAA,QAAQ,CAACoE,SAAT,CAAmBrE,IAAnB,CAAwBzE,QAAQ,CAACoB,sBAAD,CAAhC;AACAsD,EAAAA,QAAQ,CAACoE,SAAT,CAAmBrE,IAAnB,CAAwBzE,QAAQ,CAAC6F,MAAM,CAACT,QAAR,CAAhC;AAGA,QAAM4D,cAAc,GAAGxE,UAAU,GAAGgE,OAAO,CAACvE,MAA5C;AACA,QAAMU,IAAI,GAAGT,MAAM,CAACyC,MAAP,CAAc,CAAC6B,OAAD,EAAU3H,eAAe,CAAC6D,QAAD,CAAzB,CAAd,CAAb;AACA,SAAO;AAACC,IAAAA,IAAD;AAAOD,IAAAA,QAAP;AAAiBsE,IAAAA;AAAjB,GAAP;AACD;;AAKD,eAAezE,cAAf,CACEhD,MADF,EAEEuE,IAFF,EAGErE,IAHF,EAOG;AACD,QAAMiD,QAAQ,GAAG,IAAIpE,QAAJ,CAAa;AAC5B0H,IAAAA,QAAQ,EAAElC,IAAI,CAACpD,QADa;AAE5BuG,IAAAA,OAAO,EAAE,EAFmB;AAG5BC,IAAAA,eAAe,EAAE;AAHW,GAAb,CAAjB;AAMA,MAAIvE,IAAI,GAAGT,MAAM,CAAC8B,KAAP,CAAa,CAAb,CAAX;;AACA,OAAK,MAAMmD,KAAX,IAAoB5H,MAAM,CAAC6H,SAA3B,EAAsC;AACpC,QAAID,KAAK,CAACE,QAAV,EAAoB;AAClB;AACD;;AAED,UAAMC,UAAU,GAAG,MAAMlB,iBAAiB,CAACe,KAAD,EAAQrD,IAAR,EAAcnB,IAAI,CAACV,MAAnB,EAA2BxC,IAA3B,CAA1C;AAEA,UAAM8H,MAAM,GAAG,IAAI7J,WAAJ,CAAgB;AAC7B8J,MAAAA,WAAW,EAAEF,UAAU,CAACN,cADK;AAE7BS,MAAAA,SAAS,EAAEH,UAAU,CAAC5E;AAFO,KAAhB,CAAf;AAKAA,IAAAA,QAAQ,CAACuE,OAAT,CAAiBxE,IAAjB,CAAsB8E,MAAtB;AACA7E,IAAAA,QAAQ,CAACwE,eAAT,GAA2B,IAAIpI,KAAJ,CAAU4I,MAAM,CAAChF,QAAQ,CAACwE,eAAV,CAAN,GAAmCI,UAAU,CAAC3E,IAAX,CAAgBV,MAA7D,CAA3B;AAEAU,IAAAA,IAAI,GAAGT,MAAM,CAACyC,MAAP,CAAc,CAAChC,IAAD,EAAO2E,UAAU,CAAC3E,IAAlB,CAAd,CAAP;AACD;;AAED,SAAO;AAACA,IAAAA,IAAD;AAAOD,IAAAA;AAAP,GAAP;AACD;;AAKD,SAASE,YAAT,CACErD,MADF,EAEEmB,QAFF,EAGEiB,SAHF,EAIEzB,YAJF,EAKU;AACR,QAAMwC,QAAQ,GAAG,IAAIxE,YAAJ,CAAiB;AAChCyJ,IAAAA,OAAO,EAAE3I,eADuB;AAEhC4I,IAAAA,UAAU,EAAE,UAFoB;AAGhC5B,IAAAA,QAAQ,EAAEtF,QAHsB;AAIhCmH,IAAAA,UAAU,EAAElG,SAJoB;AAKhCpC,IAAAA,MAAM,EAAE,EALwB;AAMhCuI,IAAAA,kBAAkB,EAAE;AANY,GAAjB,CAAjB;;AASA,OAAK,MAAMhH,GAAX,IAAkBZ,YAAlB,EAAgC;AAAA;;AAC9B,UAAM6H,EAAE,GAAG,IAAI5J,QAAJ,CAAa;AACtB2C,MAAAA,GADsB;AAEtBC,MAAAA,KAAK,EAAEb,YAAY,CAACY,GAAD;AAFG,KAAb,CAAX;AAIA,6BAAA4B,QAAQ,CAACoF,kBAAT,0GAA6BrF,IAA7B,8GAAoCsF,EAApC;AACD;;AAED;AACE,UAAMC,UAAU,GAAG,IAAIzJ,aAAJ,CAAkB;AACnC0J,MAAAA,IAAI,EAAE,MAD6B;AAEnCC,MAAAA,YAAY,EAAEC,MAAM,CAACC,IAAP,CAAY7I,MAAM,CAAC8I,MAAnB,EAA2BpG;AAFN,KAAlB,CAAnB;AAIAS,IAAAA,QAAQ,CAACnD,MAAT,CAAgBkD,IAAhB,CAAqBuF,UAArB;AACD;;AAED,OAAK,MAAMb,KAAX,IAAoB5H,MAAM,CAAC6H,SAA3B,EAAsC;AACpC,UAAMkB,IAAI,GAAGrK,mBAAmB,CAACkJ,KAAK,CAACoB,cAAP,CAAhC;AACA,UAAMC,UAAU,GAAG,IAAIjK,aAAJ,CAAkB;AACnC0J,MAAAA,IAAI,EAAEd,KAAK,CAACc,IADuB;AAEnCQ,MAAAA,eAAe,EAAEH;AAFkB,KAAlB,CAAnB;;AAKA,QAAInB,KAAK,CAACE,QAAV,EAAoB;AAClBmB,MAAAA,UAAU,CAACN,YAAX,GAA0Bf,KAAK,CAACuB,UAAhC;AACD,KAFD,MAEO;AACLF,MAAAA,UAAU,CAAC9E,IAAX,GAAkBlF,IAAI,CAAC2I,KAAK,CAAC3C,aAAP,CAAtB;AACD;;AAED,QAAI2C,KAAK,CAACwB,YAAV,EAAwB;AACtBH,MAAAA,UAAU,CAACI,cAAX,GAA4B/K,aAAa,CAACsJ,KAAK,CAACwB,YAAP,CAAzC;AACD;;AAEDH,IAAAA,UAAU,CAACK,WAAX,GAAyB1B,KAAK,CAAC1C,UAA/B;AAEA/B,IAAAA,QAAQ,CAACnD,MAAT,CAAgBkD,IAAhB,CAAqB+F,UAArB;AACD;;AAED,QAAMM,eAAe,GAAGjK,eAAe,CAAC6D,QAAD,CAAvC;AACA,QAAMqG,aAAa,GAAG7G,MAAM,CAAC8B,KAAP,CAAa8E,eAAe,CAAC7G,MAAhB,GAAyB,CAAtC,CAAtB;AACA6G,EAAAA,eAAe,CAACE,IAAhB,CAAqBD,aAArB;AACAA,EAAAA,aAAa,CAACE,aAAd,CAA4BH,eAAe,CAAC7G,MAA5C,EAAoD6G,eAAe,CAAC7G,MAApE;AACA8G,EAAAA,aAAa,CAACtH,KAAd,CAAoB1C,aAApB,EAAmC+J,eAAe,CAAC7G,MAAhB,GAAyB,CAA5D;AACA,SAAO8G,aAAP;AACD","sourcesContent":["// Forked from https://github.com/kbajalc/parquets under MIT license (Copyright (c) 2017 ironSource Ltd.)\n/* eslint-disable camelcase */\nimport {Transform, Writable} from 'stream';\nimport {ParquetCodecOptions, PARQUET_CODECS} from '../codecs';\nimport * as Compression from '../compression';\nimport {\n ParquetBuffer,\n ParquetCodec,\n ParquetData,\n ParquetField,\n PrimitiveType\n} from '../schema/declare';\nimport {ParquetSchema} from '../schema/schema';\nimport * as Shred from '../schema/shred';\nimport {\n ColumnChunk,\n ColumnMetaData,\n CompressionCodec,\n ConvertedType,\n DataPageHeader,\n DataPageHeaderV2,\n Encoding,\n FieldRepetitionType,\n FileMetaData,\n KeyValue,\n PageHeader,\n PageType,\n RowGroup,\n SchemaElement,\n Type\n} from '../parquet-thrift';\nimport {osopen, oswrite, osclose} from '../utils/file-utils';\nimport {getBitWidth, serializeThrift} from '../utils/read-utils';\nimport Int64 from 'node-int64';\n\n/**\n * Parquet File Magic String\n */\nconst PARQUET_MAGIC = 'PAR1';\n\n/**\n * Parquet File Format Version\n */\nconst PARQUET_VERSION = 1;\n\n/**\n * Default Page and Row Group sizes\n */\nconst PARQUET_DEFAULT_PAGE_SIZE = 8192;\nconst PARQUET_DEFAULT_ROW_GROUP_SIZE = 4096;\n\n/**\n * Repetition and Definition Level Encoding\n */\nconst PARQUET_RDLVL_TYPE = 'INT32';\nconst PARQUET_RDLVL_ENCODING = 'RLE';\n\nexport interface ParquetWriterOptions {\n baseOffset?: number;\n rowGroupSize?: number;\n pageSize?: number;\n useDataPageV2?: boolean;\n\n // Write Stream Options\n flags?: string;\n encoding?: string;\n fd?: number;\n mode?: number;\n autoClose?: boolean;\n start?: number;\n}\n\n/**\n * Write a parquet file to an output stream. The ParquetWriter will perform\n * buffering/batching for performance, so close() must be called after all rows\n * are written.\n */\n// eslint-disable-next-line @typescript-eslint/no-unused-vars\nexport class ParquetWriter<T> {\n /**\n * Convenience method to create a new buffered parquet writer that writes to\n * the specified file\n */\n static async openFile<T>(\n schema: ParquetSchema,\n path: string,\n opts?: ParquetWriterOptions\n ): Promise<ParquetWriter<T>> {\n const outputStream = await osopen(path, opts);\n return ParquetWriter.openStream(schema, outputStream, opts);\n }\n\n /**\n * Convenience method to create a new buffered parquet writer that writes to\n * the specified stream\n */\n static async openStream<T>(\n schema: ParquetSchema,\n outputStream: Writable,\n opts?: ParquetWriterOptions\n ): Promise<ParquetWriter<T>> {\n if (!opts) {\n // tslint:disable-next-line:no-parameter-reassignment\n opts = {};\n }\n\n const envelopeWriter = await ParquetEnvelopeWriter.openStream(schema, outputStream, opts);\n\n return new ParquetWriter(schema, envelopeWriter, opts);\n }\n\n public schema: ParquetSchema;\n public envelopeWriter: ParquetEnvelopeWriter;\n public rowBuffer: ParquetBuffer;\n public rowGroupSize: number;\n public closed: boolean;\n public userMetadata: Record<string, string>;\n\n /**\n * Create a new buffered parquet writer for a given envelope writer\n */\n constructor(\n schema: ParquetSchema,\n envelopeWriter: ParquetEnvelopeWriter,\n opts: ParquetWriterOptions\n ) {\n this.schema = schema;\n this.envelopeWriter = envelopeWriter;\n // @ts-ignore Row buffer typings...\n this.rowBuffer = {};\n this.rowGroupSize = opts.rowGroupSize || PARQUET_DEFAULT_ROW_GROUP_SIZE;\n this.closed = false;\n this.userMetadata = {};\n\n // eslint-disable-next-line @typescript-eslint/no-floating-promises\n this.writeHeader();\n }\n\n async writeHeader(): Promise<void> {\n // TODO - better not mess with promises in the constructor\n try {\n await this.envelopeWriter.writeHeader();\n } catch (err) {\n await this.envelopeWriter.close();\n throw err;\n }\n }\n\n /**\n * Append a single row to the parquet file. Rows are buffered in memory until\n * rowGroupSize rows are in the buffer or close() is called\n */\n async appendRow<T>(row: T): Promise<void> {\n if (this.closed) {\n throw new Error('writer was closed');\n }\n Shred.shredRecord(this.schema, row, this.rowBuffer);\n if (this.rowBuffer.rowCount >= this.rowGroupSize) {\n // @ts-ignore\n this.rowBuffer = {};\n }\n }\n\n /**\n * Finish writing the parquet file and commit the footer to disk. This method\n * MUST be called after you are finished adding rows. You must not call this\n * method twice on the same object or add any rows after the close() method has\n * been called\n */\n async close(callback?: () => void): Promise<void> {\n if (this.closed) {\n throw new Error('writer was closed');\n }\n\n this.closed = true;\n\n if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) {\n // @ts-ignore\n this.rowBuffer = {};\n }\n\n await this.envelopeWriter.writeFooter(this.userMetadata);\n await this.envelopeWriter.close();\n // this.envelopeWriter = null;\n\n if (callback) {\n callback();\n }\n }\n\n /**\n * Add key<>value metadata to the file\n */\n setMetadata(key: string, value: string): void {\n // TODO: value to be any, obj -> JSON\n this.userMetadata[String(key)] = String(value);\n }\n\n /**\n * Set the parquet row group size. This values controls the maximum number\n * of rows that are buffered in memory at any given time as well as the number\n * of rows that are co-located on disk. A higher value is generally better for\n * read-time I/O performance at the tradeoff of write-time memory usage.\n */\n setRowGroupSize(cnt: number): void {\n this.rowGroupSize = cnt;\n }\n\n /**\n * Set the parquet data page size. The data page size controls the maximum\n * number of column values that are written to disk as a consecutive array\n */\n setPageSize(cnt: number): void {\n this.envelopeWriter.setPageSize(cnt);\n }\n}\n\n/**\n * Create a parquet file from a schema and a number of row groups. This class\n * performs direct, unbuffered writes to the underlying output stream and is\n * intendend for advanced and internal users; the writeXXX methods must be\n * called in the correct order to produce a valid file.\n */\nexport class ParquetEnvelopeWriter {\n /**\n * Create a new parquet envelope writer that writes to the specified stream\n */\n static async openStream(\n schema: ParquetSchema,\n outputStream: Writable,\n opts: ParquetWriterOptions\n ): Promise<ParquetEnvelopeWriter> {\n const writeFn = oswrite.bind(undefined, outputStream);\n const closeFn = osclose.bind(undefined, outputStream);\n return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts);\n }\n\n public schema: ParquetSchema;\n public write: (buf: Buffer) => Promise<void>;\n public close: () => Promise<void>;\n public offset: number;\n public rowCount: number;\n public rowGroups: RowGroup[];\n public pageSize: number;\n public useDataPageV2: boolean;\n\n constructor(\n schema: ParquetSchema,\n writeFn: (buf: Buffer) => Promise<void>,\n closeFn: () => Promise<void>,\n fileOffset: number,\n opts: ParquetWriterOptions\n ) {\n this.schema = schema;\n this.write = writeFn;\n this.close = closeFn;\n this.offset = fileOffset;\n this.rowCount = 0;\n this.rowGroups = [];\n this.pageSize = opts.pageSize || PARQUET_DEFAULT_PAGE_SIZE;\n this.useDataPageV2 = 'useDataPageV2' in opts ? Boolean(opts.useDataPageV2) : false;\n }\n\n writeSection(buf: Buffer): Promise<void> {\n this.offset += buf.length;\n return this.write(buf);\n }\n\n /**\n * Encode the parquet file header\n */\n writeHeader(): Promise<void> {\n return this.writeSection(Buffer.from(PARQUET_MAGIC));\n }\n\n /**\n * Encode a parquet row group. The records object should be created using the\n * shredRecord method\n */\n async writeRowGroup(records: ParquetBuffer): Promise<void> {\n const rgroup = await encodeRowGroup(this.schema, records, {\n baseOffset: this.offset,\n pageSize: this.pageSize,\n useDataPageV2: this.useDataPageV2\n });\n\n this.rowCount += records.rowCount;\n this.rowGroups.push(rgroup.metadata);\n return await this.writeSection(rgroup.body);\n }\n\n /**\n * Write the parquet file footer\n */\n writeFooter(userMetadata: Record<string, string>): Promise<void> {\n if (!userMetadata) {\n // tslint:disable-next-line:no-parameter-reassignment\n userMetadata = {};\n }\n\n return this.writeSection(\n encodeFooter(this.schema, this.rowCount, this.rowGroups, userMetadata)\n );\n }\n\n /**\n * Set the parquet data page size. The data page size controls the maximum\n * number of column values that are written to disk as a consecutive array\n */\n setPageSize(cnt: number): void {\n this.pageSize = cnt;\n }\n}\n\n/**\n * Create a parquet transform stream\n */\nexport class ParquetTransformer<T> extends Transform {\n public writer: ParquetWriter<T>;\n\n constructor(schema: ParquetSchema, opts: ParquetWriterOptions = {}) {\n super({objectMode: true});\n\n const writeProxy = (function (t: ParquetTransformer<any>) {\n return async function (b: any): Promise<void> {\n t.push(b);\n };\n })(this);\n\n this.writer = new ParquetWriter(\n schema,\n new ParquetEnvelopeWriter(schema, writeProxy, async () => {}, 0, opts),\n opts\n );\n }\n\n // tslint:disable-next-line:function-name\n _transform(row: any, encoding: string, callback: (val?: any) => void): Promise<void> {\n if (row) {\n return this.writer.appendRow(row).then(callback);\n }\n callback();\n return Promise.resolve();\n }\n\n // tslint:disable-next-line:function-name\n async _flush(callback: (val?: any) => void) {\n await this.writer.close(callback);\n }\n}\n\n/**\n * Encode a consecutive array of data using one of the parquet encodings\n */\nfunction encodeValues(\n type: PrimitiveType,\n encoding: ParquetCodec,\n values: any[],\n opts: ParquetCodecOptions\n) {\n if (!(encoding in PARQUET_CODECS)) {\n throw new Error(`invalid encoding: ${encoding}`);\n }\n return PARQUET_CODECS[encoding].encodeValues(type, values, opts);\n}\n\n/**\n * Encode a parquet data page\n */\nasync function encodeDataPage(\n column: ParquetField,\n data: ParquetData\n): Promise<{\n header: PageHeader;\n headerSize: number;\n page: Buffer;\n}> {\n /* encode repetition and definition levels */\n let rLevelsBuf = Buffer.alloc(0);\n if (column.rLevelMax > 0) {\n rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.rlevels, {\n bitWidth: getBitWidth(column.rLevelMax)\n // disableEnvelope: false\n });\n }\n\n let dLevelsBuf = Buffer.alloc(0);\n if (column.dLevelMax > 0) {\n dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.dlevels, {\n bitWidth: getBitWidth(column.dLevelMax)\n // disableEnvelope: false\n });\n }\n\n /* encode values */\n const valuesBuf = encodeValues(column.primitiveType!, column.encoding!, data.values, {\n typeLength: column.typeLength,\n bitWidth: column.typeLength\n });\n\n const dataBuf = Buffer.concat([rLevelsBuf, dLevelsBuf, valuesBuf]);\n\n // compression = column.compression === 'UNCOMPRESSED' ? (compression || 'UNCOMPRESSED') : column.compression;\n const compressedBuf = await Compression.deflate(column.compression!, dataBuf);\n\n /* build page header */\n const header = new PageHeader({\n type: PageType.DATA_PAGE,\n data_page_header: new DataPageHeader({\n num_values: data.count,\n encoding: Encoding[column.encoding!] as any,\n definition_level_encoding: Encoding[PARQUET_RDLVL_ENCODING], // [PARQUET_RDLVL_ENCODING],\n repetition_level_encoding: Encoding[PARQUET_RDLVL_ENCODING] // [PARQUET_RDLVL_ENCODING]\n }),\n uncompressed_page_size: dataBuf.length,\n compressed_page_size: compressedBuf.length\n });\n\n /* concat page header, repetition and definition levels and values */\n const headerBuf = serializeThrift(header);\n const page = Buffer.concat([headerBuf, compressedBuf]);\n\n return {header, headerSize: headerBuf.length, page};\n}\n\n/**\n * Encode a parquet data page (v2)\n */\nasync function encodeDataPageV2(\n column: ParquetField,\n data: ParquetData,\n rowCount: number\n): Promise<{\n header: PageHeader;\n headerSize: number;\n page: Buffer;\n}> {\n /* encode values */\n const valuesBuf = encodeValues(column.primitiveType!, column.encoding!, data.values, {\n typeLength: column.typeLength,\n bitWidth: column.typeLength\n });\n\n // compression = column.compression === 'UNCOMPRESSED' ? (compression || 'UNCOMPRESSED') : column.compression;\n const compressedBuf = await Compression.deflate(column.compression!, valuesBuf);\n\n /* encode repetition and definition levels */\n let rLevelsBuf = Buffer.alloc(0);\n if (column.rLevelMax > 0) {\n rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.rlevels, {\n bitWidth: getBitWidth(column.rLevelMax),\n disableEnvelope: true\n });\n }\n\n let dLevelsBuf = Buffer.alloc(0);\n if (column.dLevelMax > 0) {\n dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.dlevels, {\n bitWidth: getBitWidth(column.dLevelMax),\n disableEnvelope: true\n });\n }\n\n /* build page header */\n const header = new PageHeader({\n type: PageType.DATA_PAGE_V2,\n data_page_header_v2: new DataPageHeaderV2({\n num_values: data.count,\n num_nulls: data.count - data.values.length,\n num_rows: rowCount,\n encoding: Encoding[column.encoding!] as any,\n definition_levels_byte_length: dLevelsBuf.length,\n repetition_levels_byte_length: rLevelsBuf.length,\n is_compressed: column.compression !== 'UNCOMPRESSED'\n }),\n uncompressed_page_size: rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length,\n compressed_page_size: rLevelsBuf.length + dLevelsBuf.length + compressedBuf.length\n });\n\n /* concat page header, repetition and definition levels and values */\n const headerBuf = serializeThrift(header);\n const page = Buffer.concat([headerBuf, rLevelsBuf, dLevelsBuf, compressedBuf]);\n return {header, headerSize: headerBuf.length, page};\n}\n\n/**\n * Encode an array of values into a parquet column chunk\n */\nasync function encodeColumnChunk(\n column: ParquetField,\n buffer: ParquetBuffer,\n offset: number,\n opts: ParquetWriterOptions\n): Promise<{\n body: Buffer;\n metadata: ColumnMetaData;\n metadataOffset: number;\n}> {\n const data = buffer.columnData[column.path.join()];\n const baseOffset = (opts.baseOffset || 0) + offset;\n /* encode data page(s) */\n // const pages: Buffer[] = [];\n let pageBuf: Buffer;\n // tslint:disable-next-line:variable-name\n let total_uncompressed_size = 0;\n // tslint:disable-next-line:variable-name\n let total_compressed_size = 0;\n {\n const result = opts.useDataPageV2\n ? await encodeDataPageV2(column, data, buffer.rowCount)\n : await encodeDataPage(column, data);\n // pages.push(result.page);\n pageBuf = result.page;\n total_uncompressed_size += result.header.uncompressed_page_size + result.headerSize;\n total_compressed_size += result.header.compressed_page_size + result.headerSize;\n }\n\n // const pagesBuf = Buffer.concat(pages);\n // const compression = column.compression === 'UNCOMPRESSED' ? (opts.compression || 'UNCOMPRESSED') : column.compression;\n\n /* prepare metadata header */\n const metadata = new ColumnMetaData({\n path_in_schema: column.path,\n num_values: data.count,\n data_page_offset: baseOffset,\n encodings: [],\n total_uncompressed_size, // : pagesBuf.length,\n total_compressed_size,\n type: Type[column.primitiveType!],\n codec: CompressionCodec[column.compression!]\n });\n\n /* list encodings */\n metadata.encodings.push(Encoding[PARQUET_RDLVL_ENCODING]);\n metadata.encodings.push(Encoding[column.encoding!]);\n\n /* concat metadata header and data pages */\n const metadataOffset = baseOffset + pageBuf.length;\n const body = Buffer.concat([pageBuf, serializeThrift(metadata)]);\n return {body, metadata, metadataOffset};\n}\n\n/**\n * Encode a list of column values into a parquet row group\n */\nasync function encodeRowGroup(\n schema: ParquetSchema,\n data: ParquetBuffer,\n opts: ParquetWriterOptions\n): Promise<{\n body: Buffer;\n metadata: RowGroup;\n}> {\n const metadata = new RowGroup({\n num_rows: data.rowCount,\n columns: [],\n total_byte_size: 0\n });\n\n let body = Buffer.alloc(0);\n for (const field of schema.fieldList) {\n if (field.isNested) {\n continue; // eslint-disable-line no-continue\n }\n\n const cchunkData = await encodeColumnChunk(field, data, body.length, opts);\n\n const cchunk = new ColumnChunk({\n file_offset: cchunkData.metadataOffset,\n meta_data: cchunkData.metadata\n });\n\n metadata.columns.push(cchunk);\n metadata.total_byte_size = new Int64(Number(metadata.total_byte_size) + cchunkData.body.length);\n\n body = Buffer.concat([body, cchunkData.body]);\n }\n\n return {body, metadata};\n}\n\n/**\n * Encode a parquet file metadata footer\n */\nfunction encodeFooter(\n schema: ParquetSchema,\n rowCount: number,\n rowGroups: RowGroup[],\n userMetadata: Record<string, string>\n): Buffer {\n const metadata = new FileMetaData({\n version: PARQUET_VERSION,\n created_by: 'parquets',\n num_rows: rowCount,\n row_groups: rowGroups,\n schema: [],\n key_value_metadata: []\n });\n\n for (const key in userMetadata) {\n const kv = new KeyValue({\n key,\n value: userMetadata[key]\n });\n metadata.key_value_metadata?.push?.(kv);\n }\n\n {\n const schemaRoot = new SchemaElement({\n name: 'root',\n num_children: Object.keys(schema.fields).length\n });\n metadata.schema.push(schemaRoot);\n }\n\n for (const field of schema.fieldList) {\n const relt = FieldRepetitionType[field.repetitionType];\n const schemaElem = new SchemaElement({\n name: field.name,\n repetition_type: relt as any\n });\n\n if (field.isNested) {\n schemaElem.num_children = field.fieldCount;\n } else {\n schemaElem.type = Type[field.primitiveType!] as Type;\n }\n\n if (field.originalType) {\n schemaElem.converted_type = ConvertedType[field.originalType] as ConvertedType;\n }\n\n schemaElem.type_length = field.typeLength;\n\n metadata.schema.push(schemaElem);\n }\n\n const metadataEncoded = serializeThrift(metadata);\n const footerEncoded = Buffer.alloc(metadataEncoded.length + 8);\n metadataEncoded.copy(footerEncoded);\n footerEncoded.writeUInt32LE(metadataEncoded.length, metadataEncoded.length);\n footerEncoded.write(PARQUET_MAGIC, metadataEncoded.length + 4);\n return footerEncoded;\n}\n"],"file":"writer.js"}
|
|
@@ -8,5 +8,6 @@ export let CompressionCodec;
|
|
|
8
8
|
CompressionCodec[CompressionCodec["BROTLI"] = 4] = "BROTLI";
|
|
9
9
|
CompressionCodec[CompressionCodec["LZ4"] = 5] = "LZ4";
|
|
10
10
|
CompressionCodec[CompressionCodec["ZSTD"] = 6] = "ZSTD";
|
|
11
|
+
CompressionCodec[CompressionCodec["LZ4_RAW"] = 7] = "LZ4_RAW";
|
|
11
12
|
})(CompressionCodec || (CompressionCodec = {}));
|
|
12
13
|
//# sourceMappingURL=CompressionCodec.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../../../src/parquetjs/parquet-thrift/CompressionCodec.ts"],"names":["CompressionCodec"],"mappings":"AAMA,WAAYA,gBAAZ;;WAAYA,gB;AAAAA,EAAAA,gB,CAAAA,gB;AAAAA,EAAAA,gB,CAAAA,gB;AAAAA,EAAAA,gB,CAAAA,gB;AAAAA,EAAAA,gB,CAAAA,gB;AAAAA,EAAAA,gB,CAAAA,gB;AAAAA,EAAAA,gB,CAAAA,gB;AAAAA,EAAAA,gB,CAAAA,gB;GAAAA,gB,KAAAA,gB","sourcesContent":["/* tslint:disable */\n/* eslint-disable */\n/*\n * Autogenerated by @creditkarma/thrift-typescript v3.7.2\n * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING\n */\nexport enum CompressionCodec {\n UNCOMPRESSED = 0,\n SNAPPY = 1,\n GZIP = 2,\n LZO = 3,\n BROTLI = 4,\n LZ4 = 5,\n ZSTD = 6\n}\n"],"file":"CompressionCodec.js"}
|
|
1
|
+
{"version":3,"sources":["../../../../src/parquetjs/parquet-thrift/CompressionCodec.ts"],"names":["CompressionCodec"],"mappings":"AAMA,WAAYA,gBAAZ;;WAAYA,gB;AAAAA,EAAAA,gB,CAAAA,gB;AAAAA,EAAAA,gB,CAAAA,gB;AAAAA,EAAAA,gB,CAAAA,gB;AAAAA,EAAAA,gB,CAAAA,gB;AAAAA,EAAAA,gB,CAAAA,gB;AAAAA,EAAAA,gB,CAAAA,gB;AAAAA,EAAAA,gB,CAAAA,gB;AAAAA,EAAAA,gB,CAAAA,gB;GAAAA,gB,KAAAA,gB","sourcesContent":["/* tslint:disable */\n/* eslint-disable */\n/*\n * Autogenerated by @creditkarma/thrift-typescript v3.7.2\n * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING\n */\nexport enum CompressionCodec {\n UNCOMPRESSED = 0,\n SNAPPY = 1,\n GZIP = 2,\n LZO = 3,\n BROTLI = 4,\n LZ4 = 5,\n ZSTD = 6,\n LZ4_RAW = 7 // Added in 2.9\n}\n"],"file":"CompressionCodec.js"}
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import { PARQUET_CODECS } from '../codecs';
|
|
2
|
+
import { ConvertedType, Encoding, FieldRepetitionType, PageType, Type } from '../parquet-thrift';
|
|
3
|
+
import { decompress } from '../compression';
|
|
4
|
+
import { PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING } from '../../constants';
|
|
5
|
+
import { decodePageHeader, getThriftEnum, getBitWidth } from '../utils/read-utils';
|
|
6
|
+
export async function decodeDataPages(buffer, options) {
|
|
7
|
+
const cursor = {
|
|
8
|
+
buffer,
|
|
9
|
+
offset: 0,
|
|
10
|
+
size: buffer.length
|
|
11
|
+
};
|
|
12
|
+
const data = {
|
|
13
|
+
rlevels: [],
|
|
14
|
+
dlevels: [],
|
|
15
|
+
values: [],
|
|
16
|
+
pageHeaders: [],
|
|
17
|
+
count: 0
|
|
18
|
+
};
|
|
19
|
+
let dictionary = options.dictionary || [];
|
|
20
|
+
|
|
21
|
+
while (cursor.offset < cursor.size && (!options.numValues || data.dlevels.length < Number(options.numValues))) {
|
|
22
|
+
const page = await decodePage(cursor, options);
|
|
23
|
+
|
|
24
|
+
if (page.dictionary) {
|
|
25
|
+
dictionary = page.dictionary;
|
|
26
|
+
continue;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (dictionary.length) {
|
|
30
|
+
page.values = page.values.map(value => dictionary[value]);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
for (let index = 0; index < page.rlevels.length; index++) {
|
|
34
|
+
data.rlevels.push(page.rlevels[index]);
|
|
35
|
+
data.dlevels.push(page.dlevels[index]);
|
|
36
|
+
const value = page.values[index];
|
|
37
|
+
|
|
38
|
+
if (value !== undefined) {
|
|
39
|
+
data.values.push(value);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
data.count += page.count;
|
|
44
|
+
data.pageHeaders.push(page.pageHeader);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
return data;
|
|
48
|
+
}
|
|
49
|
+
export async function decodePage(cursor, options) {
|
|
50
|
+
let page;
|
|
51
|
+
const {
|
|
52
|
+
pageHeader,
|
|
53
|
+
length
|
|
54
|
+
} = await decodePageHeader(cursor.buffer, cursor.offset);
|
|
55
|
+
cursor.offset += length;
|
|
56
|
+
const pageType = getThriftEnum(PageType, pageHeader.type);
|
|
57
|
+
|
|
58
|
+
switch (pageType) {
|
|
59
|
+
case 'DATA_PAGE':
|
|
60
|
+
page = await decodeDataPage(cursor, pageHeader, options);
|
|
61
|
+
break;
|
|
62
|
+
|
|
63
|
+
case 'DATA_PAGE_V2':
|
|
64
|
+
page = await decodeDataPageV2(cursor, pageHeader, options);
|
|
65
|
+
break;
|
|
66
|
+
|
|
67
|
+
case 'DICTIONARY_PAGE':
|
|
68
|
+
page = {
|
|
69
|
+
dictionary: await decodeDictionaryPage(cursor, pageHeader, options),
|
|
70
|
+
pageHeader
|
|
71
|
+
};
|
|
72
|
+
break;
|
|
73
|
+
|
|
74
|
+
default:
|
|
75
|
+
throw new Error(`invalid page type: ${pageType}`);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return page;
|
|
79
|
+
}
|
|
80
|
+
export function decodeSchema(schemaElements, offset, len) {
|
|
81
|
+
const schema = {};
|
|
82
|
+
let next = offset;
|
|
83
|
+
|
|
84
|
+
for (let i = 0; i < len; i++) {
|
|
85
|
+
const schemaElement = schemaElements[next];
|
|
86
|
+
const repetitionType = next > 0 ? getThriftEnum(FieldRepetitionType, schemaElement.repetition_type) : 'ROOT';
|
|
87
|
+
let optional = false;
|
|
88
|
+
let repeated = false;
|
|
89
|
+
|
|
90
|
+
switch (repetitionType) {
|
|
91
|
+
case 'REQUIRED':
|
|
92
|
+
break;
|
|
93
|
+
|
|
94
|
+
case 'OPTIONAL':
|
|
95
|
+
optional = true;
|
|
96
|
+
break;
|
|
97
|
+
|
|
98
|
+
case 'REPEATED':
|
|
99
|
+
repeated = true;
|
|
100
|
+
break;
|
|
101
|
+
|
|
102
|
+
default:
|
|
103
|
+
throw new Error('parquet: unknown repetition type');
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if (schemaElement.num_children > 0) {
|
|
107
|
+
const res = decodeSchema(schemaElements, next + 1, schemaElement.num_children);
|
|
108
|
+
next = res.next;
|
|
109
|
+
schema[schemaElement.name] = {
|
|
110
|
+
optional,
|
|
111
|
+
repeated,
|
|
112
|
+
fields: res.schema
|
|
113
|
+
};
|
|
114
|
+
} else {
|
|
115
|
+
const type = getThriftEnum(Type, schemaElement.type);
|
|
116
|
+
let logicalType = type;
|
|
117
|
+
|
|
118
|
+
if (schemaElement.converted_type) {
|
|
119
|
+
logicalType = getThriftEnum(ConvertedType, schemaElement.converted_type);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
switch (logicalType) {
|
|
123
|
+
case 'DECIMAL':
|
|
124
|
+
logicalType = `${logicalType}_${type}`;
|
|
125
|
+
break;
|
|
126
|
+
|
|
127
|
+
default:
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
schema[schemaElement.name] = {
|
|
131
|
+
type: logicalType,
|
|
132
|
+
typeLength: schemaElement.type_length,
|
|
133
|
+
presision: schemaElement.precision,
|
|
134
|
+
scale: schemaElement.scale,
|
|
135
|
+
optional,
|
|
136
|
+
repeated
|
|
137
|
+
};
|
|
138
|
+
next++;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
schema,
|
|
144
|
+
offset,
|
|
145
|
+
next
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function decodeValues(type, encoding, cursor, count, opts) {
|
|
150
|
+
if (!(encoding in PARQUET_CODECS)) {
|
|
151
|
+
throw new Error(`invalid encoding: ${encoding}`);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
return PARQUET_CODECS[encoding].decodeValues(type, cursor, count, opts);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
async function decodeDataPage(cursor, header, options) {
|
|
158
|
+
var _header$data_page_hea, _header$data_page_hea2, _header$data_page_hea3, _header$data_page_hea4;
|
|
159
|
+
|
|
160
|
+
const cursorEnd = cursor.offset + header.compressed_page_size;
|
|
161
|
+
const valueCount = (_header$data_page_hea = header.data_page_header) === null || _header$data_page_hea === void 0 ? void 0 : _header$data_page_hea.num_values;
|
|
162
|
+
let dataCursor = cursor;
|
|
163
|
+
|
|
164
|
+
if (options.compression !== 'UNCOMPRESSED') {
|
|
165
|
+
const valuesBuf = await decompress(options.compression, cursor.buffer.slice(cursor.offset, cursorEnd), header.uncompressed_page_size);
|
|
166
|
+
dataCursor = {
|
|
167
|
+
buffer: valuesBuf,
|
|
168
|
+
offset: 0,
|
|
169
|
+
size: valuesBuf.length
|
|
170
|
+
};
|
|
171
|
+
cursor.offset = cursorEnd;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
const rLevelEncoding = getThriftEnum(Encoding, (_header$data_page_hea2 = header.data_page_header) === null || _header$data_page_hea2 === void 0 ? void 0 : _header$data_page_hea2.repetition_level_encoding);
|
|
175
|
+
let rLevels = new Array(valueCount);
|
|
176
|
+
|
|
177
|
+
if (options.column.rLevelMax > 0) {
|
|
178
|
+
rLevels = decodeValues(PARQUET_RDLVL_TYPE, rLevelEncoding, dataCursor, valueCount, {
|
|
179
|
+
bitWidth: getBitWidth(options.column.rLevelMax),
|
|
180
|
+
disableEnvelope: false
|
|
181
|
+
});
|
|
182
|
+
} else {
|
|
183
|
+
rLevels.fill(0);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const dLevelEncoding = getThriftEnum(Encoding, (_header$data_page_hea3 = header.data_page_header) === null || _header$data_page_hea3 === void 0 ? void 0 : _header$data_page_hea3.definition_level_encoding);
|
|
187
|
+
let dLevels = new Array(valueCount);
|
|
188
|
+
|
|
189
|
+
if (options.column.dLevelMax > 0) {
|
|
190
|
+
dLevels = decodeValues(PARQUET_RDLVL_TYPE, dLevelEncoding, dataCursor, valueCount, {
|
|
191
|
+
bitWidth: getBitWidth(options.column.dLevelMax),
|
|
192
|
+
disableEnvelope: false
|
|
193
|
+
});
|
|
194
|
+
} else {
|
|
195
|
+
dLevels.fill(0);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
let valueCountNonNull = 0;
|
|
199
|
+
|
|
200
|
+
for (const dlvl of dLevels) {
|
|
201
|
+
if (dlvl === options.column.dLevelMax) {
|
|
202
|
+
valueCountNonNull++;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const valueEncoding = getThriftEnum(Encoding, (_header$data_page_hea4 = header.data_page_header) === null || _header$data_page_hea4 === void 0 ? void 0 : _header$data_page_hea4.encoding);
|
|
207
|
+
const decodeOptions = {
|
|
208
|
+
typeLength: options.column.typeLength,
|
|
209
|
+
bitWidth: options.column.typeLength
|
|
210
|
+
};
|
|
211
|
+
const values = decodeValues(options.column.primitiveType, valueEncoding, dataCursor, valueCountNonNull, decodeOptions);
|
|
212
|
+
return {
|
|
213
|
+
dlevels: dLevels,
|
|
214
|
+
rlevels: rLevels,
|
|
215
|
+
values,
|
|
216
|
+
count: valueCount,
|
|
217
|
+
pageHeader: header
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
async function decodeDataPageV2(cursor, header, opts) {
|
|
222
|
+
var _header$data_page_hea5, _header$data_page_hea6, _header$data_page_hea7, _header$data_page_hea8;
|
|
223
|
+
|
|
224
|
+
const cursorEnd = cursor.offset + header.compressed_page_size;
|
|
225
|
+
const valueCount = (_header$data_page_hea5 = header.data_page_header_v2) === null || _header$data_page_hea5 === void 0 ? void 0 : _header$data_page_hea5.num_values;
|
|
226
|
+
const valueCountNonNull = valueCount - ((_header$data_page_hea6 = header.data_page_header_v2) === null || _header$data_page_hea6 === void 0 ? void 0 : _header$data_page_hea6.num_nulls);
|
|
227
|
+
const valueEncoding = getThriftEnum(Encoding, (_header$data_page_hea7 = header.data_page_header_v2) === null || _header$data_page_hea7 === void 0 ? void 0 : _header$data_page_hea7.encoding);
|
|
228
|
+
let rLevels = new Array(valueCount);
|
|
229
|
+
|
|
230
|
+
if (opts.column.rLevelMax > 0) {
|
|
231
|
+
rLevels = decodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, cursor, valueCount, {
|
|
232
|
+
bitWidth: getBitWidth(opts.column.rLevelMax),
|
|
233
|
+
disableEnvelope: true
|
|
234
|
+
});
|
|
235
|
+
} else {
|
|
236
|
+
rLevels.fill(0);
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
let dLevels = new Array(valueCount);
|
|
240
|
+
|
|
241
|
+
if (opts.column.dLevelMax > 0) {
|
|
242
|
+
dLevels = decodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, cursor, valueCount, {
|
|
243
|
+
bitWidth: getBitWidth(opts.column.dLevelMax),
|
|
244
|
+
disableEnvelope: true
|
|
245
|
+
});
|
|
246
|
+
} else {
|
|
247
|
+
dLevels.fill(0);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
let valuesBufCursor = cursor;
|
|
251
|
+
|
|
252
|
+
if ((_header$data_page_hea8 = header.data_page_header_v2) !== null && _header$data_page_hea8 !== void 0 && _header$data_page_hea8.is_compressed) {
|
|
253
|
+
const valuesBuf = await decompress(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd), header.uncompressed_page_size);
|
|
254
|
+
valuesBufCursor = {
|
|
255
|
+
buffer: valuesBuf,
|
|
256
|
+
offset: 0,
|
|
257
|
+
size: valuesBuf.length
|
|
258
|
+
};
|
|
259
|
+
cursor.offset = cursorEnd;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
const decodeOptions = {
|
|
263
|
+
typeLength: opts.column.typeLength,
|
|
264
|
+
bitWidth: opts.column.typeLength
|
|
265
|
+
};
|
|
266
|
+
const values = decodeValues(opts.column.primitiveType, valueEncoding, valuesBufCursor, valueCountNonNull, decodeOptions);
|
|
267
|
+
return {
|
|
268
|
+
dlevels: dLevels,
|
|
269
|
+
rlevels: rLevels,
|
|
270
|
+
values,
|
|
271
|
+
count: valueCount,
|
|
272
|
+
pageHeader: header
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
async function decodeDictionaryPage(cursor, pageHeader, options) {
|
|
277
|
+
var _pageHeader$dictionar;
|
|
278
|
+
|
|
279
|
+
const cursorEnd = cursor.offset + pageHeader.compressed_page_size;
|
|
280
|
+
let dictCursor = {
|
|
281
|
+
offset: 0,
|
|
282
|
+
buffer: cursor.buffer.slice(cursor.offset, cursorEnd),
|
|
283
|
+
size: cursorEnd - cursor.offset
|
|
284
|
+
};
|
|
285
|
+
cursor.offset = cursorEnd;
|
|
286
|
+
|
|
287
|
+
if (options.compression !== 'UNCOMPRESSED') {
|
|
288
|
+
const valuesBuf = await decompress(options.compression, dictCursor.buffer.slice(dictCursor.offset, cursorEnd), pageHeader.uncompressed_page_size);
|
|
289
|
+
dictCursor = {
|
|
290
|
+
buffer: valuesBuf,
|
|
291
|
+
offset: 0,
|
|
292
|
+
size: valuesBuf.length
|
|
293
|
+
};
|
|
294
|
+
cursor.offset = cursorEnd;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
const numValues = (pageHeader === null || pageHeader === void 0 ? void 0 : (_pageHeader$dictionar = pageHeader.dictionary_page_header) === null || _pageHeader$dictionar === void 0 ? void 0 : _pageHeader$dictionar.num_values) || 0;
|
|
298
|
+
return decodeValues(options.column.primitiveType, options.column.encoding, dictCursor, numValues, options).map(d => d.toString());
|
|
299
|
+
}
|
|
300
|
+
//# sourceMappingURL=decoders.js.map
|