@loaders.gl/parquet 4.2.0-alpha.4 → 4.2.0-alpha.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +269 -616
- package/dist/index.cjs.map +7 -0
- package/dist/index.d.ts +12 -12
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -1
- package/dist/lib/arrow/convert-columns-to-row-group.js +1 -2
- package/dist/lib/arrow/convert-row-group-to-columns.js +8 -6
- package/dist/lib/arrow/convert-schema-from-parquet.d.ts +3 -3
- package/dist/lib/arrow/convert-schema-from-parquet.d.ts.map +1 -1
- package/dist/lib/arrow/convert-schema-from-parquet.js +71 -82
- package/dist/lib/arrow/convert-schema-to-parquet.d.ts +1 -1
- package/dist/lib/arrow/convert-schema-to-parquet.d.ts.map +1 -1
- package/dist/lib/arrow/convert-schema-to-parquet.js +65 -35
- package/dist/lib/constants.js +14 -1
- package/dist/lib/parsers/get-parquet-schema.d.ts +1 -1
- package/dist/lib/parsers/get-parquet-schema.d.ts.map +1 -1
- package/dist/lib/parsers/get-parquet-schema.js +9 -7
- package/dist/lib/parsers/parse-geoparquet.d.ts +1 -1
- package/dist/lib/parsers/parse-geoparquet.d.ts.map +1 -1
- package/dist/lib/parsers/parse-geoparquet.js +47 -45
- package/dist/lib/parsers/parse-parquet-to-columns.d.ts +1 -1
- package/dist/lib/parsers/parse-parquet-to-columns.d.ts.map +1 -1
- package/dist/lib/parsers/parse-parquet-to-columns.js +36 -25
- package/dist/lib/parsers/parse-parquet.d.ts +1 -1
- package/dist/lib/parsers/parse-parquet.d.ts.map +1 -1
- package/dist/lib/parsers/parse-parquet.js +62 -50
- package/dist/lib/wasm/encode-parquet-wasm.d.ts +1 -1
- package/dist/lib/wasm/encode-parquet-wasm.d.ts.map +1 -1
- package/dist/lib/wasm/encode-parquet-wasm.js +49 -12
- package/dist/lib/wasm/load-wasm.js +13 -10
- package/dist/lib/wasm/parse-parquet-wasm.d.ts +1 -1
- package/dist/lib/wasm/parse-parquet-wasm.d.ts.map +1 -1
- package/dist/lib/wasm/parse-parquet-wasm.js +19 -15
- package/dist/parquet-loader.d.ts +1 -1
- package/dist/parquet-loader.d.ts.map +1 -1
- package/dist/parquet-loader.js +77 -64
- package/dist/parquet-wasm-loader.js +25 -28
- package/dist/parquet-wasm-writer.js +19 -23
- package/dist/parquet-writer.js +16 -12
- package/dist/parquetjs/codecs/declare.d.ts +1 -1
- package/dist/parquetjs/codecs/declare.d.ts.map +1 -1
- package/dist/parquetjs/codecs/declare.js +0 -1
- package/dist/parquetjs/codecs/dictionary.js +4 -8
- package/dist/parquetjs/codecs/index.d.ts +3 -3
- package/dist/parquetjs/codecs/index.d.ts.map +1 -1
- package/dist/parquetjs/codecs/index.js +20 -17
- package/dist/parquetjs/codecs/plain.d.ts +2 -2
- package/dist/parquetjs/codecs/plain.d.ts.map +1 -1
- package/dist/parquetjs/codecs/plain.js +166 -162
- package/dist/parquetjs/codecs/rle.d.ts +2 -2
- package/dist/parquetjs/codecs/rle.d.ts.map +1 -1
- package/dist/parquetjs/codecs/rle.js +124 -105
- package/dist/parquetjs/compression.d.ts +1 -1
- package/dist/parquetjs/compression.d.ts.map +1 -1
- package/dist/parquetjs/compression.js +157 -43
- package/dist/parquetjs/encoder/parquet-encoder.d.ts +3 -3
- package/dist/parquetjs/encoder/parquet-encoder.d.ts.map +1 -1
- package/dist/parquetjs/encoder/parquet-encoder.js +406 -275
- package/dist/parquetjs/parquet-thrift/BoundaryOrder.js +12 -7
- package/dist/parquetjs/parquet-thrift/BsonType.js +31 -27
- package/dist/parquetjs/parquet-thrift/ColumnChunk.d.ts +1 -1
- package/dist/parquetjs/parquet-thrift/ColumnChunk.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/ColumnChunk.js +175 -166
- package/dist/parquetjs/parquet-thrift/ColumnIndex.d.ts +1 -1
- package/dist/parquetjs/parquet-thrift/ColumnIndex.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/ColumnIndex.js +178 -166
- package/dist/parquetjs/parquet-thrift/ColumnMetaData.d.ts +6 -6
- package/dist/parquetjs/parquet-thrift/ColumnMetaData.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/ColumnMetaData.js +359 -333
- package/dist/parquetjs/parquet-thrift/ColumnOrder.d.ts +1 -1
- package/dist/parquetjs/parquet-thrift/ColumnOrder.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/ColumnOrder.js +73 -68
- package/dist/parquetjs/parquet-thrift/CompressionCodec.js +17 -12
- package/dist/parquetjs/parquet-thrift/ConvertedType.js +31 -26
- package/dist/parquetjs/parquet-thrift/DataPageHeader.d.ts +2 -2
- package/dist/parquetjs/parquet-thrift/DataPageHeader.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/DataPageHeader.js +136 -124
- package/dist/parquetjs/parquet-thrift/DataPageHeaderV2.d.ts +2 -2
- package/dist/parquetjs/parquet-thrift/DataPageHeaderV2.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/DataPageHeaderV2.js +198 -181
- package/dist/parquetjs/parquet-thrift/DateType.js +31 -27
- package/dist/parquetjs/parquet-thrift/DecimalType.js +76 -69
- package/dist/parquetjs/parquet-thrift/DictionaryPageHeader.d.ts +1 -1
- package/dist/parquetjs/parquet-thrift/DictionaryPageHeader.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/DictionaryPageHeader.js +93 -86
- package/dist/parquetjs/parquet-thrift/Encoding.js +17 -12
- package/dist/parquetjs/parquet-thrift/EnumType.js +31 -27
- package/dist/parquetjs/parquet-thrift/FieldRepetitionType.js +12 -7
- package/dist/parquetjs/parquet-thrift/FileMetaData.d.ts +4 -4
- package/dist/parquetjs/parquet-thrift/FileMetaData.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/FileMetaData.js +217 -205
- package/dist/parquetjs/parquet-thrift/IndexPageHeader.js +31 -27
- package/dist/parquetjs/parquet-thrift/IntType.js +76 -69
- package/dist/parquetjs/parquet-thrift/JsonType.js +31 -27
- package/dist/parquetjs/parquet-thrift/KeyValue.js +73 -67
- package/dist/parquetjs/parquet-thrift/ListType.js +31 -27
- package/dist/parquetjs/parquet-thrift/LogicalType.d.ts +13 -13
- package/dist/parquetjs/parquet-thrift/LogicalType.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/LogicalType.js +325 -344
- package/dist/parquetjs/parquet-thrift/MapType.js +31 -27
- package/dist/parquetjs/parquet-thrift/MicroSeconds.js +31 -27
- package/dist/parquetjs/parquet-thrift/MilliSeconds.js +31 -27
- package/dist/parquetjs/parquet-thrift/NullType.js +31 -27
- package/dist/parquetjs/parquet-thrift/OffsetIndex.d.ts +1 -1
- package/dist/parquetjs/parquet-thrift/OffsetIndex.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/OffsetIndex.js +66 -60
- package/dist/parquetjs/parquet-thrift/PageEncodingStats.d.ts +2 -2
- package/dist/parquetjs/parquet-thrift/PageEncodingStats.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/PageEncodingStats.js +98 -88
- package/dist/parquetjs/parquet-thrift/PageHeader.d.ts +5 -5
- package/dist/parquetjs/parquet-thrift/PageHeader.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/PageHeader.js +185 -175
- package/dist/parquetjs/parquet-thrift/PageLocation.js +108 -96
- package/dist/parquetjs/parquet-thrift/PageType.js +13 -8
- package/dist/parquetjs/parquet-thrift/RowGroup.d.ts +2 -2
- package/dist/parquetjs/parquet-thrift/RowGroup.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/RowGroup.js +145 -133
- package/dist/parquetjs/parquet-thrift/SchemaElement.d.ts +4 -4
- package/dist/parquetjs/parquet-thrift/SchemaElement.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/SchemaElement.js +211 -205
- package/dist/parquetjs/parquet-thrift/SortingColumn.js +98 -88
- package/dist/parquetjs/parquet-thrift/Statistics.js +143 -137
- package/dist/parquetjs/parquet-thrift/StringType.js +31 -27
- package/dist/parquetjs/parquet-thrift/TimeType.d.ts +1 -1
- package/dist/parquetjs/parquet-thrift/TimeType.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/TimeType.js +76 -69
- package/dist/parquetjs/parquet-thrift/TimeUnit.d.ts +2 -2
- package/dist/parquetjs/parquet-thrift/TimeUnit.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/TimeUnit.js +94 -91
- package/dist/parquetjs/parquet-thrift/TimestampType.d.ts +1 -1
- package/dist/parquetjs/parquet-thrift/TimestampType.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/TimestampType.js +76 -69
- package/dist/parquetjs/parquet-thrift/Type.js +17 -12
- package/dist/parquetjs/parquet-thrift/TypeDefinedOrder.js +31 -27
- package/dist/parquetjs/parquet-thrift/UUIDType.js +31 -27
- package/dist/parquetjs/parquet-thrift/index.d.ts +44 -44
- package/dist/parquetjs/parquet-thrift/index.d.ts.map +1 -1
- package/dist/parquetjs/parquet-thrift/index.js +6 -1
- package/dist/parquetjs/parser/decoders.d.ts +3 -3
- package/dist/parquetjs/parser/decoders.d.ts.map +1 -1
- package/dist/parquetjs/parser/decoders.js +306 -242
- package/dist/parquetjs/parser/parquet-reader.d.ts +3 -3
- package/dist/parquetjs/parser/parquet-reader.d.ts.map +1 -1
- package/dist/parquetjs/parser/parquet-reader.js +179 -157
- package/dist/parquetjs/schema/declare.d.ts +1 -1
- package/dist/parquetjs/schema/declare.d.ts.map +1 -1
- package/dist/parquetjs/schema/declare.js +6 -9
- package/dist/parquetjs/schema/schema.d.ts +1 -1
- package/dist/parquetjs/schema/schema.d.ts.map +1 -1
- package/dist/parquetjs/schema/schema.js +139 -120
- package/dist/parquetjs/schema/shred.d.ts +2 -2
- package/dist/parquetjs/schema/shred.d.ts.map +1 -1
- package/dist/parquetjs/schema/shred.js +301 -204
- package/dist/parquetjs/schema/types.d.ts +1 -1
- package/dist/parquetjs/schema/types.d.ts.map +1 -1
- package/dist/parquetjs/schema/types.js +328 -314
- package/dist/parquetjs/utils/file-utils.js +24 -23
- package/dist/parquetjs/utils/read-utils.d.ts +1 -1
- package/dist/parquetjs/utils/read-utils.d.ts.map +1 -1
- package/dist/parquetjs/utils/read-utils.js +79 -69
- package/dist/polyfills/buffer/buffer-polyfill.browser.js +8 -3
- package/dist/polyfills/buffer/buffer-polyfill.node.js +13 -8
- package/dist/polyfills/buffer/buffer.js +1750 -1367
- package/dist/polyfills/buffer/index.d.ts +3 -3
- package/dist/polyfills/buffer/index.d.ts.map +1 -1
- package/dist/polyfills/buffer/index.js +5 -1
- package/dist/polyfills/buffer/install-buffer-polyfill.js +2 -1
- package/dist/polyfills/util.js +4 -1
- package/dist/workers/parquet-worker.js +3 -1
- package/package.json +15 -13
- package/src/index.ts +1 -1
- package/src/lib/parsers/parse-parquet-to-columns.ts +1 -1
- package/src/lib/parsers/parse-parquet.ts +1 -1
- package/src/parquetjs/encoder/parquet-encoder.ts +2 -2
- package/src/parquetjs/parser/decoders.ts +2 -2
- package/src/parquetjs/parser/parquet-reader.ts +1 -1
- package/src/parquetjs/schema/schema.ts +1 -1
- package/src/parquetjs/utils/read-utils.ts +2 -2
- package/dist/index.js.map +0 -1
- package/dist/lib/arrow/convert-columns-to-row-group.js.map +0 -1
- package/dist/lib/arrow/convert-row-group-to-columns.js.map +0 -1
- package/dist/lib/arrow/convert-schema-from-parquet.js.map +0 -1
- package/dist/lib/arrow/convert-schema-to-parquet.js.map +0 -1
- package/dist/lib/constants.js.map +0 -1
- package/dist/lib/parsers/get-parquet-schema.js.map +0 -1
- package/dist/lib/parsers/parse-geoparquet.js.map +0 -1
- package/dist/lib/parsers/parse-parquet-to-columns.js.map +0 -1
- package/dist/lib/parsers/parse-parquet.js.map +0 -1
- package/dist/lib/wasm/encode-parquet-wasm.js.map +0 -1
- package/dist/lib/wasm/load-wasm.js.map +0 -1
- package/dist/lib/wasm/parse-parquet-wasm.js.map +0 -1
- package/dist/lib/wip/convert-schema-deep.java.disabled +0 -910
- package/dist/lib/wip/convert-schema-deep.rs.disabled +0 -976
- package/dist/parquet-loader.js.map +0 -1
- package/dist/parquet-wasm-loader.js.map +0 -1
- package/dist/parquet-wasm-writer.js.map +0 -1
- package/dist/parquet-writer.js.map +0 -1
- package/dist/parquetjs/LICENSE +0 -20
- package/dist/parquetjs/codecs/declare.js.map +0 -1
- package/dist/parquetjs/codecs/dictionary.js.map +0 -1
- package/dist/parquetjs/codecs/index.js.map +0 -1
- package/dist/parquetjs/codecs/plain.js.map +0 -1
- package/dist/parquetjs/codecs/rle.js.map +0 -1
- package/dist/parquetjs/compression.js.map +0 -1
- package/dist/parquetjs/encoder/parquet-encoder.js.map +0 -1
- package/dist/parquetjs/modules.d.ts +0 -21
- package/dist/parquetjs/parquet-thrift/BoundaryOrder.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/BsonType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/ColumnChunk.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/ColumnIndex.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/ColumnMetaData.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/ColumnOrder.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/CompressionCodec.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/ConvertedType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/DataPageHeader.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/DataPageHeaderV2.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/DateType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/DecimalType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/DictionaryPageHeader.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/Encoding.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/EnumType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/FieldRepetitionType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/FileMetaData.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/IndexPageHeader.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/IntType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/JsonType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/KeyValue.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/ListType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/LogicalType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/MapType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/MicroSeconds.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/MilliSeconds.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/NullType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/OffsetIndex.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/PageEncodingStats.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/PageHeader.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/PageLocation.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/PageType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/RowGroup.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/SchemaElement.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/SortingColumn.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/Statistics.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/StringType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/TimeType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/TimeUnit.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/TimestampType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/Type.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/TypeDefinedOrder.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/UUIDType.js.map +0 -1
- package/dist/parquetjs/parquet-thrift/index.js.map +0 -1
- package/dist/parquetjs/parser/decoders.js.map +0 -1
- package/dist/parquetjs/parser/parquet-reader.js.map +0 -1
- package/dist/parquetjs/schema/declare.js.map +0 -1
- package/dist/parquetjs/schema/schema.js.map +0 -1
- package/dist/parquetjs/schema/shred.js.map +0 -1
- package/dist/parquetjs/schema/types.js.map +0 -1
- package/dist/parquetjs/utils/file-utils.js.map +0 -1
- package/dist/parquetjs/utils/read-utils.js.map +0 -1
- package/dist/polyfills/buffer/buffer-polyfill.browser.js.map +0 -1
- package/dist/polyfills/buffer/buffer-polyfill.node.js.map +0 -1
- package/dist/polyfills/buffer/buffer.js.map +0 -1
- package/dist/polyfills/buffer/index.js.map +0 -1
- package/dist/polyfills/buffer/install-buffer-polyfill.js.map +0 -1
- package/dist/polyfills/util.js.map +0 -1
- package/dist/workers/parquet-worker.js.map +0 -1
|
@@ -5,318 +5,449 @@ import { ColumnChunk, ColumnMetaData, CompressionCodec, ConvertedType, DataPageH
|
|
|
5
5
|
import { osopen, oswrite, osclose } from "../utils/file-utils.js";
|
|
6
6
|
import { getBitWidth, serializeThrift } from "../utils/read-utils.js";
|
|
7
7
|
import Int64 from 'node-int64';
|
|
8
|
+
/**
|
|
9
|
+
* Parquet File Magic String
|
|
10
|
+
*/
|
|
8
11
|
const PARQUET_MAGIC = 'PAR1';
|
|
12
|
+
/**
|
|
13
|
+
* Parquet File Format Version
|
|
14
|
+
*/
|
|
9
15
|
const PARQUET_VERSION = 1;
|
|
16
|
+
/**
|
|
17
|
+
* Default Page and Row Group sizes
|
|
18
|
+
*/
|
|
10
19
|
const PARQUET_DEFAULT_PAGE_SIZE = 8192;
|
|
11
20
|
const PARQUET_DEFAULT_ROW_GROUP_SIZE = 4096;
|
|
21
|
+
/**
|
|
22
|
+
* Repetition and Definition Level Encoding
|
|
23
|
+
*/
|
|
12
24
|
const PARQUET_RDLVL_TYPE = 'INT32';
|
|
13
25
|
const PARQUET_RDLVL_ENCODING = 'RLE';
|
|
26
|
+
/**
|
|
27
|
+
* Write a parquet file to an output stream. The ParquetEncoder will perform
|
|
28
|
+
* buffering/batching for performance, so close() must be called after all rows
|
|
29
|
+
* are written.
|
|
30
|
+
*/
|
|
31
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
14
32
|
export class ParquetEncoder {
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
return new ParquetEncoder(schema, envelopeWriter, opts);
|
|
23
|
-
}
|
|
24
|
-
constructor(schema, envelopeWriter, opts) {
|
|
25
|
-
this.schema = void 0;
|
|
26
|
-
this.envelopeWriter = void 0;
|
|
27
|
-
this.rowBuffer = void 0;
|
|
28
|
-
this.rowGroupSize = void 0;
|
|
29
|
-
this.closed = void 0;
|
|
30
|
-
this.userMetadata = void 0;
|
|
31
|
-
this.schema = schema;
|
|
32
|
-
this.envelopeWriter = envelopeWriter;
|
|
33
|
-
this.rowBuffer = {};
|
|
34
|
-
this.rowGroupSize = opts.rowGroupSize || PARQUET_DEFAULT_ROW_GROUP_SIZE;
|
|
35
|
-
this.closed = false;
|
|
36
|
-
this.userMetadata = {};
|
|
37
|
-
this.writeHeader();
|
|
38
|
-
}
|
|
39
|
-
async writeHeader() {
|
|
40
|
-
try {
|
|
41
|
-
await this.envelopeWriter.writeHeader();
|
|
42
|
-
} catch (err) {
|
|
43
|
-
await this.envelopeWriter.close();
|
|
44
|
-
throw err;
|
|
33
|
+
/**
|
|
34
|
+
* Convenience method to create a new buffered parquet writer that writes to
|
|
35
|
+
* the specified file
|
|
36
|
+
*/
|
|
37
|
+
static async openFile(schema, path, opts) {
|
|
38
|
+
const outputStream = await osopen(path, opts);
|
|
39
|
+
return ParquetEncoder.openStream(schema, outputStream, opts);
|
|
45
40
|
}
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
41
|
+
/**
|
|
42
|
+
* Convenience method to create a new buffered parquet writer that writes to
|
|
43
|
+
* the specified stream
|
|
44
|
+
*/
|
|
45
|
+
static async openStream(schema, outputStream, opts = {}) {
|
|
46
|
+
const envelopeWriter = await ParquetEnvelopeWriter.openStream(schema, outputStream, opts);
|
|
47
|
+
return new ParquetEncoder(schema, envelopeWriter, opts);
|
|
50
48
|
}
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
49
|
+
/**
|
|
50
|
+
* Create a new buffered parquet writer for a given envelope writer
|
|
51
|
+
*/
|
|
52
|
+
constructor(schema, envelopeWriter, opts) {
|
|
53
|
+
this.schema = schema;
|
|
54
|
+
this.envelopeWriter = envelopeWriter;
|
|
55
|
+
// @ts-ignore Row buffer typings...
|
|
56
|
+
this.rowBuffer = {};
|
|
57
|
+
this.rowGroupSize = opts.rowGroupSize || PARQUET_DEFAULT_ROW_GROUP_SIZE;
|
|
58
|
+
this.closed = false;
|
|
59
|
+
this.userMetadata = {};
|
|
60
|
+
// eslint-disable-next-line @typescript-eslint/no-floating-promises
|
|
61
|
+
this.writeHeader();
|
|
54
62
|
}
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
63
|
+
async writeHeader() {
|
|
64
|
+
// TODO - better not mess with promises in the constructor
|
|
65
|
+
try {
|
|
66
|
+
await this.envelopeWriter.writeHeader();
|
|
67
|
+
}
|
|
68
|
+
catch (err) {
|
|
69
|
+
await this.envelopeWriter.close();
|
|
70
|
+
throw err;
|
|
71
|
+
}
|
|
59
72
|
}
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
73
|
+
/**
|
|
74
|
+
* Append a single row to the parquet file. Rows are buffered in memory until
|
|
75
|
+
* rowGroupSize rows are in the buffer or close() is called
|
|
76
|
+
*/
|
|
77
|
+
async appendRow(row) {
|
|
78
|
+
if (this.closed) {
|
|
79
|
+
throw new Error('writer was closed');
|
|
80
|
+
}
|
|
81
|
+
Shred.shredRecord(this.schema, row, this.rowBuffer);
|
|
82
|
+
if (this.rowBuffer.rowCount >= this.rowGroupSize) {
|
|
83
|
+
// @ts-ignore
|
|
84
|
+
this.rowBuffer = {};
|
|
85
|
+
}
|
|
63
86
|
}
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
87
|
+
/**
|
|
88
|
+
* Finish writing the parquet file and commit the footer to disk. This method
|
|
89
|
+
* MUST be called after you are finished adding rows. You must not call this
|
|
90
|
+
* method twice on the same object or add any rows after the close() method has
|
|
91
|
+
* been called
|
|
92
|
+
*/
|
|
93
|
+
async close(callback) {
|
|
94
|
+
if (this.closed) {
|
|
95
|
+
throw new Error('writer was closed');
|
|
96
|
+
}
|
|
97
|
+
this.closed = true;
|
|
98
|
+
if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) {
|
|
99
|
+
// @ts-ignore
|
|
100
|
+
this.rowBuffer = {};
|
|
101
|
+
}
|
|
102
|
+
await this.envelopeWriter.writeFooter(this.userMetadata);
|
|
103
|
+
await this.envelopeWriter.close();
|
|
104
|
+
// this.envelopeWriter = null;
|
|
105
|
+
if (callback) {
|
|
106
|
+
callback();
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Add key<>value metadata to the file
|
|
111
|
+
*/
|
|
112
|
+
setMetadata(key, value) {
|
|
113
|
+
// TODO: value to be any, obj -> JSON
|
|
114
|
+
this.userMetadata[String(key)] = String(value);
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Set the parquet row group size. This values controls the maximum number
|
|
118
|
+
* of rows that are buffered in memory at any given time as well as the number
|
|
119
|
+
* of rows that are co-located on disk. A higher value is generally better for
|
|
120
|
+
* read-time I/O performance at the tradeoff of write-time memory usage.
|
|
121
|
+
*/
|
|
122
|
+
setRowGroupSize(cnt) {
|
|
123
|
+
this.rowGroupSize = cnt;
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Set the parquet data page size. The data page size controls the maximum
|
|
127
|
+
* number of column values that are written to disk as a consecutive array
|
|
128
|
+
*/
|
|
129
|
+
setPageSize(cnt) {
|
|
130
|
+
this.envelopeWriter.setPageSize(cnt);
|
|
68
131
|
}
|
|
69
|
-
}
|
|
70
|
-
setMetadata(key, value) {
|
|
71
|
-
this.userMetadata[String(key)] = String(value);
|
|
72
|
-
}
|
|
73
|
-
setRowGroupSize(cnt) {
|
|
74
|
-
this.rowGroupSize = cnt;
|
|
75
|
-
}
|
|
76
|
-
setPageSize(cnt) {
|
|
77
|
-
this.envelopeWriter.setPageSize(cnt);
|
|
78
|
-
}
|
|
79
132
|
}
|
|
133
|
+
/**
|
|
134
|
+
* Create a parquet file from a schema and a number of row groups. This class
|
|
135
|
+
* performs direct, unbuffered writes to the underlying output stream and is
|
|
136
|
+
* intendend for advanced and internal users; the writeXXX methods must be
|
|
137
|
+
* called in the correct order to produce a valid file.
|
|
138
|
+
*/
|
|
80
139
|
export class ParquetEnvelopeWriter {
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
140
|
+
/**
|
|
141
|
+
* Create a new parquet envelope writer that writes to the specified stream
|
|
142
|
+
*/
|
|
143
|
+
static async openStream(schema, outputStream, opts) {
|
|
144
|
+
const writeFn = oswrite.bind(undefined, outputStream);
|
|
145
|
+
const closeFn = osclose.bind(undefined, outputStream);
|
|
146
|
+
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts);
|
|
147
|
+
}
|
|
148
|
+
constructor(schema, writeFn, closeFn, fileOffset, opts) {
|
|
149
|
+
this.schema = schema;
|
|
150
|
+
this.write = writeFn;
|
|
151
|
+
this.close = closeFn;
|
|
152
|
+
this.offset = fileOffset;
|
|
153
|
+
this.rowCount = 0;
|
|
154
|
+
this.rowGroups = [];
|
|
155
|
+
this.pageSize = opts.pageSize || PARQUET_DEFAULT_PAGE_SIZE;
|
|
156
|
+
this.useDataPageV2 = 'useDataPageV2' in opts ? Boolean(opts.useDataPageV2) : false;
|
|
157
|
+
}
|
|
158
|
+
writeSection(buf) {
|
|
159
|
+
this.offset += buf.length;
|
|
160
|
+
return this.write(buf);
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Encode the parquet file header
|
|
164
|
+
*/
|
|
165
|
+
writeHeader() {
|
|
166
|
+
return this.writeSection(Buffer.from(PARQUET_MAGIC));
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Encode a parquet row group. The records object should be created using the
|
|
170
|
+
* shredRecord method
|
|
171
|
+
*/
|
|
172
|
+
async writeRowGroup(records) {
|
|
173
|
+
const rgroup = await encodeRowGroup(this.schema, records, {
|
|
174
|
+
baseOffset: this.offset,
|
|
175
|
+
pageSize: this.pageSize,
|
|
176
|
+
useDataPageV2: this.useDataPageV2
|
|
177
|
+
});
|
|
178
|
+
this.rowCount += records.rowCount;
|
|
179
|
+
this.rowGroups.push(rgroup.metadata);
|
|
180
|
+
return await this.writeSection(rgroup.body);
|
|
181
|
+
}
|
|
182
|
+
/**
|
|
183
|
+
* Write the parquet file footer
|
|
184
|
+
*/
|
|
185
|
+
writeFooter(userMetadata) {
|
|
186
|
+
if (!userMetadata) {
|
|
187
|
+
// tslint:disable-next-line:no-parameter-reassignment
|
|
188
|
+
userMetadata = {};
|
|
189
|
+
}
|
|
190
|
+
return this.writeSection(encodeFooter(this.schema, this.rowCount, this.rowGroups, userMetadata));
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Set the parquet data page size. The data page size controls the maximum
|
|
194
|
+
* number of column values that are written to disk as a consecutive array
|
|
195
|
+
*/
|
|
196
|
+
setPageSize(cnt) {
|
|
197
|
+
this.pageSize = cnt;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Create a parquet transform stream
|
|
202
|
+
export class ParquetTransformer<T> extends stream.Transform {
|
|
203
|
+
public writer: ParquetEncoder<T>;
|
|
204
|
+
|
|
205
|
+
constructor(schema: ParquetSchema, opts: ParquetEncoderOptions = {}) {
|
|
206
|
+
super({objectMode: true});
|
|
207
|
+
|
|
208
|
+
const writeProxy = (function (t: ParquetTransformer<any>) {
|
|
209
|
+
return async function (b: any): Promise<void> {
|
|
210
|
+
t.push(b);
|
|
211
|
+
};
|
|
212
|
+
})(this);
|
|
213
|
+
|
|
214
|
+
this.writer = new ParquetEncoder(
|
|
215
|
+
schema,
|
|
216
|
+
new ParquetEnvelopeWriter(schema, writeProxy, async () => {}, 0, opts),
|
|
217
|
+
opts
|
|
218
|
+
);
|
|
120
219
|
}
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
220
|
+
|
|
221
|
+
// tslint:disable-next-line:function-name
|
|
222
|
+
_transform(row: any, encoding: string, callback: (val?: any) => void): Promise<void> {
|
|
223
|
+
if (row) {
|
|
224
|
+
return this.writer.appendRow(row).then(callback);
|
|
124
225
|
}
|
|
125
|
-
|
|
226
|
+
callback();
|
|
227
|
+
return Promise.resolve();
|
|
126
228
|
}
|
|
127
|
-
|
|
128
|
-
|
|
229
|
+
|
|
230
|
+
// tslint:disable-next-line:function-name
|
|
231
|
+
async _flush(callback: (val?: any) => void) {
|
|
232
|
+
await this.writer.close(callback);
|
|
129
233
|
}
|
|
130
234
|
}
|
|
235
|
+
*/
|
|
236
|
+
/**
|
|
237
|
+
* Encode a consecutive array of data using one of the parquet encodings
|
|
238
|
+
*/
|
|
131
239
|
function encodeValues(type, encoding, values, opts) {
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
240
|
+
if (!(encoding in PARQUET_CODECS)) {
|
|
241
|
+
throw new Error(`invalid encoding: ${encoding}`);
|
|
242
|
+
}
|
|
243
|
+
return PARQUET_CODECS[encoding].encodeValues(type, values, opts);
|
|
136
244
|
}
|
|
245
|
+
/**
|
|
246
|
+
* Encode a parquet data page
|
|
247
|
+
*/
|
|
137
248
|
async function encodeDataPage(column, data) {
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
249
|
+
/* encode repetition and definition levels */
|
|
250
|
+
let rLevelsBuf = Buffer.alloc(0);
|
|
251
|
+
if (column.rLevelMax > 0) {
|
|
252
|
+
rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.rlevels, {
|
|
253
|
+
bitWidth: getBitWidth(column.rLevelMax)
|
|
254
|
+
// disableEnvelope: false
|
|
255
|
+
});
|
|
256
|
+
}
|
|
257
|
+
let dLevelsBuf = Buffer.alloc(0);
|
|
258
|
+
if (column.dLevelMax > 0) {
|
|
259
|
+
dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.dlevels, {
|
|
260
|
+
bitWidth: getBitWidth(column.dLevelMax)
|
|
261
|
+
// disableEnvelope: false
|
|
262
|
+
});
|
|
263
|
+
}
|
|
264
|
+
/* encode values */
|
|
265
|
+
const valuesBuf = encodeValues(column.primitiveType, column.encoding, data.values, {
|
|
266
|
+
typeLength: column.typeLength,
|
|
267
|
+
bitWidth: column.typeLength
|
|
142
268
|
});
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
269
|
+
const dataBuf = Buffer.concat([rLevelsBuf, dLevelsBuf, valuesBuf]);
|
|
270
|
+
// compression = column.compression === 'UNCOMPRESSED' ? (compression || 'UNCOMPRESSED') : column.compression;
|
|
271
|
+
const compressedBuf = await Compression.deflate(column.compression, dataBuf);
|
|
272
|
+
/* build page header */
|
|
273
|
+
const header = new PageHeader({
|
|
274
|
+
type: PageType.DATA_PAGE,
|
|
275
|
+
data_page_header: new DataPageHeader({
|
|
276
|
+
num_values: data.count,
|
|
277
|
+
encoding: Encoding[column.encoding],
|
|
278
|
+
definition_level_encoding: Encoding[PARQUET_RDLVL_ENCODING], // [PARQUET_RDLVL_ENCODING],
|
|
279
|
+
repetition_level_encoding: Encoding[PARQUET_RDLVL_ENCODING] // [PARQUET_RDLVL_ENCODING]
|
|
280
|
+
}),
|
|
281
|
+
uncompressed_page_size: dataBuf.length,
|
|
282
|
+
compressed_page_size: compressedBuf.length
|
|
148
283
|
});
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
});
|
|
154
|
-
const dataBuf = Buffer.concat([rLevelsBuf, dLevelsBuf, valuesBuf]);
|
|
155
|
-
const compressedBuf = await Compression.deflate(column.compression, dataBuf);
|
|
156
|
-
const header = new PageHeader({
|
|
157
|
-
type: PageType.DATA_PAGE,
|
|
158
|
-
data_page_header: new DataPageHeader({
|
|
159
|
-
num_values: data.count,
|
|
160
|
-
encoding: Encoding[column.encoding],
|
|
161
|
-
definition_level_encoding: Encoding[PARQUET_RDLVL_ENCODING],
|
|
162
|
-
repetition_level_encoding: Encoding[PARQUET_RDLVL_ENCODING]
|
|
163
|
-
}),
|
|
164
|
-
uncompressed_page_size: dataBuf.length,
|
|
165
|
-
compressed_page_size: compressedBuf.length
|
|
166
|
-
});
|
|
167
|
-
const headerBuf = serializeThrift(header);
|
|
168
|
-
const page = Buffer.concat([headerBuf, compressedBuf]);
|
|
169
|
-
return {
|
|
170
|
-
header,
|
|
171
|
-
headerSize: headerBuf.length,
|
|
172
|
-
page
|
|
173
|
-
};
|
|
284
|
+
/* concat page header, repetition and definition levels and values */
|
|
285
|
+
const headerBuf = serializeThrift(header);
|
|
286
|
+
const page = Buffer.concat([headerBuf, compressedBuf]);
|
|
287
|
+
return { header, headerSize: headerBuf.length, page };
|
|
174
288
|
}
|
|
289
|
+
/**
|
|
290
|
+
* Encode a parquet data page (v2)
|
|
291
|
+
*/
|
|
175
292
|
async function encodeDataPageV2(column, data, rowCount) {
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
const compressedBuf = await Compression.deflate(column.compression, valuesBuf);
|
|
181
|
-
let rLevelsBuf = Buffer.alloc(0);
|
|
182
|
-
if (column.rLevelMax > 0) {
|
|
183
|
-
rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.rlevels, {
|
|
184
|
-
bitWidth: getBitWidth(column.rLevelMax),
|
|
185
|
-
disableEnvelope: true
|
|
293
|
+
/* encode values */
|
|
294
|
+
const valuesBuf = encodeValues(column.primitiveType, column.encoding, data.values, {
|
|
295
|
+
typeLength: column.typeLength,
|
|
296
|
+
bitWidth: column.typeLength
|
|
186
297
|
});
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
298
|
+
// compression = column.compression === 'UNCOMPRESSED' ? (compression || 'UNCOMPRESSED') : column.compression;
|
|
299
|
+
const compressedBuf = await Compression.deflate(column.compression, valuesBuf);
|
|
300
|
+
/* encode repetition and definition levels */
|
|
301
|
+
let rLevelsBuf = Buffer.alloc(0);
|
|
302
|
+
if (column.rLevelMax > 0) {
|
|
303
|
+
rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.rlevels, {
|
|
304
|
+
bitWidth: getBitWidth(column.rLevelMax),
|
|
305
|
+
disableEnvelope: true
|
|
306
|
+
});
|
|
307
|
+
}
|
|
308
|
+
let dLevelsBuf = Buffer.alloc(0);
|
|
309
|
+
if (column.dLevelMax > 0) {
|
|
310
|
+
dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.dlevels, {
|
|
311
|
+
bitWidth: getBitWidth(column.dLevelMax),
|
|
312
|
+
disableEnvelope: true
|
|
313
|
+
});
|
|
314
|
+
}
|
|
315
|
+
/* build page header */
|
|
316
|
+
const header = new PageHeader({
|
|
317
|
+
type: PageType.DATA_PAGE_V2,
|
|
318
|
+
data_page_header_v2: new DataPageHeaderV2({
|
|
319
|
+
num_values: data.count,
|
|
320
|
+
num_nulls: data.count - data.values.length,
|
|
321
|
+
num_rows: rowCount,
|
|
322
|
+
encoding: Encoding[column.encoding],
|
|
323
|
+
definition_levels_byte_length: dLevelsBuf.length,
|
|
324
|
+
repetition_levels_byte_length: rLevelsBuf.length,
|
|
325
|
+
is_compressed: column.compression !== 'UNCOMPRESSED'
|
|
326
|
+
}),
|
|
327
|
+
uncompressed_page_size: rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length,
|
|
328
|
+
compressed_page_size: rLevelsBuf.length + dLevelsBuf.length + compressedBuf.length
|
|
193
329
|
});
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
num_values: data.count,
|
|
199
|
-
num_nulls: data.count - data.values.length,
|
|
200
|
-
num_rows: rowCount,
|
|
201
|
-
encoding: Encoding[column.encoding],
|
|
202
|
-
definition_levels_byte_length: dLevelsBuf.length,
|
|
203
|
-
repetition_levels_byte_length: rLevelsBuf.length,
|
|
204
|
-
is_compressed: column.compression !== 'UNCOMPRESSED'
|
|
205
|
-
}),
|
|
206
|
-
uncompressed_page_size: rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length,
|
|
207
|
-
compressed_page_size: rLevelsBuf.length + dLevelsBuf.length + compressedBuf.length
|
|
208
|
-
});
|
|
209
|
-
const headerBuf = serializeThrift(header);
|
|
210
|
-
const page = Buffer.concat([headerBuf, rLevelsBuf, dLevelsBuf, compressedBuf]);
|
|
211
|
-
return {
|
|
212
|
-
header,
|
|
213
|
-
headerSize: headerBuf.length,
|
|
214
|
-
page
|
|
215
|
-
};
|
|
330
|
+
/* concat page header, repetition and definition levels and values */
|
|
331
|
+
const headerBuf = serializeThrift(header);
|
|
332
|
+
const page = Buffer.concat([headerBuf, rLevelsBuf, dLevelsBuf, compressedBuf]);
|
|
333
|
+
return { header, headerSize: headerBuf.length, page };
|
|
216
334
|
}
|
|
335
|
+
/**
|
|
336
|
+
* Encode an array of values into a parquet column chunk
|
|
337
|
+
*/
|
|
217
338
|
async function encodeColumnChunk(column, buffer, offset, opts) {
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
339
|
+
const data = buffer.columnData[column.path.join()];
|
|
340
|
+
const baseOffset = (opts.baseOffset || 0) + offset;
|
|
341
|
+
/* encode data page(s) */
|
|
342
|
+
// const pages: Buffer[] = [];
|
|
343
|
+
let pageBuf;
|
|
344
|
+
// tslint:disable-next-line:variable-name
|
|
345
|
+
let total_uncompressed_size = 0;
|
|
346
|
+
// tslint:disable-next-line:variable-name
|
|
347
|
+
let total_compressed_size = 0;
|
|
348
|
+
{
|
|
349
|
+
const result = opts.useDataPageV2
|
|
350
|
+
? await encodeDataPageV2(column, data, buffer.rowCount)
|
|
351
|
+
: await encodeDataPage(column, data);
|
|
352
|
+
// pages.push(result.page);
|
|
353
|
+
pageBuf = result.page;
|
|
354
|
+
total_uncompressed_size += result.header.uncompressed_page_size + result.headerSize;
|
|
355
|
+
total_compressed_size += result.header.compressed_page_size + result.headerSize;
|
|
356
|
+
}
|
|
357
|
+
// const pagesBuf = Buffer.concat(pages);
|
|
358
|
+
// const compression = column.compression === 'UNCOMPRESSED' ? (opts.compression || 'UNCOMPRESSED') : column.compression;
|
|
359
|
+
/* prepare metadata header */
|
|
360
|
+
const metadata = new ColumnMetaData({
|
|
361
|
+
path_in_schema: column.path,
|
|
362
|
+
num_values: data.count,
|
|
363
|
+
data_page_offset: baseOffset,
|
|
364
|
+
encodings: [],
|
|
365
|
+
total_uncompressed_size, // : pagesBuf.length,
|
|
366
|
+
total_compressed_size,
|
|
367
|
+
type: Type[column.primitiveType],
|
|
368
|
+
codec: CompressionCodec[column.compression]
|
|
369
|
+
});
|
|
370
|
+
/* list encodings */
|
|
371
|
+
metadata.encodings.push(Encoding[PARQUET_RDLVL_ENCODING]);
|
|
372
|
+
metadata.encodings.push(Encoding[column.encoding]);
|
|
373
|
+
/* concat metadata header and data pages */
|
|
374
|
+
const metadataOffset = baseOffset + pageBuf.length;
|
|
375
|
+
const body = Buffer.concat([pageBuf, serializeThrift(metadata)]);
|
|
376
|
+
return { body, metadata, metadataOffset };
|
|
248
377
|
}
|
|
378
|
+
/**
|
|
379
|
+
* Encode a list of column values into a parquet row group
|
|
380
|
+
*/
|
|
249
381
|
async function encodeRowGroup(schema, data, opts) {
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
});
|
|
255
|
-
let body = Buffer.alloc(0);
|
|
256
|
-
for (const field of schema.fieldList) {
|
|
257
|
-
if (field.isNested) {
|
|
258
|
-
continue;
|
|
259
|
-
}
|
|
260
|
-
const cchunkData = await encodeColumnChunk(field, data, body.length, opts);
|
|
261
|
-
const cchunk = new ColumnChunk({
|
|
262
|
-
file_offset: cchunkData.metadataOffset,
|
|
263
|
-
meta_data: cchunkData.metadata
|
|
382
|
+
const metadata = new RowGroup({
|
|
383
|
+
num_rows: data.rowCount,
|
|
384
|
+
columns: [],
|
|
385
|
+
total_byte_size: 0
|
|
264
386
|
});
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
387
|
+
let body = Buffer.alloc(0);
|
|
388
|
+
for (const field of schema.fieldList) {
|
|
389
|
+
if (field.isNested) {
|
|
390
|
+
continue; // eslint-disable-line no-continue
|
|
391
|
+
}
|
|
392
|
+
const cchunkData = await encodeColumnChunk(field, data, body.length, opts);
|
|
393
|
+
const cchunk = new ColumnChunk({
|
|
394
|
+
file_offset: cchunkData.metadataOffset,
|
|
395
|
+
meta_data: cchunkData.metadata
|
|
396
|
+
});
|
|
397
|
+
metadata.columns.push(cchunk);
|
|
398
|
+
metadata.total_byte_size = new Int64(Number(metadata.total_byte_size) + cchunkData.body.length);
|
|
399
|
+
body = Buffer.concat([body, cchunkData.body]);
|
|
400
|
+
}
|
|
401
|
+
return { body, metadata };
|
|
273
402
|
}
|
|
403
|
+
/**
|
|
404
|
+
* Encode a parquet file metadata footer
|
|
405
|
+
*/
|
|
274
406
|
function encodeFooter(schema, rowCount, rowGroups, userMetadata) {
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
});
|
|
283
|
-
for (const key in userMetadata) {
|
|
284
|
-
var _metadata$key_value_m, _metadata$key_value_m2, _metadata$key_value_m3;
|
|
285
|
-
const kv = new KeyValue({
|
|
286
|
-
key,
|
|
287
|
-
value: userMetadata[key]
|
|
407
|
+
const metadata = new FileMetaData({
|
|
408
|
+
version: PARQUET_VERSION,
|
|
409
|
+
created_by: 'parquets',
|
|
410
|
+
num_rows: rowCount,
|
|
411
|
+
row_groups: rowGroups,
|
|
412
|
+
schema: [],
|
|
413
|
+
key_value_metadata: []
|
|
288
414
|
});
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
});
|
|
296
|
-
metadata.schema.push(schemaRoot);
|
|
297
|
-
}
|
|
298
|
-
for (const field of schema.fieldList) {
|
|
299
|
-
const relt = FieldRepetitionType[field.repetitionType];
|
|
300
|
-
const schemaElem = new SchemaElement({
|
|
301
|
-
name: field.name,
|
|
302
|
-
repetition_type: relt
|
|
303
|
-
});
|
|
304
|
-
if (field.isNested) {
|
|
305
|
-
schemaElem.num_children = field.fieldCount;
|
|
306
|
-
} else {
|
|
307
|
-
schemaElem.type = Type[field.primitiveType];
|
|
415
|
+
for (const key in userMetadata) {
|
|
416
|
+
const kv = new KeyValue({
|
|
417
|
+
key,
|
|
418
|
+
value: userMetadata[key]
|
|
419
|
+
});
|
|
420
|
+
metadata.key_value_metadata?.push?.(kv);
|
|
308
421
|
}
|
|
309
|
-
|
|
310
|
-
|
|
422
|
+
{
|
|
423
|
+
const schemaRoot = new SchemaElement({
|
|
424
|
+
name: 'root',
|
|
425
|
+
num_children: Object.keys(schema.fields).length
|
|
426
|
+
});
|
|
427
|
+
metadata.schema.push(schemaRoot);
|
|
311
428
|
}
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
429
|
+
for (const field of schema.fieldList) {
|
|
430
|
+
const relt = FieldRepetitionType[field.repetitionType];
|
|
431
|
+
const schemaElem = new SchemaElement({
|
|
432
|
+
name: field.name,
|
|
433
|
+
repetition_type: relt
|
|
434
|
+
});
|
|
435
|
+
if (field.isNested) {
|
|
436
|
+
schemaElem.num_children = field.fieldCount;
|
|
437
|
+
}
|
|
438
|
+
else {
|
|
439
|
+
schemaElem.type = Type[field.primitiveType];
|
|
440
|
+
}
|
|
441
|
+
if (field.originalType) {
|
|
442
|
+
schemaElem.converted_type = ConvertedType[field.originalType];
|
|
443
|
+
}
|
|
444
|
+
schemaElem.type_length = field.typeLength;
|
|
445
|
+
metadata.schema.push(schemaElem);
|
|
446
|
+
}
|
|
447
|
+
const metadataEncoded = serializeThrift(metadata);
|
|
448
|
+
const footerEncoded = Buffer.alloc(metadataEncoded.length + 8);
|
|
449
|
+
metadataEncoded.copy(footerEncoded);
|
|
450
|
+
footerEncoded.writeUInt32LE(metadataEncoded.length, metadataEncoded.length);
|
|
451
|
+
footerEncoded.write(PARQUET_MAGIC, metadataEncoded.length + 4);
|
|
452
|
+
return footerEncoded;
|
|
321
453
|
}
|
|
322
|
-
//# sourceMappingURL=parquet-encoder.js.map
|