@loaders.gl/parquet 3.0.12 → 3.1.0-alpha.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. package/dist/dist.min.js +7 -18
  2. package/dist/dist.min.js.map +1 -1
  3. package/dist/es5/bundle.js +2 -4
  4. package/dist/es5/bundle.js.map +1 -1
  5. package/dist/es5/constants.js +17 -0
  6. package/dist/es5/constants.js.map +1 -0
  7. package/dist/es5/index.js +53 -21
  8. package/dist/es5/index.js.map +1 -1
  9. package/dist/es5/lib/convert-schema.js +82 -0
  10. package/dist/es5/lib/convert-schema.js.map +1 -0
  11. package/dist/es5/lib/parse-parquet.js +173 -0
  12. package/dist/es5/lib/parse-parquet.js.map +1 -0
  13. package/dist/es5/lib/read-array-buffer.js +53 -0
  14. package/dist/es5/lib/read-array-buffer.js.map +1 -0
  15. package/dist/es5/parquet-loader.js +6 -79
  16. package/dist/es5/parquet-loader.js.map +1 -1
  17. package/dist/es5/parquet-writer.js +1 -1
  18. package/dist/es5/parquet-writer.js.map +1 -1
  19. package/dist/es5/parquetjs/codecs/dictionary.js +30 -0
  20. package/dist/es5/parquetjs/codecs/dictionary.js.map +1 -0
  21. package/dist/es5/parquetjs/codecs/index.js +10 -0
  22. package/dist/es5/parquetjs/codecs/index.js.map +1 -1
  23. package/dist/es5/parquetjs/codecs/rle.js +2 -2
  24. package/dist/es5/parquetjs/codecs/rle.js.map +1 -1
  25. package/dist/es5/parquetjs/compression.js +138 -104
  26. package/dist/es5/parquetjs/compression.js.map +1 -1
  27. package/dist/es5/parquetjs/{writer.js → encoder/writer.js} +397 -228
  28. package/dist/es5/parquetjs/encoder/writer.js.map +1 -0
  29. package/dist/es5/parquetjs/parquet-thrift/CompressionCodec.js +1 -0
  30. package/dist/es5/parquetjs/parquet-thrift/CompressionCodec.js.map +1 -1
  31. package/dist/es5/parquetjs/parser/decoders.js +495 -0
  32. package/dist/es5/parquetjs/parser/decoders.js.map +1 -0
  33. package/dist/es5/parquetjs/parser/parquet-cursor.js +215 -0
  34. package/dist/es5/parquetjs/parser/parquet-cursor.js.map +1 -0
  35. package/dist/es5/parquetjs/parser/parquet-envelope-reader.js +452 -0
  36. package/dist/es5/parquetjs/parser/parquet-envelope-reader.js.map +1 -0
  37. package/dist/es5/parquetjs/parser/parquet-reader.js +413 -0
  38. package/dist/es5/parquetjs/parser/parquet-reader.js.map +1 -0
  39. package/dist/es5/parquetjs/schema/declare.js.map +1 -1
  40. package/dist/es5/parquetjs/schema/schema.js +2 -0
  41. package/dist/es5/parquetjs/schema/schema.js.map +1 -1
  42. package/dist/es5/parquetjs/schema/shred.js +2 -1
  43. package/dist/es5/parquetjs/schema/shred.js.map +1 -1
  44. package/dist/es5/parquetjs/schema/types.js +79 -4
  45. package/dist/es5/parquetjs/schema/types.js.map +1 -1
  46. package/dist/es5/parquetjs/utils/buffer-utils.js +21 -0
  47. package/dist/es5/parquetjs/utils/buffer-utils.js.map +1 -0
  48. package/dist/es5/parquetjs/utils/file-utils.js +108 -0
  49. package/dist/es5/parquetjs/utils/file-utils.js.map +1 -0
  50. package/dist/es5/parquetjs/{util.js → utils/read-utils.js} +13 -113
  51. package/dist/es5/parquetjs/utils/read-utils.js.map +1 -0
  52. package/dist/esm/bundle.js +2 -4
  53. package/dist/esm/bundle.js.map +1 -1
  54. package/dist/esm/constants.js +6 -0
  55. package/dist/esm/constants.js.map +1 -0
  56. package/dist/esm/index.js +14 -4
  57. package/dist/esm/index.js.map +1 -1
  58. package/dist/esm/lib/convert-schema.js +71 -0
  59. package/dist/esm/lib/convert-schema.js.map +1 -0
  60. package/dist/esm/lib/parse-parquet.js +28 -0
  61. package/dist/esm/lib/parse-parquet.js.map +1 -0
  62. package/dist/esm/lib/read-array-buffer.js +9 -0
  63. package/dist/esm/lib/read-array-buffer.js.map +1 -0
  64. package/dist/esm/parquet-loader.js +4 -24
  65. package/dist/esm/parquet-loader.js.map +1 -1
  66. package/dist/esm/parquet-writer.js +1 -1
  67. package/dist/esm/parquet-writer.js.map +1 -1
  68. package/dist/esm/parquetjs/codecs/dictionary.js +12 -0
  69. package/dist/esm/parquetjs/codecs/dictionary.js.map +1 -0
  70. package/dist/esm/parquetjs/codecs/index.js +9 -0
  71. package/dist/esm/parquetjs/codecs/index.js.map +1 -1
  72. package/dist/esm/parquetjs/codecs/rle.js +2 -2
  73. package/dist/esm/parquetjs/codecs/rle.js.map +1 -1
  74. package/dist/esm/parquetjs/compression.js +54 -105
  75. package/dist/esm/parquetjs/compression.js.map +1 -1
  76. package/dist/esm/parquetjs/{writer.js → encoder/writer.js} +32 -35
  77. package/dist/esm/parquetjs/encoder/writer.js.map +1 -0
  78. package/dist/esm/parquetjs/parquet-thrift/CompressionCodec.js +1 -0
  79. package/dist/esm/parquetjs/parquet-thrift/CompressionCodec.js.map +1 -1
  80. package/dist/esm/parquetjs/parser/decoders.js +300 -0
  81. package/dist/esm/parquetjs/parser/decoders.js.map +1 -0
  82. package/dist/esm/parquetjs/parser/parquet-cursor.js +90 -0
  83. package/dist/esm/parquetjs/parser/parquet-cursor.js.map +1 -0
  84. package/dist/esm/parquetjs/parser/parquet-envelope-reader.js +164 -0
  85. package/dist/esm/parquetjs/parser/parquet-envelope-reader.js.map +1 -0
  86. package/dist/esm/parquetjs/parser/parquet-reader.js +133 -0
  87. package/dist/esm/parquetjs/parser/parquet-reader.js.map +1 -0
  88. package/dist/esm/parquetjs/schema/declare.js.map +1 -1
  89. package/dist/esm/parquetjs/schema/schema.js +2 -0
  90. package/dist/esm/parquetjs/schema/schema.js.map +1 -1
  91. package/dist/esm/parquetjs/schema/shred.js +2 -1
  92. package/dist/esm/parquetjs/schema/shred.js.map +1 -1
  93. package/dist/esm/parquetjs/schema/types.js +78 -4
  94. package/dist/esm/parquetjs/schema/types.js.map +1 -1
  95. package/dist/esm/parquetjs/utils/buffer-utils.js +12 -0
  96. package/dist/esm/parquetjs/utils/buffer-utils.js.map +1 -0
  97. package/dist/esm/parquetjs/utils/file-utils.js +79 -0
  98. package/dist/esm/parquetjs/utils/file-utils.js.map +1 -0
  99. package/dist/esm/parquetjs/{util.js → utils/read-utils.js} +11 -89
  100. package/dist/esm/parquetjs/utils/read-utils.js.map +1 -0
  101. package/dist/parquet-worker.js +7 -18
  102. package/dist/parquet-worker.js.map +1 -1
  103. package/package.json +10 -10
  104. package/src/bundle.ts +2 -3
  105. package/src/constants.ts +17 -0
  106. package/src/index.ts +30 -4
  107. package/src/lib/convert-schema.ts +95 -0
  108. package/src/lib/parse-parquet.ts +27 -0
  109. package/{dist/es5/libs → src/lib}/read-array-buffer.ts +0 -0
  110. package/src/parquet-loader.ts +4 -24
  111. package/src/parquetjs/codecs/dictionary.ts +11 -0
  112. package/src/parquetjs/codecs/index.ts +13 -0
  113. package/src/parquetjs/codecs/rle.ts +4 -2
  114. package/src/parquetjs/compression.ts +89 -50
  115. package/src/parquetjs/{writer.ts → encoder/writer.ts} +46 -45
  116. package/src/parquetjs/parquet-thrift/CompressionCodec.ts +2 -1
  117. package/src/parquetjs/parser/decoders.ts +448 -0
  118. package/src/parquetjs/parser/parquet-cursor.ts +94 -0
  119. package/src/parquetjs/parser/parquet-envelope-reader.ts +210 -0
  120. package/src/parquetjs/parser/parquet-reader.ts +179 -0
  121. package/src/parquetjs/schema/declare.ts +48 -2
  122. package/src/parquetjs/schema/schema.ts +2 -0
  123. package/src/parquetjs/schema/shred.ts +3 -1
  124. package/src/parquetjs/schema/types.ts +82 -5
  125. package/src/parquetjs/utils/buffer-utils.ts +18 -0
  126. package/src/parquetjs/utils/file-utils.ts +96 -0
  127. package/src/parquetjs/{util.ts → utils/read-utils.ts} +13 -110
  128. package/dist/dist.es5.min.js +0 -51
  129. package/dist/dist.es5.min.js.map +0 -1
  130. package/dist/es5/parquetjs/compression.ts.disabled +0 -105
  131. package/dist/es5/parquetjs/reader.js +0 -1078
  132. package/dist/es5/parquetjs/reader.js.map +0 -1
  133. package/dist/es5/parquetjs/util.js.map +0 -1
  134. package/dist/es5/parquetjs/writer.js.map +0 -1
  135. package/dist/esm/libs/read-array-buffer.ts +0 -31
  136. package/dist/esm/parquetjs/compression.ts.disabled +0 -105
  137. package/dist/esm/parquetjs/reader.js +0 -524
  138. package/dist/esm/parquetjs/reader.js.map +0 -1
  139. package/dist/esm/parquetjs/util.js.map +0 -1
  140. package/dist/esm/parquetjs/writer.js.map +0 -1
  141. package/src/libs/read-array-buffer.ts +0 -31
  142. package/src/parquetjs/compression.ts.disabled +0 -105
  143. package/src/parquetjs/reader.ts +0 -707
@@ -0,0 +1,27 @@
1
+ // import type {LoaderWithParser, Loader, LoaderOptions} from '@loaders.gl/loader-utils';
2
+ import type {ParquetLoaderOptions} from '../parquet-loader';
3
+
4
+ import {ParquetReader} from '../parquetjs/parser/parquet-reader';
5
+
6
+ export async function parseParquet(arrayBuffer: ArrayBuffer, options?: ParquetLoaderOptions) {
7
+ const blob = new Blob([arrayBuffer]);
8
+ for await (const batch of parseParquetFileInBatches(blob, options)) {
9
+ return batch;
10
+ }
11
+ return null;
12
+ }
13
+
14
+ export async function* parseParquetFileInBatches(blob: Blob, options?: ParquetLoaderOptions) {
15
+ const reader = await ParquetReader.openBlob(blob);
16
+ const rows: any[][] = [];
17
+ try {
18
+ const cursor = reader.getCursor();
19
+ let record: any[] | null;
20
+ while ((record = await cursor.next())) {
21
+ rows.push(record);
22
+ }
23
+ } finally {
24
+ await reader.close();
25
+ }
26
+ yield rows;
27
+ }
File without changes
@@ -1,5 +1,4 @@
1
- import type {LoaderWithParser, Loader, LoaderOptions} from '@loaders.gl/loader-utils';
2
- import {ParquetReader} from './parquetjs/reader';
1
+ import type {Loader, LoaderOptions} from '@loaders.gl/loader-utils';
3
2
 
4
3
  // __VERSION__ is injected by babel-plugin-version-inline
5
4
  // @ts-ignore TS2304: Cannot find name '__VERSION__'.
@@ -20,7 +19,7 @@ const DEFAULT_PARQUET_LOADER_OPTIONS: ParquetLoaderOptions = {
20
19
  };
21
20
 
22
21
  /** ParquetJS table loader */
23
- export const ParquetWorkerLoader: Loader = {
22
+ export const ParquetLoader = {
24
23
  name: 'Apache Parquet',
25
24
  id: 'parquet',
26
25
  module: 'parquet',
@@ -30,27 +29,8 @@ export const ParquetWorkerLoader: Loader = {
30
29
  extensions: ['parquet'],
31
30
  mimeTypes: ['application/octet-stream'],
32
31
  binary: true,
33
- // tests: ['PARQUET'],
32
+ tests: ['PAR1', 'PARE'],
34
33
  options: DEFAULT_PARQUET_LOADER_OPTIONS
35
34
  };
36
35
 
37
- /** ParquetJS table loader */
38
- export const ParquetLoader: LoaderWithParser = {
39
- ...ParquetWorkerLoader,
40
- parse
41
- };
42
-
43
- async function parse(arrayBuffer: ArrayBuffer, options?: ParquetLoaderOptions) {
44
- const reader = await ParquetReader.openArrayBuffer(arrayBuffer);
45
- const rows: any[][] = [];
46
- try {
47
- const cursor = reader.getCursor();
48
- let record: any[] | null;
49
- while ((record = await cursor.next())) {
50
- rows.push(record);
51
- }
52
- } finally {
53
- await reader.close();
54
- }
55
- return rows;
56
- }
36
+ export const _typecheckParquetLoader: Loader = ParquetLoader;
@@ -0,0 +1,11 @@
1
+ import {decodeValues as decodeRleValues} from './rle';
2
+
3
+ export function decodeValues(type, cursor, count, opts) {
4
+ opts.bitWidth = cursor.buffer.slice(cursor.offset, cursor.offset + 1).readInt8(0);
5
+ cursor.offset += 1;
6
+ return decodeRleValues(type, cursor, count, {...opts, disableEnvelope: true});
7
+ }
8
+
9
+ export function encodeValues(type, cursor, count, opts) {
10
+ throw new Error('Encode dictionary functionality is not supported');
11
+ }
@@ -3,6 +3,7 @@ import type {ParquetCodec} from '../schema/declare';
3
3
  import type {ParquetCodecKit} from './declare';
4
4
  import * as PLAIN from './plain';
5
5
  import * as RLE from './rle';
6
+ import * as DICTIONARY from './dictionary';
6
7
 
7
8
  export * from './declare';
8
9
 
@@ -14,5 +15,17 @@ export const PARQUET_CODECS: Record<ParquetCodec, ParquetCodecKit> = {
14
15
  RLE: {
15
16
  encodeValues: RLE.encodeValues,
16
17
  decodeValues: RLE.decodeValues
18
+ },
19
+ // Using the PLAIN_DICTIONARY enum value is deprecated in the Parquet 2.0 specification.
20
+ PLAIN_DICTIONARY: {
21
+ // @ts-ignore
22
+ encodeValues: DICTIONARY.encodeValues,
23
+ decodeValues: DICTIONARY.decodeValues
24
+ },
25
+ // Prefer using RLE_DICTIONARY in a data page and PLAIN in a dictionary page for Parquet 2.0+ files.
26
+ RLE_DICTIONARY: {
27
+ // @ts-ignore
28
+ encodeValues: DICTIONARY.encodeValues,
29
+ decodeValues: DICTIONARY.decodeValues
17
30
  }
18
31
  };
@@ -138,7 +138,8 @@ function decodeRunRepeated(
138
138
 
139
139
  let value = 0;
140
140
  for (let i = 0; i < Math.ceil(bitWidth / 8); i++) {
141
- value <<= 8;
141
+ // eslint-disable-next-line
142
+ value << 8; // TODO - this looks wrong
142
143
  value += cursor.buffer[cursor.offset];
143
144
  cursor.offset += 1;
144
145
  }
@@ -173,7 +174,8 @@ function encodeRunRepeated(value: number, count: number, opts: ParquetCodecOptio
173
174
 
174
175
  for (let i = 0; i < buf.length; i++) {
175
176
  buf.writeUInt8(value & 0xff, i);
176
- value >>= 8;
177
+ // eslint-disable-next-line
178
+ value >> 8; // TODO - this looks wrong
177
179
  }
178
180
 
179
181
  return Buffer.concat([Buffer.from(varint.encode(count << 1)), buf]);
@@ -1,57 +1,107 @@
1
1
  // Forked from https://github.com/kbajalc/parquets under MIT license (Copyright (c) 2017 ironSource Ltd.)
2
2
  /* eslint-disable camelcase */
3
+ // Forked from https://github.com/ironSource/parquetjs under MIT license
4
+
5
+ import {
6
+ Compression,
7
+ NoCompression,
8
+ GZipCompression,
9
+ SnappyCompression,
10
+ BrotliCompression,
11
+ LZOCompression,
12
+ LZ4Compression,
13
+ ZstdCompression
14
+ } from '@loaders.gl/compression';
15
+
3
16
  import {ParquetCompression} from './schema/declare';
4
- import * as Util from './util';
5
- import zlib from 'zlib';
6
- import snappyjs from 'snappyjs';
17
+ import {toArrayBuffer, toBuffer} from './utils/buffer-utils';
18
+
19
+ // TODO switch to worker compression to avoid bundling...
20
+
21
+ // import brotli from 'brotli'; - brotli has problems with decompress in browsers
22
+ import brotliDecompress from 'brotli/decompress';
23
+ import lz4js from 'lz4js';
24
+ import lzo from 'lzo';
25
+ import {ZstdCodec} from 'zstd-codec';
26
+
27
+ // Inject large dependencies through Compression constructor options
28
+ const modules = {
29
+ // brotli has problems with decompress in browsers
30
+ brotli: {
31
+ decompress: brotliDecompress,
32
+ compress: () => {
33
+ throw new Error('brotli compress');
34
+ }
35
+ },
36
+ lz4js,
37
+ lzo,
38
+ 'zstd-codec': ZstdCodec
39
+ };
7
40
 
8
- let brotli: any;
9
- let lzo: any;
10
- let lz4js: any;
41
+ // See https://github.com/apache/parquet-format/blob/master/Compression.md
42
+ export const PARQUET_COMPRESSION_METHODS: Record<ParquetCompression, Compression> = {
43
+ UNCOMPRESSED: new NoCompression(),
44
+ GZIP: new GZipCompression(),
45
+ SNAPPY: new SnappyCompression(),
46
+ BROTLI: new BrotliCompression({modules}),
47
+ // TODO: Understand difference between LZ4 and LZ4_RAW
48
+ LZ4: new LZ4Compression({modules}),
49
+ LZ4_RAW: new LZ4Compression({modules}),
50
+ LZO: new LZOCompression({modules}),
51
+ ZSTD: new ZstdCompression({modules})
52
+ };
11
53
 
12
- export interface ParquetCompressionKit {
13
- deflate: (value: Buffer) => Buffer;
14
- inflate: (value: Buffer, size: number) => Buffer;
54
+ /**
55
+ * Register compressions that have big external libraries
56
+ * @param options.modules External library dependencies
57
+ */
58
+ export async function preloadCompressions(options?: {modules: {[key: string]: any}}) {
59
+ const compressions = Object.values(PARQUET_COMPRESSION_METHODS);
60
+ return await Promise.all(compressions.map((compression) => compression.preload()));
15
61
  }
16
62
 
17
- export const PARQUET_COMPRESSION_METHODS: Record<ParquetCompression, ParquetCompressionKit> = {
18
- UNCOMPRESSED: {
19
- deflate: deflate_identity,
20
- inflate: inflate_identity
21
- },
22
- GZIP: {
23
- deflate: deflate_gzip,
24
- inflate: inflate_gzip
25
- },
26
- SNAPPY: {
27
- deflate: deflate_snappy,
28
- inflate: inflate_snappy
29
- },
30
- LZO: {
31
- deflate: deflate_lzo,
32
- inflate: inflate_lzo
33
- },
34
- BROTLI: {
35
- deflate: deflate_brotli,
36
- inflate: inflate_brotli
37
- },
38
- LZ4: {
39
- deflate: deflate_lz4,
40
- inflate: inflate_lz4
63
+ /**
64
+ * Deflate a value using compression method `method`
65
+ */
66
+ export async function deflate(method: ParquetCompression, value: Buffer): Promise<Buffer> {
67
+ const compression = PARQUET_COMPRESSION_METHODS[method];
68
+ if (!compression) {
69
+ throw new Error(`parquet: invalid compression method: ${method}`);
41
70
  }
42
- };
71
+ const inputArrayBuffer = toArrayBuffer(value);
72
+ const compressedArrayBuffer = await compression.compress(inputArrayBuffer);
73
+ return toBuffer(compressedArrayBuffer);
74
+ }
43
75
 
44
76
  /**
45
- * Deflate a value using compression method `method`
77
+ * Inflate a value using compression method `method`
78
+ */
79
+ export async function decompress(
80
+ method: ParquetCompression,
81
+ value: Buffer,
82
+ size: number
83
+ ): Promise<Buffer> {
84
+ const compression = PARQUET_COMPRESSION_METHODS[method];
85
+ if (!compression) {
86
+ throw new Error(`parquet: invalid compression method: ${method}`);
87
+ }
88
+ const inputArrayBuffer = toArrayBuffer(value);
89
+ const compressedArrayBuffer = await compression.decompress(inputArrayBuffer, size);
90
+ return toBuffer(compressedArrayBuffer);
91
+ }
92
+
93
+ /*
94
+ * Inflate a value using compression method `method`
46
95
  */
47
- export function deflate(method: ParquetCompression, value: Buffer): Buffer {
96
+ export function inflate(method: ParquetCompression, value: Buffer, size: number): Buffer {
48
97
  if (!(method in PARQUET_COMPRESSION_METHODS)) {
49
98
  throw new Error(`invalid compression method: ${method}`);
50
99
  }
51
-
52
- return PARQUET_COMPRESSION_METHODS[method].deflate(value);
100
+ // @ts-ignore
101
+ return PARQUET_COMPRESSION_METHODS[method].inflate(value, size);
53
102
  }
54
103
 
104
+ /*
55
105
  function deflate_identity(value: Buffer): Buffer {
56
106
  return value;
57
107
  }
@@ -92,18 +142,6 @@ function deflate_lz4(value: Buffer): Buffer {
92
142
  throw err;
93
143
  }
94
144
  }
95
-
96
- /**
97
- * Inflate a value using compression method `method`
98
- */
99
- export function inflate(method: ParquetCompression, value: Buffer, size: number): Buffer {
100
- if (!(method in PARQUET_COMPRESSION_METHODS)) {
101
- throw new Error(`invalid compression method: ${method}`);
102
- }
103
-
104
- return PARQUET_COMPRESSION_METHODS[method].inflate(value, size);
105
- }
106
-
107
145
  function inflate_identity(value: Buffer): Buffer {
108
146
  return value;
109
147
  }
@@ -142,3 +180,4 @@ function inflate_brotli(value: Buffer): Buffer {
142
180
  }
143
181
  return Buffer.from(brotli.decompress(value));
144
182
  }
183
+ */
@@ -1,17 +1,17 @@
1
1
  // Forked from https://github.com/kbajalc/parquets under MIT license (Copyright (c) 2017 ironSource Ltd.)
2
2
  /* eslint-disable camelcase */
3
3
  import {Transform, Writable} from 'stream';
4
- import {ParquetCodecOptions, PARQUET_CODECS} from './codecs';
5
- import * as Compression from './compression';
4
+ import {ParquetCodecOptions, PARQUET_CODECS} from '../codecs';
5
+ import * as Compression from '../compression';
6
6
  import {
7
7
  ParquetBuffer,
8
8
  ParquetCodec,
9
9
  ParquetData,
10
10
  ParquetField,
11
11
  PrimitiveType
12
- } from './schema/declare';
13
- import {ParquetSchema} from './schema/schema';
14
- import * as Shred from './schema/shred';
12
+ } from '../schema/declare';
13
+ import {ParquetSchema} from '../schema/schema';
14
+ import * as Shred from '../schema/shred';
15
15
  import {
16
16
  ColumnChunk,
17
17
  ColumnMetaData,
@@ -28,8 +28,9 @@ import {
28
28
  RowGroup,
29
29
  SchemaElement,
30
30
  Type
31
- } from './parquet-thrift';
32
- import * as Util from './util';
31
+ } from '../parquet-thrift';
32
+ import {osopen, oswrite, osclose} from '../utils/file-utils';
33
+ import {getBitWidth, serializeThrift} from '../utils/read-utils';
33
34
  import Int64 from 'node-int64';
34
35
 
35
36
  /**
@@ -85,7 +86,7 @@ export class ParquetWriter<T> {
85
86
  path: string,
86
87
  opts?: ParquetWriterOptions
87
88
  ): Promise<ParquetWriter<T>> {
88
- const outputStream = await Util.osopen(path, opts);
89
+ const outputStream = await osopen(path, opts);
89
90
  return ParquetWriter.openStream(schema, outputStream, opts);
90
91
  }
91
92
 
@@ -131,13 +132,16 @@ export class ParquetWriter<T> {
131
132
  this.closed = false;
132
133
  this.userMetadata = {};
133
134
 
135
+ // eslint-disable-next-line @typescript-eslint/no-floating-promises
136
+ this.writeHeader();
137
+ }
138
+
139
+ async writeHeader(): Promise<void> {
134
140
  // TODO - better not mess with promises in the constructor
135
141
  try {
136
- // eslint-disable-next-line @typescript-eslint/no-floating-promises
137
- envelopeWriter.writeHeader();
142
+ await this.envelopeWriter.writeHeader();
138
143
  } catch (err) {
139
- // eslint-disable-next-line @typescript-eslint/no-floating-promises
140
- envelopeWriter.close();
144
+ await this.envelopeWriter.close();
141
145
  throw err;
142
146
  }
143
147
  }
@@ -226,8 +230,8 @@ export class ParquetEnvelopeWriter {
226
230
  outputStream: Writable,
227
231
  opts: ParquetWriterOptions
228
232
  ): Promise<ParquetEnvelopeWriter> {
229
- const writeFn = Util.oswrite.bind(undefined, outputStream);
230
- const closeFn = Util.osclose.bind(undefined, outputStream);
233
+ const writeFn = oswrite.bind(undefined, outputStream);
234
+ const closeFn = osclose.bind(undefined, outputStream);
231
235
  return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts);
232
236
  }
233
237
 
@@ -274,7 +278,7 @@ export class ParquetEnvelopeWriter {
274
278
  * shredRecord method
275
279
  */
276
280
  async writeRowGroup(records: ParquetBuffer): Promise<void> {
277
- const rgroup = encodeRowGroup(this.schema, records, {
281
+ const rgroup = await encodeRowGroup(this.schema, records, {
278
282
  baseOffset: this.offset,
279
283
  pageSize: this.pageSize,
280
284
  useDataPageV2: this.useDataPageV2
@@ -282,7 +286,7 @@ export class ParquetEnvelopeWriter {
282
286
 
283
287
  this.rowCount += records.rowCount;
284
288
  this.rowGroups.push(rgroup.metadata);
285
- return this.writeSection(rgroup.body);
289
+ return await this.writeSection(rgroup.body);
286
290
  }
287
291
 
288
292
  /**
@@ -363,19 +367,19 @@ function encodeValues(
363
367
  /**
364
368
  * Encode a parquet data page
365
369
  */
366
- function encodeDataPage(
370
+ async function encodeDataPage(
367
371
  column: ParquetField,
368
372
  data: ParquetData
369
- ): {
373
+ ): Promise<{
370
374
  header: PageHeader;
371
375
  headerSize: number;
372
376
  page: Buffer;
373
- } {
377
+ }> {
374
378
  /* encode repetition and definition levels */
375
379
  let rLevelsBuf = Buffer.alloc(0);
376
380
  if (column.rLevelMax > 0) {
377
381
  rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.rlevels, {
378
- bitWidth: Util.getBitWidth(column.rLevelMax)
382
+ bitWidth: getBitWidth(column.rLevelMax)
379
383
  // disableEnvelope: false
380
384
  });
381
385
  }
@@ -383,7 +387,7 @@ function encodeDataPage(
383
387
  let dLevelsBuf = Buffer.alloc(0);
384
388
  if (column.dLevelMax > 0) {
385
389
  dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.dlevels, {
386
- bitWidth: Util.getBitWidth(column.dLevelMax)
390
+ bitWidth: getBitWidth(column.dLevelMax)
387
391
  // disableEnvelope: false
388
392
  });
389
393
  }
@@ -397,7 +401,7 @@ function encodeDataPage(
397
401
  const dataBuf = Buffer.concat([rLevelsBuf, dLevelsBuf, valuesBuf]);
398
402
 
399
403
  // compression = column.compression === 'UNCOMPRESSED' ? (compression || 'UNCOMPRESSED') : column.compression;
400
- const compressedBuf = Compression.deflate(column.compression!, dataBuf);
404
+ const compressedBuf = await Compression.deflate(column.compression!, dataBuf);
401
405
 
402
406
  /* build page header */
403
407
  const header = new PageHeader({
@@ -413,7 +417,7 @@ function encodeDataPage(
413
417
  });
414
418
 
415
419
  /* concat page header, repetition and definition levels and values */
416
- const headerBuf = Util.serializeThrift(header);
420
+ const headerBuf = serializeThrift(header);
417
421
  const page = Buffer.concat([headerBuf, compressedBuf]);
418
422
 
419
423
  return {header, headerSize: headerBuf.length, page};
@@ -422,15 +426,15 @@ function encodeDataPage(
422
426
  /**
423
427
  * Encode a parquet data page (v2)
424
428
  */
425
- function encodeDataPageV2(
429
+ async function encodeDataPageV2(
426
430
  column: ParquetField,
427
431
  data: ParquetData,
428
432
  rowCount: number
429
- ): {
433
+ ): Promise<{
430
434
  header: PageHeader;
431
435
  headerSize: number;
432
436
  page: Buffer;
433
- } {
437
+ }> {
434
438
  /* encode values */
435
439
  const valuesBuf = encodeValues(column.primitiveType!, column.encoding!, data.values, {
436
440
  typeLength: column.typeLength,
@@ -438,13 +442,13 @@ function encodeDataPageV2(
438
442
  });
439
443
 
440
444
  // compression = column.compression === 'UNCOMPRESSED' ? (compression || 'UNCOMPRESSED') : column.compression;
441
- const compressedBuf = Compression.deflate(column.compression!, valuesBuf);
445
+ const compressedBuf = await Compression.deflate(column.compression!, valuesBuf);
442
446
 
443
447
  /* encode repetition and definition levels */
444
448
  let rLevelsBuf = Buffer.alloc(0);
445
449
  if (column.rLevelMax > 0) {
446
450
  rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.rlevels, {
447
- bitWidth: Util.getBitWidth(column.rLevelMax),
451
+ bitWidth: getBitWidth(column.rLevelMax),
448
452
  disableEnvelope: true
449
453
  });
450
454
  }
@@ -452,7 +456,7 @@ function encodeDataPageV2(
452
456
  let dLevelsBuf = Buffer.alloc(0);
453
457
  if (column.dLevelMax > 0) {
454
458
  dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.dlevels, {
455
- bitWidth: Util.getBitWidth(column.dLevelMax),
459
+ bitWidth: getBitWidth(column.dLevelMax),
456
460
  disableEnvelope: true
457
461
  });
458
462
  }
@@ -474,7 +478,7 @@ function encodeDataPageV2(
474
478
  });
475
479
 
476
480
  /* concat page header, repetition and definition levels and values */
477
- const headerBuf = Util.serializeThrift(header);
481
+ const headerBuf = serializeThrift(header);
478
482
  const page = Buffer.concat([headerBuf, rLevelsBuf, dLevelsBuf, compressedBuf]);
479
483
  return {header, headerSize: headerBuf.length, page};
480
484
  }
@@ -482,16 +486,16 @@ function encodeDataPageV2(
482
486
  /**
483
487
  * Encode an array of values into a parquet column chunk
484
488
  */
485
- function encodeColumnChunk(
489
+ async function encodeColumnChunk(
486
490
  column: ParquetField,
487
491
  buffer: ParquetBuffer,
488
492
  offset: number,
489
493
  opts: ParquetWriterOptions
490
- ): {
494
+ ): Promise<{
491
495
  body: Buffer;
492
496
  metadata: ColumnMetaData;
493
497
  metadataOffset: number;
494
- } {
498
+ }> {
495
499
  const data = buffer.columnData[column.path.join()];
496
500
  const baseOffset = (opts.baseOffset || 0) + offset;
497
501
  /* encode data page(s) */
@@ -502,12 +506,9 @@ function encodeColumnChunk(
502
506
  // tslint:disable-next-line:variable-name
503
507
  let total_compressed_size = 0;
504
508
  {
505
- let result: any;
506
- if (opts.useDataPageV2) {
507
- result = encodeDataPageV2(column, data, buffer.rowCount);
508
- } else {
509
- result = encodeDataPage(column, data);
510
- }
509
+ const result = opts.useDataPageV2
510
+ ? await encodeDataPageV2(column, data, buffer.rowCount)
511
+ : await encodeDataPage(column, data);
511
512
  // pages.push(result.page);
512
513
  pageBuf = result.page;
513
514
  total_uncompressed_size += result.header.uncompressed_page_size + result.headerSize;
@@ -535,21 +536,21 @@ function encodeColumnChunk(
535
536
 
536
537
  /* concat metadata header and data pages */
537
538
  const metadataOffset = baseOffset + pageBuf.length;
538
- const body = Buffer.concat([pageBuf, Util.serializeThrift(metadata)]);
539
+ const body = Buffer.concat([pageBuf, serializeThrift(metadata)]);
539
540
  return {body, metadata, metadataOffset};
540
541
  }
541
542
 
542
543
  /**
543
544
  * Encode a list of column values into a parquet row group
544
545
  */
545
- function encodeRowGroup(
546
+ async function encodeRowGroup(
546
547
  schema: ParquetSchema,
547
548
  data: ParquetBuffer,
548
549
  opts: ParquetWriterOptions
549
- ): {
550
+ ): Promise<{
550
551
  body: Buffer;
551
552
  metadata: RowGroup;
552
- } {
553
+ }> {
553
554
  const metadata = new RowGroup({
554
555
  num_rows: data.rowCount,
555
556
  columns: [],
@@ -562,7 +563,7 @@ function encodeRowGroup(
562
563
  continue; // eslint-disable-line no-continue
563
564
  }
564
565
 
565
- const cchunkData = encodeColumnChunk(field, data, body.length, opts);
566
+ const cchunkData = await encodeColumnChunk(field, data, body.length, opts);
566
567
 
567
568
  const cchunk = new ColumnChunk({
568
569
  file_offset: cchunkData.metadataOffset,
@@ -634,7 +635,7 @@ function encodeFooter(
634
635
  metadata.schema.push(schemaElem);
635
636
  }
636
637
 
637
- const metadataEncoded = Util.serializeThrift(metadata);
638
+ const metadataEncoded = serializeThrift(metadata);
638
639
  const footerEncoded = Buffer.alloc(metadataEncoded.length + 8);
639
640
  metadataEncoded.copy(footerEncoded);
640
641
  footerEncoded.writeUInt32LE(metadataEncoded.length, metadataEncoded.length);