@loaders.gl/parquet 3.4.6 → 4.0.0-alpha.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/dist.min.js +27 -34
- package/dist/dist.min.js.map +3 -3
- package/dist/es5/index.js +6 -6
- package/dist/es5/index.js.map +1 -1
- package/dist/es5/lib/arrow/convert-row-group-to-columns.js.map +1 -1
- package/dist/es5/lib/arrow/convert-schema-from-parquet.js +58 -42
- package/dist/es5/lib/arrow/convert-schema-from-parquet.js.map +1 -1
- package/dist/es5/lib/arrow/convert-schema-to-parquet.js +33 -31
- package/dist/es5/lib/arrow/convert-schema-to-parquet.js.map +1 -1
- package/dist/es5/lib/geo/decode-geo-metadata.js +12 -8
- package/dist/es5/lib/geo/decode-geo-metadata.js.map +1 -1
- package/dist/es5/lib/parsers/parse-parquet-to-columns.js +11 -7
- package/dist/es5/lib/parsers/parse-parquet-to-columns.js.map +1 -1
- package/dist/es5/lib/parsers/parse-parquet-to-rows.js +51 -29
- package/dist/es5/lib/parsers/parse-parquet-to-rows.js.map +1 -1
- package/dist/es5/lib/wasm/parse-parquet-wasm.js +6 -6
- package/dist/es5/lib/wasm/parse-parquet-wasm.js.map +1 -1
- package/dist/es5/parquet-loader.js +16 -4
- package/dist/es5/parquet-loader.js.map +1 -1
- package/dist/es5/parquet-wasm-loader.js +1 -1
- package/dist/es5/parquet-wasm-loader.js.map +1 -1
- package/dist/es5/parquet-wasm-writer.js +1 -1
- package/dist/es5/parquet-wasm-writer.js.map +1 -1
- package/dist/es5/parquet-writer.js +1 -1
- package/dist/es5/parquet-writer.js.map +1 -1
- package/dist/es5/parquetjs/encoder/parquet-encoder.js.map +1 -1
- package/dist/es5/parquetjs/parser/decoders.js.map +1 -1
- package/dist/es5/parquetjs/parser/parquet-reader.js +1 -1
- package/dist/es5/parquetjs/parser/parquet-reader.js.map +1 -1
- package/dist/es5/parquetjs/schema/declare.js +4 -4
- package/dist/es5/parquetjs/schema/declare.js.map +1 -1
- package/dist/es5/parquetjs/schema/schema.js +7 -7
- package/dist/es5/parquetjs/schema/schema.js.map +1 -1
- package/dist/es5/parquetjs/schema/shred.js +117 -22
- package/dist/es5/parquetjs/schema/shred.js.map +1 -1
- package/dist/esm/index.js +5 -5
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/lib/arrow/convert-row-group-to-columns.js.map +1 -1
- package/dist/esm/lib/arrow/convert-schema-from-parquet.js +57 -41
- package/dist/esm/lib/arrow/convert-schema-from-parquet.js.map +1 -1
- package/dist/esm/lib/arrow/convert-schema-to-parquet.js +33 -31
- package/dist/esm/lib/arrow/convert-schema-to-parquet.js.map +1 -1
- package/dist/esm/lib/geo/decode-geo-metadata.js +12 -8
- package/dist/esm/lib/geo/decode-geo-metadata.js.map +1 -1
- package/dist/esm/lib/parsers/parse-parquet-to-columns.js +12 -8
- package/dist/esm/lib/parsers/parse-parquet-to-columns.js.map +1 -1
- package/dist/esm/lib/parsers/parse-parquet-to-rows.js +14 -3
- package/dist/esm/lib/parsers/parse-parquet-to-rows.js.map +1 -1
- package/dist/esm/lib/wasm/parse-parquet-wasm.js +3 -3
- package/dist/esm/lib/wasm/parse-parquet-wasm.js.map +1 -1
- package/dist/esm/parquet-loader.js +14 -2
- package/dist/esm/parquet-loader.js.map +1 -1
- package/dist/esm/parquet-wasm-loader.js +1 -1
- package/dist/esm/parquet-wasm-loader.js.map +1 -1
- package/dist/esm/parquet-wasm-writer.js +1 -1
- package/dist/esm/parquet-wasm-writer.js.map +1 -1
- package/dist/esm/parquet-writer.js +1 -1
- package/dist/esm/parquet-writer.js.map +1 -1
- package/dist/esm/parquetjs/encoder/parquet-encoder.js.map +1 -1
- package/dist/esm/parquetjs/parser/decoders.js.map +1 -1
- package/dist/esm/parquetjs/parser/parquet-reader.js +2 -2
- package/dist/esm/parquetjs/parser/parquet-reader.js.map +1 -1
- package/dist/esm/parquetjs/schema/declare.js +1 -1
- package/dist/esm/parquetjs/schema/declare.js.map +1 -1
- package/dist/esm/parquetjs/schema/schema.js +6 -6
- package/dist/esm/parquetjs/schema/schema.js.map +1 -1
- package/dist/esm/parquetjs/schema/shred.js +108 -21
- package/dist/esm/parquetjs/schema/shred.js.map +1 -1
- package/dist/index.d.ts +8 -49
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +8 -6
- package/dist/lib/arrow/convert-row-group-to-columns.d.ts +2 -2
- package/dist/lib/arrow/convert-row-group-to-columns.d.ts.map +1 -1
- package/dist/lib/arrow/convert-schema-from-parquet.d.ts +4 -4
- package/dist/lib/arrow/convert-schema-from-parquet.d.ts.map +1 -1
- package/dist/lib/arrow/convert-schema-from-parquet.js +48 -44
- package/dist/lib/arrow/convert-schema-to-parquet.d.ts +1 -1
- package/dist/lib/arrow/convert-schema-to-parquet.d.ts.map +1 -1
- package/dist/lib/arrow/convert-schema-to-parquet.js +30 -31
- package/dist/lib/geo/decode-geo-metadata.js +12 -8
- package/dist/lib/parsers/parse-parquet-to-columns.d.ts +2 -2
- package/dist/lib/parsers/parse-parquet-to-columns.d.ts.map +1 -1
- package/dist/lib/parsers/parse-parquet-to-columns.js +13 -7
- package/dist/lib/parsers/parse-parquet-to-rows.d.ts +3 -2
- package/dist/lib/parsers/parse-parquet-to-rows.d.ts.map +1 -1
- package/dist/lib/parsers/parse-parquet-to-rows.js +16 -19
- package/dist/lib/wasm/parse-parquet-wasm.d.ts +3 -3
- package/dist/lib/wasm/parse-parquet-wasm.d.ts.map +1 -1
- package/dist/lib/wasm/parse-parquet-wasm.js +3 -3
- package/dist/parquet-loader.d.ts +3 -14
- package/dist/parquet-loader.d.ts.map +1 -1
- package/dist/parquet-loader.js +14 -2
- package/dist/parquet-worker.js +31 -38
- package/dist/parquet-worker.js.map +3 -3
- package/dist/parquet-writer.d.ts +2 -1
- package/dist/parquet-writer.d.ts.map +1 -1
- package/dist/parquet-writer.js +1 -0
- package/dist/parquetjs/encoder/parquet-encoder.d.ts +4 -4
- package/dist/parquetjs/encoder/parquet-encoder.d.ts.map +1 -1
- package/dist/parquetjs/parser/decoders.d.ts +2 -2
- package/dist/parquetjs/parser/decoders.d.ts.map +1 -1
- package/dist/parquetjs/parser/parquet-reader.d.ts +6 -6
- package/dist/parquetjs/parser/parquet-reader.d.ts.map +1 -1
- package/dist/parquetjs/parser/parquet-reader.js +1 -1
- package/dist/parquetjs/schema/declare.d.ts +6 -5
- package/dist/parquetjs/schema/declare.d.ts.map +1 -1
- package/dist/parquetjs/schema/declare.js +3 -3
- package/dist/parquetjs/schema/schema.d.ts +4 -4
- package/dist/parquetjs/schema/schema.d.ts.map +1 -1
- package/dist/parquetjs/schema/schema.js +5 -5
- package/dist/parquetjs/schema/shred.d.ts +17 -111
- package/dist/parquetjs/schema/shred.d.ts.map +1 -1
- package/dist/parquetjs/schema/shred.js +127 -119
- package/package.json +8 -8
- package/src/index.ts +32 -9
- package/src/lib/arrow/convert-row-group-to-columns.ts +2 -2
- package/src/lib/arrow/convert-schema-from-parquet.ts +56 -66
- package/src/lib/arrow/convert-schema-to-parquet.ts +32 -44
- package/src/lib/geo/decode-geo-metadata.ts +17 -8
- package/src/lib/parsers/parse-parquet-to-columns.ts +22 -11
- package/src/lib/parsers/parse-parquet-to-rows.ts +28 -23
- package/src/lib/wasm/parse-parquet-wasm.ts +7 -7
- package/src/parquet-loader.ts +25 -2
- package/src/parquet-writer.ts +4 -1
- package/src/parquetjs/encoder/parquet-encoder.ts +11 -10
- package/src/parquetjs/parser/decoders.ts +3 -3
- package/src/parquetjs/parser/parquet-reader.ts +7 -7
- package/src/parquetjs/schema/declare.ts +6 -5
- package/src/parquetjs/schema/schema.ts +8 -8
- package/src/parquetjs/schema/shred.ts +142 -103
|
@@ -10,59 +10,47 @@ import {
|
|
|
10
10
|
Schema,
|
|
11
11
|
// Struct,
|
|
12
12
|
// Field,
|
|
13
|
-
DataType
|
|
14
|
-
Bool,
|
|
15
|
-
Float64,
|
|
16
|
-
Int32,
|
|
17
|
-
Float32,
|
|
18
|
-
Binary,
|
|
19
|
-
Utf8,
|
|
20
|
-
Int64,
|
|
21
|
-
Uint16,
|
|
22
|
-
Uint32,
|
|
23
|
-
Uint64,
|
|
24
|
-
Int8,
|
|
25
|
-
Int16
|
|
13
|
+
DataType
|
|
26
14
|
} from '@loaders.gl/schema';
|
|
27
15
|
|
|
28
|
-
export const PARQUET_TYPE_MAPPING: {[type in ParquetType]:
|
|
29
|
-
BOOLEAN:
|
|
30
|
-
INT32:
|
|
31
|
-
INT64:
|
|
32
|
-
INT96:
|
|
33
|
-
FLOAT:
|
|
34
|
-
DOUBLE:
|
|
35
|
-
BYTE_ARRAY:
|
|
36
|
-
FIXED_LEN_BYTE_ARRAY:
|
|
37
|
-
UTF8:
|
|
38
|
-
DATE:
|
|
39
|
-
TIME_MILLIS:
|
|
40
|
-
TIME_MICROS:
|
|
41
|
-
TIMESTAMP_MILLIS:
|
|
42
|
-
TIMESTAMP_MICROS:
|
|
43
|
-
UINT_8:
|
|
44
|
-
UINT_16:
|
|
45
|
-
UINT_32:
|
|
46
|
-
UINT_64:
|
|
47
|
-
INT_8:
|
|
48
|
-
INT_16:
|
|
49
|
-
INT_32:
|
|
50
|
-
INT_64:
|
|
51
|
-
JSON:
|
|
52
|
-
BSON:
|
|
16
|
+
export const PARQUET_TYPE_MAPPING: {[type in ParquetType]: DataType} = {
|
|
17
|
+
BOOLEAN: 'bool',
|
|
18
|
+
INT32: 'int32',
|
|
19
|
+
INT64: 'float64',
|
|
20
|
+
INT96: 'float64',
|
|
21
|
+
FLOAT: 'float32',
|
|
22
|
+
DOUBLE: 'float64',
|
|
23
|
+
BYTE_ARRAY: 'binary',
|
|
24
|
+
FIXED_LEN_BYTE_ARRAY: 'binary',
|
|
25
|
+
UTF8: 'utf8',
|
|
26
|
+
DATE: 'int32',
|
|
27
|
+
TIME_MILLIS: 'int64',
|
|
28
|
+
TIME_MICROS: 'int64',
|
|
29
|
+
TIMESTAMP_MILLIS: 'int64',
|
|
30
|
+
TIMESTAMP_MICROS: 'int64',
|
|
31
|
+
UINT_8: 'int32',
|
|
32
|
+
UINT_16: 'uint16',
|
|
33
|
+
UINT_32: 'uint32',
|
|
34
|
+
UINT_64: 'uint64',
|
|
35
|
+
INT_8: 'int8',
|
|
36
|
+
INT_16: 'int16',
|
|
37
|
+
INT_32: 'int32',
|
|
38
|
+
INT_64: 'int64',
|
|
39
|
+
JSON: 'binary',
|
|
40
|
+
BSON: 'binary',
|
|
53
41
|
// TODO check interval type
|
|
54
|
-
INTERVAL:
|
|
55
|
-
DECIMAL_INT32:
|
|
56
|
-
DECIMAL_INT64:
|
|
57
|
-
DECIMAL_BYTE_ARRAY:
|
|
58
|
-
DECIMAL_FIXED_LEN_BYTE_ARRAY:
|
|
42
|
+
INTERVAL: 'binary',
|
|
43
|
+
DECIMAL_INT32: 'float32',
|
|
44
|
+
DECIMAL_INT64: 'float64',
|
|
45
|
+
DECIMAL_BYTE_ARRAY: 'float64',
|
|
46
|
+
DECIMAL_FIXED_LEN_BYTE_ARRAY: 'float64'
|
|
59
47
|
};
|
|
60
48
|
|
|
61
49
|
export function convertToParquetSchema(schema: Schema): Schema {
|
|
62
50
|
const fields = []; // getFields(schema.fields);
|
|
63
51
|
|
|
64
52
|
// TODO add metadata if needed.
|
|
65
|
-
return
|
|
53
|
+
return {fields, metadata: {}};
|
|
66
54
|
}
|
|
67
55
|
|
|
68
56
|
// function getFields(schema: Field[]): Definition[] {
|
|
@@ -26,7 +26,7 @@ export type GeoColumnMetadata = {
|
|
|
26
26
|
* Reads the GeoMetadata object from the metadata
|
|
27
27
|
* @note geoarrow / parquet schema is stringified into a single key-value pair in the parquet metadata */
|
|
28
28
|
export function getGeoMetadata(schema: Schema): GeoMetadata | null {
|
|
29
|
-
const stringifiedGeoMetadata = schema.metadata.
|
|
29
|
+
const stringifiedGeoMetadata = schema.metadata.geo;
|
|
30
30
|
if (!stringifiedGeoMetadata) {
|
|
31
31
|
return null;
|
|
32
32
|
}
|
|
@@ -45,7 +45,7 @@ export function getGeoMetadata(schema: Schema): GeoMetadata | null {
|
|
|
45
45
|
*/
|
|
46
46
|
export function setGeoMetadata(schema: Schema, geoMetadata: GeoMetadata): void {
|
|
47
47
|
const stringifiedGeoMetadata = JSON.stringify(geoMetadata);
|
|
48
|
-
schema.metadata.
|
|
48
|
+
schema.metadata.geo = stringifiedGeoMetadata;
|
|
49
49
|
}
|
|
50
50
|
|
|
51
51
|
/**
|
|
@@ -62,21 +62,21 @@ export function unpackGeoMetadata(schema: Schema): void {
|
|
|
62
62
|
|
|
63
63
|
const {version, primary_column, columns} = geoMetadata;
|
|
64
64
|
if (version) {
|
|
65
|
-
schema.metadata
|
|
65
|
+
schema.metadata['geo.version'] = version;
|
|
66
66
|
}
|
|
67
67
|
|
|
68
68
|
if (primary_column) {
|
|
69
|
-
schema.metadata
|
|
69
|
+
schema.metadata['geo.primary_column'] = primary_column;
|
|
70
70
|
}
|
|
71
71
|
|
|
72
72
|
// store column names as comma separated list
|
|
73
|
-
schema.metadata
|
|
73
|
+
schema.metadata['geo.columns'] = Object.keys(columns || {}).join('');
|
|
74
74
|
|
|
75
75
|
for (const [columnName, columnMetadata] of Object.entries(columns || {})) {
|
|
76
76
|
const field = schema.fields.find((field) => field.name === columnName);
|
|
77
77
|
if (field) {
|
|
78
78
|
if (field.name === primary_column) {
|
|
79
|
-
field
|
|
79
|
+
setFieldMetadata(field, 'geo.primary_field', 'true');
|
|
80
80
|
}
|
|
81
81
|
unpackGeoFieldMetadata(field, columnMetadata);
|
|
82
82
|
}
|
|
@@ -87,13 +87,22 @@ function unpackGeoFieldMetadata(field: Field, columnMetadata): void {
|
|
|
87
87
|
for (const [key, value] of Object.entries(columnMetadata || {})) {
|
|
88
88
|
switch (key) {
|
|
89
89
|
case 'geometry_type':
|
|
90
|
-
field
|
|
90
|
+
setFieldMetadata(field, `geo.${key}`, (value as string[]).join(','));
|
|
91
91
|
break;
|
|
92
92
|
case 'bbox':
|
|
93
93
|
case 'crs':
|
|
94
94
|
case 'edges':
|
|
95
95
|
default:
|
|
96
|
-
|
|
96
|
+
setFieldMetadata(
|
|
97
|
+
field,
|
|
98
|
+
`geo.${key}`,
|
|
99
|
+
typeof value === 'string' ? value : JSON.stringify(value)
|
|
100
|
+
);
|
|
97
101
|
}
|
|
98
102
|
}
|
|
99
103
|
}
|
|
104
|
+
|
|
105
|
+
function setFieldMetadata(field: Field, key: string, value: string): void {
|
|
106
|
+
field.metadata = field.metadata || {};
|
|
107
|
+
field.metadata[key] = value;
|
|
108
|
+
}
|
|
@@ -1,24 +1,30 @@
|
|
|
1
1
|
// loaders.gl, MIT license
|
|
2
2
|
|
|
3
3
|
// import type {LoaderWithParser, Loader, LoaderOptions} from '@loaders.gl/loader-utils';
|
|
4
|
-
import {ColumnarTableBatch, Schema} from '@loaders.gl/schema';
|
|
4
|
+
import {ColumnarTable, ColumnarTableBatch, Schema} from '@loaders.gl/schema';
|
|
5
5
|
import {makeReadableFile} from '@loaders.gl/loader-utils';
|
|
6
6
|
import type {ParquetLoaderOptions} from '../../parquet-loader';
|
|
7
7
|
import {ParquetReader} from '../../parquetjs/parser/parquet-reader';
|
|
8
|
-
import {
|
|
9
|
-
import {
|
|
10
|
-
import {
|
|
8
|
+
import {ParquetRowGroup} from '../../parquetjs/schema/declare';
|
|
9
|
+
import {ParquetSchema} from '../../parquetjs/schema/schema';
|
|
10
|
+
import {convertParquetSchema} from '../arrow/convert-schema-from-parquet';
|
|
11
|
+
import {materializeColumns} from '../../parquetjs/schema/shred';
|
|
12
|
+
// import {convertParquetRowGroupToColumns} from '../arrow/convert-row-group-to-columns';
|
|
11
13
|
import {unpackGeoMetadata} from '../geo/decode-geo-metadata';
|
|
12
14
|
|
|
13
15
|
export async function parseParquetInColumns(
|
|
14
16
|
arrayBuffer: ArrayBuffer,
|
|
15
17
|
options?: ParquetLoaderOptions
|
|
16
|
-
) {
|
|
18
|
+
): Promise<ColumnarTable> {
|
|
17
19
|
const blob = new Blob([arrayBuffer]);
|
|
18
20
|
for await (const batch of parseParquetFileInColumnarBatches(blob, options)) {
|
|
19
|
-
return
|
|
21
|
+
return {
|
|
22
|
+
shape: 'columnar-table',
|
|
23
|
+
schema: batch.schema,
|
|
24
|
+
data: batch.data
|
|
25
|
+
};
|
|
20
26
|
}
|
|
21
|
-
|
|
27
|
+
throw new Error('empty table');
|
|
22
28
|
}
|
|
23
29
|
|
|
24
30
|
export async function* parseParquetFileInColumnarBatches(
|
|
@@ -29,16 +35,21 @@ export async function* parseParquetFileInColumnarBatches(
|
|
|
29
35
|
const reader = new ParquetReader(file);
|
|
30
36
|
const parquetSchema = await reader.getSchema();
|
|
31
37
|
const parquetMetadata = await reader.getFileMetadata();
|
|
32
|
-
const schema =
|
|
38
|
+
const schema = convertParquetSchema(parquetSchema, parquetMetadata);
|
|
33
39
|
unpackGeoMetadata(schema);
|
|
34
40
|
const rowGroups = reader.rowGroupIterator(options?.parquet);
|
|
35
41
|
for await (const rowGroup of rowGroups) {
|
|
36
|
-
yield convertRowGroupToTableBatch(
|
|
42
|
+
yield convertRowGroupToTableBatch(parquetSchema, rowGroup, schema);
|
|
37
43
|
}
|
|
38
44
|
}
|
|
39
45
|
|
|
40
|
-
function convertRowGroupToTableBatch(
|
|
41
|
-
|
|
46
|
+
function convertRowGroupToTableBatch(
|
|
47
|
+
parquetSchema: ParquetSchema,
|
|
48
|
+
rowGroup: ParquetRowGroup,
|
|
49
|
+
schema: Schema
|
|
50
|
+
): ColumnarTableBatch {
|
|
51
|
+
// const data = convertParquetRowGroupToColumns(schema, rowGroup);
|
|
52
|
+
const data = materializeColumns(parquetSchema, rowGroup);
|
|
42
53
|
return {
|
|
43
54
|
shape: 'columnar-table',
|
|
44
55
|
batchType: 'data',
|
|
@@ -1,40 +1,45 @@
|
|
|
1
1
|
// import type {LoaderWithParser, Loader, LoaderOptions} from '@loaders.gl/loader-utils';
|
|
2
2
|
// import {ColumnarTableBatch} from '@loaders.gl/schema';
|
|
3
3
|
import {makeReadableFile} from '@loaders.gl/loader-utils';
|
|
4
|
+
import {ObjectRowTable, ObjectRowTableBatch} from '@loaders.gl/schema';
|
|
4
5
|
import type {ParquetLoaderOptions} from '../../parquet-loader';
|
|
6
|
+
import type {ParquetRow} from '../../parquetjs/schema/declare';
|
|
5
7
|
import {ParquetReader} from '../../parquetjs/parser/parquet-reader';
|
|
6
8
|
|
|
7
|
-
export async function parseParquet(
|
|
9
|
+
export async function parseParquet(
|
|
10
|
+
arrayBuffer: ArrayBuffer,
|
|
11
|
+
options?: ParquetLoaderOptions
|
|
12
|
+
): Promise<ObjectRowTable> {
|
|
8
13
|
const blob = new Blob([arrayBuffer]);
|
|
14
|
+
|
|
15
|
+
const rows: ParquetRow[] = [];
|
|
9
16
|
for await (const batch of parseParquetFileInBatches(blob, options)) {
|
|
10
|
-
|
|
17
|
+
// we have only one input batch so return
|
|
18
|
+
for (const row of batch.data) {
|
|
19
|
+
rows.push(row);
|
|
20
|
+
}
|
|
11
21
|
}
|
|
12
|
-
|
|
22
|
+
|
|
23
|
+
return {
|
|
24
|
+
shape: 'object-row-table',
|
|
25
|
+
// TODO - spread can fail for very large number of batches
|
|
26
|
+
data: rows
|
|
27
|
+
};
|
|
13
28
|
}
|
|
14
29
|
|
|
15
|
-
export async function* parseParquetFileInBatches(
|
|
30
|
+
export async function* parseParquetFileInBatches(
|
|
31
|
+
blob: Blob,
|
|
32
|
+
options?: ParquetLoaderOptions
|
|
33
|
+
): AsyncIterable<ObjectRowTableBatch> {
|
|
16
34
|
const file = makeReadableFile(blob);
|
|
17
35
|
const reader = new ParquetReader(file);
|
|
18
36
|
const rowBatches = reader.rowBatchIterator(options?.parquet);
|
|
19
37
|
for await (const rows of rowBatches) {
|
|
20
|
-
yield
|
|
38
|
+
yield {
|
|
39
|
+
shape: 'object-row-table',
|
|
40
|
+
data: rows,
|
|
41
|
+
batchType: 'data',
|
|
42
|
+
length: rows.length
|
|
43
|
+
};
|
|
21
44
|
}
|
|
22
45
|
}
|
|
23
|
-
|
|
24
|
-
// export async function* parseParquetFileInColumnarBatches(blob: Blob, options?: {columnList?: string[][]}): AsyncIterable<ColumnarTableBatch> {
|
|
25
|
-
// const rowGroupReader = new ParquetRowGroupReader({data: blob, columnList: options?.columnList});
|
|
26
|
-
// try {
|
|
27
|
-
// for await (const rowGroup of rowGroupReader) {
|
|
28
|
-
// yield convertRowGroupToTableBatch(rowGroup);
|
|
29
|
-
// }
|
|
30
|
-
// } finally {
|
|
31
|
-
// await rowGroupReader.close();
|
|
32
|
-
// }
|
|
33
|
-
// }
|
|
34
|
-
|
|
35
|
-
// function convertRowGroupToTableBatch(rowGroup): ColumnarTableBatch {
|
|
36
|
-
// // @ts-expect-error
|
|
37
|
-
// return {
|
|
38
|
-
// data: rowGroup
|
|
39
|
-
// };
|
|
40
|
-
// }
|
|
@@ -1,20 +1,20 @@
|
|
|
1
1
|
// eslint-disable
|
|
2
2
|
import type {RecordBatch} from 'apache-arrow';
|
|
3
3
|
import type {LoaderOptions} from '@loaders.gl/loader-utils';
|
|
4
|
-
import {Table, RecordBatchStreamReader} from 'apache-arrow';
|
|
4
|
+
import {Table as ArrowTable, RecordBatchStreamReader} from 'apache-arrow';
|
|
5
5
|
import {loadWasm} from './load-wasm/load-wasm-node';
|
|
6
6
|
|
|
7
|
-
export type
|
|
7
|
+
export type ParquetWasmLoaderOptions = LoaderOptions & {
|
|
8
8
|
parquet?: {
|
|
9
9
|
type?: 'arrow-table';
|
|
10
10
|
wasmUrl?: string;
|
|
11
11
|
};
|
|
12
12
|
};
|
|
13
13
|
|
|
14
|
-
export async function
|
|
14
|
+
export async function parseParquetWasm(
|
|
15
15
|
arrayBuffer: ArrayBuffer,
|
|
16
|
-
options?:
|
|
17
|
-
): Promise<
|
|
16
|
+
options?: ParquetWasmLoaderOptions
|
|
17
|
+
): Promise<ArrowTable> {
|
|
18
18
|
const wasmUrl = options?.parquet?.wasmUrl;
|
|
19
19
|
const wasm = await loadWasm(wasmUrl);
|
|
20
20
|
|
|
@@ -32,11 +32,11 @@ export async function parseParquet(
|
|
|
32
32
|
* Deserialize the IPC format into a {@link Table}. This function is a
|
|
33
33
|
* convenience wrapper for {@link RecordBatchReader}. Opposite of {@link tableToIPC}.
|
|
34
34
|
*/
|
|
35
|
-
function tableFromIPC(input: ArrayBuffer):
|
|
35
|
+
function tableFromIPC(input: ArrayBuffer): ArrowTable {
|
|
36
36
|
const reader = RecordBatchStreamReader.from(input);
|
|
37
37
|
const recordBatches: RecordBatch[] = [];
|
|
38
38
|
for (const recordBatch of reader) {
|
|
39
39
|
recordBatches.push(recordBatch);
|
|
40
40
|
}
|
|
41
|
-
return new
|
|
41
|
+
return new ArrowTable(recordBatches);
|
|
42
42
|
}
|
package/src/parquet-loader.ts
CHANGED
|
@@ -1,4 +1,11 @@
|
|
|
1
|
+
// loaders.gl, MIT license
|
|
1
2
|
import type {Loader, LoaderOptions} from '@loaders.gl/loader-utils';
|
|
3
|
+
import type {
|
|
4
|
+
ObjectRowTable,
|
|
5
|
+
ObjectRowTableBatch,
|
|
6
|
+
ColumnarTable,
|
|
7
|
+
ColumnarTableBatch
|
|
8
|
+
} from '@loaders.gl/schema';
|
|
2
9
|
|
|
3
10
|
// __VERSION__ is injected by babel-plugin-version-inline
|
|
4
11
|
// @ts-ignore TS2304: Cannot find name '__VERSION__'.
|
|
@@ -23,7 +30,7 @@ const DEFAULT_PARQUET_LOADER_OPTIONS: ParquetLoaderOptions = {
|
|
|
23
30
|
};
|
|
24
31
|
|
|
25
32
|
/** ParquetJS table loader */
|
|
26
|
-
export const ParquetLoader = {
|
|
33
|
+
export const ParquetLoader: Loader<ObjectRowTable, ObjectRowTableBatch, ParquetLoaderOptions> = {
|
|
27
34
|
name: 'Apache Parquet',
|
|
28
35
|
id: 'parquet',
|
|
29
36
|
module: 'parquet',
|
|
@@ -37,4 +44,20 @@ export const ParquetLoader = {
|
|
|
37
44
|
options: DEFAULT_PARQUET_LOADER_OPTIONS
|
|
38
45
|
};
|
|
39
46
|
|
|
40
|
-
export const
|
|
47
|
+
export const ParqueColumnnartLoader: Loader<
|
|
48
|
+
ColumnarTable,
|
|
49
|
+
ColumnarTableBatch,
|
|
50
|
+
ParquetLoaderOptions
|
|
51
|
+
> = {
|
|
52
|
+
name: 'Apache Parquet',
|
|
53
|
+
id: 'parquet',
|
|
54
|
+
module: 'parquet',
|
|
55
|
+
version: VERSION,
|
|
56
|
+
worker: true,
|
|
57
|
+
category: 'table',
|
|
58
|
+
extensions: ['parquet'],
|
|
59
|
+
mimeTypes: ['application/octet-stream'],
|
|
60
|
+
binary: true,
|
|
61
|
+
tests: ['PAR1', 'PARE'],
|
|
62
|
+
options: DEFAULT_PARQUET_LOADER_OPTIONS
|
|
63
|
+
};
|
package/src/parquet-writer.ts
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
// loaders.gl, MIT license
|
|
2
|
+
|
|
1
3
|
import type {Writer} from '@loaders.gl/loader-utils';
|
|
4
|
+
import {Table, TableBatch} from '@loaders.gl/schema';
|
|
2
5
|
|
|
3
6
|
// __VERSION__ is injected by babel-plugin-version-inline
|
|
4
7
|
// @ts-ignore TS2304: Cannot find name '__VERSION__'.
|
|
@@ -8,7 +11,7 @@ export type ParquetWriterOptions = {};
|
|
|
8
11
|
|
|
9
12
|
const DEFAULT_PARQUET_LOADER_OPTIONS = {};
|
|
10
13
|
|
|
11
|
-
export const ParquetWriter: Writer = {
|
|
14
|
+
export const ParquetWriter: Writer<Table, TableBatch, ParquetWriterOptions> = {
|
|
12
15
|
name: 'Apache Parquet',
|
|
13
16
|
id: 'parquet',
|
|
14
17
|
module: 'parquet',
|
|
@@ -4,11 +4,12 @@ import {stream} from '@loaders.gl/loader-utils';
|
|
|
4
4
|
import {ParquetCodecOptions, PARQUET_CODECS} from '../codecs';
|
|
5
5
|
import * as Compression from '../compression';
|
|
6
6
|
import {
|
|
7
|
-
|
|
7
|
+
ParquetRowGroup,
|
|
8
8
|
ParquetCodec,
|
|
9
|
-
|
|
9
|
+
ParquetColumnChunk,
|
|
10
10
|
ParquetField,
|
|
11
|
-
PrimitiveType
|
|
11
|
+
PrimitiveType,
|
|
12
|
+
ParquetRow
|
|
12
13
|
} from '../schema/declare';
|
|
13
14
|
import {ParquetSchema} from '../schema/schema';
|
|
14
15
|
import * as Shred from '../schema/shred';
|
|
@@ -105,7 +106,7 @@ export class ParquetEncoder<T> {
|
|
|
105
106
|
|
|
106
107
|
public schema: ParquetSchema;
|
|
107
108
|
public envelopeWriter: ParquetEnvelopeWriter;
|
|
108
|
-
public rowBuffer:
|
|
109
|
+
public rowBuffer: ParquetRowGroup;
|
|
109
110
|
public rowGroupSize: number;
|
|
110
111
|
public closed: boolean;
|
|
111
112
|
public userMetadata: Record<string, string>;
|
|
@@ -144,7 +145,7 @@ export class ParquetEncoder<T> {
|
|
|
144
145
|
* Append a single row to the parquet file. Rows are buffered in memory until
|
|
145
146
|
* rowGroupSize rows are in the buffer or close() is called
|
|
146
147
|
*/
|
|
147
|
-
async appendRow<T>(row: T): Promise<void> {
|
|
148
|
+
async appendRow<T extends ParquetRow>(row: T): Promise<void> {
|
|
148
149
|
if (this.closed) {
|
|
149
150
|
throw new Error('writer was closed');
|
|
150
151
|
}
|
|
@@ -271,7 +272,7 @@ export class ParquetEnvelopeWriter {
|
|
|
271
272
|
* Encode a parquet row group. The records object should be created using the
|
|
272
273
|
* shredRecord method
|
|
273
274
|
*/
|
|
274
|
-
async writeRowGroup(records:
|
|
275
|
+
async writeRowGroup(records: ParquetRowGroup): Promise<void> {
|
|
275
276
|
const rgroup = await encodeRowGroup(this.schema, records, {
|
|
276
277
|
baseOffset: this.offset,
|
|
277
278
|
pageSize: this.pageSize,
|
|
@@ -363,7 +364,7 @@ function encodeValues(
|
|
|
363
364
|
*/
|
|
364
365
|
async function encodeDataPage(
|
|
365
366
|
column: ParquetField,
|
|
366
|
-
data:
|
|
367
|
+
data: ParquetColumnChunk
|
|
367
368
|
): Promise<{
|
|
368
369
|
header: PageHeader;
|
|
369
370
|
headerSize: number;
|
|
@@ -422,7 +423,7 @@ async function encodeDataPage(
|
|
|
422
423
|
*/
|
|
423
424
|
async function encodeDataPageV2(
|
|
424
425
|
column: ParquetField,
|
|
425
|
-
data:
|
|
426
|
+
data: ParquetColumnChunk,
|
|
426
427
|
rowCount: number
|
|
427
428
|
): Promise<{
|
|
428
429
|
header: PageHeader;
|
|
@@ -482,7 +483,7 @@ async function encodeDataPageV2(
|
|
|
482
483
|
*/
|
|
483
484
|
async function encodeColumnChunk(
|
|
484
485
|
column: ParquetField,
|
|
485
|
-
buffer:
|
|
486
|
+
buffer: ParquetRowGroup,
|
|
486
487
|
offset: number,
|
|
487
488
|
opts: ParquetEncoderOptions
|
|
488
489
|
): Promise<{
|
|
@@ -539,7 +540,7 @@ async function encodeColumnChunk(
|
|
|
539
540
|
*/
|
|
540
541
|
async function encodeRowGroup(
|
|
541
542
|
schema: ParquetSchema,
|
|
542
|
-
data:
|
|
543
|
+
data: ParquetRowGroup,
|
|
543
544
|
opts: ParquetEncoderOptions
|
|
544
545
|
): Promise<{
|
|
545
546
|
body: Buffer;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// Forked from https://github.com/kbajalc/parquets under MIT license (Copyright (c) 2017 ironSource Ltd.)
|
|
2
2
|
import {
|
|
3
3
|
ParquetCodec,
|
|
4
|
-
|
|
4
|
+
ParquetColumnChunk,
|
|
5
5
|
ParquetOptions,
|
|
6
6
|
ParquetPageData,
|
|
7
7
|
ParquetType,
|
|
@@ -32,14 +32,14 @@ import {decodePageHeader, getThriftEnum, getBitWidth} from '../utils/read-utils'
|
|
|
32
32
|
export async function decodeDataPages(
|
|
33
33
|
buffer: Buffer,
|
|
34
34
|
options: ParquetOptions
|
|
35
|
-
): Promise<
|
|
35
|
+
): Promise<ParquetColumnChunk> {
|
|
36
36
|
const cursor: CursorBuffer = {
|
|
37
37
|
buffer,
|
|
38
38
|
offset: 0,
|
|
39
39
|
size: buffer.length
|
|
40
40
|
};
|
|
41
41
|
|
|
42
|
-
const data:
|
|
42
|
+
const data: ParquetColumnChunk = {
|
|
43
43
|
rlevels: [],
|
|
44
44
|
dlevels: [],
|
|
45
45
|
values: [],
|
|
@@ -3,14 +3,14 @@ import type {ReadableFile} from '@loaders.gl/loader-utils';
|
|
|
3
3
|
|
|
4
4
|
import {ParquetSchema} from '../schema/schema';
|
|
5
5
|
import {decodeSchema} from './decoders';
|
|
6
|
-
import {
|
|
6
|
+
import {materializeRows} from '../schema/shred';
|
|
7
7
|
|
|
8
8
|
import {PARQUET_MAGIC, PARQUET_MAGIC_ENCRYPTED} from '../../constants';
|
|
9
9
|
import {ColumnChunk, CompressionCodec, FileMetaData, RowGroup, Type} from '../parquet-thrift';
|
|
10
10
|
import {
|
|
11
|
-
|
|
11
|
+
ParquetRowGroup,
|
|
12
12
|
ParquetCompression,
|
|
13
|
-
|
|
13
|
+
ParquetColumnChunk,
|
|
14
14
|
PrimitiveType,
|
|
15
15
|
ParquetOptions
|
|
16
16
|
} from '../schema/declare';
|
|
@@ -68,7 +68,7 @@ export class ParquetReader {
|
|
|
68
68
|
async *rowBatchIterator(props?: ParquetIterationProps) {
|
|
69
69
|
const schema = await this.getSchema();
|
|
70
70
|
for await (const rowGroup of this.rowGroupIterator(props)) {
|
|
71
|
-
yield
|
|
71
|
+
yield materializeRows(schema, rowGroup);
|
|
72
72
|
}
|
|
73
73
|
}
|
|
74
74
|
|
|
@@ -172,8 +172,8 @@ export class ParquetReader {
|
|
|
172
172
|
schema: ParquetSchema,
|
|
173
173
|
rowGroup: RowGroup,
|
|
174
174
|
columnList: string[][]
|
|
175
|
-
): Promise<
|
|
176
|
-
const buffer:
|
|
175
|
+
): Promise<ParquetRowGroup> {
|
|
176
|
+
const buffer: ParquetRowGroup = {
|
|
177
177
|
rowCount: Number(rowGroup.num_rows),
|
|
178
178
|
columnData: {}
|
|
179
179
|
};
|
|
@@ -191,7 +191,7 @@ export class ParquetReader {
|
|
|
191
191
|
/**
|
|
192
192
|
* Each row group contains column chunks for all the columns.
|
|
193
193
|
*/
|
|
194
|
-
async readColumnChunk(schema: ParquetSchema, colChunk: ColumnChunk): Promise<
|
|
194
|
+
async readColumnChunk(schema: ParquetSchema, colChunk: ColumnChunk): Promise<ParquetColumnChunk> {
|
|
195
195
|
if (colChunk.file_path !== undefined && colChunk.file_path !== null) {
|
|
196
196
|
throw new Error('external references are not supported');
|
|
197
197
|
}
|
|
@@ -120,25 +120,26 @@ export interface ParquetPageData {
|
|
|
120
120
|
pageHeader: PageHeader;
|
|
121
121
|
}
|
|
122
122
|
|
|
123
|
-
export interface
|
|
123
|
+
export interface ParquetRow {
|
|
124
124
|
[key: string]: any;
|
|
125
125
|
}
|
|
126
126
|
|
|
127
127
|
/** @
|
|
128
128
|
* Holds data for one row group (column chunks) */
|
|
129
|
-
export class
|
|
129
|
+
export class ParquetRowGroup {
|
|
130
130
|
/** Number of rows in this page */
|
|
131
131
|
rowCount: number;
|
|
132
|
+
/** Map of Column chunks */
|
|
133
|
+
columnData: Record<string, ParquetColumnChunk>;
|
|
132
134
|
|
|
133
|
-
columnData: Record<string,
|
|
134
|
-
constructor(rowCount: number = 0, columnData: Record<string, ParquetData> = {}) {
|
|
135
|
+
constructor(rowCount: number = 0, columnData: Record<string, ParquetColumnChunk> = {}) {
|
|
135
136
|
this.rowCount = rowCount;
|
|
136
137
|
this.columnData = columnData;
|
|
137
138
|
}
|
|
138
139
|
}
|
|
139
140
|
|
|
140
141
|
/** Holds the data for one column chunk */
|
|
141
|
-
export interface
|
|
142
|
+
export interface ParquetColumnChunk {
|
|
142
143
|
dlevels: number[];
|
|
143
144
|
rlevels: number[];
|
|
144
145
|
values: any[];
|
|
@@ -4,14 +4,14 @@ import {PARQUET_CODECS} from '../codecs';
|
|
|
4
4
|
import {PARQUET_COMPRESSION_METHODS} from '../compression';
|
|
5
5
|
import {
|
|
6
6
|
FieldDefinition,
|
|
7
|
-
|
|
7
|
+
ParquetRowGroup,
|
|
8
8
|
ParquetCompression,
|
|
9
9
|
ParquetField,
|
|
10
|
-
|
|
10
|
+
ParquetRow,
|
|
11
11
|
RepetitionType,
|
|
12
12
|
SchemaDefinition
|
|
13
13
|
} from './declare';
|
|
14
|
-
import {
|
|
14
|
+
import {materializeRows, shredBuffer, shredRecord} from './shred';
|
|
15
15
|
import {PARQUET_LOGICAL_TYPES} from './types';
|
|
16
16
|
|
|
17
17
|
/**
|
|
@@ -70,12 +70,12 @@ export class ParquetSchema {
|
|
|
70
70
|
return branch;
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
-
shredRecord(
|
|
74
|
-
shredRecord(this,
|
|
73
|
+
shredRecord(row: ParquetRow, rowGroup: ParquetRowGroup): void {
|
|
74
|
+
shredRecord(this, row, rowGroup);
|
|
75
75
|
}
|
|
76
76
|
|
|
77
|
-
|
|
78
|
-
return
|
|
77
|
+
materializeRows(rowGroup: ParquetRowGroup): ParquetRow[] {
|
|
78
|
+
return materializeRows(this, rowGroup);
|
|
79
79
|
}
|
|
80
80
|
|
|
81
81
|
compress(type: ParquetCompression): this {
|
|
@@ -84,7 +84,7 @@ export class ParquetSchema {
|
|
|
84
84
|
return this;
|
|
85
85
|
}
|
|
86
86
|
|
|
87
|
-
|
|
87
|
+
rowGroup(): ParquetRowGroup {
|
|
88
88
|
return shredBuffer(this);
|
|
89
89
|
}
|
|
90
90
|
}
|