@loaders.gl/parquet 3.4.6 → 4.0.0-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. package/dist/dist.min.js +27 -34
  2. package/dist/dist.min.js.map +3 -3
  3. package/dist/es5/index.js +6 -6
  4. package/dist/es5/index.js.map +1 -1
  5. package/dist/es5/lib/arrow/convert-row-group-to-columns.js.map +1 -1
  6. package/dist/es5/lib/arrow/convert-schema-from-parquet.js +58 -42
  7. package/dist/es5/lib/arrow/convert-schema-from-parquet.js.map +1 -1
  8. package/dist/es5/lib/arrow/convert-schema-to-parquet.js +33 -31
  9. package/dist/es5/lib/arrow/convert-schema-to-parquet.js.map +1 -1
  10. package/dist/es5/lib/geo/decode-geo-metadata.js +12 -8
  11. package/dist/es5/lib/geo/decode-geo-metadata.js.map +1 -1
  12. package/dist/es5/lib/parsers/parse-parquet-to-columns.js +11 -7
  13. package/dist/es5/lib/parsers/parse-parquet-to-columns.js.map +1 -1
  14. package/dist/es5/lib/parsers/parse-parquet-to-rows.js +51 -29
  15. package/dist/es5/lib/parsers/parse-parquet-to-rows.js.map +1 -1
  16. package/dist/es5/lib/wasm/parse-parquet-wasm.js +6 -6
  17. package/dist/es5/lib/wasm/parse-parquet-wasm.js.map +1 -1
  18. package/dist/es5/parquet-loader.js +16 -4
  19. package/dist/es5/parquet-loader.js.map +1 -1
  20. package/dist/es5/parquet-wasm-loader.js +1 -1
  21. package/dist/es5/parquet-wasm-loader.js.map +1 -1
  22. package/dist/es5/parquet-wasm-writer.js +1 -1
  23. package/dist/es5/parquet-wasm-writer.js.map +1 -1
  24. package/dist/es5/parquet-writer.js +1 -1
  25. package/dist/es5/parquet-writer.js.map +1 -1
  26. package/dist/es5/parquetjs/encoder/parquet-encoder.js.map +1 -1
  27. package/dist/es5/parquetjs/parser/decoders.js.map +1 -1
  28. package/dist/es5/parquetjs/parser/parquet-reader.js +1 -1
  29. package/dist/es5/parquetjs/parser/parquet-reader.js.map +1 -1
  30. package/dist/es5/parquetjs/schema/declare.js +4 -4
  31. package/dist/es5/parquetjs/schema/declare.js.map +1 -1
  32. package/dist/es5/parquetjs/schema/schema.js +7 -7
  33. package/dist/es5/parquetjs/schema/schema.js.map +1 -1
  34. package/dist/es5/parquetjs/schema/shred.js +117 -22
  35. package/dist/es5/parquetjs/schema/shred.js.map +1 -1
  36. package/dist/esm/index.js +5 -5
  37. package/dist/esm/index.js.map +1 -1
  38. package/dist/esm/lib/arrow/convert-row-group-to-columns.js.map +1 -1
  39. package/dist/esm/lib/arrow/convert-schema-from-parquet.js +57 -41
  40. package/dist/esm/lib/arrow/convert-schema-from-parquet.js.map +1 -1
  41. package/dist/esm/lib/arrow/convert-schema-to-parquet.js +33 -31
  42. package/dist/esm/lib/arrow/convert-schema-to-parquet.js.map +1 -1
  43. package/dist/esm/lib/geo/decode-geo-metadata.js +12 -8
  44. package/dist/esm/lib/geo/decode-geo-metadata.js.map +1 -1
  45. package/dist/esm/lib/parsers/parse-parquet-to-columns.js +12 -8
  46. package/dist/esm/lib/parsers/parse-parquet-to-columns.js.map +1 -1
  47. package/dist/esm/lib/parsers/parse-parquet-to-rows.js +14 -3
  48. package/dist/esm/lib/parsers/parse-parquet-to-rows.js.map +1 -1
  49. package/dist/esm/lib/wasm/parse-parquet-wasm.js +3 -3
  50. package/dist/esm/lib/wasm/parse-parquet-wasm.js.map +1 -1
  51. package/dist/esm/parquet-loader.js +14 -2
  52. package/dist/esm/parquet-loader.js.map +1 -1
  53. package/dist/esm/parquet-wasm-loader.js +1 -1
  54. package/dist/esm/parquet-wasm-loader.js.map +1 -1
  55. package/dist/esm/parquet-wasm-writer.js +1 -1
  56. package/dist/esm/parquet-wasm-writer.js.map +1 -1
  57. package/dist/esm/parquet-writer.js +1 -1
  58. package/dist/esm/parquet-writer.js.map +1 -1
  59. package/dist/esm/parquetjs/encoder/parquet-encoder.js.map +1 -1
  60. package/dist/esm/parquetjs/parser/decoders.js.map +1 -1
  61. package/dist/esm/parquetjs/parser/parquet-reader.js +2 -2
  62. package/dist/esm/parquetjs/parser/parquet-reader.js.map +1 -1
  63. package/dist/esm/parquetjs/schema/declare.js +1 -1
  64. package/dist/esm/parquetjs/schema/declare.js.map +1 -1
  65. package/dist/esm/parquetjs/schema/schema.js +6 -6
  66. package/dist/esm/parquetjs/schema/schema.js.map +1 -1
  67. package/dist/esm/parquetjs/schema/shred.js +108 -21
  68. package/dist/esm/parquetjs/schema/shred.js.map +1 -1
  69. package/dist/index.d.ts +8 -49
  70. package/dist/index.d.ts.map +1 -1
  71. package/dist/index.js +8 -6
  72. package/dist/lib/arrow/convert-row-group-to-columns.d.ts +2 -2
  73. package/dist/lib/arrow/convert-row-group-to-columns.d.ts.map +1 -1
  74. package/dist/lib/arrow/convert-schema-from-parquet.d.ts +4 -4
  75. package/dist/lib/arrow/convert-schema-from-parquet.d.ts.map +1 -1
  76. package/dist/lib/arrow/convert-schema-from-parquet.js +48 -44
  77. package/dist/lib/arrow/convert-schema-to-parquet.d.ts +1 -1
  78. package/dist/lib/arrow/convert-schema-to-parquet.d.ts.map +1 -1
  79. package/dist/lib/arrow/convert-schema-to-parquet.js +30 -31
  80. package/dist/lib/geo/decode-geo-metadata.js +12 -8
  81. package/dist/lib/parsers/parse-parquet-to-columns.d.ts +2 -2
  82. package/dist/lib/parsers/parse-parquet-to-columns.d.ts.map +1 -1
  83. package/dist/lib/parsers/parse-parquet-to-columns.js +13 -7
  84. package/dist/lib/parsers/parse-parquet-to-rows.d.ts +3 -2
  85. package/dist/lib/parsers/parse-parquet-to-rows.d.ts.map +1 -1
  86. package/dist/lib/parsers/parse-parquet-to-rows.js +16 -19
  87. package/dist/lib/wasm/parse-parquet-wasm.d.ts +3 -3
  88. package/dist/lib/wasm/parse-parquet-wasm.d.ts.map +1 -1
  89. package/dist/lib/wasm/parse-parquet-wasm.js +3 -3
  90. package/dist/parquet-loader.d.ts +3 -14
  91. package/dist/parquet-loader.d.ts.map +1 -1
  92. package/dist/parquet-loader.js +14 -2
  93. package/dist/parquet-worker.js +31 -38
  94. package/dist/parquet-worker.js.map +3 -3
  95. package/dist/parquet-writer.d.ts +2 -1
  96. package/dist/parquet-writer.d.ts.map +1 -1
  97. package/dist/parquet-writer.js +1 -0
  98. package/dist/parquetjs/encoder/parquet-encoder.d.ts +4 -4
  99. package/dist/parquetjs/encoder/parquet-encoder.d.ts.map +1 -1
  100. package/dist/parquetjs/parser/decoders.d.ts +2 -2
  101. package/dist/parquetjs/parser/decoders.d.ts.map +1 -1
  102. package/dist/parquetjs/parser/parquet-reader.d.ts +6 -6
  103. package/dist/parquetjs/parser/parquet-reader.d.ts.map +1 -1
  104. package/dist/parquetjs/parser/parquet-reader.js +1 -1
  105. package/dist/parquetjs/schema/declare.d.ts +6 -5
  106. package/dist/parquetjs/schema/declare.d.ts.map +1 -1
  107. package/dist/parquetjs/schema/declare.js +3 -3
  108. package/dist/parquetjs/schema/schema.d.ts +4 -4
  109. package/dist/parquetjs/schema/schema.d.ts.map +1 -1
  110. package/dist/parquetjs/schema/schema.js +5 -5
  111. package/dist/parquetjs/schema/shred.d.ts +17 -111
  112. package/dist/parquetjs/schema/shred.d.ts.map +1 -1
  113. package/dist/parquetjs/schema/shred.js +127 -119
  114. package/package.json +8 -8
  115. package/src/index.ts +32 -9
  116. package/src/lib/arrow/convert-row-group-to-columns.ts +2 -2
  117. package/src/lib/arrow/convert-schema-from-parquet.ts +56 -66
  118. package/src/lib/arrow/convert-schema-to-parquet.ts +32 -44
  119. package/src/lib/geo/decode-geo-metadata.ts +17 -8
  120. package/src/lib/parsers/parse-parquet-to-columns.ts +22 -11
  121. package/src/lib/parsers/parse-parquet-to-rows.ts +28 -23
  122. package/src/lib/wasm/parse-parquet-wasm.ts +7 -7
  123. package/src/parquet-loader.ts +25 -2
  124. package/src/parquet-writer.ts +4 -1
  125. package/src/parquetjs/encoder/parquet-encoder.ts +11 -10
  126. package/src/parquetjs/parser/decoders.ts +3 -3
  127. package/src/parquetjs/parser/parquet-reader.ts +7 -7
  128. package/src/parquetjs/schema/declare.ts +6 -5
  129. package/src/parquetjs/schema/schema.ts +8 -8
  130. package/src/parquetjs/schema/shred.ts +142 -103
@@ -10,59 +10,47 @@ import {
10
10
  Schema,
11
11
  // Struct,
12
12
  // Field,
13
- DataType,
14
- Bool,
15
- Float64,
16
- Int32,
17
- Float32,
18
- Binary,
19
- Utf8,
20
- Int64,
21
- Uint16,
22
- Uint32,
23
- Uint64,
24
- Int8,
25
- Int16
13
+ DataType
26
14
  } from '@loaders.gl/schema';
27
15
 
28
- export const PARQUET_TYPE_MAPPING: {[type in ParquetType]: typeof DataType} = {
29
- BOOLEAN: Bool,
30
- INT32: Int32,
31
- INT64: Float64,
32
- INT96: Float64,
33
- FLOAT: Float32,
34
- DOUBLE: Float64,
35
- BYTE_ARRAY: Binary,
36
- FIXED_LEN_BYTE_ARRAY: Binary,
37
- UTF8: Utf8,
38
- DATE: Int32,
39
- TIME_MILLIS: Int64,
40
- TIME_MICROS: Int64,
41
- TIMESTAMP_MILLIS: Int64,
42
- TIMESTAMP_MICROS: Int64,
43
- UINT_8: Int32,
44
- UINT_16: Uint16,
45
- UINT_32: Uint32,
46
- UINT_64: Uint64,
47
- INT_8: Int8,
48
- INT_16: Int16,
49
- INT_32: Int32,
50
- INT_64: Int64,
51
- JSON: Binary,
52
- BSON: Binary,
16
+ export const PARQUET_TYPE_MAPPING: {[type in ParquetType]: DataType} = {
17
+ BOOLEAN: 'bool',
18
+ INT32: 'int32',
19
+ INT64: 'float64',
20
+ INT96: 'float64',
21
+ FLOAT: 'float32',
22
+ DOUBLE: 'float64',
23
+ BYTE_ARRAY: 'binary',
24
+ FIXED_LEN_BYTE_ARRAY: 'binary',
25
+ UTF8: 'utf8',
26
+ DATE: 'int32',
27
+ TIME_MILLIS: 'int64',
28
+ TIME_MICROS: 'int64',
29
+ TIMESTAMP_MILLIS: 'int64',
30
+ TIMESTAMP_MICROS: 'int64',
31
+ UINT_8: 'int32',
32
+ UINT_16: 'uint16',
33
+ UINT_32: 'uint32',
34
+ UINT_64: 'uint64',
35
+ INT_8: 'int8',
36
+ INT_16: 'int16',
37
+ INT_32: 'int32',
38
+ INT_64: 'int64',
39
+ JSON: 'binary',
40
+ BSON: 'binary',
53
41
  // TODO check interval type
54
- INTERVAL: Binary,
55
- DECIMAL_INT32: Float32,
56
- DECIMAL_INT64: Float64,
57
- DECIMAL_BYTE_ARRAY: Float64,
58
- DECIMAL_FIXED_LEN_BYTE_ARRAY: Float64
42
+ INTERVAL: 'binary',
43
+ DECIMAL_INT32: 'float32',
44
+ DECIMAL_INT64: 'float64',
45
+ DECIMAL_BYTE_ARRAY: 'float64',
46
+ DECIMAL_FIXED_LEN_BYTE_ARRAY: 'float64'
59
47
  };
60
48
 
61
49
  export function convertToParquetSchema(schema: Schema): Schema {
62
50
  const fields = []; // getFields(schema.fields);
63
51
 
64
52
  // TODO add metadata if needed.
65
- return new Schema(fields);
53
+ return {fields, metadata: {}};
66
54
  }
67
55
 
68
56
  // function getFields(schema: Field[]): Definition[] {
@@ -26,7 +26,7 @@ export type GeoColumnMetadata = {
26
26
  * Reads the GeoMetadata object from the metadata
27
27
  * @note geoarrow / parquet schema is stringified into a single key-value pair in the parquet metadata */
28
28
  export function getGeoMetadata(schema: Schema): GeoMetadata | null {
29
- const stringifiedGeoMetadata = schema.metadata.get('geo');
29
+ const stringifiedGeoMetadata = schema.metadata.geo;
30
30
  if (!stringifiedGeoMetadata) {
31
31
  return null;
32
32
  }
@@ -45,7 +45,7 @@ export function getGeoMetadata(schema: Schema): GeoMetadata | null {
45
45
  */
46
46
  export function setGeoMetadata(schema: Schema, geoMetadata: GeoMetadata): void {
47
47
  const stringifiedGeoMetadata = JSON.stringify(geoMetadata);
48
- schema.metadata.set('geo', stringifiedGeoMetadata);
48
+ schema.metadata.geo = stringifiedGeoMetadata;
49
49
  }
50
50
 
51
51
  /**
@@ -62,21 +62,21 @@ export function unpackGeoMetadata(schema: Schema): void {
62
62
 
63
63
  const {version, primary_column, columns} = geoMetadata;
64
64
  if (version) {
65
- schema.metadata.set('geo.version', version);
65
+ schema.metadata['geo.version'] = version;
66
66
  }
67
67
 
68
68
  if (primary_column) {
69
- schema.metadata.set('geo.primary_column', primary_column);
69
+ schema.metadata['geo.primary_column'] = primary_column;
70
70
  }
71
71
 
72
72
  // store column names as comma separated list
73
- schema.metadata.set('geo.columns', Object.keys(columns || {}).join(''));
73
+ schema.metadata['geo.columns'] = Object.keys(columns || {}).join('');
74
74
 
75
75
  for (const [columnName, columnMetadata] of Object.entries(columns || {})) {
76
76
  const field = schema.fields.find((field) => field.name === columnName);
77
77
  if (field) {
78
78
  if (field.name === primary_column) {
79
- field.metadata.set('geo.primary_field', 'true');
79
+ setFieldMetadata(field, 'geo.primary_field', 'true');
80
80
  }
81
81
  unpackGeoFieldMetadata(field, columnMetadata);
82
82
  }
@@ -87,13 +87,22 @@ function unpackGeoFieldMetadata(field: Field, columnMetadata): void {
87
87
  for (const [key, value] of Object.entries(columnMetadata || {})) {
88
88
  switch (key) {
89
89
  case 'geometry_type':
90
- field.metadata.set(`geo.${key}`, (value as string[]).join(','));
90
+ setFieldMetadata(field, `geo.${key}`, (value as string[]).join(','));
91
91
  break;
92
92
  case 'bbox':
93
93
  case 'crs':
94
94
  case 'edges':
95
95
  default:
96
- field.metadata.set(`geo.${key}`, typeof value === 'string' ? value : JSON.stringify(value));
96
+ setFieldMetadata(
97
+ field,
98
+ `geo.${key}`,
99
+ typeof value === 'string' ? value : JSON.stringify(value)
100
+ );
97
101
  }
98
102
  }
99
103
  }
104
+
105
+ function setFieldMetadata(field: Field, key: string, value: string): void {
106
+ field.metadata = field.metadata || {};
107
+ field.metadata[key] = value;
108
+ }
@@ -1,24 +1,30 @@
1
1
  // loaders.gl, MIT license
2
2
 
3
3
  // import type {LoaderWithParser, Loader, LoaderOptions} from '@loaders.gl/loader-utils';
4
- import {ColumnarTableBatch, Schema} from '@loaders.gl/schema';
4
+ import {ColumnarTable, ColumnarTableBatch, Schema} from '@loaders.gl/schema';
5
5
  import {makeReadableFile} from '@loaders.gl/loader-utils';
6
6
  import type {ParquetLoaderOptions} from '../../parquet-loader';
7
7
  import {ParquetReader} from '../../parquetjs/parser/parquet-reader';
8
- import {ParquetBuffer} from '../../parquetjs/schema/declare';
9
- import {convertSchemaFromParquet} from '../arrow/convert-schema-from-parquet';
10
- import {convertParquetRowGroupToColumns} from '../arrow/convert-row-group-to-columns';
8
+ import {ParquetRowGroup} from '../../parquetjs/schema/declare';
9
+ import {ParquetSchema} from '../../parquetjs/schema/schema';
10
+ import {convertParquetSchema} from '../arrow/convert-schema-from-parquet';
11
+ import {materializeColumns} from '../../parquetjs/schema/shred';
12
+ // import {convertParquetRowGroupToColumns} from '../arrow/convert-row-group-to-columns';
11
13
  import {unpackGeoMetadata} from '../geo/decode-geo-metadata';
12
14
 
13
15
  export async function parseParquetInColumns(
14
16
  arrayBuffer: ArrayBuffer,
15
17
  options?: ParquetLoaderOptions
16
- ) {
18
+ ): Promise<ColumnarTable> {
17
19
  const blob = new Blob([arrayBuffer]);
18
20
  for await (const batch of parseParquetFileInColumnarBatches(blob, options)) {
19
- return batch;
21
+ return {
22
+ shape: 'columnar-table',
23
+ schema: batch.schema,
24
+ data: batch.data
25
+ };
20
26
  }
21
- return null;
27
+ throw new Error('empty table');
22
28
  }
23
29
 
24
30
  export async function* parseParquetFileInColumnarBatches(
@@ -29,16 +35,21 @@ export async function* parseParquetFileInColumnarBatches(
29
35
  const reader = new ParquetReader(file);
30
36
  const parquetSchema = await reader.getSchema();
31
37
  const parquetMetadata = await reader.getFileMetadata();
32
- const schema = convertSchemaFromParquet(parquetSchema, parquetMetadata);
38
+ const schema = convertParquetSchema(parquetSchema, parquetMetadata);
33
39
  unpackGeoMetadata(schema);
34
40
  const rowGroups = reader.rowGroupIterator(options?.parquet);
35
41
  for await (const rowGroup of rowGroups) {
36
- yield convertRowGroupToTableBatch(schema, rowGroup);
42
+ yield convertRowGroupToTableBatch(parquetSchema, rowGroup, schema);
37
43
  }
38
44
  }
39
45
 
40
- function convertRowGroupToTableBatch(schema: Schema, rowGroup: ParquetBuffer): ColumnarTableBatch {
41
- const data = convertParquetRowGroupToColumns(schema, rowGroup);
46
+ function convertRowGroupToTableBatch(
47
+ parquetSchema: ParquetSchema,
48
+ rowGroup: ParquetRowGroup,
49
+ schema: Schema
50
+ ): ColumnarTableBatch {
51
+ // const data = convertParquetRowGroupToColumns(schema, rowGroup);
52
+ const data = materializeColumns(parquetSchema, rowGroup);
42
53
  return {
43
54
  shape: 'columnar-table',
44
55
  batchType: 'data',
@@ -1,40 +1,45 @@
1
1
  // import type {LoaderWithParser, Loader, LoaderOptions} from '@loaders.gl/loader-utils';
2
2
  // import {ColumnarTableBatch} from '@loaders.gl/schema';
3
3
  import {makeReadableFile} from '@loaders.gl/loader-utils';
4
+ import {ObjectRowTable, ObjectRowTableBatch} from '@loaders.gl/schema';
4
5
  import type {ParquetLoaderOptions} from '../../parquet-loader';
6
+ import type {ParquetRow} from '../../parquetjs/schema/declare';
5
7
  import {ParquetReader} from '../../parquetjs/parser/parquet-reader';
6
8
 
7
- export async function parseParquet(arrayBuffer: ArrayBuffer, options?: ParquetLoaderOptions) {
9
+ export async function parseParquet(
10
+ arrayBuffer: ArrayBuffer,
11
+ options?: ParquetLoaderOptions
12
+ ): Promise<ObjectRowTable> {
8
13
  const blob = new Blob([arrayBuffer]);
14
+
15
+ const rows: ParquetRow[] = [];
9
16
  for await (const batch of parseParquetFileInBatches(blob, options)) {
10
- return batch;
17
+ // we have only one input batch so return
18
+ for (const row of batch.data) {
19
+ rows.push(row);
20
+ }
11
21
  }
12
- return null;
22
+
23
+ return {
24
+ shape: 'object-row-table',
25
+ // TODO - spread can fail for very large number of batches
26
+ data: rows
27
+ };
13
28
  }
14
29
 
15
- export async function* parseParquetFileInBatches(blob: Blob, options?: ParquetLoaderOptions) {
30
+ export async function* parseParquetFileInBatches(
31
+ blob: Blob,
32
+ options?: ParquetLoaderOptions
33
+ ): AsyncIterable<ObjectRowTableBatch> {
16
34
  const file = makeReadableFile(blob);
17
35
  const reader = new ParquetReader(file);
18
36
  const rowBatches = reader.rowBatchIterator(options?.parquet);
19
37
  for await (const rows of rowBatches) {
20
- yield rows;
38
+ yield {
39
+ shape: 'object-row-table',
40
+ data: rows,
41
+ batchType: 'data',
42
+ length: rows.length
43
+ };
21
44
  }
22
45
  }
23
-
24
- // export async function* parseParquetFileInColumnarBatches(blob: Blob, options?: {columnList?: string[][]}): AsyncIterable<ColumnarTableBatch> {
25
- // const rowGroupReader = new ParquetRowGroupReader({data: blob, columnList: options?.columnList});
26
- // try {
27
- // for await (const rowGroup of rowGroupReader) {
28
- // yield convertRowGroupToTableBatch(rowGroup);
29
- // }
30
- // } finally {
31
- // await rowGroupReader.close();
32
- // }
33
- // }
34
-
35
- // function convertRowGroupToTableBatch(rowGroup): ColumnarTableBatch {
36
- // // @ts-expect-error
37
- // return {
38
- // data: rowGroup
39
- // };
40
- // }
@@ -1,20 +1,20 @@
1
1
  // eslint-disable
2
2
  import type {RecordBatch} from 'apache-arrow';
3
3
  import type {LoaderOptions} from '@loaders.gl/loader-utils';
4
- import {Table, RecordBatchStreamReader} from 'apache-arrow';
4
+ import {Table as ArrowTable, RecordBatchStreamReader} from 'apache-arrow';
5
5
  import {loadWasm} from './load-wasm/load-wasm-node';
6
6
 
7
- export type ParquetLoaderOptions = LoaderOptions & {
7
+ export type ParquetWasmLoaderOptions = LoaderOptions & {
8
8
  parquet?: {
9
9
  type?: 'arrow-table';
10
10
  wasmUrl?: string;
11
11
  };
12
12
  };
13
13
 
14
- export async function parseParquet(
14
+ export async function parseParquetWasm(
15
15
  arrayBuffer: ArrayBuffer,
16
- options?: ParquetLoaderOptions
17
- ): Promise<Table> {
16
+ options?: ParquetWasmLoaderOptions
17
+ ): Promise<ArrowTable> {
18
18
  const wasmUrl = options?.parquet?.wasmUrl;
19
19
  const wasm = await loadWasm(wasmUrl);
20
20
 
@@ -32,11 +32,11 @@ export async function parseParquet(
32
32
  * Deserialize the IPC format into a {@link Table}. This function is a
33
33
  * convenience wrapper for {@link RecordBatchReader}. Opposite of {@link tableToIPC}.
34
34
  */
35
- function tableFromIPC(input: ArrayBuffer): Table {
35
+ function tableFromIPC(input: ArrayBuffer): ArrowTable {
36
36
  const reader = RecordBatchStreamReader.from(input);
37
37
  const recordBatches: RecordBatch[] = [];
38
38
  for (const recordBatch of reader) {
39
39
  recordBatches.push(recordBatch);
40
40
  }
41
- return new Table(recordBatches);
41
+ return new ArrowTable(recordBatches);
42
42
  }
@@ -1,4 +1,11 @@
1
+ // loaders.gl, MIT license
1
2
  import type {Loader, LoaderOptions} from '@loaders.gl/loader-utils';
3
+ import type {
4
+ ObjectRowTable,
5
+ ObjectRowTableBatch,
6
+ ColumnarTable,
7
+ ColumnarTableBatch
8
+ } from '@loaders.gl/schema';
2
9
 
3
10
  // __VERSION__ is injected by babel-plugin-version-inline
4
11
  // @ts-ignore TS2304: Cannot find name '__VERSION__'.
@@ -23,7 +30,7 @@ const DEFAULT_PARQUET_LOADER_OPTIONS: ParquetLoaderOptions = {
23
30
  };
24
31
 
25
32
  /** ParquetJS table loader */
26
- export const ParquetLoader = {
33
+ export const ParquetLoader: Loader<ObjectRowTable, ObjectRowTableBatch, ParquetLoaderOptions> = {
27
34
  name: 'Apache Parquet',
28
35
  id: 'parquet',
29
36
  module: 'parquet',
@@ -37,4 +44,20 @@ export const ParquetLoader = {
37
44
  options: DEFAULT_PARQUET_LOADER_OPTIONS
38
45
  };
39
46
 
40
- export const _typecheckParquetLoader: Loader = ParquetLoader;
47
+ export const ParqueColumnnartLoader: Loader<
48
+ ColumnarTable,
49
+ ColumnarTableBatch,
50
+ ParquetLoaderOptions
51
+ > = {
52
+ name: 'Apache Parquet',
53
+ id: 'parquet',
54
+ module: 'parquet',
55
+ version: VERSION,
56
+ worker: true,
57
+ category: 'table',
58
+ extensions: ['parquet'],
59
+ mimeTypes: ['application/octet-stream'],
60
+ binary: true,
61
+ tests: ['PAR1', 'PARE'],
62
+ options: DEFAULT_PARQUET_LOADER_OPTIONS
63
+ };
@@ -1,4 +1,7 @@
1
+ // loaders.gl, MIT license
2
+
1
3
  import type {Writer} from '@loaders.gl/loader-utils';
4
+ import {Table, TableBatch} from '@loaders.gl/schema';
2
5
 
3
6
  // __VERSION__ is injected by babel-plugin-version-inline
4
7
  // @ts-ignore TS2304: Cannot find name '__VERSION__'.
@@ -8,7 +11,7 @@ export type ParquetWriterOptions = {};
8
11
 
9
12
  const DEFAULT_PARQUET_LOADER_OPTIONS = {};
10
13
 
11
- export const ParquetWriter: Writer = {
14
+ export const ParquetWriter: Writer<Table, TableBatch, ParquetWriterOptions> = {
12
15
  name: 'Apache Parquet',
13
16
  id: 'parquet',
14
17
  module: 'parquet',
@@ -4,11 +4,12 @@ import {stream} from '@loaders.gl/loader-utils';
4
4
  import {ParquetCodecOptions, PARQUET_CODECS} from '../codecs';
5
5
  import * as Compression from '../compression';
6
6
  import {
7
- ParquetBuffer,
7
+ ParquetRowGroup,
8
8
  ParquetCodec,
9
- ParquetData,
9
+ ParquetColumnChunk,
10
10
  ParquetField,
11
- PrimitiveType
11
+ PrimitiveType,
12
+ ParquetRow
12
13
  } from '../schema/declare';
13
14
  import {ParquetSchema} from '../schema/schema';
14
15
  import * as Shred from '../schema/shred';
@@ -105,7 +106,7 @@ export class ParquetEncoder<T> {
105
106
 
106
107
  public schema: ParquetSchema;
107
108
  public envelopeWriter: ParquetEnvelopeWriter;
108
- public rowBuffer: ParquetBuffer;
109
+ public rowBuffer: ParquetRowGroup;
109
110
  public rowGroupSize: number;
110
111
  public closed: boolean;
111
112
  public userMetadata: Record<string, string>;
@@ -144,7 +145,7 @@ export class ParquetEncoder<T> {
144
145
  * Append a single row to the parquet file. Rows are buffered in memory until
145
146
  * rowGroupSize rows are in the buffer or close() is called
146
147
  */
147
- async appendRow<T>(row: T): Promise<void> {
148
+ async appendRow<T extends ParquetRow>(row: T): Promise<void> {
148
149
  if (this.closed) {
149
150
  throw new Error('writer was closed');
150
151
  }
@@ -271,7 +272,7 @@ export class ParquetEnvelopeWriter {
271
272
  * Encode a parquet row group. The records object should be created using the
272
273
  * shredRecord method
273
274
  */
274
- async writeRowGroup(records: ParquetBuffer): Promise<void> {
275
+ async writeRowGroup(records: ParquetRowGroup): Promise<void> {
275
276
  const rgroup = await encodeRowGroup(this.schema, records, {
276
277
  baseOffset: this.offset,
277
278
  pageSize: this.pageSize,
@@ -363,7 +364,7 @@ function encodeValues(
363
364
  */
364
365
  async function encodeDataPage(
365
366
  column: ParquetField,
366
- data: ParquetData
367
+ data: ParquetColumnChunk
367
368
  ): Promise<{
368
369
  header: PageHeader;
369
370
  headerSize: number;
@@ -422,7 +423,7 @@ async function encodeDataPage(
422
423
  */
423
424
  async function encodeDataPageV2(
424
425
  column: ParquetField,
425
- data: ParquetData,
426
+ data: ParquetColumnChunk,
426
427
  rowCount: number
427
428
  ): Promise<{
428
429
  header: PageHeader;
@@ -482,7 +483,7 @@ async function encodeDataPageV2(
482
483
  */
483
484
  async function encodeColumnChunk(
484
485
  column: ParquetField,
485
- buffer: ParquetBuffer,
486
+ buffer: ParquetRowGroup,
486
487
  offset: number,
487
488
  opts: ParquetEncoderOptions
488
489
  ): Promise<{
@@ -539,7 +540,7 @@ async function encodeColumnChunk(
539
540
  */
540
541
  async function encodeRowGroup(
541
542
  schema: ParquetSchema,
542
- data: ParquetBuffer,
543
+ data: ParquetRowGroup,
543
544
  opts: ParquetEncoderOptions
544
545
  ): Promise<{
545
546
  body: Buffer;
@@ -1,7 +1,7 @@
1
1
  // Forked from https://github.com/kbajalc/parquets under MIT license (Copyright (c) 2017 ironSource Ltd.)
2
2
  import {
3
3
  ParquetCodec,
4
- ParquetData,
4
+ ParquetColumnChunk,
5
5
  ParquetOptions,
6
6
  ParquetPageData,
7
7
  ParquetType,
@@ -32,14 +32,14 @@ import {decodePageHeader, getThriftEnum, getBitWidth} from '../utils/read-utils'
32
32
  export async function decodeDataPages(
33
33
  buffer: Buffer,
34
34
  options: ParquetOptions
35
- ): Promise<ParquetData> {
35
+ ): Promise<ParquetColumnChunk> {
36
36
  const cursor: CursorBuffer = {
37
37
  buffer,
38
38
  offset: 0,
39
39
  size: buffer.length
40
40
  };
41
41
 
42
- const data: ParquetData = {
42
+ const data: ParquetColumnChunk = {
43
43
  rlevels: [],
44
44
  dlevels: [],
45
45
  values: [],
@@ -3,14 +3,14 @@ import type {ReadableFile} from '@loaders.gl/loader-utils';
3
3
 
4
4
  import {ParquetSchema} from '../schema/schema';
5
5
  import {decodeSchema} from './decoders';
6
- import {materializeRecords} from '../schema/shred';
6
+ import {materializeRows} from '../schema/shred';
7
7
 
8
8
  import {PARQUET_MAGIC, PARQUET_MAGIC_ENCRYPTED} from '../../constants';
9
9
  import {ColumnChunk, CompressionCodec, FileMetaData, RowGroup, Type} from '../parquet-thrift';
10
10
  import {
11
- ParquetBuffer,
11
+ ParquetRowGroup,
12
12
  ParquetCompression,
13
- ParquetData,
13
+ ParquetColumnChunk,
14
14
  PrimitiveType,
15
15
  ParquetOptions
16
16
  } from '../schema/declare';
@@ -68,7 +68,7 @@ export class ParquetReader {
68
68
  async *rowBatchIterator(props?: ParquetIterationProps) {
69
69
  const schema = await this.getSchema();
70
70
  for await (const rowGroup of this.rowGroupIterator(props)) {
71
- yield materializeRecords(schema, rowGroup);
71
+ yield materializeRows(schema, rowGroup);
72
72
  }
73
73
  }
74
74
 
@@ -172,8 +172,8 @@ export class ParquetReader {
172
172
  schema: ParquetSchema,
173
173
  rowGroup: RowGroup,
174
174
  columnList: string[][]
175
- ): Promise<ParquetBuffer> {
176
- const buffer: ParquetBuffer = {
175
+ ): Promise<ParquetRowGroup> {
176
+ const buffer: ParquetRowGroup = {
177
177
  rowCount: Number(rowGroup.num_rows),
178
178
  columnData: {}
179
179
  };
@@ -191,7 +191,7 @@ export class ParquetReader {
191
191
  /**
192
192
  * Each row group contains column chunks for all the columns.
193
193
  */
194
- async readColumnChunk(schema: ParquetSchema, colChunk: ColumnChunk): Promise<ParquetData> {
194
+ async readColumnChunk(schema: ParquetSchema, colChunk: ColumnChunk): Promise<ParquetColumnChunk> {
195
195
  if (colChunk.file_path !== undefined && colChunk.file_path !== null) {
196
196
  throw new Error('external references are not supported');
197
197
  }
@@ -120,25 +120,26 @@ export interface ParquetPageData {
120
120
  pageHeader: PageHeader;
121
121
  }
122
122
 
123
- export interface ParquetRecord {
123
+ export interface ParquetRow {
124
124
  [key: string]: any;
125
125
  }
126
126
 
127
127
  /** @
128
128
  * Holds data for one row group (column chunks) */
129
- export class ParquetBuffer {
129
+ export class ParquetRowGroup {
130
130
  /** Number of rows in this page */
131
131
  rowCount: number;
132
+ /** Map of Column chunks */
133
+ columnData: Record<string, ParquetColumnChunk>;
132
134
 
133
- columnData: Record<string, ParquetData>;
134
- constructor(rowCount: number = 0, columnData: Record<string, ParquetData> = {}) {
135
+ constructor(rowCount: number = 0, columnData: Record<string, ParquetColumnChunk> = {}) {
135
136
  this.rowCount = rowCount;
136
137
  this.columnData = columnData;
137
138
  }
138
139
  }
139
140
 
140
141
  /** Holds the data for one column chunk */
141
- export interface ParquetData {
142
+ export interface ParquetColumnChunk {
142
143
  dlevels: number[];
143
144
  rlevels: number[];
144
145
  values: any[];
@@ -4,14 +4,14 @@ import {PARQUET_CODECS} from '../codecs';
4
4
  import {PARQUET_COMPRESSION_METHODS} from '../compression';
5
5
  import {
6
6
  FieldDefinition,
7
- ParquetBuffer,
7
+ ParquetRowGroup,
8
8
  ParquetCompression,
9
9
  ParquetField,
10
- ParquetRecord,
10
+ ParquetRow,
11
11
  RepetitionType,
12
12
  SchemaDefinition
13
13
  } from './declare';
14
- import {materializeRecords, shredBuffer, shredRecord} from './shred';
14
+ import {materializeRows, shredBuffer, shredRecord} from './shred';
15
15
  import {PARQUET_LOGICAL_TYPES} from './types';
16
16
 
17
17
  /**
@@ -70,12 +70,12 @@ export class ParquetSchema {
70
70
  return branch;
71
71
  }
72
72
 
73
- shredRecord(record: ParquetRecord, buffer: ParquetBuffer): void {
74
- shredRecord(this, record, buffer);
73
+ shredRecord(row: ParquetRow, rowGroup: ParquetRowGroup): void {
74
+ shredRecord(this, row, rowGroup);
75
75
  }
76
76
 
77
- materializeRecords(buffer: ParquetBuffer): ParquetRecord[] {
78
- return materializeRecords(this, buffer);
77
+ materializeRows(rowGroup: ParquetRowGroup): ParquetRow[] {
78
+ return materializeRows(this, rowGroup);
79
79
  }
80
80
 
81
81
  compress(type: ParquetCompression): this {
@@ -84,7 +84,7 @@ export class ParquetSchema {
84
84
  return this;
85
85
  }
86
86
 
87
- buffer(): ParquetBuffer {
87
+ rowGroup(): ParquetRowGroup {
88
88
  return shredBuffer(this);
89
89
  }
90
90
  }