@engine9-io/input-tools 1.8.1 → 1.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/file/FileUtilities.js +1 -1
- package/file/Parquet.js +45 -36
- package/package.json +1 -1
package/file/FileUtilities.js
CHANGED
|
@@ -7,7 +7,7 @@ const { Readable, Transform, PassThrough, Writable } = require('node:stream');
|
|
|
7
7
|
const { pipeline } = require('node:stream/promises');
|
|
8
8
|
const { stringify } = require('csv');
|
|
9
9
|
|
|
10
|
-
const debug = require('debug')('
|
|
10
|
+
const debug = require('debug')('@engine9-io/file');
|
|
11
11
|
|
|
12
12
|
const { getXlsxStream } = require('xlstream');
|
|
13
13
|
const csv = require('csv');
|
package/file/Parquet.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
const parquet = require('@dsnp/parquetjs');
|
|
2
2
|
|
|
3
3
|
const { Readable } = require('node:stream');
|
|
4
|
-
const debug = require('debug')('
|
|
4
|
+
const debug = require('debug')('Parquet');
|
|
5
5
|
const { S3Client } = require('@aws-sdk/client-s3');
|
|
6
6
|
const FileWorker = require('./FileUtilities');
|
|
7
7
|
|
|
@@ -16,7 +16,7 @@ async function getReader(options) {
|
|
|
16
16
|
|
|
17
17
|
return parquet.ParquetReader.openS3(client, {
|
|
18
18
|
Bucket: parts[2],
|
|
19
|
-
Key: parts.slice(3).join('/')
|
|
19
|
+
Key: parts.slice(3).join('/')
|
|
20
20
|
});
|
|
21
21
|
}
|
|
22
22
|
return parquet.ParquetReader.openFile(filename);
|
|
@@ -25,14 +25,14 @@ async function getReader(options) {
|
|
|
25
25
|
Worker.prototype.meta = async function (options) {
|
|
26
26
|
const reader = await getReader(options);
|
|
27
27
|
return {
|
|
28
|
-
records: String(reader.metadata?.num_rows)
|
|
28
|
+
records: String(reader.metadata?.num_rows)
|
|
29
29
|
};
|
|
30
30
|
// getMetadata();
|
|
31
31
|
};
|
|
32
32
|
Worker.prototype.meta.metadata = {
|
|
33
33
|
options: {
|
|
34
|
-
path: {}
|
|
35
|
-
}
|
|
34
|
+
path: {}
|
|
35
|
+
}
|
|
36
36
|
};
|
|
37
37
|
Worker.prototype.schema = async function (options) {
|
|
38
38
|
const reader = await getReader(options);
|
|
@@ -40,8 +40,8 @@ Worker.prototype.schema = async function (options) {
|
|
|
40
40
|
};
|
|
41
41
|
Worker.prototype.schema.metadata = {
|
|
42
42
|
options: {
|
|
43
|
-
path: {}
|
|
44
|
-
}
|
|
43
|
+
path: {}
|
|
44
|
+
}
|
|
45
45
|
};
|
|
46
46
|
|
|
47
47
|
function cleanColumnName(name) {
|
|
@@ -49,8 +49,6 @@ function cleanColumnName(name) {
|
|
|
49
49
|
}
|
|
50
50
|
|
|
51
51
|
Worker.prototype.stream = async function (options) {
|
|
52
|
-
const stream = new Readable({ objectMode: true });
|
|
53
|
-
|
|
54
52
|
const reader = await getReader(options);
|
|
55
53
|
let columns;
|
|
56
54
|
if (options.columns) {
|
|
@@ -60,9 +58,9 @@ Worker.prototype.stream = async function (options) {
|
|
|
60
58
|
if (typeof options.columns === 'string') requestedColumns = options.columns.split(',').map((d) => d.trim());
|
|
61
59
|
else requestedColumns = options.columns.map((d) => (d.name ? d.name.trim() : d.trim()));
|
|
62
60
|
requestedColumns.forEach((c) => {
|
|
63
|
-
const matchingCols = fieldList
|
|
64
|
-
f.name === c || cleanColumnName(f.name) === cleanColumnName(c)
|
|
65
|
-
|
|
61
|
+
const matchingCols = fieldList
|
|
62
|
+
.filter((f) => f.name === c || cleanColumnName(f.name) === cleanColumnName(c))
|
|
63
|
+
.map((f) => f.name);
|
|
66
64
|
columns = columns.concat(matchingCols);
|
|
67
65
|
});
|
|
68
66
|
}
|
|
@@ -72,35 +70,46 @@ Worker.prototype.stream = async function (options) {
|
|
|
72
70
|
debug(`Reading parquet file ${options.filename} with columns ${columns?.join(',')} and limit ${limit}`);
|
|
73
71
|
const cursor = reader.getCursor(columns);
|
|
74
72
|
|
|
75
|
-
// read all records from the file and print them
|
|
76
|
-
let record = null;
|
|
77
73
|
let counter = 0;
|
|
78
74
|
|
|
79
75
|
const start = new Date().getTime();
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
76
|
+
|
|
77
|
+
const stream = new Readable({
|
|
78
|
+
objectMode: true,
|
|
79
|
+
async read() {
|
|
80
|
+
const token = await cursor.next();
|
|
81
|
+
if (token) {
|
|
82
|
+
counter += 1;
|
|
83
|
+
if (limit && counter > limit) {
|
|
84
|
+
debug(`Reached limit of ${limit}, stopping`);
|
|
85
|
+
this.push(null);
|
|
86
|
+
await reader.close();
|
|
87
|
+
return;
|
|
88
|
+
}
|
|
89
|
+
if (counter % 10000 === 0) {
|
|
90
|
+
let m = process.memoryUsage().heapTotal;
|
|
91
|
+
const end = new Date().getTime();
|
|
92
|
+
debug(
|
|
93
|
+
`Read ${counter} ${(counter * 1000) / (end - start)}/sec, Node reported memory usage: ${
|
|
94
|
+
m / 1024 / 1024
|
|
95
|
+
} MBs`
|
|
96
|
+
);
|
|
97
|
+
}
|
|
98
|
+
this.push(token);
|
|
99
|
+
} else {
|
|
100
|
+
await reader.close();
|
|
101
|
+
this.push(null);
|
|
102
|
+
}
|
|
91
103
|
}
|
|
92
|
-
|
|
93
|
-
} while (record);
|
|
94
|
-
stream.push(null);
|
|
95
|
-
await reader.close();
|
|
104
|
+
});
|
|
96
105
|
|
|
97
106
|
return { stream };
|
|
98
107
|
};
|
|
99
108
|
|
|
100
109
|
Worker.prototype.stream.metadata = {
|
|
101
110
|
options: {
|
|
102
|
-
path: {}
|
|
103
|
-
}
|
|
111
|
+
path: {}
|
|
112
|
+
}
|
|
104
113
|
};
|
|
105
114
|
|
|
106
115
|
Worker.prototype.toFile = async function (options) {
|
|
@@ -110,8 +119,8 @@ Worker.prototype.toFile = async function (options) {
|
|
|
110
119
|
};
|
|
111
120
|
Worker.prototype.toFile.metadata = {
|
|
112
121
|
options: {
|
|
113
|
-
path: {}
|
|
114
|
-
}
|
|
122
|
+
path: {}
|
|
123
|
+
}
|
|
115
124
|
};
|
|
116
125
|
|
|
117
126
|
Worker.prototype.stats = async function (options) {
|
|
@@ -125,13 +134,13 @@ Worker.prototype.stats = async function (options) {
|
|
|
125
134
|
return {
|
|
126
135
|
schema,
|
|
127
136
|
fileMetadata,
|
|
128
|
-
rowGroups
|
|
137
|
+
rowGroups
|
|
129
138
|
};
|
|
130
139
|
};
|
|
131
140
|
Worker.prototype.stats.metadata = {
|
|
132
141
|
options: {
|
|
133
|
-
path: {}
|
|
134
|
-
}
|
|
142
|
+
path: {}
|
|
143
|
+
}
|
|
135
144
|
};
|
|
136
145
|
|
|
137
146
|
module.exports = Worker;
|