@engine9-io/input-tools 1.9.11 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ForEachEntry.js +18 -43
- package/ValidatingReadable.js +3 -6
- package/buildSamplePackets.js +11 -16
- package/eslint.config.mjs +15 -11
- package/file/FileUtilities.js +976 -1048
- package/file/GoogleDrive.js +32 -38
- package/file/Parquet.js +112 -124
- package/file/R2.js +27 -32
- package/file/S3.js +259 -293
- package/file/tools.js +334 -326
- package/index.js +60 -75
- package/package.json +2 -1
- package/test/cli.js +3 -4
- package/test/file.js +6 -7
- package/test/processing/bigDataMessage.js +8 -10
- package/test/processing/forEach.js +6 -8
- package/test/processing/forEachResume.js +6 -8
- package/test/processing/message.js +31 -39
- package/test/processing/zip.js +6 -7
- package/test/uuid.js +6 -11
- package/timelineTypes.js +2 -24
package/file/GoogleDrive.js
CHANGED
|
@@ -1,45 +1,39 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
import { google } from "googleapis";
|
|
2
|
+
import fs from "node:fs";
|
|
4
3
|
const fsp = fs.promises;
|
|
5
|
-
|
|
6
|
-
function Worker() {}
|
|
7
|
-
|
|
4
|
+
function Worker() { }
|
|
8
5
|
Worker.prototype.setAuth = async function () {
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
6
|
+
const keyFile = process.env.GOOGLE_APPLICATION_CREDENTIALS;
|
|
7
|
+
const settings = JSON.parse(await fsp.readFile(keyFile));
|
|
8
|
+
if (!settings.subject_to_impersonate)
|
|
9
|
+
throw new Error(`You should include subject_to_impersonate in file ${keyFile}`);
|
|
10
|
+
const auth = new google.auth.GoogleAuth({
|
|
11
|
+
clientOptions: {
|
|
12
|
+
subject: settings.subject_to_impersonate,
|
|
13
|
+
},
|
|
14
|
+
keyFile,
|
|
15
|
+
scopes: ['https://www.googleapis.com/auth/drive'],
|
|
16
|
+
});
|
|
17
|
+
google.options({
|
|
18
|
+
auth,
|
|
19
|
+
});
|
|
23
20
|
};
|
|
24
|
-
|
|
25
21
|
Worker.prototype.list = async function ({ path }) {
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
return raw.data?.files;
|
|
22
|
+
await this.setAuth();
|
|
23
|
+
const drive = google.drive({ version: 'v3' });
|
|
24
|
+
const folderId = path;
|
|
25
|
+
const q = `'${folderId}' in parents and trashed=false`;
|
|
26
|
+
const raw = await drive.files.list({
|
|
27
|
+
pageSize: 150,
|
|
28
|
+
q,
|
|
29
|
+
supportsAllDrives: true, // include share drives as well
|
|
30
|
+
includeItemsFromAllDrives: true,
|
|
31
|
+
});
|
|
32
|
+
return raw.data?.files;
|
|
38
33
|
};
|
|
39
34
|
Worker.prototype.list.metadata = {
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
35
|
+
options: {
|
|
36
|
+
path: {},
|
|
37
|
+
},
|
|
43
38
|
};
|
|
44
|
-
|
|
45
|
-
module.exports = Worker;
|
|
39
|
+
export default Worker;
|
package/file/Parquet.js
CHANGED
|
@@ -1,149 +1,137 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
const
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
1
|
+
import parquet from "@dsnp/parquetjs";
|
|
2
|
+
import nodestream from "node:stream";
|
|
3
|
+
import debug$0 from "debug";
|
|
4
|
+
import clientS3 from "@aws-sdk/client-s3";
|
|
5
|
+
import FileWorker from "./FileUtilities.js";
|
|
6
|
+
const { Readable } = nodestream;
|
|
7
|
+
const debug = debug$0('Parquet');
|
|
8
|
+
const { S3Client } = clientS3;
|
|
9
|
+
function Worker() { }
|
|
10
10
|
async function getReader(options) {
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
11
|
+
const { filename } = options;
|
|
12
|
+
if (!filename)
|
|
13
|
+
throw new Error('filename is required');
|
|
14
|
+
if (filename.indexOf('s3://') === 0) {
|
|
15
|
+
const client = new S3Client({});
|
|
16
|
+
const parts = filename.split('/');
|
|
17
|
+
return parquet.ParquetReader.openS3(client, {
|
|
18
|
+
Bucket: parts[2],
|
|
19
|
+
Key: parts.slice(3).join('/')
|
|
20
|
+
});
|
|
21
|
+
}
|
|
22
|
+
return parquet.ParquetReader.openFile(filename);
|
|
23
23
|
}
|
|
24
|
-
|
|
25
24
|
Worker.prototype.meta = async function (options) {
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
25
|
+
const reader = await getReader(options);
|
|
26
|
+
const schema = reader.getSchema();
|
|
27
|
+
return {
|
|
28
|
+
//stored as a buffer
|
|
29
|
+
schema,
|
|
30
|
+
records: parseInt(reader.metadata?.num_rows?.toString(), 10)
|
|
31
|
+
};
|
|
32
|
+
// getMetadata();
|
|
34
33
|
};
|
|
35
34
|
Worker.prototype.meta.metadata = {
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
35
|
+
options: {
|
|
36
|
+
path: {}
|
|
37
|
+
}
|
|
39
38
|
};
|
|
40
39
|
Worker.prototype.schema = async function (options) {
|
|
41
|
-
|
|
42
|
-
|
|
40
|
+
const reader = await getReader(options);
|
|
41
|
+
return reader.getSchema();
|
|
43
42
|
};
|
|
44
43
|
Worker.prototype.schema.metadata = {
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
44
|
+
options: {
|
|
45
|
+
path: {}
|
|
46
|
+
}
|
|
48
47
|
};
|
|
49
|
-
|
|
50
48
|
function cleanColumnName(name) {
|
|
51
|
-
|
|
49
|
+
return name.toLowerCase().replace(/[^a-z0-9_]/g, '_');
|
|
52
50
|
}
|
|
53
|
-
|
|
54
51
|
Worker.prototype.stream = async function (options) {
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
.
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
if (parseInt(options.limit, 10) === options.limit) limit = parseInt(options.limit, 10);
|
|
72
|
-
// create a new cursor
|
|
73
|
-
debug(`Reading parquet file ${options.filename} with columns ${columns?.join(',')} and limit ${limit}`);
|
|
74
|
-
const cursor = reader.getCursor(columns);
|
|
75
|
-
|
|
76
|
-
let counter = 0;
|
|
77
|
-
|
|
78
|
-
const start = new Date().getTime();
|
|
79
|
-
|
|
80
|
-
const stream = new Readable({
|
|
81
|
-
objectMode: true,
|
|
82
|
-
async read() {
|
|
83
|
-
const token = await cursor.next();
|
|
84
|
-
if (token) {
|
|
85
|
-
counter += 1;
|
|
86
|
-
if (limit && counter > limit) {
|
|
87
|
-
debug(`Reached limit of ${limit}, stopping`);
|
|
88
|
-
this.push(null);
|
|
89
|
-
await reader.close();
|
|
90
|
-
return;
|
|
91
|
-
}
|
|
92
|
-
if (counter % 10000 === 0) {
|
|
93
|
-
let m = process.memoryUsage().heapTotal;
|
|
94
|
-
const end = new Date().getTime();
|
|
95
|
-
debug(
|
|
96
|
-
`Read ${counter} ${(counter * 1000) / (end - start)}/sec, Node reported memory usage: ${
|
|
97
|
-
m / 1024 / 1024
|
|
98
|
-
} MBs`
|
|
99
|
-
);
|
|
100
|
-
}
|
|
101
|
-
this.push(token);
|
|
102
|
-
} else {
|
|
103
|
-
await reader.close();
|
|
104
|
-
this.push(null);
|
|
105
|
-
}
|
|
52
|
+
const reader = await getReader(options);
|
|
53
|
+
let columns;
|
|
54
|
+
if (options.columns) {
|
|
55
|
+
const { fieldList } = await this.schema(options);
|
|
56
|
+
columns = [];
|
|
57
|
+
let requestedColumns = options.columns;
|
|
58
|
+
if (typeof options.columns === 'string')
|
|
59
|
+
requestedColumns = options.columns.split(',').map((d) => d.trim());
|
|
60
|
+
else
|
|
61
|
+
requestedColumns = options.columns.map((d) => (d.name ? d.name.trim() : d.trim()));
|
|
62
|
+
requestedColumns.forEach((c) => {
|
|
63
|
+
const matchingCols = fieldList
|
|
64
|
+
.filter((f) => f.name === c || cleanColumnName(f.name) === cleanColumnName(c))
|
|
65
|
+
.map((f) => f.name);
|
|
66
|
+
columns = columns.concat(matchingCols);
|
|
67
|
+
});
|
|
106
68
|
}
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
69
|
+
let limit = 0;
|
|
70
|
+
if (parseInt(options.limit, 10) === options.limit)
|
|
71
|
+
limit = parseInt(options.limit, 10);
|
|
72
|
+
// create a new cursor
|
|
73
|
+
debug(`Reading parquet file ${options.filename} with columns ${columns?.join(',')} and limit ${limit}`);
|
|
74
|
+
const cursor = reader.getCursor(columns);
|
|
75
|
+
let counter = 0;
|
|
76
|
+
const start = new Date().getTime();
|
|
77
|
+
const stream = new Readable({
|
|
78
|
+
objectMode: true,
|
|
79
|
+
async read() {
|
|
80
|
+
const token = await cursor.next();
|
|
81
|
+
if (token) {
|
|
82
|
+
counter += 1;
|
|
83
|
+
if (limit && counter > limit) {
|
|
84
|
+
debug(`Reached limit of ${limit}, stopping`);
|
|
85
|
+
this.push(null);
|
|
86
|
+
await reader.close();
|
|
87
|
+
return;
|
|
88
|
+
}
|
|
89
|
+
if (counter % 10000 === 0) {
|
|
90
|
+
let m = process.memoryUsage().heapTotal;
|
|
91
|
+
const end = new Date().getTime();
|
|
92
|
+
debug(`Read ${counter} ${(counter * 1000) / (end - start)}/sec, Node reported memory usage: ${m / 1024 / 1024} MBs`);
|
|
93
|
+
}
|
|
94
|
+
this.push(token);
|
|
95
|
+
}
|
|
96
|
+
else {
|
|
97
|
+
await reader.close();
|
|
98
|
+
this.push(null);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
});
|
|
102
|
+
return { stream };
|
|
110
103
|
};
|
|
111
|
-
|
|
112
104
|
Worker.prototype.stream.metadata = {
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
105
|
+
options: {
|
|
106
|
+
path: {}
|
|
107
|
+
}
|
|
116
108
|
};
|
|
117
|
-
|
|
118
109
|
Worker.prototype.toFile = async function (options) {
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
110
|
+
const { stream } = await this.stream(options);
|
|
111
|
+
const fworker = new FileWorker(this);
|
|
112
|
+
return fworker.objectStreamToFile({ ...options, stream });
|
|
122
113
|
};
|
|
123
114
|
Worker.prototype.toFile.metadata = {
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
115
|
+
options: {
|
|
116
|
+
path: {}
|
|
117
|
+
}
|
|
127
118
|
};
|
|
128
|
-
|
|
129
119
|
Worker.prototype.stats = async function (options) {
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
};
|
|
120
|
+
const reader = await getReader(options);
|
|
121
|
+
const schema = reader.getSchema();
|
|
122
|
+
const fileMetadata = reader.getFileMetaData();
|
|
123
|
+
const rowGroups = reader.getRowGroups();
|
|
124
|
+
// const reader = await parquet.ParquetReader.openS3(client, getParams(options));
|
|
125
|
+
// return reader.getSchema();
|
|
126
|
+
return {
|
|
127
|
+
schema,
|
|
128
|
+
fileMetadata,
|
|
129
|
+
rowGroups
|
|
130
|
+
};
|
|
142
131
|
};
|
|
143
132
|
Worker.prototype.stats.metadata = {
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
133
|
+
options: {
|
|
134
|
+
path: {}
|
|
135
|
+
}
|
|
147
136
|
};
|
|
148
|
-
|
|
149
|
-
module.exports = Worker;
|
|
137
|
+
export default Worker;
|
package/file/R2.js
CHANGED
|
@@ -1,37 +1,32 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
} =
|
|
5
|
-
const S3 = require('./S3');
|
|
6
|
-
|
|
1
|
+
import util from "node:util";
|
|
2
|
+
import clientS3 from "@aws-sdk/client-s3";
|
|
3
|
+
import S3 from "./S3.js";
|
|
4
|
+
const { S3Client, } = clientS3;
|
|
7
5
|
function R2(worker) {
|
|
8
|
-
|
|
9
|
-
|
|
6
|
+
S3.call(this, worker);
|
|
7
|
+
this.prefix = 'r2';
|
|
10
8
|
}
|
|
11
9
|
util.inherits(R2, S3);
|
|
12
|
-
|
|
13
10
|
R2.prototype.getClient = function () {
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
return this.client;
|
|
11
|
+
const missing = ['CLOUDFLARE_R2_ACCOUNT_ID', 'CLOUDFLARE_R2_ACCESS_KEY_ID', 'CLOUDFLARE_R2_SECRET_ACCESS_KEY']
|
|
12
|
+
.filter((r) => !process.env[r]);
|
|
13
|
+
if (missing.length > 0)
|
|
14
|
+
throw new Error(`Missing environment variables for Cloudflare access:${missing.join(',')}`);
|
|
15
|
+
const ACCOUNT_ID = process.env.CLOUDFLARE_R2_ACCOUNT_ID;
|
|
16
|
+
const ACCESS_KEY_ID = process.env.CLOUDFLARE_R2_ACCESS_KEY_ID;
|
|
17
|
+
const SECRET_ACCESS_KEY = process.env.CLOUDFLARE_R2_SECRET_ACCESS_KEY;
|
|
18
|
+
if (!this.client) {
|
|
19
|
+
this.client = new S3Client({
|
|
20
|
+
// R2 does not strictly require a region, but the SDK expects one. 'auto' works fine.
|
|
21
|
+
region: 'auto',
|
|
22
|
+
endpoint: `https://${ACCOUNT_ID}.r2.cloudflarestorage.com`,
|
|
23
|
+
credentials: {
|
|
24
|
+
accessKeyId: ACCESS_KEY_ID,
|
|
25
|
+
secretAccessKey: SECRET_ACCESS_KEY,
|
|
26
|
+
},
|
|
27
|
+
forcePathStyle: true, // Important for R2 compatibility
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
return this.client;
|
|
35
31
|
};
|
|
36
|
-
|
|
37
|
-
module.exports = R2;
|
|
32
|
+
export default R2;
|