@engine9-io/input-tools 1.4.1 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/file/FileUtilities.js +54 -23
- package/file/R2.js +36 -0
- package/file/S3.js +49 -9
- package/index.js +4 -0
- package/package.json +1 -1
package/file/FileUtilities.js
CHANGED
@@ -15,6 +15,7 @@ const debug = require('debug')('FileWorker');
|
|
15
15
|
const csv = require('csv');
|
16
16
|
const JSON5 = require('json5');// Useful for parsing extended JSON
|
17
17
|
const languageEncoding = require('detect-file-encoding-and-language');
|
18
|
+
const R2Worker = require('./R2');
|
18
19
|
const S3Worker = require('./S3');
|
19
20
|
const ParquetWorker = require('./Parquet');
|
20
21
|
const { streamPacket } = require('./tools');
|
@@ -401,7 +402,7 @@ Worker.prototype.objectStreamToFile = async function (options) {
|
|
401
402
|
Worker.prototype.transform = async function (options) {
|
402
403
|
const worker = this;
|
403
404
|
|
404
|
-
const filename =
|
405
|
+
const { filename } = options;
|
405
406
|
|
406
407
|
debug(`Transforming ${filename}`);
|
407
408
|
|
@@ -489,19 +490,23 @@ Worker.prototype.stream = async function (
|
|
489
490
|
} else if (filename) {
|
490
491
|
if (filename.startsWith('engine9-accounts/')) {
|
491
492
|
filename = `${process.env.ENGINE9_ACCOUNT_DIR}/${filename.slice('engine9-accounts/'.length)}`;
|
492
|
-
debug(`Prepending file with ${process.env.ENGINE9_ACCOUNT_DIR}, filename=${filename}`);
|
493
|
+
// debug(`Prepending file with ${process.env.ENGINE9_ACCOUNT_DIR}, filename=${filename}`);
|
493
494
|
} else {
|
494
|
-
debug(`Not prepending filename:${filename}`);
|
495
|
+
// debug(`Not prepending filename:${filename}`);
|
495
496
|
}
|
496
497
|
let encoding; let stream;
|
497
498
|
if (filename.slice(-8) === '.parquet') {
|
498
499
|
const pq = new ParquetWorker(this);
|
499
500
|
stream = (await pq.stream({ filename, columns, limit })).stream;
|
500
501
|
encoding = 'object';
|
501
|
-
} else if (filename.
|
502
|
+
} else if (filename.startsWith('s3://')) {
|
502
503
|
const s3Worker = new S3Worker(this);
|
503
504
|
stream = (await s3Worker.stream({ filename, columns, limit })).stream;
|
504
505
|
encoding = 'UTF-8';
|
506
|
+
} else if (filename.startsWith('r2://')) {
|
507
|
+
const r2Worker = new R2Worker(this);
|
508
|
+
stream = (await r2Worker.stream({ filename, columns, limit })).stream;
|
509
|
+
encoding = 'UTF-8';
|
505
510
|
} else {
|
506
511
|
// Check if the file exists, and fast fail if not
|
507
512
|
// Otherwise the stream hangs out as a handle
|
@@ -541,13 +546,13 @@ Worker.prototype.sample.metadata = {
|
|
541
546
|
|
542
547
|
Worker.prototype.write = async function (opts) {
|
543
548
|
const { filename, content } = opts;
|
544
|
-
if (filename.
|
545
|
-
const
|
549
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
550
|
+
const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
546
551
|
const parts = filename.split('/');
|
547
552
|
const directory = parts.slice(0, -1).join('/');
|
548
553
|
const file = parts.slice(-1)[0];
|
549
554
|
// debug(JSON.stringify({ parts, directory, file }));
|
550
|
-
await
|
555
|
+
await worker.write({
|
551
556
|
directory,
|
552
557
|
file,
|
553
558
|
content,
|
@@ -559,7 +564,7 @@ Worker.prototype.write = async function (opts) {
|
|
559
564
|
};
|
560
565
|
Worker.prototype.write.metadata = {
|
561
566
|
options: {
|
562
|
-
filename: { description: 'Location to write content to, can be local or s3://' },
|
567
|
+
filename: { description: 'Location to write content to, can be local or s3:// or r2://' },
|
563
568
|
content: {},
|
564
569
|
},
|
565
570
|
};
|
@@ -596,9 +601,9 @@ Worker.prototype.json.metadata = {
|
|
596
601
|
|
597
602
|
Worker.prototype.list = async function ({ directory }) {
|
598
603
|
if (!directory) throw new Error('directory is required');
|
599
|
-
if (directory.
|
600
|
-
const
|
601
|
-
return
|
604
|
+
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
605
|
+
const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
606
|
+
return worker.list({ directory });
|
602
607
|
}
|
603
608
|
const a = await fsp.readdir(directory, { withFileTypes: true });
|
604
609
|
return a.map((f) => ({
|
@@ -614,9 +619,9 @@ Worker.prototype.list.metadata = {
|
|
614
619
|
|
615
620
|
Worker.prototype.listAll = async function ({ directory }) {
|
616
621
|
if (!directory) throw new Error('directory is required');
|
617
|
-
if (directory.
|
618
|
-
const
|
619
|
-
return
|
622
|
+
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
623
|
+
const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
624
|
+
return worker.listAll({ directory });
|
620
625
|
}
|
621
626
|
const a = await fsp.readdir(directory, { recursive: true });
|
622
627
|
|
@@ -630,9 +635,9 @@ Worker.prototype.listAll.metadata = {
|
|
630
635
|
|
631
636
|
Worker.prototype.empty = async function ({ directory }) {
|
632
637
|
if (!directory) throw new Error('directory is required');
|
633
|
-
if (directory.
|
638
|
+
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
634
639
|
// currently not emptying S3 this way -- dangerous
|
635
|
-
throw new Error('Cannot empty an s3:// directory');
|
640
|
+
throw new Error('Cannot empty an s3:// or r2:// directory');
|
636
641
|
}
|
637
642
|
const removed = [];
|
638
643
|
// eslint-disable-next-line no-restricted-syntax
|
@@ -649,11 +654,23 @@ Worker.prototype.empty.metadata = {
|
|
649
654
|
};
|
650
655
|
|
651
656
|
Worker.prototype.move = async function ({ filename, target }) {
|
652
|
-
if (!target) throw new Error('
|
653
|
-
if (target.
|
654
|
-
|
657
|
+
if (!target) throw new Error('target is required');
|
658
|
+
if (target.startsWith('s3://') || target.startsWith('r2://')) {
|
659
|
+
if ((target.startsWith('s3://') && filename.startsWith('r2://'))
|
660
|
+
|| (target.startsWith('r2://') && filename.startsWith('s3://'))) {
|
661
|
+
throw new Error('Cowardly not copying between services');
|
662
|
+
}
|
663
|
+
|
664
|
+
const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
665
|
+
|
666
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
667
|
+
// We need to copy and delete
|
668
|
+
const output = await worker.copy({ filename, target });
|
669
|
+
await worker.remove({ filename });
|
670
|
+
return output;
|
671
|
+
}
|
655
672
|
const parts = target.split('/');
|
656
|
-
return
|
673
|
+
return worker.put({ filename, directory: parts.slice(0, -1).join('/'), file: parts.slice(-1)[0] });
|
657
674
|
}
|
658
675
|
await fsp.mkdir(path.dirname(target), { recursive: true });
|
659
676
|
await fsp.rename(filename, target);
|
@@ -668,9 +685,9 @@ Worker.prototype.move.metadata = {
|
|
668
685
|
|
669
686
|
Worker.prototype.stat = async function ({ filename }) {
|
670
687
|
if (!filename) throw new Error('filename is required');
|
671
|
-
if (filename.
|
672
|
-
const
|
673
|
-
return
|
688
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
689
|
+
const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
690
|
+
return worker.stat({ filename });
|
674
691
|
}
|
675
692
|
const {
|
676
693
|
ctime,
|
@@ -693,6 +710,20 @@ Worker.prototype.stat.metadata = {
|
|
693
710
|
},
|
694
711
|
};
|
695
712
|
|
713
|
+
Worker.prototype.download = async function ({ filename }) {
|
714
|
+
if (!filename) throw new Error('filename is required');
|
715
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
716
|
+
const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
717
|
+
return worker.download({ filename });
|
718
|
+
}
|
719
|
+
throw new Error('Cannot download a local file');
|
720
|
+
};
|
721
|
+
Worker.prototype.download.metadata = {
|
722
|
+
options: {
|
723
|
+
filename: {},
|
724
|
+
},
|
725
|
+
};
|
726
|
+
|
696
727
|
Worker.prototype.head = async function (options) {
|
697
728
|
const { stream } = await this.fileToObjectStream(options);
|
698
729
|
const chunks = [];
|
package/file/R2.js
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
const util = require('node:util');
|
2
|
+
const {
|
3
|
+
S3Client,
|
4
|
+
} = require('@aws-sdk/client-s3');
|
5
|
+
const S3 = require('./S3');
|
6
|
+
|
7
|
+
function R2(worker) {
|
8
|
+
S3.call(this, worker);
|
9
|
+
}
|
10
|
+
util.inherits(R2, S3);
|
11
|
+
|
12
|
+
R2.prototype.getClient = function () {
|
13
|
+
const missing = ['CLOUDFLARE_R2_ACCOUNT_ID', 'CLOUDFLARE_R2_ACCESS_KEY_ID', 'CLOUDFLARE_R2_SECRET_ACCESS_KEY']
|
14
|
+
.filter((r) => !process.env[r]);
|
15
|
+
if (missing.length > 0) throw new Error(`Missing environment variables for Cloudflare access:${missing.join(',')}`);
|
16
|
+
const ACCOUNT_ID = process.env.CLOUDFLARE_R2_ACCOUNT_ID;
|
17
|
+
const ACCESS_KEY_ID = process.env.CLOUDFLARE_R2_ACCESS_KEY_ID;
|
18
|
+
const SECRET_ACCESS_KEY = process.env.CLOUDFLARE_R2_SECRET_ACCESS_KEY;
|
19
|
+
|
20
|
+
if (!this.client) {
|
21
|
+
this.client = new S3Client({
|
22
|
+
// R2 does not strictly require a region, but the SDK expects one. 'auto' works fine.
|
23
|
+
region: 'auto',
|
24
|
+
endpoint: `https://${ACCOUNT_ID}.r2.cloudflarestorage.com`,
|
25
|
+
credentials: {
|
26
|
+
accessKeyId: ACCESS_KEY_ID,
|
27
|
+
secretAccessKey: SECRET_ACCESS_KEY,
|
28
|
+
},
|
29
|
+
forcePathStyle: true, // Important for R2 compatibility
|
30
|
+
|
31
|
+
});
|
32
|
+
}
|
33
|
+
return this.client;
|
34
|
+
};
|
35
|
+
|
36
|
+
module.exports = R2;
|
package/file/S3.js
CHANGED
@@ -1,9 +1,11 @@
|
|
1
|
-
const debug = require('debug')('
|
1
|
+
const debug = require('debug')('@engine9-io/input/S3');
|
2
2
|
const fs = require('node:fs');
|
3
3
|
// eslint-disable-next-line import/no-unresolved
|
4
4
|
const { mimeType: mime } = require('mime-type/with-db');
|
5
5
|
const {
|
6
6
|
S3Client,
|
7
|
+
CopyObjectCommand,
|
8
|
+
DeleteObjectCommand,
|
7
9
|
GetObjectCommand,
|
8
10
|
HeadObjectCommand,
|
9
11
|
GetObjectAttributesCommand, PutObjectCommand,
|
@@ -14,7 +16,10 @@ const { getTempFilename } = require('./tools');
|
|
14
16
|
function Worker() {}
|
15
17
|
|
16
18
|
function getParts(filename) {
|
17
|
-
if (!filename
|
19
|
+
if (!filename) throw new Error(`Invalid filename: ${filename}`);
|
20
|
+
if (!filename.startsWith('r2://') && !filename.startsWith('s3://')) {
|
21
|
+
throw new Error(`Invalid filename, must start with r2:// or s3://: ${filename}`);
|
22
|
+
}
|
18
23
|
const parts = filename.split('/');
|
19
24
|
const Bucket = parts[2];
|
20
25
|
const Key = parts.slice(3).join('/');
|
@@ -44,7 +49,7 @@ Worker.prototype.getMetadata.metadata = {
|
|
44
49
|
};
|
45
50
|
|
46
51
|
Worker.prototype.stream = async function ({ filename }) {
|
47
|
-
const s3Client =
|
52
|
+
const s3Client = this.getClient();
|
48
53
|
const { Bucket, Key } = getParts(filename);
|
49
54
|
const command = new GetObjectCommand({ Bucket, Key });
|
50
55
|
try {
|
@@ -62,10 +67,45 @@ Worker.prototype.stream.metadata = {
|
|
62
67
|
},
|
63
68
|
};
|
64
69
|
|
70
|
+
Worker.prototype.copy = async function ({ filename, target }) {
|
71
|
+
if (!filename.startsWith('s3://')) throw new Error('Cowardly not copying a file not from s3 -- use put instead');
|
72
|
+
const s3Client = this.getClient();
|
73
|
+
const { Bucket, Key } = getParts(target);
|
74
|
+
|
75
|
+
debug(`Copying ${filename} to ${JSON.stringify({ Bucket, Key })}}`);
|
76
|
+
|
77
|
+
const command = new CopyObjectCommand({
|
78
|
+
CopySource: filename.slice(4), // remove the s3:/
|
79
|
+
Bucket,
|
80
|
+
Key,
|
81
|
+
});
|
82
|
+
|
83
|
+
return s3Client.send(command);
|
84
|
+
};
|
85
|
+
|
86
|
+
Worker.prototype.copy.metadata = {
|
87
|
+
options: {
|
88
|
+
filename: {},
|
89
|
+
target: {},
|
90
|
+
},
|
91
|
+
};
|
92
|
+
|
93
|
+
Worker.prototype.remove = async function ({ filename }) {
|
94
|
+
const s3Client = this.getClient();
|
95
|
+
const { Bucket, Key } = getParts(filename);
|
96
|
+
const command = new DeleteObjectCommand({ Bucket, Key });
|
97
|
+
return s3Client.send(command);
|
98
|
+
};
|
99
|
+
Worker.prototype.remove.metadata = {
|
100
|
+
options: {
|
101
|
+
filename: {},
|
102
|
+
},
|
103
|
+
};
|
104
|
+
|
65
105
|
Worker.prototype.download = async function ({ filename }) {
|
66
106
|
const file = filename.split('/').pop();
|
67
107
|
const localPath = await getTempFilename({ targetFilename: file });
|
68
|
-
const s3Client =
|
108
|
+
const s3Client = this.getClient();
|
69
109
|
const { Bucket, Key } = getParts(filename);
|
70
110
|
const command = new GetObjectCommand({ Bucket, Key });
|
71
111
|
debug(`Downloading ${file} to ${localPath}`);
|
@@ -102,7 +142,7 @@ Worker.prototype.put = async function (options) {
|
|
102
142
|
const ContentType = mime.lookup(file);
|
103
143
|
|
104
144
|
debug(`Putting ${filename} to ${JSON.stringify({ Bucket, Key, ContentType })}}`);
|
105
|
-
const s3Client =
|
145
|
+
const s3Client = this.getClient();
|
106
146
|
|
107
147
|
const command = new PutObjectCommand({
|
108
148
|
Bucket, Key, Body, ContentType,
|
@@ -129,7 +169,7 @@ Worker.prototype.write = async function (options) {
|
|
129
169
|
const Body = content;
|
130
170
|
|
131
171
|
debug(`Writing content of length ${content.length} to ${JSON.stringify({ Bucket, Key })}}`);
|
132
|
-
const s3Client =
|
172
|
+
const s3Client = this.getClient();
|
133
173
|
const ContentType = mime.lookup(file);
|
134
174
|
|
135
175
|
const command = new PutObjectCommand({
|
@@ -151,7 +191,7 @@ Worker.prototype.list = async function ({ directory }) {
|
|
151
191
|
let dir = directory;
|
152
192
|
while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
|
153
193
|
const { Bucket, Key: Prefix } = getParts(dir);
|
154
|
-
const s3Client =
|
194
|
+
const s3Client = this.getClient();
|
155
195
|
const command = new ListObjectsV2Command({
|
156
196
|
Bucket,
|
157
197
|
Prefix: `${Prefix}/`,
|
@@ -182,7 +222,7 @@ Worker.prototype.listAll = async function ({ directory }) {
|
|
182
222
|
let dir = directory;
|
183
223
|
while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
|
184
224
|
const { Bucket, Key: Prefix } = getParts(dir);
|
185
|
-
const s3Client =
|
225
|
+
const s3Client = this.getClient();
|
186
226
|
const files = [];
|
187
227
|
let ContinuationToken = null;
|
188
228
|
do {
|
@@ -211,7 +251,7 @@ Worker.prototype.listAll.metadata = {
|
|
211
251
|
Worker.prototype.stat = async function ({ filename }) {
|
212
252
|
if (!filename) throw new Error('filename is required');
|
213
253
|
|
214
|
-
const s3Client =
|
254
|
+
const s3Client = this.getClient();
|
215
255
|
const { Bucket, Key } = getParts(filename);
|
216
256
|
const command = new HeadObjectCommand({ Bucket, Key });
|
217
257
|
const response = await s3Client.send(command);
|
package/index.js
CHANGED
@@ -364,6 +364,10 @@ function getTimelineEntryUUID(inputObject, { defaults = {} } = {}) {
|
|
364
364
|
// eslint-disable-next-line no-restricted-globals
|
365
365
|
if (isNaN(ts)) throw new Error(`getTimelineEntryUUID got an invalid date:${o.ts || '<blank>'}`);
|
366
366
|
const idString = `${ts.toISOString()}-${o.person_id}-${o.entry_type_id}-${o.source_code_id || 0}`;
|
367
|
+
|
368
|
+
if (!uuidIsValid(o.input_id)) {
|
369
|
+
throw new Error(`Invalid input_id:'${o.input_id}', type ${typeof o.input_id} -- should be a uuid`);
|
370
|
+
}
|
367
371
|
// get a temp ID
|
368
372
|
const uuid = uuidv5(idString, o.input_id);
|
369
373
|
// Change out the ts to match the v7 sorting.
|