@engine9/input-tools 2.0.8 → 2.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/file/FileUtilities.js +54 -2
- package/file/S3.js +321 -256
- package/file/tools.js +14 -0
- package/package.json +1 -1
- package/skills/timeline/SKILL.md +152 -0
package/file/FileUtilities.js
CHANGED
|
@@ -11,12 +11,12 @@ import languageEncoding from 'detect-file-encoding-and-language';
|
|
|
11
11
|
import R2Worker from './R2.js';
|
|
12
12
|
import S3Worker from './S3.js';
|
|
13
13
|
import ParquetWorker from './Parquet.js';
|
|
14
|
-
import { bool, getTempFilename, getStringArray, getTempDir, makeStrings, streamPacket, relativeDate } from './tools.js';
|
|
14
|
+
import { bool, getTempFilename, getStringArray, getTempDir, getFilePostfix, makeStrings, streamPacket, relativeDate } from './tools.js';
|
|
15
15
|
const fsp = fs.promises;
|
|
16
16
|
const { Readable, Transform, PassThrough, Writable } = nodestream;
|
|
17
17
|
const { pipeline } = promises;
|
|
18
18
|
|
|
19
|
-
const debug = debug$0('@engine9
|
|
19
|
+
const debug = debug$0('@engine9/file');
|
|
20
20
|
const { getXlsxStream } = xlstream;
|
|
21
21
|
|
|
22
22
|
function Worker({ accountId }) {
|
|
@@ -657,6 +657,58 @@ Worker.prototype.list.metadata = {
|
|
|
657
657
|
directory: { required: true }
|
|
658
658
|
}
|
|
659
659
|
};
|
|
660
|
+
Worker.prototype.analyze = async function ({ directory }) {
|
|
661
|
+
if (!directory) throw new Error('directory is required');
|
|
662
|
+
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
|
663
|
+
const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
664
|
+
return worker.analyze({ directory });
|
|
665
|
+
}
|
|
666
|
+
let fileCount = 0;
|
|
667
|
+
let directoryCount = 0;
|
|
668
|
+
let firstModified = null;
|
|
669
|
+
let lastModified = null;
|
|
670
|
+
let firstTime = null;
|
|
671
|
+
let lastTime = null;
|
|
672
|
+
const postfixCounts = Object.create(null);
|
|
673
|
+
const walk = async (dir) => {
|
|
674
|
+
const entries = await fsp.readdir(dir, { withFileTypes: true });
|
|
675
|
+
for (const ent of entries) {
|
|
676
|
+
const fullPath = path.join(dir, ent.name);
|
|
677
|
+
if (ent.isDirectory()) {
|
|
678
|
+
directoryCount += 1;
|
|
679
|
+
await walk(fullPath);
|
|
680
|
+
} else {
|
|
681
|
+
fileCount += 1;
|
|
682
|
+
const postfix = getFilePostfix(fullPath);
|
|
683
|
+
postfixCounts[postfix] = (postfixCounts[postfix] || 0) + 1;
|
|
684
|
+
const stats = await fsp.stat(fullPath);
|
|
685
|
+
const mtime = stats.mtimeMs;
|
|
686
|
+
const modifiedAt = new Date(stats.mtime).toISOString();
|
|
687
|
+
if (firstTime === null || mtime < firstTime) {
|
|
688
|
+
firstTime = mtime;
|
|
689
|
+
firstModified = { filename: fullPath, modifiedAt };
|
|
690
|
+
}
|
|
691
|
+
if (lastTime === null || mtime > lastTime) {
|
|
692
|
+
lastTime = mtime;
|
|
693
|
+
lastModified = { filename: fullPath, modifiedAt };
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
};
|
|
698
|
+
await walk(directory);
|
|
699
|
+
return {
|
|
700
|
+
fileCount,
|
|
701
|
+
directoryCount,
|
|
702
|
+
postfixCounts,
|
|
703
|
+
firstModified: fileCount ? firstModified : null,
|
|
704
|
+
lastModified: fileCount ? lastModified : null
|
|
705
|
+
};
|
|
706
|
+
};
|
|
707
|
+
Worker.prototype.analyze.metadata = {
|
|
708
|
+
options: {
|
|
709
|
+
directory: { required: true }
|
|
710
|
+
}
|
|
711
|
+
};
|
|
660
712
|
Worker.prototype.listAll = async function ({ directory, start: s, end: e }) {
|
|
661
713
|
if (!directory) throw new Error('directory is required');
|
|
662
714
|
let start = null;
|
package/file/S3.js
CHANGED
|
@@ -1,308 +1,373 @@
|
|
|
1
|
-
import debug$0 from
|
|
2
|
-
import fs from
|
|
3
|
-
import withDb from
|
|
4
|
-
import clientS3 from
|
|
5
|
-
import { getTempFilename, relativeDate } from
|
|
6
|
-
const debug = debug$0('@engine9
|
|
1
|
+
import debug$0 from 'debug';
|
|
2
|
+
import fs from 'node:fs';
|
|
3
|
+
import withDb from 'mime-type/with-db';
|
|
4
|
+
import clientS3 from '@aws-sdk/client-s3';
|
|
5
|
+
import { getTempFilename, getFilePostfix, relativeDate } from './tools.js';
|
|
6
|
+
const debug = debug$0('@engine9/input/S3');
|
|
7
7
|
const { mimeType: mime } = withDb;
|
|
8
|
-
const {
|
|
8
|
+
const {
|
|
9
|
+
S3Client,
|
|
10
|
+
CopyObjectCommand,
|
|
11
|
+
DeleteObjectCommand,
|
|
12
|
+
GetObjectCommand,
|
|
13
|
+
HeadObjectCommand,
|
|
14
|
+
GetObjectAttributesCommand,
|
|
15
|
+
PutObjectCommand,
|
|
16
|
+
ListObjectsV2Command
|
|
17
|
+
} = clientS3;
|
|
9
18
|
function Worker() {
|
|
10
|
-
|
|
19
|
+
this.prefix = 's3';
|
|
11
20
|
}
|
|
12
21
|
function getParts(filename) {
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
return { Bucket, Key };
|
|
22
|
+
if (!filename) throw new Error(`Invalid filename: ${filename}`);
|
|
23
|
+
if (!filename.startsWith('r2://') && !filename.startsWith('s3://')) {
|
|
24
|
+
throw new Error(`Invalid filename, must start with r2:// or s3://: ${filename}`);
|
|
25
|
+
}
|
|
26
|
+
const parts = filename.split('/');
|
|
27
|
+
const Bucket = parts[2];
|
|
28
|
+
const Key = parts.slice(3).join('/');
|
|
29
|
+
return { Bucket, Key };
|
|
22
30
|
}
|
|
23
31
|
Worker.prototype.getClient = function () {
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
return this.client;
|
|
32
|
+
if (!this.client) this.client = new S3Client({});
|
|
33
|
+
return this.client;
|
|
27
34
|
};
|
|
28
35
|
Worker.prototype.getMetadata = async function ({ filename }) {
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
36
|
+
const s3Client = this.getClient();
|
|
37
|
+
const { Bucket, Key } = getParts(filename);
|
|
38
|
+
const resp = await s3Client.send(
|
|
39
|
+
new GetObjectAttributesCommand({
|
|
40
|
+
Bucket,
|
|
41
|
+
Key,
|
|
42
|
+
ObjectAttributes: ['ETag', 'Checksum', 'ObjectParts', 'StorageClass', 'ObjectSize']
|
|
43
|
+
})
|
|
44
|
+
);
|
|
45
|
+
return resp;
|
|
37
46
|
};
|
|
38
47
|
Worker.prototype.getMetadata.metadata = {
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
48
|
+
options: {
|
|
49
|
+
filename: {}
|
|
50
|
+
}
|
|
42
51
|
};
|
|
43
52
|
Worker.prototype.stream = async function ({ filename }) {
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
}
|
|
53
|
+
const s3Client = this.getClient();
|
|
54
|
+
const { Bucket, Key } = getParts(filename);
|
|
55
|
+
const command = new GetObjectCommand({ Bucket, Key });
|
|
56
|
+
try {
|
|
57
|
+
debug(`Streaming file s3://${Bucket}/${Key}`);
|
|
58
|
+
const response = await s3Client.send(command);
|
|
59
|
+
return { stream: response.Body };
|
|
60
|
+
} catch (e) {
|
|
61
|
+
debug(`Could not stream filename:${filename}`);
|
|
62
|
+
throw e;
|
|
63
|
+
}
|
|
56
64
|
};
|
|
57
65
|
Worker.prototype.stream.metadata = {
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
66
|
+
options: {
|
|
67
|
+
filename: {}
|
|
68
|
+
}
|
|
61
69
|
};
|
|
62
70
|
Worker.prototype.copy = async function ({ filename, target }) {
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
return s3Client.send(command);
|
|
71
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
72
|
+
//we're fine
|
|
73
|
+
} else {
|
|
74
|
+
throw new Error('Cowardly not copying a file not from s3 -- use put instead');
|
|
75
|
+
}
|
|
76
|
+
const s3Client = this.getClient();
|
|
77
|
+
const { Bucket, Key } = getParts(target);
|
|
78
|
+
debug(`Copying ${filename} to ${JSON.stringify({ Bucket, Key })}}`);
|
|
79
|
+
const command = new CopyObjectCommand({
|
|
80
|
+
CopySource: filename.slice(4), // remove the s3:/
|
|
81
|
+
Bucket,
|
|
82
|
+
Key
|
|
83
|
+
});
|
|
84
|
+
return s3Client.send(command);
|
|
78
85
|
};
|
|
79
86
|
Worker.prototype.copy.metadata = {
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
87
|
+
options: {
|
|
88
|
+
filename: {},
|
|
89
|
+
target: {}
|
|
90
|
+
}
|
|
84
91
|
};
|
|
85
92
|
Worker.prototype.move = async function ({ filename, target }) {
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
93
|
+
await this.copy({ filename, target });
|
|
94
|
+
await this.remove({ filename });
|
|
95
|
+
return { filename: target };
|
|
89
96
|
};
|
|
90
97
|
Worker.prototype.move.metadata = {
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
98
|
+
options: {
|
|
99
|
+
filename: {},
|
|
100
|
+
target: {}
|
|
101
|
+
}
|
|
95
102
|
};
|
|
96
103
|
Worker.prototype.remove = async function ({ filename }) {
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
104
|
+
const s3Client = this.getClient();
|
|
105
|
+
const { Bucket, Key } = getParts(filename);
|
|
106
|
+
const command = new DeleteObjectCommand({ Bucket, Key });
|
|
107
|
+
return s3Client.send(command);
|
|
101
108
|
};
|
|
102
109
|
Worker.prototype.remove.metadata = {
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
110
|
+
options: {
|
|
111
|
+
filename: {}
|
|
112
|
+
}
|
|
106
113
|
};
|
|
107
114
|
Worker.prototype.download = async function ({ filename }) {
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
});
|
|
122
|
-
fileStream.on('error', reject);
|
|
115
|
+
const file = filename.split('/').pop();
|
|
116
|
+
const localPath = await getTempFilename({ targetFilename: file });
|
|
117
|
+
const s3Client = this.getClient();
|
|
118
|
+
const { Bucket, Key } = getParts(filename);
|
|
119
|
+
const command = new GetObjectCommand({ Bucket, Key });
|
|
120
|
+
debug(`Downloading ${file} to ${localPath}`);
|
|
121
|
+
const response = await s3Client.send(command);
|
|
122
|
+
const fileStream = fs.createWriteStream(localPath);
|
|
123
|
+
response.Body.pipe(fileStream);
|
|
124
|
+
return new Promise((resolve, reject) => {
|
|
125
|
+
fileStream.on('finish', async () => {
|
|
126
|
+
const { size } = await fs.promises.stat(localPath);
|
|
127
|
+
resolve({ size, filename: localPath });
|
|
123
128
|
});
|
|
129
|
+
fileStream.on('error', reject);
|
|
130
|
+
});
|
|
124
131
|
};
|
|
125
132
|
Worker.prototype.download.metadata = {
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
133
|
+
options: {
|
|
134
|
+
filename: {}
|
|
135
|
+
}
|
|
129
136
|
};
|
|
130
137
|
Worker.prototype.put = async function (options) {
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
return s3Client.send(command);
|
|
138
|
+
const { filename, directory } = options;
|
|
139
|
+
if (!filename) throw new Error('Local filename required');
|
|
140
|
+
if (directory?.indexOf('s3://') !== 0 && directory?.indexOf('r2://') !== 0)
|
|
141
|
+
throw new Error(`directory path must start with s3:// or r2://, is ${directory}`);
|
|
142
|
+
const file = options.file || filename.split('/').pop();
|
|
143
|
+
const parts = directory.split('/');
|
|
144
|
+
const Bucket = parts[2];
|
|
145
|
+
const Key = parts.slice(3).filter(Boolean).concat(file).join('/');
|
|
146
|
+
const Body = fs.createReadStream(filename);
|
|
147
|
+
const ContentType = mime.lookup(file);
|
|
148
|
+
debug(`Putting ${filename} to ${JSON.stringify({ Bucket, Key, ContentType })}}`);
|
|
149
|
+
const s3Client = this.getClient();
|
|
150
|
+
const command = new PutObjectCommand({
|
|
151
|
+
Bucket,
|
|
152
|
+
Key,
|
|
153
|
+
Body,
|
|
154
|
+
ContentType
|
|
155
|
+
});
|
|
156
|
+
return s3Client.send(command);
|
|
151
157
|
};
|
|
152
158
|
Worker.prototype.put.metadata = {
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
159
|
+
options: {
|
|
160
|
+
filename: {},
|
|
161
|
+
directory: { description: 'Directory to put file, e.g. s3://foo-bar/dir/xyz' },
|
|
162
|
+
file: { description: 'Name of file, defaults to the filename' }
|
|
163
|
+
}
|
|
158
164
|
};
|
|
159
165
|
Worker.prototype.write = async function (options) {
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
return s3Client.send(command);
|
|
166
|
+
const { directory, file, content } = options;
|
|
167
|
+
if (!directory?.indexOf('s3://') === 0) throw new Error('directory must start with s3://');
|
|
168
|
+
const parts = directory.split('/');
|
|
169
|
+
const Bucket = parts[2];
|
|
170
|
+
const Key = parts.slice(3).filter(Boolean).concat(file).join('/');
|
|
171
|
+
const Body = content;
|
|
172
|
+
debug(`Writing content of length ${content.length} to ${JSON.stringify({ Bucket, Key })}}`);
|
|
173
|
+
const s3Client = this.getClient();
|
|
174
|
+
const ContentType = mime.lookup(file);
|
|
175
|
+
const command = new PutObjectCommand({
|
|
176
|
+
Bucket,
|
|
177
|
+
Key,
|
|
178
|
+
Body,
|
|
179
|
+
ContentType
|
|
180
|
+
});
|
|
181
|
+
return s3Client.send(command);
|
|
177
182
|
};
|
|
178
183
|
Worker.prototype.write.metadata = {
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
+
options: {
|
|
185
|
+
directory: { description: 'Directory to put file, e.g. s3://foo-bar/dir/xyz' },
|
|
186
|
+
file: { description: 'Name of file, defaults to the filename' },
|
|
187
|
+
content: { description: 'Contents of file' }
|
|
188
|
+
}
|
|
184
189
|
};
|
|
185
190
|
Worker.prototype.list = async function ({ directory, start, end, raw }) {
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
const output = []
|
|
203
|
-
.concat((CommonPrefixes || []).map((f) => ({
|
|
191
|
+
if (!directory) throw new Error('directory is required');
|
|
192
|
+
let dir = directory;
|
|
193
|
+
while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
|
|
194
|
+
const { Bucket, Key: Prefix } = getParts(dir);
|
|
195
|
+
const s3Client = this.getClient();
|
|
196
|
+
const command = new ListObjectsV2Command({
|
|
197
|
+
Bucket,
|
|
198
|
+
Prefix: `${Prefix}/`,
|
|
199
|
+
Delimiter: '/'
|
|
200
|
+
});
|
|
201
|
+
const { Contents: files, CommonPrefixes } = await s3Client.send(command);
|
|
202
|
+
if (raw) return files;
|
|
203
|
+
// debug('Prefixes:', { CommonPrefixes });
|
|
204
|
+
const output = []
|
|
205
|
+
.concat(
|
|
206
|
+
(CommonPrefixes || []).map((f) => ({
|
|
204
207
|
name: f.Prefix.slice(Prefix.length + 1, -1),
|
|
205
208
|
type: 'directory'
|
|
206
|
-
|
|
207
|
-
|
|
209
|
+
}))
|
|
210
|
+
)
|
|
211
|
+
.concat(
|
|
212
|
+
(files || [])
|
|
208
213
|
.filter(({ LastModified }) => {
|
|
209
|
-
|
|
214
|
+
if (start && new Date(LastModified) < start) {
|
|
210
215
|
return false;
|
|
211
|
-
|
|
212
|
-
else if (end && new Date(LastModified) > end) {
|
|
216
|
+
} else if (end && new Date(LastModified) > end) {
|
|
213
217
|
return false;
|
|
214
|
-
|
|
215
|
-
else {
|
|
218
|
+
} else {
|
|
216
219
|
return true;
|
|
217
|
-
|
|
218
|
-
|
|
220
|
+
}
|
|
221
|
+
})
|
|
219
222
|
.map(({ Key, Size, LastModified }) => ({
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
223
|
+
name: Key.slice(Prefix.length + 1),
|
|
224
|
+
type: 'file',
|
|
225
|
+
size: Size,
|
|
226
|
+
modifiedAt: new Date(LastModified).toISOString()
|
|
227
|
+
}))
|
|
228
|
+
);
|
|
229
|
+
return output;
|
|
226
230
|
};
|
|
227
231
|
Worker.prototype.list.metadata = {
|
|
228
|
-
|
|
229
|
-
|
|
232
|
+
options: {
|
|
233
|
+
directory: { required: true }
|
|
234
|
+
}
|
|
235
|
+
};
|
|
236
|
+
Worker.prototype.analyze = async function ({ directory }) {
|
|
237
|
+
if (!directory) throw new Error('directory is required');
|
|
238
|
+
let dir = directory;
|
|
239
|
+
while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
|
|
240
|
+
const { Bucket, Key } = getParts(dir);
|
|
241
|
+
const s3Client = this.getClient();
|
|
242
|
+
let Prefix = '';
|
|
243
|
+
if (Key) Prefix = `${Key}/`;
|
|
244
|
+
const dirsSeen = new Set();
|
|
245
|
+
let fileCount = 0;
|
|
246
|
+
let firstModified = null;
|
|
247
|
+
let lastModified = null;
|
|
248
|
+
let firstTime = null;
|
|
249
|
+
let lastTime = null;
|
|
250
|
+
const postfixCounts = Object.create(null);
|
|
251
|
+
let ContinuationToken = undefined;
|
|
252
|
+
do {
|
|
253
|
+
const result = await s3Client.send(
|
|
254
|
+
new ListObjectsV2Command({
|
|
255
|
+
Bucket,
|
|
256
|
+
Prefix,
|
|
257
|
+
ContinuationToken
|
|
258
|
+
})
|
|
259
|
+
);
|
|
260
|
+
for (const content of result.Contents || []) {
|
|
261
|
+
const objectKey = content.Key;
|
|
262
|
+
let rel = Prefix ? (objectKey.startsWith(Prefix) ? objectKey.slice(Prefix.length) : objectKey) : objectKey;
|
|
263
|
+
if (!rel) continue;
|
|
264
|
+
const isFolderMarker = rel.endsWith('/');
|
|
265
|
+
const parts = rel.replace(/\/$/, '').split('/').filter(Boolean);
|
|
266
|
+
for (let i = 0; i < parts.length - 1; i++) {
|
|
267
|
+
dirsSeen.add(parts.slice(0, i + 1).join('/'));
|
|
268
|
+
}
|
|
269
|
+
if (isFolderMarker) {
|
|
270
|
+
if (parts.length) dirsSeen.add(parts.join('/'));
|
|
271
|
+
continue;
|
|
272
|
+
}
|
|
273
|
+
fileCount++;
|
|
274
|
+
const postfix = getFilePostfix(objectKey);
|
|
275
|
+
postfixCounts[postfix] = (postfixCounts[postfix] || 0) + 1;
|
|
276
|
+
const mtime = new Date(content.LastModified).getTime();
|
|
277
|
+
const modifiedAt = new Date(content.LastModified).toISOString();
|
|
278
|
+
const filename = `${this.prefix}://${Bucket}/${objectKey}`;
|
|
279
|
+
if (firstTime === null || mtime < firstTime) {
|
|
280
|
+
firstTime = mtime;
|
|
281
|
+
firstModified = { filename, modifiedAt };
|
|
282
|
+
}
|
|
283
|
+
if (lastTime === null || mtime > lastTime) {
|
|
284
|
+
lastTime = mtime;
|
|
285
|
+
lastModified = { filename, modifiedAt };
|
|
286
|
+
}
|
|
230
287
|
}
|
|
288
|
+
ContinuationToken = result.IsTruncated ? result.NextContinuationToken : undefined;
|
|
289
|
+
} while (ContinuationToken);
|
|
290
|
+
return {
|
|
291
|
+
fileCount,
|
|
292
|
+
directoryCount: dirsSeen.size,
|
|
293
|
+
postfixCounts,
|
|
294
|
+
firstModified: fileCount ? firstModified : null,
|
|
295
|
+
lastModified: fileCount ? lastModified : null
|
|
296
|
+
};
|
|
297
|
+
};
|
|
298
|
+
Worker.prototype.analyze.metadata = {
|
|
299
|
+
options: {
|
|
300
|
+
directory: { required: true }
|
|
301
|
+
}
|
|
231
302
|
};
|
|
232
303
|
/* List everything with the prefix */
|
|
233
304
|
Worker.prototype.listAll = async function (options) {
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
files.push(...newFiles);
|
|
271
|
-
ContinuationToken = result.NextContinuationToken;
|
|
272
|
-
} while (ContinuationToken);
|
|
273
|
-
return files;
|
|
305
|
+
const { directory } = options;
|
|
306
|
+
if (!directory) throw new Error('directory is required');
|
|
307
|
+
let dir = directory;
|
|
308
|
+
const start = options.start && relativeDate(options.start);
|
|
309
|
+
const end = options.end && relativeDate(options.end);
|
|
310
|
+
while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
|
|
311
|
+
const { Bucket, Key } = getParts(dir);
|
|
312
|
+
const s3Client = this.getClient();
|
|
313
|
+
const files = [];
|
|
314
|
+
let ContinuationToken = null;
|
|
315
|
+
let Prefix = null;
|
|
316
|
+
if (Key) Prefix = `${Key}/`;
|
|
317
|
+
do {
|
|
318
|
+
const command = new ListObjectsV2Command({
|
|
319
|
+
Bucket,
|
|
320
|
+
Prefix,
|
|
321
|
+
ContinuationToken
|
|
322
|
+
// Delimiter: '/',
|
|
323
|
+
});
|
|
324
|
+
debug(`Sending List command with prefix ${Prefix} with ContinuationToken ${ContinuationToken}`);
|
|
325
|
+
const result = await s3Client.send(command);
|
|
326
|
+
const newFiles =
|
|
327
|
+
result.Contents?.filter(({ LastModified }) => {
|
|
328
|
+
if (start && new Date(LastModified) < start) {
|
|
329
|
+
return false;
|
|
330
|
+
} else if (end && new Date(LastModified) > end) {
|
|
331
|
+
return false;
|
|
332
|
+
} else {
|
|
333
|
+
return true;
|
|
334
|
+
}
|
|
335
|
+
})?.map((d) => `${this.prefix}://${Bucket}/${d.Key}`) || [];
|
|
336
|
+
debug(`Retrieved ${newFiles.length} new files, total ${files.length},sample ${newFiles.slice(0, 3).join(',')}`);
|
|
337
|
+
files.push(...newFiles);
|
|
338
|
+
ContinuationToken = result.NextContinuationToken;
|
|
339
|
+
} while (ContinuationToken);
|
|
340
|
+
return files;
|
|
274
341
|
};
|
|
275
342
|
Worker.prototype.listAll.metadata = {
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
343
|
+
options: {
|
|
344
|
+
directory: { required: true }
|
|
345
|
+
}
|
|
279
346
|
};
|
|
280
347
|
Worker.prototype.moveAll = async function ({ directory, targetDirectory }) {
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
|
|
348
|
+
if (!directory || !targetDirectory) throw new Error('directory and targetDirectory required');
|
|
349
|
+
const files = await this.listAll({ directory });
|
|
350
|
+
const configs = files.map((d) => ({
|
|
351
|
+
filename: d,
|
|
352
|
+
target: d.replace(directory, targetDirectory)
|
|
353
|
+
}));
|
|
354
|
+
const pLimit = await import('p-limit');
|
|
355
|
+
const limitedMethod = pLimit.default(10);
|
|
356
|
+
return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
|
|
291
357
|
};
|
|
292
358
|
Worker.prototype.moveAll.metadata = {
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
359
|
+
options: {
|
|
360
|
+
directory: { required: true },
|
|
361
|
+
targetDirectory: { required: true }
|
|
362
|
+
}
|
|
297
363
|
};
|
|
298
364
|
Worker.prototype.stat = async function ({ filename }) {
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
const {
|
|
365
|
+
if (!filename) throw new Error('filename is required');
|
|
366
|
+
const s3Client = this.getClient();
|
|
367
|
+
const { Bucket, Key } = getParts(filename);
|
|
368
|
+
const command = new HeadObjectCommand({ Bucket, Key });
|
|
369
|
+
const response = await s3Client.send(command);
|
|
370
|
+
const {
|
|
306
371
|
// "AcceptRanges": "bytes",
|
|
307
372
|
ContentLength, // : "3191",
|
|
308
373
|
ContentType, // : "image/jpeg",
|
|
@@ -310,20 +375,20 @@ Worker.prototype.stat = async function ({ filename }) {
|
|
|
310
375
|
LastModified // : "2016-12-15T01:19:41.000Z",
|
|
311
376
|
// Metadata": {},
|
|
312
377
|
// VersionId": "null"
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
378
|
+
} = response;
|
|
379
|
+
const modifiedAt = new Date(LastModified);
|
|
380
|
+
const createdAt = modifiedAt; // Same for S3
|
|
381
|
+
const size = parseInt(ContentLength, 10);
|
|
382
|
+
return {
|
|
383
|
+
createdAt,
|
|
384
|
+
modifiedAt,
|
|
385
|
+
contentType: ContentType,
|
|
386
|
+
size
|
|
387
|
+
};
|
|
323
388
|
};
|
|
324
389
|
Worker.prototype.stat.metadata = {
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
390
|
+
options: {
|
|
391
|
+
filename: {}
|
|
392
|
+
}
|
|
328
393
|
};
|
|
329
394
|
export default Worker;
|
package/file/tools.js
CHANGED
|
@@ -300,6 +300,18 @@ function makeStrings(o) {
|
|
|
300
300
|
return a;
|
|
301
301
|
}, {});
|
|
302
302
|
}
|
|
303
|
+
/** Basename postfix with leading dot, e.g. `.txt`, `.csv.gz`; empty string if none. */
|
|
304
|
+
function getFilePostfix(filename) {
|
|
305
|
+
const base = path.basename(filename).toLowerCase();
|
|
306
|
+
if (!base || !base.includes('.')) return '';
|
|
307
|
+
if (base.endsWith('.gz')) {
|
|
308
|
+
const withoutGz = base.slice(0, -3);
|
|
309
|
+
const i = withoutGz.lastIndexOf('.');
|
|
310
|
+
if (i >= 0) return `${withoutGz.slice(i)}.gz`;
|
|
311
|
+
return '.gz';
|
|
312
|
+
}
|
|
313
|
+
return base.slice(base.lastIndexOf('.'));
|
|
314
|
+
}
|
|
303
315
|
function appendPostfix(filename, postfix) {
|
|
304
316
|
const filenameParts = filename.split('/');
|
|
305
317
|
const fileParts = filenameParts
|
|
@@ -331,6 +343,7 @@ export { getTempDir };
|
|
|
331
343
|
export { getBatchTransform };
|
|
332
344
|
export { getDebatchTransform };
|
|
333
345
|
export { getFile };
|
|
346
|
+
export { getFilePostfix };
|
|
334
347
|
export { getManifest };
|
|
335
348
|
export { getPacketFiles };
|
|
336
349
|
export { getStringArray };
|
|
@@ -348,6 +361,7 @@ export default {
|
|
|
348
361
|
getBatchTransform,
|
|
349
362
|
getDebatchTransform,
|
|
350
363
|
getFile,
|
|
364
|
+
getFilePostfix,
|
|
351
365
|
getManifest,
|
|
352
366
|
getPacketFiles,
|
|
353
367
|
getStringArray,
|
package/package.json
CHANGED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: timeline
|
|
3
|
+
description: Describes Engine9 timeline file formats (Timeline ID vs Timeline Raw) and how to construct them using only the utilities exported by @engine9/input-tools. Use when creating or transforming timeline-shaped data in plugins or ingestion code.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
## Timeline files in Engine9
|
|
7
|
+
|
|
8
|
+
Use this skill whenever you are:
|
|
9
|
+
|
|
10
|
+
- **Designing timeline outputs** from plugins or ingestion code.
|
|
11
|
+
- **Producing files** that will eventually land in an Engine9 `timeline`-style table (for example via server workers).
|
|
12
|
+
|
|
13
|
+
Engine9 models person-level activity as **timeline entries**. A timeline entry is a single fact about a person at a point in time (email send, open, click, transaction, signup, etc.), identified by:
|
|
14
|
+
|
|
15
|
+
- **`ts`**: timestamp of the event.
|
|
16
|
+
- **`entry_type_id`**: numeric type from `TIMELINE_ENTRY_TYPES` (`input-tools/timelineTypes.js`).
|
|
17
|
+
- **`person_id`**: internal person identifier.
|
|
18
|
+
- **`input_id`**: which input/source this event came from.
|
|
19
|
+
- **`id`**: a deterministic UUID derived from the above (via `getTimelineEntryUUID`).
|
|
20
|
+
|
|
21
|
+
There are **two main on-disk timeline file shapes**:
|
|
22
|
+
|
|
23
|
+
- **Timeline ID files** – already resolved to `person_id` and `id`. These are ready to load into the `timeline` table.
|
|
24
|
+
- **Timeline Raw files** – do **not** contain `person_id`. They must go through person resolution and ID assignment before they can be loaded.
|
|
25
|
+
|
|
26
|
+
## Timeline ID files
|
|
27
|
+
|
|
28
|
+
**Use when** you want data that is ready to be:
|
|
29
|
+
|
|
30
|
+
- Loaded into a downstream `timeline` table.
|
|
31
|
+
- Joined against plugin-specific detail tables.
|
|
32
|
+
- De-duplicated by `id` (UUID).
|
|
33
|
+
|
|
34
|
+
### Core shape
|
|
35
|
+
|
|
36
|
+
Timeline ID files are typically produced by:
|
|
37
|
+
|
|
38
|
+
- Plugin or ingestion code that has already resolved a stable `person_id` and `input_id`.
|
|
39
|
+
- Mappers that call `getEntryTypeId` and `getTimelineEntryUUID` from `@engine9/input-tools`.
|
|
40
|
+
|
|
41
|
+
**Minimum required fields** for a Timeline ID file that downstream workers will accept:
|
|
42
|
+
|
|
43
|
+
- **`id`**: UUID for the timeline entry.
|
|
44
|
+
- Generated by `getTimelineEntryUUID`, or provided as a stable `remote_entry_uuid`.
|
|
45
|
+
- In `InputWorker.id`, `appendTimelineId` writes this into the `id` column.
|
|
46
|
+
- **`ts`**: timestamp (string or number) that can be parsed into a `Date`.
|
|
47
|
+
- **`person_id`**: internal numeric person id.
|
|
48
|
+
- **`entry_type_id`**: integer from `TIMELINE_ENTRY_TYPES`.
|
|
49
|
+
|
|
50
|
+
**Common optional fields**:
|
|
51
|
+
|
|
52
|
+
- **`source_code_id`**: numeric source code identifier.
|
|
53
|
+
- **`email_domain`**: lower-cased domain, often derived from `email`.
|
|
54
|
+
- Any number of **extra columns** (detail fields); these can be copied into plugin-specific detail tables.
|
|
55
|
+
|
|
56
|
+
The downstream `timeline` table schema usually includes:
|
|
57
|
+
|
|
58
|
+
- A primary key `id` column (UUID stored as text).
|
|
59
|
+
- A timestamp column `ts` (millis since epoch).
|
|
60
|
+
- Integer columns `entry_type_id` and `person_id`.
|
|
61
|
+
- Optional columns such as `source_code_id` and `email_domain`.
|
|
62
|
+
|
|
63
|
+
### How to construct Timeline ID files with input-tools
|
|
64
|
+
|
|
65
|
+
When authoring a Timeline ID-producing job or plugin using `@engine9/input-tools`:
|
|
66
|
+
|
|
67
|
+
- **Always include** `id`, `ts`, `person_id`, and `entry_type_id` on each emitted row.
|
|
68
|
+
- **Prefer numeric `entry_type_id`**, but you may also keep a string `entry_type` for debugging; resolution between the two happens via `TIMELINE_ENTRY_TYPES`, `getEntryTypeId`, and `getEntryType`.
|
|
69
|
+
- **Keep `input_id` stable** per logical input stream; `getTimelineEntryUUID` uses it as the UUID namespace when generating `id`.
|
|
70
|
+
|
|
71
|
+
## Timeline Raw files
|
|
72
|
+
|
|
73
|
+
**Use when** you have raw events from an external system and **cannot yet** assign `person_id` but still want to capture structured activity.
|
|
74
|
+
|
|
75
|
+
Examples:
|
|
76
|
+
|
|
77
|
+
- Raw web or email events that only know an email or other external identifier.
|
|
78
|
+
- Logs from external APIs where person resolution happens later in the pipeline.
|
|
79
|
+
|
|
80
|
+
### Core shape
|
|
81
|
+
|
|
82
|
+
Timeline Raw files:
|
|
83
|
+
|
|
84
|
+
- **Must not contain** `person_id` (by definition for this skill).
|
|
85
|
+
- **May or may not contain** `id`.
|
|
86
|
+
- If they do contain an `id`, it is usually an external event ID or `remote_entry_uuid`, not necessarily the final Engine9 `id`.
|
|
87
|
+
- **Should contain enough information** to derive:
|
|
88
|
+
- A timestamp: **`ts`** (or a field that is mapped to `ts`).
|
|
89
|
+
- An entry type: **`entry_type`** (string) or **`entry_type_id`** (numeric).
|
|
90
|
+
- A person identifier that can be resolved later (e.g. `remote_person_id`, `email`, or similar).
|
|
91
|
+
|
|
92
|
+
Typical fields you will see:
|
|
93
|
+
|
|
94
|
+
- **`ts`** or a source-specific timestamp (later mapped to `ts`).
|
|
95
|
+
- **`entry_type`** or **`entry_type_id`** (e.g. `'EMAIL_UNSUBSCRIBE'`, `'EMAIL_OPEN'`, etc.).
|
|
96
|
+
- **Contact fields**: `email`, `remote_person_id`, phone number, etc.
|
|
97
|
+
- **Source metadata**: `account_id`, `plugin_id`, `url`, `user_agent`, `ip_address`, etc.
|
|
98
|
+
|
|
99
|
+
For example, a plugin may map an inbound event into a row with:
|
|
100
|
+
|
|
101
|
+
- `ts`, `account_id`, `entry_type_id`, `email`, `email_domain`, `url`, `user_agent`
|
|
102
|
+
|
|
103
|
+
and **no `person_id`** yet.
|
|
104
|
+
|
|
105
|
+
### Converting Timeline Raw → Timeline ID
|
|
106
|
+
|
|
107
|
+
The usual pathway for Raw → ID is:
|
|
108
|
+
|
|
109
|
+
1. **Map raw events into a timeline-shaped object** (with `ts`, `entry_type`/`entry_type_id`, and contact info).
|
|
110
|
+
2. **Resolve people** (outside of input-tools):
|
|
111
|
+
- Use your application’s person resolution or a server worker to:
|
|
112
|
+
- Look up or create `person` rows.
|
|
113
|
+
- Attach a canonical `person_id` to each row.
|
|
114
|
+
3. **Assign timeline IDs** with input-tools:
|
|
115
|
+
- Use `getEntryTypeId` (if needed) to ensure `entry_type_id` is set from `TIMELINE_ENTRY_TYPES` when only `entry_type` is present.
|
|
116
|
+
- Call `getTimelineEntryUUID` to:
|
|
117
|
+
- Require `ts`, `entry_type_id`, `input_id`, and `person_id`.
|
|
118
|
+
- Produce a deterministic, sortable UUID for `id`.
|
|
119
|
+
4. **Write out a Timeline ID file** (for example, parquet or CSV) with the full set of fields (`id`, `ts`, `person_id`, `entry_type_id`, optional `source_code_id`, etc.).
|
|
120
|
+
|
|
121
|
+
## Choosing between Timeline ID and Timeline Raw
|
|
122
|
+
|
|
123
|
+
- **Choose Timeline ID files when**:
|
|
124
|
+
- You can resolve `person_id` and `input_id` in the current process.
|
|
125
|
+
- You want files that are **immediately loadable** into a `timeline` table.
|
|
126
|
+
- You need **deduplication** by a stable `id`.
|
|
127
|
+
|
|
128
|
+
- **Choose Timeline Raw files when**:
|
|
129
|
+
- You are at the **edge of the system** (plugins, collectors, ETL jobs) and only have partial identity information.
|
|
130
|
+
- You plan a **later enrichment step** that will attach `person_id` and compute final `id` values.
|
|
131
|
+
- You want to keep the ingestion simpler and defer canonicalization.
|
|
132
|
+
|
|
133
|
+
In practice:
|
|
134
|
+
|
|
135
|
+
- **Plugins and edge collectors** often emit **Timeline Raw** shaped data first.
|
|
136
|
+
- **Downstream services or server workers** then:
|
|
137
|
+
- Resolve people (`person_id`).
|
|
138
|
+
- Generate `id` via `getTimelineEntryUUID`.
|
|
139
|
+
- Persist **Timeline ID** files and load them into the `timeline` and detail tables.
|
|
140
|
+
|
|
141
|
+
## Reference helpers
|
|
142
|
+
|
|
143
|
+
When working with any timeline format, prefer the utilities in `@engine9/input-tools`:
|
|
144
|
+
|
|
145
|
+
- **`TIMELINE_ENTRY_TYPES`** (`timelineTypes.js`): bidirectional map between string entry types and numeric `entry_type_id`.
|
|
146
|
+
- **`getEntryTypeId`**: resolve `entry_type` → `entry_type_id` with validation.
|
|
147
|
+
- **`getEntryType`**: resolve `entry_type_id` → `entry_type`.
|
|
148
|
+
- **`getTimelineEntryUUID`**: generate or normalize `id` given `ts`, `entry_type_id`, `input_id`, and `person_id`, respecting `remote_entry_uuid` / `remote_entry_id` when present.
|
|
149
|
+
- **`uuidIsValid`**: validate that a string is a proper UUID.
|
|
150
|
+
|
|
151
|
+
Use these helpers instead of hard-coding IDs or types whenever you construct timeline rows, whether Raw or ID.
|
|
152
|
+
|