@engine9/input-tools 2.0.7 → 2.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ForEachEntry.js +1 -1
- package/README.md +4 -3
- package/file/FileUtilities.js +49 -1
- package/file/S3.js +317 -256
- package/package.json +2 -2
- package/skills/timeline/SKILL.md +152 -0
- package/skills/transaction-mapping/SKILL.md +1 -1
package/ForEachEntry.js
CHANGED
|
@@ -12,7 +12,7 @@ import FileUtilities from './file/FileUtilities.js';
|
|
|
12
12
|
import { getTempFilename, getBatchTransform, getFile, streamPacket } from './file/tools.js';
|
|
13
13
|
const { Transform, Writable } = nodestream;
|
|
14
14
|
const { pipeline } = promises;
|
|
15
|
-
const debug = debug$0('@engine9
|
|
15
|
+
const debug = debug$0('@engine9/input-tools');
|
|
16
16
|
const debugThrottle = throttle(1000, debug, { noLeading: false, noTrailing: false });
|
|
17
17
|
class ForEachEntry {
|
|
18
18
|
constructor({ accountId } = {}) {
|
package/README.md
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# @engine9/input-tools
|
|
2
|
+
|
|
2
3
|
Tools for reading, writing, and management of Engine9 style inputs.
|
|
3
4
|
|
|
4
|
-
The @engine9
|
|
5
|
-
records, appending statistics and other zip files.
|
|
6
|
-
third parties to interact with engine9 instances.
|
|
5
|
+
The @engine9/input-tools are utilities for iterating through
|
|
6
|
+
records, appending statistics and other zip files. It's intended to be used by
|
|
7
|
+
third parties to interact with engine9 instances.
|
package/file/FileUtilities.js
CHANGED
|
@@ -16,7 +16,7 @@ const fsp = fs.promises;
|
|
|
16
16
|
const { Readable, Transform, PassThrough, Writable } = nodestream;
|
|
17
17
|
const { pipeline } = promises;
|
|
18
18
|
|
|
19
|
-
const debug = debug$0('@engine9
|
|
19
|
+
const debug = debug$0('@engine9/file');
|
|
20
20
|
const { getXlsxStream } = xlstream;
|
|
21
21
|
|
|
22
22
|
function Worker({ accountId }) {
|
|
@@ -657,6 +657,54 @@ Worker.prototype.list.metadata = {
|
|
|
657
657
|
directory: { required: true }
|
|
658
658
|
}
|
|
659
659
|
};
|
|
660
|
+
Worker.prototype.analyze = async function ({ directory }) {
|
|
661
|
+
if (!directory) throw new Error('directory is required');
|
|
662
|
+
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
|
663
|
+
const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
664
|
+
return worker.analyze({ directory });
|
|
665
|
+
}
|
|
666
|
+
let fileCount = 0;
|
|
667
|
+
let directoryCount = 0;
|
|
668
|
+
let firstModified = null;
|
|
669
|
+
let lastModified = null;
|
|
670
|
+
let firstTime = null;
|
|
671
|
+
let lastTime = null;
|
|
672
|
+
const walk = async (dir) => {
|
|
673
|
+
const entries = await fsp.readdir(dir, { withFileTypes: true });
|
|
674
|
+
for (const ent of entries) {
|
|
675
|
+
const fullPath = path.join(dir, ent.name);
|
|
676
|
+
if (ent.isDirectory()) {
|
|
677
|
+
directoryCount += 1;
|
|
678
|
+
await walk(fullPath);
|
|
679
|
+
} else {
|
|
680
|
+
fileCount += 1;
|
|
681
|
+
const stats = await fsp.stat(fullPath);
|
|
682
|
+
const mtime = stats.mtimeMs;
|
|
683
|
+
const modifiedAt = new Date(stats.mtime).toISOString();
|
|
684
|
+
if (firstTime === null || mtime < firstTime) {
|
|
685
|
+
firstTime = mtime;
|
|
686
|
+
firstModified = { filename: fullPath, modifiedAt };
|
|
687
|
+
}
|
|
688
|
+
if (lastTime === null || mtime > lastTime) {
|
|
689
|
+
lastTime = mtime;
|
|
690
|
+
lastModified = { filename: fullPath, modifiedAt };
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
};
|
|
695
|
+
await walk(directory);
|
|
696
|
+
return {
|
|
697
|
+
fileCount,
|
|
698
|
+
directoryCount,
|
|
699
|
+
firstModified: fileCount ? firstModified : null,
|
|
700
|
+
lastModified: fileCount ? lastModified : null
|
|
701
|
+
};
|
|
702
|
+
};
|
|
703
|
+
Worker.prototype.analyze.metadata = {
|
|
704
|
+
options: {
|
|
705
|
+
directory: { required: true }
|
|
706
|
+
}
|
|
707
|
+
};
|
|
660
708
|
Worker.prototype.listAll = async function ({ directory, start: s, end: e }) {
|
|
661
709
|
if (!directory) throw new Error('directory is required');
|
|
662
710
|
let start = null;
|
package/file/S3.js
CHANGED
|
@@ -1,308 +1,369 @@
|
|
|
1
|
-
import debug$0 from
|
|
2
|
-
import fs from
|
|
3
|
-
import withDb from
|
|
4
|
-
import clientS3 from
|
|
5
|
-
import { getTempFilename, relativeDate } from
|
|
6
|
-
const debug = debug$0('@engine9
|
|
1
|
+
import debug$0 from 'debug';
|
|
2
|
+
import fs from 'node:fs';
|
|
3
|
+
import withDb from 'mime-type/with-db';
|
|
4
|
+
import clientS3 from '@aws-sdk/client-s3';
|
|
5
|
+
import { getTempFilename, relativeDate } from './tools.js';
|
|
6
|
+
const debug = debug$0('@engine9/input/S3');
|
|
7
7
|
const { mimeType: mime } = withDb;
|
|
8
|
-
const {
|
|
8
|
+
const {
|
|
9
|
+
S3Client,
|
|
10
|
+
CopyObjectCommand,
|
|
11
|
+
DeleteObjectCommand,
|
|
12
|
+
GetObjectCommand,
|
|
13
|
+
HeadObjectCommand,
|
|
14
|
+
GetObjectAttributesCommand,
|
|
15
|
+
PutObjectCommand,
|
|
16
|
+
ListObjectsV2Command
|
|
17
|
+
} = clientS3;
|
|
9
18
|
function Worker() {
|
|
10
|
-
|
|
19
|
+
this.prefix = 's3';
|
|
11
20
|
}
|
|
12
21
|
function getParts(filename) {
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
return { Bucket, Key };
|
|
22
|
+
if (!filename) throw new Error(`Invalid filename: ${filename}`);
|
|
23
|
+
if (!filename.startsWith('r2://') && !filename.startsWith('s3://')) {
|
|
24
|
+
throw new Error(`Invalid filename, must start with r2:// or s3://: ${filename}`);
|
|
25
|
+
}
|
|
26
|
+
const parts = filename.split('/');
|
|
27
|
+
const Bucket = parts[2];
|
|
28
|
+
const Key = parts.slice(3).join('/');
|
|
29
|
+
return { Bucket, Key };
|
|
22
30
|
}
|
|
23
31
|
Worker.prototype.getClient = function () {
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
return this.client;
|
|
32
|
+
if (!this.client) this.client = new S3Client({});
|
|
33
|
+
return this.client;
|
|
27
34
|
};
|
|
28
35
|
Worker.prototype.getMetadata = async function ({ filename }) {
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
36
|
+
const s3Client = this.getClient();
|
|
37
|
+
const { Bucket, Key } = getParts(filename);
|
|
38
|
+
const resp = await s3Client.send(
|
|
39
|
+
new GetObjectAttributesCommand({
|
|
40
|
+
Bucket,
|
|
41
|
+
Key,
|
|
42
|
+
ObjectAttributes: ['ETag', 'Checksum', 'ObjectParts', 'StorageClass', 'ObjectSize']
|
|
43
|
+
})
|
|
44
|
+
);
|
|
45
|
+
return resp;
|
|
37
46
|
};
|
|
38
47
|
Worker.prototype.getMetadata.metadata = {
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
48
|
+
options: {
|
|
49
|
+
filename: {}
|
|
50
|
+
}
|
|
42
51
|
};
|
|
43
52
|
Worker.prototype.stream = async function ({ filename }) {
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
}
|
|
53
|
+
const s3Client = this.getClient();
|
|
54
|
+
const { Bucket, Key } = getParts(filename);
|
|
55
|
+
const command = new GetObjectCommand({ Bucket, Key });
|
|
56
|
+
try {
|
|
57
|
+
debug(`Streaming file s3://${Bucket}/${Key}`);
|
|
58
|
+
const response = await s3Client.send(command);
|
|
59
|
+
return { stream: response.Body };
|
|
60
|
+
} catch (e) {
|
|
61
|
+
debug(`Could not stream filename:${filename}`);
|
|
62
|
+
throw e;
|
|
63
|
+
}
|
|
56
64
|
};
|
|
57
65
|
Worker.prototype.stream.metadata = {
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
66
|
+
options: {
|
|
67
|
+
filename: {}
|
|
68
|
+
}
|
|
61
69
|
};
|
|
62
70
|
Worker.prototype.copy = async function ({ filename, target }) {
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
return s3Client.send(command);
|
|
71
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
72
|
+
//we're fine
|
|
73
|
+
} else {
|
|
74
|
+
throw new Error('Cowardly not copying a file not from s3 -- use put instead');
|
|
75
|
+
}
|
|
76
|
+
const s3Client = this.getClient();
|
|
77
|
+
const { Bucket, Key } = getParts(target);
|
|
78
|
+
debug(`Copying ${filename} to ${JSON.stringify({ Bucket, Key })}}`);
|
|
79
|
+
const command = new CopyObjectCommand({
|
|
80
|
+
CopySource: filename.slice(4), // remove the s3:/
|
|
81
|
+
Bucket,
|
|
82
|
+
Key
|
|
83
|
+
});
|
|
84
|
+
return s3Client.send(command);
|
|
78
85
|
};
|
|
79
86
|
Worker.prototype.copy.metadata = {
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
87
|
+
options: {
|
|
88
|
+
filename: {},
|
|
89
|
+
target: {}
|
|
90
|
+
}
|
|
84
91
|
};
|
|
85
92
|
Worker.prototype.move = async function ({ filename, target }) {
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
93
|
+
await this.copy({ filename, target });
|
|
94
|
+
await this.remove({ filename });
|
|
95
|
+
return { filename: target };
|
|
89
96
|
};
|
|
90
97
|
Worker.prototype.move.metadata = {
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
98
|
+
options: {
|
|
99
|
+
filename: {},
|
|
100
|
+
target: {}
|
|
101
|
+
}
|
|
95
102
|
};
|
|
96
103
|
Worker.prototype.remove = async function ({ filename }) {
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
104
|
+
const s3Client = this.getClient();
|
|
105
|
+
const { Bucket, Key } = getParts(filename);
|
|
106
|
+
const command = new DeleteObjectCommand({ Bucket, Key });
|
|
107
|
+
return s3Client.send(command);
|
|
101
108
|
};
|
|
102
109
|
Worker.prototype.remove.metadata = {
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
110
|
+
options: {
|
|
111
|
+
filename: {}
|
|
112
|
+
}
|
|
106
113
|
};
|
|
107
114
|
Worker.prototype.download = async function ({ filename }) {
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
});
|
|
122
|
-
fileStream.on('error', reject);
|
|
115
|
+
const file = filename.split('/').pop();
|
|
116
|
+
const localPath = await getTempFilename({ targetFilename: file });
|
|
117
|
+
const s3Client = this.getClient();
|
|
118
|
+
const { Bucket, Key } = getParts(filename);
|
|
119
|
+
const command = new GetObjectCommand({ Bucket, Key });
|
|
120
|
+
debug(`Downloading ${file} to ${localPath}`);
|
|
121
|
+
const response = await s3Client.send(command);
|
|
122
|
+
const fileStream = fs.createWriteStream(localPath);
|
|
123
|
+
response.Body.pipe(fileStream);
|
|
124
|
+
return new Promise((resolve, reject) => {
|
|
125
|
+
fileStream.on('finish', async () => {
|
|
126
|
+
const { size } = await fs.promises.stat(localPath);
|
|
127
|
+
resolve({ size, filename: localPath });
|
|
123
128
|
});
|
|
129
|
+
fileStream.on('error', reject);
|
|
130
|
+
});
|
|
124
131
|
};
|
|
125
132
|
Worker.prototype.download.metadata = {
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
133
|
+
options: {
|
|
134
|
+
filename: {}
|
|
135
|
+
}
|
|
129
136
|
};
|
|
130
137
|
Worker.prototype.put = async function (options) {
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
return s3Client.send(command);
|
|
138
|
+
const { filename, directory } = options;
|
|
139
|
+
if (!filename) throw new Error('Local filename required');
|
|
140
|
+
if (directory?.indexOf('s3://') !== 0 && directory?.indexOf('r2://') !== 0)
|
|
141
|
+
throw new Error(`directory path must start with s3:// or r2://, is ${directory}`);
|
|
142
|
+
const file = options.file || filename.split('/').pop();
|
|
143
|
+
const parts = directory.split('/');
|
|
144
|
+
const Bucket = parts[2];
|
|
145
|
+
const Key = parts.slice(3).filter(Boolean).concat(file).join('/');
|
|
146
|
+
const Body = fs.createReadStream(filename);
|
|
147
|
+
const ContentType = mime.lookup(file);
|
|
148
|
+
debug(`Putting ${filename} to ${JSON.stringify({ Bucket, Key, ContentType })}}`);
|
|
149
|
+
const s3Client = this.getClient();
|
|
150
|
+
const command = new PutObjectCommand({
|
|
151
|
+
Bucket,
|
|
152
|
+
Key,
|
|
153
|
+
Body,
|
|
154
|
+
ContentType
|
|
155
|
+
});
|
|
156
|
+
return s3Client.send(command);
|
|
151
157
|
};
|
|
152
158
|
Worker.prototype.put.metadata = {
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
159
|
+
options: {
|
|
160
|
+
filename: {},
|
|
161
|
+
directory: { description: 'Directory to put file, e.g. s3://foo-bar/dir/xyz' },
|
|
162
|
+
file: { description: 'Name of file, defaults to the filename' }
|
|
163
|
+
}
|
|
158
164
|
};
|
|
159
165
|
Worker.prototype.write = async function (options) {
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
return s3Client.send(command);
|
|
166
|
+
const { directory, file, content } = options;
|
|
167
|
+
if (!directory?.indexOf('s3://') === 0) throw new Error('directory must start with s3://');
|
|
168
|
+
const parts = directory.split('/');
|
|
169
|
+
const Bucket = parts[2];
|
|
170
|
+
const Key = parts.slice(3).filter(Boolean).concat(file).join('/');
|
|
171
|
+
const Body = content;
|
|
172
|
+
debug(`Writing content of length ${content.length} to ${JSON.stringify({ Bucket, Key })}}`);
|
|
173
|
+
const s3Client = this.getClient();
|
|
174
|
+
const ContentType = mime.lookup(file);
|
|
175
|
+
const command = new PutObjectCommand({
|
|
176
|
+
Bucket,
|
|
177
|
+
Key,
|
|
178
|
+
Body,
|
|
179
|
+
ContentType
|
|
180
|
+
});
|
|
181
|
+
return s3Client.send(command);
|
|
177
182
|
};
|
|
178
183
|
Worker.prototype.write.metadata = {
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
+
options: {
|
|
185
|
+
directory: { description: 'Directory to put file, e.g. s3://foo-bar/dir/xyz' },
|
|
186
|
+
file: { description: 'Name of file, defaults to the filename' },
|
|
187
|
+
content: { description: 'Contents of file' }
|
|
188
|
+
}
|
|
184
189
|
};
|
|
185
190
|
Worker.prototype.list = async function ({ directory, start, end, raw }) {
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
const output = []
|
|
203
|
-
.concat((CommonPrefixes || []).map((f) => ({
|
|
191
|
+
if (!directory) throw new Error('directory is required');
|
|
192
|
+
let dir = directory;
|
|
193
|
+
while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
|
|
194
|
+
const { Bucket, Key: Prefix } = getParts(dir);
|
|
195
|
+
const s3Client = this.getClient();
|
|
196
|
+
const command = new ListObjectsV2Command({
|
|
197
|
+
Bucket,
|
|
198
|
+
Prefix: `${Prefix}/`,
|
|
199
|
+
Delimiter: '/'
|
|
200
|
+
});
|
|
201
|
+
const { Contents: files, CommonPrefixes } = await s3Client.send(command);
|
|
202
|
+
if (raw) return files;
|
|
203
|
+
// debug('Prefixes:', { CommonPrefixes });
|
|
204
|
+
const output = []
|
|
205
|
+
.concat(
|
|
206
|
+
(CommonPrefixes || []).map((f) => ({
|
|
204
207
|
name: f.Prefix.slice(Prefix.length + 1, -1),
|
|
205
208
|
type: 'directory'
|
|
206
|
-
|
|
207
|
-
|
|
209
|
+
}))
|
|
210
|
+
)
|
|
211
|
+
.concat(
|
|
212
|
+
(files || [])
|
|
208
213
|
.filter(({ LastModified }) => {
|
|
209
|
-
|
|
214
|
+
if (start && new Date(LastModified) < start) {
|
|
210
215
|
return false;
|
|
211
|
-
|
|
212
|
-
else if (end && new Date(LastModified) > end) {
|
|
216
|
+
} else if (end && new Date(LastModified) > end) {
|
|
213
217
|
return false;
|
|
214
|
-
|
|
215
|
-
else {
|
|
218
|
+
} else {
|
|
216
219
|
return true;
|
|
217
|
-
|
|
218
|
-
|
|
220
|
+
}
|
|
221
|
+
})
|
|
219
222
|
.map(({ Key, Size, LastModified }) => ({
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
223
|
+
name: Key.slice(Prefix.length + 1),
|
|
224
|
+
type: 'file',
|
|
225
|
+
size: Size,
|
|
226
|
+
modifiedAt: new Date(LastModified).toISOString()
|
|
227
|
+
}))
|
|
228
|
+
);
|
|
229
|
+
return output;
|
|
226
230
|
};
|
|
227
231
|
Worker.prototype.list.metadata = {
|
|
228
|
-
|
|
229
|
-
|
|
232
|
+
options: {
|
|
233
|
+
directory: { required: true }
|
|
234
|
+
}
|
|
235
|
+
};
|
|
236
|
+
Worker.prototype.analyze = async function ({ directory }) {
|
|
237
|
+
if (!directory) throw new Error('directory is required');
|
|
238
|
+
let dir = directory;
|
|
239
|
+
while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
|
|
240
|
+
const { Bucket, Key } = getParts(dir);
|
|
241
|
+
const s3Client = this.getClient();
|
|
242
|
+
let Prefix = '';
|
|
243
|
+
if (Key) Prefix = `${Key}/`;
|
|
244
|
+
const dirsSeen = new Set();
|
|
245
|
+
let fileCount = 0;
|
|
246
|
+
let firstModified = null;
|
|
247
|
+
let lastModified = null;
|
|
248
|
+
let firstTime = null;
|
|
249
|
+
let lastTime = null;
|
|
250
|
+
let ContinuationToken = undefined;
|
|
251
|
+
do {
|
|
252
|
+
const result = await s3Client.send(
|
|
253
|
+
new ListObjectsV2Command({
|
|
254
|
+
Bucket,
|
|
255
|
+
Prefix,
|
|
256
|
+
ContinuationToken
|
|
257
|
+
})
|
|
258
|
+
);
|
|
259
|
+
for (const content of result.Contents || []) {
|
|
260
|
+
const objectKey = content.Key;
|
|
261
|
+
let rel = Prefix ? (objectKey.startsWith(Prefix) ? objectKey.slice(Prefix.length) : objectKey) : objectKey;
|
|
262
|
+
if (!rel) continue;
|
|
263
|
+
const isFolderMarker = rel.endsWith('/');
|
|
264
|
+
const parts = rel.replace(/\/$/, '').split('/').filter(Boolean);
|
|
265
|
+
for (let i = 0; i < parts.length - 1; i++) {
|
|
266
|
+
dirsSeen.add(parts.slice(0, i + 1).join('/'));
|
|
267
|
+
}
|
|
268
|
+
if (isFolderMarker) {
|
|
269
|
+
if (parts.length) dirsSeen.add(parts.join('/'));
|
|
270
|
+
continue;
|
|
271
|
+
}
|
|
272
|
+
fileCount++;
|
|
273
|
+
const mtime = new Date(content.LastModified).getTime();
|
|
274
|
+
const modifiedAt = new Date(content.LastModified).toISOString();
|
|
275
|
+
const filename = `${this.prefix}://${Bucket}/${objectKey}`;
|
|
276
|
+
if (firstTime === null || mtime < firstTime) {
|
|
277
|
+
firstTime = mtime;
|
|
278
|
+
firstModified = { filename, modifiedAt };
|
|
279
|
+
}
|
|
280
|
+
if (lastTime === null || mtime > lastTime) {
|
|
281
|
+
lastTime = mtime;
|
|
282
|
+
lastModified = { filename, modifiedAt };
|
|
283
|
+
}
|
|
230
284
|
}
|
|
285
|
+
ContinuationToken = result.IsTruncated ? result.NextContinuationToken : undefined;
|
|
286
|
+
} while (ContinuationToken);
|
|
287
|
+
return {
|
|
288
|
+
fileCount,
|
|
289
|
+
directoryCount: dirsSeen.size,
|
|
290
|
+
firstModified: fileCount ? firstModified : null,
|
|
291
|
+
lastModified: fileCount ? lastModified : null
|
|
292
|
+
};
|
|
293
|
+
};
|
|
294
|
+
Worker.prototype.analyze.metadata = {
|
|
295
|
+
options: {
|
|
296
|
+
directory: { required: true }
|
|
297
|
+
}
|
|
231
298
|
};
|
|
232
299
|
/* List everything with the prefix */
|
|
233
300
|
Worker.prototype.listAll = async function (options) {
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
files.push(...newFiles);
|
|
271
|
-
ContinuationToken = result.NextContinuationToken;
|
|
272
|
-
} while (ContinuationToken);
|
|
273
|
-
return files;
|
|
301
|
+
const { directory } = options;
|
|
302
|
+
if (!directory) throw new Error('directory is required');
|
|
303
|
+
let dir = directory;
|
|
304
|
+
const start = options.start && relativeDate(options.start);
|
|
305
|
+
const end = options.end && relativeDate(options.end);
|
|
306
|
+
while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
|
|
307
|
+
const { Bucket, Key } = getParts(dir);
|
|
308
|
+
const s3Client = this.getClient();
|
|
309
|
+
const files = [];
|
|
310
|
+
let ContinuationToken = null;
|
|
311
|
+
let Prefix = null;
|
|
312
|
+
if (Key) Prefix = `${Key}/`;
|
|
313
|
+
do {
|
|
314
|
+
const command = new ListObjectsV2Command({
|
|
315
|
+
Bucket,
|
|
316
|
+
Prefix,
|
|
317
|
+
ContinuationToken
|
|
318
|
+
// Delimiter: '/',
|
|
319
|
+
});
|
|
320
|
+
debug(`Sending List command with prefix ${Prefix} with ContinuationToken ${ContinuationToken}`);
|
|
321
|
+
const result = await s3Client.send(command);
|
|
322
|
+
const newFiles =
|
|
323
|
+
result.Contents?.filter(({ LastModified }) => {
|
|
324
|
+
if (start && new Date(LastModified) < start) {
|
|
325
|
+
return false;
|
|
326
|
+
} else if (end && new Date(LastModified) > end) {
|
|
327
|
+
return false;
|
|
328
|
+
} else {
|
|
329
|
+
return true;
|
|
330
|
+
}
|
|
331
|
+
})?.map((d) => `${this.prefix}://${Bucket}/${d.Key}`) || [];
|
|
332
|
+
debug(`Retrieved ${newFiles.length} new files, total ${files.length},sample ${newFiles.slice(0, 3).join(',')}`);
|
|
333
|
+
files.push(...newFiles);
|
|
334
|
+
ContinuationToken = result.NextContinuationToken;
|
|
335
|
+
} while (ContinuationToken);
|
|
336
|
+
return files;
|
|
274
337
|
};
|
|
275
338
|
Worker.prototype.listAll.metadata = {
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
339
|
+
options: {
|
|
340
|
+
directory: { required: true }
|
|
341
|
+
}
|
|
279
342
|
};
|
|
280
343
|
Worker.prototype.moveAll = async function ({ directory, targetDirectory }) {
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
|
|
344
|
+
if (!directory || !targetDirectory) throw new Error('directory and targetDirectory required');
|
|
345
|
+
const files = await this.listAll({ directory });
|
|
346
|
+
const configs = files.map((d) => ({
|
|
347
|
+
filename: d,
|
|
348
|
+
target: d.replace(directory, targetDirectory)
|
|
349
|
+
}));
|
|
350
|
+
const pLimit = await import('p-limit');
|
|
351
|
+
const limitedMethod = pLimit.default(10);
|
|
352
|
+
return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
|
|
291
353
|
};
|
|
292
354
|
Worker.prototype.moveAll.metadata = {
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
355
|
+
options: {
|
|
356
|
+
directory: { required: true },
|
|
357
|
+
targetDirectory: { required: true }
|
|
358
|
+
}
|
|
297
359
|
};
|
|
298
360
|
Worker.prototype.stat = async function ({ filename }) {
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
const {
|
|
361
|
+
if (!filename) throw new Error('filename is required');
|
|
362
|
+
const s3Client = this.getClient();
|
|
363
|
+
const { Bucket, Key } = getParts(filename);
|
|
364
|
+
const command = new HeadObjectCommand({ Bucket, Key });
|
|
365
|
+
const response = await s3Client.send(command);
|
|
366
|
+
const {
|
|
306
367
|
// "AcceptRanges": "bytes",
|
|
307
368
|
ContentLength, // : "3191",
|
|
308
369
|
ContentType, // : "image/jpeg",
|
|
@@ -310,20 +371,20 @@ Worker.prototype.stat = async function ({ filename }) {
|
|
|
310
371
|
LastModified // : "2016-12-15T01:19:41.000Z",
|
|
311
372
|
// Metadata": {},
|
|
312
373
|
// VersionId": "null"
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
374
|
+
} = response;
|
|
375
|
+
const modifiedAt = new Date(LastModified);
|
|
376
|
+
const createdAt = modifiedAt; // Same for S3
|
|
377
|
+
const size = parseInt(ContentLength, 10);
|
|
378
|
+
return {
|
|
379
|
+
createdAt,
|
|
380
|
+
modifiedAt,
|
|
381
|
+
contentType: ContentType,
|
|
382
|
+
size
|
|
383
|
+
};
|
|
323
384
|
};
|
|
324
385
|
Worker.prototype.stat.metadata = {
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
386
|
+
options: {
|
|
387
|
+
filename: {}
|
|
388
|
+
}
|
|
328
389
|
};
|
|
329
390
|
export default Worker;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@engine9/input-tools",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.9",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Tools for dealing with Engine9 inputs",
|
|
6
6
|
"main": "index.js",
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
"author": "Engine9",
|
|
12
12
|
"license": "GPL-3.0-or-later",
|
|
13
13
|
"devDependencies": {
|
|
14
|
-
"eslint": "^9.
|
|
14
|
+
"eslint": "^9.39.2"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
17
|
"@aws-sdk/client-s3": "^3.893.0",
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: timeline
|
|
3
|
+
description: Describes Engine9 timeline file formats (Timeline ID vs Timeline Raw) and how to construct them using only the utilities exported by @engine9/input-tools. Use when creating or transforming timeline-shaped data in plugins or ingestion code.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
## Timeline files in Engine9
|
|
7
|
+
|
|
8
|
+
Use this skill whenever you are:
|
|
9
|
+
|
|
10
|
+
- **Designing timeline outputs** from plugins or ingestion code.
|
|
11
|
+
- **Producing files** that will eventually land in an Engine9 `timeline`-style table (for example via server workers).
|
|
12
|
+
|
|
13
|
+
Engine9 models person-level activity as **timeline entries**. A timeline entry is a single fact about a person at a point in time (email send, open, click, transaction, signup, etc.), identified by:
|
|
14
|
+
|
|
15
|
+
- **`ts`**: timestamp of the event.
|
|
16
|
+
- **`entry_type_id`**: numeric type from `TIMELINE_ENTRY_TYPES` (`input-tools/timelineTypes.js`).
|
|
17
|
+
- **`person_id`**: internal person identifier.
|
|
18
|
+
- **`input_id`**: which input/source this event came from.
|
|
19
|
+
- **`id`**: a deterministic UUID derived from the above (via `getTimelineEntryUUID`).
|
|
20
|
+
|
|
21
|
+
There are **two main on-disk timeline file shapes**:
|
|
22
|
+
|
|
23
|
+
- **Timeline ID files** – already resolved to `person_id` and `id`. These are ready to load into the `timeline` table.
|
|
24
|
+
- **Timeline Raw files** – do **not** contain `person_id`. They must go through person resolution and ID assignment before they can be loaded.
|
|
25
|
+
|
|
26
|
+
## Timeline ID files
|
|
27
|
+
|
|
28
|
+
**Use when** you want data that is ready to be:
|
|
29
|
+
|
|
30
|
+
- Loaded into a downstream `timeline` table.
|
|
31
|
+
- Joined against plugin-specific detail tables.
|
|
32
|
+
- De-duplicated by `id` (UUID).
|
|
33
|
+
|
|
34
|
+
### Core shape
|
|
35
|
+
|
|
36
|
+
Timeline ID files are typically produced by:
|
|
37
|
+
|
|
38
|
+
- Plugin or ingestion code that has already resolved a stable `person_id` and `input_id`.
|
|
39
|
+
- Mappers that call `getEntryTypeId` and `getTimelineEntryUUID` from `@engine9/input-tools`.
|
|
40
|
+
|
|
41
|
+
**Minimum required fields** for a Timeline ID file that downstream workers will accept:
|
|
42
|
+
|
|
43
|
+
- **`id`**: UUID for the timeline entry.
|
|
44
|
+
- Generated by `getTimelineEntryUUID`, or provided as a stable `remote_entry_uuid`.
|
|
45
|
+
- In `InputWorker.id`, `appendTimelineId` writes this into the `id` column.
|
|
46
|
+
- **`ts`**: timestamp (string or number) that can be parsed into a `Date`.
|
|
47
|
+
- **`person_id`**: internal numeric person id.
|
|
48
|
+
- **`entry_type_id`**: integer from `TIMELINE_ENTRY_TYPES`.
|
|
49
|
+
|
|
50
|
+
**Common optional fields**:
|
|
51
|
+
|
|
52
|
+
- **`source_code_id`**: numeric source code identifier.
|
|
53
|
+
- **`email_domain`**: lower-cased domain, often derived from `email`.
|
|
54
|
+
- Any number of **extra columns** (detail fields); these can be copied into plugin-specific detail tables.
|
|
55
|
+
|
|
56
|
+
The downstream `timeline` table schema usually includes:
|
|
57
|
+
|
|
58
|
+
- A primary key `id` column (UUID stored as text).
|
|
59
|
+
- A timestamp column `ts` (millis since epoch).
|
|
60
|
+
- Integer columns `entry_type_id` and `person_id`.
|
|
61
|
+
- Optional columns such as `source_code_id` and `email_domain`.
|
|
62
|
+
|
|
63
|
+
### How to construct Timeline ID files with input-tools
|
|
64
|
+
|
|
65
|
+
When authoring a Timeline ID-producing job or plugin using `@engine9/input-tools`:
|
|
66
|
+
|
|
67
|
+
- **Always include** `id`, `ts`, `person_id`, and `entry_type_id` on each emitted row.
|
|
68
|
+
- **Prefer numeric `entry_type_id`**, but you may also keep a string `entry_type` for debugging; resolution between the two happens via `TIMELINE_ENTRY_TYPES`, `getEntryTypeId`, and `getEntryType`.
|
|
69
|
+
- **Keep `input_id` stable** per logical input stream; `getTimelineEntryUUID` uses it as the UUID namespace when generating `id`.
|
|
70
|
+
|
|
71
|
+
## Timeline Raw files
|
|
72
|
+
|
|
73
|
+
**Use when** you have raw events from an external system and **cannot yet** assign `person_id` but still want to capture structured activity.
|
|
74
|
+
|
|
75
|
+
Examples:
|
|
76
|
+
|
|
77
|
+
- Raw web or email events that only know an email or other external identifier.
|
|
78
|
+
- Logs from external APIs where person resolution happens later in the pipeline.
|
|
79
|
+
|
|
80
|
+
### Core shape
|
|
81
|
+
|
|
82
|
+
Timeline Raw files:
|
|
83
|
+
|
|
84
|
+
- **Must not contain** `person_id` (by definition for this skill).
|
|
85
|
+
- **May or may not contain** `id`.
|
|
86
|
+
- If they do contain an `id`, it is usually an external event ID or `remote_entry_uuid`, not necessarily the final Engine9 `id`.
|
|
87
|
+
- **Should contain enough information** to derive:
|
|
88
|
+
- A timestamp: **`ts`** (or a field that is mapped to `ts`).
|
|
89
|
+
- An entry type: **`entry_type`** (string) or **`entry_type_id`** (numeric).
|
|
90
|
+
- A person identifier that can be resolved later (e.g. `remote_person_id`, `email`, or similar).
|
|
91
|
+
|
|
92
|
+
Typical fields you will see:
|
|
93
|
+
|
|
94
|
+
- **`ts`** or a source-specific timestamp (later mapped to `ts`).
|
|
95
|
+
- **`entry_type`** or **`entry_type_id`** (e.g. `'EMAIL_UNSUBSCRIBE'`, `'EMAIL_OPEN'`, etc.).
|
|
96
|
+
- **Contact fields**: `email`, `remote_person_id`, phone number, etc.
|
|
97
|
+
- **Source metadata**: `account_id`, `plugin_id`, `url`, `user_agent`, `ip_address`, etc.
|
|
98
|
+
|
|
99
|
+
For example, a plugin may map an inbound event into a row with:
|
|
100
|
+
|
|
101
|
+
- `ts`, `account_id`, `entry_type_id`, `email`, `email_domain`, `url`, `user_agent`
|
|
102
|
+
|
|
103
|
+
and **no `person_id`** yet.
|
|
104
|
+
|
|
105
|
+
### Converting Timeline Raw → Timeline ID
|
|
106
|
+
|
|
107
|
+
The usual pathway for Raw → ID is:
|
|
108
|
+
|
|
109
|
+
1. **Map raw events into a timeline-shaped object** (with `ts`, `entry_type`/`entry_type_id`, and contact info).
|
|
110
|
+
2. **Resolve people** (outside of input-tools):
|
|
111
|
+
- Use your application’s person resolution or a server worker to:
|
|
112
|
+
- Look up or create `person` rows.
|
|
113
|
+
- Attach a canonical `person_id` to each row.
|
|
114
|
+
3. **Assign timeline IDs** with input-tools:
|
|
115
|
+
- Use `getEntryTypeId` (if needed) to ensure `entry_type_id` is set from `TIMELINE_ENTRY_TYPES` when only `entry_type` is present.
|
|
116
|
+
- Call `getTimelineEntryUUID` to:
|
|
117
|
+
- Require `ts`, `entry_type_id`, `input_id`, and `person_id`.
|
|
118
|
+
- Produce a deterministic, sortable UUID for `id`.
|
|
119
|
+
4. **Write out a Timeline ID file** (for example, parquet or CSV) with the full set of fields (`id`, `ts`, `person_id`, `entry_type_id`, optional `source_code_id`, etc.).
|
|
120
|
+
|
|
121
|
+
## Choosing between Timeline ID and Timeline Raw
|
|
122
|
+
|
|
123
|
+
- **Choose Timeline ID files when**:
|
|
124
|
+
- You can resolve `person_id` and `input_id` in the current process.
|
|
125
|
+
- You want files that are **immediately loadable** into a `timeline` table.
|
|
126
|
+
- You need **deduplication** by a stable `id`.
|
|
127
|
+
|
|
128
|
+
- **Choose Timeline Raw files when**:
|
|
129
|
+
- You are at the **edge of the system** (plugins, collectors, ETL jobs) and only have partial identity information.
|
|
130
|
+
- You plan a **later enrichment step** that will attach `person_id` and compute final `id` values.
|
|
131
|
+
- You want to keep the ingestion simpler and defer canonicalization.
|
|
132
|
+
|
|
133
|
+
In practice:
|
|
134
|
+
|
|
135
|
+
- **Plugins and edge collectors** often emit **Timeline Raw** shaped data first.
|
|
136
|
+
- **Downstream services or server workers** then:
|
|
137
|
+
- Resolve people (`person_id`).
|
|
138
|
+
- Generate `id` via `getTimelineEntryUUID`.
|
|
139
|
+
- Persist **Timeline ID** files and load them into the `timeline` and detail tables.
|
|
140
|
+
|
|
141
|
+
## Reference helpers
|
|
142
|
+
|
|
143
|
+
When working with any timeline format, prefer the utilities in `@engine9/input-tools`:
|
|
144
|
+
|
|
145
|
+
- **`TIMELINE_ENTRY_TYPES`** (`timelineTypes.js`): bidirectional map between string entry types and numeric `entry_type_id`.
|
|
146
|
+
- **`getEntryTypeId`**: resolve `entry_type` → `entry_type_id` with validation.
|
|
147
|
+
- **`getEntryType`**: resolve `entry_type_id` → `entry_type`.
|
|
148
|
+
- **`getTimelineEntryUUID`**: generate or normalize `id` given `ts`, `entry_type_id`, `input_id`, and `person_id`, respecting `remote_entry_uuid` / `remote_entry_id` when present.
|
|
149
|
+
- **`uuidIsValid`**: validate that a string is a proper UUID.
|
|
150
|
+
|
|
151
|
+
Use these helpers instead of hard-coding IDs or types whenever you construct timeline rows, whether Raw or ID.
|
|
152
|
+
|
|
@@ -43,7 +43,7 @@ Use this skill when writing a JavaScript function that maps 3rd-party payment or
|
|
|
43
43
|
|
|
44
44
|
## Transaction entry types
|
|
45
45
|
|
|
46
|
-
Use exactly these `entry_type` string values (or the numeric `entry_type_id`). Import `TIMELINE_ENTRY_TYPES` from `@engine9
|
|
46
|
+
Use exactly these `entry_type` string values (or the numeric `entry_type_id`). Import `TIMELINE_ENTRY_TYPES` from `@engine9/input-tools` for the full map.
|
|
47
47
|
|
|
48
48
|
| entry_type | entry_type_id | Use when |
|
|
49
49
|
| ------------------------ | ------------- | ------------------------------------------------------------ |
|