@engine9/input-tools 2.0.8 → 2.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,7 +16,7 @@ const fsp = fs.promises;
16
16
  const { Readable, Transform, PassThrough, Writable } = nodestream;
17
17
  const { pipeline } = promises;
18
18
 
19
- const debug = debug$0('@engine9-io/file');
19
+ const debug = debug$0('@engine9/file');
20
20
  const { getXlsxStream } = xlstream;
21
21
 
22
22
  function Worker({ accountId }) {
@@ -657,6 +657,54 @@ Worker.prototype.list.metadata = {
657
657
  directory: { required: true }
658
658
  }
659
659
  };
660
+ Worker.prototype.analyze = async function ({ directory }) {
661
+ if (!directory) throw new Error('directory is required');
662
+ if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
663
+ const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
664
+ return worker.analyze({ directory });
665
+ }
666
+ let fileCount = 0;
667
+ let directoryCount = 0;
668
+ let firstModified = null;
669
+ let lastModified = null;
670
+ let firstTime = null;
671
+ let lastTime = null;
672
+ const walk = async (dir) => {
673
+ const entries = await fsp.readdir(dir, { withFileTypes: true });
674
+ for (const ent of entries) {
675
+ const fullPath = path.join(dir, ent.name);
676
+ if (ent.isDirectory()) {
677
+ directoryCount += 1;
678
+ await walk(fullPath);
679
+ } else {
680
+ fileCount += 1;
681
+ const stats = await fsp.stat(fullPath);
682
+ const mtime = stats.mtimeMs;
683
+ const modifiedAt = new Date(stats.mtime).toISOString();
684
+ if (firstTime === null || mtime < firstTime) {
685
+ firstTime = mtime;
686
+ firstModified = { filename: fullPath, modifiedAt };
687
+ }
688
+ if (lastTime === null || mtime > lastTime) {
689
+ lastTime = mtime;
690
+ lastModified = { filename: fullPath, modifiedAt };
691
+ }
692
+ }
693
+ }
694
+ };
695
+ await walk(directory);
696
+ return {
697
+ fileCount,
698
+ directoryCount,
699
+ firstModified: fileCount ? firstModified : null,
700
+ lastModified: fileCount ? lastModified : null
701
+ };
702
+ };
703
+ Worker.prototype.analyze.metadata = {
704
+ options: {
705
+ directory: { required: true }
706
+ }
707
+ };
660
708
  Worker.prototype.listAll = async function ({ directory, start: s, end: e }) {
661
709
  if (!directory) throw new Error('directory is required');
662
710
  let start = null;
package/file/S3.js CHANGED
@@ -1,308 +1,369 @@
1
- import debug$0 from "debug";
2
- import fs from "node:fs";
3
- import withDb from "mime-type/with-db";
4
- import clientS3 from "@aws-sdk/client-s3";
5
- import { getTempFilename, relativeDate } from "./tools.js";
6
- const debug = debug$0('@engine9-io/input/S3');
1
+ import debug$0 from 'debug';
2
+ import fs from 'node:fs';
3
+ import withDb from 'mime-type/with-db';
4
+ import clientS3 from '@aws-sdk/client-s3';
5
+ import { getTempFilename, relativeDate } from './tools.js';
6
+ const debug = debug$0('@engine9/input/S3');
7
7
  const { mimeType: mime } = withDb;
8
- const { S3Client, CopyObjectCommand, DeleteObjectCommand, GetObjectCommand, HeadObjectCommand, GetObjectAttributesCommand, PutObjectCommand, ListObjectsV2Command } = clientS3;
8
+ const {
9
+ S3Client,
10
+ CopyObjectCommand,
11
+ DeleteObjectCommand,
12
+ GetObjectCommand,
13
+ HeadObjectCommand,
14
+ GetObjectAttributesCommand,
15
+ PutObjectCommand,
16
+ ListObjectsV2Command
17
+ } = clientS3;
9
18
  function Worker() {
10
- this.prefix = 's3';
19
+ this.prefix = 's3';
11
20
  }
12
21
  function getParts(filename) {
13
- if (!filename)
14
- throw new Error(`Invalid filename: ${filename}`);
15
- if (!filename.startsWith('r2://') && !filename.startsWith('s3://')) {
16
- throw new Error(`Invalid filename, must start with r2:// or s3://: ${filename}`);
17
- }
18
- const parts = filename.split('/');
19
- const Bucket = parts[2];
20
- const Key = parts.slice(3).join('/');
21
- return { Bucket, Key };
22
+ if (!filename) throw new Error(`Invalid filename: ${filename}`);
23
+ if (!filename.startsWith('r2://') && !filename.startsWith('s3://')) {
24
+ throw new Error(`Invalid filename, must start with r2:// or s3://: ${filename}`);
25
+ }
26
+ const parts = filename.split('/');
27
+ const Bucket = parts[2];
28
+ const Key = parts.slice(3).join('/');
29
+ return { Bucket, Key };
22
30
  }
23
31
  Worker.prototype.getClient = function () {
24
- if (!this.client)
25
- this.client = new S3Client({});
26
- return this.client;
32
+ if (!this.client) this.client = new S3Client({});
33
+ return this.client;
27
34
  };
28
35
  Worker.prototype.getMetadata = async function ({ filename }) {
29
- const s3Client = this.getClient();
30
- const { Bucket, Key } = getParts(filename);
31
- const resp = await s3Client.send(new GetObjectAttributesCommand({
32
- Bucket,
33
- Key,
34
- ObjectAttributes: ['ETag', 'Checksum', 'ObjectParts', 'StorageClass', 'ObjectSize']
35
- }));
36
- return resp;
36
+ const s3Client = this.getClient();
37
+ const { Bucket, Key } = getParts(filename);
38
+ const resp = await s3Client.send(
39
+ new GetObjectAttributesCommand({
40
+ Bucket,
41
+ Key,
42
+ ObjectAttributes: ['ETag', 'Checksum', 'ObjectParts', 'StorageClass', 'ObjectSize']
43
+ })
44
+ );
45
+ return resp;
37
46
  };
38
47
  Worker.prototype.getMetadata.metadata = {
39
- options: {
40
- filename: {}
41
- }
48
+ options: {
49
+ filename: {}
50
+ }
42
51
  };
43
52
  Worker.prototype.stream = async function ({ filename }) {
44
- const s3Client = this.getClient();
45
- const { Bucket, Key } = getParts(filename);
46
- const command = new GetObjectCommand({ Bucket, Key });
47
- try {
48
- debug(`Streaming file ${Key}`);
49
- const response = await s3Client.send(command);
50
- return { stream: response.Body };
51
- }
52
- catch (e) {
53
- debug(`Could not stream filename:${filename}`);
54
- throw e;
55
- }
53
+ const s3Client = this.getClient();
54
+ const { Bucket, Key } = getParts(filename);
55
+ const command = new GetObjectCommand({ Bucket, Key });
56
+ try {
57
+ debug(`Streaming file s3://${Bucket}/${Key}`);
58
+ const response = await s3Client.send(command);
59
+ return { stream: response.Body };
60
+ } catch (e) {
61
+ debug(`Could not stream filename:${filename}`);
62
+ throw e;
63
+ }
56
64
  };
57
65
  Worker.prototype.stream.metadata = {
58
- options: {
59
- filename: {}
60
- }
66
+ options: {
67
+ filename: {}
68
+ }
61
69
  };
62
70
  Worker.prototype.copy = async function ({ filename, target }) {
63
- if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
64
- //we're fine
65
- }
66
- else {
67
- throw new Error('Cowardly not copying a file not from s3 -- use put instead');
68
- }
69
- const s3Client = this.getClient();
70
- const { Bucket, Key } = getParts(target);
71
- debug(`Copying ${filename} to ${JSON.stringify({ Bucket, Key })}}`);
72
- const command = new CopyObjectCommand({
73
- CopySource: filename.slice(4), // remove the s3:/
74
- Bucket,
75
- Key
76
- });
77
- return s3Client.send(command);
71
+ if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
72
+ //we're fine
73
+ } else {
74
+ throw new Error('Cowardly not copying a file not from s3 -- use put instead');
75
+ }
76
+ const s3Client = this.getClient();
77
+ const { Bucket, Key } = getParts(target);
78
+ debug(`Copying ${filename} to ${JSON.stringify({ Bucket, Key })}}`);
79
+ const command = new CopyObjectCommand({
80
+ CopySource: filename.slice(4), // remove the s3:/
81
+ Bucket,
82
+ Key
83
+ });
84
+ return s3Client.send(command);
78
85
  };
79
86
  Worker.prototype.copy.metadata = {
80
- options: {
81
- filename: {},
82
- target: {}
83
- }
87
+ options: {
88
+ filename: {},
89
+ target: {}
90
+ }
84
91
  };
85
92
  Worker.prototype.move = async function ({ filename, target }) {
86
- await this.copy({ filename, target });
87
- await this.remove({ filename });
88
- return { filename: target };
93
+ await this.copy({ filename, target });
94
+ await this.remove({ filename });
95
+ return { filename: target };
89
96
  };
90
97
  Worker.prototype.move.metadata = {
91
- options: {
92
- filename: {},
93
- target: {}
94
- }
98
+ options: {
99
+ filename: {},
100
+ target: {}
101
+ }
95
102
  };
96
103
  Worker.prototype.remove = async function ({ filename }) {
97
- const s3Client = this.getClient();
98
- const { Bucket, Key } = getParts(filename);
99
- const command = new DeleteObjectCommand({ Bucket, Key });
100
- return s3Client.send(command);
104
+ const s3Client = this.getClient();
105
+ const { Bucket, Key } = getParts(filename);
106
+ const command = new DeleteObjectCommand({ Bucket, Key });
107
+ return s3Client.send(command);
101
108
  };
102
109
  Worker.prototype.remove.metadata = {
103
- options: {
104
- filename: {}
105
- }
110
+ options: {
111
+ filename: {}
112
+ }
106
113
  };
107
114
  Worker.prototype.download = async function ({ filename }) {
108
- const file = filename.split('/').pop();
109
- const localPath = await getTempFilename({ targetFilename: file });
110
- const s3Client = this.getClient();
111
- const { Bucket, Key } = getParts(filename);
112
- const command = new GetObjectCommand({ Bucket, Key });
113
- debug(`Downloading ${file} to ${localPath}`);
114
- const response = await s3Client.send(command);
115
- const fileStream = fs.createWriteStream(localPath);
116
- response.Body.pipe(fileStream);
117
- return new Promise((resolve, reject) => {
118
- fileStream.on('finish', async () => {
119
- const { size } = await fs.promises.stat(localPath);
120
- resolve({ size, filename: localPath });
121
- });
122
- fileStream.on('error', reject);
115
+ const file = filename.split('/').pop();
116
+ const localPath = await getTempFilename({ targetFilename: file });
117
+ const s3Client = this.getClient();
118
+ const { Bucket, Key } = getParts(filename);
119
+ const command = new GetObjectCommand({ Bucket, Key });
120
+ debug(`Downloading ${file} to ${localPath}`);
121
+ const response = await s3Client.send(command);
122
+ const fileStream = fs.createWriteStream(localPath);
123
+ response.Body.pipe(fileStream);
124
+ return new Promise((resolve, reject) => {
125
+ fileStream.on('finish', async () => {
126
+ const { size } = await fs.promises.stat(localPath);
127
+ resolve({ size, filename: localPath });
123
128
  });
129
+ fileStream.on('error', reject);
130
+ });
124
131
  };
125
132
  Worker.prototype.download.metadata = {
126
- options: {
127
- filename: {}
128
- }
133
+ options: {
134
+ filename: {}
135
+ }
129
136
  };
130
137
  Worker.prototype.put = async function (options) {
131
- const { filename, directory } = options;
132
- if (!filename)
133
- throw new Error('Local filename required');
134
- if (directory?.indexOf('s3://') !== 0 && directory?.indexOf('r2://') !== 0)
135
- throw new Error(`directory path must start with s3:// or r2://, is ${directory}`);
136
- const file = options.file || filename.split('/').pop();
137
- const parts = directory.split('/');
138
- const Bucket = parts[2];
139
- const Key = parts.slice(3).filter(Boolean).concat(file).join('/');
140
- const Body = fs.createReadStream(filename);
141
- const ContentType = mime.lookup(file);
142
- debug(`Putting ${filename} to ${JSON.stringify({ Bucket, Key, ContentType })}}`);
143
- const s3Client = this.getClient();
144
- const command = new PutObjectCommand({
145
- Bucket,
146
- Key,
147
- Body,
148
- ContentType
149
- });
150
- return s3Client.send(command);
138
+ const { filename, directory } = options;
139
+ if (!filename) throw new Error('Local filename required');
140
+ if (directory?.indexOf('s3://') !== 0 && directory?.indexOf('r2://') !== 0)
141
+ throw new Error(`directory path must start with s3:// or r2://, is ${directory}`);
142
+ const file = options.file || filename.split('/').pop();
143
+ const parts = directory.split('/');
144
+ const Bucket = parts[2];
145
+ const Key = parts.slice(3).filter(Boolean).concat(file).join('/');
146
+ const Body = fs.createReadStream(filename);
147
+ const ContentType = mime.lookup(file);
148
+ debug(`Putting ${filename} to ${JSON.stringify({ Bucket, Key, ContentType })}}`);
149
+ const s3Client = this.getClient();
150
+ const command = new PutObjectCommand({
151
+ Bucket,
152
+ Key,
153
+ Body,
154
+ ContentType
155
+ });
156
+ return s3Client.send(command);
151
157
  };
152
158
  Worker.prototype.put.metadata = {
153
- options: {
154
- filename: {},
155
- directory: { description: 'Directory to put file, e.g. s3://foo-bar/dir/xyz' },
156
- file: { description: 'Name of file, defaults to the filename' }
157
- }
159
+ options: {
160
+ filename: {},
161
+ directory: { description: 'Directory to put file, e.g. s3://foo-bar/dir/xyz' },
162
+ file: { description: 'Name of file, defaults to the filename' }
163
+ }
158
164
  };
159
165
  Worker.prototype.write = async function (options) {
160
- const { directory, file, content } = options;
161
- if (!directory?.indexOf('s3://') === 0)
162
- throw new Error('directory must start with s3://');
163
- const parts = directory.split('/');
164
- const Bucket = parts[2];
165
- const Key = parts.slice(3).filter(Boolean).concat(file).join('/');
166
- const Body = content;
167
- debug(`Writing content of length ${content.length} to ${JSON.stringify({ Bucket, Key })}}`);
168
- const s3Client = this.getClient();
169
- const ContentType = mime.lookup(file);
170
- const command = new PutObjectCommand({
171
- Bucket,
172
- Key,
173
- Body,
174
- ContentType
175
- });
176
- return s3Client.send(command);
166
+ const { directory, file, content } = options;
167
+ if (!directory?.indexOf('s3://') === 0) throw new Error('directory must start with s3://');
168
+ const parts = directory.split('/');
169
+ const Bucket = parts[2];
170
+ const Key = parts.slice(3).filter(Boolean).concat(file).join('/');
171
+ const Body = content;
172
+ debug(`Writing content of length ${content.length} to ${JSON.stringify({ Bucket, Key })}}`);
173
+ const s3Client = this.getClient();
174
+ const ContentType = mime.lookup(file);
175
+ const command = new PutObjectCommand({
176
+ Bucket,
177
+ Key,
178
+ Body,
179
+ ContentType
180
+ });
181
+ return s3Client.send(command);
177
182
  };
178
183
  Worker.prototype.write.metadata = {
179
- options: {
180
- directory: { description: 'Directory to put file, e.g. s3://foo-bar/dir/xyz' },
181
- file: { description: 'Name of file, defaults to the filename' },
182
- content: { description: 'Contents of file' }
183
- }
184
+ options: {
185
+ directory: { description: 'Directory to put file, e.g. s3://foo-bar/dir/xyz' },
186
+ file: { description: 'Name of file, defaults to the filename' },
187
+ content: { description: 'Contents of file' }
188
+ }
184
189
  };
185
190
  Worker.prototype.list = async function ({ directory, start, end, raw }) {
186
- if (!directory)
187
- throw new Error('directory is required');
188
- let dir = directory;
189
- while (dir.slice(-1) === '/')
190
- dir = dir.slice(0, -1);
191
- const { Bucket, Key: Prefix } = getParts(dir);
192
- const s3Client = this.getClient();
193
- const command = new ListObjectsV2Command({
194
- Bucket,
195
- Prefix: `${Prefix}/`,
196
- Delimiter: '/'
197
- });
198
- const { Contents: files, CommonPrefixes } = await s3Client.send(command);
199
- if (raw)
200
- return files;
201
- // debug('Prefixes:', { CommonPrefixes });
202
- const output = []
203
- .concat((CommonPrefixes || []).map((f) => ({
191
+ if (!directory) throw new Error('directory is required');
192
+ let dir = directory;
193
+ while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
194
+ const { Bucket, Key: Prefix } = getParts(dir);
195
+ const s3Client = this.getClient();
196
+ const command = new ListObjectsV2Command({
197
+ Bucket,
198
+ Prefix: `${Prefix}/`,
199
+ Delimiter: '/'
200
+ });
201
+ const { Contents: files, CommonPrefixes } = await s3Client.send(command);
202
+ if (raw) return files;
203
+ // debug('Prefixes:', { CommonPrefixes });
204
+ const output = []
205
+ .concat(
206
+ (CommonPrefixes || []).map((f) => ({
204
207
  name: f.Prefix.slice(Prefix.length + 1, -1),
205
208
  type: 'directory'
206
- })))
207
- .concat((files || [])
209
+ }))
210
+ )
211
+ .concat(
212
+ (files || [])
208
213
  .filter(({ LastModified }) => {
209
- if (start && new Date(LastModified) < start) {
214
+ if (start && new Date(LastModified) < start) {
210
215
  return false;
211
- }
212
- else if (end && new Date(LastModified) > end) {
216
+ } else if (end && new Date(LastModified) > end) {
213
217
  return false;
214
- }
215
- else {
218
+ } else {
216
219
  return true;
217
- }
218
- })
220
+ }
221
+ })
219
222
  .map(({ Key, Size, LastModified }) => ({
220
- name: Key.slice(Prefix.length + 1),
221
- type: 'file',
222
- size: Size,
223
- modifiedAt: new Date(LastModified).toISOString()
224
- })));
225
- return output;
223
+ name: Key.slice(Prefix.length + 1),
224
+ type: 'file',
225
+ size: Size,
226
+ modifiedAt: new Date(LastModified).toISOString()
227
+ }))
228
+ );
229
+ return output;
226
230
  };
227
231
  Worker.prototype.list.metadata = {
228
- options: {
229
- directory: { required: true }
232
+ options: {
233
+ directory: { required: true }
234
+ }
235
+ };
236
+ Worker.prototype.analyze = async function ({ directory }) {
237
+ if (!directory) throw new Error('directory is required');
238
+ let dir = directory;
239
+ while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
240
+ const { Bucket, Key } = getParts(dir);
241
+ const s3Client = this.getClient();
242
+ let Prefix = '';
243
+ if (Key) Prefix = `${Key}/`;
244
+ const dirsSeen = new Set();
245
+ let fileCount = 0;
246
+ let firstModified = null;
247
+ let lastModified = null;
248
+ let firstTime = null;
249
+ let lastTime = null;
250
+ let ContinuationToken = undefined;
251
+ do {
252
+ const result = await s3Client.send(
253
+ new ListObjectsV2Command({
254
+ Bucket,
255
+ Prefix,
256
+ ContinuationToken
257
+ })
258
+ );
259
+ for (const content of result.Contents || []) {
260
+ const objectKey = content.Key;
261
+ let rel = Prefix ? (objectKey.startsWith(Prefix) ? objectKey.slice(Prefix.length) : objectKey) : objectKey;
262
+ if (!rel) continue;
263
+ const isFolderMarker = rel.endsWith('/');
264
+ const parts = rel.replace(/\/$/, '').split('/').filter(Boolean);
265
+ for (let i = 0; i < parts.length - 1; i++) {
266
+ dirsSeen.add(parts.slice(0, i + 1).join('/'));
267
+ }
268
+ if (isFolderMarker) {
269
+ if (parts.length) dirsSeen.add(parts.join('/'));
270
+ continue;
271
+ }
272
+ fileCount++;
273
+ const mtime = new Date(content.LastModified).getTime();
274
+ const modifiedAt = new Date(content.LastModified).toISOString();
275
+ const filename = `${this.prefix}://${Bucket}/${objectKey}`;
276
+ if (firstTime === null || mtime < firstTime) {
277
+ firstTime = mtime;
278
+ firstModified = { filename, modifiedAt };
279
+ }
280
+ if (lastTime === null || mtime > lastTime) {
281
+ lastTime = mtime;
282
+ lastModified = { filename, modifiedAt };
283
+ }
230
284
  }
285
+ ContinuationToken = result.IsTruncated ? result.NextContinuationToken : undefined;
286
+ } while (ContinuationToken);
287
+ return {
288
+ fileCount,
289
+ directoryCount: dirsSeen.size,
290
+ firstModified: fileCount ? firstModified : null,
291
+ lastModified: fileCount ? lastModified : null
292
+ };
293
+ };
294
+ Worker.prototype.analyze.metadata = {
295
+ options: {
296
+ directory: { required: true }
297
+ }
231
298
  };
232
299
  /* List everything with the prefix */
233
300
  Worker.prototype.listAll = async function (options) {
234
- const { directory } = options;
235
- if (!directory)
236
- throw new Error('directory is required');
237
- let dir = directory;
238
- const start = options.start && relativeDate(options.start);
239
- const end = options.end && relativeDate(options.end);
240
- while (dir.slice(-1) === '/')
241
- dir = dir.slice(0, -1);
242
- const { Bucket, Key } = getParts(dir);
243
- const s3Client = this.getClient();
244
- const files = [];
245
- let ContinuationToken = null;
246
- let Prefix = null;
247
- if (Key)
248
- Prefix = `${Key}/`;
249
- do {
250
- const command = new ListObjectsV2Command({
251
- Bucket,
252
- Prefix,
253
- ContinuationToken
254
- // Delimiter: '/',
255
- });
256
- debug(`Sending List command with prefix ${Prefix} with ContinuationToken ${ContinuationToken}`);
257
- const result = await s3Client.send(command);
258
- const newFiles = result.Contents?.filter(({ LastModified }) => {
259
- if (start && new Date(LastModified) < start) {
260
- return false;
261
- }
262
- else if (end && new Date(LastModified) > end) {
263
- return false;
264
- }
265
- else {
266
- return true;
267
- }
268
- })?.map((d) => `${this.prefix}://${Bucket}/${d.Key}`) || [];
269
- debug(`Retrieved ${newFiles.length} new files, total ${files.length},sample ${newFiles.slice(0, 3).join(',')}`);
270
- files.push(...newFiles);
271
- ContinuationToken = result.NextContinuationToken;
272
- } while (ContinuationToken);
273
- return files;
301
+ const { directory } = options;
302
+ if (!directory) throw new Error('directory is required');
303
+ let dir = directory;
304
+ const start = options.start && relativeDate(options.start);
305
+ const end = options.end && relativeDate(options.end);
306
+ while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
307
+ const { Bucket, Key } = getParts(dir);
308
+ const s3Client = this.getClient();
309
+ const files = [];
310
+ let ContinuationToken = null;
311
+ let Prefix = null;
312
+ if (Key) Prefix = `${Key}/`;
313
+ do {
314
+ const command = new ListObjectsV2Command({
315
+ Bucket,
316
+ Prefix,
317
+ ContinuationToken
318
+ // Delimiter: '/',
319
+ });
320
+ debug(`Sending List command with prefix ${Prefix} with ContinuationToken ${ContinuationToken}`);
321
+ const result = await s3Client.send(command);
322
+ const newFiles =
323
+ result.Contents?.filter(({ LastModified }) => {
324
+ if (start && new Date(LastModified) < start) {
325
+ return false;
326
+ } else if (end && new Date(LastModified) > end) {
327
+ return false;
328
+ } else {
329
+ return true;
330
+ }
331
+ })?.map((d) => `${this.prefix}://${Bucket}/${d.Key}`) || [];
332
+ debug(`Retrieved ${newFiles.length} new files, total ${files.length},sample ${newFiles.slice(0, 3).join(',')}`);
333
+ files.push(...newFiles);
334
+ ContinuationToken = result.NextContinuationToken;
335
+ } while (ContinuationToken);
336
+ return files;
274
337
  };
275
338
  Worker.prototype.listAll.metadata = {
276
- options: {
277
- directory: { required: true }
278
- }
339
+ options: {
340
+ directory: { required: true }
341
+ }
279
342
  };
280
343
  Worker.prototype.moveAll = async function ({ directory, targetDirectory }) {
281
- if (!directory || !targetDirectory)
282
- throw new Error('directory and targetDirectory required');
283
- const files = await this.listAll({ directory });
284
- const configs = files.map((d) => ({
285
- filename: d,
286
- target: d.replace(directory, targetDirectory)
287
- }));
288
- const pLimit = await import('p-limit');
289
- const limitedMethod = pLimit.default(10);
290
- return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
344
+ if (!directory || !targetDirectory) throw new Error('directory and targetDirectory required');
345
+ const files = await this.listAll({ directory });
346
+ const configs = files.map((d) => ({
347
+ filename: d,
348
+ target: d.replace(directory, targetDirectory)
349
+ }));
350
+ const pLimit = await import('p-limit');
351
+ const limitedMethod = pLimit.default(10);
352
+ return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
291
353
  };
292
354
  Worker.prototype.moveAll.metadata = {
293
- options: {
294
- directory: { required: true },
295
- targetDirectory: { required: true }
296
- }
355
+ options: {
356
+ directory: { required: true },
357
+ targetDirectory: { required: true }
358
+ }
297
359
  };
298
360
  Worker.prototype.stat = async function ({ filename }) {
299
- if (!filename)
300
- throw new Error('filename is required');
301
- const s3Client = this.getClient();
302
- const { Bucket, Key } = getParts(filename);
303
- const command = new HeadObjectCommand({ Bucket, Key });
304
- const response = await s3Client.send(command);
305
- const {
361
+ if (!filename) throw new Error('filename is required');
362
+ const s3Client = this.getClient();
363
+ const { Bucket, Key } = getParts(filename);
364
+ const command = new HeadObjectCommand({ Bucket, Key });
365
+ const response = await s3Client.send(command);
366
+ const {
306
367
  // "AcceptRanges": "bytes",
307
368
  ContentLength, // : "3191",
308
369
  ContentType, // : "image/jpeg",
@@ -310,20 +371,20 @@ Worker.prototype.stat = async function ({ filename }) {
310
371
  LastModified // : "2016-12-15T01:19:41.000Z",
311
372
  // Metadata": {},
312
373
  // VersionId": "null"
313
- } = response;
314
- const modifiedAt = new Date(LastModified);
315
- const createdAt = modifiedAt; // Same for S3
316
- const size = parseInt(ContentLength, 10);
317
- return {
318
- createdAt,
319
- modifiedAt,
320
- contentType: ContentType,
321
- size
322
- };
374
+ } = response;
375
+ const modifiedAt = new Date(LastModified);
376
+ const createdAt = modifiedAt; // Same for S3
377
+ const size = parseInt(ContentLength, 10);
378
+ return {
379
+ createdAt,
380
+ modifiedAt,
381
+ contentType: ContentType,
382
+ size
383
+ };
323
384
  };
324
385
  Worker.prototype.stat.metadata = {
325
- options: {
326
- filename: {}
327
- }
386
+ options: {
387
+ filename: {}
388
+ }
328
389
  };
329
390
  export default Worker;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@engine9/input-tools",
3
- "version": "2.0.8",
3
+ "version": "2.0.9",
4
4
  "type": "module",
5
5
  "description": "Tools for dealing with Engine9 inputs",
6
6
  "main": "index.js",
@@ -0,0 +1,152 @@
1
+ ---
2
+ name: timeline
3
+ description: Describes Engine9 timeline file formats (Timeline ID vs Timeline Raw) and how to construct them using only the utilities exported by @engine9/input-tools. Use when creating or transforming timeline-shaped data in plugins or ingestion code.
4
+ ---
5
+
6
+ ## Timeline files in Engine9
7
+
8
+ Use this skill whenever you are:
9
+
10
+ - **Designing timeline outputs** from plugins or ingestion code.
11
+ - **Producing files** that will eventually land in an Engine9 `timeline`-style table (for example via server workers).
12
+
13
+ Engine9 models person-level activity as **timeline entries**. A timeline entry is a single fact about a person at a point in time (email send, open, click, transaction, signup, etc.), identified by:
14
+
15
+ - **`ts`**: timestamp of the event.
16
+ - **`entry_type_id`**: numeric type from `TIMELINE_ENTRY_TYPES` (`input-tools/timelineTypes.js`).
17
+ - **`person_id`**: internal person identifier.
18
+ - **`input_id`**: which input/source this event came from.
19
+ - **`id`**: a deterministic UUID derived from the above (via `getTimelineEntryUUID`).
20
+
21
+ There are **two main on-disk timeline file shapes**:
22
+
23
+ - **Timeline ID files** – already resolved to `person_id` and `id`. These are ready to load into the `timeline` table.
24
+ - **Timeline Raw files** – do **not** contain `person_id`. They must go through person resolution and ID assignment before they can be loaded.
25
+
26
+ ## Timeline ID files
27
+
28
+ **Use when** you want data that is ready to be:
29
+
30
+ - Loaded into a downstream `timeline` table.
31
+ - Joined against plugin-specific detail tables.
32
+ - De-duplicated by `id` (UUID).
33
+
34
+ ### Core shape
35
+
36
+ Timeline ID files are typically produced by:
37
+
38
+ - Plugin or ingestion code that has already resolved a stable `person_id` and `input_id`.
39
+ - Mappers that call `getEntryTypeId` and `getTimelineEntryUUID` from `@engine9/input-tools`.
40
+
41
+ **Minimum required fields** for a Timeline ID file that downstream workers will accept:
42
+
43
+ - **`id`**: UUID for the timeline entry.
44
+ - Generated by `getTimelineEntryUUID`, or provided as a stable `remote_entry_uuid`.
45
+ - In `InputWorker.id`, `appendTimelineId` writes this into the `id` column.
46
+ - **`ts`**: timestamp (string or number) that can be parsed into a `Date`.
47
+ - **`person_id`**: internal numeric person id.
48
+ - **`entry_type_id`**: integer from `TIMELINE_ENTRY_TYPES`.
49
+
50
+ **Common optional fields**:
51
+
52
+ - **`source_code_id`**: numeric source code identifier.
53
+ - **`email_domain`**: lower-cased domain, often derived from `email`.
54
+ - Any number of **extra columns** (detail fields); these can be copied into plugin-specific detail tables.
55
+
56
+ The downstream `timeline` table schema usually includes:
57
+
58
+ - A primary key `id` column (UUID stored as text).
59
+ - A timestamp column `ts` (millis since epoch).
60
+ - Integer columns `entry_type_id` and `person_id`.
61
+ - Optional columns such as `source_code_id` and `email_domain`.
62
+
63
+ ### How to construct Timeline ID files with input-tools
64
+
65
+ When authoring a Timeline ID-producing job or plugin using `@engine9/input-tools`:
66
+
67
+ - **Always include** `id`, `ts`, `person_id`, and `entry_type_id` on each emitted row.
68
+ - **Prefer numeric `entry_type_id`**, but you may also keep a string `entry_type` for debugging; resolution between the two happens via `TIMELINE_ENTRY_TYPES`, `getEntryTypeId`, and `getEntryType`.
69
+ - **Keep `input_id` stable** per logical input stream; `getTimelineEntryUUID` uses it as the UUID namespace when generating `id`.
70
+
71
+ ## Timeline Raw files
72
+
73
+ **Use when** you have raw events from an external system and **cannot yet** assign `person_id` but still want to capture structured activity.
74
+
75
+ Examples:
76
+
77
+ - Raw web or email events that only know an email or other external identifier.
78
+ - Logs from external APIs where person resolution happens later in the pipeline.
79
+
80
+ ### Core shape
81
+
82
+ Timeline Raw files:
83
+
84
+ - **Must not contain** `person_id` (by definition for this skill).
85
+ - **May or may not contain** `id`.
86
+ - If they do contain an `id`, it is usually an external event ID or `remote_entry_uuid`, not necessarily the final Engine9 `id`.
87
+ - **Should contain enough information** to derive:
88
+ - A timestamp: **`ts`** (or a field that is mapped to `ts`).
89
+ - An entry type: **`entry_type`** (string) or **`entry_type_id`** (numeric).
90
+ - A person identifier that can be resolved later (e.g. `remote_person_id`, `email`, or similar).
91
+
92
+ Typical fields you will see:
93
+
94
+ - **`ts`** or a source-specific timestamp (later mapped to `ts`).
95
+ - **`entry_type`** or **`entry_type_id`** (e.g. `'EMAIL_UNSUBSCRIBE'`, `'EMAIL_OPEN'`, etc.).
96
+ - **Contact fields**: `email`, `remote_person_id`, phone number, etc.
97
+ - **Source metadata**: `account_id`, `plugin_id`, `url`, `user_agent`, `ip_address`, etc.
98
+
99
+ For example, a plugin may map an inbound event into a row with:
100
+
101
+ - `ts`, `account_id`, `entry_type_id`, `email`, `email_domain`, `url`, `user_agent`
102
+
103
+ and **no `person_id`** yet.
104
+
105
+ ### Converting Timeline Raw → Timeline ID
106
+
107
+ The usual pathway for Raw → ID is:
108
+
109
+ 1. **Map raw events into a timeline-shaped object** (with `ts`, `entry_type`/`entry_type_id`, and contact info).
110
+ 2. **Resolve people** (outside of input-tools):
111
+ - Use your application’s person resolution or a server worker to:
112
+ - Look up or create `person` rows.
113
+ - Attach a canonical `person_id` to each row.
114
+ 3. **Assign timeline IDs** with input-tools:
115
+ - Use `getEntryTypeId` (if needed) to ensure `entry_type_id` is set from `TIMELINE_ENTRY_TYPES` when only `entry_type` is present.
116
+ - Call `getTimelineEntryUUID` to:
117
+ - Require `ts`, `entry_type_id`, `input_id`, and `person_id`.
118
+ - Produce a deterministic, sortable UUID for `id`.
119
+ 4. **Write out a Timeline ID file** (for example, parquet or CSV) with the full set of fields (`id`, `ts`, `person_id`, `entry_type_id`, optional `source_code_id`, etc.).
120
+
121
+ ## Choosing between Timeline ID and Timeline Raw
122
+
123
+ - **Choose Timeline ID files when**:
124
+ - You can resolve `person_id` and `input_id` in the current process.
125
+ - You want files that are **immediately loadable** into a `timeline` table.
126
+ - You need **deduplication** by a stable `id`.
127
+
128
+ - **Choose Timeline Raw files when**:
129
+ - You are at the **edge of the system** (plugins, collectors, ETL jobs) and only have partial identity information.
130
+ - You plan a **later enrichment step** that will attach `person_id` and compute final `id` values.
131
+ - You want to keep the ingestion simpler and defer canonicalization.
132
+
133
+ In practice:
134
+
135
+ - **Plugins and edge collectors** often emit **Timeline Raw** shaped data first.
136
+ - **Downstream services or server workers** then:
137
+ - Resolve people (`person_id`).
138
+ - Generate `id` via `getTimelineEntryUUID`.
139
+ - Persist **Timeline ID** files and load them into the `timeline` and detail tables.
140
+
141
+ ## Reference helpers
142
+
143
+ When working with any timeline format, prefer the utilities in `@engine9/input-tools`:
144
+
145
+ - **`TIMELINE_ENTRY_TYPES`** (`timelineTypes.js`): bidirectional map between string entry types and numeric `entry_type_id`.
146
+ - **`getEntryTypeId`**: resolve `entry_type` → `entry_type_id` with validation.
147
+ - **`getEntryType`**: resolve `entry_type_id` → `entry_type`.
148
+ - **`getTimelineEntryUUID`**: generate or normalize `id` given `ts`, `entry_type_id`, `input_id`, and `person_id`, respecting `remote_entry_uuid` / `remote_entry_id` when present.
149
+ - **`uuidIsValid`**: validate that a string is a proper UUID.
150
+
151
+ Use these helpers instead of hard-coding IDs or types whenever you construct timeline rows, whether Raw or ID.
152
+