npm - @engine9/input-tools - Versions diffs - 2.0.8 → 2.0.10 - Mend

@engine9/input-tools 2.0.8 → 2.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/file/FileUtilities.js +54 -2
package/file/S3.js +321 -256
package/file/tools.js +14 -0
package/package.json +1 -1
package/skills/timeline/SKILL.md +152 -0

package/file/FileUtilities.js CHANGED Viewed

@@ -11,12 +11,12 @@ import languageEncoding from 'detect-file-encoding-and-language';
 import R2Worker from './R2.js';
 import S3Worker from './S3.js';
 import ParquetWorker from './Parquet.js';
-import { bool, getTempFilename, getStringArray, getTempDir, makeStrings, streamPacket, relativeDate } from './tools.js';
+import { bool, getTempFilename, getStringArray, getTempDir, getFilePostfix, makeStrings, streamPacket, relativeDate } from './tools.js';
 const fsp = fs.promises;
 const { Readable, Transform, PassThrough, Writable } = nodestream;
 const { pipeline } = promises;
-const debug = debug$0('@engine9-io/file');
+const debug = debug$0('@engine9/file');
 const { getXlsxStream } = xlstream;
 function Worker({ accountId }) {
@@ -657,6 +657,58 @@ Worker.prototype.list.metadata = {
     directory: { required: true }
   }
 };
+Worker.prototype.analyze = async function ({ directory }) {
+  if (!directory) throw new Error('directory is required');
+  if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
+    const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
+    return worker.analyze({ directory });
+  }
+  let fileCount = 0;
+  let directoryCount = 0;
+  let firstModified = null;
+  let lastModified = null;
+  let firstTime = null;
+  let lastTime = null;
+  const postfixCounts = Object.create(null);
+  const walk = async (dir) => {
+    const entries = await fsp.readdir(dir, { withFileTypes: true });
+    for (const ent of entries) {
+      const fullPath = path.join(dir, ent.name);
+      if (ent.isDirectory()) {
+        directoryCount += 1;
+        await walk(fullPath);
+      } else {
+        fileCount += 1;
+        const postfix = getFilePostfix(fullPath);
+        postfixCounts[postfix] = (postfixCounts[postfix] || 0) + 1;
+        const stats = await fsp.stat(fullPath);
+        const mtime = stats.mtimeMs;
+        const modifiedAt = new Date(stats.mtime).toISOString();
+        if (firstTime === null || mtime < firstTime) {
+          firstTime = mtime;
+          firstModified = { filename: fullPath, modifiedAt };
+        }
+        if (lastTime === null || mtime > lastTime) {
+          lastTime = mtime;
+          lastModified = { filename: fullPath, modifiedAt };
+        }
+      }
+    }
+  };
+  await walk(directory);
+  return {
+    fileCount,
+    directoryCount,
+    postfixCounts,
+    firstModified: fileCount ? firstModified : null,
+    lastModified: fileCount ? lastModified : null
+  };
+};
+Worker.prototype.analyze.metadata = {
+  options: {
+    directory: { required: true }
+  }
+};
 Worker.prototype.listAll = async function ({ directory, start: s, end: e }) {
   if (!directory) throw new Error('directory is required');
   let start = null;

package/file/S3.js CHANGED Viewed

@@ -1,308 +1,373 @@
-import debug$0 from "debug";
-import fs from "node:fs";
-import withDb from "mime-type/with-db";
-import clientS3 from "@aws-sdk/client-s3";
-import { getTempFilename, relativeDate } from "./tools.js";
-const debug = debug$0('@engine9-io/input/S3');
+import debug$0 from 'debug';
+import fs from 'node:fs';
+import withDb from 'mime-type/with-db';
+import clientS3 from '@aws-sdk/client-s3';
+import { getTempFilename, getFilePostfix, relativeDate } from './tools.js';
+const debug = debug$0('@engine9/input/S3');
 const { mimeType: mime } = withDb;
-const { S3Client, CopyObjectCommand, DeleteObjectCommand, GetObjectCommand, HeadObjectCommand, GetObjectAttributesCommand, PutObjectCommand, ListObjectsV2Command } = clientS3;
+const {
+  S3Client,
+  CopyObjectCommand,
+  DeleteObjectCommand,
+  GetObjectCommand,
+  HeadObjectCommand,
+  GetObjectAttributesCommand,
+  PutObjectCommand,
+  ListObjectsV2Command
+} = clientS3;
 function Worker() {
-    this.prefix = 's3';
+  this.prefix = 's3';
 }
 function getParts(filename) {
-    if (!filename)
-        throw new Error(`Invalid filename: ${filename}`);
-    if (!filename.startsWith('r2://') && !filename.startsWith('s3://')) {
-        throw new Error(`Invalid filename, must start with r2:// or s3://: ${filename}`);
-    }
-    const parts = filename.split('/');
-    const Bucket = parts[2];
-    const Key = parts.slice(3).join('/');
-    return { Bucket, Key };
+  if (!filename) throw new Error(`Invalid filename: ${filename}`);
+  if (!filename.startsWith('r2://') && !filename.startsWith('s3://')) {
+    throw new Error(`Invalid filename, must start with r2:// or s3://: ${filename}`);
+  }
+  const parts = filename.split('/');
+  const Bucket = parts[2];
+  const Key = parts.slice(3).join('/');
+  return { Bucket, Key };
 }
 Worker.prototype.getClient = function () {
-    if (!this.client)
-        this.client = new S3Client({});
-    return this.client;
+  if (!this.client) this.client = new S3Client({});
+  return this.client;
 };
 Worker.prototype.getMetadata = async function ({ filename }) {
-    const s3Client = this.getClient();
-    const { Bucket, Key } = getParts(filename);
-    const resp = await s3Client.send(new GetObjectAttributesCommand({
-        Bucket,
-        Key,
-        ObjectAttributes: ['ETag', 'Checksum', 'ObjectParts', 'StorageClass', 'ObjectSize']
-    }));
-    return resp;
+  const s3Client = this.getClient();
+  const { Bucket, Key } = getParts(filename);
+  const resp = await s3Client.send(
+    new GetObjectAttributesCommand({
+      Bucket,
+      Key,
+      ObjectAttributes: ['ETag', 'Checksum', 'ObjectParts', 'StorageClass', 'ObjectSize']
+    })
+  );
+  return resp;
 };
 Worker.prototype.getMetadata.metadata = {
-    options: {
-        filename: {}
-    }
+  options: {
+    filename: {}
+  }
 };
 Worker.prototype.stream = async function ({ filename }) {
-    const s3Client = this.getClient();
-    const { Bucket, Key } = getParts(filename);
-    const command = new GetObjectCommand({ Bucket, Key });
-    try {
-        debug(`Streaming file ${Key}`);
-        const response = await s3Client.send(command);
-        return { stream: response.Body };
-    }
-    catch (e) {
-        debug(`Could not stream filename:${filename}`);
-        throw e;
-    }
+  const s3Client = this.getClient();
+  const { Bucket, Key } = getParts(filename);
+  const command = new GetObjectCommand({ Bucket, Key });
+  try {
+    debug(`Streaming file s3://${Bucket}/${Key}`);
+    const response = await s3Client.send(command);
+    return { stream: response.Body };
+  } catch (e) {
+    debug(`Could not stream filename:${filename}`);
+    throw e;
+  }
 };
 Worker.prototype.stream.metadata = {
-    options: {
-        filename: {}
-    }
+  options: {
+    filename: {}
+  }
 };
 Worker.prototype.copy = async function ({ filename, target }) {
-    if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
-        //we're fine
-    }
-    else {
-        throw new Error('Cowardly not copying a file not from s3 -- use put instead');
-    }
-    const s3Client = this.getClient();
-    const { Bucket, Key } = getParts(target);
-    debug(`Copying ${filename} to ${JSON.stringify({ Bucket, Key })}}`);
-    const command = new CopyObjectCommand({
-        CopySource: filename.slice(4), // remove the s3:/
-        Bucket,
-        Key
-    });
-    return s3Client.send(command);
+  if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
+    //we're fine
+  } else {
+    throw new Error('Cowardly not copying a file not from s3 -- use put instead');
+  }
+  const s3Client = this.getClient();
+  const { Bucket, Key } = getParts(target);
+  debug(`Copying ${filename} to ${JSON.stringify({ Bucket, Key })}}`);
+  const command = new CopyObjectCommand({
+    CopySource: filename.slice(4), // remove the s3:/
+    Bucket,
+    Key
+  });
+  return s3Client.send(command);
 };
 Worker.prototype.copy.metadata = {
-    options: {
-        filename: {},
-        target: {}
-    }
+  options: {
+    filename: {},
+    target: {}
+  }
 };
 Worker.prototype.move = async function ({ filename, target }) {
-    await this.copy({ filename, target });
-    await this.remove({ filename });
-    return { filename: target };
+  await this.copy({ filename, target });
+  await this.remove({ filename });
+  return { filename: target };
 };
 Worker.prototype.move.metadata = {
-    options: {
-        filename: {},
-        target: {}
-    }
+  options: {
+    filename: {},
+    target: {}
+  }
 };
 Worker.prototype.remove = async function ({ filename }) {
-    const s3Client = this.getClient();
-    const { Bucket, Key } = getParts(filename);
-    const command = new DeleteObjectCommand({ Bucket, Key });
-    return s3Client.send(command);
+  const s3Client = this.getClient();
+  const { Bucket, Key } = getParts(filename);
+  const command = new DeleteObjectCommand({ Bucket, Key });
+  return s3Client.send(command);
 };
 Worker.prototype.remove.metadata = {
-    options: {
-        filename: {}
-    }
+  options: {
+    filename: {}
+  }
 };
 Worker.prototype.download = async function ({ filename }) {
-    const file = filename.split('/').pop();
-    const localPath = await getTempFilename({ targetFilename: file });
-    const s3Client = this.getClient();
-    const { Bucket, Key } = getParts(filename);
-    const command = new GetObjectCommand({ Bucket, Key });
-    debug(`Downloading ${file} to ${localPath}`);
-    const response = await s3Client.send(command);
-    const fileStream = fs.createWriteStream(localPath);
-    response.Body.pipe(fileStream);
-    return new Promise((resolve, reject) => {
-        fileStream.on('finish', async () => {
-            const { size } = await fs.promises.stat(localPath);
-            resolve({ size, filename: localPath });
-        });
-        fileStream.on('error', reject);
+  const file = filename.split('/').pop();
+  const localPath = await getTempFilename({ targetFilename: file });
+  const s3Client = this.getClient();
+  const { Bucket, Key } = getParts(filename);
+  const command = new GetObjectCommand({ Bucket, Key });
+  debug(`Downloading ${file} to ${localPath}`);
+  const response = await s3Client.send(command);
+  const fileStream = fs.createWriteStream(localPath);
+  response.Body.pipe(fileStream);
+  return new Promise((resolve, reject) => {
+    fileStream.on('finish', async () => {
+      const { size } = await fs.promises.stat(localPath);
+      resolve({ size, filename: localPath });
     });
+    fileStream.on('error', reject);
+  });
 };
 Worker.prototype.download.metadata = {
-    options: {
-        filename: {}
-    }
+  options: {
+    filename: {}
+  }
 };
 Worker.prototype.put = async function (options) {
-    const { filename, directory } = options;
-    if (!filename)
-        throw new Error('Local filename required');
-    if (directory?.indexOf('s3://') !== 0 && directory?.indexOf('r2://') !== 0)
-        throw new Error(`directory path must start with s3:// or r2://, is ${directory}`);
-    const file = options.file || filename.split('/').pop();
-    const parts = directory.split('/');
-    const Bucket = parts[2];
-    const Key = parts.slice(3).filter(Boolean).concat(file).join('/');
-    const Body = fs.createReadStream(filename);
-    const ContentType = mime.lookup(file);
-    debug(`Putting ${filename} to ${JSON.stringify({ Bucket, Key, ContentType })}}`);
-    const s3Client = this.getClient();
-    const command = new PutObjectCommand({
-        Bucket,
-        Key,
-        Body,
-        ContentType
-    });
-    return s3Client.send(command);
+  const { filename, directory } = options;
+  if (!filename) throw new Error('Local filename required');
+  if (directory?.indexOf('s3://') !== 0 && directory?.indexOf('r2://') !== 0)
+    throw new Error(`directory path must start with s3:// or r2://, is ${directory}`);
+  const file = options.file || filename.split('/').pop();
+  const parts = directory.split('/');
+  const Bucket = parts[2];
+  const Key = parts.slice(3).filter(Boolean).concat(file).join('/');
+  const Body = fs.createReadStream(filename);
+  const ContentType = mime.lookup(file);
+  debug(`Putting ${filename} to ${JSON.stringify({ Bucket, Key, ContentType })}}`);
+  const s3Client = this.getClient();
+  const command = new PutObjectCommand({
+    Bucket,
+    Key,
+    Body,
+    ContentType
+  });
+  return s3Client.send(command);
 };
 Worker.prototype.put.metadata = {
-    options: {
-        filename: {},
-        directory: { description: 'Directory to put file, e.g. s3://foo-bar/dir/xyz' },
-        file: { description: 'Name of file, defaults to the filename' }
-    }
+  options: {
+    filename: {},
+    directory: { description: 'Directory to put file, e.g. s3://foo-bar/dir/xyz' },
+    file: { description: 'Name of file, defaults to the filename' }
+  }
 };
 Worker.prototype.write = async function (options) {
-    const { directory, file, content } = options;
-    if (!directory?.indexOf('s3://') === 0)
-        throw new Error('directory must start with s3://');
-    const parts = directory.split('/');
-    const Bucket = parts[2];
-    const Key = parts.slice(3).filter(Boolean).concat(file).join('/');
-    const Body = content;
-    debug(`Writing content of length ${content.length} to ${JSON.stringify({ Bucket, Key })}}`);
-    const s3Client = this.getClient();
-    const ContentType = mime.lookup(file);
-    const command = new PutObjectCommand({
-        Bucket,
-        Key,
-        Body,
-        ContentType
-    });
-    return s3Client.send(command);
+  const { directory, file, content } = options;
+  if (!directory?.indexOf('s3://') === 0) throw new Error('directory must start with s3://');
+  const parts = directory.split('/');
+  const Bucket = parts[2];
+  const Key = parts.slice(3).filter(Boolean).concat(file).join('/');
+  const Body = content;
+  debug(`Writing content of length ${content.length} to ${JSON.stringify({ Bucket, Key })}}`);
+  const s3Client = this.getClient();
+  const ContentType = mime.lookup(file);
+  const command = new PutObjectCommand({
+    Bucket,
+    Key,
+    Body,
+    ContentType
+  });
+  return s3Client.send(command);
 };
 Worker.prototype.write.metadata = {
-    options: {
-        directory: { description: 'Directory to put file, e.g. s3://foo-bar/dir/xyz' },
-        file: { description: 'Name of file, defaults to the filename' },
-        content: { description: 'Contents of file' }
-    }
+  options: {
+    directory: { description: 'Directory to put file, e.g. s3://foo-bar/dir/xyz' },
+    file: { description: 'Name of file, defaults to the filename' },
+    content: { description: 'Contents of file' }
+  }
 };
 Worker.prototype.list = async function ({ directory, start, end, raw }) {
-    if (!directory)
-        throw new Error('directory is required');
-    let dir = directory;
-    while (dir.slice(-1) === '/')
-        dir = dir.slice(0, -1);
-    const { Bucket, Key: Prefix } = getParts(dir);
-    const s3Client = this.getClient();
-    const command = new ListObjectsV2Command({
-        Bucket,
-        Prefix: `${Prefix}/`,
-        Delimiter: '/'
-    });
-    const { Contents: files, CommonPrefixes } = await s3Client.send(command);
-    if (raw)
-        return files;
-    // debug('Prefixes:', { CommonPrefixes });
-    const output = []
-        .concat((CommonPrefixes || []).map((f) => ({
+  if (!directory) throw new Error('directory is required');
+  let dir = directory;
+  while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
+  const { Bucket, Key: Prefix } = getParts(dir);
+  const s3Client = this.getClient();
+  const command = new ListObjectsV2Command({
+    Bucket,
+    Prefix: `${Prefix}/`,
+    Delimiter: '/'
+  });
+  const { Contents: files, CommonPrefixes } = await s3Client.send(command);
+  if (raw) return files;
+  // debug('Prefixes:', { CommonPrefixes });
+  const output = []
+    .concat(
+      (CommonPrefixes || []).map((f) => ({
         name: f.Prefix.slice(Prefix.length + 1, -1),
         type: 'directory'
-    })))
-        .concat((files || [])
+      }))
+    )
+    .concat(
+      (files || [])
         .filter(({ LastModified }) => {
-        if (start && new Date(LastModified) < start) {
+          if (start && new Date(LastModified) < start) {
             return false;
-        }
-        else if (end && new Date(LastModified) > end) {
+          } else if (end && new Date(LastModified) > end) {
             return false;
-        }
-        else {
+          } else {
             return true;
-        }
-    })
+          }
+        })
         .map(({ Key, Size, LastModified }) => ({
-        name: Key.slice(Prefix.length + 1),
-        type: 'file',
-        size: Size,
-        modifiedAt: new Date(LastModified).toISOString()
-    })));
-    return output;
+          name: Key.slice(Prefix.length + 1),
+          type: 'file',
+          size: Size,
+          modifiedAt: new Date(LastModified).toISOString()
+        }))
+    );
+  return output;
 };
 Worker.prototype.list.metadata = {
-    options: {
-        directory: { required: true }
+  options: {
+    directory: { required: true }
+  }
+};
+Worker.prototype.analyze = async function ({ directory }) {
+  if (!directory) throw new Error('directory is required');
+  let dir = directory;
+  while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
+  const { Bucket, Key } = getParts(dir);
+  const s3Client = this.getClient();
+  let Prefix = '';
+  if (Key) Prefix = `${Key}/`;
+  const dirsSeen = new Set();
+  let fileCount = 0;
+  let firstModified = null;
+  let lastModified = null;
+  let firstTime = null;
+  let lastTime = null;
+  const postfixCounts = Object.create(null);
+  let ContinuationToken = undefined;
+  do {
+    const result = await s3Client.send(
+      new ListObjectsV2Command({
+        Bucket,
+        Prefix,
+        ContinuationToken
+      })
+    );
+    for (const content of result.Contents || []) {
+      const objectKey = content.Key;
+      let rel = Prefix ? (objectKey.startsWith(Prefix) ? objectKey.slice(Prefix.length) : objectKey) : objectKey;
+      if (!rel) continue;
+      const isFolderMarker = rel.endsWith('/');
+      const parts = rel.replace(/\/$/, '').split('/').filter(Boolean);
+      for (let i = 0; i < parts.length - 1; i++) {
+        dirsSeen.add(parts.slice(0, i + 1).join('/'));
+      }
+      if (isFolderMarker) {
+        if (parts.length) dirsSeen.add(parts.join('/'));
+        continue;
+      }
+      fileCount++;
+      const postfix = getFilePostfix(objectKey);
+      postfixCounts[postfix] = (postfixCounts[postfix] || 0) + 1;
+      const mtime = new Date(content.LastModified).getTime();
+      const modifiedAt = new Date(content.LastModified).toISOString();
+      const filename = `${this.prefix}://${Bucket}/${objectKey}`;
+      if (firstTime === null || mtime < firstTime) {
+        firstTime = mtime;
+        firstModified = { filename, modifiedAt };
+      }
+      if (lastTime === null || mtime > lastTime) {
+        lastTime = mtime;
+        lastModified = { filename, modifiedAt };
+      }
     }
+    ContinuationToken = result.IsTruncated ? result.NextContinuationToken : undefined;
+  } while (ContinuationToken);
+  return {
+    fileCount,
+    directoryCount: dirsSeen.size,
+    postfixCounts,
+    firstModified: fileCount ? firstModified : null,
+    lastModified: fileCount ? lastModified : null
+  };
+};
+Worker.prototype.analyze.metadata = {
+  options: {
+    directory: { required: true }
+  }
 };
 /* List everything with the prefix */
 Worker.prototype.listAll = async function (options) {
-    const { directory } = options;
-    if (!directory)
-        throw new Error('directory is required');
-    let dir = directory;
-    const start = options.start && relativeDate(options.start);
-    const end = options.end && relativeDate(options.end);
-    while (dir.slice(-1) === '/')
-        dir = dir.slice(0, -1);
-    const { Bucket, Key } = getParts(dir);
-    const s3Client = this.getClient();
-    const files = [];
-    let ContinuationToken = null;
-    let Prefix = null;
-    if (Key)
-        Prefix = `${Key}/`;
-    do {
-        const command = new ListObjectsV2Command({
-            Bucket,
-            Prefix,
-            ContinuationToken
-            // Delimiter: '/',
-        });
-        debug(`Sending List command with prefix ${Prefix} with ContinuationToken ${ContinuationToken}`);
-        const result = await s3Client.send(command);
-        const newFiles = result.Contents?.filter(({ LastModified }) => {
-            if (start && new Date(LastModified) < start) {
-                return false;
-            }
-            else if (end && new Date(LastModified) > end) {
-                return false;
-            }
-            else {
-                return true;
-            }
-        })?.map((d) => `${this.prefix}://${Bucket}/${d.Key}`) || [];
-        debug(`Retrieved ${newFiles.length} new files, total ${files.length},sample ${newFiles.slice(0, 3).join(',')}`);
-        files.push(...newFiles);
-        ContinuationToken = result.NextContinuationToken;
-    } while (ContinuationToken);
-    return files;
+  const { directory } = options;
+  if (!directory) throw new Error('directory is required');
+  let dir = directory;
+  const start = options.start && relativeDate(options.start);
+  const end = options.end && relativeDate(options.end);
+  while (dir.slice(-1) === '/') dir = dir.slice(0, -1);
+  const { Bucket, Key } = getParts(dir);
+  const s3Client = this.getClient();
+  const files = [];
+  let ContinuationToken = null;
+  let Prefix = null;
+  if (Key) Prefix = `${Key}/`;
+  do {
+    const command = new ListObjectsV2Command({
+      Bucket,
+      Prefix,
+      ContinuationToken
+      // Delimiter: '/',
+    });
+    debug(`Sending List command with prefix ${Prefix} with ContinuationToken ${ContinuationToken}`);
+    const result = await s3Client.send(command);
+    const newFiles =
+      result.Contents?.filter(({ LastModified }) => {
+        if (start && new Date(LastModified) < start) {
+          return false;
+        } else if (end && new Date(LastModified) > end) {
+          return false;
+        } else {
+          return true;
+        }
+      })?.map((d) => `${this.prefix}://${Bucket}/${d.Key}`) || [];
+    debug(`Retrieved ${newFiles.length} new files, total ${files.length},sample ${newFiles.slice(0, 3).join(',')}`);
+    files.push(...newFiles);
+    ContinuationToken = result.NextContinuationToken;
+  } while (ContinuationToken);
+  return files;
 };
 Worker.prototype.listAll.metadata = {
-    options: {
-        directory: { required: true }
-    }
+  options: {
+    directory: { required: true }
+  }
 };
 Worker.prototype.moveAll = async function ({ directory, targetDirectory }) {
-    if (!directory || !targetDirectory)
-        throw new Error('directory and targetDirectory required');
-    const files = await this.listAll({ directory });
-    const configs = files.map((d) => ({
-        filename: d,
-        target: d.replace(directory, targetDirectory)
-    }));
-    const pLimit = await import('p-limit');
-    const limitedMethod = pLimit.default(10);
-    return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
+  if (!directory || !targetDirectory) throw new Error('directory and targetDirectory required');
+  const files = await this.listAll({ directory });
+  const configs = files.map((d) => ({
+    filename: d,
+    target: d.replace(directory, targetDirectory)
+  }));
+  const pLimit = await import('p-limit');
+  const limitedMethod = pLimit.default(10);
+  return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
 };
 Worker.prototype.moveAll.metadata = {
-    options: {
-        directory: { required: true },
-        targetDirectory: { required: true }
-    }
+  options: {
+    directory: { required: true },
+    targetDirectory: { required: true }
+  }
 };
 Worker.prototype.stat = async function ({ filename }) {
-    if (!filename)
-        throw new Error('filename is required');
-    const s3Client = this.getClient();
-    const { Bucket, Key } = getParts(filename);
-    const command = new HeadObjectCommand({ Bucket, Key });
-    const response = await s3Client.send(command);
-    const {
+  if (!filename) throw new Error('filename is required');
+  const s3Client = this.getClient();
+  const { Bucket, Key } = getParts(filename);
+  const command = new HeadObjectCommand({ Bucket, Key });
+  const response = await s3Client.send(command);
+  const {
     // "AcceptRanges": "bytes",
     ContentLength, // : "3191",
     ContentType, // : "image/jpeg",
@@ -310,20 +375,20 @@ Worker.prototype.stat = async function ({ filename }) {
     LastModified // : "2016-12-15T01:19:41.000Z",
     // Metadata": {},
     // VersionId": "null"
-     } = response;
-    const modifiedAt = new Date(LastModified);
-    const createdAt = modifiedAt; // Same for S3
-    const size = parseInt(ContentLength, 10);
-    return {
-        createdAt,
-        modifiedAt,
-        contentType: ContentType,
-        size
-    };
+  } = response;
+  const modifiedAt = new Date(LastModified);
+  const createdAt = modifiedAt; // Same for S3
+  const size = parseInt(ContentLength, 10);
+  return {
+    createdAt,
+    modifiedAt,
+    contentType: ContentType,
+    size
+  };
 };
 Worker.prototype.stat.metadata = {
-    options: {
-        filename: {}
-    }
+  options: {
+    filename: {}
+  }
 };
 export default Worker;

package/file/tools.js CHANGED Viewed

@@ -300,6 +300,18 @@ function makeStrings(o) {
     return a;
   }, {});
 }
+/** Basename postfix with leading dot, e.g. `.txt`, `.csv.gz`; empty string if none. */
+function getFilePostfix(filename) {
+  const base = path.basename(filename).toLowerCase();
+  if (!base || !base.includes('.')) return '';
+  if (base.endsWith('.gz')) {
+    const withoutGz = base.slice(0, -3);
+    const i = withoutGz.lastIndexOf('.');
+    if (i >= 0) return `${withoutGz.slice(i)}.gz`;
+    return '.gz';
+  }
+  return base.slice(base.lastIndexOf('.'));
+}
 function appendPostfix(filename, postfix) {
   const filenameParts = filename.split('/');
   const fileParts = filenameParts
@@ -331,6 +343,7 @@ export { getTempDir };
 export { getBatchTransform };
 export { getDebatchTransform };
 export { getFile };
+export { getFilePostfix };
 export { getManifest };
 export { getPacketFiles };
 export { getStringArray };
@@ -348,6 +361,7 @@ export default {
   getBatchTransform,
   getDebatchTransform,
   getFile,
+  getFilePostfix,
   getManifest,
   getPacketFiles,
   getStringArray,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@engine9/input-tools",
-  "version": "2.0.8",
+  "version": "2.0.10",
   "type": "module",
   "description": "Tools for dealing with Engine9 inputs",
   "main": "index.js",

package/skills/timeline/SKILL.md ADDED Viewed

@@ -0,0 +1,152 @@
+---
+name: timeline
+description: Describes Engine9 timeline file formats (Timeline ID vs Timeline Raw) and how to construct them using only the utilities exported by @engine9/input-tools. Use when creating or transforming timeline-shaped data in plugins or ingestion code.
+---
+## Timeline files in Engine9
+Use this skill whenever you are:
+- **Designing timeline outputs** from plugins or ingestion code.
+- **Producing files** that will eventually land in an Engine9 `timeline`-style table (for example via server workers).
+Engine9 models person-level activity as **timeline entries**. A timeline entry is a single fact about a person at a point in time (email send, open, click, transaction, signup, etc.), identified by:
+- **`ts`**: timestamp of the event.
+- **`entry_type_id`**: numeric type from `TIMELINE_ENTRY_TYPES` (`input-tools/timelineTypes.js`).
+- **`person_id`**: internal person identifier.
+- **`input_id`**: which input/source this event came from.
+- **`id`**: a deterministic UUID derived from the above (via `getTimelineEntryUUID`).
+There are **two main on-disk timeline file shapes**:
+- **Timeline ID files** – already resolved to `person_id` and `id`. These are ready to load into the `timeline` table.
+- **Timeline Raw files** – do **not** contain `person_id`. They must go through person resolution and ID assignment before they can be loaded.
+## Timeline ID files
+**Use when** you want data that is ready to be:
+- Loaded into a downstream `timeline` table.
+- Joined against plugin-specific detail tables.
+- De-duplicated by `id` (UUID).
+### Core shape
+Timeline ID files are typically produced by:
+- Plugin or ingestion code that has already resolved a stable `person_id` and `input_id`.
+- Mappers that call `getEntryTypeId` and `getTimelineEntryUUID` from `@engine9/input-tools`.
+**Minimum required fields** for a Timeline ID file that downstream workers will accept:
+- **`id`**: UUID for the timeline entry.
+  - Generated by `getTimelineEntryUUID`, or provided as a stable `remote_entry_uuid`.
+  - In `InputWorker.id`, `appendTimelineId` writes this into the `id` column.
+- **`ts`**: timestamp (string or number) that can be parsed into a `Date`.
+- **`person_id`**: internal numeric person id.
+- **`entry_type_id`**: integer from `TIMELINE_ENTRY_TYPES`.
+**Common optional fields**:
+- **`source_code_id`**: numeric source code identifier.
+- **`email_domain`**: lower-cased domain, often derived from `email`.
+- Any number of **extra columns** (detail fields); these can be copied into plugin-specific detail tables.
+The downstream `timeline` table schema usually includes:
+- A primary key `id` column (UUID stored as text).
+- A timestamp column `ts` (millis since epoch).
+- Integer columns `entry_type_id` and `person_id`.
+- Optional columns such as `source_code_id` and `email_domain`.
+### How to construct Timeline ID files with input-tools
+When authoring a Timeline ID-producing job or plugin using `@engine9/input-tools`:
+- **Always include** `id`, `ts`, `person_id`, and `entry_type_id` on each emitted row.
+- **Prefer numeric `entry_type_id`**, but you may also keep a string `entry_type` for debugging; resolution between the two happens via `TIMELINE_ENTRY_TYPES`, `getEntryTypeId`, and `getEntryType`.
+- **Keep `input_id` stable** per logical input stream; `getTimelineEntryUUID` uses it as the UUID namespace when generating `id`.
+## Timeline Raw files
+**Use when** you have raw events from an external system and **cannot yet** assign `person_id` but still want to capture structured activity.
+Examples:
+- Raw web or email events that only know an email or other external identifier.
+- Logs from external APIs where person resolution happens later in the pipeline.
+### Core shape
+Timeline Raw files:
+- **Must not contain** `person_id` (by definition for this skill).
+- **May or may not contain** `id`.
+  - If they do contain an `id`, it is usually an external event ID or `remote_entry_uuid`, not necessarily the final Engine9 `id`.
+- **Should contain enough information** to derive:
+  - A timestamp: **`ts`** (or a field that is mapped to `ts`).
+  - An entry type: **`entry_type`** (string) or **`entry_type_id`** (numeric).
+  - A person identifier that can be resolved later (e.g. `remote_person_id`, `email`, or similar).
+Typical fields you will see:
+- **`ts`** or a source-specific timestamp (later mapped to `ts`).
+- **`entry_type`** or **`entry_type_id`** (e.g. `'EMAIL_UNSUBSCRIBE'`, `'EMAIL_OPEN'`, etc.).
+- **Contact fields**: `email`, `remote_person_id`, phone number, etc.
+- **Source metadata**: `account_id`, `plugin_id`, `url`, `user_agent`, `ip_address`, etc.
+For example, a plugin may map an inbound event into a row with:
+- `ts`, `account_id`, `entry_type_id`, `email`, `email_domain`, `url`, `user_agent`
+and **no `person_id`** yet.
+### Converting Timeline Raw → Timeline ID
+The usual pathway for Raw → ID is:
+1. **Map raw events into a timeline-shaped object** (with `ts`, `entry_type`/`entry_type_id`, and contact info).
+2. **Resolve people** (outside of input-tools):
+   - Use your application’s person resolution or a server worker to:
+     - Look up or create `person` rows.
+     - Attach a canonical `person_id` to each row.
+3. **Assign timeline IDs** with input-tools:
+   - Use `getEntryTypeId` (if needed) to ensure `entry_type_id` is set from `TIMELINE_ENTRY_TYPES` when only `entry_type` is present.
+   - Call `getTimelineEntryUUID` to:
+     - Require `ts`, `entry_type_id`, `input_id`, and `person_id`.
+     - Produce a deterministic, sortable UUID for `id`.
+4. **Write out a Timeline ID file** (for example, parquet or CSV) with the full set of fields (`id`, `ts`, `person_id`, `entry_type_id`, optional `source_code_id`, etc.).
+## Choosing between Timeline ID and Timeline Raw
+- **Choose Timeline ID files when**:
+  - You can resolve `person_id` and `input_id` in the current process.
+  - You want files that are **immediately loadable** into a `timeline` table.
+  - You need **deduplication** by a stable `id`.
+- **Choose Timeline Raw files when**:
+  - You are at the **edge of the system** (plugins, collectors, ETL jobs) and only have partial identity information.
+  - You plan a **later enrichment step** that will attach `person_id` and compute final `id` values.
+  - You want to keep the ingestion simpler and defer canonicalization.
+In practice:
+- **Plugins and edge collectors** often emit **Timeline Raw** shaped data first.
+- **Downstream services or server workers** then:
+  - Resolve people (`person_id`).
+  - Generate `id` via `getTimelineEntryUUID`.
+  - Persist **Timeline ID** files and load them into the `timeline` and detail tables.
+## Reference helpers
+When working with any timeline format, prefer the utilities in `@engine9/input-tools`:
+- **`TIMELINE_ENTRY_TYPES`** (`timelineTypes.js`): bidirectional map between string entry types and numeric `entry_type_id`.
+- **`getEntryTypeId`**: resolve `entry_type` → `entry_type_id` with validation.
+- **`getEntryType`**: resolve `entry_type_id` → `entry_type`.
+- **`getTimelineEntryUUID`**: generate or normalize `id` given `ts`, `entry_type_id`, `input_id`, and `person_id`, respecting `remote_entry_uuid` / `remote_entry_id` when present.
+- **`uuidIsValid`**: validate that a string is a proper UUID.
+Use these helpers instead of hard-coding IDs or types whenever you construct timeline rows, whether Raw or ID.