@engine9/input-tools 2.0.9 → 2.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,7 @@ import languageEncoding from 'detect-file-encoding-and-language';
11
11
  import R2Worker from './R2.js';
12
12
  import S3Worker from './S3.js';
13
13
  import ParquetWorker from './Parquet.js';
14
- import { bool, getTempFilename, getStringArray, getTempDir, makeStrings, streamPacket, relativeDate } from './tools.js';
14
+ import { bool, getTempFilename, getStringArray, getTempDir, getFilePostfix, makeStrings, streamPacket, relativeDate } from './tools.js';
15
15
  const fsp = fs.promises;
16
16
  const { Readable, Transform, PassThrough, Writable } = nodestream;
17
17
  const { pipeline } = promises;
@@ -669,6 +669,7 @@ Worker.prototype.analyze = async function ({ directory }) {
669
669
  let lastModified = null;
670
670
  let firstTime = null;
671
671
  let lastTime = null;
672
+ const postfixCounts = Object.create(null);
672
673
  const walk = async (dir) => {
673
674
  const entries = await fsp.readdir(dir, { withFileTypes: true });
674
675
  for (const ent of entries) {
@@ -678,6 +679,8 @@ Worker.prototype.analyze = async function ({ directory }) {
678
679
  await walk(fullPath);
679
680
  } else {
680
681
  fileCount += 1;
682
+ const postfix = getFilePostfix(fullPath);
683
+ postfixCounts[postfix] = (postfixCounts[postfix] || 0) + 1;
681
684
  const stats = await fsp.stat(fullPath);
682
685
  const mtime = stats.mtimeMs;
683
686
  const modifiedAt = new Date(stats.mtime).toISOString();
@@ -696,6 +699,7 @@ Worker.prototype.analyze = async function ({ directory }) {
696
699
  return {
697
700
  fileCount,
698
701
  directoryCount,
702
+ postfixCounts,
699
703
  firstModified: fileCount ? firstModified : null,
700
704
  lastModified: fileCount ? lastModified : null
701
705
  };
package/file/S3.js CHANGED
@@ -2,7 +2,7 @@ import debug$0 from 'debug';
2
2
  import fs from 'node:fs';
3
3
  import withDb from 'mime-type/with-db';
4
4
  import clientS3 from '@aws-sdk/client-s3';
5
- import { getTempFilename, relativeDate } from './tools.js';
5
+ import { getTempFilename, getFilePostfix, relativeDate } from './tools.js';
6
6
  const debug = debug$0('@engine9/input/S3');
7
7
  const { mimeType: mime } = withDb;
8
8
  const {
@@ -247,6 +247,7 @@ Worker.prototype.analyze = async function ({ directory }) {
247
247
  let lastModified = null;
248
248
  let firstTime = null;
249
249
  let lastTime = null;
250
+ const postfixCounts = Object.create(null);
250
251
  let ContinuationToken = undefined;
251
252
  do {
252
253
  const result = await s3Client.send(
@@ -270,6 +271,8 @@ Worker.prototype.analyze = async function ({ directory }) {
270
271
  continue;
271
272
  }
272
273
  fileCount++;
274
+ const postfix = getFilePostfix(objectKey);
275
+ postfixCounts[postfix] = (postfixCounts[postfix] || 0) + 1;
273
276
  const mtime = new Date(content.LastModified).getTime();
274
277
  const modifiedAt = new Date(content.LastModified).toISOString();
275
278
  const filename = `${this.prefix}://${Bucket}/${objectKey}`;
@@ -287,6 +290,7 @@ Worker.prototype.analyze = async function ({ directory }) {
287
290
  return {
288
291
  fileCount,
289
292
  directoryCount: dirsSeen.size,
293
+ postfixCounts,
290
294
  firstModified: fileCount ? firstModified : null,
291
295
  lastModified: fileCount ? lastModified : null
292
296
  };
package/file/tools.js CHANGED
@@ -300,6 +300,18 @@ function makeStrings(o) {
300
300
  return a;
301
301
  }, {});
302
302
  }
303
+ /** Basename postfix with leading dot, e.g. `.txt`, `.csv.gz`; empty string if none. */
304
+ function getFilePostfix(filename) {
305
+ const base = path.basename(filename).toLowerCase();
306
+ if (!base || !base.includes('.')) return '';
307
+ if (base.endsWith('.gz')) {
308
+ const withoutGz = base.slice(0, -3);
309
+ const i = withoutGz.lastIndexOf('.');
310
+ if (i >= 0) return `${withoutGz.slice(i)}.gz`;
311
+ return '.gz';
312
+ }
313
+ return base.slice(base.lastIndexOf('.'));
314
+ }
303
315
  function appendPostfix(filename, postfix) {
304
316
  const filenameParts = filename.split('/');
305
317
  const fileParts = filenameParts
@@ -331,6 +343,7 @@ export { getTempDir };
331
343
  export { getBatchTransform };
332
344
  export { getDebatchTransform };
333
345
  export { getFile };
346
+ export { getFilePostfix };
334
347
  export { getManifest };
335
348
  export { getPacketFiles };
336
349
  export { getStringArray };
@@ -348,6 +361,7 @@ export default {
348
361
  getBatchTransform,
349
362
  getDebatchTransform,
350
363
  getFile,
364
+ getFilePostfix,
351
365
  getManifest,
352
366
  getPacketFiles,
353
367
  getStringArray,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@engine9/input-tools",
3
- "version": "2.0.9",
3
+ "version": "2.0.10",
4
4
  "type": "module",
5
5
  "description": "Tools for dealing with Engine9 inputs",
6
6
  "main": "index.js",