@engine9/input-tools 2.0.9 → 2.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/file/FileUtilities.js +5 -1
- package/file/S3.js +5 -1
- package/file/tools.js +14 -0
- package/package.json +1 -1
package/file/FileUtilities.js
CHANGED
|
@@ -11,7 +11,7 @@ import languageEncoding from 'detect-file-encoding-and-language';
|
|
|
11
11
|
import R2Worker from './R2.js';
|
|
12
12
|
import S3Worker from './S3.js';
|
|
13
13
|
import ParquetWorker from './Parquet.js';
|
|
14
|
-
import { bool, getTempFilename, getStringArray, getTempDir, makeStrings, streamPacket, relativeDate } from './tools.js';
|
|
14
|
+
import { bool, getTempFilename, getStringArray, getTempDir, getFilePostfix, makeStrings, streamPacket, relativeDate } from './tools.js';
|
|
15
15
|
const fsp = fs.promises;
|
|
16
16
|
const { Readable, Transform, PassThrough, Writable } = nodestream;
|
|
17
17
|
const { pipeline } = promises;
|
|
@@ -669,6 +669,7 @@ Worker.prototype.analyze = async function ({ directory }) {
|
|
|
669
669
|
let lastModified = null;
|
|
670
670
|
let firstTime = null;
|
|
671
671
|
let lastTime = null;
|
|
672
|
+
const postfixCounts = Object.create(null);
|
|
672
673
|
const walk = async (dir) => {
|
|
673
674
|
const entries = await fsp.readdir(dir, { withFileTypes: true });
|
|
674
675
|
for (const ent of entries) {
|
|
@@ -678,6 +679,8 @@ Worker.prototype.analyze = async function ({ directory }) {
|
|
|
678
679
|
await walk(fullPath);
|
|
679
680
|
} else {
|
|
680
681
|
fileCount += 1;
|
|
682
|
+
const postfix = getFilePostfix(fullPath);
|
|
683
|
+
postfixCounts[postfix] = (postfixCounts[postfix] || 0) + 1;
|
|
681
684
|
const stats = await fsp.stat(fullPath);
|
|
682
685
|
const mtime = stats.mtimeMs;
|
|
683
686
|
const modifiedAt = new Date(stats.mtime).toISOString();
|
|
@@ -696,6 +699,7 @@ Worker.prototype.analyze = async function ({ directory }) {
|
|
|
696
699
|
return {
|
|
697
700
|
fileCount,
|
|
698
701
|
directoryCount,
|
|
702
|
+
postfixCounts,
|
|
699
703
|
firstModified: fileCount ? firstModified : null,
|
|
700
704
|
lastModified: fileCount ? lastModified : null
|
|
701
705
|
};
|
package/file/S3.js
CHANGED
|
@@ -2,7 +2,7 @@ import debug$0 from 'debug';
|
|
|
2
2
|
import fs from 'node:fs';
|
|
3
3
|
import withDb from 'mime-type/with-db';
|
|
4
4
|
import clientS3 from '@aws-sdk/client-s3';
|
|
5
|
-
import { getTempFilename, relativeDate } from './tools.js';
|
|
5
|
+
import { getTempFilename, getFilePostfix, relativeDate } from './tools.js';
|
|
6
6
|
const debug = debug$0('@engine9/input/S3');
|
|
7
7
|
const { mimeType: mime } = withDb;
|
|
8
8
|
const {
|
|
@@ -247,6 +247,7 @@ Worker.prototype.analyze = async function ({ directory }) {
|
|
|
247
247
|
let lastModified = null;
|
|
248
248
|
let firstTime = null;
|
|
249
249
|
let lastTime = null;
|
|
250
|
+
const postfixCounts = Object.create(null);
|
|
250
251
|
let ContinuationToken = undefined;
|
|
251
252
|
do {
|
|
252
253
|
const result = await s3Client.send(
|
|
@@ -270,6 +271,8 @@ Worker.prototype.analyze = async function ({ directory }) {
|
|
|
270
271
|
continue;
|
|
271
272
|
}
|
|
272
273
|
fileCount++;
|
|
274
|
+
const postfix = getFilePostfix(objectKey);
|
|
275
|
+
postfixCounts[postfix] = (postfixCounts[postfix] || 0) + 1;
|
|
273
276
|
const mtime = new Date(content.LastModified).getTime();
|
|
274
277
|
const modifiedAt = new Date(content.LastModified).toISOString();
|
|
275
278
|
const filename = `${this.prefix}://${Bucket}/${objectKey}`;
|
|
@@ -287,6 +290,7 @@ Worker.prototype.analyze = async function ({ directory }) {
|
|
|
287
290
|
return {
|
|
288
291
|
fileCount,
|
|
289
292
|
directoryCount: dirsSeen.size,
|
|
293
|
+
postfixCounts,
|
|
290
294
|
firstModified: fileCount ? firstModified : null,
|
|
291
295
|
lastModified: fileCount ? lastModified : null
|
|
292
296
|
};
|
package/file/tools.js
CHANGED
|
@@ -300,6 +300,18 @@ function makeStrings(o) {
|
|
|
300
300
|
return a;
|
|
301
301
|
}, {});
|
|
302
302
|
}
|
|
303
|
+
/** Basename postfix with leading dot, e.g. `.txt`, `.csv.gz`; empty string if none. */
|
|
304
|
+
function getFilePostfix(filename) {
|
|
305
|
+
const base = path.basename(filename).toLowerCase();
|
|
306
|
+
if (!base || !base.includes('.')) return '';
|
|
307
|
+
if (base.endsWith('.gz')) {
|
|
308
|
+
const withoutGz = base.slice(0, -3);
|
|
309
|
+
const i = withoutGz.lastIndexOf('.');
|
|
310
|
+
if (i >= 0) return `${withoutGz.slice(i)}.gz`;
|
|
311
|
+
return '.gz';
|
|
312
|
+
}
|
|
313
|
+
return base.slice(base.lastIndexOf('.'));
|
|
314
|
+
}
|
|
303
315
|
function appendPostfix(filename, postfix) {
|
|
304
316
|
const filenameParts = filename.split('/');
|
|
305
317
|
const fileParts = filenameParts
|
|
@@ -331,6 +343,7 @@ export { getTempDir };
|
|
|
331
343
|
export { getBatchTransform };
|
|
332
344
|
export { getDebatchTransform };
|
|
333
345
|
export { getFile };
|
|
346
|
+
export { getFilePostfix };
|
|
334
347
|
export { getManifest };
|
|
335
348
|
export { getPacketFiles };
|
|
336
349
|
export { getStringArray };
|
|
@@ -348,6 +361,7 @@ export default {
|
|
|
348
361
|
getBatchTransform,
|
|
349
362
|
getDebatchTransform,
|
|
350
363
|
getFile,
|
|
364
|
+
getFilePostfix,
|
|
351
365
|
getManifest,
|
|
352
366
|
getPacketFiles,
|
|
353
367
|
getStringArray,
|