npm - @engine9-io/input-tools - Versions diffs - 1.9.11 → 2.0.1 - Mend

@engine9-io/input-tools 1.9.11 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/ForEachEntry.js +18 -45
package/ValidatingReadable.js +3 -6
package/buildSamplePackets.js +11 -16
package/eslint.config.mjs +15 -11
package/file/FileUtilities.js +29 -153
package/file/GoogleDrive.js +32 -38
package/file/Parquet.js +112 -124
package/file/R2.js +27 -32
package/file/S3.js +259 -293
package/file/tools.js +33 -54
package/index.js +59 -74
package/package.json +2 -1
package/test/cli.js +3 -4
package/test/file.js +6 -7
package/test/processing/bigDataMessage.js +8 -10
package/test/processing/forEach.js +6 -8
package/test/processing/forEachResume.js +6 -8
package/test/processing/message.js +31 -39
package/test/processing/zip.js +6 -7
package/test/uuid.js +6 -11
package/timelineTypes.js +2 -24

package/file/FileUtilities.js CHANGED Viewed

@@ -1,43 +1,32 @@
-const fs = require('node:fs');
+import fs from 'node:fs';
+import path from 'node:path';
+import zlib from 'node:zlib';
+import nodestream from 'node:stream';
+import promises from 'node:stream/promises';
+import { parse, stringify } from 'csv';
+import debug$0 from 'debug';
+import xlstream from 'xlstream';
+import JSON5 from 'json5';
+import languageEncoding from 'detect-file-encoding-and-language';
+import R2Worker from './R2.js';
+import S3Worker from './S3.js';
+import ParquetWorker from './Parquet.js';
+import { bool, getTempFilename, getStringArray, getTempDir, makeStrings, streamPacket, relativeDate } from './tools.js';
 const fsp = fs.promises;
-const path = require('node:path');
-const zlib = require('node:zlib');
-const { Readable, Transform, PassThrough, Writable } = require('node:stream');
-const { pipeline } = require('node:stream/promises');
-const { stringify } = require('csv');
-const debug = require('debug')('@engine9-io/file');
-const { getXlsxStream } = require('xlstream');
-const csv = require('csv');
-const JSON5 = require('json5');
-const languageEncoding = require('detect-file-encoding-and-language');
-const R2Worker = require('./R2');
-const S3Worker = require('./S3');
-const ParquetWorker = require('./Parquet');
+const { Readable, Transform, PassThrough, Writable } = nodestream;
+const { pipeline } = promises;
-const {
-  bool,
-  getTempFilename,
-  getStringArray,
-  getTempDir,
-  makeStrings,
-  streamPacket,
-  relativeDate
-} = require('./tools');
+const debug = debug$0('@engine9-io/file');
+const { getXlsxStream } = xlstream;
 function Worker({ accountId }) {
   this.accountId = accountId;
 }
 class LineReaderTransform extends Transform {
   constructor(options = {}) {
     super({ ...options, readableObjectMode: true });
     this.buffer = '';
   }
   _transform(chunk, encoding, callback) {
     this.buffer += chunk.toString();
     const lines = this.buffer.split(/\r?\n/);
@@ -45,7 +34,6 @@ class LineReaderTransform extends Transform {
     lines.forEach((line) => this.push(line));
     callback();
   }
   _flush(callback) {
     if (this.buffer) {
       this.push(this.buffer);
@@ -53,11 +41,9 @@ class LineReaderTransform extends Transform {
     callback();
   }
 }
 Worker.prototype.csvToObjectTransforms = function (options) {
   const transforms = [];
   const delimiter = options.delimiter || ',';
   const headerMapping =
     options.headerMapping ||
     function (d) {
@@ -65,7 +51,6 @@ Worker.prototype.csvToObjectTransforms = function (options) {
     };
   let lastLine = null;
   let head = null;
   const skipLinesWithError = bool(options.skip_lines_with_error, false);
   const parserOptions = {
     relax: true,
@@ -82,27 +67,23 @@ Worker.prototype.csvToObjectTransforms = function (options) {
   if (options.limit) {
     parserOptions.to = options.limit;
   }
   debug('Parser options=', parserOptions);
-  const parser = csv.parse(parserOptions);
+  const parser = parse(parserOptions);
   parser.on('error', (error) => {
     debug('fileToObjectStream: Error parsing csv file');
     debug(lastLine);
     throw new Error(error);
   });
   const blankAndHeaderCheck = new Transform({
     objectMode: true,
     transform(row, enc, cb) {
       // Blank rows
       if (row.length === 0) return cb();
       if (row.length === 1 && !row[0]) return cb();
       if (!head) {
         head = row.map(headerMapping);
         return cb();
       }
       const o = {};
       head.forEach((_h, i) => {
         const h = _h.trim();
@@ -110,18 +91,14 @@ Worker.prototype.csvToObjectTransforms = function (options) {
           o[h] = row[i];
         }
       });
       lastLine = row.join(delimiter);
       return cb(null, o);
     }
   });
   transforms.push(parser);
   transforms.push(blankAndHeaderCheck);
   return { transforms };
 };
 Worker.prototype.detectEncoding = async function (options) {
   if (options.encoding_override) return { encoding: options.encoding_override };
   // Limit to only the top N bytes -- for perfomance
@@ -154,19 +131,15 @@ Worker.prototype.detectEncoding = async function (options) {
       decompressStream.end();
     });
   }
   return languageEncoding(finalBuff);
 };
 Worker.prototype.detectEncoding.metadata = {
   options: {
     filename: { required: true }
   }
 };
 Worker.prototype.xlsxToObjectStream = async function (options) {
   let { filename } = options;
   if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
     // We need to copy and delete
     let worker = null;
@@ -176,7 +149,6 @@ Worker.prototype.xlsxToObjectStream = async function (options) {
       worker = new S3Worker(this);
     }
     const target = getTempFilename({ targetFilename: filename.split('/').pop() });
     await worker.copy({ filename, target });
     filename = target;
   }
@@ -202,27 +174,22 @@ Worker.prototype.xlsxToObjectStream = async function (options) {
       }
     })
   );
   return { stream };
 };
 Worker.prototype.getFormat = async function (options) {
   const { sourcePostfix, filename, format: formatOverride } = options;
   let postfix = sourcePostfix || filename.toLowerCase().split('.').pop();
   if (postfix === 'gz') {
     postfix = filename.toLowerCase().split('.');
     postfix = postfix[postfix.length - 2];
   }
   return formatOverride || postfix;
 };
 /*
 Commonly used method to transform a file into a stream of objects.
 */
 Worker.prototype.fileToObjectStream = async function (options) {
   const { filename, columns, limit: limitOption, format: formatOverride } = options;
   // handle stream item
   if (options.stream) {
     if (Array.isArray(options.stream)) {
@@ -243,7 +210,6 @@ Worker.prototype.fileToObjectStream = async function (options) {
     debug('Invalid filename:', { filename });
     throw new Error('Cowardly refusing to turn a .zip file into an object stream, turn into a csv first');
   }
   const streamInfo = await this.stream({
     filename,
     columns,
@@ -256,11 +222,8 @@ Worker.prototype.fileToObjectStream = async function (options) {
     // already an object
     return { stream };
   }
   let count = 0;
   let transforms = [];
   if (postfix === 'gz') {
     const gunzip = zlib.createGunzip();
     transforms.push(gunzip);
@@ -273,9 +236,7 @@ Worker.prototype.fileToObjectStream = async function (options) {
     stream.setEncoding(encoding);
   }
   let format = formatOverride || postfix;
   debug(`Reading file ${filename} with encoding: ${encoding} and format ${format}`);
   if (format === 'csv') {
     const csvTransforms = this.csvToObjectTransforms({ ...options });
     transforms = transforms.concat(csvTransforms.transforms);
@@ -284,12 +245,10 @@ Worker.prototype.fileToObjectStream = async function (options) {
     transforms = transforms.concat(csvTransforms.transforms);
   } else if (format === 'jsonl') {
     /* Type of JSON that has the names in an array in the first record,
-    and the values in JSON arrays thereafter
-    */
+            and the values in JSON arrays thereafter
+            */
     let headers = null;
     const lineReader = new LineReaderTransform();
     const jsonlTransform = new Transform({
       objectMode: true,
       transform(d, enc, cb) {
@@ -303,8 +262,8 @@ Worker.prototype.fileToObjectStream = async function (options) {
           throw e;
         }
         /* JSONL could potentially start with an array of names,
-        in which case we need to map the subsequent values
-      */
+                        in which case we need to map the subsequent values
+                      */
         if (headers === null) {
           if (Array.isArray(obj)) {
             headers = obj;
@@ -324,7 +283,6 @@ Worker.prototype.fileToObjectStream = async function (options) {
         return cb();
       }
     });
     transforms.push(lineReader);
     transforms.push(jsonlTransform);
   } else {
@@ -348,20 +306,18 @@ Worker.prototype.fileToObjectStream = async function (options) {
       // Don't push dummy records anymore -- legacy cruft
       debug(`Completed reading file, records=${count}`);
       /* if (count === 0) {
-        const o = { _is_placeholder: true };
-        if (head) head.forEach((c) => { o[c] = null; });
-        this.push(o);
-      } */
+                    const o = { _is_placeholder: true };
+                    if (head) head.forEach((c) => { o[c] = null; });
+                    this.push(o);
+                  } */
       cb();
     }
   });
   transforms.push(countAndDebug);
   transforms.forEach((t) => {
     stream = stream.pipe(t);
   });
   return { stream };
 };
 Worker.prototype.getFileWriterStream = async function (options = {}) {
@@ -380,13 +336,10 @@ Worker.prototype.getFileWriterStream = async function (options = {}) {
   if (bool(options.gzip, false)) filename += '.gz';
   const stream = fs.createWriteStream(filename);
   debug('FileWriterStream writing to file ', filename);
   return { filename, stream };
 };
 Worker.prototype.getOutputStreams = async function (options) {
   const { filename, stream: fileWriterStream } = await this.getFileWriterStream(options);
   let { transform } = options;
   if (typeof options.transform === 'function') {
     if (options.transform.length === 3) {
@@ -409,7 +362,6 @@ Worker.prototype.getOutputStreams = async function (options) {
   }
   const { flatten } = options;
   let flattenTransform = null;
   if (bool(flatten, false)) {
     flattenTransform = new Transform({
       objectMode: true,
@@ -431,7 +383,6 @@ Worker.prototype.getOutputStreams = async function (options) {
       }
     });
   }
   const stats = {
     records: 0
   };
@@ -473,23 +424,17 @@ Worker.prototype.objectStreamToFile = async function (options) {
   await pipeline(streams);
   return { filename, records: stats.records };
 };
 Worker.prototype.transform = async function (options) {
   const worker = this;
   const { filename } = options;
   debug(`Transforming ${filename}`);
   options.filename = filename;
   let { stream } = await worker.fileToObjectStream(options);
   if (typeof stream.pipe !== 'function') {
     debug(stream);
     throw new Error('No pipe in stream');
   }
   let t = options.transform;
   // No longer need this
   delete options.transform;
   if (!t) {
@@ -498,7 +443,6 @@ Worker.prototype.transform = async function (options) {
       cb(null, d);
     };
   }
   if (!Array.isArray(t)) t = [t];
   Object.keys(t).forEach((key) => {
     let f = t[key];
@@ -508,22 +452,17 @@ Worker.prototype.transform = async function (options) {
         transform: f
       });
     }
     stream = stream.pipe(f);
   });
   const { targetFormat } = options;
   if (
     !targetFormat &&
     (filename.toLowerCase().slice(-4) === '.csv' || filename.toLowerCase().slice(-7) === '.csv.gz')
   ) {
     options.targetFormat = 'csv';
   }
   return worker.objectStreamToFile({ ...options, stream });
 };
 Worker.prototype.transform.metadata = {
   options: {
     sourcePostfix: { description: "Override the source postfix, if for example it's a csv" },
@@ -553,12 +492,10 @@ Worker.prototype.testTransform.metadata = {
     filename: true
   }
 };
 /* Get a stream from an actual stream, or an array, or a file */
 Worker.prototype.stream = async function (options) {
   const { stream: inputStream, packet, type, columns, limit, filename: filenameOpt } = options;
   let filename = filenameOpt;
   if (inputStream) {
     if (Array.isArray(inputStream)) {
       return { stream: Readable.from(inputStream) };
@@ -611,7 +548,6 @@ Worker.prototype.stream = async function (options) {
     throw new Error('stream must be passed a stream, filename, or packet');
   }
 };
 Worker.prototype.sample = async function (opts) {
   opts.limit = opts.limit || 10;
   const { stream } = await this.fileToObjectStream(opts);
@@ -631,7 +567,6 @@ Worker.prototype.toArray.metadata = {
     filename: {}
   }
 };
 Worker.prototype.write = async function (opts) {
   const { filename, content } = opts;
   if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
@@ -658,15 +593,12 @@ Worker.prototype.write.metadata = {
     content: {}
   }
 };
 async function streamToString(stream) {
   // lets have a ReadableStream as a stream variable
   const chunks = [];
   for await (const chunk of stream) {
     chunks.push(Buffer.from(chunk));
   }
   return Buffer.concat(chunks).toString('utf-8');
 }
 /*
@@ -687,20 +619,17 @@ Worker.prototype.json.metadata = {
     filename: { description: 'Get a javascript object from a file' }
   }
 };
 Worker.prototype.list = async function ({ directory, start: s, end: e }) {
   if (!directory) throw new Error('directory is required');
   let start = null;
   let end = null;
   if (s) start = relativeDate(s);
   if (e) end = relativeDate(e);
   if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
     const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
     return worker.list({ directory, start, end });
   }
   const a = await fsp.readdir(directory, { withFileTypes: true });
   const withModified = [];
   for (const file of a) {
     const fullPath = path.join(directory, file.name);
@@ -717,7 +646,6 @@ Worker.prototype.list = async function ({ directory, start: s, end: e }) {
       });
     }
   }
   return withModified;
 };
 Worker.prototype.list.metadata = {
@@ -725,7 +653,6 @@ Worker.prototype.list.metadata = {
     directory: { required: true }
   }
 };
 Worker.prototype.listAll = async function ({ directory, start: s, end: e }) {
   if (!directory) throw new Error('directory is required');
   let start = null;
@@ -737,16 +664,13 @@ Worker.prototype.listAll = async function ({ directory, start: s, end: e }) {
     return worker.listAll({ directory, start, end });
   }
   const a = await fsp.readdir(directory, { recursive: true });
   let files = a.map((f) => `${directory}/${f}`);
   if (!start && !end) {
     return files;
   }
   const pLimit = await import('p-limit');
   const limitedMethod = pLimit.default(10);
   const filesWithinLimit = [];
   await Promise.all(
     files.map((filename) =>
       limitedMethod(async () => {
@@ -774,7 +698,6 @@ Worker.prototype.listAll.metadata = {
     end: {}
   }
 };
 Worker.prototype.moveAll = async function (options) {
   const { directory, targetDirectory } = options;
   if (!directory) throw new Error('directory is required');
@@ -783,7 +706,6 @@ Worker.prototype.moveAll = async function (options) {
     return worker.moveAll(options);
   }
   const a = await this.listAll(options);
   let configs = a.map((f) => {
     let filename = typeof f === 'string' ? f : f.filename;
     return {
@@ -792,9 +714,7 @@ Worker.prototype.moveAll = async function (options) {
     };
   });
   const pLimit = await import('p-limit');
   const limitedMethod = pLimit.default(10);
   return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
 };
 Worker.prototype.moveAll.metadata = {
@@ -803,7 +723,6 @@ Worker.prototype.moveAll.metadata = {
     targetDirectory: { required: true }
   }
 };
 Worker.prototype.empty = async function ({ directory }) {
   if (!directory) throw new Error('directory is required');
   if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
@@ -811,7 +730,6 @@ Worker.prototype.empty = async function ({ directory }) {
     throw new Error('Cannot empty an s3:// or r2:// directory');
   }
   const removed = [];
   for (const file of await fsp.readdir(directory)) {
     removed.push(file);
     await fsp.unlink(path.join(directory, file));
@@ -823,14 +741,10 @@ Worker.prototype.empty.metadata = {
     directory: { required: true }
   }
 };
 Worker.prototype.removeAll = async function (options) {
   const filenames = await this.listAll(options);
   const pLimit = await import('p-limit');
   const limitedMethod = pLimit.default(10);
   return Promise.all(filenames.map((filename) => limitedMethod(async () => this.remove({ filename }))));
 };
 Worker.prototype.removeAll.metadata = {
@@ -840,7 +754,6 @@ Worker.prototype.removeAll.metadata = {
     end: {}
   }
 };
 Worker.prototype.remove = async function ({ filename }) {
   if (!filename) throw new Error('filename is required');
   if (typeof filename !== 'string') throw new Error(`filename isn't a string:${JSON.stringify(filename)}`);
@@ -851,12 +764,10 @@ Worker.prototype.remove = async function ({ filename }) {
     } else {
       worker = new S3Worker(this);
     }
     await worker.remove({ filename });
   } else {
     fsp.unlink(filename);
   }
   return { removed: filename };
 };
 Worker.prototype.remove.metadata = {
@@ -864,7 +775,6 @@ Worker.prototype.remove.metadata = {
     filename: {}
   }
 };
 Worker.prototype.move = async function ({ filename, target, remove = true }) {
   if (!target) throw new Error('target is required');
   if (typeof target !== 'string') throw new Error(`target isn't a string:${JSON.stringify(target)}`);
@@ -875,14 +785,12 @@ Worker.prototype.move = async function ({ filename, target, remove = true }) {
     ) {
       throw new Error('Cowardly not copying between services');
     }
     let worker = null;
     if (target.startsWith('r2://')) {
       worker = new R2Worker(this);
     } else {
       worker = new S3Worker(this);
     }
     if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
       // We need to copy and delete
       const output = await worker.copy({ filename, target });
@@ -913,7 +821,6 @@ Worker.prototype.move.metadata = {
     target: {}
   }
 };
 Worker.prototype.copy = async function (opts) {
   return this.move({ ...opts, remove: false });
 };
@@ -923,17 +830,14 @@ Worker.prototype.copy.metadata = {
     target: {}
   }
 };
 Worker.prototype.stat = async function ({ filename }) {
   if (!filename) throw new Error('filename is required');
   const output = {};
   if (filename.slice(-8) === '.parquet') {
     const pq = new ParquetWorker(this);
     output.schema = (await pq.schema({ filename }))?.schema;
     output.records = (await pq.meta({ filename }))?.records;
   }
   if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
     const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
     Object.assign(output, await worker.stat({ filename }));
@@ -956,7 +860,6 @@ Worker.prototype.stat.metadata = {
     filename: {}
   }
 };
 Worker.prototype.download = async function ({ filename }) {
   if (!filename) throw new Error('filename is required');
   if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
@@ -970,29 +873,23 @@ Worker.prototype.download.metadata = {
     filename: {}
   }
 };
 Worker.prototype.head = async function (options) {
   const limit = options.limit || 3;
   const { stream } = await this.fileToObjectStream({ ...options, limit });
   const chunks = [];
   let counter = 0;
   for await (const chunk of stream) {
     chunks.push(chunk);
     counter += 1;
     if (counter >= limit) break;
   }
   return chunks;
 };
 Worker.prototype.head.metadata = {
   options: {
     filename: { required: true }
   }
 };
 Worker.prototype.columns = async function (options) {
   const head = await this.head(options);
   if (head.length == 0) {
@@ -1002,7 +899,6 @@ Worker.prototype.columns = async function (options) {
       columns: []
     };
   }
   let likelyHeaderLines = 1;
   const columns = Object.keys(head[0]);
   let s = columns.join(',');
@@ -1014,48 +910,39 @@ Worker.prototype.columns = async function (options) {
     columns
   };
 };
 Worker.prototype.columns.metadata = {
   options: {
     filename: { required: true }
   }
 };
 Worker.prototype.count = async function (options) {
   const { stream } = await this.fileToObjectStream(options);
   const sample = [];
   const limit = options.limit || 5;
   let records = 0;
   for await (const chunk of stream) {
     records += 1;
     if (records < limit) {
       sample.push(chunk);
     }
   }
   return { sample, records };
 };
 Worker.prototype.count.metadata = {
   options: {
     filename: { required: true }
   }
 };
 // Get a set of unique entries from a uniqueFunction
 // This could be large
 Worker.prototype.getUniqueSet = async function (options) {
   const existingFiles = getStringArray(options.filenames);
   const sample = {};
   let { uniqueFunction } = options;
   if (!uniqueFunction) {
     uniqueFunction = (o) => JSON.stringify(o);
   }
   const uniqueSet = new Set();
   for (const filename of existingFiles) {
     const { stream: existsStream } = await this.fileToObjectStream({ filename });
     await pipeline(
@@ -1082,22 +969,18 @@ Worker.prototype.getUniqueSet = async function (options) {
   }
   return { uniqueFunction, uniqueSet, sample };
 };
 Worker.prototype.getUniqueStream = async function (options) {
   const includeDuplicateSourceRecords = bool(options.includeDuplicateSourceRecords, false);
   const { uniqueSet, uniqueFunction, sample } = await this.getUniqueSet({
     filenames: options.existingFiles,
     uniqueFunction: options.uniqueFunction
   });
   const { stream: inStream } = await this.fileToObjectStream(options);
   const uniqueStream = inStream.pipe(
     new Transform({
       objectMode: true,
       transform(d, enc, cb) {
         const v = uniqueFunction(makeStrings(d)) || '';
         if (!v) {
           // falsey unique function includes
           // by default
@@ -1117,7 +1000,6 @@ Worker.prototype.getUniqueStream = async function (options) {
   );
   return { stream: uniqueStream, sample };
 };
 Worker.prototype.getUniqueStream.metadata = {
   options: {
     existingFiles: {},
@@ -1134,7 +1016,6 @@ Worker.prototype.getUniqueFile = async function (options) {
   const { filename, records } = await this.objectStreamToFile({ stream });
   return { filename, records, sample };
 };
 Worker.prototype.getUniqueFile.metadata = {
   options: {
     existingFiles: {},
@@ -1146,7 +1027,6 @@ Worker.prototype.getUniqueFile.metadata = {
     }
   }
 };
 /*
 diff that allows for unordered files, and doesn't store full objects in memory.
 Requires 2 passes of the files,
@@ -1155,14 +1035,12 @@ but that's a better tradeoff than trying to store huge files in memory
 Worker.prototype.diff = async function (options) {
   const { fileA, fileB, uniqueFunction: ufOpt, columns, includeDuplicateSourceRecords } = options;
   if (options.fields) throw new Error('fields is deprecated, use columns');
   if (ufOpt && columns) throw new Error('fields and uniqueFunction cannot both be specified');
   let uniqueFunction = ufOpt;
   if (!uniqueFunction && columns) {
     const farr = getStringArray(columns);
     uniqueFunction = (o) => farr.map((f) => o[f] || '').join('.');
   }
   const left = await this.getUniqueFile({
     existingFiles: [fileB],
     filename: fileA,
@@ -1175,7 +1053,6 @@ Worker.prototype.diff = async function (options) {
     uniqueFunction,
     includeDuplicateSourceRecords
   });
   return {
     left,
     right
@@ -1192,5 +1069,4 @@ Worker.prototype.diff.metadata = {
     }
   }
 };
-module.exports = Worker;
+export default Worker;