@engine9/input-tools 2.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc +7 -0
- package/ForEachEntry.js +194 -0
- package/LICENSE +674 -0
- package/README.md +6 -0
- package/ValidatingReadable.js +21 -0
- package/buildSamplePackets.js +13 -0
- package/eslint.config.mjs +17 -0
- package/file/FileUtilities.js +1076 -0
- package/file/GoogleDrive.js +39 -0
- package/file/Parquet.js +137 -0
- package/file/R2.js +32 -0
- package/file/S3.js +329 -0
- package/file/tools.js +359 -0
- package/index.js +426 -0
- package/package.json +54 -0
- package/skills/transaction-mapping/SKILL.md +105 -0
- package/skills/transaction-mapping/reference.md +72 -0
- package/test/cli.js +9 -0
- package/test/file.js +23 -0
- package/test/processing/bigDataMessage.js +52 -0
- package/test/processing/forEach.js +53 -0
- package/test/processing/forEachResume.js +54 -0
- package/test/processing/message.js +40 -0
- package/test/processing/zip.js +21 -0
- package/test/sample/1000_message.packet.zip +0 -0
- package/test/sample/5_message.packet.zip +0 -0
- package/test/sample/fileWithHead.csv +3 -0
- package/test/sample/fileWithoutHead.csv +2 -0
- package/test/sample/message/1000_fake_people.csv +1001 -0
- package/test/sample/message/5_fake_people.csv +6 -0
- package/test/sample/message/message.json5 +41 -0
- package/test/uuid.js +20 -0
- package/timelineTypes.js +139 -0
package/.prettierrc
ADDED
package/ForEachEntry.js
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import nodestream from 'node:stream';
|
|
3
|
+
import promises from 'node:stream/promises';
|
|
4
|
+
import { throttle } from 'throttle-debounce';
|
|
5
|
+
import parallelTransform from 'parallel-transform';
|
|
6
|
+
import debug$0 from 'debug';
|
|
7
|
+
import { Mutex } from 'async-mutex';
|
|
8
|
+
import { stringify, parse } from 'csv';
|
|
9
|
+
import handlebars from 'handlebars';
|
|
10
|
+
import ValidatingReadable from './ValidatingReadable.js';
|
|
11
|
+
import FileUtilities from './file/FileUtilities.js';
|
|
12
|
+
import { getTempFilename, getBatchTransform, getFile, streamPacket } from './file/tools.js';
|
|
13
|
+
const { Transform, Writable } = nodestream;
|
|
14
|
+
const { pipeline } = promises;
|
|
15
|
+
const debug = debug$0('@engine9-io/input-tools');
|
|
16
|
+
const debugThrottle = throttle(1000, debug, { noLeading: false, noTrailing: false });
|
|
17
|
+
class ForEachEntry {
|
|
18
|
+
constructor({ accountId } = {}) {
|
|
19
|
+
this.fileUtilities = new FileUtilities({ accountId });
|
|
20
|
+
}
|
|
21
|
+
getOutputStream({ name, filename, postfix = '.timeline.csv', validatorFunction = () => true }) {
|
|
22
|
+
this.outputStreams = this.outputStreams || {};
|
|
23
|
+
if (this.outputStreams[name]?.items) return this.outputStreams[name].items;
|
|
24
|
+
this.outputStreams[name] = this.outputStreams[name] || {
|
|
25
|
+
mutex: new Mutex()
|
|
26
|
+
};
|
|
27
|
+
return this.outputStreams[name].mutex.runExclusive(async () => {
|
|
28
|
+
let f = filename || (await getTempFilename({ postfix }));
|
|
29
|
+
const fileInfo = {
|
|
30
|
+
filename: f,
|
|
31
|
+
records: 0
|
|
32
|
+
};
|
|
33
|
+
debug(`Output file requested ${name}, writing output to to: ${fileInfo.filename}`);
|
|
34
|
+
const outputStream = new ValidatingReadable(
|
|
35
|
+
{
|
|
36
|
+
objectMode: true
|
|
37
|
+
},
|
|
38
|
+
validatorFunction
|
|
39
|
+
);
|
|
40
|
+
outputStream._read = () => {};
|
|
41
|
+
const writeStream = fs.createWriteStream(fileInfo.filename);
|
|
42
|
+
const finishWritingOutputPromise = new Promise((resolve, reject) => {
|
|
43
|
+
writeStream
|
|
44
|
+
.on('finish', () => {
|
|
45
|
+
resolve();
|
|
46
|
+
})
|
|
47
|
+
.on('error', (err) => {
|
|
48
|
+
reject(err);
|
|
49
|
+
});
|
|
50
|
+
});
|
|
51
|
+
this.outputStreams[name].items = {
|
|
52
|
+
stream: outputStream,
|
|
53
|
+
promises: [finishWritingOutputPromise],
|
|
54
|
+
files: [fileInfo]
|
|
55
|
+
};
|
|
56
|
+
outputStream
|
|
57
|
+
.pipe(
|
|
58
|
+
new Transform({
|
|
59
|
+
objectMode: true,
|
|
60
|
+
transform(o, enc, cb) {
|
|
61
|
+
fileInfo.records += 1;
|
|
62
|
+
cb(null, o);
|
|
63
|
+
}
|
|
64
|
+
})
|
|
65
|
+
)
|
|
66
|
+
.pipe(stringify({ header: true }))
|
|
67
|
+
.pipe(writeStream);
|
|
68
|
+
return this.outputStreams[name].items;
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
async process({
|
|
72
|
+
packet,
|
|
73
|
+
filename,
|
|
74
|
+
progress,
|
|
75
|
+
transform: userTransform,
|
|
76
|
+
batchSize = 500,
|
|
77
|
+
concurrency = 10,
|
|
78
|
+
bindings = {}
|
|
79
|
+
}) {
|
|
80
|
+
let inStream = null;
|
|
81
|
+
if (filename) {
|
|
82
|
+
debug(`Processing file ${filename}`);
|
|
83
|
+
inStream = (await this.fileUtilities.stream({ filename })).stream;
|
|
84
|
+
} else if (packet) {
|
|
85
|
+
debug(`Processing person file from packet ${packet}`);
|
|
86
|
+
inStream = (await streamPacket({ packet, type: 'person' })).stream;
|
|
87
|
+
}
|
|
88
|
+
if (typeof userTransform !== 'function') throw new Error('async transform function is required');
|
|
89
|
+
if (userTransform.length > 1) throw new Error('transform should be an async function that accepts one argument');
|
|
90
|
+
let progressThrottle = () => {};
|
|
91
|
+
if (typeof progress === 'function') {
|
|
92
|
+
const startTime = new Date().getTime();
|
|
93
|
+
progressThrottle = throttle(
|
|
94
|
+
2000,
|
|
95
|
+
function ({ records, batches }) {
|
|
96
|
+
let message = `Processed ${records} across ${batches} batches,${(
|
|
97
|
+
(records * 60 * 1000) /
|
|
98
|
+
(new Date().getTime() - startTime)
|
|
99
|
+
).toFixed(1)} records/minute`;
|
|
100
|
+
progress({ records, message });
|
|
101
|
+
},
|
|
102
|
+
{ noLeading: false, noTrailing: false }
|
|
103
|
+
);
|
|
104
|
+
}
|
|
105
|
+
let records = 0;
|
|
106
|
+
let batches = 0;
|
|
107
|
+
const outputFiles = {};
|
|
108
|
+
const transformArguments = {};
|
|
109
|
+
// An array of promises that must be completed, such as writing to disk
|
|
110
|
+
let bindingPromises = [];
|
|
111
|
+
// new Streams may be created, and they have to be completed when the file is completed
|
|
112
|
+
const newStreams = [];
|
|
113
|
+
const bindingNames = Object.keys(bindings);
|
|
114
|
+
await Promise.all(
|
|
115
|
+
bindingNames.map(async (bindingName) => {
|
|
116
|
+
const binding = bindings[bindingName];
|
|
117
|
+
if (!binding.path) throw new Error(`Invalid binding: path is required for binding ${bindingName}`);
|
|
118
|
+
if (binding.path === 'output.timeline') {
|
|
119
|
+
const {
|
|
120
|
+
stream: streamImpl,
|
|
121
|
+
promises,
|
|
122
|
+
files
|
|
123
|
+
} = await this.getOutputStream({
|
|
124
|
+
name: bindingName,
|
|
125
|
+
postfix: binding.options?.postfix || '.timeline.csv',
|
|
126
|
+
validatorFunction: (data) => {
|
|
127
|
+
if (!data) return true;
|
|
128
|
+
if (typeof data !== 'object') throw new Error('Invalid timeline data push, must be an object');
|
|
129
|
+
// Is this necessary?
|
|
130
|
+
if (!data.person_id) throw new Error('Invalid timeline data push, must have a person_id, even if 0');
|
|
131
|
+
if (!data.ts) data.ts = new Date().toISOString();
|
|
132
|
+
return true;
|
|
133
|
+
}
|
|
134
|
+
});
|
|
135
|
+
newStreams.push(streamImpl);
|
|
136
|
+
transformArguments[bindingName] = streamImpl;
|
|
137
|
+
bindingPromises = bindingPromises.concat(promises || []);
|
|
138
|
+
outputFiles[bindingName] = files;
|
|
139
|
+
} else if (binding.path === 'output.stream') {
|
|
140
|
+
const {
|
|
141
|
+
stream: streamImpl,
|
|
142
|
+
promises,
|
|
143
|
+
files
|
|
144
|
+
} = await this.getOutputStream({
|
|
145
|
+
name: bindingName,
|
|
146
|
+
postfix: binding.options?.postfix || '.timeline.csv'
|
|
147
|
+
});
|
|
148
|
+
newStreams.push(streamImpl);
|
|
149
|
+
transformArguments[bindingName] = streamImpl;
|
|
150
|
+
bindingPromises = bindingPromises.concat(promises || []);
|
|
151
|
+
outputFiles[bindingName] = files;
|
|
152
|
+
} else if (binding.path === 'file') {
|
|
153
|
+
transformArguments[bindingName] = await getFile(binding);
|
|
154
|
+
} else if (binding.path === 'handlebars') {
|
|
155
|
+
transformArguments[bindingName] = handlebars;
|
|
156
|
+
} else {
|
|
157
|
+
throw new Error(`Unsupported binding path for binding ${bindingName}: ${binding.path}`);
|
|
158
|
+
}
|
|
159
|
+
})
|
|
160
|
+
);
|
|
161
|
+
await pipeline(
|
|
162
|
+
inStream,
|
|
163
|
+
parse({
|
|
164
|
+
relax: true,
|
|
165
|
+
skip_empty_lines: true,
|
|
166
|
+
max_limit_on_data_read: 10000000,
|
|
167
|
+
columns: true
|
|
168
|
+
}),
|
|
169
|
+
getBatchTransform({ batchSize }).transform,
|
|
170
|
+
parallelTransform(concurrency, (batch, cb) => {
|
|
171
|
+
userTransform({ ...transformArguments, batch })
|
|
172
|
+
.then((d) => {
|
|
173
|
+
batches += 1;
|
|
174
|
+
records += batch?.length || 0;
|
|
175
|
+
progressThrottle({ records, batches });
|
|
176
|
+
debugThrottle(`Processed ${batches} batches for a total of ${records} outbound records`);
|
|
177
|
+
cb(null, d);
|
|
178
|
+
})
|
|
179
|
+
.catch(cb);
|
|
180
|
+
}),
|
|
181
|
+
new Writable({
|
|
182
|
+
objectMode: true,
|
|
183
|
+
write(batch, enc, cb) {
|
|
184
|
+
cb();
|
|
185
|
+
}
|
|
186
|
+
})
|
|
187
|
+
);
|
|
188
|
+
debug('Completed all batches');
|
|
189
|
+
newStreams.forEach((s) => s.push(null));
|
|
190
|
+
await Promise.all(bindingPromises);
|
|
191
|
+
return { outputFiles };
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
export default ForEachEntry;
|