@engine9-io/input-tools 1.9.2 → 1.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ForEachEntry.js +107 -77
- package/file/FileUtilities.js +1 -1
- package/index.js +75 -0
- package/package.json +1 -1
- package/test/processing/bigDataMessage.js +35 -43
- package/timelineTypes.js +2 -0
package/ForEachEntry.js
CHANGED
|
@@ -17,9 +17,7 @@ const handlebars = require('handlebars');
|
|
|
17
17
|
const ValidatingReadable = require('./ValidatingReadable');
|
|
18
18
|
const FileUtilities = require('./file/FileUtilities');
|
|
19
19
|
|
|
20
|
-
const {
|
|
21
|
-
getTempFilename, getBatchTransform, getFile, streamPacket,
|
|
22
|
-
} = require('./file/tools');
|
|
20
|
+
const { getTempFilename, getBatchTransform, getFile, streamPacket } = require('./file/tools');
|
|
23
21
|
|
|
24
22
|
class ForEachEntry {
|
|
25
23
|
constructor({ accountId } = {}) {
|
|
@@ -31,45 +29,52 @@ class ForEachEntry {
|
|
|
31
29
|
if (this.outputStreams[name]?.items) return this.outputStreams[name].items;
|
|
32
30
|
|
|
33
31
|
this.outputStreams[name] = this.outputStreams[name] || {
|
|
34
|
-
mutex: new Mutex()
|
|
32
|
+
mutex: new Mutex()
|
|
35
33
|
};
|
|
36
34
|
|
|
37
35
|
return this.outputStreams[name].mutex.runExclusive(async () => {
|
|
38
36
|
const fileInfo = {
|
|
39
37
|
filename: await getTempFilename({ postfix }),
|
|
40
|
-
records: 0
|
|
38
|
+
records: 0
|
|
41
39
|
};
|
|
42
40
|
|
|
43
41
|
debug(`Output file requested, writing output to to: ${fileInfo.filename}`);
|
|
44
|
-
const outputStream = new ValidatingReadable(
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
42
|
+
const outputStream = new ValidatingReadable(
|
|
43
|
+
{
|
|
44
|
+
objectMode: true
|
|
45
|
+
},
|
|
46
|
+
validatorFunction
|
|
47
|
+
);
|
|
48
|
+
|
|
48
49
|
outputStream._read = () => {};
|
|
49
50
|
|
|
50
51
|
const writeStream = fs.createWriteStream(fileInfo.filename);
|
|
51
52
|
const finishWritingOutputPromise = new Promise((resolve, reject) => {
|
|
52
|
-
writeStream
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
53
|
+
writeStream
|
|
54
|
+
.on('finish', () => {
|
|
55
|
+
resolve();
|
|
56
|
+
})
|
|
57
|
+
.on('error', (err) => {
|
|
58
|
+
reject(err);
|
|
59
|
+
});
|
|
57
60
|
});
|
|
58
61
|
|
|
59
62
|
this.outputStreams[name].items = {
|
|
60
63
|
stream: outputStream,
|
|
61
64
|
promises: [finishWritingOutputPromise],
|
|
62
|
-
files: [fileInfo]
|
|
65
|
+
files: [fileInfo]
|
|
63
66
|
};
|
|
64
67
|
|
|
65
68
|
outputStream
|
|
66
|
-
.pipe(
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
69
|
+
.pipe(
|
|
70
|
+
new Transform({
|
|
71
|
+
objectMode: true,
|
|
72
|
+
transform(o, enc, cb) {
|
|
73
|
+
fileInfo.records += 1;
|
|
74
|
+
cb(null, o);
|
|
75
|
+
}
|
|
76
|
+
})
|
|
77
|
+
)
|
|
73
78
|
.pipe(csv.stringify({ header: true }))
|
|
74
79
|
.pipe(writeStream);
|
|
75
80
|
|
|
@@ -80,10 +85,11 @@ class ForEachEntry {
|
|
|
80
85
|
async process({
|
|
81
86
|
packet,
|
|
82
87
|
filename,
|
|
88
|
+
progress,
|
|
83
89
|
transform: userTransform,
|
|
84
90
|
batchSize = 500,
|
|
85
91
|
concurrency = 10,
|
|
86
|
-
bindings = {}
|
|
92
|
+
bindings = {}
|
|
87
93
|
}) {
|
|
88
94
|
let inStream = null;
|
|
89
95
|
|
|
@@ -97,6 +103,22 @@ class ForEachEntry {
|
|
|
97
103
|
if (typeof userTransform !== 'function') throw new Error('async transform function is required');
|
|
98
104
|
if (userTransform.length > 1) throw new Error('transform should be an async function that accepts one argument');
|
|
99
105
|
|
|
106
|
+
let progressThrottle = () => {};
|
|
107
|
+
if (typeof progress === 'function') {
|
|
108
|
+
const startTime = new Date().getTime();
|
|
109
|
+
progressThrottle = throttle(
|
|
110
|
+
2000,
|
|
111
|
+
function ({ records, batches }) {
|
|
112
|
+
let message = `Processed ${records} across ${batches} batches,${(
|
|
113
|
+
(records * 60 * 1000) /
|
|
114
|
+
(new Date().getTime() - startTime)
|
|
115
|
+
).toFixed(1)} records/minute`;
|
|
116
|
+
progress({ records, message });
|
|
117
|
+
},
|
|
118
|
+
{ noLeading: false, noTrailing: false }
|
|
119
|
+
);
|
|
120
|
+
}
|
|
121
|
+
|
|
100
122
|
let records = 0;
|
|
101
123
|
let batches = 0;
|
|
102
124
|
|
|
@@ -110,72 +132,80 @@ class ForEachEntry {
|
|
|
110
132
|
const newStreams = [];
|
|
111
133
|
|
|
112
134
|
const bindingNames = Object.keys(bindings);
|
|
113
|
-
|
|
114
|
-
await Promise.all(
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
135
|
+
|
|
136
|
+
await Promise.all(
|
|
137
|
+
bindingNames.map(async (bindingName) => {
|
|
138
|
+
const binding = bindings[bindingName];
|
|
139
|
+
if (!binding.path) throw new Error(`Invalid binding: path is required for binding ${bindingName}`);
|
|
140
|
+
if (binding.path === 'output.timeline') {
|
|
141
|
+
const {
|
|
142
|
+
stream: streamImpl,
|
|
143
|
+
promises,
|
|
144
|
+
files
|
|
145
|
+
} = await this.getOutputStream({
|
|
146
|
+
name: bindingName,
|
|
147
|
+
postfix: binding.options?.postfix || '.timeline.csv',
|
|
148
|
+
validatorFunction: (data) => {
|
|
149
|
+
if (!data) return true;
|
|
150
|
+
if (typeof data !== 'object') throw new Error('Invalid timeline data push, must be an object');
|
|
151
|
+
// Is this necessary?
|
|
152
|
+
if (!data.person_id) throw new Error('Invalid timeline data push, must have a person_id, even if 0');
|
|
153
|
+
if (!data.ts) data.ts = new Date().toISOString();
|
|
154
|
+
return true;
|
|
155
|
+
}
|
|
156
|
+
});
|
|
157
|
+
newStreams.push(streamImpl);
|
|
158
|
+
transformArguments[bindingName] = streamImpl;
|
|
159
|
+
bindingPromises = bindingPromises.concat(promises || []);
|
|
160
|
+
outputFiles[bindingName] = files;
|
|
161
|
+
} else if (binding.path === 'output.stream') {
|
|
162
|
+
const {
|
|
163
|
+
stream: streamImpl,
|
|
164
|
+
promises,
|
|
165
|
+
files
|
|
166
|
+
} = await this.getOutputStream({
|
|
167
|
+
name: bindingName,
|
|
168
|
+
postfix: binding.options?.postfix || '.timeline.csv'
|
|
169
|
+
});
|
|
170
|
+
newStreams.push(streamImpl);
|
|
171
|
+
transformArguments[bindingName] = streamImpl;
|
|
172
|
+
bindingPromises = bindingPromises.concat(promises || []);
|
|
173
|
+
outputFiles[bindingName] = files;
|
|
174
|
+
} else if (binding.path === 'file') {
|
|
175
|
+
transformArguments[bindingName] = await getFile(binding);
|
|
176
|
+
} else if (binding.path === 'handlebars') {
|
|
177
|
+
transformArguments[bindingName] = handlebars;
|
|
178
|
+
} else {
|
|
179
|
+
throw new Error(`Unsupported binding path for binding ${bindingName}: ${binding.path}`);
|
|
180
|
+
}
|
|
181
|
+
})
|
|
182
|
+
);
|
|
151
183
|
await pipeline(
|
|
152
184
|
inStream,
|
|
153
185
|
csv.parse({
|
|
154
186
|
relax: true,
|
|
155
187
|
skip_empty_lines: true,
|
|
156
188
|
max_limit_on_data_read: 10000000,
|
|
157
|
-
columns: true
|
|
189
|
+
columns: true
|
|
158
190
|
}),
|
|
159
191
|
getBatchTransform({ batchSize }).transform,
|
|
160
|
-
parallelTransform(
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
192
|
+
parallelTransform(concurrency, (batch, cb) => {
|
|
193
|
+
userTransform({ ...transformArguments, batch })
|
|
194
|
+
.then((d) => {
|
|
195
|
+
batches += 1;
|
|
196
|
+
records += batch?.length || 0;
|
|
197
|
+
progressThrottle({ records, batches });
|
|
198
|
+
debugThrottle(`Processed ${batches} batches for a total of ${records} outbound records`);
|
|
199
|
+
cb(null, d);
|
|
200
|
+
})
|
|
201
|
+
.catch(cb);
|
|
202
|
+
}),
|
|
169
203
|
new Writable({
|
|
170
204
|
objectMode: true,
|
|
171
205
|
write(batch, enc, cb) {
|
|
172
|
-
batches += 1;
|
|
173
|
-
records += batch?.length || 0;
|
|
174
|
-
|
|
175
|
-
debugThrottle(`Processed ${batches} batches for a total of ${records} outbound records`);
|
|
176
206
|
cb();
|
|
177
|
-
}
|
|
178
|
-
})
|
|
207
|
+
}
|
|
208
|
+
})
|
|
179
209
|
);
|
|
180
210
|
debug('Completed all batches');
|
|
181
211
|
|
package/file/FileUtilities.js
CHANGED
|
@@ -890,7 +890,7 @@ Worker.prototype.stat = async function ({ filename }) {
|
|
|
890
890
|
|
|
891
891
|
if (filename.slice(-8) === '.parquet') {
|
|
892
892
|
const pq = new ParquetWorker(this);
|
|
893
|
-
output.schema = await pq.schema({ filename });
|
|
893
|
+
output.schema = (await pq.schema({ filename }))?.schema;
|
|
894
894
|
output.records = (await pq.meta({ filename }))?.records;
|
|
895
895
|
}
|
|
896
896
|
|
package/index.js
CHANGED
|
@@ -319,6 +319,79 @@ function getEntryType(o, defaults = {}) {
|
|
|
319
319
|
return etype;
|
|
320
320
|
}
|
|
321
321
|
|
|
322
|
+
function getDateRangeArray(startDate, endDate) {
|
|
323
|
+
const start = new Date(startDate);
|
|
324
|
+
const end = new Date(endDate);
|
|
325
|
+
const result = [];
|
|
326
|
+
const msInDay = 24 * 60 * 60 * 1000;
|
|
327
|
+
|
|
328
|
+
function addDays(date, days) {
|
|
329
|
+
const d = new Date(date);
|
|
330
|
+
d.setDate(d.getDate() + days);
|
|
331
|
+
return d;
|
|
332
|
+
}
|
|
333
|
+
function addMonths(date, months) {
|
|
334
|
+
const d = new Date(date);
|
|
335
|
+
d.setMonth(d.getMonth() + months);
|
|
336
|
+
return d;
|
|
337
|
+
}
|
|
338
|
+
function addYears(date, years) {
|
|
339
|
+
const d = new Date(date);
|
|
340
|
+
d.setFullYear(d.getFullYear() + years);
|
|
341
|
+
return d;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
const diffDays = Math.floor((end - start) / msInDay);
|
|
345
|
+
const diffMonths = (end.getFullYear() - start.getFullYear()) * 12 + (end.getMonth() - start.getMonth());
|
|
346
|
+
const diffYears = end.getFullYear() - start.getFullYear();
|
|
347
|
+
|
|
348
|
+
let current = new Date(start);
|
|
349
|
+
|
|
350
|
+
let stepFn;
|
|
351
|
+
if (diffDays < 10) {
|
|
352
|
+
stepFn = (date) => addDays(date, 1);
|
|
353
|
+
} else if (diffDays < 32) {
|
|
354
|
+
stepFn = (date) => addDays(date, 3);
|
|
355
|
+
} else if (diffMonths < 4) {
|
|
356
|
+
stepFn = (date) => addDays(date, 7);
|
|
357
|
+
} else if (diffYears < 2) {
|
|
358
|
+
stepFn = (date) => addMonths(date, 1);
|
|
359
|
+
} else if (diffYears < 4) {
|
|
360
|
+
stepFn = (date) => addMonths(date, 3);
|
|
361
|
+
} else {
|
|
362
|
+
stepFn = (date) => addYears(date, 1);
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
while (current <= end) {
|
|
366
|
+
result.push(new Date(current));
|
|
367
|
+
const next = stepFn(current);
|
|
368
|
+
if (next > end) break;
|
|
369
|
+
current = next;
|
|
370
|
+
}
|
|
371
|
+
// Ensure the last date is exactly the end date
|
|
372
|
+
if (result.length === 0 || result[result.length - 1].getTime() !== end.getTime()) {
|
|
373
|
+
result.push(new Date(end));
|
|
374
|
+
}
|
|
375
|
+
return result;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
class ObjectError extends Error {
|
|
379
|
+
constructor(data) {
|
|
380
|
+
if (typeof data === 'string') {
|
|
381
|
+
// normal behavior
|
|
382
|
+
super(data);
|
|
383
|
+
} else if (typeof data === 'object') {
|
|
384
|
+
super(data.message);
|
|
385
|
+
Object.keys(data).forEach((k) => {
|
|
386
|
+
this[k] = data[k];
|
|
387
|
+
});
|
|
388
|
+
this.status = data.status;
|
|
389
|
+
} else {
|
|
390
|
+
super('(No error message)');
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
|
|
322
395
|
module.exports = {
|
|
323
396
|
appendPostfix,
|
|
324
397
|
bool,
|
|
@@ -329,6 +402,7 @@ module.exports = {
|
|
|
329
402
|
ForEachEntry,
|
|
330
403
|
FileUtilities,
|
|
331
404
|
getBatchTransform,
|
|
405
|
+
getDateRangeArray,
|
|
332
406
|
getDebatchTransform,
|
|
333
407
|
getEntryType,
|
|
334
408
|
getEntryTypeId,
|
|
@@ -346,6 +420,7 @@ module.exports = {
|
|
|
346
420
|
handlebars,
|
|
347
421
|
isValidDate,
|
|
348
422
|
makeStrings,
|
|
423
|
+
ObjectError,
|
|
349
424
|
relativeDate,
|
|
350
425
|
streamPacket,
|
|
351
426
|
TIMELINE_ENTRY_TYPES,
|
package/package.json
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
const {
|
|
2
|
-
describe, it,
|
|
3
|
-
} = require('node:test');
|
|
1
|
+
const { describe, it } = require('node:test');
|
|
4
2
|
const assert = require('node:assert');
|
|
5
3
|
const debug = require('debug')('test:big-data');
|
|
6
4
|
const { setTimeout } = require('node:timers/promises');
|
|
7
|
-
const { v7: uuidv7 } = require('uuid');
|
|
5
|
+
//const { v7: uuidv7 } = require('uuid');
|
|
8
6
|
|
|
9
7
|
const { ForEachEntry } = require('../../index');
|
|
10
8
|
|
|
@@ -14,46 +12,40 @@ describe('big-data message: forEachPerson', async () => {
|
|
|
14
12
|
let counter = 0;
|
|
15
13
|
const forEach = new ForEachEntry();
|
|
16
14
|
|
|
17
|
-
const output = await forEach.process(
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
},
|
|
28
|
-
async transform({
|
|
29
|
-
batch,
|
|
30
|
-
message,
|
|
31
|
-
handlebars,
|
|
32
|
-
timelineOutputStream,
|
|
33
|
-
}) {
|
|
34
|
-
const id = uuidv7();
|
|
35
|
-
debug(`Processing batch of ${batch.length} - ${id}`);
|
|
36
|
-
if (!message?.content?.text) throw new Error(`Sample message has no content.text:${JSON.stringify(message)}`);
|
|
37
|
-
const template = handlebars.compile(message.content.text);
|
|
38
|
-
batch.forEach((person) => {
|
|
39
|
-
messageContent.push(template(person));
|
|
40
|
-
});
|
|
41
|
-
batch.forEach((p) => {
|
|
42
|
-
const o = {
|
|
43
|
-
person_id: p.person_id,
|
|
44
|
-
email: p.email,
|
|
45
|
-
entry_type: 'EMAIL_DELIVERED',
|
|
46
|
-
};
|
|
47
|
-
counter += 1;
|
|
48
|
-
if (counter % 10000 === 0) debug(`*** Processed ${counter} items, last person_id=${p.person_id}`, o);
|
|
49
|
-
timelineOutputStream.push(o);
|
|
50
|
-
});
|
|
51
|
-
// debug(`Processed batch of size ${batch.length}`);
|
|
52
|
-
await setTimeout(Math.random() * 3000);
|
|
53
|
-
debug(`Completed processing ${id}`);
|
|
54
|
-
},
|
|
15
|
+
const output = await forEach.process({
|
|
16
|
+
// packet: '../1000000_person_message.packet.zip',
|
|
17
|
+
filename: '../1000000_person_message.packet/person/1000000_fake_people.csv',
|
|
18
|
+
batchSize: 10000,
|
|
19
|
+
concurrency: 1000,
|
|
20
|
+
progress: debug,
|
|
21
|
+
bindings: {
|
|
22
|
+
timelineOutputStream: { path: 'output.timeline' },
|
|
23
|
+
message: { path: 'file', filename: '../1000000_person_message.packet/message/message.json5' },
|
|
24
|
+
handlebars: { path: 'handlebars' }
|
|
55
25
|
},
|
|
56
|
-
|
|
26
|
+
async transform({ batch, message, handlebars, timelineOutputStream }) {
|
|
27
|
+
//const id = uuidv7();
|
|
28
|
+
//debug(`Processing batch of ${batch.length} - ${id}`);
|
|
29
|
+
if (!message?.content?.text) throw new Error(`Sample message has no content.text:${JSON.stringify(message)}`);
|
|
30
|
+
const template = handlebars.compile(message.content.text);
|
|
31
|
+
batch.forEach((person) => {
|
|
32
|
+
messageContent.push(template(person));
|
|
33
|
+
});
|
|
34
|
+
batch.forEach((p) => {
|
|
35
|
+
const o = {
|
|
36
|
+
person_id: p.person_id,
|
|
37
|
+
email: p.email,
|
|
38
|
+
entry_type: 'EMAIL_DELIVERED'
|
|
39
|
+
};
|
|
40
|
+
counter += 1;
|
|
41
|
+
//if (counter % 10000 === 0) debug(`*** Processed ${counter} items, last person_id=${p.person_id}`, o);
|
|
42
|
+
timelineOutputStream.push(o);
|
|
43
|
+
});
|
|
44
|
+
// debug(`Processed batch of size ${batch.length}`);
|
|
45
|
+
await setTimeout(Math.random() * 3000);
|
|
46
|
+
//debug(`Completed processing ${id}`);
|
|
47
|
+
}
|
|
48
|
+
});
|
|
57
49
|
debug(output);
|
|
58
50
|
|
|
59
51
|
assert.equal(counter, 1000000, `Expected to loop through 1000000 people, actual:${counter}`);
|
package/timelineTypes.js
CHANGED
|
@@ -31,6 +31,7 @@ const SMS_DELIVERED = 31;
|
|
|
31
31
|
const SMS_CLICK = 33;
|
|
32
32
|
const SMS_UNSUBSCRIBE = 34;
|
|
33
33
|
const SMS_BOUNCE = 37;
|
|
34
|
+
const SMS_SPAM = 38;
|
|
34
35
|
const SMS_REPLY = 39;
|
|
35
36
|
|
|
36
37
|
const EMAIL_SEND = 40;
|
|
@@ -112,6 +113,7 @@ const TIMELINE_ENTRY_TYPES = {
|
|
|
112
113
|
SMS_CLICK,
|
|
113
114
|
SMS_UNSUBSCRIBE,
|
|
114
115
|
SMS_BOUNCE,
|
|
116
|
+
SMS_SPAM,
|
|
115
117
|
SMS_REPLY,
|
|
116
118
|
|
|
117
119
|
// email interactions
|