@engine9-io/input-tools 1.9.1 → 1.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/ForEachEntry.js CHANGED
@@ -17,9 +17,7 @@ const handlebars = require('handlebars');
17
17
  const ValidatingReadable = require('./ValidatingReadable');
18
18
  const FileUtilities = require('./file/FileUtilities');
19
19
 
20
- const {
21
- getTempFilename, getBatchTransform, getFile, streamPacket,
22
- } = require('./file/tools');
20
+ const { getTempFilename, getBatchTransform, getFile, streamPacket } = require('./file/tools');
23
21
 
24
22
  class ForEachEntry {
25
23
  constructor({ accountId } = {}) {
@@ -31,45 +29,52 @@ class ForEachEntry {
31
29
  if (this.outputStreams[name]?.items) return this.outputStreams[name].items;
32
30
 
33
31
  this.outputStreams[name] = this.outputStreams[name] || {
34
- mutex: new Mutex(),
32
+ mutex: new Mutex()
35
33
  };
36
34
 
37
35
  return this.outputStreams[name].mutex.runExclusive(async () => {
38
36
  const fileInfo = {
39
37
  filename: await getTempFilename({ postfix }),
40
- records: 0,
38
+ records: 0
41
39
  };
42
40
 
43
41
  debug(`Output file requested, writing output to to: ${fileInfo.filename}`);
44
- const outputStream = new ValidatingReadable({
45
- objectMode: true,
46
- }, validatorFunction);
47
- // eslint-disable-next-line no-underscore-dangle
42
+ const outputStream = new ValidatingReadable(
43
+ {
44
+ objectMode: true
45
+ },
46
+ validatorFunction
47
+ );
48
+
48
49
  outputStream._read = () => {};
49
50
 
50
51
  const writeStream = fs.createWriteStream(fileInfo.filename);
51
52
  const finishWritingOutputPromise = new Promise((resolve, reject) => {
52
- writeStream.on('finish', () => {
53
- resolve();
54
- }).on('error', (err) => {
55
- reject(err);
56
- });
53
+ writeStream
54
+ .on('finish', () => {
55
+ resolve();
56
+ })
57
+ .on('error', (err) => {
58
+ reject(err);
59
+ });
57
60
  });
58
61
 
59
62
  this.outputStreams[name].items = {
60
63
  stream: outputStream,
61
64
  promises: [finishWritingOutputPromise],
62
- files: [fileInfo],
65
+ files: [fileInfo]
63
66
  };
64
67
 
65
68
  outputStream
66
- .pipe(new Transform({
67
- objectMode: true,
68
- transform(o, enc, cb) {
69
- fileInfo.records += 1;
70
- cb(null, o);
71
- },
72
- }))
69
+ .pipe(
70
+ new Transform({
71
+ objectMode: true,
72
+ transform(o, enc, cb) {
73
+ fileInfo.records += 1;
74
+ cb(null, o);
75
+ }
76
+ })
77
+ )
73
78
  .pipe(csv.stringify({ header: true }))
74
79
  .pipe(writeStream);
75
80
 
@@ -80,10 +85,11 @@ class ForEachEntry {
80
85
  async process({
81
86
  packet,
82
87
  filename,
88
+ progress,
83
89
  transform: userTransform,
84
90
  batchSize = 500,
85
91
  concurrency = 10,
86
- bindings = {},
92
+ bindings = {}
87
93
  }) {
88
94
  let inStream = null;
89
95
 
@@ -97,6 +103,22 @@ class ForEachEntry {
97
103
  if (typeof userTransform !== 'function') throw new Error('async transform function is required');
98
104
  if (userTransform.length > 1) throw new Error('transform should be an async function that accepts one argument');
99
105
 
106
+ let progressThrottle = () => {};
107
+ if (typeof progress === 'function') {
108
+ const startTime = new Date().getTime();
109
+ progressThrottle = throttle(
110
+ 2000,
111
+ function ({ records, batches }) {
112
+ let message = `Processed ${records} across ${batches} batches,${(
113
+ (records * 60 * 1000) /
114
+ (new Date().getTime() - startTime)
115
+ ).toFixed(1)} records/minute`;
116
+ progress({ records, message });
117
+ },
118
+ { noLeading: false, noTrailing: false }
119
+ );
120
+ }
121
+
100
122
  let records = 0;
101
123
  let batches = 0;
102
124
 
@@ -110,72 +132,80 @@ class ForEachEntry {
110
132
  const newStreams = [];
111
133
 
112
134
  const bindingNames = Object.keys(bindings);
113
- // eslint-disable-next-line no-await-in-loop
114
- await Promise.all(bindingNames.map(async (bindingName) => {
115
- const binding = bindings[bindingName];
116
- if (!binding.path) throw new Error(`Invalid binding: path is required for binding ${bindingName}`);
117
- if (binding.path === 'output.timeline') {
118
- const { stream: streamImpl, promises, files } = await this.getOutputStream({
119
- name: bindingName,
120
- postfix: binding.options?.postfix || '.timeline.csv',
121
- validatorFunction: (data) => {
122
- if (!data) return true;
123
- if (typeof data !== 'object') throw new Error('Invalid timeline data push, must be an object');
124
- // Is this necessary?
125
- if (!data.person_id) throw new Error('Invalid timeline data push, must have a person_id, even if 0');
126
- if (!data.ts) data.ts = new Date().toISOString();
127
- return true;
128
- },
129
- });
130
- newStreams.push(streamImpl);
131
- transformArguments[bindingName] = streamImpl;
132
- bindingPromises = bindingPromises.concat(promises || []);
133
- outputFiles[bindingName] = files;
134
- } else if (binding.path === 'output.stream') {
135
- const { stream: streamImpl, promises, files } = await this.getOutputStream({
136
- name: bindingName,
137
- postfix: binding.options?.postfix || '.timeline.csv',
138
- });
139
- newStreams.push(streamImpl);
140
- transformArguments[bindingName] = streamImpl;
141
- bindingPromises = bindingPromises.concat(promises || []);
142
- outputFiles[bindingName] = files;
143
- } else if (binding.path === 'file') {
144
- transformArguments[bindingName] = await getFile(binding);
145
- } else if (binding.path === 'handlebars') {
146
- transformArguments[bindingName] = handlebars;
147
- } else {
148
- throw new Error(`Unsupported binding path for binding ${bindingName}: ${binding.path}`);
149
- }
150
- }));
135
+
136
+ await Promise.all(
137
+ bindingNames.map(async (bindingName) => {
138
+ const binding = bindings[bindingName];
139
+ if (!binding.path) throw new Error(`Invalid binding: path is required for binding ${bindingName}`);
140
+ if (binding.path === 'output.timeline') {
141
+ const {
142
+ stream: streamImpl,
143
+ promises,
144
+ files
145
+ } = await this.getOutputStream({
146
+ name: bindingName,
147
+ postfix: binding.options?.postfix || '.timeline.csv',
148
+ validatorFunction: (data) => {
149
+ if (!data) return true;
150
+ if (typeof data !== 'object') throw new Error('Invalid timeline data push, must be an object');
151
+ // Is this necessary?
152
+ if (!data.person_id) throw new Error('Invalid timeline data push, must have a person_id, even if 0');
153
+ if (!data.ts) data.ts = new Date().toISOString();
154
+ return true;
155
+ }
156
+ });
157
+ newStreams.push(streamImpl);
158
+ transformArguments[bindingName] = streamImpl;
159
+ bindingPromises = bindingPromises.concat(promises || []);
160
+ outputFiles[bindingName] = files;
161
+ } else if (binding.path === 'output.stream') {
162
+ const {
163
+ stream: streamImpl,
164
+ promises,
165
+ files
166
+ } = await this.getOutputStream({
167
+ name: bindingName,
168
+ postfix: binding.options?.postfix || '.timeline.csv'
169
+ });
170
+ newStreams.push(streamImpl);
171
+ transformArguments[bindingName] = streamImpl;
172
+ bindingPromises = bindingPromises.concat(promises || []);
173
+ outputFiles[bindingName] = files;
174
+ } else if (binding.path === 'file') {
175
+ transformArguments[bindingName] = await getFile(binding);
176
+ } else if (binding.path === 'handlebars') {
177
+ transformArguments[bindingName] = handlebars;
178
+ } else {
179
+ throw new Error(`Unsupported binding path for binding ${bindingName}: ${binding.path}`);
180
+ }
181
+ })
182
+ );
151
183
  await pipeline(
152
184
  inStream,
153
185
  csv.parse({
154
186
  relax: true,
155
187
  skip_empty_lines: true,
156
188
  max_limit_on_data_read: 10000000,
157
- columns: true,
189
+ columns: true
158
190
  }),
159
191
  getBatchTransform({ batchSize }).transform,
160
- parallelTransform(
161
- concurrency,
162
- (batch, cb) => {
163
- userTransform({ ...transformArguments, batch })
164
- .then((d) => cb(null, d))
165
- .catch(cb);
166
- },
167
-
168
- ),
192
+ parallelTransform(concurrency, (batch, cb) => {
193
+ userTransform({ ...transformArguments, batch })
194
+ .then((d) => {
195
+ batches += 1;
196
+ records += batch?.length || 0;
197
+ progressThrottle({ records, batches });
198
+ debugThrottle(`Processed ${batches} batches for a total of ${records} outbound records`);
199
+ cb(null, d);
200
+ })
201
+ .catch(cb);
202
+ }),
169
203
  new Writable({
170
204
  objectMode: true,
171
205
  write(batch, enc, cb) {
172
- batches += 1;
173
- records += batch?.length || 0;
174
-
175
- debugThrottle(`Processed ${batches} batches for a total of ${records} outbound records`);
176
206
  cb();
177
- },
178
- }),
207
+ }
208
+ })
179
209
  );
180
210
  debug('Completed all batches');
181
211
 
@@ -248,8 +248,6 @@ Worker.prototype.fileToObjectStream = async function (options) {
248
248
 
249
249
  let count = 0;
250
250
 
251
- debug(`Reading file ${filename} with encoding:`, encoding);
252
-
253
251
  let transforms = [];
254
252
 
255
253
  if (postfix === 'gz') {
@@ -265,6 +263,8 @@ Worker.prototype.fileToObjectStream = async function (options) {
265
263
  }
266
264
  let format = formatOverride || postfix;
267
265
 
266
+ debug(`Reading file ${filename} with encoding: ${encoding} and format ${format}`);
267
+
268
268
  if (format === 'csv') {
269
269
  const csvTransforms = this.csvToObjectTransforms({ ...options });
270
270
  transforms = transforms.concat(csvTransforms.transforms);
@@ -890,7 +890,7 @@ Worker.prototype.stat = async function ({ filename }) {
890
890
 
891
891
  if (filename.slice(-8) === '.parquet') {
892
892
  const pq = new ParquetWorker(this);
893
- output.schema = await pq.schema({ filename });
893
+ output.schema = (await pq.schema({ filename }))?.schema;
894
894
  output.records = (await pq.meta({ filename }))?.records;
895
895
  }
896
896
 
package/index.js CHANGED
@@ -313,16 +313,85 @@ function getEntryType(o, defaults = {}) {
313
313
  if (etype) return etype;
314
314
 
315
315
  const id = o.entry_type_id || defaults.entry_type_id;
316
- if (id) return id;
317
316
 
318
- if (!id) {
319
- throw new Error('No entry_type, nor entry_type_id specified, specify a default.entry_type');
320
- }
321
317
  etype = TIMELINE_ENTRY_TYPES[id];
322
318
  if (etype === undefined) throw new Error(`Invalid entry_type: ${etype}`);
323
319
  return etype;
324
320
  }
325
321
 
322
+ function getDateRangeArray(startDate, endDate) {
323
+ const start = new Date(startDate);
324
+ const end = new Date(endDate);
325
+ const result = [];
326
+ const msInDay = 24 * 60 * 60 * 1000;
327
+
328
+ function addDays(date, days) {
329
+ const d = new Date(date);
330
+ d.setDate(d.getDate() + days);
331
+ return d;
332
+ }
333
+ function addMonths(date, months) {
334
+ const d = new Date(date);
335
+ d.setMonth(d.getMonth() + months);
336
+ return d;
337
+ }
338
+ function addYears(date, years) {
339
+ const d = new Date(date);
340
+ d.setFullYear(d.getFullYear() + years);
341
+ return d;
342
+ }
343
+
344
+ const diffDays = Math.floor((end - start) / msInDay);
345
+ const diffMonths = (end.getFullYear() - start.getFullYear()) * 12 + (end.getMonth() - start.getMonth());
346
+ const diffYears = end.getFullYear() - start.getFullYear();
347
+
348
+ let current = new Date(start);
349
+
350
+ let stepFn;
351
+ if (diffDays < 10) {
352
+ stepFn = (date) => addDays(date, 1);
353
+ } else if (diffDays < 32) {
354
+ stepFn = (date) => addDays(date, 3);
355
+ } else if (diffMonths < 4) {
356
+ stepFn = (date) => addDays(date, 7);
357
+ } else if (diffYears < 2) {
358
+ stepFn = (date) => addMonths(date, 1);
359
+ } else if (diffYears < 4) {
360
+ stepFn = (date) => addMonths(date, 3);
361
+ } else {
362
+ stepFn = (date) => addYears(date, 1);
363
+ }
364
+
365
+ while (current <= end) {
366
+ result.push(new Date(current));
367
+ const next = stepFn(current);
368
+ if (next > end) break;
369
+ current = next;
370
+ }
371
+ // Ensure the last date is exactly the end date
372
+ if (result.length === 0 || result[result.length - 1].getTime() !== end.getTime()) {
373
+ result.push(new Date(end));
374
+ }
375
+ return result;
376
+ }
377
+
378
+ class ObjectError extends Error {
379
+ constructor(data) {
380
+ if (typeof data === 'string') {
381
+ // normal behavior
382
+ super(data);
383
+ } else if (typeof data === 'object') {
384
+ super(data.message);
385
+ Object.keys(data).forEach((k) => {
386
+ this[k] = data[k];
387
+ });
388
+ this.status = data.status;
389
+ } else {
390
+ super('(No error message)');
391
+ }
392
+ }
393
+ }
394
+
326
395
  module.exports = {
327
396
  appendPostfix,
328
397
  bool,
@@ -333,6 +402,7 @@ module.exports = {
333
402
  ForEachEntry,
334
403
  FileUtilities,
335
404
  getBatchTransform,
405
+ getDateRangeArray,
336
406
  getDebatchTransform,
337
407
  getEntryType,
338
408
  getEntryTypeId,
@@ -350,6 +420,7 @@ module.exports = {
350
420
  handlebars,
351
421
  isValidDate,
352
422
  makeStrings,
423
+ ObjectError,
353
424
  relativeDate,
354
425
  streamPacket,
355
426
  TIMELINE_ENTRY_TYPES,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@engine9-io/input-tools",
3
- "version": "1.9.1",
3
+ "version": "1.9.3",
4
4
  "description": "Tools for dealing with Engine9 inputs",
5
5
  "main": "index.js",
6
6
  "scripts": {
@@ -1,10 +1,8 @@
1
- const {
2
- describe, it,
3
- } = require('node:test');
1
+ const { describe, it } = require('node:test');
4
2
  const assert = require('node:assert');
5
3
  const debug = require('debug')('test:big-data');
6
4
  const { setTimeout } = require('node:timers/promises');
7
- const { v7: uuidv7 } = require('uuid');
5
+ //const { v7: uuidv7 } = require('uuid');
8
6
 
9
7
  const { ForEachEntry } = require('../../index');
10
8
 
@@ -14,46 +12,40 @@ describe('big-data message: forEachPerson', async () => {
14
12
  let counter = 0;
15
13
  const forEach = new ForEachEntry();
16
14
 
17
- const output = await forEach.process(
18
- {
19
- // packet: '../1000000_person_message.packet.zip',
20
- filename: '../1000000_fake_people.csv',
21
- batchSize: 10000,
22
- concurrency: 1000,
23
- bindings: {
24
- timelineOutputStream: { path: 'output.timeline' },
25
- message: { path: 'file', filename: '../1000000_person_message.packet/message/message.json5' },
26
- handlebars: { path: 'handlebars' },
27
- },
28
- async transform({
29
- batch,
30
- message,
31
- handlebars,
32
- timelineOutputStream,
33
- }) {
34
- const id = uuidv7();
35
- debug(`Processing batch of ${batch.length} - ${id}`);
36
- if (!message?.content?.text) throw new Error(`Sample message has no content.text:${JSON.stringify(message)}`);
37
- const template = handlebars.compile(message.content.text);
38
- batch.forEach((person) => {
39
- messageContent.push(template(person));
40
- });
41
- batch.forEach((p) => {
42
- const o = {
43
- person_id: p.person_id,
44
- email: p.email,
45
- entry_type: 'EMAIL_DELIVERED',
46
- };
47
- counter += 1;
48
- if (counter % 10000 === 0) debug(`*** Processed ${counter} items, last person_id=${p.person_id}`, o);
49
- timelineOutputStream.push(o);
50
- });
51
- // debug(`Processed batch of size ${batch.length}`);
52
- await setTimeout(Math.random() * 3000);
53
- debug(`Completed processing ${id}`);
54
- },
15
+ const output = await forEach.process({
16
+ // packet: '../1000000_person_message.packet.zip',
17
+ filename: '../1000000_person_message.packet/person/1000000_fake_people.csv',
18
+ batchSize: 10000,
19
+ concurrency: 1000,
20
+ progress: debug,
21
+ bindings: {
22
+ timelineOutputStream: { path: 'output.timeline' },
23
+ message: { path: 'file', filename: '../1000000_person_message.packet/message/message.json5' },
24
+ handlebars: { path: 'handlebars' }
55
25
  },
56
- );
26
+ async transform({ batch, message, handlebars, timelineOutputStream }) {
27
+ //const id = uuidv7();
28
+ //debug(`Processing batch of ${batch.length} - ${id}`);
29
+ if (!message?.content?.text) throw new Error(`Sample message has no content.text:${JSON.stringify(message)}`);
30
+ const template = handlebars.compile(message.content.text);
31
+ batch.forEach((person) => {
32
+ messageContent.push(template(person));
33
+ });
34
+ batch.forEach((p) => {
35
+ const o = {
36
+ person_id: p.person_id,
37
+ email: p.email,
38
+ entry_type: 'EMAIL_DELIVERED'
39
+ };
40
+ counter += 1;
41
+ //if (counter % 10000 === 0) debug(`*** Processed ${counter} items, last person_id=${p.person_id}`, o);
42
+ timelineOutputStream.push(o);
43
+ });
44
+ // debug(`Processed batch of size ${batch.length}`);
45
+ await setTimeout(Math.random() * 3000);
46
+ //debug(`Completed processing ${id}`);
47
+ }
48
+ });
57
49
  debug(output);
58
50
 
59
51
  assert.equal(counter, 1000000, `Expected to loop through 1000000 people, actual:${counter}`);
package/timelineTypes.js CHANGED
@@ -31,6 +31,7 @@ const SMS_DELIVERED = 31;
31
31
  const SMS_CLICK = 33;
32
32
  const SMS_UNSUBSCRIBE = 34;
33
33
  const SMS_BOUNCE = 37;
34
+ const SMS_SPAM = 38;
34
35
  const SMS_REPLY = 39;
35
36
 
36
37
  const EMAIL_SEND = 40;
@@ -112,6 +113,7 @@ const TIMELINE_ENTRY_TYPES = {
112
113
  SMS_CLICK,
113
114
  SMS_UNSUBSCRIBE,
114
115
  SMS_BOUNCE,
116
+ SMS_SPAM,
115
117
  SMS_REPLY,
116
118
 
117
119
  // email interactions