make-mp-data 2.0.21 → 2.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -32,13 +32,24 @@ export async function createHookArray(arr = [], opts) {
32
32
  } = opts || {};
33
33
 
34
34
  const FILE_CONN = pLimit(concurrency);
35
- const { config = {}, runtime = {} } = context;
35
+ const {
36
+ config = {},
37
+ runtime = {
38
+ operations: 0,
39
+ eventCount: 0,
40
+ userCount: 0,
41
+ isBatchMode: false,
42
+ verbose: false,
43
+ isCLI: false
44
+ }
45
+ } = context;
36
46
  const BATCH_SIZE = config.batchSize || 1_000_000;
37
47
  const NODE_ENV = process.env.NODE_ENV || "unknown";
38
48
 
39
49
  let batch = 0;
40
50
  let writeDir;
41
51
  let isBatchMode = runtime.isBatchMode || false;
52
+ let isWriting = false; // Prevent concurrent writes
42
53
 
43
54
  // Determine write directory
44
55
  const dataFolder = path.resolve("./data");
@@ -54,13 +65,15 @@ export async function createHookArray(arr = [], opts) {
54
65
  }
55
66
 
56
67
  function getWritePath() {
68
+ const gzipSuffix = (config.gzip && !writeDir?.startsWith('gs://')) ? '.gz' : '';
69
+
57
70
  if (isBatchMode) {
58
- if (writeDir?.startsWith('gs://')) return `${writeDir}/${filepath}-part-${batch.toString()}.${format}`;
59
- return path.join(writeDir, `${filepath}-part-${batch.toString()}.${format}`);
71
+ if (writeDir?.startsWith('gs://')) return `${writeDir}/${filepath}-part-${batch.toString()}.${format}${gzipSuffix}`;
72
+ return path.join(writeDir, `${filepath}-part-${batch.toString()}.${format}${gzipSuffix}`);
60
73
  }
61
74
  else {
62
- if (writeDir?.startsWith('gs://')) return `${writeDir}/${filepath}.${format}`;
63
- return path.join(writeDir, `${filepath}.${format}`);
75
+ if (writeDir?.startsWith('gs://')) return `${writeDir}/${filepath}.${format}${gzipSuffix}`;
76
+ return path.join(writeDir, `${filepath}.${format}${gzipSuffix}`);
64
77
  }
65
78
  }
66
79
 
@@ -109,15 +122,26 @@ export async function createHookArray(arr = [], opts) {
109
122
  }
110
123
  }
111
124
 
112
- if (arr.length > BATCH_SIZE) {
125
+ // Check batch size and handle writes synchronously to prevent race conditions
126
+ if (arr.length > BATCH_SIZE && !isWriting) {
127
+ isWriting = true; // Lock to prevent concurrent writes
113
128
  isBatchMode = true;
114
129
  runtime.isBatchMode = true; // Update runtime state
115
130
  batch++;
116
131
  const writePath = getWritePath();
117
- const writeResult = await FILE_CONN(() => writeToDisk(arr, { writePath }));
118
- // Ensure array is cleared after successful write
119
- arr.length = 0;
120
- return writeResult;
132
+
133
+ try {
134
+ // Create a copy of the data to write
135
+ const dataToWrite = [...arr];
136
+ // Clear the array immediately to prevent race conditions
137
+ arr.length = 0;
138
+
139
+ // Write to disk (this is now synchronous from the perspective of batch management)
140
+ const writeResult = await FILE_CONN(() => writeToDisk(dataToWrite, { writePath }));
141
+ return writeResult;
142
+ } finally {
143
+ isWriting = false; // Release the lock
144
+ }
121
145
  } else {
122
146
  return Promise.resolve(false);
123
147
  }
@@ -131,12 +155,19 @@ export async function createHookArray(arr = [], opts) {
131
155
  console.log(`\n\twriting ${writePath}\n`);
132
156
  }
133
157
 
158
+ const streamOptions = {
159
+ gzip: config.gzip || false
160
+ };
161
+
134
162
  switch (format) {
135
163
  case "csv":
136
- writeResult = await u.streamCSV(writePath, data);
164
+ writeResult = await u.streamCSV(writePath, data, streamOptions);
137
165
  break;
138
166
  case "json":
139
- writeResult = await u.streamJSON(writePath, data);
167
+ writeResult = await u.streamJSON(writePath, data, streamOptions);
168
+ break;
169
+ case "parquet":
170
+ writeResult = await u.streamParquet(writePath, data, streamOptions);
140
171
  break;
141
172
  default:
142
173
  throw new Error(`format ${format} is not supported`);
@@ -148,10 +179,21 @@ export async function createHookArray(arr = [], opts) {
148
179
 
149
180
  async function flush() {
150
181
  if (arr.length > 0) {
151
- batch++;
152
- const writePath = getWritePath();
153
- await FILE_CONN(() => writeToDisk(arr, { writePath }));
154
- if (isBatchMode) arr.length = 0; // free up memory for batch mode
182
+ // Wait for any ongoing writes to complete
183
+ while (isWriting) {
184
+ await new Promise(resolve => setTimeout(resolve, 10));
185
+ }
186
+
187
+ isWriting = true;
188
+ try {
189
+ batch++;
190
+ const writePath = getWritePath();
191
+ const dataToWrite = [...arr];
192
+ arr.length = 0; // Clear array after copying data
193
+ await FILE_CONN(() => writeToDisk(dataToWrite, { writePath }));
194
+ } finally {
195
+ isWriting = false;
196
+ }
155
197
  }
156
198
  }
157
199
 
@@ -186,12 +228,15 @@ export class StorageManager {
186
228
  async initializeContainers() {
187
229
  const { config } = this.context;
188
230
 
231
+ // Validate configuration for potential data loss scenarios
232
+ this.validateConfiguration(config);
233
+
189
234
  /** @type {Storage} */
190
235
  const storage = {
191
236
  eventData: await createHookArray([], {
192
237
  hook: config.hook,
193
238
  type: "event",
194
- filepath: `${config.simulationName || 'events'}-EVENTS`,
239
+ filepath: `${config.name}-EVENTS`,
195
240
  format: config.format || "csv",
196
241
  concurrency: config.concurrency || 1,
197
242
  context: this.context
@@ -200,7 +245,7 @@ export class StorageManager {
200
245
  userProfilesData: await createHookArray([], {
201
246
  hook: config.hook,
202
247
  type: "user",
203
- filepath: `${config.simulationName || 'users'}-USERS`,
248
+ filepath: `${config.name}-USERS`,
204
249
  format: config.format || "csv",
205
250
  concurrency: config.concurrency || 1,
206
251
  context: this.context
@@ -209,7 +254,7 @@ export class StorageManager {
209
254
  adSpendData: await createHookArray([], {
210
255
  hook: config.hook,
211
256
  type: "ad-spend",
212
- filepath: `${config.simulationName || 'adspend'}-ADSPEND`,
257
+ filepath: `${config.name}-ADSPEND`,
213
258
  format: config.format || "csv",
214
259
  concurrency: config.concurrency || 1,
215
260
  context: this.context
@@ -222,7 +267,7 @@ export class StorageManager {
222
267
  mirrorEventData: await createHookArray([], {
223
268
  hook: config.hook,
224
269
  type: "mirror",
225
- filepath: `${config.simulationName || 'mirror'}-MIRROR`,
270
+ filepath: `${config.name}-MIRROR`,
226
271
  format: config.format || "csv",
227
272
  concurrency: config.concurrency || 1,
228
273
  context: this.context
@@ -235,7 +280,7 @@ export class StorageManager {
235
280
  const scdArray = await createHookArray([], {
236
281
  hook: config.hook,
237
282
  type: "scd",
238
- filepath: `${config.simulationName || 'scd'}-${scdKey}-SCD`,
283
+ filepath: `${config.name}-${scdKey}-SCD`,
239
284
  format: config.format || "csv",
240
285
  concurrency: config.concurrency || 1,
241
286
  context: this.context
@@ -251,7 +296,7 @@ export class StorageManager {
251
296
  const groupArray = await createHookArray([], {
252
297
  hook: config.hook,
253
298
  type: "group",
254
- filepath: `${config.simulationName || 'groups'}-${groupKey}-GROUPS`,
299
+ filepath: `${config.name}-${groupKey}-GROUPS`,
255
300
  format: config.format || "csv",
256
301
  concurrency: config.concurrency || 1,
257
302
  context: this.context
@@ -267,7 +312,7 @@ export class StorageManager {
267
312
  const lookupArray = await createHookArray([], {
268
313
  hook: config.hook,
269
314
  type: "lookup",
270
- filepath: `${config.simulationName || 'lookup'}-${lookupConfig.key}-LOOKUP`,
315
+ filepath: `${config.name}-${lookupConfig.key}-LOOKUP`,
271
316
  format: "csv", // Always force CSV for lookup tables
272
317
  concurrency: config.concurrency || 1,
273
318
  context: this.context
@@ -279,4 +324,24 @@ export class StorageManager {
279
324
 
280
325
  return storage;
281
326
  }
327
+
328
+ /**
329
+ * Validates configuration to prevent data loss scenarios
330
+ * @param {Object} config - Configuration object
331
+ */
332
+ validateConfiguration(config) {
333
+ // Check for potential data loss scenario: writeToDisk=false with low batchSize
334
+ if (config.writeToDisk === false) {
335
+ const batchSize = config.batchSize || 1_000_000;
336
+ const numEvents = config.numEvents || 0;
337
+
338
+ if (batchSize < numEvents) {
339
+ throw new Error(
340
+ `Configuration error: writeToDisk is explicitly set to false but batchSize (${batchSize}) is lower than numEvents (${numEvents}). ` +
341
+ `This would result in data loss as batched data would be discarded. ` +
342
+ `Either set writeToDisk to true, increase batchSize to be >= numEvents, or provide a Mixpanel token to send data directly.`
343
+ );
344
+ }
345
+ }
346
+ }
282
347
  }
@@ -72,7 +72,6 @@ export async function makeEvent(
72
72
  };
73
73
 
74
74
  let defaultProps = {};
75
- let devicePool = [];
76
75
 
77
76
  // Add default properties based on configuration
78
77
  if (hasLocation) {
@@ -82,32 +81,30 @@ export async function makeEvent(
82
81
  if (hasBrowser) {
83
82
  defaultProps.browser = u.choose(defaults.browsers());
84
83
  }
85
-
86
- // Build device pool based on enabled device types
87
- if (hasAndroidDevices) devicePool.push(defaults.androidDevices());
88
- if (hasIOSDevices) devicePool.push(defaults.iOSDevices());
89
- if (hasDesktopDevices) devicePool.push(defaults.desktopDevices());
90
84
 
91
85
  // Add campaigns with attribution likelihood
92
86
  if (hasCampaigns && chance.bool({ likelihood: 25 })) {
93
87
  defaultProps.campaigns = u.pickRandom(defaults.campaigns());
94
88
  }
95
89
 
96
- // Select device from pool
97
- const devices = devicePool.flat();
98
- if (devices.length) {
99
- defaultProps.device = u.pickRandom(devices);
90
+ // PERFORMANCE: Use pre-computed device pool instead of rebuilding every time
91
+ if (defaults.allDevices.length) {
92
+ defaultProps.device = u.pickRandom(defaults.allDevices);
100
93
  }
101
94
 
102
95
  // Set event time using TimeSoup for realistic distribution
103
96
  if (earliestTime) {
104
97
  if (isFirstEvent) {
105
- // Apply time shift to move to present day using precomputed value
106
- eventTemplate.time = dayjs.unix(earliestTime).add(context.TIME_SHIFT_SECONDS, 'seconds').toISOString();
98
+ // PERFORMANCE: Direct numeric calculation instead of dayjs object creation
99
+ const shiftedTimestamp = earliestTime + context.TIME_SHIFT_SECONDS;
100
+ eventTemplate.time = dayjs.unix(shiftedTimestamp).toISOString();
107
101
  } else {
108
- // Get time from TimeSoup and apply precomputed time shift
102
+ // Get time from TimeSoup (returns ISO string) and apply precomputed time shift
109
103
  const soupTime = u.TimeSoup(earliestTime, context.FIXED_NOW, peaks, deviation, mean);
110
- eventTemplate.time = dayjs(soupTime).add(context.TIME_SHIFT_SECONDS, 'seconds').toISOString();
104
+ // PERFORMANCE: Parse ISO directly to milliseconds, add shift, convert back to ISO with one dayjs call
105
+ const soupTimestamp = new Date(soupTime).getTime() / 1000; // Convert to unix seconds
106
+ const shiftedTimestamp = soupTimestamp + context.TIME_SHIFT_SECONDS;
107
+ eventTemplate.time = dayjs.unix(shiftedTimestamp).toISOString();
111
108
  }
112
109
  }
113
110
 
@@ -133,7 +130,9 @@ export async function makeEvent(
133
130
  // PERFORMANCE: Process properties directly without creating intermediate object
134
131
  // Add custom properties from event configuration
135
132
  if (chosenEvent.properties) {
136
- for (const key in chosenEvent.properties) {
133
+ const eventKeys = Object.keys(chosenEvent.properties);
134
+ for (let i = 0; i < eventKeys.length; i++) {
135
+ const key = eventKeys[i];
137
136
  try {
138
137
  eventTemplate[key] = u.choose(chosenEvent.properties[key]);
139
138
  } catch (e) {
@@ -145,7 +144,9 @@ export async function makeEvent(
145
144
 
146
145
  // Add super properties (override event properties if needed)
147
146
  if (superProps) {
148
- for (const key in superProps) {
147
+ const superKeys = Object.keys(superProps);
148
+ for (let i = 0; i < superKeys.length; i++) {
149
+ const key = superKeys[i];
149
150
  try {
150
151
  eventTemplate[key] = u.choose(superProps[key]);
151
152
  } catch (e) {
@@ -128,7 +128,13 @@ function buildFunnelEvents(context, sequence, chosenFunnelProps) {
128
128
 
129
129
  return sequence.map((eventName) => {
130
130
  const foundEvent = config.events?.find((e) => e.event === eventName);
131
- const eventSpec = u.deepClone(foundEvent) || { event: eventName, properties: {} };
131
+
132
+ // PERFORMANCE: Shallow copy instead of deepClone for better performance
133
+ // We only need to copy the top-level structure since we're rebuilding properties anyway
134
+ const eventSpec = foundEvent ? {
135
+ event: foundEvent.event,
136
+ properties: { ...foundEvent.properties }
137
+ } : { event: eventName, properties: {} };
132
138
 
133
139
  // Process event properties
134
140
  for (const key in eventSpec.properties) {
@@ -139,11 +145,7 @@ function buildFunnelEvents(context, sequence, chosenFunnelProps) {
139
145
  }
140
146
  }
141
147
 
142
- // Clean up funnel-specific properties
143
- delete eventSpec.isFirstEvent;
144
- delete eventSpec.weight;
145
-
146
- // Merge funnel properties
148
+ // Merge funnel properties (no need to delete properties since we're creating a new object)
147
149
  eventSpec.properties = { ...eventSpec.properties, ...chosenFunnelProps };
148
150
 
149
151
  return eventSpec;
@@ -58,9 +58,10 @@ export async function sendToMixpanel(context) {
58
58
  epochEnd: dayjs().unix(),
59
59
  dryRun: false,
60
60
  abridged: false,
61
- fixJson: true,
61
+ fixJson: false,
62
62
  showProgress: NODE_ENV === "dev" ? true : false,
63
- streamFormat: mpImportFormat
63
+ streamFormat: mpImportFormat,
64
+ workers: 35
64
65
  };
65
66
 
66
67
  if (isCLI) commonOpts.showProgress = true;
@@ -134,6 +135,8 @@ export async function sendToMixpanel(context) {
134
135
  const imported = await mp({ token, groupKey }, groupProfilesToImport, {
135
136
  recordType: "group",
136
137
  ...commonOpts,
138
+ groupKey,
139
+ //dryRun: true
137
140
  });
138
141
  log(`\tsent ${comma(imported.success)} ${groupKey} profiles\n`);
139
142
  importResults.groups.push(imported);