make-mp-data 2.0.19 → 2.0.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,10 @@
3
3
  * Extracted from index.js to eliminate global dependencies
4
4
  */
5
5
 
6
- /** @typedef {import('../../types').Context} Context */
6
+ /** @typedef {import('../../types.js').Context} Context */
7
+ /** @typedef {import('../../types.js').HookedArray<any>} HookedArray */
8
+ /** @typedef {import('../../types.js').Storage} Storage */
9
+ /** @typedef {import('../../types.js').hookArrayOptions<any>} hookArrayOptions */
7
10
 
8
11
  import { existsSync } from "fs";
9
12
  import pLimit from 'p-limit';
@@ -14,34 +17,39 @@ import * as u from "../utils/utils.js";
14
17
  /**
15
18
  * Creates a hooked array that transforms data on push and handles batching/disk writes
16
19
  * @param {Array} arr - Base array to enhance
17
- * @param {Object} opts - Configuration options
18
- * @param {Function} opts.hook - Transform function applied to each item
19
- * @param {string} opts.type - Type identifier for the hook function
20
- * @param {string} opts.filepath - Base filename for disk writes
21
- * @param {string} opts.format - Output format ('csv' or 'json')
22
- * @param {number} opts.concurrency - Max concurrent file operations
23
- * @param {Context} opts.context - Context object with config, batchSize, etc.
24
- * @returns {Promise<Array>} Enhanced array with hookPush and flush methods
20
+ * @param {hookArrayOptions} opts - Configuration options
21
+ * @returns {Promise<HookedArray>} Enhanced array with hookPush and flush methods
25
22
  */
26
- export async function createHookArray(arr = [], opts = {}) {
23
+ export async function createHookArray(arr = [], opts) {
27
24
  const {
28
25
  hook = a => a,
29
26
  type = "",
30
27
  filepath = "./defaultFile",
31
28
  format = "csv",
32
29
  concurrency = 1,
33
- context = {},
30
+ context = /** @type {Context} */ ({}),
34
31
  ...rest
35
- } = opts;
32
+ } = opts || {};
36
33
 
37
34
  const FILE_CONN = pLimit(concurrency);
38
- const { config = {}, runtime = {} } = context;
35
+ const {
36
+ config = {},
37
+ runtime = {
38
+ operations: 0,
39
+ eventCount: 0,
40
+ userCount: 0,
41
+ isBatchMode: false,
42
+ verbose: false,
43
+ isCLI: false
44
+ }
45
+ } = context;
39
46
  const BATCH_SIZE = config.batchSize || 1_000_000;
40
47
  const NODE_ENV = process.env.NODE_ENV || "unknown";
41
48
 
42
49
  let batch = 0;
43
50
  let writeDir;
44
51
  let isBatchMode = runtime.isBatchMode || false;
52
+ let isWriting = false; // Prevent concurrent writes
45
53
 
46
54
  // Determine write directory
47
55
  const dataFolder = path.resolve("./data");
@@ -77,7 +85,7 @@ export async function createHookArray(arr = [], opts = {}) {
77
85
 
78
86
  // Performance optimization: skip hook overhead for passthrough hooks
79
87
  const isPassthroughHook = hook.toString().includes('return record') || hook.length === 1;
80
-
88
+
81
89
  if (isPassthroughHook) {
82
90
  // Fast path for passthrough hooks - no transformation needed
83
91
  if (Array.isArray(item)) {
@@ -112,13 +120,26 @@ export async function createHookArray(arr = [], opts = {}) {
112
120
  }
113
121
  }
114
122
 
115
- if (arr.length > BATCH_SIZE) {
123
+ // Check batch size and handle writes synchronously to prevent race conditions
124
+ if (arr.length > BATCH_SIZE && !isWriting) {
125
+ isWriting = true; // Lock to prevent concurrent writes
116
126
  isBatchMode = true;
117
127
  runtime.isBatchMode = true; // Update runtime state
118
128
  batch++;
119
129
  const writePath = getWritePath();
120
- const writeResult = await FILE_CONN(() => writeToDisk(arr, { writePath }));
121
- return writeResult;
130
+
131
+ try {
132
+ // Create a copy of the data to write
133
+ const dataToWrite = [...arr];
134
+ // Clear the array immediately to prevent race conditions
135
+ arr.length = 0;
136
+
137
+ // Write to disk (this is now synchronous from the perspective of batch management)
138
+ const writeResult = await FILE_CONN(() => writeToDisk(dataToWrite, { writePath }));
139
+ return writeResult;
140
+ } finally {
141
+ isWriting = false; // Release the lock
142
+ }
122
143
  } else {
123
144
  return Promise.resolve(false);
124
145
  }
@@ -129,7 +150,7 @@ export async function createHookArray(arr = [], opts = {}) {
129
150
  let writeResult;
130
151
 
131
152
  if (config.verbose) {
132
- console.log(`\n\n\twriting ${writePath}\n\n`);
153
+ console.log(`\n\twriting ${writePath}\n`);
133
154
  }
134
155
 
135
156
  switch (format) {
@@ -143,21 +164,33 @@ export async function createHookArray(arr = [], opts = {}) {
143
164
  throw new Error(`format ${format} is not supported`);
144
165
  }
145
166
 
146
- if (isBatchMode) data.length = 0;
167
+ // Array clearing now handled in transformThenPush to ensure proper timing
147
168
  return writeResult;
148
169
  }
149
170
 
150
171
  async function flush() {
151
172
  if (arr.length > 0) {
152
- batch++;
153
- const writePath = getWritePath();
154
- await FILE_CONN(() => writeToDisk(arr, { writePath }));
155
- if (isBatchMode) arr.length = 0; // free up memory for batch mode
173
+ // Wait for any ongoing writes to complete
174
+ while (isWriting) {
175
+ await new Promise(resolve => setTimeout(resolve, 10));
176
+ }
177
+
178
+ isWriting = true;
179
+ try {
180
+ batch++;
181
+ const writePath = getWritePath();
182
+ const dataToWrite = [...arr];
183
+ arr.length = 0; // Clear array after copying data
184
+ await FILE_CONN(() => writeToDisk(dataToWrite, { writePath }));
185
+ } finally {
186
+ isWriting = false;
187
+ }
156
188
  }
157
189
  }
158
190
 
159
191
  // Enhance the array with our methods
160
- const enrichedArray = arr;
192
+ /** @type {HookedArray} */
193
+ const enrichedArray = /** @type {any} */ (arr);
161
194
  enrichedArray.hookPush = transformThenPush;
162
195
  enrichedArray.flush = flush;
163
196
  enrichedArray.getWriteDir = getWriteDir;
@@ -181,11 +214,12 @@ export class StorageManager {
181
214
 
182
215
  /**
183
216
  * Initialize all storage containers for the data generation process
184
- * @returns {import('../../types').Storage} Storage containers object
217
+ * @returns {Promise<Storage>} Storage containers object
185
218
  */
186
219
  async initializeContainers() {
187
220
  const { config } = this.context;
188
221
 
222
+ /** @type {Storage} */
189
223
  const storage = {
190
224
  eventData: await createHookArray([], {
191
225
  hook: config.hook,
@@ -207,7 +241,7 @@ export class StorageManager {
207
241
 
208
242
  adSpendData: await createHookArray([], {
209
243
  hook: config.hook,
210
- type: "adspend",
244
+ type: "ad-spend",
211
245
  filepath: `${config.simulationName || 'adspend'}-ADSPEND`,
212
246
  format: config.format || "csv",
213
247
  concurrency: config.concurrency || 1,
@@ -267,7 +301,7 @@ export class StorageManager {
267
301
  hook: config.hook,
268
302
  type: "lookup",
269
303
  filepath: `${config.simulationName || 'lookup'}-${lookupConfig.key}-LOOKUP`,
270
- format: config.format || "csv",
304
+ format: "csv", // Always force CSV for lookup tables
271
305
  concurrency: config.concurrency || 1,
272
306
  context: this.context
273
307
  });
@@ -102,19 +102,22 @@ export async function makeEvent(
102
102
  // Set event time using TimeSoup for realistic distribution
103
103
  if (earliestTime) {
104
104
  if (isFirstEvent) {
105
- eventTemplate.time = dayjs.unix(earliestTime).toISOString();
105
+ // Apply time shift to move to present day using precomputed value
106
+ eventTemplate.time = dayjs.unix(earliestTime).add(context.TIME_SHIFT_SECONDS, 'seconds').toISOString();
106
107
  } else {
107
- eventTemplate.time = u.TimeSoup(earliestTime, context.FIXED_NOW, peaks, deviation, mean);
108
+ // Get time from TimeSoup and apply precomputed time shift
109
+ const soupTime = u.TimeSoup(earliestTime, context.FIXED_NOW, peaks, deviation, mean);
110
+ eventTemplate.time = dayjs(soupTime).add(context.TIME_SHIFT_SECONDS, 'seconds').toISOString();
108
111
  }
109
112
  }
110
113
 
111
114
  // Add anonymous and session identifiers
112
115
  if (anonymousIds.length) {
113
- eventTemplate.device_id = chance.pickone(anonymousIds);
116
+ eventTemplate.device_id = u.pickRandom(anonymousIds);
114
117
  }
115
118
 
116
119
  if (sessionIds.length) {
117
- eventTemplate.session_id = chance.pickone(sessionIds);
120
+ eventTemplate.session_id = u.pickRandom(sessionIds);
118
121
  }
119
122
 
120
123
  // Sometimes add user_id (for attribution modeling)
@@ -127,16 +130,28 @@ export async function makeEvent(
127
130
  eventTemplate.user_id = distinct_id;
128
131
  }
129
132
 
130
- // Merge custom properties with super properties
131
- const props = Object.assign({}, chosenEvent.properties, superProps);
132
-
133
+ // PERFORMANCE: Process properties directly without creating intermediate object
133
134
  // Add custom properties from event configuration
134
- for (const key in props) {
135
- try {
136
- eventTemplate[key] = u.choose(props[key]);
137
- } catch (e) {
138
- console.error(`error with ${key} in ${chosenEvent.event} event`, e);
139
- // Continue processing other properties
135
+ if (chosenEvent.properties) {
136
+ for (const key in chosenEvent.properties) {
137
+ try {
138
+ eventTemplate[key] = u.choose(chosenEvent.properties[key]);
139
+ } catch (e) {
140
+ console.error(`error with ${key} in ${chosenEvent.event} event`, e);
141
+ // Continue processing other properties
142
+ }
143
+ }
144
+ }
145
+
146
+ // Add super properties (override event properties if needed)
147
+ if (superProps) {
148
+ for (const key in superProps) {
149
+ try {
150
+ eventTemplate[key] = u.choose(superProps[key]);
151
+ } catch (e) {
152
+ console.error(`error with ${key} in super props`, e);
153
+ // Continue processing other properties
154
+ }
140
155
  }
141
156
  }
142
157
 
@@ -153,13 +168,21 @@ export async function makeEvent(
153
168
  const tuple = `${eventTemplate.event}-${eventTemplate.time}-${distinctId}`;
154
169
  eventTemplate.insert_id = u.quickHash(tuple);
155
170
 
156
- // Apply time shift to move events to current timeline
157
- if (earliestTime) {
158
- const timeShift = dayjs().add(2, "day").diff(dayjs.unix(context.FIXED_NOW), "seconds");
159
- const timeShifted = dayjs(eventTemplate.time).add(timeShift, "seconds").toISOString();
160
- eventTemplate.time = timeShifted;
171
+ // Call hook if configured (before returning the event)
172
+ const { hook } = config;
173
+ if (hook) {
174
+ const hookedEvent = await hook(eventTemplate, "event", {
175
+ user: { distinct_id },
176
+ config
177
+ });
178
+ // If hook returns a modified event, use it; otherwise use original
179
+ if (hookedEvent && typeof hookedEvent === 'object') {
180
+ return hookedEvent;
181
+ }
161
182
  }
162
183
 
184
+ // Note: Time shift already applied above during timestamp calculation
185
+
163
186
  return eventTemplate;
164
187
  }
165
188
 
@@ -58,9 +58,10 @@ export async function sendToMixpanel(context) {
58
58
  epochEnd: dayjs().unix(),
59
59
  dryRun: false,
60
60
  abridged: false,
61
- fixJson: true,
61
+ fixJson: false,
62
62
  showProgress: NODE_ENV === "dev" ? true : false,
63
- streamFormat: mpImportFormat
63
+ streamFormat: mpImportFormat,
64
+ workers: 35
64
65
  };
65
66
 
66
67
  if (isCLI) commonOpts.showProgress = true;
@@ -134,6 +135,8 @@ export async function sendToMixpanel(context) {
134
135
  const imported = await mp({ token, groupKey }, groupProfilesToImport, {
135
136
  recordType: "group",
136
137
  ...commonOpts,
138
+ groupKey,
139
+ //dryRun: true
137
140
  });
138
141
  log(`\tsent ${comma(imported.success)} ${groupKey} profiles\n`);
139
142
  importResults.groups.push(imported);