make-mp-data 2.0.21 → 2.0.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dungeons/student-teacher.js +38 -87
- package/entry.js +7 -1
- package/index.js +90 -8
- package/lib/cli/cli.js +15 -1
- package/lib/core/config-validator.js +230 -219
- package/lib/core/context.js +13 -1
- package/lib/core/storage.js +88 -23
- package/lib/generators/events.js +17 -16
- package/lib/generators/funnels.js +8 -6
- package/lib/orchestrators/mixpanel-sender.js +5 -2
- package/lib/orchestrators/user-loop.js +212 -181
- package/lib/templates/abbreviated.d.ts +4 -3
- package/lib/templates/instructions.txt +1 -0
- package/lib/templates/{dungeon-template.js → scratch-dungeon-template.js} +9 -3
- package/lib/templates/verbose-schema.js +31 -4
- package/lib/utils/utils.js +178 -14
- package/package.json +5 -4
- package/types.d.ts +9 -4
package/lib/core/storage.js
CHANGED
|
@@ -32,13 +32,24 @@ export async function createHookArray(arr = [], opts) {
|
|
|
32
32
|
} = opts || {};
|
|
33
33
|
|
|
34
34
|
const FILE_CONN = pLimit(concurrency);
|
|
35
|
-
const {
|
|
35
|
+
const {
|
|
36
|
+
config = {},
|
|
37
|
+
runtime = {
|
|
38
|
+
operations: 0,
|
|
39
|
+
eventCount: 0,
|
|
40
|
+
userCount: 0,
|
|
41
|
+
isBatchMode: false,
|
|
42
|
+
verbose: false,
|
|
43
|
+
isCLI: false
|
|
44
|
+
}
|
|
45
|
+
} = context;
|
|
36
46
|
const BATCH_SIZE = config.batchSize || 1_000_000;
|
|
37
47
|
const NODE_ENV = process.env.NODE_ENV || "unknown";
|
|
38
48
|
|
|
39
49
|
let batch = 0;
|
|
40
50
|
let writeDir;
|
|
41
51
|
let isBatchMode = runtime.isBatchMode || false;
|
|
52
|
+
let isWriting = false; // Prevent concurrent writes
|
|
42
53
|
|
|
43
54
|
// Determine write directory
|
|
44
55
|
const dataFolder = path.resolve("./data");
|
|
@@ -54,13 +65,15 @@ export async function createHookArray(arr = [], opts) {
|
|
|
54
65
|
}
|
|
55
66
|
|
|
56
67
|
function getWritePath() {
|
|
68
|
+
const gzipSuffix = (config.gzip && !writeDir?.startsWith('gs://')) ? '.gz' : '';
|
|
69
|
+
|
|
57
70
|
if (isBatchMode) {
|
|
58
|
-
if (writeDir?.startsWith('gs://')) return `${writeDir}/${filepath}-part-${batch.toString()}.${format}`;
|
|
59
|
-
return path.join(writeDir, `${filepath}-part-${batch.toString()}.${format}`);
|
|
71
|
+
if (writeDir?.startsWith('gs://')) return `${writeDir}/${filepath}-part-${batch.toString()}.${format}${gzipSuffix}`;
|
|
72
|
+
return path.join(writeDir, `${filepath}-part-${batch.toString()}.${format}${gzipSuffix}`);
|
|
60
73
|
}
|
|
61
74
|
else {
|
|
62
|
-
if (writeDir?.startsWith('gs://')) return `${writeDir}/${filepath}.${format}`;
|
|
63
|
-
return path.join(writeDir, `${filepath}.${format}`);
|
|
75
|
+
if (writeDir?.startsWith('gs://')) return `${writeDir}/${filepath}.${format}${gzipSuffix}`;
|
|
76
|
+
return path.join(writeDir, `${filepath}.${format}${gzipSuffix}`);
|
|
64
77
|
}
|
|
65
78
|
}
|
|
66
79
|
|
|
@@ -109,15 +122,26 @@ export async function createHookArray(arr = [], opts) {
|
|
|
109
122
|
}
|
|
110
123
|
}
|
|
111
124
|
|
|
112
|
-
|
|
125
|
+
// Check batch size and handle writes synchronously to prevent race conditions
|
|
126
|
+
if (arr.length > BATCH_SIZE && !isWriting) {
|
|
127
|
+
isWriting = true; // Lock to prevent concurrent writes
|
|
113
128
|
isBatchMode = true;
|
|
114
129
|
runtime.isBatchMode = true; // Update runtime state
|
|
115
130
|
batch++;
|
|
116
131
|
const writePath = getWritePath();
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
132
|
+
|
|
133
|
+
try {
|
|
134
|
+
// Create a copy of the data to write
|
|
135
|
+
const dataToWrite = [...arr];
|
|
136
|
+
// Clear the array immediately to prevent race conditions
|
|
137
|
+
arr.length = 0;
|
|
138
|
+
|
|
139
|
+
// Write to disk (this is now synchronous from the perspective of batch management)
|
|
140
|
+
const writeResult = await FILE_CONN(() => writeToDisk(dataToWrite, { writePath }));
|
|
141
|
+
return writeResult;
|
|
142
|
+
} finally {
|
|
143
|
+
isWriting = false; // Release the lock
|
|
144
|
+
}
|
|
121
145
|
} else {
|
|
122
146
|
return Promise.resolve(false);
|
|
123
147
|
}
|
|
@@ -131,12 +155,19 @@ export async function createHookArray(arr = [], opts) {
|
|
|
131
155
|
console.log(`\n\twriting ${writePath}\n`);
|
|
132
156
|
}
|
|
133
157
|
|
|
158
|
+
const streamOptions = {
|
|
159
|
+
gzip: config.gzip || false
|
|
160
|
+
};
|
|
161
|
+
|
|
134
162
|
switch (format) {
|
|
135
163
|
case "csv":
|
|
136
|
-
writeResult = await u.streamCSV(writePath, data);
|
|
164
|
+
writeResult = await u.streamCSV(writePath, data, streamOptions);
|
|
137
165
|
break;
|
|
138
166
|
case "json":
|
|
139
|
-
writeResult = await u.streamJSON(writePath, data);
|
|
167
|
+
writeResult = await u.streamJSON(writePath, data, streamOptions);
|
|
168
|
+
break;
|
|
169
|
+
case "parquet":
|
|
170
|
+
writeResult = await u.streamParquet(writePath, data, streamOptions);
|
|
140
171
|
break;
|
|
141
172
|
default:
|
|
142
173
|
throw new Error(`format ${format} is not supported`);
|
|
@@ -148,10 +179,21 @@ export async function createHookArray(arr = [], opts) {
|
|
|
148
179
|
|
|
149
180
|
async function flush() {
|
|
150
181
|
if (arr.length > 0) {
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
182
|
+
// Wait for any ongoing writes to complete
|
|
183
|
+
while (isWriting) {
|
|
184
|
+
await new Promise(resolve => setTimeout(resolve, 10));
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
isWriting = true;
|
|
188
|
+
try {
|
|
189
|
+
batch++;
|
|
190
|
+
const writePath = getWritePath();
|
|
191
|
+
const dataToWrite = [...arr];
|
|
192
|
+
arr.length = 0; // Clear array after copying data
|
|
193
|
+
await FILE_CONN(() => writeToDisk(dataToWrite, { writePath }));
|
|
194
|
+
} finally {
|
|
195
|
+
isWriting = false;
|
|
196
|
+
}
|
|
155
197
|
}
|
|
156
198
|
}
|
|
157
199
|
|
|
@@ -186,12 +228,15 @@ export class StorageManager {
|
|
|
186
228
|
async initializeContainers() {
|
|
187
229
|
const { config } = this.context;
|
|
188
230
|
|
|
231
|
+
// Validate configuration for potential data loss scenarios
|
|
232
|
+
this.validateConfiguration(config);
|
|
233
|
+
|
|
189
234
|
/** @type {Storage} */
|
|
190
235
|
const storage = {
|
|
191
236
|
eventData: await createHookArray([], {
|
|
192
237
|
hook: config.hook,
|
|
193
238
|
type: "event",
|
|
194
|
-
filepath: `${config.
|
|
239
|
+
filepath: `${config.name}-EVENTS`,
|
|
195
240
|
format: config.format || "csv",
|
|
196
241
|
concurrency: config.concurrency || 1,
|
|
197
242
|
context: this.context
|
|
@@ -200,7 +245,7 @@ export class StorageManager {
|
|
|
200
245
|
userProfilesData: await createHookArray([], {
|
|
201
246
|
hook: config.hook,
|
|
202
247
|
type: "user",
|
|
203
|
-
filepath: `${config.
|
|
248
|
+
filepath: `${config.name}-USERS`,
|
|
204
249
|
format: config.format || "csv",
|
|
205
250
|
concurrency: config.concurrency || 1,
|
|
206
251
|
context: this.context
|
|
@@ -209,7 +254,7 @@ export class StorageManager {
|
|
|
209
254
|
adSpendData: await createHookArray([], {
|
|
210
255
|
hook: config.hook,
|
|
211
256
|
type: "ad-spend",
|
|
212
|
-
filepath: `${config.
|
|
257
|
+
filepath: `${config.name}-ADSPEND`,
|
|
213
258
|
format: config.format || "csv",
|
|
214
259
|
concurrency: config.concurrency || 1,
|
|
215
260
|
context: this.context
|
|
@@ -222,7 +267,7 @@ export class StorageManager {
|
|
|
222
267
|
mirrorEventData: await createHookArray([], {
|
|
223
268
|
hook: config.hook,
|
|
224
269
|
type: "mirror",
|
|
225
|
-
filepath: `${config.
|
|
270
|
+
filepath: `${config.name}-MIRROR`,
|
|
226
271
|
format: config.format || "csv",
|
|
227
272
|
concurrency: config.concurrency || 1,
|
|
228
273
|
context: this.context
|
|
@@ -235,7 +280,7 @@ export class StorageManager {
|
|
|
235
280
|
const scdArray = await createHookArray([], {
|
|
236
281
|
hook: config.hook,
|
|
237
282
|
type: "scd",
|
|
238
|
-
filepath: `${config.
|
|
283
|
+
filepath: `${config.name}-${scdKey}-SCD`,
|
|
239
284
|
format: config.format || "csv",
|
|
240
285
|
concurrency: config.concurrency || 1,
|
|
241
286
|
context: this.context
|
|
@@ -251,7 +296,7 @@ export class StorageManager {
|
|
|
251
296
|
const groupArray = await createHookArray([], {
|
|
252
297
|
hook: config.hook,
|
|
253
298
|
type: "group",
|
|
254
|
-
filepath: `${config.
|
|
299
|
+
filepath: `${config.name}-${groupKey}-GROUPS`,
|
|
255
300
|
format: config.format || "csv",
|
|
256
301
|
concurrency: config.concurrency || 1,
|
|
257
302
|
context: this.context
|
|
@@ -267,7 +312,7 @@ export class StorageManager {
|
|
|
267
312
|
const lookupArray = await createHookArray([], {
|
|
268
313
|
hook: config.hook,
|
|
269
314
|
type: "lookup",
|
|
270
|
-
filepath: `${config.
|
|
315
|
+
filepath: `${config.name}-${lookupConfig.key}-LOOKUP`,
|
|
271
316
|
format: "csv", // Always force CSV for lookup tables
|
|
272
317
|
concurrency: config.concurrency || 1,
|
|
273
318
|
context: this.context
|
|
@@ -279,4 +324,24 @@ export class StorageManager {
|
|
|
279
324
|
|
|
280
325
|
return storage;
|
|
281
326
|
}
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Validates configuration to prevent data loss scenarios
|
|
330
|
+
* @param {Object} config - Configuration object
|
|
331
|
+
*/
|
|
332
|
+
validateConfiguration(config) {
|
|
333
|
+
// Check for potential data loss scenario: writeToDisk=false with low batchSize
|
|
334
|
+
if (config.writeToDisk === false) {
|
|
335
|
+
const batchSize = config.batchSize || 1_000_000;
|
|
336
|
+
const numEvents = config.numEvents || 0;
|
|
337
|
+
|
|
338
|
+
if (batchSize < numEvents) {
|
|
339
|
+
throw new Error(
|
|
340
|
+
`Configuration error: writeToDisk is explicitly set to false but batchSize (${batchSize}) is lower than numEvents (${numEvents}). ` +
|
|
341
|
+
`This would result in data loss as batched data would be discarded. ` +
|
|
342
|
+
`Either set writeToDisk to true, increase batchSize to be >= numEvents, or provide a Mixpanel token to send data directly.`
|
|
343
|
+
);
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
}
|
|
282
347
|
}
|
package/lib/generators/events.js
CHANGED
|
@@ -72,7 +72,6 @@ export async function makeEvent(
|
|
|
72
72
|
};
|
|
73
73
|
|
|
74
74
|
let defaultProps = {};
|
|
75
|
-
let devicePool = [];
|
|
76
75
|
|
|
77
76
|
// Add default properties based on configuration
|
|
78
77
|
if (hasLocation) {
|
|
@@ -82,32 +81,30 @@ export async function makeEvent(
|
|
|
82
81
|
if (hasBrowser) {
|
|
83
82
|
defaultProps.browser = u.choose(defaults.browsers());
|
|
84
83
|
}
|
|
85
|
-
|
|
86
|
-
// Build device pool based on enabled device types
|
|
87
|
-
if (hasAndroidDevices) devicePool.push(defaults.androidDevices());
|
|
88
|
-
if (hasIOSDevices) devicePool.push(defaults.iOSDevices());
|
|
89
|
-
if (hasDesktopDevices) devicePool.push(defaults.desktopDevices());
|
|
90
84
|
|
|
91
85
|
// Add campaigns with attribution likelihood
|
|
92
86
|
if (hasCampaigns && chance.bool({ likelihood: 25 })) {
|
|
93
87
|
defaultProps.campaigns = u.pickRandom(defaults.campaigns());
|
|
94
88
|
}
|
|
95
89
|
|
|
96
|
-
//
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
defaultProps.device = u.pickRandom(devices);
|
|
90
|
+
// PERFORMANCE: Use pre-computed device pool instead of rebuilding every time
|
|
91
|
+
if (defaults.allDevices.length) {
|
|
92
|
+
defaultProps.device = u.pickRandom(defaults.allDevices);
|
|
100
93
|
}
|
|
101
94
|
|
|
102
95
|
// Set event time using TimeSoup for realistic distribution
|
|
103
96
|
if (earliestTime) {
|
|
104
97
|
if (isFirstEvent) {
|
|
105
|
-
//
|
|
106
|
-
|
|
98
|
+
// PERFORMANCE: Direct numeric calculation instead of dayjs object creation
|
|
99
|
+
const shiftedTimestamp = earliestTime + context.TIME_SHIFT_SECONDS;
|
|
100
|
+
eventTemplate.time = dayjs.unix(shiftedTimestamp).toISOString();
|
|
107
101
|
} else {
|
|
108
|
-
// Get time from TimeSoup and apply precomputed time shift
|
|
102
|
+
// Get time from TimeSoup (returns ISO string) and apply precomputed time shift
|
|
109
103
|
const soupTime = u.TimeSoup(earliestTime, context.FIXED_NOW, peaks, deviation, mean);
|
|
110
|
-
|
|
104
|
+
// PERFORMANCE: Parse ISO directly to milliseconds, add shift, convert back to ISO with one dayjs call
|
|
105
|
+
const soupTimestamp = new Date(soupTime).getTime() / 1000; // Convert to unix seconds
|
|
106
|
+
const shiftedTimestamp = soupTimestamp + context.TIME_SHIFT_SECONDS;
|
|
107
|
+
eventTemplate.time = dayjs.unix(shiftedTimestamp).toISOString();
|
|
111
108
|
}
|
|
112
109
|
}
|
|
113
110
|
|
|
@@ -133,7 +130,9 @@ export async function makeEvent(
|
|
|
133
130
|
// PERFORMANCE: Process properties directly without creating intermediate object
|
|
134
131
|
// Add custom properties from event configuration
|
|
135
132
|
if (chosenEvent.properties) {
|
|
136
|
-
|
|
133
|
+
const eventKeys = Object.keys(chosenEvent.properties);
|
|
134
|
+
for (let i = 0; i < eventKeys.length; i++) {
|
|
135
|
+
const key = eventKeys[i];
|
|
137
136
|
try {
|
|
138
137
|
eventTemplate[key] = u.choose(chosenEvent.properties[key]);
|
|
139
138
|
} catch (e) {
|
|
@@ -145,7 +144,9 @@ export async function makeEvent(
|
|
|
145
144
|
|
|
146
145
|
// Add super properties (override event properties if needed)
|
|
147
146
|
if (superProps) {
|
|
148
|
-
|
|
147
|
+
const superKeys = Object.keys(superProps);
|
|
148
|
+
for (let i = 0; i < superKeys.length; i++) {
|
|
149
|
+
const key = superKeys[i];
|
|
149
150
|
try {
|
|
150
151
|
eventTemplate[key] = u.choose(superProps[key]);
|
|
151
152
|
} catch (e) {
|
|
@@ -128,7 +128,13 @@ function buildFunnelEvents(context, sequence, chosenFunnelProps) {
|
|
|
128
128
|
|
|
129
129
|
return sequence.map((eventName) => {
|
|
130
130
|
const foundEvent = config.events?.find((e) => e.event === eventName);
|
|
131
|
-
|
|
131
|
+
|
|
132
|
+
// PERFORMANCE: Shallow copy instead of deepClone for better performance
|
|
133
|
+
// We only need to copy the top-level structure since we're rebuilding properties anyway
|
|
134
|
+
const eventSpec = foundEvent ? {
|
|
135
|
+
event: foundEvent.event,
|
|
136
|
+
properties: { ...foundEvent.properties }
|
|
137
|
+
} : { event: eventName, properties: {} };
|
|
132
138
|
|
|
133
139
|
// Process event properties
|
|
134
140
|
for (const key in eventSpec.properties) {
|
|
@@ -139,11 +145,7 @@ function buildFunnelEvents(context, sequence, chosenFunnelProps) {
|
|
|
139
145
|
}
|
|
140
146
|
}
|
|
141
147
|
|
|
142
|
-
//
|
|
143
|
-
delete eventSpec.isFirstEvent;
|
|
144
|
-
delete eventSpec.weight;
|
|
145
|
-
|
|
146
|
-
// Merge funnel properties
|
|
148
|
+
// Merge funnel properties (no need to delete properties since we're creating a new object)
|
|
147
149
|
eventSpec.properties = { ...eventSpec.properties, ...chosenFunnelProps };
|
|
148
150
|
|
|
149
151
|
return eventSpec;
|
|
@@ -58,9 +58,10 @@ export async function sendToMixpanel(context) {
|
|
|
58
58
|
epochEnd: dayjs().unix(),
|
|
59
59
|
dryRun: false,
|
|
60
60
|
abridged: false,
|
|
61
|
-
fixJson:
|
|
61
|
+
fixJson: false,
|
|
62
62
|
showProgress: NODE_ENV === "dev" ? true : false,
|
|
63
|
-
streamFormat: mpImportFormat
|
|
63
|
+
streamFormat: mpImportFormat,
|
|
64
|
+
workers: 35
|
|
64
65
|
};
|
|
65
66
|
|
|
66
67
|
if (isCLI) commonOpts.showProgress = true;
|
|
@@ -134,6 +135,8 @@ export async function sendToMixpanel(context) {
|
|
|
134
135
|
const imported = await mp({ token, groupKey }, groupProfilesToImport, {
|
|
135
136
|
recordType: "group",
|
|
136
137
|
...commonOpts,
|
|
138
|
+
groupKey,
|
|
139
|
+
//dryRun: true
|
|
137
140
|
});
|
|
138
141
|
log(`\tsent ${comma(imported.success)} ${groupKey} profiles\n`);
|
|
139
142
|
importResults.groups.push(imported);
|