make-mp-data 2.0.22 → 2.0.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/entry.js +1 -0
- package/index.js +58 -5
- package/lib/cli/cli.js +7 -1
- package/lib/core/config-validator.js +5 -5
- package/lib/core/context.js +13 -1
- package/lib/core/storage.js +45 -13
- package/lib/generators/events.js +17 -16
- package/lib/generators/funnels.js +8 -6
- package/lib/utils/utils.js +131 -4
- package/package.json +2 -1
- package/types.d.ts +3 -3
package/entry.js
CHANGED
package/index.js
CHANGED
|
@@ -194,8 +194,10 @@ async function main(config) {
|
|
|
194
194
|
await generateCharts(context);
|
|
195
195
|
}
|
|
196
196
|
|
|
197
|
-
// Step 11a:
|
|
198
|
-
|
|
197
|
+
// Step 11a: flush lookup tables to disk (always as CSVs)
|
|
198
|
+
if (validatedConfig.writeToDisk) {
|
|
199
|
+
await flushLookupTablesToDisk(storage, validatedConfig);
|
|
200
|
+
}
|
|
199
201
|
|
|
200
202
|
// Step 11b: Flush other storage containers to disk (if writeToDisk enabled)
|
|
201
203
|
if (validatedConfig.writeToDisk) {
|
|
@@ -217,7 +219,7 @@ async function main(config) {
|
|
|
217
219
|
return {
|
|
218
220
|
...extractedData,
|
|
219
221
|
importResults,
|
|
220
|
-
files: extractFileInfo(storage),
|
|
222
|
+
files: await extractFileInfo(storage, validatedConfig),
|
|
221
223
|
time: { start, end, delta, human },
|
|
222
224
|
operations: context.getOperations(),
|
|
223
225
|
eventCount: context.getEventCount(),
|
|
@@ -426,7 +428,7 @@ async function generateCharts(context) {
|
|
|
426
428
|
if (config.makeChart && storage.eventData?.length > 0) {
|
|
427
429
|
const chartPath = typeof config.makeChart === 'string'
|
|
428
430
|
? config.makeChart
|
|
429
|
-
: `./${config.
|
|
431
|
+
: `./${config.name}-timeline`;
|
|
430
432
|
|
|
431
433
|
await generateLineChart(storage.eventData, undefined, chartPath);
|
|
432
434
|
|
|
@@ -502,11 +504,13 @@ async function flushStorageToDisk(storage, config) {
|
|
|
502
504
|
/**
|
|
503
505
|
* Extract file information from storage containers
|
|
504
506
|
* @param {import('./types').Storage} storage - Storage object
|
|
507
|
+
* @param {import('./types').Dungeon} config - Configuration object
|
|
505
508
|
* @returns {string[]} Array of file paths
|
|
506
509
|
*/
|
|
507
|
-
function extractFileInfo(storage) {
|
|
510
|
+
async function extractFileInfo(storage, config) {
|
|
508
511
|
const files = [];
|
|
509
512
|
|
|
513
|
+
// Try to get paths from containers first
|
|
510
514
|
Object.values(storage).forEach(container => {
|
|
511
515
|
if (Array.isArray(container)) {
|
|
512
516
|
container.forEach(subContainer => {
|
|
@@ -519,6 +523,55 @@ function extractFileInfo(storage) {
|
|
|
519
523
|
}
|
|
520
524
|
});
|
|
521
525
|
|
|
526
|
+
// If no files found from containers and writeToDisk is enabled, scan the data directory
|
|
527
|
+
if (files.length === 0 && config.writeToDisk) {
|
|
528
|
+
try {
|
|
529
|
+
const fs = await import('fs');
|
|
530
|
+
const path = await import('path');
|
|
531
|
+
|
|
532
|
+
let dataDir = path.resolve("./data");
|
|
533
|
+
if (!fs.existsSync(dataDir)) {
|
|
534
|
+
dataDir = path.resolve("./");
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
if (fs.existsSync(dataDir)) {
|
|
538
|
+
const allFiles = fs.readdirSync(dataDir);
|
|
539
|
+
const simulationName = config.name;
|
|
540
|
+
|
|
541
|
+
// Filter files that match our patterns and were likely created by this run
|
|
542
|
+
const relevantFiles = allFiles.filter(file => {
|
|
543
|
+
// Skip system files
|
|
544
|
+
if (file.startsWith('.')) return false;
|
|
545
|
+
|
|
546
|
+
// If we have a simulation name, only include files with that prefix
|
|
547
|
+
if (simulationName && !file.startsWith(simulationName)) {
|
|
548
|
+
return false;
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
// Check for common patterns
|
|
552
|
+
const hasEventPattern = file.includes('-EVENTS.');
|
|
553
|
+
const hasUserPattern = file.includes('-USERS.');
|
|
554
|
+
const hasScdPattern = file.includes('-SCD.');
|
|
555
|
+
const hasGroupPattern = file.includes('-GROUPS.');
|
|
556
|
+
const hasLookupPattern = file.includes('-LOOKUP.');
|
|
557
|
+
const hasAdspendPattern = file.includes('-ADSPEND.');
|
|
558
|
+
const hasMirrorPattern = file.includes('-MIRROR.');
|
|
559
|
+
|
|
560
|
+
return hasEventPattern || hasUserPattern || hasScdPattern ||
|
|
561
|
+
hasGroupPattern || hasLookupPattern || hasAdspendPattern || hasMirrorPattern;
|
|
562
|
+
});
|
|
563
|
+
|
|
564
|
+
// Convert to full paths
|
|
565
|
+
relevantFiles.forEach(file => {
|
|
566
|
+
files.push(path.join(dataDir, file));
|
|
567
|
+
});
|
|
568
|
+
}
|
|
569
|
+
} catch (error) {
|
|
570
|
+
// If scanning fails, just return empty array
|
|
571
|
+
console.warn('Warning: Could not scan data directory for files:', error.message);
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
|
|
522
575
|
return files;
|
|
523
576
|
}
|
|
524
577
|
|
package/lib/cli/cli.js
CHANGED
|
@@ -223,6 +223,12 @@ DATA MODEL: https://github.com/ak--47/make-mp-data/blob/main/default.js
|
|
|
223
223
|
type: 'boolean',
|
|
224
224
|
coerce: boolCoerce
|
|
225
225
|
})
|
|
226
|
+
.option("name", {
|
|
227
|
+
alias: 'n',
|
|
228
|
+
demandOption: false,
|
|
229
|
+
describe: 'custom name for generated files (prefix)',
|
|
230
|
+
type: 'string'
|
|
231
|
+
})
|
|
226
232
|
|
|
227
233
|
.help()
|
|
228
234
|
.wrap(null)
|
|
@@ -236,7 +242,7 @@ DATA MODEL: https://github.com/ak--47/make-mp-data/blob/main/default.js
|
|
|
236
242
|
}
|
|
237
243
|
|
|
238
244
|
|
|
239
|
-
function boolCoerce(value
|
|
245
|
+
function boolCoerce(value) {
|
|
240
246
|
if (typeof value === 'boolean') return value;
|
|
241
247
|
if (typeof value === 'string') {
|
|
242
248
|
return value.toLowerCase() === 'true';
|
|
@@ -136,9 +136,10 @@ export function validateDungeonConfig(config) {
|
|
|
136
136
|
throw new Error("Either epochStart or numDays must be provided");
|
|
137
137
|
}
|
|
138
138
|
|
|
139
|
-
//
|
|
140
|
-
|
|
141
|
-
|
|
139
|
+
// Use provided name if non-empty string, otherwise generate one
|
|
140
|
+
if (!name || name === "") {
|
|
141
|
+
name = makeName();
|
|
142
|
+
}
|
|
142
143
|
|
|
143
144
|
// Validate events
|
|
144
145
|
if (!events || !events.length) events = [{ event: "foo" }, { event: "bar" }, { event: "baz" }];
|
|
@@ -242,8 +243,7 @@ export function validateDungeonConfig(config) {
|
|
|
242
243
|
hasAndroidDevices,
|
|
243
244
|
hasDesktopDevices,
|
|
244
245
|
hasIOSDevices,
|
|
245
|
-
|
|
246
|
-
name: config.name
|
|
246
|
+
name
|
|
247
247
|
};
|
|
248
248
|
|
|
249
249
|
return validatedConfig;
|
package/lib/core/context.js
CHANGED
|
@@ -40,6 +40,14 @@ function createDefaults(config, campaignData) {
|
|
|
40
40
|
const weighedBrowsers = u.weighArray(devices.browsers);
|
|
41
41
|
const weighedCampaigns = u.weighArray(campaignData);
|
|
42
42
|
|
|
43
|
+
// PERFORMANCE: Pre-compute device pools based on config to avoid rebuilding in makeEvent
|
|
44
|
+
const devicePools = {
|
|
45
|
+
android: config.hasAndroidDevices ? weighedAndroidDevices : [],
|
|
46
|
+
ios: config.hasIOSDevices ? weighedIOSDevices : [],
|
|
47
|
+
desktop: config.hasDesktopDevices ? weighedDesktopDevices : []
|
|
48
|
+
};
|
|
49
|
+
const allDevices = [...devicePools.android, ...devicePools.ios, ...devicePools.desktop];
|
|
50
|
+
|
|
43
51
|
return {
|
|
44
52
|
locationsUsers: () => weighedLocationsUsers,
|
|
45
53
|
locationsEvents: () => weighedLocationsEvents,
|
|
@@ -47,7 +55,11 @@ function createDefaults(config, campaignData) {
|
|
|
47
55
|
androidDevices: () => weighedAndroidDevices,
|
|
48
56
|
desktopDevices: () => weighedDesktopDevices,
|
|
49
57
|
browsers: () => weighedBrowsers,
|
|
50
|
-
campaigns: () => weighedCampaigns
|
|
58
|
+
campaigns: () => weighedCampaigns,
|
|
59
|
+
|
|
60
|
+
// PERFORMANCE: Pre-computed device pools
|
|
61
|
+
devicePools,
|
|
62
|
+
allDevices
|
|
51
63
|
};
|
|
52
64
|
}
|
|
53
65
|
|
package/lib/core/storage.js
CHANGED
|
@@ -65,13 +65,15 @@ export async function createHookArray(arr = [], opts) {
|
|
|
65
65
|
}
|
|
66
66
|
|
|
67
67
|
function getWritePath() {
|
|
68
|
+
const gzipSuffix = (config.gzip && !writeDir?.startsWith('gs://')) ? '.gz' : '';
|
|
69
|
+
|
|
68
70
|
if (isBatchMode) {
|
|
69
|
-
if (writeDir?.startsWith('gs://')) return `${writeDir}/${filepath}-part-${batch.toString()}.${format}`;
|
|
70
|
-
return path.join(writeDir, `${filepath}-part-${batch.toString()}.${format}`);
|
|
71
|
+
if (writeDir?.startsWith('gs://')) return `${writeDir}/${filepath}-part-${batch.toString()}.${format}${gzipSuffix}`;
|
|
72
|
+
return path.join(writeDir, `${filepath}-part-${batch.toString()}.${format}${gzipSuffix}`);
|
|
71
73
|
}
|
|
72
74
|
else {
|
|
73
|
-
if (writeDir?.startsWith('gs://')) return `${writeDir}/${filepath}.${format}`;
|
|
74
|
-
return path.join(writeDir, `${filepath}.${format}`);
|
|
75
|
+
if (writeDir?.startsWith('gs://')) return `${writeDir}/${filepath}.${format}${gzipSuffix}`;
|
|
76
|
+
return path.join(writeDir, `${filepath}.${format}${gzipSuffix}`);
|
|
75
77
|
}
|
|
76
78
|
}
|
|
77
79
|
|
|
@@ -153,12 +155,19 @@ export async function createHookArray(arr = [], opts) {
|
|
|
153
155
|
console.log(`\n\twriting ${writePath}\n`);
|
|
154
156
|
}
|
|
155
157
|
|
|
158
|
+
const streamOptions = {
|
|
159
|
+
gzip: config.gzip || false
|
|
160
|
+
};
|
|
161
|
+
|
|
156
162
|
switch (format) {
|
|
157
163
|
case "csv":
|
|
158
|
-
writeResult = await u.streamCSV(writePath, data);
|
|
164
|
+
writeResult = await u.streamCSV(writePath, data, streamOptions);
|
|
159
165
|
break;
|
|
160
166
|
case "json":
|
|
161
|
-
writeResult = await u.streamJSON(writePath, data);
|
|
167
|
+
writeResult = await u.streamJSON(writePath, data, streamOptions);
|
|
168
|
+
break;
|
|
169
|
+
case "parquet":
|
|
170
|
+
writeResult = await u.streamParquet(writePath, data, streamOptions);
|
|
162
171
|
break;
|
|
163
172
|
default:
|
|
164
173
|
throw new Error(`format ${format} is not supported`);
|
|
@@ -219,12 +228,15 @@ export class StorageManager {
|
|
|
219
228
|
async initializeContainers() {
|
|
220
229
|
const { config } = this.context;
|
|
221
230
|
|
|
231
|
+
// Validate configuration for potential data loss scenarios
|
|
232
|
+
this.validateConfiguration(config);
|
|
233
|
+
|
|
222
234
|
/** @type {Storage} */
|
|
223
235
|
const storage = {
|
|
224
236
|
eventData: await createHookArray([], {
|
|
225
237
|
hook: config.hook,
|
|
226
238
|
type: "event",
|
|
227
|
-
filepath: `${config.
|
|
239
|
+
filepath: `${config.name}-EVENTS`,
|
|
228
240
|
format: config.format || "csv",
|
|
229
241
|
concurrency: config.concurrency || 1,
|
|
230
242
|
context: this.context
|
|
@@ -233,7 +245,7 @@ export class StorageManager {
|
|
|
233
245
|
userProfilesData: await createHookArray([], {
|
|
234
246
|
hook: config.hook,
|
|
235
247
|
type: "user",
|
|
236
|
-
filepath: `${config.
|
|
248
|
+
filepath: `${config.name}-USERS`,
|
|
237
249
|
format: config.format || "csv",
|
|
238
250
|
concurrency: config.concurrency || 1,
|
|
239
251
|
context: this.context
|
|
@@ -242,7 +254,7 @@ export class StorageManager {
|
|
|
242
254
|
adSpendData: await createHookArray([], {
|
|
243
255
|
hook: config.hook,
|
|
244
256
|
type: "ad-spend",
|
|
245
|
-
filepath: `${config.
|
|
257
|
+
filepath: `${config.name}-ADSPEND`,
|
|
246
258
|
format: config.format || "csv",
|
|
247
259
|
concurrency: config.concurrency || 1,
|
|
248
260
|
context: this.context
|
|
@@ -255,7 +267,7 @@ export class StorageManager {
|
|
|
255
267
|
mirrorEventData: await createHookArray([], {
|
|
256
268
|
hook: config.hook,
|
|
257
269
|
type: "mirror",
|
|
258
|
-
filepath: `${config.
|
|
270
|
+
filepath: `${config.name}-MIRROR`,
|
|
259
271
|
format: config.format || "csv",
|
|
260
272
|
concurrency: config.concurrency || 1,
|
|
261
273
|
context: this.context
|
|
@@ -268,7 +280,7 @@ export class StorageManager {
|
|
|
268
280
|
const scdArray = await createHookArray([], {
|
|
269
281
|
hook: config.hook,
|
|
270
282
|
type: "scd",
|
|
271
|
-
filepath: `${config.
|
|
283
|
+
filepath: `${config.name}-${scdKey}-SCD`,
|
|
272
284
|
format: config.format || "csv",
|
|
273
285
|
concurrency: config.concurrency || 1,
|
|
274
286
|
context: this.context
|
|
@@ -284,7 +296,7 @@ export class StorageManager {
|
|
|
284
296
|
const groupArray = await createHookArray([], {
|
|
285
297
|
hook: config.hook,
|
|
286
298
|
type: "group",
|
|
287
|
-
filepath: `${config.
|
|
299
|
+
filepath: `${config.name}-${groupKey}-GROUPS`,
|
|
288
300
|
format: config.format || "csv",
|
|
289
301
|
concurrency: config.concurrency || 1,
|
|
290
302
|
context: this.context
|
|
@@ -300,7 +312,7 @@ export class StorageManager {
|
|
|
300
312
|
const lookupArray = await createHookArray([], {
|
|
301
313
|
hook: config.hook,
|
|
302
314
|
type: "lookup",
|
|
303
|
-
filepath: `${config.
|
|
315
|
+
filepath: `${config.name}-${lookupConfig.key}-LOOKUP`,
|
|
304
316
|
format: "csv", // Always force CSV for lookup tables
|
|
305
317
|
concurrency: config.concurrency || 1,
|
|
306
318
|
context: this.context
|
|
@@ -312,4 +324,24 @@ export class StorageManager {
|
|
|
312
324
|
|
|
313
325
|
return storage;
|
|
314
326
|
}
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Validates configuration to prevent data loss scenarios
|
|
330
|
+
* @param {Object} config - Configuration object
|
|
331
|
+
*/
|
|
332
|
+
validateConfiguration(config) {
|
|
333
|
+
// Check for potential data loss scenario: writeToDisk=false with low batchSize
|
|
334
|
+
if (config.writeToDisk === false) {
|
|
335
|
+
const batchSize = config.batchSize || 1_000_000;
|
|
336
|
+
const numEvents = config.numEvents || 0;
|
|
337
|
+
|
|
338
|
+
if (batchSize < numEvents) {
|
|
339
|
+
throw new Error(
|
|
340
|
+
`Configuration error: writeToDisk is explicitly set to false but batchSize (${batchSize}) is lower than numEvents (${numEvents}). ` +
|
|
341
|
+
`This would result in data loss as batched data would be discarded. ` +
|
|
342
|
+
`Either set writeToDisk to true, increase batchSize to be >= numEvents, or provide a Mixpanel token to send data directly.`
|
|
343
|
+
);
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
}
|
|
315
347
|
}
|
package/lib/generators/events.js
CHANGED
|
@@ -72,7 +72,6 @@ export async function makeEvent(
|
|
|
72
72
|
};
|
|
73
73
|
|
|
74
74
|
let defaultProps = {};
|
|
75
|
-
let devicePool = [];
|
|
76
75
|
|
|
77
76
|
// Add default properties based on configuration
|
|
78
77
|
if (hasLocation) {
|
|
@@ -82,32 +81,30 @@ export async function makeEvent(
|
|
|
82
81
|
if (hasBrowser) {
|
|
83
82
|
defaultProps.browser = u.choose(defaults.browsers());
|
|
84
83
|
}
|
|
85
|
-
|
|
86
|
-
// Build device pool based on enabled device types
|
|
87
|
-
if (hasAndroidDevices) devicePool.push(defaults.androidDevices());
|
|
88
|
-
if (hasIOSDevices) devicePool.push(defaults.iOSDevices());
|
|
89
|
-
if (hasDesktopDevices) devicePool.push(defaults.desktopDevices());
|
|
90
84
|
|
|
91
85
|
// Add campaigns with attribution likelihood
|
|
92
86
|
if (hasCampaigns && chance.bool({ likelihood: 25 })) {
|
|
93
87
|
defaultProps.campaigns = u.pickRandom(defaults.campaigns());
|
|
94
88
|
}
|
|
95
89
|
|
|
96
|
-
//
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
defaultProps.device = u.pickRandom(devices);
|
|
90
|
+
// PERFORMANCE: Use pre-computed device pool instead of rebuilding every time
|
|
91
|
+
if (defaults.allDevices.length) {
|
|
92
|
+
defaultProps.device = u.pickRandom(defaults.allDevices);
|
|
100
93
|
}
|
|
101
94
|
|
|
102
95
|
// Set event time using TimeSoup for realistic distribution
|
|
103
96
|
if (earliestTime) {
|
|
104
97
|
if (isFirstEvent) {
|
|
105
|
-
//
|
|
106
|
-
|
|
98
|
+
// PERFORMANCE: Direct numeric calculation instead of dayjs object creation
|
|
99
|
+
const shiftedTimestamp = earliestTime + context.TIME_SHIFT_SECONDS;
|
|
100
|
+
eventTemplate.time = dayjs.unix(shiftedTimestamp).toISOString();
|
|
107
101
|
} else {
|
|
108
|
-
// Get time from TimeSoup and apply precomputed time shift
|
|
102
|
+
// Get time from TimeSoup (returns ISO string) and apply precomputed time shift
|
|
109
103
|
const soupTime = u.TimeSoup(earliestTime, context.FIXED_NOW, peaks, deviation, mean);
|
|
110
|
-
|
|
104
|
+
// PERFORMANCE: Parse ISO directly to milliseconds, add shift, convert back to ISO with one dayjs call
|
|
105
|
+
const soupTimestamp = new Date(soupTime).getTime() / 1000; // Convert to unix seconds
|
|
106
|
+
const shiftedTimestamp = soupTimestamp + context.TIME_SHIFT_SECONDS;
|
|
107
|
+
eventTemplate.time = dayjs.unix(shiftedTimestamp).toISOString();
|
|
111
108
|
}
|
|
112
109
|
}
|
|
113
110
|
|
|
@@ -133,7 +130,9 @@ export async function makeEvent(
|
|
|
133
130
|
// PERFORMANCE: Process properties directly without creating intermediate object
|
|
134
131
|
// Add custom properties from event configuration
|
|
135
132
|
if (chosenEvent.properties) {
|
|
136
|
-
|
|
133
|
+
const eventKeys = Object.keys(chosenEvent.properties);
|
|
134
|
+
for (let i = 0; i < eventKeys.length; i++) {
|
|
135
|
+
const key = eventKeys[i];
|
|
137
136
|
try {
|
|
138
137
|
eventTemplate[key] = u.choose(chosenEvent.properties[key]);
|
|
139
138
|
} catch (e) {
|
|
@@ -145,7 +144,9 @@ export async function makeEvent(
|
|
|
145
144
|
|
|
146
145
|
// Add super properties (override event properties if needed)
|
|
147
146
|
if (superProps) {
|
|
148
|
-
|
|
147
|
+
const superKeys = Object.keys(superProps);
|
|
148
|
+
for (let i = 0; i < superKeys.length; i++) {
|
|
149
|
+
const key = superKeys[i];
|
|
149
150
|
try {
|
|
150
151
|
eventTemplate[key] = u.choose(superProps[key]);
|
|
151
152
|
} catch (e) {
|
|
@@ -128,7 +128,13 @@ function buildFunnelEvents(context, sequence, chosenFunnelProps) {
|
|
|
128
128
|
|
|
129
129
|
return sequence.map((eventName) => {
|
|
130
130
|
const foundEvent = config.events?.find((e) => e.event === eventName);
|
|
131
|
-
|
|
131
|
+
|
|
132
|
+
// PERFORMANCE: Shallow copy instead of deepClone for better performance
|
|
133
|
+
// We only need to copy the top-level structure since we're rebuilding properties anyway
|
|
134
|
+
const eventSpec = foundEvent ? {
|
|
135
|
+
event: foundEvent.event,
|
|
136
|
+
properties: { ...foundEvent.properties }
|
|
137
|
+
} : { event: eventName, properties: {} };
|
|
132
138
|
|
|
133
139
|
// Process event properties
|
|
134
140
|
for (const key in eventSpec.properties) {
|
|
@@ -139,11 +145,7 @@ function buildFunnelEvents(context, sequence, chosenFunnelProps) {
|
|
|
139
145
|
}
|
|
140
146
|
}
|
|
141
147
|
|
|
142
|
-
//
|
|
143
|
-
delete eventSpec.isFirstEvent;
|
|
144
|
-
delete eventSpec.weight;
|
|
145
|
-
|
|
146
|
-
// Merge funnel properties
|
|
148
|
+
// Merge funnel properties (no need to delete properties since we're creating a new object)
|
|
147
149
|
eventSpec.properties = { ...eventSpec.properties, ...chosenFunnelProps };
|
|
148
150
|
|
|
149
151
|
return eventSpec;
|
package/lib/utils/utils.js
CHANGED
|
@@ -8,6 +8,7 @@ import utc from 'dayjs/plugin/utc.js';
|
|
|
8
8
|
import path from 'path';
|
|
9
9
|
import { mkdir, parseGCSUri } from 'ak-tools';
|
|
10
10
|
import { existsSync } from 'fs';
|
|
11
|
+
import zlib from 'zlib';
|
|
11
12
|
dayjs.extend(utc);
|
|
12
13
|
import 'dotenv/config';
|
|
13
14
|
import { domainSuffix, domainPrefix } from '../templates/defaults.js';
|
|
@@ -483,15 +484,22 @@ STREAMERS
|
|
|
483
484
|
----
|
|
484
485
|
*/
|
|
485
486
|
|
|
486
|
-
function streamJSON(filePath, data) {
|
|
487
|
+
function streamJSON(filePath, data, options = {}) {
|
|
487
488
|
return new Promise((resolve, reject) => {
|
|
488
489
|
let writeStream;
|
|
490
|
+
const { gzip = false } = options;
|
|
491
|
+
|
|
489
492
|
if (filePath?.startsWith('gs://')) {
|
|
490
493
|
const { uri, bucket, file } = parseGCSUri(filePath);
|
|
491
494
|
writeStream = storage.bucket(bucket).file(file).createWriteStream({ gzip: true });
|
|
492
495
|
}
|
|
493
496
|
else {
|
|
494
497
|
writeStream = fs.createWriteStream(filePath, { encoding: 'utf8' });
|
|
498
|
+
if (gzip) {
|
|
499
|
+
const gzipStream = zlib.createGzip();
|
|
500
|
+
gzipStream.pipe(writeStream);
|
|
501
|
+
writeStream = gzipStream;
|
|
502
|
+
}
|
|
495
503
|
}
|
|
496
504
|
data.forEach(item => {
|
|
497
505
|
writeStream.write(JSON.stringify(item) + '\n');
|
|
@@ -504,15 +512,22 @@ function streamJSON(filePath, data) {
|
|
|
504
512
|
});
|
|
505
513
|
}
|
|
506
514
|
|
|
507
|
-
function streamCSV(filePath, data) {
|
|
515
|
+
function streamCSV(filePath, data, options = {}) {
|
|
508
516
|
return new Promise((resolve, reject) => {
|
|
509
517
|
let writeStream;
|
|
518
|
+
const { gzip = false } = options;
|
|
519
|
+
|
|
510
520
|
if (filePath?.startsWith('gs://')) {
|
|
511
521
|
const { uri, bucket, file } = parseGCSUri(filePath);
|
|
512
522
|
writeStream = storage.bucket(bucket).file(file).createWriteStream({ gzip: true });
|
|
513
523
|
}
|
|
514
524
|
else {
|
|
515
525
|
writeStream = fs.createWriteStream(filePath, { encoding: 'utf8' });
|
|
526
|
+
if (gzip) {
|
|
527
|
+
const gzipStream = zlib.createGzip();
|
|
528
|
+
gzipStream.pipe(writeStream);
|
|
529
|
+
writeStream = gzipStream;
|
|
530
|
+
}
|
|
516
531
|
}
|
|
517
532
|
|
|
518
533
|
// Extract all unique keys from the data array
|
|
@@ -539,6 +554,117 @@ function streamCSV(filePath, data) {
|
|
|
539
554
|
});
|
|
540
555
|
}
|
|
541
556
|
|
|
557
|
+
async function streamParquet(filePath, data, options = {}) {
|
|
558
|
+
const { gzip = false } = options;
|
|
559
|
+
|
|
560
|
+
// Dynamically import hyparquet-writer
|
|
561
|
+
const { parquetWriteFile, parquetWriteBuffer } = await import('hyparquet-writer');
|
|
562
|
+
|
|
563
|
+
if (data.length === 0) {
|
|
564
|
+
throw new Error('Cannot write parquet file with empty data');
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
// Extract column names and data from the input array
|
|
568
|
+
const columns = getUniqueKeys(data);
|
|
569
|
+
const columnData = columns.map(columnName => {
|
|
570
|
+
const columnValues = data.map(row => {
|
|
571
|
+
let value = row[columnName];
|
|
572
|
+
|
|
573
|
+
// Handle null/undefined values
|
|
574
|
+
if (value === null || value === undefined) {
|
|
575
|
+
return null;
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
// Convert objects to strings
|
|
579
|
+
if (typeof value === 'object') {
|
|
580
|
+
value = JSON.stringify(value);
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
return value;
|
|
584
|
+
});
|
|
585
|
+
|
|
586
|
+
// Determine the type based on the first non-null value
|
|
587
|
+
let type = 'STRING'; // default
|
|
588
|
+
const firstValue = columnValues.find(v => v !== null && v !== undefined);
|
|
589
|
+
|
|
590
|
+
if (firstValue !== undefined) {
|
|
591
|
+
if (typeof firstValue === 'boolean') {
|
|
592
|
+
type = 'BOOLEAN';
|
|
593
|
+
} else if (typeof firstValue === 'number') {
|
|
594
|
+
// For parquet compatibility, convert numbers to appropriate types
|
|
595
|
+
if (Number.isInteger(firstValue)) {
|
|
596
|
+
// Use INT32 for smaller integers, convert to BigInt for INT64 if needed
|
|
597
|
+
if (firstValue >= -2147483648 && firstValue <= 2147483647) {
|
|
598
|
+
type = 'INT32';
|
|
599
|
+
} else {
|
|
600
|
+
type = 'INT64';
|
|
601
|
+
// Convert all values to BigInt for INT64
|
|
602
|
+
for (let i = 0; i < columnValues.length; i++) {
|
|
603
|
+
if (columnValues[i] !== null && columnValues[i] !== undefined) {
|
|
604
|
+
columnValues[i] = BigInt(columnValues[i]);
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
} else {
|
|
609
|
+
type = 'DOUBLE';
|
|
610
|
+
}
|
|
611
|
+
} else if (firstValue instanceof Date) {
|
|
612
|
+
type = 'TIMESTAMP';
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
return {
|
|
617
|
+
name: columnName,
|
|
618
|
+
data: columnValues,
|
|
619
|
+
type: type
|
|
620
|
+
};
|
|
621
|
+
});
|
|
622
|
+
|
|
623
|
+
if (filePath?.startsWith('gs://')) {
|
|
624
|
+
// For GCS, write to buffer first, then upload
|
|
625
|
+
const arrayBuffer = parquetWriteBuffer({ columnData });
|
|
626
|
+
const { bucket, file } = parseGCSUri(filePath);
|
|
627
|
+
|
|
628
|
+
const writeStream = storage.bucket(bucket).file(file).createWriteStream({
|
|
629
|
+
gzip: gzip || true // Always gzip for GCS
|
|
630
|
+
});
|
|
631
|
+
|
|
632
|
+
return new Promise((resolve, reject) => {
|
|
633
|
+
writeStream.write(Buffer.from(arrayBuffer));
|
|
634
|
+
writeStream.end();
|
|
635
|
+
writeStream.on('finish', () => resolve(filePath));
|
|
636
|
+
writeStream.on('error', reject);
|
|
637
|
+
});
|
|
638
|
+
} else {
|
|
639
|
+
// For local files
|
|
640
|
+
let actualFilePath = filePath;
|
|
641
|
+
if (gzip && !filePath.endsWith('.gz')) {
|
|
642
|
+
actualFilePath = filePath + '.gz';
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
if (gzip) {
|
|
646
|
+
// Write to buffer then gzip to disk
|
|
647
|
+
const arrayBuffer = parquetWriteBuffer({ columnData });
|
|
648
|
+
const buffer = Buffer.from(arrayBuffer);
|
|
649
|
+
const gzippedBuffer = zlib.gzipSync(buffer);
|
|
650
|
+
|
|
651
|
+
return new Promise((resolve, reject) => {
|
|
652
|
+
fs.writeFile(actualFilePath, gzippedBuffer, (err) => {
|
|
653
|
+
if (err) reject(err);
|
|
654
|
+
else resolve(actualFilePath);
|
|
655
|
+
});
|
|
656
|
+
});
|
|
657
|
+
} else {
|
|
658
|
+
// Direct write to disk
|
|
659
|
+
parquetWriteFile({
|
|
660
|
+
filename: filePath,
|
|
661
|
+
columnData
|
|
662
|
+
});
|
|
663
|
+
return Promise.resolve(filePath);
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
|
|
542
668
|
|
|
543
669
|
/*
|
|
544
670
|
----
|
|
@@ -870,7 +996,7 @@ function buildFileNames(config) {
|
|
|
870
996
|
let extension = "";
|
|
871
997
|
extension = format === "csv" ? "csv" : "json";
|
|
872
998
|
// const current = dayjs.utc().format("MM-DD-HH");
|
|
873
|
-
let simName = config.
|
|
999
|
+
let simName = config.name;
|
|
874
1000
|
let writeDir = typeof config.writeToDisk === 'string' ? config.writeToDisk : "./";
|
|
875
1001
|
if (config.writeToDisk) {
|
|
876
1002
|
const dataFolder = path.resolve("./data");
|
|
@@ -1328,7 +1454,7 @@ export {
|
|
|
1328
1454
|
TimeSoup,
|
|
1329
1455
|
companyName,
|
|
1330
1456
|
generateEmoji,
|
|
1331
|
-
hasSameKeys
|
|
1457
|
+
hasSameKeys,
|
|
1332
1458
|
deepClone,
|
|
1333
1459
|
initChance,
|
|
1334
1460
|
getChance,
|
|
@@ -1362,6 +1488,7 @@ export {
|
|
|
1362
1488
|
buildFileNames,
|
|
1363
1489
|
streamJSON,
|
|
1364
1490
|
streamCSV,
|
|
1491
|
+
streamParquet,
|
|
1365
1492
|
datesBetween,
|
|
1366
1493
|
weighChoices,
|
|
1367
1494
|
wrapFunc,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "make-mp-data",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.23",
|
|
4
4
|
"description": "builds all mixpanel primitives for a given project",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -70,6 +70,7 @@
|
|
|
70
70
|
"dayjs": "^1.11.11",
|
|
71
71
|
"dotenv": "^16.4.5",
|
|
72
72
|
"google-auth-library": "^9.15.0",
|
|
73
|
+
"hyparquet-writer": "^0.6.1",
|
|
73
74
|
"mixpanel-import": "^2.8.162",
|
|
74
75
|
"p-limit": "^3.1.0",
|
|
75
76
|
"yargs": "^17.7.2"
|
package/types.d.ts
CHANGED
|
@@ -20,7 +20,7 @@ export interface Dungeon {
|
|
|
20
20
|
epochEnd?: number;
|
|
21
21
|
numEvents?: number;
|
|
22
22
|
numUsers?: number;
|
|
23
|
-
format?: "csv" | "json" | string;
|
|
23
|
+
format?: "csv" | "json" | "parquet" | string;
|
|
24
24
|
region?: "US" | "EU";
|
|
25
25
|
concurrency?: number;
|
|
26
26
|
batchSize?: number;
|
|
@@ -30,7 +30,6 @@ export interface Dungeon {
|
|
|
30
30
|
projectId?: string;
|
|
31
31
|
|
|
32
32
|
// ids
|
|
33
|
-
simulationName?: string;
|
|
34
33
|
name?: string;
|
|
35
34
|
|
|
36
35
|
//switches
|
|
@@ -44,6 +43,7 @@ export interface Dungeon {
|
|
|
44
43
|
hasDesktopDevices?: boolean;
|
|
45
44
|
hasBrowser?: boolean;
|
|
46
45
|
writeToDisk?: boolean | string;
|
|
46
|
+
gzip?: boolean;
|
|
47
47
|
verbose?: boolean;
|
|
48
48
|
hasAnonIds?: boolean;
|
|
49
49
|
hasSessionIds?: boolean;
|
|
@@ -117,7 +117,7 @@ export interface hookArrayOptions<T> {
|
|
|
117
117
|
hook?: Hook<T>;
|
|
118
118
|
type?: hookTypes;
|
|
119
119
|
filename?: string;
|
|
120
|
-
format?: "csv" | "json" | string;
|
|
120
|
+
format?: "csv" | "json" | "parquet" | string;
|
|
121
121
|
concurrency?: number;
|
|
122
122
|
context?: Context;
|
|
123
123
|
[key: string]: any;
|