make-mp-data 2.0.21 → 2.0.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dungeons/student-teacher.js +38 -87
- package/entry.js +7 -1
- package/index.js +90 -8
- package/lib/cli/cli.js +15 -1
- package/lib/core/config-validator.js +230 -219
- package/lib/core/context.js +13 -1
- package/lib/core/storage.js +88 -23
- package/lib/generators/events.js +17 -16
- package/lib/generators/funnels.js +8 -6
- package/lib/orchestrators/mixpanel-sender.js +5 -2
- package/lib/orchestrators/user-loop.js +212 -181
- package/lib/templates/abbreviated.d.ts +4 -3
- package/lib/templates/instructions.txt +1 -0
- package/lib/templates/{dungeon-template.js → scratch-dungeon-template.js} +9 -3
- package/lib/templates/verbose-schema.js +31 -4
- package/lib/utils/utils.js +178 -14
- package/package.json +5 -4
- package/types.d.ts +9 -4
package/lib/utils/utils.js
CHANGED
|
@@ -8,9 +8,11 @@ import utc from 'dayjs/plugin/utc.js';
|
|
|
8
8
|
import path from 'path';
|
|
9
9
|
import { mkdir, parseGCSUri } from 'ak-tools';
|
|
10
10
|
import { existsSync } from 'fs';
|
|
11
|
+
import zlib from 'zlib';
|
|
11
12
|
dayjs.extend(utc);
|
|
12
13
|
import 'dotenv/config';
|
|
13
14
|
import { domainSuffix, domainPrefix } from '../templates/defaults.js';
|
|
15
|
+
const {NODE_ENV = "unknown"} = process.env;
|
|
14
16
|
|
|
15
17
|
/** @typedef {import('../../types').Dungeon} Config */
|
|
16
18
|
/** @typedef {import('../../types').EventConfig} EventConfig */
|
|
@@ -482,15 +484,22 @@ STREAMERS
|
|
|
482
484
|
----
|
|
483
485
|
*/
|
|
484
486
|
|
|
485
|
-
function streamJSON(filePath, data) {
|
|
487
|
+
function streamJSON(filePath, data, options = {}) {
|
|
486
488
|
return new Promise((resolve, reject) => {
|
|
487
489
|
let writeStream;
|
|
490
|
+
const { gzip = false } = options;
|
|
491
|
+
|
|
488
492
|
if (filePath?.startsWith('gs://')) {
|
|
489
493
|
const { uri, bucket, file } = parseGCSUri(filePath);
|
|
490
494
|
writeStream = storage.bucket(bucket).file(file).createWriteStream({ gzip: true });
|
|
491
495
|
}
|
|
492
496
|
else {
|
|
493
497
|
writeStream = fs.createWriteStream(filePath, { encoding: 'utf8' });
|
|
498
|
+
if (gzip) {
|
|
499
|
+
const gzipStream = zlib.createGzip();
|
|
500
|
+
gzipStream.pipe(writeStream);
|
|
501
|
+
writeStream = gzipStream;
|
|
502
|
+
}
|
|
494
503
|
}
|
|
495
504
|
data.forEach(item => {
|
|
496
505
|
writeStream.write(JSON.stringify(item) + '\n');
|
|
@@ -503,15 +512,22 @@ function streamJSON(filePath, data) {
|
|
|
503
512
|
});
|
|
504
513
|
}
|
|
505
514
|
|
|
506
|
-
function streamCSV(filePath, data) {
|
|
515
|
+
function streamCSV(filePath, data, options = {}) {
|
|
507
516
|
return new Promise((resolve, reject) => {
|
|
508
517
|
let writeStream;
|
|
518
|
+
const { gzip = false } = options;
|
|
519
|
+
|
|
509
520
|
if (filePath?.startsWith('gs://')) {
|
|
510
521
|
const { uri, bucket, file } = parseGCSUri(filePath);
|
|
511
522
|
writeStream = storage.bucket(bucket).file(file).createWriteStream({ gzip: true });
|
|
512
523
|
}
|
|
513
524
|
else {
|
|
514
525
|
writeStream = fs.createWriteStream(filePath, { encoding: 'utf8' });
|
|
526
|
+
if (gzip) {
|
|
527
|
+
const gzipStream = zlib.createGzip();
|
|
528
|
+
gzipStream.pipe(writeStream);
|
|
529
|
+
writeStream = gzipStream;
|
|
530
|
+
}
|
|
515
531
|
}
|
|
516
532
|
|
|
517
533
|
// Extract all unique keys from the data array
|
|
@@ -538,6 +554,117 @@ function streamCSV(filePath, data) {
|
|
|
538
554
|
});
|
|
539
555
|
}
|
|
540
556
|
|
|
557
|
+
async function streamParquet(filePath, data, options = {}) {
|
|
558
|
+
const { gzip = false } = options;
|
|
559
|
+
|
|
560
|
+
// Dynamically import hyparquet-writer
|
|
561
|
+
const { parquetWriteFile, parquetWriteBuffer } = await import('hyparquet-writer');
|
|
562
|
+
|
|
563
|
+
if (data.length === 0) {
|
|
564
|
+
throw new Error('Cannot write parquet file with empty data');
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
// Extract column names and data from the input array
|
|
568
|
+
const columns = getUniqueKeys(data);
|
|
569
|
+
const columnData = columns.map(columnName => {
|
|
570
|
+
const columnValues = data.map(row => {
|
|
571
|
+
let value = row[columnName];
|
|
572
|
+
|
|
573
|
+
// Handle null/undefined values
|
|
574
|
+
if (value === null || value === undefined) {
|
|
575
|
+
return null;
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
// Convert objects to strings
|
|
579
|
+
if (typeof value === 'object') {
|
|
580
|
+
value = JSON.stringify(value);
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
return value;
|
|
584
|
+
});
|
|
585
|
+
|
|
586
|
+
// Determine the type based on the first non-null value
|
|
587
|
+
let type = 'STRING'; // default
|
|
588
|
+
const firstValue = columnValues.find(v => v !== null && v !== undefined);
|
|
589
|
+
|
|
590
|
+
if (firstValue !== undefined) {
|
|
591
|
+
if (typeof firstValue === 'boolean') {
|
|
592
|
+
type = 'BOOLEAN';
|
|
593
|
+
} else if (typeof firstValue === 'number') {
|
|
594
|
+
// For parquet compatibility, convert numbers to appropriate types
|
|
595
|
+
if (Number.isInteger(firstValue)) {
|
|
596
|
+
// Use INT32 for smaller integers, convert to BigInt for INT64 if needed
|
|
597
|
+
if (firstValue >= -2147483648 && firstValue <= 2147483647) {
|
|
598
|
+
type = 'INT32';
|
|
599
|
+
} else {
|
|
600
|
+
type = 'INT64';
|
|
601
|
+
// Convert all values to BigInt for INT64
|
|
602
|
+
for (let i = 0; i < columnValues.length; i++) {
|
|
603
|
+
if (columnValues[i] !== null && columnValues[i] !== undefined) {
|
|
604
|
+
columnValues[i] = BigInt(columnValues[i]);
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
} else {
|
|
609
|
+
type = 'DOUBLE';
|
|
610
|
+
}
|
|
611
|
+
} else if (firstValue instanceof Date) {
|
|
612
|
+
type = 'TIMESTAMP';
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
return {
|
|
617
|
+
name: columnName,
|
|
618
|
+
data: columnValues,
|
|
619
|
+
type: type
|
|
620
|
+
};
|
|
621
|
+
});
|
|
622
|
+
|
|
623
|
+
if (filePath?.startsWith('gs://')) {
|
|
624
|
+
// For GCS, write to buffer first, then upload
|
|
625
|
+
const arrayBuffer = parquetWriteBuffer({ columnData });
|
|
626
|
+
const { bucket, file } = parseGCSUri(filePath);
|
|
627
|
+
|
|
628
|
+
const writeStream = storage.bucket(bucket).file(file).createWriteStream({
|
|
629
|
+
gzip: gzip || true // Always gzip for GCS
|
|
630
|
+
});
|
|
631
|
+
|
|
632
|
+
return new Promise((resolve, reject) => {
|
|
633
|
+
writeStream.write(Buffer.from(arrayBuffer));
|
|
634
|
+
writeStream.end();
|
|
635
|
+
writeStream.on('finish', () => resolve(filePath));
|
|
636
|
+
writeStream.on('error', reject);
|
|
637
|
+
});
|
|
638
|
+
} else {
|
|
639
|
+
// For local files
|
|
640
|
+
let actualFilePath = filePath;
|
|
641
|
+
if (gzip && !filePath.endsWith('.gz')) {
|
|
642
|
+
actualFilePath = filePath + '.gz';
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
if (gzip) {
|
|
646
|
+
// Write to buffer then gzip to disk
|
|
647
|
+
const arrayBuffer = parquetWriteBuffer({ columnData });
|
|
648
|
+
const buffer = Buffer.from(arrayBuffer);
|
|
649
|
+
const gzippedBuffer = zlib.gzipSync(buffer);
|
|
650
|
+
|
|
651
|
+
return new Promise((resolve, reject) => {
|
|
652
|
+
fs.writeFile(actualFilePath, gzippedBuffer, (err) => {
|
|
653
|
+
if (err) reject(err);
|
|
654
|
+
else resolve(actualFilePath);
|
|
655
|
+
});
|
|
656
|
+
});
|
|
657
|
+
} else {
|
|
658
|
+
// Direct write to disk
|
|
659
|
+
parquetWriteFile({
|
|
660
|
+
filename: filePath,
|
|
661
|
+
columnData
|
|
662
|
+
});
|
|
663
|
+
return Promise.resolve(filePath);
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
|
|
541
668
|
|
|
542
669
|
/*
|
|
543
670
|
----
|
|
@@ -545,13 +672,7 @@ WEIGHERS
|
|
|
545
672
|
----
|
|
546
673
|
*/
|
|
547
674
|
|
|
548
|
-
|
|
549
|
-
const weight = funnel?.weight || 1;
|
|
550
|
-
for (let i = 0; i < weight; i++) {
|
|
551
|
-
acc.push(funnel);
|
|
552
|
-
}
|
|
553
|
-
return acc;
|
|
554
|
-
}
|
|
675
|
+
|
|
555
676
|
|
|
556
677
|
/**
|
|
557
678
|
* a utility function to generate a range of numbers within a given skew
|
|
@@ -875,7 +996,7 @@ function buildFileNames(config) {
|
|
|
875
996
|
let extension = "";
|
|
876
997
|
extension = format === "csv" ? "csv" : "json";
|
|
877
998
|
// const current = dayjs.utc().format("MM-DD-HH");
|
|
878
|
-
let simName = config.
|
|
999
|
+
let simName = config.name;
|
|
879
1000
|
let writeDir = typeof config.writeToDisk === 'string' ? config.writeToDisk : "./";
|
|
880
1001
|
if (config.writeToDisk) {
|
|
881
1002
|
const dataFolder = path.resolve("./data");
|
|
@@ -1020,6 +1141,46 @@ let soupHits = 0;
|
|
|
1020
1141
|
* @param {number} [peaks=5]
|
|
1021
1142
|
*/
|
|
1022
1143
|
function TimeSoup(earliestTime, latestTime, peaks = 5, deviation = 2, mean = 0) {
|
|
1144
|
+
if (!earliestTime) earliestTime = global.FIXED_BEGIN ? global.FIXED_BEGIN : dayjs().subtract(30, 'd').unix(); // 30 days ago
|
|
1145
|
+
if (!latestTime) latestTime = global.FIXED_NOW ? global.FIXED_NOW : dayjs().unix();
|
|
1146
|
+
const chance = getChance();
|
|
1147
|
+
const totalRange = latestTime - earliestTime;
|
|
1148
|
+
const chunkSize = totalRange / peaks;
|
|
1149
|
+
|
|
1150
|
+
// Select a random chunk based on the number of peaks
|
|
1151
|
+
const peakIndex = integer(0, peaks - 1);
|
|
1152
|
+
const chunkStart = earliestTime + peakIndex * chunkSize;
|
|
1153
|
+
const chunkEnd = chunkStart + chunkSize;
|
|
1154
|
+
const chunkMid = (chunkStart + chunkEnd) / 2;
|
|
1155
|
+
|
|
1156
|
+
// Generate a single timestamp within this chunk using a normal distribution centered at chunkMid
|
|
1157
|
+
let offset;
|
|
1158
|
+
let iterations = 0;
|
|
1159
|
+
let isValidTime = false;
|
|
1160
|
+
do {
|
|
1161
|
+
iterations++;
|
|
1162
|
+
soupHits++;
|
|
1163
|
+
offset = chance.normal({ mean: mean, dev: chunkSize / deviation });
|
|
1164
|
+
isValidTime = validTime(chunkMid + offset, earliestTime, latestTime);
|
|
1165
|
+
if (iterations > 25000) {
|
|
1166
|
+
throw `${iterations} iterations... exceeded`;
|
|
1167
|
+
}
|
|
1168
|
+
} while (chunkMid + offset < chunkStart || chunkMid + offset > chunkEnd);
|
|
1169
|
+
|
|
1170
|
+
try {
|
|
1171
|
+
return dayjs.unix(chunkMid + offset).toISOString();
|
|
1172
|
+
}
|
|
1173
|
+
|
|
1174
|
+
catch (e) {
|
|
1175
|
+
//escape hatch
|
|
1176
|
+
// console.log('BAD TIME', e?.message);
|
|
1177
|
+
if (NODE_ENV === 'dev') debugger;
|
|
1178
|
+
return dayjs.unix(integer(earliestTime, latestTime)).toISOString();
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1181
|
+
|
|
1182
|
+
|
|
1183
|
+
function NewTimeSoup(earliestTime, latestTime, peaks = 5, deviation = 2, mean = 0) {
|
|
1023
1184
|
if (!earliestTime) earliestTime = global.FIXED_BEGIN ? global.FIXED_BEGIN : dayjs().subtract(30, 'd').unix(); // 30 days ago
|
|
1024
1185
|
if (!latestTime) latestTime = global.FIXED_NOW ? global.FIXED_NOW : dayjs().unix();
|
|
1025
1186
|
const chance = getChance();
|
|
@@ -1199,7 +1360,10 @@ function wrapFunc(obj, func, recursion = 0, parentKey = null, grandParentKey = n
|
|
|
1199
1360
|
|
|
1200
1361
|
// }
|
|
1201
1362
|
|
|
1202
|
-
|
|
1363
|
+
const chance = getChance();
|
|
1364
|
+
function odds(num) {
|
|
1365
|
+
return chance.bool({ likelihood: num });
|
|
1366
|
+
}
|
|
1203
1367
|
|
|
1204
1368
|
/**
|
|
1205
1369
|
* makes a random-sized array of emojis
|
|
@@ -1290,12 +1454,12 @@ export {
|
|
|
1290
1454
|
TimeSoup,
|
|
1291
1455
|
companyName,
|
|
1292
1456
|
generateEmoji,
|
|
1293
|
-
hasSameKeys
|
|
1457
|
+
hasSameKeys,
|
|
1294
1458
|
deepClone,
|
|
1295
1459
|
initChance,
|
|
1296
1460
|
getChance,
|
|
1297
1461
|
decimal,
|
|
1298
|
-
|
|
1462
|
+
odds,
|
|
1299
1463
|
validTime,
|
|
1300
1464
|
validEvent,
|
|
1301
1465
|
|
|
@@ -1311,7 +1475,6 @@ export {
|
|
|
1311
1475
|
pickAWinner,
|
|
1312
1476
|
quickHash,
|
|
1313
1477
|
weighArray,
|
|
1314
|
-
weighFunnels,
|
|
1315
1478
|
validateEventConfig,
|
|
1316
1479
|
shuffleArray,
|
|
1317
1480
|
shuffleExceptFirst,
|
|
@@ -1325,6 +1488,7 @@ export {
|
|
|
1325
1488
|
buildFileNames,
|
|
1326
1489
|
streamJSON,
|
|
1327
1490
|
streamCSV,
|
|
1491
|
+
streamParquet,
|
|
1328
1492
|
datesBetween,
|
|
1329
1493
|
weighChoices,
|
|
1330
1494
|
wrapFunc,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "make-mp-data",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.23",
|
|
4
4
|
"description": "builds all mixpanel primitives for a given project",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -63,14 +63,15 @@
|
|
|
63
63
|
"@google-cloud/storage": "^7.14.0",
|
|
64
64
|
"ak-fetch": "^2.0.12",
|
|
65
65
|
"ak-gemini": "^1.0.59",
|
|
66
|
-
"ak-tools": "^1.1.
|
|
66
|
+
"ak-tools": "^1.1.12",
|
|
67
67
|
"chance": "^1.1.11",
|
|
68
68
|
"chart.js": "^3.9.1",
|
|
69
69
|
"chartjs-node-canvas": "^4.1.6",
|
|
70
70
|
"dayjs": "^1.11.11",
|
|
71
71
|
"dotenv": "^16.4.5",
|
|
72
72
|
"google-auth-library": "^9.15.0",
|
|
73
|
-
"
|
|
73
|
+
"hyparquet-writer": "^0.6.1",
|
|
74
|
+
"mixpanel-import": "^2.8.162",
|
|
74
75
|
"p-limit": "^3.1.0",
|
|
75
76
|
"yargs": "^17.7.2"
|
|
76
77
|
},
|
|
@@ -86,4 +87,4 @@
|
|
|
86
87
|
"tmp/"
|
|
87
88
|
]
|
|
88
89
|
}
|
|
89
|
-
}
|
|
90
|
+
}
|
package/types.d.ts
CHANGED
|
@@ -20,7 +20,7 @@ export interface Dungeon {
|
|
|
20
20
|
epochEnd?: number;
|
|
21
21
|
numEvents?: number;
|
|
22
22
|
numUsers?: number;
|
|
23
|
-
format?: "csv" | "json" | string;
|
|
23
|
+
format?: "csv" | "json" | "parquet" | string;
|
|
24
24
|
region?: "US" | "EU";
|
|
25
25
|
concurrency?: number;
|
|
26
26
|
batchSize?: number;
|
|
@@ -30,7 +30,6 @@ export interface Dungeon {
|
|
|
30
30
|
projectId?: string;
|
|
31
31
|
|
|
32
32
|
// ids
|
|
33
|
-
simulationName?: string;
|
|
34
33
|
name?: string;
|
|
35
34
|
|
|
36
35
|
//switches
|
|
@@ -44,6 +43,7 @@ export interface Dungeon {
|
|
|
44
43
|
hasDesktopDevices?: boolean;
|
|
45
44
|
hasBrowser?: boolean;
|
|
46
45
|
writeToDisk?: boolean | string;
|
|
46
|
+
gzip?: boolean;
|
|
47
47
|
verbose?: boolean;
|
|
48
48
|
hasAnonIds?: boolean;
|
|
49
49
|
hasSessionIds?: boolean;
|
|
@@ -97,7 +97,7 @@ export type hookTypes =
|
|
|
97
97
|
| "user"
|
|
98
98
|
| "group"
|
|
99
99
|
| "lookup"
|
|
100
|
-
|
|
100
|
+
| "scd"
|
|
101
101
|
| "scd-pre"
|
|
102
102
|
| "mirror"
|
|
103
103
|
| "funnel-pre"
|
|
@@ -117,7 +117,7 @@ export interface hookArrayOptions<T> {
|
|
|
117
117
|
hook?: Hook<T>;
|
|
118
118
|
type?: hookTypes;
|
|
119
119
|
filename?: string;
|
|
120
|
-
format?: "csv" | "json" | string;
|
|
120
|
+
format?: "csv" | "json" | "parquet" | string;
|
|
121
121
|
concurrency?: number;
|
|
122
122
|
context?: Context;
|
|
123
123
|
[key: string]: any;
|
|
@@ -311,6 +311,11 @@ export interface Funnel {
|
|
|
311
311
|
* funnel properties go onto each event in the funnel and are held constant
|
|
312
312
|
*/
|
|
313
313
|
props?: Record<string, ValueValid>;
|
|
314
|
+
/**
|
|
315
|
+
* funnel conditions (user properties) are used to filter users who are eligible for the funnel
|
|
316
|
+
* these conditions must match the current user's profile for the user to be eligible for the funnel
|
|
317
|
+
*/
|
|
318
|
+
conditions?: Record<string, ValueValid>;
|
|
314
319
|
}
|
|
315
320
|
|
|
316
321
|
/**
|