make-mp-data 2.0.21 → 2.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,9 +8,11 @@ import utc from 'dayjs/plugin/utc.js';
8
8
  import path from 'path';
9
9
  import { mkdir, parseGCSUri } from 'ak-tools';
10
10
  import { existsSync } from 'fs';
11
+ import zlib from 'zlib';
11
12
  dayjs.extend(utc);
12
13
  import 'dotenv/config';
13
14
  import { domainSuffix, domainPrefix } from '../templates/defaults.js';
15
+ const {NODE_ENV = "unknown"} = process.env;
14
16
 
15
17
  /** @typedef {import('../../types').Dungeon} Config */
16
18
  /** @typedef {import('../../types').EventConfig} EventConfig */
@@ -482,15 +484,22 @@ STREAMERS
482
484
  ----
483
485
  */
484
486
 
485
- function streamJSON(filePath, data) {
487
+ function streamJSON(filePath, data, options = {}) {
486
488
  return new Promise((resolve, reject) => {
487
489
  let writeStream;
490
+ const { gzip = false } = options;
491
+
488
492
  if (filePath?.startsWith('gs://')) {
489
493
  const { uri, bucket, file } = parseGCSUri(filePath);
490
494
  writeStream = storage.bucket(bucket).file(file).createWriteStream({ gzip: true });
491
495
  }
492
496
  else {
493
497
  writeStream = fs.createWriteStream(filePath, { encoding: 'utf8' });
498
+ if (gzip) {
499
+ const gzipStream = zlib.createGzip();
500
+ gzipStream.pipe(writeStream);
501
+ writeStream = gzipStream;
502
+ }
494
503
  }
495
504
  data.forEach(item => {
496
505
  writeStream.write(JSON.stringify(item) + '\n');
@@ -503,15 +512,22 @@ function streamJSON(filePath, data) {
503
512
  });
504
513
  }
505
514
 
506
- function streamCSV(filePath, data) {
515
+ function streamCSV(filePath, data, options = {}) {
507
516
  return new Promise((resolve, reject) => {
508
517
  let writeStream;
518
+ const { gzip = false } = options;
519
+
509
520
  if (filePath?.startsWith('gs://')) {
510
521
  const { uri, bucket, file } = parseGCSUri(filePath);
511
522
  writeStream = storage.bucket(bucket).file(file).createWriteStream({ gzip: true });
512
523
  }
513
524
  else {
514
525
  writeStream = fs.createWriteStream(filePath, { encoding: 'utf8' });
526
+ if (gzip) {
527
+ const gzipStream = zlib.createGzip();
528
+ gzipStream.pipe(writeStream);
529
+ writeStream = gzipStream;
530
+ }
515
531
  }
516
532
 
517
533
  // Extract all unique keys from the data array
@@ -538,6 +554,117 @@ function streamCSV(filePath, data) {
538
554
  });
539
555
  }
540
556
 
557
+ async function streamParquet(filePath, data, options = {}) {
558
+ const { gzip = false } = options;
559
+
560
+ // Dynamically import hyparquet-writer
561
+ const { parquetWriteFile, parquetWriteBuffer } = await import('hyparquet-writer');
562
+
563
+ if (data.length === 0) {
564
+ throw new Error('Cannot write parquet file with empty data');
565
+ }
566
+
567
+ // Extract column names and data from the input array
568
+ const columns = getUniqueKeys(data);
569
+ const columnData = columns.map(columnName => {
570
+ const columnValues = data.map(row => {
571
+ let value = row[columnName];
572
+
573
+ // Handle null/undefined values
574
+ if (value === null || value === undefined) {
575
+ return null;
576
+ }
577
+
578
+ // Convert objects to strings
579
+ if (typeof value === 'object') {
580
+ value = JSON.stringify(value);
581
+ }
582
+
583
+ return value;
584
+ });
585
+
586
+ // Determine the type based on the first non-null value
587
+ let type = 'STRING'; // default
588
+ const firstValue = columnValues.find(v => v !== null && v !== undefined);
589
+
590
+ if (firstValue !== undefined) {
591
+ if (typeof firstValue === 'boolean') {
592
+ type = 'BOOLEAN';
593
+ } else if (typeof firstValue === 'number') {
594
+ // For parquet compatibility, convert numbers to appropriate types
595
+ if (Number.isInteger(firstValue)) {
596
+ // Use INT32 for smaller integers, convert to BigInt for INT64 if needed
597
+ if (firstValue >= -2147483648 && firstValue <= 2147483647) {
598
+ type = 'INT32';
599
+ } else {
600
+ type = 'INT64';
601
+ // Convert all values to BigInt for INT64
602
+ for (let i = 0; i < columnValues.length; i++) {
603
+ if (columnValues[i] !== null && columnValues[i] !== undefined) {
604
+ columnValues[i] = BigInt(columnValues[i]);
605
+ }
606
+ }
607
+ }
608
+ } else {
609
+ type = 'DOUBLE';
610
+ }
611
+ } else if (firstValue instanceof Date) {
612
+ type = 'TIMESTAMP';
613
+ }
614
+ }
615
+
616
+ return {
617
+ name: columnName,
618
+ data: columnValues,
619
+ type: type
620
+ };
621
+ });
622
+
623
+ if (filePath?.startsWith('gs://')) {
624
+ // For GCS, write to buffer first, then upload
625
+ const arrayBuffer = parquetWriteBuffer({ columnData });
626
+ const { bucket, file } = parseGCSUri(filePath);
627
+
628
+ const writeStream = storage.bucket(bucket).file(file).createWriteStream({
629
+ gzip: gzip || true // Always gzip for GCS
630
+ });
631
+
632
+ return new Promise((resolve, reject) => {
633
+ writeStream.write(Buffer.from(arrayBuffer));
634
+ writeStream.end();
635
+ writeStream.on('finish', () => resolve(filePath));
636
+ writeStream.on('error', reject);
637
+ });
638
+ } else {
639
+ // For local files
640
+ let actualFilePath = filePath;
641
+ if (gzip && !filePath.endsWith('.gz')) {
642
+ actualFilePath = filePath + '.gz';
643
+ }
644
+
645
+ if (gzip) {
646
+ // Write to buffer then gzip to disk
647
+ const arrayBuffer = parquetWriteBuffer({ columnData });
648
+ const buffer = Buffer.from(arrayBuffer);
649
+ const gzippedBuffer = zlib.gzipSync(buffer);
650
+
651
+ return new Promise((resolve, reject) => {
652
+ fs.writeFile(actualFilePath, gzippedBuffer, (err) => {
653
+ if (err) reject(err);
654
+ else resolve(actualFilePath);
655
+ });
656
+ });
657
+ } else {
658
+ // Direct write to disk
659
+ parquetWriteFile({
660
+ filename: filePath,
661
+ columnData
662
+ });
663
+ return Promise.resolve(filePath);
664
+ }
665
+ }
666
+ }
667
+
541
668
 
542
669
  /*
543
670
  ----
@@ -545,13 +672,7 @@ WEIGHERS
545
672
  ----
546
673
  */
547
674
 
548
- function weighFunnels(acc, funnel) {
549
- const weight = funnel?.weight || 1;
550
- for (let i = 0; i < weight; i++) {
551
- acc.push(funnel);
552
- }
553
- return acc;
554
- }
675
+
555
676
 
556
677
  /**
557
678
  * a utility function to generate a range of numbers within a given skew
@@ -875,7 +996,7 @@ function buildFileNames(config) {
875
996
  let extension = "";
876
997
  extension = format === "csv" ? "csv" : "json";
877
998
  // const current = dayjs.utc().format("MM-DD-HH");
878
- let simName = config.simulationName;
999
+ let simName = config.name;
879
1000
  let writeDir = typeof config.writeToDisk === 'string' ? config.writeToDisk : "./";
880
1001
  if (config.writeToDisk) {
881
1002
  const dataFolder = path.resolve("./data");
@@ -1020,6 +1141,46 @@ let soupHits = 0;
1020
1141
  * @param {number} [peaks=5]
1021
1142
  */
1022
1143
  function TimeSoup(earliestTime, latestTime, peaks = 5, deviation = 2, mean = 0) {
1144
+ if (!earliestTime) earliestTime = global.FIXED_BEGIN ? global.FIXED_BEGIN : dayjs().subtract(30, 'd').unix(); // 30 days ago
1145
+ if (!latestTime) latestTime = global.FIXED_NOW ? global.FIXED_NOW : dayjs().unix();
1146
+ const chance = getChance();
1147
+ const totalRange = latestTime - earliestTime;
1148
+ const chunkSize = totalRange / peaks;
1149
+
1150
+ // Select a random chunk based on the number of peaks
1151
+ const peakIndex = integer(0, peaks - 1);
1152
+ const chunkStart = earliestTime + peakIndex * chunkSize;
1153
+ const chunkEnd = chunkStart + chunkSize;
1154
+ const chunkMid = (chunkStart + chunkEnd) / 2;
1155
+
1156
+ // Generate a single timestamp within this chunk using a normal distribution centered at chunkMid
1157
+ let offset;
1158
+ let iterations = 0;
1159
+ let isValidTime = false;
1160
+ do {
1161
+ iterations++;
1162
+ soupHits++;
1163
+ offset = chance.normal({ mean: mean, dev: chunkSize / deviation });
1164
+ isValidTime = validTime(chunkMid + offset, earliestTime, latestTime);
1165
+ if (iterations > 25000) {
1166
+ throw `${iterations} iterations... exceeded`;
1167
+ }
1168
+ } while (chunkMid + offset < chunkStart || chunkMid + offset > chunkEnd);
1169
+
1170
+ try {
1171
+ return dayjs.unix(chunkMid + offset).toISOString();
1172
+ }
1173
+
1174
+ catch (e) {
1175
+ //escape hatch
1176
+ // console.log('BAD TIME', e?.message);
1177
+ if (NODE_ENV === 'dev') debugger;
1178
+ return dayjs.unix(integer(earliestTime, latestTime)).toISOString();
1179
+ }
1180
+ }
1181
+
1182
+
1183
+ function NewTimeSoup(earliestTime, latestTime, peaks = 5, deviation = 2, mean = 0) {
1023
1184
  if (!earliestTime) earliestTime = global.FIXED_BEGIN ? global.FIXED_BEGIN : dayjs().subtract(30, 'd').unix(); // 30 days ago
1024
1185
  if (!latestTime) latestTime = global.FIXED_NOW ? global.FIXED_NOW : dayjs().unix();
1025
1186
  const chance = getChance();
@@ -1199,7 +1360,10 @@ function wrapFunc(obj, func, recursion = 0, parentKey = null, grandParentKey = n
1199
1360
 
1200
1361
  // }
1201
1362
 
1202
-
1363
+ const chance = getChance();
1364
+ function odds(num) {
1365
+ return chance.bool({ likelihood: num });
1366
+ }
1203
1367
 
1204
1368
  /**
1205
1369
  * makes a random-sized array of emojis
@@ -1290,12 +1454,12 @@ export {
1290
1454
  TimeSoup,
1291
1455
  companyName,
1292
1456
  generateEmoji,
1293
- hasSameKeys as haveSameKeys,
1457
+ hasSameKeys,
1294
1458
  deepClone,
1295
1459
  initChance,
1296
1460
  getChance,
1297
1461
  decimal,
1298
-
1462
+ odds,
1299
1463
  validTime,
1300
1464
  validEvent,
1301
1465
 
@@ -1311,7 +1475,6 @@ export {
1311
1475
  pickAWinner,
1312
1476
  quickHash,
1313
1477
  weighArray,
1314
- weighFunnels,
1315
1478
  validateEventConfig,
1316
1479
  shuffleArray,
1317
1480
  shuffleExceptFirst,
@@ -1325,6 +1488,7 @@ export {
1325
1488
  buildFileNames,
1326
1489
  streamJSON,
1327
1490
  streamCSV,
1491
+ streamParquet,
1328
1492
  datesBetween,
1329
1493
  weighChoices,
1330
1494
  wrapFunc,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "make-mp-data",
3
- "version": "2.0.21",
3
+ "version": "2.0.23",
4
4
  "description": "builds all mixpanel primitives for a given project",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -63,14 +63,15 @@
63
63
  "@google-cloud/storage": "^7.14.0",
64
64
  "ak-fetch": "^2.0.12",
65
65
  "ak-gemini": "^1.0.59",
66
- "ak-tools": "^1.1.1",
66
+ "ak-tools": "^1.1.12",
67
67
  "chance": "^1.1.11",
68
68
  "chart.js": "^3.9.1",
69
69
  "chartjs-node-canvas": "^4.1.6",
70
70
  "dayjs": "^1.11.11",
71
71
  "dotenv": "^16.4.5",
72
72
  "google-auth-library": "^9.15.0",
73
- "mixpanel-import": "^2.8.15",
73
+ "hyparquet-writer": "^0.6.1",
74
+ "mixpanel-import": "^2.8.162",
74
75
  "p-limit": "^3.1.0",
75
76
  "yargs": "^17.7.2"
76
77
  },
@@ -86,4 +87,4 @@
86
87
  "tmp/"
87
88
  ]
88
89
  }
89
- }
90
+ }
package/types.d.ts CHANGED
@@ -20,7 +20,7 @@ export interface Dungeon {
20
20
  epochEnd?: number;
21
21
  numEvents?: number;
22
22
  numUsers?: number;
23
- format?: "csv" | "json" | string;
23
+ format?: "csv" | "json" | "parquet" | string;
24
24
  region?: "US" | "EU";
25
25
  concurrency?: number;
26
26
  batchSize?: number;
@@ -30,7 +30,6 @@ export interface Dungeon {
30
30
  projectId?: string;
31
31
 
32
32
  // ids
33
- simulationName?: string;
34
33
  name?: string;
35
34
 
36
35
  //switches
@@ -44,6 +43,7 @@ export interface Dungeon {
44
43
  hasDesktopDevices?: boolean;
45
44
  hasBrowser?: boolean;
46
45
  writeToDisk?: boolean | string;
46
+ gzip?: boolean;
47
47
  verbose?: boolean;
48
48
  hasAnonIds?: boolean;
49
49
  hasSessionIds?: boolean;
@@ -97,7 +97,7 @@ export type hookTypes =
97
97
  | "user"
98
98
  | "group"
99
99
  | "lookup"
100
- // | "scd"
100
+ | "scd"
101
101
  | "scd-pre"
102
102
  | "mirror"
103
103
  | "funnel-pre"
@@ -117,7 +117,7 @@ export interface hookArrayOptions<T> {
117
117
  hook?: Hook<T>;
118
118
  type?: hookTypes;
119
119
  filename?: string;
120
- format?: "csv" | "json" | string;
120
+ format?: "csv" | "json" | "parquet" | string;
121
121
  concurrency?: number;
122
122
  context?: Context;
123
123
  [key: string]: any;
@@ -311,6 +311,11 @@ export interface Funnel {
311
311
  * funnel properties go onto each event in the funnel and are held constant
312
312
  */
313
313
  props?: Record<string, ValueValid>;
314
+ /**
315
+ * funnel conditions (user properties) are used to filter users who are eligible for the funnel
316
+ * these conditions must match the current user's profile for the user to be eligible for the funnel
317
+ */
318
+ conditions?: Record<string, ValueValid>;
314
319
  }
315
320
 
316
321
  /**