ga4-export-fixer 0.9.0-dev.6 → 0.9.0-dev.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ga4-export-fixer",
3
- "version": "0.9.0-dev.6",
3
+ "version": "0.9.0-dev.8",
4
4
  "description": "",
5
5
  "main": "index.js",
6
6
  "files": [
@@ -320,74 +320,23 @@ ${excludedEventsSQL}`,
320
320
  } : {};
321
321
  const itemListExcludedColumns = itemListSteps ? ['_item_row_id'] : [];
322
322
 
323
- // Build enrichment-source CTEs and gather event-level join/column data.
324
- // Item-level enrichments throw "not yet supported" — they will arrive in a later release.
325
- const enrichments = mergedConfig.enrichments ?? [];
326
- const enrichmentSteps = [];
327
- const enrichmentJoins = [];
328
- const enrichmentColumns = {}; // column name SQL expression for select.columns
329
- const enrichmentColumnNames = new Set(); // column names for excludedColumns of wildcards
330
- const enrichmentColumnOwner = {}; // column name → { i, name } for collision errors
331
- for (const [i, e] of enrichments.entries()) {
332
- const level = e.level ?? 'event';
333
- if (level === 'item') {
334
- throw new Error(
335
- `config.enrichments[${i}] uses level: 'item', which is not yet supported in this version. ` +
336
- `Item-level enrichments will ship in a future release; see design_docs/planned/data-enrichments.md.`
337
- );
338
- }
339
- const joinKeys = Array.isArray(e.joinKey) ? e.joinKey : [e.joinKey];
340
- const cteName = `enrich_${e.name}`;
341
- // Source CTE selects joinKey columns plus the requested columns. key === value
342
- // shape skips the alias clause in queryBuilder's columnsToSQL.
343
- const cteCols = {};
344
- for (const k of joinKeys) cteCols[k] = k;
345
- for (const c of e.columns) cteCols[c] = c;
346
- const sourceStep = {
347
- name: cteName,
348
- select: { columns: cteCols },
349
- from: e.source,
350
- };
351
- // Opt-in dedupe: which row wins is non-deterministic — users with strict needs
352
- // pre-aggregate in their source SQL.
353
- if (e.dedupe) {
354
- sourceStep.qualify = `row_number() over (partition by ${joinKeys.join(', ')}) = 1`;
355
- }
356
- enrichmentSteps.push(sourceStep);
357
-
358
- enrichmentJoins.push({
359
- type: 'left',
360
- table: cteName,
361
- on: `using(${joinKeys.join(', ')})`,
362
- });
363
-
364
- // Replace-or-add: each enrichment column overrides explicit select columns via JS object
365
- // spread, AND joins the excludedColumns set so it suppresses overlap with the wildcard
366
- // event_data.* / session_data.* expansions below.
367
- for (const c of e.columns) {
368
- if (enrichmentColumnNames.has(c)) {
369
- const owner = enrichmentColumnOwner[c];
370
- throw new Error(
371
- `config.enrichments[${i}] (name: '${e.name}') and config.enrichments[${owner.i}] ` +
372
- `(name: '${owner.name}') both target column '${c}'. ` +
373
- `Two enrichments cannot write the same column; rename one in source SQL or pick a different name.`
374
- );
375
- }
376
- enrichmentColumns[c] = `${cteName}.${c}`;
377
- enrichmentColumnNames.add(c);
378
- enrichmentColumnOwner[c] = { i, name: e.name };
379
- }
380
- }
381
- const enrichmentExcludedColumns = [...enrichmentColumnNames];
382
-
383
- // Only forward enrichment columns to each wildcard's EXCEPT input if they actually exist
384
- // in that wildcard's source CTE. Otherwise BigQuery rejects with "Column X in SELECT *
385
- // EXCEPT list does not exist". After M1, Object.keys(step.select.columns) is the complete
386
- // column set of both event_data and session_data — so the same predicate works for both.
387
- const eventDataExplicit = new Set(Object.keys(eventDataStep.select.columns));
388
- const sessionDataExplicit = new Set(Object.keys(sessionDataStep.select.columns));
389
- const eventDataEnrichmentExcept = enrichmentExcludedColumns.filter(c => eventDataExplicit.has(c));
390
- const sessionDataEnrichmentExcept = enrichmentExcludedColumns.filter(c => sessionDataExplicit.has(c));
323
+ // Build enrichment-source CTEs and gather event-level join/column data. Item-level
324
+ // enrichments throw "not yet supported" inside the utility — they will arrive in a later release.
325
+ const { steps: enrichmentSteps, joins: enrichmentJoins, columns: enrichmentColumns,
326
+ columnNames: enrichmentColumnNames } = utils.buildEnrichments(mergedConfig.enrichments);
327
+
328
+ // Build the set of columns the outer SELECT already maps explicitly (so wildcards skip them)
329
+ // plus internal-only columns that should never reach enhanced_events.
330
+ const alreadyMapped = [
331
+ ...Object.keys(finalColumnOrder),
332
+ ...Object.keys(itemListOverrides),
333
+ ...enrichmentColumnNames,
334
+ 'entrances',
335
+ mergedConfig.sessionParams.length > 0 ? 'session_params_prep' : undefined,
336
+ 'data_is_final',
337
+ 'export_type',
338
+ ...itemListExcludedColumns,
339
+ ];
391
340
 
392
341
  // Join event_data and session_data, include additional logic
393
342
  // Named 'enhanced_events' so user-supplied customSteps can reference it as a stable handle.
@@ -399,27 +348,10 @@ ${excludedEventsSQL}`,
399
348
  ...finalColumnOrder,
400
349
  ...itemListOverrides,
401
350
  // event-level enrichment columns: override matching explicit columns; new columns added.
402
- // Wildcard-column overlap is handled below via excludedColumns.
403
351
  ...enrichmentColumns,
404
- // get the rest of the event_data columns
405
- '[sql]event_data': utils.selectOtherColumns(
406
- eventDataStep,
407
- Object.keys(finalColumnOrder),
408
- [
409
- 'entrances',
410
- mergedConfig.sessionParams.length > 0 ? 'session_params_prep' : undefined,
411
- 'data_is_final',
412
- 'export_type',
413
- ...itemListExcludedColumns,
414
- ...eventDataEnrichmentExcept,
415
- ]
416
- ),
417
- // get the rest of the session_data columns
418
- '[sql]session_data': utils.selectOtherColumns(
419
- sessionDataStep,
420
- Object.keys(finalColumnOrder),
421
- sessionDataEnrichmentExcept,
422
- ),
352
+ // explicit pass-throughs for the rest of event_data and session_data
353
+ ...utils.buildQualifiedPassThroughs(eventDataStep, alreadyMapped),
354
+ ...utils.buildQualifiedPassThroughs(sessionDataStep, alreadyMapped),
423
355
  // include additional columns
424
356
  row_inserted_timestamp: 'current_timestamp()',
425
357
  data_is_final: 'data_is_final',
package/utils.js CHANGED
@@ -474,53 +474,6 @@ const mergeDataformTableConfigurations = (defaultConfig, inputConfig = {}) => {
474
474
  return deepMerge(defaultConfig, inputConfig);
475
475
  };
476
476
 
477
- /**
478
- * Generates a SQL selection string for a given query step, excluding columns already defined elsewhere
479
- * or columns that should be excluded.
480
- *
481
- * This utility is helpful when joining tables/CTEs to avoid selecting duplicate or already-present columns.
482
- *
483
- * @param {Object} step - A queryBuilder structured step containing a `name` (CTE/table alias) and a `select.columns` object.
484
- * @param {string[]} [alreadyDefinedColumns=[]] - Columns that have already been defined and should be excluded from selection.
485
- * @param {string[]} [excludedColumns=[]] - Additional columns to explicitly exclude from selection.
486
- * @returns {string|undefined} A SQL select string (e.g. 'stepName.*' or 'stepName.* except (col1, col2)'), or undefined if all columns are excluded.
487
- */
488
- const selectOtherColumns = (step, alreadyDefinedColumns = [], excludedColumns = []) => {
489
- const stepName = step.name;
490
- const stepColumns = Object.keys(step.select.columns);
491
-
492
- // Columns in step.select.columns that should be excluded (already-defined or explicitly listed)
493
- const internalExcept = stepColumns.filter(
494
- column => alreadyDefinedColumns.includes(column) || excludedColumns.includes(column)
495
- );
496
-
497
- // Columns in excludedColumns that aren't enumerated in step.select.columns. These are
498
- // wildcard-sourced columns (e.g. default GA4 export columns coming through `event_data.*`
499
- // inside event_data's own select). The caller knows what to exclude; trust them.
500
- // BigQuery throws at dry-run if the column doesn't exist in the source — surfaces typos.
501
- // Filter out undefined/null entries (callers can pass conditional values like
502
- // `cond ? 'col' : undefined` for ergonomics).
503
- const externalExcept = excludedColumns.filter(
504
- c => typeof c === 'string' && c.length > 0 && !stepColumns.includes(c)
505
- );
506
-
507
- const allExcept = [...internalExcept, ...externalExcept];
508
-
509
- // If nothing is excluded, select everything
510
- if (allExcept.length === 0) {
511
- return `${stepName}.*`;
512
- }
513
-
514
- // If every enumerated column is excluded and there are no external excepts to apply,
515
- // there's nothing to select via the wildcard
516
- if (internalExcept.length === stepColumns.length && externalExcept.length === 0) {
517
- return;
518
- }
519
-
520
- return `${stepName}.* except (${allExcept.join(', ')})`;
521
- };
522
-
523
-
524
477
  /**
525
478
  * Builds a queryBuilder `select.columns` fragment that passes through every source column
526
479
  * not already covered by an explicit columns object.
@@ -560,6 +513,119 @@ const buildPassThroughs = (explicitColumns, sourceColumns) => {
560
513
  };
561
514
 
562
515
 
516
+ /**
517
+ * Builds the per-enrichment CTE definitions, JOIN clauses, and column-name mappings for the
518
+ * declarative `enrichments` feature.
519
+ *
520
+ * Pure config-to-data mapping. No knowledge of downstream CTEs or specific table modules —
521
+ * intended to be called by any table module that exposes an `enrichments` config field.
522
+ *
523
+ * Encapsulates two generation-time throws:
524
+ * - level: 'item' (not yet supported; deferred per design_docs/planned/data-enrichments.md Q15).
525
+ * - Enrichment-vs-enrichment column collisions (two enrichments targeting the same column).
526
+ *
527
+ * @param {Array<Object>} enrichments - Validated enrichment entries. Each entry has fields:
528
+ * { name, level, source, joinKey, columns, dedupe? } per data-enrichments.md Q8.
529
+ * @returns {Object} A struct with five fields:
530
+ * - `steps` — array of queryBuilder source-CTE step definitions (one `enrich_<name>` per entry).
531
+ * - `joins` — array of LEFT JOIN clauses to attach downstream (one per entry).
532
+ * - `columns` — map of `{ <enrichmentColumn>: 'enrich_<name>.<col>' }` for spreading into a
533
+ * downstream SELECT's `select.columns`.
534
+ * - `columnNames` — Set of all enrichment column names (used by callers for overlap detection
535
+ * against downstream CTEs).
536
+ * - `columnOwner` — map of `{ <column>: { i, name } }` recording which enrichment owns each
537
+ * column; preserved for diagnostics.
538
+ *
539
+ * @throws {Error} If any entry has `level: 'item'` (with a pointer to data-enrichments.md).
540
+ * @throws {Error} If two enrichments target the same column name (with both enrichment names).
541
+ *
542
+ * @example
543
+ * const { steps, joins, columns, columnNames } = buildEnrichments(config.enrichments);
544
+ */
545
+ const buildEnrichments = (enrichments) => {
546
+ const steps = [];
547
+ const joins = [];
548
+ const columns = {};
549
+ const columnNames = new Set();
550
+ const columnOwner = {};
551
+
552
+ for (const [i, e] of (enrichments ?? []).entries()) {
553
+ const level = e.level ?? 'event';
554
+ if (level === 'item') {
555
+ throw new Error(
556
+ `config.enrichments[${i}] uses level: 'item', which is not yet supported in this version. ` +
557
+ `Item-level enrichments will ship in a future release; see design_docs/planned/data-enrichments.md.`
558
+ );
559
+ }
560
+ const joinKeys = Array.isArray(e.joinKey) ? e.joinKey : [e.joinKey];
561
+ const cteName = `enrich_${e.name}`;
562
+
563
+ // Source CTE selects joinKey columns plus the requested columns. key === value
564
+ // shape skips the alias clause in queryBuilder's columnsToSQL.
565
+ const cteCols = {};
566
+ for (const k of joinKeys) cteCols[k] = k;
567
+ for (const c of e.columns) cteCols[c] = c;
568
+ const sourceStep = { name: cteName, select: { columns: cteCols }, from: e.source };
569
+ // Opt-in dedupe: which row wins is non-deterministic — users with strict needs
570
+ // pre-aggregate in their source SQL.
571
+ if (e.dedupe) {
572
+ sourceStep.qualify = `row_number() over (partition by ${joinKeys.join(', ')}) = 1`;
573
+ }
574
+ steps.push(sourceStep);
575
+
576
+ joins.push({ type: 'left', table: cteName, on: `using(${joinKeys.join(', ')})` });
577
+
578
+ for (const c of e.columns) {
579
+ if (columnNames.has(c)) {
580
+ const owner = columnOwner[c];
581
+ throw new Error(
582
+ `config.enrichments[${i}] (name: '${e.name}') and config.enrichments[${owner.i}] ` +
583
+ `(name: '${owner.name}') both target column '${c}'. ` +
584
+ `Two enrichments cannot write the same column; rename one in source SQL or pick a different name.`
585
+ );
586
+ }
587
+ columns[c] = `${cteName}.${c}`;
588
+ columnNames.add(c);
589
+ columnOwner[c] = { i, name: e.name };
590
+ }
591
+ }
592
+
593
+ return { steps, joins, columns, columnNames, columnOwner };
594
+ };
595
+
596
+
597
+ /**
598
+ * Builds a qualified pass-through fragment for spreading into a downstream SELECT's
599
+ * `select.columns`. For each column in `step.select.columns` not already in `alreadyCovered`,
600
+ * emits an entry of the form `{ <col>: '<step.name>.<col>' }`.
601
+ *
602
+ * Columns whose values in `step.select.columns` are `undefined` (the user-exclusion sentinel
603
+ * shape from getExcludedColumns) are skipped. Names in `alreadyCovered` that don't exist in
604
+ * `step.select.columns` are silently ignored — the loop only iterates `step.select.columns`,
605
+ * so unknown names cause no harm. This is the safety property that lets callers pass
606
+ * "everything that might collide" without pre-filtering.
607
+ *
608
+ * @param {Object} step - A queryBuilder step with a `name` and `select.columns` object.
609
+ * @param {Iterable<string>} alreadyCovered - Column names already mapped elsewhere in the
610
+ * downstream SELECT, plus any internal-only columns the downstream SELECT shouldn't re-emit.
611
+ * @returns {Object} A map of `{ <col>: '<step.name>.<col>' }` entries.
612
+ *
613
+ * @example
614
+ * buildQualifiedPassThroughs(eventDataStep, ['event_date', 'session_id', 'entrances']);
615
+ * // → { event_name: 'event_data.event_name', user_pseudo_id: 'event_data.user_pseudo_id', ... }
616
+ */
617
+ const buildQualifiedPassThroughs = (step, alreadyCovered) => {
618
+ const covered = new Set(alreadyCovered);
619
+ const passThroughs = {};
620
+ for (const [col, expr] of Object.entries(step.select.columns)) {
621
+ if (expr === undefined) continue;
622
+ if (covered.has(col)) continue;
623
+ passThroughs[col] = `${step.name}.${col}`;
624
+ }
625
+ return passThroughs;
626
+ };
627
+
628
+
563
629
  /**
564
630
  * Processes a date input string and returns a corresponding SQL date casting expression,
565
631
  * or passes through BigQuery SQL statements as-is.
@@ -634,8 +700,9 @@ module.exports = {
634
700
  queryBuilder,
635
701
  isDataformTableReferenceObject,
636
702
  setDataformContext,
637
- selectOtherColumns,
638
703
  buildPassThroughs,
704
+ buildEnrichments,
705
+ buildQualifiedPassThroughs,
639
706
  processDate,
640
707
  getDatasetName
641
708
  };