ga4-export-fixer 0.9.0-dev.1 → 0.9.0-dev.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -201,11 +201,11 @@ const validateEnhancedEventsConfig = (config, options = {}) => {
201
201
  }
202
202
  }
203
203
 
204
- // customSteps - optional array of queryBuilder step objects appended to the pipeline
205
- // Layer 1 (config shape): array, objects with non-empty name, no duplicates within customSteps.
204
+ // customSteps - optional array of queryBuilder step objects appended to the pipeline.
205
+ // Config-shape checks only: array, objects with non-empty name, no duplicates within customSteps.
206
206
  // Step-shape validation (clause keys, etc.) deferred to queryBuilder.
207
- // Collision-with-package-names check deferred to _generateEnhancedEventsSQL (Layer 2),
208
- // since the reserved set is config-dependent (e.g. item_list_* only exist when itemListAttribution is on).
207
+ // Collision-with-package-names check deferred to _generateEnhancedEventsSQL, since the
208
+ // reserved set is config-dependent (e.g. item_list_* only exist when itemListAttribution is on).
209
209
  if (config.customSteps !== undefined) {
210
210
  if (!Array.isArray(config.customSteps)) {
211
211
  throw new Error(`config.customSteps must be an array. Received: ${JSON.stringify(config.customSteps)}`);
@@ -225,6 +225,101 @@ const validateEnhancedEventsConfig = (config, options = {}) => {
225
225
  seenNames.add(step.name);
226
226
  }
227
227
  }
228
+
229
+ // enrichments - optional array of declarative external-data enrichment specs.
230
+ // Config-shape checks only. Reserved-name collision and item-level joinKey resolution
231
+ // happen in _generateEnhancedEventsSQL, where the reserved set and item-level join targets
232
+ // are derived from the resolved config.
233
+ if (config.enrichments !== undefined) {
234
+ if (!Array.isArray(config.enrichments)) {
235
+ throw new Error(`config.enrichments must be an array. Received: ${JSON.stringify(config.enrichments)}`);
236
+ }
237
+ const validLevels = ['event', 'item'];
238
+ const seenNames = new Set();
239
+ for (let i = 0; i < config.enrichments.length; i++) {
240
+ const entry = config.enrichments[i];
241
+ if (!entry || typeof entry !== 'object' || Array.isArray(entry)) {
242
+ throw new Error(`config.enrichments[${i}] must be a non-null object. Received: ${JSON.stringify(entry)}`);
243
+ }
244
+ if (typeof entry.name !== 'string' || !entry.name.trim()) {
245
+ throw new Error(`config.enrichments[${i}].name must be a non-empty string. Received: ${JSON.stringify(entry.name)}`);
246
+ }
247
+ if (seenNames.has(entry.name)) {
248
+ throw new Error(`config.enrichments contains duplicate name '${entry.name}'. Each enrichments entry must have a unique name.`);
249
+ }
250
+ seenNames.add(entry.name);
251
+ if (entry.level !== undefined && !validLevels.includes(entry.level)) {
252
+ throw new Error(`config.enrichments[${i}].level must be one of: ${validLevels.join(', ')}. Received: ${JSON.stringify(entry.level)}`);
253
+ }
254
+ // source: Dataform table reference object or backtick-quoted string
255
+ if (entry.source === undefined || entry.source === null) {
256
+ throw new Error(`config.enrichments[${i}].source is required.`);
257
+ }
258
+ if (isDataformTableReferenceObject(entry.source)) {
259
+ // Valid Dataform reference
260
+ } else if (typeof entry.source === 'string') {
261
+ if (!entry.source.trim()) {
262
+ throw new Error(`config.enrichments[${i}].source must be a non-empty string. Received empty string.`);
263
+ }
264
+ if (!/^`[^\.]+\.[^\.]+\.[^\.]+`$/.test(entry.source.trim())) {
265
+ throw new Error(`config.enrichments[${i}].source must be in the format '\`project.dataset.table\`' (with backticks) or a Dataform table reference. Received: ${JSON.stringify(entry.source)}`);
266
+ }
267
+ } else {
268
+ throw new Error(`config.enrichments[${i}].source must be a Dataform table reference object or a string in format '\`project.dataset.table\`'. Received: ${JSON.stringify(entry.source)}`);
269
+ }
270
+ // joinKey: required, plain SQL identifier OR non-empty array of plain SQL identifiers.
271
+ // Plain identifier = ^[a-zA-Z_][a-zA-Z0-9_]*$ — no aliases (`id as user_id`), no backticks,
272
+ // no dotted paths. Users with mismatched dim-column names alias in an upstream Dataform view.
273
+ const sqlIdentifier = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
274
+ const aliasingHint = ' Aliases like \'id as user_id\' are not supported here; alias in an upstream Dataform view if your dim has a different column name.';
275
+ if (entry.joinKey === undefined || entry.joinKey === null) {
276
+ throw new Error(`config.enrichments[${i}].joinKey is required.`);
277
+ }
278
+ if (typeof entry.joinKey === 'string') {
279
+ if (!entry.joinKey.trim()) {
280
+ throw new Error(`config.enrichments[${i}].joinKey must be a non-empty string. Received empty string.`);
281
+ }
282
+ if (!sqlIdentifier.test(entry.joinKey)) {
283
+ throw new Error(`config.enrichments[${i}].joinKey must be a plain SQL identifier. Received: ${JSON.stringify(entry.joinKey)}.${aliasingHint}`);
284
+ }
285
+ } else if (Array.isArray(entry.joinKey)) {
286
+ if (entry.joinKey.length === 0) {
287
+ throw new Error(`config.enrichments[${i}].joinKey must be a non-empty array when provided as an array.`);
288
+ }
289
+ for (let j = 0; j < entry.joinKey.length; j++) {
290
+ const k = entry.joinKey[j];
291
+ if (typeof k !== 'string' || !k.trim()) {
292
+ throw new Error(`config.enrichments[${i}].joinKey[${j}] must be a non-empty string. Received: ${JSON.stringify(k)}`);
293
+ }
294
+ if (!sqlIdentifier.test(k)) {
295
+ throw new Error(`config.enrichments[${i}].joinKey[${j}] must be a plain SQL identifier. Received: ${JSON.stringify(k)}.${aliasingHint}`);
296
+ }
297
+ }
298
+ } else {
299
+ throw new Error(`config.enrichments[${i}].joinKey must be a string or a non-empty array of strings. Received: ${JSON.stringify(entry.joinKey)}`);
300
+ }
301
+ // columns: required, non-empty array of plain SQL identifiers (no aliasing).
302
+ if (!Array.isArray(entry.columns)) {
303
+ throw new Error(`config.enrichments[${i}].columns must be an array. Received: ${JSON.stringify(entry.columns)}`);
304
+ }
305
+ if (entry.columns.length === 0) {
306
+ throw new Error(`config.enrichments[${i}].columns must be non-empty. List the source columns to add to the output (excluding joinKey).`);
307
+ }
308
+ for (let j = 0; j < entry.columns.length; j++) {
309
+ const c = entry.columns[j];
310
+ if (typeof c !== 'string' || !c.trim()) {
311
+ throw new Error(`config.enrichments[${i}].columns[${j}] must be a non-empty string. Received: ${JSON.stringify(c)}`);
312
+ }
313
+ if (!sqlIdentifier.test(c)) {
314
+ throw new Error(`config.enrichments[${i}].columns[${j}] must be a plain SQL identifier. Received: ${JSON.stringify(c)}.${aliasingHint}`);
315
+ }
316
+ }
317
+ // dedupe: optional boolean
318
+ if (entry.dedupe !== undefined && typeof entry.dedupe !== 'boolean') {
319
+ throw new Error(`config.enrichments[${i}].dedupe must be a boolean when provided. Received: ${JSON.stringify(entry.dedupe)}`);
320
+ }
321
+ }
322
+ }
228
323
  } catch (e) {
229
324
  e.message = `Config validation: ${e.message}`;
230
325
  throw e;
package/utils.js CHANGED
@@ -389,6 +389,16 @@ const setDataformContext = (ctx, config) => {
389
389
  }
390
390
  }
391
391
 
392
+ // resolve Dataform refs in enrichments[].source the same way as sourceTable
393
+ if (Array.isArray(config.enrichments)) {
394
+ config.enrichments = config.enrichments.map(e => {
395
+ if (isDataformTableReferenceObject(e.source)) {
396
+ return { ...e, source: ctx.ref(e.source) };
397
+ }
398
+ return e;
399
+ });
400
+ }
401
+
392
402
  config.self = ctx.self();
393
403
  config.incremental = ctx.incremental();
394
404
 
@@ -465,37 +475,162 @@ const mergeDataformTableConfigurations = (defaultConfig, inputConfig = {}) => {
465
475
  };
466
476
 
467
477
  /**
468
- * Generates a SQL selection string for a given query step, excluding columns already defined elsewhere
469
- * or columns that should be excluded.
478
+ * Builds a queryBuilder `select.columns` fragment that passes through every source column
479
+ * not already covered by an explicit columns object.
470
480
  *
471
- * This utility is helpful when joining tables/CTEs to avoid selecting duplicate or already-present columns.
472
- *
473
- * @param {Object} step - A queryBuilder structured step containing a `name` (CTE/table alias) and a `select.columns` object.
474
- * @param {string[]} [alreadyDefinedColumns=[]] - Columns that have already been defined and should be excluded from selection.
475
- * @param {string[]} [excludedColumns=[]] - Additional columns to explicitly exclude from selection.
476
- * @returns {string|undefined} A SQL select string (e.g. 'stepName.*' or 'stepName.* except (col1, col2)'), or undefined if all columns are excluded.
481
+ * A source column is considered "covered" and skipped from pass-throughs when it appears as:
482
+ * - a KEY in `explicitColumns` (a transform, package promotion, or undefined-valued exclusion
483
+ * sentinel like `{ event_dimensions: undefined }`), OR
484
+ * - a VALUE in `explicitColumns` (a bare source-column identifier referenced by a value-side
485
+ * rename, e.g. `{ user_traffic_source: 'traffic_source' }` covers 'traffic_source').
486
+ *
487
+ * Values that are SQL expressions, function calls, or non-strings never count as coverage —
488
+ * they reference the source column internally but the column itself is still available as a
489
+ * pass-through. (`.includes()` compares by strict equality, so 'extract(datetime from ...)'
490
+ * never matches a bare column name.)
491
+ *
492
+ * @param {Object} explicitColumns - A queryBuilder step's explicit `select.columns` entries.
493
+ * @param {Iterable<string>} sourceColumns - Column names available on the source schema.
494
+ * @returns {Object} A map of `{ column: column }` entries for every source column not covered.
495
+ *
496
+ * @example
497
+ * buildPassThroughs(
498
+ * { event_name: 'event_name', user_traffic_source: 'traffic_source' },
499
+ * ['event_name', 'traffic_source', 'device', 'geo']
500
+ * );
501
+ * // → { device: 'device', geo: 'geo' }
477
502
  */
478
- const selectOtherColumns = (step, alreadyDefinedColumns = [], excludedColumns = []) => {
479
- const stepName = step.name;
480
- const stepColumns = Object.keys(step.select.columns);
481
-
482
- // Determine which columns to exclude: those already defined or explicitly excluded
483
- const exceptColumns = stepColumns.filter(
484
- column => alreadyDefinedColumns.includes(column) || excludedColumns.includes(column)
485
- );
486
-
487
- // If none of the columns have been defined or excluded, select them all
488
- if (exceptColumns.length === 0) {
489
- return `${stepName}.*`;
503
+ const buildPassThroughs = (explicitColumns, sourceColumns) => {
504
+ const explicitKeys = Object.keys(explicitColumns);
505
+ const explicitValues = Object.values(explicitColumns);
506
+ const passThroughs = {};
507
+ for (const column of sourceColumns) {
508
+ if (!explicitKeys.includes(column) && !explicitValues.includes(column)) {
509
+ passThroughs[column] = column;
510
+ }
490
511
  }
512
+ return passThroughs;
513
+ };
491
514
 
492
- // If all columns have been defined or excluded, do not select any
493
- if (exceptColumns.length === stepColumns.length) {
494
- return;
515
+
516
+ /**
517
+ * Builds the per-enrichment CTE definitions, JOIN clauses, and column-name mappings for the
518
+ * declarative `enrichments` feature. Routes event-level and item-level entries through
519
+ * separate output channels so the caller can attach them to different downstream CTEs.
520
+ *
521
+ * Pure config-to-data mapping. No knowledge of downstream CTEs or specific table modules —
522
+ * intended to be called by any table module that exposes an `enrichments` config field.
523
+ *
524
+ * Encapsulates one generation-time throw:
525
+ * - Same-level enrichment-vs-enrichment column collisions (two event-level enrichments or
526
+ * two item-level enrichments targeting the same column). Cross-level same-name is allowed —
527
+ * the two columns target structurally distinct slots (`enhanced_events.<col>` vs
528
+ * `items[].<col>`).
529
+ *
530
+ * @param {Array<Object>} enrichments - Validated enrichment entries. Each entry has fields:
531
+ * { name, level, source, joinKey, columns, dedupe? }. `level` is 'event' (default) or 'item'.
532
+ * @returns {Object} A struct with four fields:
533
+ * - `steps` — array of queryBuilder source-CTE step definitions (one `enrich_<name>` per
534
+ * entry, regardless of level — all source CTEs go to the top of the pipeline).
535
+ * - `event` — { joins, columns, columnNames } for event-level enrichments. Caller attaches
536
+ * `joins` to the event-grained downstream CTE (e.g. `enhanced_events`) and spreads `columns`
537
+ * into that CTE's `select.columns`.
538
+ * - `item` — { joins, columns, columnNames } for item-level enrichments. Caller attaches
539
+ * `joins` to the item-grained downstream CTE (e.g. `items_rebuilt`) and folds `columns`
540
+ * into that CTE's struct construction.
541
+ * - `columnOwner` — map of `{ <column>: { i, name, level } }` recording which enrichment
542
+ * owns each column. The `level` field distinguishes cross-level same-name entries.
543
+ *
544
+ * @throws {Error} If two same-level enrichments target the same column name (with both
545
+ * enrichment names and the conflicting column in the error message).
546
+ *
547
+ * @example
548
+ * const { steps, event, item } = buildEnrichments(config.enrichments);
549
+ * // event.joins → attach to enhanced_events; event.columns → spread into enhanced_events
550
+ * // item.joins → attach to items_rebuilt; item.columns → fold into items struct
551
+ */
552
+ const buildEnrichments = (enrichments) => {
553
+ const steps = [];
554
+ const channels = {
555
+ event: { joins: [], columns: {}, columnNames: new Set() },
556
+ item: { joins: [], columns: {}, columnNames: new Set() },
557
+ };
558
+ const columnOwner = {};
559
+
560
+ for (const [i, e] of (enrichments ?? []).entries()) {
561
+ const level = e.level ?? 'event';
562
+ const channel = channels[level];
563
+ const joinKeys = Array.isArray(e.joinKey) ? e.joinKey : [e.joinKey];
564
+ const cteName = `enrich_${e.name}`;
565
+
566
+ // Source CTE selects joinKey columns plus the requested columns. key === value
567
+ // shape skips the alias clause in queryBuilder's columnsToSQL.
568
+ const cteCols = {};
569
+ for (const k of joinKeys) cteCols[k] = k;
570
+ for (const c of e.columns) cteCols[c] = c;
571
+ const sourceStep = { name: cteName, select: { columns: cteCols }, from: e.source };
572
+ // Opt-in dedupe: which row wins is non-deterministic — users with strict needs
573
+ // pre-aggregate in their source SQL.
574
+ if (e.dedupe) {
575
+ sourceStep.qualify = `row_number() over (partition by ${joinKeys.join(', ')}) = 1`;
576
+ }
577
+ steps.push(sourceStep);
578
+
579
+ channel.joins.push({ type: 'left', table: cteName, on: `using(${joinKeys.join(', ')})` });
580
+
581
+ for (const c of e.columns) {
582
+ // Same-level collision throw. Cross-level same-name is allowed because the two
583
+ // columns target structurally distinct output slots (event_data vs items[]).
584
+ if (channel.columnNames.has(c)) {
585
+ const owner = columnOwner[c];
586
+ throw new Error(
587
+ `config.enrichments[${i}] (name: '${e.name}') and config.enrichments[${owner.i}] ` +
588
+ `(name: '${owner.name}') both target column '${c}' at level '${level}'. ` +
589
+ `Two enrichments cannot write the same column at the same level; rename one in source SQL or pick a different name.`
590
+ );
591
+ }
592
+ channel.columns[c] = `${cteName}.${c}`;
593
+ channel.columnNames.add(c);
594
+ // columnOwner is keyed by column name; if the same name appears at different
595
+ // levels, the second-writer entry wins, but we record level so diagnostics
596
+ // distinguish them. Same-level collisions throw above before reaching here.
597
+ columnOwner[c] = { i, name: e.name, level };
598
+ }
495
599
  }
496
600
 
497
- // Otherwise, select all except the excluded/defined ones
498
- return `${stepName}.* except (${exceptColumns.join(', ')})`;
601
+ return { steps, event: channels.event, item: channels.item, columnOwner };
602
+ };
603
+
604
+
605
+ /**
606
+ * Builds a qualified pass-through fragment for spreading into a downstream SELECT's
607
+ * `select.columns`. For each column in `step.select.columns` not already in `alreadyCovered`,
608
+ * emits an entry of the form `{ <col>: '<step.name>.<col>' }`.
609
+ *
610
+ * Columns whose values in `step.select.columns` are `undefined` (the user-exclusion sentinel
611
+ * shape from getExcludedColumns) are skipped. Names in `alreadyCovered` that don't exist in
612
+ * `step.select.columns` are silently ignored — the loop only iterates `step.select.columns`,
613
+ * so unknown names cause no harm. This is the safety property that lets callers pass
614
+ * "everything that might collide" without pre-filtering.
615
+ *
616
+ * @param {Object} step - A queryBuilder step with a `name` and `select.columns` object.
617
+ * @param {Iterable<string>} alreadyCovered - Column names already mapped elsewhere in the
618
+ * downstream SELECT, plus any internal-only columns the downstream SELECT shouldn't re-emit.
619
+ * @returns {Object} A map of `{ <col>: '<step.name>.<col>' }` entries.
620
+ *
621
+ * @example
622
+ * buildQualifiedPassThroughs(eventDataStep, ['event_date', 'session_id', 'entrances']);
623
+ * // → { event_name: 'event_data.event_name', user_pseudo_id: 'event_data.user_pseudo_id', ... }
624
+ */
625
+ const buildQualifiedPassThroughs = (step, alreadyCovered) => {
626
+ const covered = new Set(alreadyCovered);
627
+ const passThroughs = {};
628
+ for (const [col, expr] of Object.entries(step.select.columns)) {
629
+ if (expr === undefined) continue;
630
+ if (covered.has(col)) continue;
631
+ passThroughs[col] = `${step.name}.${col}`;
632
+ }
633
+ return passThroughs;
499
634
  };
500
635
 
501
636
 
@@ -573,7 +708,9 @@ module.exports = {
573
708
  queryBuilder,
574
709
  isDataformTableReferenceObject,
575
710
  setDataformContext,
576
- selectOtherColumns,
711
+ buildPassThroughs,
712
+ buildEnrichments,
713
+ buildQualifiedPassThroughs,
577
714
  processDate,
578
715
  getDatasetName
579
716
  };