ga4-export-fixer 0.8.0 → 0.9.0-dev.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -230,7 +230,7 @@ const _generateEnhancedEventsSQL = (mergedConfig) => {
230
230
  // ecommerce
231
231
  ecommerce: helpers.fixEcommerceStruct('ecommerce'),
232
232
  items: 'items',
233
- _item_list_attribution_row_id: itemListAttribution ? helpers.itemListAttributionRowId(ecommerceEventsFilter) : undefined,
233
+ _item_row_id: itemListAttribution ? helpers.itemRowId(ecommerceEventsFilter) : undefined,
234
234
  // flag if the data is "final" and is not expected to change anymore
235
235
  data_is_final: helpers.isFinalData(mergedConfig.dataIsFinal.detectionMethod, mergedConfig.dataIsFinal.dayThreshold),
236
236
  export_type: helpers.getGa4ExportType('_table_suffix'),
@@ -268,9 +268,9 @@ ${excludedEventsSQL}`,
268
268
  'group by': 'session_id',
269
269
  };
270
270
 
271
- // item list attribution CTEs:
272
- // 1. item_list_unnest: unnest items from ecommerce events, compute attribution via window function
273
- // 2. item_list_data: re-aggregate items with attributed list fields
271
+ // Shared item-array CTEs:
272
+ // 1. items_unnested: unnest items from ecommerce events, compute attribution via window function
273
+ // 2. items_rebuilt: re-aggregate items with attributed list fields
274
274
  const itemListSteps = itemListAttribution ? (() => {
275
275
  const attrExpr = helpers.itemListAttributionExpr(
276
276
  itemListAttribution.lookbackType,
@@ -279,12 +279,14 @@ ${excludedEventsSQL}`,
279
279
  );
280
280
  const passthroughEvents = `event_name in ('view_item_list', 'select_item', 'view_promotion', 'select_promotion')`;
281
281
 
282
- const attributionStep = {
283
- name: 'item_list_attribution',
282
+ const unnestedStep = {
283
+ name: 'items_unnested',
284
284
  select: {
285
285
  columns: {
286
- '_item_list_attribution_row_id': '_item_list_attribution_row_id',
286
+ '_item_row_id': '_item_row_id',
287
287
  'event_name': 'event_name',
288
+ // event_date is carried forward for ability to use it in data enrichment joins
289
+ 'event_date': 'event_date',
288
290
  'item': 'item',
289
291
  '_item_list_attr': attrExpr,
290
292
  },
@@ -293,11 +295,11 @@ ${excludedEventsSQL}`,
293
295
  where: `event_name in (${ecommerceEventsFilter})`,
294
296
  };
295
297
 
296
- const dataStep = {
297
- name: 'item_list_data',
298
+ const rebuiltStep = {
299
+ name: 'items_rebuilt',
298
300
  select: {
299
301
  columns: {
300
- '_item_list_attribution_row_id': '_item_list_attribution_row_id',
302
+ '_item_row_id': '_item_row_id',
301
303
  'items': `array_agg(
302
304
  (select as struct item.* replace(
303
305
  coalesce(if(${passthroughEvents}, item.item_list_name, _item_list_attr.item_list_name), '(not set)') as item_list_name,
@@ -307,21 +309,81 @@ ${excludedEventsSQL}`,
307
309
  )`,
308
310
  },
309
311
  },
310
- from: 'item_list_attribution',
311
- 'group by': '_item_list_attribution_row_id',
312
+ from: 'items_unnested',
313
+ 'group by': '_item_row_id',
312
314
  };
313
315
 
314
- return [attributionStep, dataStep];
316
+ return [unnestedStep, rebuiltStep];
315
317
  })() : null;
316
318
 
317
319
  const finalColumnOrder = getFinalColumnOrder(eventDataStep, sessionDataStep);
318
320
 
319
- // When item list attribution is enabled, override the items column and exclude _item_list_attribution_row_id
321
+ // When item list attribution is enabled, override the items column and exclude _item_row_id
320
322
  // COALESCE handles events without items (not in ecommerce filter) where the LEFT JOIN returns NULL
321
323
  const itemListOverrides = itemListSteps ? {
322
- items: 'coalesce(item_list_data.items, event_data.items)',
324
+ items: 'coalesce(items_rebuilt.items, event_data.items)',
323
325
  } : {};
324
- const itemListExcludedColumns = itemListSteps ? ['_item_list_attribution_row_id'] : [];
326
+ const itemListExcludedColumns = itemListSteps ? ['_item_row_id'] : [];
327
+
328
+ // Build enrichment-source CTEs and gather event-level join/column data.
329
+ // Item-level enrichments throw "not yet supported" — they will arrive in a later release.
330
+ const enrichments = mergedConfig.enrichments ?? [];
331
+ const enrichmentSteps = [];
332
+ const enrichmentJoins = [];
333
+ const enrichmentColumns = {}; // column name → SQL expression for select.columns
334
+ const enrichmentColumnNames = new Set(); // column names for excludedColumns of wildcards
335
+ const enrichmentColumnOwner = {}; // column name → { i, name } for collision errors
336
+ for (const [i, e] of enrichments.entries()) {
337
+ const level = e.level ?? 'event';
338
+ if (level === 'item') {
339
+ throw new Error(
340
+ `config.enrichments[${i}] uses level: 'item', which is not yet supported in this version. ` +
341
+ `Item-level enrichments will ship in a future release; see design_docs/planned/data-enrichments.md.`
342
+ );
343
+ }
344
+ const joinKeys = Array.isArray(e.joinKey) ? e.joinKey : [e.joinKey];
345
+ const cteName = `enrich_${e.name}`;
346
+ // Source CTE selects joinKey columns plus the requested columns. key === value
347
+ // shape skips the alias clause in queryBuilder's columnsToSQL.
348
+ const cteCols = {};
349
+ for (const k of joinKeys) cteCols[k] = k;
350
+ for (const c of e.columns) cteCols[c] = c;
351
+ const sourceStep = {
352
+ name: cteName,
353
+ select: { columns: cteCols },
354
+ from: e.source,
355
+ };
356
+ // Opt-in dedupe: which row wins is non-deterministic — users with strict needs
357
+ // pre-aggregate in their source SQL.
358
+ if (e.dedupe) {
359
+ sourceStep.qualify = `row_number() over (partition by ${joinKeys.join(', ')}) = 1`;
360
+ }
361
+ enrichmentSteps.push(sourceStep);
362
+
363
+ enrichmentJoins.push({
364
+ type: 'left',
365
+ table: cteName,
366
+ on: `using(${joinKeys.join(', ')})`,
367
+ });
368
+
369
+ // Replace-or-add: each enrichment column overrides explicit select columns via JS object
370
+ // spread, AND joins the excludedColumns set so it suppresses overlap with the wildcard
371
+ // event_data.* / session_data.* expansions below.
372
+ for (const c of e.columns) {
373
+ if (enrichmentColumnNames.has(c)) {
374
+ const owner = enrichmentColumnOwner[c];
375
+ throw new Error(
376
+ `config.enrichments[${i}] (name: '${e.name}') and config.enrichments[${owner.i}] ` +
377
+ `(name: '${owner.name}') both target column '${c}'. ` +
378
+ `Two enrichments cannot write the same column; rename one in source SQL or pick a different name.`
379
+ );
380
+ }
381
+ enrichmentColumns[c] = `${cteName}.${c}`;
382
+ enrichmentColumnNames.add(c);
383
+ enrichmentColumnOwner[c] = { i, name: e.name };
384
+ }
385
+ }
386
+ const enrichmentExcludedColumns = [...enrichmentColumnNames];
325
387
 
326
388
  // Join event_data and session_data, include additional logic
327
389
  // Named 'enhanced_events' so user-supplied customSteps can reference it as a stable handle.
@@ -332,6 +394,9 @@ ${excludedEventsSQL}`,
332
394
  // get the most important columns in the correct order
333
395
  ...finalColumnOrder,
334
396
  ...itemListOverrides,
397
+ // event-level enrichment columns: override matching explicit columns; new columns added.
398
+ // Wildcard-column overlap is handled below via excludedColumns.
399
+ ...enrichmentColumns,
335
400
  // get the rest of the event_data columns
336
401
  '[sql]event_data': utils.selectOtherColumns(
337
402
  eventDataStep,
@@ -342,13 +407,14 @@ ${excludedEventsSQL}`,
342
407
  'data_is_final',
343
408
  'export_type',
344
409
  ...itemListExcludedColumns,
410
+ ...enrichmentExcludedColumns,
345
411
  ]
346
412
  ),
347
413
  // get the rest of the session_data columns
348
414
  '[sql]session_data': utils.selectOtherColumns(
349
415
  sessionDataStep,
350
416
  Object.keys(finalColumnOrder),
351
- []
417
+ [...enrichmentExcludedColumns],
352
418
  ),
353
419
  // include additional columns
354
420
  row_inserted_timestamp: 'current_timestamp()',
@@ -360,19 +426,22 @@ ${excludedEventsSQL}`,
360
426
  joins: [
361
427
  ...(itemListSteps ? [{
362
428
  type: 'left',
363
- table: 'item_list_data',
364
- on: 'using(_item_list_attribution_row_id)'
429
+ table: 'items_rebuilt',
430
+ on: 'using(_item_row_id)'
365
431
  }] : []),
366
432
  {
367
433
  type: 'left',
368
434
  table: 'session_data',
369
435
  on: 'using(session_id)'
370
- }
436
+ },
437
+ // Event-level enrichment joins go last so they apply on top of the package's own joins.
438
+ ...enrichmentJoins,
371
439
  ],
372
440
  where: helpers.incrementalDateFilter(mergedConfig)
373
441
  };
374
442
 
375
443
  const packageSteps = [
444
+ ...enrichmentSteps,
376
445
  eventDataStep,
377
446
  ...(itemListSteps ?? []),
378
447
  sessionDataStep,
@@ -381,7 +450,8 @@ ${excludedEventsSQL}`,
381
450
 
382
451
  // Layer 2 validation: customSteps name must not collide with package step names.
383
452
  // Reserved set is derived from packageSteps at runtime (single source of truth) — what
384
- // is reserved depends on config (e.g. item_list_* exist only when itemListAttribution is on).
453
+ // is reserved depends on config (e.g. item_list_* exist only when itemListAttribution is on,
454
+ // and enrich_* names exist only when enrichments are configured).
385
455
  const customSteps = mergedConfig.customSteps ?? [];
386
456
  if (customSteps.length > 0) {
387
457
  const reservedNames = new Set(packageSteps.map(s => s.name));
@@ -225,6 +225,101 @@ const validateEnhancedEventsConfig = (config, options = {}) => {
225
225
  seenNames.add(step.name);
226
226
  }
227
227
  }
228
+
229
+ // enrichments - optional array of declarative external-data enrichment specs.
230
+ // This block performs Layer 1 (config-shape) checks. Layer 2 checks (reserved-name collision
231
+ // + item-level deferral throw) live in _generateEnhancedEventsSQL — the reserved set is
232
+ // config-dependent and the item-level deferral throws there once the SQL is built.
233
+ if (config.enrichments !== undefined) {
234
+ if (!Array.isArray(config.enrichments)) {
235
+ throw new Error(`config.enrichments must be an array. Received: ${JSON.stringify(config.enrichments)}`);
236
+ }
237
+ const validLevels = ['event', 'item'];
238
+ const seenNames = new Set();
239
+ for (let i = 0; i < config.enrichments.length; i++) {
240
+ const entry = config.enrichments[i];
241
+ if (!entry || typeof entry !== 'object' || Array.isArray(entry)) {
242
+ throw new Error(`config.enrichments[${i}] must be a non-null object. Received: ${JSON.stringify(entry)}`);
243
+ }
244
+ if (typeof entry.name !== 'string' || !entry.name.trim()) {
245
+ throw new Error(`config.enrichments[${i}].name must be a non-empty string. Received: ${JSON.stringify(entry.name)}`);
246
+ }
247
+ if (seenNames.has(entry.name)) {
248
+ throw new Error(`config.enrichments contains duplicate name '${entry.name}'. Each enrichments entry must have a unique name.`);
249
+ }
250
+ seenNames.add(entry.name);
251
+ if (entry.level !== undefined && !validLevels.includes(entry.level)) {
252
+ throw new Error(`config.enrichments[${i}].level must be one of: ${validLevels.join(', ')}. Received: ${JSON.stringify(entry.level)}`);
253
+ }
254
+ // source: Dataform table reference object or backtick-quoted string
255
+ if (entry.source === undefined || entry.source === null) {
256
+ throw new Error(`config.enrichments[${i}].source is required.`);
257
+ }
258
+ if (isDataformTableReferenceObject(entry.source)) {
259
+ // Valid Dataform reference
260
+ } else if (typeof entry.source === 'string') {
261
+ if (!entry.source.trim()) {
262
+ throw new Error(`config.enrichments[${i}].source must be a non-empty string. Received empty string.`);
263
+ }
264
+ if (!/^`[^\.]+\.[^\.]+\.[^\.]+`$/.test(entry.source.trim())) {
265
+ throw new Error(`config.enrichments[${i}].source must be in the format '\`project.dataset.table\`' (with backticks) or a Dataform table reference. Received: ${JSON.stringify(entry.source)}`);
266
+ }
267
+ } else {
268
+ throw new Error(`config.enrichments[${i}].source must be a Dataform table reference object or a string in format '\`project.dataset.table\`'. Received: ${JSON.stringify(entry.source)}`);
269
+ }
270
+ // joinKey: required, plain SQL identifier OR non-empty array of plain SQL identifiers.
271
+ // Plain identifier = ^[a-zA-Z_][a-zA-Z0-9_]*$ — no aliases (`id as user_id`), no backticks,
272
+ // no dotted paths. Users with mismatched dim-column names alias in an upstream Dataform view.
273
+ const sqlIdentifier = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
274
+ const aliasingHint = ' Aliases like \'id as user_id\' are not supported here; alias in an upstream Dataform view if your dim has a different column name.';
275
+ if (entry.joinKey === undefined || entry.joinKey === null) {
276
+ throw new Error(`config.enrichments[${i}].joinKey is required.`);
277
+ }
278
+ if (typeof entry.joinKey === 'string') {
279
+ if (!entry.joinKey.trim()) {
280
+ throw new Error(`config.enrichments[${i}].joinKey must be a non-empty string. Received empty string.`);
281
+ }
282
+ if (!sqlIdentifier.test(entry.joinKey)) {
283
+ throw new Error(`config.enrichments[${i}].joinKey must be a plain SQL identifier. Received: ${JSON.stringify(entry.joinKey)}.${aliasingHint}`);
284
+ }
285
+ } else if (Array.isArray(entry.joinKey)) {
286
+ if (entry.joinKey.length === 0) {
287
+ throw new Error(`config.enrichments[${i}].joinKey must be a non-empty array when provided as an array.`);
288
+ }
289
+ for (let j = 0; j < entry.joinKey.length; j++) {
290
+ const k = entry.joinKey[j];
291
+ if (typeof k !== 'string' || !k.trim()) {
292
+ throw new Error(`config.enrichments[${i}].joinKey[${j}] must be a non-empty string. Received: ${JSON.stringify(k)}`);
293
+ }
294
+ if (!sqlIdentifier.test(k)) {
295
+ throw new Error(`config.enrichments[${i}].joinKey[${j}] must be a plain SQL identifier. Received: ${JSON.stringify(k)}.${aliasingHint}`);
296
+ }
297
+ }
298
+ } else {
299
+ throw new Error(`config.enrichments[${i}].joinKey must be a string or a non-empty array of strings. Received: ${JSON.stringify(entry.joinKey)}`);
300
+ }
301
+ // columns: required, non-empty array of plain SQL identifiers (no aliasing).
302
+ if (!Array.isArray(entry.columns)) {
303
+ throw new Error(`config.enrichments[${i}].columns must be an array. Received: ${JSON.stringify(entry.columns)}`);
304
+ }
305
+ if (entry.columns.length === 0) {
306
+ throw new Error(`config.enrichments[${i}].columns must be non-empty. List the source columns to add to the output (excluding joinKey).`);
307
+ }
308
+ for (let j = 0; j < entry.columns.length; j++) {
309
+ const c = entry.columns[j];
310
+ if (typeof c !== 'string' || !c.trim()) {
311
+ throw new Error(`config.enrichments[${i}].columns[${j}] must be a non-empty string. Received: ${JSON.stringify(c)}`);
312
+ }
313
+ if (!sqlIdentifier.test(c)) {
314
+ throw new Error(`config.enrichments[${i}].columns[${j}] must be a plain SQL identifier. Received: ${JSON.stringify(c)}.${aliasingHint}`);
315
+ }
316
+ }
317
+ // dedupe: optional boolean
318
+ if (entry.dedupe !== undefined && typeof entry.dedupe !== 'boolean') {
319
+ throw new Error(`config.enrichments[${i}].dedupe must be a boolean when provided. Received: ${JSON.stringify(entry.dedupe)}`);
320
+ }
321
+ }
322
+ }
228
323
  } catch (e) {
229
324
  e.message = `Config validation: ${e.message}`;
230
325
  throw e;
package/utils.js CHANGED
@@ -389,6 +389,16 @@ const setDataformContext = (ctx, config) => {
389
389
  }
390
390
  }
391
391
 
392
+ // resolve Dataform refs in enrichments[].source the same way as sourceTable
393
+ if (Array.isArray(config.enrichments)) {
394
+ config.enrichments = config.enrichments.map(e => {
395
+ if (isDataformTableReferenceObject(e.source)) {
396
+ return { ...e, source: ctx.ref(e.source) };
397
+ }
398
+ return e;
399
+ });
400
+ }
401
+
392
402
  config.self = ctx.self();
393
403
  config.incremental = ctx.incremental();
394
404
 
@@ -479,23 +489,35 @@ const selectOtherColumns = (step, alreadyDefinedColumns = [], excludedColumns =
479
489
  const stepName = step.name;
480
490
  const stepColumns = Object.keys(step.select.columns);
481
491
 
482
- // Determine which columns to exclude: those already defined or explicitly excluded
483
- const exceptColumns = stepColumns.filter(
492
+ // Columns in step.select.columns that should be excluded (already-defined or explicitly listed)
493
+ const internalExcept = stepColumns.filter(
484
494
  column => alreadyDefinedColumns.includes(column) || excludedColumns.includes(column)
485
495
  );
486
496
 
487
- // If none of the columns have been defined or excluded, select them all
488
- if (exceptColumns.length === 0) {
497
+ // Columns in excludedColumns that aren't enumerated in step.select.columns. These are
498
+ // wildcard-sourced columns (e.g. default GA4 export columns coming through `event_data.*`
499
+ // inside event_data's own select). The caller knows what to exclude; trust them.
500
+ // BigQuery throws at dry-run if the column doesn't exist in the source — surfaces typos.
501
+ // Filter out undefined/null entries (callers can pass conditional values like
502
+ // `cond ? 'col' : undefined` for ergonomics).
503
+ const externalExcept = excludedColumns.filter(
504
+ c => typeof c === 'string' && c.length > 0 && !stepColumns.includes(c)
505
+ );
506
+
507
+ const allExcept = [...internalExcept, ...externalExcept];
508
+
509
+ // If nothing is excluded, select everything
510
+ if (allExcept.length === 0) {
489
511
  return `${stepName}.*`;
490
512
  }
491
513
 
492
- // If all columns have been defined or excluded, do not select any
493
- if (exceptColumns.length === stepColumns.length) {
514
+ // If every enumerated column is excluded and there are no external excepts to apply,
515
+ // there's nothing to select via the wildcard
516
+ if (internalExcept.length === stepColumns.length && externalExcept.length === 0) {
494
517
  return;
495
518
  }
496
519
 
497
- // Otherwise, select all except the excluded/defined ones
498
- return `${stepName}.* except (${exceptColumns.join(', ')})`;
520
+ return `${stepName}.* except (${allExcept.join(', ')})`;
499
521
  };
500
522
 
501
523