ga4-export-fixer 0.8.0-dev.2 → 0.9.0-dev.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -128,6 +128,12 @@ The goal of the package is to **speed up development** when building data models
128
128
  <b>🕐 Timezone-Aware Datetime</b><br>
129
129
  <code>event_datetime</code> converted to a configurable IANA timezone
130
130
  </td>
131
+ <td valign="top">
132
+ <b>🧩 Custom Processing Steps</b><br>
133
+ Append user-defined CTEs via <code>customSteps</code> to derive new columns or join external tables
134
+ </td>
135
+ </tr>
136
+ <tr>
131
137
  <td valign="top">
132
138
  <b>🛡️ Zero Dependencies</b><br>
133
139
  No additional external dependencies added to your Dataform repository
@@ -139,10 +145,10 @@ The goal of the package is to **speed up development** when building data models
139
145
 
140
146
  Features under consideration for future releases:
141
147
 
148
+ - Data enrichment (item-level, session-level, event-level)
149
+ - Aggregated tables (ga4_session, ga4_ecommerce...)
142
150
  - Web and app specific default configurations
143
151
  - Custom channel grouping
144
- - Data enrichment (item-level, session-level, event-level)
145
- - Custom processing steps (additional CTEs)
146
152
  - Custom traffic source attribution
147
153
 
148
154
  ## Installation
@@ -163,7 +169,7 @@ Include the package in the package.json file in your Dataform repository.
163
169
  {
164
170
  "dependencies": {
165
171
  "@dataform/core": "3.0.42",
166
- "ga4-export-fixer": "0.7.1"
172
+ "ga4-export-fixer": "0.8.0"
167
173
  }
168
174
  }
169
175
  ```
@@ -465,10 +471,11 @@ itemListAttribution: { lookbackType: 'TIME', lookbackTimeMs: 86400000 }
465
471
  | ------------------------ | ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- |
466
472
  | `event_data` | yes | Extracted and shaped events from `sourceTable`, with date filtering and column promotions applied. *Unfiltered for the buffer-days range.* |
467
473
  | `session_data` | yes | Session-level aggregations (grouped by `session_id`). |
468
- | `item_list_attribution` | only when `itemListAttribution` is on | Per-event item attribution rows. |
469
- | `item_list_data` | only when `itemListAttribution` is on | Re-aggregated items with attributed list fields. |
470
- | `enhanced_events` | yes | The package's standard output shape (joined event_data + session_data + item_list_data, columns ordered, incremental date filter applied). The natural starting point for most custom CTEs. |
474
+ | `items_unnested` | only when `itemListAttribution` is on | Per-event item rows (one row per item per ecommerce event), with attribution window function applied. |
475
+ | `items_rebuilt` | only when `itemListAttribution` is on | Re-aggregated items with attributed list fields, joined back to events via `_item_row_id`. |
476
+ | `enhanced_events` | yes | The package's standard output shape (joined event_data + session_data + items_rebuilt, columns ordered, incremental date filter applied). The natural starting point for most custom CTEs. |
471
477
 
478
+ Example custom step using the raw SQL format:
472
479
 
473
480
  ```javascript
474
481
  // Add a content_group column derived from page.path
@@ -488,6 +495,28 @@ from enhanced_events`,
488
495
  ],
489
496
  ```
490
497
 
498
+ The same example in the structured shape:
499
+
500
+ ```javascript
501
+ customSteps: [
502
+ {
503
+ name: 'final',
504
+ select: {
505
+ columns: {
506
+ '[sql]passthrough': 'enhanced_events.*',
507
+ content_group: `case
508
+ when page.path like '/blog/%' then 'blog'
509
+ when page.path like '/products/%' then 'product'
510
+ when page.path = '/' then 'home'
511
+ else 'other'
512
+ end`,
513
+ },
514
+ },
515
+ from: 'enhanced_events',
516
+ },
517
+ ],
518
+ ```
519
+
491
520
  > **Note:** Custom columns aren't auto-documented. Use `dataformTableConfig.columns` to add descriptions — it's deep-merged with the package's defaults, so your keys are added or override matching defaults, and untouched defaults stay.
492
521
 
493
522
  > **Note:** Built-in assertions assume the package's standard schema. If your custom CTEs rename, drop, or filter rows in ways that break those assumptions, disable the affected assertions explicitly via the `assertions` config option.
@@ -1,262 +1,263 @@
1
- const { unnestEventParam } = require('./params');
2
-
3
- /**
4
- * SQL expression that builds a session ID by concatenating `user_pseudo_id` with the `ga_session_id` event parameter.
5
- */
6
- const sessionId = `concat(user_pseudo_id, (select value.int_value from unnest(event_params) where key = 'ga_session_id'))`;
7
-
8
- /*
9
- Ecommerce
10
- */
11
-
12
- /**
13
- * Fixes and normalizes the ecommerce struct extracted from GA4 event data.
14
- *
15
- * This helper returns a SQL expression that:
16
- * - Ensures `ecommerce.transaction_id` is set to NULL if it has the placeholder string '(not set)';
17
- * - For 'purchase' events, normalizes `ecommerce.purchase_revenue` by:
18
- * * Removing NaN values;
19
- * * Filling missing purchase revenue (an old GA4 bug) with the event parameter 'value', safely cast as FLOAT64;
20
- * - Leaves other fields in the ecommerce struct unchanged.
21
- *
22
- * The result is a new struct with the same shape as 'ecommerce' but with cleaned transaction_id and purchase_revenue.
23
- *
24
- * @returns {string} A SQL snippet for SELECT AS STRUCT ... REPLACE to normalize ecommerce fields.
25
- *
26
- * @example
27
- * fixEcommerceStruct()
28
- * // => SQL string that can be used in a SELECT list to normalize ecommerce columns
29
- */
30
- const fixEcommerceStruct = () => {
31
- return `(select as struct ecommerce.* replace(
32
- if(ecommerce.transaction_id <> '(not set)', ecommerce.transaction_id, null) as transaction_id,
33
- if(
34
- event_name = 'purchase',
35
- coalesce(
36
- -- fix possible NaN values
37
- if(is_nan(ecommerce.purchase_revenue), null, ecommerce.purchase_revenue),
38
- -- fix an old ga4 bug where purchase_revenue was missing
39
- safe_cast(${unnestEventParam('value')} as float64)
40
- ),
41
- null
42
- ) as purchase_revenue
43
- ))`;
44
- };
45
-
46
- /*
47
- Check if GA4 data is "final" and is not expected to change anymore
48
- */
49
-
50
- /**
51
- * Generates a SQL expression to determine whether GA4 export data can be considered "final" (not subject to further change).
52
- *
53
- * Two detection methods are supported:
54
- * - 'EXPORT_TYPE': Checks the table suffix; returns FALSE for intraday or "fresh" tables, TRUE for finalized data.
55
- * - 'DAY_THRESHOLD': Considers data final if a configurable number of days has passed since event_date.
56
- *
57
- * @param {'EXPORT_TYPE'|'DAY_THRESHOLD'} detectionMethod - The method to use for finality determination.
58
- * 'EXPORT_TYPE': Uses patterns in _table_suffix (e.g., 'intraday_%', 'fresh_%').
59
- * 'DAY_THRESHOLD': Uses date difference between the current date and event_date.
60
- * @param {number} [dayThreshold] - (Only for 'DAY_THRESHOLD') Number of days after which data is considered final. Required when detectionMethod is 'DAY_THRESHOLD'.
61
- * @returns {string} SQL expression that evaluates to TRUE if the data is final, otherwise FALSE.
62
- *
63
- * @throws {Error} If an unsupported detectionMethod is provided.
64
- *
65
- * @example
66
- * // Checks based on export type
67
- * isFinalData('EXPORT_TYPE')
68
- * // => "if(_table_suffix like 'intraday_%' or _table_suffix like 'fresh_%', false, true)"
69
- *
70
- * // Checks using a custom day threshold
71
- * isFinalData('DAY_THRESHOLD', 5)
72
- * // => "if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > 5, true, false)"
73
- */
74
- const isFinalData = (detectionMethod, dayThreshold) => {
75
- if (detectionMethod !== 'EXPORT_TYPE' && detectionMethod !== 'DAY_THRESHOLD') {
76
- throw new Error(`isFinalData: Unsupported detectionMethod '${detectionMethod}'. Supported values are 'EXPORT_TYPE' and 'DAY_THRESHOLD'.`);
77
- }
78
-
79
- if (detectionMethod === 'DAY_THRESHOLD') {
80
- if (typeof dayThreshold === 'undefined') {
81
- throw new Error("isFinalData: 'dayThreshold' is required when using 'DAY_THRESHOLD' detectionMethod.");
82
- }
83
- if (!Number.isInteger(dayThreshold) || dayThreshold < 0) {
84
- throw new Error("isFinalData: 'dayThreshold' must be an integer greater than or equal to 0 when using 'DAY_THRESHOLD' detectionMethod.");
85
- }
86
- }
87
-
88
- if (detectionMethod === 'EXPORT_TYPE') {
89
- return 'if(_table_suffix like \'intraday_%\' or _table_suffix like \'fresh_%\', false, true)';
90
- }
91
-
92
- if (detectionMethod === 'DAY_THRESHOLD') {
93
- return `if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > ${dayThreshold}, true, false)`;
94
- }
95
- };
96
-
97
- /**
98
- * Checks whether a given column name is part of the standard/expected GA4 BigQuery export columns.
99
- *
100
- * The list of recognized GA4 export columns is based on the official schema as of 2026-02-18.
101
- * This function can be used to filter or validate column names when processing GA4 data exports.
102
- *
103
- * @param {string} columnName - The name of the column to check.
104
- * @returns {boolean} True if the column name is a GA4 export column, otherwise false.
105
- */
106
- const isGa4ExportColumn = (columnName) => {
107
- // list updated 2026-02-18
108
- const ga4ExportColumns = [
109
- "event_date",
110
- "event_timestamp",
111
- "event_name",
112
- "event_params",
113
- "event_previous_timestamp",
114
- "event_value_in_usd",
115
- "event_bundle_sequence_id",
116
- "event_server_timestamp_offset",
117
- "user_id",
118
- "user_pseudo_id",
119
- "privacy_info",
120
- "user_properties",
121
- "user_first_touch_timestamp",
122
- "user_ltv",
123
- "device",
124
- "geo",
125
- "app_info",
126
- "traffic_source",
127
- "stream_id",
128
- "platform",
129
- "event_dimensions",
130
- "ecommerce",
131
- "items",
132
- "collected_traffic_source",
133
- "is_active_user",
134
- "batch_event_index",
135
- "batch_page_id",
136
- "batch_ordering_id",
137
- "session_traffic_source_last_click",
138
- "publisher"
139
- ];
140
- return ga4ExportColumns.includes(columnName);
141
- };
142
-
143
- /**
144
- * Generates a SQL CASE expression that determines the GA4 export type from a table suffix.
145
- *
146
- * Returns 'intraday' for suffixes like 'intraday_%', 'fresh' for 'fresh_%',
147
- * and 'daily' for 8-digit date suffixes (YYYYMMDD).
148
- *
149
- * @param {string} tableSuffix - SQL expression or column reference for the table suffix (e.g., '_table_suffix').
150
- * @returns {string} SQL CASE expression that evaluates to 'intraday', 'fresh', or 'daily'.
151
- */
152
- const getGa4ExportType = (tableSuffix) => {
153
- return `case
154
- when ${tableSuffix} like 'intraday_%' then 'intraday'
155
- when ${tableSuffix} like 'fresh_%' then 'fresh'
156
- when regexp_contains(${tableSuffix}, r'^\\d{8}$') then 'daily'
157
- end`;
158
- };
159
-
160
- /**
161
- * Generates a SQL LAST_VALUE window function that attributes item list fields
162
- * (item_list_name, item_list_id, item_list_index) from select_item/select_promotion
163
- * events to downstream ecommerce events using a lookback window.
164
- *
165
- * Returns a struct containing all three attributed fields via a single window sort.
166
- *
167
- * @param {'SESSION'|'TIME'} lookbackType - Window scope: session-based or time-based
168
- * @param {string} timestampColumn - Column to order by ('event_timestamp' or 'event_custom_timestamp')
169
- * @param {number} [lookbackTimeMs] - Lookback window in milliseconds (required when lookbackType is 'TIME')
170
- * @returns {string} SQL expression that evaluates to a struct with item_list_name, item_list_id, item_list_index
171
- */
172
- const itemListAttributionExpr = (lookbackType, timestampColumn, lookbackTimeMs) => {
173
- const selectEvents = `event_name in ('select_item', 'select_promotion')`;
174
- const structExpr = `struct(item.item_list_name, item.item_list_id, item.item_list_index)`;
175
-
176
- let partitionBy;
177
- let frameBounds;
178
-
179
- if (lookbackType === 'SESSION') {
180
- partitionBy = 'session_id, item.item_id';
181
- frameBounds = 'rows between unbounded preceding and current row';
182
- } else {
183
- // TIME-based: range window in microseconds
184
- const lookbackMicros = lookbackTimeMs * 1000;
185
- partitionBy = 'user_pseudo_id, item.item_id';
186
- frameBounds = `range between ${lookbackMicros} preceding and current row`;
187
- }
188
-
189
- return `last_value(
190
- if(${selectEvents}, ${structExpr}, null) ignore nulls
191
- ) over(
192
- partition by ${partitionBy}
193
- order by ${timestampColumn} asc
194
- ${frameBounds}
195
- )`;
196
- };
197
-
198
- /**
199
- * Generates a SQL expression for a deterministic hash-based row id used by the
200
- * item list attribution join. Only computed for events in `ecommerceEventsFilter`;
201
- * other events get NULL.
202
- *
203
- * The row_number() window keeps the id stable across CTE re-evaluations:
204
- * BigQuery may inline the CTE and re-run the window per reference, so without
205
- * a stable ordering the two sides of the downstream join could hash differently.
206
- * partition by event_name avoids a single-partition bottleneck.
207
- * Residual collisions (identical event_timestamp + identical items) are safe —
208
- * the rows are interchangeable, so arbitrary row number assignment between them
209
- * produces the same result.
210
- *
211
- * @param {string} ecommerceEventsFilter - Comma-separated, quoted list of event names
212
- * (e.g., "'purchase', 'add_to_cart'").
213
- * @returns {string} SQL expression that evaluates to the row id or NULL.
214
- */
215
- const itemListAttributionRowId = (ecommerceEventsFilter) => {
216
- return `if(
217
- event_name in (${ecommerceEventsFilter}),
218
- farm_fingerprint(concat(
219
- user_pseudo_id,
220
- cast(event_timestamp as string),
221
- event_name,
222
- to_json_string(items),
223
- cast(row_number() over(
224
- partition by event_name, user_pseudo_id
225
- order by event_timestamp, to_json_string(items)
226
- ) as string)
227
- )),
228
- null
229
- )`;
230
- };
231
-
232
- /**
233
- * Official GA4 ecommerce events that carry item data.
234
- * Based on: https://developers.google.com/analytics/devguides/collection/ga4/ecommerce
235
- */
236
- const ga4EcommerceEvents = [
237
- 'view_item_list',
238
- 'select_item',
239
- 'view_promotion',
240
- 'select_promotion',
241
- 'view_item',
242
- 'add_to_wishlist',
243
- 'add_to_cart',
244
- 'remove_from_cart',
245
- 'view_cart',
246
- 'begin_checkout',
247
- 'add_shipping_info',
248
- 'add_payment_info',
249
- 'purchase',
250
- 'refund',
251
- ];
252
-
253
- module.exports = {
254
- sessionId,
255
- fixEcommerceStruct,
256
- isFinalData,
257
- isGa4ExportColumn,
258
- getGa4ExportType,
259
- itemListAttributionExpr,
260
- itemListAttributionRowId,
261
- ga4EcommerceEvents
262
- };
1
+ const { unnestEventParam } = require('./params');
2
+
3
+ /**
4
+ * SQL expression that builds a session ID by concatenating `user_pseudo_id` with the `ga_session_id` event parameter.
5
+ */
6
+ const sessionId = `concat(user_pseudo_id, (select value.int_value from unnest(event_params) where key = 'ga_session_id'))`;
7
+
8
+ /*
9
+ Ecommerce
10
+ */
11
+
12
+ /**
13
+ * Fixes and normalizes the ecommerce struct extracted from GA4 event data.
14
+ *
15
+ * This helper returns a SQL expression that:
16
+ * - Ensures `ecommerce.transaction_id` is set to NULL if it has the placeholder string '(not set)';
17
+ * - For 'purchase' events, normalizes `ecommerce.purchase_revenue` by:
18
+ * * Removing NaN values;
19
+ * * Filling missing purchase revenue (an old GA4 bug) with the event parameter 'value', safely cast as FLOAT64;
20
+ * - Leaves other fields in the ecommerce struct unchanged.
21
+ *
22
+ * The result is a new struct with the same shape as 'ecommerce' but with cleaned transaction_id and purchase_revenue.
23
+ *
24
+ * @returns {string} A SQL snippet for SELECT AS STRUCT ... REPLACE to normalize ecommerce fields.
25
+ *
26
+ * @example
27
+ * fixEcommerceStruct()
28
+ * // => SQL string that can be used in a SELECT list to normalize ecommerce columns
29
+ */
30
+ const fixEcommerceStruct = () => {
31
+ return `(select as struct ecommerce.* replace(
32
+ if(ecommerce.transaction_id <> '(not set)', ecommerce.transaction_id, null) as transaction_id,
33
+ if(
34
+ event_name = 'purchase',
35
+ coalesce(
36
+ -- fix possible NaN values
37
+ if(is_nan(ecommerce.purchase_revenue), null, ecommerce.purchase_revenue),
38
+ -- fix an old ga4 bug where purchase_revenue was missing
39
+ safe_cast(${unnestEventParam('value')} as float64)
40
+ ),
41
+ null
42
+ ) as purchase_revenue
43
+ ))`;
44
+ };
45
+
46
+ /*
47
+ Check if GA4 data is "final" and is not expected to change anymore
48
+ */
49
+
50
+ /**
51
+ * Generates a SQL expression to determine whether GA4 export data can be considered "final" (not subject to further change).
52
+ *
53
+ * Two detection methods are supported:
54
+ * - 'EXPORT_TYPE': Checks the table suffix; returns FALSE for intraday or "fresh" tables, TRUE for finalized data.
55
+ * - 'DAY_THRESHOLD': Considers data final if a configurable number of days has passed since event_date.
56
+ *
57
+ * @param {'EXPORT_TYPE'|'DAY_THRESHOLD'} detectionMethod - The method to use for finality determination.
58
+ * 'EXPORT_TYPE': Uses patterns in _table_suffix (e.g., 'intraday_%', 'fresh_%').
59
+ * 'DAY_THRESHOLD': Uses date difference between the current date and event_date.
60
+ * @param {number} [dayThreshold] - (Only for 'DAY_THRESHOLD') Number of days after which data is considered final. Required when detectionMethod is 'DAY_THRESHOLD'.
61
+ * @returns {string} SQL expression that evaluates to TRUE if the data is final, otherwise FALSE.
62
+ *
63
+ * @throws {Error} If an unsupported detectionMethod is provided.
64
+ *
65
+ * @example
66
+ * // Checks based on export type
67
+ * isFinalData('EXPORT_TYPE')
68
+ * // => "if(_table_suffix like 'intraday_%' or _table_suffix like 'fresh_%', false, true)"
69
+ *
70
+ * // Checks using a custom day threshold
71
+ * isFinalData('DAY_THRESHOLD', 5)
72
+ * // => "if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > 5, true, false)"
73
+ */
74
+ const isFinalData = (detectionMethod, dayThreshold) => {
75
+ if (detectionMethod !== 'EXPORT_TYPE' && detectionMethod !== 'DAY_THRESHOLD') {
76
+ throw new Error(`isFinalData: Unsupported detectionMethod '${detectionMethod}'. Supported values are 'EXPORT_TYPE' and 'DAY_THRESHOLD'.`);
77
+ }
78
+
79
+ if (detectionMethod === 'DAY_THRESHOLD') {
80
+ if (typeof dayThreshold === 'undefined') {
81
+ throw new Error("isFinalData: 'dayThreshold' is required when using 'DAY_THRESHOLD' detectionMethod.");
82
+ }
83
+ if (!Number.isInteger(dayThreshold) || dayThreshold < 0) {
84
+ throw new Error("isFinalData: 'dayThreshold' must be an integer greater than or equal to 0 when using 'DAY_THRESHOLD' detectionMethod.");
85
+ }
86
+ }
87
+
88
+ if (detectionMethod === 'EXPORT_TYPE') {
89
+ return 'if(_table_suffix like \'intraday_%\' or _table_suffix like \'fresh_%\', false, true)';
90
+ }
91
+
92
+ if (detectionMethod === 'DAY_THRESHOLD') {
93
+ return `if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > ${dayThreshold}, true, false)`;
94
+ }
95
+ };
96
+
97
+ /**
98
+ * Checks whether a given column name is part of the standard/expected GA4 BigQuery export columns.
99
+ *
100
+ * The list of recognized GA4 export columns is based on the official schema as of 2026-02-18.
101
+ * This function can be used to filter or validate column names when processing GA4 data exports.
102
+ *
103
+ * @param {string} columnName - The name of the column to check.
104
+ * @returns {boolean} True if the column name is a GA4 export column, otherwise false.
105
+ */
106
+ const isGa4ExportColumn = (columnName) => {
107
+ // list updated 2026-02-18
108
+ const ga4ExportColumns = [
109
+ "event_date",
110
+ "event_timestamp",
111
+ "event_name",
112
+ "event_params",
113
+ "event_previous_timestamp",
114
+ "event_value_in_usd",
115
+ "event_bundle_sequence_id",
116
+ "event_server_timestamp_offset",
117
+ "user_id",
118
+ "user_pseudo_id",
119
+ "privacy_info",
120
+ "user_properties",
121
+ "user_first_touch_timestamp",
122
+ "user_ltv",
123
+ "device",
124
+ "geo",
125
+ "app_info",
126
+ "traffic_source",
127
+ "stream_id",
128
+ "platform",
129
+ "event_dimensions",
130
+ "ecommerce",
131
+ "items",
132
+ "collected_traffic_source",
133
+ "is_active_user",
134
+ "batch_event_index",
135
+ "batch_page_id",
136
+ "batch_ordering_id",
137
+ "session_traffic_source_last_click",
138
+ "publisher"
139
+ ];
140
+ return ga4ExportColumns.includes(columnName);
141
+ };
142
+
143
+ /**
144
+ * Generates a SQL CASE expression that determines the GA4 export type from a table suffix.
145
+ *
146
+ * Returns 'intraday' for suffixes like 'intraday_%', 'fresh' for 'fresh_%',
147
+ * and 'daily' for 8-digit date suffixes (YYYYMMDD).
148
+ *
149
+ * @param {string} tableSuffix - SQL expression or column reference for the table suffix (e.g., '_table_suffix').
150
+ * @returns {string} SQL CASE expression that evaluates to 'intraday', 'fresh', or 'daily'.
151
+ */
152
+ const getGa4ExportType = (tableSuffix) => {
153
+ return `case
154
+ when ${tableSuffix} like 'intraday_%' then 'intraday'
155
+ when ${tableSuffix} like 'fresh_%' then 'fresh'
156
+ when regexp_contains(${tableSuffix}, r'^\\d{8}$') then 'daily'
157
+ end`;
158
+ };
159
+
160
+ /**
161
+ * Generates a SQL LAST_VALUE window function that attributes item list fields
162
+ * (item_list_name, item_list_id, item_list_index) from select_item/select_promotion
163
+ * events to downstream ecommerce events using a lookback window.
164
+ *
165
+ * Returns a struct containing all three attributed fields via a single window sort.
166
+ *
167
+ * @param {'SESSION'|'TIME'} lookbackType - Window scope: session-based or time-based
168
+ * @param {string} timestampColumn - Column to order by ('event_timestamp' or 'event_custom_timestamp')
169
+ * @param {number} [lookbackTimeMs] - Lookback window in milliseconds (required when lookbackType is 'TIME')
170
+ * @returns {string} SQL expression that evaluates to a struct with item_list_name, item_list_id, item_list_index
171
+ */
172
+ const itemListAttributionExpr = (lookbackType, timestampColumn, lookbackTimeMs) => {
173
+ const selectEvents = `event_name in ('select_item', 'select_promotion')`;
174
+ const structExpr = `struct(item.item_list_name, item.item_list_id, item.item_list_index)`;
175
+
176
+ let partitionBy;
177
+ let frameBounds;
178
+
179
+ if (lookbackType === 'SESSION') {
180
+ partitionBy = 'session_id, item.item_id';
181
+ frameBounds = 'rows between unbounded preceding and current row';
182
+ } else {
183
+ // TIME-based: range window in microseconds
184
+ const lookbackMicros = lookbackTimeMs * 1000;
185
+ partitionBy = 'user_pseudo_id, item.item_id';
186
+ frameBounds = `range between ${lookbackMicros} preceding and current row`;
187
+ }
188
+
189
+ return `last_value(
190
+ if(${selectEvents}, ${structExpr}, null) ignore nulls
191
+ ) over(
192
+ partition by ${partitionBy}
193
+ order by ${timestampColumn} asc
194
+ ${frameBounds}
195
+ )`;
196
+ };
197
+
198
+ /**
199
+ * Generates a SQL expression for a deterministic hash-based row id used by the
200
+ * shared items_unnested / items_rebuilt scaffold (item-list attribution and,
201
+ * eventually, item-level data enrichments). Only computed for events in
202
+ * `ecommerceEventsFilter`; other events get NULL.
203
+ *
204
+ * The row_number() window keeps the id stable across CTE re-evaluations:
205
+ * BigQuery may inline the CTE and re-run the window per reference, so without
206
+ * a stable ordering the two sides of the downstream join could hash differently.
207
+ * partition by event_name avoids a single-partition bottleneck.
208
+ * Residual collisions (identical event_timestamp + identical items) are safe
209
+ * the rows are interchangeable, so arbitrary row number assignment between them
210
+ * produces the same result.
211
+ *
212
+ * @param {string} ecommerceEventsFilter - Comma-separated, quoted list of event names
213
+ * (e.g., "'purchase', 'add_to_cart'").
214
+ * @returns {string} SQL expression that evaluates to the row id or NULL.
215
+ */
216
+ const itemRowId = (ecommerceEventsFilter) => {
217
+ return `if(
218
+ event_name in (${ecommerceEventsFilter}),
219
+ farm_fingerprint(concat(
220
+ user_pseudo_id,
221
+ cast(event_timestamp as string),
222
+ event_name,
223
+ to_json_string(items),
224
+ cast(row_number() over(
225
+ partition by event_name, user_pseudo_id
226
+ order by event_timestamp, to_json_string(items)
227
+ ) as string)
228
+ )),
229
+ null
230
+ )`;
231
+ };
232
+
233
+ /**
234
+ * Official GA4 ecommerce events that carry item data.
235
+ * Based on: https://developers.google.com/analytics/devguides/collection/ga4/ecommerce
236
+ */
237
+ const ga4EcommerceEvents = [
238
+ 'view_item_list',
239
+ 'select_item',
240
+ 'view_promotion',
241
+ 'select_promotion',
242
+ 'view_item',
243
+ 'add_to_wishlist',
244
+ 'add_to_cart',
245
+ 'remove_from_cart',
246
+ 'view_cart',
247
+ 'begin_checkout',
248
+ 'add_shipping_info',
249
+ 'add_payment_info',
250
+ 'purchase',
251
+ 'refund',
252
+ ];
253
+
254
+ module.exports = {
255
+ sessionId,
256
+ fixEcommerceStruct,
257
+ isFinalData,
258
+ isGa4ExportColumn,
259
+ getGa4ExportType,
260
+ itemListAttributionExpr,
261
+ itemRowId,
262
+ ga4EcommerceEvents
263
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ga4-export-fixer",
3
- "version": "0.8.0-dev.2",
3
+ "version": "0.9.0-dev.1",
4
4
  "description": "",
5
5
  "main": "index.js",
6
6
  "files": [
@@ -44,8 +44,8 @@
44
44
  },
45
45
  "homepage": "https://github.com/tanelytics/ga4-export-fixer#readme",
46
46
  "devDependencies": {
47
- "@google-cloud/bigquery": "^8.1.1",
48
- "@google-cloud/dataform": "^2.2.1",
49
- "dotenv": "^17.3.1"
47
+ "@google-cloud/bigquery": "^8.3.0",
48
+ "@google-cloud/dataform": "^2.2.2",
49
+ "dotenv": "^17.4.2"
50
50
  }
51
51
  }
@@ -230,7 +230,7 @@ const _generateEnhancedEventsSQL = (mergedConfig) => {
230
230
  // ecommerce
231
231
  ecommerce: helpers.fixEcommerceStruct('ecommerce'),
232
232
  items: 'items',
233
- _item_list_attribution_row_id: itemListAttribution ? helpers.itemListAttributionRowId(ecommerceEventsFilter) : undefined,
233
+ _item_row_id: itemListAttribution ? helpers.itemRowId(ecommerceEventsFilter) : undefined,
234
234
  // flag if the data is "final" and is not expected to change anymore
235
235
  data_is_final: helpers.isFinalData(mergedConfig.dataIsFinal.detectionMethod, mergedConfig.dataIsFinal.dayThreshold),
236
236
  export_type: helpers.getGa4ExportType('_table_suffix'),
@@ -268,9 +268,10 @@ ${excludedEventsSQL}`,
268
268
  'group by': 'session_id',
269
269
  };
270
270
 
271
- // item list attribution CTEs:
272
- // 1. item_list_unnest: unnest items from ecommerce events, compute attribution via window function
273
- // 2. item_list_data: re-aggregate items with attributed list fields
271
+ // Shared item-array CTEs (currently used by item-list attribution; will also be used by
272
+ // item-level data enrichments see design_docs/planned/data-enrichments.md, Q16):
273
+ // 1. items_unnested: unnest items from ecommerce events, compute attribution via window function
274
+ // 2. items_rebuilt: re-aggregate items with attributed list fields
274
275
  const itemListSteps = itemListAttribution ? (() => {
275
276
  const attrExpr = helpers.itemListAttributionExpr(
276
277
  itemListAttribution.lookbackType,
@@ -279,12 +280,14 @@ ${excludedEventsSQL}`,
279
280
  );
280
281
  const passthroughEvents = `event_name in ('view_item_list', 'select_item', 'view_promotion', 'select_promotion')`;
281
282
 
282
- const attributionStep = {
283
- name: 'item_list_attribution',
283
+ const unnestedStep = {
284
+ name: 'items_unnested',
284
285
  select: {
285
286
  columns: {
286
- '_item_list_attribution_row_id': '_item_list_attribution_row_id',
287
+ '_item_row_id': '_item_row_id',
287
288
  'event_name': 'event_name',
289
+ // event_date is carried forward for ability to use it in data enrichment joins
290
+ 'event_date': 'event_date',
288
291
  'item': 'item',
289
292
  '_item_list_attr': attrExpr,
290
293
  },
@@ -293,11 +296,11 @@ ${excludedEventsSQL}`,
293
296
  where: `event_name in (${ecommerceEventsFilter})`,
294
297
  };
295
298
 
296
- const dataStep = {
297
- name: 'item_list_data',
299
+ const rebuiltStep = {
300
+ name: 'items_rebuilt',
298
301
  select: {
299
302
  columns: {
300
- '_item_list_attribution_row_id': '_item_list_attribution_row_id',
303
+ '_item_row_id': '_item_row_id',
301
304
  'items': `array_agg(
302
305
  (select as struct item.* replace(
303
306
  coalesce(if(${passthroughEvents}, item.item_list_name, _item_list_attr.item_list_name), '(not set)') as item_list_name,
@@ -307,21 +310,21 @@ ${excludedEventsSQL}`,
307
310
  )`,
308
311
  },
309
312
  },
310
- from: 'item_list_attribution',
311
- 'group by': '_item_list_attribution_row_id',
313
+ from: 'items_unnested',
314
+ 'group by': '_item_row_id',
312
315
  };
313
316
 
314
- return [attributionStep, dataStep];
317
+ return [unnestedStep, rebuiltStep];
315
318
  })() : null;
316
319
 
317
320
  const finalColumnOrder = getFinalColumnOrder(eventDataStep, sessionDataStep);
318
321
 
319
- // When item list attribution is enabled, override the items column and exclude _item_list_attribution_row_id
322
+ // When item list attribution is enabled, override the items column and exclude _item_row_id
320
323
  // COALESCE handles events without items (not in ecommerce filter) where the LEFT JOIN returns NULL
321
324
  const itemListOverrides = itemListSteps ? {
322
- items: 'coalesce(item_list_data.items, event_data.items)',
325
+ items: 'coalesce(items_rebuilt.items, event_data.items)',
323
326
  } : {};
324
- const itemListExcludedColumns = itemListSteps ? ['_item_list_attribution_row_id'] : [];
327
+ const itemListExcludedColumns = itemListSteps ? ['_item_row_id'] : [];
325
328
 
326
329
  // Join event_data and session_data, include additional logic
327
330
  // Named 'enhanced_events' so user-supplied customSteps can reference it as a stable handle.
@@ -360,8 +363,8 @@ ${excludedEventsSQL}`,
360
363
  joins: [
361
364
  ...(itemListSteps ? [{
362
365
  type: 'left',
363
- table: 'item_list_data',
364
- on: 'using(_item_list_attribution_row_id)'
366
+ table: 'items_rebuilt',
367
+ on: 'using(_item_row_id)'
365
368
  }] : []),
366
369
  {
367
370
  type: 'left',