ga4-export-fixer 0.8.0-dev.2 → 0.9.0-dev.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -6
- package/helpers/ga4Transforms.js +263 -262
- package/package.json +4 -4
- package/tables/ga4EventsEnhanced/index.js +21 -18
package/README.md
CHANGED
|
@@ -128,6 +128,12 @@ The goal of the package is to **speed up development** when building data models
|
|
|
128
128
|
<b>🕐 Timezone-Aware Datetime</b><br>
|
|
129
129
|
<code>event_datetime</code> converted to a configurable IANA timezone
|
|
130
130
|
</td>
|
|
131
|
+
<td valign="top">
|
|
132
|
+
<b>🧩 Custom Processing Steps</b><br>
|
|
133
|
+
Append user-defined CTEs via <code>customSteps</code> to derive new columns or join external tables
|
|
134
|
+
</td>
|
|
135
|
+
</tr>
|
|
136
|
+
<tr>
|
|
131
137
|
<td valign="top">
|
|
132
138
|
<b>🛡️ Zero Dependencies</b><br>
|
|
133
139
|
No additional external dependencies added to your Dataform repository
|
|
@@ -139,10 +145,10 @@ The goal of the package is to **speed up development** when building data models
|
|
|
139
145
|
|
|
140
146
|
Features under consideration for future releases:
|
|
141
147
|
|
|
148
|
+
- Data enrichment (item-level, session-level, event-level)
|
|
149
|
+
- Aggregated tables (ga4_session, ga4_ecommerce...)
|
|
142
150
|
- Web and app specific default configurations
|
|
143
151
|
- Custom channel grouping
|
|
144
|
-
- Data enrichment (item-level, session-level, event-level)
|
|
145
|
-
- Custom processing steps (additional CTEs)
|
|
146
152
|
- Custom traffic source attribution
|
|
147
153
|
|
|
148
154
|
## Installation
|
|
@@ -163,7 +169,7 @@ Include the package in the package.json file in your Dataform repository.
|
|
|
163
169
|
{
|
|
164
170
|
"dependencies": {
|
|
165
171
|
"@dataform/core": "3.0.42",
|
|
166
|
-
"ga4-export-fixer": "0.
|
|
172
|
+
"ga4-export-fixer": "0.8.0"
|
|
167
173
|
}
|
|
168
174
|
}
|
|
169
175
|
```
|
|
@@ -465,10 +471,11 @@ itemListAttribution: { lookbackType: 'TIME', lookbackTimeMs: 86400000 }
|
|
|
465
471
|
| ------------------------ | ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
466
472
|
| `event_data` | yes | Extracted and shaped events from `sourceTable`, with date filtering and column promotions applied. *Unfiltered for the buffer-days range.* |
|
|
467
473
|
| `session_data` | yes | Session-level aggregations (grouped by `session_id`). |
|
|
468
|
-
| `
|
|
469
|
-
| `
|
|
470
|
-
| `enhanced_events` | yes | The package's standard output shape (joined event_data + session_data +
|
|
474
|
+
| `items_unnested` | only when `itemListAttribution` is on | Per-event item rows (one row per item per ecommerce event), with attribution window function applied. |
|
|
475
|
+
| `items_rebuilt` | only when `itemListAttribution` is on | Re-aggregated items with attributed list fields, joined back to events via `_item_row_id`. |
|
|
476
|
+
| `enhanced_events` | yes | The package's standard output shape (joined event_data + session_data + items_rebuilt, columns ordered, incremental date filter applied). The natural starting point for most custom CTEs. |
|
|
471
477
|
|
|
478
|
+
Example custom step using the raw SQL format:
|
|
472
479
|
|
|
473
480
|
```javascript
|
|
474
481
|
// Add a content_group column derived from page.path
|
|
@@ -488,6 +495,28 @@ from enhanced_events`,
|
|
|
488
495
|
],
|
|
489
496
|
```
|
|
490
497
|
|
|
498
|
+
The same example in the structured shape:
|
|
499
|
+
|
|
500
|
+
```javascript
|
|
501
|
+
customSteps: [
|
|
502
|
+
{
|
|
503
|
+
name: 'final',
|
|
504
|
+
select: {
|
|
505
|
+
columns: {
|
|
506
|
+
'[sql]passthrough': 'enhanced_events.*',
|
|
507
|
+
content_group: `case
|
|
508
|
+
when page.path like '/blog/%' then 'blog'
|
|
509
|
+
when page.path like '/products/%' then 'product'
|
|
510
|
+
when page.path = '/' then 'home'
|
|
511
|
+
else 'other'
|
|
512
|
+
end`,
|
|
513
|
+
},
|
|
514
|
+
},
|
|
515
|
+
from: 'enhanced_events',
|
|
516
|
+
},
|
|
517
|
+
],
|
|
518
|
+
```
|
|
519
|
+
|
|
491
520
|
> **Note:** Custom columns aren't auto-documented. Use `dataformTableConfig.columns` to add descriptions — it's deep-merged with the package's defaults, so your keys are added or override matching defaults, and untouched defaults stay.
|
|
492
521
|
|
|
493
522
|
> **Note:** Built-in assertions assume the package's standard schema. If your custom CTEs rename, drop, or filter rows in ways that break those assumptions, disable the affected assertions explicitly via the `assertions` config option.
|
package/helpers/ga4Transforms.js
CHANGED
|
@@ -1,262 +1,263 @@
|
|
|
1
|
-
const { unnestEventParam } = require('./params');
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* SQL expression that builds a session ID by concatenating `user_pseudo_id` with the `ga_session_id` event parameter.
|
|
5
|
-
*/
|
|
6
|
-
const sessionId = `concat(user_pseudo_id, (select value.int_value from unnest(event_params) where key = 'ga_session_id'))`;
|
|
7
|
-
|
|
8
|
-
/*
|
|
9
|
-
Ecommerce
|
|
10
|
-
*/
|
|
11
|
-
|
|
12
|
-
/**
|
|
13
|
-
* Fixes and normalizes the ecommerce struct extracted from GA4 event data.
|
|
14
|
-
*
|
|
15
|
-
* This helper returns a SQL expression that:
|
|
16
|
-
* - Ensures `ecommerce.transaction_id` is set to NULL if it has the placeholder string '(not set)';
|
|
17
|
-
* - For 'purchase' events, normalizes `ecommerce.purchase_revenue` by:
|
|
18
|
-
* * Removing NaN values;
|
|
19
|
-
* * Filling missing purchase revenue (an old GA4 bug) with the event parameter 'value', safely cast as FLOAT64;
|
|
20
|
-
* - Leaves other fields in the ecommerce struct unchanged.
|
|
21
|
-
*
|
|
22
|
-
* The result is a new struct with the same shape as 'ecommerce' but with cleaned transaction_id and purchase_revenue.
|
|
23
|
-
*
|
|
24
|
-
* @returns {string} A SQL snippet for SELECT AS STRUCT ... REPLACE to normalize ecommerce fields.
|
|
25
|
-
*
|
|
26
|
-
* @example
|
|
27
|
-
* fixEcommerceStruct()
|
|
28
|
-
* // => SQL string that can be used in a SELECT list to normalize ecommerce columns
|
|
29
|
-
*/
|
|
30
|
-
const fixEcommerceStruct = () => {
|
|
31
|
-
return `(select as struct ecommerce.* replace(
|
|
32
|
-
if(ecommerce.transaction_id <> '(not set)', ecommerce.transaction_id, null) as transaction_id,
|
|
33
|
-
if(
|
|
34
|
-
event_name = 'purchase',
|
|
35
|
-
coalesce(
|
|
36
|
-
-- fix possible NaN values
|
|
37
|
-
if(is_nan(ecommerce.purchase_revenue), null, ecommerce.purchase_revenue),
|
|
38
|
-
-- fix an old ga4 bug where purchase_revenue was missing
|
|
39
|
-
safe_cast(${unnestEventParam('value')} as float64)
|
|
40
|
-
),
|
|
41
|
-
null
|
|
42
|
-
) as purchase_revenue
|
|
43
|
-
))`;
|
|
44
|
-
};
|
|
45
|
-
|
|
46
|
-
/*
|
|
47
|
-
Check if GA4 data is "final" and is not expected to change anymore
|
|
48
|
-
*/
|
|
49
|
-
|
|
50
|
-
/**
|
|
51
|
-
* Generates a SQL expression to determine whether GA4 export data can be considered "final" (not subject to further change).
|
|
52
|
-
*
|
|
53
|
-
* Two detection methods are supported:
|
|
54
|
-
* - 'EXPORT_TYPE': Checks the table suffix; returns FALSE for intraday or "fresh" tables, TRUE for finalized data.
|
|
55
|
-
* - 'DAY_THRESHOLD': Considers data final if a configurable number of days has passed since event_date.
|
|
56
|
-
*
|
|
57
|
-
* @param {'EXPORT_TYPE'|'DAY_THRESHOLD'} detectionMethod - The method to use for finality determination.
|
|
58
|
-
* 'EXPORT_TYPE': Uses patterns in _table_suffix (e.g., 'intraday_%', 'fresh_%').
|
|
59
|
-
* 'DAY_THRESHOLD': Uses date difference between the current date and event_date.
|
|
60
|
-
* @param {number} [dayThreshold] - (Only for 'DAY_THRESHOLD') Number of days after which data is considered final. Required when detectionMethod is 'DAY_THRESHOLD'.
|
|
61
|
-
* @returns {string} SQL expression that evaluates to TRUE if the data is final, otherwise FALSE.
|
|
62
|
-
*
|
|
63
|
-
* @throws {Error} If an unsupported detectionMethod is provided.
|
|
64
|
-
*
|
|
65
|
-
* @example
|
|
66
|
-
* // Checks based on export type
|
|
67
|
-
* isFinalData('EXPORT_TYPE')
|
|
68
|
-
* // => "if(_table_suffix like 'intraday_%' or _table_suffix like 'fresh_%', false, true)"
|
|
69
|
-
*
|
|
70
|
-
* // Checks using a custom day threshold
|
|
71
|
-
* isFinalData('DAY_THRESHOLD', 5)
|
|
72
|
-
* // => "if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > 5, true, false)"
|
|
73
|
-
*/
|
|
74
|
-
const isFinalData = (detectionMethod, dayThreshold) => {
|
|
75
|
-
if (detectionMethod !== 'EXPORT_TYPE' && detectionMethod !== 'DAY_THRESHOLD') {
|
|
76
|
-
throw new Error(`isFinalData: Unsupported detectionMethod '${detectionMethod}'. Supported values are 'EXPORT_TYPE' and 'DAY_THRESHOLD'.`);
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
if (detectionMethod === 'DAY_THRESHOLD') {
|
|
80
|
-
if (typeof dayThreshold === 'undefined') {
|
|
81
|
-
throw new Error("isFinalData: 'dayThreshold' is required when using 'DAY_THRESHOLD' detectionMethod.");
|
|
82
|
-
}
|
|
83
|
-
if (!Number.isInteger(dayThreshold) || dayThreshold < 0) {
|
|
84
|
-
throw new Error("isFinalData: 'dayThreshold' must be an integer greater than or equal to 0 when using 'DAY_THRESHOLD' detectionMethod.");
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
if (detectionMethod === 'EXPORT_TYPE') {
|
|
89
|
-
return 'if(_table_suffix like \'intraday_%\' or _table_suffix like \'fresh_%\', false, true)';
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
if (detectionMethod === 'DAY_THRESHOLD') {
|
|
93
|
-
return `if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > ${dayThreshold}, true, false)`;
|
|
94
|
-
}
|
|
95
|
-
};
|
|
96
|
-
|
|
97
|
-
/**
|
|
98
|
-
* Checks whether a given column name is part of the standard/expected GA4 BigQuery export columns.
|
|
99
|
-
*
|
|
100
|
-
* The list of recognized GA4 export columns is based on the official schema as of 2026-02-18.
|
|
101
|
-
* This function can be used to filter or validate column names when processing GA4 data exports.
|
|
102
|
-
*
|
|
103
|
-
* @param {string} columnName - The name of the column to check.
|
|
104
|
-
* @returns {boolean} True if the column name is a GA4 export column, otherwise false.
|
|
105
|
-
*/
|
|
106
|
-
const isGa4ExportColumn = (columnName) => {
|
|
107
|
-
// list updated 2026-02-18
|
|
108
|
-
const ga4ExportColumns = [
|
|
109
|
-
"event_date",
|
|
110
|
-
"event_timestamp",
|
|
111
|
-
"event_name",
|
|
112
|
-
"event_params",
|
|
113
|
-
"event_previous_timestamp",
|
|
114
|
-
"event_value_in_usd",
|
|
115
|
-
"event_bundle_sequence_id",
|
|
116
|
-
"event_server_timestamp_offset",
|
|
117
|
-
"user_id",
|
|
118
|
-
"user_pseudo_id",
|
|
119
|
-
"privacy_info",
|
|
120
|
-
"user_properties",
|
|
121
|
-
"user_first_touch_timestamp",
|
|
122
|
-
"user_ltv",
|
|
123
|
-
"device",
|
|
124
|
-
"geo",
|
|
125
|
-
"app_info",
|
|
126
|
-
"traffic_source",
|
|
127
|
-
"stream_id",
|
|
128
|
-
"platform",
|
|
129
|
-
"event_dimensions",
|
|
130
|
-
"ecommerce",
|
|
131
|
-
"items",
|
|
132
|
-
"collected_traffic_source",
|
|
133
|
-
"is_active_user",
|
|
134
|
-
"batch_event_index",
|
|
135
|
-
"batch_page_id",
|
|
136
|
-
"batch_ordering_id",
|
|
137
|
-
"session_traffic_source_last_click",
|
|
138
|
-
"publisher"
|
|
139
|
-
];
|
|
140
|
-
return ga4ExportColumns.includes(columnName);
|
|
141
|
-
};
|
|
142
|
-
|
|
143
|
-
/**
|
|
144
|
-
* Generates a SQL CASE expression that determines the GA4 export type from a table suffix.
|
|
145
|
-
*
|
|
146
|
-
* Returns 'intraday' for suffixes like 'intraday_%', 'fresh' for 'fresh_%',
|
|
147
|
-
* and 'daily' for 8-digit date suffixes (YYYYMMDD).
|
|
148
|
-
*
|
|
149
|
-
* @param {string} tableSuffix - SQL expression or column reference for the table suffix (e.g., '_table_suffix').
|
|
150
|
-
* @returns {string} SQL CASE expression that evaluates to 'intraday', 'fresh', or 'daily'.
|
|
151
|
-
*/
|
|
152
|
-
const getGa4ExportType = (tableSuffix) => {
|
|
153
|
-
return `case
|
|
154
|
-
when ${tableSuffix} like 'intraday_%' then 'intraday'
|
|
155
|
-
when ${tableSuffix} like 'fresh_%' then 'fresh'
|
|
156
|
-
when regexp_contains(${tableSuffix}, r'^\\d{8}$') then 'daily'
|
|
157
|
-
end`;
|
|
158
|
-
};
|
|
159
|
-
|
|
160
|
-
/**
|
|
161
|
-
* Generates a SQL LAST_VALUE window function that attributes item list fields
|
|
162
|
-
* (item_list_name, item_list_id, item_list_index) from select_item/select_promotion
|
|
163
|
-
* events to downstream ecommerce events using a lookback window.
|
|
164
|
-
*
|
|
165
|
-
* Returns a struct containing all three attributed fields via a single window sort.
|
|
166
|
-
*
|
|
167
|
-
* @param {'SESSION'|'TIME'} lookbackType - Window scope: session-based or time-based
|
|
168
|
-
* @param {string} timestampColumn - Column to order by ('event_timestamp' or 'event_custom_timestamp')
|
|
169
|
-
* @param {number} [lookbackTimeMs] - Lookback window in milliseconds (required when lookbackType is 'TIME')
|
|
170
|
-
* @returns {string} SQL expression that evaluates to a struct with item_list_name, item_list_id, item_list_index
|
|
171
|
-
*/
|
|
172
|
-
const itemListAttributionExpr = (lookbackType, timestampColumn, lookbackTimeMs) => {
|
|
173
|
-
const selectEvents = `event_name in ('select_item', 'select_promotion')`;
|
|
174
|
-
const structExpr = `struct(item.item_list_name, item.item_list_id, item.item_list_index)`;
|
|
175
|
-
|
|
176
|
-
let partitionBy;
|
|
177
|
-
let frameBounds;
|
|
178
|
-
|
|
179
|
-
if (lookbackType === 'SESSION') {
|
|
180
|
-
partitionBy = 'session_id, item.item_id';
|
|
181
|
-
frameBounds = 'rows between unbounded preceding and current row';
|
|
182
|
-
} else {
|
|
183
|
-
// TIME-based: range window in microseconds
|
|
184
|
-
const lookbackMicros = lookbackTimeMs * 1000;
|
|
185
|
-
partitionBy = 'user_pseudo_id, item.item_id';
|
|
186
|
-
frameBounds = `range between ${lookbackMicros} preceding and current row`;
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
return `last_value(
|
|
190
|
-
if(${selectEvents}, ${structExpr}, null) ignore nulls
|
|
191
|
-
) over(
|
|
192
|
-
partition by ${partitionBy}
|
|
193
|
-
order by ${timestampColumn} asc
|
|
194
|
-
${frameBounds}
|
|
195
|
-
)`;
|
|
196
|
-
};
|
|
197
|
-
|
|
198
|
-
/**
|
|
199
|
-
* Generates a SQL expression for a deterministic hash-based row id used by the
|
|
200
|
-
*
|
|
201
|
-
*
|
|
202
|
-
*
|
|
203
|
-
*
|
|
204
|
-
*
|
|
205
|
-
*
|
|
206
|
-
*
|
|
207
|
-
*
|
|
208
|
-
*
|
|
209
|
-
*
|
|
210
|
-
*
|
|
211
|
-
*
|
|
212
|
-
*
|
|
213
|
-
*
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
*
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
'
|
|
239
|
-
'
|
|
240
|
-
'
|
|
241
|
-
'
|
|
242
|
-
'
|
|
243
|
-
'
|
|
244
|
-
'
|
|
245
|
-
'
|
|
246
|
-
'
|
|
247
|
-
'
|
|
248
|
-
'
|
|
249
|
-
'
|
|
250
|
-
'
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
1
|
+
const { unnestEventParam } = require('./params');
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* SQL expression that builds a session ID by concatenating `user_pseudo_id` with the `ga_session_id` event parameter.
|
|
5
|
+
*/
|
|
6
|
+
const sessionId = `concat(user_pseudo_id, (select value.int_value from unnest(event_params) where key = 'ga_session_id'))`;
|
|
7
|
+
|
|
8
|
+
/*
|
|
9
|
+
Ecommerce
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Fixes and normalizes the ecommerce struct extracted from GA4 event data.
|
|
14
|
+
*
|
|
15
|
+
* This helper returns a SQL expression that:
|
|
16
|
+
* - Ensures `ecommerce.transaction_id` is set to NULL if it has the placeholder string '(not set)';
|
|
17
|
+
* - For 'purchase' events, normalizes `ecommerce.purchase_revenue` by:
|
|
18
|
+
* * Removing NaN values;
|
|
19
|
+
* * Filling missing purchase revenue (an old GA4 bug) with the event parameter 'value', safely cast as FLOAT64;
|
|
20
|
+
* - Leaves other fields in the ecommerce struct unchanged.
|
|
21
|
+
*
|
|
22
|
+
* The result is a new struct with the same shape as 'ecommerce' but with cleaned transaction_id and purchase_revenue.
|
|
23
|
+
*
|
|
24
|
+
* @returns {string} A SQL snippet for SELECT AS STRUCT ... REPLACE to normalize ecommerce fields.
|
|
25
|
+
*
|
|
26
|
+
* @example
|
|
27
|
+
* fixEcommerceStruct()
|
|
28
|
+
* // => SQL string that can be used in a SELECT list to normalize ecommerce columns
|
|
29
|
+
*/
|
|
30
|
+
const fixEcommerceStruct = () => {
|
|
31
|
+
return `(select as struct ecommerce.* replace(
|
|
32
|
+
if(ecommerce.transaction_id <> '(not set)', ecommerce.transaction_id, null) as transaction_id,
|
|
33
|
+
if(
|
|
34
|
+
event_name = 'purchase',
|
|
35
|
+
coalesce(
|
|
36
|
+
-- fix possible NaN values
|
|
37
|
+
if(is_nan(ecommerce.purchase_revenue), null, ecommerce.purchase_revenue),
|
|
38
|
+
-- fix an old ga4 bug where purchase_revenue was missing
|
|
39
|
+
safe_cast(${unnestEventParam('value')} as float64)
|
|
40
|
+
),
|
|
41
|
+
null
|
|
42
|
+
) as purchase_revenue
|
|
43
|
+
))`;
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
/*
|
|
47
|
+
Check if GA4 data is "final" and is not expected to change anymore
|
|
48
|
+
*/
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Generates a SQL expression to determine whether GA4 export data can be considered "final" (not subject to further change).
|
|
52
|
+
*
|
|
53
|
+
* Two detection methods are supported:
|
|
54
|
+
* - 'EXPORT_TYPE': Checks the table suffix; returns FALSE for intraday or "fresh" tables, TRUE for finalized data.
|
|
55
|
+
* - 'DAY_THRESHOLD': Considers data final if a configurable number of days has passed since event_date.
|
|
56
|
+
*
|
|
57
|
+
* @param {'EXPORT_TYPE'|'DAY_THRESHOLD'} detectionMethod - The method to use for finality determination.
|
|
58
|
+
* 'EXPORT_TYPE': Uses patterns in _table_suffix (e.g., 'intraday_%', 'fresh_%').
|
|
59
|
+
* 'DAY_THRESHOLD': Uses date difference between the current date and event_date.
|
|
60
|
+
* @param {number} [dayThreshold] - (Only for 'DAY_THRESHOLD') Number of days after which data is considered final. Required when detectionMethod is 'DAY_THRESHOLD'.
|
|
61
|
+
* @returns {string} SQL expression that evaluates to TRUE if the data is final, otherwise FALSE.
|
|
62
|
+
*
|
|
63
|
+
* @throws {Error} If an unsupported detectionMethod is provided.
|
|
64
|
+
*
|
|
65
|
+
* @example
|
|
66
|
+
* // Checks based on export type
|
|
67
|
+
* isFinalData('EXPORT_TYPE')
|
|
68
|
+
* // => "if(_table_suffix like 'intraday_%' or _table_suffix like 'fresh_%', false, true)"
|
|
69
|
+
*
|
|
70
|
+
* // Checks using a custom day threshold
|
|
71
|
+
* isFinalData('DAY_THRESHOLD', 5)
|
|
72
|
+
* // => "if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > 5, true, false)"
|
|
73
|
+
*/
|
|
74
|
+
const isFinalData = (detectionMethod, dayThreshold) => {
|
|
75
|
+
if (detectionMethod !== 'EXPORT_TYPE' && detectionMethod !== 'DAY_THRESHOLD') {
|
|
76
|
+
throw new Error(`isFinalData: Unsupported detectionMethod '${detectionMethod}'. Supported values are 'EXPORT_TYPE' and 'DAY_THRESHOLD'.`);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (detectionMethod === 'DAY_THRESHOLD') {
|
|
80
|
+
if (typeof dayThreshold === 'undefined') {
|
|
81
|
+
throw new Error("isFinalData: 'dayThreshold' is required when using 'DAY_THRESHOLD' detectionMethod.");
|
|
82
|
+
}
|
|
83
|
+
if (!Number.isInteger(dayThreshold) || dayThreshold < 0) {
|
|
84
|
+
throw new Error("isFinalData: 'dayThreshold' must be an integer greater than or equal to 0 when using 'DAY_THRESHOLD' detectionMethod.");
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
if (detectionMethod === 'EXPORT_TYPE') {
|
|
89
|
+
return 'if(_table_suffix like \'intraday_%\' or _table_suffix like \'fresh_%\', false, true)';
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if (detectionMethod === 'DAY_THRESHOLD') {
|
|
93
|
+
return `if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > ${dayThreshold}, true, false)`;
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Checks whether a given column name is part of the standard/expected GA4 BigQuery export columns.
|
|
99
|
+
*
|
|
100
|
+
* The list of recognized GA4 export columns is based on the official schema as of 2026-02-18.
|
|
101
|
+
* This function can be used to filter or validate column names when processing GA4 data exports.
|
|
102
|
+
*
|
|
103
|
+
* @param {string} columnName - The name of the column to check.
|
|
104
|
+
* @returns {boolean} True if the column name is a GA4 export column, otherwise false.
|
|
105
|
+
*/
|
|
106
|
+
const isGa4ExportColumn = (columnName) => {
|
|
107
|
+
// list updated 2026-02-18
|
|
108
|
+
const ga4ExportColumns = [
|
|
109
|
+
"event_date",
|
|
110
|
+
"event_timestamp",
|
|
111
|
+
"event_name",
|
|
112
|
+
"event_params",
|
|
113
|
+
"event_previous_timestamp",
|
|
114
|
+
"event_value_in_usd",
|
|
115
|
+
"event_bundle_sequence_id",
|
|
116
|
+
"event_server_timestamp_offset",
|
|
117
|
+
"user_id",
|
|
118
|
+
"user_pseudo_id",
|
|
119
|
+
"privacy_info",
|
|
120
|
+
"user_properties",
|
|
121
|
+
"user_first_touch_timestamp",
|
|
122
|
+
"user_ltv",
|
|
123
|
+
"device",
|
|
124
|
+
"geo",
|
|
125
|
+
"app_info",
|
|
126
|
+
"traffic_source",
|
|
127
|
+
"stream_id",
|
|
128
|
+
"platform",
|
|
129
|
+
"event_dimensions",
|
|
130
|
+
"ecommerce",
|
|
131
|
+
"items",
|
|
132
|
+
"collected_traffic_source",
|
|
133
|
+
"is_active_user",
|
|
134
|
+
"batch_event_index",
|
|
135
|
+
"batch_page_id",
|
|
136
|
+
"batch_ordering_id",
|
|
137
|
+
"session_traffic_source_last_click",
|
|
138
|
+
"publisher"
|
|
139
|
+
];
|
|
140
|
+
return ga4ExportColumns.includes(columnName);
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Generates a SQL CASE expression that determines the GA4 export type from a table suffix.
|
|
145
|
+
*
|
|
146
|
+
* Returns 'intraday' for suffixes like 'intraday_%', 'fresh' for 'fresh_%',
|
|
147
|
+
* and 'daily' for 8-digit date suffixes (YYYYMMDD).
|
|
148
|
+
*
|
|
149
|
+
* @param {string} tableSuffix - SQL expression or column reference for the table suffix (e.g., '_table_suffix').
|
|
150
|
+
* @returns {string} SQL CASE expression that evaluates to 'intraday', 'fresh', or 'daily'.
|
|
151
|
+
*/
|
|
152
|
+
const getGa4ExportType = (tableSuffix) => {
|
|
153
|
+
return `case
|
|
154
|
+
when ${tableSuffix} like 'intraday_%' then 'intraday'
|
|
155
|
+
when ${tableSuffix} like 'fresh_%' then 'fresh'
|
|
156
|
+
when regexp_contains(${tableSuffix}, r'^\\d{8}$') then 'daily'
|
|
157
|
+
end`;
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Generates a SQL LAST_VALUE window function that attributes item list fields
|
|
162
|
+
* (item_list_name, item_list_id, item_list_index) from select_item/select_promotion
|
|
163
|
+
* events to downstream ecommerce events using a lookback window.
|
|
164
|
+
*
|
|
165
|
+
* Returns a struct containing all three attributed fields via a single window sort.
|
|
166
|
+
*
|
|
167
|
+
* @param {'SESSION'|'TIME'} lookbackType - Window scope: session-based or time-based
|
|
168
|
+
* @param {string} timestampColumn - Column to order by ('event_timestamp' or 'event_custom_timestamp')
|
|
169
|
+
* @param {number} [lookbackTimeMs] - Lookback window in milliseconds (required when lookbackType is 'TIME')
|
|
170
|
+
* @returns {string} SQL expression that evaluates to a struct with item_list_name, item_list_id, item_list_index
|
|
171
|
+
*/
|
|
172
|
+
const itemListAttributionExpr = (lookbackType, timestampColumn, lookbackTimeMs) => {
|
|
173
|
+
const selectEvents = `event_name in ('select_item', 'select_promotion')`;
|
|
174
|
+
const structExpr = `struct(item.item_list_name, item.item_list_id, item.item_list_index)`;
|
|
175
|
+
|
|
176
|
+
let partitionBy;
|
|
177
|
+
let frameBounds;
|
|
178
|
+
|
|
179
|
+
if (lookbackType === 'SESSION') {
|
|
180
|
+
partitionBy = 'session_id, item.item_id';
|
|
181
|
+
frameBounds = 'rows between unbounded preceding and current row';
|
|
182
|
+
} else {
|
|
183
|
+
// TIME-based: range window in microseconds
|
|
184
|
+
const lookbackMicros = lookbackTimeMs * 1000;
|
|
185
|
+
partitionBy = 'user_pseudo_id, item.item_id';
|
|
186
|
+
frameBounds = `range between ${lookbackMicros} preceding and current row`;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
return `last_value(
|
|
190
|
+
if(${selectEvents}, ${structExpr}, null) ignore nulls
|
|
191
|
+
) over(
|
|
192
|
+
partition by ${partitionBy}
|
|
193
|
+
order by ${timestampColumn} asc
|
|
194
|
+
${frameBounds}
|
|
195
|
+
)`;
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Generates a SQL expression for a deterministic hash-based row id used by the
|
|
200
|
+
* shared items_unnested / items_rebuilt scaffold (item-list attribution and,
|
|
201
|
+
* eventually, item-level data enrichments). Only computed for events in
|
|
202
|
+
* `ecommerceEventsFilter`; other events get NULL.
|
|
203
|
+
*
|
|
204
|
+
* The row_number() window keeps the id stable across CTE re-evaluations:
|
|
205
|
+
* BigQuery may inline the CTE and re-run the window per reference, so without
|
|
206
|
+
* a stable ordering the two sides of the downstream join could hash differently.
|
|
207
|
+
* partition by event_name avoids a single-partition bottleneck.
|
|
208
|
+
* Residual collisions (identical event_timestamp + identical items) are safe —
|
|
209
|
+
* the rows are interchangeable, so arbitrary row number assignment between them
|
|
210
|
+
* produces the same result.
|
|
211
|
+
*
|
|
212
|
+
* @param {string} ecommerceEventsFilter - Comma-separated, quoted list of event names
|
|
213
|
+
* (e.g., "'purchase', 'add_to_cart'").
|
|
214
|
+
* @returns {string} SQL expression that evaluates to the row id or NULL.
|
|
215
|
+
*/
|
|
216
|
+
const itemRowId = (ecommerceEventsFilter) => {
|
|
217
|
+
return `if(
|
|
218
|
+
event_name in (${ecommerceEventsFilter}),
|
|
219
|
+
farm_fingerprint(concat(
|
|
220
|
+
user_pseudo_id,
|
|
221
|
+
cast(event_timestamp as string),
|
|
222
|
+
event_name,
|
|
223
|
+
to_json_string(items),
|
|
224
|
+
cast(row_number() over(
|
|
225
|
+
partition by event_name, user_pseudo_id
|
|
226
|
+
order by event_timestamp, to_json_string(items)
|
|
227
|
+
) as string)
|
|
228
|
+
)),
|
|
229
|
+
null
|
|
230
|
+
)`;
|
|
231
|
+
};
|
|
232
|
+
|
|
233
|
+
/**
|
|
234
|
+
* Official GA4 ecommerce events that carry item data.
|
|
235
|
+
* Based on: https://developers.google.com/analytics/devguides/collection/ga4/ecommerce
|
|
236
|
+
*/
|
|
237
|
+
const ga4EcommerceEvents = [
|
|
238
|
+
'view_item_list',
|
|
239
|
+
'select_item',
|
|
240
|
+
'view_promotion',
|
|
241
|
+
'select_promotion',
|
|
242
|
+
'view_item',
|
|
243
|
+
'add_to_wishlist',
|
|
244
|
+
'add_to_cart',
|
|
245
|
+
'remove_from_cart',
|
|
246
|
+
'view_cart',
|
|
247
|
+
'begin_checkout',
|
|
248
|
+
'add_shipping_info',
|
|
249
|
+
'add_payment_info',
|
|
250
|
+
'purchase',
|
|
251
|
+
'refund',
|
|
252
|
+
];
|
|
253
|
+
|
|
254
|
+
module.exports = {
|
|
255
|
+
sessionId,
|
|
256
|
+
fixEcommerceStruct,
|
|
257
|
+
isFinalData,
|
|
258
|
+
isGa4ExportColumn,
|
|
259
|
+
getGa4ExportType,
|
|
260
|
+
itemListAttributionExpr,
|
|
261
|
+
itemRowId,
|
|
262
|
+
ga4EcommerceEvents
|
|
263
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ga4-export-fixer",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.9.0-dev.1",
|
|
4
4
|
"description": "",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"files": [
|
|
@@ -44,8 +44,8 @@
|
|
|
44
44
|
},
|
|
45
45
|
"homepage": "https://github.com/tanelytics/ga4-export-fixer#readme",
|
|
46
46
|
"devDependencies": {
|
|
47
|
-
"@google-cloud/bigquery": "^8.
|
|
48
|
-
"@google-cloud/dataform": "^2.2.
|
|
49
|
-
"dotenv": "^17.
|
|
47
|
+
"@google-cloud/bigquery": "^8.3.0",
|
|
48
|
+
"@google-cloud/dataform": "^2.2.2",
|
|
49
|
+
"dotenv": "^17.4.2"
|
|
50
50
|
}
|
|
51
51
|
}
|
|
@@ -230,7 +230,7 @@ const _generateEnhancedEventsSQL = (mergedConfig) => {
|
|
|
230
230
|
// ecommerce
|
|
231
231
|
ecommerce: helpers.fixEcommerceStruct('ecommerce'),
|
|
232
232
|
items: 'items',
|
|
233
|
-
|
|
233
|
+
_item_row_id: itemListAttribution ? helpers.itemRowId(ecommerceEventsFilter) : undefined,
|
|
234
234
|
// flag if the data is "final" and is not expected to change anymore
|
|
235
235
|
data_is_final: helpers.isFinalData(mergedConfig.dataIsFinal.detectionMethod, mergedConfig.dataIsFinal.dayThreshold),
|
|
236
236
|
export_type: helpers.getGa4ExportType('_table_suffix'),
|
|
@@ -268,9 +268,10 @@ ${excludedEventsSQL}`,
|
|
|
268
268
|
'group by': 'session_id',
|
|
269
269
|
};
|
|
270
270
|
|
|
271
|
-
// item list attribution
|
|
272
|
-
//
|
|
273
|
-
//
|
|
271
|
+
// Shared item-array CTEs (currently used by item-list attribution; will also be used by
|
|
272
|
+
// item-level data enrichments — see design_docs/planned/data-enrichments.md, Q16):
|
|
273
|
+
// 1. items_unnested: unnest items from ecommerce events, compute attribution via window function
|
|
274
|
+
// 2. items_rebuilt: re-aggregate items with attributed list fields
|
|
274
275
|
const itemListSteps = itemListAttribution ? (() => {
|
|
275
276
|
const attrExpr = helpers.itemListAttributionExpr(
|
|
276
277
|
itemListAttribution.lookbackType,
|
|
@@ -279,12 +280,14 @@ ${excludedEventsSQL}`,
|
|
|
279
280
|
);
|
|
280
281
|
const passthroughEvents = `event_name in ('view_item_list', 'select_item', 'view_promotion', 'select_promotion')`;
|
|
281
282
|
|
|
282
|
-
const
|
|
283
|
-
name: '
|
|
283
|
+
const unnestedStep = {
|
|
284
|
+
name: 'items_unnested',
|
|
284
285
|
select: {
|
|
285
286
|
columns: {
|
|
286
|
-
'
|
|
287
|
+
'_item_row_id': '_item_row_id',
|
|
287
288
|
'event_name': 'event_name',
|
|
289
|
+
// event_date is carried forward for ability to use it in data enrichment joins
|
|
290
|
+
'event_date': 'event_date',
|
|
288
291
|
'item': 'item',
|
|
289
292
|
'_item_list_attr': attrExpr,
|
|
290
293
|
},
|
|
@@ -293,11 +296,11 @@ ${excludedEventsSQL}`,
|
|
|
293
296
|
where: `event_name in (${ecommerceEventsFilter})`,
|
|
294
297
|
};
|
|
295
298
|
|
|
296
|
-
const
|
|
297
|
-
name: '
|
|
299
|
+
const rebuiltStep = {
|
|
300
|
+
name: 'items_rebuilt',
|
|
298
301
|
select: {
|
|
299
302
|
columns: {
|
|
300
|
-
'
|
|
303
|
+
'_item_row_id': '_item_row_id',
|
|
301
304
|
'items': `array_agg(
|
|
302
305
|
(select as struct item.* replace(
|
|
303
306
|
coalesce(if(${passthroughEvents}, item.item_list_name, _item_list_attr.item_list_name), '(not set)') as item_list_name,
|
|
@@ -307,21 +310,21 @@ ${excludedEventsSQL}`,
|
|
|
307
310
|
)`,
|
|
308
311
|
},
|
|
309
312
|
},
|
|
310
|
-
from: '
|
|
311
|
-
'group by': '
|
|
313
|
+
from: 'items_unnested',
|
|
314
|
+
'group by': '_item_row_id',
|
|
312
315
|
};
|
|
313
316
|
|
|
314
|
-
return [
|
|
317
|
+
return [unnestedStep, rebuiltStep];
|
|
315
318
|
})() : null;
|
|
316
319
|
|
|
317
320
|
const finalColumnOrder = getFinalColumnOrder(eventDataStep, sessionDataStep);
|
|
318
321
|
|
|
319
|
-
// When item list attribution is enabled, override the items column and exclude
|
|
322
|
+
// When item list attribution is enabled, override the items column and exclude _item_row_id
|
|
320
323
|
// COALESCE handles events without items (not in ecommerce filter) where the LEFT JOIN returns NULL
|
|
321
324
|
const itemListOverrides = itemListSteps ? {
|
|
322
|
-
items: 'coalesce(
|
|
325
|
+
items: 'coalesce(items_rebuilt.items, event_data.items)',
|
|
323
326
|
} : {};
|
|
324
|
-
const itemListExcludedColumns = itemListSteps ? ['
|
|
327
|
+
const itemListExcludedColumns = itemListSteps ? ['_item_row_id'] : [];
|
|
325
328
|
|
|
326
329
|
// Join event_data and session_data, include additional logic
|
|
327
330
|
// Named 'enhanced_events' so user-supplied customSteps can reference it as a stable handle.
|
|
@@ -360,8 +363,8 @@ ${excludedEventsSQL}`,
|
|
|
360
363
|
joins: [
|
|
361
364
|
...(itemListSteps ? [{
|
|
362
365
|
type: 'left',
|
|
363
|
-
table: '
|
|
364
|
-
on: 'using(
|
|
366
|
+
table: 'items_rebuilt',
|
|
367
|
+
on: 'using(_item_row_id)'
|
|
365
368
|
}] : []),
|
|
366
369
|
{
|
|
367
370
|
type: 'left',
|