ga4-export-fixer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +20 -0
- package/README.md +0 -0
- package/constants.js +9 -0
- package/helpers.js +720 -0
- package/index.js +7 -0
- package/package.json +32 -0
- package/preOperations.js +194 -0
- package/tables/ga4EventsEnhanced.js +445 -0
- package/utils.js +611 -0
package/helpers.js
ADDED
|
@@ -0,0 +1,720 @@
|
|
|
1
|
+
const constants = require('./constants');
|
|
2
|
+
|
|
3
|
+
/*
|
|
4
|
+
Unnesting parameters
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
// unnest any parameter from the selected params array
|
|
8
|
+
const unnestParam = (keyName, paramsArray, dataType) => {
|
|
9
|
+
if (typeof keyName !== 'string' || keyName.trim() === '') {
|
|
10
|
+
throw new Error("unnestParam: 'keyName' is required and must be a non-empty string.");
|
|
11
|
+
}
|
|
12
|
+
if (typeof paramsArray !== 'string' || paramsArray.trim() === '') {
|
|
13
|
+
throw new Error("unnestParam: 'paramsArray' is required and must be a non-empty string.");
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
if (dataType) {
|
|
17
|
+
// return the value from the selected column
|
|
18
|
+
if (dataType === 'string') {
|
|
19
|
+
return `(select value.string_value from unnest(${paramsArray}) where key = '${keyName}')`;
|
|
20
|
+
} else if (dataType === 'int' || dataType === 'int64') {
|
|
21
|
+
return `(select value.int_value from unnest(${paramsArray}) where key = '${keyName}')`;
|
|
22
|
+
} else if (dataType === 'double') {
|
|
23
|
+
return `(select value.double_value from unnest(${paramsArray}) where key = '${keyName}')`;
|
|
24
|
+
} else if (dataType === 'float' || dataType === 'float64') {
|
|
25
|
+
return `(select value.float_value from unnest(${paramsArray}) where key = '${keyName}')`;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
throw new Error(`unnestParam: Unsupported dataType '${dataType}'. Supported values are 'string', 'int', 'int64', 'double', 'float', and 'float64'.`);
|
|
29
|
+
} else {
|
|
30
|
+
// return the value from the column that has data, cast as string
|
|
31
|
+
return `(select coalesce(value.string_value, cast(value.int_value as string), cast(value.double_value as string), cast(value.float_value as string)) from unnest(${paramsArray}) where key = '${keyName}')`;
|
|
32
|
+
}
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
// event_params
|
|
36
|
+
|
|
37
|
+
// unnest a param from the event_params array
|
|
38
|
+
const unnestEventParam = (keyName, dataType) => {
|
|
39
|
+
return unnestParam(keyName, 'event_params', dataType);
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
/*
|
|
43
|
+
Common identifiers
|
|
44
|
+
*/
|
|
45
|
+
|
|
46
|
+
const sessionId = `concat(user_pseudo_id, (select value.int_value from unnest(event_params) where key = 'ga_session_id'))`;
|
|
47
|
+
|
|
48
|
+
/*
|
|
49
|
+
Date and time
|
|
50
|
+
*/
|
|
51
|
+
|
|
52
|
+
const eventDate = `cast(event_date as date format 'YYYYMMDD')`;
|
|
53
|
+
|
|
54
|
+
// get the most accurate event timestamp
|
|
55
|
+
/**
|
|
56
|
+
* Returns a SQL expression for the event timestamp in microseconds.
|
|
57
|
+
*
|
|
58
|
+
* If a custom event parameter is provided (e.g., a parameter collected as a JavaScript timestamp in milliseconds using Date.now()),
|
|
59
|
+
* this function will attempt to extract its value (via event_params) and convert it to microseconds by multiplying by 1000.
|
|
60
|
+
* If the custom parameter is not present or null, the function falls back to the default 'event_timestamp' field.
|
|
61
|
+
*
|
|
62
|
+
* Usage of customTimestampParameter is intended for event parameters that carry a JS timestamp in milliseconds (for example, set using Date.now()).
|
|
63
|
+
*
|
|
64
|
+
* @param {string} [customTimestampParameter] - Name of an event parameter containing a JS timestamp in milliseconds (e.g., collected via Date.now()).
|
|
65
|
+
* @returns {string} SQL expression for the event timestamp in microseconds.
|
|
66
|
+
*/
|
|
67
|
+
const getEventTimestampMicros = (customTimestampParameter) => {
|
|
68
|
+
if (typeof customTimestampParameter !== 'undefined' && (typeof customTimestampParameter !== 'string' || customTimestampParameter.trim() === '')) {
|
|
69
|
+
throw new Error("getEventTimestampMicros: customTimestampParameter must be undefined or a non-empty string.");
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
if (customTimestampParameter) {
|
|
73
|
+
return `coalesce((select value.int_value from unnest(event_params) where key = '${customTimestampParameter}')*1000, event_timestamp)`;
|
|
74
|
+
}
|
|
75
|
+
return 'event_timestamp';
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
// datetime in the local time zone
|
|
79
|
+
/**
|
|
80
|
+
* Returns a SQL expression representing the event's local datetime (in the specified time zone),
|
|
81
|
+
* derived from the default event_timestamp field.
|
|
82
|
+
*
|
|
83
|
+
* - This function always uses the exported GA4 event_timestamp (in microseconds) for datetime calculation.
|
|
84
|
+
* - No custom timestamp parameter from event_params is used; the extraction is strictly from event_timestamp.
|
|
85
|
+
* - The returned expression converts event_timestamp to a TIMESTAMP, then extracts the DATETIME in the desired time zone.
|
|
86
|
+
*
|
|
87
|
+
* @param {Object} config - Optional configuration with a timezone property (defaults to 'Etc/UTC').
|
|
88
|
+
* @param {string} [config.timezone] - IANA time zone string (e.g., 'Europe/Helsinki'). Defaults to 'Etc/UTC'.
|
|
89
|
+
* @returns {string} SQL expression for the local datetime of the event.
|
|
90
|
+
*
|
|
91
|
+
* @example
|
|
92
|
+
* getEventDateTime({ timezone: 'Europe/Helsinki' })
|
|
93
|
+
* // => "extract(datetime from timestamp_micros(event_timestamp) at time zone 'Europe/Helsinki')"
|
|
94
|
+
*/
|
|
95
|
+
const getEventDateTime = (config) => {
|
|
96
|
+
const timezone = config?.timezone || 'Etc/UTC';
|
|
97
|
+
return `extract(datetime from timestamp_micros(${getEventTimestampMicros()}) at time zone '${timezone}')`;
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
// Filter the export tables by date range
|
|
101
|
+
/**
|
|
102
|
+
* Generates a SQL filter condition for selecting GA4 export tables based on the export type ('intraday' or 'daily') and a date range.
|
|
103
|
+
*
|
|
104
|
+
* This helper produces SQL snippets to be used in WHERE clauses, ensuring only tables within the provided date range and export type are included.
|
|
105
|
+
*
|
|
106
|
+
* - For 'daily' exports: Matches table suffixes formatted as YYYYMMDD (e.g., 20240101).
|
|
107
|
+
* - For 'intraday' exports: Matches table suffixes prefixed with 'intraday_' followed by the date (e.g., intraday_20240101).
|
|
108
|
+
* - Throws an error for unsupported export types or if start/end dates are undefined.
|
|
109
|
+
*
|
|
110
|
+
* @param {'intraday'|'daily'} exportType - The type of export table; either 'intraday' or 'daily'.
|
|
111
|
+
* @param {string} start - The start date value as a SQL date expression (e.g. 'current_date()-1').
|
|
112
|
+
* @param {string} end - The end date value as a SQL date expression (e.g. 'current_date()').
|
|
113
|
+
* @returns {string} SQL condition to restrict tables by _table_suffix to the appropriate date range and export type.
|
|
114
|
+
*
|
|
115
|
+
* @throws {Error} If exportType is not 'intraday' or 'daily', or if start/end are not defined.
|
|
116
|
+
*
|
|
117
|
+
* @example
|
|
118
|
+
* ga4ExportDateFilter('daily', 'current_date()-1', 'current_date()')
|
|
119
|
+
* // => "(_table_suffix >= cast(current_date()-1 as string format \"YYYYMMDD\") and _table_suffix <= cast(current_date() as string format \"YYYYMMDD\"))"
|
|
120
|
+
*
|
|
121
|
+
* ga4ExportDateFilter('intraday', 'current_date()-1', 'current_date()')
|
|
122
|
+
* // => "(_table_suffix >= 'intraday_' || cast(current_date()-1 as string format \"YYYYMMDD\") and _table_suffix <= 'intraday_' || cast(current_date() as string format \"YYYYMMDD\"))"
|
|
123
|
+
*/
|
|
124
|
+
const ga4ExportDateFilter = (exportType, start, end) => {
|
|
125
|
+
if (exportType !== 'intraday' && exportType !== 'daily') {
|
|
126
|
+
throw new Error(
|
|
127
|
+
`ga4ExportDateFilter: Unsupported exportType '${exportType}'. Supported values are 'intraday' and 'daily'.`
|
|
128
|
+
);
|
|
129
|
+
}
|
|
130
|
+
if (typeof start === 'undefined' || typeof end === 'undefined') {
|
|
131
|
+
throw new Error("ga4ExportDateFilter: 'start' and 'end' parameters must be defined.");
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
if (exportType === 'intraday') {
|
|
135
|
+
return `(_table_suffix >= 'intraday_' || cast(${start} as string format "YYYYMMDD") and _table_suffix <= 'intraday_' || cast(${end} as string format "YYYYMMDD"))`;
|
|
136
|
+
}
|
|
137
|
+
if (exportType === 'daily') {
|
|
138
|
+
return `(_table_suffix >= cast(${start} as string format "YYYYMMDD") and _table_suffix <= cast(${end} as string format "YYYYMMDD"))`;
|
|
139
|
+
}
|
|
140
|
+
};
|
|
141
|
+
|
|
142
|
+
// Filter the export tables by date range for both intraday and daily exports
|
|
143
|
+
/**
|
|
144
|
+
* Generates a SQL filter condition for GA4 export tables based on the provided configuration.
|
|
145
|
+
*
|
|
146
|
+
* This function produces a composite filter condition that constrains which BigQuery GA4 export tables
|
|
147
|
+
* (daily and, optionally, intraday) are included based on start and end dates, the operational mode (test, incremental, or full refresh),
|
|
148
|
+
* and any configured buffer days for session overlap.
|
|
149
|
+
*
|
|
150
|
+
* Logic:
|
|
151
|
+
* - In test mode: Uses `testConfig.dateRangeStart` and `testConfig.dateRangeEnd` for the table date filters.
|
|
152
|
+
* - In incremental refresh mode: Uses dynamic variable placeholders for efficient incremental logic (`constants.DATE_RANGE_START_VARIABLE`, etc).
|
|
153
|
+
* - Otherwise (full refresh): Uses static date values from the configuration's preOperations.
|
|
154
|
+
* - If `includedExportTypes.intraday` is true, includes a filter for intraday tables with their own start date variable.
|
|
155
|
+
* - Applies `bufferDays` to the daily export's start boundary to ensure session completeness across date boundaries.
|
|
156
|
+
*
|
|
157
|
+
* Example output (SQL snippet):
|
|
158
|
+
* (
|
|
159
|
+
* (_table_suffix >= '20240101-1' and _table_suffix <= '20240105')
|
|
160
|
+
* or (_table_suffix >= 'intraday_20240101' and _table_suffix <= 'intraday_20240105')
|
|
161
|
+
* )
|
|
162
|
+
*
|
|
163
|
+
* @param {Object} config Configuration object governing the date filtering logic. Expected properties:
|
|
164
|
+
* @param {boolean} [config.test] Whether to use test configuration dates.
|
|
165
|
+
* @param {Object} [config.testConfig] Contains `dateRangeStart` and `dateRangeEnd` for tests.
|
|
166
|
+
* @param {boolean} [config.incremental] Whether to use incremental variable placeholders.
|
|
167
|
+
* @param {Object} [config.preOperations] Contains `dateRangeStartFullRefresh` and `dateRangeEnd` for full refresh.
|
|
168
|
+
* @param {Object} [config.includedExportTypes] Should contain `intraday` (boolean).
|
|
169
|
+
* @param {number} [config.bufferDays] Number of buffer days to extend date range for daily exports.
|
|
170
|
+
* @returns {string} SQL condition as a string that can be injected into a WHERE clause.
|
|
171
|
+
*/
|
|
172
|
+
const ga4ExportDateFilters = (config) => {
|
|
173
|
+
const bufferDays = config.bufferDays || 0;
|
|
174
|
+
|
|
175
|
+
const getStartDate = () => {
|
|
176
|
+
if (config.test) {
|
|
177
|
+
return config.testConfig.dateRangeStart;
|
|
178
|
+
}
|
|
179
|
+
if (config.incremental) {
|
|
180
|
+
return constants.DATE_RANGE_START_VARIABLE;
|
|
181
|
+
}
|
|
182
|
+
return config.preOperations.dateRangeStartFullRefresh;
|
|
183
|
+
};
|
|
184
|
+
|
|
185
|
+
const getEndDate = () => {
|
|
186
|
+
if (config.test) {
|
|
187
|
+
return config.testConfig.dateRangeEnd;
|
|
188
|
+
}
|
|
189
|
+
if (config.incremental) {
|
|
190
|
+
return constants.DATE_RANGE_END_VARIABLE;
|
|
191
|
+
}
|
|
192
|
+
return config.preOperations.dateRangeEnd;
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
const start = getStartDate();
|
|
196
|
+
const end = getEndDate();
|
|
197
|
+
const intradayStart = config.test ? config.testConfig.dateRangeStart : constants.INTRADAY_DATE_RANGE_START_VARIABLE;
|
|
198
|
+
|
|
199
|
+
return `(
|
|
200
|
+
${ga4ExportDateFilter('daily', `${start}-${bufferDays}`, end)}
|
|
201
|
+
${config.includedExportTypes.intraday ? `or ${ga4ExportDateFilter('intraday', intradayStart, end)}` : ''}
|
|
202
|
+
)`;
|
|
203
|
+
};
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Generates a SQL filter condition for restricting event data to a specific date range.
|
|
207
|
+
*
|
|
208
|
+
* This function is used to dynamically create a WHERE clause for filtering the `event_date`
|
|
209
|
+
* based on the provided configuration. It handles three primary scenarios:
|
|
210
|
+
* 1. **Test Mode (`config.test`)**: Uses explicit start and end dates from the test configuration.
|
|
211
|
+
* 2. **Incremental Refresh (`config.incremental`)**: Uses BigQuery variable placeholders
|
|
212
|
+
* for efficient incremental queries (`constants.DATE_RANGE_START_VARIABLE` and
|
|
213
|
+
* `constants.DATE_RANGE_END_VARIABLE`).
|
|
214
|
+
* 3. **Full Refresh (default)**: Uses static start and end dates from the standard config,
|
|
215
|
+
* generally for full table rebuilds.
|
|
216
|
+
*
|
|
217
|
+
* This behavior ensures that query cost estimation in BigQuery remains accurate by avoiding
|
|
218
|
+
* variable use in non-incremental queries.
|
|
219
|
+
*
|
|
220
|
+
* @param {Object} config - Configuration object controlling the date filter logic.
|
|
221
|
+
* @param {boolean} [config.test] - If true, uses explicit test dates.
|
|
222
|
+
* @param {Object} [config.testConfig] - Contains `dateRangeStart` and `dateRangeEnd` for testing.
|
|
223
|
+
* @param {boolean} [config.incremental] - If true, uses variable placeholders for incremental queries.
|
|
224
|
+
* @param {Object} [config.preOperations] - Contains full refresh date range values.
|
|
225
|
+
* @returns {string} - SQL condition string to filter the query by date range.
|
|
226
|
+
*/
|
|
227
|
+
const finalDataFilter = (config) => {
|
|
228
|
+
const setDateRange = (start, end) => {
|
|
229
|
+
return `(event_date >= ${start} and event_date <= ${end})`;
|
|
230
|
+
};
|
|
231
|
+
|
|
232
|
+
if (config.test) {
|
|
233
|
+
return setDateRange(config.testConfig.dateRangeStart, config.testConfig.dateRangeEnd);
|
|
234
|
+
}
|
|
235
|
+
if (config.incremental) {
|
|
236
|
+
return setDateRange(constants.DATE_RANGE_START_VARIABLE, constants.DATE_RANGE_END_VARIABLE);
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
return setDateRange(config.preOperations.dateRangeStartFullRefresh, config.preOperations.dateRangeEnd);
|
|
240
|
+
};
|
|
241
|
+
|
|
242
|
+
/*
|
|
243
|
+
Page details
|
|
244
|
+
*/
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Generates a SQL expression to extract the hostname from a URL.
|
|
248
|
+
*
|
|
249
|
+
* This function returns a BigQuery SQL string that:
|
|
250
|
+
* 1. Removes the HTTP or HTTPS scheme from the start of the URL using regexp_replace.
|
|
251
|
+
* 2. Extracts the hostname (the first part before the next '/') using regexp_extract.
|
|
252
|
+
*
|
|
253
|
+
* Example usage (in SQL context):
|
|
254
|
+
* SELECT ${extractUrlHostname('my_url_column')} AS hostname
|
|
255
|
+
*
|
|
256
|
+
* @param {string} url - The SQL expression or column reference containing the URL.
|
|
257
|
+
* @returns {string} - BigQuery SQL expression for extracting the hostname from the input URL.
|
|
258
|
+
*/
|
|
259
|
+
const extractUrlHostname = (url) => {
|
|
260
|
+
return `regexp_extract(
|
|
261
|
+
regexp_replace(
|
|
262
|
+
${url},
|
|
263
|
+
r'^https?://',
|
|
264
|
+
''
|
|
265
|
+
),
|
|
266
|
+
r'^[^/]+'
|
|
267
|
+
)`;
|
|
268
|
+
};
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Generates a SQL expression to extract the path component from a URL.
|
|
272
|
+
*
|
|
273
|
+
* This function returns a BigQuery SQL string that:
|
|
274
|
+
* 1. Removes the scheme and hostname (e.g., http(s)://domain) from the URL using regexp_replace.
|
|
275
|
+
* 2. Removes any query ('?') or fragment ('#') from the resulting string.
|
|
276
|
+
* 3. Trims whitespace from the result.
|
|
277
|
+
*
|
|
278
|
+
* Example usage (in SQL context):
|
|
279
|
+
* SELECT ${extractUrlPath('my_url_column')} AS path
|
|
280
|
+
*
|
|
281
|
+
* @param {string} url - The SQL expression or column reference containing the URL.
|
|
282
|
+
* @returns {string} - BigQuery SQL expression for extracting the path component from the input URL.
|
|
283
|
+
*/
|
|
284
|
+
const extractUrlPath = (url) => {
|
|
285
|
+
return `trim(
|
|
286
|
+
regexp_replace(
|
|
287
|
+
regexp_replace(
|
|
288
|
+
${url},
|
|
289
|
+
r'^https?://[^/]+',
|
|
290
|
+
''
|
|
291
|
+
),
|
|
292
|
+
r'[\\?#].*',
|
|
293
|
+
''
|
|
294
|
+
)
|
|
295
|
+
)`;
|
|
296
|
+
};
|
|
297
|
+
|
|
298
|
+
/**
|
|
299
|
+
* Generates a SQL expression to extract the query component from a URL.
|
|
300
|
+
*
|
|
301
|
+
* This function returns a BigQuery SQL string that:
|
|
302
|
+
* 1. Uses regexp_extract to retrieve the query string (the part starting with '?', up to but not including a fragment '#', if present) from the input URL.
|
|
303
|
+
* 2. Trims leading/trailing whitespace from the extracted query string.
|
|
304
|
+
*
|
|
305
|
+
* Example usage (in SQL context):
|
|
306
|
+
* SELECT ${extractUrlQuery('my_url_column')} AS url_query
|
|
307
|
+
*
|
|
308
|
+
* @param {string} url - The SQL expression or column reference containing the URL.
|
|
309
|
+
* @returns {string} - BigQuery SQL expression for extracting the query string from the input URL, including the leading '?' if present.
|
|
310
|
+
*/
|
|
311
|
+
const extractUrlQuery = (url) => {
|
|
312
|
+
return `trim(regexp_extract(${url}, r'\\?[^#]+'))`;
|
|
313
|
+
};
|
|
314
|
+
|
|
315
|
+
/**
|
|
316
|
+
* Generates a SQL expression to parse the query parameters of a URL into an array of structs (key-value pairs).
|
|
317
|
+
*
|
|
318
|
+
* This function:
|
|
319
|
+
* 1. Extracts the query string from the given URL using {@link extractUrlQuery}.
|
|
320
|
+
* 2. Splits the query string on '&' to separate individual key-value pairs.
|
|
321
|
+
* 3. Splits each pair on '=' to extract the parameter key and value.
|
|
322
|
+
* 4. Returns an array of STRUCTs with fields "key" and "value".
|
|
323
|
+
*
|
|
324
|
+
* Example usage (in SQL context):
|
|
325
|
+
* SELECT ${extractUrlQueryParams('my_url_column')} AS query_params
|
|
326
|
+
*
|
|
327
|
+
* Output schema:
|
|
328
|
+
* ARRAY<STRUCT<key STRING, value STRING>>
|
|
329
|
+
*
|
|
330
|
+
* @param {string} url - The SQL expression or column reference containing the URL.
|
|
331
|
+
* @returns {string} - BigQuery SQL expression producing an array of key/value structs for the query parameters.
|
|
332
|
+
*/
|
|
333
|
+
const extractUrlQueryParams = (url) => {
|
|
334
|
+
return `array(
|
|
335
|
+
(
|
|
336
|
+
select
|
|
337
|
+
as struct split(keyval, '=') [safe_offset(0)] as key,
|
|
338
|
+
split(keyval, '=') [safe_offset(1)] as value
|
|
339
|
+
from
|
|
340
|
+
unnest(
|
|
341
|
+
split(
|
|
342
|
+
${extractUrlQuery(url)},
|
|
343
|
+
'&'
|
|
344
|
+
)
|
|
345
|
+
) as keyval
|
|
346
|
+
)
|
|
347
|
+
)`;
|
|
348
|
+
};
|
|
349
|
+
|
|
350
|
+
/**
|
|
351
|
+
* Generates a SQL expression that extracts detailed page information from a given URL.
|
|
352
|
+
*
|
|
353
|
+
* This function produces a BigQuery SQL struct containing the following fields:
|
|
354
|
+
* - hostname: The hostname part of the URL (e.g., 'www.example.com')
|
|
355
|
+
* - path: The path portion of the URL (e.g., '/about/team')
|
|
356
|
+
* - query: The raw query string from the URL, including the leading '?', if present (e.g., '?id=123')
|
|
357
|
+
* - query_params: An array of STRUCT<key STRING, value STRING> representing parsed key/value pairs from the query string
|
|
358
|
+
*
|
|
359
|
+
* If no URL is provided, the function defaults to extracting the URL from the `page_location` event parameter.
|
|
360
|
+
* All fields are derived via helper functions that generate appropriate BigQuery SQL expressions.
|
|
361
|
+
*
|
|
362
|
+
* Example usage (in SQL context):
|
|
363
|
+
* SELECT ${extractPageDetails('my_url_column')} AS page_details
|
|
364
|
+
*
|
|
365
|
+
* Output schema (STRUCT):
|
|
366
|
+
* {
|
|
367
|
+
* hostname: STRING,
|
|
368
|
+
* path: STRING,
|
|
369
|
+
* query: STRING,
|
|
370
|
+
* query_params: ARRAY<STRUCT<key STRING, value STRING>>
|
|
371
|
+
* }
|
|
372
|
+
*
|
|
373
|
+
* @param {string} [url] - (Optional) SQL expression or column reference for the URL to extract details from.
|
|
374
|
+
* If not provided, defaults to unnesting the 'page_location' event parameter as a string.
|
|
375
|
+
* @returns {string} BigQuery SQL expression yielding a STRUCT of hostname, path, query, and query_params from the URL.
|
|
376
|
+
*/
|
|
377
|
+
const extractPageDetails = (url) => {
|
|
378
|
+
url = url || `${unnestEventParam('page_location', 'string')}`;
|
|
379
|
+
|
|
380
|
+
return `(select as struct
|
|
381
|
+
${extractUrlHostname(url)} as hostname,
|
|
382
|
+
${extractUrlPath(url)} as path,
|
|
383
|
+
${extractUrlQuery(url)} as query,
|
|
384
|
+
${extractUrlQueryParams(url)} as query_params
|
|
385
|
+
)`;
|
|
386
|
+
};
|
|
387
|
+
|
|
388
|
+
/*
|
|
389
|
+
Handling event and session parameters
|
|
390
|
+
*/
|
|
391
|
+
|
|
392
|
+
// filter the event_params array by the selected parameters
|
|
393
|
+
const filterEventParams = (params, filterType) => {
|
|
394
|
+
if (!Array.isArray(params) || !params.every(p => typeof p === 'string')) {
|
|
395
|
+
throw new Error("filterEventParams: 'params' must be an array of strings (empty array allowed).");
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
if (filterType !== 'include' && filterType !== 'exclude') {
|
|
399
|
+
throw new Error("filterEventParams: 'filterType' must be 'include' or 'exclude'.");
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
const filterParams = params.map(p => `'${p}'`).join(', ');
|
|
403
|
+
|
|
404
|
+
if (filterType === 'include') {
|
|
405
|
+
return `array(select as struct * from unnest(event_params) where key in (${filterParams}))`;
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
if (filterType === 'exclude') {
|
|
409
|
+
if (!params || params.length === 0) {
|
|
410
|
+
return 'event_params';
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
return `array(select as struct * from unnest(event_params) where key not in (${filterParams}))`;
|
|
414
|
+
}
|
|
415
|
+
};
|
|
416
|
+
|
|
417
|
+
/**
|
|
418
|
+
* Generates a BigQuery SQL expression that aggregates specified session parameters across events,
|
|
419
|
+
* returning, for each parameter, the most recent (last non-null) value by timestamp. If a parameter
|
|
420
|
+
* does not appear, a dummy struct with null values for all types is returned for that key.
|
|
421
|
+
*
|
|
422
|
+
* This is useful for building an array of session parameter structs for analytic purposes,
|
|
423
|
+
* ensuring proper presence of all expected keys and null placeholders where values are missing.
|
|
424
|
+
*
|
|
425
|
+
* The resulting SQL expression yields an ARRAY<STRUCT<key STRING, value STRUCT<string_value STRING, int_value INT64, float_value FLOAT64, double_value FLOAT64>>>.
|
|
426
|
+
*
|
|
427
|
+
* @param {string[]} paramNames - Array of parameter names (keys) to aggregate.
|
|
428
|
+
* @param {string} paramsArray - SQL expression or column reference representing the array of session parameters to aggregate.
|
|
429
|
+
* @param {string} timestampColumn - SQL expression or column indicating the timestamp associated with each parameter, used for ordering.
|
|
430
|
+
* @returns {string} SQL expression that produces an array of parameter structs with their last values or null if not present.
|
|
431
|
+
*/
|
|
432
|
+
const aggregateSessionParams = (paramNames, paramsArray, timestampColumn) => {
|
|
433
|
+
// Validate paramNames
|
|
434
|
+
if (!Array.isArray(paramNames) || !paramNames.every(p => typeof p === 'string')) {
|
|
435
|
+
throw new Error("aggregateSessionParams: 'paramNames' must be an array of strings (empty array allowed).");
|
|
436
|
+
}
|
|
437
|
+
// Validate paramsArray
|
|
438
|
+
if (typeof paramsArray !== 'string' || paramsArray.trim() === '') {
|
|
439
|
+
throw new Error("aggregateSessionParams: 'paramsArray' must be a non-empty string reference to a SQL field or expression.");
|
|
440
|
+
}
|
|
441
|
+
// Validate timestampColumn
|
|
442
|
+
if (typeof timestampColumn !== 'string' || timestampColumn.trim() === '') {
|
|
443
|
+
throw new Error("aggregateSessionParams: 'timestampColumn' must be a non-empty string reference to a SQL field or expression.");
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
if (paramNames.length > 0) {
|
|
447
|
+
const sessionParamStructs = paramNames.map(p => {
|
|
448
|
+
return `ifnull(
|
|
449
|
+
-- get the last non-null value for the parameter
|
|
450
|
+
array_agg(
|
|
451
|
+
(select as struct * from unnest(${paramsArray}) where key = '${p}') ignore nulls
|
|
452
|
+
order by ${timestampColumn} desc
|
|
453
|
+
limit 1
|
|
454
|
+
)[safe_offset(0)],
|
|
455
|
+
-- if no value is found, return a dummy value
|
|
456
|
+
(
|
|
457
|
+
select as struct
|
|
458
|
+
'${p}' as key,
|
|
459
|
+
(
|
|
460
|
+
select as struct
|
|
461
|
+
cast(null as string) as string_value,
|
|
462
|
+
cast(null as int64) as int_value,
|
|
463
|
+
cast(null as float64) as float_value,
|
|
464
|
+
cast(null as float64) as double_value
|
|
465
|
+
) as value
|
|
466
|
+
)
|
|
467
|
+
)`;
|
|
468
|
+
});
|
|
469
|
+
|
|
470
|
+
return `[
|
|
471
|
+
${sessionParamStructs.join(',\n ')}
|
|
472
|
+
]`;
|
|
473
|
+
} else {
|
|
474
|
+
// declare the session_params in the schema even if no session params are specified
|
|
475
|
+
return `cast([] as array<struct<key string, value struct<string_value string, int_value int64, float_value float64, double_value float64>>>)`;
|
|
476
|
+
}
|
|
477
|
+
};
|
|
478
|
+
|
|
479
|
+
/**
|
|
480
|
+
* Produces a SQL expression that returns an array of session parameter structs
|
|
481
|
+
* from the given paramsArray, excluding any where all value fields are null.
|
|
482
|
+
*
|
|
483
|
+
* This helper is useful for cleaning up session_params or event_params arrays
|
|
484
|
+
* by removing elements whose value is entirely null (i.e., string_value, int_value,
|
|
485
|
+
* float_value, and double_value are all null). The resulting array contains
|
|
486
|
+
* only parameter entries with at least one non-null value.
|
|
487
|
+
*
|
|
488
|
+
* @param {string} paramsArray - The name of the array field or SQL expression to unnest (e.g. 'session_params' or 'event_params').
|
|
489
|
+
* @returns {string} SQL expression that yields an array of non-null parameter structs.
|
|
490
|
+
*
|
|
491
|
+
* @example
|
|
492
|
+
* excludeNullSessionParams('session_params')
|
|
493
|
+
* // => "array(select as struct * from unnest(session_params) where value.string_value is not null or value.int_value is not null or value.float_value is not null or value.double_value is not null)"
|
|
494
|
+
*/
|
|
495
|
+
const excludeNullSessionParams = (paramsArray) => {
|
|
496
|
+
if (typeof paramsArray !== 'string' || paramsArray.trim() === '') {
|
|
497
|
+
throw new Error("excludeNullSessionParams: 'paramsArray' is required and must be a non-empty string.");
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
return `array(select as struct * from unnest(${paramsArray}) where value.string_value is not null or value.int_value is not null or value.float_value is not null or value.double_value is not null)`;
|
|
501
|
+
};
|
|
502
|
+
|
|
503
|
+
/*
|
|
504
|
+
Aggregation
|
|
505
|
+
*/
|
|
506
|
+
|
|
507
|
+
/**
|
|
508
|
+
* Generates a SQL aggregation expression for a specified column and aggregation type,
|
|
509
|
+
* optionally using a timestamp column for ordering 'first' or 'last' values.
|
|
510
|
+
*
|
|
511
|
+
* Supported aggregation types:
|
|
512
|
+
* - 'max': Returns the maximum value of the column.
|
|
513
|
+
* - 'min': Returns the minimum value of the column.
|
|
514
|
+
* - 'first': Returns the first non-null value of the column, ordered by the timestampColumn ascending.
|
|
515
|
+
* - 'last': Returns the last non-null value of the column, ordered by the timestampColumn descending.
|
|
516
|
+
* - 'any': Returns any (typically arbitrary) value of the column (uses BigQuery's any_value).
|
|
517
|
+
*
|
|
518
|
+
* Throws an error if required parameters are missing or an unsupported aggregation type is requested.
|
|
519
|
+
*
|
|
520
|
+
* @param {string} column - The name of the column to aggregate.
|
|
521
|
+
* @param {string} aggregateType - Type of aggregation ('max', 'min', 'first', 'last', or 'any').
|
|
522
|
+
* @param {string} timestampColumn - Column to use for ordering when aggregateType is 'first' or 'last'.
|
|
523
|
+
* @returns {string} A SQL expression for the requested aggregation.
|
|
524
|
+
* @throws {Error} If required parameters are missing or an unsupported aggregateType is provided.
|
|
525
|
+
*
|
|
526
|
+
* @example
|
|
527
|
+
* aggregateValue('user_id', 'last', 'event_timestamp')
|
|
528
|
+
* // => SQL expression for the last user_id by event_timestamp.
|
|
529
|
+
*/
|
|
530
|
+
const aggregateValue = (column, aggregateType, timestampColumn) => {
|
|
531
|
+
if (typeof column === 'undefined' || typeof timestampColumn === 'undefined') {
|
|
532
|
+
throw new Error("aggregateValue: 'column' and 'timestampColumn' are required parameters and must be defined.");
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
if (aggregateType === 'max') {
|
|
536
|
+
return `max(${column})`;
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
if (aggregateType === 'min') {
|
|
540
|
+
return `min(${column})`;
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
if (aggregateType === 'first') {
|
|
544
|
+
return `array_agg(
|
|
545
|
+
${column} ignore nulls
|
|
546
|
+
order by ${timestampColumn}
|
|
547
|
+
limit 1
|
|
548
|
+
)[safe_offset(0)]`;
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
if (aggregateType === 'last') {
|
|
552
|
+
return `array_agg(
|
|
553
|
+
${column} ignore nulls
|
|
554
|
+
order by ${timestampColumn} desc
|
|
555
|
+
limit 1
|
|
556
|
+
)[safe_offset(0)]`;
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
if (aggregateType === 'any') {
|
|
560
|
+
return `any_value(${column})`;
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
throw new Error(`aggregateValue: Unsupported aggregateType '${aggregateType}'. Supported values are 'max', 'min', 'first', 'last', and 'any'.`);
|
|
564
|
+
};
|
|
565
|
+
|
|
566
|
+
/*
|
|
567
|
+
Ecommerce
|
|
568
|
+
*/
|
|
569
|
+
|
|
570
|
+
/**
|
|
571
|
+
* Fixes and normalizes the ecommerce struct extracted from GA4 event data.
|
|
572
|
+
*
|
|
573
|
+
* This helper returns a SQL expression that:
|
|
574
|
+
* - Ensures `ecommerce.transaction_id` is set to NULL if it has the placeholder string '(not set)';
|
|
575
|
+
* - For 'purchase' events, normalizes `ecommerce.purchase_revenue` by:
|
|
576
|
+
* * Removing NaN values;
|
|
577
|
+
* * Filling missing purchase revenue (an old GA4 bug) with the event parameter 'value', safely cast as FLOAT64;
|
|
578
|
+
* - Leaves other fields in the ecommerce struct unchanged.
|
|
579
|
+
*
|
|
580
|
+
* The result is a new struct with the same shape as 'ecommerce' but with cleaned transaction_id and purchase_revenue.
|
|
581
|
+
*
|
|
582
|
+
* @returns {string} A SQL snippet for SELECT AS STRUCT ... REPLACE to normalize ecommerce fields.
|
|
583
|
+
*
|
|
584
|
+
* @example
|
|
585
|
+
* fixEcommerceStruct()
|
|
586
|
+
* // => SQL string that can be used in a SELECT list to normalize ecommerce columns
|
|
587
|
+
*/
|
|
588
|
+
const fixEcommerceStruct = () => {
|
|
589
|
+
return `(select as struct ecommerce.* replace(
|
|
590
|
+
if(ecommerce.transaction_id <> '(not set)', ecommerce.transaction_id, null) as transaction_id,
|
|
591
|
+
if(
|
|
592
|
+
event_name = 'purchase',
|
|
593
|
+
coalesce(
|
|
594
|
+
-- fix possible NaN values
|
|
595
|
+
if(is_nan(ecommerce.purchase_revenue), null, ecommerce.purchase_revenue),
|
|
596
|
+
-- fix an old ga4 bug where purchase_revenue was missing
|
|
597
|
+
safe_cast(${unnestEventParam('value')} as float64)
|
|
598
|
+
),
|
|
599
|
+
null
|
|
600
|
+
) as purchase_revenue
|
|
601
|
+
))`;
|
|
602
|
+
};
|
|
603
|
+
|
|
604
|
+
/*
|
|
605
|
+
Check if GA4 data is "final" and is not expected to change anymore
|
|
606
|
+
*/
|
|
607
|
+
|
|
608
|
+
/**
|
|
609
|
+
* Generates a SQL expression to determine whether GA4 export data can be considered "final" (not subject to further change).
|
|
610
|
+
*
|
|
611
|
+
* Two detection methods are supported:
|
|
612
|
+
* - 'EXPORT_TYPE': Checks the table suffix; returns FALSE for intraday or "fresh" tables, TRUE for finalized data.
|
|
613
|
+
* - 'DAY_THRESHOLD': Considers data final if a configurable number of days has passed since event_date.
|
|
614
|
+
*
|
|
615
|
+
* @param {'EXPORT_TYPE'|'DAY_THRESHOLD'} detectionMethod - The method to use for finality determination.
|
|
616
|
+
* 'EXPORT_TYPE': Uses patterns in _table_suffix (e.g., 'intraday_%', 'fresh_%').
|
|
617
|
+
* 'DAY_THRESHOLD': Uses date difference between the current date and event_date.
|
|
618
|
+
* @param {number} [dayThreshold=3] - (Only for 'DAY_THRESHOLD') Number of days after which data is considered final.
|
|
619
|
+
* @returns {string} SQL expression that evaluates to TRUE if the data is final, otherwise FALSE.
|
|
620
|
+
*
|
|
621
|
+
* @throws {Error} If an unsupported detectionMethod is provided.
|
|
622
|
+
*
|
|
623
|
+
* @example
|
|
624
|
+
* // Checks based on export type
|
|
625
|
+
* isFinalData('EXPORT_TYPE')
|
|
626
|
+
* // => "if(_table_suffix like 'intraday_%' or _table_suffix like 'fresh_%', false, true)"
|
|
627
|
+
*
|
|
628
|
+
* // Checks using a custom day threshold
|
|
629
|
+
* isFinalData('DAY_THRESHOLD', 5)
|
|
630
|
+
* // => "if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > 5, true, false)"
|
|
631
|
+
*/
|
|
632
|
+
const isFinalData = (detectionMethod, dayThreshold) => {
|
|
633
|
+
if (detectionMethod !== 'EXPORT_TYPE' && detectionMethod !== 'DAY_THRESHOLD') {
|
|
634
|
+
throw new Error(`isFinalData: Unsupported detectionMethod '${detectionMethod}'. Supported values are 'EXPORT_TYPE' and 'DAY_THRESHOLD'.`);
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
if (typeof dayThreshold !== 'undefined' && (typeof dayThreshold !== 'number' || isNaN(dayThreshold))) {
|
|
638
|
+
throw new Error("isFinalData: 'dayThreshold' must be a number if provided.");
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
const defaultDayThreshold = 3;
|
|
642
|
+
const threshold = typeof dayThreshold !== 'undefined' ? dayThreshold : defaultDayThreshold;
|
|
643
|
+
|
|
644
|
+
if (detectionMethod === 'EXPORT_TYPE') {
|
|
645
|
+
return 'if(_table_suffix like \'intraday_%\' or _table_suffix like \'fresh_%\', false, true)';
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
if (detectionMethod === 'DAY_THRESHOLD') {
|
|
649
|
+
return `if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > ${threshold}, true, false)`;
|
|
650
|
+
}
|
|
651
|
+
};
|
|
652
|
+
|
|
653
|
+
/**
|
|
654
|
+
* Checks whether a given column name is part of the standard/expected GA4 BigQuery export columns.
|
|
655
|
+
*
|
|
656
|
+
* The list of recognized GA4 export columns is based on the official schema as of 2026-02-18.
|
|
657
|
+
* This function can be used to filter or validate column names when processing GA4 data exports.
|
|
658
|
+
*
|
|
659
|
+
* @param {string} columnName - The name of the column to check.
|
|
660
|
+
* @returns {boolean} True if the column name is a GA4 export column, otherwise false.
|
|
661
|
+
*/
|
|
662
|
+
const isGa4ExportColumn = (columnName) => {
|
|
663
|
+
// list updated 2026-02-18
|
|
664
|
+
const ga4ExportColumns = [
|
|
665
|
+
"event_date",
|
|
666
|
+
"event_timestamp",
|
|
667
|
+
"event_name",
|
|
668
|
+
"event_params",
|
|
669
|
+
"event_previous_timestamp",
|
|
670
|
+
"event_value_in_usd",
|
|
671
|
+
"event_bundle_sequence_id",
|
|
672
|
+
"event_server_timestamp_offset",
|
|
673
|
+
"user_id",
|
|
674
|
+
"user_pseudo_id",
|
|
675
|
+
"privacy_info",
|
|
676
|
+
"user_properties",
|
|
677
|
+
"user_first_touch_timestamp",
|
|
678
|
+
"user_ltv",
|
|
679
|
+
"device",
|
|
680
|
+
"geo",
|
|
681
|
+
"app_info",
|
|
682
|
+
"traffic_source",
|
|
683
|
+
"stream_id",
|
|
684
|
+
"platform",
|
|
685
|
+
"event_dimensions",
|
|
686
|
+
"ecommerce",
|
|
687
|
+
"items",
|
|
688
|
+
"collected_traffic_source",
|
|
689
|
+
"is_active_user",
|
|
690
|
+
"batch_event_index",
|
|
691
|
+
"batch_page_id",
|
|
692
|
+
"batch_ordering_id",
|
|
693
|
+
"session_traffic_source_last_click",
|
|
694
|
+
"publisher"
|
|
695
|
+
];
|
|
696
|
+
return ga4ExportColumns.includes(columnName);
|
|
697
|
+
};
|
|
698
|
+
|
|
699
|
+
module.exports = {
|
|
700
|
+
eventDate,
|
|
701
|
+
getEventDateTime,
|
|
702
|
+
getEventTimestampMicros,
|
|
703
|
+
unnestEventParam,
|
|
704
|
+
sessionId,
|
|
705
|
+
aggregateValue,
|
|
706
|
+
fixEcommerceStruct,
|
|
707
|
+
isFinalData,
|
|
708
|
+
ga4ExportDateFilter,
|
|
709
|
+
ga4ExportDateFilters,
|
|
710
|
+
filterEventParams,
|
|
711
|
+
aggregateSessionParams,
|
|
712
|
+
excludeNullSessionParams,
|
|
713
|
+
finalDataFilter,
|
|
714
|
+
extractPageDetails,
|
|
715
|
+
extractUrlHostname,
|
|
716
|
+
extractUrlPath,
|
|
717
|
+
extractUrlQuery,
|
|
718
|
+
extractUrlQueryParams,
|
|
719
|
+
isGa4ExportColumn
|
|
720
|
+
}
|