ga4-export-fixer 0.4.1 → 0.4.2-dev.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +92 -20
- package/helpers/aggregation.js +226 -0
- package/helpers/dateFilters.js +206 -0
- package/helpers/dateTime.js +58 -0
- package/helpers/ga4Transforms.js +166 -0
- package/helpers/index.js +8 -0
- package/helpers/params.js +77 -0
- package/helpers/urlParsing.js +155 -0
- package/index.js +1 -1
- package/package.json +2 -2
- package/tables/ga4EventsEnhanced.js +1 -1
- package/helpers.js +0 -812
package/README.md
CHANGED
|
@@ -27,26 +27,98 @@ The goal of the package is to **speed up development** when building data models
|
|
|
27
27
|
|
|
28
28
|
### Main Features
|
|
29
29
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
30
|
+
<table>
|
|
31
|
+
<tr>
|
|
32
|
+
<td width="50%" valign="top">
|
|
33
|
+
<b>📦 Best Available Data</b><br>
|
|
34
|
+
Combines daily, fresh (360) & intraday exports so the most complete version is always available
|
|
35
|
+
</td>
|
|
36
|
+
<td width="50%" valign="top">
|
|
37
|
+
<b>🔄 Incremental Updates</b><br>
|
|
38
|
+
Run on any schedule — daily, hourly, or custom
|
|
39
|
+
</td>
|
|
40
|
+
</tr>
|
|
41
|
+
<tr>
|
|
42
|
+
<td valign="top">
|
|
43
|
+
<b>📐 Flexible Schema</b><br>
|
|
44
|
+
Keeps the flexible structure of the original export with key fields promoted to columns for better query performance; partitioning & clustering enabled
|
|
45
|
+
</td>
|
|
46
|
+
<td valign="top">
|
|
47
|
+
<b>🤖 AI Agent Ready</b><br>
|
|
48
|
+
Extensive table & column descriptions for AI agents and humans
|
|
49
|
+
</td>
|
|
50
|
+
</tr>
|
|
51
|
+
<tr>
|
|
52
|
+
<td valign="top">
|
|
53
|
+
<b>🔑 Session Identity Resolution</b><br>
|
|
54
|
+
<code>user_id</code> resolved per session; <code>merged_user_id</code> coalesces with <code>user_pseudo_id</code>
|
|
55
|
+
</td>
|
|
56
|
+
<td valign="top">
|
|
57
|
+
<b>📡 Session Traffic Sources</b><br>
|
|
58
|
+
<code>session_first_traffic_source</code> and <code>session_traffic_source_last_click</code> computed automatically, adjusting for sessions that span midnight
|
|
59
|
+
</td>
|
|
60
|
+
</tr>
|
|
61
|
+
<tr>
|
|
62
|
+
<td valign="top">
|
|
63
|
+
<b>📍 Landing Page Detection</b><br>
|
|
64
|
+
Derived per session from the first page where <code>entrances > 0</code>
|
|
65
|
+
</td>
|
|
66
|
+
<td valign="top">
|
|
67
|
+
<b>🔗 Page URL Parsing</b><br>
|
|
68
|
+
Parsed <code>hostname</code>, <code>path</code>, <code>query</code>, and <code>query_params</code> from <code>page_location</code>
|
|
69
|
+
</td>
|
|
70
|
+
</tr>
|
|
71
|
+
<tr>
|
|
72
|
+
<td valign="top">
|
|
73
|
+
<b>🛒 Ecommerce Data Fixes</b><br>
|
|
74
|
+
Nullifies placeholder <code>transaction_id</code>; corrects <code>purchase_revenue</code> bugs
|
|
75
|
+
</td>
|
|
76
|
+
<td valign="top">
|
|
77
|
+
<b>⚙️ Event Parameter Handling</b><br>
|
|
78
|
+
Promote event params to columns; include or exclude by name
|
|
79
|
+
</td>
|
|
80
|
+
</tr>
|
|
81
|
+
<tr>
|
|
82
|
+
<td valign="top">
|
|
83
|
+
<b>📊 Session Parameters</b><br>
|
|
84
|
+
Promote selected event parameters as <code>session_params</code>
|
|
85
|
+
</td>
|
|
86
|
+
<td valign="top">
|
|
87
|
+
<b>⏱️ Custom Timestamp</b><br>
|
|
88
|
+
Use a custom event parameter as primary timestamp with automatic fallback
|
|
89
|
+
</td>
|
|
90
|
+
</tr>
|
|
91
|
+
<tr>
|
|
92
|
+
<td valign="top">
|
|
93
|
+
<b>🔒 Schema Lock</b><br>
|
|
94
|
+
Lock table schema to a specific GA4 export date to prevent schema drift
|
|
95
|
+
</td>
|
|
96
|
+
<td valign="top">
|
|
97
|
+
<b>✅ Data Freshness Tracking</b><br>
|
|
98
|
+
<code>data_is_final</code> flag and <code>export_type</code> label on every row
|
|
99
|
+
</td>
|
|
100
|
+
</tr>
|
|
101
|
+
<tr>
|
|
102
|
+
<td valign="top">
|
|
103
|
+
<b>🔃 Selective Re-processing</b><br>
|
|
104
|
+
Re-process a date range without full table rebuild using <code>incrementalStartOverride</code> and <code>incrementalEndOverride</code>
|
|
105
|
+
</td>
|
|
106
|
+
<td valign="top">
|
|
107
|
+
<b>📑 Batch Processing</b><br>
|
|
108
|
+
Process large exports in smaller batches via <code>numberOfDaysToProcess</code>
|
|
109
|
+
</td>
|
|
110
|
+
</tr>
|
|
111
|
+
<tr>
|
|
112
|
+
<td valign="top">
|
|
113
|
+
<b>🕐 Timezone-Aware Datetime</b><br>
|
|
114
|
+
<code>event_datetime</code> converted to a configurable IANA timezone
|
|
115
|
+
</td>
|
|
116
|
+
<td valign="top">
|
|
117
|
+
<b>🛡️ Zero Dependencies</b><br>
|
|
118
|
+
No additional external dependencies added to your Dataform repository
|
|
119
|
+
</td>
|
|
120
|
+
</tr>
|
|
121
|
+
</table>
|
|
50
122
|
|
|
51
123
|
### Planned, Upcoming Features
|
|
52
124
|
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
/*
|
|
2
|
+
Handling event and session parameters
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Generates a SQL expression that filters the `event_params` array by the given parameter names.
|
|
7
|
+
*
|
|
8
|
+
* When filterType is 'include', only parameters whose key matches one of the given names are kept.
|
|
9
|
+
* When filterType is 'exclude', parameters whose key matches are removed. If the params array is
|
|
10
|
+
* empty with 'exclude', the original `event_params` column is returned unfiltered.
|
|
11
|
+
*
|
|
12
|
+
* @param {string[]} params - Array of event parameter names to include or exclude.
|
|
13
|
+
* @param {'include'|'exclude'} filterType - Whether to include or exclude the listed parameters.
|
|
14
|
+
* @returns {string} SQL expression that produces a filtered event_params array.
|
|
15
|
+
* @throws {Error} If params is not an array of strings, or if filterType is not 'include' or 'exclude'.
|
|
16
|
+
*/
|
|
17
|
+
const filterEventParams = (params, filterType) => {
|
|
18
|
+
if (!Array.isArray(params) || !params.every(p => typeof p === 'string')) {
|
|
19
|
+
throw new Error("filterEventParams: 'params' must be an array of strings (empty array allowed).");
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
if (filterType !== 'include' && filterType !== 'exclude') {
|
|
23
|
+
throw new Error("filterEventParams: 'filterType' must be 'include' or 'exclude'.");
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const filterParams = params.map(p => `'${p}'`).join(', ');
|
|
27
|
+
|
|
28
|
+
if (filterType === 'include') {
|
|
29
|
+
return `array(select as struct * from unnest(event_params) where key in (${filterParams}))`;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if (filterType === 'exclude') {
|
|
33
|
+
if (!params || params.length === 0) {
|
|
34
|
+
return 'event_params';
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
return `array(select as struct * from unnest(event_params) where key not in (${filterParams}))`;
|
|
38
|
+
}
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Generates a BigQuery SQL expression that aggregates specified session parameters across events,
|
|
43
|
+
* returning, for each parameter, the most recent (last non-null) value by timestamp. If a parameter
|
|
44
|
+
* does not appear, a dummy struct with null values for all types is returned for that key.
|
|
45
|
+
*
|
|
46
|
+
* This is useful for building an array of session parameter structs for analytic purposes,
|
|
47
|
+
* ensuring proper presence of all expected keys and null placeholders where values are missing.
|
|
48
|
+
*
|
|
49
|
+
* The resulting SQL expression yields an ARRAY<STRUCT<key STRING, value STRUCT<string_value STRING, int_value INT64, float_value FLOAT64, double_value FLOAT64>>>.
|
|
50
|
+
*
|
|
51
|
+
* @param {string[]} paramNames - Array of parameter names (keys) to aggregate.
|
|
52
|
+
* @param {string} paramsArray - SQL expression or column reference representing the array of session parameters to aggregate.
|
|
53
|
+
* @param {string} timestampColumn - SQL expression or column indicating the timestamp associated with each parameter, used for ordering.
|
|
54
|
+
* @returns {string} SQL expression that produces an array of parameter structs with their last values or null if not present.
|
|
55
|
+
*/
|
|
56
|
+
const aggregateSessionParams = (paramNames, paramsArray, timestampColumn) => {
|
|
57
|
+
// Validate paramNames
|
|
58
|
+
if (!Array.isArray(paramNames) || !paramNames.every(p => typeof p === 'string')) {
|
|
59
|
+
throw new Error("aggregateSessionParams: 'paramNames' must be an array of strings (empty array allowed).");
|
|
60
|
+
}
|
|
61
|
+
// Validate paramsArray
|
|
62
|
+
if (typeof paramsArray !== 'string' || paramsArray.trim() === '') {
|
|
63
|
+
throw new Error("aggregateSessionParams: 'paramsArray' must be a non-empty string reference to a SQL field or expression.");
|
|
64
|
+
}
|
|
65
|
+
// Validate timestampColumn
|
|
66
|
+
if (typeof timestampColumn !== 'string' || timestampColumn.trim() === '') {
|
|
67
|
+
throw new Error("aggregateSessionParams: 'timestampColumn' must be a non-empty string reference to a SQL field or expression.");
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (paramNames.length > 0) {
|
|
71
|
+
const sessionParamStructs = paramNames.map(p => {
|
|
72
|
+
return `ifnull(
|
|
73
|
+
-- get the last non-null value for the parameter
|
|
74
|
+
array_agg(
|
|
75
|
+
(select as struct * from unnest(${paramsArray}) where key = '${p}') ignore nulls
|
|
76
|
+
order by ${timestampColumn} desc
|
|
77
|
+
limit 1
|
|
78
|
+
)[safe_offset(0)],
|
|
79
|
+
-- if no value is found, return a dummy value
|
|
80
|
+
(
|
|
81
|
+
select as struct
|
|
82
|
+
'${p}' as key,
|
|
83
|
+
(
|
|
84
|
+
select as struct
|
|
85
|
+
cast(null as string) as string_value,
|
|
86
|
+
cast(null as int64) as int_value,
|
|
87
|
+
cast(null as float64) as float_value,
|
|
88
|
+
cast(null as float64) as double_value
|
|
89
|
+
) as value
|
|
90
|
+
)
|
|
91
|
+
)`;
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
return `[
|
|
95
|
+
${sessionParamStructs.join(',\n ')}
|
|
96
|
+
]`;
|
|
97
|
+
} else {
|
|
98
|
+
// declare the session_params in the schema even if no session params are specified
|
|
99
|
+
return `cast([] as array<struct<key string, value struct<string_value string, int_value int64, float_value float64, double_value float64>>>)`;
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Produces a SQL expression that returns an array of session parameter structs
|
|
105
|
+
* from the given paramsArray, excluding any where all value fields are null.
|
|
106
|
+
*
|
|
107
|
+
* This helper is useful for cleaning up session_params or event_params arrays
|
|
108
|
+
* by removing elements whose value is entirely null (i.e., string_value, int_value,
|
|
109
|
+
* float_value, and double_value are all null). The resulting array contains
|
|
110
|
+
* only parameter entries with at least one non-null value.
|
|
111
|
+
*
|
|
112
|
+
* @param {string} paramsArray - The name of the array field or SQL expression to unnest (e.g. 'session_params' or 'event_params').
|
|
113
|
+
* @returns {string} SQL expression that yields an array of non-null parameter structs.
|
|
114
|
+
*
|
|
115
|
+
* @example
|
|
116
|
+
* excludeNullSessionParams('session_params')
|
|
117
|
+
* // => "array(select as struct * from unnest(session_params) where value.string_value is not null or value.int_value is not null or value.float_value is not null or value.double_value is not null)"
|
|
118
|
+
*/
|
|
119
|
+
const excludeNullSessionParams = (paramsArray) => {
|
|
120
|
+
if (typeof paramsArray !== 'string' || paramsArray.trim() === '') {
|
|
121
|
+
throw new Error("excludeNullSessionParams: 'paramsArray' is required and must be a non-empty string.");
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return `array(select as struct * from unnest(${paramsArray}) where value.string_value is not null or value.int_value is not null or value.float_value is not null or value.double_value is not null)`;
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
/*
|
|
128
|
+
Aggregation
|
|
129
|
+
*/
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Generates a SQL aggregation expression for a specified column and aggregation type,
|
|
133
|
+
* optionally using a timestamp column for ordering 'first' or 'last' values.
|
|
134
|
+
*
|
|
135
|
+
* Supported aggregation types:
|
|
136
|
+
* - 'max': Returns the maximum value of the column.
|
|
137
|
+
* - 'min': Returns the minimum value of the column.
|
|
138
|
+
* - 'first': Returns the first non-null value of the column, ordered by the timestampColumn ascending.
|
|
139
|
+
* - 'last': Returns the last non-null value of the column, ordered by the timestampColumn descending.
|
|
140
|
+
* - 'any': Returns any (typically arbitrary) value of the column (uses BigQuery's any_value).
|
|
141
|
+
*
|
|
142
|
+
* Throws an error if required parameters are missing or an unsupported aggregation type is requested.
|
|
143
|
+
*
|
|
144
|
+
* @param {string} column - The name of the column to aggregate.
|
|
145
|
+
* @param {'max'|'min'|'first'|'last'|'any'} aggregateType - Type of aggregation.
|
|
146
|
+
* @param {string} [timestampColumn] - Column to use for ordering. Required when aggregateType is 'first' or 'last'.
|
|
147
|
+
* @returns {string} A SQL expression for the requested aggregation.
|
|
148
|
+
* @throws {Error} If required parameters are missing or an unsupported aggregateType is provided.
|
|
149
|
+
*
|
|
150
|
+
* @example
|
|
151
|
+
* aggregateValue('user_id', 'last', 'event_timestamp')
|
|
152
|
+
* // => SQL expression for the last user_id by event_timestamp.
|
|
153
|
+
*/
|
|
154
|
+
const aggregateValue = (column, aggregateType, timestampColumn) => {
|
|
155
|
+
if (typeof column === 'undefined') {
|
|
156
|
+
throw new Error("aggregateValue: 'column' is a required parameter and must be defined.");
|
|
157
|
+
}
|
|
158
|
+
if (typeof aggregateType === 'undefined') {
|
|
159
|
+
throw new Error("aggregateValue: 'aggregateType' is a required parameter and must be defined.");
|
|
160
|
+
}
|
|
161
|
+
if ((aggregateType === 'first' || aggregateType === 'last') && typeof timestampColumn === 'undefined') {
|
|
162
|
+
throw new Error(`aggregateValue: 'timestampColumn' is required when aggregateType is '${aggregateType}'.`);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
if (aggregateType === 'max') {
|
|
166
|
+
return `max(${column})`;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if (aggregateType === 'min') {
|
|
170
|
+
return `min(${column})`;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
if (aggregateType === 'first') {
|
|
174
|
+
return `array_agg(
|
|
175
|
+
${column} ignore nulls
|
|
176
|
+
order by ${timestampColumn}
|
|
177
|
+
limit 1
|
|
178
|
+
)[safe_offset(0)]`;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if (aggregateType === 'last') {
|
|
182
|
+
return `array_agg(
|
|
183
|
+
${column} ignore nulls
|
|
184
|
+
order by ${timestampColumn} desc
|
|
185
|
+
limit 1
|
|
186
|
+
)[safe_offset(0)]`;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if (aggregateType === 'any') {
|
|
190
|
+
return `any_value(${column})`;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
throw new Error(`aggregateValue: Unsupported aggregateType '${aggregateType}'. Supported values are 'max', 'min', 'first', 'last', and 'any'.`);
|
|
194
|
+
};
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Generates SQL aggregation expressions for an array of column specifications.
|
|
198
|
+
*
|
|
199
|
+
* Each item in the array is passed to {@link aggregateValue} and optionally aliased.
|
|
200
|
+
* The results are joined with commas for use in a SELECT clause.
|
|
201
|
+
*
|
|
202
|
+
* @param {Object[]} values - Array of aggregation specifications.
|
|
203
|
+
* @param {string} values[].column - The column to aggregate.
|
|
204
|
+
* @param {'max'|'min'|'first'|'last'|'any'} values[].aggregateType - Type of aggregation.
|
|
205
|
+
* @param {string} [values[].timestampColumn] - Column for ordering. Required for 'first'/'last'.
|
|
206
|
+
* @param {string} [values[].alias] - Optional output alias (appended as `AS alias`).
|
|
207
|
+
* @returns {string} Comma-separated SQL aggregation expressions.
|
|
208
|
+
* @throws {Error} If values is not an array.
|
|
209
|
+
*/
|
|
210
|
+
const aggregateValues = (values) => {
|
|
211
|
+
if (Array.isArray(values)) {
|
|
212
|
+
return values.map(value => {
|
|
213
|
+
const sqlExpression = aggregateValue(value.column, value.aggregateType, value.timestampColumn)
|
|
214
|
+
return `${sqlExpression}${value.alias ? ` as ${value.alias}` : ''}`;
|
|
215
|
+
}).join(',\n ');
|
|
216
|
+
}
|
|
217
|
+
throw new Error("aggregateValues: 'values' must be an array of objects with 'column', 'aggregateType', and 'timestampColumn' properties.");
|
|
218
|
+
};
|
|
219
|
+
|
|
220
|
+
module.exports = {
|
|
221
|
+
filterEventParams,
|
|
222
|
+
aggregateSessionParams,
|
|
223
|
+
excludeNullSessionParams,
|
|
224
|
+
aggregateValue,
|
|
225
|
+
aggregateValues
|
|
226
|
+
};
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
const constants = require('../constants');
|
|
2
|
+
const { baseConfig } = require('../defaultConfig');
|
|
3
|
+
|
|
4
|
+
// Filter the export tables by date range
|
|
5
|
+
/**
|
|
6
|
+
* Generates a SQL filter condition for selecting GA4 export tables based on the export type and a date range.
|
|
7
|
+
*
|
|
8
|
+
* This helper produces SQL snippets to be used in WHERE clauses, ensuring only tables within the provided date range and export type are included.
|
|
9
|
+
*
|
|
10
|
+
* - For 'daily' exports: Matches table suffixes formatted as YYYYMMDD (e.g., 20240101).
|
|
11
|
+
* - For 'fresh' exports: Matches table suffixes prefixed with 'fresh_' followed by the date (e.g., fresh_20240101).
|
|
12
|
+
* - For 'intraday' exports: Matches table suffixes prefixed with 'intraday_' followed by the date (e.g., intraday_20240101).
|
|
13
|
+
*
|
|
14
|
+
* @param {'daily'|'fresh'|'intraday'} exportType - The type of export table.
|
|
15
|
+
* @param {string} start - The start date value as a SQL date expression (e.g. 'current_date()-1').
|
|
16
|
+
* @param {string} end - The end date value as a SQL date expression (e.g. 'current_date()').
|
|
17
|
+
* @returns {string} SQL condition to restrict tables by _table_suffix to the appropriate date range and export type.
|
|
18
|
+
*
|
|
19
|
+
* @throws {Error} If exportType is not supported, or if start/end are not defined.
|
|
20
|
+
*/
|
|
21
|
+
const ga4ExportDateFilter = (exportType, start, end) => {
|
|
22
|
+
if (exportType !== 'intraday' && exportType !== 'daily' && exportType !== 'fresh') {
|
|
23
|
+
throw new Error(
|
|
24
|
+
`ga4ExportDateFilter: Unsupported exportType '${exportType}'. Supported values are 'daily', 'fresh', and 'intraday'.`
|
|
25
|
+
);
|
|
26
|
+
}
|
|
27
|
+
if (typeof start === 'undefined' || typeof end === 'undefined') {
|
|
28
|
+
throw new Error("ga4ExportDateFilter: 'start' and 'end' parameters must be defined.");
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const prefix = exportType === 'daily' ? '' : `'${exportType}_' || `;
|
|
32
|
+
return `(_table_suffix >= ${prefix}cast(${start} as string format "YYYYMMDD") and _table_suffix <= ${prefix}cast(${end} as string format "YYYYMMDD"))`;
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Builds a `_table_suffix` WHERE clause for GA4 BigQuery export tables (daily, fresh, and/or intraday).
|
|
37
|
+
*
|
|
38
|
+
* Date boundaries are resolved differently depending on the mode:
|
|
39
|
+
* - **test** -- literal dates from `config.testConfig`
|
|
40
|
+
* - **incremental** -- BigQuery variable placeholders set by pre-operations
|
|
41
|
+
* - **full refresh** -- static dates from `config.preOperations`
|
|
42
|
+
*
|
|
43
|
+
* `bufferDays` is subtracted from the daily start date so sessions that span
|
|
44
|
+
* midnight are not partially excluded.
|
|
45
|
+
*
|
|
46
|
+
* Export priority: daily > fresh > intraday. Each lower-priority export only
|
|
47
|
+
* provides data not already covered by a higher-priority one.
|
|
48
|
+
*
|
|
49
|
+
* When fresh and daily are both enabled, the fresh start date comes from
|
|
50
|
+
* `FRESH_DATE_RANGE_START_VARIABLE` (first day with fresh but no daily table).
|
|
51
|
+
*
|
|
52
|
+
* When fresh and intraday are both enabled, intraday rows are filtered by
|
|
53
|
+
* `event_timestamp > fresh_max_event_timestamp` to avoid duplicating fresh data.
|
|
54
|
+
*
|
|
55
|
+
* When only daily and intraday are enabled (no fresh), the existing
|
|
56
|
+
* `INTRADAY_DATE_RANGE_START_VARIABLE` checkpoint logic is preserved.
|
|
57
|
+
*
|
|
58
|
+
* @param {Object} config
|
|
59
|
+
* @param {boolean} config.test - Use literal test dates.
|
|
60
|
+
* @param {Object} config.testConfig - `{ dateRangeStart, dateRangeEnd }`.
|
|
61
|
+
* @param {boolean} config.incremental - Use BigQuery variable placeholders.
|
|
62
|
+
* @param {Object} config.preOperations - `{ dateRangeStartFullRefresh, dateRangeEnd }`.
|
|
63
|
+
* @param {Object} config.includedExportTypes - `{ daily: boolean, fresh: boolean, intraday: boolean }`.
|
|
64
|
+
* @param {number} [config.bufferDays=0] - Extra days subtracted from the start date.
|
|
65
|
+
* @returns {string} SQL fragment for a WHERE clause.
|
|
66
|
+
*/
|
|
67
|
+
const ga4ExportDateFilters = (config) => {
|
|
68
|
+
const bufferDays = config.bufferDays || 0;
|
|
69
|
+
|
|
70
|
+
const getStartDate = () => {
|
|
71
|
+
if (config.test) {
|
|
72
|
+
return config.testConfig.dateRangeStart;
|
|
73
|
+
}
|
|
74
|
+
if (config.incremental) {
|
|
75
|
+
return constants.DATE_RANGE_START_VARIABLE;
|
|
76
|
+
}
|
|
77
|
+
return config.preOperations.dateRangeStartFullRefresh;
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
const getEndDate = () => {
|
|
81
|
+
if (config.test) {
|
|
82
|
+
return config.testConfig.dateRangeEnd;
|
|
83
|
+
}
|
|
84
|
+
if (config.incremental) {
|
|
85
|
+
return constants.DATE_RANGE_END_VARIABLE;
|
|
86
|
+
}
|
|
87
|
+
if (config.preOperations.numberOfDaysToProcess !== undefined) {
|
|
88
|
+
return `least(${config.preOperations.dateRangeStartFullRefresh}+${config.preOperations.numberOfDaysToProcess}-1, current_date())`;
|
|
89
|
+
}
|
|
90
|
+
return config.preOperations.dateRangeEnd;
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
const getFreshStartDate = () => {
|
|
94
|
+
// Fresh tables persist alongside daily tables (unlike intraday which gets deleted),
|
|
95
|
+
// so the checkpoint variable is needed even in test mode to avoid duplicate data.
|
|
96
|
+
if (config.includedExportTypes.fresh && config.includedExportTypes.daily) {
|
|
97
|
+
return constants.FRESH_DATE_RANGE_START_VARIABLE;
|
|
98
|
+
}
|
|
99
|
+
if (config.includedExportTypes.fresh && !config.includedExportTypes.daily) {
|
|
100
|
+
return getStartDate();
|
|
101
|
+
}
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
const getIntradayStartDate = () => {
|
|
105
|
+
// When fresh is enabled: intraday starts from the same point as fresh.
|
|
106
|
+
// Fresh tables persist alongside intraday tables, so the checkpoint is
|
|
107
|
+
// needed even in test mode to avoid duplicate data.
|
|
108
|
+
if (config.includedExportTypes.fresh) {
|
|
109
|
+
return getFreshStartDate();
|
|
110
|
+
}
|
|
111
|
+
// For non-fresh paths, test mode skips pre-operation variables.
|
|
112
|
+
if (config.test) {
|
|
113
|
+
return config.testConfig.dateRangeStart;
|
|
114
|
+
}
|
|
115
|
+
// When daily+intraday without fresh: use the existing date-based checkpoint
|
|
116
|
+
if (config.includedExportTypes.intraday && config.includedExportTypes.daily) {
|
|
117
|
+
return constants.INTRADAY_DATE_RANGE_START_VARIABLE;
|
|
118
|
+
}
|
|
119
|
+
// Intraday-only: reuse the daily start-date logic with bufferDays
|
|
120
|
+
if (config.includedExportTypes.intraday && !config.includedExportTypes.daily) {
|
|
121
|
+
return `${getStartDate()}-${bufferDays}`;
|
|
122
|
+
}
|
|
123
|
+
};
|
|
124
|
+
|
|
125
|
+
const getIntradayFilter = () => {
|
|
126
|
+
const intradayStart = getIntradayStartDate();
|
|
127
|
+
const suffixFilter = ga4ExportDateFilter('intraday', intradayStart, end);
|
|
128
|
+
|
|
129
|
+
// When fresh is also enabled, add timestamp condition to avoid duplicating fresh data.
|
|
130
|
+
// Applied even in test mode because fresh and intraday tables coexist for the same days.
|
|
131
|
+
if (config.includedExportTypes.fresh) {
|
|
132
|
+
return `(${suffixFilter} and event_timestamp > coalesce(${constants.FRESH_MAX_EVENT_TIMESTAMP_VARIABLE}, 0))`;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return suffixFilter;
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
const dailyStart = `${getStartDate()}-${bufferDays}`;
|
|
139
|
+
const freshStart = getFreshStartDate();
|
|
140
|
+
const end = getEndDate();
|
|
141
|
+
|
|
142
|
+
const dateFilters = [
|
|
143
|
+
config.includedExportTypes.daily ? ga4ExportDateFilter('daily', dailyStart, end) : null,
|
|
144
|
+
config.includedExportTypes.fresh ? ga4ExportDateFilter('fresh', freshStart, end) : null,
|
|
145
|
+
config.includedExportTypes.intraday ? getIntradayFilter() : null,
|
|
146
|
+
];
|
|
147
|
+
|
|
148
|
+
return `(
|
|
149
|
+
${dateFilters.filter(filter => !!filter).join(' or ')}
|
|
150
|
+
)`;
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Generates a SQL filter condition for restricting event data to a specific date range.
|
|
155
|
+
*
|
|
156
|
+
* This function is used to dynamically create a WHERE clause for filtering the `event_date`
|
|
157
|
+
* based on the provided configuration. It handles three primary scenarios:
|
|
158
|
+
* 1. **Test Mode (`config.test`)**: Uses explicit start and end dates from the test configuration.
|
|
159
|
+
* 2. **Incremental Refresh (`config.incremental`)**: Uses BigQuery variable placeholders
|
|
160
|
+
* for efficient incremental queries (`constants.DATE_RANGE_START_VARIABLE` and
|
|
161
|
+
* `constants.DATE_RANGE_END_VARIABLE`).
|
|
162
|
+
* 3. **Full Refresh (default)**: Uses static start and end dates from the standard config,
|
|
163
|
+
* generally for full table rebuilds.
|
|
164
|
+
*
|
|
165
|
+
* This behavior ensures that query cost estimation in BigQuery remains accurate by avoiding
|
|
166
|
+
* variable use in non-incremental queries.
|
|
167
|
+
*
|
|
168
|
+
* @param {Object} config - Configuration object controlling the date filter logic.
|
|
169
|
+
* @param {boolean} [config.test] - If true, uses explicit test dates.
|
|
170
|
+
* @param {Object} [config.testConfig] - Contains `dateRangeStart` and `dateRangeEnd` for testing.
|
|
171
|
+
* @param {boolean} [config.incremental] - If true, uses variable placeholders for incremental queries.
|
|
172
|
+
* @param {Object} [config.preOperations] - Contains full refresh date range values.
|
|
173
|
+
* @returns {string} - SQL condition string to filter the query by date range.
|
|
174
|
+
*/
|
|
175
|
+
const incrementalDateFilter = (config) => {
|
|
176
|
+
const setDateRange = (start, end) => {
|
|
177
|
+
return `(event_date >= ${start} and event_date <= ${end})`;
|
|
178
|
+
};
|
|
179
|
+
|
|
180
|
+
// test mode
|
|
181
|
+
if (config.test) {
|
|
182
|
+
const testStart = config?.testConfig?.dateRangeStart || baseConfig.testConfig.dateRangeStart;
|
|
183
|
+
const testEnd = config?.testConfig?.dateRangeEnd || baseConfig.testConfig.dateRangeEnd;
|
|
184
|
+
|
|
185
|
+
return setDateRange(testStart, testEnd);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// incremental mode
|
|
189
|
+
if (config.incremental) {
|
|
190
|
+
return setDateRange(constants.DATE_RANGE_START_VARIABLE, constants.DATE_RANGE_END_VARIABLE);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// full refresh mode
|
|
194
|
+
const fullRefreshStart = config?.preOperations?.dateRangeStartFullRefresh || baseConfig.preOperations.dateRangeStartFullRefresh;
|
|
195
|
+
const fullRefreshEnd = config?.preOperations?.numberOfDaysToProcess !== undefined
|
|
196
|
+
? `least(${fullRefreshStart}+${config.preOperations.numberOfDaysToProcess}-1, current_date())`
|
|
197
|
+
: (config?.preOperations?.dateRangeEnd || baseConfig.preOperations.dateRangeEnd);
|
|
198
|
+
|
|
199
|
+
return setDateRange(fullRefreshStart, fullRefreshEnd);
|
|
200
|
+
};
|
|
201
|
+
|
|
202
|
+
module.exports = {
|
|
203
|
+
ga4ExportDateFilter,
|
|
204
|
+
ga4ExportDateFilters,
|
|
205
|
+
incrementalDateFilter
|
|
206
|
+
};
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/*
|
|
2
|
+
Date and time
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* SQL expression that casts the GA4 `event_date` string column to a DATE using YYYYMMDD format.
|
|
7
|
+
*/
|
|
8
|
+
const eventDate = `cast(event_date as date format 'YYYYMMDD')`;
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Returns a SQL expression for the event timestamp in microseconds.
|
|
12
|
+
*
|
|
13
|
+
* If a custom event parameter is provided (e.g., a parameter collected as a JavaScript timestamp in milliseconds using Date.now()),
|
|
14
|
+
* this function will attempt to extract its value (via event_params) and convert it to microseconds by multiplying by 1000.
|
|
15
|
+
* If the custom parameter is not present or null, the function falls back to the default 'event_timestamp' field.
|
|
16
|
+
*
|
|
17
|
+
* Usage of customTimestampParameter is intended for event parameters that carry a JS timestamp in milliseconds (for example, set using Date.now()).
|
|
18
|
+
*
|
|
19
|
+
* @param {string} [customTimestampParameter] - Name of an event parameter containing a JS timestamp in milliseconds (e.g., collected via Date.now()).
|
|
20
|
+
* @returns {string} SQL expression for the event timestamp in microseconds.
|
|
21
|
+
*/
|
|
22
|
+
const getEventTimestampMicros = (customTimestampParameter) => {
|
|
23
|
+
if (typeof customTimestampParameter !== 'undefined' && (typeof customTimestampParameter !== 'string' || customTimestampParameter.trim() === '')) {
|
|
24
|
+
throw new Error("getEventTimestampMicros: customTimestampParameter must be undefined or a non-empty string.");
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
if (customTimestampParameter) {
|
|
28
|
+
return `coalesce((select value.int_value from unnest(event_params) where key = '${customTimestampParameter}')*1000, event_timestamp)`;
|
|
29
|
+
}
|
|
30
|
+
return 'event_timestamp';
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Returns a SQL expression representing the event's local datetime (in the specified time zone),
|
|
35
|
+
* derived from the default event_timestamp field.
|
|
36
|
+
*
|
|
37
|
+
* - This function always uses the exported GA4 event_timestamp (in microseconds) for datetime calculation.
|
|
38
|
+
* - No custom timestamp parameter from event_params is used; the extraction is strictly from event_timestamp.
|
|
39
|
+
* - The returned expression converts event_timestamp to a TIMESTAMP, then extracts the DATETIME in the desired time zone.
|
|
40
|
+
*
|
|
41
|
+
* @param {Object} [config] - Optional configuration with a timezone property (defaults to 'Etc/UTC').
|
|
42
|
+
* @param {string} [config.timezone] - IANA time zone string (e.g., 'Europe/Helsinki'). Defaults to 'Etc/UTC'.
|
|
43
|
+
* @returns {string} SQL expression for the local datetime of the event.
|
|
44
|
+
*
|
|
45
|
+
* @example
|
|
46
|
+
* getEventDateTime({ timezone: 'Europe/Helsinki' })
|
|
47
|
+
* // => "extract(datetime from timestamp_micros(event_timestamp) at time zone 'Europe/Helsinki')"
|
|
48
|
+
*/
|
|
49
|
+
const getEventDateTime = (config) => {
|
|
50
|
+
const timezone = config?.timezone || 'Etc/UTC';
|
|
51
|
+
return `extract(datetime from timestamp_micros(${getEventTimestampMicros()}) at time zone '${timezone}')`;
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
module.exports = {
|
|
55
|
+
eventDate,
|
|
56
|
+
getEventTimestampMicros,
|
|
57
|
+
getEventDateTime
|
|
58
|
+
};
|