ga4-export-fixer 0.1.1 → 0.1.3-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -17,13 +17,15 @@ Include the package in the package.json file in your Dataform repository.
17
17
  **`package.json`**
18
18
  ```json
19
19
  {
20
- "name": "my_dataform_repo",
21
20
  "dependencies": {
22
- "@dataform/core": "3.0.39",
23
- "ga4-export-fixer": "0.1.0"
21
+ "@dataform/core": "3.0.42",
22
+ "ga4-export-fixer": "0.1.2"
24
23
  }
25
24
  }
26
25
  ```
26
+
27
+ **Note:** The best practice is to specify the package version explicitly (e.g. `"0.1.2"`) rather than using `"latest"` or `"*"`, to avoid unexpected breaking changes when the package is updated.
28
+
27
29
  In Google Cloud Dataform, click "Install Packages" to install it in your development workspace.
28
30
 
29
31
  If your Dataform repository does not have a package.json file, see this guide: https://docs.cloud.google.com/dataform/docs/manage-repository#move-to-package-json
@@ -46,6 +48,8 @@ The main features include:
46
48
 
47
49
  Create a new **ga4_events_enhanced** table using a **.js** file in your repository's **definitions** folder.
48
50
 
51
+ ##### Using Defaults
52
+
49
53
  **`definitions/ga4/ga4_events_enhanced.js`**
50
54
  ```javascript
51
55
  const { ga4EventsEnhanced } = require('ga4-export-fixer');
@@ -57,6 +61,53 @@ const config = {
57
61
  ga4EventsEnhanced.createTable(publish, config);
58
62
  ```
59
63
 
64
+ ##### With Custom Configuration
65
+
66
+ **`definitions/ga4/ga4_events_enhanced.js`**
67
+ ```javascript
68
+ const { ga4EventsEnhanced } = require('ga4-export-fixer');
69
+
70
+ const config = {
71
+ sourceTable: constants.GA4_TABLES.MY_GA4_EXPORT,
72
+ schemaLock: '20260101', // prevent possible issues from updates to the export schema
73
+ customTimestampParam: 'custom_event_timestamp', // custom timestamp collected as an event param
74
+ timezone: 'Europe/Helsinki',
75
+ // not needed data
76
+ excludedColumns: [
77
+ 'app_info',
78
+ 'publisher'
79
+ ],
80
+ // not needed events
81
+ excludedEvents: [
82
+ 'user_engagement'
83
+ ],
84
+ // transform to session-level
85
+ sessionParams: [
86
+ 'user_agent'
87
+ ],
88
+ // promote as columns
89
+ eventParamsToColumns: [
90
+ {name: 'session_engaged'},
91
+ {name: 'ga_session_number', type: 'int'},
92
+ {name: 'page_type', type: 'string'},
93
+ ],
94
+ // not needed in the event_params array
95
+ excludedEventParams: [
96
+ 'session_engaged',
97
+ 'ga_session_number',
98
+ 'page_type',
99
+ 'user_agent'
100
+ ],
101
+ // use day threshold for data_is_final
102
+ dataIsFinal: {
103
+ detectionMethod: 'DAY_THRESHOLD',
104
+ dayThreshold: 4
105
+ },
106
+ };
107
+
108
+ ga4EventsEnhanced.createTable(publish, config);
109
+ ```
110
+
60
111
  #### SQLX Deployment
61
112
 
62
113
  Alternatively, you can create the **ga4_events_enhanced** table using a .SQLX file.
@@ -91,19 +142,132 @@ pre_operations {
91
142
  }
92
143
  ```
93
144
 
145
+
146
+
147
+ #### Configuration Object
148
+
149
+ All fields are optional except `sourceTable`. Default values are applied automatically, so you only need to specify the fields you want to override.
150
+
151
+ | Field | Type | Default | Description |
152
+ |-------|------|---------|-------------|
153
+ | `sourceTable` | Dataform ref() / string | **required** | Source GA4 export table. Use `ref()` in Dataform or a string in format `` `project.dataset.table` `` |
154
+ | `self` | Dataform self() | **required for .SQLX deployment** | Reference to the table itself. Use `self()` in Dataform |
155
+ | `incremental` | Dataform incremental() | **required for .SQLX deployment** | Switch between incremental and full refresh logic. Use `incremental()` in Dataform |
156
+ | `schemaLock` | string (YYYYMMDD) | `undefined` | Lock the table schema to a specific date. Must be a valid date >= `"20241009"` |
157
+ | `timezone` | string | `'Etc/UTC'` | IANA timezone for event datetime (e.g. `'Europe/Helsinki'`) |
158
+ | `customTimestampParam` | string | `undefined` | Name of a custom event parameter containing a JS timestamp in milliseconds (e.g. collected via `Date.now()`) |
159
+ | `bufferDays` | integer | `1` | Extra days to include for sessions that span midnight |
160
+ | `test` | boolean | `false` | Enable test mode (uses `testConfig` date range instead of pre-operations) |
161
+ | `excludedEventParams` | string[] | `[]` | Event parameter names to exclude from the `event_params` array |
162
+ | `excludedEvents` | string[] | `[]` | Event names to exclude from the table |
163
+ | `excludedColumns` | string[] | `[]` | Default GA4 export columns to exclude from the final table, for example `'app_info'` or `'publisher'` |
164
+ | `sessionParams` | string[] | `[]` | Event parameter names to aggregate as session-level parameters |
165
+
166
+ **`includedExportTypes`** — which GA4 export types to include:
167
+
168
+ | Field | Type | Default | Description |
169
+ |-------|------|---------|-------------|
170
+ | `includedExportTypes.daily` | boolean | `true` | Include daily (processed) export |
171
+ | `includedExportTypes.intraday` | boolean | `true` | Include intraday export |
172
+
173
+ **`dataIsFinal`** — how to determine whether data is final (not expected to change):
174
+
175
+ | Field | Type | Default | Description |
176
+ |-------|------|---------|-------------|
177
+ | `dataIsFinal.detectionMethod` | string | `'EXPORT_TYPE'` | `'EXPORT_TYPE'` (uses table suffix, all data from the daily export is considered final) or `'DAY_THRESHOLD'` (uses days since event) |
178
+ | `dataIsFinal.dayThreshold` | integer | `4` | Days after which data is considered final. Required when `detectionMethod` is `'DAY_THRESHOLD'` |
179
+
180
+ **`testConfig`** — date range used when `test` is `true`:
181
+
182
+ | Field | Type | Default | Description |
183
+ |-------|------|---------|-------------|
184
+ | `testConfig.dateRangeStart` | string (SQL date) | `'current_date()-1'` | Start date for test queries |
185
+ | `testConfig.dateRangeEnd` | string (SQL date) | `'current_date()'` | End date for test queries |
186
+
187
+ **`preOperations`** — date range and incremental refresh configuration:
188
+
189
+ | Field | Type | Default | Description |
190
+ |-------|------|---------|-------------|
191
+ | `preOperations.dateRangeStartFullRefresh` | string (SQL date) | `'date(2000, 1, 1)'` | Start date for full refresh |
192
+ | `preOperations.dateRangeEnd` | string (SQL date) | `'current_date()'` | End date for queries |
193
+ | `preOperations.numberOfPreviousDaysToScan` | integer | `10` | Number of previous days to scan from the result table when determining the incremental refresh start checkpoint. A higher value is required if the table updates have fallen behind for some reason |
194
+ | `preOperations.incrementalStartOverride` | string (SQL date) | `undefined` | Override the incremental start date to re-process a specific range |
195
+ | `preOperations.incrementalEndOverride` | string (SQL date) | `undefined` | Override the incremental end date to re-process a specific range |
196
+
197
+ **`eventParamsToColumns`** — each item in the array is an object:
198
+
199
+ | Field | Type | Required | Description |
200
+ |-------|------|----------|-------------|
201
+ | `name` | string | Yes | Event parameter name |
202
+ | `type` | string | No | Data type: `'string'`, `'int'`, `'int64'`, `'double'`, `'float'`, or `'float64'`. If omitted, returns the value converted to a string |
203
+ | `columnName` | string | No | Column name in the output. Defaults to the parameter `name` |
204
+
205
+ Date fields (`dateRangeStart`, `dateRangeEnd`, etc.) accept string dates in `YYYYMMDD` or `YYYY-MM-DD` format, or BigQuery SQL expressions (e.g. `'current_date()'`, `'date(2026, 1, 1)'`).
206
+
94
207
  ### Helpers
95
208
 
96
- The helpers contain templates for common SQL expression needed when working with GA4 data.
209
+ The helpers contain templates for common SQL expressions. The functions are referenced by **ga4EventsEnhanced** but can also be imported as utility functions for working with GA4 data.
97
210
 
98
211
  ```javascript
99
212
  const { helpers } = require('ga4-export-fixer');
100
-
101
- // Unnest event parameters, date filters, URL extraction, session aggregation, etc.
102
- helpers.unnestEventParam('page_location', 'string');
103
- helpers.ga4ExportDateFilter('daily', 'current_date()-7', 'current_date()');
104
- helpers.extractPageDetails();
105
213
  ```
106
214
 
215
+ #### SQL Templates
216
+
217
+ | Name | Example | Description |
218
+ |------|---------|-------------|
219
+ | `eventDate` | `helpers.eventDate` | Casts `event_date` string to a DATE using YYYYMMDD format |
220
+ | `sessionId` | `helpers.sessionId` | Builds a session ID by concatenating `user_pseudo_id` and `ga_session_id` |
221
+
222
+ #### Functions
223
+
224
+ **Unnesting parameters**
225
+
226
+ | Function | Example | Description |
227
+ |----------|---------|-------------|
228
+ | `unnestEventParam` | `unnestEventParam('page_location', 'string')` | Extracts a value from the `event_params` array by key. Supported types: `'string'`, `'int'`, `'int64'`, `'double'`, `'float'`, `'float64'`. Omit type to get the value converted as a string |
229
+
230
+ **Date and time**
231
+
232
+ | Function | Example | Description |
233
+ |----------|---------|-------------|
234
+ | `getEventTimestampMicros` | `getEventTimestampMicros('custom_ts')` | Returns SQL for event timestamp in microseconds. With a custom parameter, uses it (converted from ms) with fallback to `event_timestamp` |
235
+ | `getEventDateTime` | `getEventDateTime({ timezone: 'Europe/Helsinki' })` | Returns SQL for event datetime in the given timezone. Defaults to `'Etc/UTC'` |
236
+
237
+ **Date filters**
238
+
239
+ | Function | Example | Description |
240
+ |----------|---------|-------------|
241
+ | `ga4ExportDateFilter` | `ga4ExportDateFilter('daily', 'current_date()-7', 'current_date()')` | Generates a `_table_suffix` filter for a single export type (`'daily'` or `'intraday'`) and date range |
242
+
243
+ **Page details**
244
+
245
+ | Function | Example | Description |
246
+ |----------|---------|-------------|
247
+ | `extractUrlHostname` | `extractUrlHostname('page_location')` | Extracts hostname from a URL column |
248
+ | `extractUrlPath` | `extractUrlPath('page_location')` | Extracts the path component from a URL column |
249
+ | `extractUrlQuery` | `extractUrlQuery('page_location')` | Extracts the query string (including `?`) from a URL column |
250
+ | `extractUrlQueryParams` | `extractUrlQueryParams('page_location')` | Parses URL query parameters into `ARRAY<STRUCT<key STRING, value STRING>>` |
251
+ | `extractPageDetails` | `extractPageDetails()` | Returns a struct with `hostname`, `path`, `query`, and `query_params`. Defaults to `page_location` event parameter |
252
+
253
+ **Aggregation**
254
+
255
+ | Function | Example | Description |
256
+ |----------|---------|-------------|
257
+ | `aggregateValue` | `aggregateValue('user_id', 'last', 'event_timestamp')` | Aggregates a column using `'max'`, `'min'`, `'first'`, `'last'`, or `'any'`. `'first'` and `'last'` use the timestamp column for ordering |
258
+
259
+ **Ecommerce**
260
+
261
+ | Function | Example | Description |
262
+ |----------|---------|-------------|
263
+ | `fixEcommerceStruct` | `fixEcommerceStruct()` | Cleans the ecommerce struct: sets `transaction_id` to null when `'(not set)'`, and fixes missing/NaN `purchase_revenue` for purchase events |
264
+
265
+ **Data freshness**
266
+
267
+ | Function | Example | Description |
268
+ |----------|---------|-------------|
269
+ | `isFinalData` | `isFinalData('DAY_THRESHOLD', 4)` | Returns SQL that evaluates to `true` when data is final. `'EXPORT_TYPE'` checks table suffix; `'DAY_THRESHOLD'` uses days since event (`dayThreshold` is required and must be a non-negative integer) |
270
+
107
271
  ## License
108
272
 
109
273
  MIT
package/helpers.js CHANGED
@@ -634,19 +634,21 @@ const isFinalData = (detectionMethod, dayThreshold) => {
634
634
  throw new Error(`isFinalData: Unsupported detectionMethod '${detectionMethod}'. Supported values are 'EXPORT_TYPE' and 'DAY_THRESHOLD'.`);
635
635
  }
636
636
 
637
- if (typeof dayThreshold !== 'undefined' && (typeof dayThreshold !== 'number' || isNaN(dayThreshold))) {
638
- throw new Error("isFinalData: 'dayThreshold' must be a number if provided.");
637
+ if (detectionMethod === 'DAY_THRESHOLD') {
638
+ if (typeof dayThreshold === 'undefined') {
639
+ throw new Error("isFinalData: 'dayThreshold' is required when using 'DAY_THRESHOLD' detectionMethod.");
640
+ }
641
+ if (!Number.isInteger(dayThreshold) || dayThreshold < 0) {
642
+ throw new Error("isFinalData: 'dayThreshold' must be an integer greater than or equal to 0 when using 'DAY_THRESHOLD' detectionMethod.");
643
+ }
639
644
  }
640
645
 
641
- const defaultDayThreshold = 3;
642
- const threshold = typeof dayThreshold !== 'undefined' ? dayThreshold : defaultDayThreshold;
643
-
644
646
  if (detectionMethod === 'EXPORT_TYPE') {
645
647
  return 'if(_table_suffix like \'intraday_%\' or _table_suffix like \'fresh_%\', false, true)';
646
648
  }
647
649
 
648
650
  if (detectionMethod === 'DAY_THRESHOLD') {
649
- return `if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > ${threshold}, true, false)`;
651
+ return `if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > ${dayThreshold}, true, false)`;
650
652
  }
651
653
  };
652
654
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ga4-export-fixer",
3
- "version": "0.1.1",
3
+ "version": "0.1.3-dev.0",
4
4
  "description": "",
5
5
  "main": "index.js",
6
6
  "files": [
@@ -13,7 +13,8 @@
13
13
  ],
14
14
  "scripts": {
15
15
  "test": "node tests/ga4EventsEnhanced.test.js",
16
- "test:events": "node tests/ga4EventsEnhanced.test.js"
16
+ "test:events": "node tests/ga4EventsEnhanced.test.js",
17
+ "prepublishOnly": "node scripts/updateReadme.js"
17
18
  },
18
19
  "repository": {
19
20
  "type": "git",
@@ -39,13 +39,13 @@ const defaultConfig = {
39
39
  // this is useful if you want to re-process only a specific date range
40
40
  incrementalStartOverride: undefined,
41
41
  incrementalEndOverride: undefined,
42
- numberOfPreviousDaysToScan: 5,
42
+ numberOfPreviousDaysToScan: 10,
43
43
  },
44
44
  // these parameters are excluded by default because they've been made available in other columns
45
45
  defaultExcludedEventParams: [
46
46
  'page_location',
47
47
  'ga_session_id',
48
- //'custom_event_timestamp', // poistetaan, jos käytössä
48
+ //'custom_event_timestamp', // removed if customTimestampParam is used
49
49
  ],
50
50
  excludedEventParams: [],
51
51
  eventParamsToColumns: [
@@ -57,12 +57,13 @@ const defaultConfig = {
57
57
  'first_visit'
58
58
  ],
59
59
  excludedEvents: [],
60
- // exclude these columns when extracting raw data from the export tables
61
- excludedColumns: [
60
+ defaultExcludedColumns: [
62
61
  'event_dimensions', // legacy column, not needed
63
62
  'traffic_source', // renamed to user_traffic_source
64
63
  'session_id'
65
64
  ],
65
+ // exclude these columns when extracting raw data from the export tables
66
+ excludedColumns: [],
66
67
  };
67
68
 
68
69
  // List the columns in the order they should be in the final table
@@ -227,8 +228,9 @@ const generateEnhancedEventsSQL = (config) => {
227
228
  };
228
229
 
229
230
  const getExcludedColumns = () => {
231
+ const allExcludedColumns = utils.mergeUniqueArrays(mergedConfig.defaultExcludedColumns, mergedConfig.excludedColumns);
230
232
  const excludedColumns = {};
231
- mergedConfig.excludedColumns.forEach(c => {
233
+ allExcludedColumns.forEach(c => {
232
234
  excludedColumns[c] = undefined;
233
235
  });
234
236
  return excludedColumns;
package/utils.js CHANGED
@@ -105,6 +105,7 @@ const mergeSQLConfigurations = (defaultConfig, inputConfig = {}) => {
105
105
  return defaultConfig;
106
106
  }
107
107
 
108
+ // the merged configuration object
108
109
  const result = { ...defaultConfig };
109
110
 
110
111
  for (const key in inputConfig) {
@@ -170,6 +171,27 @@ const mergeSQLConfigurations = (defaultConfig, inputConfig = {}) => {
170
171
  }
171
172
  }
172
173
 
174
+ // support different formats for passing the sourceTable path
175
+ const fixSourceTable = (sourceTable) => {
176
+ if (isDataformTableReferenceObject(sourceTable)) {
177
+ return sourceTable;
178
+ }
179
+ if (typeof sourceTable === 'string') {
180
+ const tablePath = sourceTable.replace(/[`"']/g, '').trim();
181
+ if (/^[a-zA-Z0-9-]+\.[a-zA-Z0-9_]+(\.[^\.]+)?$/.test(tablePath)) {
182
+ const project = tablePath.split('.')[0];
183
+ const dataset = tablePath.split('.')[1];
184
+ return `\`${project}.${dataset}.events_*\``;
185
+ }
186
+ }
187
+ throw new Error(`sourceTable must be a Dataform table reference or a string in the format '\`project.dataset.table\`'. Received: ${JSON.stringify(sourceTable)}`);
188
+ };
189
+
190
+ // process the sourceTable to support different formats
191
+ if (result.sourceTable) {
192
+ result.sourceTable = fixSourceTable(result.sourceTable);
193
+ }
194
+
173
195
  return result;
174
196
  };
175
197
 
@@ -178,15 +200,15 @@ const mergeSQLConfigurations = (defaultConfig, inputConfig = {}) => {
178
200
  *
179
201
  * A Dataform table reference object is expected to have the properties: 'name', 'project', and 'dataset'.
180
202
  *
181
- * @param {Object} table - The object to check.
203
+ * @param {Object} obj - The object to check.
182
204
  * @returns {boolean} True if the object is a Dataform table reference, false otherwise.
183
205
  */
184
- const isDataformTableReferenceObject = (table) => {
185
- return table &&
186
- typeof table === 'object' &&
187
- Object.hasOwn(table, 'name') &&
188
- Object.hasOwn(table, 'project') &&
189
- Object.hasOwn(table, 'dataset');
206
+ const isDataformTableReferenceObject = (obj) => {
207
+ return obj &&
208
+ typeof obj === 'object' &&
209
+ Object.hasOwn(obj, 'name') &&
210
+ Object.hasOwn(obj, 'project') &&
211
+ Object.hasOwn(obj, 'dataset');
190
212
  };
191
213
 
192
214