ga4-export-fixer 0.4.1 → 0.4.2-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,158 @@
1
+ const { unnestEventParam } = require('./params');
2
+
3
+ /*
4
+ Common identifiers
5
+ */
6
+
7
+ const sessionId = `concat(user_pseudo_id, (select value.int_value from unnest(event_params) where key = 'ga_session_id'))`;
8
+
9
+ /*
10
+ Ecommerce
11
+ */
12
+
13
+ /**
14
+ * Fixes and normalizes the ecommerce struct extracted from GA4 event data.
15
+ *
16
+ * This helper returns a SQL expression that:
17
+ * - Ensures `ecommerce.transaction_id` is set to NULL if it has the placeholder string '(not set)';
18
+ * - For 'purchase' events, normalizes `ecommerce.purchase_revenue` by:
19
+ * * Removing NaN values;
20
+ * * Filling missing purchase revenue (an old GA4 bug) with the event parameter 'value', safely cast as FLOAT64;
21
+ * - Leaves other fields in the ecommerce struct unchanged.
22
+ *
23
+ * The result is a new struct with the same shape as 'ecommerce' but with cleaned transaction_id and purchase_revenue.
24
+ *
25
+ * @returns {string} A SQL snippet for SELECT AS STRUCT ... REPLACE to normalize ecommerce fields.
26
+ *
27
+ * @example
28
+ * fixEcommerceStruct()
29
+ * // => SQL string that can be used in a SELECT list to normalize ecommerce columns
30
+ */
31
+ const fixEcommerceStruct = () => {
32
+ return `(select as struct ecommerce.* replace(
33
+ if(ecommerce.transaction_id <> '(not set)', ecommerce.transaction_id, null) as transaction_id,
34
+ if(
35
+ event_name = 'purchase',
36
+ coalesce(
37
+ -- fix possible NaN values
38
+ if(is_nan(ecommerce.purchase_revenue), null, ecommerce.purchase_revenue),
39
+ -- fix an old ga4 bug where purchase_revenue was missing
40
+ safe_cast(${unnestEventParam('value')} as float64)
41
+ ),
42
+ null
43
+ ) as purchase_revenue
44
+ ))`;
45
+ };
46
+
47
+ /*
48
+ Check if GA4 data is "final" and is not expected to change anymore
49
+ */
50
+
51
+ /**
52
+ * Generates a SQL expression to determine whether GA4 export data can be considered "final" (not subject to further change).
53
+ *
54
+ * Two detection methods are supported:
55
+ * - 'EXPORT_TYPE': Checks the table suffix; returns FALSE for intraday or "fresh" tables, TRUE for finalized data.
56
+ * - 'DAY_THRESHOLD': Considers data final if a configurable number of days has passed since event_date.
57
+ *
58
+ * @param {'EXPORT_TYPE'|'DAY_THRESHOLD'} detectionMethod - The method to use for finality determination.
59
+ * 'EXPORT_TYPE': Uses patterns in _table_suffix (e.g., 'intraday_%', 'fresh_%').
60
+ * 'DAY_THRESHOLD': Uses date difference between the current date and event_date.
61
+ * @param {number} [dayThreshold=3] - (Only for 'DAY_THRESHOLD') Number of days after which data is considered final.
62
+ * @returns {string} SQL expression that evaluates to TRUE if the data is final, otherwise FALSE.
63
+ *
64
+ * @throws {Error} If an unsupported detectionMethod is provided.
65
+ *
66
+ * @example
67
+ * // Checks based on export type
68
+ * isFinalData('EXPORT_TYPE')
69
+ * // => "if(_table_suffix like 'intraday_%' or _table_suffix like 'fresh_%', false, true)"
70
+ *
71
+ * // Checks using a custom day threshold
72
+ * isFinalData('DAY_THRESHOLD', 5)
73
+ * // => "if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > 5, true, false)"
74
+ */
75
+ const isFinalData = (detectionMethod, dayThreshold) => {
76
+ if (detectionMethod !== 'EXPORT_TYPE' && detectionMethod !== 'DAY_THRESHOLD') {
77
+ throw new Error(`isFinalData: Unsupported detectionMethod '${detectionMethod}'. Supported values are 'EXPORT_TYPE' and 'DAY_THRESHOLD'.`);
78
+ }
79
+
80
+ if (detectionMethod === 'DAY_THRESHOLD') {
81
+ if (typeof dayThreshold === 'undefined') {
82
+ throw new Error("isFinalData: 'dayThreshold' is required when using 'DAY_THRESHOLD' detectionMethod.");
83
+ }
84
+ if (!Number.isInteger(dayThreshold) || dayThreshold < 0) {
85
+ throw new Error("isFinalData: 'dayThreshold' must be an integer greater than or equal to 0 when using 'DAY_THRESHOLD' detectionMethod.");
86
+ }
87
+ }
88
+
89
+ if (detectionMethod === 'EXPORT_TYPE') {
90
+ return 'if(_table_suffix like \'intraday_%\' or _table_suffix like \'fresh_%\', false, true)';
91
+ }
92
+
93
+ if (detectionMethod === 'DAY_THRESHOLD') {
94
+ return `if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > ${dayThreshold}, true, false)`;
95
+ }
96
+ };
97
+
98
+ /**
99
+ * Checks whether a given column name is part of the standard/expected GA4 BigQuery export columns.
100
+ *
101
+ * The list of recognized GA4 export columns is based on the official schema as of 2026-02-18.
102
+ * This function can be used to filter or validate column names when processing GA4 data exports.
103
+ *
104
+ * @param {string} columnName - The name of the column to check.
105
+ * @returns {boolean} True if the column name is a GA4 export column, otherwise false.
106
+ */
107
+ const isGa4ExportColumn = (columnName) => {
108
+ // list updated 2026-02-18
109
+ const ga4ExportColumns = [
110
+ "event_date",
111
+ "event_timestamp",
112
+ "event_name",
113
+ "event_params",
114
+ "event_previous_timestamp",
115
+ "event_value_in_usd",
116
+ "event_bundle_sequence_id",
117
+ "event_server_timestamp_offset",
118
+ "user_id",
119
+ "user_pseudo_id",
120
+ "privacy_info",
121
+ "user_properties",
122
+ "user_first_touch_timestamp",
123
+ "user_ltv",
124
+ "device",
125
+ "geo",
126
+ "app_info",
127
+ "traffic_source",
128
+ "stream_id",
129
+ "platform",
130
+ "event_dimensions",
131
+ "ecommerce",
132
+ "items",
133
+ "collected_traffic_source",
134
+ "is_active_user",
135
+ "batch_event_index",
136
+ "batch_page_id",
137
+ "batch_ordering_id",
138
+ "session_traffic_source_last_click",
139
+ "publisher"
140
+ ];
141
+ return ga4ExportColumns.includes(columnName);
142
+ };
143
+
144
+ const getGa4ExportType = (tableSuffix) => {
145
+ return `case
146
+ when ${tableSuffix} like 'intraday_%' then 'intraday'
147
+ when ${tableSuffix} like 'fresh_%' then 'fresh'
148
+ when regexp_contains(${tableSuffix}, r'^\\d{8}$') then 'daily'
149
+ end`;
150
+ };
151
+
152
+ module.exports = {
153
+ sessionId,
154
+ fixEcommerceStruct,
155
+ isFinalData,
156
+ isGa4ExportColumn,
157
+ getGa4ExportType
158
+ };
@@ -0,0 +1,8 @@
1
+ module.exports = {
2
+ ...require('./params'),
3
+ ...require('./dateTime'),
4
+ ...require('./dateFilters'),
5
+ ...require('./urlParsing'),
6
+ ...require('./aggregation'),
7
+ ...require('./ga4Transforms')
8
+ };
@@ -0,0 +1,48 @@
1
+ /*
2
+ Unnesting parameters
3
+ */
4
+
5
+ // unnest any parameter from the selected params array
6
+ const unnestParam = (keyName, paramsArray, dataType) => {
7
+ if (typeof keyName !== 'string' || keyName.trim() === '') {
8
+ throw new Error("unnestParam: 'keyName' is required and must be a non-empty string.");
9
+ }
10
+ if (typeof paramsArray !== 'string' || paramsArray.trim() === '') {
11
+ throw new Error("unnestParam: 'paramsArray' is required and must be a non-empty string.");
12
+ }
13
+
14
+ if (dataType) {
15
+ // return the value from the selected column
16
+ if (dataType === 'string') {
17
+ return `(select value.string_value from unnest(${paramsArray}) where key = '${keyName}')`;
18
+ } else if (dataType === 'int' || dataType === 'int64') {
19
+ return `(select value.int_value from unnest(${paramsArray}) where key = '${keyName}')`;
20
+ } else if (dataType === 'double') {
21
+ return `(select value.double_value from unnest(${paramsArray}) where key = '${keyName}')`;
22
+ } else if (dataType === 'float' || dataType === 'float64') {
23
+ return `(select value.float_value from unnest(${paramsArray}) where key = '${keyName}')`;
24
+ }
25
+
26
+ throw new Error(`unnestParam: Unsupported dataType '${dataType}'. Supported values are 'string', 'int', 'int64', 'double', 'float', and 'float64'.`);
27
+ } else {
28
+ // return the value from the column that has data, cast as string
29
+ return `(select coalesce(value.string_value, cast(value.int_value as string), cast(value.double_value as string), cast(value.float_value as string)) from unnest(${paramsArray}) where key = '${keyName}')`;
30
+ }
31
+ };
32
+
33
+ // event_params and session_params
34
+
35
+ // unnest a param from the event_params array
36
+ const unnestEventParam = (keyName, dataType) => {
37
+ return unnestParam(keyName, 'event_params', dataType);
38
+ };
39
+
40
+ // unnest a param from the session_params array
41
+ const unnestSessionParam = (keyName, dataType) => {
42
+ return unnestParam(keyName, 'session_params', dataType);
43
+ };
44
+
45
+ module.exports = {
46
+ unnestEventParam,
47
+ unnestSessionParam
48
+ };
@@ -0,0 +1,155 @@
1
+ const { unnestEventParam } = require('./params');
2
+
3
+ /*
4
+ Page details
5
+ */
6
+
7
+ /**
8
+ * Generates a SQL expression to extract the hostname from a URL.
9
+ *
10
+ * This function returns a BigQuery SQL string that:
11
+ * 1. Removes the HTTP or HTTPS scheme from the start of the URL using regexp_replace.
12
+ * 2. Extracts the hostname (the first part before the next '/') using regexp_extract.
13
+ *
14
+ * Example usage (in SQL context):
15
+ * SELECT ${extractUrlHostname('my_url_column')} AS hostname
16
+ *
17
+ * @param {string} url - The SQL expression or column reference containing the URL.
18
+ * @returns {string} - BigQuery SQL expression for extracting the hostname from the input URL.
19
+ */
20
+ const extractUrlHostname = (url) => {
21
+ return `regexp_extract(
22
+ regexp_replace(
23
+ ${url},
24
+ r'^https?://',
25
+ ''
26
+ ),
27
+ r'^[^/]+'
28
+ )`;
29
+ };
30
+
31
+ /**
32
+ * Generates a SQL expression to extract the path component from a URL.
33
+ *
34
+ * This function returns a BigQuery SQL string that:
35
+ * 1. Removes the scheme and hostname (e.g., http(s)://domain) from the URL using regexp_replace.
36
+ * 2. Removes any query ('?') or fragment ('#') from the resulting string.
37
+ * 3. Trims whitespace from the result.
38
+ *
39
+ * Example usage (in SQL context):
40
+ * SELECT ${extractUrlPath('my_url_column')} AS path
41
+ *
42
+ * @param {string} url - The SQL expression or column reference containing the URL.
43
+ * @returns {string} - BigQuery SQL expression for extracting the path component from the input URL.
44
+ */
45
+ const extractUrlPath = (url) => {
46
+ return `trim(
47
+ regexp_replace(
48
+ regexp_replace(
49
+ ${url},
50
+ r'^https?://[^/]+',
51
+ ''
52
+ ),
53
+ r'[\\?#].*',
54
+ ''
55
+ )
56
+ )`;
57
+ };
58
+
59
+ /**
60
+ * Generates a SQL expression to extract the query component from a URL.
61
+ *
62
+ * This function returns a BigQuery SQL string that:
63
+ * 1. Uses regexp_extract to retrieve the query string (the part starting with '?', up to but not including a fragment '#', if present) from the input URL.
64
+ * 2. Trims leading/trailing whitespace from the extracted query string.
65
+ *
66
+ * Example usage (in SQL context):
67
+ * SELECT ${extractUrlQuery('my_url_column')} AS url_query
68
+ *
69
+ * @param {string} url - The SQL expression or column reference containing the URL.
70
+ * @returns {string} - BigQuery SQL expression for extracting the query string from the input URL, including the leading '?' if present.
71
+ */
72
+ const extractUrlQuery = (url) => {
73
+ return `trim(regexp_extract(${url}, r'\\?[^#]+'))`;
74
+ };
75
+
76
+ /**
77
+ * Generates a SQL expression to parse the query parameters of a URL into an array of structs (key-value pairs).
78
+ *
79
+ * This function:
80
+ * 1. Extracts the query string from the given URL using {@link extractUrlQuery}.
81
+ * 2. Splits the query string on '&' to separate individual key-value pairs.
82
+ * 3. Splits each pair on '=' to extract the parameter key and value.
83
+ * 4. Returns an array of STRUCTs with fields "key" and "value".
84
+ *
85
+ * Example usage (in SQL context):
86
+ * SELECT ${extractUrlQueryParams('my_url_column')} AS query_params
87
+ *
88
+ * Output schema:
89
+ * ARRAY<STRUCT<key STRING, value STRING>>
90
+ *
91
+ * @param {string} url - The SQL expression or column reference containing the URL.
92
+ * @returns {string} - BigQuery SQL expression producing an array of key/value structs for the query parameters.
93
+ */
94
+ const extractUrlQueryParams = (url) => {
95
+ return `array(
96
+ (
97
+ select
98
+ as struct split(keyval, '=') [safe_offset(0)] as key,
99
+ split(keyval, '=') [safe_offset(1)] as value
100
+ from
101
+ unnest(
102
+ split(
103
+ ${extractUrlQuery(url)},
104
+ '&'
105
+ )
106
+ ) as keyval
107
+ )
108
+ )`;
109
+ };
110
+
111
+ /**
112
+ * Generates a SQL expression that extracts detailed page information from a given URL.
113
+ *
114
+ * This function produces a BigQuery SQL struct containing the following fields:
115
+ * - hostname: The hostname part of the URL (e.g., 'www.example.com')
116
+ * - path: The path portion of the URL (e.g., '/about/team')
117
+ * - query: The raw query string from the URL, including the leading '?', if present (e.g., '?id=123')
118
+ * - query_params: An array of STRUCT<key STRING, value STRING> representing parsed key/value pairs from the query string
119
+ *
120
+ * If no URL is provided, the function defaults to extracting the URL from the `page_location` event parameter.
121
+ * All fields are derived via helper functions that generate appropriate BigQuery SQL expressions.
122
+ *
123
+ * Example usage (in SQL context):
124
+ * SELECT ${extractPageDetails('my_url_column')} AS page_details
125
+ *
126
+ * Output schema (STRUCT):
127
+ * {
128
+ * hostname: STRING,
129
+ * path: STRING,
130
+ * query: STRING,
131
+ * query_params: ARRAY<STRUCT<key STRING, value STRING>>
132
+ * }
133
+ *
134
+ * @param {string} [url] - (Optional) SQL expression or column reference for the URL to extract details from.
135
+ * If not provided, defaults to unnesting the 'page_location' event parameter as a string.
136
+ * @returns {string} BigQuery SQL expression yielding a STRUCT of hostname, path, query, and query_params from the URL.
137
+ */
138
+ const extractPageDetails = (url) => {
139
+ url = url || `${unnestEventParam('page_location', 'string')}`;
140
+
141
+ return `(select as struct
142
+ ${extractUrlHostname(url)} as hostname,
143
+ ${extractUrlPath(url)} as path,
144
+ ${extractUrlQuery(url)} as query,
145
+ ${extractUrlQueryParams(url)} as query_params
146
+ )`;
147
+ };
148
+
149
+ module.exports = {
150
+ extractUrlHostname,
151
+ extractUrlPath,
152
+ extractUrlQuery,
153
+ extractUrlQueryParams,
154
+ extractPageDetails
155
+ };
package/index.js CHANGED
@@ -1,4 +1,4 @@
1
- const helpers = require('./helpers.js');
1
+ const helpers = require('./helpers/index.js');
2
2
  const ga4EventsEnhanced = require('./tables/ga4EventsEnhanced.js');
3
3
  const preOperations = require('./preOperations.js');
4
4
  const { validateBaseConfig, validateEnhancedEventsConfig } = require('./inputValidation.js');
package/package.json CHANGED
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "name": "ga4-export-fixer",
3
- "version": "0.4.1",
3
+ "version": "0.4.2-dev.0",
4
4
  "description": "",
5
5
  "main": "index.js",
6
6
  "files": [
7
7
  "index.js",
8
- "helpers.js",
8
+ "helpers",
9
9
  "utils.js",
10
10
  "preOperations.js",
11
11
  "constants.js",
@@ -1,4 +1,4 @@
1
- const helpers = require('../helpers.js');
1
+ const helpers = require('../helpers/index.js');
2
2
  const utils = require('../utils.js');
3
3
  const inputValidation = require('../inputValidation.js');
4
4
  const constants = require('../constants.js');