ga4-export-fixer 0.4.1 → 0.4.2-dev.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,166 @@
1
+ const { unnestEventParam } = require('./params');
2
+
3
+ /**
4
+ * SQL expression that builds a session ID by concatenating `user_pseudo_id` with the `ga_session_id` event parameter.
5
+ */
6
+ const sessionId = `concat(user_pseudo_id, (select value.int_value from unnest(event_params) where key = 'ga_session_id'))`;
7
+
8
+ /*
9
+ Ecommerce
10
+ */
11
+
12
+ /**
13
+ * Fixes and normalizes the ecommerce struct extracted from GA4 event data.
14
+ *
15
+ * This helper returns a SQL expression that:
16
+ * - Ensures `ecommerce.transaction_id` is set to NULL if it has the placeholder string '(not set)';
17
+ * - For 'purchase' events, normalizes `ecommerce.purchase_revenue` by:
18
+ * * Removing NaN values;
19
+ * * Filling missing purchase revenue (an old GA4 bug) with the event parameter 'value', safely cast as FLOAT64;
20
+ * - Leaves other fields in the ecommerce struct unchanged.
21
+ *
22
+ * The result is a new struct with the same shape as 'ecommerce' but with cleaned transaction_id and purchase_revenue.
23
+ *
24
+ * @returns {string} A SQL snippet for SELECT AS STRUCT ... REPLACE to normalize ecommerce fields.
25
+ *
26
+ * @example
27
+ * fixEcommerceStruct()
28
+ * // => SQL string that can be used in a SELECT list to normalize ecommerce columns
29
+ */
30
+ const fixEcommerceStruct = () => {
31
+ return `(select as struct ecommerce.* replace(
32
+ if(ecommerce.transaction_id <> '(not set)', ecommerce.transaction_id, null) as transaction_id,
33
+ if(
34
+ event_name = 'purchase',
35
+ coalesce(
36
+ -- fix possible NaN values
37
+ if(is_nan(ecommerce.purchase_revenue), null, ecommerce.purchase_revenue),
38
+ -- fix an old ga4 bug where purchase_revenue was missing
39
+ safe_cast(${unnestEventParam('value')} as float64)
40
+ ),
41
+ null
42
+ ) as purchase_revenue
43
+ ))`;
44
+ };
45
+
46
+ /*
47
+ Check if GA4 data is "final" and is not expected to change anymore
48
+ */
49
+
50
+ /**
51
+ * Generates a SQL expression to determine whether GA4 export data can be considered "final" (not subject to further change).
52
+ *
53
+ * Two detection methods are supported:
54
+ * - 'EXPORT_TYPE': Checks the table suffix; returns FALSE for intraday or "fresh" tables, TRUE for finalized data.
55
+ * - 'DAY_THRESHOLD': Considers data final if a configurable number of days has passed since event_date.
56
+ *
57
+ * @param {'EXPORT_TYPE'|'DAY_THRESHOLD'} detectionMethod - The method to use for finality determination.
58
+ * 'EXPORT_TYPE': Uses patterns in _table_suffix (e.g., 'intraday_%', 'fresh_%').
59
+ * 'DAY_THRESHOLD': Uses date difference between the current date and event_date.
60
+ * @param {number} [dayThreshold] - (Only for 'DAY_THRESHOLD') Number of days after which data is considered final. Required when detectionMethod is 'DAY_THRESHOLD'.
61
+ * @returns {string} SQL expression that evaluates to TRUE if the data is final, otherwise FALSE.
62
+ *
63
+ * @throws {Error} If an unsupported detectionMethod is provided.
64
+ *
65
+ * @example
66
+ * // Checks based on export type
67
+ * isFinalData('EXPORT_TYPE')
68
+ * // => "if(_table_suffix like 'intraday_%' or _table_suffix like 'fresh_%', false, true)"
69
+ *
70
+ * // Checks using a custom day threshold
71
+ * isFinalData('DAY_THRESHOLD', 5)
72
+ * // => "if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > 5, true, false)"
73
+ */
74
+ const isFinalData = (detectionMethod, dayThreshold) => {
75
+ if (detectionMethod !== 'EXPORT_TYPE' && detectionMethod !== 'DAY_THRESHOLD') {
76
+ throw new Error(`isFinalData: Unsupported detectionMethod '${detectionMethod}'. Supported values are 'EXPORT_TYPE' and 'DAY_THRESHOLD'.`);
77
+ }
78
+
79
+ if (detectionMethod === 'DAY_THRESHOLD') {
80
+ if (typeof dayThreshold === 'undefined') {
81
+ throw new Error("isFinalData: 'dayThreshold' is required when using 'DAY_THRESHOLD' detectionMethod.");
82
+ }
83
+ if (!Number.isInteger(dayThreshold) || dayThreshold < 0) {
84
+ throw new Error("isFinalData: 'dayThreshold' must be an integer greater than or equal to 0 when using 'DAY_THRESHOLD' detectionMethod.");
85
+ }
86
+ }
87
+
88
+ if (detectionMethod === 'EXPORT_TYPE') {
89
+ return 'if(_table_suffix like \'intraday_%\' or _table_suffix like \'fresh_%\', false, true)';
90
+ }
91
+
92
+ if (detectionMethod === 'DAY_THRESHOLD') {
93
+ return `if(date_diff(current_date(), cast(event_date as date format 'YYYYMMDD'), day) > ${dayThreshold}, true, false)`;
94
+ }
95
+ };
96
+
97
+ /**
98
+ * Checks whether a given column name is part of the standard/expected GA4 BigQuery export columns.
99
+ *
100
+ * The list of recognized GA4 export columns is based on the official schema as of 2026-02-18.
101
+ * This function can be used to filter or validate column names when processing GA4 data exports.
102
+ *
103
+ * @param {string} columnName - The name of the column to check.
104
+ * @returns {boolean} True if the column name is a GA4 export column, otherwise false.
105
+ */
106
+ const isGa4ExportColumn = (columnName) => {
107
+ // list updated 2026-02-18
108
+ const ga4ExportColumns = [
109
+ "event_date",
110
+ "event_timestamp",
111
+ "event_name",
112
+ "event_params",
113
+ "event_previous_timestamp",
114
+ "event_value_in_usd",
115
+ "event_bundle_sequence_id",
116
+ "event_server_timestamp_offset",
117
+ "user_id",
118
+ "user_pseudo_id",
119
+ "privacy_info",
120
+ "user_properties",
121
+ "user_first_touch_timestamp",
122
+ "user_ltv",
123
+ "device",
124
+ "geo",
125
+ "app_info",
126
+ "traffic_source",
127
+ "stream_id",
128
+ "platform",
129
+ "event_dimensions",
130
+ "ecommerce",
131
+ "items",
132
+ "collected_traffic_source",
133
+ "is_active_user",
134
+ "batch_event_index",
135
+ "batch_page_id",
136
+ "batch_ordering_id",
137
+ "session_traffic_source_last_click",
138
+ "publisher"
139
+ ];
140
+ return ga4ExportColumns.includes(columnName);
141
+ };
142
+
143
+ /**
144
+ * Generates a SQL CASE expression that determines the GA4 export type from a table suffix.
145
+ *
146
+ * Returns 'intraday' for suffixes like 'intraday_%', 'fresh' for 'fresh_%',
147
+ * and 'daily' for 8-digit date suffixes (YYYYMMDD).
148
+ *
149
+ * @param {string} tableSuffix - SQL expression or column reference for the table suffix (e.g., '_table_suffix').
150
+ * @returns {string} SQL CASE expression that evaluates to 'intraday', 'fresh', or 'daily'.
151
+ */
152
+ const getGa4ExportType = (tableSuffix) => {
153
+ return `case
154
+ when ${tableSuffix} like 'intraday_%' then 'intraday'
155
+ when ${tableSuffix} like 'fresh_%' then 'fresh'
156
+ when regexp_contains(${tableSuffix}, r'^\\d{8}$') then 'daily'
157
+ end`;
158
+ };
159
+
160
+ module.exports = {
161
+ sessionId,
162
+ fixEcommerceStruct,
163
+ isFinalData,
164
+ isGa4ExportColumn,
165
+ getGa4ExportType
166
+ };
@@ -0,0 +1,8 @@
1
+ module.exports = {
2
+ ...require('./params'),
3
+ ...require('./dateTime'),
4
+ ...require('./dateFilters'),
5
+ ...require('./urlParsing'),
6
+ ...require('./aggregation'),
7
+ ...require('./ga4Transforms')
8
+ };
@@ -0,0 +1,77 @@
1
+ /*
2
+ Unnesting parameters
3
+ */
4
+
5
+ /**
6
+ * Generates a SQL subquery to extract a value from a parameter array by key.
7
+ *
8
+ * When a dataType is provided, the value is extracted from the corresponding typed column
9
+ * (e.g., `value.string_value`, `value.int_value`). When omitted, a coalesce across all
10
+ * value columns is returned, cast as a string.
11
+ *
12
+ * @param {string} keyName - The parameter key to look up in the array.
13
+ * @param {string} paramsArray - The SQL expression for the parameter array to unnest (e.g., 'event_params').
14
+ * @param {string} [dataType] - Optional data type: 'string', 'int', 'int64', 'double', 'float', or 'float64'.
15
+ * If omitted, returns the value converted to a string.
16
+ * @returns {string} SQL subquery expression that extracts the parameter value.
17
+ * @throws {Error} If keyName or paramsArray is not a non-empty string, or if dataType is unsupported.
18
+ */
19
+ const unnestParam = (keyName, paramsArray, dataType) => {
20
+ if (typeof keyName !== 'string' || keyName.trim() === '') {
21
+ throw new Error("unnestParam: 'keyName' is required and must be a non-empty string.");
22
+ }
23
+ if (typeof paramsArray !== 'string' || paramsArray.trim() === '') {
24
+ throw new Error("unnestParam: 'paramsArray' is required and must be a non-empty string.");
25
+ }
26
+
27
+ if (dataType) {
28
+ // return the value from the selected column
29
+ if (dataType === 'string') {
30
+ return `(select value.string_value from unnest(${paramsArray}) where key = '${keyName}')`;
31
+ } else if (dataType === 'int' || dataType === 'int64') {
32
+ return `(select value.int_value from unnest(${paramsArray}) where key = '${keyName}')`;
33
+ } else if (dataType === 'double') {
34
+ return `(select value.double_value from unnest(${paramsArray}) where key = '${keyName}')`;
35
+ } else if (dataType === 'float' || dataType === 'float64') {
36
+ return `(select value.float_value from unnest(${paramsArray}) where key = '${keyName}')`;
37
+ }
38
+
39
+ throw new Error(`unnestParam: Unsupported dataType '${dataType}'. Supported values are 'string', 'int', 'int64', 'double', 'float', and 'float64'.`);
40
+ } else {
41
+ // return the value from the column that has data, cast as string
42
+ return `(select coalesce(value.string_value, cast(value.int_value as string), cast(value.double_value as string), cast(value.float_value as string)) from unnest(${paramsArray}) where key = '${keyName}')`;
43
+ }
44
+ };
45
+
46
+ /**
47
+ * Extracts a value from the `event_params` array by key.
48
+ *
49
+ * Supported types: 'string', 'int', 'int64', 'double', 'float', 'float64'.
50
+ * If omitted, returns the value converted to a string.
51
+ *
52
+ * @param {string} keyName - The event parameter key to look up.
53
+ * @param {string} [dataType] - Optional data type for the extracted value.
54
+ * @returns {string} SQL subquery expression that extracts the event parameter value.
55
+ */
56
+ const unnestEventParam = (keyName, dataType) => {
57
+ return unnestParam(keyName, 'event_params', dataType);
58
+ };
59
+
60
+ /**
61
+ * Extracts a value from the `session_params` array by key.
62
+ *
63
+ * Supported types: 'string', 'int', 'int64', 'double', 'float', 'float64'.
64
+ * If omitted, returns the value converted to a string.
65
+ *
66
+ * @param {string} keyName - The session parameter key to look up.
67
+ * @param {string} [dataType] - Optional data type for the extracted value.
68
+ * @returns {string} SQL subquery expression that extracts the session parameter value.
69
+ */
70
+ const unnestSessionParam = (keyName, dataType) => {
71
+ return unnestParam(keyName, 'session_params', dataType);
72
+ };
73
+
74
+ module.exports = {
75
+ unnestEventParam,
76
+ unnestSessionParam
77
+ };
@@ -0,0 +1,155 @@
1
+ const { unnestEventParam } = require('./params');
2
+
3
+ /*
4
+ Page details
5
+ */
6
+
7
+ /**
8
+ * Generates a SQL expression to extract the hostname from a URL.
9
+ *
10
+ * This function returns a BigQuery SQL string that:
11
+ * 1. Removes the HTTP or HTTPS scheme from the start of the URL using regexp_replace.
12
+ * 2. Extracts the hostname (the first part before the next '/') using regexp_extract.
13
+ *
14
+ * Example usage (in SQL context):
15
+ * SELECT ${extractUrlHostname('my_url_column')} AS hostname
16
+ *
17
+ * @param {string} url - The SQL expression or column reference containing the URL.
18
+ * @returns {string} - BigQuery SQL expression for extracting the hostname from the input URL.
19
+ */
20
+ const extractUrlHostname = (url) => {
21
+ return `regexp_extract(
22
+ regexp_replace(
23
+ ${url},
24
+ r'^https?://',
25
+ ''
26
+ ),
27
+ r'^[^/]+'
28
+ )`;
29
+ };
30
+
31
+ /**
32
+ * Generates a SQL expression to extract the path component from a URL.
33
+ *
34
+ * This function returns a BigQuery SQL string that:
35
+ * 1. Removes the scheme and hostname (e.g., http(s)://domain) from the URL using regexp_replace.
36
+ * 2. Removes any query ('?') or fragment ('#') from the resulting string.
37
+ * 3. Trims whitespace from the result.
38
+ *
39
+ * Example usage (in SQL context):
40
+ * SELECT ${extractUrlPath('my_url_column')} AS path
41
+ *
42
+ * @param {string} url - The SQL expression or column reference containing the URL.
43
+ * @returns {string} - BigQuery SQL expression for extracting the path component from the input URL.
44
+ */
45
+ const extractUrlPath = (url) => {
46
+ return `trim(
47
+ regexp_replace(
48
+ regexp_replace(
49
+ ${url},
50
+ r'^https?://[^/]+',
51
+ ''
52
+ ),
53
+ r'[\\?#].*',
54
+ ''
55
+ )
56
+ )`;
57
+ };
58
+
59
+ /**
60
+ * Generates a SQL expression to extract the query component from a URL.
61
+ *
62
+ * This function returns a BigQuery SQL string that:
63
+ * 1. Uses regexp_extract to retrieve the query string (the part starting with '?', up to but not including a fragment '#', if present) from the input URL.
64
+ * 2. Trims leading/trailing whitespace from the extracted query string.
65
+ *
66
+ * Example usage (in SQL context):
67
+ * SELECT ${extractUrlQuery('my_url_column')} AS url_query
68
+ *
69
+ * @param {string} url - The SQL expression or column reference containing the URL.
70
+ * @returns {string} - BigQuery SQL expression for extracting the query string from the input URL, including the leading '?' if present.
71
+ */
72
+ const extractUrlQuery = (url) => {
73
+ return `trim(regexp_extract(${url}, r'\\?[^#]+'))`;
74
+ };
75
+
76
+ /**
77
+ * Generates a SQL expression to parse the query parameters of a URL into an array of structs (key-value pairs).
78
+ *
79
+ * This function:
80
+ * 1. Extracts the query string from the given URL using {@link extractUrlQuery}.
81
+ * 2. Splits the query string on '&' to separate individual key-value pairs.
82
+ * 3. Splits each pair on '=' to extract the parameter key and value.
83
+ * 4. Returns an array of STRUCTs with fields "key" and "value".
84
+ *
85
+ * Example usage (in SQL context):
86
+ * SELECT ${extractUrlQueryParams('my_url_column')} AS query_params
87
+ *
88
+ * Output schema:
89
+ * ARRAY<STRUCT<key STRING, value STRING>>
90
+ *
91
+ * @param {string} url - The SQL expression or column reference containing the URL.
92
+ * @returns {string} - BigQuery SQL expression producing an array of key/value structs for the query parameters.
93
+ */
94
+ const extractUrlQueryParams = (url) => {
95
+ return `array(
96
+ (
97
+ select
98
+ as struct split(keyval, '=') [safe_offset(0)] as key,
99
+ split(keyval, '=') [safe_offset(1)] as value
100
+ from
101
+ unnest(
102
+ split(
103
+ ${extractUrlQuery(url)},
104
+ '&'
105
+ )
106
+ ) as keyval
107
+ )
108
+ )`;
109
+ };
110
+
111
+ /**
112
+ * Generates a SQL expression that extracts detailed page information from a given URL.
113
+ *
114
+ * This function produces a BigQuery SQL struct containing the following fields:
115
+ * - hostname: The hostname part of the URL (e.g., 'www.example.com')
116
+ * - path: The path portion of the URL (e.g., '/about/team')
117
+ * - query: The raw query string from the URL, including the leading '?', if present (e.g., '?id=123')
118
+ * - query_params: An array of STRUCT<key STRING, value STRING> representing parsed key/value pairs from the query string
119
+ *
120
+ * If no URL is provided, the function defaults to extracting the URL from the `page_location` event parameter.
121
+ * All fields are derived via helper functions that generate appropriate BigQuery SQL expressions.
122
+ *
123
+ * Example usage (in SQL context):
124
+ * SELECT ${extractPageDetails('my_url_column')} AS page_details
125
+ *
126
+ * Output schema (STRUCT):
127
+ * {
128
+ * hostname: STRING,
129
+ * path: STRING,
130
+ * query: STRING,
131
+ * query_params: ARRAY<STRUCT<key STRING, value STRING>>
132
+ * }
133
+ *
134
+ * @param {string} [url] - (Optional) SQL expression or column reference for the URL to extract details from.
135
+ * If not provided, defaults to unnesting the 'page_location' event parameter as a string.
136
+ * @returns {string} BigQuery SQL expression yielding a STRUCT of hostname, path, query, and query_params from the URL.
137
+ */
138
+ const extractPageDetails = (url) => {
139
+ url = url || `${unnestEventParam('page_location', 'string')}`;
140
+
141
+ return `(select as struct
142
+ ${extractUrlHostname(url)} as hostname,
143
+ ${extractUrlPath(url)} as path,
144
+ ${extractUrlQuery(url)} as query,
145
+ ${extractUrlQueryParams(url)} as query_params
146
+ )`;
147
+ };
148
+
149
+ module.exports = {
150
+ extractUrlHostname,
151
+ extractUrlPath,
152
+ extractUrlQuery,
153
+ extractUrlQueryParams,
154
+ extractPageDetails
155
+ };
package/index.js CHANGED
@@ -1,4 +1,4 @@
1
- const helpers = require('./helpers.js');
1
+ const helpers = require('./helpers/index.js');
2
2
  const ga4EventsEnhanced = require('./tables/ga4EventsEnhanced.js');
3
3
  const preOperations = require('./preOperations.js');
4
4
  const { validateBaseConfig, validateEnhancedEventsConfig } = require('./inputValidation.js');
package/package.json CHANGED
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "name": "ga4-export-fixer",
3
- "version": "0.4.1",
3
+ "version": "0.4.2-dev.1",
4
4
  "description": "",
5
5
  "main": "index.js",
6
6
  "files": [
7
7
  "index.js",
8
- "helpers.js",
8
+ "helpers",
9
9
  "utils.js",
10
10
  "preOperations.js",
11
11
  "constants.js",
@@ -1,4 +1,4 @@
1
- const helpers = require('../helpers.js');
1
+ const helpers = require('../helpers/index.js');
2
2
  const utils = require('../utils.js');
3
3
  const inputValidation = require('../inputValidation.js');
4
4
  const constants = require('../constants.js');