ga4-export-fixer 0.6.2-dev.1 → 0.6.2-dev.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -457,7 +457,7 @@ This creates the table along with the default-enabled assertions, using the same
|
|
|
457
457
|
|
|
458
458
|
| Assertion | Name | Enabled by default | Description |
|
|
459
459
|
| --------- | ---- | ------------------ | ----------- |
|
|
460
|
-
| `dailyQuality` | `{tableName}_daily_quality` | Yes | Compares session count, event count, and
|
|
460
|
+
| `dailyQuality` | `{tableName}_daily_quality` | Yes | Compares session count, event count, item revenue, and ecommerce purchase revenue per day between the enhanced table and raw export. Detects missing days, count mismatches, and non-final data inflation |
|
|
461
461
|
| `itemRevenue` | `{tableName}_item_revenue` | No (opt-in) | Reconciles item_revenue at the (event_date, item_id) grain between the enhanced table and raw export |
|
|
462
462
|
|
|
463
463
|
Assertions inherit the table's schema and tags from `dataformTableConfig`. Each assertion queries the last 5 days of data.
|
package/package.json
CHANGED
|
@@ -2,46 +2,27 @@ const helpers = require('../../../helpers/index.js');
|
|
|
2
2
|
const utils = require('../../../utils.js');
|
|
3
3
|
const { ga4EventsEnhancedConfig } = require('../config.js');
|
|
4
4
|
const { validateEnhancedEventsConfig } = require('../validation.js');
|
|
5
|
+
const { buildDedupedRawSource } = require('./shared.js');
|
|
5
6
|
|
|
6
7
|
const defaultConfig = { ...ga4EventsEnhancedConfig };
|
|
7
8
|
|
|
8
|
-
|
|
9
|
-
* Builds a _table_suffix date filter for the assertion's raw-side query.
|
|
10
|
-
*
|
|
11
|
-
* Uses the low-level ga4ExportDateFilter() helper per enabled export type
|
|
12
|
-
* with a fixed 5-day lookback window. This is intentionally separate from
|
|
13
|
-
* the pipeline's ga4ExportDateFilters() which depends on incremental state
|
|
14
|
-
* and BigQuery pre-operation variables.
|
|
15
|
-
*
|
|
16
|
-
* @param {Object} includedExportTypes - { daily: boolean, fresh: boolean, intraday: boolean }
|
|
17
|
-
* @returns {string} SQL fragment for a WHERE clause
|
|
18
|
-
*/
|
|
19
|
-
const buildAssertionDateFilter = (includedExportTypes) => {
|
|
20
|
-
const start = 'date_sub(current_date(), interval 5 day)';
|
|
21
|
-
const end = 'current_date()';
|
|
22
|
-
|
|
23
|
-
const filters = [
|
|
24
|
-
includedExportTypes.daily ? helpers.ga4ExportDateFilter('daily', start, end) : null,
|
|
25
|
-
includedExportTypes.fresh ? helpers.ga4ExportDateFilter('fresh', start, end) : null,
|
|
26
|
-
includedExportTypes.intraday ? helpers.ga4ExportDateFilter('intraday', start, end) : null,
|
|
27
|
-
].filter(Boolean);
|
|
28
|
-
|
|
29
|
-
return filters.join(' or ');
|
|
30
|
-
};
|
|
9
|
+
const ASSERTION_LOOKBACK_DAYS = 5;
|
|
31
10
|
|
|
32
11
|
/**
|
|
33
12
|
* Generates a SQL assertion query that validates daily data quality between the
|
|
34
13
|
* enhanced events table and the raw GA4 export data.
|
|
35
14
|
*
|
|
36
|
-
* The query compares session count, event count, and total
|
|
37
|
-
* aggregated per (event_date, data_is_final) for the last 5
|
|
38
|
-
* Returns violating rows -- 0 rows means the assertion passes.
|
|
15
|
+
* The query compares session count, event count, total item_revenue, and total
|
|
16
|
+
* purchase_revenue aggregated per (event_date, data_is_final) for the last 5
|
|
17
|
+
* days. Returns violating rows -- 0 rows means the assertion passes.
|
|
39
18
|
*
|
|
40
|
-
*
|
|
19
|
+
* Six violation types are detected:
|
|
41
20
|
* - MISSING_DAY: Raw data has events but enhanced table has none for this day
|
|
42
21
|
* - SESSION_COUNT_MISMATCH: Final data session count differs
|
|
43
22
|
* - EVENT_COUNT_MISMATCH: Final data event count differs
|
|
44
|
-
* -
|
|
23
|
+
* - ITEM_REVENUE_MISMATCH: Final data total item_revenue differs
|
|
24
|
+
* - PURCHASE_REVENUE_MISMATCH: Final data total ecommerce.purchase_revenue differs
|
|
25
|
+
* (raw side applies fixEcommerceStruct() to mirror the enhanced pipeline's fix)
|
|
45
26
|
* - NON_FINAL_EXCESS_EVENTS: Non-final enhanced data has more events than raw
|
|
46
27
|
*
|
|
47
28
|
* @param {string} tableRef - Fully qualified reference to the enhanced table
|
|
@@ -51,15 +32,15 @@ const buildAssertionDateFilter = (includedExportTypes) => {
|
|
|
51
32
|
const _generateDailyQualityAssertionSql = (tableRef, mergedConfig) => {
|
|
52
33
|
const excludedEvents = mergedConfig.excludedEvents;
|
|
53
34
|
const excludedEventsSQL = excludedEvents.length > 0
|
|
54
|
-
? `
|
|
55
|
-
: '';
|
|
35
|
+
? `event_name not in (${excludedEvents.map(e => `'${e}'`).join(', ')})`
|
|
36
|
+
: 'true';
|
|
56
37
|
|
|
57
38
|
const dataIsFinalCondition = helpers.isFinalData(
|
|
58
39
|
mergedConfig.dataIsFinal.detectionMethod,
|
|
59
40
|
mergedConfig.dataIsFinal.dayThreshold
|
|
60
41
|
);
|
|
61
42
|
|
|
62
|
-
const
|
|
43
|
+
const dedupedRawSource = buildDedupedRawSource(mergedConfig, ASSERTION_LOOKBACK_DAYS);
|
|
63
44
|
|
|
64
45
|
return `with enhanced_daily as (
|
|
65
46
|
select
|
|
@@ -67,11 +48,12 @@ const _generateDailyQualityAssertionSql = (tableRef, mergedConfig) => {
|
|
|
67
48
|
data_is_final,
|
|
68
49
|
count(distinct session_id) as session_count,
|
|
69
50
|
count(*) as event_count,
|
|
70
|
-
coalesce(sum((select sum(item.item_revenue) from unnest(items) as item)), 0) as total_item_revenue
|
|
51
|
+
coalesce(sum((select sum(item.item_revenue) from unnest(items) as item)), 0) as total_item_revenue,
|
|
52
|
+
coalesce(sum(ecommerce.purchase_revenue), 0) as total_purchase_revenue
|
|
71
53
|
from
|
|
72
54
|
${tableRef}
|
|
73
55
|
where
|
|
74
|
-
event_date >= date_sub(current_date(), interval
|
|
56
|
+
event_date >= date_sub(current_date(), interval ${ASSERTION_LOOKBACK_DAYS} day)
|
|
75
57
|
group by event_date, data_is_final
|
|
76
58
|
),
|
|
77
59
|
raw_daily as (
|
|
@@ -80,13 +62,12 @@ raw_daily as (
|
|
|
80
62
|
${dataIsFinalCondition} as data_is_final,
|
|
81
63
|
count(distinct concat(user_pseudo_id, cast((select value.int_value from unnest(event_params) where key = 'ga_session_id') as string))) as session_count,
|
|
82
64
|
count(*) as event_count,
|
|
83
|
-
coalesce(sum((select sum(item.item_revenue) from unnest(items) as item)), 0) as total_item_revenue
|
|
65
|
+
coalesce(sum((select sum(item.item_revenue) from unnest(items) as item)), 0) as total_item_revenue,
|
|
66
|
+
coalesce(sum(${helpers.fixEcommerceStruct()}.purchase_revenue), 0) as total_purchase_revenue
|
|
84
67
|
from
|
|
85
|
-
${
|
|
68
|
+
${dedupedRawSource}
|
|
86
69
|
where
|
|
87
|
-
(${dateFilter})
|
|
88
70
|
${excludedEventsSQL}
|
|
89
|
-
and cast(event_date as date format 'YYYYMMDD') >= date_sub(current_date(), interval 5 day)
|
|
90
71
|
group by event_date, data_is_final
|
|
91
72
|
),
|
|
92
73
|
daily_comparison as (
|
|
@@ -97,8 +78,10 @@ daily_comparison as (
|
|
|
97
78
|
r.session_count as raw_sessions,
|
|
98
79
|
e.event_count as enhanced_events,
|
|
99
80
|
r.event_count as raw_events,
|
|
100
|
-
round(e.total_item_revenue, 2) as
|
|
101
|
-
round(r.total_item_revenue, 2) as
|
|
81
|
+
round(e.total_item_revenue, 2) as enhanced_item_revenue,
|
|
82
|
+
round(r.total_item_revenue, 2) as raw_item_revenue,
|
|
83
|
+
round(e.total_purchase_revenue, 2) as enhanced_purchase_revenue,
|
|
84
|
+
round(r.total_purchase_revenue, 2) as raw_purchase_revenue
|
|
102
85
|
from
|
|
103
86
|
enhanced_daily e
|
|
104
87
|
full outer join
|
|
@@ -111,8 +94,10 @@ select
|
|
|
111
94
|
raw_sessions,
|
|
112
95
|
enhanced_events,
|
|
113
96
|
raw_events,
|
|
114
|
-
|
|
115
|
-
|
|
97
|
+
enhanced_item_revenue,
|
|
98
|
+
raw_item_revenue,
|
|
99
|
+
enhanced_purchase_revenue,
|
|
100
|
+
raw_purchase_revenue,
|
|
116
101
|
violation_type
|
|
117
102
|
from
|
|
118
103
|
daily_comparison,
|
|
@@ -120,7 +105,8 @@ from
|
|
|
120
105
|
if(enhanced_events is null and raw_events > 0, 'MISSING_DAY', null),
|
|
121
106
|
if(data_is_final = true and enhanced_sessions != raw_sessions, 'SESSION_COUNT_MISMATCH', null),
|
|
122
107
|
if(data_is_final = true and enhanced_events != raw_events, 'EVENT_COUNT_MISMATCH', null),
|
|
123
|
-
if(data_is_final = true and
|
|
108
|
+
if(data_is_final = true and enhanced_item_revenue != raw_item_revenue, 'ITEM_REVENUE_MISMATCH', null),
|
|
109
|
+
if(data_is_final = true and enhanced_purchase_revenue != raw_purchase_revenue, 'PURCHASE_REVENUE_MISMATCH', null),
|
|
124
110
|
if(data_is_final = false and coalesce(enhanced_events, 0) > coalesce(raw_events, 0), 'NON_FINAL_EXCESS_EVENTS', null)
|
|
125
111
|
]) as violation_type
|
|
126
112
|
where
|
|
@@ -131,9 +117,9 @@ where
|
|
|
131
117
|
* Generates a daily quality assertion SQL query.
|
|
132
118
|
*
|
|
133
119
|
* Merges the provided config with defaults, validates, then generates a SQL
|
|
134
|
-
* query comparing daily aggregates (session count, event count, item_revenue
|
|
135
|
-
* between the enhanced table and raw export data,
|
|
136
|
-
* and non-final data inflation.
|
|
120
|
+
* query comparing daily aggregates (session count, event count, item_revenue,
|
|
121
|
+
* ecommerce.purchase_revenue) between the enhanced table and raw export data,
|
|
122
|
+
* and checks for missing days and non-final data inflation.
|
|
137
123
|
*
|
|
138
124
|
* @param {string} tableRef - Fully qualified reference to the enhanced table.
|
|
139
125
|
* @param {Object} config - User-provided table configuration.
|
|
@@ -2,38 +2,17 @@ const helpers = require('../../../helpers/index.js');
|
|
|
2
2
|
const utils = require('../../../utils.js');
|
|
3
3
|
const { ga4EventsEnhancedConfig } = require('../config.js');
|
|
4
4
|
const { validateEnhancedEventsConfig } = require('../validation.js');
|
|
5
|
+
const { buildDedupedRawSource } = require('./shared.js');
|
|
5
6
|
|
|
6
7
|
const defaultConfig = { ...ga4EventsEnhancedConfig };
|
|
7
8
|
|
|
9
|
+
const ASSERTION_LOOKBACK_DAYS = 5;
|
|
10
|
+
|
|
8
11
|
// Ecommerce events that carry item data (excluding refund — refunds reverse revenue
|
|
9
12
|
// and are handled separately in some pipelines, but item_revenue on refund rows
|
|
10
13
|
// should still reconcile 1:1 between enhanced and raw).
|
|
11
14
|
const ecommerceEvents = helpers.ga4EcommerceEvents.map(e => `'${e}'`).join(', ');
|
|
12
15
|
|
|
13
|
-
/**
|
|
14
|
-
* Builds a _table_suffix date filter for the assertion's raw-side query.
|
|
15
|
-
*
|
|
16
|
-
* Uses the low-level ga4ExportDateFilter() helper per enabled export type
|
|
17
|
-
* with a fixed 5-day lookback window. This is intentionally separate from
|
|
18
|
-
* the pipeline's ga4ExportDateFilters() which depends on incremental state
|
|
19
|
-
* and BigQuery pre-operation variables.
|
|
20
|
-
*
|
|
21
|
-
* @param {Object} includedExportTypes - { daily: boolean, fresh: boolean, intraday: boolean }
|
|
22
|
-
* @returns {string} SQL fragment for a WHERE clause
|
|
23
|
-
*/
|
|
24
|
-
const buildAssertionDateFilter = (includedExportTypes) => {
|
|
25
|
-
const start = 'date_sub(current_date(), interval 5 day)';
|
|
26
|
-
const end = 'current_date()';
|
|
27
|
-
|
|
28
|
-
const filters = [
|
|
29
|
-
includedExportTypes.daily ? helpers.ga4ExportDateFilter('daily', start, end) : null,
|
|
30
|
-
includedExportTypes.fresh ? helpers.ga4ExportDateFilter('fresh', start, end) : null,
|
|
31
|
-
includedExportTypes.intraday ? helpers.ga4ExportDateFilter('intraday', start, end) : null,
|
|
32
|
-
].filter(Boolean);
|
|
33
|
-
|
|
34
|
-
return filters.join(' or ');
|
|
35
|
-
};
|
|
36
|
-
|
|
37
16
|
/**
|
|
38
17
|
* Generates a SQL assertion query that reconciles item_revenue between the
|
|
39
18
|
* enhanced events table and the raw GA4 export data.
|
|
@@ -51,8 +30,8 @@ const _generateItemRevenueAssertionSql = (tableRef, mergedConfig) => {
|
|
|
51
30
|
// excluded events filter (same logic as the enhanced table pipeline)
|
|
52
31
|
const excludedEvents = mergedConfig.excludedEvents;
|
|
53
32
|
const excludedEventsSQL = excludedEvents.length > 0
|
|
54
|
-
? `
|
|
55
|
-
: '';
|
|
33
|
+
? `event_name not in (${excludedEvents.map(e => `'${e}'`).join(', ')})`
|
|
34
|
+
: 'true';
|
|
56
35
|
|
|
57
36
|
// data_is_final condition for the raw side
|
|
58
37
|
const dataIsFinalCondition = helpers.isFinalData(
|
|
@@ -60,8 +39,8 @@ const _generateItemRevenueAssertionSql = (tableRef, mergedConfig) => {
|
|
|
60
39
|
mergedConfig.dataIsFinal.dayThreshold
|
|
61
40
|
);
|
|
62
41
|
|
|
63
|
-
//
|
|
64
|
-
const
|
|
42
|
+
// deduplicated raw-source subquery (mirrors pipeline setPreOperations dedup)
|
|
43
|
+
const dedupedRawSource = buildDedupedRawSource(mergedConfig, ASSERTION_LOOKBACK_DAYS);
|
|
65
44
|
|
|
66
45
|
return `with enhanced_revenue as (
|
|
67
46
|
select
|
|
@@ -74,7 +53,7 @@ const _generateItemRevenueAssertionSql = (tableRef, mergedConfig) => {
|
|
|
74
53
|
unnest(items) as item
|
|
75
54
|
where
|
|
76
55
|
data_is_final = true
|
|
77
|
-
and event_date >= date_sub(current_date(), interval
|
|
56
|
+
and event_date >= date_sub(current_date(), interval ${ASSERTION_LOOKBACK_DAYS} day)
|
|
78
57
|
and event_name in (${ecommerceEvents})
|
|
79
58
|
group by event_date, item.item_id
|
|
80
59
|
),
|
|
@@ -85,14 +64,12 @@ raw_revenue as (
|
|
|
85
64
|
sum(item.item_revenue) as total_item_revenue,
|
|
86
65
|
count(*) as item_count
|
|
87
66
|
from
|
|
88
|
-
${
|
|
67
|
+
${dedupedRawSource},
|
|
89
68
|
unnest(items) as item
|
|
90
69
|
where
|
|
91
|
-
(${dateFilter})
|
|
92
70
|
${excludedEventsSQL}
|
|
93
71
|
and event_name in (${ecommerceEvents})
|
|
94
72
|
and ${dataIsFinalCondition}
|
|
95
|
-
and cast(event_date as date format 'YYYYMMDD') >= date_sub(current_date(), interval 5 day)
|
|
96
73
|
group by event_date, item.item_id
|
|
97
74
|
)
|
|
98
75
|
select
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
const helpers = require('../../../helpers/index.js');
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Builds a _table_suffix date filter for the assertion's raw-side query.
|
|
5
|
+
*
|
|
6
|
+
* Uses the low-level ga4ExportDateFilter() helper per enabled export type
|
|
7
|
+
* over a caller-provided lookback window. Intentionally separate from the
|
|
8
|
+
* pipeline's ga4ExportDateFilters() which depends on incremental state
|
|
9
|
+
* and BigQuery pre-operation variables.
|
|
10
|
+
*
|
|
11
|
+
* @param {Object} includedExportTypes - { daily: boolean, fresh: boolean, intraday: boolean }
|
|
12
|
+
* @param {number} lookbackDays - Number of days to look back from current_date().
|
|
13
|
+
* @returns {string} SQL fragment for a WHERE clause
|
|
14
|
+
*/
|
|
15
|
+
const buildAssertionDateFilter = (includedExportTypes, lookbackDays) => {
|
|
16
|
+
const start = `date_sub(current_date(), interval ${lookbackDays} day)`;
|
|
17
|
+
const end = 'current_date()';
|
|
18
|
+
|
|
19
|
+
const filters = [
|
|
20
|
+
includedExportTypes.daily ? helpers.ga4ExportDateFilter('daily', start, end) : null,
|
|
21
|
+
includedExportTypes.fresh ? helpers.ga4ExportDateFilter('fresh', start, end) : null,
|
|
22
|
+
includedExportTypes.intraday ? helpers.ga4ExportDateFilter('intraday', start, end) : null,
|
|
23
|
+
].filter(Boolean);
|
|
24
|
+
|
|
25
|
+
return filters.join(' or ');
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Builds a deduplicated raw-source subquery for assertion use.
|
|
30
|
+
*
|
|
31
|
+
* Replicates what setPreOperations() does at pipeline time, without access
|
|
32
|
+
* to its BigQuery variables. Covers all seven combinations of
|
|
33
|
+
* includedExportTypes {daily, fresh, intraday}:
|
|
34
|
+
*
|
|
35
|
+
* - qualify dense_rank() over (partition by date, order by _table_suffix) = 1
|
|
36
|
+
* picks the highest-priority table per day. Alphabetical order gives
|
|
37
|
+
* daily ('20260115') < fresh ('fresh_20260115') < intraday ('intraday_20260115'),
|
|
38
|
+
* matching the pipeline's daily > fresh > intraday priority.
|
|
39
|
+
* - When fresh and intraday are both enabled, intraday rows with
|
|
40
|
+
* event_timestamp > max(fresh.event_timestamp) for the same date are
|
|
41
|
+
* additionally admitted — matching the FRESH_MAX_EVENT_TIMESTAMP boundary.
|
|
42
|
+
*
|
|
43
|
+
* @param {Object} mergedConfig - Merged table configuration.
|
|
44
|
+
* @param {number} lookbackDays - Number of days to look back from current_date().
|
|
45
|
+
* @returns {string} SQL fragment: a parenthesized subquery usable in a FROM clause.
|
|
46
|
+
*/
|
|
47
|
+
const buildDedupedRawSource = (mergedConfig, lookbackDays) => {
|
|
48
|
+
const dateFilter = buildAssertionDateFilter(mergedConfig.includedExportTypes, lookbackDays);
|
|
49
|
+
const freshAndIntraday = mergedConfig.includedExportTypes.fresh && mergedConfig.includedExportTypes.intraday;
|
|
50
|
+
|
|
51
|
+
const intradayException = freshAndIntraday
|
|
52
|
+
? `
|
|
53
|
+
or (
|
|
54
|
+
starts_with(_table_suffix, 'intraday_')
|
|
55
|
+
and dense_rank() over (
|
|
56
|
+
partition by regexp_extract(_table_suffix, r'[0-9]+')
|
|
57
|
+
order by _table_suffix
|
|
58
|
+
) = 2
|
|
59
|
+
and event_timestamp > max(if(starts_with(_table_suffix, 'fresh_'), event_timestamp, null)) over (
|
|
60
|
+
partition by regexp_extract(_table_suffix, r'[0-9]+')
|
|
61
|
+
)
|
|
62
|
+
)`
|
|
63
|
+
: '';
|
|
64
|
+
|
|
65
|
+
// _table_suffix is a pseudo-column and not propagated by SELECT *; select it
|
|
66
|
+
// explicitly so downstream CTEs (e.g., isFinalData('EXPORT_TYPE')) can still reference it.
|
|
67
|
+
return `(
|
|
68
|
+
select
|
|
69
|
+
*,
|
|
70
|
+
_table_suffix
|
|
71
|
+
from
|
|
72
|
+
${mergedConfig.sourceTable}
|
|
73
|
+
where
|
|
74
|
+
(${dateFilter})
|
|
75
|
+
qualify
|
|
76
|
+
dense_rank() over (
|
|
77
|
+
partition by regexp_extract(_table_suffix, r'[0-9]+')
|
|
78
|
+
order by _table_suffix
|
|
79
|
+
) = 1${intradayException}
|
|
80
|
+
)`;
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
module.exports = { buildAssertionDateFilter, buildDedupedRawSource };
|