ga4-export-fixer 0.3.2-dev.1 → 0.3.2-dev.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -208,7 +208,9 @@
208
208
  "campaign_id": "Cross-channel campaign ID",
209
209
  "source_platform": "Cross-channel source platform",
210
210
  "source": "Cross-channel source",
211
- "medium": "Cross-channel medium"
211
+ "medium": "Cross-channel medium",
212
+ "primary_channel_group": "Custom primary channel group for the session's last non-direct click",
213
+ "default_channel_group": "Default channel group for the session's last non-direct click"
212
214
  }
213
215
  },
214
216
  "sa360_campaign": {
@@ -13,7 +13,7 @@
13
13
  "event_params": "Nested array of event parameters. Unnest with CROSS JOIN UNNEST(event_params) to access individual parameter values",
14
14
  "session_params": "Session-scoped parameters. Unnest with CROSS JOIN UNNEST(session_params) to access session-level parameter values",
15
15
  "ecommerce": "Transaction and revenue data. Filter to purchase or refund events (WHERE event_name = 'purchase') for ecommerce reporting",
16
- "items": "Product-level data array. Unnest with CROSS JOIN UNNEST(items) for item-level analysis in ecommerce reports",
16
+ "items": "Product-level data array. Unnest with CROSS JOIN UNNEST(items) for item-level analysis. For product revenue and units sold, filter to event_name = 'purchase' and use item_revenue and quantity. Use item_category to filter or group by product category",
17
17
  "collected_traffic_source": "Event-level UTM parameters and click identifiers. For session-level attribution, prefer session_first_traffic_source instead",
18
18
  "session_first_traffic_source": "First-touch traffic source for the session. Use for session-level acquisition and campaign reporting",
19
19
  "session_traffic_source_last_click": "Google-attributed session traffic source. Use for last-click attribution analysis across Google Ads and manual channels",
@@ -0,0 +1,141 @@
1
+ {
2
+ "synonyms": [
3
+ {
4
+ "terms": ["users", "unique users", "visitors", "unique visitors"],
5
+ "sql": "COUNT(DISTINCT user_pseudo_id) or COUNT(DISTINCT merged_user_id) for cross-device",
6
+ "dependsOn": ["user_pseudo_id"]
7
+ },
8
+ {
9
+ "terms": ["sessions", "visits", "unique sessions"],
10
+ "sql": "COUNT(DISTINCT session_id)",
11
+ "dependsOn": ["session_id"]
12
+ },
13
+ {
14
+ "terms": ["page views", "pageviews", "page view count"],
15
+ "sql": "COUNT(*) WHERE event_name = 'page_view'",
16
+ "dependsOn": ["event_name"]
17
+ },
18
+ {
19
+ "terms": ["transactions", "purchases", "orders"],
20
+ "sql": "COUNT(*) WHERE event_name = 'purchase'",
21
+ "dependsOn": ["ecommerce"]
22
+ },
23
+ {
24
+ "terms": ["revenue", "sales", "purchase revenue", "total revenue"],
25
+ "sql": "SUM(ecommerce.purchase_revenue) WHERE event_name = 'purchase' for transaction-level. For product-level revenue: SUM(items.item_revenue) after CROSS JOIN UNNEST(items) WHERE event_name = 'purchase'",
26
+ "dependsOn": ["ecommerce"]
27
+ },
28
+ {
29
+ "terms": ["products", "items", "product revenue", "item revenue", "product sales", "items sold"],
30
+ "sql": "CROSS JOIN UNNEST(items) to access product-level data. Use items.item_name, items.item_category for identification, items.item_revenue for revenue, items.quantity for units sold. Filter to event_name = 'purchase' for sold products",
31
+ "dependsOn": ["items"]
32
+ },
33
+ {
34
+ "terms": ["page URL", "page location", "URL", "current page"],
35
+ "sql": "page_location (full URL) or page.hostname, page.path, page.query (parsed components)",
36
+ "dependsOn": ["page_location"]
37
+ },
38
+ {
39
+ "terms": ["landing page", "entry page", "first page"],
40
+ "sql": "landing_page.hostname, landing_page.path",
41
+ "dependsOn": ["landing_page"]
42
+ },
43
+ {
44
+ "terms": ["traffic source", "channel", "channel group", "source of traffic"],
45
+ "sql": "session_traffic_source_last_click.cross_channel_campaign.primary_channel_group for top-level channel grouping. session_traffic_source_last_click.cross_channel_campaign.source, session_traffic_source_last_click.cross_channel_campaign.medium, and session_traffic_source_last_click.cross_channel_campaign.campaign_name for detailed source/medium/campaign analysis.",
46
+ "dependsOn": ["session_traffic_source_last_click"]
47
+ },
48
+ {
49
+ "terms": ["device type", "device category", "device", "mobile vs desktop"],
50
+ "sql": "device.category (values: mobile, tablet, desktop)",
51
+ "dependsOn": ["device"]
52
+ },
53
+ {
54
+ "terms": ["country", "location", "region", "city", "geography", "geo"],
55
+ "sql": "geo.country, geo.region, geo.city",
56
+ "dependsOn": ["geo"]
57
+ }
58
+ ],
59
+ "keyFields": [
60
+ {
61
+ "field": "event_date",
62
+ "note": "Partition column. Always include in WHERE to limit scanned data and cost.",
63
+ "dependsOn": []
64
+ },
65
+ {
66
+ "field": "event_name",
67
+ "note": "Primary event type filter (e.g. 'page_view', 'purchase', 'scroll').",
68
+ "dependsOn": []
69
+ },
70
+ {
71
+ "field": "user_pseudo_id",
72
+ "note": "Device-level user identifier. COUNT(DISTINCT) for unique users.",
73
+ "dependsOn": []
74
+ },
75
+ {
76
+ "field": "session_id",
77
+ "note": "Unique session identifier. COUNT(DISTINCT) for sessions, GROUP BY for session metrics.",
78
+ "dependsOn": ["session_id"]
79
+ },
80
+ {
81
+ "field": "merged_user_id",
82
+ "note": "Cross-device user identifier. Prefer over user_pseudo_id for user-level analysis.",
83
+ "dependsOn": []
84
+ },
85
+ {
86
+ "field": "page_location",
87
+ "note": "Full page URL. Use page.hostname and page.path for structured analysis.",
88
+ "dependsOn": ["page_location"]
89
+ },
90
+ {
91
+ "field": "ecommerce.purchase_revenue",
92
+ "note": "Transaction-level revenue. Filter to event_name = 'purchase' before aggregating.",
93
+ "dependsOn": ["ecommerce"]
94
+ },
95
+ {
96
+ "field": "items",
97
+ "note": "Product-level data (nested array). CROSS JOIN UNNEST(items) to access item_name, item_category, item_revenue, quantity. Filter to event_name = 'purchase' for sold products.",
98
+ "dependsOn": ["items"]
99
+ },
100
+ {
101
+ "field": "session_traffic_source_last_click",
102
+ "note": "Primary field for traffic source analysis. Use .cross_channel_campaign.primary_channel_group for channel grouping. When asked about popular traffic sources, default to channel group unless specified otherwise.",
103
+ "dependsOn": ["session_traffic_source_last_click"]
104
+ },
105
+ {
106
+ "field": "device.category",
107
+ "note": "Device type segmentation (mobile, tablet, desktop).",
108
+ "dependsOn": ["device"]
109
+ },
110
+ {
111
+ "field": "geo.country",
112
+ "note": "Geographic segmentation by country.",
113
+ "dependsOn": ["geo"]
114
+ },
115
+ {
116
+ "field": "event_datetime",
117
+ "note": "Event timestamp in the configured timezone. Use for time-of-day analysis.",
118
+ "dependsOn": []
119
+ },
120
+ {
121
+ "field": "data_is_final",
122
+ "note": "Data stability flag. WHERE data_is_final = true for stable data only.",
123
+ "dependsOn": []
124
+ }
125
+ ],
126
+ "filteringGuidance": [
127
+ {"text": "Always filter on event_date (partition column) to reduce query cost.", "dependsOn": []},
128
+ {"text": "Use event_name to select specific event types (e.g. WHERE event_name = 'page_view').", "dependsOn": []},
129
+ {"text": "For user counts: COUNT(DISTINCT user_pseudo_id) for device-level, COUNT(DISTINCT merged_user_id) for cross-device.", "dependsOn": []},
130
+ {"text": "For session metrics: GROUP BY session_id.", "dependsOn": ["session_id"]},
131
+ {"text": "For transaction-level ecommerce revenue: filter WHERE event_name = 'purchase' before aggregating ecommerce.purchase_revenue.", "dependsOn": ["ecommerce"]},
132
+ {"text": "For product-level analysis: CROSS JOIN UNNEST(items) WHERE event_name = 'purchase'. Use items.item_revenue for product revenue, items.quantity for units sold, items.item_category for product categories.", "dependsOn": ["items"]},
133
+ {"text": "For traffic source analysis: use session_traffic_source_last_click.cross_channel_campaign.primary_channel_group for channel grouping. When asked about popular traffic sources, default to channel group unless specified otherwise. For detailed attribution: session_traffic_source_last_click (last non-direct click), session_first_traffic_source (last click without non-direct attribution), collected_traffic_source (event-level data).", "dependsOn": ["session_traffic_source_last_click"]},
134
+ {"text": "Nested arrays (event_params, items): use CROSS JOIN UNNEST(...) to access individual values.", "dependsOn": []},
135
+ {"text": "Use data_is_final = true to exclude data that may still change in future refreshes.", "dependsOn": []}
136
+ ],
137
+ "eventVocabulary": {
138
+ "autoCollectedAndEnhanced": ["page_view", "session_start", "first_visit", "user_engagement", "scroll", "click", "file_download", "video_start", "video_progress", "video_complete", "view_search_results", "form_start", "form_submit"],
139
+ "ecommerce": ["purchase", "add_to_cart", "begin_checkout", "view_item", "view_item_list", "add_to_wishlist", "remove_from_cart", "add_payment_info", "add_shipping_info", "refund"]
140
+ }
141
+ }
package/documentation.js CHANGED
@@ -1,6 +1,8 @@
1
1
  const columnDescriptions = require('./columns/columnDescriptions.json');
2
2
  const columnLineage = require('./columns/columnLineage.json');
3
3
  const columnTypicalUse = require('./columns/columnTypicalUse.json');
4
+ const tableAgentInstructions = require('./columns/tableAgentInstructions.json');
5
+ const constants = require('./constants');
4
6
 
5
7
  /**
6
8
  * Composes a multi-section column description string from individual sections.
@@ -168,9 +170,134 @@ const getColumnDescriptions = (config) => {
168
170
  return descriptions;
169
171
  };
170
172
 
173
+ /**
174
+ * Checks whether a column (or its parent struct) is excluded by the config.
175
+ *
176
+ * @param {string[]} dependsOn - Column names this entry depends on.
177
+ * @param {string[]} excludedColumns - Combined excluded columns from config.
178
+ * @returns {boolean} True if ALL dependsOn columns are excluded.
179
+ */
180
+ const isExcluded = (dependsOn, excludedColumns) => {
181
+ if (!dependsOn || dependsOn.length === 0) return false;
182
+ return dependsOn.every(col => excludedColumns.includes(col));
183
+ };
184
+
185
+ /**
186
+ * Composes the full table description for ga4_events_enhanced, including
187
+ * AI agent instructions (key fields, synonyms, filtering guidance, event vocabulary)
188
+ * and the existing table features and config JSON dump.
189
+ *
190
+ * @param {Object} config - The merged configuration object.
191
+ * @returns {string} The composed table description.
192
+ */
193
+ const getTableDescription = (config) => {
194
+ // Only use user-configured excludedColumns for filtering AI instructions.
195
+ // defaultExcludedColumns refers to raw GA4 export columns excluded during extraction
196
+ // (e.g. session_id is excluded from the raw export but exists as a derived column in the final table).
197
+ const excludedColumns = config.excludedColumns || [];
198
+
199
+ const excludedEvents = [
200
+ ...(config.defaultExcludedEvents || []),
201
+ ...(config.excludedEvents || []),
202
+ ];
203
+
204
+ const sections = [];
205
+
206
+ // 1. Overview
207
+ const overviewLines = [
208
+ 'GA4 Events Enhanced',
209
+ '',
210
+ 'An enhanced version of the GA4 BigQuery export. Each row is one event.',
211
+ ];
212
+ if (config.timezone) {
213
+ overviewLines.push(`Timezone: ${config.timezone}.`);
214
+ }
215
+ sections.push(overviewLines.join('\n'));
216
+
217
+ // 2. Key Fields
218
+ const keyFieldLines = tableAgentInstructions.keyFields
219
+ .filter(kf => !isExcluded(kf.dependsOn, excludedColumns))
220
+ .map(kf => `- ${kf.field}: ${kf.note}`);
221
+
222
+ // Add promoted event params
223
+ if (config.eventParamsToColumns && config.eventParamsToColumns.length > 0) {
224
+ config.eventParamsToColumns.forEach(p => {
225
+ const columnName = p.columnName || p.name;
226
+ keyFieldLines.push(`- ${columnName}: Promoted event parameter '${p.name}'. Available as a top-level column for direct filtering.`);
227
+ });
228
+ }
229
+
230
+ if (keyFieldLines.length > 0) {
231
+ sections.push('KEY FIELDS:\n' + keyFieldLines.join('\n'));
232
+ }
233
+
234
+ // 3. Synonyms
235
+ const synonymLines = tableAgentInstructions.synonyms
236
+ .filter(s => !isExcluded(s.dependsOn, excludedColumns))
237
+ .map(s => `- "${s.terms.join('" / "')}" → ${s.sql}`);
238
+
239
+ if (synonymLines.length > 0) {
240
+ sections.push('SYNONYMS:\n' + synonymLines.join('\n'));
241
+ }
242
+
243
+ // 4. Filtering and Grouping
244
+ const guidanceLines = tableAgentInstructions.filteringGuidance
245
+ .filter(g => !isExcluded(g.dependsOn, excludedColumns))
246
+ .map(g => `- ${g.text}`);
247
+
248
+ if (guidanceLines.length > 0) {
249
+ sections.push('FILTERING AND GROUPING:\n' + guidanceLines.join('\n'));
250
+ }
251
+
252
+ // 5. Event Vocabulary
253
+ const vocabParts = [];
254
+ const autoEvents = tableAgentInstructions.eventVocabulary.autoCollectedAndEnhanced
255
+ .filter(e => !excludedEvents.includes(e));
256
+ if (autoEvents.length > 0) {
257
+ vocabParts.push(`Auto-collected and enhanced measurement: ${autoEvents.join(', ')}`);
258
+ }
259
+
260
+ if (!isExcluded(['ecommerce'], excludedColumns)) {
261
+ const ecomEvents = tableAgentInstructions.eventVocabulary.ecommerce
262
+ .filter(e => !excludedEvents.includes(e));
263
+ if (ecomEvents.length > 0) {
264
+ vocabParts.push(`Ecommerce (recommended): ${ecomEvents.join(', ')}`);
265
+ }
266
+ }
267
+
268
+ if (vocabParts.length > 0) {
269
+ sections.push('COMMON EVENT NAMES:\n' + vocabParts.join('\n'));
270
+ }
271
+
272
+ // 6. Table Features
273
+ const featureLines = [
274
+ 'Combines daily, intraday, and fresh exports; the best available version of each event is used.',
275
+ 'Incremental updates: non-final data is replaced with the latest available data on every run.',
276
+ 'Promotes key fields (e.g. page_location, session_id) to top-level columns for faster queries.',
277
+ 'Session-level fields: landing_page, user_id resolution, and configurable session parameters.',
278
+ ];
279
+ sections.push('TABLE FEATURES:\n' + featureLines.map(f => `- ${f}`).join('\n'));
280
+
281
+ // 7. Package Attribution
282
+ sections.push(`${constants.TABLE_DESCRIPTION_SUFFIX}\n${constants.TABLE_DESCRIPTION_DOCUMENTATION_LINK}`);
283
+
284
+ // 8. Config JSON dump
285
+ const configJson = JSON.stringify(
286
+ Object.fromEntries(
287
+ Object.entries(config).filter(([key]) => !key.startsWith('default') && key !== 'dataformTableConfig')
288
+ ),
289
+ null,
290
+ 2
291
+ );
292
+ sections.push(`The last full table refresh was done using this configuration:\n${configJson}`);
293
+
294
+ return sections.join('\n\n');
295
+ };
296
+
171
297
  module.exports = {
172
298
  columnDescriptions,
173
299
  getColumnDescriptions,
300
+ getTableDescription,
174
301
  composeDescription,
175
302
  getLineageText,
176
303
  buildConfigNotes,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ga4-export-fixer",
3
- "version": "0.3.2-dev.1",
3
+ "version": "0.3.2-dev.3",
4
4
  "description": "",
5
5
  "main": "index.js",
6
6
  "files": [
@@ -321,29 +321,7 @@ ${excludedEventsSQL}`,
321
321
  const createEnhancedEventsTable = (dataformPublish, config) => {
322
322
  const mergedConfig = utils.mergeSQLConfigurations(defaultConfig, config);
323
323
 
324
- const tableDescription = `GA4 Events Enhanced
325
-
326
- - Combines daily (processed) and intraday exports so the best available version of each event is always used.
327
- - Incremental updates: All data with "data_is_final" flag set to false is deleted and replaced with the latest available data on every run (supports any schedule: daily, hourly, or custom).
328
- - Keeps the flexible schema of the original export while promoting key fields (e.g. page_location, session_id) to columns for faster queries.
329
- - Partitioned by event_date and clustered for optimal query performance.
330
- - Event parameter handling: promote params to columns or exclude by name.
331
- - Session-level fields: landing_page, fixed user_id, and configurable session parameters.
332
- - Other improvements and refinements based on configuration
333
-
334
- ${constants.TABLE_DESCRIPTION_SUFFIX}
335
- ${constants.TABLE_DESCRIPTION_DOCUMENTATION_LINK}
336
-
337
- The last full table refresh was done using this configuration:
338
- ${JSON.stringify(
339
- Object.fromEntries(
340
- // don't display the default arrays here, their contents are included in the main arrays via the mergeSQLConfigurations function
341
- // dataformTAbleConfig is also excluded since it's not relevant for the SQL generation and is more of a deployment detail
342
- Object.entries(mergedConfig).filter(([key]) => !key.startsWith('default') && key !== 'dataformTableConfig')
343
- ),
344
- null,
345
- 2
346
- )}`;
324
+ const tableDescription = documentation.getTableDescription(mergedConfig);
347
325
 
348
326
  // the defaults for the dataform table config
349
327
  const defaultDataformTableConfig = {