ga4-export-fixer 0.3.2-dev.0 → 0.3.2-dev.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,130 @@
1
+ {
2
+ "synonyms": [
3
+ {
4
+ "terms": ["users", "unique users", "visitors", "unique visitors"],
5
+ "sql": "COUNT(DISTINCT user_pseudo_id) or COUNT(DISTINCT merged_user_id) for cross-device",
6
+ "dependsOn": ["user_pseudo_id"]
7
+ },
8
+ {
9
+ "terms": ["sessions", "visits", "unique sessions"],
10
+ "sql": "COUNT(DISTINCT session_id)",
11
+ "dependsOn": ["session_id"]
12
+ },
13
+ {
14
+ "terms": ["page views", "pageviews", "page view count"],
15
+ "sql": "COUNT(*) WHERE event_name = 'page_view'",
16
+ "dependsOn": ["event_name"]
17
+ },
18
+ {
19
+ "terms": ["transactions", "purchases", "orders"],
20
+ "sql": "COUNT(*) WHERE event_name = 'purchase'",
21
+ "dependsOn": ["ecommerce"]
22
+ },
23
+ {
24
+ "terms": ["revenue", "sales", "purchase revenue", "total revenue"],
25
+ "sql": "SUM(ecommerce.purchase_revenue) WHERE event_name = 'purchase'",
26
+ "dependsOn": ["ecommerce"]
27
+ },
28
+ {
29
+ "terms": ["page URL", "page location", "URL", "current page"],
30
+ "sql": "page_location (full URL) or page.hostname, page.path, page.query (parsed components)",
31
+ "dependsOn": ["page_location"]
32
+ },
33
+ {
34
+ "terms": ["landing page", "entry page", "first page"],
35
+ "sql": "landing_page.hostname, landing_page.path",
36
+ "dependsOn": ["landing_page"]
37
+ },
38
+ {
39
+ "terms": ["traffic source", "channel", "medium", "campaign", "source", "UTM"],
40
+ "sql": "collected_traffic_source (event-level), session_first_traffic_source (session first-touch), session_traffic_source_last_click (session last-click attribution)",
41
+ "dependsOn": ["collected_traffic_source", "session_first_traffic_source"]
42
+ },
43
+ {
44
+ "terms": ["device type", "device category", "device", "mobile vs desktop"],
45
+ "sql": "device.category (values: mobile, tablet, desktop)",
46
+ "dependsOn": ["device"]
47
+ },
48
+ {
49
+ "terms": ["country", "location", "region", "city", "geography", "geo"],
50
+ "sql": "geo.country, geo.region, geo.city",
51
+ "dependsOn": ["geo"]
52
+ }
53
+ ],
54
+ "keyFields": [
55
+ {
56
+ "field": "event_date",
57
+ "note": "Partition column. Always include in WHERE to limit scanned data and cost.",
58
+ "dependsOn": []
59
+ },
60
+ {
61
+ "field": "event_name",
62
+ "note": "Primary event type filter (e.g. 'page_view', 'purchase', 'scroll').",
63
+ "dependsOn": []
64
+ },
65
+ {
66
+ "field": "user_pseudo_id",
67
+ "note": "Device-level user identifier. COUNT(DISTINCT) for unique users.",
68
+ "dependsOn": []
69
+ },
70
+ {
71
+ "field": "session_id",
72
+ "note": "Unique session identifier. COUNT(DISTINCT) for sessions, GROUP BY for session metrics.",
73
+ "dependsOn": ["session_id"]
74
+ },
75
+ {
76
+ "field": "merged_user_id",
77
+ "note": "Cross-device user identifier. Prefer over user_pseudo_id for user-level analysis.",
78
+ "dependsOn": []
79
+ },
80
+ {
81
+ "field": "page_location",
82
+ "note": "Full page URL. Use page.hostname and page.path for structured analysis.",
83
+ "dependsOn": ["page_location"]
84
+ },
85
+ {
86
+ "field": "ecommerce.purchase_revenue",
87
+ "note": "Revenue per transaction. Filter to event_name = 'purchase' before aggregating.",
88
+ "dependsOn": ["ecommerce"]
89
+ },
90
+ {
91
+ "field": "session_first_traffic_source",
92
+ "note": "Session acquisition source. Use .manual_source and .manual_medium for UTM data.",
93
+ "dependsOn": ["session_first_traffic_source"]
94
+ },
95
+ {
96
+ "field": "device.category",
97
+ "note": "Device type segmentation (mobile, tablet, desktop).",
98
+ "dependsOn": ["device"]
99
+ },
100
+ {
101
+ "field": "geo.country",
102
+ "note": "Geographic segmentation by country.",
103
+ "dependsOn": ["geo"]
104
+ },
105
+ {
106
+ "field": "event_datetime",
107
+ "note": "Event timestamp in the configured timezone. Use for time-of-day analysis.",
108
+ "dependsOn": []
109
+ },
110
+ {
111
+ "field": "data_is_final",
112
+ "note": "Data stability flag. WHERE data_is_final = true for stable data only.",
113
+ "dependsOn": []
114
+ }
115
+ ],
116
+ "filteringGuidance": [
117
+ {"text": "Always filter on event_date (partition column) to reduce query cost.", "dependsOn": []},
118
+ {"text": "Use event_name to select specific event types (e.g. WHERE event_name = 'page_view').", "dependsOn": []},
119
+ {"text": "For user counts: COUNT(DISTINCT user_pseudo_id) for device-level, COUNT(DISTINCT merged_user_id) for cross-device.", "dependsOn": []},
120
+ {"text": "For session metrics: GROUP BY session_id.", "dependsOn": ["session_id"]},
121
+ {"text": "For ecommerce revenue: filter WHERE event_name = 'purchase' before aggregating ecommerce.purchase_revenue.", "dependsOn": ["ecommerce"]},
122
+ {"text": "For traffic source analysis: use session_first_traffic_source for first-touch, session_traffic_source_last_click for last-click attribution.", "dependsOn": ["session_first_traffic_source"]},
123
+ {"text": "Nested arrays (event_params, items): use CROSS JOIN UNNEST(...) to access individual values.", "dependsOn": []},
124
+ {"text": "Use data_is_final = true to exclude data that may still change in future refreshes.", "dependsOn": []}
125
+ ],
126
+ "eventVocabulary": {
127
+ "autoCollectedAndEnhanced": ["page_view", "session_start", "first_visit", "user_engagement", "scroll", "click", "file_download", "video_start", "video_progress", "video_complete", "view_search_results", "form_start", "form_submit"],
128
+ "ecommerce": ["purchase", "add_to_cart", "begin_checkout", "view_item", "view_item_list", "add_to_wishlist", "remove_from_cart", "add_payment_info", "add_shipping_info", "refund"]
129
+ }
130
+ }
package/documentation.js CHANGED
@@ -1,6 +1,8 @@
1
1
  const columnDescriptions = require('./columns/columnDescriptions.json');
2
2
  const columnLineage = require('./columns/columnLineage.json');
3
3
  const columnTypicalUse = require('./columns/columnTypicalUse.json');
4
+ const tableAgentInstructions = require('./columns/tableAgentInstructions.json');
5
+ const constants = require('./constants');
4
6
 
5
7
  /**
6
8
  * Composes a multi-section column description string from individual sections.
@@ -168,9 +170,134 @@ const getColumnDescriptions = (config) => {
168
170
  return descriptions;
169
171
  };
170
172
 
173
+ /**
174
+ * Checks whether a column (or its parent struct) is excluded by the config.
175
+ *
176
+ * @param {string[]} dependsOn - Column names this entry depends on.
177
+ * @param {string[]} excludedColumns - Combined excluded columns from config.
178
+ * @returns {boolean} True if ALL dependsOn columns are excluded.
179
+ */
180
+ const isExcluded = (dependsOn, excludedColumns) => {
181
+ if (!dependsOn || dependsOn.length === 0) return false;
182
+ return dependsOn.every(col => excludedColumns.includes(col));
183
+ };
184
+
185
+ /**
186
+ * Composes the full table description for ga4_events_enhanced, including
187
+ * AI agent instructions (key fields, synonyms, filtering guidance, event vocabulary)
188
+ * and the existing table features and config JSON dump.
189
+ *
190
+ * @param {Object} config - The merged configuration object.
191
+ * @returns {string} The composed table description.
192
+ */
193
+ const getTableDescription = (config) => {
194
+ // Only use user-configured excludedColumns for filtering AI instructions.
195
+ // defaultExcludedColumns refers to raw GA4 export columns excluded during extraction
196
+ // (e.g. session_id is excluded from the raw export but exists as a derived column in the final table).
197
+ const excludedColumns = config.excludedColumns || [];
198
+
199
+ const excludedEvents = [
200
+ ...(config.defaultExcludedEvents || []),
201
+ ...(config.excludedEvents || []),
202
+ ];
203
+
204
+ const sections = [];
205
+
206
+ // 1. Overview
207
+ const overviewLines = [
208
+ 'GA4 Events Enhanced',
209
+ '',
210
+ 'An enhanced version of the GA4 BigQuery export. Each row is one event.',
211
+ ];
212
+ if (config.timezone) {
213
+ overviewLines.push(`Timezone: ${config.timezone}.`);
214
+ }
215
+ sections.push(overviewLines.join('\n'));
216
+
217
+ // 2. Key Fields
218
+ const keyFieldLines = tableAgentInstructions.keyFields
219
+ .filter(kf => !isExcluded(kf.dependsOn, excludedColumns))
220
+ .map(kf => `- ${kf.field}: ${kf.note}`);
221
+
222
+ // Add promoted event params
223
+ if (config.eventParamsToColumns && config.eventParamsToColumns.length > 0) {
224
+ config.eventParamsToColumns.forEach(p => {
225
+ const columnName = p.columnName || p.name;
226
+ keyFieldLines.push(`- ${columnName}: Promoted event parameter '${p.name}'. Available as a top-level column for direct filtering.`);
227
+ });
228
+ }
229
+
230
+ if (keyFieldLines.length > 0) {
231
+ sections.push('KEY FIELDS:\n' + keyFieldLines.join('\n'));
232
+ }
233
+
234
+ // 3. Synonyms
235
+ const synonymLines = tableAgentInstructions.synonyms
236
+ .filter(s => !isExcluded(s.dependsOn, excludedColumns))
237
+ .map(s => `- "${s.terms.join('" / "')}" → ${s.sql}`);
238
+
239
+ if (synonymLines.length > 0) {
240
+ sections.push('SYNONYMS:\n' + synonymLines.join('\n'));
241
+ }
242
+
243
+ // 4. Filtering and Grouping
244
+ const guidanceLines = tableAgentInstructions.filteringGuidance
245
+ .filter(g => !isExcluded(g.dependsOn, excludedColumns))
246
+ .map(g => `- ${g.text}`);
247
+
248
+ if (guidanceLines.length > 0) {
249
+ sections.push('FILTERING AND GROUPING:\n' + guidanceLines.join('\n'));
250
+ }
251
+
252
+ // 5. Event Vocabulary
253
+ const vocabParts = [];
254
+ const autoEvents = tableAgentInstructions.eventVocabulary.autoCollectedAndEnhanced
255
+ .filter(e => !excludedEvents.includes(e));
256
+ if (autoEvents.length > 0) {
257
+ vocabParts.push(`Auto-collected and enhanced measurement: ${autoEvents.join(', ')}`);
258
+ }
259
+
260
+ if (!isExcluded(['ecommerce'], excludedColumns)) {
261
+ const ecomEvents = tableAgentInstructions.eventVocabulary.ecommerce
262
+ .filter(e => !excludedEvents.includes(e));
263
+ if (ecomEvents.length > 0) {
264
+ vocabParts.push(`Ecommerce (recommended): ${ecomEvents.join(', ')}`);
265
+ }
266
+ }
267
+
268
+ if (vocabParts.length > 0) {
269
+ sections.push('COMMON EVENT NAMES:\n' + vocabParts.join('\n'));
270
+ }
271
+
272
+ // 6. Table Features
273
+ const featureLines = [
274
+ 'Combines daily, intraday, and fresh exports; the best available version of each event is used.',
275
+ 'Incremental updates: non-final data is replaced with the latest available data on every run.',
276
+ 'Promotes key fields (e.g. page_location, session_id) to top-level columns for faster queries.',
277
+ 'Session-level fields: landing_page, user_id resolution, and configurable session parameters.',
278
+ ];
279
+ sections.push('TABLE FEATURES:\n' + featureLines.map(f => `- ${f}`).join('\n'));
280
+
281
+ // 7. Package Attribution
282
+ sections.push(`${constants.TABLE_DESCRIPTION_SUFFIX}\n${constants.TABLE_DESCRIPTION_DOCUMENTATION_LINK}`);
283
+
284
+ // 8. Config JSON dump
285
+ const configJson = JSON.stringify(
286
+ Object.fromEntries(
287
+ Object.entries(config).filter(([key]) => !key.startsWith('default') && key !== 'dataformTableConfig')
288
+ ),
289
+ null,
290
+ 2
291
+ );
292
+ sections.push(`The last full table refresh was done using this configuration:\n${configJson}`);
293
+
294
+ return sections.join('\n\n');
295
+ };
296
+
171
297
  module.exports = {
172
298
  columnDescriptions,
173
299
  getColumnDescriptions,
300
+ getTableDescription,
174
301
  composeDescription,
175
302
  getLineageText,
176
303
  buildConfigNotes,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ga4-export-fixer",
3
- "version": "0.3.2-dev.0",
3
+ "version": "0.3.2-dev.2",
4
4
  "description": "",
5
5
  "main": "index.js",
6
6
  "files": [
@@ -321,28 +321,7 @@ ${excludedEventsSQL}`,
321
321
  const createEnhancedEventsTable = (dataformPublish, config) => {
322
322
  const mergedConfig = utils.mergeSQLConfigurations(defaultConfig, config);
323
323
 
324
- const tableDescription = `GA4 Events Enhanced
325
-
326
- - Combines daily (processed) and intraday exports so the best available version of each event is always used.
327
- - Incremental updates: All data with "data_is_final" flag set to false is deleted and replaced with the latest available data on every run (supports any schedule: daily, hourly, or custom).
328
- - Keeps the flexible schema of the original export while promoting key fields (e.g. page_location, session_id) to columns for faster queries.
329
- - Partitioned by event_date and clustered for optimal query performance.
330
- - Event parameter handling: promote params to columns or exclude by name.
331
- - Session-level fields: landing_page, fixed user_id, and configurable session parameters.
332
- - Other improvements and refinements based on configuration
333
-
334
- ${constants.TABLE_DESCRIPTION_SUFFIX}
335
- ${constants.TABLE_DESCRIPTION_DOCUMENTATION_LINK}
336
-
337
- The last full table refresh was done using this configuration:
338
- ${JSON.stringify(
339
- Object.fromEntries(
340
- // don't display the default arrays here, their contents are included in the main arrays via the mergeSQLConfigurations function
341
- Object.entries(mergedConfig).filter(([key]) => !key.startsWith('default'))
342
- ),
343
- null,
344
- 2
345
- )}`;
324
+ const tableDescription = documentation.getTableDescription(mergedConfig);
346
325
 
347
326
  // the defaults for the dataform table config
348
327
  const defaultDataformTableConfig = {