ga4-export-fixer 0.8.0 → 0.9.0-dev.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +129 -8
- package/documentation.js +272 -223
- package/helpers/ga4Transforms.js +315 -262
- package/package.json +8 -5
- package/tables/ga4EventsEnhanced/config.js +4 -0
- package/tables/ga4EventsEnhanced/index.js +216 -100
- package/tables/ga4EventsEnhanced/validation.js +99 -4
- package/utils.js +163 -26
package/README.md
CHANGED
|
@@ -128,6 +128,12 @@ The goal of the package is to **speed up development** when building data models
|
|
|
128
128
|
<b>🕐 Timezone-Aware Datetime</b><br>
|
|
129
129
|
<code>event_datetime</code> converted to a configurable IANA timezone
|
|
130
130
|
</td>
|
|
131
|
+
<td valign="top">
|
|
132
|
+
<b>🧩 Custom Processing Steps</b><br>
|
|
133
|
+
Append user-defined CTEs via <code>customSteps</code> to derive new columns or join external tables
|
|
134
|
+
</td>
|
|
135
|
+
</tr>
|
|
136
|
+
<tr>
|
|
131
137
|
<td valign="top">
|
|
132
138
|
<b>🛡️ Zero Dependencies</b><br>
|
|
133
139
|
No additional external dependencies added to your Dataform repository
|
|
@@ -139,10 +145,10 @@ The goal of the package is to **speed up development** when building data models
|
|
|
139
145
|
|
|
140
146
|
Features under consideration for future releases:
|
|
141
147
|
|
|
148
|
+
- Data enrichment (item-level, session-level, event-level)
|
|
149
|
+
- Aggregated tables (ga4_session, ga4_ecommerce...)
|
|
142
150
|
- Web and app specific default configurations
|
|
143
151
|
- Custom channel grouping
|
|
144
|
-
- Data enrichment (item-level, session-level, event-level)
|
|
145
|
-
- Custom processing steps (additional CTEs)
|
|
146
152
|
- Custom traffic source attribution
|
|
147
153
|
|
|
148
154
|
## Installation
|
|
@@ -192,7 +198,8 @@ Create a new **ga4_events_enhanced** table using a **.js** file in your reposito
|
|
|
192
198
|
const { ga4EventsEnhanced } = require('ga4-export-fixer');
|
|
193
199
|
|
|
194
200
|
const config = {
|
|
195
|
-
|
|
201
|
+
// using hard-coded GA4 export path
|
|
202
|
+
sourceTable: '`project.analytics_12345.events_*`'
|
|
196
203
|
};
|
|
197
204
|
|
|
198
205
|
ga4EventsEnhanced.createTable(publish, config);
|
|
@@ -206,6 +213,7 @@ ga4EventsEnhanced.createTable(publish, config);
|
|
|
206
213
|
const { ga4EventsEnhanced } = require('ga4-export-fixer');
|
|
207
214
|
|
|
208
215
|
const config = {
|
|
216
|
+
// GA4 export path declared, using the table reference object
|
|
209
217
|
sourceTable: constants.GA4_TABLES.MY_GA4_EXPORT,
|
|
210
218
|
// use dataformTableConfig to make changes to the default Dataform table configuration
|
|
211
219
|
dataformTableConfig: {
|
|
@@ -284,7 +292,8 @@ js {
|
|
|
284
292
|
const { ga4EventsEnhanced } = require('ga4-export-fixer');
|
|
285
293
|
|
|
286
294
|
const config = {
|
|
287
|
-
|
|
295
|
+
// using hard-coded GA4 export path
|
|
296
|
+
sourceTable: '`project.analytics_12345.events_*`',
|
|
288
297
|
self: self(),
|
|
289
298
|
incremental: incremental()
|
|
290
299
|
};
|
|
@@ -308,7 +317,7 @@ All fields are optional except `sourceTable`. Default values are applied automat
|
|
|
308
317
|
|
|
309
318
|
| Field | Type | Default/Required | Description |
|
|
310
319
|
| ---------------------- | ----------------------- | ---------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
311
|
-
| `sourceTable` | Dataform ref
|
|
320
|
+
| `sourceTable` | Dataform ref / object / string | **required** | Source GA4 export table. Inside an SQLX `js { }` block use `ref(...)`. From a `.js` definition file use a `{ schema, name }` ref object (resolved later via `ctx.ref()`) or a backtick-quoted ``` `project.dataset.events_*` ``` string for an external table. |
|
|
312
321
|
| `self` | Dataform self() | **required for .SQLX deployment** | Reference to the table itself. Use `self()` in Dataform |
|
|
313
322
|
| `incremental` | Dataform incremental() | **required for .SQLX deployment** | Switch between incremental and full refresh logic. Use `incremental()` in Dataform |
|
|
314
323
|
| `dataformTableConfig` | object | **In JS deployment only.** [See default](#default-dataformtableconfig) | Override the default Dataform table configuration for JS deployment. See: [ITableConfig reference](https://docs.cloud.google.com/dataform/docs/reference/dataform-core-reference#itableconfig) |
|
|
@@ -328,6 +337,7 @@ All fields are optional except `sourceTable`. Default values are applied automat
|
|
|
328
337
|
| `preOperations` | object | [See details](#preOperations) | Date range and incremental refresh configuration |
|
|
329
338
|
| `eventParamsToColumns` | object[] | `[]` | Event parameters to promote to columns. [See item schema](#eventParamsToColumns) |
|
|
330
339
|
| `customSteps` | object[] | `[]` | User-defined CTEs appended to the pipeline after `enhanced_events`. [See Custom CTEs](#custom-ctes) |
|
|
340
|
+
| `enrichments` | object[] | `[]` | Declarative external-data enrichments joined into `enhanced_events`. [See Data Enrichments](#data-enrichments) |
|
|
331
341
|
|
|
332
342
|
<a id="default-dataformtableconfig"></a>
|
|
333
343
|
<details>
|
|
@@ -465,10 +475,12 @@ itemListAttribution: { lookbackType: 'TIME', lookbackTimeMs: 86400000 }
|
|
|
465
475
|
| ------------------------ | ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
466
476
|
| `event_data` | yes | Extracted and shaped events from `sourceTable`, with date filtering and column promotions applied. *Unfiltered for the buffer-days range.* |
|
|
467
477
|
| `session_data` | yes | Session-level aggregations (grouped by `session_id`). |
|
|
468
|
-
| `
|
|
469
|
-
| `
|
|
470
|
-
| `
|
|
478
|
+
| `items_unnested` | only when `itemListAttribution` is on | Per-event item rows (one row per item per ecommerce event), with attribution window function applied. |
|
|
479
|
+
| `items_rebuilt` | only when `itemListAttribution` is on | Re-aggregated items with attributed list fields, joined back to events via `_item_row_id`. |
|
|
480
|
+
| `enrich_<name>` | only when configured via `enrichments` | One CTE per [enrichment](#data-enrichments) entry, providing dim data for joining into `enhanced_events`. |
|
|
481
|
+
| `enhanced_events` | yes | The package's standard output shape (joined event_data + session_data + items_rebuilt + enrich_*, columns ordered, incremental date filter applied). The natural starting point for most custom CTEs. |
|
|
471
482
|
|
|
483
|
+
Example custom step using the raw SQL format:
|
|
472
484
|
|
|
473
485
|
```javascript
|
|
474
486
|
// Add a content_group column derived from page.path
|
|
@@ -488,10 +500,119 @@ from enhanced_events`,
|
|
|
488
500
|
],
|
|
489
501
|
```
|
|
490
502
|
|
|
503
|
+
The same example in the structured shape:
|
|
504
|
+
|
|
505
|
+
```javascript
|
|
506
|
+
customSteps: [
|
|
507
|
+
{
|
|
508
|
+
name: 'final',
|
|
509
|
+
select: {
|
|
510
|
+
columns: {
|
|
511
|
+
'[sql]passthrough': 'enhanced_events.*',
|
|
512
|
+
content_group: `case
|
|
513
|
+
when page.path like '/blog/%' then 'blog'
|
|
514
|
+
when page.path like '/products/%' then 'product'
|
|
515
|
+
when page.path = '/' then 'home'
|
|
516
|
+
else 'other'
|
|
517
|
+
end`,
|
|
518
|
+
},
|
|
519
|
+
},
|
|
520
|
+
from: 'enhanced_events',
|
|
521
|
+
},
|
|
522
|
+
],
|
|
523
|
+
```
|
|
524
|
+
|
|
491
525
|
> **Note:** Custom columns aren't auto-documented. Use `dataformTableConfig.columns` to add descriptions — it's deep-merged with the package's defaults, so your keys are added or override matching defaults, and untouched defaults stay.
|
|
492
526
|
|
|
493
527
|
> **Note:** Built-in assertions assume the package's standard schema. If your custom CTEs rename, drop, or filter rows in ways that break those assumptions, disable the affected assertions explicitly via the `assertions` config option.
|
|
494
528
|
|
|
529
|
+
<a id="data-enrichments"></a>
|
|
530
|
+
|
|
531
|
+
**`enrichments`** — declaratively join external dimension data into `enhanced_events` (cohort labels, page metadata, marketing attribution, etc.). Each entry describes one dim source plus the join — the package generates the source CTE, the `LEFT JOIN`, and column descriptions automatically.
|
|
532
|
+
|
|
533
|
+
For typical use cases this is the right tool; reach for `customSteps` only when you need a transformation that doesn't fit a flat dim join.
|
|
534
|
+
|
|
535
|
+
**Per-enrichment shape:**
|
|
536
|
+
|
|
537
|
+
| Field | Type | Required | Description |
|
|
538
|
+
| --- | --- | --- | --- |
|
|
539
|
+
| `name` | string | Yes | Used in the generated `enrich_<name>` CTE name. Unique within `enrichments`. |
|
|
540
|
+
| `level` | `'event'` / `'item'` | No, defaults to `'event'` | Join grain. `'event'` joins external dim data onto each event row (any column on `enhanced_events` as the key). `'item'` joins external dim data onto each item inside the `items` array (any field on the items struct or any event_data column as the key). |
|
|
541
|
+
| `source` | Dataform ref / object / string | Yes | Source dim table. Inside an SQLX `js { }` block use `ref(...)`. From a `.js` definition file use a `{ schema, name }` ref object (resolved later via `ctx.ref()`) or a backtick-quoted ``` `project.dataset.table` ``` string for an external table. |
|
|
542
|
+
| `joinKey` | string / string[] | Yes | For `level: 'event'`: column name(s) on `enhanced_events`. For `level: 'item'`: field name(s) on the items struct (e.g. `'item_id'`) or column name(s) on `event_data` (e.g. `'user_pseudo_id'`). Composite keys (array) compile to `USING(col1, col2, ...)`. |
|
|
543
|
+
| `columns` | string[] | Yes | Source columns to add to the output (excluding `joinKey`). Names matching existing columns are coalesced with the original (`coalesce(enrich.col, original)`) so missed JOINs fall back to the existing value. |
|
|
544
|
+
| `dedupe` | boolean | No, defaults to `false` | When `true`, wraps the source CTE in `qualify row_number() over (partition by <joinKey>) = 1` for non-unique-key dim sources. Non-deterministic which row wins; for strict needs, pre-aggregate in source SQL. |
|
|
545
|
+
|
|
546
|
+
**Coalesce-or-add semantics.** If an enrichment column name matches an existing column on `enhanced_events` (a column promoted via `eventParamsToColumns`, a package-generated column, or a default GA4 column from the export), the enrichment value is coalesced with the original: `coalesce(enrich_<name>.<col>, <original>) as <col>`. Rows where the JOIN matches get the enrichment value; rows where it misses fall back to the existing value rather than going NULL. If there is no overlap, the column is added as a plain `enrich_<name>.<col>`.
|
|
547
|
+
|
|
548
|
+
**Example** — attach user cohort labels by `user_pseudo_id` (Dataform-declared table referenced by `{ schema, name }`):
|
|
549
|
+
|
|
550
|
+
```javascript
|
|
551
|
+
enrichments: [
|
|
552
|
+
{
|
|
553
|
+
name: 'cohorts',
|
|
554
|
+
level: 'event',
|
|
555
|
+
source: { schema: 'analytics', name: 'user_cohorts' },
|
|
556
|
+
joinKey: 'user_pseudo_id',
|
|
557
|
+
columns: ['cohort_label', 'lifecycle_stage'],
|
|
558
|
+
},
|
|
559
|
+
],
|
|
560
|
+
```
|
|
561
|
+
|
|
562
|
+
**Example** — composite key (date + user) for daily-varying dim data, with dedupe safety net (external table referenced by backtick-FQN):
|
|
563
|
+
|
|
564
|
+
```javascript
|
|
565
|
+
enrichments: [
|
|
566
|
+
{
|
|
567
|
+
name: 'segments',
|
|
568
|
+
level: 'event',
|
|
569
|
+
source: '`my-project.analytics.daily_user_segments`',
|
|
570
|
+
joinKey: ['event_date', 'user_pseudo_id'],
|
|
571
|
+
columns: ['segment'],
|
|
572
|
+
dedupe: true,
|
|
573
|
+
},
|
|
574
|
+
],
|
|
575
|
+
```
|
|
576
|
+
|
|
577
|
+
**Example** — fix a promoted event parameter via enrichment (coalesce case: enrichment value wins where the JOIN matches, original kept where it doesn't):
|
|
578
|
+
|
|
579
|
+
```javascript
|
|
580
|
+
{
|
|
581
|
+
eventParamsToColumns: [{ name: 'page_title', type: 'string' }],
|
|
582
|
+
enrichments: [
|
|
583
|
+
{
|
|
584
|
+
name: 'titles',
|
|
585
|
+
level: 'event',
|
|
586
|
+
source: { schema: 'analytics', name: 'page_title_overrides' },
|
|
587
|
+
joinKey: 'page_location',
|
|
588
|
+
columns: ['page_title'], // overlaps the promoted column → coalesce(enrich.page_title, event_data.page_title)
|
|
589
|
+
},
|
|
590
|
+
],
|
|
591
|
+
}
|
|
592
|
+
```
|
|
593
|
+
|
|
594
|
+
**Example** — item-level enrichment: attach product master data to each item via `item_id`. The enrichment flows into the `items` array struct; `margin_bucket` is added as a new item-struct field, and `item_category` overlap-coalesces against the original. Item-level enrichment columns do NOT appear at the event grain — they live inside `items[].<col>`:
|
|
595
|
+
|
|
596
|
+
```javascript
|
|
597
|
+
enrichments: [
|
|
598
|
+
{
|
|
599
|
+
name: 'products',
|
|
600
|
+
level: 'item',
|
|
601
|
+
source: { schema: 'analytics', name: 'product_master' },
|
|
602
|
+
joinKey: 'item_id', // joins on item.item_id
|
|
603
|
+
columns: ['margin_bucket', 'item_category'], // margin_bucket is additive; item_category overlap-coalesces
|
|
604
|
+
},
|
|
605
|
+
],
|
|
606
|
+
```
|
|
607
|
+
|
|
608
|
+
For `level: 'item'`, valid `joinKey` values are any field on the GA4 items struct (`item_id`, `item_category`, etc.) or any column on `event_data` (`user_pseudo_id`, `event_date`, etc.). An event-level and an item-level enrichment may share the same column name (e.g. both writing `cohort`) — the two columns target structurally distinct slots (`enhanced_events.cohort` at event grain vs `items[].cohort` inside the items array) and are not in collision.
|
|
609
|
+
|
|
610
|
+
> **Note:** Each enrichment generates a CTE named `enrich_<name>` at the top of the pipeline. The `enrich_*` namespace is part of the reserved-names contract — `customSteps` cannot use these names. The active reserved set includes only the names of enrichments actually configured.
|
|
611
|
+
|
|
612
|
+
> **Note:** Event-level enrichment columns get auto-generated descriptions (`Added by enrichment '<name>' (joined on <joinKey> from <source>).` for new columns; `Coalesced by enrichment '<name>' (...; falls back to original on missed JOIN). Original: <description>` for overlapping columns). User-supplied `dataformTableConfig.columns` overrides win — the auto-generated description is the default. Item-level enrichment columns do not receive auto-generated descriptions (BigQuery does not surface per-field descriptions on STRUCT-array fields cleanly through Dataform's column-description mechanism).
|
|
613
|
+
|
|
614
|
+
> **Note:** `joinKey` and `columns` entries must be plain SQL identifiers — inline aliases like `'id as user_id'` are rejected at validation time. If your dim source uses a different column name, alias it in an upstream Dataform view and point `source` at that view.
|
|
615
|
+
|
|
495
616
|
<br>
|
|
496
617
|
|
|
497
618
|
---
|