wraith-modelgen 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- modelgen/__init__.py +1 -0
- modelgen/cli.py +373 -0
- modelgen/config.py +62 -0
- modelgen/contract.py +265 -0
- modelgen/generator.py +363 -0
- modelgen/introspect.py +134 -0
- wraith_modelgen-0.4.0.dist-info/METADATA +220 -0
- wraith_modelgen-0.4.0.dist-info/RECORD +11 -0
- wraith_modelgen-0.4.0.dist-info/WHEEL +4 -0
- wraith_modelgen-0.4.0.dist-info/entry_points.txt +2 -0
- wraith_modelgen-0.4.0.dist-info/licenses/LICENSE +21 -0
modelgen/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.1"
|
modelgen/cli.py
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.logging import RichHandler
|
|
9
|
+
from rich.panel import Panel
|
|
10
|
+
from rich.table import Table
|
|
11
|
+
|
|
12
|
+
from modelgen import __version__
|
|
13
|
+
from modelgen.contract import ContractError, EventContract
|
|
14
|
+
from modelgen.generator import GenerationError, Layer, generate
|
|
15
|
+
from modelgen.introspect import IntrospectionError
|
|
16
|
+
|
|
17
|
+
app = typer.Typer(
|
|
18
|
+
name="modelgen",
|
|
19
|
+
help=(
|
|
20
|
+
"BigQuery dbt model scaffolder. Generates one layer"
|
|
21
|
+
" at a time from a YAML data contract."
|
|
22
|
+
),
|
|
23
|
+
no_args_is_help=True,
|
|
24
|
+
rich_markup_mode="rich",
|
|
25
|
+
)
|
|
26
|
+
console = Console()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _version_callback(value: bool) -> None:
|
|
30
|
+
if value:
|
|
31
|
+
console.print(f"modelgen {__version__}")
|
|
32
|
+
raise typer.Exit()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@app.callback()
|
|
36
|
+
def _main(
|
|
37
|
+
version: bool = typer.Option(
|
|
38
|
+
False,
|
|
39
|
+
"--version",
|
|
40
|
+
"-V",
|
|
41
|
+
callback=_version_callback,
|
|
42
|
+
is_eager=True,
|
|
43
|
+
help="Show version and exit.",
|
|
44
|
+
),
|
|
45
|
+
) -> None:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _configure_logging(verbose: bool) -> None:
|
|
50
|
+
level = logging.DEBUG if verbose else logging.INFO
|
|
51
|
+
logging.basicConfig(
|
|
52
|
+
level=level,
|
|
53
|
+
format="%(message)s",
|
|
54
|
+
datefmt="[%X]",
|
|
55
|
+
handlers=[
|
|
56
|
+
RichHandler(
|
|
57
|
+
console=console, rich_tracebacks=False, show_path=False
|
|
58
|
+
)
|
|
59
|
+
],
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@app.command()
|
|
64
|
+
def gen(
|
|
65
|
+
contract: Path = typer.Argument(
|
|
66
|
+
...,
|
|
67
|
+
help="Path to the YAML data contract file.",
|
|
68
|
+
exists=True,
|
|
69
|
+
readable=True,
|
|
70
|
+
),
|
|
71
|
+
raw: bool = typer.Option(
|
|
72
|
+
False, "--raw", help="Generate the raw (Origin Data Product) layer."
|
|
73
|
+
),
|
|
74
|
+
staging: bool = typer.Option(
|
|
75
|
+
False,
|
|
76
|
+
"--staging",
|
|
77
|
+
help="Generate the staging (Consumption Data Product) layer.",
|
|
78
|
+
),
|
|
79
|
+
output: Path = typer.Option(
|
|
80
|
+
Path("./models"),
|
|
81
|
+
"--output",
|
|
82
|
+
"-o",
|
|
83
|
+
help="Directory to write generated files into.",
|
|
84
|
+
),
|
|
85
|
+
dry_run: bool = typer.Option(
|
|
86
|
+
False,
|
|
87
|
+
"--dry-run",
|
|
88
|
+
help="Print what would be generated without writing.",
|
|
89
|
+
),
|
|
90
|
+
verbose: bool = typer.Option(
|
|
91
|
+
False, "--verbose", "-v", help="Enable debug logging."
|
|
92
|
+
),
|
|
93
|
+
):
|
|
94
|
+
"""
|
|
95
|
+
Generate a dbt model layer from a YAML data contract.
|
|
96
|
+
|
|
97
|
+
\b
|
|
98
|
+
Examples:
|
|
99
|
+
modelgen gen contract.yml --raw -o ./models/raw
|
|
100
|
+
modelgen gen contract.yml --staging -o ./models/staging
|
|
101
|
+
"""
|
|
102
|
+
_configure_logging(verbose)
|
|
103
|
+
|
|
104
|
+
if raw == staging:
|
|
105
|
+
console.print("[red]Specify exactly one of --raw or --staging.[/red]")
|
|
106
|
+
raise typer.Exit(2)
|
|
107
|
+
|
|
108
|
+
layer = Layer.RAW if raw else Layer.STAGING
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
event = EventContract.from_file(contract)
|
|
112
|
+
except ContractError as e:
|
|
113
|
+
console.print(f"[red]Invalid contract:[/red] {e}")
|
|
114
|
+
raise typer.Exit(1)
|
|
115
|
+
|
|
116
|
+
from modelgen.introspect import BigQueryIntrospector
|
|
117
|
+
|
|
118
|
+
introspector = BigQueryIntrospector()
|
|
119
|
+
|
|
120
|
+
target_dataset = (
|
|
121
|
+
event.raw.dataset if layer == Layer.RAW else event.staging.dataset
|
|
122
|
+
)
|
|
123
|
+
target_model = (
|
|
124
|
+
event.raw_model_name
|
|
125
|
+
if layer == Layer.RAW
|
|
126
|
+
else event.staging_model_name
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
console.print(
|
|
130
|
+
Panel(
|
|
131
|
+
f"[bold]Event:[/bold] {event.name}\n"
|
|
132
|
+
f"[bold]Layer:[/bold] {layer.value}\n"
|
|
133
|
+
f"[bold]Source:[/bold] {event.source.fqn}\n"
|
|
134
|
+
f"[bold]Target:[/bold] {target_dataset}.{target_model}\n"
|
|
135
|
+
f"[bold]Unique key:[/bold] {', '.join(event.unique_key)} | "
|
|
136
|
+
f"[bold]Loaded at:[/bold] {event.loaded_at_field}",
|
|
137
|
+
title="[bold green]modelgen[/bold green]",
|
|
138
|
+
expand=False,
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
result = generate(
|
|
144
|
+
contract=event,
|
|
145
|
+
introspector=introspector,
|
|
146
|
+
output_dir=output,
|
|
147
|
+
layer=layer,
|
|
148
|
+
contract_path=str(contract),
|
|
149
|
+
dry_run=dry_run,
|
|
150
|
+
)
|
|
151
|
+
except (GenerationError, IntrospectionError) as e:
|
|
152
|
+
console.print(f"[red]Generation failed:[/red] {e}")
|
|
153
|
+
raise typer.Exit(1)
|
|
154
|
+
except Exception as e:
|
|
155
|
+
console.print(f"[red]Unexpected error:[/red] {type(e).__name__}: {e}")
|
|
156
|
+
if verbose:
|
|
157
|
+
console.print_exception()
|
|
158
|
+
raise typer.Exit(1)
|
|
159
|
+
|
|
160
|
+
table = Table(show_header=True, header_style="bold cyan")
|
|
161
|
+
table.add_column("File")
|
|
162
|
+
table.add_column("Status")
|
|
163
|
+
for path in result.written:
|
|
164
|
+
status = (
|
|
165
|
+
"[yellow]dry-run[/yellow]" if dry_run else "[green]written[/green]"
|
|
166
|
+
)
|
|
167
|
+
table.add_row(str(path), status)
|
|
168
|
+
console.print(table)
|
|
169
|
+
|
|
170
|
+
if result.new_source_columns:
|
|
171
|
+
console.print(
|
|
172
|
+
Panel(
|
|
173
|
+
f"Source has {len(result.new_source_columns)} column(s)"
|
|
174
|
+
" not yet in the staging contract:\n"
|
|
175
|
+
f" {', '.join(result.new_source_columns)}\n\n"
|
|
176
|
+
"[dim]These flow through raw automatically. "
|
|
177
|
+
"Add to the staging contract when "
|
|
178
|
+
"downstream consumers need them.[/dim]",
|
|
179
|
+
title="[yellow]Schema evolution detected[/yellow]",
|
|
180
|
+
border_style="yellow",
|
|
181
|
+
)
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
@app.command()
|
|
186
|
+
def validate(
|
|
187
|
+
contract: Path = typer.Argument(
|
|
188
|
+
...,
|
|
189
|
+
help="Path to the YAML data contract file.",
|
|
190
|
+
exists=True,
|
|
191
|
+
readable=True,
|
|
192
|
+
),
|
|
193
|
+
):
|
|
194
|
+
"""Validate a contract's YAML structure (does not introspect BigQuery)."""
|
|
195
|
+
try:
|
|
196
|
+
event = EventContract.from_file(contract)
|
|
197
|
+
console.print(
|
|
198
|
+
f"[green]OK[/green] Contract valid: [bold]{event.name}[/bold]\n"
|
|
199
|
+
f" Source: {event.source.fqn}\n"
|
|
200
|
+
f" Unique key: {', '.join(event.unique_key)}\n"
|
|
201
|
+
f" Staging columns declared: {len(event.staging.columns)}"
|
|
202
|
+
)
|
|
203
|
+
except ContractError as e:
|
|
204
|
+
console.print(f"[red]Invalid contract:[/red] {e}")
|
|
205
|
+
raise typer.Exit(1)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
@app.command()
|
|
209
|
+
def run(
|
|
210
|
+
config: Path | None = typer.Option(
|
|
211
|
+
None,
|
|
212
|
+
"--config",
|
|
213
|
+
"-c",
|
|
214
|
+
help=(
|
|
215
|
+
"Path to .modelgen.yml. Auto-discovered from cwd"
|
|
216
|
+
" upward if not given."
|
|
217
|
+
),
|
|
218
|
+
),
|
|
219
|
+
dry_run: bool = typer.Option(
|
|
220
|
+
False,
|
|
221
|
+
"--dry-run",
|
|
222
|
+
help="Print what would be generated without writing.",
|
|
223
|
+
),
|
|
224
|
+
verbose: bool = typer.Option(
|
|
225
|
+
False, "--verbose", "-v", help="Enable debug logging."
|
|
226
|
+
),
|
|
227
|
+
):
|
|
228
|
+
"""
|
|
229
|
+
Discover all contracts in the repo and generate dbt models for every layer.
|
|
230
|
+
|
|
231
|
+
\b
|
|
232
|
+
Reads .modelgen.yml (or uses opinionated defaults) to find contracts
|
|
233
|
+
and write to:
|
|
234
|
+
models/raw/ — raw (Origin Data Product) layer
|
|
235
|
+
models/staging/ — staging layer (skipped if no columns declared)
|
|
236
|
+
|
|
237
|
+
\b
|
|
238
|
+
Default layout expected in the consuming repo:
|
|
239
|
+
contracts/ <- YAML data contracts
|
|
240
|
+
models/raw/ <- generated raw models (output)
|
|
241
|
+
models/staging/ <- generated staging models (output)
|
|
242
|
+
"""
|
|
243
|
+
_configure_logging(verbose)
|
|
244
|
+
|
|
245
|
+
from modelgen.config import ConfigError, ModelgenConfig
|
|
246
|
+
from modelgen.introspect import BigQueryIntrospector
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
if config is not None:
|
|
250
|
+
cfg = ModelgenConfig.from_file(config)
|
|
251
|
+
else:
|
|
252
|
+
cfg = ModelgenConfig.discover(Path.cwd())
|
|
253
|
+
except ConfigError as e:
|
|
254
|
+
console.print(f"[red]Config error:[/red] {e}")
|
|
255
|
+
raise typer.Exit(1)
|
|
256
|
+
|
|
257
|
+
contract_files = cfg.contract_files()
|
|
258
|
+
if not contract_files:
|
|
259
|
+
console.print(
|
|
260
|
+
f"[yellow]No contracts found.[/yellow] "
|
|
261
|
+
f"Pattern [dim]{cfg.contracts_glob!r}[/dim]"
|
|
262
|
+
f" matched nothing in {cfg.root}"
|
|
263
|
+
)
|
|
264
|
+
raise typer.Exit(0)
|
|
265
|
+
|
|
266
|
+
console.print(
|
|
267
|
+
Panel(
|
|
268
|
+
f"[bold]Contracts:[/bold] {len(contract_files)}\n"
|
|
269
|
+
f"[bold]Root:[/bold] {cfg.root}\n"
|
|
270
|
+
f"[bold]Raw output:[/bold] {cfg.raw_output}\n"
|
|
271
|
+
f"[bold]Staging output:[/bold] {cfg.staging_output}",
|
|
272
|
+
title="[bold green]modelgen run[/bold green]",
|
|
273
|
+
expand=False,
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
introspector = BigQueryIntrospector()
|
|
278
|
+
|
|
279
|
+
table = Table(show_header=True, header_style="bold cyan")
|
|
280
|
+
table.add_column("Contract")
|
|
281
|
+
table.add_column("Layer")
|
|
282
|
+
table.add_column("File")
|
|
283
|
+
table.add_column("Status")
|
|
284
|
+
|
|
285
|
+
errors: list[tuple[str, str, str]] = []
|
|
286
|
+
new_cols_notices: list[tuple[str, list[str]]] = []
|
|
287
|
+
|
|
288
|
+
for contract_path in contract_files:
|
|
289
|
+
try:
|
|
290
|
+
event = EventContract.from_file(contract_path)
|
|
291
|
+
except ContractError as e:
|
|
292
|
+
errors.append((contract_path.name, "parse", str(e)))
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
rel = contract_path.relative_to(cfg.root)
|
|
297
|
+
except ValueError:
|
|
298
|
+
rel = contract_path
|
|
299
|
+
|
|
300
|
+
# Raw — always
|
|
301
|
+
try:
|
|
302
|
+
raw_result = generate(
|
|
303
|
+
contract=event,
|
|
304
|
+
introspector=introspector,
|
|
305
|
+
output_dir=cfg.raw_output,
|
|
306
|
+
layer=Layer.RAW,
|
|
307
|
+
contract_path=str(rel),
|
|
308
|
+
dry_run=dry_run,
|
|
309
|
+
)
|
|
310
|
+
status = (
|
|
311
|
+
"[yellow]dry-run[/yellow]"
|
|
312
|
+
if dry_run
|
|
313
|
+
else "[green]written[/green]"
|
|
314
|
+
)
|
|
315
|
+
for p in raw_result.written:
|
|
316
|
+
table.add_row(contract_path.name, "raw", p.name, status)
|
|
317
|
+
except (GenerationError, IntrospectionError) as e:
|
|
318
|
+
errors.append((contract_path.name, "raw", str(e)))
|
|
319
|
+
|
|
320
|
+
# Staging — skip if no columns declared in the contract
|
|
321
|
+
if not event.staging.columns:
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
try:
|
|
325
|
+
stg_result = generate(
|
|
326
|
+
contract=event,
|
|
327
|
+
introspector=introspector,
|
|
328
|
+
output_dir=cfg.staging_output,
|
|
329
|
+
layer=Layer.STAGING,
|
|
330
|
+
contract_path=str(rel),
|
|
331
|
+
dry_run=dry_run,
|
|
332
|
+
)
|
|
333
|
+
status = (
|
|
334
|
+
"[yellow]dry-run[/yellow]"
|
|
335
|
+
if dry_run
|
|
336
|
+
else "[green]written[/green]"
|
|
337
|
+
)
|
|
338
|
+
for p in stg_result.written:
|
|
339
|
+
table.add_row(contract_path.name, "staging", p.name, status)
|
|
340
|
+
if stg_result.new_source_columns:
|
|
341
|
+
new_cols_notices.append(
|
|
342
|
+
(event.name, stg_result.new_source_columns)
|
|
343
|
+
)
|
|
344
|
+
except (GenerationError, IntrospectionError) as e:
|
|
345
|
+
errors.append((contract_path.name, "staging", str(e)))
|
|
346
|
+
|
|
347
|
+
console.print(table)
|
|
348
|
+
|
|
349
|
+
for event_name, cols in new_cols_notices:
|
|
350
|
+
console.print(
|
|
351
|
+
Panel(
|
|
352
|
+
f"[bold]{event_name}[/bold]: {len(cols)} source"
|
|
353
|
+
" column(s) not yet in staging contract:\n"
|
|
354
|
+
f" {', '.join(cols)}\n\n"
|
|
355
|
+
"[dim]These flow through raw automatically. "
|
|
356
|
+
"Add to staging when consumers need them.[/dim]",
|
|
357
|
+
title="[yellow]Schema evolution detected[/yellow]",
|
|
358
|
+
border_style="yellow",
|
|
359
|
+
)
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
if errors:
|
|
363
|
+
for name, layer_name, msg in errors:
|
|
364
|
+
console.print(f"[red]Error[/red] [{layer_name}] {name}: {msg}")
|
|
365
|
+
raise typer.Exit(1)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def main():
|
|
369
|
+
app()
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
if __name__ == "__main__":
|
|
373
|
+
main()
|
modelgen/config.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Project-level configuration for modelgen.
|
|
3
|
+
|
|
4
|
+
Reads .modelgen.yml from the repo root (or walks up from cwd to find it).
|
|
5
|
+
All paths in the config file are relative to the .modelgen.yml location.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import yaml
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ConfigError(ValueError):
|
|
17
|
+
"""Raised when .modelgen.yml is malformed."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
_DEFAULT_CONTRACTS = "contracts/*.yml"
|
|
21
|
+
_DEFAULT_RAW_OUTPUT = "models/raw"
|
|
22
|
+
_DEFAULT_STAGING_OUTPUT = "models/staging"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class ModelgenConfig:
|
|
27
|
+
root: Path
|
|
28
|
+
contracts_glob: str
|
|
29
|
+
raw_output: Path
|
|
30
|
+
staging_output: Path
|
|
31
|
+
|
|
32
|
+
def contract_files(self) -> list[Path]:
|
|
33
|
+
return sorted(self.root.glob(self.contracts_glob))
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def _build(cls, root: Path, raw: dict) -> ModelgenConfig:
|
|
37
|
+
return cls(
|
|
38
|
+
root=root,
|
|
39
|
+
contracts_glob=raw.get("contracts", _DEFAULT_CONTRACTS),
|
|
40
|
+
raw_output=root / raw.get("raw_output", _DEFAULT_RAW_OUTPUT),
|
|
41
|
+
staging_output=root
|
|
42
|
+
/ raw.get("staging_output", _DEFAULT_STAGING_OUTPUT),
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def from_file(cls, path: Path) -> ModelgenConfig:
|
|
47
|
+
try:
|
|
48
|
+
raw = yaml.safe_load(path.read_text()) or {}
|
|
49
|
+
except yaml.YAMLError as e:
|
|
50
|
+
raise ConfigError(f"Failed to parse {path}: {e}") from e
|
|
51
|
+
if not isinstance(raw, dict):
|
|
52
|
+
raise ConfigError(f"{path} must be a YAML mapping.")
|
|
53
|
+
return cls._build(path.parent, raw)
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def discover(cls, start: Path) -> ModelgenConfig:
|
|
57
|
+
"""Find .modelgen.yml walking up from start; use defaults if absent."""
|
|
58
|
+
for directory in [start, *start.parents]:
|
|
59
|
+
candidate = directory / ".modelgen.yml"
|
|
60
|
+
if candidate.exists():
|
|
61
|
+
return cls.from_file(candidate)
|
|
62
|
+
return cls._build(start, {})
|
modelgen/contract.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data contract schema and parsing.
|
|
3
|
+
|
|
4
|
+
The contract is the source of truth. Source teams maintain it as part of
|
|
5
|
+
their release process. modelgen propagates contract changes to dbt models;
|
|
6
|
+
downstream FDPs see a stable interface because staging absorbs the drift.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
import yaml
|
|
16
|
+
|
|
17
|
+
SUPPORTED_CONTRACT_VERSIONS = {"1"}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ContractError(ValueError):
|
|
21
|
+
"""Raised when a contract is malformed or references invalid state."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class ColumnTest:
|
|
26
|
+
name: str
|
|
27
|
+
args: dict[str, Any] = field(default_factory=dict)
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def from_raw(cls, raw: Any) -> ColumnTest:
|
|
31
|
+
if isinstance(raw, str):
|
|
32
|
+
return cls(name=raw)
|
|
33
|
+
if isinstance(raw, dict):
|
|
34
|
+
if len(raw) != 1:
|
|
35
|
+
raise ContractError(
|
|
36
|
+
"Test entry must have exactly one key, "
|
|
37
|
+
f"got: {list(raw.keys())}"
|
|
38
|
+
)
|
|
39
|
+
name = str(next(iter(raw)))
|
|
40
|
+
args = raw[name] or {}
|
|
41
|
+
if not isinstance(args, dict):
|
|
42
|
+
raise ContractError(
|
|
43
|
+
f"Test args for {name!r} must be a mapping, "
|
|
44
|
+
f"got {type(args).__name__}"
|
|
45
|
+
)
|
|
46
|
+
return cls(name=name, args=args)
|
|
47
|
+
raise ContractError(f"Unrecognised test format: {raw!r}")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class StagingColumn:
|
|
52
|
+
"""One column in staging: source ref + rename + cast + tests."""
|
|
53
|
+
|
|
54
|
+
source: str # column name in raw (== source column name)
|
|
55
|
+
name: str = "" # column name in staging (default: source.lower())
|
|
56
|
+
type: str = "" # BigQuery type to cast to (default: no cast)
|
|
57
|
+
description: str = ""
|
|
58
|
+
tests: list[ColumnTest] = field(default_factory=list)
|
|
59
|
+
|
|
60
|
+
def __post_init__(self):
|
|
61
|
+
if not self.source:
|
|
62
|
+
raise ContractError("Staging column must declare a 'source'.")
|
|
63
|
+
if not self.name:
|
|
64
|
+
object.__setattr__(self, "name", self.source.lower())
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def needs_cast(self) -> bool:
|
|
68
|
+
return bool(self.type)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def needs_rename(self) -> bool:
|
|
72
|
+
return self.source != self.name
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def from_raw(cls, raw: dict) -> StagingColumn:
|
|
76
|
+
if "source" not in raw:
|
|
77
|
+
raise ContractError(f"Staging column missing 'source': {raw}")
|
|
78
|
+
return cls(
|
|
79
|
+
source=raw["source"],
|
|
80
|
+
name=raw.get("name", ""),
|
|
81
|
+
type=raw.get("type", ""),
|
|
82
|
+
description=raw.get("description", ""),
|
|
83
|
+
tests=[ColumnTest.from_raw(t) for t in raw.get("tests", [])],
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass(frozen=True)
|
|
88
|
+
class SourceRef:
|
|
89
|
+
"""Fully-qualified BigQuery table reference."""
|
|
90
|
+
|
|
91
|
+
project: str
|
|
92
|
+
dataset: str
|
|
93
|
+
table: str
|
|
94
|
+
|
|
95
|
+
def __post_init__(self):
|
|
96
|
+
for label, value in [
|
|
97
|
+
("project", self.project),
|
|
98
|
+
("dataset", self.dataset),
|
|
99
|
+
("table", self.table),
|
|
100
|
+
]:
|
|
101
|
+
if not value or not isinstance(value, str):
|
|
102
|
+
raise ContractError(
|
|
103
|
+
f"source.{label} must be a non-empty string."
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def fqn(self) -> str:
|
|
108
|
+
return f"{self.project}.{self.dataset}.{self.table}"
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def from_raw(cls, raw: dict) -> SourceRef:
|
|
112
|
+
if not raw:
|
|
113
|
+
raise ContractError("Contract missing 'source' block.")
|
|
114
|
+
return cls(
|
|
115
|
+
project=raw["project"],
|
|
116
|
+
dataset=raw["dataset"],
|
|
117
|
+
table=raw["table"],
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass(frozen=True)
|
|
122
|
+
class PartitionConfig:
|
|
123
|
+
"""BigQuery partition spec. Maps directly to dbt's partition_by config."""
|
|
124
|
+
|
|
125
|
+
field: str
|
|
126
|
+
data_type: str = "timestamp"
|
|
127
|
+
granularity: str = "day"
|
|
128
|
+
|
|
129
|
+
@classmethod
|
|
130
|
+
def from_raw(cls, raw: dict | None) -> PartitionConfig | None:
|
|
131
|
+
if raw is None:
|
|
132
|
+
return None
|
|
133
|
+
if "field" not in raw:
|
|
134
|
+
raise ContractError("partition_by must specify a 'field'.")
|
|
135
|
+
return cls(
|
|
136
|
+
field=raw["field"],
|
|
137
|
+
data_type=raw.get("data_type", "timestamp"),
|
|
138
|
+
granularity=raw.get("granularity", "day"),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@dataclass
|
|
143
|
+
class RawConfig:
|
|
144
|
+
dataset: str = "raw"
|
|
145
|
+
incremental_strategy: str = "merge"
|
|
146
|
+
dedup: bool = False
|
|
147
|
+
partition_by: PartitionConfig | None = None
|
|
148
|
+
cluster_by: list[str] = field(default_factory=list)
|
|
149
|
+
|
|
150
|
+
@classmethod
|
|
151
|
+
def from_raw(cls, raw: dict | None) -> RawConfig:
|
|
152
|
+
raw = raw or {}
|
|
153
|
+
cluster_by = raw.get("cluster_by", [])
|
|
154
|
+
if isinstance(cluster_by, str):
|
|
155
|
+
cluster_by = [cluster_by]
|
|
156
|
+
return cls(
|
|
157
|
+
dataset=raw.get("dataset", "raw"),
|
|
158
|
+
incremental_strategy=raw.get("incremental_strategy", "merge"),
|
|
159
|
+
dedup=raw.get("dedup", False),
|
|
160
|
+
partition_by=PartitionConfig.from_raw(raw.get("partition_by")),
|
|
161
|
+
cluster_by=cluster_by,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@dataclass
|
|
166
|
+
class StagingConfig:
|
|
167
|
+
dataset: str = "staging"
|
|
168
|
+
incremental_strategy: str = "merge"
|
|
169
|
+
columns: list[StagingColumn] = field(default_factory=list)
|
|
170
|
+
partition_by: PartitionConfig | None = None
|
|
171
|
+
cluster_by: list[str] = field(default_factory=list)
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def from_raw(cls, raw: dict | None) -> StagingConfig:
|
|
175
|
+
raw = raw or {}
|
|
176
|
+
cluster_by = raw.get("cluster_by", [])
|
|
177
|
+
if isinstance(cluster_by, str):
|
|
178
|
+
cluster_by = [cluster_by]
|
|
179
|
+
return cls(
|
|
180
|
+
dataset=raw.get("dataset", "staging"),
|
|
181
|
+
incremental_strategy=raw.get("incremental_strategy", "merge"),
|
|
182
|
+
columns=[
|
|
183
|
+
StagingColumn.from_raw(c) for c in raw.get("columns", [])
|
|
184
|
+
],
|
|
185
|
+
partition_by=PartitionConfig.from_raw(raw.get("partition_by")),
|
|
186
|
+
cluster_by=cluster_by,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _normalise_unique_key(raw: Any) -> list[str]:
|
|
191
|
+
if isinstance(raw, str):
|
|
192
|
+
return [raw]
|
|
193
|
+
if isinstance(raw, list) and all(isinstance(k, str) for k in raw):
|
|
194
|
+
if not raw:
|
|
195
|
+
raise ContractError("unique_key list cannot be empty.")
|
|
196
|
+
return [str(k) for k in raw]
|
|
197
|
+
raise ContractError(
|
|
198
|
+
f"unique_key must be a string or list of strings, got {raw!r}"
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@dataclass
|
|
203
|
+
class EventContract:
|
|
204
|
+
name: str
|
|
205
|
+
description: str
|
|
206
|
+
unique_key: list[str]
|
|
207
|
+
loaded_at_field: str
|
|
208
|
+
source: SourceRef
|
|
209
|
+
raw: RawConfig
|
|
210
|
+
staging: StagingConfig
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def raw_model_name(self) -> str:
|
|
214
|
+
return f"raw__{self.name}"
|
|
215
|
+
|
|
216
|
+
@property
|
|
217
|
+
def staging_model_name(self) -> str:
|
|
218
|
+
return f"stg__{self.name}"
|
|
219
|
+
|
|
220
|
+
@property
|
|
221
|
+
def unique_key_dbt(self) -> str:
|
|
222
|
+
"""Render unique_key in the form dbt expects: a string for single key,
|
|
223
|
+
a list literal for composite."""
|
|
224
|
+
if len(self.unique_key) == 1:
|
|
225
|
+
return f"'{self.unique_key[0]}'"
|
|
226
|
+
return "[" + ", ".join(f"'{k}'" for k in self.unique_key) + "]"
|
|
227
|
+
|
|
228
|
+
@classmethod
|
|
229
|
+
def from_file(cls, path: Path) -> EventContract:
|
|
230
|
+
try:
|
|
231
|
+
raw_yaml = yaml.safe_load(path.read_text())
|
|
232
|
+
except yaml.YAMLError as e:
|
|
233
|
+
raise ContractError(f"Failed to parse YAML at {path}: {e}") from e
|
|
234
|
+
|
|
235
|
+
if not isinstance(raw_yaml, dict):
|
|
236
|
+
raise ContractError(
|
|
237
|
+
"Contract root must be a mapping, "
|
|
238
|
+
f"got {type(raw_yaml).__name__}"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
version = str(raw_yaml.get("version", ""))
|
|
242
|
+
if version not in SUPPORTED_CONTRACT_VERSIONS:
|
|
243
|
+
raise ContractError(
|
|
244
|
+
f"Unsupported contract version {version!r}. "
|
|
245
|
+
f"Supported: {sorted(SUPPORTED_CONTRACT_VERSIONS)}"
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
event = raw_yaml.get("event")
|
|
249
|
+
if not isinstance(event, dict):
|
|
250
|
+
raise ContractError("Contract missing 'event' block.")
|
|
251
|
+
|
|
252
|
+
required = ["name", "unique_key", "loaded_at_field", "source"]
|
|
253
|
+
missing = [k for k in required if k not in event]
|
|
254
|
+
if missing:
|
|
255
|
+
raise ContractError(f"event missing required fields: {missing}")
|
|
256
|
+
|
|
257
|
+
return cls(
|
|
258
|
+
name=event["name"],
|
|
259
|
+
description=event.get("description", ""),
|
|
260
|
+
unique_key=_normalise_unique_key(event["unique_key"]),
|
|
261
|
+
loaded_at_field=event["loaded_at_field"],
|
|
262
|
+
source=SourceRef.from_raw(event["source"]),
|
|
263
|
+
raw=RawConfig.from_raw(event.get("raw")),
|
|
264
|
+
staging=StagingConfig.from_raw(event.get("staging")),
|
|
265
|
+
)
|