dbt-scribe 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt_scribe-0.1.0/.env.example +6 -0
- dbt_scribe-0.1.0/.github/workflows/ci.yml +30 -0
- dbt_scribe-0.1.0/.gitignore +68 -0
- dbt_scribe-0.1.0/CONTEXT.md +91 -0
- dbt_scribe-0.1.0/DECISIONS.md +363 -0
- dbt_scribe-0.1.0/NEXT_STEPS.md +240 -0
- dbt_scribe-0.1.0/PKG-INFO +432 -0
- dbt_scribe-0.1.0/README.md +408 -0
- dbt_scribe-0.1.0/STRUCTURE.md +269 -0
- dbt_scribe-0.1.0/dbt_scribe/__init__.py +1 -0
- dbt_scribe-0.1.0/dbt_scribe/analyzer.py +348 -0
- dbt_scribe-0.1.0/dbt_scribe/cli.py +254 -0
- dbt_scribe-0.1.0/dbt_scribe/config.py +157 -0
- dbt_scribe-0.1.0/dbt_scribe/coverage.py +1 -0
- dbt_scribe-0.1.0/dbt_scribe/generators/__init__.py +0 -0
- dbt_scribe-0.1.0/dbt_scribe/generators/base_generator.py +41 -0
- dbt_scribe-0.1.0/dbt_scribe/generators/docs_generator.py +192 -0
- dbt_scribe-0.1.0/dbt_scribe/generators/providers/__init__.py +0 -0
- dbt_scribe-0.1.0/dbt_scribe/generators/providers/anthropic_provider.py +30 -0
- dbt_scribe-0.1.0/dbt_scribe/generators/providers/google_provider.py +31 -0
- dbt_scribe-0.1.0/dbt_scribe/generators/providers/openai_provider.py +31 -0
- dbt_scribe-0.1.0/dbt_scribe/generators/tests_generator.py +248 -0
- dbt_scribe-0.1.0/dbt_scribe/parsers/__init__.py +0 -0
- dbt_scribe-0.1.0/dbt_scribe/parsers/manifest_parser.py +188 -0
- dbt_scribe-0.1.0/dbt_scribe/parsers/yaml_parser.py +64 -0
- dbt_scribe-0.1.0/dbt_scribe/prompts/docs_intermediate.j2 +26 -0
- dbt_scribe-0.1.0/dbt_scribe/prompts/docs_mart.j2 +31 -0
- dbt_scribe-0.1.0/dbt_scribe/prompts/docs_staging.j2 +32 -0
- dbt_scribe-0.1.0/dbt_scribe/prompts/tests_generic.j2 +78 -0
- dbt_scribe-0.1.0/dbt_scribe/prompts/tests_singular.j2 +1 -0
- dbt_scribe-0.1.0/dbt_scribe/resolver.py +51 -0
- dbt_scribe-0.1.0/dbt_scribe/writers/__init__.py +0 -0
- dbt_scribe-0.1.0/dbt_scribe/writers/docs_writer.py +65 -0
- dbt_scribe-0.1.0/dbt_scribe/writers/singular_test_writer.py +1 -0
- dbt_scribe-0.1.0/dbt_scribe/writers/yaml_writer.py +126 -0
- dbt_scribe-0.1.0/pyproject.toml +53 -0
- dbt_scribe-0.1.0/tests/fixtures/dbt_project/dbt-scribe.yml +34 -0
- dbt_scribe-0.1.0/tests/fixtures/dbt_project/dbt_project.yml +28 -0
- dbt_scribe-0.1.0/tests/fixtures/dbt_project/models/intermediate/rugby/int_fixtures_enriched_with_teams.sql +28 -0
- dbt_scribe-0.1.0/tests/fixtures/dbt_project/models/marts/rugby/fixtures.sql +22 -0
- dbt_scribe-0.1.0/tests/fixtures/dbt_project/models/staging/api_sports/stg_api_sports__fixtures.sql +21 -0
- dbt_scribe-0.1.0/tests/fixtures/dbt_project/models/staging/api_sports/stg_api_sports__fixtures.yml +31 -0
- dbt_scribe-0.1.0/tests/fixtures/dbt_project/target/manifest.json +266 -0
- dbt_scribe-0.1.0/tests/test_analyzer.py +115 -0
- dbt_scribe-0.1.0/tests/test_bootstrap.py +72 -0
- dbt_scribe-0.1.0/tests/test_cli_commands.py +151 -0
- dbt_scribe-0.1.0/tests/test_config.py +160 -0
- dbt_scribe-0.1.0/tests/test_docs_generator.py +114 -0
- dbt_scribe-0.1.0/tests/test_docs_writer.py +111 -0
- dbt_scribe-0.1.0/tests/test_integration_pipeline.py +45 -0
- dbt_scribe-0.1.0/tests/test_manifest_parser.py +50 -0
- dbt_scribe-0.1.0/tests/test_providers.py +128 -0
- dbt_scribe-0.1.0/tests/test_resolver.py +55 -0
- dbt_scribe-0.1.0/tests/test_tests_generator.py +116 -0
- dbt_scribe-0.1.0/tests/test_yaml_parser.py +75 -0
- dbt_scribe-0.1.0/tests/test_yaml_writer.py +169 -0
- dbt_scribe-0.1.0/uv.lock +1455 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
test:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
|
|
11
|
+
steps:
|
|
12
|
+
- name: Check out repository
|
|
13
|
+
uses: actions/checkout@v4
|
|
14
|
+
|
|
15
|
+
- name: Set up Python
|
|
16
|
+
uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: "3.11"
|
|
19
|
+
|
|
20
|
+
- name: Install package
|
|
21
|
+
run: python -m pip install -e ".[dev]"
|
|
22
|
+
|
|
23
|
+
- name: Run ruff
|
|
24
|
+
run: ruff check .
|
|
25
|
+
|
|
26
|
+
- name: Run mypy
|
|
27
|
+
run: mypy dbt_scribe
|
|
28
|
+
|
|
29
|
+
- name: Run pytest
|
|
30
|
+
run: pytest
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# dbt-scribe — .gitignore
|
|
3
|
+
# =============================================================================
|
|
4
|
+
|
|
5
|
+
# ── Python ────────────────────────────────────────────────────────────────────
|
|
6
|
+
__pycache__/
|
|
7
|
+
*.py[cod]
|
|
8
|
+
*$py.class
|
|
9
|
+
*.so
|
|
10
|
+
*.pyd
|
|
11
|
+
|
|
12
|
+
# Virtual environments
|
|
13
|
+
.venv/
|
|
14
|
+
venv/
|
|
15
|
+
env/
|
|
16
|
+
ENV/
|
|
17
|
+
|
|
18
|
+
# Distribution / packaging
|
|
19
|
+
dist/
|
|
20
|
+
build/
|
|
21
|
+
*.egg-info/
|
|
22
|
+
*.egg
|
|
23
|
+
.eggs/
|
|
24
|
+
MANIFEST
|
|
25
|
+
|
|
26
|
+
# Type checking
|
|
27
|
+
.mypy_cache/
|
|
28
|
+
.dmypy.json
|
|
29
|
+
dmypy.json
|
|
30
|
+
.pytype/
|
|
31
|
+
|
|
32
|
+
# Test / coverage
|
|
33
|
+
.pytest_cache/
|
|
34
|
+
.coverage
|
|
35
|
+
.coverage.*
|
|
36
|
+
coverage.xml
|
|
37
|
+
htmlcov/
|
|
38
|
+
|
|
39
|
+
# ── dbt-scribe specific ───────────────────────────────────────────────────────
|
|
40
|
+
# LLM response cache — keyed on compiled SQL + config fingerprint
|
|
41
|
+
.dbt-scribe-cache/
|
|
42
|
+
|
|
43
|
+
# DuckDB files from test projects
|
|
44
|
+
*.duckdb
|
|
45
|
+
*.duckdb.wal
|
|
46
|
+
|
|
47
|
+
# ── Environment / secrets ─────────────────────────────────────────────────────
|
|
48
|
+
.env
|
|
49
|
+
.env.local
|
|
50
|
+
.env.*.local
|
|
51
|
+
|
|
52
|
+
# ── Editors and OS ────────────────────────────────────────────────────────────
|
|
53
|
+
.DS_Store
|
|
54
|
+
Thumbs.db
|
|
55
|
+
.idea/
|
|
56
|
+
.vscode/
|
|
57
|
+
*.swp
|
|
58
|
+
*.swo
|
|
59
|
+
*~
|
|
60
|
+
|
|
61
|
+
# ── Claude Code ───────────────────────────────────────────────────────────────
|
|
62
|
+
.claude/
|
|
63
|
+
|
|
64
|
+
# ── Misc ──────────────────────────────────────────────────────────────────────
|
|
65
|
+
*.log
|
|
66
|
+
|
|
67
|
+
# Claude Code local settings
|
|
68
|
+
.claude/
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# CONTEXT.md — dbt-scribe
|
|
2
|
+
|
|
3
|
+
## What this project is
|
|
4
|
+
|
|
5
|
+
`dbt-scribe` is a Python CLI that analyses a dbt Core project and uses an LLM
|
|
6
|
+
(Anthropic Claude, OpenAI, or Google Gemini) to automatically generate:
|
|
7
|
+
|
|
8
|
+
- **Documentation** — model and column descriptions in YAML, plus long-form docs
|
|
9
|
+
blocks in Markdown (`*__docs.md` files), following dbt's two-tier convention
|
|
10
|
+
- **Tests** — named generic tests in YAML (`not_null`, `unique`, `accepted_values`,
|
|
11
|
+
`relationships`) and singular SQL tests in `tests/`, inferred from column semantics
|
|
12
|
+
|
|
13
|
+
The tool is non-destructive by default: it only fills in what is missing and never
|
|
14
|
+
overwrites existing descriptions or tests unless `--force` is explicitly passed.
|
|
15
|
+
A `{{ doc("...") }}` reference is treated as a filled description and is preserved.
|
|
16
|
+
|
|
17
|
+
## Problem it solves
|
|
18
|
+
|
|
19
|
+
Writing thorough dbt documentation and tests manually is time-consuming. A typical
|
|
20
|
+
staging model with 15 columns takes 30–45 minutes to document properly when following
|
|
21
|
+
strict conventions (English descriptions, two-tier docs, named tests, shared blocks,
|
|
22
|
+
four-section mart template, etc.). This debt accumulates quickly across multiple
|
|
23
|
+
projects and degrades portfolio quality.
|
|
24
|
+
|
|
25
|
+
Existing tools (`dbt-osmosis`, `dbt-codegen`, `dbt Assist`) either perform
|
|
26
|
+
mechanical propagation without LLM understanding, generate empty boilerplate, or
|
|
27
|
+
require dbt Cloud. `dbt-scribe` fills the gap: LLM-powered generation, local,
|
|
28
|
+
configurable per project, compatible with dbt Core.
|
|
29
|
+
|
|
30
|
+
## How it works
|
|
31
|
+
|
|
32
|
+
1. **Bootstrap** — validates that the current directory is a dbt project root
|
|
33
|
+
(`dbt_project.yml` + `target/manifest.json` + `dbt-scribe.yml` must all be present)
|
|
34
|
+
2. **Manifest parsing** — reads `target/manifest.json` to extract compiled SQL
|
|
35
|
+
(Jinja2-resolved), column lists, lineage, adapter type, and fully-qualified node names
|
|
36
|
+
3. **YAML parsing** — reads existing `.yml` files to determine what is already documented
|
|
37
|
+
4. **Analysis** — detects the layer (staging / intermediate / marts) and infers column
|
|
38
|
+
types (pk, fk, enum, timestamp, boolean, metric, shared, text)
|
|
39
|
+
5. **Generation** — calls the configured LLM provider with structured Jinja2 prompts;
|
|
40
|
+
all responses are JSON for reliable parsing
|
|
41
|
+
6. **Writing** — creates `.yml` files from scratch or merges into existing ones;
|
|
42
|
+
creates or updates `*__docs.md` files; writes singular test SQL files
|
|
43
|
+
|
|
44
|
+
> **Prerequisite:** `dbt compile` must be run before `dbt-scribe` so that
|
|
45
|
+
> `target/manifest.json` is up to date. The tool validates this at startup.
|
|
46
|
+
|
|
47
|
+
## Configuration
|
|
48
|
+
|
|
49
|
+
Each dbt project that uses `dbt-scribe` has its own `dbt-scribe.yml` at its root.
|
|
50
|
+
This file is versioned with the dbt project and controls: LLM provider and model,
|
|
51
|
+
documentation rules (two-tier, shared columns, mart template), test generation patterns
|
|
52
|
+
(PK/FK/enum column name regexes), coverage thresholds, and layer conventions.
|
|
53
|
+
|
|
54
|
+
A default config is generated by `dbt-scribe init`.
|
|
55
|
+
|
|
56
|
+
## Design principles
|
|
57
|
+
|
|
58
|
+
- **Manifest-first** — compiled SQL from `manifest.json` is the only source of truth;
|
|
59
|
+
raw `.sql` files (which contain unresolved Jinja2) are never parsed directly
|
|
60
|
+
- **Non-destructive** — existing documentation and tests are preserved unless `--force`
|
|
61
|
+
- **Convention-aware** — the tool understands dbt layer conventions and adapts its
|
|
62
|
+
output accordingly (different doc templates for staging vs. marts, etc.)
|
|
63
|
+
- **Provider-agnostic** — an `LLMProvider` abstraction isolates all generator code
|
|
64
|
+
from the specifics of Anthropic, OpenAI, or Google SDKs
|
|
65
|
+
- **JSON output from LLM** — all prompts instruct the model to return JSON only,
|
|
66
|
+
eliminating fragile regex-based text parsing
|
|
67
|
+
|
|
68
|
+
## Supported dbt adapters
|
|
69
|
+
|
|
70
|
+
DuckDB (default), BigQuery, PostgreSQL. Auto-detected from `manifest.json` metadata.
|
|
71
|
+
|
|
72
|
+
## Target users (V1)
|
|
73
|
+
|
|
74
|
+
Solo analytics engineers and small teams running dbt Core locally, with a three-layer
|
|
75
|
+
medallion architecture (staging / intermediate / marts). The tool is designed to be
|
|
76
|
+
dropped into any existing dbt project with minimal setup.
|
|
77
|
+
|
|
78
|
+
## Repository
|
|
79
|
+
|
|
80
|
+
- GitHub: https://github.com/jeremy6680/dbt-scribe
|
|
81
|
+
- Author: Jeremy Marchandeau
|
|
82
|
+
- License: MIT
|
|
83
|
+
- Part of the Web2Data portfolio (web2data.jeremymarchandeau.com)
|
|
84
|
+
|
|
85
|
+
## Related projects
|
|
86
|
+
|
|
87
|
+
- `w2d-scaffold` — project scaffolding CLI; `dbt-scribe init` may eventually be called
|
|
88
|
+
automatically when scaffolding a new `data` project type
|
|
89
|
+
- Metrigator — shares the same "audit and automatically improve project quality" philosophy
|
|
90
|
+
- TasteBase, BrickMetrics, DraftLab — dbt projects in the portfolio that serve as
|
|
91
|
+
real-world test targets for `dbt-scribe`
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
# DECISIONS.md — dbt-scribe
|
|
2
|
+
|
|
3
|
+
Architectural and technical decisions log.
|
|
4
|
+
Each entry documents what was decided, why, and what alternatives were considered.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## ADR-001 — Multi-provider LLM abstraction from V1
|
|
9
|
+
|
|
10
|
+
**Date:** 2026-05-07
|
|
11
|
+
**Status:** Accepted
|
|
12
|
+
|
|
13
|
+
**Decision:** Support three LLM providers from V1 (Anthropic Claude, OpenAI, Google
|
|
14
|
+
Gemini) through a shared `LLMProvider` abstract interface.
|
|
15
|
+
|
|
16
|
+
**Rationale:** Provider preference is a legitimate user choice (cost, access, model
|
|
17
|
+
quality). Implementing the abstraction upfront costs little (one interface + three
|
|
18
|
+
simple adapters) and avoids a painful refactor later. All providers receive identical
|
|
19
|
+
prompts (structured JSON, temperature 0.2), making the abstraction natural.
|
|
20
|
+
|
|
21
|
+
**Providers:**
|
|
22
|
+
|
|
23
|
+
| Provider | Default model | Environment variable | SDK package |
|
|
24
|
+
| --------------------- | -------------------------- | -------------------- | -------------- |
|
|
25
|
+
| `anthropic` (default) | `claude-sonnet-4-20250514` | `ANTHROPIC_API_KEY` | `anthropic` |
|
|
26
|
+
| `openai` | `gpt-4o` | `OPENAI_API_KEY` | `openai` |
|
|
27
|
+
| `google` | `gemini-2.5-pro` | `GOOGLE_API_KEY` | `google-genai` |
|
|
28
|
+
|
|
29
|
+
**Note on Google SDK:** The originally planned `google-generativeai` package was
|
|
30
|
+
deprecated before implementation. The `google-genai` package (Google's replacement,
|
|
31
|
+
`from google import genai`) is used instead. The interface is functionally equivalent.
|
|
32
|
+
|
|
33
|
+
**Consequence:** Three SDK dependencies instead of one. All three are installed by
|
|
34
|
+
default; optional extras may be introduced in V2 if package size becomes a concern.
|
|
35
|
+
|
|
36
|
+
**Alternative rejected:** Claude-only in V1, abstraction in V2. Rejected because the
|
|
37
|
+
interface is trivial to write now and would cause an API-breaking change later.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## ADR-002 — Structured JSON output from LLM
|
|
42
|
+
|
|
43
|
+
**Date:** 2026-05-07
|
|
44
|
+
**Status:** Accepted
|
|
45
|
+
|
|
46
|
+
**Decision:** All LLM calls must return valid JSON only. The system prompt explicitly
|
|
47
|
+
forbids any text outside the JSON structure (no preamble, no markdown fences,
|
|
48
|
+
no explanation).
|
|
49
|
+
|
|
50
|
+
**Rationale:** JSON parsing is reliable and deterministic. Free-text responses would
|
|
51
|
+
require fragile regex-based extraction that breaks on minor model output variations.
|
|
52
|
+
|
|
53
|
+
**Consequence:** Prompts must be carefully engineered to prevent the model from adding
|
|
54
|
+
explanatory text. A JSON parse failure triggers a retry (up to 3 attempts).
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## ADR-003 — One LLM call per model, not per column
|
|
59
|
+
|
|
60
|
+
**Date:** 2026-05-07
|
|
61
|
+
**Status:** Accepted
|
|
62
|
+
|
|
63
|
+
**Decision:** Generate documentation and tests for all columns of a model in a single
|
|
64
|
+
LLM call.
|
|
65
|
+
|
|
66
|
+
**Rationale:** A single call gives the model full inter-column context (e.g., it can
|
|
67
|
+
infer that `home_score` and `away_score` are paired concepts). It also reduces API
|
|
68
|
+
cost and latency compared to one call per column.
|
|
69
|
+
|
|
70
|
+
**Consequence:** Prompts can be long for wide models. If a model exceeds ~50 columns,
|
|
71
|
+
the call will be split into two sequential calls to stay within context limits.
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## ADR-004 — `ruamel.yaml` instead of `PyYAML`
|
|
76
|
+
|
|
77
|
+
**Date:** 2026-05-07
|
|
78
|
+
**Status:** Accepted (deferred to Phase 2 — `PyYAML` used in Phase 1 for simplicity)
|
|
79
|
+
|
|
80
|
+
**Decision:** Use `ruamel.yaml` for all YAML read/write operations in the final tool.
|
|
81
|
+
|
|
82
|
+
**Rationale:** `PyYAML` does not preserve comments or key ordering in round-trip
|
|
83
|
+
operations. dbt YAML files use structured comments (`# ── Primary key ──`) as
|
|
84
|
+
visual separators that must be preserved when merging into existing files.
|
|
85
|
+
|
|
86
|
+
**Consequence:** `ruamel.yaml` has a slightly more verbose API than `PyYAML`.
|
|
87
|
+
Phase 1 uses `PyYAML` to keep the initial scope small; migration to `ruamel.yaml`
|
|
88
|
+
is scheduled for Phase 2 (Step tracked in NEXT_STEPS.md backlog).
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## ADR-005 — Cache key = SHA-256(compiled_sql + config_fingerprint)
|
|
93
|
+
|
|
94
|
+
**Date:** 2026-05-07
|
|
95
|
+
**Status:** Accepted (deferred to Phase 2)
|
|
96
|
+
|
|
97
|
+
**Decision:** LLM response cache keys are computed from a composite hash of the
|
|
98
|
+
compiled SQL and the relevant sections of `dbt-scribe.yml`.
|
|
99
|
+
|
|
100
|
+
**Rationale:** Hashing compiled SQL alone is insufficient — if `dbt-scribe.yml`
|
|
101
|
+
changes (new patterns, new `default_owner`, new thresholds), cached results may no
|
|
102
|
+
longer match the expected output even if the SQL is unchanged.
|
|
103
|
+
|
|
104
|
+
**Implementation:**
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
config_fingerprint = json.dumps({
|
|
108
|
+
"llm": config.llm.model_dump(),
|
|
109
|
+
"docs": config.docs.model_dump(),
|
|
110
|
+
"tests": config.tests.model_dump(),
|
|
111
|
+
"conventions": config.conventions.model_dump(),
|
|
112
|
+
}, sort_keys=True)
|
|
113
|
+
cache_key = hashlib.sha256((compiled_sql + config_fingerprint).encode()).hexdigest()
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
**Consequence:** `.dbt-scribe-cache/` directory must be added to `.gitignore`.
|
|
117
|
+
A config change invalidates all cache entries for affected models.
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## ADR-006 — `dbt-scribe.yml` lives in the dbt project root
|
|
122
|
+
|
|
123
|
+
**Date:** 2026-05-07
|
|
124
|
+
**Status:** Accepted
|
|
125
|
+
|
|
126
|
+
**Decision:** The configuration file is versioned inside the target dbt project,
|
|
127
|
+
not in a global user directory.
|
|
128
|
+
|
|
129
|
+
**Rationale:** Each dbt project has its own conventions (adapter, coverage thresholds,
|
|
130
|
+
shared column names, owner details). The config must evolve with the project and be
|
|
131
|
+
visible to anyone cloning the repo.
|
|
132
|
+
|
|
133
|
+
**Consequence:** `dbt-scribe` looks for `dbt-scribe.yml` in the current working
|
|
134
|
+
directory. A missing config file triggers a `BootstrapError` with an explicit
|
|
135
|
+
suggestion to run `dbt-scribe init`.
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## ADR-007 — `sqlglot` applied to compiled SQL only (never to raw `.sql` files)
|
|
140
|
+
|
|
141
|
+
**Date:** 2026-05-07
|
|
142
|
+
**Status:** Accepted
|
|
143
|
+
|
|
144
|
+
**Decision:** `sqlglot` is used exclusively to parse the compiled SQL extracted from
|
|
145
|
+
`manifest.json`. Raw `.sql` files (which contain unresolved Jinja2 templates) are
|
|
146
|
+
never passed to `sqlglot`.
|
|
147
|
+
|
|
148
|
+
**Rationale:** `sqlglot` is an excellent multi-dialect SQL parser, but dbt `.sql`
|
|
149
|
+
files contain Jinja2 (`{{ ref('...') }}`, `{{ var('...') }}`, custom macros) that
|
|
150
|
+
`sqlglot` cannot handle. Using compiled SQL (where Jinja2 is fully resolved by dbt)
|
|
151
|
+
eliminates this friction entirely.
|
|
152
|
+
|
|
153
|
+
**Consequence:** `sql_parser.py` does not exist. All SQL parsing logic lives in
|
|
154
|
+
`manifest_parser.py` (extraction from manifest) and `analyzer.py` (calling `sqlglot`
|
|
155
|
+
on the extracted compiled SQL).
|
|
156
|
+
|
|
157
|
+
**Alternative rejected:** Naive Jinja2 pre-processing (replace `{{ ... }}` with
|
|
158
|
+
placeholders). Rejected because it is fragile and impossible to test exhaustively.
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## ADR-008 — YAML Writer creates missing files from scratch
|
|
163
|
+
|
|
164
|
+
**Date:** 2026-05-07
|
|
165
|
+
**Status:** Accepted
|
|
166
|
+
|
|
167
|
+
**Decision:** `dbt-scribe` creates `.yml` files from scratch when they do not exist,
|
|
168
|
+
rather than requiring a pre-existing skeleton from `dbt-codegen`.
|
|
169
|
+
|
|
170
|
+
**Rationale:** Makes the tool fully autonomous. Users can run `dbt-scribe generate`
|
|
171
|
+
on a dbt project that has no YAML documentation at all and get a complete, correctly
|
|
172
|
+
structured result. `dbt-codegen` remains compatible but is no longer required.
|
|
173
|
+
|
|
174
|
+
**Consequence:** The YAML Writer must know the canonical structure expected per layer
|
|
175
|
+
(section comments, column ordering, conditional `persist_docs`, tags inferred from
|
|
176
|
+
the manifest `fqn`).
|
|
177
|
+
|
|
178
|
+
**Alternative rejected:** Require a pre-existing empty `.yml` file. Rejected because
|
|
179
|
+
it introduces unnecessary friction (two-tool workflow for a task that should be one command).
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## ADR-009 — `manifest.json` is a mandatory prerequisite
|
|
184
|
+
|
|
185
|
+
**Date:** 2026-05-07
|
|
186
|
+
**Status:** Accepted
|
|
187
|
+
|
|
188
|
+
**Decision:** `target/manifest.json` is required for all commands except `init`.
|
|
189
|
+
There is no degraded mode that operates without the manifest in V1.
|
|
190
|
+
|
|
191
|
+
**Rationale:** Raw `.sql` files contain Jinja2 that cannot be reliably parsed without
|
|
192
|
+
running dbt. The manifest, produced by `dbt compile`, contains fully-resolved compiled
|
|
193
|
+
SQL — the only reliable source for column extraction.
|
|
194
|
+
|
|
195
|
+
**User impact:** The user must run `dbt compile` before using `dbt-scribe`. This is
|
|
196
|
+
documented prominently in the README and enforced by the bootstrap check with an
|
|
197
|
+
explicit error message including the command to run.
|
|
198
|
+
|
|
199
|
+
**Alternative rejected:** Best-effort mode on raw SQL (Jinja2 replaced by placeholders).
|
|
200
|
+
Rejected because results would be unreliable and difficult to test. The `dbt compile`
|
|
201
|
+
prerequisite is a low-friction, well-understood workflow step.
|
|
202
|
+
|
|
203
|
+
**V2 consideration:** A `--no-manifest` flag could enable partial analysis on raw SQL
|
|
204
|
+
for cases where compiling the project is not possible (e.g., missing warehouse credentials).
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
## ADR-010 — `{{ doc("...") }}` reference treated as a filled description
|
|
209
|
+
|
|
210
|
+
**Date:** 2026-05-07
|
|
211
|
+
**Status:** Accepted
|
|
212
|
+
|
|
213
|
+
**Decision:** The YAML Writer treats a column description containing `{{ doc("...") }}`
|
|
214
|
+
as "already documented" and will not overwrite it unless `--force` is passed.
|
|
215
|
+
|
|
216
|
+
**Rationale:** A `doc()` reference is intentional documentation. Overwriting it
|
|
217
|
+
automatically would silently break carefully hand-crafted YAML files.
|
|
218
|
+
|
|
219
|
+
**Implementation:** `is_description_set(description)` in `yaml_parser.py` returns
|
|
220
|
+
`True` for any non-empty string OR any string matching the pattern `{{\s*doc\(`.
|
|
221
|
+
|
|
222
|
+
**Consequence:** `--force` is the only mechanism to replace an existing `doc()` reference
|
|
223
|
+
with a generated inline description.
|
|
224
|
+
|
|
225
|
+
---
|
|
226
|
+
|
|
227
|
+
## ADR-011 — Run from dbt project root, no `--project-dir` flag in V1
|
|
228
|
+
|
|
229
|
+
**Date:** 2026-05-07
|
|
230
|
+
**Status:** Accepted
|
|
231
|
+
|
|
232
|
+
**Decision:** `dbt-scribe` must be invoked from the dbt project root directory.
|
|
233
|
+
No `--project-dir` flag is provided in V1.
|
|
234
|
+
|
|
235
|
+
**Rationale:** Keeps all path resolution relative to `os.getcwd()`, consistent with
|
|
236
|
+
how `dbt` itself works. Simpler bootstrap logic.
|
|
237
|
+
|
|
238
|
+
**Consequence:** The bootstrap check validates `dbt_project.yml` in the current
|
|
239
|
+
directory and exits with a clear error if it is absent.
|
|
240
|
+
|
|
241
|
+
**Alternative rejected:** Auto-discovery (walking up parent directories to find
|
|
242
|
+
`dbt_project.yml`). Rejected because implicit behavior is hard to debug and
|
|
243
|
+
could produce surprising results in nested project structures.
|
|
244
|
+
|
|
245
|
+
**V2:** A `--project-dir` flag will be added to support CI workflows that invoke
|
|
246
|
+
tools from a repository root that is not the dbt project root.
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
250
|
+
## ADR-012 — `model_root` read from `dbt_project.yml`, not hardcoded
|
|
251
|
+
|
|
252
|
+
**Date:** 2026-05-08
|
|
253
|
+
**Status:** Accepted
|
|
254
|
+
|
|
255
|
+
**Decision:** The path prefix used to locate model YAML files (default `"models"`) is
|
|
256
|
+
read from the `model-paths` key of `dbt_project.yml` at startup, rather than being
|
|
257
|
+
hardcoded as the string `"models"`.
|
|
258
|
+
|
|
259
|
+
**Rationale:** dbt's `model-paths` config is the authoritative source for where models
|
|
260
|
+
live. Hardcoding `"models"` would silently create files in the wrong directory (or miss
|
|
261
|
+
existing ones) for any project that sets `model-paths: [dbt_models]` or similar —
|
|
262
|
+
without raising an error, causing documentation corruption.
|
|
263
|
+
|
|
264
|
+
**Implementation:** `_read_model_root()` in `cli.py` reads `dbt_project.yml` (already
|
|
265
|
+
required by `_bootstrap()`) and extracts `model-paths[0]`, defaulting to `"models"` if
|
|
266
|
+
the key is absent. The value is stored in `ScribeConfig.model_root` via `model_copy()`
|
|
267
|
+
and threaded through to `yaml_writer`, `docs_writer`, and `resolver`.
|
|
268
|
+
|
|
269
|
+
**Consequence:** `ScribeConfig` gains a `model_root: str` field. It is not
|
|
270
|
+
user-configurable in `dbt-scribe.yml` — it is always derived from `dbt_project.yml` to
|
|
271
|
+
avoid duplicating config the user already maintains.
|
|
272
|
+
|
|
273
|
+
**Alternative rejected:** A `model_root` key in `dbt-scribe.yml`. Rejected because it
|
|
274
|
+
would require users to keep two files in sync.
|
|
275
|
+
|
|
276
|
+
---
|
|
277
|
+
|
|
278
|
+
## ADR-013 — Non-retryable HTTP status codes bypass the retry loop
|
|
279
|
+
|
|
280
|
+
**Date:** 2026-05-08
|
|
281
|
+
**Status:** Accepted
|
|
282
|
+
|
|
283
|
+
**Decision:** `LLMProvider.complete()` does not retry exceptions whose `status_code`
|
|
284
|
+
attribute is in `{400, 401, 403, 404}`. These are re-raised immediately.
|
|
285
|
+
|
|
286
|
+
**Rationale:** Retrying authentication errors (401), permission errors (403), or
|
|
287
|
+
not-found errors (404) is wasteful and misleading — the result is identical on every
|
|
288
|
+
attempt, and the 0 + 2 + 4 second backoff adds 6 seconds of delay before the same
|
|
289
|
+
failure. The real root cause (missing API key, wrong model name) is also obscured by
|
|
290
|
+
the final "LLM call failed after 3 attempts" message.
|
|
291
|
+
|
|
292
|
+
**Implementation:** Provider-agnostic duck typing: `getattr(exc, "status_code", None)`.
|
|
293
|
+
The Anthropic, OpenAI, and Google SDKs all expose `status_code` on their HTTP exception
|
|
294
|
+
classes. No provider SDK is imported in `base_generator.py`.
|
|
295
|
+
|
|
296
|
+
**Consequence:** Only transient errors (network timeout, 429 rate limit, 5xx) benefit
|
|
297
|
+
from the retry loop.
|
|
298
|
+
|
|
299
|
+
---
|
|
300
|
+
|
|
301
|
+
## ADR-014 — sqlglot column extraction fallback when manifest columns dict is empty
|
|
302
|
+
|
|
303
|
+
**Date:** 2026-05-09
|
|
304
|
+
**Status:** Accepted
|
|
305
|
+
|
|
306
|
+
**Decision:** When `manifest.json` has an empty `columns` dict for a node (which happens
|
|
307
|
+
when no YAML existed at `dbt compile` time), extract column names from the compiled SQL
|
|
308
|
+
using sqlglot. Try the adapter dialect first (e.g. `bigquery` for backtick quoting), then
|
|
309
|
+
fall back through `bigquery → duckdb → postgres → None` until one succeeds.
|
|
310
|
+
|
|
311
|
+
**Rationale:** dbt only populates `columns` in the manifest from existing YAML declarations.
|
|
312
|
+
On a greenfield project with no YAML, the dict is always empty — which is exactly the
|
|
313
|
+
situation `dbt-scribe` is designed to fix. The multi-dialect fallback handles BigQuery's
|
|
314
|
+
backtick quoting which sqlglot rejects without the correct dialect.
|
|
315
|
+
|
|
316
|
+
**Consequence:** `manifest_parser.py` now imports sqlglot. Column names extracted from SQL
|
|
317
|
+
are lowercased and have no data_type or description (None). This is acceptable — the
|
|
318
|
+
generator only needs column names to produce descriptions and tests.
|
|
319
|
+
|
|
320
|
+
---
|
|
321
|
+
|
|
322
|
+
## ADR-015 — Mart docs block assembled in Python, not by LLM
|
|
323
|
+
|
|
324
|
+
**Date:** 2026-05-09
|
|
325
|
+
**Status:** Accepted
|
|
326
|
+
|
|
327
|
+
**Decision:** The four-section mart docs block structure is assembled in Python in
|
|
328
|
+
`docs_generator._assemble_mart_docs_block()`. The LLM is only asked to provide the
|
|
329
|
+
content of two sections (`description_and_motivation` and `known_limitations`) as
|
|
330
|
+
separate JSON fields. The section headers, blank lines, and stakeholder sections are
|
|
331
|
+
added by code.
|
|
332
|
+
|
|
333
|
+
**Rationale:** Prompt-only enforcement of the four-section template proved unreliable —
|
|
334
|
+
the LLM consistently returned a `docs_block_content` key with free-form markdown
|
|
335
|
+
regardless of how the prompt was worded. Moving structure to code guarantees the template
|
|
336
|
+
is always respected. The fallback in `_assemble_mart_docs_block` handles the case where
|
|
337
|
+
the LLM still returns `docs_block_content` by using it as the description section content.
|
|
338
|
+
|
|
339
|
+
**Consequence:** `docs_mart.j2` now requests separate JSON fields. The LLM response shape
|
|
340
|
+
changed from `{model_description, docs_block_content, columns}` to
|
|
341
|
+
`{model_description, description_and_motivation, known_limitations, columns}`.
|
|
342
|
+
The Python assembly guarantees the four sections regardless of LLM output.
|
|
343
|
+
|
|
344
|
+
---
|
|
345
|
+
|
|
346
|
+
## ADR-016 — Layer detection uses alternate spelling fallbacks
|
|
347
|
+
|
|
348
|
+
**Date:** 2026-05-09
|
|
349
|
+
**Status:** Accepted
|
|
350
|
+
|
|
351
|
+
**Decision:** `detect_layer()` in `analyzer.py` accepts common alternate spellings of
|
|
352
|
+
layer folder names (`mart` vs `marts`, `int` vs `intermediate`) in addition to exact
|
|
353
|
+
config matches.
|
|
354
|
+
|
|
355
|
+
**Rationale:** Real dbt projects use `mart/` (without s) while the CDC and default config
|
|
356
|
+
specify `marts/`. Requiring an exact match caused `Layer.UNKNOWN` on the test project,
|
|
357
|
+
which silently routed mart models through the staging prompt and bypassed
|
|
358
|
+
`_assemble_mart_docs_block`. Alternate spelling fallbacks make the tool robust to the
|
|
359
|
+
most common naming variations without requiring config changes.
|
|
360
|
+
|
|
361
|
+
**Consequence:** Users with non-standard folder names should still set `marts_prefix`
|
|
362
|
+
correctly in `dbt-scribe.yml` — the fallbacks are a safety net, not a replacement for
|
|
363
|
+
correct config.
|