dbt-scribe 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. dbt_scribe-0.1.0/.env.example +6 -0
  2. dbt_scribe-0.1.0/.github/workflows/ci.yml +30 -0
  3. dbt_scribe-0.1.0/.gitignore +68 -0
  4. dbt_scribe-0.1.0/CONTEXT.md +91 -0
  5. dbt_scribe-0.1.0/DECISIONS.md +363 -0
  6. dbt_scribe-0.1.0/NEXT_STEPS.md +240 -0
  7. dbt_scribe-0.1.0/PKG-INFO +432 -0
  8. dbt_scribe-0.1.0/README.md +408 -0
  9. dbt_scribe-0.1.0/STRUCTURE.md +269 -0
  10. dbt_scribe-0.1.0/dbt_scribe/__init__.py +1 -0
  11. dbt_scribe-0.1.0/dbt_scribe/analyzer.py +348 -0
  12. dbt_scribe-0.1.0/dbt_scribe/cli.py +254 -0
  13. dbt_scribe-0.1.0/dbt_scribe/config.py +157 -0
  14. dbt_scribe-0.1.0/dbt_scribe/coverage.py +1 -0
  15. dbt_scribe-0.1.0/dbt_scribe/generators/__init__.py +0 -0
  16. dbt_scribe-0.1.0/dbt_scribe/generators/base_generator.py +41 -0
  17. dbt_scribe-0.1.0/dbt_scribe/generators/docs_generator.py +192 -0
  18. dbt_scribe-0.1.0/dbt_scribe/generators/providers/__init__.py +0 -0
  19. dbt_scribe-0.1.0/dbt_scribe/generators/providers/anthropic_provider.py +30 -0
  20. dbt_scribe-0.1.0/dbt_scribe/generators/providers/google_provider.py +31 -0
  21. dbt_scribe-0.1.0/dbt_scribe/generators/providers/openai_provider.py +31 -0
  22. dbt_scribe-0.1.0/dbt_scribe/generators/tests_generator.py +248 -0
  23. dbt_scribe-0.1.0/dbt_scribe/parsers/__init__.py +0 -0
  24. dbt_scribe-0.1.0/dbt_scribe/parsers/manifest_parser.py +188 -0
  25. dbt_scribe-0.1.0/dbt_scribe/parsers/yaml_parser.py +64 -0
  26. dbt_scribe-0.1.0/dbt_scribe/prompts/docs_intermediate.j2 +26 -0
  27. dbt_scribe-0.1.0/dbt_scribe/prompts/docs_mart.j2 +31 -0
  28. dbt_scribe-0.1.0/dbt_scribe/prompts/docs_staging.j2 +32 -0
  29. dbt_scribe-0.1.0/dbt_scribe/prompts/tests_generic.j2 +78 -0
  30. dbt_scribe-0.1.0/dbt_scribe/prompts/tests_singular.j2 +1 -0
  31. dbt_scribe-0.1.0/dbt_scribe/resolver.py +51 -0
  32. dbt_scribe-0.1.0/dbt_scribe/writers/__init__.py +0 -0
  33. dbt_scribe-0.1.0/dbt_scribe/writers/docs_writer.py +65 -0
  34. dbt_scribe-0.1.0/dbt_scribe/writers/singular_test_writer.py +1 -0
  35. dbt_scribe-0.1.0/dbt_scribe/writers/yaml_writer.py +126 -0
  36. dbt_scribe-0.1.0/pyproject.toml +53 -0
  37. dbt_scribe-0.1.0/tests/fixtures/dbt_project/dbt-scribe.yml +34 -0
  38. dbt_scribe-0.1.0/tests/fixtures/dbt_project/dbt_project.yml +28 -0
  39. dbt_scribe-0.1.0/tests/fixtures/dbt_project/models/intermediate/rugby/int_fixtures_enriched_with_teams.sql +28 -0
  40. dbt_scribe-0.1.0/tests/fixtures/dbt_project/models/marts/rugby/fixtures.sql +22 -0
  41. dbt_scribe-0.1.0/tests/fixtures/dbt_project/models/staging/api_sports/stg_api_sports__fixtures.sql +21 -0
  42. dbt_scribe-0.1.0/tests/fixtures/dbt_project/models/staging/api_sports/stg_api_sports__fixtures.yml +31 -0
  43. dbt_scribe-0.1.0/tests/fixtures/dbt_project/target/manifest.json +266 -0
  44. dbt_scribe-0.1.0/tests/test_analyzer.py +115 -0
  45. dbt_scribe-0.1.0/tests/test_bootstrap.py +72 -0
  46. dbt_scribe-0.1.0/tests/test_cli_commands.py +151 -0
  47. dbt_scribe-0.1.0/tests/test_config.py +160 -0
  48. dbt_scribe-0.1.0/tests/test_docs_generator.py +114 -0
  49. dbt_scribe-0.1.0/tests/test_docs_writer.py +111 -0
  50. dbt_scribe-0.1.0/tests/test_integration_pipeline.py +45 -0
  51. dbt_scribe-0.1.0/tests/test_manifest_parser.py +50 -0
  52. dbt_scribe-0.1.0/tests/test_providers.py +128 -0
  53. dbt_scribe-0.1.0/tests/test_resolver.py +55 -0
  54. dbt_scribe-0.1.0/tests/test_tests_generator.py +116 -0
  55. dbt_scribe-0.1.0/tests/test_yaml_parser.py +75 -0
  56. dbt_scribe-0.1.0/tests/test_yaml_writer.py +169 -0
  57. dbt_scribe-0.1.0/uv.lock +1455 -0
@@ -0,0 +1,6 @@
1
+ # Copy this file to .env and fill in the keys for the provider(s) you use.
2
+ # Only the key for your configured provider is required.
3
+
4
+ ANTHROPIC_API_KEY=sk-ant-...
5
+ OPENAI_API_KEY=sk-...
6
+ GOOGLE_API_KEY=...
@@ -0,0 +1,30 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ jobs:
8
+ test:
9
+ runs-on: ubuntu-latest
10
+
11
+ steps:
12
+ - name: Check out repository
13
+ uses: actions/checkout@v4
14
+
15
+ - name: Set up Python
16
+ uses: actions/setup-python@v5
17
+ with:
18
+ python-version: "3.11"
19
+
20
+ - name: Install package
21
+ run: python -m pip install -e ".[dev]"
22
+
23
+ - name: Run ruff
24
+ run: ruff check .
25
+
26
+ - name: Run mypy
27
+ run: mypy dbt_scribe
28
+
29
+ - name: Run pytest
30
+ run: pytest
@@ -0,0 +1,68 @@
1
+ # =============================================================================
2
+ # dbt-scribe — .gitignore
3
+ # =============================================================================
4
+
5
+ # ── Python ────────────────────────────────────────────────────────────────────
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+ *.so
10
+ *.pyd
11
+
12
+ # Virtual environments
13
+ .venv/
14
+ venv/
15
+ env/
16
+ ENV/
17
+
18
+ # Distribution / packaging
19
+ dist/
20
+ build/
21
+ *.egg-info/
22
+ *.egg
23
+ .eggs/
24
+ MANIFEST
25
+
26
+ # Type checking
27
+ .mypy_cache/
28
+ .dmypy.json
29
+ dmypy.json
30
+ .pytype/
31
+
32
+ # Test / coverage
33
+ .pytest_cache/
34
+ .coverage
35
+ .coverage.*
36
+ coverage.xml
37
+ htmlcov/
38
+
39
+ # ── dbt-scribe specific ───────────────────────────────────────────────────────
40
+ # LLM response cache — keyed on compiled SQL + config fingerprint
41
+ .dbt-scribe-cache/
42
+
43
+ # DuckDB files from test projects
44
+ *.duckdb
45
+ *.duckdb.wal
46
+
47
+ # ── Environment / secrets ─────────────────────────────────────────────────────
48
+ .env
49
+ .env.local
50
+ .env.*.local
51
+
52
+ # ── Editors and OS ────────────────────────────────────────────────────────────
53
+ .DS_Store
54
+ Thumbs.db
55
+ .idea/
56
+ .vscode/
57
+ *.swp
58
+ *.swo
59
+ *~
60
+
61
+ # ── Claude Code ───────────────────────────────────────────────────────────────
62
+ .claude/
63
+
64
+ # ── Misc ──────────────────────────────────────────────────────────────────────
65
+ *.log
66
+
67
+ # Claude Code local settings
68
+ .claude/
@@ -0,0 +1,91 @@
1
+ # CONTEXT.md — dbt-scribe
2
+
3
+ ## What this project is
4
+
5
+ `dbt-scribe` is a Python CLI that analyses a dbt Core project and uses an LLM
6
+ (Anthropic Claude, OpenAI, or Google Gemini) to automatically generate:
7
+
8
+ - **Documentation** — model and column descriptions in YAML, plus long-form docs
9
+ blocks in Markdown (`*__docs.md` files), following dbt's two-tier convention
10
+ - **Tests** — named generic tests in YAML (`not_null`, `unique`, `accepted_values`,
11
+ `relationships`) and singular SQL tests in `tests/`, inferred from column semantics
12
+
13
+ The tool is non-destructive by default: it only fills in what is missing and never
14
+ overwrites existing descriptions or tests unless `--force` is explicitly passed.
15
+ A `{{ doc("...") }}` reference is treated as a filled description and is preserved.
16
+
17
+ ## Problem it solves
18
+
19
+ Writing thorough dbt documentation and tests manually is time-consuming. A typical
20
+ staging model with 15 columns takes 30–45 minutes to document properly when following
21
+ strict conventions (English descriptions, two-tier docs, named tests, shared blocks,
22
+ four-section mart template, etc.). This debt accumulates quickly across multiple
23
+ projects and degrades portfolio quality.
24
+
25
+ Existing tools (`dbt-osmosis`, `dbt-codegen`, `dbt Assist`) either perform
26
+ mechanical propagation without LLM understanding, generate empty boilerplate, or
27
+ require dbt Cloud. `dbt-scribe` fills the gap: LLM-powered generation, local,
28
+ configurable per project, compatible with dbt Core.
29
+
30
+ ## How it works
31
+
32
+ 1. **Bootstrap** — validates that the current directory is a dbt project root
33
+ (`dbt_project.yml` + `target/manifest.json` + `dbt-scribe.yml` must all be present)
34
+ 2. **Manifest parsing** — reads `target/manifest.json` to extract compiled SQL
35
+ (Jinja2-resolved), column lists, lineage, adapter type, and fully-qualified node names
36
+ 3. **YAML parsing** — reads existing `.yml` files to determine what is already documented
37
+ 4. **Analysis** — detects the layer (staging / intermediate / marts) and infers column
38
+ types (pk, fk, enum, timestamp, boolean, metric, shared, text)
39
+ 5. **Generation** — calls the configured LLM provider with structured Jinja2 prompts;
40
+ all responses are JSON for reliable parsing
41
+ 6. **Writing** — creates `.yml` files from scratch or merges into existing ones;
42
+ creates or updates `*__docs.md` files; writes singular test SQL files
43
+
44
+ > **Prerequisite:** `dbt compile` must be run before `dbt-scribe` so that
45
+ > `target/manifest.json` is up to date. The tool validates this at startup.
46
+
47
+ ## Configuration
48
+
49
+ Each dbt project that uses `dbt-scribe` has its own `dbt-scribe.yml` at its root.
50
+ This file is versioned with the dbt project and controls: LLM provider and model,
51
+ documentation rules (two-tier, shared columns, mart template), test generation patterns
52
+ (PK/FK/enum column name regexes), coverage thresholds, and layer conventions.
53
+
54
+ A default config is generated by `dbt-scribe init`.
55
+
56
+ ## Design principles
57
+
58
+ - **Manifest-first** — compiled SQL from `manifest.json` is the only source of truth;
59
+ raw `.sql` files (which contain unresolved Jinja2) are never parsed directly
60
+ - **Non-destructive** — existing documentation and tests are preserved unless `--force`
61
+ - **Convention-aware** — the tool understands dbt layer conventions and adapts its
62
+ output accordingly (different doc templates for staging vs. marts, etc.)
63
+ - **Provider-agnostic** — an `LLMProvider` abstraction isolates all generator code
64
+ from the specifics of Anthropic, OpenAI, or Google SDKs
65
+ - **JSON output from LLM** — all prompts instruct the model to return JSON only,
66
+ eliminating fragile regex-based text parsing
67
+
68
+ ## Supported dbt adapters
69
+
70
+ DuckDB (default), BigQuery, PostgreSQL. Auto-detected from `manifest.json` metadata.
71
+
72
+ ## Target users (V1)
73
+
74
+ Solo analytics engineers and small teams running dbt Core locally, with a three-layer
75
+ medallion architecture (staging / intermediate / marts). The tool is designed to be
76
+ dropped into any existing dbt project with minimal setup.
77
+
78
+ ## Repository
79
+
80
+ - GitHub: https://github.com/jeremy6680/dbt-scribe
81
+ - Author: Jeremy Marchandeau
82
+ - License: MIT
83
+ - Part of the Web2Data portfolio (web2data.jeremymarchandeau.com)
84
+
85
+ ## Related projects
86
+
87
+ - `w2d-scaffold` — project scaffolding CLI; `dbt-scribe init` may eventually be called
88
+ automatically when scaffolding a new `data` project type
89
+ - Metrigator — shares the same "audit and automatically improve project quality" philosophy
90
+ - TasteBase, BrickMetrics, DraftLab — dbt projects in the portfolio that serve as
91
+ real-world test targets for `dbt-scribe`
@@ -0,0 +1,363 @@
1
+ # DECISIONS.md — dbt-scribe
2
+
3
+ Architectural and technical decisions log.
4
+ Each entry documents what was decided, why, and what alternatives were considered.
5
+
6
+ ---
7
+
8
+ ## ADR-001 — Multi-provider LLM abstraction from V1
9
+
10
+ **Date:** 2026-05-07
11
+ **Status:** Accepted
12
+
13
+ **Decision:** Support three LLM providers from V1 (Anthropic Claude, OpenAI, Google
14
+ Gemini) through a shared `LLMProvider` abstract interface.
15
+
16
+ **Rationale:** Provider preference is a legitimate user choice (cost, access, model
17
+ quality). Implementing the abstraction upfront costs little (one interface + three
18
+ simple adapters) and avoids a painful refactor later. All providers receive identical
19
+ prompts (structured JSON, temperature 0.2), making the abstraction natural.
20
+
21
+ **Providers:**
22
+
23
+ | Provider | Default model | Environment variable | SDK package |
24
+ | --------------------- | -------------------------- | -------------------- | -------------- |
25
+ | `anthropic` (default) | `claude-sonnet-4-20250514` | `ANTHROPIC_API_KEY` | `anthropic` |
26
+ | `openai` | `gpt-4o` | `OPENAI_API_KEY` | `openai` |
27
+ | `google` | `gemini-2.5-pro` | `GOOGLE_API_KEY` | `google-genai` |
28
+
29
+ **Note on Google SDK:** The originally planned `google-generativeai` package was
30
+ deprecated before implementation. The `google-genai` package (Google's replacement,
31
+ `from google import genai`) is used instead. The interface is functionally equivalent.
32
+
33
+ **Consequence:** Three SDK dependencies instead of one. All three are installed by
34
+ default; optional extras may be introduced in V2 if package size becomes a concern.
35
+
36
+ **Alternative rejected:** Claude-only in V1, abstraction in V2. Rejected because the
37
+ interface is trivial to write now and would cause an API-breaking change later.
38
+
39
+ ---
40
+
41
+ ## ADR-002 — Structured JSON output from LLM
42
+
43
+ **Date:** 2026-05-07
44
+ **Status:** Accepted
45
+
46
+ **Decision:** All LLM calls must return valid JSON only. The system prompt explicitly
47
+ forbids any text outside the JSON structure (no preamble, no markdown fences,
48
+ no explanation).
49
+
50
+ **Rationale:** JSON parsing is reliable and deterministic. Free-text responses would
51
+ require fragile regex-based extraction that breaks on minor model output variations.
52
+
53
+ **Consequence:** Prompts must be carefully engineered to prevent the model from adding
54
+ explanatory text. A JSON parse failure triggers a retry (up to 3 attempts).
55
+
56
+ ---
57
+
58
+ ## ADR-003 — One LLM call per model, not per column
59
+
60
+ **Date:** 2026-05-07
61
+ **Status:** Accepted
62
+
63
+ **Decision:** Generate documentation and tests for all columns of a model in a single
64
+ LLM call.
65
+
66
+ **Rationale:** A single call gives the model full inter-column context (e.g., it can
67
+ infer that `home_score` and `away_score` are paired concepts). It also reduces API
68
+ cost and latency compared to one call per column.
69
+
70
+ **Consequence:** Prompts can be long for wide models. If a model exceeds ~50 columns,
71
+ the call will be split into two sequential calls to stay within context limits.
72
+
73
+ ---
74
+
75
+ ## ADR-004 — `ruamel.yaml` instead of `PyYAML`
76
+
77
+ **Date:** 2026-05-07
78
+ **Status:** Accepted (deferred to Phase 2 — `PyYAML` used in Phase 1 for simplicity)
79
+
80
+ **Decision:** Use `ruamel.yaml` for all YAML read/write operations in the final tool.
81
+
82
+ **Rationale:** `PyYAML` does not preserve comments or key ordering in round-trip
83
+ operations. dbt YAML files use structured comments (`# ── Primary key ──`) as
84
+ visual separators that must be preserved when merging into existing files.
85
+
86
+ **Consequence:** `ruamel.yaml` has a slightly more verbose API than `PyYAML`.
87
+ Phase 1 uses `PyYAML` to keep the initial scope small; migration to `ruamel.yaml`
88
+ is scheduled for Phase 2 (Step tracked in NEXT_STEPS.md backlog).
89
+
90
+ ---
91
+
92
+ ## ADR-005 — Cache key = SHA-256(compiled_sql + config_fingerprint)
93
+
94
+ **Date:** 2026-05-07
95
+ **Status:** Accepted (deferred to Phase 2)
96
+
97
+ **Decision:** LLM response cache keys are computed from a composite hash of the
98
+ compiled SQL and the relevant sections of `dbt-scribe.yml`.
99
+
100
+ **Rationale:** Hashing compiled SQL alone is insufficient — if `dbt-scribe.yml`
101
+ changes (new patterns, new `default_owner`, new thresholds), cached results may no
102
+ longer match the expected output even if the SQL is unchanged.
103
+
104
+ **Implementation:**
105
+
106
+ ```python
107
+ config_fingerprint = json.dumps({
108
+ "llm": config.llm.model_dump(),
109
+ "docs": config.docs.model_dump(),
110
+ "tests": config.tests.model_dump(),
111
+ "conventions": config.conventions.model_dump(),
112
+ }, sort_keys=True)
113
+ cache_key = hashlib.sha256((compiled_sql + config_fingerprint).encode()).hexdigest()
114
+ ```
115
+
116
+ **Consequence:** `.dbt-scribe-cache/` directory must be added to `.gitignore`.
117
+ A config change invalidates all cache entries for affected models.
118
+
119
+ ---
120
+
121
+ ## ADR-006 — `dbt-scribe.yml` lives in the dbt project root
122
+
123
+ **Date:** 2026-05-07
124
+ **Status:** Accepted
125
+
126
+ **Decision:** The configuration file is versioned inside the target dbt project,
127
+ not in a global user directory.
128
+
129
+ **Rationale:** Each dbt project has its own conventions (adapter, coverage thresholds,
130
+ shared column names, owner details). The config must evolve with the project and be
131
+ visible to anyone cloning the repo.
132
+
133
+ **Consequence:** `dbt-scribe` looks for `dbt-scribe.yml` in the current working
134
+ directory. A missing config file triggers a `BootstrapError` with an explicit
135
+ suggestion to run `dbt-scribe init`.
136
+
137
+ ---
138
+
139
+ ## ADR-007 — `sqlglot` applied to compiled SQL only (never to raw `.sql` files)
140
+
141
+ **Date:** 2026-05-07
142
+ **Status:** Accepted
143
+
144
+ **Decision:** `sqlglot` is used exclusively to parse the compiled SQL extracted from
145
+ `manifest.json`. Raw `.sql` files (which contain unresolved Jinja2 templates) are
146
+ never passed to `sqlglot`.
147
+
148
+ **Rationale:** `sqlglot` is an excellent multi-dialect SQL parser, but dbt `.sql`
149
+ files contain Jinja2 (`{{ ref('...') }}`, `{{ var('...') }}`, custom macros) that
150
+ `sqlglot` cannot handle. Using compiled SQL (where Jinja2 is fully resolved by dbt)
151
+ eliminates this friction entirely.
152
+
153
+ **Consequence:** `sql_parser.py` does not exist. All SQL parsing logic lives in
154
+ `manifest_parser.py` (extraction from manifest) and `analyzer.py` (calling `sqlglot`
155
+ on the extracted compiled SQL).
156
+
157
+ **Alternative rejected:** Naive Jinja2 pre-processing (replace `{{ ... }}` with
158
+ placeholders). Rejected because it is fragile and impossible to test exhaustively.
159
+
160
+ ---
161
+
162
+ ## ADR-008 — YAML Writer creates missing files from scratch
163
+
164
+ **Date:** 2026-05-07
165
+ **Status:** Accepted
166
+
167
+ **Decision:** `dbt-scribe` creates `.yml` files from scratch when they do not exist,
168
+ rather than requiring a pre-existing skeleton from `dbt-codegen`.
169
+
170
+ **Rationale:** Makes the tool fully autonomous. Users can run `dbt-scribe generate`
171
+ on a dbt project that has no YAML documentation at all and get a complete, correctly
172
+ structured result. `dbt-codegen` remains compatible but is no longer required.
173
+
174
+ **Consequence:** The YAML Writer must know the canonical structure expected per layer
175
+ (section comments, column ordering, conditional `persist_docs`, tags inferred from
176
+ the manifest `fqn`).
177
+
178
+ **Alternative rejected:** Require a pre-existing empty `.yml` file. Rejected because
179
+ it introduces unnecessary friction (two-tool workflow for a task that should be one command).
180
+
181
+ ---
182
+
183
+ ## ADR-009 — `manifest.json` is a mandatory prerequisite
184
+
185
+ **Date:** 2026-05-07
186
+ **Status:** Accepted
187
+
188
+ **Decision:** `target/manifest.json` is required for all commands except `init`.
189
+ There is no degraded mode that operates without the manifest in V1.
190
+
191
+ **Rationale:** Raw `.sql` files contain Jinja2 that cannot be reliably parsed without
192
+ running dbt. The manifest, produced by `dbt compile`, contains fully-resolved compiled
193
+ SQL — the only reliable source for column extraction.
194
+
195
+ **User impact:** The user must run `dbt compile` before using `dbt-scribe`. This is
196
+ documented prominently in the README and enforced by the bootstrap check with an
197
+ explicit error message including the command to run.
198
+
199
+ **Alternative rejected:** Best-effort mode on raw SQL (Jinja2 replaced by placeholders).
200
+ Rejected because results would be unreliable and difficult to test. The `dbt compile`
201
+ prerequisite is a low-friction, well-understood workflow step.
202
+
203
+ **V2 consideration:** A `--no-manifest` flag could enable partial analysis on raw SQL
204
+ for cases where compiling the project is not possible (e.g., missing warehouse credentials).
205
+
206
+ ---
207
+
208
+ ## ADR-010 — `{{ doc("...") }}` reference treated as a filled description
209
+
210
+ **Date:** 2026-05-07
211
+ **Status:** Accepted
212
+
213
+ **Decision:** The YAML Writer treats a column description containing `{{ doc("...") }}`
214
+ as "already documented" and will not overwrite it unless `--force` is passed.
215
+
216
+ **Rationale:** A `doc()` reference is intentional documentation. Overwriting it
217
+ automatically would silently break carefully hand-crafted YAML files.
218
+
219
+ **Implementation:** `is_description_set(description)` in `yaml_parser.py` returns
220
+ `True` for any non-empty string OR any string matching the pattern `{{\s*doc\(`.
221
+
222
+ **Consequence:** `--force` is the only mechanism to replace an existing `doc()` reference
223
+ with a generated inline description.
224
+
225
+ ---
226
+
227
+ ## ADR-011 — Run from dbt project root, no `--project-dir` flag in V1
228
+
229
+ **Date:** 2026-05-07
230
+ **Status:** Accepted
231
+
232
+ **Decision:** `dbt-scribe` must be invoked from the dbt project root directory.
233
+ No `--project-dir` flag is provided in V1.
234
+
235
+ **Rationale:** Keeps all path resolution relative to `os.getcwd()`, consistent with
236
+ how `dbt` itself works. Simpler bootstrap logic.
237
+
238
+ **Consequence:** The bootstrap check validates `dbt_project.yml` in the current
239
+ directory and exits with a clear error if it is absent.
240
+
241
+ **Alternative rejected:** Auto-discovery (walking up parent directories to find
242
+ `dbt_project.yml`). Rejected because implicit behavior is hard to debug and
243
+ could produce surprising results in nested project structures.
244
+
245
+ **V2:** A `--project-dir` flag will be added to support CI workflows that invoke
246
+ tools from a repository root that is not the dbt project root.
247
+
248
+ ---
249
+
250
+ ## ADR-012 — `model_root` read from `dbt_project.yml`, not hardcoded
251
+
252
+ **Date:** 2026-05-08
253
+ **Status:** Accepted
254
+
255
+ **Decision:** The path prefix used to locate model YAML files (default `"models"`) is
256
+ read from the `model-paths` key of `dbt_project.yml` at startup, rather than being
257
+ hardcoded as the string `"models"`.
258
+
259
+ **Rationale:** dbt's `model-paths` config is the authoritative source for where models
260
+ live. Hardcoding `"models"` would silently create files in the wrong directory (or miss
261
+ existing ones) for any project that sets `model-paths: [dbt_models]` or similar —
262
+ without raising an error, causing documentation corruption.
263
+
264
+ **Implementation:** `_read_model_root()` in `cli.py` reads `dbt_project.yml` (already
265
+ required by `_bootstrap()`) and extracts `model-paths[0]`, defaulting to `"models"` if
266
+ the key is absent. The value is stored in `ScribeConfig.model_root` via `model_copy()`
267
+ and threaded through to `yaml_writer`, `docs_writer`, and `resolver`.
268
+
269
+ **Consequence:** `ScribeConfig` gains a `model_root: str` field. It is not
270
+ user-configurable in `dbt-scribe.yml` — it is always derived from `dbt_project.yml` to
271
+ avoid duplicating config the user already maintains.
272
+
273
+ **Alternative rejected:** A `model_root` key in `dbt-scribe.yml`. Rejected because it
274
+ would require users to keep two files in sync.
275
+
276
+ ---
277
+
278
+ ## ADR-013 — Non-retryable HTTP status codes bypass the retry loop
279
+
280
+ **Date:** 2026-05-08
281
+ **Status:** Accepted
282
+
283
+ **Decision:** `LLMProvider.complete()` does not retry exceptions whose `status_code`
284
+ attribute is in `{400, 401, 403, 404}`. These are re-raised immediately.
285
+
286
+ **Rationale:** Retrying authentication errors (401), permission errors (403), or
287
+ not-found errors (404) is wasteful and misleading — the result is identical on every
288
+ attempt, and the 0 + 2 + 4 second backoff adds 6 seconds of delay before the same
289
+ failure. The real root cause (missing API key, wrong model name) is also obscured by
290
+ the final "LLM call failed after 3 attempts" message.
291
+
292
+ **Implementation:** Provider-agnostic duck typing: `getattr(exc, "status_code", None)`.
293
+ The Anthropic, OpenAI, and Google SDKs all expose `status_code` on their HTTP exception
294
+ classes. No provider SDK is imported in `base_generator.py`.
295
+
296
+ **Consequence:** Only transient errors (network timeout, 429 rate limit, 5xx) benefit
297
+ from the retry loop.
298
+
299
+ ---
300
+
301
+ ## ADR-014 — sqlglot column extraction fallback when manifest columns dict is empty
302
+
303
+ **Date:** 2026-05-09
304
+ **Status:** Accepted
305
+
306
+ **Decision:** When `manifest.json` has an empty `columns` dict for a node (which happens
307
+ when no YAML existed at `dbt compile` time), extract column names from the compiled SQL
308
+ using sqlglot. Try the adapter dialect first (e.g. `bigquery` for backtick quoting), then
309
+ fall back through `bigquery → duckdb → postgres → None` until one succeeds.
310
+
311
+ **Rationale:** dbt only populates `columns` in the manifest from existing YAML declarations.
312
+ On a greenfield project with no YAML, the dict is always empty — which is exactly the
313
+ situation `dbt-scribe` is designed to fix. The multi-dialect fallback handles BigQuery's
314
+ backtick quoting which sqlglot rejects without the correct dialect.
315
+
316
+ **Consequence:** `manifest_parser.py` now imports sqlglot. Column names extracted from SQL
317
+ are lowercased and have no data_type or description (None). This is acceptable — the
318
+ generator only needs column names to produce descriptions and tests.
319
+
320
+ ---
321
+
322
+ ## ADR-015 — Mart docs block assembled in Python, not by LLM
323
+
324
+ **Date:** 2026-05-09
325
+ **Status:** Accepted
326
+
327
+ **Decision:** The four-section mart docs block structure is assembled in Python in
328
+ `docs_generator._assemble_mart_docs_block()`. The LLM is only asked to provide the
329
+ content of two sections (`description_and_motivation` and `known_limitations`) as
330
+ separate JSON fields. The section headers, blank lines, and stakeholder sections are
331
+ added by code.
332
+
333
+ **Rationale:** Prompt-only enforcement of the four-section template proved unreliable —
334
+ the LLM consistently returned a `docs_block_content` key with free-form markdown
335
+ regardless of how the prompt was worded. Moving structure to code guarantees the template
336
+ is always respected. The fallback in `_assemble_mart_docs_block` handles the case where
337
+ the LLM still returns `docs_block_content` by using it as the description section content.
338
+
339
+ **Consequence:** `docs_mart.j2` now requests separate JSON fields. The LLM response shape
340
+ changed from `{model_description, docs_block_content, columns}` to
341
+ `{model_description, description_and_motivation, known_limitations, columns}`.
342
+ The Python assembly guarantees the four sections regardless of LLM output.
343
+
344
+ ---
345
+
346
+ ## ADR-016 — Layer detection uses alternate spelling fallbacks
347
+
348
+ **Date:** 2026-05-09
349
+ **Status:** Accepted
350
+
351
+ **Decision:** `detect_layer()` in `analyzer.py` accepts common alternate spellings of
352
+ layer folder names (`mart` vs `marts`, `int` vs `intermediate`) in addition to exact
353
+ config matches.
354
+
355
+ **Rationale:** Real dbt projects use `mart/` (without s) while the CDC and default config
356
+ specify `marts/`. Requiring an exact match caused `Layer.UNKNOWN` on the test project,
357
+ which silently routed mart models through the staging prompt and bypassed
358
+ `_assemble_mart_docs_block`. Alternate spelling fallbacks make the tool robust to the
359
+ most common naming variations without requiring config changes.
360
+
361
+ **Consequence:** Users with non-standard folder names should still set `marts_prefix`
362
+ correctly in `dbt-scribe.yml` — the fallbacks are a safety net, not a replacement for
363
+ correct config.