coop-data-doc 0.15.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. coop_data_doc-0.15.0/.github/workflows/ci.yml +33 -0
  2. coop_data_doc-0.15.0/.github/workflows/publish.yml +38 -0
  3. coop_data_doc-0.15.0/.gitignore +8 -0
  4. coop_data_doc-0.15.0/ARCHITECTURE.md +167 -0
  5. coop_data_doc-0.15.0/CLAUDE.md +50 -0
  6. coop_data_doc-0.15.0/CONTRIBUTING.md +67 -0
  7. coop_data_doc-0.15.0/LICENSE +21 -0
  8. coop_data_doc-0.15.0/PKG-INFO +612 -0
  9. coop_data_doc-0.15.0/README.md +580 -0
  10. coop_data_doc-0.15.0/pyproject.toml +53 -0
  11. coop_data_doc-0.15.0/src/coop_data_doc/__init__.py +3 -0
  12. coop_data_doc-0.15.0/src/coop_data_doc/__main__.py +4 -0
  13. coop_data_doc-0.15.0/src/coop_data_doc/cli.py +586 -0
  14. coop_data_doc-0.15.0/src/coop_data_doc/config.py +362 -0
  15. coop_data_doc-0.15.0/src/coop_data_doc/crawler.py +149 -0
  16. coop_data_doc-0.15.0/src/coop_data_doc/diagnostics.py +157 -0
  17. coop_data_doc-0.15.0/src/coop_data_doc/graph/__init__.py +23 -0
  18. coop_data_doc-0.15.0/src/coop_data_doc/graph/model.py +246 -0
  19. coop_data_doc-0.15.0/src/coop_data_doc/graph/serialize.py +33 -0
  20. coop_data_doc-0.15.0/src/coop_data_doc/layering.py +149 -0
  21. coop_data_doc-0.15.0/src/coop_data_doc/linker/__init__.py +1 -0
  22. coop_data_doc-0.15.0/src/coop_data_doc/linker/cache.py +110 -0
  23. coop_data_doc-0.15.0/src/coop_data_doc/linker/interactive.py +50 -0
  24. coop_data_doc-0.15.0/src/coop_data_doc/linker/resolver.py +234 -0
  25. coop_data_doc-0.15.0/src/coop_data_doc/parsers/__init__.py +1 -0
  26. coop_data_doc-0.15.0/src/coop_data_doc/parsers/bim.py +150 -0
  27. coop_data_doc-0.15.0/src/coop_data_doc/parsers/dax.py +105 -0
  28. coop_data_doc-0.15.0/src/coop_data_doc/parsers/mcode.py +98 -0
  29. coop_data_doc-0.15.0/src/coop_data_doc/parsers/pbir.py +329 -0
  30. coop_data_doc-0.15.0/src/coop_data_doc/parsers/pbix.py +168 -0
  31. coop_data_doc-0.15.0/src/coop_data_doc/parsers/sql_common.py +255 -0
  32. coop_data_doc-0.15.0/src/coop_data_doc/parsers/sql_objects.py +271 -0
  33. coop_data_doc-0.15.0/src/coop_data_doc/parsers/sql_procs.py +343 -0
  34. coop_data_doc-0.15.0/src/coop_data_doc/parsers/tmdl.py +347 -0
  35. coop_data_doc-0.15.0/src/coop_data_doc/progress.py +101 -0
  36. coop_data_doc-0.15.0/src/coop_data_doc/render/__init__.py +1 -0
  37. coop_data_doc-0.15.0/src/coop_data_doc/render/markdown.py +308 -0
  38. coop_data_doc-0.15.0/src/coop_data_doc/render/mermaid.py +162 -0
  39. coop_data_doc-0.15.0/src/coop_data_doc/render/site.py +346 -0
  40. coop_data_doc-0.15.0/src/coop_data_doc/templates/assets/README.md +13 -0
  41. coop_data_doc-0.15.0/src/coop_data_doc/templates/assets/custom.css +22 -0
  42. coop_data_doc-0.15.0/src/coop_data_doc/templates/assets/iframe-worker-shim.js +1 -0
  43. coop_data_doc-0.15.0/src/coop_data_doc/templates/assets/mermaid.min.js +3405 -0
  44. coop_data_doc-0.15.0/src/coop_data_doc/upgrade.py +338 -0
  45. coop_data_doc-0.15.0/src/coop_data_doc/wizard.py +359 -0
  46. coop_data_doc-0.15.0/tasks/README.md +27 -0
  47. coop_data_doc-0.15.0/tasks/_shared-context.md +76 -0
  48. coop_data_doc-0.15.0/tasks/module-0.md +20 -0
  49. coop_data_doc-0.15.0/tasks/module-1.md +100 -0
  50. coop_data_doc-0.15.0/tasks/module-2.md +104 -0
  51. coop_data_doc-0.15.0/tasks/module-3.md +146 -0
  52. coop_data_doc-0.15.0/tasks/module-4.md +111 -0
  53. coop_data_doc-0.15.0/tasks/module-5.md +154 -0
  54. coop_data_doc-0.15.0/tasks/module-6.md +61 -0
  55. coop_data_doc-0.15.0/tasks/module-7.md +55 -0
  56. coop_data_doc-0.15.0/tests/conftest.py +8 -0
  57. coop_data_doc-0.15.0/tests/fixtures/repo_pbi/LegacyThing/report.json +13 -0
  58. coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.Report/definition/pages/page1/page.json +6 -0
  59. coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.Report/definition/pages/page1/visuals/abc123/visual.json +25 -0
  60. coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.SemanticModel/definition/model.tmdl +7 -0
  61. coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.SemanticModel/definition/tables/dim_customer.tmdl +27 -0
  62. coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.SemanticModel/definition/tables/ext_unresolved.tmdl +15 -0
  63. coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.SemanticModel/definition/tables/fact_sales.tmdl +34 -0
  64. coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.SemanticModel/definition/tables/orders_native.tmdl +16 -0
  65. coop_data_doc-0.15.0/tests/fixtures/repo_sql/archive/old_proc.sql +3 -0
  66. coop_data_doc-0.15.0/tests/fixtures/repo_sql/procs/usp_cursor_legacy.sql +25 -0
  67. coop_data_doc-0.15.0/tests/fixtures/repo_sql/procs/usp_dynamic_refresh.sql +8 -0
  68. coop_data_doc-0.15.0/tests/fixtures/repo_sql/procs/usp_load_fact_sales.sql +44 -0
  69. coop_data_doc-0.15.0/tests/fixtures/repo_sql/tables/dbo.agg_sales_daily.sql +7 -0
  70. coop_data_doc-0.15.0/tests/fixtures/repo_sql/tables/dbo.fact_sales.sql +8 -0
  71. coop_data_doc-0.15.0/tests/fixtures/repo_sql/views/sales/dim_customer.sql +10 -0
  72. coop_data_doc-0.15.0/tests/fixtures/repo_sql/views/sales/v_orders_star.sql +5 -0
  73. coop_data_doc-0.15.0/tests/test_cli.py +283 -0
  74. coop_data_doc-0.15.0/tests/test_config.py +122 -0
  75. coop_data_doc-0.15.0/tests/test_crawler.py +107 -0
  76. coop_data_doc-0.15.0/tests/test_determinism.py +36 -0
  77. coop_data_doc-0.15.0/tests/test_diagnostics.py +46 -0
  78. coop_data_doc-0.15.0/tests/test_graph.py +138 -0
  79. coop_data_doc-0.15.0/tests/test_layering.py +134 -0
  80. coop_data_doc-0.15.0/tests/test_linker.py +192 -0
  81. coop_data_doc-0.15.0/tests/test_pbi_parsers.py +368 -0
  82. coop_data_doc-0.15.0/tests/test_progress.py +115 -0
  83. coop_data_doc-0.15.0/tests/test_render.py +389 -0
  84. coop_data_doc-0.15.0/tests/test_sql_parsers.py +194 -0
  85. coop_data_doc-0.15.0/tests/test_upgrade.py +297 -0
  86. coop_data_doc-0.15.0/tests/test_wizard.py +202 -0
@@ -0,0 +1,33 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ lint:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - uses: actions/setup-python@v5
14
+ with:
15
+ python-version: "3.12"
16
+ - run: pip install ruff
17
+ - run: ruff check src tests
18
+ - run: ruff format --check src tests
19
+
20
+ test:
21
+ strategy:
22
+ fail-fast: false
23
+ matrix:
24
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
25
+ os: [ubuntu-latest, windows-latest]
26
+ runs-on: ${{ matrix.os }}
27
+ steps:
28
+ - uses: actions/checkout@v4
29
+ - uses: actions/setup-python@v5
30
+ with:
31
+ python-version: ${{ matrix.python-version }}
32
+ - run: pip install -e ".[dev]"
33
+ - run: pytest -q
@@ -0,0 +1,38 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags: ["v*"]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ - uses: actions/setup-python@v5
13
+ with:
14
+ python-version: "3.12"
15
+ - run: pip install build
16
+ - run: python -m build
17
+ - name: Smoke-test the wheel
18
+ run: |
19
+ python -m venv /tmp/smoke
20
+ /tmp/smoke/bin/pip install dist/*.whl
21
+ /tmp/smoke/bin/coop-data-doc --version
22
+ - uses: actions/upload-artifact@v4
23
+ with:
24
+ name: dist
25
+ path: dist/
26
+
27
+ publish:
28
+ needs: build
29
+ runs-on: ubuntu-latest
30
+ environment: pypi
31
+ permissions:
32
+ id-token: write # PyPI trusted publishing
33
+ steps:
34
+ - uses: actions/download-artifact@v4
35
+ with:
36
+ name: dist
37
+ path: dist/
38
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,8 @@
1
+ __pycache__/
2
+ *.pyc
3
+ .venv/
4
+ dist/
5
+ build/
6
+ *.egg-info/
7
+ .pytest_cache/
8
+ .ruff_cache/
@@ -0,0 +1,167 @@
1
+ # Architecture
2
+
3
+ How `coop-data-doc` turns two git repos into lineage documentation. Read this
4
+ before changing code; `CONTRIBUTING.md` has the rules, this file has the *why*
5
+ and the *how*.
6
+
7
+ ## The pipeline
8
+
9
+ ```
10
+ coop-data-doc.yml ──► [crawler] ──► FileInventory (classified files)
11
+
12
+ ┌─────────────┴───────────────┐
13
+ ▼ ▼
14
+ [SQL parsers (M2)] [Power BI parsers (M3)]
15
+ sql_objects: CREATE tmdl: tables/columns/measures/partitions
16
+ TABLE/VIEW + columns bim: same, from model.bim JSON
17
+ sql_procs: proc DML mcode: partition M ──► SourceRef
18
+ (INSERT/MERGE/UPDATE/ dax: measure ──► measure/table refs
19
+ SELECT INTO/EXEC) pbir: report/page/visual + field bindings
20
+ (PBIR folders AND legacy report.json)
21
+ pbix: best-effort zip extraction
22
+ │ │
23
+ ▼ ▼
24
+ resolve_stub_references() link_visual_bindings()
25
+ └──────────────┬───────────────┘
26
+ prune_schemas() — drop system/ignored schemas
27
+ assign_layers() — bronze/silver/gold (rules + heuristic)
28
+
29
+ [linker (M4)]
30
+ cache → exact → config rule → fuzzy → interactive prompt
31
+ answers persist to .lineage-cache.json (commit it)
32
+
33
+
34
+ LineageGraph ── graph.json + diagnostics.json (artifacts)
35
+
36
+ ┌──────────────┴──────────────┐
37
+ ▼ ▼
38
+ [render/markdown (M5)] [render/site (M5)]
39
+ per-node .md + index.md MkDocs Material, dark default,
40
+ + manifest.json (for agents) offline search + vendored mermaid
41
+ ```
42
+
43
+ `cli.run_pipeline()` (`src/coop_data_doc/cli.py`) is the executable version of
44
+ this diagram — read it first when tracing behavior.
45
+
46
+ ## The data model (everything flows through one graph)
47
+
48
+ `src/coop_data_doc/graph/model.py`:
49
+
50
+ - **Node** — `id`, `node_type`, `name`, `schema_name`, `source_file`,
51
+ `columns`, free-form `metadata`. Ids are stable slugs:
52
+ `"{type}:{schema}.{name}"`, lowercased, brackets stripped
53
+ (`[dbo].[Fact Sales]` → `gold_table:dbo.fact sales`). For Power BI nodes,
54
+ `schema_name` holds the normalized semantic-model name.
55
+ - **NodeType** — `silver_table, gold_table, view, stored_proc,
56
+ semantic_model, pbi_table, measure, report, report_page, visual`.
57
+ - **Edge** — `source_id`, `target_id`, `edge_type`, `evidence`
58
+ (a `"file: snippet"` string proving the edge — every edge is auditable).
59
+ - **Edge direction.** Edges are *authored* in the parser-natural direction,
60
+ which is not always the data-flow direction. `Edge.flow()` normalizes:
61
+
62
+ | edge_type | authored as | data flows |
63
+ | --- | --- | --- |
64
+ | `reads` | proc/view → table it reads | target → source |
65
+ | `writes` | proc → table it writes | source → target |
66
+ | `feeds` | view → pbi_table; pbi_table → model; visual → page → report | source → target |
67
+ | `defines` | proc → table it CREATEs | source → target |
68
+ | `references` | measure → measure/table; proc → proc (EXEC) | target → source |
69
+ | `visualizes` | visual → pbi_table/measure | target → source |
70
+
71
+ All traversal (`upstream()` / `downstream()`) uses `flow()`, so callers
72
+ never think about authoring direction.
73
+
74
+ ## Key design decisions
75
+
76
+ **Deterministic by construction.** Every iteration is sorted, ids are
77
+ normalized, serialization sorts keys and edges, and nothing embeds a
78
+ timestamp. Same inputs + same cache ⇒ byte-identical output
79
+ (`tests/test_determinism.py` enforces this). This is what makes
80
+ `coop-data-doc check` a valid CI freshness gate.
81
+
82
+ **Never guess lineage.** Anything not statically provable becomes a warning
83
+ or an `unresolved` marker, never an invented edge: dynamic SQL
84
+ (`sp_executesql`), opaque .pbix models, unrecognized M partitions.
85
+
86
+ **AST first, regex fallback second (SQL).** T-SQL proc bodies routinely
87
+ defeat sqlglot (cursors, WHILE, TRY/CATCH). `sql_procs` therefore processes
88
+ *statement by statement*: split the body on `;` (string/comment-aware
89
+ scanner), try sqlglot per chunk, and only for unparseable chunks apply
90
+ documented regex patterns — marking the proc
91
+ `metadata.parse_quality = "regex_fallback"` plus a warning so humans know to
92
+ eyeball it. Two sqlglot subtleties are handled centrally: temp tables lose
93
+ their `#` in the AST (flagged `temporary=True` instead — see
94
+ `is_temp_table`), and `UPDATE alias ... FROM table AS alias` needs alias
95
+ resolution for the write target.
96
+
97
+ **Layer assignment is a post-pass** (`layering.assign_layers`). Object *type*
98
+ comes from the SQL; the medallion *layer* (bronze/silver/gold) is assigned
99
+ from `config.layers` rules — by schema and/or source-path glob, precedence
100
+ gold → silver → bronze — with a read/write heuristic fallback (a table only
101
+ ever read → silver source; one created here → gold). `display_name` carries
102
+ the original-case name for rendering while ids stay normalized.
103
+ `prune_schemas` first drops system schemas (`sys`/`information_schema`/
104
+ `tempdb`/`db_*`) and any `ignore_schemas`, which would otherwise appear as
105
+ phantom nodes from catalog references.
106
+
107
+ **Name gaps are a first-class problem.** View schemas and semantic-model
108
+ names are similar but not identical (e.g. schema `sales` feeds the
109
+ "Sales Analytics" model). The linker ladder
110
+ (`linker/resolver.py`) goes: cache → exact id match → `schema_mappings`
111
+ config rule → fuzzy (`difflib`, auto-accept ≥ 0.92, prompt 0.60–0.92) →
112
+ interactive `questionary` prompt. Every interactive answer is written to
113
+ `.lineage-cache.json` immediately (crash-safe), so the second run asks
114
+ nothing.
115
+
116
+ **Two renderers, one graph.** `render/markdown.py` emits strict fixed-order
117
+ YAML front-matter (`id`, `type`, `name`, `schema`, `source_file`, `path`,
118
+ `upstream_inputs`, `downstream_dependents`, `tags`) so agents can parse pages
119
+ without heuristics; `manifest.json` is the whole serialized graph for
120
+ programmatic consumers. Page filenames come from `slug()` (filesystem-safe,
121
+ length-bounded, hash-suffixed for uniqueness — not derivable from the id), so
122
+ the `path` field is the source of truth for where a node's page lives. `render/site.py` synthesizes a Material config and
123
+ post-processes the built HTML so the portal works over `file://` with zero
124
+ network: vendored `mermaid.min.js` (Material skips its CDN fetch when
125
+ `window.mermaid` exists), vendored iframe-worker shim (URL rewritten in the
126
+ HTML), `font: false`, `use_directory_urls: false`.
127
+
128
+ **Human content survives regeneration.** Each page has a Business Intent
129
+ block between `<!-- intent:begin/end -->` markers; the renderer carries the
130
+ existing block forward verbatim. `check` copies the committed tree before
131
+ re-rendering for the same reason.
132
+
133
+ ## For agents: answering questions from the output
134
+
135
+ - *"What breaks if I drop column X from view Y?"* — open the view's page
136
+ (find it via the `path` field, not by computing a filename), read
137
+ `downstream_dependents`, follow each page's front-matter transitively
138
+ (or walk `manifest.json` edges with the
139
+ flow table above).
140
+ - *"Where does this report number come from?"* — visual page →
141
+ `visualizes` → measure (DAX shown on the measure page) → `references` →
142
+ pbi_table → `feeds` → view → `reads` → gold table → `writes` ← proc →
143
+ `reads` → silver sources.
144
+ - Trust levels: edges carry `evidence`; nodes parsed via fallback carry
145
+ `metadata.parse_quality = "regex_fallback"`; DAX/measure edges are
146
+ heuristic (`dax_refs_heuristic`).
147
+
148
+ ## Repo layout
149
+
150
+ ```
151
+ src/coop_data_doc/
152
+ ├── cli.py entrypoints + run_pipeline (the orchestration) + interactive menu
153
+ ├── config.py coop-data-doc.yml model (repos/layers/ignore_schemas) + ParseWarning
154
+ ├── crawler.py repo walk + FileKind classification
155
+ ├── graph/ model.py (Node/Edge/LineageGraph, display_name), serialize.py
156
+ ├── parsers/ sql_common/sql_objects/sql_procs, tmdl/bim/mcode/dax/pbir/pbix
157
+ ├── layering.py medallion layer assignment + system/ignored-schema pruning
158
+ ├── linker/ resolver.py (ladder), cache.py, interactive.py
159
+ ├── diagnostics.py severity-classified warnings → console / JSON / HTML page
160
+ ├── progress.py stderr progress bars + spinner (TTY-only)
161
+ ├── wizard.py interactive `setup` (repos, layers, ignore, mappings)
162
+ ├── upgrade.py `upgrade` — the only networked command (PyPI/git)
163
+ ├── render/ markdown.py, mermaid.py, site.py (layer-grouped nav)
164
+ └── templates/assets/ vendored mermaid + iframe-worker + custom.css
165
+ tasks/ original builder briefs — double as interface docs
166
+ tests/ fixtures/repo_sql + fixtures/repo_pbi drive everything
167
+ ```
@@ -0,0 +1,50 @@
1
+ # coop-data-doc — agent guide
2
+
3
+ Offline, deterministic data-lineage doc generator for SQL + Power BI estates.
4
+ Start with `ARCHITECTURE.md` (pipeline, data model, design decisions), then
5
+ `CONTRIBUTING.md` (rules). The `tasks/` briefs document each module's
6
+ interface in depth.
7
+
8
+ ## Commands
9
+
10
+ ```bash
11
+ .venv/bin/python -m pytest -q # full suite (fast, <1s)
12
+ .venv/bin/ruff check src tests # lint
13
+ .venv/bin/ruff format --check src tests # formatting (CI enforces this too)
14
+ .venv/bin/coop-data-doc build --non-interactive # run the tool itself
15
+ ```
16
+
17
+ If `.venv` is missing: `python3 -m venv .venv && .venv/bin/pip install -e ".[dev]"`.
18
+
19
+ ## Hard rules
20
+
21
+ 1. **Determinism** (CI-enforced) — sorted iteration everywhere, no
22
+ timestamps/randomness in output, and `newline="\n"` on every generated
23
+ `write_text` (cross-OS byte-identity). `tests/test_determinism.py`
24
+ byte-compares two full builds.
25
+ 2. **Offline pipeline** — no network/DB/LLM anywhere in doc generation;
26
+ built HTML must work over `file://` (vendored assets in
27
+ `src/coop_data_doc/templates/assets/`). Sole exception: the explicit
28
+ `upgrade` command (`upgrade.py`); the pipeline never imports it.
29
+ 3. **Pure parsers** (convention, reviewed not CI-enforced) — no print/exit
30
+ outside `cli.py`, `wizard.py`, `progress.py`, and `linker/interactive.py`;
31
+ warnings are returned as `ParseWarning` values.
32
+ 4. **Never guess lineage** — un-provable things become warnings or
33
+ `unresolved` markers, not edges.
34
+
35
+ ## Orientation shortcuts
36
+
37
+ - Orchestration: `cli.run_pipeline()` — the whole pipeline in one function
38
+ (crawl → SQL parse → PBI parse → prune_schemas → assign_layers → link).
39
+ - Data model + edge-direction semantics: `graph/model.py` (read `Edge.flow()`
40
+ before touching traversal — `reads`/`references`/`visualizes` are authored
41
+ opposite to data flow). `id`/`name` are normalized (lowercase) for matching;
42
+ `display_name` keeps original case for rendering (`Node.qualified_display`).
43
+ - Layers (bronze/silver/gold) come from `config.layers` rules in `layering.py`,
44
+ not from node type; object type is parser-detected.
45
+ - Diagnostics (`diagnostics.py`) classify every warning by severity and render
46
+ the console summary + `diagnostics.json` + the HTML Diagnostics page.
47
+ - Tests are fixture-driven: `tests/fixtures/repo_sql` and `repo_pbi` are
48
+ miniature real repos; most tests assert exact node-id/edge-key sets.
49
+ - When adding a parser case, extend the fixtures rather than writing inline
50
+ SQL/JSON strings, so the crawler/CLI/determinism suites cover it too.
@@ -0,0 +1,67 @@
1
+ # Contributing
2
+
3
+ ## Module map
4
+
5
+ | Module | Files | Role |
6
+ | --- | --- | --- |
7
+ | M0 core graph | `graph/model.py`, `graph/serialize.py` | `Node`/`Edge`/`LineageGraph`; the only data structure modules share |
8
+ | M1 config + crawler | `config.py`, `crawler.py` | `coop-data-doc.yml` loading, repo walking, `FileKind` classification |
9
+ | M2 SQL parser | `parsers/sql_common.py`, `sql_objects.py`, `sql_procs.py` | sqlglot (tsql) AST lineage with a regex fallback ladder |
10
+ | M3 Power BI extractor | `parsers/tmdl.py`, `bim.py`, `mcode.py`, `dax.py`, `pbir.py`, `pbix.py` | semantic models, measures, reports, best-effort pbix |
11
+ | M4 linker | `linker/resolver.py`, `cache.py`, `interactive.py` | joins SQL ↔ PBI: cache → exact → config rule → fuzzy → prompt |
12
+ | M4½ layering | `layering.py` | medallion layer (bronze/silver/gold) from `config.layers` rules (schema/path), heuristic fallback; object *type* stays parser-detected |
13
+ | M4¾ diagnostics | `diagnostics.py` | severity-classified warnings/unresolved → console summary, `diagnostics.json`, and the HTML Diagnostics page |
14
+ | M5 renderers | `render/markdown.py`, `mermaid.py`, `site.py` | agent Markdown + offline MkDocs Material portal; nav grouped by layer→type; `schema.Object` (original-case `display_name`) titles |
15
+ | M6 CLI | `cli.py`, `wizard.py`, `upgrade.py`, `progress.py` | interactive menu (bare invocation), `setup` / `init` / `scan` / `build` / `update` / `check` / `help` / `upgrade`; stderr progress bars + spinner |
16
+
17
+ The original builder briefs live in `tasks/` and double as interface documentation.
18
+
19
+ ## Non-negotiable rules
20
+
21
+ 1. **Deterministic output.** Iterate everything in sorted order; never embed
22
+ timestamps or randomness; pass `newline="\n"` to every `write_text` of a
23
+ generated artifact (Windows would otherwise emit CRLF and break
24
+ cross-platform byte-identity). `tests/test_determinism.py` builds twice
25
+ and byte-compares.
26
+ 2. **Offline at runtime.** No network, no DB connections, no LLM calls
27
+ anywhere in the documentation pipeline. The built HTML must work over
28
+ `file://` (vendored assets live in `src/coop_data_doc/templates/assets/`).
29
+ The single sanctioned exception is the explicit `upgrade` command
30
+ (`upgrade.py`), which checks PyPI/git for tool and dependency updates —
31
+ nothing in the pipeline may import it.
32
+ 3. **Parsers are pure.** No printing or exiting outside `cli.py`,
33
+ `wizard.py`, and `linker/interactive.py`; warnings are returned as
34
+ `ParseWarning` values. Parsers/renderers may accept an optional
35
+ `on_file`/`on_node` callback for progress reporting (the CLI supplies
36
+ it) — that's a reporting hook, not printing; the parser never renders.
37
+ 4. **Page filenames go through `slug()`** (`render/mermaid.py`): always
38
+ filesystem-safe (Windows-illegal chars stripped), length-bounded, and
39
+ uniquified with a short id-hash. Never build a page path by hand; agents
40
+ read the `path` front-matter field.
41
+ 5. **Never guess lineage.** Dynamic SQL, opaque pbix models, and
42
+ unrecognized partition sources produce warnings/unresolved markers, not
43
+ invented edges.
44
+
45
+ ## Adding a parser
46
+
47
+ Implement `(entries: list[FileEntry], graph: LineageGraph) -> list[ParseWarning]`,
48
+ register the file kind in `crawler.py`, wire it into `cli.run_pipeline`, and add
49
+ fixtures under `tests/fixtures/` with node/edge-set assertions.
50
+
51
+ ## Developing
52
+
53
+ ```bash
54
+ pip install -e ".[dev]"
55
+ pytest
56
+ ruff check src tests
57
+ ```
58
+
59
+ ## Releasing
60
+
61
+ Bump the version in **both** `pyproject.toml` and `src/coop_data_doc/__init__.py`
62
+ on every release. This is mandatory: installs from the git URL (pipx/uv/pip) are
63
+ **version-gated** — `pipx upgrade` re-clones the branch but only installs when the
64
+ version number increased. Ship code changes under the same version and users'
65
+ `coop-data-doc upgrade` will silently report "already at latest" and skip them.
66
+ Use semver: patch for fixes, minor for new commands/features, major for breaking
67
+ config/output changes.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Aaron Jennings
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.