coop-data-doc 0.15.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coop_data_doc-0.15.0/.github/workflows/ci.yml +33 -0
- coop_data_doc-0.15.0/.github/workflows/publish.yml +38 -0
- coop_data_doc-0.15.0/.gitignore +8 -0
- coop_data_doc-0.15.0/ARCHITECTURE.md +167 -0
- coop_data_doc-0.15.0/CLAUDE.md +50 -0
- coop_data_doc-0.15.0/CONTRIBUTING.md +67 -0
- coop_data_doc-0.15.0/LICENSE +21 -0
- coop_data_doc-0.15.0/PKG-INFO +612 -0
- coop_data_doc-0.15.0/README.md +580 -0
- coop_data_doc-0.15.0/pyproject.toml +53 -0
- coop_data_doc-0.15.0/src/coop_data_doc/__init__.py +3 -0
- coop_data_doc-0.15.0/src/coop_data_doc/__main__.py +4 -0
- coop_data_doc-0.15.0/src/coop_data_doc/cli.py +586 -0
- coop_data_doc-0.15.0/src/coop_data_doc/config.py +362 -0
- coop_data_doc-0.15.0/src/coop_data_doc/crawler.py +149 -0
- coop_data_doc-0.15.0/src/coop_data_doc/diagnostics.py +157 -0
- coop_data_doc-0.15.0/src/coop_data_doc/graph/__init__.py +23 -0
- coop_data_doc-0.15.0/src/coop_data_doc/graph/model.py +246 -0
- coop_data_doc-0.15.0/src/coop_data_doc/graph/serialize.py +33 -0
- coop_data_doc-0.15.0/src/coop_data_doc/layering.py +149 -0
- coop_data_doc-0.15.0/src/coop_data_doc/linker/__init__.py +1 -0
- coop_data_doc-0.15.0/src/coop_data_doc/linker/cache.py +110 -0
- coop_data_doc-0.15.0/src/coop_data_doc/linker/interactive.py +50 -0
- coop_data_doc-0.15.0/src/coop_data_doc/linker/resolver.py +234 -0
- coop_data_doc-0.15.0/src/coop_data_doc/parsers/__init__.py +1 -0
- coop_data_doc-0.15.0/src/coop_data_doc/parsers/bim.py +150 -0
- coop_data_doc-0.15.0/src/coop_data_doc/parsers/dax.py +105 -0
- coop_data_doc-0.15.0/src/coop_data_doc/parsers/mcode.py +98 -0
- coop_data_doc-0.15.0/src/coop_data_doc/parsers/pbir.py +329 -0
- coop_data_doc-0.15.0/src/coop_data_doc/parsers/pbix.py +168 -0
- coop_data_doc-0.15.0/src/coop_data_doc/parsers/sql_common.py +255 -0
- coop_data_doc-0.15.0/src/coop_data_doc/parsers/sql_objects.py +271 -0
- coop_data_doc-0.15.0/src/coop_data_doc/parsers/sql_procs.py +343 -0
- coop_data_doc-0.15.0/src/coop_data_doc/parsers/tmdl.py +347 -0
- coop_data_doc-0.15.0/src/coop_data_doc/progress.py +101 -0
- coop_data_doc-0.15.0/src/coop_data_doc/render/__init__.py +1 -0
- coop_data_doc-0.15.0/src/coop_data_doc/render/markdown.py +308 -0
- coop_data_doc-0.15.0/src/coop_data_doc/render/mermaid.py +162 -0
- coop_data_doc-0.15.0/src/coop_data_doc/render/site.py +346 -0
- coop_data_doc-0.15.0/src/coop_data_doc/templates/assets/README.md +13 -0
- coop_data_doc-0.15.0/src/coop_data_doc/templates/assets/custom.css +22 -0
- coop_data_doc-0.15.0/src/coop_data_doc/templates/assets/iframe-worker-shim.js +1 -0
- coop_data_doc-0.15.0/src/coop_data_doc/templates/assets/mermaid.min.js +3405 -0
- coop_data_doc-0.15.0/src/coop_data_doc/upgrade.py +338 -0
- coop_data_doc-0.15.0/src/coop_data_doc/wizard.py +359 -0
- coop_data_doc-0.15.0/tasks/README.md +27 -0
- coop_data_doc-0.15.0/tasks/_shared-context.md +76 -0
- coop_data_doc-0.15.0/tasks/module-0.md +20 -0
- coop_data_doc-0.15.0/tasks/module-1.md +100 -0
- coop_data_doc-0.15.0/tasks/module-2.md +104 -0
- coop_data_doc-0.15.0/tasks/module-3.md +146 -0
- coop_data_doc-0.15.0/tasks/module-4.md +111 -0
- coop_data_doc-0.15.0/tasks/module-5.md +154 -0
- coop_data_doc-0.15.0/tasks/module-6.md +61 -0
- coop_data_doc-0.15.0/tasks/module-7.md +55 -0
- coop_data_doc-0.15.0/tests/conftest.py +8 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_pbi/LegacyThing/report.json +13 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.Report/definition/pages/page1/page.json +6 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.Report/definition/pages/page1/visuals/abc123/visual.json +25 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.SemanticModel/definition/model.tmdl +7 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.SemanticModel/definition/tables/dim_customer.tmdl +27 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.SemanticModel/definition/tables/ext_unresolved.tmdl +15 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.SemanticModel/definition/tables/fact_sales.tmdl +34 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_pbi/Sales.SemanticModel/definition/tables/orders_native.tmdl +16 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_sql/archive/old_proc.sql +3 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_sql/procs/usp_cursor_legacy.sql +25 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_sql/procs/usp_dynamic_refresh.sql +8 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_sql/procs/usp_load_fact_sales.sql +44 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_sql/tables/dbo.agg_sales_daily.sql +7 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_sql/tables/dbo.fact_sales.sql +8 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_sql/views/sales/dim_customer.sql +10 -0
- coop_data_doc-0.15.0/tests/fixtures/repo_sql/views/sales/v_orders_star.sql +5 -0
- coop_data_doc-0.15.0/tests/test_cli.py +283 -0
- coop_data_doc-0.15.0/tests/test_config.py +122 -0
- coop_data_doc-0.15.0/tests/test_crawler.py +107 -0
- coop_data_doc-0.15.0/tests/test_determinism.py +36 -0
- coop_data_doc-0.15.0/tests/test_diagnostics.py +46 -0
- coop_data_doc-0.15.0/tests/test_graph.py +138 -0
- coop_data_doc-0.15.0/tests/test_layering.py +134 -0
- coop_data_doc-0.15.0/tests/test_linker.py +192 -0
- coop_data_doc-0.15.0/tests/test_pbi_parsers.py +368 -0
- coop_data_doc-0.15.0/tests/test_progress.py +115 -0
- coop_data_doc-0.15.0/tests/test_render.py +389 -0
- coop_data_doc-0.15.0/tests/test_sql_parsers.py +194 -0
- coop_data_doc-0.15.0/tests/test_upgrade.py +297 -0
- coop_data_doc-0.15.0/tests/test_wizard.py +202 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
lint:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
- uses: actions/setup-python@v5
|
|
14
|
+
with:
|
|
15
|
+
python-version: "3.12"
|
|
16
|
+
- run: pip install ruff
|
|
17
|
+
- run: ruff check src tests
|
|
18
|
+
- run: ruff format --check src tests
|
|
19
|
+
|
|
20
|
+
test:
|
|
21
|
+
strategy:
|
|
22
|
+
fail-fast: false
|
|
23
|
+
matrix:
|
|
24
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
25
|
+
os: [ubuntu-latest, windows-latest]
|
|
26
|
+
runs-on: ${{ matrix.os }}
|
|
27
|
+
steps:
|
|
28
|
+
- uses: actions/checkout@v4
|
|
29
|
+
- uses: actions/setup-python@v5
|
|
30
|
+
with:
|
|
31
|
+
python-version: ${{ matrix.python-version }}
|
|
32
|
+
- run: pip install -e ".[dev]"
|
|
33
|
+
- run: pytest -q
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v4
|
|
12
|
+
- uses: actions/setup-python@v5
|
|
13
|
+
with:
|
|
14
|
+
python-version: "3.12"
|
|
15
|
+
- run: pip install build
|
|
16
|
+
- run: python -m build
|
|
17
|
+
- name: Smoke-test the wheel
|
|
18
|
+
run: |
|
|
19
|
+
python -m venv /tmp/smoke
|
|
20
|
+
/tmp/smoke/bin/pip install dist/*.whl
|
|
21
|
+
/tmp/smoke/bin/coop-data-doc --version
|
|
22
|
+
- uses: actions/upload-artifact@v4
|
|
23
|
+
with:
|
|
24
|
+
name: dist
|
|
25
|
+
path: dist/
|
|
26
|
+
|
|
27
|
+
publish:
|
|
28
|
+
needs: build
|
|
29
|
+
runs-on: ubuntu-latest
|
|
30
|
+
environment: pypi
|
|
31
|
+
permissions:
|
|
32
|
+
id-token: write # PyPI trusted publishing
|
|
33
|
+
steps:
|
|
34
|
+
- uses: actions/download-artifact@v4
|
|
35
|
+
with:
|
|
36
|
+
name: dist
|
|
37
|
+
path: dist/
|
|
38
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# Architecture
|
|
2
|
+
|
|
3
|
+
How `coop-data-doc` turns two git repos into lineage documentation. Read this
|
|
4
|
+
before changing code; `CONTRIBUTING.md` has the rules, this file has the *why*
|
|
5
|
+
and the *how*.
|
|
6
|
+
|
|
7
|
+
## The pipeline
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
coop-data-doc.yml ──► [crawler] ──► FileInventory (classified files)
|
|
11
|
+
│
|
|
12
|
+
┌─────────────┴───────────────┐
|
|
13
|
+
▼ ▼
|
|
14
|
+
[SQL parsers (M2)] [Power BI parsers (M3)]
|
|
15
|
+
sql_objects: CREATE tmdl: tables/columns/measures/partitions
|
|
16
|
+
TABLE/VIEW + columns bim: same, from model.bim JSON
|
|
17
|
+
sql_procs: proc DML mcode: partition M ──► SourceRef
|
|
18
|
+
(INSERT/MERGE/UPDATE/ dax: measure ──► measure/table refs
|
|
19
|
+
SELECT INTO/EXEC) pbir: report/page/visual + field bindings
|
|
20
|
+
(PBIR folders AND legacy report.json)
|
|
21
|
+
pbix: best-effort zip extraction
|
|
22
|
+
│ │
|
|
23
|
+
▼ ▼
|
|
24
|
+
resolve_stub_references() link_visual_bindings()
|
|
25
|
+
└──────────────┬───────────────┘
|
|
26
|
+
prune_schemas() — drop system/ignored schemas
|
|
27
|
+
assign_layers() — bronze/silver/gold (rules + heuristic)
|
|
28
|
+
│
|
|
29
|
+
[linker (M4)]
|
|
30
|
+
cache → exact → config rule → fuzzy → interactive prompt
|
|
31
|
+
answers persist to .lineage-cache.json (commit it)
|
|
32
|
+
│
|
|
33
|
+
▼
|
|
34
|
+
LineageGraph ── graph.json + diagnostics.json (artifacts)
|
|
35
|
+
│
|
|
36
|
+
┌──────────────┴──────────────┐
|
|
37
|
+
▼ ▼
|
|
38
|
+
[render/markdown (M5)] [render/site (M5)]
|
|
39
|
+
per-node .md + index.md MkDocs Material, dark default,
|
|
40
|
+
+ manifest.json (for agents) offline search + vendored mermaid
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
`cli.run_pipeline()` (`src/coop_data_doc/cli.py`) is the executable version of
|
|
44
|
+
this diagram — read it first when tracing behavior.
|
|
45
|
+
|
|
46
|
+
## The data model (everything flows through one graph)
|
|
47
|
+
|
|
48
|
+
`src/coop_data_doc/graph/model.py`:
|
|
49
|
+
|
|
50
|
+
- **Node** — `id`, `node_type`, `name`, `schema_name`, `source_file`,
|
|
51
|
+
`columns`, free-form `metadata`. Ids are stable slugs:
|
|
52
|
+
`"{type}:{schema}.{name}"`, lowercased, brackets stripped
|
|
53
|
+
(`[dbo].[Fact Sales]` → `gold_table:dbo.fact sales`). For Power BI nodes,
|
|
54
|
+
`schema_name` holds the normalized semantic-model name.
|
|
55
|
+
- **NodeType** — `silver_table, gold_table, view, stored_proc,
|
|
56
|
+
semantic_model, pbi_table, measure, report, report_page, visual`.
|
|
57
|
+
- **Edge** — `source_id`, `target_id`, `edge_type`, `evidence`
|
|
58
|
+
(a `"file: snippet"` string proving the edge — every edge is auditable).
|
|
59
|
+
- **Edge direction.** Edges are *authored* in the parser-natural direction,
|
|
60
|
+
which is not always the data-flow direction. `Edge.flow()` normalizes:
|
|
61
|
+
|
|
62
|
+
| edge_type | authored as | data flows |
|
|
63
|
+
| --- | --- | --- |
|
|
64
|
+
| `reads` | proc/view → table it reads | target → source |
|
|
65
|
+
| `writes` | proc → table it writes | source → target |
|
|
66
|
+
| `feeds` | view → pbi_table; pbi_table → model; visual → page → report | source → target |
|
|
67
|
+
| `defines` | proc → table it CREATEs | source → target |
|
|
68
|
+
| `references` | measure → measure/table; proc → proc (EXEC) | target → source |
|
|
69
|
+
| `visualizes` | visual → pbi_table/measure | target → source |
|
|
70
|
+
|
|
71
|
+
All traversal (`upstream()` / `downstream()`) uses `flow()`, so callers
|
|
72
|
+
never think about authoring direction.
|
|
73
|
+
|
|
74
|
+
## Key design decisions
|
|
75
|
+
|
|
76
|
+
**Deterministic by construction.** Every iteration is sorted, ids are
|
|
77
|
+
normalized, serialization sorts keys and edges, and nothing embeds a
|
|
78
|
+
timestamp. Same inputs + same cache ⇒ byte-identical output
|
|
79
|
+
(`tests/test_determinism.py` enforces this). This is what makes
|
|
80
|
+
`coop-data-doc check` a valid CI freshness gate.
|
|
81
|
+
|
|
82
|
+
**Never guess lineage.** Anything not statically provable becomes a warning
|
|
83
|
+
or an `unresolved` marker, never an invented edge: dynamic SQL
|
|
84
|
+
(`sp_executesql`), opaque .pbix models, unrecognized M partitions.
|
|
85
|
+
|
|
86
|
+
**AST first, regex fallback second (SQL).** T-SQL proc bodies routinely
|
|
87
|
+
defeat sqlglot (cursors, WHILE, TRY/CATCH). `sql_procs` therefore processes
|
|
88
|
+
*statement by statement*: split the body on `;` (string/comment-aware
|
|
89
|
+
scanner), try sqlglot per chunk, and only for unparseable chunks apply
|
|
90
|
+
documented regex patterns — marking the proc
|
|
91
|
+
`metadata.parse_quality = "regex_fallback"` plus a warning so humans know to
|
|
92
|
+
eyeball it. Two sqlglot subtleties are handled centrally: temp tables lose
|
|
93
|
+
their `#` in the AST (flagged `temporary=True` instead — see
|
|
94
|
+
`is_temp_table`), and `UPDATE alias ... FROM table AS alias` needs alias
|
|
95
|
+
resolution for the write target.
|
|
96
|
+
|
|
97
|
+
**Layer assignment is a post-pass** (`layering.assign_layers`). Object *type*
|
|
98
|
+
comes from the SQL; the medallion *layer* (bronze/silver/gold) is assigned
|
|
99
|
+
from `config.layers` rules — by schema and/or source-path glob, precedence
|
|
100
|
+
gold → silver → bronze — with a read/write heuristic fallback (a table only
|
|
101
|
+
ever read → silver source; one created here → gold). `display_name` carries
|
|
102
|
+
the original-case name for rendering while ids stay normalized.
|
|
103
|
+
`prune_schemas` first drops system schemas (`sys`/`information_schema`/
|
|
104
|
+
`tempdb`/`db_*`) and any `ignore_schemas`, which would otherwise appear as
|
|
105
|
+
phantom nodes from catalog references.
|
|
106
|
+
|
|
107
|
+
**Name gaps are a first-class problem.** View schemas and semantic-model
|
|
108
|
+
names are similar but not identical (e.g. schema `sales` feeds the
|
|
109
|
+
"Sales Analytics" model). The linker ladder
|
|
110
|
+
(`linker/resolver.py`) goes: cache → exact id match → `schema_mappings`
|
|
111
|
+
config rule → fuzzy (`difflib`, auto-accept ≥ 0.92, prompt 0.60–0.92) →
|
|
112
|
+
interactive `questionary` prompt. Every interactive answer is written to
|
|
113
|
+
`.lineage-cache.json` immediately (crash-safe), so the second run asks
|
|
114
|
+
nothing.
|
|
115
|
+
|
|
116
|
+
**Two renderers, one graph.** `render/markdown.py` emits strict fixed-order
|
|
117
|
+
YAML front-matter (`id`, `type`, `name`, `schema`, `source_file`, `path`,
|
|
118
|
+
`upstream_inputs`, `downstream_dependents`, `tags`) so agents can parse pages
|
|
119
|
+
without heuristics; `manifest.json` is the whole serialized graph for
|
|
120
|
+
programmatic consumers. Page filenames come from `slug()` (filesystem-safe,
|
|
121
|
+
length-bounded, hash-suffixed for uniqueness — not derivable from the id), so
|
|
122
|
+
the `path` field is the source of truth for where a node's page lives. `render/site.py` synthesizes a Material config and
|
|
123
|
+
post-processes the built HTML so the portal works over `file://` with zero
|
|
124
|
+
network: vendored `mermaid.min.js` (Material skips its CDN fetch when
|
|
125
|
+
`window.mermaid` exists), vendored iframe-worker shim (URL rewritten in the
|
|
126
|
+
HTML), `font: false`, `use_directory_urls: false`.
|
|
127
|
+
|
|
128
|
+
**Human content survives regeneration.** Each page has a Business Intent
|
|
129
|
+
block between `<!-- intent:begin/end -->` markers; the renderer carries the
|
|
130
|
+
existing block forward verbatim. `check` copies the committed tree before
|
|
131
|
+
re-rendering for the same reason.
|
|
132
|
+
|
|
133
|
+
## For agents: answering questions from the output
|
|
134
|
+
|
|
135
|
+
- *"What breaks if I drop column X from view Y?"* — open the view's page
|
|
136
|
+
(find it via the `path` field, not by computing a filename), read
|
|
137
|
+
`downstream_dependents`, follow each page's front-matter transitively
|
|
138
|
+
(or walk `manifest.json` edges with the
|
|
139
|
+
flow table above).
|
|
140
|
+
- *"Where does this report number come from?"* — visual page →
|
|
141
|
+
`visualizes` → measure (DAX shown on the measure page) → `references` →
|
|
142
|
+
pbi_table → `feeds` → view → `reads` → gold table → `writes` ← proc →
|
|
143
|
+
`reads` → silver sources.
|
|
144
|
+
- Trust levels: edges carry `evidence`; nodes parsed via fallback carry
|
|
145
|
+
`metadata.parse_quality = "regex_fallback"`; DAX/measure edges are
|
|
146
|
+
heuristic (`dax_refs_heuristic`).
|
|
147
|
+
|
|
148
|
+
## Repo layout
|
|
149
|
+
|
|
150
|
+
```
|
|
151
|
+
src/coop_data_doc/
|
|
152
|
+
├── cli.py entrypoints + run_pipeline (the orchestration) + interactive menu
|
|
153
|
+
├── config.py coop-data-doc.yml model (repos/layers/ignore_schemas) + ParseWarning
|
|
154
|
+
├── crawler.py repo walk + FileKind classification
|
|
155
|
+
├── graph/ model.py (Node/Edge/LineageGraph, display_name), serialize.py
|
|
156
|
+
├── parsers/ sql_common/sql_objects/sql_procs, tmdl/bim/mcode/dax/pbir/pbix
|
|
157
|
+
├── layering.py medallion layer assignment + system/ignored-schema pruning
|
|
158
|
+
├── linker/ resolver.py (ladder), cache.py, interactive.py
|
|
159
|
+
├── diagnostics.py severity-classified warnings → console / JSON / HTML page
|
|
160
|
+
├── progress.py stderr progress bars + spinner (TTY-only)
|
|
161
|
+
├── wizard.py interactive `setup` (repos, layers, ignore, mappings)
|
|
162
|
+
├── upgrade.py `upgrade` — the only networked command (PyPI/git)
|
|
163
|
+
├── render/ markdown.py, mermaid.py, site.py (layer-grouped nav)
|
|
164
|
+
└── templates/assets/ vendored mermaid + iframe-worker + custom.css
|
|
165
|
+
tasks/ original builder briefs — double as interface docs
|
|
166
|
+
tests/ fixtures/repo_sql + fixtures/repo_pbi drive everything
|
|
167
|
+
```
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# coop-data-doc — agent guide
|
|
2
|
+
|
|
3
|
+
Offline, deterministic data-lineage doc generator for SQL + Power BI estates.
|
|
4
|
+
Start with `ARCHITECTURE.md` (pipeline, data model, design decisions), then
|
|
5
|
+
`CONTRIBUTING.md` (rules). The `tasks/` briefs document each module's
|
|
6
|
+
interface in depth.
|
|
7
|
+
|
|
8
|
+
## Commands
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
.venv/bin/python -m pytest -q # full suite (fast, <1s)
|
|
12
|
+
.venv/bin/ruff check src tests # lint
|
|
13
|
+
.venv/bin/ruff format --check src tests # formatting (CI enforces this too)
|
|
14
|
+
.venv/bin/coop-data-doc build --non-interactive # run the tool itself
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
If `.venv` is missing: `python3 -m venv .venv && .venv/bin/pip install -e ".[dev]"`.
|
|
18
|
+
|
|
19
|
+
## Hard rules
|
|
20
|
+
|
|
21
|
+
1. **Determinism** (CI-enforced) — sorted iteration everywhere, no
|
|
22
|
+
timestamps/randomness in output, and `newline="\n"` on every generated
|
|
23
|
+
`write_text` (cross-OS byte-identity). `tests/test_determinism.py`
|
|
24
|
+
byte-compares two full builds.
|
|
25
|
+
2. **Offline pipeline** — no network/DB/LLM anywhere in doc generation;
|
|
26
|
+
built HTML must work over `file://` (vendored assets in
|
|
27
|
+
`src/coop_data_doc/templates/assets/`). Sole exception: the explicit
|
|
28
|
+
`upgrade` command (`upgrade.py`); the pipeline never imports it.
|
|
29
|
+
3. **Pure parsers** (convention, reviewed not CI-enforced) — no print/exit
|
|
30
|
+
outside `cli.py`, `wizard.py`, `progress.py`, and `linker/interactive.py`;
|
|
31
|
+
warnings are returned as `ParseWarning` values.
|
|
32
|
+
4. **Never guess lineage** — un-provable things become warnings or
|
|
33
|
+
`unresolved` markers, not edges.
|
|
34
|
+
|
|
35
|
+
## Orientation shortcuts
|
|
36
|
+
|
|
37
|
+
- Orchestration: `cli.run_pipeline()` — the whole pipeline in one function
|
|
38
|
+
(crawl → SQL parse → PBI parse → prune_schemas → assign_layers → link).
|
|
39
|
+
- Data model + edge-direction semantics: `graph/model.py` (read `Edge.flow()`
|
|
40
|
+
before touching traversal — `reads`/`references`/`visualizes` are authored
|
|
41
|
+
opposite to data flow). `id`/`name` are normalized (lowercase) for matching;
|
|
42
|
+
`display_name` keeps original case for rendering (`Node.qualified_display`).
|
|
43
|
+
- Layers (bronze/silver/gold) come from `config.layers` rules in `layering.py`,
|
|
44
|
+
not from node type; object type is parser-detected.
|
|
45
|
+
- Diagnostics (`diagnostics.py`) classify every warning by severity and render
|
|
46
|
+
the console summary + `diagnostics.json` + the HTML Diagnostics page.
|
|
47
|
+
- Tests are fixture-driven: `tests/fixtures/repo_sql` and `repo_pbi` are
|
|
48
|
+
miniature real repos; most tests assert exact node-id/edge-key sets.
|
|
49
|
+
- When adding a parser case, extend the fixtures rather than writing inline
|
|
50
|
+
SQL/JSON strings, so the crawler/CLI/determinism suites cover it too.
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
## Module map
|
|
4
|
+
|
|
5
|
+
| Module | Files | Role |
|
|
6
|
+
| --- | --- | --- |
|
|
7
|
+
| M0 core graph | `graph/model.py`, `graph/serialize.py` | `Node`/`Edge`/`LineageGraph`; the only data structure modules share |
|
|
8
|
+
| M1 config + crawler | `config.py`, `crawler.py` | `coop-data-doc.yml` loading, repo walking, `FileKind` classification |
|
|
9
|
+
| M2 SQL parser | `parsers/sql_common.py`, `sql_objects.py`, `sql_procs.py` | sqlglot (tsql) AST lineage with a regex fallback ladder |
|
|
10
|
+
| M3 Power BI extractor | `parsers/tmdl.py`, `bim.py`, `mcode.py`, `dax.py`, `pbir.py`, `pbix.py` | semantic models, measures, reports, best-effort pbix |
|
|
11
|
+
| M4 linker | `linker/resolver.py`, `cache.py`, `interactive.py` | joins SQL ↔ PBI: cache → exact → config rule → fuzzy → prompt |
|
|
12
|
+
| M4½ layering | `layering.py` | medallion layer (bronze/silver/gold) from `config.layers` rules (schema/path), heuristic fallback; object *type* stays parser-detected |
|
|
13
|
+
| M4¾ diagnostics | `diagnostics.py` | severity-classified warnings/unresolved → console summary, `diagnostics.json`, and the HTML Diagnostics page |
|
|
14
|
+
| M5 renderers | `render/markdown.py`, `mermaid.py`, `site.py` | agent Markdown + offline MkDocs Material portal; nav grouped by layer→type; `schema.Object` (original-case `display_name`) titles |
|
|
15
|
+
| M6 CLI | `cli.py`, `wizard.py`, `upgrade.py`, `progress.py` | interactive menu (bare invocation), `setup` / `init` / `scan` / `build` / `update` / `check` / `help` / `upgrade`; stderr progress bars + spinner |
|
|
16
|
+
|
|
17
|
+
The original builder briefs live in `tasks/` and double as interface documentation.
|
|
18
|
+
|
|
19
|
+
## Non-negotiable rules
|
|
20
|
+
|
|
21
|
+
1. **Deterministic output.** Iterate everything in sorted order; never embed
|
|
22
|
+
timestamps or randomness; pass `newline="\n"` to every `write_text` of a
|
|
23
|
+
generated artifact (Windows would otherwise emit CRLF and break
|
|
24
|
+
cross-platform byte-identity). `tests/test_determinism.py` builds twice
|
|
25
|
+
and byte-compares.
|
|
26
|
+
2. **Offline at runtime.** No network, no DB connections, no LLM calls
|
|
27
|
+
anywhere in the documentation pipeline. The built HTML must work over
|
|
28
|
+
`file://` (vendored assets live in `src/coop_data_doc/templates/assets/`).
|
|
29
|
+
The single sanctioned exception is the explicit `upgrade` command
|
|
30
|
+
(`upgrade.py`), which checks PyPI/git for tool and dependency updates —
|
|
31
|
+
nothing in the pipeline may import it.
|
|
32
|
+
3. **Parsers are pure.** No printing or exiting outside `cli.py`,
|
|
33
|
+
`wizard.py`, and `linker/interactive.py`; warnings are returned as
|
|
34
|
+
`ParseWarning` values. Parsers/renderers may accept an optional
|
|
35
|
+
`on_file`/`on_node` callback for progress reporting (the CLI supplies
|
|
36
|
+
it) — that's a reporting hook, not printing; the parser never renders.
|
|
37
|
+
4. **Page filenames go through `slug()`** (`render/mermaid.py`): always
|
|
38
|
+
filesystem-safe (Windows-illegal chars stripped), length-bounded, and
|
|
39
|
+
uniquified with a short id-hash. Never build a page path by hand; agents
|
|
40
|
+
read the `path` front-matter field.
|
|
41
|
+
5. **Never guess lineage.** Dynamic SQL, opaque pbix models, and
|
|
42
|
+
unrecognized partition sources produce warnings/unresolved markers, not
|
|
43
|
+
invented edges.
|
|
44
|
+
|
|
45
|
+
## Adding a parser
|
|
46
|
+
|
|
47
|
+
Implement `(entries: list[FileEntry], graph: LineageGraph) -> list[ParseWarning]`,
|
|
48
|
+
register the file kind in `crawler.py`, wire it into `cli.run_pipeline`, and add
|
|
49
|
+
fixtures under `tests/fixtures/` with node/edge-set assertions.
|
|
50
|
+
|
|
51
|
+
## Developing
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install -e ".[dev]"
|
|
55
|
+
pytest
|
|
56
|
+
ruff check src tests
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Releasing
|
|
60
|
+
|
|
61
|
+
Bump the version in **both** `pyproject.toml` and `src/coop_data_doc/__init__.py`
|
|
62
|
+
on every release. This is mandatory: installs from the git URL (pipx/uv/pip) are
|
|
63
|
+
**version-gated** — `pipx upgrade` re-clones the branch but only installs when the
|
|
64
|
+
version number increased. Ship code changes under the same version and users'
|
|
65
|
+
`coop-data-doc upgrade` will silently report "already at latest" and skip them.
|
|
66
|
+
Use semver: patch for fixes, minor for new commands/features, major for breaking
|
|
67
|
+
config/output changes.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Aaron Jennings
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|