codd-dev 0.2.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codd/graph.py ADDED
@@ -0,0 +1,288 @@
1
+ """CEG (Conditioned Evidence Graph) — JSONL file-backed dependency graph.
2
+
3
+ Design: All data lives in JSONL files (one record per line).
4
+ Files are loaded into memory on init, flushed to disk on close().
5
+ Git-friendly: every change is a line-level diff.
6
+ """
7
+
8
+ import json
9
+ from pathlib import Path
10
+ from typing import Optional
11
+
12
+
13
+ class CEG:
14
+ """Conditioned Evidence Graph — JSONL-backed dependency graph.
15
+
16
+ Storage:
17
+ {scan_dir}/nodes.jsonl — one JSON object per line
18
+ {scan_dir}/edges.jsonl — one JSON object per line
19
+ """
20
+
21
+ def __init__(self, scan_dir: Path):
22
+ self.scan_dir = Path(scan_dir)
23
+ self.scan_dir.mkdir(parents=True, exist_ok=True)
24
+
25
+ self.nodes_path = self.scan_dir / "nodes.jsonl"
26
+ self.edges_path = self.scan_dir / "edges.jsonl"
27
+
28
+ # In-memory stores
29
+ self.nodes: dict[str, dict] = {} # keyed by node_id
30
+ self.edges: list[dict] = [] # list of edge dicts
31
+ self._next_edge_id = 1
32
+ self._dirty = False
33
+
34
+ # Load existing data
35
+ self._load()
36
+
37
+ def _load(self):
38
+ """Load JSONL files into memory."""
39
+ if self.nodes_path.exists():
40
+ for line in self.nodes_path.read_text().splitlines():
41
+ line = line.strip()
42
+ if line:
43
+ node = json.loads(line)
44
+ self.nodes[node["id"]] = node
45
+
46
+ if self.edges_path.exists():
47
+ for line in self.edges_path.read_text().splitlines():
48
+ line = line.strip()
49
+ if line:
50
+ edge = json.loads(line)
51
+ self.edges.append(edge)
52
+ if edge.get("id", 0) >= self._next_edge_id:
53
+ self._next_edge_id = edge["id"] + 1
54
+
55
+ def close(self):
56
+ """Flush to disk."""
57
+ if self._dirty:
58
+ self._flush()
59
+
60
+ def _flush(self):
61
+ """Write all data back to JSONL files."""
62
+ # Sort nodes by id for stable output
63
+ sorted_nodes = sorted(self.nodes.values(), key=lambda n: n["id"])
64
+ with open(self.nodes_path, "w") as f:
65
+ for node in sorted_nodes:
66
+ f.write(json.dumps(node, ensure_ascii=False) + "\n")
67
+
68
+ # Sort edges by id for stable output
69
+ sorted_edges = sorted(self.edges, key=lambda e: e.get("id", 0))
70
+ with open(self.edges_path, "w") as f:
71
+ for edge in sorted_edges:
72
+ f.write(json.dumps(edge, ensure_ascii=False) + "\n")
73
+
74
+ self._dirty = False
75
+
76
+ # ── Node operations ──
77
+
78
+ def upsert_node(self, node_id: str, node_type: str, path: str = None,
79
+ name: str = None, module: str = None):
80
+ node = self.nodes.get(node_id, {"id": node_id})
81
+ node["type"] = node_type
82
+ if path is not None:
83
+ node["path"] = path
84
+ if name is not None:
85
+ node["name"] = name
86
+ if module is not None:
87
+ node["module"] = module
88
+ self.nodes[node_id] = node
89
+ self._dirty = True
90
+
91
+ def get_node(self, node_id: str) -> Optional[dict]:
92
+ return self.nodes.get(node_id)
93
+
94
+ def count_nodes(self) -> int:
95
+ return len(self.nodes)
96
+
97
+ def find_nodes_by_path(self, path: str) -> list:
98
+ return [n for n in self.nodes.values() if n.get("path") == path]
99
+
100
+ def get_convention_edges(self, node_id: str) -> list:
101
+ results = []
102
+ for e in self.edges:
103
+ if e["source_id"] == node_id and e["relation"] == "must_review" and e.get("is_active", True):
104
+ target = self.nodes.get(e["target_id"], {})
105
+ result = {**e, "target_name": target.get("name"), "target_type": target.get("type")}
106
+ results.append(result)
107
+ results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
108
+ return results
109
+
110
+ # ── Edge operations ──
111
+
112
+ def add_edge(self, source_id: str, target_id: str, relation: str,
113
+ semantic: str, confidence: float = 0.5,
114
+ condition: str = None) -> int:
115
+ edge_id = self._next_edge_id
116
+ self._next_edge_id += 1
117
+ edge = {
118
+ "id": edge_id,
119
+ "source_id": source_id,
120
+ "target_id": target_id,
121
+ "relation": relation,
122
+ "semantic": semantic,
123
+ "confidence": confidence,
124
+ "is_active": True,
125
+ "evidence": [],
126
+ }
127
+ if condition:
128
+ edge["condition"] = condition
129
+ self.edges.append(edge)
130
+ self._dirty = True
131
+ return edge_id
132
+
133
+ def get_outgoing_edges(self, node_id: str, min_confidence: float = 0.0) -> list:
134
+ results = []
135
+ for e in self.edges:
136
+ if (e["source_id"] == node_id and e.get("is_active", True)
137
+ and e.get("confidence", 0) >= min_confidence):
138
+ target = self.nodes.get(e["target_id"], {})
139
+ result = {**e, "target_name": target.get("name"), "target_type": target.get("type")}
140
+ results.append(result)
141
+ results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
142
+ return results
143
+
144
+ def get_incoming_edges(self, node_id: str, min_confidence: float = 0.0) -> list:
145
+ results = []
146
+ for e in self.edges:
147
+ if (e["target_id"] == node_id and e.get("is_active", True)
148
+ and e.get("confidence", 0) >= min_confidence):
149
+ source = self.nodes.get(e["source_id"], {})
150
+ result = {**e, "source_name": source.get("name"), "source_type": source.get("type")}
151
+ results.append(result)
152
+ results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
153
+ return results
154
+
155
+ def count_edges(self) -> int:
156
+ return sum(1 for e in self.edges if e.get("is_active", True))
157
+
158
+ # ── Evidence operations ──
159
+
160
+ def add_evidence(self, edge_id: int, source_type: str, method: str,
161
+ score: float, detail: str = None, is_negative: bool = False) -> int:
162
+ for edge in self.edges:
163
+ if edge["id"] == edge_id:
164
+ ev = {"source_type": source_type, "method": method, "score": score}
165
+ if detail:
166
+ ev["detail"] = detail
167
+ if is_negative:
168
+ ev["is_negative"] = True
169
+ edge.setdefault("evidence", []).append(ev)
170
+ # Recalculate confidence via Noisy-OR
171
+ edge["confidence"] = self._noisy_or(edge["evidence"])
172
+ self._dirty = True
173
+ return len(edge["evidence"])
174
+ return 0
175
+
176
+ @staticmethod
177
+ def _noisy_or(evidence: list) -> float:
178
+ """Noisy-OR: P(at least one fires) = 1 - product(1 - p_i)."""
179
+ positive_product = 1.0
180
+ negative_product = 1.0
181
+ for ev in evidence:
182
+ if ev.get("is_negative"):
183
+ negative_product *= (1.0 - ev["score"])
184
+ else:
185
+ positive_product *= (1.0 - ev["score"])
186
+ return round(max(0.0, (1.0 - positive_product) - (1.0 - negative_product)), 4)
187
+
188
+ # ── Propagation ──
189
+
190
+ def propagate_impact(self, start_node_id: str, max_depth: int = 10,
191
+ min_confidence: float = 0.0) -> dict:
192
+ """BFS propagation from a changed node."""
193
+ visited = {}
194
+ queue = [(start_node_id, 0, [start_node_id])]
195
+
196
+ while queue:
197
+ current, depth, path = queue.pop(0)
198
+ if depth > max_depth:
199
+ continue
200
+ if current in visited:
201
+ continue
202
+ visited[current] = {"depth": depth, "path": path}
203
+
204
+ for edge in self.get_outgoing_edges(current, min_confidence):
205
+ target = edge["target_id"]
206
+ if target not in visited:
207
+ queue.append((target, depth + 1, path + [target]))
208
+
209
+ if start_node_id in visited:
210
+ del visited[start_node_id]
211
+ return visited
212
+
213
+ # ── Band classification ──
214
+
215
+ def classify_band(self, confidence: float, evidence_count: int,
216
+ green_threshold: float = 0.90,
217
+ green_min_evidence: int = 2,
218
+ amber_threshold: float = 0.50) -> str:
219
+ if confidence >= green_threshold and evidence_count >= green_min_evidence:
220
+ return "green"
221
+ elif confidence >= amber_threshold:
222
+ return "amber"
223
+ else:
224
+ return "gray"
225
+
226
+ # ── Selective refresh ──
227
+
228
+ AUTO_SOURCE_TYPES = ("static", "framework", "frontmatter", "inferred")
229
+ HUMAN_SOURCE_TYPES = ("human", "dynamic", "history")
230
+
231
+ def purge_auto_generated(self) -> dict:
232
+ """Delete auto-generated evidence/edges/nodes, preserve human knowledge."""
233
+ deleted_evidence = 0
234
+ deleted_edges = 0
235
+
236
+ # Remove auto evidence from edges
237
+ surviving_edges = []
238
+ for edge in self.edges:
239
+ original_count = len(edge.get("evidence", []))
240
+ edge["evidence"] = [
241
+ ev for ev in edge.get("evidence", [])
242
+ if ev.get("source_type") not in self.AUTO_SOURCE_TYPES
243
+ ]
244
+ deleted_evidence += original_count - len(edge["evidence"])
245
+
246
+ if edge["evidence"]:
247
+ edge["confidence"] = self._noisy_or(edge["evidence"])
248
+ surviving_edges.append(edge)
249
+ else:
250
+ deleted_edges += 1
251
+
252
+ self.edges = surviving_edges
253
+
254
+ # Remove orphan nodes
255
+ referenced = set()
256
+ for edge in self.edges:
257
+ referenced.add(edge["source_id"])
258
+ referenced.add(edge["target_id"])
259
+
260
+ orphans = [nid for nid in self.nodes if nid not in referenced]
261
+ for nid in orphans:
262
+ del self.nodes[nid]
263
+
264
+ self._dirty = True
265
+ return {
266
+ "evidence": deleted_evidence,
267
+ "edges": deleted_edges,
268
+ "nodes": len(orphans),
269
+ }
270
+
271
+ def count_human_evidence(self) -> int:
272
+ count = 0
273
+ for edge in self.edges:
274
+ for ev in edge.get("evidence", []):
275
+ if ev.get("source_type") in self.HUMAN_SOURCE_TYPES:
276
+ count += 1
277
+ return count
278
+
279
+ # ── Stats ──
280
+
281
+ def stats(self) -> dict:
282
+ total_evidence = sum(len(e.get("evidence", [])) for e in self.edges)
283
+ return {
284
+ "nodes": self.count_nodes(),
285
+ "edges": self.count_edges(),
286
+ "evidence": total_evidence,
287
+ "human_evidence": self.count_human_evidence(),
288
+ }
codd/hooks.py ADDED
@@ -0,0 +1,104 @@
1
+ """Git hook helpers for CoDD pre-commit enforcement."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import subprocess
6
+ from pathlib import Path, PurePosixPath
7
+
8
+ import yaml
9
+
10
+ from codd.scanner import _extract_frontmatter
11
+ from codd.validator import run_validate
12
+
13
+
14
+ HOOK_SOURCE = Path(__file__).parent.parent / "hooks" / "pre-commit"
15
+
16
+
17
+ def install_pre_commit_hook(project_root: Path) -> tuple[Path, bool]:
18
+ """Install the packaged pre-commit hook into a Git repository."""
19
+ config_path = project_root / "codd" / "codd.yaml"
20
+ if not config_path.exists():
21
+ raise FileNotFoundError(f"{config_path} not found")
22
+
23
+ git_dir = project_root / ".git"
24
+ if not git_dir.exists():
25
+ raise FileNotFoundError(f"{git_dir} not found")
26
+
27
+ if not HOOK_SOURCE.exists():
28
+ raise FileNotFoundError(f"{HOOK_SOURCE} not found")
29
+
30
+ destination = git_dir / "hooks" / "pre-commit"
31
+ source = HOOK_SOURCE.resolve()
32
+ source.chmod(source.stat().st_mode | 0o111)
33
+
34
+ if destination.is_symlink():
35
+ if destination.resolve() == source:
36
+ return destination, False
37
+ raise FileExistsError(f"{destination} already exists and points to {destination.resolve()}")
38
+
39
+ if destination.exists():
40
+ raise FileExistsError(f"{destination} already exists")
41
+
42
+ destination.parent.mkdir(parents=True, exist_ok=True)
43
+ destination.symlink_to(source)
44
+ return destination, True
45
+
46
+
47
+ def run_pre_commit(project_root: Path) -> int:
48
+ """Validate staged CoDD documents before commit."""
49
+ config_path = project_root / "codd" / "codd.yaml"
50
+ if not config_path.exists():
51
+ print("ERROR: codd/codd.yaml not found.")
52
+ return 1
53
+
54
+ config = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
55
+ try:
56
+ staged_docs = _get_staged_markdown_files(project_root, config)
57
+ except RuntimeError as exc:
58
+ print(f"ERROR: {exc}")
59
+ return 1
60
+
61
+ for relative_path in staged_docs:
62
+ if _extract_frontmatter(project_root / relative_path) is not None:
63
+ continue
64
+ print(f"ERROR: {relative_path} is missing CoDD YAML frontmatter")
65
+ return 1
66
+
67
+ return run_validate(project_root, project_root / "codd")
68
+
69
+
70
+ def _get_staged_markdown_files(project_root: Path, config: dict) -> list[Path]:
71
+ result = subprocess.run(
72
+ ["git", "diff", "--cached", "--name-only", "--diff-filter=ACMR"],
73
+ cwd=project_root,
74
+ capture_output=True,
75
+ text=True,
76
+ check=False,
77
+ )
78
+ if result.returncode != 0:
79
+ raise RuntimeError(result.stderr.strip() or "git diff --cached failed")
80
+
81
+ doc_dirs = ((config.get("scan") or {}).get("doc_dirs") or [])
82
+ staged_docs: list[Path] = []
83
+
84
+ for entry in result.stdout.splitlines():
85
+ relative_path = entry.strip()
86
+ if not relative_path.endswith(".md"):
87
+ continue
88
+ if not _is_in_doc_dirs(relative_path, doc_dirs):
89
+ continue
90
+ staged_docs.append(Path(relative_path))
91
+
92
+ return staged_docs
93
+
94
+
95
+ def _is_in_doc_dirs(relative_path: str, doc_dirs: list[str]) -> bool:
96
+ rel = PurePosixPath(relative_path)
97
+ for doc_dir in doc_dirs:
98
+ base = PurePosixPath(str(doc_dir).rstrip("/"))
99
+ try:
100
+ rel.relative_to(base)
101
+ return True
102
+ except ValueError:
103
+ continue
104
+ return False