codd-dev 0.2.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codd/scanner.py ADDED
@@ -0,0 +1,445 @@
1
+ """CoDD Scanner — Extract dependency data from document frontmatter + source code.
2
+
3
+ Design principle: Documents ARE the data (Single Source of Truth).
4
+ Dependency metadata is embedded as YAML frontmatter in deliverable documents.
5
+ Auto-generated data (frontmatter, AST) is refreshed on scan.
6
+ Human knowledge (manual annotations, overrides) is NEVER deleted.
7
+ """
8
+
9
+ import os
10
+ import re
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import yaml
15
+
16
+ from codd.graph import CEG
17
+
18
+
19
+ def run_scan(project_root: Path, codd_dir: Path):
20
+ """Scan all project documents and source code, refresh auto-generated data.
21
+
22
+ Human knowledge (source_type='human') is preserved.
23
+ Auto-generated data (frontmatter, static, framework) is purged and rebuilt.
24
+ """
25
+ config_path = codd_dir / "codd.yaml"
26
+ if not config_path.exists():
27
+ print("Error: codd/codd.yaml not found.")
28
+ raise SystemExit(1)
29
+
30
+ config = yaml.safe_load(config_path.read_text())
31
+ scan_dir = codd_dir / "scan"
32
+
33
+ ceg = CEG(scan_dir)
34
+
35
+ # Purge auto-generated data, keep human knowledge
36
+ purged = ceg.purge_auto_generated()
37
+ human_count = ceg.count_human_evidence()
38
+ print(f"Purged auto-generated: {purged['evidence']} evidence, {purged['edges']} edges, {purged['nodes']} nodes")
39
+ if human_count > 0:
40
+ print(f"Preserved: {human_count} human evidence records")
41
+
42
+ # Phase 1: Scan document frontmatter (all .md/.yaml in doc_dirs)
43
+ doc_dirs = config["scan"].get("doc_dirs", [])
44
+ frontmatter_count = 0
45
+ warnings = []
46
+ for doc_dir in doc_dirs:
47
+ full_path = project_root / doc_dir
48
+ if full_path.exists():
49
+ count, doc_warnings = _scan_frontmatter(ceg, project_root, full_path)
50
+ frontmatter_count += count
51
+ warnings.extend(doc_warnings)
52
+
53
+ # Phase 1b: Also scan codd/annotations/ if it exists (backward compat)
54
+ annotations_dir = codd_dir / "annotations"
55
+ if annotations_dir.exists():
56
+ _load_legacy_annotations(ceg, annotations_dir)
57
+
58
+ # Phase 2: Scan source code (imports, calls)
59
+ language = config["project"].get("language", "python")
60
+ source_dirs = config["scan"].get("source_dirs", [])
61
+ exclude_patterns = config["scan"].get("exclude", [])
62
+
63
+ for src_dir in source_dirs:
64
+ full_path = project_root / src_dir
65
+ if full_path.exists():
66
+ _scan_source_directory(ceg, project_root, full_path, language, exclude_patterns)
67
+
68
+ warnings.extend(_collect_wave_config_warnings(project_root, config))
69
+ for warning in warnings:
70
+ print(f"WARNING: {warning}")
71
+
72
+ stats = ceg.stats()
73
+ print(f"Scan complete:")
74
+ print(f" Documents with frontmatter: {frontmatter_count}")
75
+ print(f" Graph: {stats['nodes']} nodes, {stats['edges']} edges")
76
+ print(f" Evidence: {stats['evidence']} total ({stats['human_evidence']} human, {stats['evidence'] - stats['human_evidence']} auto)")
77
+ ceg.close()
78
+
79
+
80
+ # ═══════════════════════════════════════════════════════════
81
+ # Phase 1: Document frontmatter scanning
82
+ # ═══════════════════════════════════════════════════════════
83
+
84
+ def _scan_frontmatter(ceg: CEG, project_root: Path, doc_dir: Path) -> tuple[int, list[str]]:
85
+ """Scan all Markdown files in a directory for CoDD frontmatter."""
86
+ count = 0
87
+ warnings: list[str] = []
88
+ for root, dirs, files in os.walk(doc_dir):
89
+ for fname in files:
90
+ if not fname.endswith(".md"):
91
+ continue
92
+ full = Path(root) / fname
93
+ rel = full.relative_to(project_root).as_posix()
94
+ codd_data = _extract_frontmatter(full)
95
+ if codd_data:
96
+ _load_frontmatter(ceg, rel, codd_data)
97
+ count += 1
98
+ warnings.extend(_collect_document_warnings(rel, codd_data))
99
+ elif rel.startswith("docs/"):
100
+ warnings.append(f"{rel}: missing CoDD YAML frontmatter")
101
+ if count > 0:
102
+ print(f" Frontmatter: {count} documents in {doc_dir.relative_to(project_root)}")
103
+ return count, warnings
104
+
105
+
106
+ def _extract_frontmatter(file_path: Path) -> dict | None:
107
+ """Extract CoDD metadata from Markdown YAML frontmatter.
108
+
109
+ Supports:
110
+ ---
111
+ codd:
112
+ node_id: "req:FR-03"
113
+ ...
114
+ ---
115
+ # Document content
116
+ """
117
+ try:
118
+ content = file_path.read_text(errors="ignore")
119
+ except Exception:
120
+ return None
121
+
122
+ # Match YAML frontmatter between --- delimiters
123
+ match = re.match(r'^---\s*\n(.*?)\n---', content, re.DOTALL)
124
+ if not match:
125
+ return None
126
+
127
+ try:
128
+ frontmatter = yaml.safe_load(match.group(1))
129
+ except yaml.YAMLError:
130
+ return None
131
+
132
+ if not isinstance(frontmatter, dict):
133
+ return None
134
+
135
+ return frontmatter.get("codd")
136
+
137
+
138
+ def build_document_node_path_map(project_root: Path, config: dict[str, Any]) -> dict[str, Path]:
139
+ """Resolve document node IDs to project-relative paths."""
140
+ node_paths: dict[str, Path] = {}
141
+
142
+ for doc_dir in config.get("scan", {}).get("doc_dirs", []):
143
+ full_path = project_root / doc_dir
144
+ if not full_path.exists():
145
+ continue
146
+
147
+ for root, _, files in os.walk(full_path):
148
+ for fname in files:
149
+ if not fname.endswith(".md"):
150
+ continue
151
+
152
+ full = Path(root) / fname
153
+ rel = full.relative_to(project_root)
154
+ codd_data = _extract_frontmatter(full)
155
+ if not codd_data:
156
+ continue
157
+
158
+ node_id = codd_data.get("node_id", f"doc:{rel.as_posix()}")
159
+ node_paths[str(node_id)] = rel
160
+
161
+ from codd.generator import _load_wave_artifacts
162
+
163
+ try:
164
+ artifacts = _load_wave_artifacts(config)
165
+ except ValueError:
166
+ artifacts = []
167
+
168
+ for artifact in artifacts:
169
+ node_paths.setdefault(artifact.node_id, Path(artifact.output))
170
+
171
+ return node_paths
172
+
173
+
174
+ def _load_frontmatter(ceg: CEG, doc_path: str, codd: dict):
175
+ """Load CoDD frontmatter data into the graph."""
176
+ node_id = codd.get("node_id", f"doc:{doc_path}")
177
+ node_type = codd.get("type", "document")
178
+ ceg.upsert_node(node_id, node_type, path=doc_path, name=node_id)
179
+
180
+ # Process depends_on (outgoing edges from this document)
181
+ for dep in codd.get("depends_on", []):
182
+ target_id = dep.get("id")
183
+ if not target_id:
184
+ continue
185
+ target_type = _infer_node_type(target_id)
186
+ ceg.upsert_node(target_id, target_type, name=target_id)
187
+ relation = dep.get("relation", "depends_on")
188
+ semantic = dep.get("semantic", "governance")
189
+ edge_id = ceg.add_edge(node_id, target_id, relation, semantic)
190
+ ceg.add_evidence(edge_id, "frontmatter", "frontmatter", 0.9,
191
+ detail=f"from {doc_path}")
192
+
193
+ # Process depended_by (incoming edges — other things that depend on this)
194
+ for dep in codd.get("depended_by", []):
195
+ source_id = dep.get("id")
196
+ if not source_id:
197
+ continue
198
+ source_type = _infer_node_type(source_id)
199
+ ceg.upsert_node(source_id, source_type, name=source_id)
200
+ relation = dep.get("relation", "depends_on")
201
+ semantic = dep.get("semantic", "governance")
202
+ edge_id = ceg.add_edge(source_id, node_id, relation, semantic)
203
+ ceg.add_evidence(edge_id, "frontmatter", "frontmatter", 0.9,
204
+ detail=f"from {doc_path}")
205
+
206
+ # Process conventions (must_review rules embedded in document)
207
+ for conv in codd.get("conventions", []):
208
+ targets = conv.get("targets", [])
209
+ if isinstance(targets, str):
210
+ targets = [targets]
211
+ reason = conv.get("reason", "")
212
+ for target in targets:
213
+ target_type = _infer_node_type(target)
214
+ ceg.upsert_node(target, target_type, name=target)
215
+ edge_id = ceg.add_edge(node_id, target, "must_review", "governance")
216
+ ceg.add_evidence(edge_id, "frontmatter", "convention", 0.8, detail=reason)
217
+
218
+ # Process data_dependencies (behavioral edges)
219
+ for data_dep in codd.get("data_dependencies", []):
220
+ table = data_dep.get("table", "")
221
+ column = data_dep.get("column", "")
222
+ dep_id = f"db_column:{table}.{column}"
223
+ ceg.upsert_node(dep_id, "db_column", name=f"{table}.{column}")
224
+ for affected in data_dep.get("affects", []):
225
+ ceg.upsert_node(affected, _infer_node_type(affected), name=affected)
226
+ edge_id = ceg.add_edge(dep_id, affected, "behavioral_dependency", "behavioral")
227
+ ceg.add_evidence(edge_id, "frontmatter", "frontmatter", 0.75,
228
+ detail=data_dep.get("condition", ""))
229
+
230
+
231
+ # ═══════════════════════════════════════════════════════════
232
+ # Legacy: annotations/ YAML support (backward compatibility)
233
+ # ═══════════════════════════════════════════════════════════
234
+
235
+ def _load_legacy_annotations(ceg: CEG, annotations_dir: Path):
236
+ """Load legacy annotations/*.yaml files (backward compat with v0.1)."""
237
+ loaded = False
238
+
239
+ conv_path = annotations_dir / "conventions.yaml"
240
+ if conv_path.exists():
241
+ data = yaml.safe_load(conv_path.read_text())
242
+ for conv in (data or {}).get("conventions", []):
243
+ _load_legacy_convention(ceg, conv)
244
+ loaded = True
245
+
246
+ links_path = annotations_dir / "doc_links.yaml"
247
+ if links_path.exists():
248
+ data = yaml.safe_load(links_path.read_text())
249
+ for link in (data or {}).get("links", []):
250
+ _load_legacy_doc_link(ceg, link)
251
+ loaded = True
252
+
253
+ deps_path = annotations_dir / "data_dependencies.yaml"
254
+ if deps_path.exists():
255
+ data = yaml.safe_load(deps_path.read_text())
256
+ for dep in (data or {}).get("data_dependencies", []):
257
+ _load_legacy_data_dependency(ceg, dep)
258
+ loaded = True
259
+
260
+ if loaded:
261
+ print(" Legacy annotations/ loaded (consider migrating to frontmatter)")
262
+
263
+
264
+ def _load_legacy_convention(ceg: CEG, conv: dict):
265
+ sources = conv.get("when_changed", [])
266
+ if isinstance(sources, str):
267
+ sources = [sources]
268
+ targets = conv.get("must_review", [])
269
+ if isinstance(targets, str):
270
+ targets = [targets]
271
+
272
+ for source in sources:
273
+ ceg.upsert_node(source, _infer_node_type(source), name=source)
274
+ for target in targets:
275
+ ceg.upsert_node(target, _infer_node_type(target), name=target)
276
+ edge_id = ceg.add_edge(source, target, "must_review", "governance", confidence=0.5)
277
+ ceg.add_evidence(edge_id, "frontmatter", "legacy_annotation", 0.8, detail=conv.get("reason", ""))
278
+
279
+
280
+ def _load_legacy_doc_link(ceg: CEG, link: dict):
281
+ req = link.get("requirement")
282
+ design = link.get("design")
283
+ code_files = link.get("code", [])
284
+ test_files = link.get("test", [])
285
+ db_tables = link.get("db", [])
286
+
287
+ if req:
288
+ ceg.upsert_node(req, "requirement", name=req)
289
+ if design:
290
+ ceg.upsert_node(design, "design", name=design)
291
+ edge_id = ceg.add_edge(req, design, "specifies", "governance")
292
+ ceg.add_evidence(edge_id, "frontmatter", "legacy_annotation", 0.9)
293
+ for code in code_files:
294
+ ceg.upsert_node(code, "file", path=code, name=code)
295
+ edge_id = ceg.add_edge(req, code, "implements", "governance")
296
+ ceg.add_evidence(edge_id, "frontmatter", "legacy_annotation", 0.9)
297
+
298
+ for code in code_files:
299
+ ceg.upsert_node(code, "file", path=code, name=code)
300
+ for test in test_files:
301
+ ceg.upsert_node(test, "test_case", path=test, name=test)
302
+ edge_id = ceg.add_edge(code, test, "tests", "validation")
303
+ ceg.add_evidence(edge_id, "frontmatter", "legacy_annotation", 0.85)
304
+ for table in db_tables:
305
+ ceg.upsert_node(table, "db_table", name=table)
306
+ edge_id = ceg.add_edge(code, table, "writes_table", "structural")
307
+ ceg.add_evidence(edge_id, "frontmatter", "legacy_annotation", 0.8)
308
+
309
+
310
+ def _load_legacy_data_dependency(ceg: CEG, dep: dict):
311
+ table = dep.get("table", "")
312
+ column = dep.get("column", "")
313
+ node_id = f"db_column:{table}.{column}"
314
+ ceg.upsert_node(node_id, "db_column", name=f"{table}.{column}")
315
+ for affected in dep.get("affects", []):
316
+ ceg.upsert_node(affected, "file", path=affected, name=affected)
317
+ edge_id = ceg.add_edge(node_id, affected, "behavioral_dependency", "behavioral")
318
+ ceg.add_evidence(edge_id, "frontmatter", "legacy_annotation", 0.75, detail=dep.get("condition", ""))
319
+
320
+
321
+ # ═══════════════════════════════════════════════════════════
322
+ # Phase 2: Source code scanning
323
+ # ═══════════════════════════════════════════════════════════
324
+
325
+ def _scan_source_directory(ceg: CEG, project_root: Path, src_dir: Path,
326
+ language: str, exclude_patterns: list):
327
+ """Scan source files for import/call dependencies."""
328
+ extensions = {
329
+ "python": [".py"],
330
+ "typescript": [".ts", ".tsx"],
331
+ "javascript": [".js", ".jsx"],
332
+ "java": [".java"],
333
+ "go": [".go"],
334
+ }
335
+ exts = extensions.get(language, [])
336
+
337
+ file_count = 0
338
+ for root, dirs, files in os.walk(src_dir):
339
+ for fname in files:
340
+ if not any(fname.endswith(ext) for ext in exts):
341
+ continue
342
+ full = Path(root) / fname
343
+ rel = full.relative_to(project_root).as_posix()
344
+
345
+ if any(_match_glob(rel, pat) for pat in exclude_patterns):
346
+ continue
347
+
348
+ ceg.upsert_node(f"file:{rel}", "file", path=rel, name=fname)
349
+ file_count += 1
350
+ _extract_imports_basic(ceg, project_root, full, rel, language)
351
+
352
+ if file_count > 0:
353
+ print(f" Source: {file_count} {language} files in {src_dir.relative_to(project_root)}")
354
+
355
+
356
+ def _extract_imports_basic(ceg: CEG, project_root: Path, file_path: Path,
357
+ rel_path: str, language: str):
358
+ """Basic import extraction using regex (to be replaced with Tree-sitter)."""
359
+ try:
360
+ content = file_path.read_text(errors="ignore")
361
+ except Exception:
362
+ return
363
+
364
+ source_id = f"file:{rel_path}"
365
+
366
+ if language in ("typescript", "javascript"):
367
+ for match in re.finditer(r'''(?:import|from)\s+['"]([^'"]+)['"]''', content):
368
+ target_module = match.group(1)
369
+ if target_module.startswith("."):
370
+ resolved = (file_path.parent / target_module).resolve()
371
+ for ext in [".ts", ".tsx", ".js", ".jsx", "/index.ts", "/index.tsx"]:
372
+ candidate = Path(str(resolved) + ext)
373
+ if candidate.exists():
374
+ target_rel = candidate.relative_to(project_root).as_posix()
375
+ target_id = f"file:{target_rel}"
376
+ ceg.upsert_node(target_id, "file", path=target_rel)
377
+ edge_id = ceg.add_edge(source_id, target_id, "imports", "structural")
378
+ ceg.add_evidence(edge_id, "static", "regex_import", 0.95)
379
+ break
380
+
381
+ elif language == "python":
382
+ for match in re.finditer(r'(?:from|import)\s+([\w.]+)', content):
383
+ target_module = match.group(1)
384
+ target_id = f"module:{target_module}"
385
+ ceg.upsert_node(target_id, "module", name=target_module)
386
+ edge_id = ceg.add_edge(source_id, target_id, "imports", "structural")
387
+ ceg.add_evidence(edge_id, "static", "regex_import", 0.90)
388
+
389
+
390
+ # ═══════════════════════════════════════════════════════════
391
+ # Utilities
392
+ # ═══════════════════════════════════════════════════════════
393
+
394
+ def _match_glob(path: str, pattern: str) -> bool:
395
+ import fnmatch
396
+ return fnmatch.fnmatch(path, pattern)
397
+
398
+
399
+ def _collect_document_warnings(rel_path: str, codd_data: dict) -> list[str]:
400
+ warnings = []
401
+ if codd_data.get("type") == "design" and not _has_dependency_refs(codd_data.get("depends_on")):
402
+ warnings.append(f"{rel_path}: design document has empty depends_on")
403
+ return warnings
404
+
405
+
406
+ def _has_dependency_refs(entries) -> bool:
407
+ if not entries:
408
+ return False
409
+ for entry in entries:
410
+ if isinstance(entry, str) and entry:
411
+ return True
412
+ if isinstance(entry, dict) and (entry.get("id") or entry.get("node_id")):
413
+ return True
414
+ return False
415
+
416
+
417
+ def _collect_wave_config_warnings(project_root: Path, config: dict) -> list[str]:
418
+ wave_config = config.get("wave_config")
419
+ if not wave_config:
420
+ return []
421
+
422
+ from codd.generator import _load_wave_artifacts
423
+
424
+ warnings = []
425
+ for artifact in _load_wave_artifacts(config):
426
+ output_path = project_root / artifact.output
427
+ if not output_path.exists():
428
+ warnings.append(
429
+ f"{artifact.output}: wave_config defines {artifact.node_id} but the file has not been generated"
430
+ )
431
+ return warnings
432
+
433
+
434
+ def _infer_node_type(node_id: str) -> str:
435
+ prefixes = {
436
+ "db_table:": "db_table", "db_column:": "db_column",
437
+ "module:": "module", "file:": "file", "test:": "test_case",
438
+ "config:": "config_key", "endpoint:": "endpoint",
439
+ "infra:": "infrastructure", "db:": "db_object",
440
+ "req:": "requirement", "design:": "design", "doc:": "document",
441
+ }
442
+ for prefix, node_type in prefixes.items():
443
+ if node_id.startswith(prefix):
444
+ return node_type
445
+ return "unknown"