polycodegraph 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. codegraph/__init__.py +10 -0
  2. codegraph/analysis/__init__.py +30 -0
  3. codegraph/analysis/_common.py +125 -0
  4. codegraph/analysis/blast_radius.py +63 -0
  5. codegraph/analysis/cycles.py +79 -0
  6. codegraph/analysis/dataflow.py +861 -0
  7. codegraph/analysis/dead_code.py +165 -0
  8. codegraph/analysis/hotspots.py +68 -0
  9. codegraph/analysis/infrastructure.py +439 -0
  10. codegraph/analysis/metrics.py +52 -0
  11. codegraph/analysis/report.py +222 -0
  12. codegraph/analysis/roles.py +323 -0
  13. codegraph/analysis/untested.py +79 -0
  14. codegraph/cli.py +1506 -0
  15. codegraph/config.py +64 -0
  16. codegraph/embed/__init__.py +35 -0
  17. codegraph/embed/chunker.py +120 -0
  18. codegraph/embed/embedder.py +113 -0
  19. codegraph/embed/query.py +181 -0
  20. codegraph/embed/store.py +360 -0
  21. codegraph/graph/__init__.py +0 -0
  22. codegraph/graph/builder.py +212 -0
  23. codegraph/graph/schema.py +69 -0
  24. codegraph/graph/store_networkx.py +55 -0
  25. codegraph/graph/store_sqlite.py +249 -0
  26. codegraph/mcp_server/__init__.py +6 -0
  27. codegraph/mcp_server/server.py +933 -0
  28. codegraph/parsers/__init__.py +0 -0
  29. codegraph/parsers/base.py +70 -0
  30. codegraph/parsers/go.py +570 -0
  31. codegraph/parsers/python.py +1707 -0
  32. codegraph/parsers/typescript.py +1397 -0
  33. codegraph/py.typed +0 -0
  34. codegraph/resolve/__init__.py +4 -0
  35. codegraph/resolve/calls.py +480 -0
  36. codegraph/review/__init__.py +31 -0
  37. codegraph/review/baseline.py +32 -0
  38. codegraph/review/differ.py +211 -0
  39. codegraph/review/hook.py +70 -0
  40. codegraph/review/risk.py +219 -0
  41. codegraph/review/rules.py +342 -0
  42. codegraph/viz/__init__.py +17 -0
  43. codegraph/viz/_style.py +45 -0
  44. codegraph/viz/dashboard.py +740 -0
  45. codegraph/viz/diagrams.py +370 -0
  46. codegraph/viz/explore.py +453 -0
  47. codegraph/viz/hld.py +683 -0
  48. codegraph/viz/html.py +115 -0
  49. codegraph/viz/mermaid.py +111 -0
  50. codegraph/viz/svg.py +77 -0
  51. codegraph/web/__init__.py +4 -0
  52. codegraph/web/server.py +165 -0
  53. codegraph/web/static/app.css +664 -0
  54. codegraph/web/static/app.js +919 -0
  55. codegraph/web/static/index.html +112 -0
  56. codegraph/web/static/views/architecture.js +1671 -0
  57. codegraph/web/static/views/graph3d.css +564 -0
  58. codegraph/web/static/views/graph3d.js +999 -0
  59. codegraph/web/static/views/graph3d_transform.js +984 -0
  60. codegraph/workspace/__init__.py +34 -0
  61. codegraph/workspace/config.py +110 -0
  62. codegraph/workspace/operations.py +294 -0
  63. polycodegraph-0.1.0.dist-info/METADATA +687 -0
  64. polycodegraph-0.1.0.dist-info/RECORD +67 -0
  65. polycodegraph-0.1.0.dist-info/WHEEL +4 -0
  66. polycodegraph-0.1.0.dist-info/entry_points.txt +2 -0
  67. polycodegraph-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,360 @@
1
+ """On-disk vector store.
2
+
3
+ Tries LanceDB first (the production backend) and falls back to a tiny JSON
4
+ file when the optional ``embed`` extra isn't installed. The fallback is good
5
+ enough for unit tests and for repos that just want a quick local index without
6
+ pulling the full Arrow / LanceDB stack.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import contextlib
11
+ import json
12
+ import math
13
+ from collections.abc import Iterable, Sequence
14
+ from dataclasses import asdict, dataclass, field
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from codegraph.embed.chunker import Chunk # noqa: F401 (re-export friendly)
19
+ from codegraph.embed.embedder import DEFAULT_DIM, DEFAULT_MODEL, Embedder
20
+
21
+ _STORE_FILENAME = "embeddings.lance"
22
+ _FALLBACK_FILENAME = "embeddings.json"
23
+
24
+
25
+ @dataclass
26
+ class StoredChunk:
27
+ id: str
28
+ qualname: str
29
+ file: str
30
+ line_start: int
31
+ line_end: int
32
+ kind: str
33
+ role: str | None
34
+ text: str
35
+ vector: list[float] = field(default_factory=list)
36
+
37
+ # pragma: codegraph-public-api
38
+ def to_json(self) -> dict[str, Any]:
39
+ return {
40
+ "id": self.id,
41
+ "qualname": self.qualname,
42
+ "file": self.file,
43
+ "line_start": self.line_start,
44
+ "line_end": self.line_end,
45
+ "kind": self.kind,
46
+ "role": self.role,
47
+ "text": self.text,
48
+ "vector": list(self.vector),
49
+ }
50
+
51
+ @classmethod
52
+ def from_json(cls, data: dict[str, Any]) -> StoredChunk:
53
+ return cls(
54
+ id=str(data["id"]),
55
+ qualname=str(data["qualname"]),
56
+ file=str(data["file"]),
57
+ line_start=int(data["line_start"]),
58
+ line_end=int(data["line_end"]),
59
+ kind=str(data["kind"]),
60
+ role=(str(data["role"]) if data.get("role") else None),
61
+ text=str(data["text"]),
62
+ vector=[float(v) for v in data.get("vector") or []],
63
+ )
64
+
65
+
66
+ def _cosine(a: Sequence[float], b: Sequence[float]) -> float:
67
+ if not a or not b:
68
+ return 0.0
69
+ dot = 0.0
70
+ na = 0.0
71
+ nb = 0.0
72
+ for x, y in zip(a, b, strict=False):
73
+ dot += x * y
74
+ na += x * x
75
+ nb += y * y
76
+ if na == 0.0 or nb == 0.0:
77
+ return 0.0
78
+ return dot / (math.sqrt(na) * math.sqrt(nb))
79
+
80
+
81
+ # ---------------------------------------------------------------------------
82
+ # Backend abstraction
83
+ # ---------------------------------------------------------------------------
84
+
85
+ class _JsonBackend:
86
+ """JSON-backed backend. Used in tests and as the no-deps fallback."""
87
+
88
+ def __init__(self, path: Path) -> None:
89
+ self.path = path
90
+ self.path.parent.mkdir(parents=True, exist_ok=True)
91
+ self._rows: list[StoredChunk] = []
92
+ if self.path.exists():
93
+ try:
94
+ raw = json.loads(self.path.read_text(encoding="utf-8"))
95
+ self._rows = [StoredChunk.from_json(r) for r in raw]
96
+ except (OSError, json.JSONDecodeError, KeyError, TypeError):
97
+ self._rows = []
98
+
99
+ def upsert(self, rows: Iterable[StoredChunk]) -> None:
100
+ new = list(rows)
101
+ new_ids = {r.id for r in new}
102
+ kept = [r for r in self._rows if r.id not in new_ids]
103
+ self._rows = kept + new
104
+ self._flush()
105
+
106
+ def replace_all(self, rows: Iterable[StoredChunk]) -> None:
107
+ self._rows = list(rows)
108
+ self._flush()
109
+
110
+ def _flush(self) -> None:
111
+ payload = [r.to_json() for r in self._rows]
112
+ self.path.write_text(json.dumps(payload), encoding="utf-8")
113
+
114
+ def all(self) -> list[StoredChunk]:
115
+ return list(self._rows)
116
+
117
+ def query(self, vector: Sequence[float], k: int) -> list[tuple[StoredChunk, float]]:
118
+ scored = [(row, _cosine(vector, row.vector)) for row in self._rows]
119
+ scored.sort(key=lambda pair: pair[1], reverse=True)
120
+ return scored[:k]
121
+
122
+ def size_bytes(self) -> int:
123
+ return self.path.stat().st_size if self.path.exists() else 0
124
+
125
+
126
+ class _LanceBackend:
127
+ """LanceDB backend. Schema mirrors :class:`StoredChunk`."""
128
+
129
+ def __init__(self, path: Path, dim: int) -> None:
130
+ import lancedb
131
+ import pyarrow as pa
132
+
133
+ self.path = path
134
+ self._dim = dim
135
+ self._pa = pa
136
+ self._db = lancedb.connect(str(path))
137
+ self._schema = self._make_schema(dim)
138
+ if "chunks" in self._db.table_names():
139
+ self._table = self._db.open_table("chunks")
140
+ else:
141
+ self._table = self._db.create_table("chunks", schema=self._schema, mode="create")
142
+
143
+ def _make_schema(self, dim: int) -> Any:
144
+ pa = self._pa
145
+ return pa.schema(
146
+ [
147
+ ("id", pa.string()),
148
+ ("qualname", pa.string()),
149
+ ("file", pa.string()),
150
+ ("line_start", pa.int64()),
151
+ ("line_end", pa.int64()),
152
+ ("kind", pa.string()),
153
+ ("role", pa.string()),
154
+ ("text", pa.string()),
155
+ ("vector", pa.list_(pa.float32(), dim)),
156
+ ]
157
+ )
158
+
159
+ def _to_dict(self, row: StoredChunk) -> dict[str, Any]:
160
+ return {
161
+ "id": row.id,
162
+ "qualname": row.qualname,
163
+ "file": row.file,
164
+ "line_start": row.line_start,
165
+ "line_end": row.line_end,
166
+ "kind": row.kind,
167
+ "role": row.role or "",
168
+ "text": row.text,
169
+ "vector": row.vector,
170
+ }
171
+
172
+ def upsert(self, rows: Iterable[StoredChunk]) -> None:
173
+ batch = [self._to_dict(r) for r in rows]
174
+ if not batch:
175
+ return
176
+ ids = ", ".join(f"'{r['id']}'" for r in batch)
177
+ with contextlib.suppress(Exception):
178
+ self._table.delete(f"id IN ({ids})")
179
+ self._table.add(batch)
180
+
181
+ def replace_all(self, rows: Iterable[StoredChunk]) -> None:
182
+ batch = [self._to_dict(r) for r in rows]
183
+ with contextlib.suppress(Exception):
184
+ self._db.drop_table("chunks", ignore_missing=True)
185
+ self._table = self._db.create_table("chunks", schema=self._schema, mode="create")
186
+ if batch:
187
+ self._table.add(batch)
188
+
189
+ def _row_from_record(self, r: dict[str, Any]) -> StoredChunk:
190
+ return StoredChunk(
191
+ id=str(r["id"]),
192
+ qualname=str(r["qualname"]),
193
+ file=str(r["file"]),
194
+ line_start=int(r["line_start"]),
195
+ line_end=int(r["line_end"]),
196
+ kind=str(r["kind"]),
197
+ role=str(r["role"]) or None,
198
+ text=str(r["text"]),
199
+ vector=list(r["vector"]),
200
+ )
201
+
202
+ def all(self) -> list[StoredChunk]:
203
+ rows = self._table.to_pandas().to_dict(orient="records")
204
+ return [self._row_from_record(r) for r in rows]
205
+
206
+ def query(self, vector: Sequence[float], k: int) -> list[tuple[StoredChunk, float]]:
207
+ results = self._table.search(list(vector)).limit(k).to_pandas()
208
+ out: list[tuple[StoredChunk, float]] = []
209
+ for r in results.to_dict(orient="records"):
210
+ chunk = self._row_from_record(r)
211
+ distance = float(r.get("_distance", 0.0))
212
+ similarity = 1.0 / (1.0 + distance)
213
+ out.append((chunk, similarity))
214
+ return out
215
+
216
+ def size_bytes(self) -> int:
217
+ total = 0
218
+ for p in self.path.rglob("*"):
219
+ if p.is_file():
220
+ total += p.stat().st_size
221
+ return total
222
+
223
+
224
+ # ---------------------------------------------------------------------------
225
+ # Public store
226
+ # ---------------------------------------------------------------------------
227
+
228
+
229
+ class EmbeddingStore:
230
+ """High-level interface that auto-selects a backend.
231
+
232
+ ``backend='auto'`` (default) tries LanceDB and falls back to JSON.
233
+ ``backend='json'`` forces the lightweight backend (used in tests).
234
+ """
235
+
236
+ def __init__(
237
+ self,
238
+ data_dir: Path,
239
+ *,
240
+ dim: int = DEFAULT_DIM,
241
+ backend: str = "auto",
242
+ ) -> None:
243
+ self.data_dir = data_dir
244
+ self.dim = dim
245
+ self.backend_name: str
246
+ self._backend: _LanceBackend | _JsonBackend
247
+ data_dir.mkdir(parents=True, exist_ok=True)
248
+
249
+ if backend == "json":
250
+ self._backend = _JsonBackend(data_dir / _FALLBACK_FILENAME)
251
+ self.backend_name = "json"
252
+ return
253
+
254
+ try:
255
+ self._backend = _LanceBackend(data_dir / _STORE_FILENAME, dim=dim)
256
+ self.backend_name = "lancedb"
257
+ except ImportError:
258
+ if backend == "lancedb":
259
+ raise
260
+ self._backend = _JsonBackend(data_dir / _FALLBACK_FILENAME)
261
+ self.backend_name = "json"
262
+
263
+ # ------------------------------------------------------------------
264
+ # pragma: codegraph-public-api
265
+ def upsert(self, rows: Iterable[StoredChunk]) -> None:
266
+ self._backend.upsert(rows)
267
+
268
+ # pragma: codegraph-public-api
269
+ def replace_all(self, rows: Iterable[StoredChunk]) -> None:
270
+ self._backend.replace_all(rows)
271
+
272
+ # pragma: codegraph-public-api
273
+ def all(self) -> list[StoredChunk]:
274
+ return self._backend.all()
275
+
276
+ # pragma: codegraph-public-api
277
+ def query(self, vector: Sequence[float], k: int = 5) -> list[tuple[StoredChunk, float]]:
278
+ return self._backend.query(vector, k)
279
+
280
+ # pragma: codegraph-public-api
281
+ def size_bytes(self) -> int:
282
+ return self._backend.size_bytes()
283
+
284
+
285
+ # ---------------------------------------------------------------------------
286
+ # build_index — orchestrator wired up to chunker + embedder
287
+ # ---------------------------------------------------------------------------
288
+
289
+
290
+ @dataclass
291
+ class IndexStats:
292
+ chunks_indexed: int
293
+ model: str
294
+ dim: int
295
+ backend: str
296
+ on_disk_bytes: int
297
+
298
+ # pragma: codegraph-public-api
299
+ def as_dict(self) -> dict[str, Any]:
300
+ return asdict(self)
301
+
302
+
303
+ def build_index(
304
+ repo_root: Path,
305
+ *,
306
+ db_path: Path | None = None,
307
+ embeddings_dir: Path | None = None,
308
+ embedder: Embedder | None = None,
309
+ model: str = DEFAULT_MODEL,
310
+ force: bool = False,
311
+ progress: Any | None = None,
312
+ backend: str = "auto",
313
+ ) -> IndexStats:
314
+ """Chunk + embed + persist.
315
+
316
+ ``progress`` (optional) is anything with an ``advance(step: int)`` method
317
+ — typically a ``rich.progress.Progress`` task. Pass ``None`` to disable.
318
+ """
319
+ from codegraph.embed.chunker import chunk_repo
320
+
321
+ chunks = list(chunk_repo(repo_root, db_path=db_path))
322
+ emb = embedder or Embedder(model)
323
+
324
+ rows: list[StoredChunk] = []
325
+ dim = DEFAULT_DIM
326
+ if chunks:
327
+ vectors = emb.embed([c.text for c in chunks], batch_size=32)
328
+ dim = len(vectors[0]) if vectors else DEFAULT_DIM
329
+ for c, v in zip(chunks, vectors, strict=False):
330
+ rows.append(
331
+ StoredChunk(
332
+ id=c.id,
333
+ qualname=c.qualname,
334
+ file=c.file,
335
+ line_start=c.line_start,
336
+ line_end=c.line_end,
337
+ kind=c.kind,
338
+ role=c.role,
339
+ text=c.text,
340
+ vector=v,
341
+ )
342
+ )
343
+ if progress is not None:
344
+ with contextlib.suppress(Exception):
345
+ progress.advance(1)
346
+
347
+ out_dir = embeddings_dir or (repo_root / ".codegraph")
348
+ store = EmbeddingStore(out_dir, dim=dim, backend=backend)
349
+ if force:
350
+ store.replace_all(rows)
351
+ else:
352
+ store.upsert(rows)
353
+
354
+ return IndexStats(
355
+ chunks_indexed=len(rows),
356
+ model=emb.model,
357
+ dim=dim,
358
+ backend=store.backend_name,
359
+ on_disk_bytes=store.size_bytes(),
360
+ )
File without changes
@@ -0,0 +1,212 @@
1
+ """Repo walker and incremental graph builder."""
2
+ from __future__ import annotations
3
+
4
+ import hashlib
5
+ import logging
6
+ import subprocess
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import pathspec
13
+
14
+ # Ensure extractors register themselves.
15
+ import codegraph.parsers.go
16
+ import codegraph.parsers.python
17
+ import codegraph.parsers.typescript # noqa: F401
18
+ from codegraph.config import CodegraphConfig
19
+ from codegraph.graph.schema import Node, NodeKind, make_node_id
20
+ from codegraph.graph.store_sqlite import SQLiteGraphStore
21
+ from codegraph.parsers.base import get_extractor_for
22
+ from codegraph.parsers.python import PythonExtractor
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ _BUILTIN_IGNORES = [
27
+ ".git", ".venv", "venv", "node_modules", ".codegraph",
28
+ "dist", "build", "__pycache__", ".next", ".pytest_cache",
29
+ ".mypy_cache", ".ruff_cache", ".tox", ".eggs", "*.egg-info",
30
+ ".DS_Store", "*.pyc", "*.pyo",
31
+ ]
32
+
33
+ _IGNORE_DIRS: set[str] = {
34
+ ".git", ".venv", "venv", "node_modules", ".codegraph",
35
+ "dist", "build", "__pycache__", ".next", ".pytest_cache",
36
+ ".mypy_cache", ".ruff_cache", ".tox",
37
+ }
38
+
39
+
40
+ @dataclass
41
+ class BuildStats:
42
+ files_scanned: int = 0
43
+ files_parsed: int = 0
44
+ nodes_added: int = 0
45
+ edges_added: int = 0
46
+ files_skipped: int = 0
47
+ errors: list[str] = field(default_factory=list)
48
+
49
+
50
+ def _sha256(path: Path) -> str:
51
+ h = hashlib.sha256()
52
+ with path.open("rb") as f:
53
+ for chunk in iter(lambda: f.read(65536), b""):
54
+ h.update(chunk)
55
+ return h.hexdigest()
56
+
57
+
58
+ def _get_git_sha(repo_root: Path) -> str | None:
59
+ try:
60
+ result = subprocess.run(
61
+ ["git", "rev-parse", "--short", "HEAD"],
62
+ cwd=repo_root,
63
+ capture_output=True,
64
+ text=True,
65
+ timeout=5,
66
+ )
67
+ if result.returncode == 0:
68
+ return result.stdout.strip()
69
+ except Exception:
70
+ pass
71
+ return None
72
+
73
+
74
+ class GraphBuilder:
75
+ def __init__(
76
+ self,
77
+ repo_root: Path,
78
+ store: SQLiteGraphStore,
79
+ ignore: list[str] | None = None,
80
+ config: CodegraphConfig | None = None,
81
+ ) -> None:
82
+ self._repo_root = repo_root
83
+ self._store = store
84
+ self._ignore = ignore or []
85
+ self._config = config or CodegraphConfig()
86
+ self._apply_config_to_extractors()
87
+
88
+ def _apply_config_to_extractors(self) -> None:
89
+ """Forward user dead-code patterns onto the singleton extractors."""
90
+ extra = tuple(self._config.dead_code.entry_point_decorators)
91
+ # PythonExtractor is registered as a singleton in the registry; we
92
+ # mutate its class attribute so subsequent parse_file calls pick up
93
+ # the user patterns.
94
+ PythonExtractor.extra_entry_point_decorators = extra
95
+
96
+ def build(self, incremental: bool = True) -> BuildStats:
97
+ stats = BuildStats()
98
+ patterns = _BUILTIN_IGNORES + self._ignore
99
+ spec = pathspec.PathSpec.from_lines("gitwildmatch", patterns)
100
+
101
+ files = list(self._walk_repo(spec))
102
+ stats.files_scanned = len(files)
103
+
104
+ for file_path in files:
105
+ rel = file_path.relative_to(self._repo_root).as_posix()
106
+ try:
107
+ content_hash = _sha256(file_path)
108
+
109
+ extractor = get_extractor_for(file_path)
110
+ language = extractor.language if extractor else "unknown"
111
+
112
+ file_node_id = make_node_id(NodeKind.FILE, rel, rel)
113
+ if incremental:
114
+ existing = self._store.get_node(file_node_id)
115
+ if existing and existing.content_hash == content_hash:
116
+ stats.files_skipped += 1
117
+ continue
118
+
119
+ self._store.delete_file(rel)
120
+
121
+ file_node = Node(
122
+ id=file_node_id,
123
+ kind=NodeKind.FILE,
124
+ name=file_path.name,
125
+ qualname=rel,
126
+ file=rel,
127
+ line_start=1,
128
+ line_end=0,
129
+ content_hash=content_hash,
130
+ language=language,
131
+ metadata={"size": file_path.stat().st_size},
132
+ )
133
+ self._store.upsert_node(file_node)
134
+ stats.nodes_added += 1
135
+
136
+ if extractor is not None:
137
+ nodes, edges = extractor.parse_file(
138
+ file_path, self._repo_root
139
+ )
140
+ self._store.upsert_nodes(nodes)
141
+ self._store.upsert_edges(edges)
142
+ stats.nodes_added += len(nodes)
143
+ stats.edges_added += len(edges)
144
+ stats.files_parsed += 1
145
+
146
+ except Exception as exc:
147
+ logger.warning("Error parsing %s: %s", rel, exc)
148
+ stats.errors.append(f"{rel}: {exc}")
149
+
150
+ now = datetime.now(tz=timezone.utc).isoformat()
151
+ self._store.set_meta("last_build_time", now)
152
+ git_sha = _get_git_sha(self._repo_root)
153
+ if git_sha:
154
+ self._store.set_meta("last_git_sha", git_sha)
155
+
156
+ # Best-effort cross-file resolution of unresolved CALLS/IMPORTS edges.
157
+ try:
158
+ from codegraph.resolve import resolve_unresolved_edges
159
+ rstats = resolve_unresolved_edges(self._store)
160
+ self._store.set_meta(
161
+ "last_resolve",
162
+ f"{rstats.resolved}/{rstats.inspected} resolved",
163
+ )
164
+ except Exception as exc: # pragma: no cover - defensive
165
+ logger.warning("resolver failed: %s", exc)
166
+ stats.errors.append(f"resolver: {exc}")
167
+
168
+ # Architectural role classification (DF1.5): stamp HANDLER/SERVICE/
169
+ # COMPONENT/REPO onto FUNCTION/METHOD/CLASS nodes.
170
+ try:
171
+ from codegraph.analysis.roles import classify_roles
172
+ from codegraph.graph.store_networkx import to_digraph
173
+
174
+ graph = to_digraph(self._store)
175
+ count = classify_roles(graph)
176
+ if count:
177
+ updated: list[Node] = []
178
+ for nid, attrs in graph.nodes(data=True):
179
+ metadata = attrs.get("metadata") or {}
180
+ if not metadata.get("role"):
181
+ continue
182
+ existing = self._store.get_node(nid)
183
+ if existing is None:
184
+ continue
185
+ existing.metadata["role"] = metadata["role"]
186
+ updated.append(existing)
187
+ if updated:
188
+ self._store.upsert_nodes(updated)
189
+ self._store.set_meta("last_roles", str(count))
190
+ logger.info("roles: %d nodes classified", count)
191
+ except Exception as exc: # pragma: no cover - defensive
192
+ logger.warning("role classifier failed: %s", exc)
193
+ stats.errors.append(f"roles: {exc}")
194
+
195
+ return stats
196
+
197
+ def _walk_repo(self, spec: Any) -> list[Path]:
198
+ result: list[Path] = []
199
+ for file_path in sorted(self._repo_root.rglob("*")):
200
+ if not file_path.is_file():
201
+ continue
202
+ try:
203
+ rel = file_path.relative_to(self._repo_root).as_posix()
204
+ except ValueError:
205
+ continue
206
+ if spec.match_file(rel):
207
+ continue
208
+ parts = Path(rel).parts
209
+ if any(part in _IGNORE_DIRS for part in parts[:-1]):
210
+ continue
211
+ result.append(file_path)
212
+ return result
@@ -0,0 +1,69 @@
1
+ """Graph schema: Node, Edge, and ID generation."""
2
+ from __future__ import annotations
3
+
4
+ import hashlib
5
+ from enum import Enum
6
+ from typing import Any
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class NodeKind(str, Enum):
12
+ FILE = "FILE"
13
+ MODULE = "MODULE"
14
+ CLASS = "CLASS"
15
+ FUNCTION = "FUNCTION"
16
+ METHOD = "METHOD"
17
+ VARIABLE = "VARIABLE"
18
+ PARAMETER = "PARAMETER"
19
+ IMPORT = "IMPORT"
20
+ TEST = "TEST"
21
+
22
+
23
+ class EdgeKind(str, Enum):
24
+ DEFINED_IN = "DEFINED_IN"
25
+ IMPORTS = "IMPORTS"
26
+ CALLS = "CALLS"
27
+ INHERITS = "INHERITS"
28
+ IMPLEMENTS = "IMPLEMENTS"
29
+ READS = "READS"
30
+ WRITES = "WRITES"
31
+ RETURNS = "RETURNS"
32
+ PARAM_OF = "PARAM_OF"
33
+ TESTED_BY = "TESTED_BY"
34
+ # v0.2 cross-stack data-flow edges (populated by DF1 / DF2 extractors).
35
+ # Reserved here so DF1/DF2 agents don't both edit this enum in parallel.
36
+ ROUTE = "ROUTE" # HANDLER → URL pattern (DF1, FastAPI/Flask)
37
+ READS_FROM = "READS_FROM" # function → SQLAlchemy model on read (DF1)
38
+ WRITES_TO = "WRITES_TO" # function → SQLAlchemy model on write (DF1)
39
+ FETCH_CALL = "FETCH_CALL" # frontend call site → URL string (DF2, fetch/axios)
40
+
41
+
42
+ class Node(BaseModel):
43
+ id: str
44
+ kind: NodeKind
45
+ name: str
46
+ qualname: str
47
+ file: str
48
+ line_start: int
49
+ line_end: int
50
+ signature: str | None = None
51
+ docstring: str | None = None
52
+ content_hash: str | None = None
53
+ language: str
54
+ metadata: dict[str, Any] = Field(default_factory=dict)
55
+
56
+
57
+ class Edge(BaseModel):
58
+ src: str
59
+ dst: str
60
+ kind: EdgeKind
61
+ file: str | None = None
62
+ line: int | None = None
63
+ metadata: dict[str, Any] = Field(default_factory=dict)
64
+
65
+
66
+ def make_node_id(kind: NodeKind, qualname: str, file: str) -> str:
67
+ """Stable BLAKE2b-128 hex hash of (kind, qualname, file)."""
68
+ data = f"{kind.value}:{qualname}:{file}".encode()
69
+ return hashlib.blake2b(data, digest_size=16).hexdigest()
@@ -0,0 +1,55 @@
1
+ """NetworkX adapter for the SQLiteGraphStore."""
2
+ from __future__ import annotations
3
+
4
+ from collections.abc import Iterable
5
+ from typing import cast
6
+
7
+ import networkx as nx
8
+
9
+ from codegraph.graph.schema import EdgeKind
10
+ from codegraph.graph.store_sqlite import SQLiteGraphStore
11
+
12
+
13
+ def to_digraph(store: SQLiteGraphStore) -> nx.MultiDiGraph:
14
+ g: nx.MultiDiGraph = nx.MultiDiGraph()
15
+ for node in store.iter_nodes():
16
+ g.add_node(node.id, **node.model_dump(mode="json"))
17
+ for edge in store.iter_edges():
18
+ g.add_edge(edge.src, edge.dst, key=edge.kind.value, **edge.model_dump(mode="json"))
19
+ return g
20
+
21
+
22
+ def subgraph_around(
23
+ g: nx.MultiDiGraph,
24
+ node_id: str,
25
+ depth: int,
26
+ direction: str = "both",
27
+ edge_kinds: Iterable[EdgeKind] | None = None,
28
+ ) -> nx.MultiDiGraph:
29
+ """Return a MultiDiGraph of nodes within `depth` BFS hops from node_id."""
30
+ allowed_kinds: set[str] | None = (
31
+ {k.value for k in edge_kinds} if edge_kinds is not None else None
32
+ )
33
+ visited: set[str] = set()
34
+ frontier: set[str] = {node_id}
35
+ for _ in range(depth):
36
+ next_frontier: set[str] = set()
37
+ for n in frontier:
38
+ if n not in g:
39
+ continue
40
+ neighbors: list[str] = []
41
+ if direction in ("out", "both"):
42
+ for _src, dst, data in g.out_edges(n, data=True):
43
+ if allowed_kinds is None or data.get("kind") in allowed_kinds:
44
+ neighbors.append(dst)
45
+ if direction in ("in", "both"):
46
+ for src, _dst, data in g.in_edges(n, data=True):
47
+ if allowed_kinds is None or data.get("kind") in allowed_kinds:
48
+ neighbors.append(src)
49
+ for nb in neighbors:
50
+ if nb not in visited and nb not in frontier:
51
+ next_frontier.add(nb)
52
+ visited.update(frontier)
53
+ frontier = next_frontier - visited
54
+ visited.update(frontier)
55
+ return cast(nx.MultiDiGraph, g.subgraph(visited).copy())