polycodegraph 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codegraph/__init__.py +10 -0
- codegraph/analysis/__init__.py +30 -0
- codegraph/analysis/_common.py +125 -0
- codegraph/analysis/blast_radius.py +63 -0
- codegraph/analysis/cycles.py +79 -0
- codegraph/analysis/dataflow.py +861 -0
- codegraph/analysis/dead_code.py +165 -0
- codegraph/analysis/hotspots.py +68 -0
- codegraph/analysis/infrastructure.py +439 -0
- codegraph/analysis/metrics.py +52 -0
- codegraph/analysis/report.py +222 -0
- codegraph/analysis/roles.py +323 -0
- codegraph/analysis/untested.py +79 -0
- codegraph/cli.py +1506 -0
- codegraph/config.py +64 -0
- codegraph/embed/__init__.py +35 -0
- codegraph/embed/chunker.py +120 -0
- codegraph/embed/embedder.py +113 -0
- codegraph/embed/query.py +181 -0
- codegraph/embed/store.py +360 -0
- codegraph/graph/__init__.py +0 -0
- codegraph/graph/builder.py +212 -0
- codegraph/graph/schema.py +69 -0
- codegraph/graph/store_networkx.py +55 -0
- codegraph/graph/store_sqlite.py +249 -0
- codegraph/mcp_server/__init__.py +6 -0
- codegraph/mcp_server/server.py +933 -0
- codegraph/parsers/__init__.py +0 -0
- codegraph/parsers/base.py +70 -0
- codegraph/parsers/go.py +570 -0
- codegraph/parsers/python.py +1707 -0
- codegraph/parsers/typescript.py +1397 -0
- codegraph/py.typed +0 -0
- codegraph/resolve/__init__.py +4 -0
- codegraph/resolve/calls.py +480 -0
- codegraph/review/__init__.py +31 -0
- codegraph/review/baseline.py +32 -0
- codegraph/review/differ.py +211 -0
- codegraph/review/hook.py +70 -0
- codegraph/review/risk.py +219 -0
- codegraph/review/rules.py +342 -0
- codegraph/viz/__init__.py +17 -0
- codegraph/viz/_style.py +45 -0
- codegraph/viz/dashboard.py +740 -0
- codegraph/viz/diagrams.py +370 -0
- codegraph/viz/explore.py +453 -0
- codegraph/viz/hld.py +683 -0
- codegraph/viz/html.py +115 -0
- codegraph/viz/mermaid.py +111 -0
- codegraph/viz/svg.py +77 -0
- codegraph/web/__init__.py +4 -0
- codegraph/web/server.py +165 -0
- codegraph/web/static/app.css +664 -0
- codegraph/web/static/app.js +919 -0
- codegraph/web/static/index.html +112 -0
- codegraph/web/static/views/architecture.js +1671 -0
- codegraph/web/static/views/graph3d.css +564 -0
- codegraph/web/static/views/graph3d.js +999 -0
- codegraph/web/static/views/graph3d_transform.js +984 -0
- codegraph/workspace/__init__.py +34 -0
- codegraph/workspace/config.py +110 -0
- codegraph/workspace/operations.py +294 -0
- polycodegraph-0.1.0.dist-info/METADATA +687 -0
- polycodegraph-0.1.0.dist-info/RECORD +67 -0
- polycodegraph-0.1.0.dist-info/WHEEL +4 -0
- polycodegraph-0.1.0.dist-info/entry_points.txt +2 -0
- polycodegraph-0.1.0.dist-info/licenses/LICENSE +21 -0
codegraph/embed/store.py
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
"""On-disk vector store.
|
|
2
|
+
|
|
3
|
+
Tries LanceDB first (the production backend) and falls back to a tiny JSON
|
|
4
|
+
file when the optional ``embed`` extra isn't installed. The fallback is good
|
|
5
|
+
enough for unit tests and for repos that just want a quick local index without
|
|
6
|
+
pulling the full Arrow / LanceDB stack.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import contextlib
|
|
11
|
+
import json
|
|
12
|
+
import math
|
|
13
|
+
from collections.abc import Iterable, Sequence
|
|
14
|
+
from dataclasses import asdict, dataclass, field
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from codegraph.embed.chunker import Chunk # noqa: F401 (re-export friendly)
|
|
19
|
+
from codegraph.embed.embedder import DEFAULT_DIM, DEFAULT_MODEL, Embedder
|
|
20
|
+
|
|
21
|
+
_STORE_FILENAME = "embeddings.lance"
|
|
22
|
+
_FALLBACK_FILENAME = "embeddings.json"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class StoredChunk:
|
|
27
|
+
id: str
|
|
28
|
+
qualname: str
|
|
29
|
+
file: str
|
|
30
|
+
line_start: int
|
|
31
|
+
line_end: int
|
|
32
|
+
kind: str
|
|
33
|
+
role: str | None
|
|
34
|
+
text: str
|
|
35
|
+
vector: list[float] = field(default_factory=list)
|
|
36
|
+
|
|
37
|
+
# pragma: codegraph-public-api
|
|
38
|
+
def to_json(self) -> dict[str, Any]:
|
|
39
|
+
return {
|
|
40
|
+
"id": self.id,
|
|
41
|
+
"qualname": self.qualname,
|
|
42
|
+
"file": self.file,
|
|
43
|
+
"line_start": self.line_start,
|
|
44
|
+
"line_end": self.line_end,
|
|
45
|
+
"kind": self.kind,
|
|
46
|
+
"role": self.role,
|
|
47
|
+
"text": self.text,
|
|
48
|
+
"vector": list(self.vector),
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def from_json(cls, data: dict[str, Any]) -> StoredChunk:
|
|
53
|
+
return cls(
|
|
54
|
+
id=str(data["id"]),
|
|
55
|
+
qualname=str(data["qualname"]),
|
|
56
|
+
file=str(data["file"]),
|
|
57
|
+
line_start=int(data["line_start"]),
|
|
58
|
+
line_end=int(data["line_end"]),
|
|
59
|
+
kind=str(data["kind"]),
|
|
60
|
+
role=(str(data["role"]) if data.get("role") else None),
|
|
61
|
+
text=str(data["text"]),
|
|
62
|
+
vector=[float(v) for v in data.get("vector") or []],
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _cosine(a: Sequence[float], b: Sequence[float]) -> float:
|
|
67
|
+
if not a or not b:
|
|
68
|
+
return 0.0
|
|
69
|
+
dot = 0.0
|
|
70
|
+
na = 0.0
|
|
71
|
+
nb = 0.0
|
|
72
|
+
for x, y in zip(a, b, strict=False):
|
|
73
|
+
dot += x * y
|
|
74
|
+
na += x * x
|
|
75
|
+
nb += y * y
|
|
76
|
+
if na == 0.0 or nb == 0.0:
|
|
77
|
+
return 0.0
|
|
78
|
+
return dot / (math.sqrt(na) * math.sqrt(nb))
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Backend abstraction
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
class _JsonBackend:
|
|
86
|
+
"""JSON-backed backend. Used in tests and as the no-deps fallback."""
|
|
87
|
+
|
|
88
|
+
def __init__(self, path: Path) -> None:
|
|
89
|
+
self.path = path
|
|
90
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
self._rows: list[StoredChunk] = []
|
|
92
|
+
if self.path.exists():
|
|
93
|
+
try:
|
|
94
|
+
raw = json.loads(self.path.read_text(encoding="utf-8"))
|
|
95
|
+
self._rows = [StoredChunk.from_json(r) for r in raw]
|
|
96
|
+
except (OSError, json.JSONDecodeError, KeyError, TypeError):
|
|
97
|
+
self._rows = []
|
|
98
|
+
|
|
99
|
+
def upsert(self, rows: Iterable[StoredChunk]) -> None:
|
|
100
|
+
new = list(rows)
|
|
101
|
+
new_ids = {r.id for r in new}
|
|
102
|
+
kept = [r for r in self._rows if r.id not in new_ids]
|
|
103
|
+
self._rows = kept + new
|
|
104
|
+
self._flush()
|
|
105
|
+
|
|
106
|
+
def replace_all(self, rows: Iterable[StoredChunk]) -> None:
|
|
107
|
+
self._rows = list(rows)
|
|
108
|
+
self._flush()
|
|
109
|
+
|
|
110
|
+
def _flush(self) -> None:
|
|
111
|
+
payload = [r.to_json() for r in self._rows]
|
|
112
|
+
self.path.write_text(json.dumps(payload), encoding="utf-8")
|
|
113
|
+
|
|
114
|
+
def all(self) -> list[StoredChunk]:
|
|
115
|
+
return list(self._rows)
|
|
116
|
+
|
|
117
|
+
def query(self, vector: Sequence[float], k: int) -> list[tuple[StoredChunk, float]]:
|
|
118
|
+
scored = [(row, _cosine(vector, row.vector)) for row in self._rows]
|
|
119
|
+
scored.sort(key=lambda pair: pair[1], reverse=True)
|
|
120
|
+
return scored[:k]
|
|
121
|
+
|
|
122
|
+
def size_bytes(self) -> int:
|
|
123
|
+
return self.path.stat().st_size if self.path.exists() else 0
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class _LanceBackend:
|
|
127
|
+
"""LanceDB backend. Schema mirrors :class:`StoredChunk`."""
|
|
128
|
+
|
|
129
|
+
def __init__(self, path: Path, dim: int) -> None:
|
|
130
|
+
import lancedb
|
|
131
|
+
import pyarrow as pa
|
|
132
|
+
|
|
133
|
+
self.path = path
|
|
134
|
+
self._dim = dim
|
|
135
|
+
self._pa = pa
|
|
136
|
+
self._db = lancedb.connect(str(path))
|
|
137
|
+
self._schema = self._make_schema(dim)
|
|
138
|
+
if "chunks" in self._db.table_names():
|
|
139
|
+
self._table = self._db.open_table("chunks")
|
|
140
|
+
else:
|
|
141
|
+
self._table = self._db.create_table("chunks", schema=self._schema, mode="create")
|
|
142
|
+
|
|
143
|
+
def _make_schema(self, dim: int) -> Any:
|
|
144
|
+
pa = self._pa
|
|
145
|
+
return pa.schema(
|
|
146
|
+
[
|
|
147
|
+
("id", pa.string()),
|
|
148
|
+
("qualname", pa.string()),
|
|
149
|
+
("file", pa.string()),
|
|
150
|
+
("line_start", pa.int64()),
|
|
151
|
+
("line_end", pa.int64()),
|
|
152
|
+
("kind", pa.string()),
|
|
153
|
+
("role", pa.string()),
|
|
154
|
+
("text", pa.string()),
|
|
155
|
+
("vector", pa.list_(pa.float32(), dim)),
|
|
156
|
+
]
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def _to_dict(self, row: StoredChunk) -> dict[str, Any]:
|
|
160
|
+
return {
|
|
161
|
+
"id": row.id,
|
|
162
|
+
"qualname": row.qualname,
|
|
163
|
+
"file": row.file,
|
|
164
|
+
"line_start": row.line_start,
|
|
165
|
+
"line_end": row.line_end,
|
|
166
|
+
"kind": row.kind,
|
|
167
|
+
"role": row.role or "",
|
|
168
|
+
"text": row.text,
|
|
169
|
+
"vector": row.vector,
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
def upsert(self, rows: Iterable[StoredChunk]) -> None:
|
|
173
|
+
batch = [self._to_dict(r) for r in rows]
|
|
174
|
+
if not batch:
|
|
175
|
+
return
|
|
176
|
+
ids = ", ".join(f"'{r['id']}'" for r in batch)
|
|
177
|
+
with contextlib.suppress(Exception):
|
|
178
|
+
self._table.delete(f"id IN ({ids})")
|
|
179
|
+
self._table.add(batch)
|
|
180
|
+
|
|
181
|
+
def replace_all(self, rows: Iterable[StoredChunk]) -> None:
|
|
182
|
+
batch = [self._to_dict(r) for r in rows]
|
|
183
|
+
with contextlib.suppress(Exception):
|
|
184
|
+
self._db.drop_table("chunks", ignore_missing=True)
|
|
185
|
+
self._table = self._db.create_table("chunks", schema=self._schema, mode="create")
|
|
186
|
+
if batch:
|
|
187
|
+
self._table.add(batch)
|
|
188
|
+
|
|
189
|
+
def _row_from_record(self, r: dict[str, Any]) -> StoredChunk:
|
|
190
|
+
return StoredChunk(
|
|
191
|
+
id=str(r["id"]),
|
|
192
|
+
qualname=str(r["qualname"]),
|
|
193
|
+
file=str(r["file"]),
|
|
194
|
+
line_start=int(r["line_start"]),
|
|
195
|
+
line_end=int(r["line_end"]),
|
|
196
|
+
kind=str(r["kind"]),
|
|
197
|
+
role=str(r["role"]) or None,
|
|
198
|
+
text=str(r["text"]),
|
|
199
|
+
vector=list(r["vector"]),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
def all(self) -> list[StoredChunk]:
|
|
203
|
+
rows = self._table.to_pandas().to_dict(orient="records")
|
|
204
|
+
return [self._row_from_record(r) for r in rows]
|
|
205
|
+
|
|
206
|
+
def query(self, vector: Sequence[float], k: int) -> list[tuple[StoredChunk, float]]:
|
|
207
|
+
results = self._table.search(list(vector)).limit(k).to_pandas()
|
|
208
|
+
out: list[tuple[StoredChunk, float]] = []
|
|
209
|
+
for r in results.to_dict(orient="records"):
|
|
210
|
+
chunk = self._row_from_record(r)
|
|
211
|
+
distance = float(r.get("_distance", 0.0))
|
|
212
|
+
similarity = 1.0 / (1.0 + distance)
|
|
213
|
+
out.append((chunk, similarity))
|
|
214
|
+
return out
|
|
215
|
+
|
|
216
|
+
def size_bytes(self) -> int:
|
|
217
|
+
total = 0
|
|
218
|
+
for p in self.path.rglob("*"):
|
|
219
|
+
if p.is_file():
|
|
220
|
+
total += p.stat().st_size
|
|
221
|
+
return total
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
# ---------------------------------------------------------------------------
|
|
225
|
+
# Public store
|
|
226
|
+
# ---------------------------------------------------------------------------
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class EmbeddingStore:
|
|
230
|
+
"""High-level interface that auto-selects a backend.
|
|
231
|
+
|
|
232
|
+
``backend='auto'`` (default) tries LanceDB and falls back to JSON.
|
|
233
|
+
``backend='json'`` forces the lightweight backend (used in tests).
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
def __init__(
|
|
237
|
+
self,
|
|
238
|
+
data_dir: Path,
|
|
239
|
+
*,
|
|
240
|
+
dim: int = DEFAULT_DIM,
|
|
241
|
+
backend: str = "auto",
|
|
242
|
+
) -> None:
|
|
243
|
+
self.data_dir = data_dir
|
|
244
|
+
self.dim = dim
|
|
245
|
+
self.backend_name: str
|
|
246
|
+
self._backend: _LanceBackend | _JsonBackend
|
|
247
|
+
data_dir.mkdir(parents=True, exist_ok=True)
|
|
248
|
+
|
|
249
|
+
if backend == "json":
|
|
250
|
+
self._backend = _JsonBackend(data_dir / _FALLBACK_FILENAME)
|
|
251
|
+
self.backend_name = "json"
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
self._backend = _LanceBackend(data_dir / _STORE_FILENAME, dim=dim)
|
|
256
|
+
self.backend_name = "lancedb"
|
|
257
|
+
except ImportError:
|
|
258
|
+
if backend == "lancedb":
|
|
259
|
+
raise
|
|
260
|
+
self._backend = _JsonBackend(data_dir / _FALLBACK_FILENAME)
|
|
261
|
+
self.backend_name = "json"
|
|
262
|
+
|
|
263
|
+
# ------------------------------------------------------------------
|
|
264
|
+
# pragma: codegraph-public-api
|
|
265
|
+
def upsert(self, rows: Iterable[StoredChunk]) -> None:
|
|
266
|
+
self._backend.upsert(rows)
|
|
267
|
+
|
|
268
|
+
# pragma: codegraph-public-api
|
|
269
|
+
def replace_all(self, rows: Iterable[StoredChunk]) -> None:
|
|
270
|
+
self._backend.replace_all(rows)
|
|
271
|
+
|
|
272
|
+
# pragma: codegraph-public-api
|
|
273
|
+
def all(self) -> list[StoredChunk]:
|
|
274
|
+
return self._backend.all()
|
|
275
|
+
|
|
276
|
+
# pragma: codegraph-public-api
|
|
277
|
+
def query(self, vector: Sequence[float], k: int = 5) -> list[tuple[StoredChunk, float]]:
|
|
278
|
+
return self._backend.query(vector, k)
|
|
279
|
+
|
|
280
|
+
# pragma: codegraph-public-api
|
|
281
|
+
def size_bytes(self) -> int:
|
|
282
|
+
return self._backend.size_bytes()
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
# ---------------------------------------------------------------------------
|
|
286
|
+
# build_index — orchestrator wired up to chunker + embedder
|
|
287
|
+
# ---------------------------------------------------------------------------
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
@dataclass
|
|
291
|
+
class IndexStats:
|
|
292
|
+
chunks_indexed: int
|
|
293
|
+
model: str
|
|
294
|
+
dim: int
|
|
295
|
+
backend: str
|
|
296
|
+
on_disk_bytes: int
|
|
297
|
+
|
|
298
|
+
# pragma: codegraph-public-api
|
|
299
|
+
def as_dict(self) -> dict[str, Any]:
|
|
300
|
+
return asdict(self)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def build_index(
|
|
304
|
+
repo_root: Path,
|
|
305
|
+
*,
|
|
306
|
+
db_path: Path | None = None,
|
|
307
|
+
embeddings_dir: Path | None = None,
|
|
308
|
+
embedder: Embedder | None = None,
|
|
309
|
+
model: str = DEFAULT_MODEL,
|
|
310
|
+
force: bool = False,
|
|
311
|
+
progress: Any | None = None,
|
|
312
|
+
backend: str = "auto",
|
|
313
|
+
) -> IndexStats:
|
|
314
|
+
"""Chunk + embed + persist.
|
|
315
|
+
|
|
316
|
+
``progress`` (optional) is anything with an ``advance(step: int)`` method
|
|
317
|
+
— typically a ``rich.progress.Progress`` task. Pass ``None`` to disable.
|
|
318
|
+
"""
|
|
319
|
+
from codegraph.embed.chunker import chunk_repo
|
|
320
|
+
|
|
321
|
+
chunks = list(chunk_repo(repo_root, db_path=db_path))
|
|
322
|
+
emb = embedder or Embedder(model)
|
|
323
|
+
|
|
324
|
+
rows: list[StoredChunk] = []
|
|
325
|
+
dim = DEFAULT_DIM
|
|
326
|
+
if chunks:
|
|
327
|
+
vectors = emb.embed([c.text for c in chunks], batch_size=32)
|
|
328
|
+
dim = len(vectors[0]) if vectors else DEFAULT_DIM
|
|
329
|
+
for c, v in zip(chunks, vectors, strict=False):
|
|
330
|
+
rows.append(
|
|
331
|
+
StoredChunk(
|
|
332
|
+
id=c.id,
|
|
333
|
+
qualname=c.qualname,
|
|
334
|
+
file=c.file,
|
|
335
|
+
line_start=c.line_start,
|
|
336
|
+
line_end=c.line_end,
|
|
337
|
+
kind=c.kind,
|
|
338
|
+
role=c.role,
|
|
339
|
+
text=c.text,
|
|
340
|
+
vector=v,
|
|
341
|
+
)
|
|
342
|
+
)
|
|
343
|
+
if progress is not None:
|
|
344
|
+
with contextlib.suppress(Exception):
|
|
345
|
+
progress.advance(1)
|
|
346
|
+
|
|
347
|
+
out_dir = embeddings_dir or (repo_root / ".codegraph")
|
|
348
|
+
store = EmbeddingStore(out_dir, dim=dim, backend=backend)
|
|
349
|
+
if force:
|
|
350
|
+
store.replace_all(rows)
|
|
351
|
+
else:
|
|
352
|
+
store.upsert(rows)
|
|
353
|
+
|
|
354
|
+
return IndexStats(
|
|
355
|
+
chunks_indexed=len(rows),
|
|
356
|
+
model=emb.model,
|
|
357
|
+
dim=dim,
|
|
358
|
+
backend=store.backend_name,
|
|
359
|
+
on_disk_bytes=store.size_bytes(),
|
|
360
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""Repo walker and incremental graph builder."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import hashlib
|
|
5
|
+
import logging
|
|
6
|
+
import subprocess
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import pathspec
|
|
13
|
+
|
|
14
|
+
# Ensure extractors register themselves.
|
|
15
|
+
import codegraph.parsers.go
|
|
16
|
+
import codegraph.parsers.python
|
|
17
|
+
import codegraph.parsers.typescript # noqa: F401
|
|
18
|
+
from codegraph.config import CodegraphConfig
|
|
19
|
+
from codegraph.graph.schema import Node, NodeKind, make_node_id
|
|
20
|
+
from codegraph.graph.store_sqlite import SQLiteGraphStore
|
|
21
|
+
from codegraph.parsers.base import get_extractor_for
|
|
22
|
+
from codegraph.parsers.python import PythonExtractor
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
_BUILTIN_IGNORES = [
|
|
27
|
+
".git", ".venv", "venv", "node_modules", ".codegraph",
|
|
28
|
+
"dist", "build", "__pycache__", ".next", ".pytest_cache",
|
|
29
|
+
".mypy_cache", ".ruff_cache", ".tox", ".eggs", "*.egg-info",
|
|
30
|
+
".DS_Store", "*.pyc", "*.pyo",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
_IGNORE_DIRS: set[str] = {
|
|
34
|
+
".git", ".venv", "venv", "node_modules", ".codegraph",
|
|
35
|
+
"dist", "build", "__pycache__", ".next", ".pytest_cache",
|
|
36
|
+
".mypy_cache", ".ruff_cache", ".tox",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class BuildStats:
|
|
42
|
+
files_scanned: int = 0
|
|
43
|
+
files_parsed: int = 0
|
|
44
|
+
nodes_added: int = 0
|
|
45
|
+
edges_added: int = 0
|
|
46
|
+
files_skipped: int = 0
|
|
47
|
+
errors: list[str] = field(default_factory=list)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _sha256(path: Path) -> str:
|
|
51
|
+
h = hashlib.sha256()
|
|
52
|
+
with path.open("rb") as f:
|
|
53
|
+
for chunk in iter(lambda: f.read(65536), b""):
|
|
54
|
+
h.update(chunk)
|
|
55
|
+
return h.hexdigest()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _get_git_sha(repo_root: Path) -> str | None:
|
|
59
|
+
try:
|
|
60
|
+
result = subprocess.run(
|
|
61
|
+
["git", "rev-parse", "--short", "HEAD"],
|
|
62
|
+
cwd=repo_root,
|
|
63
|
+
capture_output=True,
|
|
64
|
+
text=True,
|
|
65
|
+
timeout=5,
|
|
66
|
+
)
|
|
67
|
+
if result.returncode == 0:
|
|
68
|
+
return result.stdout.strip()
|
|
69
|
+
except Exception:
|
|
70
|
+
pass
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class GraphBuilder:
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
repo_root: Path,
|
|
78
|
+
store: SQLiteGraphStore,
|
|
79
|
+
ignore: list[str] | None = None,
|
|
80
|
+
config: CodegraphConfig | None = None,
|
|
81
|
+
) -> None:
|
|
82
|
+
self._repo_root = repo_root
|
|
83
|
+
self._store = store
|
|
84
|
+
self._ignore = ignore or []
|
|
85
|
+
self._config = config or CodegraphConfig()
|
|
86
|
+
self._apply_config_to_extractors()
|
|
87
|
+
|
|
88
|
+
def _apply_config_to_extractors(self) -> None:
|
|
89
|
+
"""Forward user dead-code patterns onto the singleton extractors."""
|
|
90
|
+
extra = tuple(self._config.dead_code.entry_point_decorators)
|
|
91
|
+
# PythonExtractor is registered as a singleton in the registry; we
|
|
92
|
+
# mutate its class attribute so subsequent parse_file calls pick up
|
|
93
|
+
# the user patterns.
|
|
94
|
+
PythonExtractor.extra_entry_point_decorators = extra
|
|
95
|
+
|
|
96
|
+
def build(self, incremental: bool = True) -> BuildStats:
|
|
97
|
+
stats = BuildStats()
|
|
98
|
+
patterns = _BUILTIN_IGNORES + self._ignore
|
|
99
|
+
spec = pathspec.PathSpec.from_lines("gitwildmatch", patterns)
|
|
100
|
+
|
|
101
|
+
files = list(self._walk_repo(spec))
|
|
102
|
+
stats.files_scanned = len(files)
|
|
103
|
+
|
|
104
|
+
for file_path in files:
|
|
105
|
+
rel = file_path.relative_to(self._repo_root).as_posix()
|
|
106
|
+
try:
|
|
107
|
+
content_hash = _sha256(file_path)
|
|
108
|
+
|
|
109
|
+
extractor = get_extractor_for(file_path)
|
|
110
|
+
language = extractor.language if extractor else "unknown"
|
|
111
|
+
|
|
112
|
+
file_node_id = make_node_id(NodeKind.FILE, rel, rel)
|
|
113
|
+
if incremental:
|
|
114
|
+
existing = self._store.get_node(file_node_id)
|
|
115
|
+
if existing and existing.content_hash == content_hash:
|
|
116
|
+
stats.files_skipped += 1
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
self._store.delete_file(rel)
|
|
120
|
+
|
|
121
|
+
file_node = Node(
|
|
122
|
+
id=file_node_id,
|
|
123
|
+
kind=NodeKind.FILE,
|
|
124
|
+
name=file_path.name,
|
|
125
|
+
qualname=rel,
|
|
126
|
+
file=rel,
|
|
127
|
+
line_start=1,
|
|
128
|
+
line_end=0,
|
|
129
|
+
content_hash=content_hash,
|
|
130
|
+
language=language,
|
|
131
|
+
metadata={"size": file_path.stat().st_size},
|
|
132
|
+
)
|
|
133
|
+
self._store.upsert_node(file_node)
|
|
134
|
+
stats.nodes_added += 1
|
|
135
|
+
|
|
136
|
+
if extractor is not None:
|
|
137
|
+
nodes, edges = extractor.parse_file(
|
|
138
|
+
file_path, self._repo_root
|
|
139
|
+
)
|
|
140
|
+
self._store.upsert_nodes(nodes)
|
|
141
|
+
self._store.upsert_edges(edges)
|
|
142
|
+
stats.nodes_added += len(nodes)
|
|
143
|
+
stats.edges_added += len(edges)
|
|
144
|
+
stats.files_parsed += 1
|
|
145
|
+
|
|
146
|
+
except Exception as exc:
|
|
147
|
+
logger.warning("Error parsing %s: %s", rel, exc)
|
|
148
|
+
stats.errors.append(f"{rel}: {exc}")
|
|
149
|
+
|
|
150
|
+
now = datetime.now(tz=timezone.utc).isoformat()
|
|
151
|
+
self._store.set_meta("last_build_time", now)
|
|
152
|
+
git_sha = _get_git_sha(self._repo_root)
|
|
153
|
+
if git_sha:
|
|
154
|
+
self._store.set_meta("last_git_sha", git_sha)
|
|
155
|
+
|
|
156
|
+
# Best-effort cross-file resolution of unresolved CALLS/IMPORTS edges.
|
|
157
|
+
try:
|
|
158
|
+
from codegraph.resolve import resolve_unresolved_edges
|
|
159
|
+
rstats = resolve_unresolved_edges(self._store)
|
|
160
|
+
self._store.set_meta(
|
|
161
|
+
"last_resolve",
|
|
162
|
+
f"{rstats.resolved}/{rstats.inspected} resolved",
|
|
163
|
+
)
|
|
164
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
165
|
+
logger.warning("resolver failed: %s", exc)
|
|
166
|
+
stats.errors.append(f"resolver: {exc}")
|
|
167
|
+
|
|
168
|
+
# Architectural role classification (DF1.5): stamp HANDLER/SERVICE/
|
|
169
|
+
# COMPONENT/REPO onto FUNCTION/METHOD/CLASS nodes.
|
|
170
|
+
try:
|
|
171
|
+
from codegraph.analysis.roles import classify_roles
|
|
172
|
+
from codegraph.graph.store_networkx import to_digraph
|
|
173
|
+
|
|
174
|
+
graph = to_digraph(self._store)
|
|
175
|
+
count = classify_roles(graph)
|
|
176
|
+
if count:
|
|
177
|
+
updated: list[Node] = []
|
|
178
|
+
for nid, attrs in graph.nodes(data=True):
|
|
179
|
+
metadata = attrs.get("metadata") or {}
|
|
180
|
+
if not metadata.get("role"):
|
|
181
|
+
continue
|
|
182
|
+
existing = self._store.get_node(nid)
|
|
183
|
+
if existing is None:
|
|
184
|
+
continue
|
|
185
|
+
existing.metadata["role"] = metadata["role"]
|
|
186
|
+
updated.append(existing)
|
|
187
|
+
if updated:
|
|
188
|
+
self._store.upsert_nodes(updated)
|
|
189
|
+
self._store.set_meta("last_roles", str(count))
|
|
190
|
+
logger.info("roles: %d nodes classified", count)
|
|
191
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
192
|
+
logger.warning("role classifier failed: %s", exc)
|
|
193
|
+
stats.errors.append(f"roles: {exc}")
|
|
194
|
+
|
|
195
|
+
return stats
|
|
196
|
+
|
|
197
|
+
def _walk_repo(self, spec: Any) -> list[Path]:
|
|
198
|
+
result: list[Path] = []
|
|
199
|
+
for file_path in sorted(self._repo_root.rglob("*")):
|
|
200
|
+
if not file_path.is_file():
|
|
201
|
+
continue
|
|
202
|
+
try:
|
|
203
|
+
rel = file_path.relative_to(self._repo_root).as_posix()
|
|
204
|
+
except ValueError:
|
|
205
|
+
continue
|
|
206
|
+
if spec.match_file(rel):
|
|
207
|
+
continue
|
|
208
|
+
parts = Path(rel).parts
|
|
209
|
+
if any(part in _IGNORE_DIRS for part in parts[:-1]):
|
|
210
|
+
continue
|
|
211
|
+
result.append(file_path)
|
|
212
|
+
return result
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Graph schema: Node, Edge, and ID generation."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import hashlib
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class NodeKind(str, Enum):
|
|
12
|
+
FILE = "FILE"
|
|
13
|
+
MODULE = "MODULE"
|
|
14
|
+
CLASS = "CLASS"
|
|
15
|
+
FUNCTION = "FUNCTION"
|
|
16
|
+
METHOD = "METHOD"
|
|
17
|
+
VARIABLE = "VARIABLE"
|
|
18
|
+
PARAMETER = "PARAMETER"
|
|
19
|
+
IMPORT = "IMPORT"
|
|
20
|
+
TEST = "TEST"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EdgeKind(str, Enum):
|
|
24
|
+
DEFINED_IN = "DEFINED_IN"
|
|
25
|
+
IMPORTS = "IMPORTS"
|
|
26
|
+
CALLS = "CALLS"
|
|
27
|
+
INHERITS = "INHERITS"
|
|
28
|
+
IMPLEMENTS = "IMPLEMENTS"
|
|
29
|
+
READS = "READS"
|
|
30
|
+
WRITES = "WRITES"
|
|
31
|
+
RETURNS = "RETURNS"
|
|
32
|
+
PARAM_OF = "PARAM_OF"
|
|
33
|
+
TESTED_BY = "TESTED_BY"
|
|
34
|
+
# v0.2 cross-stack data-flow edges (populated by DF1 / DF2 extractors).
|
|
35
|
+
# Reserved here so DF1/DF2 agents don't both edit this enum in parallel.
|
|
36
|
+
ROUTE = "ROUTE" # HANDLER → URL pattern (DF1, FastAPI/Flask)
|
|
37
|
+
READS_FROM = "READS_FROM" # function → SQLAlchemy model on read (DF1)
|
|
38
|
+
WRITES_TO = "WRITES_TO" # function → SQLAlchemy model on write (DF1)
|
|
39
|
+
FETCH_CALL = "FETCH_CALL" # frontend call site → URL string (DF2, fetch/axios)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Node(BaseModel):
|
|
43
|
+
id: str
|
|
44
|
+
kind: NodeKind
|
|
45
|
+
name: str
|
|
46
|
+
qualname: str
|
|
47
|
+
file: str
|
|
48
|
+
line_start: int
|
|
49
|
+
line_end: int
|
|
50
|
+
signature: str | None = None
|
|
51
|
+
docstring: str | None = None
|
|
52
|
+
content_hash: str | None = None
|
|
53
|
+
language: str
|
|
54
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class Edge(BaseModel):
|
|
58
|
+
src: str
|
|
59
|
+
dst: str
|
|
60
|
+
kind: EdgeKind
|
|
61
|
+
file: str | None = None
|
|
62
|
+
line: int | None = None
|
|
63
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def make_node_id(kind: NodeKind, qualname: str, file: str) -> str:
|
|
67
|
+
"""Stable BLAKE2b-128 hex hash of (kind, qualname, file)."""
|
|
68
|
+
data = f"{kind.value}:{qualname}:{file}".encode()
|
|
69
|
+
return hashlib.blake2b(data, digest_size=16).hexdigest()
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""NetworkX adapter for the SQLiteGraphStore."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from typing import cast
|
|
6
|
+
|
|
7
|
+
import networkx as nx
|
|
8
|
+
|
|
9
|
+
from codegraph.graph.schema import EdgeKind
|
|
10
|
+
from codegraph.graph.store_sqlite import SQLiteGraphStore
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def to_digraph(store: SQLiteGraphStore) -> nx.MultiDiGraph:
|
|
14
|
+
g: nx.MultiDiGraph = nx.MultiDiGraph()
|
|
15
|
+
for node in store.iter_nodes():
|
|
16
|
+
g.add_node(node.id, **node.model_dump(mode="json"))
|
|
17
|
+
for edge in store.iter_edges():
|
|
18
|
+
g.add_edge(edge.src, edge.dst, key=edge.kind.value, **edge.model_dump(mode="json"))
|
|
19
|
+
return g
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def subgraph_around(
|
|
23
|
+
g: nx.MultiDiGraph,
|
|
24
|
+
node_id: str,
|
|
25
|
+
depth: int,
|
|
26
|
+
direction: str = "both",
|
|
27
|
+
edge_kinds: Iterable[EdgeKind] | None = None,
|
|
28
|
+
) -> nx.MultiDiGraph:
|
|
29
|
+
"""Return a MultiDiGraph of nodes within `depth` BFS hops from node_id."""
|
|
30
|
+
allowed_kinds: set[str] | None = (
|
|
31
|
+
{k.value for k in edge_kinds} if edge_kinds is not None else None
|
|
32
|
+
)
|
|
33
|
+
visited: set[str] = set()
|
|
34
|
+
frontier: set[str] = {node_id}
|
|
35
|
+
for _ in range(depth):
|
|
36
|
+
next_frontier: set[str] = set()
|
|
37
|
+
for n in frontier:
|
|
38
|
+
if n not in g:
|
|
39
|
+
continue
|
|
40
|
+
neighbors: list[str] = []
|
|
41
|
+
if direction in ("out", "both"):
|
|
42
|
+
for _src, dst, data in g.out_edges(n, data=True):
|
|
43
|
+
if allowed_kinds is None or data.get("kind") in allowed_kinds:
|
|
44
|
+
neighbors.append(dst)
|
|
45
|
+
if direction in ("in", "both"):
|
|
46
|
+
for src, _dst, data in g.in_edges(n, data=True):
|
|
47
|
+
if allowed_kinds is None or data.get("kind") in allowed_kinds:
|
|
48
|
+
neighbors.append(src)
|
|
49
|
+
for nb in neighbors:
|
|
50
|
+
if nb not in visited and nb not in frontier:
|
|
51
|
+
next_frontier.add(nb)
|
|
52
|
+
visited.update(frontier)
|
|
53
|
+
frontier = next_frontier - visited
|
|
54
|
+
visited.update(frontier)
|
|
55
|
+
return cast(nx.MultiDiGraph, g.subgraph(visited).copy())
|