polycodegraph 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. codegraph/__init__.py +10 -0
  2. codegraph/analysis/__init__.py +30 -0
  3. codegraph/analysis/_common.py +125 -0
  4. codegraph/analysis/blast_radius.py +63 -0
  5. codegraph/analysis/cycles.py +79 -0
  6. codegraph/analysis/dataflow.py +861 -0
  7. codegraph/analysis/dead_code.py +165 -0
  8. codegraph/analysis/hotspots.py +68 -0
  9. codegraph/analysis/infrastructure.py +439 -0
  10. codegraph/analysis/metrics.py +52 -0
  11. codegraph/analysis/report.py +222 -0
  12. codegraph/analysis/roles.py +323 -0
  13. codegraph/analysis/untested.py +79 -0
  14. codegraph/cli.py +1506 -0
  15. codegraph/config.py +64 -0
  16. codegraph/embed/__init__.py +35 -0
  17. codegraph/embed/chunker.py +120 -0
  18. codegraph/embed/embedder.py +113 -0
  19. codegraph/embed/query.py +181 -0
  20. codegraph/embed/store.py +360 -0
  21. codegraph/graph/__init__.py +0 -0
  22. codegraph/graph/builder.py +212 -0
  23. codegraph/graph/schema.py +69 -0
  24. codegraph/graph/store_networkx.py +55 -0
  25. codegraph/graph/store_sqlite.py +249 -0
  26. codegraph/mcp_server/__init__.py +6 -0
  27. codegraph/mcp_server/server.py +933 -0
  28. codegraph/parsers/__init__.py +0 -0
  29. codegraph/parsers/base.py +70 -0
  30. codegraph/parsers/go.py +570 -0
  31. codegraph/parsers/python.py +1707 -0
  32. codegraph/parsers/typescript.py +1397 -0
  33. codegraph/py.typed +0 -0
  34. codegraph/resolve/__init__.py +4 -0
  35. codegraph/resolve/calls.py +480 -0
  36. codegraph/review/__init__.py +31 -0
  37. codegraph/review/baseline.py +32 -0
  38. codegraph/review/differ.py +211 -0
  39. codegraph/review/hook.py +70 -0
  40. codegraph/review/risk.py +219 -0
  41. codegraph/review/rules.py +342 -0
  42. codegraph/viz/__init__.py +17 -0
  43. codegraph/viz/_style.py +45 -0
  44. codegraph/viz/dashboard.py +740 -0
  45. codegraph/viz/diagrams.py +370 -0
  46. codegraph/viz/explore.py +453 -0
  47. codegraph/viz/hld.py +683 -0
  48. codegraph/viz/html.py +115 -0
  49. codegraph/viz/mermaid.py +111 -0
  50. codegraph/viz/svg.py +77 -0
  51. codegraph/web/__init__.py +4 -0
  52. codegraph/web/server.py +165 -0
  53. codegraph/web/static/app.css +664 -0
  54. codegraph/web/static/app.js +919 -0
  55. codegraph/web/static/index.html +112 -0
  56. codegraph/web/static/views/architecture.js +1671 -0
  57. codegraph/web/static/views/graph3d.css +564 -0
  58. codegraph/web/static/views/graph3d.js +999 -0
  59. codegraph/web/static/views/graph3d_transform.js +984 -0
  60. codegraph/workspace/__init__.py +34 -0
  61. codegraph/workspace/config.py +110 -0
  62. codegraph/workspace/operations.py +294 -0
  63. polycodegraph-0.1.0.dist-info/METADATA +687 -0
  64. polycodegraph-0.1.0.dist-info/RECORD +67 -0
  65. polycodegraph-0.1.0.dist-info/WHEEL +4 -0
  66. polycodegraph-0.1.0.dist-info/entry_points.txt +2 -0
  67. polycodegraph-0.1.0.dist-info/licenses/LICENSE +21 -0
codegraph/config.py ADDED
@@ -0,0 +1,64 @@
1
+ """Codegraph configuration model and helpers."""
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import yaml
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class DeadCodeConfig(BaseModel):
12
+ """User-supplied dead-code analysis tweaks.
13
+
14
+ Extends the built-in entry-point catalog. All fields are optional;
15
+ user patterns are merged with the built-ins at parse time.
16
+ """
17
+
18
+ entry_point_decorators: list[str] = Field(default_factory=list)
19
+ """Extra decorator strings (e.g. ``"@my.handler"``) treated as entry
20
+ points. Matched as substring of the raw decorator text."""
21
+
22
+ entry_point_names: list[str] = Field(default_factory=list)
23
+ """Extra function/method/class name globs treated as entry points
24
+ (fnmatch syntax). Reserved for future use."""
25
+
26
+ entry_point_files: list[str] = Field(default_factory=list)
27
+ """File-path globs whose definitions are all treated as entry points.
28
+ Reserved for future use."""
29
+
30
+
31
+ class CodegraphConfig(BaseModel):
32
+ version: int = 1
33
+ languages: list[str] = Field(
34
+ default_factory=lambda: ["python", "typescript", "javascript"]
35
+ )
36
+ default_branch: str = "main"
37
+ ignore: list[str] = Field(default_factory=list)
38
+ baseline: dict[str, Any] = Field(
39
+ default_factory=lambda: {"backend": "local"}
40
+ )
41
+ critical_paths: list[dict[str, Any]] = Field(default_factory=list)
42
+ mcp: dict[str, Any] = Field(default_factory=lambda: {"enabled": False})
43
+ install_hook: bool = False
44
+ register_mcp: bool = False
45
+ dead_code: DeadCodeConfig = Field(default_factory=DeadCodeConfig)
46
+
47
+
48
+ def load_config(repo_root: Path) -> CodegraphConfig:
49
+ cfg_path = repo_root / ".codegraph.yml"
50
+ if not cfg_path.exists():
51
+ return CodegraphConfig()
52
+ with cfg_path.open() as f:
53
+ data = yaml.safe_load(f) or {}
54
+ return CodegraphConfig.model_validate(data)
55
+
56
+
57
+ def save_config(repo_root: Path, cfg: CodegraphConfig) -> None:
58
+ cfg_path = repo_root / ".codegraph.yml"
59
+ with cfg_path.open("w") as f:
60
+ yaml.dump(cfg.model_dump(), f, default_flow_style=False, sort_keys=True)
61
+
62
+
63
+ def default_data_dir(repo_root: Path) -> Path:
64
+ return repo_root / ".codegraph"
@@ -0,0 +1,35 @@
1
+ """Local, open-weight embedding layer for codegraph (v0.3).
2
+
3
+ Public surface:
4
+
5
+ * :class:`Chunk` — a single embeddable code unit (function/method/class).
6
+ * :class:`Hit` — a search result.
7
+ * :func:`chunk_repo` — turn graph nodes into chunks.
8
+ * :func:`build_index` — chunk + embed + write index to ``.codegraph/embeddings.lance``.
9
+ * :func:`query` — convenience wrapper around :func:`semantic_query`.
10
+
11
+ The heavy dependencies (``sentence-transformers``, ``lancedb``) are imported
12
+ lazily. Install with ``pip install -e ".[embed]"``.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ from codegraph.embed.chunker import Chunk, chunk_repo
17
+ from codegraph.embed.query import Hit, hybrid_query, semantic_query
18
+ from codegraph.embed.store import EmbeddingStore, build_index
19
+
20
+ __all__ = [
21
+ "Chunk",
22
+ "EmbeddingStore",
23
+ "Hit",
24
+ "build_index",
25
+ "chunk_repo",
26
+ "hybrid_query",
27
+ "query",
28
+ "semantic_query",
29
+ ]
30
+
31
+
32
+ # pragma: codegraph-public-api
33
+ def query(text: str, *, k: int = 5) -> list[Hit]:
34
+ """Shortcut: run a semantic query against the cwd ``.codegraph`` index."""
35
+ return semantic_query(text, k=k)
@@ -0,0 +1,120 @@
1
+ """Chunk a codegraph SQLite store into embeddable units."""
2
+ from __future__ import annotations
3
+
4
+ from collections.abc import Iterable, Iterator
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from codegraph.graph.schema import Node, NodeKind
10
+ from codegraph.graph.store_sqlite import SQLiteGraphStore
11
+
12
+ # Node kinds that produce a chunk by default. MODULE / FILE / TEST / IMPORT /
13
+ # VARIABLE / PARAMETER are skipped: they're either coarse, generated, or
14
+ # duplicate the chunks of the symbols they contain.
15
+ _DEFAULT_KINDS: frozenset[NodeKind] = frozenset(
16
+ {NodeKind.FUNCTION, NodeKind.METHOD, NodeKind.CLASS}
17
+ )
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class Chunk:
22
+ """A single embeddable code chunk pulled from the graph + source."""
23
+
24
+ qualname: str
25
+ file: str
26
+ line_start: int
27
+ line_end: int
28
+ kind: str
29
+ text: str
30
+ params: list[str] = field(default_factory=list)
31
+ returns: str | None = None
32
+ role: str | None = None
33
+
34
+ @property
35
+ def id(self) -> str:
36
+ """Stable id for upsert / dedupe."""
37
+ return f"{self.file}::{self.qualname}::{self.line_start}"
38
+
39
+
40
+ def _read_lines(repo_root: Path, file: str) -> list[str]:
41
+ """Best-effort read; returns ``[]`` on any IO error."""
42
+ try:
43
+ return (repo_root / file).read_text(encoding="utf-8").splitlines()
44
+ except (OSError, UnicodeDecodeError):
45
+ return []
46
+
47
+
48
+ def _slice(lines: list[str], start: int, end: int) -> str:
49
+ if not lines:
50
+ return ""
51
+ s = max(0, start - 1)
52
+ e = max(s, min(len(lines), end))
53
+ return "\n".join(lines[s:e])
54
+
55
+
56
+ def _md_list(value: Any) -> list[str]:
57
+ if isinstance(value, list):
58
+ return [str(v) for v in value]
59
+ return []
60
+
61
+
62
+ def _node_to_chunk(node: Node, repo_root: Path, _cache: dict[str, list[str]]) -> Chunk | None:
63
+ lines = _cache.get(node.file)
64
+ if lines is None:
65
+ lines = _read_lines(repo_root, node.file)
66
+ _cache[node.file] = lines
67
+
68
+ body = _slice(lines, node.line_start, node.line_end)
69
+ if not body.strip():
70
+ # Fall back to signature / docstring so we still index something useful.
71
+ body = "\n".join(filter(None, [node.signature or "", node.docstring or ""]))
72
+ if not body.strip():
73
+ return None
74
+
75
+ md = node.metadata or {}
76
+ role_val = md.get("role") if isinstance(md, dict) else None
77
+ returns_val = md.get("returns") if isinstance(md, dict) else None
78
+
79
+ return Chunk(
80
+ qualname=node.qualname,
81
+ file=node.file,
82
+ line_start=node.line_start,
83
+ line_end=node.line_end,
84
+ kind=node.kind.value,
85
+ text=body,
86
+ params=_md_list(md.get("params")) if isinstance(md, dict) else [],
87
+ returns=str(returns_val) if returns_val is not None else None,
88
+ role=str(role_val) if role_val is not None else None,
89
+ )
90
+
91
+
92
+ def chunk_repo(
93
+ repo_root: Path,
94
+ *,
95
+ db_path: Path | None = None,
96
+ kinds: Iterable[NodeKind] | None = None,
97
+ ) -> Iterator[Chunk]:
98
+ """Yield one :class:`Chunk` per matching graph node.
99
+
100
+ ``kinds`` defaults to FUNCTION / METHOD / CLASS.
101
+ """
102
+ db = db_path or (repo_root / ".codegraph" / "graph.db")
103
+ if not db.exists():
104
+ raise FileNotFoundError(
105
+ f"No graph database at {db}. Run `codegraph build` first."
106
+ )
107
+
108
+ selected = frozenset(kinds) if kinds is not None else _DEFAULT_KINDS
109
+ line_cache: dict[str, list[str]] = {}
110
+
111
+ store = SQLiteGraphStore(db)
112
+ try:
113
+ for node in store.iter_nodes():
114
+ if node.kind not in selected:
115
+ continue
116
+ chunk = _node_to_chunk(node, repo_root, line_cache)
117
+ if chunk is not None:
118
+ yield chunk
119
+ finally:
120
+ store.close()
@@ -0,0 +1,113 @@
1
+ """Sentence-transformers wrapper.
2
+
3
+ The model is lazy-loaded so unit tests can substitute a deterministic fake
4
+ without pulling the real dependency tree.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import os
9
+ from collections.abc import Sequence
10
+ from pathlib import Path
11
+ from typing import Any, Protocol
12
+
13
+ DEFAULT_MODEL = "nomic-ai/CodeRankEmbed"
14
+ DEFAULT_DIM = 768
15
+
16
+
17
+ class _EncoderLike(Protocol):
18
+ def encode(
19
+ self,
20
+ sentences: Sequence[str],
21
+ *,
22
+ batch_size: int = ...,
23
+ show_progress_bar: bool = ...,
24
+ convert_to_numpy: bool = ...,
25
+ normalize_embeddings: bool = ...,
26
+ ) -> Any:
27
+ ...
28
+
29
+
30
+ def _cache_dir() -> Path:
31
+ """Where downloaded models live. Honours ``XDG_CACHE_HOME`` if set."""
32
+ xdg = os.environ.get("XDG_CACHE_HOME")
33
+ base = Path(xdg) if xdg else Path.home() / ".cache"
34
+ return base / "codegraph" / "models"
35
+
36
+
37
+ class MissingDependencyError(RuntimeError):
38
+ """Raised when the optional ``embed`` extra is not installed."""
39
+
40
+
41
+ def _load_sentence_transformer(model: str) -> _EncoderLike:
42
+ try:
43
+ from sentence_transformers import SentenceTransformer
44
+ except ImportError as exc: # pragma: no cover — exercised via mock test
45
+ raise MissingDependencyError(
46
+ "sentence-transformers is not installed.\n"
47
+ "Run: pip install -e \".[embed]\""
48
+ ) from exc
49
+
50
+ cache = _cache_dir()
51
+ cache.mkdir(parents=True, exist_ok=True)
52
+ encoder: _EncoderLike = SentenceTransformer(
53
+ model, cache_folder=str(cache), trust_remote_code=True
54
+ )
55
+ return encoder
56
+
57
+
58
+ class Embedder:
59
+ """Tiny wrapper around a sentence-transformer model.
60
+
61
+ The actual encoder is loaded lazily on first :meth:`embed` call so that:
62
+
63
+ * Construction is cheap and side-effect free (good for tests).
64
+ * Multiple embedders can co-exist without re-downloading.
65
+
66
+ Pass ``encoder`` to inject a fake / mock for tests.
67
+ """
68
+
69
+ def __init__(
70
+ self,
71
+ model: str = DEFAULT_MODEL,
72
+ *,
73
+ dim: int | None = None,
74
+ encoder: _EncoderLike | None = None,
75
+ ) -> None:
76
+ self.model = model
77
+ self._encoder: _EncoderLike | None = encoder
78
+ self._dim: int | None = dim
79
+
80
+ # ------------------------------------------------------------------
81
+ # Lifecycle
82
+ # ------------------------------------------------------------------
83
+ def _ensure_loaded(self) -> _EncoderLike:
84
+ if self._encoder is None:
85
+ self._encoder = _load_sentence_transformer(self.model)
86
+ return self._encoder
87
+
88
+ @property
89
+ def dim(self) -> int:
90
+ if self._dim is not None:
91
+ return self._dim
92
+ # Probe the encoder with a single token so we don't have to assume.
93
+ vecs = self.embed(["probe"])
94
+ self._dim = len(vecs[0])
95
+ return self._dim
96
+
97
+ # ------------------------------------------------------------------
98
+ # Encoding
99
+ # ------------------------------------------------------------------
100
+ def embed(self, texts: Sequence[str], *, batch_size: int = 32) -> list[list[float]]:
101
+ """Return one row of floats per input string."""
102
+ if not texts:
103
+ return []
104
+ enc = self._ensure_loaded()
105
+ out = enc.encode(
106
+ list(texts),
107
+ batch_size=batch_size,
108
+ show_progress_bar=False,
109
+ convert_to_numpy=True,
110
+ normalize_embeddings=True,
111
+ )
112
+ # Accept numpy arrays, lists, or anything that iterates rows.
113
+ return [list(map(float, row)) for row in out]
@@ -0,0 +1,181 @@
1
+ """Query helpers for the embeddings index."""
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from codegraph.embed.embedder import Embedder
9
+ from codegraph.embed.store import EmbeddingStore, StoredChunk
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class Hit:
14
+ """A single search result."""
15
+
16
+ qualname: str
17
+ file: str
18
+ line: int
19
+ kind: str
20
+ role: str | None
21
+ score: float
22
+ text_snippet: str
23
+
24
+ # pragma: codegraph-public-api
25
+ def as_dict(self, *, score_field: str = "score") -> dict[str, Any]:
26
+ return {
27
+ "qualname": self.qualname,
28
+ "file": self.file,
29
+ "line": self.line,
30
+ "kind": self.kind,
31
+ "role": self.role,
32
+ score_field: round(float(self.score), 6),
33
+ "text_snippet": self.text_snippet,
34
+ }
35
+
36
+
37
+ def _snippet(text: str, max_lines: int = 6, max_chars: int = 400) -> str:
38
+ lines = text.splitlines()[:max_lines]
39
+ snippet = "\n".join(lines)
40
+ if len(snippet) > max_chars:
41
+ snippet = snippet[:max_chars] + "…"
42
+ return snippet
43
+
44
+
45
+ def _index_dir(repo_root: Path | None = None) -> Path:
46
+ return (repo_root or Path.cwd()) / ".codegraph"
47
+
48
+
49
+ class IndexMissingError(RuntimeError):
50
+ """Raised when the embeddings index does not exist on disk."""
51
+
52
+
53
+ def _open_store(repo_root: Path | None = None) -> EmbeddingStore:
54
+ base = _index_dir(repo_root)
55
+ lance = base / "embeddings.lance"
56
+ json_fb = base / "embeddings.json"
57
+ if not lance.exists() and not json_fb.exists():
58
+ raise IndexMissingError(
59
+ "no embedding index — run `codegraph embed` first"
60
+ )
61
+ backend = "auto" if lance.exists() else "json"
62
+ return EmbeddingStore(base, backend=backend)
63
+
64
+
65
+ def _to_hit(row: StoredChunk, score: float) -> Hit:
66
+ return Hit(
67
+ qualname=row.qualname,
68
+ file=row.file,
69
+ line=row.line_start,
70
+ kind=row.kind,
71
+ role=row.role,
72
+ score=score,
73
+ text_snippet=_snippet(row.text),
74
+ )
75
+
76
+
77
+ def semantic_query(
78
+ text: str,
79
+ *,
80
+ k: int = 5,
81
+ repo_root: Path | None = None,
82
+ embedder: Embedder | None = None,
83
+ store: EmbeddingStore | None = None,
84
+ ) -> list[Hit]:
85
+ """Pure cosine-similarity ranking against the index."""
86
+ s = store or _open_store(repo_root)
87
+ emb = embedder or Embedder()
88
+ vector = emb.embed([text])[0]
89
+ hits = s.query(vector, k=k)
90
+ return [_to_hit(row, score) for row, score in hits]
91
+
92
+
93
+ def hybrid_query(
94
+ text: str,
95
+ *,
96
+ k: int = 5,
97
+ role: str | None = None,
98
+ focus_qn: str | None = None,
99
+ repo_root: Path | None = None,
100
+ embedder: Embedder | None = None,
101
+ store: EmbeddingStore | None = None,
102
+ graph: Any | None = None,
103
+ ) -> list[Hit]:
104
+ """Blend semantic similarity with graph distance from a focus node.
105
+
106
+ ``final_score = 0.6 * cosine + 0.4 * (1 / (1 + graph_hops))``
107
+ """
108
+ s = store or _open_store(repo_root)
109
+ emb = embedder or Embedder()
110
+ vector = emb.embed([text])[0]
111
+
112
+ pool_size = max(k * 4, 20)
113
+ raw = s.query(vector, k=pool_size)
114
+
115
+ if role is not None:
116
+ raw = [(row, score) for row, score in raw if row.role == role]
117
+
118
+ if focus_qn is None:
119
+ return [_to_hit(row, score) for row, score in raw[:k]]
120
+
121
+ g = graph if graph is not None else _load_graph(repo_root)
122
+ focus_id = _find_node_by_qualname(g, focus_qn) if g is not None else None
123
+
124
+ rescored: list[tuple[StoredChunk, float, float]] = []
125
+ for row, semantic in raw:
126
+ target_id = _find_node_by_qualname(g, row.qualname) if g is not None else None
127
+ hops = _graph_distance(g, focus_id, target_id) if g is not None else None
128
+ graph_score = 0.0 if hops is None else 1.0 / (1.0 + float(hops))
129
+ final = 0.6 * float(semantic) + 0.4 * graph_score
130
+ rescored.append((row, final, semantic))
131
+
132
+ rescored.sort(key=lambda triple: triple[1], reverse=True)
133
+ return [_to_hit(row, final) for row, final, _ in rescored[:k]]
134
+
135
+
136
+ # ---------------------------------------------------------------------------
137
+ # Graph helpers (lazy nx import; tolerate missing graph)
138
+ # ---------------------------------------------------------------------------
139
+
140
+
141
+ def _load_graph(repo_root: Path | None) -> Any | None:
142
+ try:
143
+ from codegraph.graph.store_networkx import to_digraph
144
+ from codegraph.graph.store_sqlite import SQLiteGraphStore
145
+ except Exception: # pragma: no cover
146
+ return None
147
+
148
+ db = (repo_root or Path.cwd()) / ".codegraph" / "graph.db"
149
+ if not db.exists():
150
+ return None
151
+ store = SQLiteGraphStore(db)
152
+ try:
153
+ return to_digraph(store)
154
+ finally:
155
+ store.close()
156
+
157
+
158
+ def _find_node_by_qualname(graph: Any, qualname: str) -> str | None:
159
+ if graph is None:
160
+ return None
161
+ if qualname in graph:
162
+ return qualname
163
+ q = qualname.lower()
164
+ for nid, attrs in graph.nodes(data=True):
165
+ if str(attrs.get("qualname") or "").lower() == q:
166
+ return str(nid)
167
+ return None
168
+
169
+
170
+ def _graph_distance(graph: Any, src: str | None, dst: str | None) -> int | None:
171
+ if graph is None or src is None or dst is None:
172
+ return None
173
+ if src == dst:
174
+ return 0
175
+ try:
176
+ import networkx as nx
177
+
178
+ ug = graph.to_undirected(as_view=True) if hasattr(graph, "to_undirected") else graph
179
+ return int(nx.shortest_path_length(ug, src, dst))
180
+ except Exception:
181
+ return None