polycodegraph 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codegraph/__init__.py +10 -0
- codegraph/analysis/__init__.py +30 -0
- codegraph/analysis/_common.py +125 -0
- codegraph/analysis/blast_radius.py +63 -0
- codegraph/analysis/cycles.py +79 -0
- codegraph/analysis/dataflow.py +861 -0
- codegraph/analysis/dead_code.py +165 -0
- codegraph/analysis/hotspots.py +68 -0
- codegraph/analysis/infrastructure.py +439 -0
- codegraph/analysis/metrics.py +52 -0
- codegraph/analysis/report.py +222 -0
- codegraph/analysis/roles.py +323 -0
- codegraph/analysis/untested.py +79 -0
- codegraph/cli.py +1506 -0
- codegraph/config.py +64 -0
- codegraph/embed/__init__.py +35 -0
- codegraph/embed/chunker.py +120 -0
- codegraph/embed/embedder.py +113 -0
- codegraph/embed/query.py +181 -0
- codegraph/embed/store.py +360 -0
- codegraph/graph/__init__.py +0 -0
- codegraph/graph/builder.py +212 -0
- codegraph/graph/schema.py +69 -0
- codegraph/graph/store_networkx.py +55 -0
- codegraph/graph/store_sqlite.py +249 -0
- codegraph/mcp_server/__init__.py +6 -0
- codegraph/mcp_server/server.py +933 -0
- codegraph/parsers/__init__.py +0 -0
- codegraph/parsers/base.py +70 -0
- codegraph/parsers/go.py +570 -0
- codegraph/parsers/python.py +1707 -0
- codegraph/parsers/typescript.py +1397 -0
- codegraph/py.typed +0 -0
- codegraph/resolve/__init__.py +4 -0
- codegraph/resolve/calls.py +480 -0
- codegraph/review/__init__.py +31 -0
- codegraph/review/baseline.py +32 -0
- codegraph/review/differ.py +211 -0
- codegraph/review/hook.py +70 -0
- codegraph/review/risk.py +219 -0
- codegraph/review/rules.py +342 -0
- codegraph/viz/__init__.py +17 -0
- codegraph/viz/_style.py +45 -0
- codegraph/viz/dashboard.py +740 -0
- codegraph/viz/diagrams.py +370 -0
- codegraph/viz/explore.py +453 -0
- codegraph/viz/hld.py +683 -0
- codegraph/viz/html.py +115 -0
- codegraph/viz/mermaid.py +111 -0
- codegraph/viz/svg.py +77 -0
- codegraph/web/__init__.py +4 -0
- codegraph/web/server.py +165 -0
- codegraph/web/static/app.css +664 -0
- codegraph/web/static/app.js +919 -0
- codegraph/web/static/index.html +112 -0
- codegraph/web/static/views/architecture.js +1671 -0
- codegraph/web/static/views/graph3d.css +564 -0
- codegraph/web/static/views/graph3d.js +999 -0
- codegraph/web/static/views/graph3d_transform.js +984 -0
- codegraph/workspace/__init__.py +34 -0
- codegraph/workspace/config.py +110 -0
- codegraph/workspace/operations.py +294 -0
- polycodegraph-0.1.0.dist-info/METADATA +687 -0
- polycodegraph-0.1.0.dist-info/RECORD +67 -0
- polycodegraph-0.1.0.dist-info/WHEEL +4 -0
- polycodegraph-0.1.0.dist-info/entry_points.txt +2 -0
- polycodegraph-0.1.0.dist-info/licenses/LICENSE +21 -0
codegraph/config.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Codegraph configuration model and helpers."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DeadCodeConfig(BaseModel):
|
|
12
|
+
"""User-supplied dead-code analysis tweaks.
|
|
13
|
+
|
|
14
|
+
Extends the built-in entry-point catalog. All fields are optional;
|
|
15
|
+
user patterns are merged with the built-ins at parse time.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
entry_point_decorators: list[str] = Field(default_factory=list)
|
|
19
|
+
"""Extra decorator strings (e.g. ``"@my.handler"``) treated as entry
|
|
20
|
+
points. Matched as substring of the raw decorator text."""
|
|
21
|
+
|
|
22
|
+
entry_point_names: list[str] = Field(default_factory=list)
|
|
23
|
+
"""Extra function/method/class name globs treated as entry points
|
|
24
|
+
(fnmatch syntax). Reserved for future use."""
|
|
25
|
+
|
|
26
|
+
entry_point_files: list[str] = Field(default_factory=list)
|
|
27
|
+
"""File-path globs whose definitions are all treated as entry points.
|
|
28
|
+
Reserved for future use."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class CodegraphConfig(BaseModel):
|
|
32
|
+
version: int = 1
|
|
33
|
+
languages: list[str] = Field(
|
|
34
|
+
default_factory=lambda: ["python", "typescript", "javascript"]
|
|
35
|
+
)
|
|
36
|
+
default_branch: str = "main"
|
|
37
|
+
ignore: list[str] = Field(default_factory=list)
|
|
38
|
+
baseline: dict[str, Any] = Field(
|
|
39
|
+
default_factory=lambda: {"backend": "local"}
|
|
40
|
+
)
|
|
41
|
+
critical_paths: list[dict[str, Any]] = Field(default_factory=list)
|
|
42
|
+
mcp: dict[str, Any] = Field(default_factory=lambda: {"enabled": False})
|
|
43
|
+
install_hook: bool = False
|
|
44
|
+
register_mcp: bool = False
|
|
45
|
+
dead_code: DeadCodeConfig = Field(default_factory=DeadCodeConfig)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def load_config(repo_root: Path) -> CodegraphConfig:
|
|
49
|
+
cfg_path = repo_root / ".codegraph.yml"
|
|
50
|
+
if not cfg_path.exists():
|
|
51
|
+
return CodegraphConfig()
|
|
52
|
+
with cfg_path.open() as f:
|
|
53
|
+
data = yaml.safe_load(f) or {}
|
|
54
|
+
return CodegraphConfig.model_validate(data)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def save_config(repo_root: Path, cfg: CodegraphConfig) -> None:
|
|
58
|
+
cfg_path = repo_root / ".codegraph.yml"
|
|
59
|
+
with cfg_path.open("w") as f:
|
|
60
|
+
yaml.dump(cfg.model_dump(), f, default_flow_style=False, sort_keys=True)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def default_data_dir(repo_root: Path) -> Path:
|
|
64
|
+
return repo_root / ".codegraph"
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Local, open-weight embedding layer for codegraph (v0.3).
|
|
2
|
+
|
|
3
|
+
Public surface:
|
|
4
|
+
|
|
5
|
+
* :class:`Chunk` — a single embeddable code unit (function/method/class).
|
|
6
|
+
* :class:`Hit` — a search result.
|
|
7
|
+
* :func:`chunk_repo` — turn graph nodes into chunks.
|
|
8
|
+
* :func:`build_index` — chunk + embed + write index to ``.codegraph/embeddings.lance``.
|
|
9
|
+
* :func:`query` — convenience wrapper around :func:`semantic_query`.
|
|
10
|
+
|
|
11
|
+
The heavy dependencies (``sentence-transformers``, ``lancedb``) are imported
|
|
12
|
+
lazily. Install with ``pip install -e ".[embed]"``.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from codegraph.embed.chunker import Chunk, chunk_repo
|
|
17
|
+
from codegraph.embed.query import Hit, hybrid_query, semantic_query
|
|
18
|
+
from codegraph.embed.store import EmbeddingStore, build_index
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"Chunk",
|
|
22
|
+
"EmbeddingStore",
|
|
23
|
+
"Hit",
|
|
24
|
+
"build_index",
|
|
25
|
+
"chunk_repo",
|
|
26
|
+
"hybrid_query",
|
|
27
|
+
"query",
|
|
28
|
+
"semantic_query",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# pragma: codegraph-public-api
|
|
33
|
+
def query(text: str, *, k: int = 5) -> list[Hit]:
|
|
34
|
+
"""Shortcut: run a semantic query against the cwd ``.codegraph`` index."""
|
|
35
|
+
return semantic_query(text, k=k)
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Chunk a codegraph SQLite store into embeddable units."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from collections.abc import Iterable, Iterator
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from codegraph.graph.schema import Node, NodeKind
|
|
10
|
+
from codegraph.graph.store_sqlite import SQLiteGraphStore
|
|
11
|
+
|
|
12
|
+
# Node kinds that produce a chunk by default. MODULE / FILE / TEST / IMPORT /
|
|
13
|
+
# VARIABLE / PARAMETER are skipped: they're either coarse, generated, or
|
|
14
|
+
# duplicate the chunks of the symbols they contain.
|
|
15
|
+
_DEFAULT_KINDS: frozenset[NodeKind] = frozenset(
|
|
16
|
+
{NodeKind.FUNCTION, NodeKind.METHOD, NodeKind.CLASS}
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class Chunk:
|
|
22
|
+
"""A single embeddable code chunk pulled from the graph + source."""
|
|
23
|
+
|
|
24
|
+
qualname: str
|
|
25
|
+
file: str
|
|
26
|
+
line_start: int
|
|
27
|
+
line_end: int
|
|
28
|
+
kind: str
|
|
29
|
+
text: str
|
|
30
|
+
params: list[str] = field(default_factory=list)
|
|
31
|
+
returns: str | None = None
|
|
32
|
+
role: str | None = None
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def id(self) -> str:
|
|
36
|
+
"""Stable id for upsert / dedupe."""
|
|
37
|
+
return f"{self.file}::{self.qualname}::{self.line_start}"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _read_lines(repo_root: Path, file: str) -> list[str]:
|
|
41
|
+
"""Best-effort read; returns ``[]`` on any IO error."""
|
|
42
|
+
try:
|
|
43
|
+
return (repo_root / file).read_text(encoding="utf-8").splitlines()
|
|
44
|
+
except (OSError, UnicodeDecodeError):
|
|
45
|
+
return []
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _slice(lines: list[str], start: int, end: int) -> str:
|
|
49
|
+
if not lines:
|
|
50
|
+
return ""
|
|
51
|
+
s = max(0, start - 1)
|
|
52
|
+
e = max(s, min(len(lines), end))
|
|
53
|
+
return "\n".join(lines[s:e])
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _md_list(value: Any) -> list[str]:
|
|
57
|
+
if isinstance(value, list):
|
|
58
|
+
return [str(v) for v in value]
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _node_to_chunk(node: Node, repo_root: Path, _cache: dict[str, list[str]]) -> Chunk | None:
|
|
63
|
+
lines = _cache.get(node.file)
|
|
64
|
+
if lines is None:
|
|
65
|
+
lines = _read_lines(repo_root, node.file)
|
|
66
|
+
_cache[node.file] = lines
|
|
67
|
+
|
|
68
|
+
body = _slice(lines, node.line_start, node.line_end)
|
|
69
|
+
if not body.strip():
|
|
70
|
+
# Fall back to signature / docstring so we still index something useful.
|
|
71
|
+
body = "\n".join(filter(None, [node.signature or "", node.docstring or ""]))
|
|
72
|
+
if not body.strip():
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
md = node.metadata or {}
|
|
76
|
+
role_val = md.get("role") if isinstance(md, dict) else None
|
|
77
|
+
returns_val = md.get("returns") if isinstance(md, dict) else None
|
|
78
|
+
|
|
79
|
+
return Chunk(
|
|
80
|
+
qualname=node.qualname,
|
|
81
|
+
file=node.file,
|
|
82
|
+
line_start=node.line_start,
|
|
83
|
+
line_end=node.line_end,
|
|
84
|
+
kind=node.kind.value,
|
|
85
|
+
text=body,
|
|
86
|
+
params=_md_list(md.get("params")) if isinstance(md, dict) else [],
|
|
87
|
+
returns=str(returns_val) if returns_val is not None else None,
|
|
88
|
+
role=str(role_val) if role_val is not None else None,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def chunk_repo(
|
|
93
|
+
repo_root: Path,
|
|
94
|
+
*,
|
|
95
|
+
db_path: Path | None = None,
|
|
96
|
+
kinds: Iterable[NodeKind] | None = None,
|
|
97
|
+
) -> Iterator[Chunk]:
|
|
98
|
+
"""Yield one :class:`Chunk` per matching graph node.
|
|
99
|
+
|
|
100
|
+
``kinds`` defaults to FUNCTION / METHOD / CLASS.
|
|
101
|
+
"""
|
|
102
|
+
db = db_path or (repo_root / ".codegraph" / "graph.db")
|
|
103
|
+
if not db.exists():
|
|
104
|
+
raise FileNotFoundError(
|
|
105
|
+
f"No graph database at {db}. Run `codegraph build` first."
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
selected = frozenset(kinds) if kinds is not None else _DEFAULT_KINDS
|
|
109
|
+
line_cache: dict[str, list[str]] = {}
|
|
110
|
+
|
|
111
|
+
store = SQLiteGraphStore(db)
|
|
112
|
+
try:
|
|
113
|
+
for node in store.iter_nodes():
|
|
114
|
+
if node.kind not in selected:
|
|
115
|
+
continue
|
|
116
|
+
chunk = _node_to_chunk(node, repo_root, line_cache)
|
|
117
|
+
if chunk is not None:
|
|
118
|
+
yield chunk
|
|
119
|
+
finally:
|
|
120
|
+
store.close()
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Sentence-transformers wrapper.
|
|
2
|
+
|
|
3
|
+
The model is lazy-loaded so unit tests can substitute a deterministic fake
|
|
4
|
+
without pulling the real dependency tree.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Protocol
|
|
12
|
+
|
|
13
|
+
DEFAULT_MODEL = "nomic-ai/CodeRankEmbed"
|
|
14
|
+
DEFAULT_DIM = 768
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _EncoderLike(Protocol):
|
|
18
|
+
def encode(
|
|
19
|
+
self,
|
|
20
|
+
sentences: Sequence[str],
|
|
21
|
+
*,
|
|
22
|
+
batch_size: int = ...,
|
|
23
|
+
show_progress_bar: bool = ...,
|
|
24
|
+
convert_to_numpy: bool = ...,
|
|
25
|
+
normalize_embeddings: bool = ...,
|
|
26
|
+
) -> Any:
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _cache_dir() -> Path:
|
|
31
|
+
"""Where downloaded models live. Honours ``XDG_CACHE_HOME`` if set."""
|
|
32
|
+
xdg = os.environ.get("XDG_CACHE_HOME")
|
|
33
|
+
base = Path(xdg) if xdg else Path.home() / ".cache"
|
|
34
|
+
return base / "codegraph" / "models"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class MissingDependencyError(RuntimeError):
|
|
38
|
+
"""Raised when the optional ``embed`` extra is not installed."""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _load_sentence_transformer(model: str) -> _EncoderLike:
|
|
42
|
+
try:
|
|
43
|
+
from sentence_transformers import SentenceTransformer
|
|
44
|
+
except ImportError as exc: # pragma: no cover — exercised via mock test
|
|
45
|
+
raise MissingDependencyError(
|
|
46
|
+
"sentence-transformers is not installed.\n"
|
|
47
|
+
"Run: pip install -e \".[embed]\""
|
|
48
|
+
) from exc
|
|
49
|
+
|
|
50
|
+
cache = _cache_dir()
|
|
51
|
+
cache.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
encoder: _EncoderLike = SentenceTransformer(
|
|
53
|
+
model, cache_folder=str(cache), trust_remote_code=True
|
|
54
|
+
)
|
|
55
|
+
return encoder
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class Embedder:
|
|
59
|
+
"""Tiny wrapper around a sentence-transformer model.
|
|
60
|
+
|
|
61
|
+
The actual encoder is loaded lazily on first :meth:`embed` call so that:
|
|
62
|
+
|
|
63
|
+
* Construction is cheap and side-effect free (good for tests).
|
|
64
|
+
* Multiple embedders can co-exist without re-downloading.
|
|
65
|
+
|
|
66
|
+
Pass ``encoder`` to inject a fake / mock for tests.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(
|
|
70
|
+
self,
|
|
71
|
+
model: str = DEFAULT_MODEL,
|
|
72
|
+
*,
|
|
73
|
+
dim: int | None = None,
|
|
74
|
+
encoder: _EncoderLike | None = None,
|
|
75
|
+
) -> None:
|
|
76
|
+
self.model = model
|
|
77
|
+
self._encoder: _EncoderLike | None = encoder
|
|
78
|
+
self._dim: int | None = dim
|
|
79
|
+
|
|
80
|
+
# ------------------------------------------------------------------
|
|
81
|
+
# Lifecycle
|
|
82
|
+
# ------------------------------------------------------------------
|
|
83
|
+
def _ensure_loaded(self) -> _EncoderLike:
|
|
84
|
+
if self._encoder is None:
|
|
85
|
+
self._encoder = _load_sentence_transformer(self.model)
|
|
86
|
+
return self._encoder
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def dim(self) -> int:
|
|
90
|
+
if self._dim is not None:
|
|
91
|
+
return self._dim
|
|
92
|
+
# Probe the encoder with a single token so we don't have to assume.
|
|
93
|
+
vecs = self.embed(["probe"])
|
|
94
|
+
self._dim = len(vecs[0])
|
|
95
|
+
return self._dim
|
|
96
|
+
|
|
97
|
+
# ------------------------------------------------------------------
|
|
98
|
+
# Encoding
|
|
99
|
+
# ------------------------------------------------------------------
|
|
100
|
+
def embed(self, texts: Sequence[str], *, batch_size: int = 32) -> list[list[float]]:
|
|
101
|
+
"""Return one row of floats per input string."""
|
|
102
|
+
if not texts:
|
|
103
|
+
return []
|
|
104
|
+
enc = self._ensure_loaded()
|
|
105
|
+
out = enc.encode(
|
|
106
|
+
list(texts),
|
|
107
|
+
batch_size=batch_size,
|
|
108
|
+
show_progress_bar=False,
|
|
109
|
+
convert_to_numpy=True,
|
|
110
|
+
normalize_embeddings=True,
|
|
111
|
+
)
|
|
112
|
+
# Accept numpy arrays, lists, or anything that iterates rows.
|
|
113
|
+
return [list(map(float, row)) for row in out]
|
codegraph/embed/query.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""Query helpers for the embeddings index."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from codegraph.embed.embedder import Embedder
|
|
9
|
+
from codegraph.embed.store import EmbeddingStore, StoredChunk
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class Hit:
|
|
14
|
+
"""A single search result."""
|
|
15
|
+
|
|
16
|
+
qualname: str
|
|
17
|
+
file: str
|
|
18
|
+
line: int
|
|
19
|
+
kind: str
|
|
20
|
+
role: str | None
|
|
21
|
+
score: float
|
|
22
|
+
text_snippet: str
|
|
23
|
+
|
|
24
|
+
# pragma: codegraph-public-api
|
|
25
|
+
def as_dict(self, *, score_field: str = "score") -> dict[str, Any]:
|
|
26
|
+
return {
|
|
27
|
+
"qualname": self.qualname,
|
|
28
|
+
"file": self.file,
|
|
29
|
+
"line": self.line,
|
|
30
|
+
"kind": self.kind,
|
|
31
|
+
"role": self.role,
|
|
32
|
+
score_field: round(float(self.score), 6),
|
|
33
|
+
"text_snippet": self.text_snippet,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _snippet(text: str, max_lines: int = 6, max_chars: int = 400) -> str:
|
|
38
|
+
lines = text.splitlines()[:max_lines]
|
|
39
|
+
snippet = "\n".join(lines)
|
|
40
|
+
if len(snippet) > max_chars:
|
|
41
|
+
snippet = snippet[:max_chars] + "…"
|
|
42
|
+
return snippet
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _index_dir(repo_root: Path | None = None) -> Path:
|
|
46
|
+
return (repo_root or Path.cwd()) / ".codegraph"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class IndexMissingError(RuntimeError):
|
|
50
|
+
"""Raised when the embeddings index does not exist on disk."""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _open_store(repo_root: Path | None = None) -> EmbeddingStore:
|
|
54
|
+
base = _index_dir(repo_root)
|
|
55
|
+
lance = base / "embeddings.lance"
|
|
56
|
+
json_fb = base / "embeddings.json"
|
|
57
|
+
if not lance.exists() and not json_fb.exists():
|
|
58
|
+
raise IndexMissingError(
|
|
59
|
+
"no embedding index — run `codegraph embed` first"
|
|
60
|
+
)
|
|
61
|
+
backend = "auto" if lance.exists() else "json"
|
|
62
|
+
return EmbeddingStore(base, backend=backend)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _to_hit(row: StoredChunk, score: float) -> Hit:
|
|
66
|
+
return Hit(
|
|
67
|
+
qualname=row.qualname,
|
|
68
|
+
file=row.file,
|
|
69
|
+
line=row.line_start,
|
|
70
|
+
kind=row.kind,
|
|
71
|
+
role=row.role,
|
|
72
|
+
score=score,
|
|
73
|
+
text_snippet=_snippet(row.text),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def semantic_query(
|
|
78
|
+
text: str,
|
|
79
|
+
*,
|
|
80
|
+
k: int = 5,
|
|
81
|
+
repo_root: Path | None = None,
|
|
82
|
+
embedder: Embedder | None = None,
|
|
83
|
+
store: EmbeddingStore | None = None,
|
|
84
|
+
) -> list[Hit]:
|
|
85
|
+
"""Pure cosine-similarity ranking against the index."""
|
|
86
|
+
s = store or _open_store(repo_root)
|
|
87
|
+
emb = embedder or Embedder()
|
|
88
|
+
vector = emb.embed([text])[0]
|
|
89
|
+
hits = s.query(vector, k=k)
|
|
90
|
+
return [_to_hit(row, score) for row, score in hits]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def hybrid_query(
|
|
94
|
+
text: str,
|
|
95
|
+
*,
|
|
96
|
+
k: int = 5,
|
|
97
|
+
role: str | None = None,
|
|
98
|
+
focus_qn: str | None = None,
|
|
99
|
+
repo_root: Path | None = None,
|
|
100
|
+
embedder: Embedder | None = None,
|
|
101
|
+
store: EmbeddingStore | None = None,
|
|
102
|
+
graph: Any | None = None,
|
|
103
|
+
) -> list[Hit]:
|
|
104
|
+
"""Blend semantic similarity with graph distance from a focus node.
|
|
105
|
+
|
|
106
|
+
``final_score = 0.6 * cosine + 0.4 * (1 / (1 + graph_hops))``
|
|
107
|
+
"""
|
|
108
|
+
s = store or _open_store(repo_root)
|
|
109
|
+
emb = embedder or Embedder()
|
|
110
|
+
vector = emb.embed([text])[0]
|
|
111
|
+
|
|
112
|
+
pool_size = max(k * 4, 20)
|
|
113
|
+
raw = s.query(vector, k=pool_size)
|
|
114
|
+
|
|
115
|
+
if role is not None:
|
|
116
|
+
raw = [(row, score) for row, score in raw if row.role == role]
|
|
117
|
+
|
|
118
|
+
if focus_qn is None:
|
|
119
|
+
return [_to_hit(row, score) for row, score in raw[:k]]
|
|
120
|
+
|
|
121
|
+
g = graph if graph is not None else _load_graph(repo_root)
|
|
122
|
+
focus_id = _find_node_by_qualname(g, focus_qn) if g is not None else None
|
|
123
|
+
|
|
124
|
+
rescored: list[tuple[StoredChunk, float, float]] = []
|
|
125
|
+
for row, semantic in raw:
|
|
126
|
+
target_id = _find_node_by_qualname(g, row.qualname) if g is not None else None
|
|
127
|
+
hops = _graph_distance(g, focus_id, target_id) if g is not None else None
|
|
128
|
+
graph_score = 0.0 if hops is None else 1.0 / (1.0 + float(hops))
|
|
129
|
+
final = 0.6 * float(semantic) + 0.4 * graph_score
|
|
130
|
+
rescored.append((row, final, semantic))
|
|
131
|
+
|
|
132
|
+
rescored.sort(key=lambda triple: triple[1], reverse=True)
|
|
133
|
+
return [_to_hit(row, final) for row, final, _ in rescored[:k]]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# ---------------------------------------------------------------------------
|
|
137
|
+
# Graph helpers (lazy nx import; tolerate missing graph)
|
|
138
|
+
# ---------------------------------------------------------------------------
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _load_graph(repo_root: Path | None) -> Any | None:
|
|
142
|
+
try:
|
|
143
|
+
from codegraph.graph.store_networkx import to_digraph
|
|
144
|
+
from codegraph.graph.store_sqlite import SQLiteGraphStore
|
|
145
|
+
except Exception: # pragma: no cover
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
db = (repo_root or Path.cwd()) / ".codegraph" / "graph.db"
|
|
149
|
+
if not db.exists():
|
|
150
|
+
return None
|
|
151
|
+
store = SQLiteGraphStore(db)
|
|
152
|
+
try:
|
|
153
|
+
return to_digraph(store)
|
|
154
|
+
finally:
|
|
155
|
+
store.close()
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _find_node_by_qualname(graph: Any, qualname: str) -> str | None:
|
|
159
|
+
if graph is None:
|
|
160
|
+
return None
|
|
161
|
+
if qualname in graph:
|
|
162
|
+
return qualname
|
|
163
|
+
q = qualname.lower()
|
|
164
|
+
for nid, attrs in graph.nodes(data=True):
|
|
165
|
+
if str(attrs.get("qualname") or "").lower() == q:
|
|
166
|
+
return str(nid)
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _graph_distance(graph: Any, src: str | None, dst: str | None) -> int | None:
|
|
171
|
+
if graph is None or src is None or dst is None:
|
|
172
|
+
return None
|
|
173
|
+
if src == dst:
|
|
174
|
+
return 0
|
|
175
|
+
try:
|
|
176
|
+
import networkx as nx
|
|
177
|
+
|
|
178
|
+
ug = graph.to_undirected(as_view=True) if hasattr(graph, "to_undirected") else graph
|
|
179
|
+
return int(nx.shortest_path_length(ug, src, dst))
|
|
180
|
+
except Exception:
|
|
181
|
+
return None
|