agentforge-graph 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentforge_graph/__init__.py +6 -0
- agentforge_graph/chunking/__init__.py +12 -0
- agentforge_graph/chunking/cast.py +159 -0
- agentforge_graph/chunking/chunk.py +19 -0
- agentforge_graph/chunking/tokens.py +15 -0
- agentforge_graph/cli.py +607 -0
- agentforge_graph/config.py +259 -0
- agentforge_graph/core/__init__.py +54 -0
- agentforge_graph/core/conformance.py +270 -0
- agentforge_graph/core/contracts.py +163 -0
- agentforge_graph/core/kinds.py +68 -0
- agentforge_graph/core/models.py +134 -0
- agentforge_graph/core/provenance.py +62 -0
- agentforge_graph/core/symbols.py +116 -0
- agentforge_graph/embed/__init__.py +28 -0
- agentforge_graph/embed/base.py +22 -0
- agentforge_graph/embed/bedrock.py +85 -0
- agentforge_graph/embed/fake.py +34 -0
- agentforge_graph/embed/openai.py +67 -0
- agentforge_graph/embed/pipeline.py +184 -0
- agentforge_graph/embed/registry.py +66 -0
- agentforge_graph/embed/report.py +15 -0
- agentforge_graph/enrich/__init__.py +70 -0
- agentforge_graph/enrich/anthropic.py +38 -0
- agentforge_graph/enrich/anthropic_client.py +109 -0
- agentforge_graph/enrich/bedrock.py +24 -0
- agentforge_graph/enrich/bedrock_client.py +115 -0
- agentforge_graph/enrich/bedrock_summarizer.py +23 -0
- agentforge_graph/enrich/claude.py +172 -0
- agentforge_graph/enrich/enricher.py +108 -0
- agentforge_graph/enrich/governs.py +173 -0
- agentforge_graph/enrich/governs_enricher.py +152 -0
- agentforge_graph/enrich/heuristics.py +224 -0
- agentforge_graph/enrich/judge.py +63 -0
- agentforge_graph/enrich/registry.py +133 -0
- agentforge_graph/enrich/report.py +60 -0
- agentforge_graph/enrich/summarizer.py +62 -0
- agentforge_graph/enrich/summary_enricher.py +211 -0
- agentforge_graph/enrich/taxonomy.py +38 -0
- agentforge_graph/frameworks/__init__.py +29 -0
- agentforge_graph/frameworks/base.py +75 -0
- agentforge_graph/frameworks/detect.py +124 -0
- agentforge_graph/frameworks/extractor.py +63 -0
- agentforge_graph/frameworks/orm.py +93 -0
- agentforge_graph/frameworks/packs/_js_ast.py +56 -0
- agentforge_graph/frameworks/packs/_python_ast.py +157 -0
- agentforge_graph/frameworks/packs/django/__init__.py +240 -0
- agentforge_graph/frameworks/packs/django/models.scm +7 -0
- agentforge_graph/frameworks/packs/express/__init__.py +133 -0
- agentforge_graph/frameworks/packs/express/routes.scm +8 -0
- agentforge_graph/frameworks/packs/fastapi/__init__.py +210 -0
- agentforge_graph/frameworks/packs/fastapi/depends.scm +6 -0
- agentforge_graph/frameworks/packs/fastapi/routes.scm +10 -0
- agentforge_graph/frameworks/packs/flask/__init__.py +143 -0
- agentforge_graph/frameworks/packs/flask/routes.scm +11 -0
- agentforge_graph/frameworks/packs/nestjs/__init__.py +205 -0
- agentforge_graph/frameworks/packs/nestjs/routes.scm +6 -0
- agentforge_graph/frameworks/packs/spring/__init__.py +267 -0
- agentforge_graph/frameworks/packs/spring/routes.scm +6 -0
- agentforge_graph/frameworks/packs/sqlalchemy/__init__.py +250 -0
- agentforge_graph/frameworks/packs/sqlalchemy/models.scm +7 -0
- agentforge_graph/frameworks/registry.py +44 -0
- agentforge_graph/ingest/__init__.py +30 -0
- agentforge_graph/ingest/codegraph.py +847 -0
- agentforge_graph/ingest/extractor.py +353 -0
- agentforge_graph/ingest/incremental/__init__.py +25 -0
- agentforge_graph/ingest/incremental/detect.py +118 -0
- agentforge_graph/ingest/incremental/dirty.py +61 -0
- agentforge_graph/ingest/incremental/indexer.py +218 -0
- agentforge_graph/ingest/incremental/meta.py +72 -0
- agentforge_graph/ingest/incremental/ports.py +39 -0
- agentforge_graph/ingest/pack.py +160 -0
- agentforge_graph/ingest/packs/__init__.py +34 -0
- agentforge_graph/ingest/packs/cpp/__init__.py +35 -0
- agentforge_graph/ingest/packs/cpp/references.scm +15 -0
- agentforge_graph/ingest/packs/cpp/structure.scm +49 -0
- agentforge_graph/ingest/packs/csharp/__init__.py +35 -0
- agentforge_graph/ingest/packs/csharp/references.scm +12 -0
- agentforge_graph/ingest/packs/csharp/structure.scm +45 -0
- agentforge_graph/ingest/packs/go/__init__.py +38 -0
- agentforge_graph/ingest/packs/go/references.scm +12 -0
- agentforge_graph/ingest/packs/go/structure.scm +64 -0
- agentforge_graph/ingest/packs/java/__init__.py +35 -0
- agentforge_graph/ingest/packs/java/references.scm +12 -0
- agentforge_graph/ingest/packs/java/structure.scm +38 -0
- agentforge_graph/ingest/packs/javascript/__init__.py +34 -0
- agentforge_graph/ingest/packs/javascript/references.scm +11 -0
- agentforge_graph/ingest/packs/javascript/structure.scm +166 -0
- agentforge_graph/ingest/packs/php/__init__.py +35 -0
- agentforge_graph/ingest/packs/php/references.scm +15 -0
- agentforge_graph/ingest/packs/php/structure.scm +44 -0
- agentforge_graph/ingest/packs/python/__init__.py +25 -0
- agentforge_graph/ingest/packs/python/references.scm +14 -0
- agentforge_graph/ingest/packs/python/structure.scm +57 -0
- agentforge_graph/ingest/packs/ruby/__init__.py +37 -0
- agentforge_graph/ingest/packs/ruby/references.scm +12 -0
- agentforge_graph/ingest/packs/ruby/structure.scm +37 -0
- agentforge_graph/ingest/packs/rust/__init__.py +39 -0
- agentforge_graph/ingest/packs/rust/references.scm +12 -0
- agentforge_graph/ingest/packs/rust/structure.scm +46 -0
- agentforge_graph/ingest/packs/typescript/__init__.py +31 -0
- agentforge_graph/ingest/packs/typescript/references.scm +11 -0
- agentforge_graph/ingest/packs/typescript/structure.scm +99 -0
- agentforge_graph/ingest/pipeline.py +134 -0
- agentforge_graph/ingest/report.py +84 -0
- agentforge_graph/ingest/resolver.py +467 -0
- agentforge_graph/ingest/source.py +79 -0
- agentforge_graph/knowledge/__init__.py +28 -0
- agentforge_graph/knowledge/adr.py +136 -0
- agentforge_graph/knowledge/commits.py +152 -0
- agentforge_graph/knowledge/ingest.py +312 -0
- agentforge_graph/knowledge/mentions.py +71 -0
- agentforge_graph/knowledge/report.py +32 -0
- agentforge_graph/main.py +21 -0
- agentforge_graph/providers.py +36 -0
- agentforge_graph/repomap/__init__.py +14 -0
- agentforge_graph/repomap/rank.py +161 -0
- agentforge_graph/repomap/render.py +55 -0
- agentforge_graph/repomap/repomap.py +66 -0
- agentforge_graph/retrieve/__init__.py +21 -0
- agentforge_graph/retrieve/pack.py +76 -0
- agentforge_graph/retrieve/rerank.py +251 -0
- agentforge_graph/retrieve/retriever.py +286 -0
- agentforge_graph/retrieve/scoring.py +36 -0
- agentforge_graph/serve/__init__.py +19 -0
- agentforge_graph/serve/engine.py +204 -0
- agentforge_graph/serve/http_runner.py +133 -0
- agentforge_graph/serve/server.py +110 -0
- agentforge_graph/serve/tools.py +307 -0
- agentforge_graph/store/__init__.py +32 -0
- agentforge_graph/store/_rowmap.py +102 -0
- agentforge_graph/store/errors.py +22 -0
- agentforge_graph/store/facade.py +89 -0
- agentforge_graph/store/kuzu_store.py +380 -0
- agentforge_graph/store/lance_store.py +146 -0
- agentforge_graph/store/neo4j_store.py +294 -0
- agentforge_graph/store/pgvector_store.py +170 -0
- agentforge_graph/store/registry.py +45 -0
- agentforge_graph/temporal/__init__.py +36 -0
- agentforge_graph/temporal/backfill.py +338 -0
- agentforge_graph/temporal/events.py +82 -0
- agentforge_graph/temporal/index.py +190 -0
- agentforge_graph/temporal/mining.py +190 -0
- agentforge_graph/temporal/recorder.py +114 -0
- agentforge_graph/temporal/store.py +282 -0
- agentforge_graph-0.3.2.dist-info/METADATA +291 -0
- agentforge_graph-0.3.2.dist-info/RECORD +151 -0
- agentforge_graph-0.3.2.dist-info/WHEEL +4 -0
- agentforge_graph-0.3.2.dist-info/entry_points.txt +3 -0
- agentforge_graph-0.3.2.dist-info/licenses/LICENSE +202 -0
- agentforge_graph-0.3.2.dist-info/licenses/NOTICE +14 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""Reranking hook (feat-006 seam; ENH-009).
|
|
2
|
+
|
|
3
|
+
Three modes, all behind ``retrieve.rerank``:
|
|
4
|
+
|
|
5
|
+
- ``off`` (default) — identity; pure cosine + graph score.
|
|
6
|
+
- ``lexical`` — a deterministic, dependency-free blend of the base score with the
|
|
7
|
+
**subtoken overlap** between the query and the candidate's name + code, so a
|
|
8
|
+
chunk whose symbol the query names (``ZodObject``, ``_parse``, ``res.send``)
|
|
9
|
+
sorts up even when its raw cosine landed *near* the answer. Useful for
|
|
10
|
+
keyword/symbol-naming queries; measured mixed on prose (hence opt-in).
|
|
11
|
+
- ``cross_encoder`` — a real semantic re-score: a cross-encoder relevance model
|
|
12
|
+
(``sentence-transformers``, the ``rerank`` extra) scores each (query,
|
|
13
|
+
candidate) pair, blended with the base score. The highest-leverage lever for
|
|
14
|
+
natural-language → symbol precision. The model is lazy-loaded so the base
|
|
15
|
+
install / CI never import torch; the blend logic is injectable (``CrossScorer``)
|
|
16
|
+
so it is tested without the model. Third-party only — no ``agentforge`` import
|
|
17
|
+
(ADR-0001).
|
|
18
|
+
|
|
19
|
+
All rerankers are deterministic given their inputs and order-stable on ties."""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import asyncio
|
|
24
|
+
import math
|
|
25
|
+
import re
|
|
26
|
+
from typing import Protocol
|
|
27
|
+
|
|
28
|
+
from .pack import ContextItem
|
|
29
|
+
|
|
30
|
+
# A capable, small default cross-encoder; overridable via ``retrieve.rerank_model``.
|
|
31
|
+
DEFAULT_CROSS_ENCODER = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
|
32
|
+
_MAX_CANDIDATE_CHARS = 2000 # cross-encoders truncate anyway; bound the payload
|
|
33
|
+
|
|
34
|
+
# Function words that carry no retrieval signal — dropped from both sides.
|
|
35
|
+
_STOP = frozenset(
|
|
36
|
+
[
|
|
37
|
+
"a",
|
|
38
|
+
"an",
|
|
39
|
+
"the",
|
|
40
|
+
"is",
|
|
41
|
+
"are",
|
|
42
|
+
"was",
|
|
43
|
+
"were",
|
|
44
|
+
"be",
|
|
45
|
+
"how",
|
|
46
|
+
"do",
|
|
47
|
+
"does",
|
|
48
|
+
"did",
|
|
49
|
+
"of",
|
|
50
|
+
"to",
|
|
51
|
+
"in",
|
|
52
|
+
"on",
|
|
53
|
+
"for",
|
|
54
|
+
"and",
|
|
55
|
+
"or",
|
|
56
|
+
"with",
|
|
57
|
+
"this",
|
|
58
|
+
"that",
|
|
59
|
+
"it",
|
|
60
|
+
"as",
|
|
61
|
+
"at",
|
|
62
|
+
"by",
|
|
63
|
+
"from",
|
|
64
|
+
"into",
|
|
65
|
+
"than",
|
|
66
|
+
"then",
|
|
67
|
+
"over",
|
|
68
|
+
"under",
|
|
69
|
+
"not",
|
|
70
|
+
"no",
|
|
71
|
+
"your",
|
|
72
|
+
"you",
|
|
73
|
+
"we",
|
|
74
|
+
"i",
|
|
75
|
+
"me",
|
|
76
|
+
"my",
|
|
77
|
+
"our",
|
|
78
|
+
"their",
|
|
79
|
+
"its",
|
|
80
|
+
"they",
|
|
81
|
+
"them",
|
|
82
|
+
"he",
|
|
83
|
+
"she",
|
|
84
|
+
"where",
|
|
85
|
+
"what",
|
|
86
|
+
"which",
|
|
87
|
+
"who",
|
|
88
|
+
"when",
|
|
89
|
+
"why",
|
|
90
|
+
"can",
|
|
91
|
+
"could",
|
|
92
|
+
"should",
|
|
93
|
+
"would",
|
|
94
|
+
"will",
|
|
95
|
+
"shall",
|
|
96
|
+
"may",
|
|
97
|
+
"might",
|
|
98
|
+
"must",
|
|
99
|
+
"have",
|
|
100
|
+
"has",
|
|
101
|
+
"had",
|
|
102
|
+
"get",
|
|
103
|
+
"set",
|
|
104
|
+
"up",
|
|
105
|
+
"out",
|
|
106
|
+
"off",
|
|
107
|
+
"via",
|
|
108
|
+
"per",
|
|
109
|
+
"use",
|
|
110
|
+
"used",
|
|
111
|
+
"using",
|
|
112
|
+
"return",
|
|
113
|
+
"returns",
|
|
114
|
+
]
|
|
115
|
+
)
|
|
116
|
+
# Split identifiers into subtokens: ALLCAPS, CamelChunk, or a run of lower/digits.
|
|
117
|
+
_CAMEL = re.compile(r"[A-Z]+(?![a-z])|[A-Z][a-z0-9]*|[a-z0-9]+")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _tokens(text: str) -> set[str]:
|
|
121
|
+
"""Lowercased subtokens of ``text``: splits on non-alphanumerics *and*
|
|
122
|
+
camelCase, so ``ZodObject._parse`` → {zod, object, parse}. Drops stopwords
|
|
123
|
+
and single chars (noise)."""
|
|
124
|
+
out: set[str] = set()
|
|
125
|
+
for raw in re.split(r"[^A-Za-z0-9]+", text):
|
|
126
|
+
for m in _CAMEL.findall(raw):
|
|
127
|
+
low = m.lower()
|
|
128
|
+
if len(low) >= 2 and low not in _STOP:
|
|
129
|
+
out.add(low)
|
|
130
|
+
return out
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class Reranker(Protocol):
|
|
134
|
+
async def rerank(self, query: str, items: list[ContextItem]) -> list[ContextItem]: ...
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class NoopReranker:
|
|
138
|
+
"""Identity reranker (rerank: off)."""
|
|
139
|
+
|
|
140
|
+
async def rerank(self, query: str, items: list[ContextItem]) -> list[ContextItem]:
|
|
141
|
+
return items
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class LexicalReranker:
|
|
145
|
+
"""Blend base score with query↔candidate subtoken overlap, then re-sort.
|
|
146
|
+
|
|
147
|
+
``final = (1 - weight)·base + weight·overlap``, where ``overlap`` is the
|
|
148
|
+
fraction of (non-stopword) query subtokens present in the candidate's
|
|
149
|
+
name + code. Deterministic and order-stable on ties."""
|
|
150
|
+
|
|
151
|
+
def __init__(self, weight: float = 0.5) -> None:
|
|
152
|
+
self._w = max(0.0, min(1.0, weight))
|
|
153
|
+
|
|
154
|
+
async def rerank(self, query: str, items: list[ContextItem]) -> list[ContextItem]:
|
|
155
|
+
qtoks = _tokens(query)
|
|
156
|
+
if not qtoks or not items:
|
|
157
|
+
return items
|
|
158
|
+
rescored: list[ContextItem] = []
|
|
159
|
+
for it in items:
|
|
160
|
+
itoks = _tokens(f"{it.name} {it.code or ''}")
|
|
161
|
+
overlap = len(qtoks & itoks) / len(qtoks)
|
|
162
|
+
final = (1.0 - self._w) * it.score + self._w * overlap
|
|
163
|
+
rescored.append(
|
|
164
|
+
it.model_copy(update={"score": final, "why": [*it.why, f"lexical {overlap:.2f}"]})
|
|
165
|
+
)
|
|
166
|
+
rescored.sort(key=lambda i: (-i.score, i.id)) # id tiebreak = deterministic
|
|
167
|
+
return rescored
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class CrossScorer(Protocol):
|
|
171
|
+
"""Scores (query, candidate-text) pairs — higher = more relevant. The
|
|
172
|
+
injection seam that keeps the model out of the blend logic (and out of CI)."""
|
|
173
|
+
|
|
174
|
+
def score(self, query: str, texts: list[str]) -> list[float]: ...
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _candidate_text(it: ContextItem) -> str:
|
|
178
|
+
body = it.code or it.signature()
|
|
179
|
+
return f"{it.name}\n{body}"[:_MAX_CANDIDATE_CHARS]
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class CrossEncoderReranker:
|
|
183
|
+
"""Re-score the top-k candidates with a cross-encoder, then blend and re-sort.
|
|
184
|
+
|
|
185
|
+
``final = (1 - weight)·base + weight·σ(cross_score)`` — the cross-encoder's
|
|
186
|
+
raw relevance logit is squashed to ``[0, 1]`` (so it is comparable to the
|
|
187
|
+
cosine-scale base score) and blended. The model call runs off the event loop
|
|
188
|
+
(``to_thread``); identity on an empty query/candidate set."""
|
|
189
|
+
|
|
190
|
+
def __init__(self, scorer: CrossScorer, weight: float = 0.5) -> None:
|
|
191
|
+
self._scorer = scorer
|
|
192
|
+
self._w = max(0.0, min(1.0, weight))
|
|
193
|
+
|
|
194
|
+
async def rerank(self, query: str, items: list[ContextItem]) -> list[ContextItem]:
|
|
195
|
+
if not query or not items:
|
|
196
|
+
return items
|
|
197
|
+
texts = [_candidate_text(it) for it in items]
|
|
198
|
+
raw = await asyncio.to_thread(self._scorer.score, query, texts)
|
|
199
|
+
rescored: list[ContextItem] = []
|
|
200
|
+
for it, r in zip(items, raw, strict=True):
|
|
201
|
+
ce = 1.0 / (1.0 + math.exp(-r)) # σ → [0, 1]
|
|
202
|
+
final = (1.0 - self._w) * it.score + self._w * ce
|
|
203
|
+
rescored.append(
|
|
204
|
+
it.model_copy(update={"score": final, "why": [*it.why, f"cross-encoder {ce:.2f}"]})
|
|
205
|
+
)
|
|
206
|
+
rescored.sort(key=lambda i: (-i.score, i.id))
|
|
207
|
+
return rescored
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class SentenceTransformerScorer:
|
|
211
|
+
"""A ``CrossScorer`` backed by ``sentence_transformers.CrossEncoder``. The
|
|
212
|
+
model is loaded lazily on first use, so importing this module (and running
|
|
213
|
+
CI) never pulls torch; the import error names the extra to install."""
|
|
214
|
+
|
|
215
|
+
def __init__(self, model_name: str = DEFAULT_CROSS_ENCODER) -> None:
|
|
216
|
+
self._model_name = model_name
|
|
217
|
+
self._model: object | None = None
|
|
218
|
+
|
|
219
|
+
def _ensure_model(self) -> object:
|
|
220
|
+
if self._model is None:
|
|
221
|
+
try:
|
|
222
|
+
from sentence_transformers import CrossEncoder
|
|
223
|
+
except ImportError as exc: # the extra isn't installed
|
|
224
|
+
raise ImportError(
|
|
225
|
+
"cross-encoder rerank needs the 'rerank' extra (uv sync --extra rerank)"
|
|
226
|
+
) from exc
|
|
227
|
+
self._model = CrossEncoder(self._model_name)
|
|
228
|
+
return self._model
|
|
229
|
+
|
|
230
|
+
def score(self, query: str, texts: list[str]) -> list[float]:
|
|
231
|
+
if not texts:
|
|
232
|
+
return []
|
|
233
|
+
model = self._ensure_model()
|
|
234
|
+
return [float(s) for s in model.predict([(query, t) for t in texts])] # type: ignore[attr-defined]
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def reranker_from_config(rerank: str, weight: float = 0.5, model: str = "") -> Reranker:
|
|
238
|
+
"""Resolve the ``retrieve.rerank`` config value to a reranker.
|
|
239
|
+
``off``/empty → identity; ``lexical`` → :class:`LexicalReranker`;
|
|
240
|
+
``cross_encoder`` → :class:`CrossEncoderReranker` over a lazily-loaded
|
|
241
|
+
sentence-transformers model (``model`` overrides the default)."""
|
|
242
|
+
ref = (rerank or "off").strip()
|
|
243
|
+
if ref in ("", "off"):
|
|
244
|
+
return NoopReranker()
|
|
245
|
+
if ref == "lexical":
|
|
246
|
+
return LexicalReranker(weight)
|
|
247
|
+
if ref == "cross_encoder":
|
|
248
|
+
return CrossEncoderReranker(
|
|
249
|
+
SentenceTransformerScorer(model or DEFAULT_CROSS_ENCODER), weight
|
|
250
|
+
)
|
|
251
|
+
raise ValueError(f"unknown reranker {ref!r}; use 'off', 'lexical' or 'cross_encoder'")
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""``Retriever`` — vector entry → typed graph expansion → provenance-weighted
|
|
2
|
+
merge. Deterministic and LLM-free; the single retrieval surface feat-008 and
|
|
3
|
+
the enrichers ride on.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from typing import Literal
|
|
10
|
+
|
|
11
|
+
from agentforge_graph.config import RetrieveConfig
|
|
12
|
+
from agentforge_graph.core import Direction, EdgeKind, Node, NodeKind, Source, SymbolID
|
|
13
|
+
from agentforge_graph.embed import Embedder
|
|
14
|
+
from agentforge_graph.store import Store
|
|
15
|
+
|
|
16
|
+
from .pack import ContextItem, ContextPack
|
|
17
|
+
from .rerank import NoopReranker, Reranker
|
|
18
|
+
from .scoring import dedupe_max, edge_weight, step_score
|
|
19
|
+
|
|
20
|
+
Mode = Literal["context", "impact", "definition", "similar"]
|
|
21
|
+
|
|
22
|
+
# code-symbol kinds an as_of allow-filter constrains (feat-009)
|
|
23
|
+
_SYMBOL_KINDS = frozenset({NodeKind.CLASS, NodeKind.FUNCTION, NodeKind.METHOD})
|
|
24
|
+
|
|
25
|
+
# A query "smells architectural" when it asks about decisions/rationale/design — the
|
|
26
|
+
# case where ADR/doc prose SHOULD rank with code. Else docs are down-weighted (feat-010).
|
|
27
|
+
_ARCH_TERMS = frozenset(
|
|
28
|
+
{
|
|
29
|
+
"why",
|
|
30
|
+
"decision",
|
|
31
|
+
"decisions",
|
|
32
|
+
"rationale",
|
|
33
|
+
"architecture",
|
|
34
|
+
"architectural",
|
|
35
|
+
"design",
|
|
36
|
+
"convention",
|
|
37
|
+
"conventions",
|
|
38
|
+
"adr",
|
|
39
|
+
"govern",
|
|
40
|
+
"governs",
|
|
41
|
+
"principle",
|
|
42
|
+
"tradeoff",
|
|
43
|
+
"trade-off",
|
|
44
|
+
"policy",
|
|
45
|
+
"constraint",
|
|
46
|
+
"supposed",
|
|
47
|
+
"allowed",
|
|
48
|
+
"forbidden",
|
|
49
|
+
"deprecated",
|
|
50
|
+
}
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
_WORD_RE = re.compile(r"[a-z0-9-]+")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _is_architectural(query: str) -> bool:
|
|
57
|
+
return any(w in _ARCH_TERMS for w in _WORD_RE.findall(query.lower()))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# Trust order for the min_provenance filter: llm < parsed < resolved <= manual
|
|
61
|
+
# (human-asserted facts are trusted; ADR-0004 / spec §2). Distinct from
|
|
62
|
+
# GraphQuery.min_source and from the scoring edge_weights.
|
|
63
|
+
_RANK: dict[Source, int] = {Source.LLM: 0, Source.PARSED: 1, Source.RESOLVED: 2, Source.MANUAL: 3}
|
|
64
|
+
_FLOOR: dict[str, int] = {"parsed": 1, "resolved": 2}
|
|
65
|
+
|
|
66
|
+
# feat-009 churn/authorship fields denormalised onto a symbol's node.attrs; the
|
|
67
|
+
# Retriever surfaces them on the item without joining the temporal sidecar (it
|
|
68
|
+
# stays in the deterministic core, ADR-0001). Empty → item.temporal stays None.
|
|
69
|
+
_TEMPORAL_KEYS = (
|
|
70
|
+
"introduced",
|
|
71
|
+
"introduced_ts",
|
|
72
|
+
"last_changed",
|
|
73
|
+
"last_changed_ts",
|
|
74
|
+
"churn_30d",
|
|
75
|
+
"churn_90d",
|
|
76
|
+
"top_authors",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _temporal_attrs(node: Node) -> dict[str, object] | None:
|
|
81
|
+
out = {k: node.attrs[k] for k in _TEMPORAL_KEYS if k in node.attrs}
|
|
82
|
+
return out or None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
_MODE_EDGES: dict[Mode, tuple[list[EdgeKind], Direction]] = {
|
|
86
|
+
# GOVERNS/DESCRIBES (feat-010) surface the decision/doc governing a retrieved
|
|
87
|
+
# symbol; TAGGED + SUMMARIZES (feat-012) surface its design-pattern role and
|
|
88
|
+
# the module summary. The differentiators; llm items obey include_llm_facts.
|
|
89
|
+
"context": (
|
|
90
|
+
[
|
|
91
|
+
EdgeKind.CALLS,
|
|
92
|
+
EdgeKind.CONTAINS,
|
|
93
|
+
EdgeKind.INHERITS,
|
|
94
|
+
EdgeKind.REFERENCES,
|
|
95
|
+
EdgeKind.GOVERNS,
|
|
96
|
+
EdgeKind.DESCRIBES,
|
|
97
|
+
EdgeKind.TAGGED,
|
|
98
|
+
EdgeKind.SUMMARIZES,
|
|
99
|
+
],
|
|
100
|
+
"both",
|
|
101
|
+
),
|
|
102
|
+
"impact": ([EdgeKind.CALLS, EdgeKind.IMPORTS, EdgeKind.IMPLEMENTS], "in"),
|
|
103
|
+
"definition": ([EdgeKind.CONTAINS, EdgeKind.CHUNK_OF], "both"),
|
|
104
|
+
"similar": ([], "both"),
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class Retriever:
|
|
109
|
+
def __init__(
|
|
110
|
+
self,
|
|
111
|
+
store: Store,
|
|
112
|
+
embedder: Embedder,
|
|
113
|
+
config: RetrieveConfig,
|
|
114
|
+
reranker: Reranker | None = None,
|
|
115
|
+
) -> None:
|
|
116
|
+
self.store = store
|
|
117
|
+
self.embedder = embedder
|
|
118
|
+
self.config = config
|
|
119
|
+
self.reranker: Reranker = reranker or NoopReranker()
|
|
120
|
+
|
|
121
|
+
async def retrieve(
|
|
122
|
+
self,
|
|
123
|
+
query: str | None = None,
|
|
124
|
+
symbol: str | None = None,
|
|
125
|
+
mode: Mode = "context",
|
|
126
|
+
k: int | None = None,
|
|
127
|
+
depth: int | None = None,
|
|
128
|
+
edge_kinds: list[EdgeKind] | None = None,
|
|
129
|
+
min_provenance: Literal["parsed", "resolved"] | None = None,
|
|
130
|
+
include_llm_facts: bool = True,
|
|
131
|
+
allow_ids: set[str] | None = None,
|
|
132
|
+
) -> ContextPack:
|
|
133
|
+
cfg = self.config
|
|
134
|
+
k = cfg.k if k is None else k
|
|
135
|
+
depth = cfg.depth if depth is None else depth
|
|
136
|
+
items: list[ContextItem] = []
|
|
137
|
+
notes: list[str] = []
|
|
138
|
+
seeds: dict[str, float] = {}
|
|
139
|
+
|
|
140
|
+
# --- entry ---
|
|
141
|
+
if query is not None:
|
|
142
|
+
qvec = (await self.embedder.embed([query], "query"))[0]
|
|
143
|
+
# down-weight ADR/doc prose so code outranks equally-similar docs, unless
|
|
144
|
+
# the query smells architectural (then docs keep their full score). feat-010.
|
|
145
|
+
doc_w = 1.0 if _is_architectural(query) else self.config.doc_weight
|
|
146
|
+
for hit in await self.store.vectors.search(qvec, k):
|
|
147
|
+
node = await self.store.graph.get(hit.ref)
|
|
148
|
+
if node is None:
|
|
149
|
+
continue
|
|
150
|
+
score = hit.score * doc_w if node.kind is NodeKind.DOC_CHUNK else hit.score
|
|
151
|
+
items.append(self._item(node, score, [f"vector hit {score:.2f}"]))
|
|
152
|
+
if mode != "similar":
|
|
153
|
+
# a chunk hit seeds its symbols; a summary hit (feat-012)
|
|
154
|
+
# seeds the code it summarizes — concept query → code.
|
|
155
|
+
for edge in await self.store.graph.adjacent(
|
|
156
|
+
hit.ref, [EdgeKind.CHUNK_OF, EdgeKind.SUMMARIZES], "out"
|
|
157
|
+
):
|
|
158
|
+
seeds[edge.dst] = max(seeds.get(edge.dst, 0.0), score)
|
|
159
|
+
# a doc-chunk hit (feat-010) seeds the code it attaches to: an
|
|
160
|
+
# ADR section seeds its containing Decision (CONTAINS in → then
|
|
161
|
+
# GOVERNS to governed code); a docstring seeds the symbol it
|
|
162
|
+
# DESCRIBES (out). Either way a prose query reaches the code.
|
|
163
|
+
if node.kind is NodeKind.DOC_CHUNK:
|
|
164
|
+
for edge in await self.store.graph.adjacent(
|
|
165
|
+
hit.ref, [EdgeKind.CONTAINS], "in"
|
|
166
|
+
):
|
|
167
|
+
seeds[edge.src] = max(seeds.get(edge.src, 0.0), score)
|
|
168
|
+
for edge in await self.store.graph.adjacent(
|
|
169
|
+
hit.ref, [EdgeKind.DESCRIBES], "out"
|
|
170
|
+
):
|
|
171
|
+
seeds[edge.dst] = max(seeds.get(edge.dst, 0.0), score)
|
|
172
|
+
if symbol is not None:
|
|
173
|
+
seeds[symbol] = max(seeds.get(symbol, 0.0), 1.0)
|
|
174
|
+
|
|
175
|
+
for sid, score in list(seeds.items()):
|
|
176
|
+
node = await self.store.graph.get(sid)
|
|
177
|
+
if node is None:
|
|
178
|
+
del seeds[sid]
|
|
179
|
+
continue
|
|
180
|
+
items.append(self._item(node, score, ["entry"]))
|
|
181
|
+
|
|
182
|
+
# --- expand ---
|
|
183
|
+
kinds, direction = _MODE_EDGES[mode]
|
|
184
|
+
if edge_kinds is not None:
|
|
185
|
+
kinds = edge_kinds
|
|
186
|
+
if mode != "similar" and depth > 0 and seeds:
|
|
187
|
+
await self._expand(seeds, kinds, direction, depth, items, notes)
|
|
188
|
+
|
|
189
|
+
# --- merge ---
|
|
190
|
+
items = self._filter(items, min_provenance, include_llm_facts)
|
|
191
|
+
if allow_ids is not None: # feat-009 as_of: drop symbols not alive at the commit
|
|
192
|
+
items = [it for it in items if it.kind not in _SYMBOL_KINDS or it.id in allow_ids]
|
|
193
|
+
items = dedupe_max(items)
|
|
194
|
+
items = await self.reranker.rerank(query or "", items)
|
|
195
|
+
return ContextPack(query=query, symbol=symbol, mode=mode, items=items, notes=notes)
|
|
196
|
+
|
|
197
|
+
async def _expand(
|
|
198
|
+
self,
|
|
199
|
+
seeds: dict[str, float],
|
|
200
|
+
kinds: list[EdgeKind],
|
|
201
|
+
direction: Direction,
|
|
202
|
+
depth: int,
|
|
203
|
+
items: list[ContextItem],
|
|
204
|
+
notes: list[str],
|
|
205
|
+
) -> None:
|
|
206
|
+
cfg = self.config
|
|
207
|
+
frontier = dict(seeds)
|
|
208
|
+
visited = set(seeds)
|
|
209
|
+
for hop in range(1, depth + 1):
|
|
210
|
+
nxt: dict[str, float] = {}
|
|
211
|
+
for sid, score in frontier.items():
|
|
212
|
+
parent = await self.store.graph.get(sid)
|
|
213
|
+
pname = parent.name if parent else sid
|
|
214
|
+
edges = await self.store.graph.adjacent(sid, kinds, direction)
|
|
215
|
+
if len(edges) > cfg.fanout_cap:
|
|
216
|
+
notes.append(f"fan-out cap {cfg.fanout_cap} at {pname} ({len(edges)} edges)")
|
|
217
|
+
edges = edges[: cfg.fanout_cap]
|
|
218
|
+
for edge in edges:
|
|
219
|
+
other = edge.dst if edge.src == sid else edge.src
|
|
220
|
+
onode = await self.store.graph.get(other)
|
|
221
|
+
if onode is None:
|
|
222
|
+
continue
|
|
223
|
+
oscore = step_score(
|
|
224
|
+
score, cfg.decay, edge_weight(cfg.edge_weights, edge.provenance.source)
|
|
225
|
+
)
|
|
226
|
+
items.append(
|
|
227
|
+
self._item(onode, oscore, [f"{edge.kind.value} of {pname} (hop {hop})"])
|
|
228
|
+
)
|
|
229
|
+
if other not in visited:
|
|
230
|
+
visited.add(other)
|
|
231
|
+
nxt[other] = max(nxt.get(other, 0.0), oscore)
|
|
232
|
+
frontier = nxt
|
|
233
|
+
if not frontier:
|
|
234
|
+
break
|
|
235
|
+
|
|
236
|
+
def _item(self, node: Node, score: float, why: list[str]) -> ContextItem:
|
|
237
|
+
return ContextItem(
|
|
238
|
+
id=node.id,
|
|
239
|
+
kind=node.kind,
|
|
240
|
+
name=node.name,
|
|
241
|
+
score=score,
|
|
242
|
+
path=SymbolID.parse(node.id).path,
|
|
243
|
+
span=node.span,
|
|
244
|
+
code=self._render_code(node),
|
|
245
|
+
provenance=node.provenance.source,
|
|
246
|
+
why=list(why),
|
|
247
|
+
temporal=_temporal_attrs(node),
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
@staticmethod
|
|
251
|
+
def _render_code(node: Node) -> str | None:
|
|
252
|
+
"""The verbatim block a retrieved item renders. A Decision (feat-010)
|
|
253
|
+
shows its status/date inline so the agent sees governance at a glance."""
|
|
254
|
+
if node.kind is NodeKind.DECISION:
|
|
255
|
+
status = node.attrs.get("status", "")
|
|
256
|
+
date = node.attrs.get("date", "")
|
|
257
|
+
adr = node.attrs.get("adr_id", "")
|
|
258
|
+
stamp = ", ".join(x for x in (status, date) if x)
|
|
259
|
+
prefix = f"[{stamp}] " if stamp else ""
|
|
260
|
+
label = f"{adr}: " if adr else ""
|
|
261
|
+
return f"{prefix}{label}{node.attrs.get('title', node.name)}"
|
|
262
|
+
if node.kind is NodeKind.PATTERN_TAG:
|
|
263
|
+
return f"[llm] design pattern: {node.name}"
|
|
264
|
+
if node.kind is NodeKind.SUMMARY:
|
|
265
|
+
return f"[summary] {node.attrs.get('text', '')}"
|
|
266
|
+
if node.kind is NodeKind.DOC_CHUNK: # feat-010 — ADR/doc prose
|
|
267
|
+
heading = node.attrs.get("heading", "")
|
|
268
|
+
text = node.attrs.get("text", "")
|
|
269
|
+
return f"[doc] {heading}\n{text}".strip() if heading else f"[doc] {text}".strip()
|
|
270
|
+
return node.attrs.get("code")
|
|
271
|
+
|
|
272
|
+
def _filter(
|
|
273
|
+
self,
|
|
274
|
+
items: list[ContextItem],
|
|
275
|
+
min_provenance: Literal["parsed", "resolved"] | None,
|
|
276
|
+
include_llm_facts: bool,
|
|
277
|
+
) -> list[ContextItem]:
|
|
278
|
+
floor = _FLOOR[min_provenance] if min_provenance else None
|
|
279
|
+
out: list[ContextItem] = []
|
|
280
|
+
for it in items:
|
|
281
|
+
if not include_llm_facts and it.provenance is Source.LLM:
|
|
282
|
+
continue
|
|
283
|
+
if floor is not None and _RANK[it.provenance] < floor:
|
|
284
|
+
continue
|
|
285
|
+
out.append(it)
|
|
286
|
+
return out
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Scoring math for retrieval: provenance edge weights, per-hop decay, and
|
|
2
|
+
dedupe (max score wins, why-traces unioned)."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from agentforge_graph.core import Source
|
|
7
|
+
|
|
8
|
+
from .pack import ContextItem
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def edge_weight(weights: dict[str, float], source: Source) -> float:
|
|
12
|
+
"""Weight an expansion edge by its provenance — resolved > parsed > llm
|
|
13
|
+
(ADR-0004). Unknown sources fall back to 0.5."""
|
|
14
|
+
return weights.get(source.value, 0.5)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def step_score(parent_score: float, decay: float, weight: float) -> float:
|
|
18
|
+
"""One hop of decay: ``parent × decay × edge_weight``. Repeated over hops
|
|
19
|
+
yields the ``decay^hop`` falloff."""
|
|
20
|
+
return parent_score * decay * weight
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def dedupe_max(items: list[ContextItem]) -> list[ContextItem]:
|
|
24
|
+
"""Collapse items sharing an id to the highest-scoring one, unioning the
|
|
25
|
+
why-traces; return sorted by score descending."""
|
|
26
|
+
best: dict[str, ContextItem] = {}
|
|
27
|
+
whys: dict[str, list[str]] = {}
|
|
28
|
+
for it in items:
|
|
29
|
+
acc = whys.setdefault(it.id, [])
|
|
30
|
+
for w in it.why:
|
|
31
|
+
if w not in acc:
|
|
32
|
+
acc.append(w)
|
|
33
|
+
if it.id not in best or it.score > best[it.id].score:
|
|
34
|
+
best[it.id] = it
|
|
35
|
+
merged = [it.model_copy(update={"why": whys[i]}) for i, it in best.items()]
|
|
36
|
+
return sorted(merged, key=lambda i: i.score, reverse=True)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""agentforge_graph.serve — MCP server & AgentForge tool API (feat-008).
|
|
2
|
+
|
|
3
|
+
The framework-facing serving layer: the nine read-only tools over feat-006/007,
|
|
4
|
+
bound both as native AgentForge ``Tool`` instances (``code_graph_tools``) and an
|
|
5
|
+
MCP server (``serve_mcp``) over **stdio or streamable-HTTP**, from one definition.
|
|
6
|
+
This package imports ``agentforge`` (the deliberate ADR-0001 exception).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .engine import TOOL_API_VERSION
|
|
12
|
+
from .server import build_mcp_server, code_graph_tools, serve_mcp
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"TOOL_API_VERSION",
|
|
16
|
+
"code_graph_tools",
|
|
17
|
+
"serve_mcp",
|
|
18
|
+
"build_mcp_server",
|
|
19
|
+
]
|