apexgraph 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- apexgraph-0.1.0.dist-info/METADATA +175 -0
- apexgraph-0.1.0.dist-info/RECORD +32 -0
- apexgraph-0.1.0.dist-info/WHEEL +4 -0
- apexgraph-0.1.0.dist-info/entry_points.txt +2 -0
- apexgraph-0.1.0.dist-info/licenses/LICENSE +21 -0
- graphex/__init__.py +8 -0
- graphex/audit.py +111 -0
- graphex/benchmark.py +297 -0
- graphex/budget.py +336 -0
- graphex/cache.py +116 -0
- graphex/cli.py +675 -0
- graphex/diff.py +176 -0
- graphex/exporter.py +119 -0
- graphex/formatter.py +287 -0
- graphex/ignore.py +87 -0
- graphex/indexer/__init__.py +6 -0
- graphex/indexer/go.py +139 -0
- graphex/indexer/project.py +266 -0
- graphex/indexer/python.py +190 -0
- graphex/indexer/typescript.py +267 -0
- graphex/injector.py +207 -0
- graphex/loader.py +670 -0
- graphex/mcp.py +463 -0
- graphex/models.py +290 -0
- graphex/retrieval/__init__.py +6 -0
- graphex/retrieval/base.py +31 -0
- graphex/retrieval/bm25.py +223 -0
- graphex/retrieval/dense.py +81 -0
- graphex/retrieval/fusion.py +71 -0
- graphex/retrieval/ppr.py +225 -0
- graphex/scorer.py +113 -0
- graphex/viz.py +321 -0
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: apexgraph
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Apex-relevance subgraph retrieval for AI agents. Feed your LLM the peak of your knowledge graph, sized to a token budget.
|
|
5
|
+
Project-URL: Homepage, https://github.com/alfonsomayoral/graphex
|
|
6
|
+
Project-URL: Repository, https://github.com/alfonsomayoral/graphex
|
|
7
|
+
Project-URL: Issues, https://github.com/alfonsomayoral/graphex/issues
|
|
8
|
+
Author-email: Alfonso Mayoral <alfonsomayoral29@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: ai-tools,bm25,cli,graphify,knowledge-graph,llm,mcp,pagerank,rag,token-budget
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
|
+
Requires-Dist: click>=8.1
|
|
21
|
+
Requires-Dist: networkx>=3.2
|
|
22
|
+
Requires-Dist: pathspec>=0.12
|
|
23
|
+
Requires-Dist: rich>=13.7
|
|
24
|
+
Requires-Dist: tiktoken>=0.7
|
|
25
|
+
Provides-Extra: dense
|
|
26
|
+
Requires-Dist: anthropic>=0.34; extra == 'dense'
|
|
27
|
+
Requires-Dist: openai>=1.40; extra == 'dense'
|
|
28
|
+
Provides-Extra: ts
|
|
29
|
+
Requires-Dist: tree-sitter-typescript>=0.23; extra == 'ts'
|
|
30
|
+
Requires-Dist: tree-sitter>=0.22; extra == 'ts'
|
|
31
|
+
Provides-Extra: viz
|
|
32
|
+
Requires-Dist: watchdog>=4.0; extra == 'viz'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
<div align="center">
|
|
36
|
+
|
|
37
|
+
# Graphex
|
|
38
|
+
|
|
39
|
+
**Apex-relevance subgraph retrieval for AI agents.**
|
|
40
|
+
|
|
41
|
+
Feed your LLM the *peak* of your knowledge graph — sized to a token budget.
|
|
42
|
+
|
|
43
|
+
[](https://github.com/alfonsomayoral/graphex/actions/workflows/ci.yml)
|
|
44
|
+

|
|
45
|
+

|
|
46
|
+
|
|
47
|
+
</div>
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
Knowledge graphs grow large. When an agent needs context about one corner of a
|
|
52
|
+
codebase, dumping the whole graph into the prompt wastes tokens and money — and
|
|
53
|
+
buries the relevant nodes in noise. **Graphex scores every node against your
|
|
54
|
+
query and returns the most relevant, connected subgraph that fits within a token
|
|
55
|
+
budget**, ready to paste into a prompt or serve over MCP.
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
graphex index . # build a graph from your code (no LLM)
|
|
59
|
+
graphex "how does auth work" --budget 4000 # retrieve the apex subgraph
|
|
60
|
+
graphex serve # expose it to agents over MCP
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Graphex reads the graphs produced by **graphify** and uses the rich signals
|
|
64
|
+
graphify emits — edge weights, confidence, hyperedges, communities, and god
|
|
65
|
+
nodes — that simpler tools throw away.
|
|
66
|
+
|
|
67
|
+
## Install
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
uv tool install apexgraph # or: pipx install apexgraph
|
|
71
|
+
# optional extras:
|
|
72
|
+
uv tool install "apexgraph[ts]" # better TypeScript indexing (tree-sitter)
|
|
73
|
+
uv tool install "apexgraph[dense]" # OpenAI/Anthropic embedding backend
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
The PyPI distribution is `apexgraph`; the command and import name are `graphex`.
|
|
77
|
+
Requires Python 3.12+.
|
|
78
|
+
|
|
79
|
+
## How it works
|
|
80
|
+
|
|
81
|
+
A five-stage pipeline, each stage a single-responsibility module:
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
load ─▶ score ─▶ select ─▶ inject ─▶ render
|
|
85
|
+
│ │ │ │ │
|
|
86
|
+
multi- BM25 → cost-aware source- markdown /
|
|
87
|
+
format PPR + MMR under code json / yaml
|
|
88
|
+
loader prior budget bodies
|
|
89
|
+
▲
|
|
90
|
+
index ───────────────┘ build a graph straight from code (no graphify)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
**Relevance is one principled number, not a hand-tuned mix.** BM25 finds the
|
|
94
|
+
nodes the query is literally about; those seed a **Personalized PageRank** walk
|
|
95
|
+
that spreads relevance across the weighted graph (edge `weight × confidence`,
|
|
96
|
+
plus hyperedge cliques); a light importance/god-node prior nudges genuinely
|
|
97
|
+
central entities up. The query-independent half — global PageRank, the BM25
|
|
98
|
+
inverted index — is precomputed once and cached, invalidated by content hash, so
|
|
99
|
+
a query is just a lookup plus one walk.
|
|
100
|
+
|
|
101
|
+
**Selection is a budgeted knapsack, solved as one.** Picking the highest-value
|
|
102
|
+
set of nodes under a token ceiling is the 0/1 knapsack problem. Graphex selects
|
|
103
|
+
by *marginal value per token* and shapes the result with two terms — an MMR
|
|
104
|
+
penalty so it doesn't say the same thing twice, and a connectivity bonus so the
|
|
105
|
+
result is a coherent connected subgraph, not a bag of redundant islands. An exact
|
|
106
|
+
DP-knapsack mode is available for benchmarking the value ceiling.
|
|
107
|
+
|
|
108
|
+
**Token accounting is honest.** A node's cost is the size of its *final rendered
|
|
109
|
+
form*, including any injected source code — so `tokens_used` never lies and the
|
|
110
|
+
output never overflows the budget you asked for.
|
|
111
|
+
|
|
112
|
+
## Usage
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
# Index a project into a graphify-compatible graph.json (Python / TS / Go)
|
|
116
|
+
graphex index ./src -o graph.json
|
|
117
|
+
graphex index ./src --incremental # re-index only changed files
|
|
118
|
+
|
|
119
|
+
# Query (any unrecognised first arg routes here)
|
|
120
|
+
graphex "session token validation" -b 2000
|
|
121
|
+
graphex "auth flow" --explain # per-node BM25 / PPR / prior breakdown
|
|
122
|
+
graphex "auth flow" --inject-code # include real function bodies, still in budget
|
|
123
|
+
graphex "auth flow" --viz # interactive force-directed HTML
|
|
124
|
+
|
|
125
|
+
# Inspect (node ids come from your indexed graph; these match examples/)
|
|
126
|
+
graphex stats -g examples/sample_graph.json
|
|
127
|
+
graphex explain auth_service_login -g examples/sample_graph.json
|
|
128
|
+
graphex path auth_service auth_service_login -g examples/sample_graph.json
|
|
129
|
+
|
|
130
|
+
# Export a context block to paste into a system prompt / CLAUDE.md
|
|
131
|
+
graphex export "auth flow" -f claudemd -o CONTEXT.md
|
|
132
|
+
|
|
133
|
+
# Measure quality honestly (recall@budget, not just tokens saved)
|
|
134
|
+
graphex benchmark -q "auth flow" -q "db pooling" -b 1000 -b 4000
|
|
135
|
+
|
|
136
|
+
# Compare two graph versions and see the change impact
|
|
137
|
+
graphex diff old.json new.json --budget 2000
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
See [`examples/`](examples/) for a full walkthrough on a sample project.
|
|
141
|
+
|
|
142
|
+
## MCP server
|
|
143
|
+
|
|
144
|
+
Graphex speaks the Model Context Protocol over stdio (stdlib only, no SDK):
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
graphex serve --graph graph.json
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
It exposes four tools: `graphex_query`, `graphex_explain`, `graphex_path`,
|
|
151
|
+
`graphex_stats`. Register it with Claude Code:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
claude mcp add graphex -- graphex serve --graph /abs/path/to/graph.json
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Honest benchmarking
|
|
158
|
+
|
|
159
|
+
"Tokens saved" is a vanity metric — a tool that returns nothing saves 100%.
|
|
160
|
+
Graphex reports **recall@budget** alongside it: how much of the relevant set the
|
|
161
|
+
budgeted subgraph actually captures. High savings with low recall means
|
|
162
|
+
under-retrieval, and the benchmark makes that trade-off visible.
|
|
163
|
+
|
|
164
|
+
## Development
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
uv sync
|
|
168
|
+
uv run pytest # test suite
|
|
169
|
+
uv run ruff check . # lint
|
|
170
|
+
uv run black . # format
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## License
|
|
174
|
+
|
|
175
|
+
MIT © Alfonso Mayoral
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
graphex/__init__.py,sha256=mjenbSPSekR-ZERd5ZCgcEt1PAdsNDEDHhZmLRNOzFk,304
|
|
2
|
+
graphex/audit.py,sha256=-JtFuA5Lckh2ePdVgff4MC85rZiEjQpO9mk3SKwxyYU,3696
|
|
3
|
+
graphex/benchmark.py,sha256=1SSAtJ4VjfCCOZn4H142aUfA-9CQR_1Jh02UshKg3dk,10374
|
|
4
|
+
graphex/budget.py,sha256=DNYgM3nkBY0zmxvRUj9wb-1-Jn1M6LwU51m4N2UmWpE,12281
|
|
5
|
+
graphex/cache.py,sha256=Y2krFu3zqYuzVPWZWgWHuODAe_ZOsgeAfS9qsm-X3KM,3723
|
|
6
|
+
graphex/cli.py,sha256=6qbNO00ip4UQcwiJD0F9MfokA8y8zrbq8R0b97KVrk4,23493
|
|
7
|
+
graphex/diff.py,sha256=EUo8aq2KDxPcZvKTqDCyVyDnUmdA1glBOgQvuiBVuhI,6565
|
|
8
|
+
graphex/exporter.py,sha256=WwrVJ7MeOhD0zfC7dOviChKPjOPgHKzTIJIFCZPiQfw,4011
|
|
9
|
+
graphex/formatter.py,sha256=cB_TiCiZir4UqzCBU2EJcavvSzrboZUlIsKMKRE-kc8,10232
|
|
10
|
+
graphex/ignore.py,sha256=b22Jdj5xdc7UCrQNKW2K_omHJ5z1Vg1AZd1HPP7B5FU,3294
|
|
11
|
+
graphex/injector.py,sha256=vXKBQwtLUOFbjZcVCbzYWGU1pcsWIkvOgv9IQWtoIP8,7378
|
|
12
|
+
graphex/loader.py,sha256=dZFv1FyO-51tH7Dc1DYmLru_DrWPuT3TA47o1_kxdjU,23514
|
|
13
|
+
graphex/mcp.py,sha256=h39GgLWxSbdfuNifyke31b8SpJqmRRDNHPuIk7WiupM,16847
|
|
14
|
+
graphex/models.py,sha256=1L2pBzJlhlytGrHFpLFOdO9Cl7f0DSB9IfyIkbeF0OM,10979
|
|
15
|
+
graphex/scorer.py,sha256=8h3iPIHVFL7dpeAXF76iOF-pk4djE295yEemvLIzMgs,3490
|
|
16
|
+
graphex/viz.py,sha256=uOJjAcIHyp6itT0xAtS4y_5Gjp__erb-tsRf3o48xEY,11938
|
|
17
|
+
graphex/indexer/__init__.py,sha256=hi9JbBond8GYj3uQ3wNnD_Z59b167UBLwghiHQeq2Tg,305
|
|
18
|
+
graphex/indexer/go.py,sha256=1U0oxzOWp3EJoOdMfgglNIiIZhMboBZca-IhrsxMNRM,4466
|
|
19
|
+
graphex/indexer/project.py,sha256=jfhaVMJPDevxOQfHlkQ0QzMLVZIWBxMY52zkhFOIXcQ,8886
|
|
20
|
+
graphex/indexer/python.py,sha256=-8s_KSRc0SOmEPP9DD_lK0fWumJUUjc3uJ2obHIgLYg,6705
|
|
21
|
+
graphex/indexer/typescript.py,sha256=Ua9BxONinoOa0so4GWsD6d_OsxJV-fGb_hB0W9-LCoI,9928
|
|
22
|
+
graphex/retrieval/__init__.py,sha256=zlC4ekCbuUPBSD4YFcCUrW1Hh76rVnABCLTGwzEu2uA,295
|
|
23
|
+
graphex/retrieval/base.py,sha256=WSsUnnRFs4v1I1zj5NrBJn-7c-n-9_K6Jkg67HaSFJU,1002
|
|
24
|
+
graphex/retrieval/bm25.py,sha256=HIsWYVsILGzAmAC0Te_7zO7xMeGawdslAcxLxoK6i08,8650
|
|
25
|
+
graphex/retrieval/dense.py,sha256=C6KajmcMWkNytSXMOO1Pvl7P4bpNEt4GPRviKfXy3zg,3136
|
|
26
|
+
graphex/retrieval/fusion.py,sha256=LW-b44Vskm24PQ1i56m7R-iHFSb5zCUCqn9WvCKwIHU,2535
|
|
27
|
+
graphex/retrieval/ppr.py,sha256=Ai15NuFdKFVdmg_NGu2aD-CEx39FFuve7IVwC_Rab7E,8325
|
|
28
|
+
apexgraph-0.1.0.dist-info/METADATA,sha256=PbEVjDz_Q-cS8VUkflEjCzWRGLhYH-MBPFf5g5JXM2g,6848
|
|
29
|
+
apexgraph-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
30
|
+
apexgraph-0.1.0.dist-info/entry_points.txt,sha256=8EFHKOF-yAm-QT7PeQCam3OKvjZzDqXP6hy30eNBQMI,44
|
|
31
|
+
apexgraph-0.1.0.dist-info/licenses/LICENSE,sha256=Q88Aj987s01Cmk0M57h7w7_6V-vRtRCTPKZVQkscY8E,1072
|
|
32
|
+
apexgraph-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Alfonso Mayoral
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
graphex/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Graphex — apex-relevance subgraph retrieval for AI agents.
|
|
2
|
+
|
|
3
|
+
Given a knowledge graph (e.g. the output of `graphify`) and a natural-language
|
|
4
|
+
query, Graphex selects the highest-relevance subgraph that fits within a token
|
|
5
|
+
budget, ready to inject into an LLM's context window.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
graphex/audit.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Append-only query audit log.
|
|
2
|
+
|
|
3
|
+
Every retrieval can be recorded as one JSON line in ``<audit_dir>/audit.jsonl``
|
|
4
|
+
(default ``.graphex/audit.jsonl``). The log is *best-effort*: writing it must
|
|
5
|
+
never break a query, so I/O errors are swallowed. Reading is tolerant of partial
|
|
6
|
+
writes — malformed lines are skipped rather than raised on.
|
|
7
|
+
|
|
8
|
+
Beyond the raw round-trip, :func:`top_nodes_from_audit` aggregates the most
|
|
9
|
+
frequently selected nodes across the whole history, a cheap proxy for "what does
|
|
10
|
+
this graph keep surfacing".
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from collections import Counter
|
|
17
|
+
from datetime import UTC, datetime
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
AUDIT_DIRNAME = ".graphex"
|
|
21
|
+
AUDIT_FILENAME = "audit.jsonl"
|
|
22
|
+
|
|
23
|
+
# Cap on how many top nodes we persist per entry.
|
|
24
|
+
_TOP_NODES_CAP = 10
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _audit_path(audit_dir: Path | str) -> Path:
|
|
28
|
+
return Path(audit_dir) / AUDIT_FILENAME
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def log_query(
|
|
32
|
+
query: str,
|
|
33
|
+
graph_path: Path,
|
|
34
|
+
stats: dict,
|
|
35
|
+
top_nodes: list[str],
|
|
36
|
+
audit_dir: Path | str = AUDIT_DIRNAME,
|
|
37
|
+
) -> None:
|
|
38
|
+
"""Append one audit record for a completed query.
|
|
39
|
+
|
|
40
|
+
The record captures the query string, the graph it ran against, selection /
|
|
41
|
+
token statistics (read from ``stats`` with keys ``nodes_selected``,
|
|
42
|
+
``nodes_total``, ``tokens_used``, ``tokens_budget``), and up to the first
|
|
43
|
+
ten ``top_nodes``. A UTC ISO-8601 ``timestamp`` is stamped automatically.
|
|
44
|
+
|
|
45
|
+
This is best-effort: the audit directory is created if missing, and any
|
|
46
|
+
:class:`OSError` raised while writing is swallowed so auditing never breaks
|
|
47
|
+
a retrieval.
|
|
48
|
+
"""
|
|
49
|
+
record = {
|
|
50
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
51
|
+
"query": query,
|
|
52
|
+
"graph": str(graph_path),
|
|
53
|
+
"nodes_selected": stats.get("nodes_selected"),
|
|
54
|
+
"nodes_total": stats.get("nodes_total"),
|
|
55
|
+
"tokens_used": stats.get("tokens_used"),
|
|
56
|
+
"tokens_budget": stats.get("tokens_budget"),
|
|
57
|
+
"top_nodes": list(top_nodes[:_TOP_NODES_CAP]),
|
|
58
|
+
}
|
|
59
|
+
try:
|
|
60
|
+
directory = Path(audit_dir)
|
|
61
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
62
|
+
line = json.dumps(record, default=str)
|
|
63
|
+
with _audit_path(directory).open("a", encoding="utf-8") as fh:
|
|
64
|
+
fh.write(line + "\n")
|
|
65
|
+
except OSError:
|
|
66
|
+
# Auditing is best-effort — never propagate I/O failures to the caller.
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def read_audit(audit_dir: Path | str = AUDIT_DIRNAME) -> list[dict]:
|
|
71
|
+
"""Read and parse every audit record.
|
|
72
|
+
|
|
73
|
+
Returns an empty list if the file does not exist. Malformed lines (e.g. from
|
|
74
|
+
a partial write) are skipped rather than raised on.
|
|
75
|
+
"""
|
|
76
|
+
path = _audit_path(audit_dir)
|
|
77
|
+
if not path.exists():
|
|
78
|
+
return []
|
|
79
|
+
records: list[dict] = []
|
|
80
|
+
try:
|
|
81
|
+
text = path.read_text(encoding="utf-8")
|
|
82
|
+
except OSError:
|
|
83
|
+
return []
|
|
84
|
+
for line in text.splitlines():
|
|
85
|
+
line = line.strip()
|
|
86
|
+
if not line:
|
|
87
|
+
continue
|
|
88
|
+
try:
|
|
89
|
+
obj = json.loads(line)
|
|
90
|
+
except (json.JSONDecodeError, ValueError):
|
|
91
|
+
continue
|
|
92
|
+
if isinstance(obj, dict):
|
|
93
|
+
records.append(obj)
|
|
94
|
+
return records
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def top_nodes_from_audit(
|
|
98
|
+
audit_dir: Path | str = AUDIT_DIRNAME, n: int = 10
|
|
99
|
+
) -> list[tuple[str, int]]:
|
|
100
|
+
"""Rank node ids by how often they appear across all entries' ``top_nodes``.
|
|
101
|
+
|
|
102
|
+
Returns the ``n`` most common ``(node_id, count)`` pairs in descending order
|
|
103
|
+
of count.
|
|
104
|
+
"""
|
|
105
|
+
counter: Counter[str] = Counter()
|
|
106
|
+
for record in read_audit(audit_dir):
|
|
107
|
+
top_nodes = record.get("top_nodes")
|
|
108
|
+
if not isinstance(top_nodes, list):
|
|
109
|
+
continue
|
|
110
|
+
counter.update(str(node_id) for node_id in top_nodes)
|
|
111
|
+
return counter.most_common(n)
|
graphex/benchmark.py
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
"""Measure the *honest* value of token-budgeted graph retrieval.
|
|
2
|
+
|
|
3
|
+
The temptation with a tool like Graphex is to report a single flattering number:
|
|
4
|
+
"we saved 92% of your tokens!" That number is meaningless on its own — a tool
|
|
5
|
+
that returns nothing saves 100% of the tokens and answers 0% of the questions.
|
|
6
|
+
The only number that matters is the trade-off: what fraction of the *relevant*
|
|
7
|
+
content did the budgeted subgraph actually keep, for the tokens it spent?
|
|
8
|
+
|
|
9
|
+
So every ``(query, budget)`` pair is scored on two axes:
|
|
10
|
+
|
|
11
|
+
1. ``token_savings`` — ``1 − tokens_used / full_graph_tokens``, where
|
|
12
|
+
``full_graph_tokens`` is the cost of injecting the whole graph (the naive
|
|
13
|
+
baseline). Higher is cheaper. Easy to game by retrieving less.
|
|
14
|
+
2. ``recall_at_budget`` — of the top-``k`` nodes by full-graph relevance (the
|
|
15
|
+
retrieval target), what fraction did the budgeted subgraph capture? Higher is
|
|
16
|
+
better. *This* is the metric that exposes under-retrieval: a tiny budget posts
|
|
17
|
+
a gorgeous ``token_savings`` and a damning ``recall_at_budget``.
|
|
18
|
+
|
|
19
|
+
Read them together. Savings without recall is just throwing the answer away.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
from graphex.budget import (
|
|
28
|
+
_NODE_OVERHEAD_TOKENS,
|
|
29
|
+
_node_body,
|
|
30
|
+
count_tokens,
|
|
31
|
+
select_subgraph,
|
|
32
|
+
)
|
|
33
|
+
from graphex.cache import CachedArtifacts, load_or_build
|
|
34
|
+
from graphex.models import KnowledgeGraph
|
|
35
|
+
from graphex.scorer import score_nodes
|
|
36
|
+
|
|
37
|
+
DEFAULT_BUDGETS: tuple[int, ...] = (2000, 4000, 8000)
|
|
38
|
+
DEFAULT_K_RELEVANT = 10
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(slots=True)
|
|
42
|
+
class BenchmarkRow:
|
|
43
|
+
"""One ``(query, budget)`` measurement.
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
query: The query that was scored.
|
|
47
|
+
budget: The token ceiling the subgraph was selected under.
|
|
48
|
+
nodes_selected: How many nodes the budgeted subgraph kept.
|
|
49
|
+
tokens_used: Rendered token cost of the budgeted subgraph.
|
|
50
|
+
full_graph_tokens: Token cost of injecting every node (the baseline).
|
|
51
|
+
token_savings: ``1 − tokens_used / full_graph_tokens`` in ``[0, 1]``.
|
|
52
|
+
recall_at_budget: Fraction of the relevant top-k set captured, in ``[0, 1]``.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
query: str
|
|
56
|
+
budget: int
|
|
57
|
+
nodes_selected: int
|
|
58
|
+
tokens_used: int
|
|
59
|
+
full_graph_tokens: int
|
|
60
|
+
token_savings: float
|
|
61
|
+
recall_at_budget: float
|
|
62
|
+
|
|
63
|
+
def to_dict(self) -> dict[str, Any]:
|
|
64
|
+
"""Serialize to a plain JSON-friendly mapping."""
|
|
65
|
+
return {
|
|
66
|
+
"query": self.query,
|
|
67
|
+
"budget": self.budget,
|
|
68
|
+
"nodes_selected": self.nodes_selected,
|
|
69
|
+
"tokens_used": self.tokens_used,
|
|
70
|
+
"full_graph_tokens": self.full_graph_tokens,
|
|
71
|
+
"token_savings": self.token_savings,
|
|
72
|
+
"recall_at_budget": self.recall_at_budget,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass(slots=True)
|
|
77
|
+
class BenchmarkResult:
|
|
78
|
+
"""The full benchmark: one :class:`BenchmarkRow` per ``(query, budget)``."""
|
|
79
|
+
|
|
80
|
+
rows: list[BenchmarkRow] = field(default_factory=list)
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def mean_token_savings(self) -> float:
|
|
84
|
+
"""Average ``token_savings`` across all rows (0.0 if empty)."""
|
|
85
|
+
if not self.rows:
|
|
86
|
+
return 0.0
|
|
87
|
+
return sum(r.token_savings for r in self.rows) / len(self.rows)
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def mean_recall(self) -> float:
|
|
91
|
+
"""Average ``recall_at_budget`` across all rows (0.0 if empty)."""
|
|
92
|
+
if not self.rows:
|
|
93
|
+
return 0.0
|
|
94
|
+
return sum(r.recall_at_budget for r in self.rows) / len(self.rows)
|
|
95
|
+
|
|
96
|
+
def to_dict(self) -> dict[str, Any]:
|
|
97
|
+
"""Serialize to a JSON-friendly dict (rows plus the aggregate means)."""
|
|
98
|
+
return {
|
|
99
|
+
"rows": [r.to_dict() for r in self.rows],
|
|
100
|
+
"mean_token_savings": self.mean_token_savings,
|
|
101
|
+
"mean_recall": self.mean_recall,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _full_graph_tokens(graph: KnowledgeGraph, model: str) -> int:
|
|
106
|
+
"""Token cost of injecting the whole graph — the naive baseline to beat.
|
|
107
|
+
|
|
108
|
+
Computed with the *same* per-node accounting as the budgeted selection
|
|
109
|
+
(:func:`graphex.budget._node_body` + per-node overhead), so ``token_savings``
|
|
110
|
+
is a fair like-for-like comparison rather than a pessimistic one.
|
|
111
|
+
"""
|
|
112
|
+
return sum(
|
|
113
|
+
count_tokens(_node_body(graph, nid, None), model) + _NODE_OVERHEAD_TOKENS
|
|
114
|
+
for nid in graph.node_ids
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _relevant_set(scores: dict[str, float], k: int) -> set[str]:
|
|
119
|
+
"""The retrieval target: the top-``k`` nodes by full-graph relevance.
|
|
120
|
+
|
|
121
|
+
Ties are broken by node id (ascending) so the set is deterministic. Nodes
|
|
122
|
+
with a non-positive score are never relevant — an empty graph (or one where
|
|
123
|
+
nothing matched) yields an empty target rather than padding with noise.
|
|
124
|
+
"""
|
|
125
|
+
ranked = sorted(
|
|
126
|
+
(nid for nid, s in scores.items() if s > 0.0),
|
|
127
|
+
key=lambda nid: (-scores[nid], nid),
|
|
128
|
+
)
|
|
129
|
+
return set(ranked[:k])
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _measure(
|
|
133
|
+
graph: KnowledgeGraph,
|
|
134
|
+
query: str,
|
|
135
|
+
budget: int,
|
|
136
|
+
scores: dict[str, float],
|
|
137
|
+
relevant: set[str],
|
|
138
|
+
full_tokens: int,
|
|
139
|
+
*,
|
|
140
|
+
model: str,
|
|
141
|
+
min_score: float,
|
|
142
|
+
) -> BenchmarkRow:
|
|
143
|
+
"""Score a single ``(query, budget)`` pair into a :class:`BenchmarkRow`."""
|
|
144
|
+
sub, stats = select_subgraph(graph, scores, budget, model=model, min_score=min_score)
|
|
145
|
+
tokens_used = int(stats["tokens_used"])
|
|
146
|
+
|
|
147
|
+
token_savings = 1.0 - tokens_used / full_tokens if full_tokens > 0 else 0.0
|
|
148
|
+
token_savings = max(0.0, min(1.0, token_savings))
|
|
149
|
+
|
|
150
|
+
if relevant:
|
|
151
|
+
captured = len(set(sub.node_ids) & relevant)
|
|
152
|
+
recall = captured / len(relevant)
|
|
153
|
+
else:
|
|
154
|
+
recall = 0.0
|
|
155
|
+
|
|
156
|
+
return BenchmarkRow(
|
|
157
|
+
query=query,
|
|
158
|
+
budget=budget,
|
|
159
|
+
nodes_selected=int(stats["nodes_selected"]),
|
|
160
|
+
tokens_used=tokens_used,
|
|
161
|
+
full_graph_tokens=full_tokens,
|
|
162
|
+
token_savings=round(token_savings, 4),
|
|
163
|
+
recall_at_budget=round(recall, 4),
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def run_benchmark(
|
|
168
|
+
graph: KnowledgeGraph,
|
|
169
|
+
queries: list[str],
|
|
170
|
+
budgets: list[int] | tuple[int, ...] = DEFAULT_BUDGETS,
|
|
171
|
+
*,
|
|
172
|
+
model: str = "cl100k_base",
|
|
173
|
+
k_relevant: int = DEFAULT_K_RELEVANT,
|
|
174
|
+
min_score: float = 0.0,
|
|
175
|
+
cache: CachedArtifacts | None = None,
|
|
176
|
+
) -> BenchmarkResult:
|
|
177
|
+
"""Benchmark budgeted retrieval over a grid of ``queries × budgets``.
|
|
178
|
+
|
|
179
|
+
For each query the full-graph relevance is scored once; the top-``k_relevant``
|
|
180
|
+
nodes define the retrieval target. Each budget then selects a subgraph and is
|
|
181
|
+
scored on ``token_savings`` and ``recall_at_budget`` against that target.
|
|
182
|
+
|
|
183
|
+
The query-independent BM25/PageRank artifacts are computed once and reused
|
|
184
|
+
across every query, so the grid costs one walk per query, not per cell.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
graph: The knowledge graph to benchmark against.
|
|
188
|
+
queries: Free-text queries to evaluate.
|
|
189
|
+
budgets: Token ceilings to sweep (default ``(2000, 4000, 8000)``).
|
|
190
|
+
model: tiktoken encoding for token counting.
|
|
191
|
+
k_relevant: Size of the relevant top-k target set per query.
|
|
192
|
+
min_score: Drop candidates below this score before selection.
|
|
193
|
+
cache: Precomputed artifacts to reuse. Built in-memory when omitted.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
A :class:`BenchmarkResult` with one row per ``(query, budget)`` pair.
|
|
197
|
+
"""
|
|
198
|
+
artifacts = cache if cache is not None else load_or_build(graph, use_cache=False)
|
|
199
|
+
full_tokens = _full_graph_tokens(graph, model)
|
|
200
|
+
|
|
201
|
+
rows: list[BenchmarkRow] = []
|
|
202
|
+
for query in queries:
|
|
203
|
+
scores = score_nodes(graph, query, cache=artifacts)
|
|
204
|
+
relevant = _relevant_set(scores, k_relevant)
|
|
205
|
+
for budget in budgets:
|
|
206
|
+
rows.append(
|
|
207
|
+
_measure(
|
|
208
|
+
graph,
|
|
209
|
+
query,
|
|
210
|
+
budget,
|
|
211
|
+
scores,
|
|
212
|
+
relevant,
|
|
213
|
+
full_tokens,
|
|
214
|
+
model=model,
|
|
215
|
+
min_score=min_score,
|
|
216
|
+
)
|
|
217
|
+
)
|
|
218
|
+
return BenchmarkResult(rows=rows)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# -- formatting --------------------------------------------------------------
|
|
222
|
+
|
|
223
|
+
_HEADERS: tuple[str, ...] = (
|
|
224
|
+
"query",
|
|
225
|
+
"budget",
|
|
226
|
+
"nodes",
|
|
227
|
+
"tokens_used",
|
|
228
|
+
"full_tokens",
|
|
229
|
+
"token_savings",
|
|
230
|
+
"recall@budget",
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _truncate(text: str, width: int) -> str:
|
|
235
|
+
"""Clip ``text`` to ``width`` columns, marking elision with an ellipsis."""
|
|
236
|
+
if len(text) <= width:
|
|
237
|
+
return text
|
|
238
|
+
if width <= 1:
|
|
239
|
+
return text[:width]
|
|
240
|
+
return text[: width - 1] + "…"
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def format_benchmark(result: BenchmarkResult, *, max_query_width: int = 28) -> str:
|
|
244
|
+
"""Render a benchmark as a plain, aligned text table.
|
|
245
|
+
|
|
246
|
+
Shows every ``(query, budget)`` row plus the aggregate means, and closes with
|
|
247
|
+
a one-line reminder that high savings with low recall means under-retrieval.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
result: The benchmark to render.
|
|
251
|
+
max_query_width: Column width cap for the (possibly long) query text.
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
A multi-line string ending in a newline.
|
|
255
|
+
"""
|
|
256
|
+
rows = result.rows
|
|
257
|
+
cells: list[tuple[str, ...]] = []
|
|
258
|
+
for r in rows:
|
|
259
|
+
cells.append(
|
|
260
|
+
(
|
|
261
|
+
_truncate(r.query, max_query_width),
|
|
262
|
+
str(r.budget),
|
|
263
|
+
str(r.nodes_selected),
|
|
264
|
+
str(r.tokens_used),
|
|
265
|
+
str(r.full_graph_tokens),
|
|
266
|
+
f"{r.token_savings:.1%}",
|
|
267
|
+
f"{r.recall_at_budget:.1%}",
|
|
268
|
+
)
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
widths = [len(h) for h in _HEADERS]
|
|
272
|
+
for row in cells:
|
|
273
|
+
for i, value in enumerate(row):
|
|
274
|
+
widths[i] = max(widths[i], len(value))
|
|
275
|
+
|
|
276
|
+
def _fmt(values: tuple[str, ...] | list[str]) -> str:
|
|
277
|
+
# First column left-aligned (text); the rest right-aligned (numbers).
|
|
278
|
+
parts = [values[0].ljust(widths[0])]
|
|
279
|
+
parts.extend(values[i].rjust(widths[i]) for i in range(1, len(values)))
|
|
280
|
+
return " ".join(parts)
|
|
281
|
+
|
|
282
|
+
sep = " ".join("-" * w for w in widths)
|
|
283
|
+
lines: list[str] = ["Graphex retrieval benchmark", "", _fmt(_HEADERS), sep]
|
|
284
|
+
lines.extend(_fmt(row) for row in cells)
|
|
285
|
+
if not cells:
|
|
286
|
+
lines.append("(no rows)")
|
|
287
|
+
lines.append(sep)
|
|
288
|
+
lines.append(
|
|
289
|
+
f"mean token_savings: {result.mean_token_savings:.1%} "
|
|
290
|
+
f"mean recall@budget: {result.mean_recall:.1%}"
|
|
291
|
+
)
|
|
292
|
+
lines.append("")
|
|
293
|
+
lines.append(
|
|
294
|
+
"Note: high token-savings with low recall@budget means under-retrieval — "
|
|
295
|
+
"the budget is too tight to keep the relevant nodes."
|
|
296
|
+
)
|
|
297
|
+
return "\n".join(lines) + "\n"
|