graphifyy 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
graphify/__init__.py ADDED
@@ -0,0 +1,27 @@
1
+ """graphify — extract · build · cluster · analyze · report."""
2
+
3
+
4
+ def __getattr__(name):
5
+ # Lazy imports so `graphify install` works before heavy deps are in place.
6
+ _map = {
7
+ "extract": ("graphify.extract", "extract"),
8
+ "collect_files": ("graphify.extract", "collect_files"),
9
+ "build_from_json": ("graphify.build", "build_from_json"),
10
+ "cluster": ("graphify.cluster", "cluster"),
11
+ "score_all": ("graphify.cluster", "score_all"),
12
+ "cohesion_score": ("graphify.cluster", "cohesion_score"),
13
+ "god_nodes": ("graphify.analyze", "god_nodes"),
14
+ "surprising_connections": ("graphify.analyze", "surprising_connections"),
15
+ "suggest_questions": ("graphify.analyze", "suggest_questions"),
16
+ "generate": ("graphify.report", "generate"),
17
+ "to_json": ("graphify.export", "to_json"),
18
+ "to_html": ("graphify.export", "to_html"),
19
+ "to_svg": ("graphify.export", "to_svg"),
20
+ "to_canvas": ("graphify.export", "to_canvas"),
21
+ }
22
+ if name in _map:
23
+ import importlib
24
+ mod_name, attr = _map[name]
25
+ mod = importlib.import_module(mod_name)
26
+ return getattr(mod, attr)
27
+ raise AttributeError(f"module 'graphify' has no attribute {name!r}")
graphify/__main__.py ADDED
@@ -0,0 +1,89 @@
1
+ """graphify CLI — `graphify install` sets up the Claude Code skill."""
2
+ from __future__ import annotations
3
+ import json
4
+ import shutil
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ _SKILL_REGISTRATION = (
9
+ "\n# graphify\n"
10
+ "- **graphify** (`~/.claude/skills/graphify/SKILL.md`) "
11
+ "— any input to knowledge graph. Trigger: `/graphify`\n"
12
+ "When the user types `/graphify`, invoke the Skill tool "
13
+ "with `skill: \"graphify\"` before doing anything else.\n"
14
+ )
15
+
16
+
17
+ def _bundled_skill() -> Path:
18
+ """Path to the skill.md bundled with this package."""
19
+ return Path(__file__).parent / "skill.md"
20
+
21
+
22
+ def install() -> None:
23
+ skill_src = _bundled_skill()
24
+ if not skill_src.exists():
25
+ print("error: skill.md not found in package — reinstall graphify", file=sys.stderr)
26
+ sys.exit(1)
27
+
28
+ # Copy skill to ~/.claude/skills/graphify/SKILL.md
29
+ skill_dst = Path.home() / ".claude" / "skills" / "graphify" / "SKILL.md"
30
+ skill_dst.parent.mkdir(parents=True, exist_ok=True)
31
+ shutil.copy(skill_src, skill_dst)
32
+ print(f" skill installed → {skill_dst}")
33
+
34
+ # Register in ~/.claude/CLAUDE.md
35
+ claude_md = Path.home() / ".claude" / "CLAUDE.md"
36
+ if claude_md.exists():
37
+ content = claude_md.read_text()
38
+ if "graphify" in content:
39
+ print(f" CLAUDE.md → already registered (no change)")
40
+ else:
41
+ claude_md.write_text(content.rstrip() + _SKILL_REGISTRATION)
42
+ print(f" CLAUDE.md → skill registered in {claude_md}")
43
+ else:
44
+ claude_md.parent.mkdir(parents=True, exist_ok=True)
45
+ claude_md.write_text(_SKILL_REGISTRATION.lstrip())
46
+ print(f" CLAUDE.md → created at {claude_md}")
47
+
48
+ print()
49
+ print("Done. Open Claude Code in any directory and type:")
50
+ print()
51
+ print(" /graphify .")
52
+ print()
53
+
54
+
55
+ def main() -> None:
56
+ if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"):
57
+ print("Usage: graphify <command>")
58
+ print()
59
+ print("Commands:")
60
+ print(" install copy skill to ~/.claude/skills/ and register in CLAUDE.md")
61
+ print(" benchmark [graph.json] measure token reduction vs naive full-corpus approach")
62
+ print()
63
+ return
64
+
65
+ cmd = sys.argv[1]
66
+ if cmd == "install":
67
+ install()
68
+ elif cmd == "benchmark":
69
+ from graphify.benchmark import run_benchmark, print_benchmark
70
+ graph_path = sys.argv[2] if len(sys.argv) > 2 else ".graphify/graph.json"
71
+ # Try to load corpus_words from detect output
72
+ corpus_words = None
73
+ detect_path = Path(".graphify_detect.json")
74
+ if detect_path.exists():
75
+ try:
76
+ detect_data = json.loads(detect_path.read_text())
77
+ corpus_words = detect_data.get("total_words")
78
+ except Exception:
79
+ pass
80
+ result = run_benchmark(graph_path, corpus_words=corpus_words)
81
+ print_benchmark(result)
82
+ else:
83
+ print(f"error: unknown command '{cmd}'", file=sys.stderr)
84
+ print("Run 'graphify --help' for usage.", file=sys.stderr)
85
+ sys.exit(1)
86
+
87
+
88
+ if __name__ == "__main__":
89
+ main()
graphify/analyze.py ADDED
@@ -0,0 +1,429 @@
1
+ """Graph analysis: god nodes (most connected), surprising connections (cross-community), suggested questions."""
2
+ from __future__ import annotations
3
+ import networkx as nx
4
+
5
+
6
+ def _is_file_node(G: nx.Graph, node_id: str) -> bool:
7
+ """
8
+ Return True if this node is a file-level hub node (e.g. 'client', 'models')
9
+ or an AST method stub (e.g. '.auth_flow()', '.__init__()').
10
+
11
+ These are synthetic nodes created by the AST extractor and should be excluded
12
+ from god nodes, surprising connections, and knowledge gap reporting.
13
+ """
14
+ label = G.nodes[node_id].get("label", "")
15
+ if not label:
16
+ return False
17
+ # File-level hub: label is a filename with a code extension
18
+ if label.split(".")[-1] in ("py", "ts", "js", "go", "rs", "java", "rb", "cpp", "c", "h"):
19
+ return True
20
+ # Method stub: AST extractor labels methods as '.method_name()'
21
+ if label.startswith(".") and label.endswith("()"):
22
+ return True
23
+ # Module-level function stub: labeled 'function_name()' — only has a contains edge
24
+ # These are real functions but structurally isolated by definition; not a gap worth flagging
25
+ if label.endswith("()") and G.degree(node_id) <= 1:
26
+ return True
27
+ return False
28
+
29
+
30
+ def god_nodes(G: nx.Graph, top_n: int = 10) -> list[dict]:
31
+ """Return the top_n most-connected real entities — the core abstractions.
32
+
33
+ File-level hub nodes are excluded: they accumulate import/contains edges
34
+ mechanically and don't represent meaningful architectural abstractions.
35
+ """
36
+ degree = dict(G.degree())
37
+ sorted_nodes = sorted(degree.items(), key=lambda x: x[1], reverse=True)
38
+ result = []
39
+ for node_id, deg in sorted_nodes:
40
+ if _is_file_node(G, node_id) or _is_concept_node(G, node_id):
41
+ continue
42
+ result.append({
43
+ "id": node_id,
44
+ "label": G.nodes[node_id].get("label", node_id),
45
+ "edges": deg,
46
+ })
47
+ if len(result) >= top_n:
48
+ break
49
+ return result
50
+
51
+
52
+ def surprising_connections(
53
+ G: nx.Graph,
54
+ communities: dict[int, list[str]] | None = None,
55
+ top_n: int = 5,
56
+ ) -> list[dict]:
57
+ """
58
+ Find connections that are genuinely surprising — not obvious from file structure.
59
+
60
+ Strategy:
61
+ - Multi-file corpora: cross-file edges between real entities (not concept nodes).
62
+ Sorted AMBIGUOUS → INFERRED → EXTRACTED.
63
+ - Single-file / single-source corpora: cross-community edges that bridge
64
+ distant parts of the graph (betweenness centrality on edges).
65
+ These reveal non-obvious structural couplings.
66
+
67
+ Concept nodes (empty source_file, or injected semantic annotations) are excluded
68
+ from surprising connections because they are intentional, not discovered.
69
+ """
70
+ # Identify unique source files (ignore empty/null source_file)
71
+ source_files = {
72
+ data.get("source_file", "")
73
+ for _, data in G.nodes(data=True)
74
+ if data.get("source_file", "")
75
+ }
76
+ is_multi_source = len(source_files) > 1
77
+
78
+ if is_multi_source:
79
+ return _cross_file_surprises(G, communities or {}, top_n)
80
+ else:
81
+ return _cross_community_surprises(G, communities or {}, top_n)
82
+
83
+
84
+ def _is_concept_node(G: nx.Graph, node_id: str) -> bool:
85
+ """
86
+ Return True if this node is a manually-injected semantic concept node
87
+ rather than a real entity found in source code.
88
+
89
+ Signals:
90
+ - Empty source_file
91
+ - source_file doesn't look like a real file path (no extension)
92
+ """
93
+ data = G.nodes[node_id]
94
+ source = data.get("source_file", "")
95
+ if not source:
96
+ return True
97
+ # Has no file extension → probably a concept label, not a real file
98
+ if "." not in source.split("/")[-1]:
99
+ return True
100
+ return False
101
+
102
+
103
+ def _cross_file_surprises(G: nx.Graph, communities: dict[int, list[str]], top_n: int) -> list[dict]:
104
+ """
105
+ Cross-file edges between real code/doc entities.
106
+ Excludes concept nodes, file hub nodes, and plain import edges.
107
+ Sorted AMBIGUOUS → INFERRED → EXTRACTED.
108
+ """
109
+ surprises = []
110
+ order = {"AMBIGUOUS": 0, "INFERRED": 1, "EXTRACTED": 2}
111
+
112
+ for u, v, data in G.edges(data=True):
113
+ # Skip structural scaffolding — not insights
114
+ relation = data.get("relation", "")
115
+ if relation in ("imports", "imports_from", "contains", "method"):
116
+ continue
117
+ # Skip if either endpoint is a concept or file-level node
118
+ if _is_concept_node(G, u) or _is_concept_node(G, v):
119
+ continue
120
+ if _is_file_node(G, u) or _is_file_node(G, v):
121
+ continue
122
+
123
+ u_source = G.nodes[u].get("source_file", "")
124
+ v_source = G.nodes[v].get("source_file", "")
125
+
126
+ if u_source and v_source and u_source != v_source:
127
+ # Respect original edge direction stored in _src/_tgt (if present),
128
+ # otherwise fall back to u/v which may be in arbitrary order.
129
+ src_id = data.get("_src", u)
130
+ tgt_id = data.get("_tgt", v)
131
+ surprises.append({
132
+ "source": G.nodes[src_id].get("label", src_id),
133
+ "target": G.nodes[tgt_id].get("label", tgt_id),
134
+ "source_files": [
135
+ G.nodes[src_id].get("source_file", ""),
136
+ G.nodes[tgt_id].get("source_file", ""),
137
+ ],
138
+ "confidence": data.get("confidence", "EXTRACTED"),
139
+ "relation": relation,
140
+ })
141
+
142
+ surprises.sort(key=lambda x: order.get(x["confidence"], 3))
143
+ if surprises:
144
+ return surprises[:top_n]
145
+
146
+ # Fallback: no semantic cross-file edges found (pure AST corpus).
147
+ # Surface cross-community bridge edges as structural surprises instead.
148
+ return _cross_community_surprises(G, communities, top_n)
149
+
150
+
151
+ def _cross_community_surprises(
152
+ G: nx.Graph,
153
+ communities: dict[int, list[str]],
154
+ top_n: int,
155
+ ) -> list[dict]:
156
+ """
157
+ For single-source corpora: find edges that bridge different communities.
158
+ These are surprising because Leiden grouped everything else tightly —
159
+ these edges cut across the natural structure.
160
+
161
+ Falls back to high-betweenness edges if no community info is provided.
162
+ """
163
+ if not communities:
164
+ # No community info — use edge betweenness centrality
165
+ if G.number_of_edges() == 0:
166
+ return []
167
+ betweenness = nx.edge_betweenness_centrality(G)
168
+ top_edges = sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:top_n]
169
+ result = []
170
+ for (u, v), score in top_edges:
171
+ data = G.edges[u, v]
172
+ result.append({
173
+ "source": G.nodes[u].get("label", u),
174
+ "target": G.nodes[v].get("label", v),
175
+ "source_files": [
176
+ G.nodes[u].get("source_file", ""),
177
+ G.nodes[v].get("source_file", ""),
178
+ ],
179
+ "confidence": data.get("confidence", "EXTRACTED"),
180
+ "relation": data.get("relation", ""),
181
+ "note": f"Bridges graph structure (betweenness={score:.3f})",
182
+ })
183
+ return result
184
+
185
+ # Build node → community map
186
+ node_community = {n: cid for cid, nodes in communities.items() for n in nodes}
187
+
188
+ surprises = []
189
+ for u, v, data in G.edges(data=True):
190
+ cid_u = node_community.get(u)
191
+ cid_v = node_community.get(v)
192
+ if cid_u is None or cid_v is None or cid_u == cid_v:
193
+ continue
194
+ # Skip file hub nodes and plain structural edges
195
+ if _is_file_node(G, u) or _is_file_node(G, v):
196
+ continue
197
+ relation = data.get("relation", "")
198
+ if relation in ("imports", "imports_from", "contains", "method"):
199
+ continue
200
+ # This edge crosses community boundaries — interesting
201
+ confidence = data.get("confidence", "EXTRACTED")
202
+ src_id = data.get("_src", u)
203
+ tgt_id = data.get("_tgt", v)
204
+ surprises.append({
205
+ "source": G.nodes[src_id].get("label", src_id),
206
+ "target": G.nodes[tgt_id].get("label", tgt_id),
207
+ "source_files": [
208
+ G.nodes[src_id].get("source_file", ""),
209
+ G.nodes[tgt_id].get("source_file", ""),
210
+ ],
211
+ "confidence": confidence,
212
+ "relation": relation,
213
+ "note": f"Bridges community {cid_u} → community {cid_v}",
214
+ "_pair": tuple(sorted([cid_u, cid_v])),
215
+ })
216
+
217
+ # Sort: AMBIGUOUS first, then INFERRED, then EXTRACTED
218
+ order = {"AMBIGUOUS": 0, "INFERRED": 1, "EXTRACTED": 2}
219
+ surprises.sort(key=lambda x: order.get(x["confidence"], 3))
220
+
221
+ # Deduplicate by community pair — one representative edge per (A→B) boundary.
222
+ # Without this, a single high-betweenness god node dominates all results.
223
+ seen_pairs: set[tuple] = set()
224
+ deduped = []
225
+ for s in surprises:
226
+ pair = s.pop("_pair")
227
+ if pair not in seen_pairs:
228
+ seen_pairs.add(pair)
229
+ deduped.append(s)
230
+ return deduped[:top_n]
231
+
232
+
233
+ def suggest_questions(
234
+ G: nx.Graph,
235
+ communities: dict[int, list[str]],
236
+ community_labels: dict[int, str],
237
+ top_n: int = 7,
238
+ ) -> list[dict]:
239
+ """
240
+ Generate questions the graph is uniquely positioned to answer.
241
+ Based on: AMBIGUOUS edges, bridge nodes, underexplored god nodes, isolated nodes.
242
+ Each question has a 'type', 'question', and 'why' field.
243
+ """
244
+ questions = []
245
+ node_community = {n: cid for cid, nodes in communities.items() for n in nodes}
246
+
247
+ # 1. AMBIGUOUS edges → unresolved relationship questions
248
+ for u, v, data in G.edges(data=True):
249
+ if data.get("confidence") == "AMBIGUOUS":
250
+ ul = G.nodes[u].get("label", u)
251
+ vl = G.nodes[v].get("label", v)
252
+ relation = data.get("relation", "related to")
253
+ questions.append({
254
+ "type": "ambiguous_edge",
255
+ "question": f"What is the exact relationship between `{ul}` and `{vl}`?",
256
+ "why": f"Edge tagged AMBIGUOUS (relation: {relation}) — confidence is low.",
257
+ })
258
+
259
+ # 2. Bridge nodes (high betweenness) → cross-cutting concern questions
260
+ if G.number_of_edges() > 0:
261
+ betweenness = nx.betweenness_centrality(G)
262
+ # Top bridge nodes that are NOT file-level hubs
263
+ bridges = sorted(
264
+ [(n, s) for n, s in betweenness.items()
265
+ if not _is_file_node(G, n) and not _is_concept_node(G, n) and s > 0],
266
+ key=lambda x: x[1],
267
+ reverse=True,
268
+ )[:3]
269
+ for node_id, score in bridges:
270
+ label = G.nodes[node_id].get("label", node_id)
271
+ cid = node_community.get(node_id)
272
+ comm_label = community_labels.get(cid, f"Community {cid}") if cid is not None else "unknown"
273
+ neighbors = list(G.neighbors(node_id))
274
+ neighbor_comms = {node_community.get(n) for n in neighbors if node_community.get(n) != cid}
275
+ if neighbor_comms:
276
+ other_labels = [community_labels.get(c, f"Community {c}") for c in neighbor_comms]
277
+ questions.append({
278
+ "type": "bridge_node",
279
+ "question": f"Why does `{label}` connect `{comm_label}` to {', '.join(f'`{l}`' for l in other_labels)}?",
280
+ "why": f"High betweenness centrality ({score:.3f}) — this node is a cross-community bridge.",
281
+ })
282
+
283
+ # 3. God nodes with many INFERRED edges → verification questions
284
+ degree = dict(G.degree())
285
+ top_nodes = sorted(
286
+ [(n, d) for n, d in degree.items() if not _is_file_node(G, n)],
287
+ key=lambda x: x[1],
288
+ reverse=True,
289
+ )[:5]
290
+ for node_id, _ in top_nodes:
291
+ inferred = [
292
+ (u, v, d) for u, v, d in G.edges(node_id, data=True)
293
+ if d.get("confidence") == "INFERRED"
294
+ ]
295
+ if len(inferred) >= 2:
296
+ label = G.nodes[node_id].get("label", node_id)
297
+ # Use _src/_tgt to get the correct direction; fall back to v (the other node)
298
+ others = []
299
+ for u, v, d in inferred[:2]:
300
+ src_id = d.get("_src", u)
301
+ tgt_id = d.get("_tgt", v)
302
+ other_id = tgt_id if src_id == node_id else src_id
303
+ others.append(G.nodes[other_id].get("label", other_id))
304
+ questions.append({
305
+ "type": "verify_inferred",
306
+ "question": f"Are the {len(inferred)} inferred relationships involving `{label}` (e.g. with `{others[0]}` and `{others[1]}`) actually correct?",
307
+ "why": f"`{label}` has {len(inferred)} INFERRED edges — model-reasoned connections that need verification.",
308
+ })
309
+
310
+ # 4. Isolated or weakly-connected nodes → exploration questions
311
+ isolated = [
312
+ n for n in G.nodes()
313
+ if G.degree(n) <= 1 and not _is_file_node(G, n) and not _is_concept_node(G, n)
314
+ ]
315
+ if isolated:
316
+ labels = [G.nodes[n].get("label", n) for n in isolated[:3]]
317
+ questions.append({
318
+ "type": "isolated_nodes",
319
+ "question": f"What connects {', '.join(f'`{l}`' for l in labels)} to the rest of the system?",
320
+ "why": f"{len(isolated)} weakly-connected nodes found — possible documentation gaps or missing edges.",
321
+ })
322
+
323
+ # 5. Low-cohesion communities → structural questions
324
+ from .cluster import cohesion_score
325
+ for cid, nodes in communities.items():
326
+ score = cohesion_score(G, nodes)
327
+ if score < 0.15 and len(nodes) >= 5:
328
+ label = community_labels.get(cid, f"Community {cid}")
329
+ questions.append({
330
+ "type": "low_cohesion",
331
+ "question": f"Should `{label}` be split into smaller, more focused modules?",
332
+ "why": f"Cohesion score {score} — nodes in this community are weakly interconnected.",
333
+ })
334
+
335
+ if not questions:
336
+ return [{
337
+ "type": "no_signal",
338
+ "question": None,
339
+ "why": (
340
+ "Not enough signal to generate questions. "
341
+ "This usually means the corpus has no AMBIGUOUS edges, no bridge nodes, "
342
+ "no INFERRED relationships, and all communities are tightly cohesive. "
343
+ "Add more files or run with --mode deep to extract richer edges."
344
+ ),
345
+ }]
346
+
347
+ return questions[:top_n]
348
+
349
+
350
+ def graph_diff(G_old: nx.Graph, G_new: nx.Graph) -> dict:
351
+ """Compare two graph snapshots and return what changed.
352
+
353
+ Returns:
354
+ {
355
+ "new_nodes": [{"id": ..., "label": ...}],
356
+ "removed_nodes": [{"id": ..., "label": ...}],
357
+ "new_edges": [{"source": ..., "target": ..., "relation": ..., "confidence": ...}],
358
+ "removed_edges": [...],
359
+ "summary": "3 new nodes, 5 new edges, 1 node removed"
360
+ }
361
+ """
362
+ old_nodes = set(G_old.nodes())
363
+ new_nodes = set(G_new.nodes())
364
+
365
+ added_node_ids = new_nodes - old_nodes
366
+ removed_node_ids = old_nodes - new_nodes
367
+
368
+ new_nodes_list = [
369
+ {"id": n, "label": G_new.nodes[n].get("label", n)}
370
+ for n in added_node_ids
371
+ ]
372
+ removed_nodes_list = [
373
+ {"id": n, "label": G_old.nodes[n].get("label", n)}
374
+ for n in removed_node_ids
375
+ ]
376
+
377
+ def edge_key(G: nx.Graph, u: str, v: str, data: dict) -> tuple:
378
+ return (u, v, data.get("relation", ""))
379
+
380
+ old_edge_keys = {
381
+ edge_key(G_old, u, v, d)
382
+ for u, v, d in G_old.edges(data=True)
383
+ }
384
+ new_edge_keys = {
385
+ edge_key(G_new, u, v, d)
386
+ for u, v, d in G_new.edges(data=True)
387
+ }
388
+
389
+ added_edge_keys = new_edge_keys - old_edge_keys
390
+ removed_edge_keys = old_edge_keys - new_edge_keys
391
+
392
+ new_edges_list = []
393
+ for u, v, d in G_new.edges(data=True):
394
+ if edge_key(G_new, u, v, d) in added_edge_keys:
395
+ new_edges_list.append({
396
+ "source": u,
397
+ "target": v,
398
+ "relation": d.get("relation", ""),
399
+ "confidence": d.get("confidence", ""),
400
+ })
401
+
402
+ removed_edges_list = []
403
+ for u, v, d in G_old.edges(data=True):
404
+ if edge_key(G_old, u, v, d) in removed_edge_keys:
405
+ removed_edges_list.append({
406
+ "source": u,
407
+ "target": v,
408
+ "relation": d.get("relation", ""),
409
+ "confidence": d.get("confidence", ""),
410
+ })
411
+
412
+ parts = []
413
+ if new_nodes_list:
414
+ parts.append(f"{len(new_nodes_list)} new node{'s' if len(new_nodes_list) != 1 else ''}")
415
+ if new_edges_list:
416
+ parts.append(f"{len(new_edges_list)} new edge{'s' if len(new_edges_list) != 1 else ''}")
417
+ if removed_nodes_list:
418
+ parts.append(f"{len(removed_nodes_list)} node{'s' if len(removed_nodes_list) != 1 else ''} removed")
419
+ if removed_edges_list:
420
+ parts.append(f"{len(removed_edges_list)} edge{'s' if len(removed_edges_list) != 1 else ''} removed")
421
+ summary = ", ".join(parts) if parts else "no changes"
422
+
423
+ return {
424
+ "new_nodes": new_nodes_list,
425
+ "removed_nodes": removed_nodes_list,
426
+ "new_edges": new_edges_list,
427
+ "removed_edges": removed_edges_list,
428
+ "summary": summary,
429
+ }
graphify/benchmark.py ADDED
@@ -0,0 +1,126 @@
1
+ """Token-reduction benchmark — measures how much context graphify saves vs naive full-corpus approach."""
2
+ from __future__ import annotations
3
+ import json
4
+ from pathlib import Path
5
+ import networkx as nx
6
+ from networkx.readwrite import json_graph
7
+
8
+
9
+ _CHARS_PER_TOKEN = 4 # standard approximation
10
+
11
+
12
+ def _estimate_tokens(text: str) -> int:
13
+ return max(1, len(text) // _CHARS_PER_TOKEN)
14
+
15
+
16
+ def _query_subgraph_tokens(G: nx.Graph, question: str, depth: int = 3) -> int:
17
+ """Run BFS from best-matching nodes and return estimated tokens in the subgraph context."""
18
+ terms = [t.lower() for t in question.split() if len(t) > 2]
19
+ scored = []
20
+ for nid, data in G.nodes(data=True):
21
+ label = data.get("label", "").lower()
22
+ score = sum(1 for t in terms if t in label)
23
+ if score > 0:
24
+ scored.append((score, nid))
25
+ scored.sort(reverse=True)
26
+ start_nodes = [nid for _, nid in scored[:3]]
27
+ if not start_nodes:
28
+ return 0
29
+
30
+ visited: set[str] = set(start_nodes)
31
+ frontier = set(start_nodes)
32
+ edges_seen: list[tuple] = []
33
+ for _ in range(depth):
34
+ next_frontier: set[str] = set()
35
+ for n in frontier:
36
+ for neighbor in G.neighbors(n):
37
+ if neighbor not in visited:
38
+ next_frontier.add(neighbor)
39
+ edges_seen.append((n, neighbor))
40
+ visited.update(next_frontier)
41
+ frontier = next_frontier
42
+
43
+ lines = []
44
+ for nid in visited:
45
+ d = G.nodes[nid]
46
+ lines.append(f"NODE {d.get('label', nid)} src={d.get('source_file', '')} loc={d.get('source_location', '')}")
47
+ for u, v in edges_seen:
48
+ if u in visited and v in visited:
49
+ d = G.edges[u, v]
50
+ lines.append(f"EDGE {G.nodes[u].get('label', u)} --{d.get('relation', '')}--> {G.nodes[v].get('label', v)}")
51
+
52
+ return _estimate_tokens("\n".join(lines))
53
+
54
+
55
+ _SAMPLE_QUESTIONS = [
56
+ "how does authentication work",
57
+ "what is the main entry point",
58
+ "how are errors handled",
59
+ "what connects the data layer to the api",
60
+ "what are the core abstractions",
61
+ ]
62
+
63
+
64
+ def run_benchmark(
65
+ graph_path: str = ".graphify/graph.json",
66
+ corpus_words: int | None = None,
67
+ questions: list[str] | None = None,
68
+ ) -> dict:
69
+ """Measure token reduction: corpus tokens vs graphify query tokens.
70
+
71
+ Args:
72
+ graph_path: path to the built graph
73
+ corpus_words: total word count from detect() output; if None, estimated from graph
74
+ questions: list of questions to benchmark; defaults to _SAMPLE_QUESTIONS
75
+
76
+ Returns dict with: corpus_tokens, avg_query_tokens, reduction_ratio, per_question
77
+ """
78
+ data = json.loads(Path(graph_path).read_text())
79
+ G = json_graph.node_link_graph(data, edges="links")
80
+
81
+ if corpus_words is None:
82
+ # Rough estimate: each node label is ~3 words, plus source context
83
+ corpus_words = G.number_of_nodes() * 50
84
+
85
+ corpus_tokens = corpus_words * 100 // 75 # words → tokens (100 words ≈ 133 tokens)
86
+
87
+ qs = questions or _SAMPLE_QUESTIONS
88
+ per_question = []
89
+ for q in qs:
90
+ qt = _query_subgraph_tokens(G, q)
91
+ if qt > 0:
92
+ per_question.append({"question": q, "query_tokens": qt, "reduction": round(corpus_tokens / qt, 1)})
93
+
94
+ if not per_question:
95
+ return {"error": "No matching nodes found for sample questions. Build the graph first."}
96
+
97
+ avg_query_tokens = sum(p["query_tokens"] for p in per_question) // len(per_question)
98
+ reduction_ratio = round(corpus_tokens / avg_query_tokens, 1) if avg_query_tokens > 0 else 0
99
+
100
+ return {
101
+ "corpus_tokens": corpus_tokens,
102
+ "corpus_words": corpus_words,
103
+ "nodes": G.number_of_nodes(),
104
+ "edges": G.number_of_edges(),
105
+ "avg_query_tokens": avg_query_tokens,
106
+ "reduction_ratio": reduction_ratio,
107
+ "per_question": per_question,
108
+ }
109
+
110
+
111
+ def print_benchmark(result: dict) -> None:
112
+ """Print a human-readable benchmark report."""
113
+ if "error" in result:
114
+ print(f"Benchmark error: {result['error']}")
115
+ return
116
+
117
+ print(f"\ngraphify token reduction benchmark")
118
+ print(f"{'─' * 50}")
119
+ print(f" Corpus: {result['corpus_words']:,} words → ~{result['corpus_tokens']:,} tokens (naive)")
120
+ print(f" Graph: {result['nodes']:,} nodes, {result['edges']:,} edges")
121
+ print(f" Avg query cost: ~{result['avg_query_tokens']:,} tokens")
122
+ print(f" Reduction: {result['reduction_ratio']}x fewer tokens per query")
123
+ print(f"\n Per question:")
124
+ for p in result["per_question"]:
125
+ print(f" [{p['reduction']}x] {p['question'][:55]}")
126
+ print()