graphnav 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ class GraphNotFoundError(Exception):
2
+ pass
3
+
4
+
5
+ class CodexNotFoundError(Exception):
6
+ pass
7
+
8
+
9
+ class CodexTimeoutError(Exception):
10
+ pass
codex_graph/cli.py ADDED
@@ -0,0 +1,238 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import os
5
+ import sys
6
+
7
+ from codex_graph import CodexNotFoundError, CodexTimeoutError, GraphNotFoundError
8
+ from codex_graph.config import load_config
9
+ from codex_graph.graph_query import load_index, query_files
10
+ from codex_graph import runner
11
+
12
+
13
+ def _run_mono_command(cmd: str, argv: list[str]) -> None:
14
+ from codex_graph import multirepo
15
+
16
+ parser = argparse.ArgumentParser(
17
+ prog=f"codex-graph {cmd}",
18
+ description={
19
+ "map": "Build per-service graphs and cross-service bridge notes for a monorepo",
20
+ "watch": "Watch for file changes and keep per-service graphs and bridge notes up-to-date",
21
+ }[cmd],
22
+ )
23
+ parser.add_argument("--root", default=".", metavar="PATH", help="Monorepo root directory (default: current directory)")
24
+ parser.add_argument("--backend", default=None, metavar="BACKEND", help="graphify LLM backend (claude|openai|gemini|deepseek|ollama)")
25
+ parser.add_argument("--config", default=None, metavar="PATH", help="Path to config.toml")
26
+ if cmd == "map":
27
+ parser.add_argument("--dry-run", action="store_true", help="Detect services and print the plan without invoking graphify")
28
+
29
+ args = parser.parse_args(argv)
30
+ cfg = load_config(args.config)
31
+
32
+ if cmd == "map":
33
+ rc = multirepo.run_map(
34
+ root=args.root,
35
+ mono_cfg=cfg.mono,
36
+ backend_override=args.backend,
37
+ dry_run=args.dry_run,
38
+ )
39
+ else:
40
+ rc = multirepo.run_watch(
41
+ root=args.root,
42
+ mono_cfg=cfg.mono,
43
+ backend_override=args.backend,
44
+ )
45
+ sys.exit(rc)
46
+
47
+
48
+ def _auto_map_if_needed(cfg_path: str | None) -> None:
49
+ from codex_graph import multirepo
50
+ from codex_graph.config import load_config
51
+
52
+ cfg = load_config(cfg_path)
53
+ root = os.path.abspath(".")
54
+ services = multirepo.detect_services(root, cfg.mono.marker_files)
55
+ if not services:
56
+ return
57
+
58
+ names = ", ".join(s.name for s in services)
59
+ print(f"[codex-graph] Detected {len(services)} service(s): {names}")
60
+ print(f"[codex-graph] Running 'codex-graph map' to build knowledge graphs ...", file=sys.stderr)
61
+ rc = multirepo.run_map(root=root, mono_cfg=cfg.mono)
62
+ sys.exit(rc)
63
+
64
+
65
+ def _run_context_command(argv: list[str]) -> None:
66
+ from codex_graph import multirepo
67
+
68
+ parser = argparse.ArgumentParser(
69
+ prog="codex-graph context",
70
+ description="Print a token-budgeted context pack (files + symbol locations + cross-service impact) for a coding task",
71
+ )
72
+ parser.add_argument("task", nargs="?", help="The coding task, in natural language")
73
+ parser.add_argument("--root", default=".", metavar="PATH", help="Repo root (default: current directory)")
74
+ parser.add_argument("--budget", type=int, default=None, metavar="N", help="Approx token budget for the pack")
75
+ parser.add_argument("--files", type=int, default=None, metavar="N", help="Max number of files to include")
76
+ parser.add_argument("--config", default=None, metavar="PATH", help="Path to config.toml")
77
+ args = parser.parse_args(argv)
78
+
79
+ task = args.task
80
+ if not task and not sys.stdin.isatty():
81
+ task = sys.stdin.read().strip()
82
+ if not task:
83
+ parser.print_help()
84
+ sys.exit(1)
85
+
86
+ cfg = load_config(args.config)
87
+ pack = multirepo.build_context_pack(
88
+ root=args.root,
89
+ task=task,
90
+ top_files=args.files if args.files is not None else cfg.mono.context_top_files,
91
+ budget_tokens=args.budget if args.budget is not None else cfg.mono.context_budget_tokens,
92
+ skip_patterns=cfg.graph.skip_patterns,
93
+ )
94
+ print(pack)
95
+ sys.exit(0)
96
+
97
+
98
+ def _run_graph_query_command(kind: str, argv: list[str]) -> None:
99
+ from codex_graph import multirepo
100
+ from codex_graph.graph_nav import GraphNav
101
+
102
+ parser = argparse.ArgumentParser(prog=f"codex-graph {kind}")
103
+ parser.add_argument("term", nargs="?", help="query (find) or symbol (neighbors)")
104
+ parser.add_argument("--root", default=".", metavar="PATH")
105
+ parser.add_argument("--config", default=None, metavar="PATH")
106
+ args = parser.parse_args(argv)
107
+ if not args.term:
108
+ parser.print_help()
109
+ sys.exit(1)
110
+
111
+ cfg = load_config(args.config)
112
+ graph_path = multirepo._overarching_graph_path(os.path.abspath(args.root))
113
+ if not os.path.exists(graph_path):
114
+ print(f"Error: no knowledge graph at {graph_path}. Run `codex-graph map` first.", file=sys.stderr)
115
+ sys.exit(2)
116
+ nav = GraphNav(graph_path, cfg.graph.skip_patterns)
117
+
118
+ if kind == "find":
119
+ hits = nav.find_symbols(args.term, k=10)
120
+ if not hits:
121
+ print("(no matches)")
122
+ for h in hits:
123
+ print(f"{h['symbol']} — {h['file']}:{h['loc']}")
124
+ else:
125
+ r = nav.neighbors(args.term)
126
+ if not r.get("found", True):
127
+ print("(symbol not found)")
128
+ sys.exit(0)
129
+ print(f"{r['symbol']} defined at {r['defined_at']}")
130
+ if r.get("callers"):
131
+ print("callers:")
132
+ for c in r["callers"]:
133
+ print(" " + c)
134
+ if r.get("callees"):
135
+ print("calls:")
136
+ for c in r["callees"]:
137
+ print(" " + c)
138
+ sys.exit(0)
139
+
140
+
141
+ def main() -> None:
142
+ if len(sys.argv) > 1 and sys.argv[1] in ("map", "watch"):
143
+ _run_mono_command(sys.argv[1], sys.argv[2:])
144
+ return
145
+ if len(sys.argv) > 1 and sys.argv[1] == "context":
146
+ _run_context_command(sys.argv[2:])
147
+ return
148
+ if len(sys.argv) > 1 and sys.argv[1] in ("find", "neighbors"):
149
+ _run_graph_query_command(sys.argv[1], sys.argv[2:])
150
+ return
151
+
152
+ parser = argparse.ArgumentParser(
153
+ prog="codex-graph",
154
+ description=(
155
+ "Codex CLI with knowledge-graph context injection for monorepos.\n\n"
156
+ "First-run (after pip install): just run 'codex-graph' or 'codex-graph map'\n"
157
+ "in your monorepo root — services are auto-detected and graphs are built.\n\n"
158
+ "Subcommands:\n"
159
+ " map Build per-service graphs and cross-service bridge notes\n"
160
+ " watch Keep graphs and bridge notes up-to-date as files change"
161
+ ),
162
+ formatter_class=argparse.RawDescriptionHelpFormatter,
163
+ )
164
+ parser.add_argument("prompt", nargs="?", help="Natural language task prompt")
165
+ parser.add_argument("--config", default=None, metavar="PATH", help="Path to config.toml")
166
+ parser.add_argument("--top-k", type=int, default=None, metavar="N", help="Number of files to inject as context")
167
+ parser.add_argument("--graph", default=None, metavar="PATH", help="Path to graph.json")
168
+ parser.add_argument("--dry-run", action="store_true", help="Print enriched prompt without calling codex")
169
+ parser.add_argument("--list-files", action="store_true", help="Print ranked files and scores, then exit")
170
+ parser.add_argument("--no-context", action="store_true", help="Pass prompt to codex with no graph context")
171
+
172
+ args = parser.parse_args()
173
+
174
+ prompt = args.prompt
175
+ if not prompt:
176
+ if sys.stdin.isatty():
177
+ _auto_map_if_needed(args.config)
178
+ parser.print_help()
179
+ sys.exit(1)
180
+ prompt = sys.stdin.read().strip()
181
+ if not prompt:
182
+ parser.print_help()
183
+ sys.exit(1)
184
+
185
+ cfg = load_config(args.config)
186
+
187
+ if args.top_k is not None:
188
+ cfg.query.top_k = args.top_k
189
+ if args.graph is not None:
190
+ cfg.graph.path = args.graph
191
+
192
+ project_root = os.path.abspath(cfg.graph.project_root)
193
+ graph_path = (
194
+ cfg.graph.path
195
+ if os.path.isabs(cfg.graph.path)
196
+ else os.path.join(os.getcwd(), cfg.graph.path)
197
+ )
198
+
199
+ if args.no_context:
200
+ ranked = []
201
+ else:
202
+ try:
203
+ index = load_index(graph_path, cfg.graph.skip_patterns)
204
+ except GraphNotFoundError as e:
205
+ print(f"Error: {e}", file=sys.stderr)
206
+ sys.exit(2)
207
+
208
+ ranked = query_files(
209
+ prompt,
210
+ index,
211
+ cfg.query.top_k,
212
+ cfg.query.community_boost_weight,
213
+ cfg.query.bm25_k1,
214
+ cfg.query.bm25_b,
215
+ )
216
+
217
+ if args.list_files:
218
+ for rf in ranked:
219
+ print(f"{rf.score:.3f} {rf.source_file}")
220
+ sys.exit(0)
221
+
222
+ if args.dry_run:
223
+ print(runner.build_prompt(prompt, ranked, cfg, project_root))
224
+ sys.exit(0)
225
+
226
+ try:
227
+ exit_code = runner.run(prompt, ranked, cfg, project_root)
228
+ sys.exit(exit_code)
229
+ except CodexNotFoundError as e:
230
+ print(f"Error: {e}", file=sys.stderr)
231
+ sys.exit(127)
232
+ except CodexTimeoutError as e:
233
+ print(f"Error: {e}", file=sys.stderr)
234
+ sys.exit(124)
235
+
236
+
237
+ if __name__ == "__main__":
238
+ main()
codex_graph/config.py ADDED
@@ -0,0 +1,127 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import tomllib
5
+ from dataclasses import dataclass, field
6
+
7
+
8
+ @dataclass
9
+ class GraphConfig:
10
+ path: str = "graphify-out/graph.json"
11
+ project_root: str = "."
12
+ skip_patterns: list[str] = field(default_factory=lambda: ["playwright-report", "node_modules", ".git"])
13
+
14
+
15
+ @dataclass
16
+ class QueryConfig:
17
+ top_k: int = 5
18
+ community_boost_weight: float = 2.0
19
+ bm25_k1: float = 1.5
20
+ bm25_b: float = 0.75
21
+
22
+
23
+ @dataclass
24
+ class ContextConfig:
25
+ max_file_chars: int = 8000
26
+ show_scores: bool = False
27
+
28
+
29
+ @dataclass
30
+ class CodexConfig:
31
+ command: str = "codex"
32
+ subcommand: str = "exec"
33
+ extra_args: list[str] = field(default_factory=list)
34
+ inject_via: str = "stdin"
35
+ timeout_seconds: int = 300
36
+
37
+
38
+ @dataclass
39
+ class MonoConfig:
40
+ marker_files: list[str] = field(default_factory=lambda: [
41
+ "package.json", "pyproject.toml", "go.mod", "Cargo.toml",
42
+ "pom.xml", "build.gradle", "setup.py", "setup.cfg",
43
+ "requirements.txt", "Gemfile", "composer.json", "tsconfig.json",
44
+ ])
45
+ graphify_backend: str = "claude"
46
+ watch_poll_interval: float = 3.0
47
+ context_budget_tokens: int = 2000
48
+ context_top_files: int = 8
49
+
50
+
51
+ @dataclass
52
+ class Config:
53
+ graph: GraphConfig = field(default_factory=GraphConfig)
54
+ query: QueryConfig = field(default_factory=QueryConfig)
55
+ context: ContextConfig = field(default_factory=ContextConfig)
56
+ codex: CodexConfig = field(default_factory=CodexConfig)
57
+ mono: MonoConfig = field(default_factory=MonoConfig)
58
+
59
+
60
+ def _apply_toml(cfg: Config, data: dict) -> Config:
61
+ if "graph" in data:
62
+ g = data["graph"]
63
+ cfg.graph = GraphConfig(
64
+ path=g.get("path", cfg.graph.path),
65
+ project_root=g.get("project_root", cfg.graph.project_root),
66
+ skip_patterns=g.get("skip_patterns", cfg.graph.skip_patterns),
67
+ )
68
+ if "query" in data:
69
+ q = data["query"]
70
+ cfg.query = QueryConfig(
71
+ top_k=q.get("top_k", cfg.query.top_k),
72
+ community_boost_weight=q.get("community_boost_weight", cfg.query.community_boost_weight),
73
+ bm25_k1=q.get("bm25_k1", cfg.query.bm25_k1),
74
+ bm25_b=q.get("bm25_b", cfg.query.bm25_b),
75
+ )
76
+ if "context" in data:
77
+ c = data["context"]
78
+ cfg.context = ContextConfig(
79
+ max_file_chars=c.get("max_file_chars", cfg.context.max_file_chars),
80
+ show_scores=c.get("show_scores", cfg.context.show_scores),
81
+ )
82
+ if "codex" in data:
83
+ cx = data["codex"]
84
+ cfg.codex = CodexConfig(
85
+ command=cx.get("command", cfg.codex.command),
86
+ subcommand=cx.get("subcommand", cfg.codex.subcommand),
87
+ extra_args=cx.get("extra_args", cfg.codex.extra_args),
88
+ inject_via=cx.get("inject_via", cfg.codex.inject_via),
89
+ timeout_seconds=cx.get("timeout_seconds", cfg.codex.timeout_seconds),
90
+ )
91
+ if "mono" in data:
92
+ m = data["mono"]
93
+ cfg.mono = MonoConfig(
94
+ marker_files=m.get("marker_files", cfg.mono.marker_files),
95
+ graphify_backend=m.get("graphify_backend", cfg.mono.graphify_backend),
96
+ watch_poll_interval=m.get("watch_poll_interval", cfg.mono.watch_poll_interval),
97
+ context_budget_tokens=m.get("context_budget_tokens", cfg.mono.context_budget_tokens),
98
+ context_top_files=m.get("context_top_files", cfg.mono.context_top_files),
99
+ )
100
+ return cfg
101
+
102
+
103
+ def load_config(explicit_path: str | None = None) -> Config:
104
+ cfg = Config()
105
+
106
+ candidates: list[str] = []
107
+ if explicit_path:
108
+ candidates = [explicit_path]
109
+ else:
110
+ env_path = os.environ.get("CODEX_GRAPH_CONFIG")
111
+ if env_path:
112
+ candidates.append(env_path)
113
+ candidates.append(os.path.join(os.getcwd(), "config.toml"))
114
+ candidates.append(os.path.expanduser("~/.codex-graph/config.toml"))
115
+
116
+ for path in candidates:
117
+ if os.path.exists(path):
118
+ with open(path, "rb") as f:
119
+ data = tomllib.load(f)
120
+ cfg = _apply_toml(cfg, data)
121
+ break
122
+ else:
123
+ if explicit_path:
124
+ import sys
125
+ print(f"Warning: config file not found: {explicit_path}", file=sys.stderr)
126
+
127
+ return cfg
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from collections import defaultdict
6
+
7
+ from codex_graph.graph_query import _tokenize
8
+
9
+
10
+ class GraphNav:
11
+ def __init__(self, graph_path: str, skip_patterns: list[str] | None = None):
12
+ with open(graph_path) as f:
13
+ graph = json.load(f)
14
+ self.skip = skip_patterns or []
15
+ self.id2node: dict = {}
16
+ self.file2ids: dict[str, list] = defaultdict(list)
17
+ for n in graph.get("nodes", []):
18
+ nid = n.get("id")
19
+ if nid is None:
20
+ continue
21
+ self.id2node[nid] = n
22
+ sf = n.get("source_file", "")
23
+ if sf:
24
+ self.file2ids[sf].append(nid)
25
+ self.in_edges: dict[object, list] = defaultdict(list)
26
+ self.out_edges: dict[object, list] = defaultdict(list)
27
+ for e in graph.get("links", []):
28
+ s, t = e.get("source"), e.get("target")
29
+ if s is None or t is None:
30
+ continue
31
+ rel = e.get("relation", "")
32
+ self.out_edges[s].append((t, rel))
33
+ self.in_edges[t].append((s, rel))
34
+
35
+ def _skipped(self, sf: str) -> bool:
36
+ return (not sf) or any(p in sf for p in self.skip)
37
+
38
+ def _loc(self, nid) -> str:
39
+ n = self.id2node.get(nid, {})
40
+ sf = n.get("source_file", "?")
41
+ loc = n.get("source_location", "")
42
+ return f"{sf}:{loc}" if loc else sf
43
+
44
+ def find_symbols(self, query: str, k: int = 8) -> list[dict]:
45
+ q = set(_tokenize(query))
46
+ if not q:
47
+ return []
48
+ scored = []
49
+ for n in self.id2node.values():
50
+ if n.get("file_type") != "code":
51
+ continue
52
+ sf = n.get("source_file", "")
53
+ label = n.get("label", "")
54
+ if self._skipped(sf) or not label:
55
+ continue
56
+ toks = set(_tokenize(label)) | set(_tokenize(os.path.basename(os.path.splitext(sf)[0])))
57
+ overlap = len(q & toks)
58
+ if overlap:
59
+ scored.append((overlap, label, sf, n.get("source_location", "")))
60
+ scored.sort(key=lambda x: -x[0])
61
+ return [{"symbol": l, "file": sf, "loc": loc} for _, l, sf, loc in scored[:k]]
62
+
63
+ def neighbors(self, symbol: str, k: int = 12) -> dict:
64
+ q = set(_tokenize(symbol))
65
+ best, best_ov = None, 0
66
+ for nid, n in self.id2node.items():
67
+ ov = len(q & set(_tokenize(n.get("label", ""))))
68
+ if ov > best_ov:
69
+ best, best_ov = nid, ov
70
+ if best is None:
71
+ return {"symbol": symbol, "found": False}
72
+ callers, callees = [], []
73
+ for s, rel in self.in_edges.get(best, []):
74
+ sn = self.id2node.get(s, {})
75
+ if self._skipped(sn.get("source_file", "")):
76
+ continue
77
+ callers.append(f"{sn.get('label', '?')} ({self._loc(s)}) --{rel}-->")
78
+ for t, rel in self.out_edges.get(best, []):
79
+ tn = self.id2node.get(t, {})
80
+ if self._skipped(tn.get("source_file", "")):
81
+ continue
82
+ callees.append(f"--{rel}--> {tn.get('label', '?')} ({self._loc(t)})")
83
+ return {
84
+ "symbol": self.id2node[best].get("label"),
85
+ "defined_at": self._loc(best),
86
+ "callers": callers[:k],
87
+ "callees": callees[:k],
88
+ }
89
+
90
+ def references_to(self, files: list[str], limit: int = 12) -> list[str]:
91
+ target_ids = set()
92
+ for sf in files:
93
+ target_ids.update(self.file2ids.get(sf, []))
94
+ seen, rows = set(), []
95
+ file_set = set(files)
96
+ for tid in target_ids:
97
+ tnode = self.id2node.get(tid, {})
98
+ for s, rel in self.in_edges.get(tid, []):
99
+ sn = self.id2node.get(s, {})
100
+ sf = sn.get("source_file", "")
101
+ if self._skipped(sf) or sf in file_set:
102
+ continue
103
+ key = (sf, sn.get("source_location", ""), tnode.get("label", ""))
104
+ if key in seen:
105
+ continue
106
+ seen.add(key)
107
+ loc = sn.get("source_location", "")
108
+ rows.append(
109
+ f"{sf}:{loc} {sn.get('label', '?')} --{rel}--> {tnode.get('label', '?')}"
110
+ )
111
+ if len(rows) >= limit:
112
+ return rows
113
+ return rows
@@ -0,0 +1,187 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import math
5
+ import os
6
+ import re
7
+ from collections import Counter, defaultdict
8
+ from dataclasses import dataclass
9
+
10
+ from codex_graph import GraphNotFoundError
11
+
12
+
13
+ @dataclass
14
+ class RankedFile:
15
+ source_file: str
16
+ score: float
17
+
18
+
19
+ ALLOWED_EXTENSIONS = {
20
+ ".py", ".pyi", ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
21
+ ".go", ".java", ".kt", ".kts", ".cs", ".rb", ".rs", ".php",
22
+ ".swift", ".scala", ".c", ".cc", ".cpp", ".cxx", ".h", ".hpp",
23
+ ".hh", ".m", ".mm", ".lua", ".dart", ".ex", ".exs", ".clj",
24
+ ".cljs", ".groovy", ".vue", ".svelte", ".sh", ".bash", ".zsh",
25
+ ".pl", ".r", ".sql", ".proto", ".thrift", ".graphql", ".gql",
26
+ ".md", ".mdx", ".markdown", ".rst", ".txt", ".adoc",
27
+ }
28
+
29
+ GENERATED_PATTERNS = (
30
+ ".pb.go", ".pb.cc", ".pb.h", "_pb2.py", "_pb2.pyi", "_pb2_grpc.py",
31
+ "pb2_grpc", "_grpc.pb.", "genproto/", "/generated/", ".generated.",
32
+ ".g.dart", "_pb.dart", "/migrations/",
33
+ )
34
+
35
+
36
+ def _is_rankable(source_file: str) -> bool:
37
+ lower = source_file.lower()
38
+ if any(p in lower for p in GENERATED_PATTERNS):
39
+ return False
40
+ return os.path.splitext(lower)[1] in ALLOWED_EXTENSIONS
41
+
42
+
43
+ _IDENT_RE = re.compile(r"[A-Z]+(?=[A-Z][a-z])|[A-Z][a-z]+|[a-z]+|[A-Z]+|[0-9]+")
44
+
45
+
46
+ def _stem(t: str) -> str:
47
+ if len(t) <= 4 or t.endswith("ss"):
48
+ return t
49
+ if t.endswith("ies"):
50
+ return t[:-3] + "y"
51
+ if t.endswith("es"):
52
+ return t[:-2]
53
+ if t.endswith("s"):
54
+ return t[:-1]
55
+ return t
56
+
57
+
58
+ def _tokenize(s: str) -> list[str]:
59
+ toks: list[str] = []
60
+ for word in re.split(r"[^A-Za-z0-9]+", s):
61
+ if not word:
62
+ continue
63
+ for sub in (_IDENT_RE.findall(word) or [word]):
64
+ t = sub.lower()
65
+ if len(t) >= 2:
66
+ toks.append(_stem(t))
67
+ return toks
68
+
69
+
70
+ class GraphIndex:
71
+ _TYPE_WEIGHT = {"rationale": 3, "document": 2, "concept": 2, "code": 1}
72
+
73
+ def __init__(self, graph_path: str, skip_patterns: list[str]):
74
+ with open(graph_path) as f:
75
+ graph = json.load(f)
76
+
77
+ nodes = graph.get("nodes", [])
78
+
79
+ self.file_tokens: dict[str, list[str]] = defaultdict(list)
80
+ self.file_communities: dict[str, set[int]] = defaultdict(set)
81
+ self.community_tokens: dict[int, set[str]] = defaultdict(set)
82
+
83
+ for n in nodes:
84
+ sf = n.get("source_file", "")
85
+ label = n.get("norm_label") or n.get("label") or ""
86
+ cid = n.get("community")
87
+ tokens = _tokenize(label)
88
+
89
+ if cid is not None:
90
+ self.community_tokens[cid].update(tokens)
91
+
92
+ if not sf or any(p in sf for p in skip_patterns) or not _is_rankable(sf):
93
+ continue
94
+
95
+ weight = self._TYPE_WEIGHT.get(n.get("file_type", "code"), 1)
96
+ self.file_tokens[sf].extend(tokens * weight)
97
+ if cid is not None:
98
+ self.file_communities[sf].add(cid)
99
+
100
+ for sf in list(self.file_tokens.keys()):
101
+ stem_path = os.path.splitext(sf)[0]
102
+ base_tokens = _tokenize(os.path.basename(stem_path))
103
+ dir_tokens = _tokenize(os.path.dirname(stem_path))
104
+ self.file_tokens[sf].extend(base_tokens * 6)
105
+ self.file_tokens[sf].extend(dir_tokens * 2)
106
+
107
+ self._N = len(self.file_tokens)
108
+ self._avgdl = (
109
+ sum(len(t) for t in self.file_tokens.values()) / max(self._N, 1)
110
+ )
111
+ self._df: dict[str, int] = defaultdict(int)
112
+ for tokens in self.file_tokens.values():
113
+ for t in set(tokens):
114
+ self._df[t] += 1
115
+
116
+ def _bm25(self, query_tokens: list[str], sf: str, k1: float, b: float) -> float:
117
+ doc = self.file_tokens.get(sf, [])
118
+ dl = len(doc)
119
+ tf_counts = Counter(doc)
120
+ score = 0.0
121
+ for t in query_tokens:
122
+ df = self._df.get(t)
123
+ if not df:
124
+ continue
125
+ idf = math.log((self._N - df + 0.5) / (df + 0.5) + 1)
126
+ tf = tf_counts[t]
127
+ tf_norm = (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / self._avgdl))
128
+ score += idf * tf_norm
129
+ return score
130
+
131
+ def _community_boost(self, query_tokens: list[str], sf: str, boost_weight: float) -> float:
132
+ qset = set(query_tokens)
133
+ best = 0.0
134
+ for cid in self.file_communities.get(sf, set()):
135
+ ctokens = self.community_tokens[cid]
136
+ overlap = len(qset & ctokens) / (len(qset) + 1)
137
+ if overlap > best:
138
+ best = overlap
139
+ return best * boost_weight
140
+
141
+ def rank(
142
+ self,
143
+ prompt: str,
144
+ top_k: int,
145
+ community_boost_weight: float,
146
+ bm25_k1: float,
147
+ bm25_b: float,
148
+ keep_ratio: float = 0.3,
149
+ ) -> list[RankedFile]:
150
+ qtoks = _tokenize(prompt)
151
+ if not qtoks:
152
+ return []
153
+ scores = {
154
+ sf: self._bm25(qtoks, sf, bm25_k1, bm25_b)
155
+ + self._community_boost(qtoks, sf, community_boost_weight)
156
+ for sf in self.file_tokens
157
+ }
158
+ ranked = sorted(scores.items(), key=lambda x: -x[1])
159
+ if not ranked or ranked[0][1] <= 0:
160
+ return []
161
+ floor = ranked[0][1] * keep_ratio
162
+ return [
163
+ RankedFile(source_file=sf, score=sc)
164
+ for sf, sc in ranked[:top_k]
165
+ if sc > 0 and sc >= floor
166
+ ]
167
+
168
+
169
+ def load_index(graph_path: str, skip_patterns: list[str]) -> GraphIndex:
170
+ if not os.path.exists(graph_path):
171
+ raise GraphNotFoundError(
172
+ f"graph.json not found: {graph_path}\n"
173
+ "Run Graphify on the repo first, or set [graph] path in config.toml"
174
+ )
175
+ return GraphIndex(graph_path, skip_patterns)
176
+
177
+
178
+ def query_files(
179
+ prompt: str,
180
+ index: GraphIndex,
181
+ top_k: int,
182
+ community_boost_weight: float = 2.0,
183
+ bm25_k1: float = 1.5,
184
+ bm25_b: float = 0.75,
185
+ keep_ratio: float = 0.3,
186
+ ) -> list[RankedFile]:
187
+ return index.rank(prompt, top_k, community_boost_weight, bm25_k1, bm25_b, keep_ratio)