graphnav 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codex_graph/__init__.py +10 -0
- codex_graph/cli.py +238 -0
- codex_graph/config.py +127 -0
- codex_graph/graph_nav.py +113 -0
- codex_graph/graph_query.py +187 -0
- codex_graph/multirepo.py +793 -0
- codex_graph/runner.py +123 -0
- graphnav-0.1.0.dist-info/METADATA +9 -0
- graphnav-0.1.0.dist-info/RECORD +12 -0
- graphnav-0.1.0.dist-info/WHEEL +5 -0
- graphnav-0.1.0.dist-info/entry_points.txt +2 -0
- graphnav-0.1.0.dist-info/top_level.txt +1 -0
codex_graph/__init__.py
ADDED
codex_graph/cli.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
from codex_graph import CodexNotFoundError, CodexTimeoutError, GraphNotFoundError
|
|
8
|
+
from codex_graph.config import load_config
|
|
9
|
+
from codex_graph.graph_query import load_index, query_files
|
|
10
|
+
from codex_graph import runner
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _run_mono_command(cmd: str, argv: list[str]) -> None:
|
|
14
|
+
from codex_graph import multirepo
|
|
15
|
+
|
|
16
|
+
parser = argparse.ArgumentParser(
|
|
17
|
+
prog=f"codex-graph {cmd}",
|
|
18
|
+
description={
|
|
19
|
+
"map": "Build per-service graphs and cross-service bridge notes for a monorepo",
|
|
20
|
+
"watch": "Watch for file changes and keep per-service graphs and bridge notes up-to-date",
|
|
21
|
+
}[cmd],
|
|
22
|
+
)
|
|
23
|
+
parser.add_argument("--root", default=".", metavar="PATH", help="Monorepo root directory (default: current directory)")
|
|
24
|
+
parser.add_argument("--backend", default=None, metavar="BACKEND", help="graphify LLM backend (claude|openai|gemini|deepseek|ollama)")
|
|
25
|
+
parser.add_argument("--config", default=None, metavar="PATH", help="Path to config.toml")
|
|
26
|
+
if cmd == "map":
|
|
27
|
+
parser.add_argument("--dry-run", action="store_true", help="Detect services and print the plan without invoking graphify")
|
|
28
|
+
|
|
29
|
+
args = parser.parse_args(argv)
|
|
30
|
+
cfg = load_config(args.config)
|
|
31
|
+
|
|
32
|
+
if cmd == "map":
|
|
33
|
+
rc = multirepo.run_map(
|
|
34
|
+
root=args.root,
|
|
35
|
+
mono_cfg=cfg.mono,
|
|
36
|
+
backend_override=args.backend,
|
|
37
|
+
dry_run=args.dry_run,
|
|
38
|
+
)
|
|
39
|
+
else:
|
|
40
|
+
rc = multirepo.run_watch(
|
|
41
|
+
root=args.root,
|
|
42
|
+
mono_cfg=cfg.mono,
|
|
43
|
+
backend_override=args.backend,
|
|
44
|
+
)
|
|
45
|
+
sys.exit(rc)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _auto_map_if_needed(cfg_path: str | None) -> None:
|
|
49
|
+
from codex_graph import multirepo
|
|
50
|
+
from codex_graph.config import load_config
|
|
51
|
+
|
|
52
|
+
cfg = load_config(cfg_path)
|
|
53
|
+
root = os.path.abspath(".")
|
|
54
|
+
services = multirepo.detect_services(root, cfg.mono.marker_files)
|
|
55
|
+
if not services:
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
names = ", ".join(s.name for s in services)
|
|
59
|
+
print(f"[codex-graph] Detected {len(services)} service(s): {names}")
|
|
60
|
+
print(f"[codex-graph] Running 'codex-graph map' to build knowledge graphs ...", file=sys.stderr)
|
|
61
|
+
rc = multirepo.run_map(root=root, mono_cfg=cfg.mono)
|
|
62
|
+
sys.exit(rc)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _run_context_command(argv: list[str]) -> None:
|
|
66
|
+
from codex_graph import multirepo
|
|
67
|
+
|
|
68
|
+
parser = argparse.ArgumentParser(
|
|
69
|
+
prog="codex-graph context",
|
|
70
|
+
description="Print a token-budgeted context pack (files + symbol locations + cross-service impact) for a coding task",
|
|
71
|
+
)
|
|
72
|
+
parser.add_argument("task", nargs="?", help="The coding task, in natural language")
|
|
73
|
+
parser.add_argument("--root", default=".", metavar="PATH", help="Repo root (default: current directory)")
|
|
74
|
+
parser.add_argument("--budget", type=int, default=None, metavar="N", help="Approx token budget for the pack")
|
|
75
|
+
parser.add_argument("--files", type=int, default=None, metavar="N", help="Max number of files to include")
|
|
76
|
+
parser.add_argument("--config", default=None, metavar="PATH", help="Path to config.toml")
|
|
77
|
+
args = parser.parse_args(argv)
|
|
78
|
+
|
|
79
|
+
task = args.task
|
|
80
|
+
if not task and not sys.stdin.isatty():
|
|
81
|
+
task = sys.stdin.read().strip()
|
|
82
|
+
if not task:
|
|
83
|
+
parser.print_help()
|
|
84
|
+
sys.exit(1)
|
|
85
|
+
|
|
86
|
+
cfg = load_config(args.config)
|
|
87
|
+
pack = multirepo.build_context_pack(
|
|
88
|
+
root=args.root,
|
|
89
|
+
task=task,
|
|
90
|
+
top_files=args.files if args.files is not None else cfg.mono.context_top_files,
|
|
91
|
+
budget_tokens=args.budget if args.budget is not None else cfg.mono.context_budget_tokens,
|
|
92
|
+
skip_patterns=cfg.graph.skip_patterns,
|
|
93
|
+
)
|
|
94
|
+
print(pack)
|
|
95
|
+
sys.exit(0)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _run_graph_query_command(kind: str, argv: list[str]) -> None:
|
|
99
|
+
from codex_graph import multirepo
|
|
100
|
+
from codex_graph.graph_nav import GraphNav
|
|
101
|
+
|
|
102
|
+
parser = argparse.ArgumentParser(prog=f"codex-graph {kind}")
|
|
103
|
+
parser.add_argument("term", nargs="?", help="query (find) or symbol (neighbors)")
|
|
104
|
+
parser.add_argument("--root", default=".", metavar="PATH")
|
|
105
|
+
parser.add_argument("--config", default=None, metavar="PATH")
|
|
106
|
+
args = parser.parse_args(argv)
|
|
107
|
+
if not args.term:
|
|
108
|
+
parser.print_help()
|
|
109
|
+
sys.exit(1)
|
|
110
|
+
|
|
111
|
+
cfg = load_config(args.config)
|
|
112
|
+
graph_path = multirepo._overarching_graph_path(os.path.abspath(args.root))
|
|
113
|
+
if not os.path.exists(graph_path):
|
|
114
|
+
print(f"Error: no knowledge graph at {graph_path}. Run `codex-graph map` first.", file=sys.stderr)
|
|
115
|
+
sys.exit(2)
|
|
116
|
+
nav = GraphNav(graph_path, cfg.graph.skip_patterns)
|
|
117
|
+
|
|
118
|
+
if kind == "find":
|
|
119
|
+
hits = nav.find_symbols(args.term, k=10)
|
|
120
|
+
if not hits:
|
|
121
|
+
print("(no matches)")
|
|
122
|
+
for h in hits:
|
|
123
|
+
print(f"{h['symbol']} — {h['file']}:{h['loc']}")
|
|
124
|
+
else:
|
|
125
|
+
r = nav.neighbors(args.term)
|
|
126
|
+
if not r.get("found", True):
|
|
127
|
+
print("(symbol not found)")
|
|
128
|
+
sys.exit(0)
|
|
129
|
+
print(f"{r['symbol']} defined at {r['defined_at']}")
|
|
130
|
+
if r.get("callers"):
|
|
131
|
+
print("callers:")
|
|
132
|
+
for c in r["callers"]:
|
|
133
|
+
print(" " + c)
|
|
134
|
+
if r.get("callees"):
|
|
135
|
+
print("calls:")
|
|
136
|
+
for c in r["callees"]:
|
|
137
|
+
print(" " + c)
|
|
138
|
+
sys.exit(0)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def main() -> None:
|
|
142
|
+
if len(sys.argv) > 1 and sys.argv[1] in ("map", "watch"):
|
|
143
|
+
_run_mono_command(sys.argv[1], sys.argv[2:])
|
|
144
|
+
return
|
|
145
|
+
if len(sys.argv) > 1 and sys.argv[1] == "context":
|
|
146
|
+
_run_context_command(sys.argv[2:])
|
|
147
|
+
return
|
|
148
|
+
if len(sys.argv) > 1 and sys.argv[1] in ("find", "neighbors"):
|
|
149
|
+
_run_graph_query_command(sys.argv[1], sys.argv[2:])
|
|
150
|
+
return
|
|
151
|
+
|
|
152
|
+
parser = argparse.ArgumentParser(
|
|
153
|
+
prog="codex-graph",
|
|
154
|
+
description=(
|
|
155
|
+
"Codex CLI with knowledge-graph context injection for monorepos.\n\n"
|
|
156
|
+
"First-run (after pip install): just run 'codex-graph' or 'codex-graph map'\n"
|
|
157
|
+
"in your monorepo root — services are auto-detected and graphs are built.\n\n"
|
|
158
|
+
"Subcommands:\n"
|
|
159
|
+
" map Build per-service graphs and cross-service bridge notes\n"
|
|
160
|
+
" watch Keep graphs and bridge notes up-to-date as files change"
|
|
161
|
+
),
|
|
162
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
163
|
+
)
|
|
164
|
+
parser.add_argument("prompt", nargs="?", help="Natural language task prompt")
|
|
165
|
+
parser.add_argument("--config", default=None, metavar="PATH", help="Path to config.toml")
|
|
166
|
+
parser.add_argument("--top-k", type=int, default=None, metavar="N", help="Number of files to inject as context")
|
|
167
|
+
parser.add_argument("--graph", default=None, metavar="PATH", help="Path to graph.json")
|
|
168
|
+
parser.add_argument("--dry-run", action="store_true", help="Print enriched prompt without calling codex")
|
|
169
|
+
parser.add_argument("--list-files", action="store_true", help="Print ranked files and scores, then exit")
|
|
170
|
+
parser.add_argument("--no-context", action="store_true", help="Pass prompt to codex with no graph context")
|
|
171
|
+
|
|
172
|
+
args = parser.parse_args()
|
|
173
|
+
|
|
174
|
+
prompt = args.prompt
|
|
175
|
+
if not prompt:
|
|
176
|
+
if sys.stdin.isatty():
|
|
177
|
+
_auto_map_if_needed(args.config)
|
|
178
|
+
parser.print_help()
|
|
179
|
+
sys.exit(1)
|
|
180
|
+
prompt = sys.stdin.read().strip()
|
|
181
|
+
if not prompt:
|
|
182
|
+
parser.print_help()
|
|
183
|
+
sys.exit(1)
|
|
184
|
+
|
|
185
|
+
cfg = load_config(args.config)
|
|
186
|
+
|
|
187
|
+
if args.top_k is not None:
|
|
188
|
+
cfg.query.top_k = args.top_k
|
|
189
|
+
if args.graph is not None:
|
|
190
|
+
cfg.graph.path = args.graph
|
|
191
|
+
|
|
192
|
+
project_root = os.path.abspath(cfg.graph.project_root)
|
|
193
|
+
graph_path = (
|
|
194
|
+
cfg.graph.path
|
|
195
|
+
if os.path.isabs(cfg.graph.path)
|
|
196
|
+
else os.path.join(os.getcwd(), cfg.graph.path)
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
if args.no_context:
|
|
200
|
+
ranked = []
|
|
201
|
+
else:
|
|
202
|
+
try:
|
|
203
|
+
index = load_index(graph_path, cfg.graph.skip_patterns)
|
|
204
|
+
except GraphNotFoundError as e:
|
|
205
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
206
|
+
sys.exit(2)
|
|
207
|
+
|
|
208
|
+
ranked = query_files(
|
|
209
|
+
prompt,
|
|
210
|
+
index,
|
|
211
|
+
cfg.query.top_k,
|
|
212
|
+
cfg.query.community_boost_weight,
|
|
213
|
+
cfg.query.bm25_k1,
|
|
214
|
+
cfg.query.bm25_b,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
if args.list_files:
|
|
218
|
+
for rf in ranked:
|
|
219
|
+
print(f"{rf.score:.3f} {rf.source_file}")
|
|
220
|
+
sys.exit(0)
|
|
221
|
+
|
|
222
|
+
if args.dry_run:
|
|
223
|
+
print(runner.build_prompt(prompt, ranked, cfg, project_root))
|
|
224
|
+
sys.exit(0)
|
|
225
|
+
|
|
226
|
+
try:
|
|
227
|
+
exit_code = runner.run(prompt, ranked, cfg, project_root)
|
|
228
|
+
sys.exit(exit_code)
|
|
229
|
+
except CodexNotFoundError as e:
|
|
230
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
231
|
+
sys.exit(127)
|
|
232
|
+
except CodexTimeoutError as e:
|
|
233
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
234
|
+
sys.exit(124)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
if __name__ == "__main__":
|
|
238
|
+
main()
|
codex_graph/config.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import tomllib
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class GraphConfig:
|
|
10
|
+
path: str = "graphify-out/graph.json"
|
|
11
|
+
project_root: str = "."
|
|
12
|
+
skip_patterns: list[str] = field(default_factory=lambda: ["playwright-report", "node_modules", ".git"])
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class QueryConfig:
|
|
17
|
+
top_k: int = 5
|
|
18
|
+
community_boost_weight: float = 2.0
|
|
19
|
+
bm25_k1: float = 1.5
|
|
20
|
+
bm25_b: float = 0.75
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ContextConfig:
|
|
25
|
+
max_file_chars: int = 8000
|
|
26
|
+
show_scores: bool = False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class CodexConfig:
|
|
31
|
+
command: str = "codex"
|
|
32
|
+
subcommand: str = "exec"
|
|
33
|
+
extra_args: list[str] = field(default_factory=list)
|
|
34
|
+
inject_via: str = "stdin"
|
|
35
|
+
timeout_seconds: int = 300
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class MonoConfig:
|
|
40
|
+
marker_files: list[str] = field(default_factory=lambda: [
|
|
41
|
+
"package.json", "pyproject.toml", "go.mod", "Cargo.toml",
|
|
42
|
+
"pom.xml", "build.gradle", "setup.py", "setup.cfg",
|
|
43
|
+
"requirements.txt", "Gemfile", "composer.json", "tsconfig.json",
|
|
44
|
+
])
|
|
45
|
+
graphify_backend: str = "claude"
|
|
46
|
+
watch_poll_interval: float = 3.0
|
|
47
|
+
context_budget_tokens: int = 2000
|
|
48
|
+
context_top_files: int = 8
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class Config:
|
|
53
|
+
graph: GraphConfig = field(default_factory=GraphConfig)
|
|
54
|
+
query: QueryConfig = field(default_factory=QueryConfig)
|
|
55
|
+
context: ContextConfig = field(default_factory=ContextConfig)
|
|
56
|
+
codex: CodexConfig = field(default_factory=CodexConfig)
|
|
57
|
+
mono: MonoConfig = field(default_factory=MonoConfig)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _apply_toml(cfg: Config, data: dict) -> Config:
|
|
61
|
+
if "graph" in data:
|
|
62
|
+
g = data["graph"]
|
|
63
|
+
cfg.graph = GraphConfig(
|
|
64
|
+
path=g.get("path", cfg.graph.path),
|
|
65
|
+
project_root=g.get("project_root", cfg.graph.project_root),
|
|
66
|
+
skip_patterns=g.get("skip_patterns", cfg.graph.skip_patterns),
|
|
67
|
+
)
|
|
68
|
+
if "query" in data:
|
|
69
|
+
q = data["query"]
|
|
70
|
+
cfg.query = QueryConfig(
|
|
71
|
+
top_k=q.get("top_k", cfg.query.top_k),
|
|
72
|
+
community_boost_weight=q.get("community_boost_weight", cfg.query.community_boost_weight),
|
|
73
|
+
bm25_k1=q.get("bm25_k1", cfg.query.bm25_k1),
|
|
74
|
+
bm25_b=q.get("bm25_b", cfg.query.bm25_b),
|
|
75
|
+
)
|
|
76
|
+
if "context" in data:
|
|
77
|
+
c = data["context"]
|
|
78
|
+
cfg.context = ContextConfig(
|
|
79
|
+
max_file_chars=c.get("max_file_chars", cfg.context.max_file_chars),
|
|
80
|
+
show_scores=c.get("show_scores", cfg.context.show_scores),
|
|
81
|
+
)
|
|
82
|
+
if "codex" in data:
|
|
83
|
+
cx = data["codex"]
|
|
84
|
+
cfg.codex = CodexConfig(
|
|
85
|
+
command=cx.get("command", cfg.codex.command),
|
|
86
|
+
subcommand=cx.get("subcommand", cfg.codex.subcommand),
|
|
87
|
+
extra_args=cx.get("extra_args", cfg.codex.extra_args),
|
|
88
|
+
inject_via=cx.get("inject_via", cfg.codex.inject_via),
|
|
89
|
+
timeout_seconds=cx.get("timeout_seconds", cfg.codex.timeout_seconds),
|
|
90
|
+
)
|
|
91
|
+
if "mono" in data:
|
|
92
|
+
m = data["mono"]
|
|
93
|
+
cfg.mono = MonoConfig(
|
|
94
|
+
marker_files=m.get("marker_files", cfg.mono.marker_files),
|
|
95
|
+
graphify_backend=m.get("graphify_backend", cfg.mono.graphify_backend),
|
|
96
|
+
watch_poll_interval=m.get("watch_poll_interval", cfg.mono.watch_poll_interval),
|
|
97
|
+
context_budget_tokens=m.get("context_budget_tokens", cfg.mono.context_budget_tokens),
|
|
98
|
+
context_top_files=m.get("context_top_files", cfg.mono.context_top_files),
|
|
99
|
+
)
|
|
100
|
+
return cfg
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def load_config(explicit_path: str | None = None) -> Config:
|
|
104
|
+
cfg = Config()
|
|
105
|
+
|
|
106
|
+
candidates: list[str] = []
|
|
107
|
+
if explicit_path:
|
|
108
|
+
candidates = [explicit_path]
|
|
109
|
+
else:
|
|
110
|
+
env_path = os.environ.get("CODEX_GRAPH_CONFIG")
|
|
111
|
+
if env_path:
|
|
112
|
+
candidates.append(env_path)
|
|
113
|
+
candidates.append(os.path.join(os.getcwd(), "config.toml"))
|
|
114
|
+
candidates.append(os.path.expanduser("~/.codex-graph/config.toml"))
|
|
115
|
+
|
|
116
|
+
for path in candidates:
|
|
117
|
+
if os.path.exists(path):
|
|
118
|
+
with open(path, "rb") as f:
|
|
119
|
+
data = tomllib.load(f)
|
|
120
|
+
cfg = _apply_toml(cfg, data)
|
|
121
|
+
break
|
|
122
|
+
else:
|
|
123
|
+
if explicit_path:
|
|
124
|
+
import sys
|
|
125
|
+
print(f"Warning: config file not found: {explicit_path}", file=sys.stderr)
|
|
126
|
+
|
|
127
|
+
return cfg
|
codex_graph/graph_nav.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
|
|
7
|
+
from codex_graph.graph_query import _tokenize
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class GraphNav:
|
|
11
|
+
def __init__(self, graph_path: str, skip_patterns: list[str] | None = None):
|
|
12
|
+
with open(graph_path) as f:
|
|
13
|
+
graph = json.load(f)
|
|
14
|
+
self.skip = skip_patterns or []
|
|
15
|
+
self.id2node: dict = {}
|
|
16
|
+
self.file2ids: dict[str, list] = defaultdict(list)
|
|
17
|
+
for n in graph.get("nodes", []):
|
|
18
|
+
nid = n.get("id")
|
|
19
|
+
if nid is None:
|
|
20
|
+
continue
|
|
21
|
+
self.id2node[nid] = n
|
|
22
|
+
sf = n.get("source_file", "")
|
|
23
|
+
if sf:
|
|
24
|
+
self.file2ids[sf].append(nid)
|
|
25
|
+
self.in_edges: dict[object, list] = defaultdict(list)
|
|
26
|
+
self.out_edges: dict[object, list] = defaultdict(list)
|
|
27
|
+
for e in graph.get("links", []):
|
|
28
|
+
s, t = e.get("source"), e.get("target")
|
|
29
|
+
if s is None or t is None:
|
|
30
|
+
continue
|
|
31
|
+
rel = e.get("relation", "")
|
|
32
|
+
self.out_edges[s].append((t, rel))
|
|
33
|
+
self.in_edges[t].append((s, rel))
|
|
34
|
+
|
|
35
|
+
def _skipped(self, sf: str) -> bool:
|
|
36
|
+
return (not sf) or any(p in sf for p in self.skip)
|
|
37
|
+
|
|
38
|
+
def _loc(self, nid) -> str:
|
|
39
|
+
n = self.id2node.get(nid, {})
|
|
40
|
+
sf = n.get("source_file", "?")
|
|
41
|
+
loc = n.get("source_location", "")
|
|
42
|
+
return f"{sf}:{loc}" if loc else sf
|
|
43
|
+
|
|
44
|
+
def find_symbols(self, query: str, k: int = 8) -> list[dict]:
|
|
45
|
+
q = set(_tokenize(query))
|
|
46
|
+
if not q:
|
|
47
|
+
return []
|
|
48
|
+
scored = []
|
|
49
|
+
for n in self.id2node.values():
|
|
50
|
+
if n.get("file_type") != "code":
|
|
51
|
+
continue
|
|
52
|
+
sf = n.get("source_file", "")
|
|
53
|
+
label = n.get("label", "")
|
|
54
|
+
if self._skipped(sf) or not label:
|
|
55
|
+
continue
|
|
56
|
+
toks = set(_tokenize(label)) | set(_tokenize(os.path.basename(os.path.splitext(sf)[0])))
|
|
57
|
+
overlap = len(q & toks)
|
|
58
|
+
if overlap:
|
|
59
|
+
scored.append((overlap, label, sf, n.get("source_location", "")))
|
|
60
|
+
scored.sort(key=lambda x: -x[0])
|
|
61
|
+
return [{"symbol": l, "file": sf, "loc": loc} for _, l, sf, loc in scored[:k]]
|
|
62
|
+
|
|
63
|
+
def neighbors(self, symbol: str, k: int = 12) -> dict:
|
|
64
|
+
q = set(_tokenize(symbol))
|
|
65
|
+
best, best_ov = None, 0
|
|
66
|
+
for nid, n in self.id2node.items():
|
|
67
|
+
ov = len(q & set(_tokenize(n.get("label", ""))))
|
|
68
|
+
if ov > best_ov:
|
|
69
|
+
best, best_ov = nid, ov
|
|
70
|
+
if best is None:
|
|
71
|
+
return {"symbol": symbol, "found": False}
|
|
72
|
+
callers, callees = [], []
|
|
73
|
+
for s, rel in self.in_edges.get(best, []):
|
|
74
|
+
sn = self.id2node.get(s, {})
|
|
75
|
+
if self._skipped(sn.get("source_file", "")):
|
|
76
|
+
continue
|
|
77
|
+
callers.append(f"{sn.get('label', '?')} ({self._loc(s)}) --{rel}-->")
|
|
78
|
+
for t, rel in self.out_edges.get(best, []):
|
|
79
|
+
tn = self.id2node.get(t, {})
|
|
80
|
+
if self._skipped(tn.get("source_file", "")):
|
|
81
|
+
continue
|
|
82
|
+
callees.append(f"--{rel}--> {tn.get('label', '?')} ({self._loc(t)})")
|
|
83
|
+
return {
|
|
84
|
+
"symbol": self.id2node[best].get("label"),
|
|
85
|
+
"defined_at": self._loc(best),
|
|
86
|
+
"callers": callers[:k],
|
|
87
|
+
"callees": callees[:k],
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
def references_to(self, files: list[str], limit: int = 12) -> list[str]:
|
|
91
|
+
target_ids = set()
|
|
92
|
+
for sf in files:
|
|
93
|
+
target_ids.update(self.file2ids.get(sf, []))
|
|
94
|
+
seen, rows = set(), []
|
|
95
|
+
file_set = set(files)
|
|
96
|
+
for tid in target_ids:
|
|
97
|
+
tnode = self.id2node.get(tid, {})
|
|
98
|
+
for s, rel in self.in_edges.get(tid, []):
|
|
99
|
+
sn = self.id2node.get(s, {})
|
|
100
|
+
sf = sn.get("source_file", "")
|
|
101
|
+
if self._skipped(sf) or sf in file_set:
|
|
102
|
+
continue
|
|
103
|
+
key = (sf, sn.get("source_location", ""), tnode.get("label", ""))
|
|
104
|
+
if key in seen:
|
|
105
|
+
continue
|
|
106
|
+
seen.add(key)
|
|
107
|
+
loc = sn.get("source_location", "")
|
|
108
|
+
rows.append(
|
|
109
|
+
f"{sf}:{loc} {sn.get('label', '?')} --{rel}--> {tnode.get('label', '?')}"
|
|
110
|
+
)
|
|
111
|
+
if len(rows) >= limit:
|
|
112
|
+
return rows
|
|
113
|
+
return rows
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import math
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
from collections import Counter, defaultdict
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
|
|
10
|
+
from codex_graph import GraphNotFoundError
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class RankedFile:
|
|
15
|
+
source_file: str
|
|
16
|
+
score: float
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
ALLOWED_EXTENSIONS = {
|
|
20
|
+
".py", ".pyi", ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
|
|
21
|
+
".go", ".java", ".kt", ".kts", ".cs", ".rb", ".rs", ".php",
|
|
22
|
+
".swift", ".scala", ".c", ".cc", ".cpp", ".cxx", ".h", ".hpp",
|
|
23
|
+
".hh", ".m", ".mm", ".lua", ".dart", ".ex", ".exs", ".clj",
|
|
24
|
+
".cljs", ".groovy", ".vue", ".svelte", ".sh", ".bash", ".zsh",
|
|
25
|
+
".pl", ".r", ".sql", ".proto", ".thrift", ".graphql", ".gql",
|
|
26
|
+
".md", ".mdx", ".markdown", ".rst", ".txt", ".adoc",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
GENERATED_PATTERNS = (
|
|
30
|
+
".pb.go", ".pb.cc", ".pb.h", "_pb2.py", "_pb2.pyi", "_pb2_grpc.py",
|
|
31
|
+
"pb2_grpc", "_grpc.pb.", "genproto/", "/generated/", ".generated.",
|
|
32
|
+
".g.dart", "_pb.dart", "/migrations/",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _is_rankable(source_file: str) -> bool:
|
|
37
|
+
lower = source_file.lower()
|
|
38
|
+
if any(p in lower for p in GENERATED_PATTERNS):
|
|
39
|
+
return False
|
|
40
|
+
return os.path.splitext(lower)[1] in ALLOWED_EXTENSIONS
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
_IDENT_RE = re.compile(r"[A-Z]+(?=[A-Z][a-z])|[A-Z][a-z]+|[a-z]+|[A-Z]+|[0-9]+")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _stem(t: str) -> str:
|
|
47
|
+
if len(t) <= 4 or t.endswith("ss"):
|
|
48
|
+
return t
|
|
49
|
+
if t.endswith("ies"):
|
|
50
|
+
return t[:-3] + "y"
|
|
51
|
+
if t.endswith("es"):
|
|
52
|
+
return t[:-2]
|
|
53
|
+
if t.endswith("s"):
|
|
54
|
+
return t[:-1]
|
|
55
|
+
return t
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _tokenize(s: str) -> list[str]:
|
|
59
|
+
toks: list[str] = []
|
|
60
|
+
for word in re.split(r"[^A-Za-z0-9]+", s):
|
|
61
|
+
if not word:
|
|
62
|
+
continue
|
|
63
|
+
for sub in (_IDENT_RE.findall(word) or [word]):
|
|
64
|
+
t = sub.lower()
|
|
65
|
+
if len(t) >= 2:
|
|
66
|
+
toks.append(_stem(t))
|
|
67
|
+
return toks
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class GraphIndex:
|
|
71
|
+
_TYPE_WEIGHT = {"rationale": 3, "document": 2, "concept": 2, "code": 1}
|
|
72
|
+
|
|
73
|
+
def __init__(self, graph_path: str, skip_patterns: list[str]):
|
|
74
|
+
with open(graph_path) as f:
|
|
75
|
+
graph = json.load(f)
|
|
76
|
+
|
|
77
|
+
nodes = graph.get("nodes", [])
|
|
78
|
+
|
|
79
|
+
self.file_tokens: dict[str, list[str]] = defaultdict(list)
|
|
80
|
+
self.file_communities: dict[str, set[int]] = defaultdict(set)
|
|
81
|
+
self.community_tokens: dict[int, set[str]] = defaultdict(set)
|
|
82
|
+
|
|
83
|
+
for n in nodes:
|
|
84
|
+
sf = n.get("source_file", "")
|
|
85
|
+
label = n.get("norm_label") or n.get("label") or ""
|
|
86
|
+
cid = n.get("community")
|
|
87
|
+
tokens = _tokenize(label)
|
|
88
|
+
|
|
89
|
+
if cid is not None:
|
|
90
|
+
self.community_tokens[cid].update(tokens)
|
|
91
|
+
|
|
92
|
+
if not sf or any(p in sf for p in skip_patterns) or not _is_rankable(sf):
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
weight = self._TYPE_WEIGHT.get(n.get("file_type", "code"), 1)
|
|
96
|
+
self.file_tokens[sf].extend(tokens * weight)
|
|
97
|
+
if cid is not None:
|
|
98
|
+
self.file_communities[sf].add(cid)
|
|
99
|
+
|
|
100
|
+
for sf in list(self.file_tokens.keys()):
|
|
101
|
+
stem_path = os.path.splitext(sf)[0]
|
|
102
|
+
base_tokens = _tokenize(os.path.basename(stem_path))
|
|
103
|
+
dir_tokens = _tokenize(os.path.dirname(stem_path))
|
|
104
|
+
self.file_tokens[sf].extend(base_tokens * 6)
|
|
105
|
+
self.file_tokens[sf].extend(dir_tokens * 2)
|
|
106
|
+
|
|
107
|
+
self._N = len(self.file_tokens)
|
|
108
|
+
self._avgdl = (
|
|
109
|
+
sum(len(t) for t in self.file_tokens.values()) / max(self._N, 1)
|
|
110
|
+
)
|
|
111
|
+
self._df: dict[str, int] = defaultdict(int)
|
|
112
|
+
for tokens in self.file_tokens.values():
|
|
113
|
+
for t in set(tokens):
|
|
114
|
+
self._df[t] += 1
|
|
115
|
+
|
|
116
|
+
def _bm25(self, query_tokens: list[str], sf: str, k1: float, b: float) -> float:
|
|
117
|
+
doc = self.file_tokens.get(sf, [])
|
|
118
|
+
dl = len(doc)
|
|
119
|
+
tf_counts = Counter(doc)
|
|
120
|
+
score = 0.0
|
|
121
|
+
for t in query_tokens:
|
|
122
|
+
df = self._df.get(t)
|
|
123
|
+
if not df:
|
|
124
|
+
continue
|
|
125
|
+
idf = math.log((self._N - df + 0.5) / (df + 0.5) + 1)
|
|
126
|
+
tf = tf_counts[t]
|
|
127
|
+
tf_norm = (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / self._avgdl))
|
|
128
|
+
score += idf * tf_norm
|
|
129
|
+
return score
|
|
130
|
+
|
|
131
|
+
def _community_boost(self, query_tokens: list[str], sf: str, boost_weight: float) -> float:
|
|
132
|
+
qset = set(query_tokens)
|
|
133
|
+
best = 0.0
|
|
134
|
+
for cid in self.file_communities.get(sf, set()):
|
|
135
|
+
ctokens = self.community_tokens[cid]
|
|
136
|
+
overlap = len(qset & ctokens) / (len(qset) + 1)
|
|
137
|
+
if overlap > best:
|
|
138
|
+
best = overlap
|
|
139
|
+
return best * boost_weight
|
|
140
|
+
|
|
141
|
+
def rank(
|
|
142
|
+
self,
|
|
143
|
+
prompt: str,
|
|
144
|
+
top_k: int,
|
|
145
|
+
community_boost_weight: float,
|
|
146
|
+
bm25_k1: float,
|
|
147
|
+
bm25_b: float,
|
|
148
|
+
keep_ratio: float = 0.3,
|
|
149
|
+
) -> list[RankedFile]:
|
|
150
|
+
qtoks = _tokenize(prompt)
|
|
151
|
+
if not qtoks:
|
|
152
|
+
return []
|
|
153
|
+
scores = {
|
|
154
|
+
sf: self._bm25(qtoks, sf, bm25_k1, bm25_b)
|
|
155
|
+
+ self._community_boost(qtoks, sf, community_boost_weight)
|
|
156
|
+
for sf in self.file_tokens
|
|
157
|
+
}
|
|
158
|
+
ranked = sorted(scores.items(), key=lambda x: -x[1])
|
|
159
|
+
if not ranked or ranked[0][1] <= 0:
|
|
160
|
+
return []
|
|
161
|
+
floor = ranked[0][1] * keep_ratio
|
|
162
|
+
return [
|
|
163
|
+
RankedFile(source_file=sf, score=sc)
|
|
164
|
+
for sf, sc in ranked[:top_k]
|
|
165
|
+
if sc > 0 and sc >= floor
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def load_index(graph_path: str, skip_patterns: list[str]) -> GraphIndex:
|
|
170
|
+
if not os.path.exists(graph_path):
|
|
171
|
+
raise GraphNotFoundError(
|
|
172
|
+
f"graph.json not found: {graph_path}\n"
|
|
173
|
+
"Run Graphify on the repo first, or set [graph] path in config.toml"
|
|
174
|
+
)
|
|
175
|
+
return GraphIndex(graph_path, skip_patterns)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def query_files(
|
|
179
|
+
prompt: str,
|
|
180
|
+
index: GraphIndex,
|
|
181
|
+
top_k: int,
|
|
182
|
+
community_boost_weight: float = 2.0,
|
|
183
|
+
bm25_k1: float = 1.5,
|
|
184
|
+
bm25_b: float = 0.75,
|
|
185
|
+
keep_ratio: float = 0.3,
|
|
186
|
+
) -> list[RankedFile]:
|
|
187
|
+
return index.rank(prompt, top_k, community_boost_weight, bm25_k1, bm25_b, keep_ratio)
|