codebase-index 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_index/__init__.py +7 -0
- codebase_index/__main__.py +3 -0
- codebase_index/cli.py +916 -0
- codebase_index/config.py +110 -0
- codebase_index/discovery/__init__.py +10 -0
- codebase_index/discovery/classify.py +151 -0
- codebase_index/discovery/ignore.py +58 -0
- codebase_index/discovery/walker.py +75 -0
- codebase_index/doctor.py +138 -0
- codebase_index/embeddings/__init__.py +2 -0
- codebase_index/embeddings/backend.py +67 -0
- codebase_index/embeddings/external.py +56 -0
- codebase_index/embeddings/local.py +41 -0
- codebase_index/embeddings/noop.py +15 -0
- codebase_index/graph/__init__.py +8 -0
- codebase_index/graph/analysis.py +468 -0
- codebase_index/graph/builder.py +160 -0
- codebase_index/graph/expand.py +136 -0
- codebase_index/graph/export.py +381 -0
- codebase_index/graph/navigate.py +201 -0
- codebase_index/indexer/__init__.py +8 -0
- codebase_index/indexer/doc_chunks.py +202 -0
- codebase_index/indexer/freshness.py +109 -0
- codebase_index/indexer/pipeline.py +423 -0
- codebase_index/mcp/__init__.py +2 -0
- codebase_index/mcp/server.py +354 -0
- codebase_index/models.py +145 -0
- codebase_index/output/__init__.py +6 -0
- codebase_index/output/json.py +13 -0
- codebase_index/output/markdown.py +316 -0
- codebase_index/output/redact.py +31 -0
- codebase_index/parsers/__init__.py +9 -0
- codebase_index/parsers/base.py +47 -0
- codebase_index/parsers/languages.py +290 -0
- codebase_index/parsers/line_chunker.py +39 -0
- codebase_index/parsers/symbol_chunks.py +62 -0
- codebase_index/parsers/treesitter.py +439 -0
- codebase_index/retrieval/__init__.py +9 -0
- codebase_index/retrieval/budget.py +82 -0
- codebase_index/retrieval/fusion.py +62 -0
- codebase_index/retrieval/intent.py +56 -0
- codebase_index/retrieval/pipeline.py +207 -0
- codebase_index/retrieval/rerank.py +69 -0
- codebase_index/retrieval/searchers.py +291 -0
- codebase_index/retrieval/skeleton.py +251 -0
- codebase_index/retrieval/types.py +79 -0
- codebase_index/scaffold.py +399 -0
- codebase_index/service.py +158 -0
- codebase_index/skill_template/SKILL.md +198 -0
- codebase_index/skill_template/examples/hooks/settings.json +16 -0
- codebase_index/skill_template/scripts/cbx +25 -0
- codebase_index/skill_template/scripts/cbx.ps1 +25 -0
- codebase_index/skill_update.py +150 -0
- codebase_index/storage/__init__.py +8 -0
- codebase_index/storage/db.py +116 -0
- codebase_index/storage/repo.py +701 -0
- codebase_index/storage/schema.sql +125 -0
- codebase_index/watch/__init__.py +5 -0
- codebase_index/watch/watcher.py +93 -0
- codebase_index-1.6.0.dist-info/METADATA +748 -0
- codebase_index-1.6.0.dist-info/RECORD +64 -0
- codebase_index-1.6.0.dist-info/WHEEL +4 -0
- codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
- codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
"""Compact Markdown renderer for SearchResponse and dict payloads."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from ..models import ImpactResponse, RefsResponse, SearchResponse, SymbolResponse
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def render(resp: SearchResponse | dict) -> str:
|
|
11
|
+
if isinstance(resp, dict):
|
|
12
|
+
return _render_dict(resp)
|
|
13
|
+
return _render_search_response(resp)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _render_dict(payload: dict) -> str:
|
|
17
|
+
lines: list[str] = []
|
|
18
|
+
lines.append(f"**Query:** {payload['query']} ")
|
|
19
|
+
lines.append(
|
|
20
|
+
f"**Intent:** `{payload['intent']}` · **Confidence:** {payload['confidence']}\n"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
if payload["results"]:
|
|
24
|
+
lines.append("| # | Path | Lines | Reason |")
|
|
25
|
+
lines.append("|---|------|-------|--------|")
|
|
26
|
+
for r in payload["results"]:
|
|
27
|
+
lines.append(
|
|
28
|
+
f"| {r['rank']} | `{r['path']}` | {r['line_start']}-{r['line_end']} "
|
|
29
|
+
f"| {r.get('reason', '')} |"
|
|
30
|
+
)
|
|
31
|
+
lines.append("")
|
|
32
|
+
for r in payload["results"]:
|
|
33
|
+
if r.get("snippet"):
|
|
34
|
+
lines.append(f"`{r['path']}:{r['line_start']}-{r['line_end']}`")
|
|
35
|
+
lines.append("```")
|
|
36
|
+
lines.append(r["snippet"])
|
|
37
|
+
lines.append("```")
|
|
38
|
+
|
|
39
|
+
if payload["recommended_reads"]:
|
|
40
|
+
lines.append("\n**Recommended reads:**")
|
|
41
|
+
for rr in payload["recommended_reads"]:
|
|
42
|
+
lines.append(f"- `{rr['path']}:{rr['line_start']}-{rr['line_end']}`")
|
|
43
|
+
|
|
44
|
+
fb = payload.get("fallback_suggestions", {}).get("ripgrep")
|
|
45
|
+
if fb:
|
|
46
|
+
lines.append("\n**Fallback (low confidence) — try:**")
|
|
47
|
+
for cmd in fb:
|
|
48
|
+
lines.append(f"- `{cmd}`")
|
|
49
|
+
|
|
50
|
+
pg = payload.get("pagination")
|
|
51
|
+
if pg:
|
|
52
|
+
shown = f"results {pg['offset'] + 1}–{pg['offset'] + len(payload['results'])}"
|
|
53
|
+
if pg.get("has_more"):
|
|
54
|
+
lines.append(f"\n_Showing {shown}; more available — `--offset {pg['next_offset']}`._")
|
|
55
|
+
else:
|
|
56
|
+
lines.append(f"\n_Showing {shown} (end of results)._")
|
|
57
|
+
|
|
58
|
+
return "\n".join(lines)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _render_search_response(resp: SearchResponse) -> str:
|
|
62
|
+
lines: list[str] = []
|
|
63
|
+
freshness = "fresh" if not resp.index.stale else "STALE"
|
|
64
|
+
if not resp.index.exists:
|
|
65
|
+
freshness = "NO INDEX"
|
|
66
|
+
lines.append(
|
|
67
|
+
f"**query:** {resp.query} | **intent:** {resp.intent} | "
|
|
68
|
+
f"**confidence:** {resp.confidence} | **index:** {freshness}"
|
|
69
|
+
)
|
|
70
|
+
lines.append("")
|
|
71
|
+
|
|
72
|
+
if resp.results:
|
|
73
|
+
lines.append("| # | path | lines | reason |")
|
|
74
|
+
lines.append("|---|------|-------|--------|")
|
|
75
|
+
for result in resp.results:
|
|
76
|
+
symbols = f" `{','.join(result.symbols)}`" if result.symbols else ""
|
|
77
|
+
lines.append(
|
|
78
|
+
f"| {result.rank} | `{result.path}`{symbols} | "
|
|
79
|
+
f"{result.line_start}-{result.line_end} | {result.reason} |"
|
|
80
|
+
)
|
|
81
|
+
lines.append("")
|
|
82
|
+
for result in resp.results:
|
|
83
|
+
if result.snippet:
|
|
84
|
+
lines.append(f"`{result.path}:{result.line_start}-{result.line_end}`")
|
|
85
|
+
lines.append("```")
|
|
86
|
+
lines.append(result.snippet)
|
|
87
|
+
lines.append("```")
|
|
88
|
+
lines.append("")
|
|
89
|
+
else:
|
|
90
|
+
lines.append("_No index matches._")
|
|
91
|
+
lines.append("")
|
|
92
|
+
|
|
93
|
+
if resp.recommended_reads:
|
|
94
|
+
lines.append("**recommended reads:**")
|
|
95
|
+
for read in resp.recommended_reads:
|
|
96
|
+
lines.append(f"- `{read.path}:{read.line_start}-{read.line_end}`")
|
|
97
|
+
lines.append("")
|
|
98
|
+
|
|
99
|
+
if resp.fallback_suggestions:
|
|
100
|
+
lines.append("**fallback:**")
|
|
101
|
+
for commands in resp.fallback_suggestions.values():
|
|
102
|
+
for command in commands:
|
|
103
|
+
lines.append(f"- `{command}`")
|
|
104
|
+
|
|
105
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def render_symbols(resp: SymbolResponse) -> str:
|
|
109
|
+
lines = [_header(resp.query, resp.index.exists, resp.index.stale)]
|
|
110
|
+
lines.append("")
|
|
111
|
+
if not resp.symbols:
|
|
112
|
+
lines.append("_No symbol definitions found._")
|
|
113
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
114
|
+
|
|
115
|
+
lines.append("| name | kind | path | lines | signature |")
|
|
116
|
+
lines.append("|------|------|------|-------|-----------|")
|
|
117
|
+
for symbol in resp.symbols:
|
|
118
|
+
display = symbol.qualified or symbol.name
|
|
119
|
+
signature = symbol.signature or ""
|
|
120
|
+
lines.append(
|
|
121
|
+
f"| `{display}` | {symbol.kind} | `{symbol.path}` | "
|
|
122
|
+
f"{symbol.line_start}-{symbol.line_end} | `{signature}` |"
|
|
123
|
+
)
|
|
124
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _coverage_line(coverage) -> Optional[str]:
|
|
128
|
+
if coverage is not None and getattr(coverage, "partial", False):
|
|
129
|
+
return f"\n> ⚠️ Partial graph coverage: {coverage.reason}"
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# Audit-trail glyphs: an exact edge needs no annotation; inferred/ambiguous ones
|
|
134
|
+
# warn the reader that the link is a heuristic or could not be pinned down.
|
|
135
|
+
_CONF_MARK = {"extracted": "", "inferred": "~ inferred", "ambiguous": "? ambiguous"}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _conf_mark(confidence: Optional[str]) -> str:
|
|
139
|
+
return _CONF_MARK.get(confidence or "extracted", confidence or "")
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def render_refs(resp: RefsResponse) -> str:
|
|
143
|
+
lines = [_header(resp.query, resp.index.exists, resp.index.stale)]
|
|
144
|
+
lines.append("")
|
|
145
|
+
note = _coverage_line(resp.coverage)
|
|
146
|
+
if not resp.sites:
|
|
147
|
+
lines.append("_No references found._")
|
|
148
|
+
if note:
|
|
149
|
+
lines.append(note)
|
|
150
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
151
|
+
|
|
152
|
+
lines.append("| kind | path | line | confidence |")
|
|
153
|
+
lines.append("|------|------|------|------------|")
|
|
154
|
+
for site in resp.sites:
|
|
155
|
+
lines.append(
|
|
156
|
+
f"| {site.kind} | `{site.path}` | {site.line} | {_conf_mark(site.confidence) or 'exact'} |"
|
|
157
|
+
)
|
|
158
|
+
if note:
|
|
159
|
+
lines.append(note)
|
|
160
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _node_label(ref: dict) -> str:
|
|
164
|
+
name = ref.get("name")
|
|
165
|
+
path = ref.get("path") or ""
|
|
166
|
+
return f"`{name}` ({path})" if name and ref.get("kind") == "symbol" else f"`{path}`"
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def render_path(payload: dict) -> str:
|
|
170
|
+
"""Render a path between two nodes as an arrow chain annotated with edge types."""
|
|
171
|
+
head = f"**path:** `{payload['src']}` → `{payload['dst']}`"
|
|
172
|
+
if not payload.get("found"):
|
|
173
|
+
return f"{head}\n\n_{payload.get('reason', 'No path found.')}_\n"
|
|
174
|
+
|
|
175
|
+
lines = [f"{head} · **{payload.get('hops', 0)} hop(s)**", ""]
|
|
176
|
+
nodes = payload.get("nodes", [])
|
|
177
|
+
steps = payload.get("steps", [])
|
|
178
|
+
# Render as: A --edge(conf)--> B --edge--> C
|
|
179
|
+
if nodes:
|
|
180
|
+
lines.append(_node_label(nodes[0]))
|
|
181
|
+
for step, nxt in zip(steps, nodes[1:]):
|
|
182
|
+
mark = _conf_mark(step.get("confidence"))
|
|
183
|
+
edge = f"{step['edge_type']}{' ' + mark if mark else ''}"
|
|
184
|
+
arrow = "→" if step.get("direction") == "out" else "←"
|
|
185
|
+
lines.append(f" {arrow} _{edge}_ {arrow}")
|
|
186
|
+
lines.append(_node_label(nxt))
|
|
187
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def render_describe(payload: dict) -> str:
|
|
191
|
+
"""Render a symbol node card: definition, centrality, callers, callees."""
|
|
192
|
+
head = f"**describe:** `{payload['query']}`"
|
|
193
|
+
if not payload.get("found"):
|
|
194
|
+
return f"{head}\n\n_{payload.get('reason', 'Not found.')}_\n"
|
|
195
|
+
|
|
196
|
+
p = payload.get("primary", {})
|
|
197
|
+
god = f" · god node #{p['god_rank']}" if p.get("god_rank") else ""
|
|
198
|
+
lines = [
|
|
199
|
+
f"{head} · module `{p.get('module', '?')}` · "
|
|
200
|
+
f"in {p.get('in_degree', 0)} / out {p.get('out_degree', 0)}{god}",
|
|
201
|
+
"",
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
defs = payload.get("definitions", [])
|
|
205
|
+
if defs:
|
|
206
|
+
lines.append("**definition(s):**")
|
|
207
|
+
for d in defs:
|
|
208
|
+
sig = f" — `{d['signature']}`" if d.get("signature") else ""
|
|
209
|
+
lines.append(f"- {d['kind']} `{d.get('qualified') or d['name']}` "
|
|
210
|
+
f"at `{d['path']}:{d['line_start']}`{sig}")
|
|
211
|
+
lines.append("")
|
|
212
|
+
|
|
213
|
+
callers = payload.get("callers", [])
|
|
214
|
+
if callers:
|
|
215
|
+
lines.append(f"**callers ({len(callers)}):**")
|
|
216
|
+
for c in callers[:20]:
|
|
217
|
+
mark = _conf_mark(c.get("confidence"))
|
|
218
|
+
lines.append(f"- `{c['path']}:{c['line']}`{' · ' + mark if mark else ''}")
|
|
219
|
+
lines.append("")
|
|
220
|
+
|
|
221
|
+
callees = payload.get("callees", [])
|
|
222
|
+
if callees:
|
|
223
|
+
lines.append(f"**callees ({len(callees)}):**")
|
|
224
|
+
for c in callees[:20]:
|
|
225
|
+
mark = _conf_mark(c.get("confidence"))
|
|
226
|
+
lines.append(f"- {_node_label(c)} _{c.get('edge_type', '')}_"
|
|
227
|
+
f"{' · ' + mark if mark else ''}")
|
|
228
|
+
lines.append("")
|
|
229
|
+
|
|
230
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def render_architecture(payload: dict) -> str:
|
|
234
|
+
"""Render the architecture overview: modules, god nodes, surprising links, questions."""
|
|
235
|
+
if not payload.get("available", False):
|
|
236
|
+
reason = payload.get("reason", "No architecture analysis available.")
|
|
237
|
+
return f"_{reason}_\n"
|
|
238
|
+
|
|
239
|
+
idx = payload.get("index", {})
|
|
240
|
+
freshness = "fresh" if not idx.get("stale") else "STALE"
|
|
241
|
+
lines = [
|
|
242
|
+
f"**Architecture overview** | **index:** {freshness} | "
|
|
243
|
+
f"{payload.get('node_count', 0)} nodes · {payload.get('edge_count', 0)} edges · "
|
|
244
|
+
f"{payload.get('community_count', 0)} modules · modularity {payload.get('modularity', 0)}",
|
|
245
|
+
"",
|
|
246
|
+
]
|
|
247
|
+
|
|
248
|
+
communities = payload.get("communities", [])
|
|
249
|
+
if communities:
|
|
250
|
+
lines.append("### Modules")
|
|
251
|
+
lines.append("| # | module | size | key nodes |")
|
|
252
|
+
lines.append("|---|--------|------|-----------|")
|
|
253
|
+
for c in communities:
|
|
254
|
+
tops = ", ".join(f"`{t['name']}`" for t in c.get("top_nodes", [])[:4])
|
|
255
|
+
lines.append(f"| {c['id']} | {c['label']} | {c['size']} | {tops} |")
|
|
256
|
+
lines.append("")
|
|
257
|
+
|
|
258
|
+
gods = payload.get("god_nodes", [])
|
|
259
|
+
if gods:
|
|
260
|
+
lines.append("### God nodes (most-connected)")
|
|
261
|
+
lines.append("| node | kind | degree | location |")
|
|
262
|
+
lines.append("|------|------|--------|----------|")
|
|
263
|
+
for g in gods:
|
|
264
|
+
loc = g.get("path") or ""
|
|
265
|
+
lines.append(f"| `{g['name']}` | {g['kind']} | {g['degree']} | `{loc}` |")
|
|
266
|
+
lines.append("")
|
|
267
|
+
|
|
268
|
+
surprising = payload.get("surprising", [])
|
|
269
|
+
if surprising:
|
|
270
|
+
lines.append("### Surprising connections (cross-module bridges)")
|
|
271
|
+
for s in surprising:
|
|
272
|
+
fr, to = s["from"], s["to"]
|
|
273
|
+
lines.append(
|
|
274
|
+
f"- `{fr['name']}` ({fr.get('path') or '?'}) ↔ "
|
|
275
|
+
f"`{to['name']}` ({to.get('path') or '?'}) — {s['edge_count']} edge(s)"
|
|
276
|
+
)
|
|
277
|
+
lines.append("")
|
|
278
|
+
|
|
279
|
+
questions = payload.get("questions", [])
|
|
280
|
+
if questions:
|
|
281
|
+
lines.append("### Suggested questions")
|
|
282
|
+
for q in questions:
|
|
283
|
+
lines.append(f"- {q}")
|
|
284
|
+
lines.append("")
|
|
285
|
+
|
|
286
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _header(query: str, exists: bool, stale: bool) -> str:
|
|
290
|
+
freshness = "fresh" if not stale else "STALE"
|
|
291
|
+
if not exists:
|
|
292
|
+
freshness = "NO INDEX"
|
|
293
|
+
return f"**query:** {query} | **index:** {freshness}"
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def render_impact(resp: ImpactResponse) -> str:
|
|
297
|
+
header = (f"**impact:** `{resp.target}` · **direction:** {resp.direction} · "
|
|
298
|
+
f"**depth:** {resp.depth} · **affected files:** {len(resp.files)}")
|
|
299
|
+
lines = [header, ""]
|
|
300
|
+
note = _coverage_line(resp.coverage)
|
|
301
|
+
if not resp.nodes:
|
|
302
|
+
body = ["_No impact found (target unknown or no edges)._"]
|
|
303
|
+
if note:
|
|
304
|
+
body.append(note)
|
|
305
|
+
return "\n".join(lines + body + [""]).rstrip() + "\n"
|
|
306
|
+
lines.append("| dist | via | kind | node | location |")
|
|
307
|
+
lines.append("|------|-----|------|------|----------|")
|
|
308
|
+
for n in sorted(resp.nodes, key=lambda x: (x.distance, x.path, x.line_start or 0)):
|
|
309
|
+
loc = f"{n.path}:{n.line_start}" if n.line_start else n.path
|
|
310
|
+
node_name = f"`{n.name}`" if n.name else "—"
|
|
311
|
+
mark = _conf_mark(n.via_confidence)
|
|
312
|
+
via = f"{n.via_edge or ''} {mark}".strip()
|
|
313
|
+
lines.append(f"| {n.distance} | {via} | {n.kind} | {node_name} | `{loc}` |")
|
|
314
|
+
if note:
|
|
315
|
+
lines.append(note)
|
|
316
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Conservative output-time secret redaction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
_PRIVATE_KEY_RE = re.compile(
|
|
8
|
+
r"-----BEGIN ([A-Z ]*PRIVATE KEY)-----.*?-----END \1-----",
|
|
9
|
+
re.DOTALL,
|
|
10
|
+
)
|
|
11
|
+
_AWS_ACCESS_KEY_RE = re.compile(r"\bAKIA[0-9A-Z]{16}\b")
|
|
12
|
+
_ASSIGNED_SECRET_RE = re.compile(
|
|
13
|
+
r"(?i)\b(api[_-]?key|access[_-]?token|secret|token|password)\b"
|
|
14
|
+
r"(\s*[:=]\s*)"
|
|
15
|
+
r"([\"']?)[A-Za-z0-9_./+=:-]{16,}\3"
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def redact_snippet(text: str) -> str:
|
|
20
|
+
text = _PRIVATE_KEY_RE.sub(_redact_private_key, text)
|
|
21
|
+
text = _AWS_ACCESS_KEY_RE.sub("<<redacted:aws_access_key>>", text)
|
|
22
|
+
return _ASSIGNED_SECRET_RE.sub(
|
|
23
|
+
lambda m: f"{m.group(1)}{m.group(2)}<<redacted:secret>>", text
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _redact_private_key(match: re.Match[str]) -> str:
|
|
28
|
+
return "\n".join(
|
|
29
|
+
"<<redacted:private_key>>" if line and not line.startswith("-----") else line
|
|
30
|
+
for line in match.group(0).splitlines()
|
|
31
|
+
)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Parsers turn an eligible file into chunks + symbols.
|
|
2
|
+
|
|
3
|
+
base.py : Parser protocol -> parse(path, text) -> (list[Chunk], list[Symbol]).
|
|
4
|
+
treesitter.py : AST-based symbol extraction using tree-sitter grammars.
|
|
5
|
+
line_chunker.py : fallback line-window chunking for unsupported / unparseable files.
|
|
6
|
+
languages.py : grammar registry + per-language node->symbol-kind maps + import/call queries.
|
|
7
|
+
|
|
8
|
+
Selection: treesitter when a grammar exists for the detected language, else line_chunker.
|
|
9
|
+
"""
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Shared parser types."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Optional, Protocol
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class Chunk:
|
|
11
|
+
line_start: int
|
|
12
|
+
line_end: int
|
|
13
|
+
content: str
|
|
14
|
+
token_est: int
|
|
15
|
+
kind: str = "window"
|
|
16
|
+
symbol_index: Optional[int] = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class Symbol:
|
|
21
|
+
name: str
|
|
22
|
+
kind: str
|
|
23
|
+
line_start: int
|
|
24
|
+
line_end: int
|
|
25
|
+
qualified: Optional[str] = None
|
|
26
|
+
signature: Optional[str] = None
|
|
27
|
+
parent_index: Optional[int] = None
|
|
28
|
+
docstring: Optional[str] = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class Edge:
|
|
33
|
+
edge_type: str
|
|
34
|
+
callee_name: str
|
|
35
|
+
line: int
|
|
36
|
+
src_symbol_index: Optional[int] = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class ParseResult:
|
|
41
|
+
chunks: list[Chunk] = field(default_factory=list)
|
|
42
|
+
symbols: list[Symbol] = field(default_factory=list)
|
|
43
|
+
edges: list[Edge] = field(default_factory=list)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Parser(Protocol):
|
|
47
|
+
def parse(self, text: str) -> ParseResult: ...
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
"""Per-language tree-sitter specs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
CONTAINER_KINDS = {"class", "interface", "enum", "struct", "trait", "impl", "record"}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_PY_IMPORTS = """
|
|
12
|
+
(import_from_statement module_name: (dotted_name) @import.module)
|
|
13
|
+
(import_statement name: (dotted_name) @import.module)
|
|
14
|
+
(class_definition superclasses: (argument_list (identifier) @extends.base))
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
_JS_IMPORTS = """
|
|
18
|
+
(import_statement source: (string (string_fragment) @import.module))
|
|
19
|
+
(class_declaration (class_heritage (identifier) @extends.base))
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
_TS_IMPORTS = """
|
|
23
|
+
(import_statement source: (string (string_fragment) @import.module))
|
|
24
|
+
(class_declaration (class_heritage
|
|
25
|
+
(extends_clause value: (identifier) @extends.base)))
|
|
26
|
+
(class_declaration (class_heritage
|
|
27
|
+
(implements_clause (type_identifier) @implements.iface)))
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass(frozen=True)
|
|
32
|
+
class LangSpec:
|
|
33
|
+
name: str
|
|
34
|
+
ts_name: str
|
|
35
|
+
defs_query: str
|
|
36
|
+
calls_query: str
|
|
37
|
+
imports_query: str = ""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
_PYTHON = LangSpec(
|
|
41
|
+
name="python",
|
|
42
|
+
ts_name="python",
|
|
43
|
+
defs_query="""
|
|
44
|
+
(function_definition name: (identifier) @name) @def.function
|
|
45
|
+
(class_definition name: (identifier) @name) @def.class
|
|
46
|
+
""",
|
|
47
|
+
calls_query="""
|
|
48
|
+
(call function: (identifier) @callee)
|
|
49
|
+
(call function: (attribute attribute: (identifier) @callee))
|
|
50
|
+
""",
|
|
51
|
+
imports_query=_PY_IMPORTS,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
_JS_DEFS = """
|
|
55
|
+
(function_declaration name: (identifier) @name) @def.function
|
|
56
|
+
(class_declaration name: (identifier) @name) @def.class
|
|
57
|
+
(method_definition name: (property_identifier) @name) @def.method
|
|
58
|
+
(variable_declarator name: (identifier) @name value: (arrow_function)) @def.function
|
|
59
|
+
(variable_declarator name: (identifier) @name value: (function_expression)) @def.function
|
|
60
|
+
"""
|
|
61
|
+
_JS_CALLS = """
|
|
62
|
+
(call_expression function: (identifier) @callee)
|
|
63
|
+
(call_expression function: (member_expression property: (property_identifier) @callee))
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
_JAVASCRIPT = LangSpec(
|
|
67
|
+
name="javascript",
|
|
68
|
+
ts_name="javascript",
|
|
69
|
+
defs_query=_JS_DEFS,
|
|
70
|
+
calls_query=_JS_CALLS,
|
|
71
|
+
imports_query=_JS_IMPORTS,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
_TS_DEFS = """
|
|
75
|
+
(function_declaration name: (identifier) @name) @def.function
|
|
76
|
+
(class_declaration name: (type_identifier) @name) @def.class
|
|
77
|
+
(method_definition name: (property_identifier) @name) @def.method
|
|
78
|
+
(variable_declarator name: (identifier) @name value: (arrow_function)) @def.function
|
|
79
|
+
(interface_declaration name: (type_identifier) @name) @def.interface
|
|
80
|
+
(enum_declaration name: (identifier) @name) @def.enum
|
|
81
|
+
(type_alias_declaration name: (type_identifier) @name) @def.type
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
_TYPESCRIPT = LangSpec(
|
|
85
|
+
name="typescript",
|
|
86
|
+
ts_name="typescript",
|
|
87
|
+
defs_query=_TS_DEFS,
|
|
88
|
+
calls_query=_JS_CALLS,
|
|
89
|
+
imports_query=_TS_IMPORTS,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# --- Tier A: compiled / back-end languages ------------------------------------------------------
|
|
93
|
+
# NOTE: symbol extraction is driven by treesitter._definition_kind (node-type mapping), not by
|
|
94
|
+
# defs_query. These queries are kept as compile-checked documentation of the relevant node types
|
|
95
|
+
# and to power graph edges (imports_query). See tests/test_languages.py.
|
|
96
|
+
|
|
97
|
+
_JAVA = LangSpec(
|
|
98
|
+
name="java",
|
|
99
|
+
ts_name="java",
|
|
100
|
+
defs_query="""
|
|
101
|
+
(class_declaration name: (identifier) @name) @def.class
|
|
102
|
+
(interface_declaration name: (identifier) @name) @def.interface
|
|
103
|
+
(enum_declaration name: (identifier) @name) @def.enum
|
|
104
|
+
(record_declaration name: (identifier) @name) @def.record
|
|
105
|
+
(method_declaration name: (identifier) @name) @def.method
|
|
106
|
+
(constructor_declaration name: (identifier) @name) @def.method
|
|
107
|
+
""",
|
|
108
|
+
calls_query="(method_invocation name: (identifier) @callee)",
|
|
109
|
+
imports_query="""
|
|
110
|
+
(import_declaration (scoped_identifier) @import.module)
|
|
111
|
+
(superclass (type_identifier) @extends.base)
|
|
112
|
+
(super_interfaces (type_list (type_identifier) @implements.iface))
|
|
113
|
+
""",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
_GO = LangSpec(
|
|
117
|
+
name="go",
|
|
118
|
+
ts_name="go",
|
|
119
|
+
defs_query="""
|
|
120
|
+
(function_declaration name: (identifier) @name) @def.function
|
|
121
|
+
(method_declaration name: (field_identifier) @name) @def.method
|
|
122
|
+
(type_spec name: (type_identifier) @name) @def.type
|
|
123
|
+
""",
|
|
124
|
+
calls_query="(call_expression function: (identifier) @callee)",
|
|
125
|
+
imports_query="(import_spec (interpreted_string_literal) @import.module)",
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
_RUST = LangSpec(
|
|
129
|
+
name="rust",
|
|
130
|
+
ts_name="rust",
|
|
131
|
+
defs_query="""
|
|
132
|
+
(function_item name: (identifier) @name) @def.function
|
|
133
|
+
(struct_item name: (type_identifier) @name) @def.struct
|
|
134
|
+
(enum_item name: (type_identifier) @name) @def.enum
|
|
135
|
+
(trait_item name: (type_identifier) @name) @def.trait
|
|
136
|
+
(impl_item type: (type_identifier) @name) @def.impl
|
|
137
|
+
(mod_item name: (identifier) @name) @def.module
|
|
138
|
+
""",
|
|
139
|
+
calls_query="(call_expression function: (identifier) @callee)",
|
|
140
|
+
imports_query="""
|
|
141
|
+
(use_declaration (scoped_identifier) @import.module)
|
|
142
|
+
(use_declaration (identifier) @import.module)
|
|
143
|
+
(use_declaration (use_as_clause) @import.module)
|
|
144
|
+
(use_declaration (scoped_use_list) @import.module)
|
|
145
|
+
""",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
_C = LangSpec(
|
|
149
|
+
name="c",
|
|
150
|
+
ts_name="c",
|
|
151
|
+
defs_query="""
|
|
152
|
+
(function_definition
|
|
153
|
+
declarator: (function_declarator declarator: (identifier) @name)) @def.function
|
|
154
|
+
(struct_specifier name: (type_identifier) @name) @def.struct
|
|
155
|
+
""",
|
|
156
|
+
calls_query="(call_expression function: (identifier) @callee)",
|
|
157
|
+
imports_query="""
|
|
158
|
+
(preproc_include path: (system_lib_string) @import.module)
|
|
159
|
+
(preproc_include path: (string_literal) @import.module)
|
|
160
|
+
""",
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
_CPP = LangSpec(
|
|
164
|
+
name="cpp",
|
|
165
|
+
ts_name="cpp",
|
|
166
|
+
defs_query="""
|
|
167
|
+
(function_definition
|
|
168
|
+
declarator: (function_declarator declarator: (identifier) @name)) @def.function
|
|
169
|
+
(class_specifier name: (type_identifier) @name) @def.class
|
|
170
|
+
(struct_specifier name: (type_identifier) @name) @def.struct
|
|
171
|
+
(namespace_definition name: (namespace_identifier) @name) @def.module
|
|
172
|
+
""",
|
|
173
|
+
calls_query="(call_expression function: (identifier) @callee)",
|
|
174
|
+
imports_query="""
|
|
175
|
+
(preproc_include path: (system_lib_string) @import.module)
|
|
176
|
+
(preproc_include path: (string_literal) @import.module)
|
|
177
|
+
(base_class_clause (type_identifier) @extends.base)
|
|
178
|
+
""",
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
_CSHARP = LangSpec(
|
|
182
|
+
name="csharp",
|
|
183
|
+
ts_name="csharp",
|
|
184
|
+
defs_query="""
|
|
185
|
+
(class_declaration name: (identifier) @name) @def.class
|
|
186
|
+
(interface_declaration name: (identifier) @name) @def.interface
|
|
187
|
+
(struct_declaration name: (identifier) @name) @def.struct
|
|
188
|
+
(enum_declaration name: (identifier) @name) @def.enum
|
|
189
|
+
(method_declaration name: (identifier) @name) @def.method
|
|
190
|
+
(constructor_declaration name: (identifier) @name) @def.method
|
|
191
|
+
""",
|
|
192
|
+
calls_query="(invocation_expression function: (identifier) @callee)",
|
|
193
|
+
imports_query="""
|
|
194
|
+
(using_directive (identifier) @import.module)
|
|
195
|
+
(using_directive (qualified_name) @import.module)
|
|
196
|
+
(base_list (identifier) @extends.base)
|
|
197
|
+
""",
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
_RUBY = LangSpec(
|
|
201
|
+
name="ruby",
|
|
202
|
+
ts_name="ruby",
|
|
203
|
+
defs_query="""
|
|
204
|
+
(class name: (constant) @name) @def.class
|
|
205
|
+
(module name: (constant) @name) @def.module
|
|
206
|
+
(method name: (identifier) @name) @def.method
|
|
207
|
+
""",
|
|
208
|
+
calls_query="(call method: (identifier) @callee)",
|
|
209
|
+
imports_query="(superclass (constant) @extends.base)",
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
_PHP = LangSpec(
|
|
213
|
+
name="php",
|
|
214
|
+
ts_name="php",
|
|
215
|
+
defs_query="""
|
|
216
|
+
(class_declaration name: (name) @name) @def.class
|
|
217
|
+
(interface_declaration name: (name) @name) @def.interface
|
|
218
|
+
(trait_declaration name: (name) @name) @def.trait
|
|
219
|
+
(method_declaration name: (name) @name) @def.method
|
|
220
|
+
(function_definition name: (name) @name) @def.function
|
|
221
|
+
""",
|
|
222
|
+
calls_query="(function_call_expression function: (name) @callee)",
|
|
223
|
+
imports_query="""
|
|
224
|
+
(namespace_use_declaration (namespace_use_clause (qualified_name) @import.module))
|
|
225
|
+
(base_clause (name) @extends.base)
|
|
226
|
+
(class_interface_clause (name) @implements.iface)
|
|
227
|
+
""",
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
_KOTLIN = LangSpec(
|
|
231
|
+
name="kotlin",
|
|
232
|
+
ts_name="kotlin",
|
|
233
|
+
defs_query="""
|
|
234
|
+
(class_declaration (type_identifier) @name) @def.class
|
|
235
|
+
(object_declaration (type_identifier) @name) @def.class
|
|
236
|
+
(function_declaration (simple_identifier) @name) @def.function
|
|
237
|
+
""",
|
|
238
|
+
calls_query="(call_expression (simple_identifier) @callee)",
|
|
239
|
+
imports_query="(import_header (identifier) @import.module)",
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
LANGS: dict[str, LangSpec] = {
|
|
243
|
+
s.name: s
|
|
244
|
+
for s in (
|
|
245
|
+
_PYTHON,
|
|
246
|
+
_JAVASCRIPT,
|
|
247
|
+
_TYPESCRIPT,
|
|
248
|
+
_JAVA,
|
|
249
|
+
_GO,
|
|
250
|
+
_RUST,
|
|
251
|
+
_C,
|
|
252
|
+
_CPP,
|
|
253
|
+
_CSHARP,
|
|
254
|
+
_RUBY,
|
|
255
|
+
_PHP,
|
|
256
|
+
_KOTLIN,
|
|
257
|
+
)
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def has_grammar(lang: Optional[str]) -> bool:
|
|
262
|
+
"""True if a tree-sitter grammar is loadable for `lang` (Tier B eligibility)."""
|
|
263
|
+
if not lang:
|
|
264
|
+
return False
|
|
265
|
+
try:
|
|
266
|
+
from tree_sitter_language_pack import get_language
|
|
267
|
+
|
|
268
|
+
return get_language(lang) is not None
|
|
269
|
+
except Exception:
|
|
270
|
+
return False
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def is_supported(lang: Optional[str]) -> bool:
|
|
274
|
+
"""A language is supported if it has a Tier-A spec OR a loadable Tier-B grammar."""
|
|
275
|
+
if lang in LANGS:
|
|
276
|
+
return True
|
|
277
|
+
return has_grammar(lang)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def spec_for(lang: Optional[str]) -> Optional[LangSpec]:
|
|
281
|
+
return LANGS.get(lang) if lang else None
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def has_full_graph(lang: Optional[str]) -> bool:
|
|
285
|
+
"""True if `lang` has a Tier-A spec (full import/inheritance edges for refs/impact).
|
|
286
|
+
|
|
287
|
+
Tier-B languages (a loadable grammar but no hand-tuned spec) yield symbols and
|
|
288
|
+
best-effort call sites only, so their dependency graph is partial.
|
|
289
|
+
"""
|
|
290
|
+
return spec_for(lang) is not None
|