projectmind-mcp 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- annotations.py +315 -0
- ast_splitter.py +382 -0
- background_indexer.py +523 -0
- bm25_index.py +221 -0
- cache_manager.py +286 -0
- code_intelligence.py +1551 -0
- codebase_indexer.py +440 -0
- config.py +421 -0
- context.py +147 -0
- exceptions.py +49 -0
- git_utils.py +161 -0
- incremental_indexing.py +158 -0
- logger.py +119 -0
- maintenance.py +560 -0
- manifest.py +495 -0
- mcp_server.py +3477 -0
- memory_limited_indexer.py +112 -0
- memory_manager.py +413 -0
- projectmind_mcp-0.9.0.dist-info/METADATA +399 -0
- projectmind_mcp-0.9.0.dist-info/RECORD +26 -0
- projectmind_mcp-0.9.0.dist-info/WHEEL +5 -0
- projectmind_mcp-0.9.0.dist-info/entry_points.txt +3 -0
- projectmind_mcp-0.9.0.dist-info/top_level.txt +21 -0
- query_router.py +417 -0
- symbol_graph.py +1345 -0
- vector_store_manager.py +528 -0
annotations.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File annotations — AI-authored summaries that make keyword search semantic.
|
|
3
|
+
|
|
4
|
+
The design inverts the usual RAG architecture: instead of a small embedding
|
|
5
|
+
model guessing what code means, the *client* LLM (which actually understands
|
|
6
|
+
the code) writes a short natural-language summary + keywords per file. Those
|
|
7
|
+
annotations are indexed into the BM25 corpus and served as a cheap query tier,
|
|
8
|
+
so "where is authentication handled?" matches the summary text with plain
|
|
9
|
+
keyword search — no vector database required.
|
|
10
|
+
|
|
11
|
+
Storage: `.ai/annotations.json` (atomic writes, UTF-8, project-relative posix
|
|
12
|
+
keys). Each entry remembers the file's mtime at annotation time so stale
|
|
13
|
+
annotations (file changed since) can be surfaced for re-annotation.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import re
|
|
21
|
+
import threading
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from datetime import datetime
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
import config
|
|
28
|
+
from incremental_indexing import atomic_write
|
|
29
|
+
from logger import get_logger
|
|
30
|
+
|
|
31
|
+
logger = get_logger()
|
|
32
|
+
|
|
33
|
+
ANNOTATIONS_FILENAME = "annotations.json"
|
|
34
|
+
ANNOTATIONS_VERSION = 1
|
|
35
|
+
|
|
36
|
+
# Cap on files returned by unannotated scans (safety on huge repos)
|
|
37
|
+
_SCAN_CAP = 20000
|
|
38
|
+
|
|
39
|
+
_TOKEN_RE = re.compile(r"[a-z0-9_]+")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _annotations_path() -> Path:
|
|
43
|
+
return config.AI_DIR / ANNOTATIONS_FILENAME
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _rel_posix(path: str | Path) -> str:
|
|
47
|
+
"""Normalize any path to a project-relative posix string."""
|
|
48
|
+
p = Path(path)
|
|
49
|
+
try:
|
|
50
|
+
if p.is_absolute():
|
|
51
|
+
return p.resolve().relative_to(config.PROJECT_ROOT.resolve()).as_posix()
|
|
52
|
+
return p.as_posix()
|
|
53
|
+
except Exception:
|
|
54
|
+
return str(path).replace("\\", "/")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _tokenize(text: str) -> set[str]:
|
|
58
|
+
return {t for t in _TOKEN_RE.findall(text.lower()) if len(t) >= 2}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class Annotation:
|
|
63
|
+
path: str # project-relative posix
|
|
64
|
+
summary: str
|
|
65
|
+
keywords: list[str] = field(default_factory=list)
|
|
66
|
+
annotated_mtime: float = 0.0
|
|
67
|
+
updated_at: str = ""
|
|
68
|
+
|
|
69
|
+
def to_dict(self) -> dict[str, Any]:
|
|
70
|
+
return {
|
|
71
|
+
"summary": self.summary,
|
|
72
|
+
"keywords": self.keywords,
|
|
73
|
+
"annotated_mtime": self.annotated_mtime,
|
|
74
|
+
"updated_at": self.updated_at,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
def search_text(self) -> str:
|
|
78
|
+
"""Text indexed into BM25 for this annotation."""
|
|
79
|
+
stem = Path(self.path).stem
|
|
80
|
+
return f"{self.path} {stem} {self.summary} {' '.join(self.keywords)}"
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def doc_id(self) -> str:
|
|
84
|
+
return f"annot::{self.path}"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class AnnotationStore:
|
|
88
|
+
"""Thread-safe store over `.ai/annotations.json`."""
|
|
89
|
+
|
|
90
|
+
def __init__(self, path: Path | None = None):
|
|
91
|
+
self._path = path if path is not None else _annotations_path()
|
|
92
|
+
self._lock = threading.RLock()
|
|
93
|
+
self._files: dict[str, Annotation] = {}
|
|
94
|
+
self._loaded_mtime: float = -1.0
|
|
95
|
+
self._load()
|
|
96
|
+
|
|
97
|
+
# -- persistence --------------------------------------------------------
|
|
98
|
+
|
|
99
|
+
def _load(self) -> None:
|
|
100
|
+
with self._lock:
|
|
101
|
+
if not self._path.exists():
|
|
102
|
+
self._files = {}
|
|
103
|
+
self._loaded_mtime = -1.0
|
|
104
|
+
return
|
|
105
|
+
try:
|
|
106
|
+
data = json.loads(self._path.read_text(encoding="utf-8"))
|
|
107
|
+
files = {}
|
|
108
|
+
for rel, entry in (data.get("files") or {}).items():
|
|
109
|
+
files[rel] = Annotation(
|
|
110
|
+
path=rel,
|
|
111
|
+
summary=str(entry.get("summary", "")),
|
|
112
|
+
keywords=[str(k) for k in entry.get("keywords", [])],
|
|
113
|
+
annotated_mtime=float(entry.get("annotated_mtime", 0.0) or 0.0),
|
|
114
|
+
updated_at=str(entry.get("updated_at", "")),
|
|
115
|
+
)
|
|
116
|
+
self._files = files
|
|
117
|
+
self._loaded_mtime = self._path.stat().st_mtime
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.warning(f"Could not load annotations: {e}")
|
|
120
|
+
self._files = {}
|
|
121
|
+
|
|
122
|
+
def refresh_if_changed(self) -> None:
|
|
123
|
+
"""Re-read from disk when another process/session updated the file."""
|
|
124
|
+
with self._lock:
|
|
125
|
+
try:
|
|
126
|
+
current = self._path.stat().st_mtime if self._path.exists() else -1.0
|
|
127
|
+
except OSError:
|
|
128
|
+
return
|
|
129
|
+
if current != self._loaded_mtime:
|
|
130
|
+
self._load()
|
|
131
|
+
|
|
132
|
+
def _save(self) -> None:
|
|
133
|
+
with self._lock:
|
|
134
|
+
data = {
|
|
135
|
+
"version": ANNOTATIONS_VERSION,
|
|
136
|
+
"files": {rel: a.to_dict() for rel, a in sorted(self._files.items())},
|
|
137
|
+
}
|
|
138
|
+
atomic_write(self._path, json.dumps(data, indent=2, ensure_ascii=False))
|
|
139
|
+
try:
|
|
140
|
+
self._loaded_mtime = self._path.stat().st_mtime
|
|
141
|
+
except OSError:
|
|
142
|
+
pass
|
|
143
|
+
|
|
144
|
+
# -- API ----------------------------------------------------------------
|
|
145
|
+
|
|
146
|
+
def set(self, path: str, summary: str, keywords: list[str] | None = None) -> Annotation:
|
|
147
|
+
"""Save/replace one file's annotation. Path must exist inside the project."""
|
|
148
|
+
target = config.validate_path(path)
|
|
149
|
+
if not target.is_file():
|
|
150
|
+
raise ValueError(f"Not a file: {path}")
|
|
151
|
+
rel = _rel_posix(target)
|
|
152
|
+
try:
|
|
153
|
+
mtime = target.stat().st_mtime
|
|
154
|
+
except OSError:
|
|
155
|
+
mtime = 0.0
|
|
156
|
+
ann = Annotation(
|
|
157
|
+
path=rel,
|
|
158
|
+
summary=summary.strip(),
|
|
159
|
+
keywords=[k.strip() for k in (keywords or []) if k.strip()],
|
|
160
|
+
annotated_mtime=mtime,
|
|
161
|
+
updated_at=datetime.now().isoformat(timespec="seconds"),
|
|
162
|
+
)
|
|
163
|
+
with self._lock:
|
|
164
|
+
self._files[rel] = ann
|
|
165
|
+
self._save()
|
|
166
|
+
return ann
|
|
167
|
+
|
|
168
|
+
def remove(self, path: str) -> bool:
|
|
169
|
+
rel = _rel_posix(path)
|
|
170
|
+
with self._lock:
|
|
171
|
+
if rel in self._files:
|
|
172
|
+
del self._files[rel]
|
|
173
|
+
self._save()
|
|
174
|
+
return True
|
|
175
|
+
return False
|
|
176
|
+
|
|
177
|
+
def get(self, path: str) -> Annotation | None:
|
|
178
|
+
self.refresh_if_changed()
|
|
179
|
+
return self._files.get(_rel_posix(path))
|
|
180
|
+
|
|
181
|
+
def all(self) -> dict[str, Annotation]:
|
|
182
|
+
self.refresh_if_changed()
|
|
183
|
+
with self._lock:
|
|
184
|
+
return dict(self._files)
|
|
185
|
+
|
|
186
|
+
def count(self) -> int:
|
|
187
|
+
self.refresh_if_changed()
|
|
188
|
+
return len(self._files)
|
|
189
|
+
|
|
190
|
+
def is_stale(self, ann: Annotation) -> bool:
|
|
191
|
+
"""True when the file changed after it was annotated (or vanished)."""
|
|
192
|
+
target = config.PROJECT_ROOT / ann.path
|
|
193
|
+
try:
|
|
194
|
+
return target.stat().st_mtime != ann.annotated_mtime
|
|
195
|
+
except OSError:
|
|
196
|
+
return True
|
|
197
|
+
|
|
198
|
+
def unannotated(self, limit: int = 20) -> tuple[list[str], list[str], int]:
|
|
199
|
+
"""
|
|
200
|
+
Scan indexable code files and report annotation coverage.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
(missing, stale, total_scanned): relative posix paths of files with
|
|
204
|
+
no annotation, files whose annotation is stale, and the number of
|
|
205
|
+
code files scanned.
|
|
206
|
+
"""
|
|
207
|
+
self.refresh_if_changed()
|
|
208
|
+
missing: list[str] = []
|
|
209
|
+
stale: list[str] = []
|
|
210
|
+
scanned = 0
|
|
211
|
+
root = config.PROJECT_ROOT
|
|
212
|
+
|
|
213
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
214
|
+
dirnames[:] = [d for d in dirnames if not config.is_dir_ignored(d)]
|
|
215
|
+
for fname in filenames:
|
|
216
|
+
fp = Path(dirpath) / fname
|
|
217
|
+
suffix = fp.suffix.lower()
|
|
218
|
+
# Annotations target source code, not configs/docs
|
|
219
|
+
if suffix not in config.CODE_EXTENSIONS:
|
|
220
|
+
continue
|
|
221
|
+
scanned += 1
|
|
222
|
+
if scanned > _SCAN_CAP:
|
|
223
|
+
return missing, stale, scanned
|
|
224
|
+
rel = _rel_posix(fp)
|
|
225
|
+
ann = self._files.get(rel)
|
|
226
|
+
if ann is None:
|
|
227
|
+
if len(missing) < limit:
|
|
228
|
+
missing.append(rel)
|
|
229
|
+
elif self.is_stale(ann):
|
|
230
|
+
if len(stale) < limit:
|
|
231
|
+
stale.append(rel)
|
|
232
|
+
return missing, stale, scanned
|
|
233
|
+
|
|
234
|
+
# -- search integration --------------------------------------------------
|
|
235
|
+
|
|
236
|
+
def as_documents(self) -> tuple[list[str], list[str], list[dict[str, Any]]]:
|
|
237
|
+
"""Annotation docs for the BM25 corpus: (ids, texts, metadatas)."""
|
|
238
|
+
self.refresh_if_changed()
|
|
239
|
+
ids: list[str] = []
|
|
240
|
+
texts: list[str] = []
|
|
241
|
+
metas: list[dict[str, Any]] = []
|
|
242
|
+
with self._lock:
|
|
243
|
+
for ann in self._files.values():
|
|
244
|
+
if not ann.summary:
|
|
245
|
+
continue
|
|
246
|
+
ids.append(ann.doc_id)
|
|
247
|
+
texts.append(ann.search_text())
|
|
248
|
+
metas.append(
|
|
249
|
+
{
|
|
250
|
+
"source": ann.path,
|
|
251
|
+
"symbol_type": "annotation",
|
|
252
|
+
"symbol_name": Path(ann.path).stem,
|
|
253
|
+
}
|
|
254
|
+
)
|
|
255
|
+
return ids, texts, metas
|
|
256
|
+
|
|
257
|
+
def search(self, query: str, n: int = 8) -> list[dict[str, Any]]:
|
|
258
|
+
"""
|
|
259
|
+
Cheap keyword-overlap scoring over annotations (query-tier friendly:
|
|
260
|
+
no BM25 matrix or vector store needed, runs in microseconds).
|
|
261
|
+
"""
|
|
262
|
+
self.refresh_if_changed()
|
|
263
|
+
q_tokens = _tokenize(query)
|
|
264
|
+
if not q_tokens:
|
|
265
|
+
return []
|
|
266
|
+
scored: list[tuple[float, Annotation]] = []
|
|
267
|
+
with self._lock:
|
|
268
|
+
for ann in self._files.values():
|
|
269
|
+
path_tokens = _tokenize(ann.path)
|
|
270
|
+
kw_tokens = _tokenize(" ".join(ann.keywords))
|
|
271
|
+
sum_tokens = _tokenize(ann.summary)
|
|
272
|
+
score = 0.0
|
|
273
|
+
for tok in q_tokens:
|
|
274
|
+
if tok in kw_tokens:
|
|
275
|
+
score += 3.0
|
|
276
|
+
if tok in path_tokens:
|
|
277
|
+
score += 2.0
|
|
278
|
+
if tok in sum_tokens:
|
|
279
|
+
score += 1.0
|
|
280
|
+
matched = sum(
|
|
281
|
+
1 for t in q_tokens if t in kw_tokens or t in path_tokens or t in sum_tokens
|
|
282
|
+
)
|
|
283
|
+
score += 2.0 * (matched / len(q_tokens))
|
|
284
|
+
if matched:
|
|
285
|
+
scored.append((score, ann))
|
|
286
|
+
scored.sort(key=lambda x: x[0], reverse=True)
|
|
287
|
+
top = scored[:n]
|
|
288
|
+
max_score = top[0][0] if top else 1.0
|
|
289
|
+
return [
|
|
290
|
+
{
|
|
291
|
+
"source": ann.path,
|
|
292
|
+
"score": round(0.5 + 0.5 * (s / max_score), 4) if max_score else 0.5,
|
|
293
|
+
"tier": "L0_annot",
|
|
294
|
+
"snippet": ann.summary[:300],
|
|
295
|
+
"extra": {"keywords": ann.keywords, "updated_at": ann.updated_at},
|
|
296
|
+
}
|
|
297
|
+
for s, ann in top
|
|
298
|
+
]
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# Module-level singleton (per project root; reset via get_store cache key)
|
|
302
|
+
_store: AnnotationStore | None = None
|
|
303
|
+
_store_path: Path | None = None
|
|
304
|
+
_store_lock = threading.Lock()
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def get_store() -> AnnotationStore:
|
|
308
|
+
"""Returns the AnnotationStore for the current project root."""
|
|
309
|
+
global _store, _store_path
|
|
310
|
+
with _store_lock:
|
|
311
|
+
current = _annotations_path()
|
|
312
|
+
if _store is None or _store_path != current:
|
|
313
|
+
_store = AnnotationStore(current)
|
|
314
|
+
_store_path = current
|
|
315
|
+
return _store
|
ast_splitter.py
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import threading
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
8
|
+
|
|
9
|
+
from config import CHUNK_OVERLAP, CHUNK_SIZE
|
|
10
|
+
from logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
LANGUAGE_MAP: dict[str, str] = {
|
|
15
|
+
".py": "python",
|
|
16
|
+
".js": "javascript",
|
|
17
|
+
".jsx": "javascript",
|
|
18
|
+
".ts": "typescript",
|
|
19
|
+
".tsx": "tsx",
|
|
20
|
+
".java": "java",
|
|
21
|
+
".go": "go",
|
|
22
|
+
".rs": "rust",
|
|
23
|
+
".rb": "ruby",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
TOP_LEVEL_NODES: dict[str, list[str]] = {
|
|
27
|
+
"python": ["function_definition", "class_definition", "decorated_definition"],
|
|
28
|
+
"javascript": [
|
|
29
|
+
"function_declaration",
|
|
30
|
+
"class_declaration",
|
|
31
|
+
"export_statement",
|
|
32
|
+
"lexical_declaration",
|
|
33
|
+
],
|
|
34
|
+
"typescript": [
|
|
35
|
+
"function_declaration",
|
|
36
|
+
"class_declaration",
|
|
37
|
+
"export_statement",
|
|
38
|
+
"interface_declaration",
|
|
39
|
+
"type_alias_declaration",
|
|
40
|
+
],
|
|
41
|
+
"tsx": [
|
|
42
|
+
"function_declaration",
|
|
43
|
+
"class_declaration",
|
|
44
|
+
"export_statement",
|
|
45
|
+
"interface_declaration",
|
|
46
|
+
],
|
|
47
|
+
"java": ["class_declaration", "interface_declaration", "enum_declaration"],
|
|
48
|
+
"go": ["function_declaration", "method_declaration", "type_declaration"],
|
|
49
|
+
"rust": ["function_item", "impl_item", "struct_item", "enum_item", "trait_item"],
|
|
50
|
+
"ruby": ["method", "class", "module"],
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
METHOD_NODES: dict[str, list[str]] = {
|
|
54
|
+
"python": ["function_definition"],
|
|
55
|
+
"javascript": ["method_definition", "function_declaration", "arrow_function"],
|
|
56
|
+
"typescript": ["method_definition", "method_signature", "function_declaration"],
|
|
57
|
+
"tsx": ["method_definition", "method_signature", "function_declaration"],
|
|
58
|
+
"java": ["method_declaration", "constructor_declaration"],
|
|
59
|
+
"go": ["method_declaration", "function_declaration"],
|
|
60
|
+
"rust": ["function_item"],
|
|
61
|
+
"ruby": ["method"],
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
NAME_FIELD: dict[str, str] = {
|
|
65
|
+
"python": "name",
|
|
66
|
+
"javascript": "name",
|
|
67
|
+
"typescript": "name",
|
|
68
|
+
"tsx": "name",
|
|
69
|
+
"java": "name",
|
|
70
|
+
"go": "name",
|
|
71
|
+
"rust": "name",
|
|
72
|
+
"ruby": "name",
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
_parsers: dict[str, Any] = {}
|
|
76
|
+
|
|
77
|
+
# tree-sitter Parser objects are NOT thread-safe: concurrent parse() calls on
|
|
78
|
+
# a shared parser can crash the interpreter. All parser creation and parsing
|
|
79
|
+
# is serialized through this lock (parsing is cheap next to embedding).
|
|
80
|
+
_parse_lock = threading.RLock()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _get_parser(language: str) -> Any | None:
|
|
84
|
+
with _parse_lock:
|
|
85
|
+
return _get_parser_locked(language)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _get_parser_locked(language: str) -> Any | None:
|
|
89
|
+
if language in _parsers:
|
|
90
|
+
return _parsers[language]
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
from tree_sitter import Language, Parser
|
|
94
|
+
|
|
95
|
+
if language == "python":
|
|
96
|
+
import tree_sitter_python as mod
|
|
97
|
+
elif language == "javascript":
|
|
98
|
+
import tree_sitter_javascript as mod
|
|
99
|
+
elif language == "typescript":
|
|
100
|
+
import tree_sitter_typescript as mod
|
|
101
|
+
|
|
102
|
+
lang = Language(mod.language_typescript())
|
|
103
|
+
parser = Parser(lang)
|
|
104
|
+
_parsers[language] = parser
|
|
105
|
+
return parser
|
|
106
|
+
elif language == "tsx":
|
|
107
|
+
import tree_sitter_typescript as mod
|
|
108
|
+
|
|
109
|
+
lang = Language(mod.language_tsx())
|
|
110
|
+
parser = Parser(lang)
|
|
111
|
+
_parsers[language] = parser
|
|
112
|
+
return parser
|
|
113
|
+
elif language == "java":
|
|
114
|
+
import tree_sitter_java as mod
|
|
115
|
+
elif language == "go":
|
|
116
|
+
import tree_sitter_go as mod
|
|
117
|
+
elif language == "rust":
|
|
118
|
+
import tree_sitter_rust as mod
|
|
119
|
+
elif language == "ruby":
|
|
120
|
+
import tree_sitter_ruby as mod
|
|
121
|
+
else:
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
lang = Language(mod.language())
|
|
125
|
+
parser = Parser(lang)
|
|
126
|
+
_parsers[language] = parser
|
|
127
|
+
return parser
|
|
128
|
+
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.warning(f"Could not load tree-sitter parser for {language}: {e}")
|
|
131
|
+
_parsers[language] = None
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _get_node_name(node: Any, source: bytes) -> str:
|
|
136
|
+
for child in node.children:
|
|
137
|
+
if child.type == "identifier" or child.type == "name":
|
|
138
|
+
return child.text.decode("utf-8", errors="replace")
|
|
139
|
+
if node.start_byte < len(source):
|
|
140
|
+
first_line = source[node.start_byte : node.start_byte + 80].decode(
|
|
141
|
+
"utf-8", errors="replace"
|
|
142
|
+
)
|
|
143
|
+
return first_line.split("\n")[0].strip()[:60]
|
|
144
|
+
return "unknown"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _extract_class_chunks(
|
|
148
|
+
class_node: Any,
|
|
149
|
+
source: bytes,
|
|
150
|
+
language: str,
|
|
151
|
+
file_path: Path,
|
|
152
|
+
text_splitter: RecursiveCharacterTextSplitter,
|
|
153
|
+
) -> list[dict[str, Any]]:
|
|
154
|
+
chunks = []
|
|
155
|
+
class_name = _get_node_name(class_node, source)
|
|
156
|
+
method_types = METHOD_NODES.get(language, [])
|
|
157
|
+
|
|
158
|
+
class_text = class_node.text.decode("utf-8", errors="replace")
|
|
159
|
+
body_node = None
|
|
160
|
+
for child in class_node.children:
|
|
161
|
+
if child.type in ("block", "class_body", "declaration_list"):
|
|
162
|
+
body_node = child
|
|
163
|
+
break
|
|
164
|
+
|
|
165
|
+
if body_node is None:
|
|
166
|
+
chunks.extend(
|
|
167
|
+
_make_text_chunks(
|
|
168
|
+
class_text,
|
|
169
|
+
str(file_path),
|
|
170
|
+
"class",
|
|
171
|
+
class_name,
|
|
172
|
+
None,
|
|
173
|
+
class_node.start_point[0] + 1,
|
|
174
|
+
class_node.end_point[0] + 1,
|
|
175
|
+
text_splitter,
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
return chunks
|
|
179
|
+
|
|
180
|
+
has_methods = False
|
|
181
|
+
for child in body_node.children:
|
|
182
|
+
if child.type in method_types:
|
|
183
|
+
has_methods = True
|
|
184
|
+
method_name = _get_node_name(child, source)
|
|
185
|
+
method_text = child.text.decode("utf-8", errors="replace")
|
|
186
|
+
context_prefix = f"# Class: {class_name}\n"
|
|
187
|
+
full_text = context_prefix + method_text
|
|
188
|
+
chunks.extend(
|
|
189
|
+
_make_text_chunks(
|
|
190
|
+
full_text,
|
|
191
|
+
str(file_path),
|
|
192
|
+
"method",
|
|
193
|
+
method_name,
|
|
194
|
+
class_name,
|
|
195
|
+
child.start_point[0] + 1,
|
|
196
|
+
child.end_point[0] + 1,
|
|
197
|
+
text_splitter,
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if not has_methods:
|
|
202
|
+
chunks.extend(
|
|
203
|
+
_make_text_chunks(
|
|
204
|
+
class_text,
|
|
205
|
+
str(file_path),
|
|
206
|
+
"class",
|
|
207
|
+
class_name,
|
|
208
|
+
None,
|
|
209
|
+
class_node.start_point[0] + 1,
|
|
210
|
+
class_node.end_point[0] + 1,
|
|
211
|
+
text_splitter,
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
return chunks
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _make_text_chunks(
|
|
219
|
+
text: str,
|
|
220
|
+
source_path: str,
|
|
221
|
+
symbol_type: str,
|
|
222
|
+
symbol_name: str,
|
|
223
|
+
class_name: str | None,
|
|
224
|
+
line_start: int,
|
|
225
|
+
line_end: int,
|
|
226
|
+
text_splitter: RecursiveCharacterTextSplitter,
|
|
227
|
+
) -> list[dict[str, Any]]:
|
|
228
|
+
if len(text) <= CHUNK_SIZE:
|
|
229
|
+
return [
|
|
230
|
+
{
|
|
231
|
+
"text": text,
|
|
232
|
+
"metadata": {
|
|
233
|
+
"source": source_path,
|
|
234
|
+
"symbol_type": symbol_type,
|
|
235
|
+
"symbol_name": symbol_name,
|
|
236
|
+
"class_name": class_name or "",
|
|
237
|
+
"line_start": line_start,
|
|
238
|
+
"line_end": line_end,
|
|
239
|
+
"chunk_index": 0,
|
|
240
|
+
},
|
|
241
|
+
}
|
|
242
|
+
]
|
|
243
|
+
|
|
244
|
+
sub_chunks = text_splitter.split_text(text)
|
|
245
|
+
result = []
|
|
246
|
+
for i, sub in enumerate(sub_chunks):
|
|
247
|
+
result.append(
|
|
248
|
+
{
|
|
249
|
+
"text": sub,
|
|
250
|
+
"metadata": {
|
|
251
|
+
"source": source_path,
|
|
252
|
+
"symbol_type": symbol_type,
|
|
253
|
+
"symbol_name": symbol_name,
|
|
254
|
+
"class_name": class_name or "",
|
|
255
|
+
"line_start": line_start,
|
|
256
|
+
"line_end": line_end,
|
|
257
|
+
"chunk_index": i,
|
|
258
|
+
},
|
|
259
|
+
}
|
|
260
|
+
)
|
|
261
|
+
return result
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
class ASTSplitter:
|
|
265
|
+
def __init__(self) -> None:
|
|
266
|
+
self._text_splitter = RecursiveCharacterTextSplitter(
|
|
267
|
+
chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
def split(self, content: str, file_path: Path) -> list[dict[str, Any]]:
|
|
271
|
+
language = LANGUAGE_MAP.get(file_path.suffix.lower())
|
|
272
|
+
if language:
|
|
273
|
+
parser = _get_parser(language)
|
|
274
|
+
if parser:
|
|
275
|
+
try:
|
|
276
|
+
return self._split_by_ast(content, language, parser, file_path)
|
|
277
|
+
except Exception as e:
|
|
278
|
+
logger.warning(f"AST split failed for {file_path}, falling back: {e}")
|
|
279
|
+
|
|
280
|
+
return self._split_by_text(content, file_path)
|
|
281
|
+
|
|
282
|
+
def _split_by_ast(
|
|
283
|
+
self, content: str, language: str, parser: Any, file_path: Path
|
|
284
|
+
) -> list[dict[str, Any]]:
|
|
285
|
+
source = content.encode("utf-8")
|
|
286
|
+
with _parse_lock:
|
|
287
|
+
tree = parser.parse(source)
|
|
288
|
+
root = tree.root_node
|
|
289
|
+
|
|
290
|
+
top_types = TOP_LEVEL_NODES.get(language, [])
|
|
291
|
+
chunks: list[dict[str, Any]] = []
|
|
292
|
+
covered_ranges: list[tuple[int, int]] = []
|
|
293
|
+
|
|
294
|
+
for node in root.children:
|
|
295
|
+
actual_node = node
|
|
296
|
+
if node.type == "decorated_definition":
|
|
297
|
+
for child in node.children:
|
|
298
|
+
if child.type in ("function_definition", "class_definition"):
|
|
299
|
+
actual_node = child
|
|
300
|
+
break
|
|
301
|
+
|
|
302
|
+
if node.type == "export_statement":
|
|
303
|
+
for child in node.children:
|
|
304
|
+
if child.type in top_types:
|
|
305
|
+
actual_node = child
|
|
306
|
+
break
|
|
307
|
+
|
|
308
|
+
if actual_node.type not in top_types and node.type not in top_types:
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
node_text = node.text.decode("utf-8", errors="replace") if node.text else ""
|
|
312
|
+
if not node_text.strip():
|
|
313
|
+
continue
|
|
314
|
+
|
|
315
|
+
covered_ranges.append((node.start_byte, node.end_byte))
|
|
316
|
+
|
|
317
|
+
if actual_node.type == "class_definition" or actual_node.type == "class_declaration":
|
|
318
|
+
chunks.extend(
|
|
319
|
+
_extract_class_chunks(
|
|
320
|
+
actual_node, source, language, file_path, self._text_splitter
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
else:
|
|
324
|
+
symbol_name = _get_node_name(actual_node, source)
|
|
325
|
+
chunks.extend(
|
|
326
|
+
_make_text_chunks(
|
|
327
|
+
node_text,
|
|
328
|
+
str(file_path),
|
|
329
|
+
"function",
|
|
330
|
+
symbol_name,
|
|
331
|
+
None,
|
|
332
|
+
node.start_point[0] + 1,
|
|
333
|
+
node.end_point[0] + 1,
|
|
334
|
+
self._text_splitter,
|
|
335
|
+
)
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
if not chunks:
|
|
339
|
+
return self._split_by_text(content, file_path)
|
|
340
|
+
|
|
341
|
+
module_lines = []
|
|
342
|
+
for node in root.children:
|
|
343
|
+
is_covered = any(s <= node.start_byte < e for s, e in covered_ranges)
|
|
344
|
+
if not is_covered and node.text:
|
|
345
|
+
line = node.text.decode("utf-8", errors="replace").strip()
|
|
346
|
+
if line:
|
|
347
|
+
module_lines.append(line)
|
|
348
|
+
|
|
349
|
+
if module_lines:
|
|
350
|
+
module_text = "\n".join(module_lines)
|
|
351
|
+
chunks.extend(
|
|
352
|
+
_make_text_chunks(
|
|
353
|
+
module_text,
|
|
354
|
+
str(file_path),
|
|
355
|
+
"module",
|
|
356
|
+
"module_level",
|
|
357
|
+
None,
|
|
358
|
+
1,
|
|
359
|
+
root.end_point[0] + 1,
|
|
360
|
+
self._text_splitter,
|
|
361
|
+
)
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
return chunks
|
|
365
|
+
|
|
366
|
+
def _split_by_text(self, content: str, file_path: Path) -> list[dict[str, Any]]:
|
|
367
|
+
sub_chunks = self._text_splitter.split_text(content)
|
|
368
|
+
return [
|
|
369
|
+
{
|
|
370
|
+
"text": chunk,
|
|
371
|
+
"metadata": {
|
|
372
|
+
"source": str(file_path),
|
|
373
|
+
"symbol_type": "text",
|
|
374
|
+
"symbol_name": "",
|
|
375
|
+
"class_name": "",
|
|
376
|
+
"line_start": 0,
|
|
377
|
+
"line_end": 0,
|
|
378
|
+
"chunk_index": i,
|
|
379
|
+
},
|
|
380
|
+
}
|
|
381
|
+
for i, chunk in enumerate(sub_chunks)
|
|
382
|
+
]
|