projectmind-mcp 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
annotations.py ADDED
@@ -0,0 +1,315 @@
1
+ """
2
+ File annotations — AI-authored summaries that make keyword search semantic.
3
+
4
+ The design inverts the usual RAG architecture: instead of a small embedding
5
+ model guessing what code means, the *client* LLM (which actually understands
6
+ the code) writes a short natural-language summary + keywords per file. Those
7
+ annotations are indexed into the BM25 corpus and served as a cheap query tier,
8
+ so "where is authentication handled?" matches the summary text with plain
9
+ keyword search — no vector database required.
10
+
11
+ Storage: `.ai/annotations.json` (atomic writes, UTF-8, project-relative posix
12
+ keys). Each entry remembers the file's mtime at annotation time so stale
13
+ annotations (file changed since) can be surfaced for re-annotation.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import os
20
+ import re
21
+ import threading
22
+ from dataclasses import dataclass, field
23
+ from datetime import datetime
24
+ from pathlib import Path
25
+ from typing import Any
26
+
27
+ import config
28
+ from incremental_indexing import atomic_write
29
+ from logger import get_logger
30
+
31
+ logger = get_logger()
32
+
33
+ ANNOTATIONS_FILENAME = "annotations.json"
34
+ ANNOTATIONS_VERSION = 1
35
+
36
+ # Cap on files returned by unannotated scans (safety on huge repos)
37
+ _SCAN_CAP = 20000
38
+
39
+ _TOKEN_RE = re.compile(r"[a-z0-9_]+")
40
+
41
+
42
+ def _annotations_path() -> Path:
43
+ return config.AI_DIR / ANNOTATIONS_FILENAME
44
+
45
+
46
+ def _rel_posix(path: str | Path) -> str:
47
+ """Normalize any path to a project-relative posix string."""
48
+ p = Path(path)
49
+ try:
50
+ if p.is_absolute():
51
+ return p.resolve().relative_to(config.PROJECT_ROOT.resolve()).as_posix()
52
+ return p.as_posix()
53
+ except Exception:
54
+ return str(path).replace("\\", "/")
55
+
56
+
57
+ def _tokenize(text: str) -> set[str]:
58
+ return {t for t in _TOKEN_RE.findall(text.lower()) if len(t) >= 2}
59
+
60
+
61
+ @dataclass
62
+ class Annotation:
63
+ path: str # project-relative posix
64
+ summary: str
65
+ keywords: list[str] = field(default_factory=list)
66
+ annotated_mtime: float = 0.0
67
+ updated_at: str = ""
68
+
69
+ def to_dict(self) -> dict[str, Any]:
70
+ return {
71
+ "summary": self.summary,
72
+ "keywords": self.keywords,
73
+ "annotated_mtime": self.annotated_mtime,
74
+ "updated_at": self.updated_at,
75
+ }
76
+
77
+ def search_text(self) -> str:
78
+ """Text indexed into BM25 for this annotation."""
79
+ stem = Path(self.path).stem
80
+ return f"{self.path} {stem} {self.summary} {' '.join(self.keywords)}"
81
+
82
+ @property
83
+ def doc_id(self) -> str:
84
+ return f"annot::{self.path}"
85
+
86
+
87
+ class AnnotationStore:
88
+ """Thread-safe store over `.ai/annotations.json`."""
89
+
90
+ def __init__(self, path: Path | None = None):
91
+ self._path = path if path is not None else _annotations_path()
92
+ self._lock = threading.RLock()
93
+ self._files: dict[str, Annotation] = {}
94
+ self._loaded_mtime: float = -1.0
95
+ self._load()
96
+
97
+ # -- persistence --------------------------------------------------------
98
+
99
+ def _load(self) -> None:
100
+ with self._lock:
101
+ if not self._path.exists():
102
+ self._files = {}
103
+ self._loaded_mtime = -1.0
104
+ return
105
+ try:
106
+ data = json.loads(self._path.read_text(encoding="utf-8"))
107
+ files = {}
108
+ for rel, entry in (data.get("files") or {}).items():
109
+ files[rel] = Annotation(
110
+ path=rel,
111
+ summary=str(entry.get("summary", "")),
112
+ keywords=[str(k) for k in entry.get("keywords", [])],
113
+ annotated_mtime=float(entry.get("annotated_mtime", 0.0) or 0.0),
114
+ updated_at=str(entry.get("updated_at", "")),
115
+ )
116
+ self._files = files
117
+ self._loaded_mtime = self._path.stat().st_mtime
118
+ except Exception as e:
119
+ logger.warning(f"Could not load annotations: {e}")
120
+ self._files = {}
121
+
122
+ def refresh_if_changed(self) -> None:
123
+ """Re-read from disk when another process/session updated the file."""
124
+ with self._lock:
125
+ try:
126
+ current = self._path.stat().st_mtime if self._path.exists() else -1.0
127
+ except OSError:
128
+ return
129
+ if current != self._loaded_mtime:
130
+ self._load()
131
+
132
+ def _save(self) -> None:
133
+ with self._lock:
134
+ data = {
135
+ "version": ANNOTATIONS_VERSION,
136
+ "files": {rel: a.to_dict() for rel, a in sorted(self._files.items())},
137
+ }
138
+ atomic_write(self._path, json.dumps(data, indent=2, ensure_ascii=False))
139
+ try:
140
+ self._loaded_mtime = self._path.stat().st_mtime
141
+ except OSError:
142
+ pass
143
+
144
+ # -- API ----------------------------------------------------------------
145
+
146
+ def set(self, path: str, summary: str, keywords: list[str] | None = None) -> Annotation:
147
+ """Save/replace one file's annotation. Path must exist inside the project."""
148
+ target = config.validate_path(path)
149
+ if not target.is_file():
150
+ raise ValueError(f"Not a file: {path}")
151
+ rel = _rel_posix(target)
152
+ try:
153
+ mtime = target.stat().st_mtime
154
+ except OSError:
155
+ mtime = 0.0
156
+ ann = Annotation(
157
+ path=rel,
158
+ summary=summary.strip(),
159
+ keywords=[k.strip() for k in (keywords or []) if k.strip()],
160
+ annotated_mtime=mtime,
161
+ updated_at=datetime.now().isoformat(timespec="seconds"),
162
+ )
163
+ with self._lock:
164
+ self._files[rel] = ann
165
+ self._save()
166
+ return ann
167
+
168
+ def remove(self, path: str) -> bool:
169
+ rel = _rel_posix(path)
170
+ with self._lock:
171
+ if rel in self._files:
172
+ del self._files[rel]
173
+ self._save()
174
+ return True
175
+ return False
176
+
177
+ def get(self, path: str) -> Annotation | None:
178
+ self.refresh_if_changed()
179
+ return self._files.get(_rel_posix(path))
180
+
181
+ def all(self) -> dict[str, Annotation]:
182
+ self.refresh_if_changed()
183
+ with self._lock:
184
+ return dict(self._files)
185
+
186
+ def count(self) -> int:
187
+ self.refresh_if_changed()
188
+ return len(self._files)
189
+
190
+ def is_stale(self, ann: Annotation) -> bool:
191
+ """True when the file changed after it was annotated (or vanished)."""
192
+ target = config.PROJECT_ROOT / ann.path
193
+ try:
194
+ return target.stat().st_mtime != ann.annotated_mtime
195
+ except OSError:
196
+ return True
197
+
198
+ def unannotated(self, limit: int = 20) -> tuple[list[str], list[str], int]:
199
+ """
200
+ Scan indexable code files and report annotation coverage.
201
+
202
+ Returns:
203
+ (missing, stale, total_scanned): relative posix paths of files with
204
+ no annotation, files whose annotation is stale, and the number of
205
+ code files scanned.
206
+ """
207
+ self.refresh_if_changed()
208
+ missing: list[str] = []
209
+ stale: list[str] = []
210
+ scanned = 0
211
+ root = config.PROJECT_ROOT
212
+
213
+ for dirpath, dirnames, filenames in os.walk(root):
214
+ dirnames[:] = [d for d in dirnames if not config.is_dir_ignored(d)]
215
+ for fname in filenames:
216
+ fp = Path(dirpath) / fname
217
+ suffix = fp.suffix.lower()
218
+ # Annotations target source code, not configs/docs
219
+ if suffix not in config.CODE_EXTENSIONS:
220
+ continue
221
+ scanned += 1
222
+ if scanned > _SCAN_CAP:
223
+ return missing, stale, scanned
224
+ rel = _rel_posix(fp)
225
+ ann = self._files.get(rel)
226
+ if ann is None:
227
+ if len(missing) < limit:
228
+ missing.append(rel)
229
+ elif self.is_stale(ann):
230
+ if len(stale) < limit:
231
+ stale.append(rel)
232
+ return missing, stale, scanned
233
+
234
+ # -- search integration --------------------------------------------------
235
+
236
+ def as_documents(self) -> tuple[list[str], list[str], list[dict[str, Any]]]:
237
+ """Annotation docs for the BM25 corpus: (ids, texts, metadatas)."""
238
+ self.refresh_if_changed()
239
+ ids: list[str] = []
240
+ texts: list[str] = []
241
+ metas: list[dict[str, Any]] = []
242
+ with self._lock:
243
+ for ann in self._files.values():
244
+ if not ann.summary:
245
+ continue
246
+ ids.append(ann.doc_id)
247
+ texts.append(ann.search_text())
248
+ metas.append(
249
+ {
250
+ "source": ann.path,
251
+ "symbol_type": "annotation",
252
+ "symbol_name": Path(ann.path).stem,
253
+ }
254
+ )
255
+ return ids, texts, metas
256
+
257
+ def search(self, query: str, n: int = 8) -> list[dict[str, Any]]:
258
+ """
259
+ Cheap keyword-overlap scoring over annotations (query-tier friendly:
260
+ no BM25 matrix or vector store needed, runs in microseconds).
261
+ """
262
+ self.refresh_if_changed()
263
+ q_tokens = _tokenize(query)
264
+ if not q_tokens:
265
+ return []
266
+ scored: list[tuple[float, Annotation]] = []
267
+ with self._lock:
268
+ for ann in self._files.values():
269
+ path_tokens = _tokenize(ann.path)
270
+ kw_tokens = _tokenize(" ".join(ann.keywords))
271
+ sum_tokens = _tokenize(ann.summary)
272
+ score = 0.0
273
+ for tok in q_tokens:
274
+ if tok in kw_tokens:
275
+ score += 3.0
276
+ if tok in path_tokens:
277
+ score += 2.0
278
+ if tok in sum_tokens:
279
+ score += 1.0
280
+ matched = sum(
281
+ 1 for t in q_tokens if t in kw_tokens or t in path_tokens or t in sum_tokens
282
+ )
283
+ score += 2.0 * (matched / len(q_tokens))
284
+ if matched:
285
+ scored.append((score, ann))
286
+ scored.sort(key=lambda x: x[0], reverse=True)
287
+ top = scored[:n]
288
+ max_score = top[0][0] if top else 1.0
289
+ return [
290
+ {
291
+ "source": ann.path,
292
+ "score": round(0.5 + 0.5 * (s / max_score), 4) if max_score else 0.5,
293
+ "tier": "L0_annot",
294
+ "snippet": ann.summary[:300],
295
+ "extra": {"keywords": ann.keywords, "updated_at": ann.updated_at},
296
+ }
297
+ for s, ann in top
298
+ ]
299
+
300
+
301
+ # Module-level singleton (per project root; reset via get_store cache key)
302
+ _store: AnnotationStore | None = None
303
+ _store_path: Path | None = None
304
+ _store_lock = threading.Lock()
305
+
306
+
307
+ def get_store() -> AnnotationStore:
308
+ """Returns the AnnotationStore for the current project root."""
309
+ global _store, _store_path
310
+ with _store_lock:
311
+ current = _annotations_path()
312
+ if _store is None or _store_path != current:
313
+ _store = AnnotationStore(current)
314
+ _store_path = current
315
+ return _store
ast_splitter.py ADDED
@@ -0,0 +1,382 @@
1
+ from __future__ import annotations
2
+
3
+ import threading
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+
9
+ from config import CHUNK_OVERLAP, CHUNK_SIZE
10
+ from logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+ LANGUAGE_MAP: dict[str, str] = {
15
+ ".py": "python",
16
+ ".js": "javascript",
17
+ ".jsx": "javascript",
18
+ ".ts": "typescript",
19
+ ".tsx": "tsx",
20
+ ".java": "java",
21
+ ".go": "go",
22
+ ".rs": "rust",
23
+ ".rb": "ruby",
24
+ }
25
+
26
+ TOP_LEVEL_NODES: dict[str, list[str]] = {
27
+ "python": ["function_definition", "class_definition", "decorated_definition"],
28
+ "javascript": [
29
+ "function_declaration",
30
+ "class_declaration",
31
+ "export_statement",
32
+ "lexical_declaration",
33
+ ],
34
+ "typescript": [
35
+ "function_declaration",
36
+ "class_declaration",
37
+ "export_statement",
38
+ "interface_declaration",
39
+ "type_alias_declaration",
40
+ ],
41
+ "tsx": [
42
+ "function_declaration",
43
+ "class_declaration",
44
+ "export_statement",
45
+ "interface_declaration",
46
+ ],
47
+ "java": ["class_declaration", "interface_declaration", "enum_declaration"],
48
+ "go": ["function_declaration", "method_declaration", "type_declaration"],
49
+ "rust": ["function_item", "impl_item", "struct_item", "enum_item", "trait_item"],
50
+ "ruby": ["method", "class", "module"],
51
+ }
52
+
53
+ METHOD_NODES: dict[str, list[str]] = {
54
+ "python": ["function_definition"],
55
+ "javascript": ["method_definition", "function_declaration", "arrow_function"],
56
+ "typescript": ["method_definition", "method_signature", "function_declaration"],
57
+ "tsx": ["method_definition", "method_signature", "function_declaration"],
58
+ "java": ["method_declaration", "constructor_declaration"],
59
+ "go": ["method_declaration", "function_declaration"],
60
+ "rust": ["function_item"],
61
+ "ruby": ["method"],
62
+ }
63
+
64
+ NAME_FIELD: dict[str, str] = {
65
+ "python": "name",
66
+ "javascript": "name",
67
+ "typescript": "name",
68
+ "tsx": "name",
69
+ "java": "name",
70
+ "go": "name",
71
+ "rust": "name",
72
+ "ruby": "name",
73
+ }
74
+
75
+ _parsers: dict[str, Any] = {}
76
+
77
+ # tree-sitter Parser objects are NOT thread-safe: concurrent parse() calls on
78
+ # a shared parser can crash the interpreter. All parser creation and parsing
79
+ # is serialized through this lock (parsing is cheap next to embedding).
80
+ _parse_lock = threading.RLock()
81
+
82
+
83
+ def _get_parser(language: str) -> Any | None:
84
+ with _parse_lock:
85
+ return _get_parser_locked(language)
86
+
87
+
88
+ def _get_parser_locked(language: str) -> Any | None:
89
+ if language in _parsers:
90
+ return _parsers[language]
91
+
92
+ try:
93
+ from tree_sitter import Language, Parser
94
+
95
+ if language == "python":
96
+ import tree_sitter_python as mod
97
+ elif language == "javascript":
98
+ import tree_sitter_javascript as mod
99
+ elif language == "typescript":
100
+ import tree_sitter_typescript as mod
101
+
102
+ lang = Language(mod.language_typescript())
103
+ parser = Parser(lang)
104
+ _parsers[language] = parser
105
+ return parser
106
+ elif language == "tsx":
107
+ import tree_sitter_typescript as mod
108
+
109
+ lang = Language(mod.language_tsx())
110
+ parser = Parser(lang)
111
+ _parsers[language] = parser
112
+ return parser
113
+ elif language == "java":
114
+ import tree_sitter_java as mod
115
+ elif language == "go":
116
+ import tree_sitter_go as mod
117
+ elif language == "rust":
118
+ import tree_sitter_rust as mod
119
+ elif language == "ruby":
120
+ import tree_sitter_ruby as mod
121
+ else:
122
+ return None
123
+
124
+ lang = Language(mod.language())
125
+ parser = Parser(lang)
126
+ _parsers[language] = parser
127
+ return parser
128
+
129
+ except Exception as e:
130
+ logger.warning(f"Could not load tree-sitter parser for {language}: {e}")
131
+ _parsers[language] = None
132
+ return None
133
+
134
+
135
+ def _get_node_name(node: Any, source: bytes) -> str:
136
+ for child in node.children:
137
+ if child.type == "identifier" or child.type == "name":
138
+ return child.text.decode("utf-8", errors="replace")
139
+ if node.start_byte < len(source):
140
+ first_line = source[node.start_byte : node.start_byte + 80].decode(
141
+ "utf-8", errors="replace"
142
+ )
143
+ return first_line.split("\n")[0].strip()[:60]
144
+ return "unknown"
145
+
146
+
147
+ def _extract_class_chunks(
148
+ class_node: Any,
149
+ source: bytes,
150
+ language: str,
151
+ file_path: Path,
152
+ text_splitter: RecursiveCharacterTextSplitter,
153
+ ) -> list[dict[str, Any]]:
154
+ chunks = []
155
+ class_name = _get_node_name(class_node, source)
156
+ method_types = METHOD_NODES.get(language, [])
157
+
158
+ class_text = class_node.text.decode("utf-8", errors="replace")
159
+ body_node = None
160
+ for child in class_node.children:
161
+ if child.type in ("block", "class_body", "declaration_list"):
162
+ body_node = child
163
+ break
164
+
165
+ if body_node is None:
166
+ chunks.extend(
167
+ _make_text_chunks(
168
+ class_text,
169
+ str(file_path),
170
+ "class",
171
+ class_name,
172
+ None,
173
+ class_node.start_point[0] + 1,
174
+ class_node.end_point[0] + 1,
175
+ text_splitter,
176
+ )
177
+ )
178
+ return chunks
179
+
180
+ has_methods = False
181
+ for child in body_node.children:
182
+ if child.type in method_types:
183
+ has_methods = True
184
+ method_name = _get_node_name(child, source)
185
+ method_text = child.text.decode("utf-8", errors="replace")
186
+ context_prefix = f"# Class: {class_name}\n"
187
+ full_text = context_prefix + method_text
188
+ chunks.extend(
189
+ _make_text_chunks(
190
+ full_text,
191
+ str(file_path),
192
+ "method",
193
+ method_name,
194
+ class_name,
195
+ child.start_point[0] + 1,
196
+ child.end_point[0] + 1,
197
+ text_splitter,
198
+ )
199
+ )
200
+
201
+ if not has_methods:
202
+ chunks.extend(
203
+ _make_text_chunks(
204
+ class_text,
205
+ str(file_path),
206
+ "class",
207
+ class_name,
208
+ None,
209
+ class_node.start_point[0] + 1,
210
+ class_node.end_point[0] + 1,
211
+ text_splitter,
212
+ )
213
+ )
214
+
215
+ return chunks
216
+
217
+
218
+ def _make_text_chunks(
219
+ text: str,
220
+ source_path: str,
221
+ symbol_type: str,
222
+ symbol_name: str,
223
+ class_name: str | None,
224
+ line_start: int,
225
+ line_end: int,
226
+ text_splitter: RecursiveCharacterTextSplitter,
227
+ ) -> list[dict[str, Any]]:
228
+ if len(text) <= CHUNK_SIZE:
229
+ return [
230
+ {
231
+ "text": text,
232
+ "metadata": {
233
+ "source": source_path,
234
+ "symbol_type": symbol_type,
235
+ "symbol_name": symbol_name,
236
+ "class_name": class_name or "",
237
+ "line_start": line_start,
238
+ "line_end": line_end,
239
+ "chunk_index": 0,
240
+ },
241
+ }
242
+ ]
243
+
244
+ sub_chunks = text_splitter.split_text(text)
245
+ result = []
246
+ for i, sub in enumerate(sub_chunks):
247
+ result.append(
248
+ {
249
+ "text": sub,
250
+ "metadata": {
251
+ "source": source_path,
252
+ "symbol_type": symbol_type,
253
+ "symbol_name": symbol_name,
254
+ "class_name": class_name or "",
255
+ "line_start": line_start,
256
+ "line_end": line_end,
257
+ "chunk_index": i,
258
+ },
259
+ }
260
+ )
261
+ return result
262
+
263
+
264
+ class ASTSplitter:
265
+ def __init__(self) -> None:
266
+ self._text_splitter = RecursiveCharacterTextSplitter(
267
+ chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
268
+ )
269
+
270
+ def split(self, content: str, file_path: Path) -> list[dict[str, Any]]:
271
+ language = LANGUAGE_MAP.get(file_path.suffix.lower())
272
+ if language:
273
+ parser = _get_parser(language)
274
+ if parser:
275
+ try:
276
+ return self._split_by_ast(content, language, parser, file_path)
277
+ except Exception as e:
278
+ logger.warning(f"AST split failed for {file_path}, falling back: {e}")
279
+
280
+ return self._split_by_text(content, file_path)
281
+
282
+ def _split_by_ast(
283
+ self, content: str, language: str, parser: Any, file_path: Path
284
+ ) -> list[dict[str, Any]]:
285
+ source = content.encode("utf-8")
286
+ with _parse_lock:
287
+ tree = parser.parse(source)
288
+ root = tree.root_node
289
+
290
+ top_types = TOP_LEVEL_NODES.get(language, [])
291
+ chunks: list[dict[str, Any]] = []
292
+ covered_ranges: list[tuple[int, int]] = []
293
+
294
+ for node in root.children:
295
+ actual_node = node
296
+ if node.type == "decorated_definition":
297
+ for child in node.children:
298
+ if child.type in ("function_definition", "class_definition"):
299
+ actual_node = child
300
+ break
301
+
302
+ if node.type == "export_statement":
303
+ for child in node.children:
304
+ if child.type in top_types:
305
+ actual_node = child
306
+ break
307
+
308
+ if actual_node.type not in top_types and node.type not in top_types:
309
+ continue
310
+
311
+ node_text = node.text.decode("utf-8", errors="replace") if node.text else ""
312
+ if not node_text.strip():
313
+ continue
314
+
315
+ covered_ranges.append((node.start_byte, node.end_byte))
316
+
317
+ if actual_node.type == "class_definition" or actual_node.type == "class_declaration":
318
+ chunks.extend(
319
+ _extract_class_chunks(
320
+ actual_node, source, language, file_path, self._text_splitter
321
+ )
322
+ )
323
+ else:
324
+ symbol_name = _get_node_name(actual_node, source)
325
+ chunks.extend(
326
+ _make_text_chunks(
327
+ node_text,
328
+ str(file_path),
329
+ "function",
330
+ symbol_name,
331
+ None,
332
+ node.start_point[0] + 1,
333
+ node.end_point[0] + 1,
334
+ self._text_splitter,
335
+ )
336
+ )
337
+
338
+ if not chunks:
339
+ return self._split_by_text(content, file_path)
340
+
341
+ module_lines = []
342
+ for node in root.children:
343
+ is_covered = any(s <= node.start_byte < e for s, e in covered_ranges)
344
+ if not is_covered and node.text:
345
+ line = node.text.decode("utf-8", errors="replace").strip()
346
+ if line:
347
+ module_lines.append(line)
348
+
349
+ if module_lines:
350
+ module_text = "\n".join(module_lines)
351
+ chunks.extend(
352
+ _make_text_chunks(
353
+ module_text,
354
+ str(file_path),
355
+ "module",
356
+ "module_level",
357
+ None,
358
+ 1,
359
+ root.end_point[0] + 1,
360
+ self._text_splitter,
361
+ )
362
+ )
363
+
364
+ return chunks
365
+
366
+ def _split_by_text(self, content: str, file_path: Path) -> list[dict[str, Any]]:
367
+ sub_chunks = self._text_splitter.split_text(content)
368
+ return [
369
+ {
370
+ "text": chunk,
371
+ "metadata": {
372
+ "source": str(file_path),
373
+ "symbol_type": "text",
374
+ "symbol_name": "",
375
+ "class_name": "",
376
+ "line_start": 0,
377
+ "line_end": 0,
378
+ "chunk_index": i,
379
+ },
380
+ }
381
+ for i, chunk in enumerate(sub_chunks)
382
+ ]