sari 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. app/__init__.py +1 -0
  2. app/config.py +240 -0
  3. app/db.py +932 -0
  4. app/dedup_queue.py +77 -0
  5. app/engine_registry.py +56 -0
  6. app/engine_runtime.py +472 -0
  7. app/http_server.py +204 -0
  8. app/indexer.py +1532 -0
  9. app/main.py +147 -0
  10. app/models.py +39 -0
  11. app/queue_pipeline.py +65 -0
  12. app/ranking.py +144 -0
  13. app/registry.py +172 -0
  14. app/search_engine.py +572 -0
  15. app/watcher.py +124 -0
  16. app/workspace.py +286 -0
  17. deckard/__init__.py +3 -0
  18. deckard/__main__.py +4 -0
  19. deckard/main.py +345 -0
  20. deckard/version.py +1 -0
  21. mcp/__init__.py +1 -0
  22. mcp/__main__.py +19 -0
  23. mcp/cli.py +485 -0
  24. mcp/daemon.py +149 -0
  25. mcp/proxy.py +304 -0
  26. mcp/registry.py +218 -0
  27. mcp/server.py +519 -0
  28. mcp/session.py +234 -0
  29. mcp/telemetry.py +112 -0
  30. mcp/test_cli.py +89 -0
  31. mcp/test_daemon.py +124 -0
  32. mcp/test_server.py +197 -0
  33. mcp/tools/__init__.py +14 -0
  34. mcp/tools/_util.py +244 -0
  35. mcp/tools/deckard_guide.py +32 -0
  36. mcp/tools/doctor.py +208 -0
  37. mcp/tools/get_callers.py +60 -0
  38. mcp/tools/get_implementations.py +60 -0
  39. mcp/tools/index_file.py +75 -0
  40. mcp/tools/list_files.py +138 -0
  41. mcp/tools/read_file.py +48 -0
  42. mcp/tools/read_symbol.py +99 -0
  43. mcp/tools/registry.py +212 -0
  44. mcp/tools/repo_candidates.py +89 -0
  45. mcp/tools/rescan.py +46 -0
  46. mcp/tools/scan_once.py +54 -0
  47. mcp/tools/search.py +208 -0
  48. mcp/tools/search_api_endpoints.py +72 -0
  49. mcp/tools/search_symbols.py +63 -0
  50. mcp/tools/status.py +135 -0
  51. sari/__init__.py +1 -0
  52. sari/__main__.py +4 -0
  53. sari-0.0.1.dist-info/METADATA +521 -0
  54. sari-0.0.1.dist-info/RECORD +58 -0
  55. sari-0.0.1.dist-info/WHEEL +5 -0
  56. sari-0.0.1.dist-info/entry_points.txt +2 -0
  57. sari-0.0.1.dist-info/licenses/LICENSE +21 -0
  58. sari-0.0.1.dist-info/top_level.txt +4 -0
app/dedup_queue.py ADDED
@@ -0,0 +1,77 @@
1
+ import queue
2
+ import threading
3
+ from typing import Any, List, Optional, Set
4
+
5
+ class DedupQueue:
6
+ """
7
+ A thread-safe queue that ignores duplicate items currently pending control.
8
+ """
9
+ def __init__(self):
10
+ self.q: queue.Queue = queue.Queue()
11
+ self.pending: Set[Any] = set()
12
+ self.lock = threading.Lock()
13
+
14
+ def put(self, item: Any) -> bool:
15
+ """
16
+ Put item into queue. Returns True if added, False if already pending.
17
+ """
18
+ with self.lock:
19
+ if item in self.pending:
20
+ return False
21
+ self.pending.add(item)
22
+ self.q.put(item)
23
+ return True
24
+
25
+ def get(self, block: bool = True, timeout: Optional[float] = None) -> Any:
26
+ try:
27
+ item = self.q.get(block=block, timeout=timeout)
28
+ return item
29
+ except queue.Empty:
30
+ raise
31
+
32
+ def task_done(self, item: Any) -> None:
33
+ """
34
+ Mark item as processed, removing it from pending set.
35
+ """
36
+ with self.lock:
37
+ self.pending.discard(item)
38
+ self.q.task_done()
39
+
40
+ def get_batch(self, max_size: int = 50, timeout: float = 0.1) -> List[Any]:
41
+ """
42
+ Get up to max_size items.
43
+ Note: You must assume ownership of these items.
44
+ We remove them from 'pending' set when we return them??
45
+ Wait, usually 'task_done' is called after processing.
46
+ But for 'Dedup', if we pull it out, it's no longer 'pending in queue',
47
+ so we should allow re-queueing (e.g. if file changes again while we process).
48
+ So removing from 'pending' set immediately upon 'get' is correct for ensuring
49
+ "If it changes AGAIN, we queue it AGAIN".
50
+ """
51
+ items = []
52
+ try:
53
+ # Blocking get for first item
54
+ item = self.q.get(block=True, timeout=timeout)
55
+ items.append(item)
56
+ # Remove from pending immediately so new events can be queued
57
+ with self.lock:
58
+ self.pending.discard(item)
59
+ self.q.task_done() # We count 'task_done' regarding the queue generic logic
60
+
61
+ # Non-blocking for rest
62
+ while len(items) < max_size:
63
+ try:
64
+ item = self.q.get_nowait()
65
+ items.append(item)
66
+ with self.lock:
67
+ self.pending.discard(item)
68
+ self.q.task_done()
69
+ except queue.Empty:
70
+ break
71
+ except queue.Empty:
72
+ pass
73
+
74
+ return items
75
+
76
+ def qsize(self) -> int:
77
+ return self.q.qsize()
app/engine_registry.py ADDED
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple
5
+
6
+ try:
7
+ from .models import SearchHit, SearchOptions
8
+ from .search_engine import SqliteSearchEngineAdapter
9
+ from .engine_runtime import EmbeddedEngine
10
+ except ImportError:
11
+ from models import SearchHit, SearchOptions
12
+ from search_engine import SqliteSearchEngineAdapter
13
+ from engine_runtime import EmbeddedEngine
14
+
15
+
16
+ class SearchEngineInterface(Protocol):
17
+ def search_v2(self, opts: SearchOptions) -> Tuple[List[SearchHit], Dict[str, Any]]:
18
+ ...
19
+
20
+ def repo_candidates(self, q: str, limit: int = 3, root_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
21
+ ...
22
+
23
+
24
+ class EngineRegistry:
25
+ def __init__(self) -> None:
26
+ self._factories: Dict[str, Callable[[Any, Any, Any], SearchEngineInterface]] = {}
27
+
28
+ def register(self, name: str, factory: Callable[[Any, Any, Any], SearchEngineInterface]) -> None:
29
+ self._factories[name] = factory
30
+
31
+ def create(self, name: str, db: Any, cfg: Any = None, roots: Any = None) -> SearchEngineInterface:
32
+ if name not in self._factories:
33
+ raise KeyError(f"engine not registered: {name}")
34
+ return self._factories[name](db, cfg, roots)
35
+
36
+ def default(self, db: Any, cfg: Any = None, roots: Any = None) -> SearchEngineInterface:
37
+ name = default_engine_name()
38
+ return self.create(name, db, cfg, roots)
39
+
40
+
41
+ _REGISTRY = EngineRegistry()
42
+ _REGISTRY.register("sqlite", lambda db, _cfg, _roots: SqliteSearchEngineAdapter(db))
43
+ _REGISTRY.register("embedded", lambda db, cfg, roots: EmbeddedEngine(db, cfg, roots or []))
44
+
45
+
46
+ def get_registry() -> EngineRegistry:
47
+ return _REGISTRY
48
+
49
+
50
+ def default_engine_name() -> str:
51
+ mode = (os.environ.get("DECKARD_ENGINE_MODE") or "sqlite").strip().lower()
52
+ return "embedded" if mode == "embedded" else "sqlite"
53
+
54
+
55
+ def get_default_engine(db: Any, cfg: Any = None, roots: Any = None) -> SearchEngineInterface:
56
+ return _REGISTRY.default(db, cfg, roots)
app/engine_runtime.py ADDED
@@ -0,0 +1,472 @@
1
+ import hashlib
2
+ import json
3
+ import os
4
+ import re
5
+ import subprocess
6
+ import sys
7
+ import time
8
+ import unicodedata
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
12
+
13
+ try:
14
+ from .models import SearchHit, SearchOptions
15
+ from .ranking import get_file_extension, snippet_around
16
+ from .workspace import WorkspaceManager
17
+ except ImportError:
18
+ from models import SearchHit, SearchOptions
19
+ from ranking import get_file_extension, snippet_around
20
+ from workspace import WorkspaceManager
21
+
22
+
23
+ ENGINE_PACKAGE = os.environ.get("DECKARD_ENGINE_PACKAGE", "tantivy==0.22.0")
24
+ _DEFAULT_ENGINE_MEM_MB = 512
25
+ _DEFAULT_ENGINE_INDEX_MEM_MB = 256
26
+ _DEFAULT_ENGINE_THREADS = 2
27
+
28
+
29
+ class EngineError(RuntimeError):
30
+ def __init__(self, code: str, message: str, hint: Optional[str] = None):
31
+ super().__init__(message)
32
+ self.code = code
33
+ self.message = message
34
+ self.hint = hint or ""
35
+
36
+
37
+ def _normalize_text(text: str) -> str:
38
+ if not text:
39
+ return ""
40
+ norm = unicodedata.normalize("NFKC", text)
41
+ norm = norm.lower()
42
+ norm = " ".join(norm.split())
43
+ return norm
44
+
45
+
46
+ def _env_int(name: str, default: int) -> int:
47
+ try:
48
+ return int(os.environ.get(name, default))
49
+ except (TypeError, ValueError):
50
+ return default
51
+
52
+
53
+ def _query_parts(q: str) -> Tuple[List[str], List[str]]:
54
+ parts = re.split(r"\"([^\"]+)\"", q)
55
+ tokens: List[str] = []
56
+ phrases: List[str] = []
57
+ for idx, part in enumerate(parts):
58
+ if idx % 2 == 1:
59
+ if part.strip():
60
+ phrases.append(part.strip())
61
+ else:
62
+ tokens.extend([p for p in part.strip().split() if p])
63
+ return tokens, phrases
64
+
65
+
66
+ def _has_cjk(text: str) -> bool:
67
+ for ch in text:
68
+ code = ord(ch)
69
+ if 0x4E00 <= code <= 0x9FFF or 0x3400 <= code <= 0x4DBF or 0x3040 <= code <= 0x30FF:
70
+ return True
71
+ return False
72
+
73
+
74
+ def _venv_python(venv_dir: Path) -> Path:
75
+ if os.name == "nt":
76
+ return venv_dir / "Scripts" / "python.exe"
77
+ return venv_dir / "bin" / "python"
78
+
79
+
80
+ def _inject_venv_site_packages(venv_dir: Path) -> None:
81
+ major = sys.version_info.major
82
+ minor = sys.version_info.minor
83
+ if os.name == "nt":
84
+ sp = venv_dir / "Lib" / "site-packages"
85
+ else:
86
+ sp = venv_dir / "lib" / f"python{major}.{minor}" / "site-packages"
87
+ if sp.exists():
88
+ sys.path.insert(0, str(sp))
89
+
90
+
91
+ def _ensure_venv(venv_dir: Path) -> None:
92
+ if venv_dir.exists():
93
+ return
94
+ import venv
95
+ venv_dir.parent.mkdir(parents=True, exist_ok=True)
96
+ venv.EnvBuilder(with_pip=True).create(str(venv_dir))
97
+
98
+
99
+ def _install_engine_package(venv_dir: Path) -> None:
100
+ _ensure_venv(venv_dir)
101
+ py = _venv_python(venv_dir)
102
+ subprocess.check_call([str(py), "-m", "pip", "install", ENGINE_PACKAGE])
103
+
104
+
105
+ def _load_tantivy(venv_dir: Path, auto_install: bool) -> Any:
106
+ try:
107
+ import tantivy # type: ignore
108
+ return tantivy
109
+ except Exception:
110
+ if not auto_install:
111
+ raise EngineError("ERR_ENGINE_NOT_INSTALLED", "Engine not installed", "sari --cmd engine install")
112
+ _install_engine_package(venv_dir)
113
+ _inject_venv_site_packages(venv_dir)
114
+ try:
115
+ import tantivy # type: ignore
116
+ return tantivy
117
+ except Exception as exc:
118
+ raise EngineError("ERR_ENGINE_NOT_INSTALLED", f"Engine install failed: {exc}", "sari --cmd engine install")
119
+
120
+
121
+ @dataclass
122
+ class EngineMeta:
123
+ engine_mode: str
124
+ engine_ready: bool
125
+ engine_version: str
126
+ index_version: str
127
+ reason: str = ""
128
+ hint: str = ""
129
+ doc_count: int = 0
130
+ index_size_bytes: int = 0
131
+ last_build_ts: int = 0
132
+ engine_mem_mb: int = 0
133
+ index_mem_mb: int = 0
134
+ engine_threads: int = 0
135
+
136
+
137
+ class EmbeddedEngine:
138
+ def __init__(self, db: Any, cfg: Any, roots: List[str]):
139
+ self._db = db
140
+ self._cfg = cfg
141
+ self._roots = roots
142
+ self._root_ids = [WorkspaceManager.root_id(r) for r in roots]
143
+ self._roots_hash = WorkspaceManager.roots_hash(self._root_ids)
144
+ self._index_dir = WorkspaceManager.get_engine_index_dir(self._roots_hash)
145
+ self._cache_dir = WorkspaceManager.get_engine_cache_dir()
146
+ self._venv_dir = WorkspaceManager.get_engine_venv_dir()
147
+ self._index_version_path = self._index_dir / "index_version.json"
148
+ self._auto_install = (os.environ.get("DECKARD_ENGINE_AUTO_INSTALL", "1").strip().lower() not in {"0", "false", "no", "off"})
149
+ self._tantivy = None
150
+ self._index = None
151
+ self._schema = None
152
+ self._fields: Dict[str, Any] = {}
153
+
154
+ def _engine_limits(self) -> Tuple[int, int, int]:
155
+ mem_mb = _env_int("DECKARD_ENGINE_MEM_MB", _DEFAULT_ENGINE_MEM_MB)
156
+ index_mem_mb = _env_int("DECKARD_ENGINE_INDEX_MEM_MB", _DEFAULT_ENGINE_INDEX_MEM_MB)
157
+ threads = _env_int("DECKARD_ENGINE_THREADS", _DEFAULT_ENGINE_THREADS)
158
+ mem_mb = max(64, mem_mb)
159
+ index_mem_mb = max(64, index_mem_mb)
160
+ if index_mem_mb > mem_mb:
161
+ index_mem_mb = mem_mb
162
+ max_threads = max(1, os.cpu_count() or 1)
163
+ if threads < 1:
164
+ threads = 1
165
+ if threads > max_threads:
166
+ threads = max_threads
167
+ return mem_mb, index_mem_mb, threads
168
+
169
+ def _index_writer(self, index: Any) -> Any:
170
+ _mem_mb, index_mem_mb, threads = self._engine_limits()
171
+ budget = int(index_mem_mb) * 1024 * 1024
172
+ try:
173
+ return index.writer(budget, threads)
174
+ except TypeError:
175
+ try:
176
+ return index.writer(budget)
177
+ except TypeError:
178
+ return index.writer()
179
+
180
+ def _engine_version(self) -> str:
181
+ if not self._tantivy:
182
+ return "unknown"
183
+ return getattr(self._tantivy, "__version__", "unknown")
184
+
185
+ def _config_hash(self) -> str:
186
+ payload = {
187
+ "root_ids": sorted(self._root_ids),
188
+ "include_ext": list(getattr(self._cfg, "include_ext", [])),
189
+ "include_files": list(getattr(self._cfg, "include_files", [])),
190
+ "exclude_dirs": list(getattr(self._cfg, "exclude_dirs", [])),
191
+ "exclude_globs": list(getattr(self._cfg, "exclude_globs", [])),
192
+ "max_file_bytes": int(getattr(self._cfg, "max_file_bytes", 0) or 0),
193
+ "size_profile": (os.environ.get("DECKARD_SIZE_PROFILE") or "default").strip().lower(),
194
+ "max_parse_bytes": int(os.environ.get("DECKARD_MAX_PARSE_BYTES", "0") or 0),
195
+ "max_ast_bytes": int(os.environ.get("DECKARD_MAX_AST_BYTES", "0") or 0),
196
+ "follow_symlinks": (os.environ.get("DECKARD_FOLLOW_SYMLINKS", "0").strip().lower() in ("1", "true", "yes", "on")),
197
+ "engine_version": self._engine_version(),
198
+ "max_doc_bytes": int(os.environ.get("DECKARD_ENGINE_MAX_DOC_BYTES", "4194304") or 4194304),
199
+ "preview_bytes": int(os.environ.get("DECKARD_ENGINE_PREVIEW_BYTES", "8192") or 8192),
200
+ }
201
+ raw = json.dumps(payload, sort_keys=True, ensure_ascii=False)
202
+ return hashlib.sha1(raw.encode("utf-8")).hexdigest()
203
+
204
+ def _load_index_version(self) -> Dict[str, Any]:
205
+ if not self._index_version_path.exists():
206
+ return {}
207
+ try:
208
+ return json.loads(self._index_version_path.read_text(encoding="utf-8"))
209
+ except Exception:
210
+ return {}
211
+
212
+ def _write_index_version(self, doc_count: int) -> None:
213
+ meta = {
214
+ "version": 1,
215
+ "build_ts": int(time.time()),
216
+ "doc_count": int(doc_count),
217
+ "engine_version": self._engine_version(),
218
+ "config_hash": self._config_hash(),
219
+ }
220
+ self._index_dir.mkdir(parents=True, exist_ok=True)
221
+ self._index_version_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
222
+
223
+ def _ensure_index(self) -> None:
224
+ self._tantivy = _load_tantivy(self._venv_dir, self._auto_install)
225
+ if self._schema and self._index:
226
+ return
227
+ schema_builder = self._tantivy.SchemaBuilder()
228
+ self._fields = {
229
+ "doc_id": schema_builder.add_text_field("doc_id", stored=True),
230
+ "path": schema_builder.add_text_field("path", stored=True),
231
+ "repo": schema_builder.add_text_field("repo", stored=True),
232
+ "root_id": schema_builder.add_text_field("root_id", stored=True),
233
+ "rel_path": schema_builder.add_text_field("rel_path", stored=True),
234
+ "path_text": schema_builder.add_text_field("path_text"),
235
+ "body_text": schema_builder.add_text_field("body_text"),
236
+ "preview": schema_builder.add_text_field("preview", stored=True),
237
+ "mtime": schema_builder.add_i64_field("mtime", stored=True),
238
+ "size": schema_builder.add_i64_field("size", stored=True),
239
+ }
240
+ self._schema = schema_builder.build()
241
+ if self._index_dir.exists() and (self._index_dir / "meta.json").exists():
242
+ self._index = self._tantivy.Index(self._index_dir.as_posix())
243
+ else:
244
+ self._index_dir.mkdir(parents=True, exist_ok=True)
245
+ self._index = self._tantivy.Index(self._schema, self._index_dir.as_posix())
246
+
247
+ def status(self) -> EngineMeta:
248
+ mode = "embedded"
249
+ mem_mb, index_mem_mb, threads = self._engine_limits()
250
+ try:
251
+ if not self._tantivy:
252
+ self._tantivy = _load_tantivy(self._venv_dir, auto_install=False)
253
+ except EngineError:
254
+ return EngineMeta(
255
+ engine_mode=mode,
256
+ engine_ready=False,
257
+ engine_version="unknown",
258
+ index_version="",
259
+ reason="NOT_INSTALLED",
260
+ hint="sari --cmd engine install",
261
+ engine_mem_mb=mem_mb,
262
+ index_mem_mb=index_mem_mb,
263
+ engine_threads=threads,
264
+ )
265
+ index_meta = self._load_index_version()
266
+ engine_version = index_meta.get("engine_version", "")
267
+ cfg_hash = index_meta.get("config_hash", "")
268
+ ready = bool(index_meta) and cfg_hash == self._config_hash() and engine_version
269
+ reason = ""
270
+ hint = ""
271
+ if not index_meta:
272
+ ready = False
273
+ reason = "INDEX_MISSING"
274
+ hint = "sari --cmd engine rebuild"
275
+ elif cfg_hash != self._config_hash():
276
+ ready = False
277
+ reason = "CONFIG_MISMATCH"
278
+ hint = "sari --cmd engine rebuild"
279
+ if not engine_version:
280
+ ready = False
281
+ reason = "ENGINE_MISMATCH"
282
+ hint = "sari --cmd engine rebuild"
283
+ idx_size = 0
284
+ if self._index_dir.exists():
285
+ try:
286
+ idx_size = sum(p.stat().st_size for p in self._index_dir.rglob("*") if p.is_file())
287
+ except Exception:
288
+ idx_size = 0
289
+ return EngineMeta(
290
+ engine_mode=mode,
291
+ engine_ready=ready,
292
+ engine_version=engine_version or "unknown",
293
+ index_version=cfg_hash or "",
294
+ reason=reason,
295
+ hint=hint,
296
+ doc_count=int(index_meta.get("doc_count", 0) or 0),
297
+ index_size_bytes=idx_size,
298
+ last_build_ts=int(index_meta.get("build_ts", 0) or 0),
299
+ engine_mem_mb=mem_mb,
300
+ index_mem_mb=index_mem_mb,
301
+ engine_threads=threads,
302
+ )
303
+
304
+ def install(self) -> None:
305
+ _load_tantivy(self._venv_dir, auto_install=True)
306
+ self._ensure_index()
307
+
308
+ def rebuild(self) -> None:
309
+ self._ensure_index()
310
+ tmp_dir = self._index_dir.parent / f"{self._index_dir.name}.build"
311
+ if tmp_dir.exists():
312
+ for p in tmp_dir.rglob("*"):
313
+ if p.is_file():
314
+ try:
315
+ p.unlink()
316
+ except Exception:
317
+ pass
318
+ if tmp_dir.exists():
319
+ try:
320
+ tmp_dir.rmdir()
321
+ except Exception:
322
+ pass
323
+ tmp_dir.mkdir(parents=True, exist_ok=True)
324
+ idx = self._tantivy.Index(self._schema, tmp_dir.as_posix())
325
+ writer = self._index_writer(idx)
326
+ count = 0
327
+ for doc in self._db.iter_engine_documents(self._root_ids):
328
+ writer.add_document(self._tantivy.Document(**doc))
329
+ count += 1
330
+ writer.commit()
331
+ idx.reload()
332
+ if self._index_dir.exists():
333
+ for p in self._index_dir.rglob("*"):
334
+ if p.is_file():
335
+ try:
336
+ p.unlink()
337
+ except Exception:
338
+ pass
339
+ if self._index_dir.exists():
340
+ try:
341
+ self._index_dir.rmdir()
342
+ except Exception:
343
+ pass
344
+ tmp_dir.replace(self._index_dir)
345
+ self._index = idx
346
+ self._write_index_version(count)
347
+
348
+ def upsert_documents(self, docs: Iterable[Dict[str, Any]]) -> None:
349
+ self._ensure_index()
350
+ writer = self._index_writer(self._index)
351
+ count = 0
352
+ for doc in docs:
353
+ doc_id = doc.get("doc_id")
354
+ if doc_id:
355
+ term = self._tantivy.Term.from_field_text(self._fields["doc_id"], doc_id)
356
+ writer.delete_term(term)
357
+ writer.add_document(self._tantivy.Document(**doc))
358
+ count += 1
359
+ writer.commit()
360
+ if count:
361
+ self._write_index_version(self._load_index_version().get("doc_count", 0) + count)
362
+
363
+ def delete_documents(self, doc_ids: Iterable[str]) -> None:
364
+ self._ensure_index()
365
+ writer = self._index_writer(self._index)
366
+ deleted = 0
367
+ for doc_id in doc_ids:
368
+ term = self._tantivy.Term.from_field_text(self._fields["doc_id"], doc_id)
369
+ writer.delete_term(term)
370
+ deleted += 1
371
+ if deleted:
372
+ writer.commit()
373
+ meta = self._load_index_version()
374
+ doc_count = int(meta.get("doc_count", 0) or 0)
375
+ doc_count = max(0, doc_count - deleted)
376
+ self._write_index_version(doc_count)
377
+
378
+ def search_v2(self, opts: SearchOptions) -> Tuple[List[SearchHit], Dict[str, Any]]:
379
+ self._ensure_index()
380
+ meta = {"total_mode": "approx", "total": -1}
381
+ norm_q = _normalize_text(opts.query or "")
382
+ if not norm_q:
383
+ return [], meta
384
+ tokens, phrases = _query_parts(norm_q)
385
+ pieces = []
386
+ for p in phrases:
387
+ pieces.append(f"\"{p}\"")
388
+ for t in tokens:
389
+ pieces.append(t)
390
+ qstr = " AND ".join(pieces) if pieces else ""
391
+ if not qstr:
392
+ return [], meta
393
+ qp = self._tantivy.QueryParser.for_index(self._index, [self._fields["body_text"], self._fields["path_text"]])
394
+ try:
395
+ qp.set_conjunction_by_default()
396
+ except Exception:
397
+ pass
398
+ query = qp.parse_query(qstr)
399
+ searcher = self._index.searcher()
400
+ limit = max(1, min(int(opts.limit), 50))
401
+ top_docs = searcher.search(query, self._tantivy.TopDocs(limit=limit + int(opts.offset)))
402
+ hits: List[SearchHit] = []
403
+ for score, doc_address in top_docs:
404
+ doc = searcher.doc(doc_address)
405
+ path = doc.get_first(self._fields["path"])
406
+ repo = doc.get_first(self._fields["repo"]) or "__root__"
407
+ mtime = int(doc.get_first(self._fields["mtime"]) or 0)
408
+ size = int(doc.get_first(self._fields["size"]) or 0)
409
+ preview = doc.get_first(self._fields["preview"]) or ""
410
+ path_str = str(path) if path else ""
411
+ if opts.root_ids:
412
+ rid = doc.get_first(self._fields["root_id"]) or ""
413
+ if rid not in opts.root_ids:
414
+ continue
415
+ if opts.repo and repo != opts.repo:
416
+ continue
417
+ if opts.file_types and get_file_extension(path_str) not in [ft.lower().lstrip(".") for ft in opts.file_types]:
418
+ continue
419
+ if opts.path_pattern and not _path_pattern_match(path_str, opts.path_pattern):
420
+ continue
421
+ if opts.exclude_patterns and _exclude_pattern_match(path_str, opts.exclude_patterns):
422
+ continue
423
+ snippet = snippet_around(preview, tokens, opts.snippet_lines, highlight=True) if preview else ""
424
+ hits.append(SearchHit(
425
+ repo=repo,
426
+ path=path_str,
427
+ score=float(score),
428
+ snippet=snippet,
429
+ mtime=mtime,
430
+ size=size,
431
+ match_count=0,
432
+ file_type=get_file_extension(path_str),
433
+ hit_reason="Engine match",
434
+ ))
435
+ hits.sort(key=lambda h: (-h.score, -h.mtime, h.path))
436
+ start = int(opts.offset)
437
+ end = start + limit
438
+ return hits[start:end], meta
439
+
440
+ def repo_candidates(self, q: str, limit: int = 3, root_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
441
+ q = (q or "").strip()
442
+ if not q:
443
+ return []
444
+ sql = "SELECT repo, COUNT(1) AS c FROM files WHERE content LIKE ? ESCAPE '^' GROUP BY repo ORDER BY c DESC LIMIT ?;"
445
+ like_q = q.replace("^", "^^").replace("%", "^%").replace("_", "^_")
446
+ with self._db._read_lock:
447
+ rows = self._db._read.execute(sql, (f"%{like_q}%", limit)).fetchall()
448
+ out = []
449
+ for r in rows:
450
+ repo, c = str(r["repo"]), int(r["c"])
451
+ out.append({"repo": repo, "score": c, "evidence": ""})
452
+ return out
453
+
454
+
455
+ def _path_pattern_match(path: str, pattern: str) -> bool:
456
+ import fnmatch
457
+ p = path.replace("\\", "/")
458
+ pat = pattern.replace("\\", "/")
459
+ if pat.startswith("/"):
460
+ if p.startswith(pat):
461
+ return True
462
+ if p.endswith("/" + pat) or p == pat:
463
+ return True
464
+ return fnmatch.fnmatch(p, pat) or fnmatch.fnmatch(p, f"*/{pat}") or fnmatch.fnmatch(p, f"*/{pat}/*")
465
+
466
+
467
+ def _exclude_pattern_match(path: str, patterns: List[str]) -> bool:
468
+ import fnmatch
469
+ for p in patterns:
470
+ if p in path or fnmatch.fnmatch(path, f"*{p}*"):
471
+ return True
472
+ return False