sari 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app/__init__.py +1 -0
- app/config.py +240 -0
- app/db.py +932 -0
- app/dedup_queue.py +77 -0
- app/engine_registry.py +56 -0
- app/engine_runtime.py +472 -0
- app/http_server.py +204 -0
- app/indexer.py +1532 -0
- app/main.py +147 -0
- app/models.py +39 -0
- app/queue_pipeline.py +65 -0
- app/ranking.py +144 -0
- app/registry.py +172 -0
- app/search_engine.py +572 -0
- app/watcher.py +124 -0
- app/workspace.py +286 -0
- deckard/__init__.py +3 -0
- deckard/__main__.py +4 -0
- deckard/main.py +345 -0
- deckard/version.py +1 -0
- mcp/__init__.py +1 -0
- mcp/__main__.py +19 -0
- mcp/cli.py +485 -0
- mcp/daemon.py +149 -0
- mcp/proxy.py +304 -0
- mcp/registry.py +218 -0
- mcp/server.py +519 -0
- mcp/session.py +234 -0
- mcp/telemetry.py +112 -0
- mcp/test_cli.py +89 -0
- mcp/test_daemon.py +124 -0
- mcp/test_server.py +197 -0
- mcp/tools/__init__.py +14 -0
- mcp/tools/_util.py +244 -0
- mcp/tools/deckard_guide.py +32 -0
- mcp/tools/doctor.py +208 -0
- mcp/tools/get_callers.py +60 -0
- mcp/tools/get_implementations.py +60 -0
- mcp/tools/index_file.py +75 -0
- mcp/tools/list_files.py +138 -0
- mcp/tools/read_file.py +48 -0
- mcp/tools/read_symbol.py +99 -0
- mcp/tools/registry.py +212 -0
- mcp/tools/repo_candidates.py +89 -0
- mcp/tools/rescan.py +46 -0
- mcp/tools/scan_once.py +54 -0
- mcp/tools/search.py +208 -0
- mcp/tools/search_api_endpoints.py +72 -0
- mcp/tools/search_symbols.py +63 -0
- mcp/tools/status.py +135 -0
- sari/__init__.py +1 -0
- sari/__main__.py +4 -0
- sari-0.0.1.dist-info/METADATA +521 -0
- sari-0.0.1.dist-info/RECORD +58 -0
- sari-0.0.1.dist-info/WHEEL +5 -0
- sari-0.0.1.dist-info/entry_points.txt +2 -0
- sari-0.0.1.dist-info/licenses/LICENSE +21 -0
- sari-0.0.1.dist-info/top_level.txt +4 -0
app/dedup_queue.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import queue
|
|
2
|
+
import threading
|
|
3
|
+
from typing import Any, List, Optional, Set
|
|
4
|
+
|
|
5
|
+
class DedupQueue:
|
|
6
|
+
"""
|
|
7
|
+
A thread-safe queue that ignores duplicate items currently pending control.
|
|
8
|
+
"""
|
|
9
|
+
def __init__(self):
|
|
10
|
+
self.q: queue.Queue = queue.Queue()
|
|
11
|
+
self.pending: Set[Any] = set()
|
|
12
|
+
self.lock = threading.Lock()
|
|
13
|
+
|
|
14
|
+
def put(self, item: Any) -> bool:
|
|
15
|
+
"""
|
|
16
|
+
Put item into queue. Returns True if added, False if already pending.
|
|
17
|
+
"""
|
|
18
|
+
with self.lock:
|
|
19
|
+
if item in self.pending:
|
|
20
|
+
return False
|
|
21
|
+
self.pending.add(item)
|
|
22
|
+
self.q.put(item)
|
|
23
|
+
return True
|
|
24
|
+
|
|
25
|
+
def get(self, block: bool = True, timeout: Optional[float] = None) -> Any:
|
|
26
|
+
try:
|
|
27
|
+
item = self.q.get(block=block, timeout=timeout)
|
|
28
|
+
return item
|
|
29
|
+
except queue.Empty:
|
|
30
|
+
raise
|
|
31
|
+
|
|
32
|
+
def task_done(self, item: Any) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Mark item as processed, removing it from pending set.
|
|
35
|
+
"""
|
|
36
|
+
with self.lock:
|
|
37
|
+
self.pending.discard(item)
|
|
38
|
+
self.q.task_done()
|
|
39
|
+
|
|
40
|
+
def get_batch(self, max_size: int = 50, timeout: float = 0.1) -> List[Any]:
|
|
41
|
+
"""
|
|
42
|
+
Get up to max_size items.
|
|
43
|
+
Note: You must assume ownership of these items.
|
|
44
|
+
We remove them from 'pending' set when we return them??
|
|
45
|
+
Wait, usually 'task_done' is called after processing.
|
|
46
|
+
But for 'Dedup', if we pull it out, it's no longer 'pending in queue',
|
|
47
|
+
so we should allow re-queueing (e.g. if file changes again while we process).
|
|
48
|
+
So removing from 'pending' set immediately upon 'get' is correct for ensuring
|
|
49
|
+
"If it changes AGAIN, we queue it AGAIN".
|
|
50
|
+
"""
|
|
51
|
+
items = []
|
|
52
|
+
try:
|
|
53
|
+
# Blocking get for first item
|
|
54
|
+
item = self.q.get(block=True, timeout=timeout)
|
|
55
|
+
items.append(item)
|
|
56
|
+
# Remove from pending immediately so new events can be queued
|
|
57
|
+
with self.lock:
|
|
58
|
+
self.pending.discard(item)
|
|
59
|
+
self.q.task_done() # We count 'task_done' regarding the queue generic logic
|
|
60
|
+
|
|
61
|
+
# Non-blocking for rest
|
|
62
|
+
while len(items) < max_size:
|
|
63
|
+
try:
|
|
64
|
+
item = self.q.get_nowait()
|
|
65
|
+
items.append(item)
|
|
66
|
+
with self.lock:
|
|
67
|
+
self.pending.discard(item)
|
|
68
|
+
self.q.task_done()
|
|
69
|
+
except queue.Empty:
|
|
70
|
+
break
|
|
71
|
+
except queue.Empty:
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
return items
|
|
75
|
+
|
|
76
|
+
def qsize(self) -> int:
|
|
77
|
+
return self.q.qsize()
|
app/engine_registry.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from .models import SearchHit, SearchOptions
|
|
8
|
+
from .search_engine import SqliteSearchEngineAdapter
|
|
9
|
+
from .engine_runtime import EmbeddedEngine
|
|
10
|
+
except ImportError:
|
|
11
|
+
from models import SearchHit, SearchOptions
|
|
12
|
+
from search_engine import SqliteSearchEngineAdapter
|
|
13
|
+
from engine_runtime import EmbeddedEngine
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SearchEngineInterface(Protocol):
|
|
17
|
+
def search_v2(self, opts: SearchOptions) -> Tuple[List[SearchHit], Dict[str, Any]]:
|
|
18
|
+
...
|
|
19
|
+
|
|
20
|
+
def repo_candidates(self, q: str, limit: int = 3, root_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
21
|
+
...
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class EngineRegistry:
|
|
25
|
+
def __init__(self) -> None:
|
|
26
|
+
self._factories: Dict[str, Callable[[Any, Any, Any], SearchEngineInterface]] = {}
|
|
27
|
+
|
|
28
|
+
def register(self, name: str, factory: Callable[[Any, Any, Any], SearchEngineInterface]) -> None:
|
|
29
|
+
self._factories[name] = factory
|
|
30
|
+
|
|
31
|
+
def create(self, name: str, db: Any, cfg: Any = None, roots: Any = None) -> SearchEngineInterface:
|
|
32
|
+
if name not in self._factories:
|
|
33
|
+
raise KeyError(f"engine not registered: {name}")
|
|
34
|
+
return self._factories[name](db, cfg, roots)
|
|
35
|
+
|
|
36
|
+
def default(self, db: Any, cfg: Any = None, roots: Any = None) -> SearchEngineInterface:
|
|
37
|
+
name = default_engine_name()
|
|
38
|
+
return self.create(name, db, cfg, roots)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
_REGISTRY = EngineRegistry()
|
|
42
|
+
_REGISTRY.register("sqlite", lambda db, _cfg, _roots: SqliteSearchEngineAdapter(db))
|
|
43
|
+
_REGISTRY.register("embedded", lambda db, cfg, roots: EmbeddedEngine(db, cfg, roots or []))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_registry() -> EngineRegistry:
|
|
47
|
+
return _REGISTRY
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def default_engine_name() -> str:
|
|
51
|
+
mode = (os.environ.get("DECKARD_ENGINE_MODE") or "sqlite").strip().lower()
|
|
52
|
+
return "embedded" if mode == "embedded" else "sqlite"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_default_engine(db: Any, cfg: Any = None, roots: Any = None) -> SearchEngineInterface:
|
|
56
|
+
return _REGISTRY.default(db, cfg, roots)
|
app/engine_runtime.py
ADDED
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
import unicodedata
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from .models import SearchHit, SearchOptions
|
|
15
|
+
from .ranking import get_file_extension, snippet_around
|
|
16
|
+
from .workspace import WorkspaceManager
|
|
17
|
+
except ImportError:
|
|
18
|
+
from models import SearchHit, SearchOptions
|
|
19
|
+
from ranking import get_file_extension, snippet_around
|
|
20
|
+
from workspace import WorkspaceManager
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
ENGINE_PACKAGE = os.environ.get("DECKARD_ENGINE_PACKAGE", "tantivy==0.22.0")
|
|
24
|
+
_DEFAULT_ENGINE_MEM_MB = 512
|
|
25
|
+
_DEFAULT_ENGINE_INDEX_MEM_MB = 256
|
|
26
|
+
_DEFAULT_ENGINE_THREADS = 2
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class EngineError(RuntimeError):
|
|
30
|
+
def __init__(self, code: str, message: str, hint: Optional[str] = None):
|
|
31
|
+
super().__init__(message)
|
|
32
|
+
self.code = code
|
|
33
|
+
self.message = message
|
|
34
|
+
self.hint = hint or ""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _normalize_text(text: str) -> str:
|
|
38
|
+
if not text:
|
|
39
|
+
return ""
|
|
40
|
+
norm = unicodedata.normalize("NFKC", text)
|
|
41
|
+
norm = norm.lower()
|
|
42
|
+
norm = " ".join(norm.split())
|
|
43
|
+
return norm
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _env_int(name: str, default: int) -> int:
|
|
47
|
+
try:
|
|
48
|
+
return int(os.environ.get(name, default))
|
|
49
|
+
except (TypeError, ValueError):
|
|
50
|
+
return default
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _query_parts(q: str) -> Tuple[List[str], List[str]]:
|
|
54
|
+
parts = re.split(r"\"([^\"]+)\"", q)
|
|
55
|
+
tokens: List[str] = []
|
|
56
|
+
phrases: List[str] = []
|
|
57
|
+
for idx, part in enumerate(parts):
|
|
58
|
+
if idx % 2 == 1:
|
|
59
|
+
if part.strip():
|
|
60
|
+
phrases.append(part.strip())
|
|
61
|
+
else:
|
|
62
|
+
tokens.extend([p for p in part.strip().split() if p])
|
|
63
|
+
return tokens, phrases
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _has_cjk(text: str) -> bool:
|
|
67
|
+
for ch in text:
|
|
68
|
+
code = ord(ch)
|
|
69
|
+
if 0x4E00 <= code <= 0x9FFF or 0x3400 <= code <= 0x4DBF or 0x3040 <= code <= 0x30FF:
|
|
70
|
+
return True
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _venv_python(venv_dir: Path) -> Path:
|
|
75
|
+
if os.name == "nt":
|
|
76
|
+
return venv_dir / "Scripts" / "python.exe"
|
|
77
|
+
return venv_dir / "bin" / "python"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _inject_venv_site_packages(venv_dir: Path) -> None:
|
|
81
|
+
major = sys.version_info.major
|
|
82
|
+
minor = sys.version_info.minor
|
|
83
|
+
if os.name == "nt":
|
|
84
|
+
sp = venv_dir / "Lib" / "site-packages"
|
|
85
|
+
else:
|
|
86
|
+
sp = venv_dir / "lib" / f"python{major}.{minor}" / "site-packages"
|
|
87
|
+
if sp.exists():
|
|
88
|
+
sys.path.insert(0, str(sp))
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _ensure_venv(venv_dir: Path) -> None:
|
|
92
|
+
if venv_dir.exists():
|
|
93
|
+
return
|
|
94
|
+
import venv
|
|
95
|
+
venv_dir.parent.mkdir(parents=True, exist_ok=True)
|
|
96
|
+
venv.EnvBuilder(with_pip=True).create(str(venv_dir))
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _install_engine_package(venv_dir: Path) -> None:
|
|
100
|
+
_ensure_venv(venv_dir)
|
|
101
|
+
py = _venv_python(venv_dir)
|
|
102
|
+
subprocess.check_call([str(py), "-m", "pip", "install", ENGINE_PACKAGE])
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _load_tantivy(venv_dir: Path, auto_install: bool) -> Any:
|
|
106
|
+
try:
|
|
107
|
+
import tantivy # type: ignore
|
|
108
|
+
return tantivy
|
|
109
|
+
except Exception:
|
|
110
|
+
if not auto_install:
|
|
111
|
+
raise EngineError("ERR_ENGINE_NOT_INSTALLED", "Engine not installed", "sari --cmd engine install")
|
|
112
|
+
_install_engine_package(venv_dir)
|
|
113
|
+
_inject_venv_site_packages(venv_dir)
|
|
114
|
+
try:
|
|
115
|
+
import tantivy # type: ignore
|
|
116
|
+
return tantivy
|
|
117
|
+
except Exception as exc:
|
|
118
|
+
raise EngineError("ERR_ENGINE_NOT_INSTALLED", f"Engine install failed: {exc}", "sari --cmd engine install")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass
|
|
122
|
+
class EngineMeta:
|
|
123
|
+
engine_mode: str
|
|
124
|
+
engine_ready: bool
|
|
125
|
+
engine_version: str
|
|
126
|
+
index_version: str
|
|
127
|
+
reason: str = ""
|
|
128
|
+
hint: str = ""
|
|
129
|
+
doc_count: int = 0
|
|
130
|
+
index_size_bytes: int = 0
|
|
131
|
+
last_build_ts: int = 0
|
|
132
|
+
engine_mem_mb: int = 0
|
|
133
|
+
index_mem_mb: int = 0
|
|
134
|
+
engine_threads: int = 0
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class EmbeddedEngine:
|
|
138
|
+
def __init__(self, db: Any, cfg: Any, roots: List[str]):
|
|
139
|
+
self._db = db
|
|
140
|
+
self._cfg = cfg
|
|
141
|
+
self._roots = roots
|
|
142
|
+
self._root_ids = [WorkspaceManager.root_id(r) for r in roots]
|
|
143
|
+
self._roots_hash = WorkspaceManager.roots_hash(self._root_ids)
|
|
144
|
+
self._index_dir = WorkspaceManager.get_engine_index_dir(self._roots_hash)
|
|
145
|
+
self._cache_dir = WorkspaceManager.get_engine_cache_dir()
|
|
146
|
+
self._venv_dir = WorkspaceManager.get_engine_venv_dir()
|
|
147
|
+
self._index_version_path = self._index_dir / "index_version.json"
|
|
148
|
+
self._auto_install = (os.environ.get("DECKARD_ENGINE_AUTO_INSTALL", "1").strip().lower() not in {"0", "false", "no", "off"})
|
|
149
|
+
self._tantivy = None
|
|
150
|
+
self._index = None
|
|
151
|
+
self._schema = None
|
|
152
|
+
self._fields: Dict[str, Any] = {}
|
|
153
|
+
|
|
154
|
+
def _engine_limits(self) -> Tuple[int, int, int]:
|
|
155
|
+
mem_mb = _env_int("DECKARD_ENGINE_MEM_MB", _DEFAULT_ENGINE_MEM_MB)
|
|
156
|
+
index_mem_mb = _env_int("DECKARD_ENGINE_INDEX_MEM_MB", _DEFAULT_ENGINE_INDEX_MEM_MB)
|
|
157
|
+
threads = _env_int("DECKARD_ENGINE_THREADS", _DEFAULT_ENGINE_THREADS)
|
|
158
|
+
mem_mb = max(64, mem_mb)
|
|
159
|
+
index_mem_mb = max(64, index_mem_mb)
|
|
160
|
+
if index_mem_mb > mem_mb:
|
|
161
|
+
index_mem_mb = mem_mb
|
|
162
|
+
max_threads = max(1, os.cpu_count() or 1)
|
|
163
|
+
if threads < 1:
|
|
164
|
+
threads = 1
|
|
165
|
+
if threads > max_threads:
|
|
166
|
+
threads = max_threads
|
|
167
|
+
return mem_mb, index_mem_mb, threads
|
|
168
|
+
|
|
169
|
+
def _index_writer(self, index: Any) -> Any:
|
|
170
|
+
_mem_mb, index_mem_mb, threads = self._engine_limits()
|
|
171
|
+
budget = int(index_mem_mb) * 1024 * 1024
|
|
172
|
+
try:
|
|
173
|
+
return index.writer(budget, threads)
|
|
174
|
+
except TypeError:
|
|
175
|
+
try:
|
|
176
|
+
return index.writer(budget)
|
|
177
|
+
except TypeError:
|
|
178
|
+
return index.writer()
|
|
179
|
+
|
|
180
|
+
def _engine_version(self) -> str:
|
|
181
|
+
if not self._tantivy:
|
|
182
|
+
return "unknown"
|
|
183
|
+
return getattr(self._tantivy, "__version__", "unknown")
|
|
184
|
+
|
|
185
|
+
def _config_hash(self) -> str:
|
|
186
|
+
payload = {
|
|
187
|
+
"root_ids": sorted(self._root_ids),
|
|
188
|
+
"include_ext": list(getattr(self._cfg, "include_ext", [])),
|
|
189
|
+
"include_files": list(getattr(self._cfg, "include_files", [])),
|
|
190
|
+
"exclude_dirs": list(getattr(self._cfg, "exclude_dirs", [])),
|
|
191
|
+
"exclude_globs": list(getattr(self._cfg, "exclude_globs", [])),
|
|
192
|
+
"max_file_bytes": int(getattr(self._cfg, "max_file_bytes", 0) or 0),
|
|
193
|
+
"size_profile": (os.environ.get("DECKARD_SIZE_PROFILE") or "default").strip().lower(),
|
|
194
|
+
"max_parse_bytes": int(os.environ.get("DECKARD_MAX_PARSE_BYTES", "0") or 0),
|
|
195
|
+
"max_ast_bytes": int(os.environ.get("DECKARD_MAX_AST_BYTES", "0") or 0),
|
|
196
|
+
"follow_symlinks": (os.environ.get("DECKARD_FOLLOW_SYMLINKS", "0").strip().lower() in ("1", "true", "yes", "on")),
|
|
197
|
+
"engine_version": self._engine_version(),
|
|
198
|
+
"max_doc_bytes": int(os.environ.get("DECKARD_ENGINE_MAX_DOC_BYTES", "4194304") or 4194304),
|
|
199
|
+
"preview_bytes": int(os.environ.get("DECKARD_ENGINE_PREVIEW_BYTES", "8192") or 8192),
|
|
200
|
+
}
|
|
201
|
+
raw = json.dumps(payload, sort_keys=True, ensure_ascii=False)
|
|
202
|
+
return hashlib.sha1(raw.encode("utf-8")).hexdigest()
|
|
203
|
+
|
|
204
|
+
def _load_index_version(self) -> Dict[str, Any]:
|
|
205
|
+
if not self._index_version_path.exists():
|
|
206
|
+
return {}
|
|
207
|
+
try:
|
|
208
|
+
return json.loads(self._index_version_path.read_text(encoding="utf-8"))
|
|
209
|
+
except Exception:
|
|
210
|
+
return {}
|
|
211
|
+
|
|
212
|
+
def _write_index_version(self, doc_count: int) -> None:
|
|
213
|
+
meta = {
|
|
214
|
+
"version": 1,
|
|
215
|
+
"build_ts": int(time.time()),
|
|
216
|
+
"doc_count": int(doc_count),
|
|
217
|
+
"engine_version": self._engine_version(),
|
|
218
|
+
"config_hash": self._config_hash(),
|
|
219
|
+
}
|
|
220
|
+
self._index_dir.mkdir(parents=True, exist_ok=True)
|
|
221
|
+
self._index_version_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
|
222
|
+
|
|
223
|
+
def _ensure_index(self) -> None:
|
|
224
|
+
self._tantivy = _load_tantivy(self._venv_dir, self._auto_install)
|
|
225
|
+
if self._schema and self._index:
|
|
226
|
+
return
|
|
227
|
+
schema_builder = self._tantivy.SchemaBuilder()
|
|
228
|
+
self._fields = {
|
|
229
|
+
"doc_id": schema_builder.add_text_field("doc_id", stored=True),
|
|
230
|
+
"path": schema_builder.add_text_field("path", stored=True),
|
|
231
|
+
"repo": schema_builder.add_text_field("repo", stored=True),
|
|
232
|
+
"root_id": schema_builder.add_text_field("root_id", stored=True),
|
|
233
|
+
"rel_path": schema_builder.add_text_field("rel_path", stored=True),
|
|
234
|
+
"path_text": schema_builder.add_text_field("path_text"),
|
|
235
|
+
"body_text": schema_builder.add_text_field("body_text"),
|
|
236
|
+
"preview": schema_builder.add_text_field("preview", stored=True),
|
|
237
|
+
"mtime": schema_builder.add_i64_field("mtime", stored=True),
|
|
238
|
+
"size": schema_builder.add_i64_field("size", stored=True),
|
|
239
|
+
}
|
|
240
|
+
self._schema = schema_builder.build()
|
|
241
|
+
if self._index_dir.exists() and (self._index_dir / "meta.json").exists():
|
|
242
|
+
self._index = self._tantivy.Index(self._index_dir.as_posix())
|
|
243
|
+
else:
|
|
244
|
+
self._index_dir.mkdir(parents=True, exist_ok=True)
|
|
245
|
+
self._index = self._tantivy.Index(self._schema, self._index_dir.as_posix())
|
|
246
|
+
|
|
247
|
+
def status(self) -> EngineMeta:
|
|
248
|
+
mode = "embedded"
|
|
249
|
+
mem_mb, index_mem_mb, threads = self._engine_limits()
|
|
250
|
+
try:
|
|
251
|
+
if not self._tantivy:
|
|
252
|
+
self._tantivy = _load_tantivy(self._venv_dir, auto_install=False)
|
|
253
|
+
except EngineError:
|
|
254
|
+
return EngineMeta(
|
|
255
|
+
engine_mode=mode,
|
|
256
|
+
engine_ready=False,
|
|
257
|
+
engine_version="unknown",
|
|
258
|
+
index_version="",
|
|
259
|
+
reason="NOT_INSTALLED",
|
|
260
|
+
hint="sari --cmd engine install",
|
|
261
|
+
engine_mem_mb=mem_mb,
|
|
262
|
+
index_mem_mb=index_mem_mb,
|
|
263
|
+
engine_threads=threads,
|
|
264
|
+
)
|
|
265
|
+
index_meta = self._load_index_version()
|
|
266
|
+
engine_version = index_meta.get("engine_version", "")
|
|
267
|
+
cfg_hash = index_meta.get("config_hash", "")
|
|
268
|
+
ready = bool(index_meta) and cfg_hash == self._config_hash() and engine_version
|
|
269
|
+
reason = ""
|
|
270
|
+
hint = ""
|
|
271
|
+
if not index_meta:
|
|
272
|
+
ready = False
|
|
273
|
+
reason = "INDEX_MISSING"
|
|
274
|
+
hint = "sari --cmd engine rebuild"
|
|
275
|
+
elif cfg_hash != self._config_hash():
|
|
276
|
+
ready = False
|
|
277
|
+
reason = "CONFIG_MISMATCH"
|
|
278
|
+
hint = "sari --cmd engine rebuild"
|
|
279
|
+
if not engine_version:
|
|
280
|
+
ready = False
|
|
281
|
+
reason = "ENGINE_MISMATCH"
|
|
282
|
+
hint = "sari --cmd engine rebuild"
|
|
283
|
+
idx_size = 0
|
|
284
|
+
if self._index_dir.exists():
|
|
285
|
+
try:
|
|
286
|
+
idx_size = sum(p.stat().st_size for p in self._index_dir.rglob("*") if p.is_file())
|
|
287
|
+
except Exception:
|
|
288
|
+
idx_size = 0
|
|
289
|
+
return EngineMeta(
|
|
290
|
+
engine_mode=mode,
|
|
291
|
+
engine_ready=ready,
|
|
292
|
+
engine_version=engine_version or "unknown",
|
|
293
|
+
index_version=cfg_hash or "",
|
|
294
|
+
reason=reason,
|
|
295
|
+
hint=hint,
|
|
296
|
+
doc_count=int(index_meta.get("doc_count", 0) or 0),
|
|
297
|
+
index_size_bytes=idx_size,
|
|
298
|
+
last_build_ts=int(index_meta.get("build_ts", 0) or 0),
|
|
299
|
+
engine_mem_mb=mem_mb,
|
|
300
|
+
index_mem_mb=index_mem_mb,
|
|
301
|
+
engine_threads=threads,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
def install(self) -> None:
|
|
305
|
+
_load_tantivy(self._venv_dir, auto_install=True)
|
|
306
|
+
self._ensure_index()
|
|
307
|
+
|
|
308
|
+
def rebuild(self) -> None:
|
|
309
|
+
self._ensure_index()
|
|
310
|
+
tmp_dir = self._index_dir.parent / f"{self._index_dir.name}.build"
|
|
311
|
+
if tmp_dir.exists():
|
|
312
|
+
for p in tmp_dir.rglob("*"):
|
|
313
|
+
if p.is_file():
|
|
314
|
+
try:
|
|
315
|
+
p.unlink()
|
|
316
|
+
except Exception:
|
|
317
|
+
pass
|
|
318
|
+
if tmp_dir.exists():
|
|
319
|
+
try:
|
|
320
|
+
tmp_dir.rmdir()
|
|
321
|
+
except Exception:
|
|
322
|
+
pass
|
|
323
|
+
tmp_dir.mkdir(parents=True, exist_ok=True)
|
|
324
|
+
idx = self._tantivy.Index(self._schema, tmp_dir.as_posix())
|
|
325
|
+
writer = self._index_writer(idx)
|
|
326
|
+
count = 0
|
|
327
|
+
for doc in self._db.iter_engine_documents(self._root_ids):
|
|
328
|
+
writer.add_document(self._tantivy.Document(**doc))
|
|
329
|
+
count += 1
|
|
330
|
+
writer.commit()
|
|
331
|
+
idx.reload()
|
|
332
|
+
if self._index_dir.exists():
|
|
333
|
+
for p in self._index_dir.rglob("*"):
|
|
334
|
+
if p.is_file():
|
|
335
|
+
try:
|
|
336
|
+
p.unlink()
|
|
337
|
+
except Exception:
|
|
338
|
+
pass
|
|
339
|
+
if self._index_dir.exists():
|
|
340
|
+
try:
|
|
341
|
+
self._index_dir.rmdir()
|
|
342
|
+
except Exception:
|
|
343
|
+
pass
|
|
344
|
+
tmp_dir.replace(self._index_dir)
|
|
345
|
+
self._index = idx
|
|
346
|
+
self._write_index_version(count)
|
|
347
|
+
|
|
348
|
+
def upsert_documents(self, docs: Iterable[Dict[str, Any]]) -> None:
|
|
349
|
+
self._ensure_index()
|
|
350
|
+
writer = self._index_writer(self._index)
|
|
351
|
+
count = 0
|
|
352
|
+
for doc in docs:
|
|
353
|
+
doc_id = doc.get("doc_id")
|
|
354
|
+
if doc_id:
|
|
355
|
+
term = self._tantivy.Term.from_field_text(self._fields["doc_id"], doc_id)
|
|
356
|
+
writer.delete_term(term)
|
|
357
|
+
writer.add_document(self._tantivy.Document(**doc))
|
|
358
|
+
count += 1
|
|
359
|
+
writer.commit()
|
|
360
|
+
if count:
|
|
361
|
+
self._write_index_version(self._load_index_version().get("doc_count", 0) + count)
|
|
362
|
+
|
|
363
|
+
def delete_documents(self, doc_ids: Iterable[str]) -> None:
|
|
364
|
+
self._ensure_index()
|
|
365
|
+
writer = self._index_writer(self._index)
|
|
366
|
+
deleted = 0
|
|
367
|
+
for doc_id in doc_ids:
|
|
368
|
+
term = self._tantivy.Term.from_field_text(self._fields["doc_id"], doc_id)
|
|
369
|
+
writer.delete_term(term)
|
|
370
|
+
deleted += 1
|
|
371
|
+
if deleted:
|
|
372
|
+
writer.commit()
|
|
373
|
+
meta = self._load_index_version()
|
|
374
|
+
doc_count = int(meta.get("doc_count", 0) or 0)
|
|
375
|
+
doc_count = max(0, doc_count - deleted)
|
|
376
|
+
self._write_index_version(doc_count)
|
|
377
|
+
|
|
378
|
+
def search_v2(self, opts: SearchOptions) -> Tuple[List[SearchHit], Dict[str, Any]]:
|
|
379
|
+
self._ensure_index()
|
|
380
|
+
meta = {"total_mode": "approx", "total": -1}
|
|
381
|
+
norm_q = _normalize_text(opts.query or "")
|
|
382
|
+
if not norm_q:
|
|
383
|
+
return [], meta
|
|
384
|
+
tokens, phrases = _query_parts(norm_q)
|
|
385
|
+
pieces = []
|
|
386
|
+
for p in phrases:
|
|
387
|
+
pieces.append(f"\"{p}\"")
|
|
388
|
+
for t in tokens:
|
|
389
|
+
pieces.append(t)
|
|
390
|
+
qstr = " AND ".join(pieces) if pieces else ""
|
|
391
|
+
if not qstr:
|
|
392
|
+
return [], meta
|
|
393
|
+
qp = self._tantivy.QueryParser.for_index(self._index, [self._fields["body_text"], self._fields["path_text"]])
|
|
394
|
+
try:
|
|
395
|
+
qp.set_conjunction_by_default()
|
|
396
|
+
except Exception:
|
|
397
|
+
pass
|
|
398
|
+
query = qp.parse_query(qstr)
|
|
399
|
+
searcher = self._index.searcher()
|
|
400
|
+
limit = max(1, min(int(opts.limit), 50))
|
|
401
|
+
top_docs = searcher.search(query, self._tantivy.TopDocs(limit=limit + int(opts.offset)))
|
|
402
|
+
hits: List[SearchHit] = []
|
|
403
|
+
for score, doc_address in top_docs:
|
|
404
|
+
doc = searcher.doc(doc_address)
|
|
405
|
+
path = doc.get_first(self._fields["path"])
|
|
406
|
+
repo = doc.get_first(self._fields["repo"]) or "__root__"
|
|
407
|
+
mtime = int(doc.get_first(self._fields["mtime"]) or 0)
|
|
408
|
+
size = int(doc.get_first(self._fields["size"]) or 0)
|
|
409
|
+
preview = doc.get_first(self._fields["preview"]) or ""
|
|
410
|
+
path_str = str(path) if path else ""
|
|
411
|
+
if opts.root_ids:
|
|
412
|
+
rid = doc.get_first(self._fields["root_id"]) or ""
|
|
413
|
+
if rid not in opts.root_ids:
|
|
414
|
+
continue
|
|
415
|
+
if opts.repo and repo != opts.repo:
|
|
416
|
+
continue
|
|
417
|
+
if opts.file_types and get_file_extension(path_str) not in [ft.lower().lstrip(".") for ft in opts.file_types]:
|
|
418
|
+
continue
|
|
419
|
+
if opts.path_pattern and not _path_pattern_match(path_str, opts.path_pattern):
|
|
420
|
+
continue
|
|
421
|
+
if opts.exclude_patterns and _exclude_pattern_match(path_str, opts.exclude_patterns):
|
|
422
|
+
continue
|
|
423
|
+
snippet = snippet_around(preview, tokens, opts.snippet_lines, highlight=True) if preview else ""
|
|
424
|
+
hits.append(SearchHit(
|
|
425
|
+
repo=repo,
|
|
426
|
+
path=path_str,
|
|
427
|
+
score=float(score),
|
|
428
|
+
snippet=snippet,
|
|
429
|
+
mtime=mtime,
|
|
430
|
+
size=size,
|
|
431
|
+
match_count=0,
|
|
432
|
+
file_type=get_file_extension(path_str),
|
|
433
|
+
hit_reason="Engine match",
|
|
434
|
+
))
|
|
435
|
+
hits.sort(key=lambda h: (-h.score, -h.mtime, h.path))
|
|
436
|
+
start = int(opts.offset)
|
|
437
|
+
end = start + limit
|
|
438
|
+
return hits[start:end], meta
|
|
439
|
+
|
|
440
|
+
def repo_candidates(self, q: str, limit: int = 3, root_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
441
|
+
q = (q or "").strip()
|
|
442
|
+
if not q:
|
|
443
|
+
return []
|
|
444
|
+
sql = "SELECT repo, COUNT(1) AS c FROM files WHERE content LIKE ? ESCAPE '^' GROUP BY repo ORDER BY c DESC LIMIT ?;"
|
|
445
|
+
like_q = q.replace("^", "^^").replace("%", "^%").replace("_", "^_")
|
|
446
|
+
with self._db._read_lock:
|
|
447
|
+
rows = self._db._read.execute(sql, (f"%{like_q}%", limit)).fetchall()
|
|
448
|
+
out = []
|
|
449
|
+
for r in rows:
|
|
450
|
+
repo, c = str(r["repo"]), int(r["c"])
|
|
451
|
+
out.append({"repo": repo, "score": c, "evidence": ""})
|
|
452
|
+
return out
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def _path_pattern_match(path: str, pattern: str) -> bool:
|
|
456
|
+
import fnmatch
|
|
457
|
+
p = path.replace("\\", "/")
|
|
458
|
+
pat = pattern.replace("\\", "/")
|
|
459
|
+
if pat.startswith("/"):
|
|
460
|
+
if p.startswith(pat):
|
|
461
|
+
return True
|
|
462
|
+
if p.endswith("/" + pat) or p == pat:
|
|
463
|
+
return True
|
|
464
|
+
return fnmatch.fnmatch(p, pat) or fnmatch.fnmatch(p, f"*/{pat}") or fnmatch.fnmatch(p, f"*/{pat}/*")
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def _exclude_pattern_match(path: str, patterns: List[str]) -> bool:
|
|
468
|
+
import fnmatch
|
|
469
|
+
for p in patterns:
|
|
470
|
+
if p in path or fnmatch.fnmatch(path, f"*{p}*"):
|
|
471
|
+
return True
|
|
472
|
+
return False
|