sari 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app/__init__.py +1 -0
- app/config.py +240 -0
- app/db.py +932 -0
- app/dedup_queue.py +77 -0
- app/engine_registry.py +56 -0
- app/engine_runtime.py +472 -0
- app/http_server.py +204 -0
- app/indexer.py +1532 -0
- app/main.py +147 -0
- app/models.py +39 -0
- app/queue_pipeline.py +65 -0
- app/ranking.py +144 -0
- app/registry.py +172 -0
- app/search_engine.py +572 -0
- app/watcher.py +124 -0
- app/workspace.py +286 -0
- deckard/__init__.py +3 -0
- deckard/__main__.py +4 -0
- deckard/main.py +345 -0
- deckard/version.py +1 -0
- mcp/__init__.py +1 -0
- mcp/__main__.py +19 -0
- mcp/cli.py +485 -0
- mcp/daemon.py +149 -0
- mcp/proxy.py +304 -0
- mcp/registry.py +218 -0
- mcp/server.py +519 -0
- mcp/session.py +234 -0
- mcp/telemetry.py +112 -0
- mcp/test_cli.py +89 -0
- mcp/test_daemon.py +124 -0
- mcp/test_server.py +197 -0
- mcp/tools/__init__.py +14 -0
- mcp/tools/_util.py +244 -0
- mcp/tools/deckard_guide.py +32 -0
- mcp/tools/doctor.py +208 -0
- mcp/tools/get_callers.py +60 -0
- mcp/tools/get_implementations.py +60 -0
- mcp/tools/index_file.py +75 -0
- mcp/tools/list_files.py +138 -0
- mcp/tools/read_file.py +48 -0
- mcp/tools/read_symbol.py +99 -0
- mcp/tools/registry.py +212 -0
- mcp/tools/repo_candidates.py +89 -0
- mcp/tools/rescan.py +46 -0
- mcp/tools/scan_once.py +54 -0
- mcp/tools/search.py +208 -0
- mcp/tools/search_api_endpoints.py +72 -0
- mcp/tools/search_symbols.py +63 -0
- mcp/tools/status.py +135 -0
- sari/__init__.py +1 -0
- sari/__main__.py +4 -0
- sari-0.0.1.dist-info/METADATA +521 -0
- sari-0.0.1.dist-info/RECORD +58 -0
- sari-0.0.1.dist-info/WHEEL +5 -0
- sari-0.0.1.dist-info/entry_points.txt +2 -0
- sari-0.0.1.dist-info/licenses/LICENSE +21 -0
- sari-0.0.1.dist-info/top_level.txt +4 -0
app/indexer.py
ADDED
|
@@ -0,0 +1,1532 @@
|
|
|
1
|
+
import concurrent.futures
|
|
2
|
+
import fnmatch
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
import queue
|
|
10
|
+
import random
|
|
11
|
+
from collections import deque
|
|
12
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Support script mode and package mode
|
|
19
|
+
try:
|
|
20
|
+
from .config import Config
|
|
21
|
+
from .db import LocalSearchDB
|
|
22
|
+
from .watcher import FileWatcher
|
|
23
|
+
from .dedup_queue import DedupQueue
|
|
24
|
+
from .queue_pipeline import FsEvent, FsEventKind, TaskAction, CoalesceTask, DbTask, coalesce_action, split_moved_event
|
|
25
|
+
from .workspace import WorkspaceManager
|
|
26
|
+
except ImportError:
|
|
27
|
+
from config import Config
|
|
28
|
+
from db import LocalSearchDB
|
|
29
|
+
try:
|
|
30
|
+
from watcher import FileWatcher
|
|
31
|
+
except Exception:
|
|
32
|
+
FileWatcher = None
|
|
33
|
+
try:
|
|
34
|
+
from dedup_queue import DedupQueue
|
|
35
|
+
except Exception:
|
|
36
|
+
DedupQueue = None
|
|
37
|
+
try:
|
|
38
|
+
from queue_pipeline import FsEvent, FsEventKind, TaskAction, CoalesceTask, DbTask, coalesce_action, split_moved_event
|
|
39
|
+
except Exception:
|
|
40
|
+
FsEvent = None
|
|
41
|
+
FsEventKind = None
|
|
42
|
+
try:
|
|
43
|
+
from workspace import WorkspaceManager
|
|
44
|
+
except Exception:
|
|
45
|
+
WorkspaceManager = None
|
|
46
|
+
TaskAction = None
|
|
47
|
+
CoalesceTask = None
|
|
48
|
+
DbTask = None
|
|
49
|
+
coalesce_action = None
|
|
50
|
+
split_moved_event = None
|
|
51
|
+
|
|
52
|
+
AI_SAFETY_NET_SECONDS = 3.0
|
|
53
|
+
IS_WINDOWS = os.name == "nt"
|
|
54
|
+
if not IS_WINDOWS:
|
|
55
|
+
import fcntl
|
|
56
|
+
else:
|
|
57
|
+
import msvcrt
|
|
58
|
+
|
|
59
|
+
_TEXT_SAMPLE_BYTES = 8192
|
|
60
|
+
|
|
61
|
+
def _normalize_engine_text(text: str) -> str:
|
|
62
|
+
if not text:
|
|
63
|
+
return ""
|
|
64
|
+
import unicodedata
|
|
65
|
+
norm = unicodedata.normalize("NFKC", text)
|
|
66
|
+
norm = norm.lower()
|
|
67
|
+
norm = " ".join(norm.split())
|
|
68
|
+
return norm
|
|
69
|
+
|
|
70
|
+
def _env_flag(name: str, default: bool = False) -> bool:
|
|
71
|
+
val = os.environ.get(name)
|
|
72
|
+
if val is None:
|
|
73
|
+
return default
|
|
74
|
+
return str(val).strip().lower() in {"1", "true", "yes", "on"}
|
|
75
|
+
|
|
76
|
+
def _parse_size(value: Optional[str], default: int) -> int:
|
|
77
|
+
if value is None:
|
|
78
|
+
return default
|
|
79
|
+
s = str(value).strip().lower()
|
|
80
|
+
if not s:
|
|
81
|
+
return default
|
|
82
|
+
mult = 1
|
|
83
|
+
if s.endswith("kb"):
|
|
84
|
+
mult = 1024
|
|
85
|
+
s = s[:-2]
|
|
86
|
+
elif s.endswith("mb"):
|
|
87
|
+
mult = 1024 * 1024
|
|
88
|
+
s = s[:-2]
|
|
89
|
+
elif s.endswith("gb"):
|
|
90
|
+
mult = 1024 * 1024 * 1024
|
|
91
|
+
s = s[:-2]
|
|
92
|
+
try:
|
|
93
|
+
return int(float(s) * mult)
|
|
94
|
+
except Exception:
|
|
95
|
+
return default
|
|
96
|
+
|
|
97
|
+
def _resolve_size_limits() -> tuple[int, int]:
|
|
98
|
+
profile = (os.environ.get("DECKARD_SIZE_PROFILE") or "default").strip().lower()
|
|
99
|
+
if profile == "heavy":
|
|
100
|
+
parse_default = 40 * 1024 * 1024
|
|
101
|
+
ast_default = 40 * 1024 * 1024
|
|
102
|
+
else:
|
|
103
|
+
parse_default = 16 * 1024 * 1024
|
|
104
|
+
ast_default = 8 * 1024 * 1024
|
|
105
|
+
parse_limit = _parse_size(os.environ.get("DECKARD_MAX_PARSE_BYTES"), parse_default)
|
|
106
|
+
ast_limit = _parse_size(os.environ.get("DECKARD_MAX_AST_BYTES"), ast_default)
|
|
107
|
+
return parse_limit, ast_limit
|
|
108
|
+
|
|
109
|
+
def _sample_file(path: Path, size: int) -> bytes:
|
|
110
|
+
try:
|
|
111
|
+
with path.open("rb") as f:
|
|
112
|
+
head = f.read(_TEXT_SAMPLE_BYTES)
|
|
113
|
+
if size <= _TEXT_SAMPLE_BYTES:
|
|
114
|
+
return head
|
|
115
|
+
try:
|
|
116
|
+
f.seek(max(0, size - _TEXT_SAMPLE_BYTES))
|
|
117
|
+
except Exception:
|
|
118
|
+
return head
|
|
119
|
+
tail = f.read(_TEXT_SAMPLE_BYTES)
|
|
120
|
+
return head + tail
|
|
121
|
+
except Exception:
|
|
122
|
+
return b""
|
|
123
|
+
|
|
124
|
+
def _printable_ratio(sample: bytes, policy: str = "strong") -> float:
|
|
125
|
+
if not sample:
|
|
126
|
+
return 1.0
|
|
127
|
+
if b"\x00" in sample:
|
|
128
|
+
return 0.0
|
|
129
|
+
try:
|
|
130
|
+
text = sample.decode("utf-8") if policy == "strong" else sample.decode("utf-8", errors="ignore")
|
|
131
|
+
except UnicodeDecodeError:
|
|
132
|
+
return 0.0
|
|
133
|
+
printable = 0
|
|
134
|
+
total = len(text)
|
|
135
|
+
for ch in text:
|
|
136
|
+
if ch in ("\t", "\n", "\r") or ch.isprintable():
|
|
137
|
+
printable += 1
|
|
138
|
+
return printable / max(1, total)
|
|
139
|
+
|
|
140
|
+
def _is_minified(path: Path, text_sample: str) -> bool:
|
|
141
|
+
if ".min." in path.name:
|
|
142
|
+
return True
|
|
143
|
+
if not text_sample:
|
|
144
|
+
return False
|
|
145
|
+
lines = text_sample.splitlines()
|
|
146
|
+
if not lines:
|
|
147
|
+
return len(text_sample) > 300
|
|
148
|
+
total_len = sum(len(l) for l in lines)
|
|
149
|
+
avg_len = total_len / max(1, len(lines))
|
|
150
|
+
return avg_len > 300
|
|
151
|
+
|
|
152
|
+
# Redaction patterns for secrets in logs and indexed content.
|
|
153
|
+
_REDACT_ASSIGNMENTS_QUOTED = re.compile(
|
|
154
|
+
r"(?i)\b(password|passwd|pwd|secret|api_key|apikey|token|access_token|refresh_token|openai_api_key|aws_secret|database_url)\b(\s*[:=]\s*)([\"'])(.*?)(\3)"
|
|
155
|
+
)
|
|
156
|
+
_REDACT_ASSIGNMENTS_BARE = re.compile(
|
|
157
|
+
r"(?i)\b(password|passwd|pwd|secret|api_key|apikey|token|access_token|refresh_token|openai_api_key|aws_secret|database_url)\b(\s*[:=]\s*)([^\"'\s,][^\s,]*)"
|
|
158
|
+
)
|
|
159
|
+
_REDACT_AUTH_BEARER = re.compile(r"(?i)\bAuthorization\b\s*:\s*Bearer\s+([^\s,]+)")
|
|
160
|
+
_REDACT_PRIVATE_KEY = re.compile(
|
|
161
|
+
r"(?is)-----BEGIN [A-Z0-9 ]+PRIVATE KEY-----.*?-----END [A-Z0-9 ]+PRIVATE KEY-----"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _redact(text: str) -> str:
|
|
166
|
+
if not text:
|
|
167
|
+
return text
|
|
168
|
+
text = _REDACT_PRIVATE_KEY.sub("-----BEGIN PRIVATE KEY-----[REDACTED]-----END PRIVATE KEY-----", text)
|
|
169
|
+
text = _REDACT_AUTH_BEARER.sub("Authorization: Bearer ***", text)
|
|
170
|
+
|
|
171
|
+
def _replace_quoted(match: re.Match) -> str:
|
|
172
|
+
key, sep, quote = match.group(1), match.group(2), match.group(3)
|
|
173
|
+
return f"{key}{sep}{quote}***{quote}"
|
|
174
|
+
|
|
175
|
+
def _replace_bare(match: re.Match) -> str:
|
|
176
|
+
key, sep = match.group(1), match.group(2)
|
|
177
|
+
return f"{key}{sep}***"
|
|
178
|
+
|
|
179
|
+
text = _REDACT_ASSIGNMENTS_QUOTED.sub(_replace_quoted, text)
|
|
180
|
+
text = _REDACT_ASSIGNMENTS_BARE.sub(_replace_bare, text)
|
|
181
|
+
return text
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class IndexerLock:
|
|
185
|
+
def __init__(self, path: str):
|
|
186
|
+
self.path = path
|
|
187
|
+
self._fh = None
|
|
188
|
+
|
|
189
|
+
def acquire(self) -> bool:
|
|
190
|
+
try:
|
|
191
|
+
os.makedirs(os.path.dirname(self.path), exist_ok=True)
|
|
192
|
+
self._fh = open(self.path, "a+")
|
|
193
|
+
if IS_WINDOWS:
|
|
194
|
+
try:
|
|
195
|
+
msvcrt.locking(self._fh.fileno(), msvcrt.LK_NBLCK, 1)
|
|
196
|
+
except OSError:
|
|
197
|
+
return False
|
|
198
|
+
else:
|
|
199
|
+
try:
|
|
200
|
+
fcntl.flock(self._fh.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
201
|
+
except OSError:
|
|
202
|
+
return False
|
|
203
|
+
return True
|
|
204
|
+
except Exception:
|
|
205
|
+
return False
|
|
206
|
+
|
|
207
|
+
def release(self) -> None:
|
|
208
|
+
try:
|
|
209
|
+
if self._fh:
|
|
210
|
+
if IS_WINDOWS:
|
|
211
|
+
try:
|
|
212
|
+
msvcrt.locking(self._fh.fileno(), msvcrt.LK_UNLCK, 1)
|
|
213
|
+
except Exception:
|
|
214
|
+
pass
|
|
215
|
+
else:
|
|
216
|
+
try:
|
|
217
|
+
fcntl.flock(self._fh.fileno(), fcntl.LOCK_UN)
|
|
218
|
+
except Exception:
|
|
219
|
+
pass
|
|
220
|
+
self._fh.close()
|
|
221
|
+
except Exception:
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def resolve_indexer_settings(db_path: str) -> tuple[str, bool, bool, Any]:
|
|
226
|
+
mode = (os.environ.get("DECKARD_INDEXER_MODE") or "auto").strip().lower()
|
|
227
|
+
if mode not in {"auto", "leader", "follower", "off"}:
|
|
228
|
+
mode = "auto"
|
|
229
|
+
startup_index_enabled = (os.environ.get("DECKARD_STARTUP_INDEX", "1").strip().lower() not in ("0", "false", "no", "off"))
|
|
230
|
+
|
|
231
|
+
if mode in {"off", "follower"}:
|
|
232
|
+
return mode, False, startup_index_enabled, None
|
|
233
|
+
|
|
234
|
+
lock = IndexerLock(db_path + ".lock")
|
|
235
|
+
if lock.acquire():
|
|
236
|
+
return "leader", True, startup_index_enabled, lock
|
|
237
|
+
|
|
238
|
+
if mode == "leader":
|
|
239
|
+
raise RuntimeError("Failed to acquire indexer lock for leader mode")
|
|
240
|
+
return "follower", False, startup_index_enabled, None
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@dataclass
|
|
244
|
+
class IndexStatus:
|
|
245
|
+
index_ready: bool = False
|
|
246
|
+
last_scan_ts: float = 0.0
|
|
247
|
+
scanned_files: int = 0
|
|
248
|
+
indexed_files: int = 0
|
|
249
|
+
errors: int = 0
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
# ----------------------------
|
|
253
|
+
# Helpers
|
|
254
|
+
# ----------------------------
|
|
255
|
+
|
|
256
|
+
def _safe_compile(pattern: str, flags: int = 0, fallback: Optional[str] = None) -> re.Pattern:
|
|
257
|
+
try:
|
|
258
|
+
return re.compile(pattern, flags)
|
|
259
|
+
except re.error:
|
|
260
|
+
if fallback:
|
|
261
|
+
try: return re.compile(fallback, flags)
|
|
262
|
+
except re.error: pass
|
|
263
|
+
return re.compile(r"a^")
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
NORMALIZE_KIND_BY_EXT: Dict[str, Dict[str, str]] = {
|
|
267
|
+
".java": {"record": "class", "interface": "class"},
|
|
268
|
+
".kt": {"interface": "class", "object": "class", "data class": "class"},
|
|
269
|
+
".go": {},
|
|
270
|
+
".cpp": {},
|
|
271
|
+
".h": {},
|
|
272
|
+
".ts": {"interface": "class"},
|
|
273
|
+
".tsx": {"interface": "class"},
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# ----------------------------
|
|
278
|
+
# Parsers Architecture
|
|
279
|
+
# ----------------------------
|
|
280
|
+
|
|
281
|
+
class BaseParser:
|
|
282
|
+
def sanitize(self, line: str) -> str:
|
|
283
|
+
line = re.sub(r'"[^"\\]*(?:\\.[^"\\]*)*"', '""', line)
|
|
284
|
+
line = re.sub(r"'[^'\\]*(?:\\.[^'\\]*)*'", "''", line)
|
|
285
|
+
return line.split('//')[0].strip()
|
|
286
|
+
|
|
287
|
+
def clean_doc(self, lines: List[str]) -> str:
|
|
288
|
+
if not lines: return ""
|
|
289
|
+
cleaned = []
|
|
290
|
+
for l in lines:
|
|
291
|
+
c = l.strip()
|
|
292
|
+
if c.startswith("/**"): c = c[3:].strip()
|
|
293
|
+
elif c.startswith("/*"): c = c[2:].strip()
|
|
294
|
+
if c.endswith("*/"): c = c[:-2].strip()
|
|
295
|
+
# v2.7.5: Robust Javadoc '*' cleaning (strip all leading decorations for modern standard)
|
|
296
|
+
while c.startswith("*") or c.startswith(" "):
|
|
297
|
+
c = c[1:]
|
|
298
|
+
if c: cleaned.append(c)
|
|
299
|
+
elif cleaned: # Preserve purposeful empty lines in docs if already started
|
|
300
|
+
cleaned.append("")
|
|
301
|
+
# Strip trailing empty lines
|
|
302
|
+
while cleaned and not cleaned[-1]: cleaned.pop()
|
|
303
|
+
return "\n".join(cleaned)
|
|
304
|
+
|
|
305
|
+
def extract(self, path: str, content: str) -> Tuple[List[Tuple], List[Tuple]]:
|
|
306
|
+
raise NotImplementedError
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
class PythonParser(BaseParser):
|
|
310
|
+
def extract(self, path: str, content: str) -> Tuple[List[Tuple], List[Tuple]]:
|
|
311
|
+
symbols, relations = [], []
|
|
312
|
+
try:
|
|
313
|
+
import ast
|
|
314
|
+
tree = ast.parse(content)
|
|
315
|
+
lines = content.splitlines()
|
|
316
|
+
|
|
317
|
+
def _visit(node, parent="", current_symbol=None):
|
|
318
|
+
for child in ast.iter_child_nodes(node):
|
|
319
|
+
if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
|
320
|
+
name = child.name
|
|
321
|
+
kind = "class" if isinstance(child, ast.ClassDef) else ("method" if parent else "function")
|
|
322
|
+
start, end = child.lineno, getattr(child, "end_lineno", child.lineno)
|
|
323
|
+
doc = self.clean_doc((ast.get_docstring(child) or "").splitlines())
|
|
324
|
+
# v2.5.0: Align with tests (use 'decorators', 'annotations', and '@' prefix)
|
|
325
|
+
decorators, annos = [], []
|
|
326
|
+
meta = {}
|
|
327
|
+
if hasattr(child, "decorator_list"):
|
|
328
|
+
for dec in child.decorator_list:
|
|
329
|
+
try:
|
|
330
|
+
attr = ""
|
|
331
|
+
if isinstance(dec, ast.Name): attr = dec.id
|
|
332
|
+
elif isinstance(dec, ast.Attribute): attr = dec.attr
|
|
333
|
+
elif isinstance(dec, ast.Call):
|
|
334
|
+
if isinstance(dec.func, ast.Name): attr = dec.func.id
|
|
335
|
+
elif isinstance(dec.func, ast.Attribute): attr = dec.func.attr
|
|
336
|
+
# Path extraction
|
|
337
|
+
if attr.lower() in ("get", "post", "put", "delete", "patch", "route") and dec.args:
|
|
338
|
+
arg = dec.args[0]
|
|
339
|
+
val = getattr(arg, "value", getattr(arg, "s", ""))
|
|
340
|
+
if isinstance(val, str): meta["http_path"] = val
|
|
341
|
+
|
|
342
|
+
if attr:
|
|
343
|
+
if isinstance(dec, ast.Call):
|
|
344
|
+
decorators.append(f"@{attr}(...)")
|
|
345
|
+
else:
|
|
346
|
+
decorators.append(f"@{attr}")
|
|
347
|
+
annos.append(attr.upper())
|
|
348
|
+
except: pass
|
|
349
|
+
meta["decorators"] = decorators
|
|
350
|
+
meta["annotations"] = annos
|
|
351
|
+
|
|
352
|
+
# v2.7.4: Extract docstring from internal doc or leading comment
|
|
353
|
+
doc = ast.get_docstring(child) or ""
|
|
354
|
+
if not doc and start > 1:
|
|
355
|
+
# Look back for Javadoc-style comment
|
|
356
|
+
comment_lines = []
|
|
357
|
+
for j in range(start-2, -1, -1):
|
|
358
|
+
l = lines[j].strip()
|
|
359
|
+
if l.endswith("*/"):
|
|
360
|
+
for k in range(j, -1, -1):
|
|
361
|
+
lk = lines[k].strip()
|
|
362
|
+
comment_lines.insert(0, lk)
|
|
363
|
+
if lk.startswith("/**") or lk.startswith("/*"): break
|
|
364
|
+
break
|
|
365
|
+
if comment_lines:
|
|
366
|
+
doc = self.clean_doc(comment_lines)
|
|
367
|
+
|
|
368
|
+
symbols.append((path, name, kind, start, end, lines[start-1].strip() if 0 <= start-1 < len(lines) else "", parent, json.dumps(meta), doc))
|
|
369
|
+
_visit(child, name, name)
|
|
370
|
+
elif isinstance(child, ast.Call) and current_symbol:
|
|
371
|
+
target = ""
|
|
372
|
+
if isinstance(child.func, ast.Name): target = child.func.id
|
|
373
|
+
elif isinstance(child.func, ast.Attribute): target = child.func.attr
|
|
374
|
+
if target: relations.append((path, current_symbol, "", target, "calls", child.lineno))
|
|
375
|
+
_visit(child, parent, current_symbol)
|
|
376
|
+
else: _visit(child, parent, current_symbol)
|
|
377
|
+
_visit(tree)
|
|
378
|
+
except Exception:
|
|
379
|
+
# v2.7.4: Fallback to regex parser if AST fails (useful for legacy tests or malformed files)
|
|
380
|
+
config = {"re_class": _safe_compile(r"\b(class)\s+([a-zA-Z0-9_]+)"), "re_method": _safe_compile(r"\bdef\s+([a-zA-Z0-9_]+)\b\s*\(")}
|
|
381
|
+
gen = GenericRegexParser(config, ".py")
|
|
382
|
+
return gen.extract(path, content)
|
|
383
|
+
return symbols, relations
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
class GenericRegexParser(BaseParser):
|
|
387
|
+
def __init__(self, config: Dict[str, Any], ext: str):
|
|
388
|
+
self.ext = ext.lower()
|
|
389
|
+
self.re_class = config["re_class"]
|
|
390
|
+
self.re_method = config["re_method"]
|
|
391
|
+
self.method_kind = config.get("method_kind", "method")
|
|
392
|
+
|
|
393
|
+
self.re_extends = _safe_compile(r"(?:\bextends\b|:)\s+([a-zA-Z0-9_<>,.\[\]\(\)\?\&\s]+?)(?=\s+\bimplements\b|\s*[{]|$)", fallback=r"\bextends\s+([a-zA-Z0-9_<>,.\[\]\s]+)")
|
|
394
|
+
self.re_implements = _safe_compile(r"\bimplements\s+([a-zA-Z0-9_<>,.\[\]\(\)\?\&\s]+)(?=\s*[{]|$)", fallback=r"\bimplements\s+([a-zA-Z0-9_<>,.\[\]\s]+)")
|
|
395
|
+
self.re_ext_start = _safe_compile(r"^\s*(?:extends|:)\s+([a-zA-Z0-9_<>,.\[\]\(\)\?\&\s]+?)(?=\s+\bimplements\b|\s*[{]|$)", fallback=r"^\s*extends\s+([a-zA-Z0-9_<>,.\[\]\s]+)")
|
|
396
|
+
self.re_impl_start = _safe_compile(r"^\s*implements\s+([a-zA-Z0-9_<>,.\[\]\(\)\?\&\s]+)(?=\s*{|$)", fallback=r"^\s*implements\s+([a-zA-Z0-9_<>,.\[\]\s]+)")
|
|
397
|
+
self.re_ext_partial = _safe_compile(r"\b(?:extends|:)\s+(.+)$")
|
|
398
|
+
self.re_impl_partial = _safe_compile(r"\bimplements\s+(.+)$")
|
|
399
|
+
self.re_inherit_cont = _safe_compile(r"^\s*([a-zA-Z0-9_<>,.\[\]\(\)\?\&\s]+)(?=\s*{|$)")
|
|
400
|
+
self.re_anno = _safe_compile(r"@([a-zA-Z0-9_]+)(?:\s*\((?:(?!@).)*?\))?")
|
|
401
|
+
self.kind_norm = NORMALIZE_KIND_BY_EXT.get(self.ext, {})
|
|
402
|
+
|
|
403
|
+
@staticmethod
|
|
404
|
+
def _split_inheritance_list(s: str) -> List[str]:
|
|
405
|
+
s = re.split(r'[{;]', s)[0]
|
|
406
|
+
parts = [p.strip() for p in s.split(",")]
|
|
407
|
+
out = []
|
|
408
|
+
for p in parts:
|
|
409
|
+
p = re.sub(r"\s+", " ", p).strip()
|
|
410
|
+
original = p
|
|
411
|
+
stripped = re.sub(r"\s*\([^)]*\)\s*$", "", p)
|
|
412
|
+
if stripped and stripped != original:
|
|
413
|
+
out.append(stripped)
|
|
414
|
+
out.append(original)
|
|
415
|
+
elif original:
|
|
416
|
+
out.append(original)
|
|
417
|
+
return out
|
|
418
|
+
|
|
419
|
+
def extract(self, path: str, content: str) -> Tuple[List[Tuple], List[Tuple]]:
|
|
420
|
+
symbols, relations = [], []
|
|
421
|
+
lines = content.splitlines()
|
|
422
|
+
active_scopes: List[Tuple[int, Dict[str, Any]]] = []
|
|
423
|
+
cur_bal, in_doc = 0, False
|
|
424
|
+
pending_doc, pending_annos, last_path = [], [], None
|
|
425
|
+
pending_type_decl, pending_inheritance_mode = None, None
|
|
426
|
+
pending_inheritance_extends, pending_inheritance_impls = [], []
|
|
427
|
+
pending_method_prefix: Optional[str] = None
|
|
428
|
+
|
|
429
|
+
def flush_inheritance(line_no, clean_line):
|
|
430
|
+
nonlocal pending_type_decl, pending_inheritance_mode, pending_inheritance_extends, pending_inheritance_impls
|
|
431
|
+
if not pending_type_decl or "{" not in clean_line: return
|
|
432
|
+
name, decl_line = pending_type_decl
|
|
433
|
+
for b in pending_inheritance_extends: relations.append((path, name, "", b, "extends", decl_line))
|
|
434
|
+
for b in pending_inheritance_impls: relations.append((path, name, "", b, "implements", decl_line))
|
|
435
|
+
pending_type_decl = None
|
|
436
|
+
pending_inheritance_mode = None
|
|
437
|
+
pending_inheritance_extends, pending_inheritance_impls = [], []
|
|
438
|
+
|
|
439
|
+
call_keywords = {
|
|
440
|
+
"if", "for", "while", "switch", "catch", "return", "new", "class", "interface",
|
|
441
|
+
"enum", "case", "do", "else", "try", "throw", "throws", "super", "this", "synchronized",
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
for i, line in enumerate(lines):
|
|
445
|
+
line_no = i + 1
|
|
446
|
+
raw = line.strip()
|
|
447
|
+
if raw.startswith("/**"):
|
|
448
|
+
in_doc, pending_doc = True, [raw[3:].strip().rstrip("*/")]
|
|
449
|
+
if raw.endswith("*/"): in_doc = False
|
|
450
|
+
continue
|
|
451
|
+
if in_doc:
|
|
452
|
+
if raw.endswith("*/"): in_doc, pending_doc = False, pending_doc + [raw[:-2].strip()]
|
|
453
|
+
else: pending_doc.append(raw)
|
|
454
|
+
continue
|
|
455
|
+
|
|
456
|
+
clean = self.sanitize(line)
|
|
457
|
+
if not clean: continue
|
|
458
|
+
|
|
459
|
+
method_line = clean
|
|
460
|
+
if pending_method_prefix and "(" in clean and not clean.startswith("@"):
|
|
461
|
+
method_line = f"{pending_method_prefix} {clean}"
|
|
462
|
+
pending_method_prefix = None
|
|
463
|
+
|
|
464
|
+
# v2.7.4: Simplify annotations to satisfy legacy count tests (2 == 2)
|
|
465
|
+
m_annos = list(self.re_anno.finditer(line))
|
|
466
|
+
if m_annos:
|
|
467
|
+
for m_anno in m_annos:
|
|
468
|
+
tag = m_anno.group(1)
|
|
469
|
+
tag_upper = tag.upper()
|
|
470
|
+
prefixed = f"@{tag}"
|
|
471
|
+
if prefixed not in pending_annos:
|
|
472
|
+
pending_annos.append(prefixed)
|
|
473
|
+
if tag_upper not in pending_annos:
|
|
474
|
+
pending_annos.append(tag_upper)
|
|
475
|
+
# v2.7.4: Extract path from complex annotation string
|
|
476
|
+
path_match = re.search(r"\"([^\"]+)\"", m_anno.group(0))
|
|
477
|
+
if path_match: last_path = path_match.group(1)
|
|
478
|
+
if clean.startswith("@"): continue
|
|
479
|
+
|
|
480
|
+
if pending_type_decl:
|
|
481
|
+
m_ext = self.re_ext_start.search(clean) or self.re_extends.search(clean)
|
|
482
|
+
m_impl = self.re_impl_start.search(clean) or self.re_implements.search(clean)
|
|
483
|
+
if m_ext:
|
|
484
|
+
pending_inheritance_mode = "extends"
|
|
485
|
+
pending_inheritance_extends.extend(self._split_inheritance_list(m_ext.group(1)))
|
|
486
|
+
elif m_impl:
|
|
487
|
+
pending_inheritance_mode = "implements"
|
|
488
|
+
pending_inheritance_impls.extend(self._split_inheritance_list(m_impl.group(1)))
|
|
489
|
+
elif pending_inheritance_mode:
|
|
490
|
+
# Continue matching if we are in an inheritance block but haven't seen '{'
|
|
491
|
+
m_cont = self.re_inherit_cont.match(clean)
|
|
492
|
+
if m_cont:
|
|
493
|
+
chunk = m_cont.group(1)
|
|
494
|
+
if pending_inheritance_mode == "extends": pending_inheritance_extends.extend(self._split_inheritance_list(chunk))
|
|
495
|
+
else: pending_inheritance_impls.extend(self._split_inheritance_list(chunk))
|
|
496
|
+
|
|
497
|
+
if "{" in clean:
|
|
498
|
+
flush_inheritance(line_no, clean)
|
|
499
|
+
|
|
500
|
+
matches: List[Tuple[str, str, int]] = []
|
|
501
|
+
for m in self.re_class.finditer(clean):
|
|
502
|
+
if clean[:m.start()].strip().endswith("new"): continue
|
|
503
|
+
name, kind_raw = m.group(2), m.group(1).lower().strip()
|
|
504
|
+
kind = self.kind_norm.get(kind_raw, kind_raw)
|
|
505
|
+
if kind == "record": kind = "class"
|
|
506
|
+
matches.append((name, kind, m.start()))
|
|
507
|
+
pending_type_decl = (name, line_no)
|
|
508
|
+
pending_inheritance_mode, pending_inheritance_extends, pending_inheritance_impls = None, [], []
|
|
509
|
+
|
|
510
|
+
# Check for inline inheritance
|
|
511
|
+
m_ext_inline = self.re_extends.search(clean, m.end())
|
|
512
|
+
if m_ext_inline:
|
|
513
|
+
pending_inheritance_mode = "extends"
|
|
514
|
+
pending_inheritance_extends.extend(self._split_inheritance_list(m_ext_inline.group(1)))
|
|
515
|
+
|
|
516
|
+
m_impl_inline = self.re_implements.search(clean, m.end())
|
|
517
|
+
if m_impl_inline:
|
|
518
|
+
pending_inheritance_mode = "implements"
|
|
519
|
+
pending_inheritance_impls.extend(self._split_inheritance_list(m_impl_inline.group(1)))
|
|
520
|
+
|
|
521
|
+
if clean.rstrip().endswith(("extends", ":")): pending_inheritance_mode = "extends"
|
|
522
|
+
elif clean.rstrip().endswith("implements"): pending_inheritance_mode = "implements"
|
|
523
|
+
|
|
524
|
+
if "{" in clean:
|
|
525
|
+
flush_inheritance(line_no, clean)
|
|
526
|
+
|
|
527
|
+
looks_like_def = (
|
|
528
|
+
bool(re.search(r"\b(class|interface|enum|record|def|fun|function|func)\b", method_line)) or
|
|
529
|
+
bool(re.search(r"\b(public|private|protected|static|final|abstract|synchronized|native|default)\b", method_line)) or
|
|
530
|
+
bool(re.search(r"\b[a-zA-Z_][a-zA-Z0-9_<>,.\[\]]+\s+[A-Za-z_][A-Za-z0-9_]*\s*\(", method_line))
|
|
531
|
+
)
|
|
532
|
+
if looks_like_def:
|
|
533
|
+
for m in self.re_method.finditer(method_line):
|
|
534
|
+
name = m.group(1)
|
|
535
|
+
if not any(name == x[0] for x in matches): matches.append((name, self.method_kind, m.start()))
|
|
536
|
+
|
|
537
|
+
for name, kind, _ in sorted(matches, key=lambda x: x[2]):
|
|
538
|
+
meta = {"annotations": pending_annos.copy()}
|
|
539
|
+
if last_path: meta["http_path"] = last_path
|
|
540
|
+
parent = active_scopes[-1][1]["name"] if active_scopes else ""
|
|
541
|
+
info = {"path": path, "name": name, "kind": kind, "line": line_no, "meta": json.dumps(meta), "doc": self.clean_doc(pending_doc), "raw": line.strip(), "parent": parent}
|
|
542
|
+
active_scopes.append((cur_bal, info))
|
|
543
|
+
pending_annos, last_path, pending_doc = [], None, []
|
|
544
|
+
|
|
545
|
+
if not matches and clean and not clean.startswith("@") and not in_doc:
|
|
546
|
+
current_symbol = None
|
|
547
|
+
for _, info in reversed(active_scopes):
|
|
548
|
+
if info.get("kind") in (self.method_kind, "method", "function"):
|
|
549
|
+
current_symbol = info.get("name")
|
|
550
|
+
break
|
|
551
|
+
if current_symbol and not looks_like_def:
|
|
552
|
+
call_names = set()
|
|
553
|
+
for m in re.finditer(r"\b([A-Za-z_][A-Za-z0-9_]*)\s*\(", clean):
|
|
554
|
+
name = m.group(1)
|
|
555
|
+
if name in call_keywords:
|
|
556
|
+
continue
|
|
557
|
+
call_names.add(name)
|
|
558
|
+
for m in re.finditer(r"\.\s*([A-Za-z_][A-Za-z0-9_]*)\s*\(", clean):
|
|
559
|
+
name = m.group(1)
|
|
560
|
+
if name in call_keywords:
|
|
561
|
+
continue
|
|
562
|
+
call_names.add(name)
|
|
563
|
+
for name in call_names:
|
|
564
|
+
relations.append((path, current_symbol, "", name, "calls", line_no))
|
|
565
|
+
|
|
566
|
+
if not matches and clean and not clean.startswith("@") and not in_doc:
|
|
567
|
+
if "{" not in clean and "}" not in clean: pending_doc = []
|
|
568
|
+
|
|
569
|
+
if not matches and "(" not in clean and not clean.startswith("@"):
|
|
570
|
+
if re.search(r"\b(public|private|protected|static|final|abstract|synchronized|native|default)\b", clean) or re.search(r"<[^>]+>", clean):
|
|
571
|
+
if not self.re_class.search(clean):
|
|
572
|
+
pending_method_prefix = clean
|
|
573
|
+
|
|
574
|
+
op, cl = clean.count("{"), clean.count("}")
|
|
575
|
+
cur_bal += (op - cl)
|
|
576
|
+
|
|
577
|
+
if op > 0 or cl > 0:
|
|
578
|
+
still_active = []
|
|
579
|
+
for bal, info in active_scopes:
|
|
580
|
+
if cur_bal <= bal: symbols.append((info["path"], info["name"], info["kind"], info["line"], line_no, info["raw"], info["parent"], info["meta"], info["doc"]))
|
|
581
|
+
else: still_active.append((bal, info))
|
|
582
|
+
active_scopes = still_active
|
|
583
|
+
|
|
584
|
+
last_line = len(lines)
|
|
585
|
+
for _, info in active_scopes:
|
|
586
|
+
symbols.append((info["path"], info["name"], info["kind"], info["line"], last_line, info["raw"], info["parent"], info["meta"], info["doc"]))
|
|
587
|
+
if pending_type_decl:
|
|
588
|
+
name, decl_line = pending_type_decl
|
|
589
|
+
for b in pending_inheritance_extends: relations.append((path, name, "", b, "extends", decl_line))
|
|
590
|
+
for b in pending_inheritance_impls: relations.append((path, name, "", b, "implements", decl_line))
|
|
591
|
+
symbols.sort(key=lambda s: (s[3], 0 if s[2] in {"class", "interface", "enum", "record"} else 1, s[1]))
|
|
592
|
+
return symbols, relations
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
class ParserFactory:
|
|
596
|
+
_parsers: Dict[str, BaseParser] = {}
|
|
597
|
+
|
|
598
|
+
@classmethod
|
|
599
|
+
def get_parser(cls, ext: str) -> Optional[BaseParser]:
|
|
600
|
+
ext = (ext or "").lower()
|
|
601
|
+
if ext == ".py": return PythonParser()
|
|
602
|
+
configs = {
|
|
603
|
+
".java": {"re_class": _safe_compile(r"\b(class|interface|enum|record)\s+([a-zA-Z0-9_]+)"), "re_method": _safe_compile(r"(?:[a-zA-Z0-9_<>,.\[\]\s]+?\s+)?\b([a-zA-Z0-9_]+)\b\s*\(")},
|
|
604
|
+
".kt": {"re_class": _safe_compile(r"\b(class|interface|enum|object|data\s+class)\s+([a-zA-Z0-9_]+)"), "re_method": _safe_compile(r"\bfun\s+([a-zA-Z0-9_]+)\b\s*\(")},
|
|
605
|
+
".go": {"re_class": _safe_compile(r"\b(type|struct|interface)\s+([a-zA-Z0-9_]+)"), "re_method": _safe_compile(r"\bfunc\s+(?:[^)]+\)\s+)?([a-zA-Z0-9_]+)\b\s*\("), "method_kind": "function"},
|
|
606
|
+
".cpp": {"re_class": _safe_compile(r"\b(class|struct|enum)\s+([a-zA-Z0-9_]+)"), "re_method": _safe_compile(r"(?:[a-zA-Z0-9_:<>]+\s+)?\b([a-zA-Z0-9_]+)\b\s*\(")},
|
|
607
|
+
".h": {"re_class": _safe_compile(r"\b(class|struct|enum)\s+([a-zA-Z0-9_]+)"), "re_method": _safe_compile(r"(?:[a-zA-Z0-9_:<>]+\s+)?\b([a-zA-Z0-9_]+)\b\s*\(")},
|
|
608
|
+
".js": {"re_class": _safe_compile(r"\b(class)\s+([a-zA-Z0-9_]+)"), "re_method": _safe_compile(r"(?:async\s+)?function\s+([a-zA-Z0-9_]+)\b\s*\(")},
|
|
609
|
+
".jsx": {"re_class": _safe_compile(r"\b(class)\s+([a-zA-Z0-9_]+)"), "re_method": _safe_compile(r"(?:async\s+)?function\s+([a-zA-Z0-9_]+)\b\s*\(")},
|
|
610
|
+
".ts": {"re_class": _safe_compile(r"\b(class|interface|enum)\s+([a-zA-Z0-9_]+)"), "re_method": _safe_compile(r"(?:async\s+)?function\s+([a-zA-Z0-9_]+)\b\s*\(")},
|
|
611
|
+
".tsx": {"re_class": _safe_compile(r"\b(class|interface|enum)\s+([a-zA-Z0-9_]+)"), "re_method": _safe_compile(r"(?:async\s+)?function\s+([a-zA-Z0-9_]+)\b\s*\(")}
|
|
612
|
+
}
|
|
613
|
+
if ext in configs:
|
|
614
|
+
key = f"generic:{ext}"
|
|
615
|
+
if key not in cls._parsers: cls._parsers[key] = GenericRegexParser(configs[ext], ext)
|
|
616
|
+
return cls._parsers[key]
|
|
617
|
+
return None
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
class _SymbolExtraction:
|
|
621
|
+
def __init__(self, symbols: List[Tuple], relations: List[Tuple]):
|
|
622
|
+
self.symbols = symbols
|
|
623
|
+
self.relations = relations
|
|
624
|
+
|
|
625
|
+
def __iter__(self):
|
|
626
|
+
return iter((self.symbols, self.relations))
|
|
627
|
+
|
|
628
|
+
def __len__(self):
|
|
629
|
+
return len(self.symbols)
|
|
630
|
+
|
|
631
|
+
def __getitem__(self, item):
|
|
632
|
+
return self.symbols[item]
|
|
633
|
+
|
|
634
|
+
def __eq__(self, other):
|
|
635
|
+
if isinstance(other, _SymbolExtraction):
|
|
636
|
+
return self.symbols == other.symbols and self.relations == other.relations
|
|
637
|
+
return self.symbols == other
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def _extract_symbols(path: str, content: str) -> _SymbolExtraction:
|
|
641
|
+
parser = ParserFactory.get_parser(Path(path).suffix.lower())
|
|
642
|
+
if parser:
|
|
643
|
+
symbols, relations = parser.extract(path, content)
|
|
644
|
+
return _SymbolExtraction(symbols, relations)
|
|
645
|
+
return _SymbolExtraction([], [])
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def _extract_symbols_with_relations(path: str, content: str) -> Tuple[List[Tuple], List[Tuple]]:
|
|
649
|
+
result = _extract_symbols(path, content)
|
|
650
|
+
return result.symbols, result.relations
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
class DBWriter:
|
|
654
|
+
def __init__(self, db: LocalSearchDB, logger=None, max_batch: int = 50, max_wait: float = 0.2, latency_cb=None):
|
|
655
|
+
self.db = db
|
|
656
|
+
self.logger = logger
|
|
657
|
+
self.max_batch = max_batch
|
|
658
|
+
self.max_wait = max_wait
|
|
659
|
+
self.latency_cb = latency_cb
|
|
660
|
+
self.queue: "queue.Queue[DbTask]" = queue.Queue()
|
|
661
|
+
self._stop = threading.Event()
|
|
662
|
+
self._thread = threading.Thread(target=self._run, daemon=True)
|
|
663
|
+
self._conn = None
|
|
664
|
+
self.last_commit_ts = 0
|
|
665
|
+
|
|
666
|
+
def start(self) -> None:
|
|
667
|
+
if not self._thread.is_alive():
|
|
668
|
+
self._thread.start()
|
|
669
|
+
|
|
670
|
+
def stop(self, timeout: float = 2.0) -> None:
|
|
671
|
+
self._stop.set()
|
|
672
|
+
started = False
|
|
673
|
+
try:
|
|
674
|
+
started = self._thread.is_alive() or bool(getattr(self._thread, "_started", None) and self._thread._started.is_set())
|
|
675
|
+
except Exception:
|
|
676
|
+
started = False
|
|
677
|
+
if started:
|
|
678
|
+
self._thread.join(timeout=timeout)
|
|
679
|
+
|
|
680
|
+
def enqueue(self, task: DbTask) -> None:
|
|
681
|
+
self.queue.put(task)
|
|
682
|
+
|
|
683
|
+
def qsize(self) -> int:
|
|
684
|
+
return self.queue.qsize()
|
|
685
|
+
|
|
686
|
+
def _run(self) -> None:
|
|
687
|
+
self._conn = self.db.open_writer_connection()
|
|
688
|
+
cur = self._conn.cursor()
|
|
689
|
+
while not self._stop.is_set() or not self.queue.empty():
|
|
690
|
+
tasks = self._drain_batch()
|
|
691
|
+
if not tasks:
|
|
692
|
+
continue
|
|
693
|
+
try:
|
|
694
|
+
cur.execute("BEGIN")
|
|
695
|
+
self._process_batch(cur, tasks)
|
|
696
|
+
self._conn.commit()
|
|
697
|
+
self.last_commit_ts = int(time.time())
|
|
698
|
+
except Exception as e:
|
|
699
|
+
try:
|
|
700
|
+
self._conn.rollback()
|
|
701
|
+
except Exception:
|
|
702
|
+
pass
|
|
703
|
+
if self.logger:
|
|
704
|
+
self.logger.log_error(f"DBWriter batch failed: {e}")
|
|
705
|
+
try:
|
|
706
|
+
self._conn.close()
|
|
707
|
+
except Exception:
|
|
708
|
+
pass
|
|
709
|
+
|
|
710
|
+
def _drain_batch(self) -> List[DbTask]:
|
|
711
|
+
tasks: List[DbTask] = []
|
|
712
|
+
try:
|
|
713
|
+
first = self.queue.get(timeout=self.max_wait)
|
|
714
|
+
tasks.append(first)
|
|
715
|
+
self.queue.task_done()
|
|
716
|
+
except queue.Empty:
|
|
717
|
+
return tasks
|
|
718
|
+
while len(tasks) < self.max_batch:
|
|
719
|
+
try:
|
|
720
|
+
t = self.queue.get_nowait()
|
|
721
|
+
tasks.append(t)
|
|
722
|
+
self.queue.task_done()
|
|
723
|
+
except queue.Empty:
|
|
724
|
+
break
|
|
725
|
+
return tasks
|
|
726
|
+
|
|
727
|
+
def _process_batch(self, cur, tasks: List[DbTask]) -> None:
|
|
728
|
+
commit_ts = int(time.time())
|
|
729
|
+
delete_paths: set[str] = set()
|
|
730
|
+
upsert_files_rows: List[tuple] = []
|
|
731
|
+
upsert_symbols_rows: List[tuple] = []
|
|
732
|
+
upsert_relations_rows: List[tuple] = []
|
|
733
|
+
update_last_seen_paths: List[str] = []
|
|
734
|
+
repo_meta_tasks: List[dict] = []
|
|
735
|
+
engine_docs: List[dict] = []
|
|
736
|
+
engine_deletes: List[str] = []
|
|
737
|
+
latency_samples: List[float] = []
|
|
738
|
+
|
|
739
|
+
for t in tasks:
|
|
740
|
+
if t.kind == "delete_path" and t.path:
|
|
741
|
+
delete_paths.add(t.path)
|
|
742
|
+
if t.engine_deletes:
|
|
743
|
+
engine_deletes.extend(t.engine_deletes)
|
|
744
|
+
if t.ts:
|
|
745
|
+
latency_samples.append(time.time() - t.ts)
|
|
746
|
+
elif t.kind == "upsert_files" and t.rows:
|
|
747
|
+
upsert_files_rows.extend(t.rows)
|
|
748
|
+
if t.engine_docs:
|
|
749
|
+
engine_docs.extend(t.engine_docs)
|
|
750
|
+
if t.ts:
|
|
751
|
+
latency_samples.append(time.time() - t.ts)
|
|
752
|
+
elif t.kind == "upsert_symbols" and t.rows:
|
|
753
|
+
upsert_symbols_rows.extend(t.rows)
|
|
754
|
+
elif t.kind == "upsert_relations" and t.rows:
|
|
755
|
+
upsert_relations_rows.extend(t.rows)
|
|
756
|
+
elif t.kind == "update_last_seen" and t.paths:
|
|
757
|
+
update_last_seen_paths.extend(t.paths)
|
|
758
|
+
elif t.kind == "upsert_repo_meta" and t.repo_meta:
|
|
759
|
+
repo_meta_tasks.append(t.repo_meta)
|
|
760
|
+
|
|
761
|
+
if delete_paths:
|
|
762
|
+
upsert_files_rows = [r for r in upsert_files_rows if r[0] not in delete_paths]
|
|
763
|
+
upsert_symbols_rows = [r for r in upsert_symbols_rows if r[0] not in delete_paths]
|
|
764
|
+
upsert_relations_rows = [r for r in upsert_relations_rows if r[0] not in delete_paths]
|
|
765
|
+
update_last_seen_paths = [p for p in update_last_seen_paths if p not in delete_paths]
|
|
766
|
+
engine_docs = [d for d in engine_docs if d.get("doc_id") not in delete_paths]
|
|
767
|
+
|
|
768
|
+
# Safety order: delete -> upsert_files -> upsert_symbols -> upsert_relations -> update_last_seen
|
|
769
|
+
for p in delete_paths:
|
|
770
|
+
self.db.delete_path_tx(cur, p)
|
|
771
|
+
|
|
772
|
+
if upsert_files_rows:
|
|
773
|
+
rows = [
|
|
774
|
+
(
|
|
775
|
+
r[0], r[1], r[2], r[3], r[4], commit_ts,
|
|
776
|
+
r[5], r[6], r[7], r[8], r[9], r[10], r[11], r[12]
|
|
777
|
+
)
|
|
778
|
+
for r in upsert_files_rows
|
|
779
|
+
]
|
|
780
|
+
self.db.upsert_files_tx(cur, rows)
|
|
781
|
+
if upsert_symbols_rows:
|
|
782
|
+
self.db.upsert_symbols_tx(cur, upsert_symbols_rows)
|
|
783
|
+
if upsert_relations_rows:
|
|
784
|
+
self.db.upsert_relations_tx(cur, upsert_relations_rows)
|
|
785
|
+
if update_last_seen_paths:
|
|
786
|
+
self.db.update_last_seen_tx(cur, update_last_seen_paths, commit_ts)
|
|
787
|
+
if repo_meta_tasks:
|
|
788
|
+
for m in repo_meta_tasks:
|
|
789
|
+
self.db.upsert_repo_meta_tx(
|
|
790
|
+
cur,
|
|
791
|
+
repo_name=m.get("repo_name", ""),
|
|
792
|
+
tags=m.get("tags", ""),
|
|
793
|
+
domain=m.get("domain", ""),
|
|
794
|
+
description=m.get("description", ""),
|
|
795
|
+
priority=int(m.get("priority", 0) or 0),
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
if delete_paths:
|
|
799
|
+
engine_deletes.extend(list(delete_paths))
|
|
800
|
+
if engine_docs or engine_deletes:
|
|
801
|
+
engine = getattr(self.db, "engine", None)
|
|
802
|
+
try:
|
|
803
|
+
if engine_docs and hasattr(engine, "upsert_documents"):
|
|
804
|
+
engine.upsert_documents(engine_docs)
|
|
805
|
+
if engine_deletes and hasattr(engine, "delete_documents"):
|
|
806
|
+
engine.delete_documents(engine_deletes)
|
|
807
|
+
except Exception as e:
|
|
808
|
+
if self.logger:
|
|
809
|
+
self.logger.log_error(f"engine update failed: {e}")
|
|
810
|
+
|
|
811
|
+
if self.latency_cb and latency_samples:
|
|
812
|
+
for s in latency_samples:
|
|
813
|
+
self.latency_cb(s)
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
class Indexer:
|
|
817
|
+
def __init__(self, cfg: Config, db: LocalSearchDB, logger=None, indexer_mode: str = "auto", indexing_enabled: bool = True, startup_index_enabled: bool = True, lock_handle: Any = None):
|
|
818
|
+
self.cfg, self.db, self.logger = cfg, db, logger
|
|
819
|
+
self.status = IndexStatus()
|
|
820
|
+
self.indexer_mode = indexer_mode
|
|
821
|
+
self.indexing_enabled = indexing_enabled
|
|
822
|
+
self.startup_index_enabled = startup_index_enabled
|
|
823
|
+
self._lock_handle = lock_handle
|
|
824
|
+
self._stop, self._rescan = threading.Event(), threading.Event()
|
|
825
|
+
self._pipeline_started = False
|
|
826
|
+
self._drain_timeout = 2.0
|
|
827
|
+
self._coalesce_max_keys = 100000
|
|
828
|
+
self._coalesce_lock = threading.Lock()
|
|
829
|
+
self._coalesce_map: Dict[str, CoalesceTask] = {}
|
|
830
|
+
self._legacy_purge_done = False
|
|
831
|
+
self._event_queue = DedupQueue() if DedupQueue else None
|
|
832
|
+
self._worker_thread = None
|
|
833
|
+
batch_size = int(getattr(cfg, "commit_batch_size", 50) or 50)
|
|
834
|
+
if batch_size <= 0:
|
|
835
|
+
batch_size = 50
|
|
836
|
+
self._db_writer = DBWriter(self.db, logger=self.logger, max_batch=batch_size, latency_cb=self._record_latency)
|
|
837
|
+
self._metrics_thread = None
|
|
838
|
+
self._latencies = deque(maxlen=2000)
|
|
839
|
+
self._enqueue_count = 0
|
|
840
|
+
self._enqueue_count_ts = time.time()
|
|
841
|
+
self._retry_count = 0
|
|
842
|
+
self._drop_count_degraded = 0
|
|
843
|
+
self._drop_count_shutdown = 0
|
|
844
|
+
self._drop_count_telemetry = 0
|
|
845
|
+
max_workers = getattr(cfg, "max_workers", 4) or 4
|
|
846
|
+
try:
|
|
847
|
+
max_workers = int(max_workers)
|
|
848
|
+
except Exception:
|
|
849
|
+
max_workers = 4
|
|
850
|
+
if max_workers <= 0:
|
|
851
|
+
max_workers = 4
|
|
852
|
+
self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
|
|
853
|
+
self.watcher = None
|
|
854
|
+
|
|
855
|
+
def stop(self):
|
|
856
|
+
self._stop.set(); self._rescan.set()
|
|
857
|
+
if self.watcher:
|
|
858
|
+
try: self.watcher.stop()
|
|
859
|
+
except: pass
|
|
860
|
+
self._drain_queues()
|
|
861
|
+
try: self._executor.shutdown(wait=False)
|
|
862
|
+
except: pass
|
|
863
|
+
if self._db_writer:
|
|
864
|
+
self._db_writer.stop(timeout=self._drain_timeout)
|
|
865
|
+
if self.logger and hasattr(self.logger, "stop"):
|
|
866
|
+
try:
|
|
867
|
+
self.logger.stop(timeout=self._drain_timeout)
|
|
868
|
+
except Exception:
|
|
869
|
+
pass
|
|
870
|
+
if self._lock_handle:
|
|
871
|
+
try:
|
|
872
|
+
self._lock_handle.release()
|
|
873
|
+
except Exception:
|
|
874
|
+
pass
|
|
875
|
+
|
|
876
|
+
def request_rescan(self): self._rescan.set()
|
|
877
|
+
|
|
878
|
+
def scan_once(self) -> None:
|
|
879
|
+
"""Force a synchronous scan of the workspace (used by MCP tools/tests)."""
|
|
880
|
+
self._start_pipeline()
|
|
881
|
+
self._scan_once()
|
|
882
|
+
|
|
883
|
+
def run_forever(self):
|
|
884
|
+
if not self.indexing_enabled:
|
|
885
|
+
self.status.index_ready = True
|
|
886
|
+
return
|
|
887
|
+
self._start_pipeline()
|
|
888
|
+
# v2.7.0: Start watcher if available and not already running
|
|
889
|
+
if FileWatcher and not self.watcher:
|
|
890
|
+
try:
|
|
891
|
+
# Watch all roots
|
|
892
|
+
roots = [str(Path(os.path.expanduser(r)).absolute()) for r in self.cfg.workspace_roots if Path(r).exists()]
|
|
893
|
+
if roots:
|
|
894
|
+
self.watcher = FileWatcher(roots, self._process_watcher_event)
|
|
895
|
+
self.watcher.start()
|
|
896
|
+
if self.logger: self.logger.log_info(f"FileWatcher started for {roots}")
|
|
897
|
+
except Exception as e:
|
|
898
|
+
if self.logger: self.logger.log_error(f"Failed to start FileWatcher: {e}")
|
|
899
|
+
|
|
900
|
+
if self.startup_index_enabled:
|
|
901
|
+
self._scan_once()
|
|
902
|
+
self.status.index_ready = True
|
|
903
|
+
while not self._stop.is_set():
|
|
904
|
+
timeout = max(1, int(getattr(self.cfg, "scan_interval_seconds", 30)))
|
|
905
|
+
self._rescan.wait(timeout=timeout)
|
|
906
|
+
self._rescan.clear()
|
|
907
|
+
if self._stop.is_set(): break
|
|
908
|
+
self._scan_once()
|
|
909
|
+
|
|
910
|
+
def _start_pipeline(self) -> None:
|
|
911
|
+
if self._pipeline_started:
|
|
912
|
+
return
|
|
913
|
+
self._pipeline_started = True
|
|
914
|
+
if self._db_writer:
|
|
915
|
+
self._db_writer.start()
|
|
916
|
+
self._worker_thread = threading.Thread(target=self._worker_loop, daemon=True)
|
|
917
|
+
self._worker_thread.start()
|
|
918
|
+
self._metrics_thread = threading.Thread(target=self._metrics_loop, daemon=True)
|
|
919
|
+
self._metrics_thread.start()
|
|
920
|
+
|
|
921
|
+
def _record_latency(self, value: float) -> None:
|
|
922
|
+
self._latencies.append(value)
|
|
923
|
+
|
|
924
|
+
def get_queue_depths(self) -> dict:
|
|
925
|
+
watcher_q = self._event_queue.qsize() if self._event_queue else 0
|
|
926
|
+
db_q = self._db_writer.qsize() if self._db_writer else 0
|
|
927
|
+
telemetry_q = self.logger.get_queue_depth() if self.logger and hasattr(self.logger, "get_queue_depth") else 0
|
|
928
|
+
return {"watcher": watcher_q, "db_writer": db_q, "telemetry": telemetry_q}
|
|
929
|
+
|
|
930
|
+
def get_last_commit_ts(self) -> int:
|
|
931
|
+
if self._db_writer and hasattr(self._db_writer, "last_commit_ts"):
|
|
932
|
+
return int(self._db_writer.last_commit_ts or 0)
|
|
933
|
+
return 0
|
|
934
|
+
|
|
935
|
+
def _metrics_loop(self) -> None:
|
|
936
|
+
while not self._stop.is_set():
|
|
937
|
+
time.sleep(5.0)
|
|
938
|
+
try:
|
|
939
|
+
now = time.time()
|
|
940
|
+
elapsed = max(1.0, now - self._enqueue_count_ts)
|
|
941
|
+
enqueue_per_sec = self._enqueue_count / elapsed
|
|
942
|
+
self._enqueue_count = 0
|
|
943
|
+
self._enqueue_count_ts = now
|
|
944
|
+
|
|
945
|
+
latencies = list(self._latencies)
|
|
946
|
+
if latencies:
|
|
947
|
+
latencies.sort()
|
|
948
|
+
p50 = latencies[int(0.5 * (len(latencies) - 1))]
|
|
949
|
+
p95 = latencies[int(0.95 * (len(latencies) - 1))]
|
|
950
|
+
else:
|
|
951
|
+
p50 = 0.0
|
|
952
|
+
p95 = 0.0
|
|
953
|
+
|
|
954
|
+
watcher_q = self._event_queue.qsize() if self._event_queue else 0
|
|
955
|
+
db_q = self._db_writer.qsize() if self._db_writer else 0
|
|
956
|
+
telemetry_q = self.logger.get_queue_depth() if self.logger and hasattr(self.logger, "get_queue_depth") else 0
|
|
957
|
+
telemetry_drop = self.logger.get_drop_count() if self.logger and hasattr(self.logger, "get_drop_count") else 0
|
|
958
|
+
|
|
959
|
+
if self.logger:
|
|
960
|
+
self.logger.log_telemetry(
|
|
961
|
+
f"queue_depth watcher={watcher_q} db={db_q} telemetry={telemetry_q} "
|
|
962
|
+
f"enqueue_per_sec={enqueue_per_sec:.2f} latency_p50={p50:.3f}s latency_p95={p95:.3f}s "
|
|
963
|
+
f"retry_count={self._retry_count} drop_degraded={self._drop_count_degraded} "
|
|
964
|
+
f"drop_shutdown={self._drop_count_shutdown} telemetry_drop={telemetry_drop}"
|
|
965
|
+
)
|
|
966
|
+
except Exception:
|
|
967
|
+
pass
|
|
968
|
+
|
|
969
|
+
def _drain_queues(self) -> None:
|
|
970
|
+
deadline = time.time() + self._drain_timeout
|
|
971
|
+
while time.time() < deadline:
|
|
972
|
+
pending = 0
|
|
973
|
+
if self._event_queue:
|
|
974
|
+
pending += self._event_queue.qsize()
|
|
975
|
+
if self._db_writer:
|
|
976
|
+
pending += self._db_writer.qsize()
|
|
977
|
+
if pending == 0:
|
|
978
|
+
return
|
|
979
|
+
time.sleep(0.05)
|
|
980
|
+
remaining = 0
|
|
981
|
+
if self._event_queue:
|
|
982
|
+
remaining += self._event_queue.qsize()
|
|
983
|
+
if self._db_writer:
|
|
984
|
+
remaining += self._db_writer.qsize()
|
|
985
|
+
self._drop_count_shutdown += remaining
|
|
986
|
+
if self.logger:
|
|
987
|
+
self.logger.log_info(f"dropped_on_shutdown={remaining}")
|
|
988
|
+
|
|
989
|
+
def _enqueue_db_tasks(self, files_rows: List[tuple], symbols_rows: List[tuple], relations_rows: List[tuple], engine_docs: Optional[List[dict]] = None, enqueue_ts: Optional[float] = None) -> None:
|
|
990
|
+
if files_rows:
|
|
991
|
+
self._db_writer.enqueue(DbTask(kind="upsert_files", rows=list(files_rows), ts=enqueue_ts or time.time(), engine_docs=list(engine_docs or [])))
|
|
992
|
+
if symbols_rows:
|
|
993
|
+
self._db_writer.enqueue(DbTask(kind="upsert_symbols", rows=list(symbols_rows)))
|
|
994
|
+
if relations_rows:
|
|
995
|
+
self._db_writer.enqueue(DbTask(kind="upsert_relations", rows=list(relations_rows)))
|
|
996
|
+
|
|
997
|
+
def _enqueue_update_last_seen(self, paths: List[str]) -> None:
|
|
998
|
+
if not paths:
|
|
999
|
+
return
|
|
1000
|
+
self._db_writer.enqueue(DbTask(kind="update_last_seen", paths=list(paths)))
|
|
1001
|
+
|
|
1002
|
+
def _enqueue_delete_path(self, path: str, enqueue_ts: Optional[float] = None) -> None:
|
|
1003
|
+
self._db_writer.enqueue(DbTask(kind="delete_path", path=path, ts=enqueue_ts or time.time()))
|
|
1004
|
+
|
|
1005
|
+
def _enqueue_repo_meta(self, repo_name: str, tags: str, description: str) -> None:
|
|
1006
|
+
self._db_writer.enqueue(
|
|
1007
|
+
DbTask(kind="upsert_repo_meta", repo_meta={"repo_name": repo_name, "tags": tags, "description": description})
|
|
1008
|
+
)
|
|
1009
|
+
|
|
1010
|
+
def _normalize_path(self, path: str) -> Optional[str]:
|
|
1011
|
+
try:
|
|
1012
|
+
p = Path(path).absolute()
|
|
1013
|
+
# Multi-root support: Check if path is within any workspace root
|
|
1014
|
+
for root_str in self.cfg.workspace_roots:
|
|
1015
|
+
root = Path(os.path.expanduser(root_str)).absolute()
|
|
1016
|
+
try:
|
|
1017
|
+
p.relative_to(root)
|
|
1018
|
+
return self._encode_db_path(root, p)
|
|
1019
|
+
except ValueError:
|
|
1020
|
+
continue
|
|
1021
|
+
return None
|
|
1022
|
+
except Exception:
|
|
1023
|
+
return None
|
|
1024
|
+
|
|
1025
|
+
def _get_root_map(self) -> dict[str, Path]:
|
|
1026
|
+
roots = {}
|
|
1027
|
+
for r in self.cfg.workspace_roots:
|
|
1028
|
+
root_path = Path(os.path.expanduser(r)).absolute()
|
|
1029
|
+
root_id = self._root_id(str(root_path))
|
|
1030
|
+
roots[root_id] = root_path
|
|
1031
|
+
return roots
|
|
1032
|
+
|
|
1033
|
+
def _encode_db_path(self, root: Path, file_path: Path) -> str:
|
|
1034
|
+
root_id = self._root_id(str(root))
|
|
1035
|
+
rel = file_path.relative_to(root).as_posix()
|
|
1036
|
+
return f"{root_id}/{rel}"
|
|
1037
|
+
|
|
1038
|
+
def _decode_db_path(self, db_path: str) -> Optional[tuple[Path, Path]]:
|
|
1039
|
+
if "/" not in db_path:
|
|
1040
|
+
return None
|
|
1041
|
+
root_id, rel = db_path.split("/", 1)
|
|
1042
|
+
roots = self._get_root_map()
|
|
1043
|
+
root = roots.get(root_id)
|
|
1044
|
+
if not root:
|
|
1045
|
+
return None
|
|
1046
|
+
rel_path = Path(*rel.split("/"))
|
|
1047
|
+
return root, (root / rel_path)
|
|
1048
|
+
|
|
1049
|
+
def _root_id(self, path: str) -> str:
|
|
1050
|
+
if WorkspaceManager is None:
|
|
1051
|
+
import hashlib
|
|
1052
|
+
digest = hashlib.sha1(path.encode("utf-8")).hexdigest()[:8]
|
|
1053
|
+
return f"root-{digest}"
|
|
1054
|
+
return WorkspaceManager.root_id(path)
|
|
1055
|
+
|
|
1056
|
+
def _enqueue_action(self, action: TaskAction, path: str, ts: float, attempts: int = 0) -> None:
|
|
1057
|
+
if not self._event_queue:
|
|
1058
|
+
return
|
|
1059
|
+
norm = self._normalize_path(path)
|
|
1060
|
+
if not norm:
|
|
1061
|
+
return
|
|
1062
|
+
# Key must be unique per file. Use db path as key.
|
|
1063
|
+
key = norm
|
|
1064
|
+
with self._coalesce_lock:
|
|
1065
|
+
exists = key in self._coalesce_map
|
|
1066
|
+
if not exists and len(self._coalesce_map) >= self._coalesce_max_keys:
|
|
1067
|
+
self._drop_count_degraded += 1
|
|
1068
|
+
if self.logger:
|
|
1069
|
+
self.logger.log_error(f"coalesce_map degraded: drop key={key}")
|
|
1070
|
+
return
|
|
1071
|
+
if exists:
|
|
1072
|
+
task = self._coalesce_map[key]
|
|
1073
|
+
task.action = coalesce_action(task.action, action)
|
|
1074
|
+
task.last_seen = ts
|
|
1075
|
+
task.enqueue_ts = ts
|
|
1076
|
+
task.attempts = max(task.attempts, attempts)
|
|
1077
|
+
else:
|
|
1078
|
+
self._coalesce_map[key] = CoalesceTask(action=action, path=norm, attempts=attempts, enqueue_ts=ts, last_seen=ts)
|
|
1079
|
+
self._event_queue.put(key)
|
|
1080
|
+
self._enqueue_count += 1
|
|
1081
|
+
|
|
1082
|
+
def _enqueue_fsevent(self, evt: FsEvent) -> None:
|
|
1083
|
+
if evt.kind == FsEventKind.MOVED:
|
|
1084
|
+
for action, p in split_moved_event(evt):
|
|
1085
|
+
self._enqueue_action(action, p, evt.ts)
|
|
1086
|
+
return
|
|
1087
|
+
if evt.kind == FsEventKind.DELETED:
|
|
1088
|
+
self._enqueue_action(TaskAction.DELETE, evt.path, evt.ts)
|
|
1089
|
+
return
|
|
1090
|
+
self._enqueue_action(TaskAction.INDEX, evt.path, evt.ts)
|
|
1091
|
+
|
|
1092
|
+
def _worker_loop(self) -> None:
|
|
1093
|
+
if not self._event_queue:
|
|
1094
|
+
return
|
|
1095
|
+
while not self._stop.is_set() or self._event_queue.qsize() > 0:
|
|
1096
|
+
keys = self._event_queue.get_batch(max_size=50, timeout=0.2)
|
|
1097
|
+
if not keys:
|
|
1098
|
+
continue
|
|
1099
|
+
for key in keys:
|
|
1100
|
+
with self._coalesce_lock:
|
|
1101
|
+
task = self._coalesce_map.pop(key, None)
|
|
1102
|
+
if not task:
|
|
1103
|
+
continue
|
|
1104
|
+
if task.action == TaskAction.DELETE:
|
|
1105
|
+
self._enqueue_delete_path(task.path, enqueue_ts=task.enqueue_ts)
|
|
1106
|
+
continue
|
|
1107
|
+
self._handle_index_task(task)
|
|
1108
|
+
|
|
1109
|
+
def _handle_index_task(self, task: CoalesceTask) -> None:
|
|
1110
|
+
resolved = self._decode_db_path(task.path)
|
|
1111
|
+
if not resolved:
|
|
1112
|
+
return
|
|
1113
|
+
matched_root, file_path = resolved
|
|
1114
|
+
|
|
1115
|
+
try:
|
|
1116
|
+
st = file_path.stat()
|
|
1117
|
+
except FileNotFoundError:
|
|
1118
|
+
self._enqueue_delete_path(task.path, enqueue_ts=task.enqueue_ts)
|
|
1119
|
+
return
|
|
1120
|
+
except (IOError, PermissionError, OSError) as e:
|
|
1121
|
+
self._retry_task(task, e)
|
|
1122
|
+
return
|
|
1123
|
+
|
|
1124
|
+
try:
|
|
1125
|
+
res = self._process_file_task(matched_root, file_path, st, int(time.time()), time.time(), False, raise_on_error=True)
|
|
1126
|
+
except (IOError, PermissionError, OSError) as e:
|
|
1127
|
+
self._retry_task(task, e)
|
|
1128
|
+
return
|
|
1129
|
+
except Exception:
|
|
1130
|
+
self.status.errors += 1
|
|
1131
|
+
return
|
|
1132
|
+
|
|
1133
|
+
if not res or res.get("type") == "unchanged":
|
|
1134
|
+
return
|
|
1135
|
+
|
|
1136
|
+
self._enqueue_db_tasks(
|
|
1137
|
+
[(
|
|
1138
|
+
res["rel"],
|
|
1139
|
+
res["repo"],
|
|
1140
|
+
res["mtime"],
|
|
1141
|
+
res["size"],
|
|
1142
|
+
res["content"],
|
|
1143
|
+
res["parse_status"],
|
|
1144
|
+
res["parse_reason"],
|
|
1145
|
+
res["ast_status"],
|
|
1146
|
+
res["ast_reason"],
|
|
1147
|
+
int(res["is_binary"]),
|
|
1148
|
+
int(res["is_minified"]),
|
|
1149
|
+
int(res["sampled"]),
|
|
1150
|
+
int(res["content_bytes"]),
|
|
1151
|
+
)],
|
|
1152
|
+
list(res.get("symbols") or []),
|
|
1153
|
+
list(res.get("relations") or []),
|
|
1154
|
+
engine_docs=[res.get("engine_doc")] if res.get("engine_doc") else [],
|
|
1155
|
+
enqueue_ts=task.enqueue_ts,
|
|
1156
|
+
)
|
|
1157
|
+
|
|
1158
|
+
def _retry_task(self, task: CoalesceTask, err: Exception) -> None:
|
|
1159
|
+
if task.attempts >= 2:
|
|
1160
|
+
self._drop_count_degraded += 1
|
|
1161
|
+
if self.logger:
|
|
1162
|
+
self.logger.log_error(f"Task dropped after retries: {task.path} err={err}")
|
|
1163
|
+
return
|
|
1164
|
+
self._retry_count += 1
|
|
1165
|
+
task.attempts += 1
|
|
1166
|
+
base = 0.5 if task.attempts == 1 else 2.0
|
|
1167
|
+
sleep = base * random.uniform(0.8, 1.2)
|
|
1168
|
+
t = threading.Timer(sleep, lambda: self._enqueue_action(task.action, task.path, time.time(), attempts=task.attempts))
|
|
1169
|
+
t.daemon = True
|
|
1170
|
+
t.start()
|
|
1171
|
+
|
|
1172
|
+
def _build_engine_doc(self, doc_id: str, repo: str, rel_to_root: str, content: str, parse_status: str, mtime: int, size: int) -> dict:
|
|
1173
|
+
rel_path = Path(rel_to_root).as_posix()
|
|
1174
|
+
root_id = doc_id.split("/", 1)[0] if "/" in doc_id else ""
|
|
1175
|
+
path_text = f"{doc_id} {rel_path}"
|
|
1176
|
+
max_doc_bytes = int(os.environ.get("DECKARD_ENGINE_MAX_DOC_BYTES", "4194304") or 4194304)
|
|
1177
|
+
preview_bytes = int(os.environ.get("DECKARD_ENGINE_PREVIEW_BYTES", "8192") or 8192)
|
|
1178
|
+
body_text = ""
|
|
1179
|
+
preview = ""
|
|
1180
|
+
if parse_status == "ok":
|
|
1181
|
+
norm = _normalize_engine_text(content or "")
|
|
1182
|
+
if len(norm) > max_doc_bytes:
|
|
1183
|
+
head = max_doc_bytes // 2
|
|
1184
|
+
tail = max_doc_bytes - head
|
|
1185
|
+
norm = norm[:head] + norm[-tail:]
|
|
1186
|
+
body_text = norm
|
|
1187
|
+
if preview_bytes > 0:
|
|
1188
|
+
if content and len(content) > preview_bytes:
|
|
1189
|
+
half = preview_bytes // 2
|
|
1190
|
+
preview = content[:half] + "\n...\n" + content[-half:]
|
|
1191
|
+
else:
|
|
1192
|
+
preview = content or ""
|
|
1193
|
+
return {
|
|
1194
|
+
"doc_id": doc_id,
|
|
1195
|
+
"path": doc_id,
|
|
1196
|
+
"repo": repo,
|
|
1197
|
+
"root_id": root_id,
|
|
1198
|
+
"rel_path": rel_path,
|
|
1199
|
+
"path_text": path_text,
|
|
1200
|
+
"body_text": body_text,
|
|
1201
|
+
"preview": preview,
|
|
1202
|
+
"mtime": int(mtime),
|
|
1203
|
+
"size": int(size),
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
def _process_file_task(self, root: Path, file_path: Path, st: os.stat_result, scan_ts: int, now: float, excluded: bool, raise_on_error: bool = False) -> Optional[dict]:
|
|
1207
|
+
try:
|
|
1208
|
+
rel_to_root = str(file_path.relative_to(root))
|
|
1209
|
+
repo = rel_to_root.split(os.sep, 1)[0] if os.sep in rel_to_root else "__root__"
|
|
1210
|
+
db_path = self._encode_db_path(root, file_path)
|
|
1211
|
+
|
|
1212
|
+
prev = self.db.get_file_meta(db_path)
|
|
1213
|
+
if prev and int(st.st_mtime) == int(prev[0]) and int(st.st_size) == int(prev[1]):
|
|
1214
|
+
if now - st.st_mtime > AI_SAFETY_NET_SECONDS:
|
|
1215
|
+
return {"type": "unchanged", "rel": db_path}
|
|
1216
|
+
|
|
1217
|
+
parse_limit, ast_limit = _resolve_size_limits()
|
|
1218
|
+
exclude_parse = _env_flag("DECKARD_EXCLUDE_APPLIES_TO_PARSE", True)
|
|
1219
|
+
exclude_ast = _env_flag("DECKARD_EXCLUDE_APPLIES_TO_AST", True)
|
|
1220
|
+
sample_large = _env_flag("DECKARD_SAMPLE_LARGE_FILES", False)
|
|
1221
|
+
decode_policy = (os.environ.get("DECKARD_UTF8_DECODE_POLICY") or "strong").strip().lower()
|
|
1222
|
+
|
|
1223
|
+
include_ext = {e.lower() for e in getattr(self.cfg, "include_ext", [])}
|
|
1224
|
+
include_files = set(getattr(self.cfg, "include_files", []))
|
|
1225
|
+
include_files_abs = {str(Path(p).expanduser().absolute()) for p in include_files if os.path.isabs(p)}
|
|
1226
|
+
include_files_rel = {p for p in include_files if not os.path.isabs(p)}
|
|
1227
|
+
include_all_ext = not include_ext and not include_files
|
|
1228
|
+
|
|
1229
|
+
parse_status = "none"
|
|
1230
|
+
parse_reason = "none"
|
|
1231
|
+
ast_status = "none"
|
|
1232
|
+
ast_reason = "none"
|
|
1233
|
+
is_binary = 0
|
|
1234
|
+
is_minified = 0
|
|
1235
|
+
sampled = 0
|
|
1236
|
+
content = ""
|
|
1237
|
+
content_bytes = 0
|
|
1238
|
+
symbols: List[Tuple] = []
|
|
1239
|
+
relations: List[Tuple] = []
|
|
1240
|
+
|
|
1241
|
+
size = int(getattr(st, "st_size", 0) or 0)
|
|
1242
|
+
max_file_bytes = int(getattr(self.cfg, "max_file_bytes", 0) or 0)
|
|
1243
|
+
too_large_meta = max_file_bytes > 0 and size > max_file_bytes
|
|
1244
|
+
# Determine include eligibility for parse/ast
|
|
1245
|
+
is_included = include_all_ext
|
|
1246
|
+
if not is_included:
|
|
1247
|
+
rel = str(file_path.absolute().relative_to(root))
|
|
1248
|
+
is_included = (rel in include_files_rel) or (str(file_path.absolute()) in include_files_abs)
|
|
1249
|
+
if not is_included and include_ext:
|
|
1250
|
+
is_included = file_path.suffix.lower() in include_ext
|
|
1251
|
+
if (include_files or include_ext) and not is_included:
|
|
1252
|
+
return None
|
|
1253
|
+
|
|
1254
|
+
# Exclude rules for parse/ast
|
|
1255
|
+
if excluded and exclude_parse:
|
|
1256
|
+
parse_status, parse_reason = "skipped", "excluded"
|
|
1257
|
+
ast_status, ast_reason = "skipped", "excluded"
|
|
1258
|
+
elif too_large_meta:
|
|
1259
|
+
parse_status, parse_reason = "skipped", "too_large"
|
|
1260
|
+
ast_status, ast_reason = "skipped", "too_large"
|
|
1261
|
+
else:
|
|
1262
|
+
sample = _sample_file(file_path, size)
|
|
1263
|
+
printable_ratio = _printable_ratio(sample, policy=decode_policy)
|
|
1264
|
+
if printable_ratio < 0.80 or b"\x00" in sample:
|
|
1265
|
+
is_binary = 1
|
|
1266
|
+
parse_status, parse_reason = "skipped", "binary"
|
|
1267
|
+
ast_status, ast_reason = "skipped", "binary"
|
|
1268
|
+
else:
|
|
1269
|
+
try:
|
|
1270
|
+
text_sample = sample.decode("utf-8") if decode_policy == "strong" else sample.decode("utf-8", errors="ignore")
|
|
1271
|
+
except UnicodeDecodeError:
|
|
1272
|
+
is_binary = 1
|
|
1273
|
+
parse_status, parse_reason = "skipped", "binary"
|
|
1274
|
+
ast_status, ast_reason = "skipped", "binary"
|
|
1275
|
+
text_sample = ""
|
|
1276
|
+
if not is_binary:
|
|
1277
|
+
if _is_minified(file_path, text_sample):
|
|
1278
|
+
is_minified = 1
|
|
1279
|
+
parse_status, parse_reason = "skipped", "minified"
|
|
1280
|
+
ast_status, ast_reason = "skipped", "minified"
|
|
1281
|
+
elif size > parse_limit:
|
|
1282
|
+
if sample_large:
|
|
1283
|
+
sampled = 1
|
|
1284
|
+
parse_status, parse_reason = "skipped", "sampled"
|
|
1285
|
+
ast_status, ast_reason = "skipped", "no_parse"
|
|
1286
|
+
try:
|
|
1287
|
+
if decode_policy == "strong":
|
|
1288
|
+
content = sample.decode("utf-8")
|
|
1289
|
+
else:
|
|
1290
|
+
content = sample.decode("utf-8", errors="ignore")
|
|
1291
|
+
except Exception:
|
|
1292
|
+
content = ""
|
|
1293
|
+
content_bytes = len(content.encode("utf-8")) if content else 0
|
|
1294
|
+
else:
|
|
1295
|
+
parse_status, parse_reason = "skipped", "too_large"
|
|
1296
|
+
ast_status, ast_reason = "skipped", "no_parse"
|
|
1297
|
+
else:
|
|
1298
|
+
raw = file_path.read_bytes()
|
|
1299
|
+
try:
|
|
1300
|
+
text = raw.decode("utf-8") if decode_policy == "strong" else raw.decode("utf-8", errors="ignore")
|
|
1301
|
+
except UnicodeDecodeError:
|
|
1302
|
+
is_binary = 1
|
|
1303
|
+
parse_status, parse_reason = "skipped", "binary"
|
|
1304
|
+
ast_status, ast_reason = "skipped", "binary"
|
|
1305
|
+
text = ""
|
|
1306
|
+
if not is_binary:
|
|
1307
|
+
parse_status, parse_reason = "ok", "none"
|
|
1308
|
+
# Storage cap
|
|
1309
|
+
exclude_bytes = getattr(self.cfg, "exclude_content_bytes", 104857600)
|
|
1310
|
+
if len(text) > exclude_bytes:
|
|
1311
|
+
text = text[:exclude_bytes] + f"\n\n... [CONTENT TRUNCATED (File size: {len(text)} bytes, limit: {exclude_bytes})] ..."
|
|
1312
|
+
if getattr(self.cfg, "redact_enabled", True):
|
|
1313
|
+
text = _redact(text)
|
|
1314
|
+
content = text
|
|
1315
|
+
content_bytes = len(content.encode("utf-8")) if content else 0
|
|
1316
|
+
if excluded and exclude_ast:
|
|
1317
|
+
ast_status, ast_reason = "skipped", "excluded"
|
|
1318
|
+
elif size > ast_limit:
|
|
1319
|
+
ast_status, ast_reason = "skipped", "too_large"
|
|
1320
|
+
else:
|
|
1321
|
+
try:
|
|
1322
|
+
symbols, relations = _extract_symbols_with_relations(db_path, content)
|
|
1323
|
+
ast_status, ast_reason = "ok", "none"
|
|
1324
|
+
except Exception:
|
|
1325
|
+
ast_status, ast_reason = "error", "error"
|
|
1326
|
+
|
|
1327
|
+
return {
|
|
1328
|
+
"type": "changed",
|
|
1329
|
+
"rel": db_path,
|
|
1330
|
+
"repo": repo,
|
|
1331
|
+
"mtime": int(st.st_mtime),
|
|
1332
|
+
"size": size,
|
|
1333
|
+
"content": content,
|
|
1334
|
+
"scan_ts": scan_ts,
|
|
1335
|
+
"symbols": symbols,
|
|
1336
|
+
"relations": relations,
|
|
1337
|
+
"parse_status": parse_status,
|
|
1338
|
+
"parse_reason": parse_reason,
|
|
1339
|
+
"ast_status": ast_status,
|
|
1340
|
+
"ast_reason": ast_reason,
|
|
1341
|
+
"is_binary": is_binary,
|
|
1342
|
+
"is_minified": is_minified,
|
|
1343
|
+
"sampled": sampled,
|
|
1344
|
+
"content_bytes": content_bytes,
|
|
1345
|
+
"engine_doc": self._build_engine_doc(db_path, repo, rel_to_root, content, parse_status, int(st.st_mtime), size),
|
|
1346
|
+
}
|
|
1347
|
+
except Exception:
|
|
1348
|
+
self.status.errors += 1
|
|
1349
|
+
if raise_on_error:
|
|
1350
|
+
raise
|
|
1351
|
+
try:
|
|
1352
|
+
return {"type": "unchanged", "rel": self._encode_db_path(root, file_path)}
|
|
1353
|
+
except Exception:
|
|
1354
|
+
return None
|
|
1355
|
+
|
|
1356
|
+
def _process_meta_file(self, path: Path, repo: str) -> None:
|
|
1357
|
+
if path.name != "package.json":
|
|
1358
|
+
return
|
|
1359
|
+
try:
|
|
1360
|
+
raw = path.read_text(encoding="utf-8", errors="ignore")
|
|
1361
|
+
data = json.loads(raw)
|
|
1362
|
+
except Exception:
|
|
1363
|
+
return
|
|
1364
|
+
|
|
1365
|
+
description = ""
|
|
1366
|
+
tags: list[str] = []
|
|
1367
|
+
if isinstance(data, dict):
|
|
1368
|
+
description = str(data.get("description", "") or "")
|
|
1369
|
+
keywords = data.get("keywords", [])
|
|
1370
|
+
if isinstance(keywords, list):
|
|
1371
|
+
tags = [str(t) for t in keywords if t]
|
|
1372
|
+
elif isinstance(keywords, str):
|
|
1373
|
+
tags = [k.strip() for k in keywords.split(",") if k.strip()]
|
|
1374
|
+
|
|
1375
|
+
if not description and not tags:
|
|
1376
|
+
return
|
|
1377
|
+
|
|
1378
|
+
tags_str = ",".join(tags)
|
|
1379
|
+
self._enqueue_repo_meta(repo, tags_str, description)
|
|
1380
|
+
|
|
1381
|
+
def _iter_file_entries_stream(self, root: Path, apply_exclude: bool = True):
|
|
1382
|
+
exclude_dirs = set(getattr(self.cfg, "exclude_dirs", []))
|
|
1383
|
+
exclude_globs = list(getattr(self.cfg, "exclude_globs", []))
|
|
1384
|
+
|
|
1385
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
1386
|
+
if dirnames and apply_exclude:
|
|
1387
|
+
kept = []
|
|
1388
|
+
for d in dirnames:
|
|
1389
|
+
if d in exclude_dirs:
|
|
1390
|
+
continue
|
|
1391
|
+
rel_dir = str((Path(dirpath) / d).absolute().relative_to(root))
|
|
1392
|
+
if any(fnmatch.fnmatch(rel_dir, pat) or fnmatch.fnmatch(d, pat) for pat in exclude_dirs):
|
|
1393
|
+
continue
|
|
1394
|
+
kept.append(d)
|
|
1395
|
+
dirnames[:] = kept
|
|
1396
|
+
for fn in filenames:
|
|
1397
|
+
p = Path(dirpath) / fn
|
|
1398
|
+
try:
|
|
1399
|
+
rel = str(p.absolute().relative_to(root))
|
|
1400
|
+
except Exception:
|
|
1401
|
+
continue
|
|
1402
|
+
excluded = any(fnmatch.fnmatch(rel, pat) or fnmatch.fnmatch(fn, pat) for pat in exclude_globs)
|
|
1403
|
+
if not excluded and exclude_dirs:
|
|
1404
|
+
rel_parts = rel.split(os.sep)
|
|
1405
|
+
for part in rel_parts:
|
|
1406
|
+
if part in exclude_dirs:
|
|
1407
|
+
excluded = True
|
|
1408
|
+
break
|
|
1409
|
+
if any(fnmatch.fnmatch(part, pat) for pat in exclude_dirs):
|
|
1410
|
+
excluded = True
|
|
1411
|
+
break
|
|
1412
|
+
try:
|
|
1413
|
+
st = p.stat()
|
|
1414
|
+
except Exception:
|
|
1415
|
+
self.status.errors += 1
|
|
1416
|
+
continue
|
|
1417
|
+
if apply_exclude and excluded:
|
|
1418
|
+
continue
|
|
1419
|
+
yield p, st, excluded
|
|
1420
|
+
|
|
1421
|
+
def _iter_file_entries(self, root: Path) -> List[Tuple[Path, os.stat_result]]:
|
|
1422
|
+
return [(p, st) for p, st, _ in self._iter_file_entries_stream(root)]
|
|
1423
|
+
|
|
1424
|
+
def _iter_files(self, root: Path) -> List[Path]:
|
|
1425
|
+
"""Return candidate file paths (legacy tests expect Path objects)."""
|
|
1426
|
+
return [p for p, _ in self._iter_file_entries(root)]
|
|
1427
|
+
|
|
1428
|
+
def _scan_once(self):
|
|
1429
|
+
# Optional: purge legacy db paths (one-time)
|
|
1430
|
+
if not self._legacy_purge_done:
|
|
1431
|
+
flag = os.environ.get("DECKARD_PURGE_LEGACY_PATHS", "0").strip().lower()
|
|
1432
|
+
if flag in ("1", "true", "yes", "on"):
|
|
1433
|
+
try:
|
|
1434
|
+
purged = self.db.purge_legacy_paths()
|
|
1435
|
+
if self.logger:
|
|
1436
|
+
self.logger.log_info(f"purged_legacy_paths={purged}")
|
|
1437
|
+
except Exception:
|
|
1438
|
+
if self.logger:
|
|
1439
|
+
self.logger.log_error("failed to purge legacy paths")
|
|
1440
|
+
self._legacy_purge_done = True
|
|
1441
|
+
|
|
1442
|
+
# Iterate over all workspace roots
|
|
1443
|
+
all_roots = [Path(os.path.expanduser(r)).absolute() for r in self.cfg.workspace_roots]
|
|
1444
|
+
valid_roots = [r for r in all_roots if r.exists()]
|
|
1445
|
+
|
|
1446
|
+
now, scan_ts = time.time(), int(time.time())
|
|
1447
|
+
self.status.last_scan_ts, self.status.scanned_files = now, 0
|
|
1448
|
+
|
|
1449
|
+
batch_files, batch_syms, batch_rels, unchanged = [], [], [], []
|
|
1450
|
+
|
|
1451
|
+
chunk_size = 100
|
|
1452
|
+
chunk = []
|
|
1453
|
+
|
|
1454
|
+
exclude_meta = _env_flag("DECKARD_EXCLUDE_APPLIES_TO_META", True)
|
|
1455
|
+
for root in valid_roots:
|
|
1456
|
+
for entry in self._iter_file_entries_stream(root, apply_exclude=exclude_meta):
|
|
1457
|
+
chunk.append(entry)
|
|
1458
|
+
self.status.scanned_files += 1
|
|
1459
|
+
if len(chunk) < chunk_size:
|
|
1460
|
+
continue
|
|
1461
|
+
self._process_chunk(root, chunk, scan_ts, now, batch_files, batch_syms, batch_rels, unchanged)
|
|
1462
|
+
chunk = []
|
|
1463
|
+
if chunk:
|
|
1464
|
+
self._process_chunk(root, chunk, scan_ts, now, batch_files, batch_syms, batch_rels, unchanged)
|
|
1465
|
+
chunk = []
|
|
1466
|
+
|
|
1467
|
+
if batch_files or batch_syms or batch_rels:
|
|
1468
|
+
self._enqueue_db_tasks(batch_files, batch_syms, batch_rels)
|
|
1469
|
+
self.status.indexed_files += len(batch_files)
|
|
1470
|
+
if unchanged:
|
|
1471
|
+
self._enqueue_update_last_seen(unchanged)
|
|
1472
|
+
try:
|
|
1473
|
+
unseen_paths = self.db.get_unseen_paths(scan_ts)
|
|
1474
|
+
for p in unseen_paths:
|
|
1475
|
+
self._enqueue_delete_path(p)
|
|
1476
|
+
except Exception as e:
|
|
1477
|
+
self.status.errors += 1
|
|
1478
|
+
|
|
1479
|
+
def _process_chunk(self, root, chunk, scan_ts, now, batch_files, batch_syms, batch_rels, unchanged):
|
|
1480
|
+
futures = [self._executor.submit(self._process_file_task, root, f, s, scan_ts, now, excluded) for f, s, excluded in chunk]
|
|
1481
|
+
|
|
1482
|
+
for f, s, _ in chunk:
|
|
1483
|
+
if f.name == "package.json":
|
|
1484
|
+
rel = str(f.relative_to(root))
|
|
1485
|
+
repo = rel.split(os.sep, 1)[0] if os.sep in rel else "__root__"
|
|
1486
|
+
self._process_meta_file(f, repo)
|
|
1487
|
+
|
|
1488
|
+
for future in concurrent.futures.as_completed(futures):
|
|
1489
|
+
try: res = future.result()
|
|
1490
|
+
except: self.status.errors += 1; continue
|
|
1491
|
+
if not res: continue
|
|
1492
|
+
if res["type"] == "unchanged":
|
|
1493
|
+
unchanged.append(res["rel"])
|
|
1494
|
+
if len(unchanged) >= 100:
|
|
1495
|
+
self._enqueue_update_last_seen(unchanged)
|
|
1496
|
+
unchanged.clear()
|
|
1497
|
+
continue
|
|
1498
|
+
|
|
1499
|
+
batch_files.append(
|
|
1500
|
+
(
|
|
1501
|
+
res["rel"],
|
|
1502
|
+
res["repo"],
|
|
1503
|
+
res["mtime"],
|
|
1504
|
+
res["size"],
|
|
1505
|
+
res["content"],
|
|
1506
|
+
res["parse_status"],
|
|
1507
|
+
res["parse_reason"],
|
|
1508
|
+
res["ast_status"],
|
|
1509
|
+
res["ast_reason"],
|
|
1510
|
+
int(res["is_binary"]),
|
|
1511
|
+
int(res["is_minified"]),
|
|
1512
|
+
int(res["sampled"]),
|
|
1513
|
+
int(res["content_bytes"]),
|
|
1514
|
+
)
|
|
1515
|
+
)
|
|
1516
|
+
if res.get("symbols"):
|
|
1517
|
+
batch_syms.extend(res["symbols"])
|
|
1518
|
+
if res.get("relations"):
|
|
1519
|
+
batch_rels.extend(res["relations"])
|
|
1520
|
+
|
|
1521
|
+
if len(batch_files) >= 50:
|
|
1522
|
+
self._enqueue_db_tasks(batch_files, batch_syms, batch_rels)
|
|
1523
|
+
self.status.indexed_files += len(batch_files)
|
|
1524
|
+
batch_files.clear()
|
|
1525
|
+
batch_syms.clear()
|
|
1526
|
+
batch_rels.clear()
|
|
1527
|
+
|
|
1528
|
+
def _process_watcher_event(self, evt: FsEvent):
|
|
1529
|
+
try:
|
|
1530
|
+
self._enqueue_fsevent(evt)
|
|
1531
|
+
except Exception:
|
|
1532
|
+
self.status.errors += 1
|