sari 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app/__init__.py +1 -0
- app/config.py +240 -0
- app/db.py +932 -0
- app/dedup_queue.py +77 -0
- app/engine_registry.py +56 -0
- app/engine_runtime.py +472 -0
- app/http_server.py +204 -0
- app/indexer.py +1532 -0
- app/main.py +147 -0
- app/models.py +39 -0
- app/queue_pipeline.py +65 -0
- app/ranking.py +144 -0
- app/registry.py +172 -0
- app/search_engine.py +572 -0
- app/watcher.py +124 -0
- app/workspace.py +286 -0
- deckard/__init__.py +3 -0
- deckard/__main__.py +4 -0
- deckard/main.py +345 -0
- deckard/version.py +1 -0
- mcp/__init__.py +1 -0
- mcp/__main__.py +19 -0
- mcp/cli.py +485 -0
- mcp/daemon.py +149 -0
- mcp/proxy.py +304 -0
- mcp/registry.py +218 -0
- mcp/server.py +519 -0
- mcp/session.py +234 -0
- mcp/telemetry.py +112 -0
- mcp/test_cli.py +89 -0
- mcp/test_daemon.py +124 -0
- mcp/test_server.py +197 -0
- mcp/tools/__init__.py +14 -0
- mcp/tools/_util.py +244 -0
- mcp/tools/deckard_guide.py +32 -0
- mcp/tools/doctor.py +208 -0
- mcp/tools/get_callers.py +60 -0
- mcp/tools/get_implementations.py +60 -0
- mcp/tools/index_file.py +75 -0
- mcp/tools/list_files.py +138 -0
- mcp/tools/read_file.py +48 -0
- mcp/tools/read_symbol.py +99 -0
- mcp/tools/registry.py +212 -0
- mcp/tools/repo_candidates.py +89 -0
- mcp/tools/rescan.py +46 -0
- mcp/tools/scan_once.py +54 -0
- mcp/tools/search.py +208 -0
- mcp/tools/search_api_endpoints.py +72 -0
- mcp/tools/search_symbols.py +63 -0
- mcp/tools/status.py +135 -0
- sari/__init__.py +1 -0
- sari/__main__.py +4 -0
- sari-0.0.1.dist-info/METADATA +521 -0
- sari-0.0.1.dist-info/RECORD +58 -0
- sari-0.0.1.dist-info/WHEEL +5 -0
- sari-0.0.1.dist-info/entry_points.txt +2 -0
- sari-0.0.1.dist-info/licenses/LICENSE +21 -0
- sari-0.0.1.dist-info/top_level.txt +4 -0
app/search_engine.py
ADDED
|
@@ -0,0 +1,572 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
import re
|
|
3
|
+
import time
|
|
4
|
+
import unicodedata
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Tuple, Optional, Any, Dict
|
|
7
|
+
|
|
8
|
+
from .models import SearchHit, SearchOptions
|
|
9
|
+
from .ranking import (
|
|
10
|
+
extract_terms, count_matches, calculate_recency_score,
|
|
11
|
+
snippet_around, get_file_extension, glob_to_like
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
class SearchEngine:
|
|
15
|
+
def __init__(self, db):
|
|
16
|
+
self.db = db
|
|
17
|
+
|
|
18
|
+
def search_v2(self, opts: SearchOptions) -> Tuple[List[SearchHit], Dict[str, Any]]:
|
|
19
|
+
"""Enhanced search with Hybrid (Symbol + FTS) strategy."""
|
|
20
|
+
q = (opts.query or "").strip()
|
|
21
|
+
q = unicodedata.normalize("NFKC", q).lower()
|
|
22
|
+
q = " ".join(q.split())
|
|
23
|
+
if not q:
|
|
24
|
+
return [], {"fallback_used": False, "total_scanned": 0, "total": 0}
|
|
25
|
+
|
|
26
|
+
terms = extract_terms(q)
|
|
27
|
+
meta: Dict[str, Any] = {"fallback_used": False, "total_scanned": 0}
|
|
28
|
+
|
|
29
|
+
# Regex mode bypasses hybrid logic
|
|
30
|
+
if opts.use_regex:
|
|
31
|
+
return self._search_regex(opts, terms, meta)
|
|
32
|
+
|
|
33
|
+
# 1. Symbol Search (Priority Layer)
|
|
34
|
+
symbol_hits_data = []
|
|
35
|
+
if opts.total_mode != "approx":
|
|
36
|
+
symbol_hits_data = self.db.search_symbols(q, repo=opts.repo, limit=50, root_ids=list(opts.root_ids or []))
|
|
37
|
+
|
|
38
|
+
# Convert symbol hits to SearchHit objects
|
|
39
|
+
symbol_hits = []
|
|
40
|
+
for s in symbol_hits_data:
|
|
41
|
+
hit = SearchHit(
|
|
42
|
+
repo=s["repo"],
|
|
43
|
+
path=s["path"],
|
|
44
|
+
score=1000.0, # Massive starting score for symbol match
|
|
45
|
+
snippet=s["snippet"],
|
|
46
|
+
mtime=s["mtime"],
|
|
47
|
+
size=s["size"],
|
|
48
|
+
match_count=1,
|
|
49
|
+
file_type=get_file_extension(s["path"]),
|
|
50
|
+
hit_reason=f"Symbol: {s['kind']} {s['name']}",
|
|
51
|
+
context_symbol=f"{s['kind']}: {s['name']}",
|
|
52
|
+
docstring=s.get("docstring", ""),
|
|
53
|
+
metadata=s.get("metadata", "{}")
|
|
54
|
+
)
|
|
55
|
+
# Recency boost if enabled
|
|
56
|
+
if opts.recency_boost:
|
|
57
|
+
hit.score = calculate_recency_score(hit.mtime, hit.score)
|
|
58
|
+
symbol_hits.append(hit)
|
|
59
|
+
|
|
60
|
+
# 2. FTS Search
|
|
61
|
+
fts_hits = []
|
|
62
|
+
# v2.7.0: Allow unicode in FTS, but fallback if non-ASCII character present
|
|
63
|
+
# as FTS tokenizers often skip emojis and special symbols.
|
|
64
|
+
has_unicode = any(ord(c) > 127 for c in q)
|
|
65
|
+
is_too_short = len(q) < 3
|
|
66
|
+
|
|
67
|
+
use_fts = self.db.fts_enabled and not is_too_short and not has_unicode
|
|
68
|
+
fts_success = False
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
if use_fts:
|
|
72
|
+
try:
|
|
73
|
+
res = self.db._search_fts(opts, terms, meta, no_slice=True)
|
|
74
|
+
if res:
|
|
75
|
+
fts_hits, fts_meta = res
|
|
76
|
+
meta.update(fts_meta)
|
|
77
|
+
fts_success = True
|
|
78
|
+
except sqlite3.OperationalError:
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
if not fts_success:
|
|
82
|
+
# Fallback to LIKE
|
|
83
|
+
res, like_meta = self.db._search_like(opts, terms, meta, no_slice=True)
|
|
84
|
+
fts_hits = res
|
|
85
|
+
meta.update(like_meta)
|
|
86
|
+
meta["fallback_used"] = True
|
|
87
|
+
elif not fts_hits and terms:
|
|
88
|
+
# v2.7.5: Force fallback if FTS results are suspiciously empty for non-trivial query
|
|
89
|
+
res, like_meta = self.db._search_like(opts, terms, meta, no_slice=True)
|
|
90
|
+
fts_hits = res
|
|
91
|
+
meta.update(like_meta)
|
|
92
|
+
meta["fallback_used"] = True
|
|
93
|
+
|
|
94
|
+
# 3. Merge Strategies
|
|
95
|
+
merged_map: Dict[str, SearchHit] = {}
|
|
96
|
+
for h in fts_hits:
|
|
97
|
+
merged_map[h.path] = h
|
|
98
|
+
|
|
99
|
+
for sh in symbol_hits:
|
|
100
|
+
if sh.path in merged_map:
|
|
101
|
+
existing = merged_map[sh.path]
|
|
102
|
+
existing.score += 1200.0
|
|
103
|
+
existing.hit_reason = f"{sh.hit_reason}, {existing.hit_reason}"
|
|
104
|
+
if sh.snippet.strip() not in existing.snippet:
|
|
105
|
+
existing.snippet = f"{sh.snippet}\n...\n{existing.snippet}"
|
|
106
|
+
if sh.docstring:
|
|
107
|
+
existing.docstring = sh.docstring
|
|
108
|
+
if sh.metadata and sh.metadata != "{}":
|
|
109
|
+
existing.metadata = sh.metadata
|
|
110
|
+
else:
|
|
111
|
+
merged_map[sh.path] = sh
|
|
112
|
+
|
|
113
|
+
final_hits = list(merged_map.values())
|
|
114
|
+
final_hits.sort(key=lambda h: (-h.score, -h.mtime, h.path))
|
|
115
|
+
|
|
116
|
+
start = int(opts.offset)
|
|
117
|
+
end = start + int(opts.limit)
|
|
118
|
+
|
|
119
|
+
# Adjust Total Count
|
|
120
|
+
if opts.total_mode == "approx":
|
|
121
|
+
meta["total"] = -1
|
|
122
|
+
elif meta.get("total", 0) > 0:
|
|
123
|
+
meta["total"] = max(meta["total"], len(final_hits))
|
|
124
|
+
else:
|
|
125
|
+
meta["total"] = len(final_hits)
|
|
126
|
+
|
|
127
|
+
return final_hits[start:end], meta
|
|
128
|
+
|
|
129
|
+
def _search_like(self, opts: SearchOptions, terms: List[str],
|
|
130
|
+
meta: Dict[str, Any], no_slice: bool = False) -> Tuple[List[SearchHit], Dict[str, Any]]:
|
|
131
|
+
meta["fallback_used"] = True
|
|
132
|
+
like_q = opts.query.replace("^", "^^").replace("%", "^%").replace("_", "^_")
|
|
133
|
+
# v2.7.0: Use files_view (fv) instead of files (f) for content match as it's decompressed
|
|
134
|
+
where_clauses = ["(fv.content LIKE ? ESCAPE '^' OR f.path LIKE ? ESCAPE '^' OR f.repo LIKE ? ESCAPE '^')"]
|
|
135
|
+
params: List[Any] = [f"%{like_q}%", f"%{like_q}%", f"%{like_q}%"]
|
|
136
|
+
|
|
137
|
+
filter_clauses, filter_params = self._build_filter_clauses(opts)
|
|
138
|
+
where_clauses.extend(filter_clauses)
|
|
139
|
+
params.extend(filter_params)
|
|
140
|
+
|
|
141
|
+
where = " AND ".join(where_clauses)
|
|
142
|
+
fetch_limit = (opts.offset + opts.limit) * 2
|
|
143
|
+
if fetch_limit < 100: fetch_limit = 100
|
|
144
|
+
|
|
145
|
+
sql = f"""
|
|
146
|
+
SELECT f.repo AS repo,
|
|
147
|
+
f.path AS path,
|
|
148
|
+
f.mtime AS mtime,
|
|
149
|
+
f.size AS size,
|
|
150
|
+
1.0 AS score,
|
|
151
|
+
fv.content AS content
|
|
152
|
+
FROM files f
|
|
153
|
+
JOIN files_view fv ON f.rowid = fv.rowid
|
|
154
|
+
WHERE {where}
|
|
155
|
+
ORDER BY {"f.mtime DESC" if opts.recency_boost else "f.path"}, f.path ASC
|
|
156
|
+
LIMIT ?;
|
|
157
|
+
"""
|
|
158
|
+
params.append(int(fetch_limit))
|
|
159
|
+
|
|
160
|
+
with self.db._read_lock:
|
|
161
|
+
if opts.total_mode == "exact":
|
|
162
|
+
count_sql = f"SELECT COUNT(*) as c FROM files f JOIN files_view fv ON f.rowid = fv.rowid WHERE {where}"
|
|
163
|
+
count_row = self.db._read.execute(count_sql, params[:-1]).fetchone()
|
|
164
|
+
meta["total"] = int(count_row["c"]) if count_row else 0
|
|
165
|
+
else:
|
|
166
|
+
meta["total"] = -1
|
|
167
|
+
|
|
168
|
+
rows = self.db._read.execute(sql, params).fetchall()
|
|
169
|
+
|
|
170
|
+
meta["total_mode"] = opts.total_mode
|
|
171
|
+
hits = self._process_rows(rows, opts, terms)
|
|
172
|
+
meta["total_scanned"] = len(rows)
|
|
173
|
+
|
|
174
|
+
if no_slice:
|
|
175
|
+
return hits, meta
|
|
176
|
+
|
|
177
|
+
start = opts.offset
|
|
178
|
+
end = opts.offset + opts.limit
|
|
179
|
+
return hits[start:end], meta
|
|
180
|
+
|
|
181
|
+
def _search_fts(self, opts: SearchOptions, terms: List[str],
|
|
182
|
+
meta: Dict[str, Any], no_slice: bool = False) -> Optional[Tuple[List[SearchHit], Dict[str, Any]]]:
|
|
183
|
+
# v2.7.0: Safe FTS query escaping
|
|
184
|
+
# Wrap terms in double quotes and escape existing quotes to prevent FTS5 syntax errors
|
|
185
|
+
safe_terms = []
|
|
186
|
+
for t in terms:
|
|
187
|
+
clean_t = t.replace('"', '""')
|
|
188
|
+
if clean_t:
|
|
189
|
+
safe_terms.append(f'"{clean_t}"')
|
|
190
|
+
|
|
191
|
+
fts_query = " ".join(safe_terms)
|
|
192
|
+
if not fts_query:
|
|
193
|
+
return [], meta
|
|
194
|
+
|
|
195
|
+
where_clauses = ["files_fts MATCH ?"]
|
|
196
|
+
params: List[Any] = [fts_query]
|
|
197
|
+
|
|
198
|
+
filter_clauses, filter_params = self._build_filter_clauses(opts)
|
|
199
|
+
where_clauses.extend(filter_clauses)
|
|
200
|
+
params.extend(filter_params)
|
|
201
|
+
|
|
202
|
+
where = " AND ".join(where_clauses)
|
|
203
|
+
total_hits = 0
|
|
204
|
+
if opts.total_mode == "exact":
|
|
205
|
+
try:
|
|
206
|
+
count_sql = f"SELECT COUNT(*) as c FROM files_fts JOIN files f ON f.rowid = files_fts.rowid WHERE {where}"
|
|
207
|
+
with self.db._read_lock:
|
|
208
|
+
count_row = self.db._read.execute(count_sql, params).fetchone()
|
|
209
|
+
total_hits = int(count_row["c"]) if count_row else 0
|
|
210
|
+
except sqlite3.OperationalError:
|
|
211
|
+
return None
|
|
212
|
+
else:
|
|
213
|
+
total_hits = -1
|
|
214
|
+
|
|
215
|
+
meta["total"] = total_hits
|
|
216
|
+
meta["total_mode"] = opts.total_mode
|
|
217
|
+
fetch_limit = 50
|
|
218
|
+
|
|
219
|
+
path_prior_sql = """
|
|
220
|
+
CASE
|
|
221
|
+
WHEN f.path LIKE 'src/%' OR f.path LIKE '%/src/%' OR f.path LIKE 'app/%' OR f.path LIKE '%/app/%' OR f.path LIKE 'core/%' OR f.path LIKE '%/core/%' THEN 0.6
|
|
222
|
+
WHEN f.path LIKE 'config/%' OR f.path LIKE '%/config/%' OR f.path LIKE 'domain/%' OR f.path LIKE '%/domain/%' OR f.path LIKE 'service/%' OR f.path LIKE '%/service/%' THEN 0.4
|
|
223
|
+
WHEN f.path LIKE 'test/%' OR f.path LIKE '%/test/%' OR f.path LIKE 'tests/%' OR f.path LIKE '%/tests/%' OR f.path LIKE 'example/%' OR f.path LIKE '%/example/%' OR f.path LIKE 'dist/%' OR f.path LIKE '%/dist/%' OR f.path LIKE 'build/%' OR f.path LIKE '%/build/%' THEN -0.7
|
|
224
|
+
ELSE 0.0
|
|
225
|
+
END
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
filetype_prior_sql = """
|
|
229
|
+
CASE
|
|
230
|
+
WHEN f.path LIKE '%.py' OR f.path LIKE '%.ts' OR f.path LIKE '%.go' OR f.path LIKE '%.java' OR f.path LIKE '%.kt' THEN 0.3
|
|
231
|
+
WHEN f.path LIKE '%.yaml' OR f.path LIKE '%.yml' OR f.path LIKE '%.json' THEN 0.15
|
|
232
|
+
WHEN f.path LIKE '%.lock' OR f.path LIKE '%.min.js' OR f.path LIKE '%.map' THEN -0.8
|
|
233
|
+
ELSE 0.0
|
|
234
|
+
END
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
sql = f"""
|
|
238
|
+
SELECT f.repo AS repo,
|
|
239
|
+
f.path AS path,
|
|
240
|
+
f.mtime AS mtime,
|
|
241
|
+
f.size AS size,
|
|
242
|
+
( -1.0 * bm25(files_fts) + {path_prior_sql} + {filetype_prior_sql} ) AS score,
|
|
243
|
+
f.content AS content
|
|
244
|
+
FROM files_fts
|
|
245
|
+
JOIN files f ON f.rowid = files_fts.rowid
|
|
246
|
+
WHERE {where}
|
|
247
|
+
ORDER BY score DESC
|
|
248
|
+
LIMIT ?;
|
|
249
|
+
"""
|
|
250
|
+
params.append(int(fetch_limit))
|
|
251
|
+
|
|
252
|
+
with self.db._read_lock:
|
|
253
|
+
rows = self.db._read.execute(sql, params).fetchall()
|
|
254
|
+
|
|
255
|
+
hits = self._process_rows(rows, opts, terms, is_rerank=True)
|
|
256
|
+
meta["total_scanned"] = len(rows)
|
|
257
|
+
|
|
258
|
+
if no_slice:
|
|
259
|
+
return hits, meta
|
|
260
|
+
|
|
261
|
+
start = opts.offset
|
|
262
|
+
end = opts.offset + opts.limit
|
|
263
|
+
return hits[start:end], meta
|
|
264
|
+
|
|
265
|
+
def _search_regex(self, opts: SearchOptions, terms: List[str],
|
|
266
|
+
meta: Dict[str, Any]) -> Tuple[List[SearchHit], Dict[str, Any]]:
|
|
267
|
+
meta["regex_mode"] = True
|
|
268
|
+
flags = 0 if opts.case_sensitive else re.IGNORECASE
|
|
269
|
+
try:
|
|
270
|
+
pattern = re.compile(opts.query, flags)
|
|
271
|
+
except re.error as e:
|
|
272
|
+
meta["regex_error"] = str(e)
|
|
273
|
+
return [], meta
|
|
274
|
+
|
|
275
|
+
where_clauses = ["1=1"]
|
|
276
|
+
params: List[Any] = []
|
|
277
|
+
if opts.repo:
|
|
278
|
+
where_clauses.append("f.repo = ?")
|
|
279
|
+
params.append(opts.repo)
|
|
280
|
+
|
|
281
|
+
filter_clauses, filter_params = self._build_filter_clauses(opts)
|
|
282
|
+
where_clauses.extend(filter_clauses)
|
|
283
|
+
params.extend(filter_params)
|
|
284
|
+
|
|
285
|
+
where = " AND ".join(where_clauses)
|
|
286
|
+
|
|
287
|
+
sql = f"""
|
|
288
|
+
SELECT f.repo AS repo,
|
|
289
|
+
f.path AS path,
|
|
290
|
+
f.mtime AS mtime,
|
|
291
|
+
f.size AS size,
|
|
292
|
+
fv.content AS content
|
|
293
|
+
FROM files f
|
|
294
|
+
JOIN files_view fv ON f.rowid = fv.rowid
|
|
295
|
+
WHERE {where}
|
|
296
|
+
ORDER BY {"f.mtime DESC" if opts.recency_boost else "f.path"}
|
|
297
|
+
LIMIT 5000;
|
|
298
|
+
"""
|
|
299
|
+
with self.db._read_lock:
|
|
300
|
+
rows = self.db._read.execute(sql, params).fetchall()
|
|
301
|
+
meta["total_scanned"] = len(rows)
|
|
302
|
+
|
|
303
|
+
# No more manual _decompress(r["content"]) needed here as it comes from fv.content
|
|
304
|
+
hits: List[SearchHit] = []
|
|
305
|
+
for r in rows:
|
|
306
|
+
path = r["path"]
|
|
307
|
+
content = r["content"] or ""
|
|
308
|
+
|
|
309
|
+
if not self._matches_file_types(path, opts.file_types): continue
|
|
310
|
+
if not self._matches_path_pattern(path, opts.path_pattern): continue
|
|
311
|
+
if self._matches_exclude_patterns(path, opts.exclude_patterns): continue
|
|
312
|
+
|
|
313
|
+
matches = pattern.findall(content)
|
|
314
|
+
if not matches: continue
|
|
315
|
+
|
|
316
|
+
match_count = len(matches)
|
|
317
|
+
score = float(match_count)
|
|
318
|
+
if opts.recency_boost:
|
|
319
|
+
score = calculate_recency_score(int(r["mtime"]), score)
|
|
320
|
+
|
|
321
|
+
snippet = snippet_around(content, [opts.query], opts.snippet_lines, highlight=True)
|
|
322
|
+
hits.append(SearchHit(
|
|
323
|
+
repo=r["repo"], path=path, score=score, snippet=snippet,
|
|
324
|
+
mtime=int(r["mtime"]), size=int(r["size"]), match_count=match_count,
|
|
325
|
+
file_type=get_file_extension(path)
|
|
326
|
+
))
|
|
327
|
+
|
|
328
|
+
hits.sort(key=lambda h: (-h.score, -h.mtime, h.path))
|
|
329
|
+
meta["total"] = len(hits)
|
|
330
|
+
meta["total_mode"] = "approx"
|
|
331
|
+
start = opts.offset
|
|
332
|
+
end = opts.offset + opts.limit
|
|
333
|
+
return hits[start:end], meta
|
|
334
|
+
|
|
335
|
+
def _process_rows(self, rows: list, opts: SearchOptions,
|
|
336
|
+
terms: List[str], is_rerank: bool = False) -> List[SearchHit]:
|
|
337
|
+
hits: List[SearchHit] = []
|
|
338
|
+
all_meta = self.db.get_all_repo_meta()
|
|
339
|
+
query_terms = [t.lower() for t in terms]
|
|
340
|
+
query_raw_lower = opts.query.lower()
|
|
341
|
+
|
|
342
|
+
# v2.7.0: Local import of _decompress is no longer strictly needed if content comes from VIEW,
|
|
343
|
+
# but let's keep it as a fallback in case raw rows are passed.
|
|
344
|
+
from .db import _decompress
|
|
345
|
+
|
|
346
|
+
def_patterns = []
|
|
347
|
+
for term in query_terms:
|
|
348
|
+
if len(term) < 3: continue
|
|
349
|
+
p = re.compile(rf"(class|def|function|struct|pub\s+fn|async\s+def|interface|type)\s+{re.escape(term)}\b", re.IGNORECASE)
|
|
350
|
+
def_patterns.append(p)
|
|
351
|
+
|
|
352
|
+
for r in rows:
|
|
353
|
+
path = r["path"]
|
|
354
|
+
repo_name = r["repo"]
|
|
355
|
+
# Try to use 'content' as is (from view), fallback to decompress if it's BLOB
|
|
356
|
+
content = r["content"]
|
|
357
|
+
if isinstance(content, (bytes, bytearray)):
|
|
358
|
+
content = _decompress(content)
|
|
359
|
+
elif content is None:
|
|
360
|
+
content = ""
|
|
361
|
+
|
|
362
|
+
mtime = int(r["mtime"])
|
|
363
|
+
size = int(r["size"])
|
|
364
|
+
|
|
365
|
+
if not self._matches_file_types(path, opts.file_types): continue
|
|
366
|
+
if not self._matches_path_pattern(path, opts.path_pattern): continue
|
|
367
|
+
if self._matches_exclude_patterns(path, opts.exclude_patterns): continue
|
|
368
|
+
|
|
369
|
+
score = float(r["score"]) if r["score"] is not None else 0.0
|
|
370
|
+
reasons = []
|
|
371
|
+
path_lower = path.lower()
|
|
372
|
+
filename = path_lower.split("/")[-1]
|
|
373
|
+
file_stem = Path(filename).stem.lower()
|
|
374
|
+
|
|
375
|
+
if filename == query_raw_lower or file_stem == query_raw_lower:
|
|
376
|
+
score += 2.0
|
|
377
|
+
reasons.append("Exact filename match")
|
|
378
|
+
elif query_raw_lower in file_stem:
|
|
379
|
+
score += 1.2
|
|
380
|
+
reasons.append("Filename stem match")
|
|
381
|
+
elif path_lower.endswith(query_raw_lower):
|
|
382
|
+
score += 1.0
|
|
383
|
+
reasons.append("Path suffix match")
|
|
384
|
+
|
|
385
|
+
for pat in def_patterns:
|
|
386
|
+
if pat.search(content):
|
|
387
|
+
score += 1.5
|
|
388
|
+
reasons.append("Definition found")
|
|
389
|
+
break
|
|
390
|
+
|
|
391
|
+
if len(query_terms) > 1:
|
|
392
|
+
content_lower = content.lower()
|
|
393
|
+
term_indices = []
|
|
394
|
+
all_found = True
|
|
395
|
+
for t in query_terms:
|
|
396
|
+
idx = content_lower.find(t)
|
|
397
|
+
if idx == -1:
|
|
398
|
+
all_found = False
|
|
399
|
+
break
|
|
400
|
+
term_indices.append(idx)
|
|
401
|
+
if all_found:
|
|
402
|
+
span = max(term_indices) - min(term_indices)
|
|
403
|
+
if span < 100:
|
|
404
|
+
score += 0.5
|
|
405
|
+
reasons.append("Proximity boost")
|
|
406
|
+
|
|
407
|
+
meta_obj = all_meta.get(repo_name)
|
|
408
|
+
if meta_obj:
|
|
409
|
+
if meta_obj["priority"] > 0:
|
|
410
|
+
score += meta_obj["priority"]
|
|
411
|
+
reasons.append("High priority")
|
|
412
|
+
tags = meta_obj["tags"].lower().split(",")
|
|
413
|
+
domain = meta_obj["domain"].lower()
|
|
414
|
+
for term in query_terms:
|
|
415
|
+
if term in tags or term == domain:
|
|
416
|
+
score += 0.5
|
|
417
|
+
reasons.append(f"Tag match ({term})")
|
|
418
|
+
break
|
|
419
|
+
|
|
420
|
+
if any(p in path_lower for p in [".codex/", "agents.md", "gemini.md", "readme.md"]):
|
|
421
|
+
score += 0.2
|
|
422
|
+
reasons.append("Core file")
|
|
423
|
+
|
|
424
|
+
if opts.recency_boost:
|
|
425
|
+
score = calculate_recency_score(mtime, score)
|
|
426
|
+
|
|
427
|
+
match_count = count_matches(content, opts.query, False, opts.case_sensitive)
|
|
428
|
+
if opts.case_sensitive and match_count == 0: continue
|
|
429
|
+
|
|
430
|
+
# v2.7.0: Debugging fallback logic - if no matches found via count_matches, log why
|
|
431
|
+
if match_count == 0 and not opts.case_sensitive:
|
|
432
|
+
# We expect non-case-sensitive to find things if LIKE found them
|
|
433
|
+
pass
|
|
434
|
+
|
|
435
|
+
snippet = snippet_around(content, terms, opts.snippet_lines, highlight=True)
|
|
436
|
+
context_symbol = ""
|
|
437
|
+
first_line_match = re.search(r"L(\d+):", snippet)
|
|
438
|
+
if first_line_match:
|
|
439
|
+
start_line = int(first_line_match.group(1))
|
|
440
|
+
ctx = self.db._get_enclosing_symbol(path, start_line)
|
|
441
|
+
if ctx:
|
|
442
|
+
context_symbol = ctx
|
|
443
|
+
score += 0.2
|
|
444
|
+
|
|
445
|
+
hits.append(SearchHit(
|
|
446
|
+
repo=repo_name, path=path, score=round(score, 3), snippet=snippet,
|
|
447
|
+
mtime=mtime, size=size, match_count=match_count,
|
|
448
|
+
file_type=get_file_extension(path),
|
|
449
|
+
hit_reason=", ".join(reasons) if reasons else "Content match",
|
|
450
|
+
context_symbol=context_symbol
|
|
451
|
+
))
|
|
452
|
+
|
|
453
|
+
hits.sort(key=lambda h: (-h.score, -h.mtime, h.path))
|
|
454
|
+
return hits
|
|
455
|
+
|
|
456
|
+
def repo_candidates(self, q: str, limit: int = 3, root_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
457
|
+
q = (q or "").strip()
|
|
458
|
+
if not q: return []
|
|
459
|
+
limit = max(1, min(int(limit), 5))
|
|
460
|
+
|
|
461
|
+
if self.db.fts_enabled:
|
|
462
|
+
sql = """
|
|
463
|
+
SELECT f.repo AS repo, COUNT(1) AS c
|
|
464
|
+
FROM files_fts JOIN files f ON f.rowid = files_fts.rowid
|
|
465
|
+
WHERE files_fts MATCH ? GROUP BY f.repo ORDER BY c DESC LIMIT ?;
|
|
466
|
+
"""
|
|
467
|
+
try:
|
|
468
|
+
with self.db._read_lock:
|
|
469
|
+
rows = self.db._read.execute(sql, (q, limit)).fetchall()
|
|
470
|
+
out: List[Dict[str, Any]] = []
|
|
471
|
+
for r in rows:
|
|
472
|
+
repo = str(r["repo"])
|
|
473
|
+
c = int(r["c"])
|
|
474
|
+
hits, _ = self.search_v2(SearchOptions(query=q, repo=repo, limit=1, root_ids=list(root_ids or [])))
|
|
475
|
+
evidence = hits[0].snippet.replace("\n", " ")[:200] if hits else ""
|
|
476
|
+
out.append({"repo": repo, "score": c, "evidence": evidence})
|
|
477
|
+
return out
|
|
478
|
+
except sqlite3.OperationalError: pass
|
|
479
|
+
|
|
480
|
+
like_q = q.replace("^", "^^").replace("%", "^%").replace("_", "^_")
|
|
481
|
+
sql = "SELECT repo, COUNT(1) AS c FROM files WHERE content LIKE ? ESCAPE '^' GROUP BY repo ORDER BY c DESC LIMIT ?;"
|
|
482
|
+
with self.db._read_lock:
|
|
483
|
+
rows = self.db._read.execute(sql, (f"%{like_q}%", limit)).fetchall()
|
|
484
|
+
out = []
|
|
485
|
+
for r in rows:
|
|
486
|
+
repo, c = str(r["repo"]), int(r["c"])
|
|
487
|
+
hits, _ = self.search_v2(SearchOptions(query=q, repo=repo, limit=1, root_ids=list(root_ids or [])))
|
|
488
|
+
evidence = hits[0].snippet.replace("\n", " ")[:200] if hits else ""
|
|
489
|
+
out.append({"repo": repo, "score": c, "evidence": evidence})
|
|
490
|
+
return out
|
|
491
|
+
|
|
492
|
+
def _build_filter_clauses(self, opts: SearchOptions) -> Tuple[List[str], List[Any]]:
|
|
493
|
+
clauses, params = [], []
|
|
494
|
+
if opts.root_ids:
|
|
495
|
+
root_clauses = []
|
|
496
|
+
for rid in opts.root_ids:
|
|
497
|
+
root_clauses.append("f.path LIKE ?")
|
|
498
|
+
params.append(f"{rid}/%")
|
|
499
|
+
if root_clauses:
|
|
500
|
+
clauses.append("(" + " OR ".join(root_clauses) + ")")
|
|
501
|
+
if opts.repo:
|
|
502
|
+
clauses.append("f.repo = ?")
|
|
503
|
+
params.append(opts.repo)
|
|
504
|
+
if opts.file_types:
|
|
505
|
+
type_clauses = []
|
|
506
|
+
for ft in opts.file_types:
|
|
507
|
+
ext = ft.lower().lstrip(".")
|
|
508
|
+
type_clauses.append("f.path LIKE ?")
|
|
509
|
+
params.append(f"%.{ext}")
|
|
510
|
+
if type_clauses: clauses.append("(" + " OR ".join(type_clauses) + ")")
|
|
511
|
+
if opts.path_pattern:
|
|
512
|
+
clauses.append("f.path LIKE ?")
|
|
513
|
+
params.append(glob_to_like(opts.path_pattern))
|
|
514
|
+
return clauses, params
|
|
515
|
+
|
|
516
|
+
def _matches_file_types(self, path: str, file_types: List[str]) -> bool:
|
|
517
|
+
if not file_types: return True
|
|
518
|
+
return get_file_extension(path) in [ft.lower().lstrip('.') for ft in file_types]
|
|
519
|
+
|
|
520
|
+
def _matches_path_pattern(self, path: str, pattern: Optional[str]) -> bool:
|
|
521
|
+
if not pattern: return True
|
|
522
|
+
import fnmatch
|
|
523
|
+
|
|
524
|
+
# Normalize slashes for consistency
|
|
525
|
+
path = path.replace("\\", "/")
|
|
526
|
+
pattern = pattern.replace("\\", "/")
|
|
527
|
+
|
|
528
|
+
# If pattern is absolute, match exactly or prefix
|
|
529
|
+
if pattern.startswith("/"):
|
|
530
|
+
if path.startswith(pattern): return True
|
|
531
|
+
return fnmatch.fnmatch(path, pattern)
|
|
532
|
+
|
|
533
|
+
# Relative pattern: match end of path or segment
|
|
534
|
+
# e.g. "src/main.py" should match "/users/.../src/main.py"
|
|
535
|
+
|
|
536
|
+
if path.endswith("/" + pattern): return True
|
|
537
|
+
if path == pattern: return True
|
|
538
|
+
|
|
539
|
+
# Check glob
|
|
540
|
+
if fnmatch.fnmatch(path, pattern): return True
|
|
541
|
+
if fnmatch.fnmatch(path, f"*/{pattern}"): return True
|
|
542
|
+
if fnmatch.fnmatch(path, f"*/{pattern}/*"): return True
|
|
543
|
+
|
|
544
|
+
# Fallback to existing loose match
|
|
545
|
+
return (fnmatch.fnmatch(path, f"**/{pattern}") or
|
|
546
|
+
fnmatch.fnmatch(path, f"{pattern}*"))
|
|
547
|
+
|
|
548
|
+
def _matches_exclude_patterns(self, path: str, patterns: List[str]) -> bool:
|
|
549
|
+
if not patterns: return False
|
|
550
|
+
import fnmatch
|
|
551
|
+
for p in patterns:
|
|
552
|
+
if p in path or fnmatch.fnmatch(path, f"*{p}*"): return True
|
|
553
|
+
return False
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
class SqliteSearchEngineAdapter:
|
|
557
|
+
"""Adapter for the legacy SQLite-backed SearchEngine implementation."""
|
|
558
|
+
|
|
559
|
+
def __init__(self, db):
|
|
560
|
+
self._impl = SearchEngine(db)
|
|
561
|
+
|
|
562
|
+
def search_v2(self, opts: SearchOptions):
|
|
563
|
+
return self._impl.search_v2(opts)
|
|
564
|
+
|
|
565
|
+
def repo_candidates(self, q: str, limit: int = 3, root_ids: Optional[List[str]] = None):
|
|
566
|
+
return self._impl.repo_candidates(q, limit, root_ids=root_ids)
|
|
567
|
+
|
|
568
|
+
def _search_like(self, opts: SearchOptions, terms: List[str], meta: Dict[str, Any], no_slice: bool = False):
|
|
569
|
+
return self._impl._search_like(opts, terms, meta, no_slice=no_slice)
|
|
570
|
+
|
|
571
|
+
def _search_fts(self, opts: SearchOptions, terms: List[str], meta: Dict[str, Any], no_slice: bool = False):
|
|
572
|
+
return self._impl._search_fts(opts, terms, meta, no_slice=no_slice)
|
app/watcher.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
import threading
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Callable, Dict, List, Optional
|
|
7
|
+
from threading import Timer
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from watchdog.observers import Observer
|
|
11
|
+
from watchdog.events import FileSystemEventHandler
|
|
12
|
+
HAS_WATCHDOG = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
HAS_WATCHDOG = False
|
|
15
|
+
# Dummy classes for safe definition
|
|
16
|
+
class FileSystemEventHandler: pass
|
|
17
|
+
class Observer: pass
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
from .queue_pipeline import FsEvent, FsEventKind
|
|
21
|
+
except Exception:
|
|
22
|
+
from queue_pipeline import FsEvent, FsEventKind
|
|
23
|
+
|
|
24
|
+
class DebouncedEventHandler(FileSystemEventHandler):
|
|
25
|
+
"""Handles events with debounce to prevent duplicate indexing on save."""
|
|
26
|
+
def __init__(self, callback: Callable[[str], None], debounce_seconds: float = 1.0, logger=None):
|
|
27
|
+
self.callback = callback
|
|
28
|
+
self.debounce_seconds = debounce_seconds
|
|
29
|
+
self.logger = logger
|
|
30
|
+
self._timers = {}
|
|
31
|
+
self._lock = threading.Lock()
|
|
32
|
+
self._pending_events: Dict[str, FsEvent] = {}
|
|
33
|
+
|
|
34
|
+
def on_any_event(self, event):
|
|
35
|
+
if event.is_directory:
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
# We care about Created, Modified, Moved, Deleted
|
|
39
|
+
# watchdog event types: 'created', 'deleted', 'modified', 'moved'
|
|
40
|
+
|
|
41
|
+
evt_kind = None
|
|
42
|
+
if event.event_type == 'created':
|
|
43
|
+
evt_kind = FsEventKind.CREATED
|
|
44
|
+
elif event.event_type == 'modified':
|
|
45
|
+
evt_kind = FsEventKind.MODIFIED
|
|
46
|
+
elif event.event_type == 'deleted':
|
|
47
|
+
evt_kind = FsEventKind.DELETED
|
|
48
|
+
elif event.event_type == 'moved':
|
|
49
|
+
evt_kind = FsEventKind.MOVED
|
|
50
|
+
|
|
51
|
+
if not evt_kind:
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
key = event.src_path
|
|
55
|
+
fs_event = FsEvent(kind=evt_kind, path=event.src_path,
|
|
56
|
+
dest_path=getattr(event, 'dest_path', None),
|
|
57
|
+
ts=time.time())
|
|
58
|
+
|
|
59
|
+
with self._lock:
|
|
60
|
+
if key in self._timers:
|
|
61
|
+
self._timers[key].cancel()
|
|
62
|
+
self._pending_events[key] = fs_event
|
|
63
|
+
t = Timer(self.debounce_seconds, self._trigger, args=[key])
|
|
64
|
+
self._timers[key] = t
|
|
65
|
+
t.start()
|
|
66
|
+
|
|
67
|
+
def _trigger(self, path: str):
|
|
68
|
+
with self._lock:
|
|
69
|
+
if path in self._timers:
|
|
70
|
+
del self._timers[path]
|
|
71
|
+
fs_event = self._pending_events.pop(path, None)
|
|
72
|
+
if not fs_event:
|
|
73
|
+
return
|
|
74
|
+
try:
|
|
75
|
+
self.callback(fs_event)
|
|
76
|
+
except Exception as e:
|
|
77
|
+
if self.logger:
|
|
78
|
+
self.logger.log_error(f"Watcher callback failed for {path}: {e}")
|
|
79
|
+
|
|
80
|
+
class FileWatcher:
|
|
81
|
+
def __init__(self, paths: List[str], on_change_callback: Callable[[FsEvent], None], logger=None):
|
|
82
|
+
self.paths = paths
|
|
83
|
+
self.callback = on_change_callback
|
|
84
|
+
self.logger = logger
|
|
85
|
+
self.observer = None
|
|
86
|
+
self._running = False
|
|
87
|
+
|
|
88
|
+
def start(self):
|
|
89
|
+
if not HAS_WATCHDOG:
|
|
90
|
+
if self.logger:
|
|
91
|
+
self.logger.log_info("Watchdog not installed. Skipping real-time monitoring.")
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
if self._running:
|
|
95
|
+
return
|
|
96
|
+
|
|
97
|
+
self.observer = Observer()
|
|
98
|
+
handler = DebouncedEventHandler(self.callback, logger=self.logger)
|
|
99
|
+
|
|
100
|
+
started_any = False
|
|
101
|
+
for p in self.paths:
|
|
102
|
+
if os.path.exists(p):
|
|
103
|
+
try:
|
|
104
|
+
self.observer.schedule(handler, p, recursive=True)
|
|
105
|
+
started_any = True
|
|
106
|
+
except Exception as e:
|
|
107
|
+
if self.logger:
|
|
108
|
+
self.logger.log_error(f"Failed to watch path {p}: {e}")
|
|
109
|
+
|
|
110
|
+
if started_any:
|
|
111
|
+
try:
|
|
112
|
+
self.observer.start()
|
|
113
|
+
self._running = True
|
|
114
|
+
if self.logger:
|
|
115
|
+
self.logger.log_info(f"Watcher started on: {self.paths}")
|
|
116
|
+
except Exception as e:
|
|
117
|
+
if self.logger:
|
|
118
|
+
self.logger.log_error(f"Failed to start observer: {e}")
|
|
119
|
+
|
|
120
|
+
def stop(self):
|
|
121
|
+
if self.observer and self._running:
|
|
122
|
+
self.observer.stop()
|
|
123
|
+
self.observer.join()
|
|
124
|
+
self._running = False
|