loki-mode 7.21.0 → 7.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,451 @@
1
+ #!/opt/homebrew/bin/python3.12
2
+ """
3
+ Loki Mode Hybrid Codebase Search
4
+
5
+ Combines lexical (ripgrep / grep) and semantic (ChromaDB) retrieval over the
6
+ same files the indexer covers, then fuses the two ranked lists with reciprocal
7
+ rank fusion (RRF). Results are deduped by file:line and truncated to a token
8
+ budget (greedy, highest fused score first).
9
+
10
+ Design notes:
11
+ - Pure logic (RRF, dedup, budget truncation) is separated from I/O so it can
12
+ be unit tested without a live ChromaDB or ripgrep.
13
+ - chromadb is imported lazily inside the semantic path so this module loads
14
+ and the grep-only fallback works even when chromadb is not installed.
15
+ - When ChromaDB / docker is unreachable, search degrades to grep-only so it
16
+ still returns results instead of erroring.
17
+
18
+ Usage:
19
+ python tools/hybrid_search.py "rate limit detection"
20
+ python tools/hybrid_search.py "council vote" --grep-only
21
+ python tools/hybrid_search.py "memory retrieval" --semantic-only
22
+ python tools/hybrid_search.py "build prompt" --budget 4000 --top 15
23
+ """
24
+
25
+ import argparse
26
+ import importlib.util
27
+ import json
28
+ import os
29
+ import shutil
30
+ import subprocess
31
+ import sys
32
+ from pathlib import Path
33
+
34
+ # Project root (tools/hybrid_search.py -> repo root).
35
+ PROJECT_ROOT = Path(__file__).parent.parent.resolve()
36
+
37
+ CHROMA_HOST = os.environ.get("LOKI_CHROMA_HOST", "localhost")
38
+ CHROMA_PORT = int(os.environ.get("LOKI_CHROMA_PORT", "8100"))
39
+ COLLECTION_NAME = os.environ.get("LOKI_CHROMA_COLLECTION", "loki-codebase")
40
+
41
+ # Conservative default token budget for the merged result set. Embeddings and
42
+ # large dumps cost context, so we keep this small and let callers override.
43
+ DEFAULT_TOKEN_BUDGET = 3000
44
+ RRF_K = 60
45
+
46
+
47
+ # -----------------------------------------------------------------------------
48
+ # Token estimation (reuse memory.token_economics; fall back if unavailable)
49
+ # -----------------------------------------------------------------------------
50
+
51
+ def _load_estimate_tokens():
52
+ try:
53
+ if str(PROJECT_ROOT) not in sys.path:
54
+ sys.path.insert(0, str(PROJECT_ROOT))
55
+ from memory.token_economics import estimate_tokens as _et
56
+ return _et
57
+ except Exception:
58
+ def _fallback(text: str) -> int:
59
+ if not text:
60
+ return 0
61
+ return max(1, len(text) // 4)
62
+ return _fallback
63
+
64
+
65
+ estimate_tokens = _load_estimate_tokens()
66
+
67
+
68
+ # -----------------------------------------------------------------------------
69
+ # Pure logic: reciprocal rank fusion, dedup, budget truncation
70
+ # -----------------------------------------------------------------------------
71
+
72
+ def _result_key(item: dict) -> str:
73
+ """Dedup key: a result is identified by its file:line location."""
74
+ return f"{item.get('file', '')}:{item.get('line', 0)}"
75
+
76
+
77
+ def reciprocal_rank_fusion(grep_ranked: list, semantic_ranked: list,
78
+ k: int = RRF_K) -> list:
79
+ """Fuse two ranked result lists with reciprocal rank fusion.
80
+
81
+ RRF score for an item = sum over each list it appears in of 1 / (k + rank),
82
+ where rank is 1-based position in that list. Items are deduped by file:line;
83
+ when the same location appears in both lists the scores add (so locations
84
+ found by both retrievers rank higher). Output is sorted descending by fused
85
+ score with a deterministic tiebreak on (file, line).
86
+
87
+ Args:
88
+ grep_ranked: lexical results, best first. Each is a dict with at least
89
+ "file" and "line"; may carry "snippet", "name", etc.
90
+ semantic_ranked: semantic results, best first, same shape.
91
+ k: RRF constant (default 60, the standard value).
92
+
93
+ Returns:
94
+ A list of merged dicts, each annotated with "_fused_score" and
95
+ "_sources" (sorted list of which retrievers found it).
96
+ """
97
+ merged: dict = {}
98
+
99
+ def _absorb(ranked: list, source: str):
100
+ for rank, item in enumerate(ranked, start=1):
101
+ key = _result_key(item)
102
+ contribution = 1.0 / (k + rank)
103
+ if key not in merged:
104
+ entry = dict(item)
105
+ entry["_fused_score"] = 0.0
106
+ entry["_sources"] = set()
107
+ merged[key] = entry
108
+ merged[key]["_fused_score"] += contribution
109
+ merged[key]["_sources"].add(source)
110
+ # Prefer a snippet if this list has one and we do not yet.
111
+ if not merged[key].get("snippet") and item.get("snippet"):
112
+ merged[key]["snippet"] = item["snippet"]
113
+
114
+ _absorb(grep_ranked, "grep")
115
+ _absorb(semantic_ranked, "semantic")
116
+
117
+ results = list(merged.values())
118
+ for entry in results:
119
+ entry["_sources"] = sorted(entry["_sources"])
120
+ # Deterministic order: highest fused score, then file, then line.
121
+ results.sort(key=lambda e: (-e["_fused_score"],
122
+ str(e.get("file", "")),
123
+ int(e.get("line", 0))))
124
+ return results
125
+
126
+
127
+ def truncate_to_budget(results: list, budget: int) -> list:
128
+ """Greedily keep highest-scored results until the token budget is reached.
129
+
130
+ The token cost of a result is estimated from its snippet plus a small fixed
131
+ overhead for the file:line header line. If a single result is larger than
132
+ the whole budget it is skipped and smaller later results are still packed
133
+ (skip-and-continue), so the budget is never exceeded and the function does
134
+ not get stuck on one oversized hit. Assumes results are already sorted by
135
+ desired priority (RRF output is).
136
+
137
+ Returns the kept subset (a new list), preserving input order.
138
+ """
139
+ if budget <= 0:
140
+ return []
141
+ kept: list = []
142
+ used = 0
143
+ for item in results:
144
+ header = f"{item.get('file', '')}:{item.get('line', 0)} "
145
+ snippet = item.get("snippet", "") or ""
146
+ cost = estimate_tokens(header) + estimate_tokens(snippet)
147
+ if used + cost > budget:
148
+ # Skip this one; a smaller later result may still fit.
149
+ continue
150
+ kept.append(item)
151
+ used += cost
152
+ return kept
153
+
154
+
155
+ # -----------------------------------------------------------------------------
156
+ # I/O: file scope, lexical search, semantic search
157
+ # -----------------------------------------------------------------------------
158
+
159
+ def _indexer_files() -> list:
160
+ """The set of files the indexer covers, as absolute paths.
161
+
162
+ Imports the indexer's collect_files() when possible (single source of truth);
163
+ falls back to a directory glob if the indexer cannot be imported (e.g.
164
+ chromadb missing). The fallback keeps grep-only search working.
165
+ """
166
+ try:
167
+ if str(PROJECT_ROOT / "tools") not in sys.path:
168
+ sys.path.insert(0, str(PROJECT_ROOT / "tools"))
169
+ import importlib
170
+ spec = importlib.util.spec_from_file_location(
171
+ "loki_index_codebase", str(PROJECT_ROOT / "tools" / "index-codebase.py"))
172
+ mod = importlib.util.module_from_spec(spec)
173
+ spec.loader.exec_module(mod)
174
+ return [str(fp) for fp, _ in mod.collect_files()]
175
+ except Exception:
176
+ # Fallback: scan common code dirs without importing chromadb.
177
+ paths = []
178
+ for sub in ("autonomy", "providers", "memory", "dashboard", "mcp",
179
+ "swarm", "learning", "events", "state", "skills", "tests"):
180
+ d = PROJECT_ROOT / sub
181
+ if not d.is_dir():
182
+ continue
183
+ for ext in ("*.sh", "*.py", "*.md"):
184
+ paths.extend(str(p) for p in d.rglob(ext))
185
+ for top in ("SKILL.md", "CLAUDE.md", "autonomy/loki"):
186
+ p = PROJECT_ROOT / top
187
+ if p.exists():
188
+ paths.append(str(p))
189
+ return sorted(set(paths))
190
+
191
+
192
+ def _have_ripgrep() -> bool:
193
+ return shutil.which("rg") is not None
194
+
195
+
196
+ def grep_search(query: str, files: list, top: int = 30) -> tuple:
197
+ """Lexical search over the given files. Returns (results, tool_used).
198
+
199
+ Prefers ripgrep; falls back to grep -rn; falls back to a pure-python scan.
200
+ Results are ranked by per-file match count (more matches first), then by
201
+ line number, then file path for determinism. Each result is a dict with
202
+ file (rel path), line, snippet, name, source="grep".
203
+ """
204
+ if not query.strip() or not files:
205
+ return [], "none"
206
+
207
+ matches: list = [] # (abs_file, line_no, text)
208
+ tool_used = "none"
209
+
210
+ if _have_ripgrep():
211
+ tool_used = "ripgrep"
212
+ try:
213
+ cmd = ["rg", "--no-heading", "--line-number", "--color", "never",
214
+ "--fixed-strings", "-e", query] + files
215
+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
216
+ for line in proc.stdout.splitlines():
217
+ # format: path:line:content
218
+ parts = line.split(":", 2)
219
+ if len(parts) < 3:
220
+ continue
221
+ fpath, lno, text = parts[0], parts[1], parts[2]
222
+ if not lno.isdigit():
223
+ continue
224
+ matches.append((fpath, int(lno), text))
225
+ except Exception:
226
+ matches = []
227
+ tool_used = "none"
228
+
229
+ if tool_used == "none" and shutil.which("grep"):
230
+ tool_used = "grep"
231
+ try:
232
+ cmd = ["grep", "-rnF", "--", query] + files
233
+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
234
+ for line in proc.stdout.splitlines():
235
+ parts = line.split(":", 2)
236
+ if len(parts) < 3:
237
+ continue
238
+ fpath, lno, text = parts[0], parts[1], parts[2]
239
+ if not lno.isdigit():
240
+ continue
241
+ matches.append((fpath, int(lno), text))
242
+ except Exception:
243
+ matches = []
244
+ tool_used = "none"
245
+
246
+ if tool_used == "none":
247
+ # Pure-python scan as a last resort (no external tools at all).
248
+ tool_used = "python-scan"
249
+ needle = query
250
+ for fpath in files:
251
+ try:
252
+ with open(fpath, "r", errors="replace") as fh:
253
+ for i, text in enumerate(fh, start=1):
254
+ if needle in text:
255
+ matches.append((fpath, i, text.rstrip("\n")))
256
+ except Exception:
257
+ continue
258
+
259
+ # Rank by per-file match count desc, then line asc, then path.
260
+ counts: dict = {}
261
+ for fpath, _, _ in matches:
262
+ counts[fpath] = counts.get(fpath, 0) + 1
263
+
264
+ def _rel(p: str) -> str:
265
+ try:
266
+ return str(Path(p).resolve().relative_to(PROJECT_ROOT))
267
+ except Exception:
268
+ return p
269
+
270
+ matches.sort(key=lambda m: (-counts[m[0]], m[1], m[0]))
271
+
272
+ results = []
273
+ for fpath, lno, text in matches[:top]:
274
+ results.append({
275
+ "file": _rel(fpath),
276
+ "line": lno,
277
+ "name": "",
278
+ "snippet": text.strip()[:300],
279
+ "source": "grep",
280
+ })
281
+ return results, tool_used
282
+
283
+
284
+ def semantic_search(query: str, top: int = 30) -> tuple:
285
+ """Semantic search via ChromaDB. Returns (results, available: bool).
286
+
287
+ Imports chromadb lazily so a missing dependency or stopped container does
288
+ not break the module. On any failure returns ([], False) so callers can
289
+ fall back to grep-only.
290
+ """
291
+ try:
292
+ import chromadb
293
+ client = chromadb.HttpClient(host=CHROMA_HOST, port=CHROMA_PORT)
294
+ client.heartbeat()
295
+ collection = client.get_collection(name=COLLECTION_NAME)
296
+ res = collection.query(
297
+ query_texts=[query],
298
+ n_results=top,
299
+ include=["documents", "metadatas", "distances"],
300
+ )
301
+ except Exception:
302
+ return [], False
303
+
304
+ results = []
305
+ ids = res.get("ids", [[]])
306
+ if not ids or not ids[0]:
307
+ return [], True
308
+ for i in range(len(ids[0])):
309
+ meta = res["metadatas"][0][i]
310
+ doc = res["documents"][0][i]
311
+ dist = res["distances"][0][i]
312
+ results.append({
313
+ "file": meta.get("file", ""),
314
+ "line": meta.get("line", 0),
315
+ "name": meta.get("name", ""),
316
+ "snippet": (doc[:300] if doc else ""),
317
+ "relevance": round(max(0.0, 1.0 - dist / 2.0), 4),
318
+ "source": "semantic",
319
+ })
320
+ return results, True
321
+
322
+
323
+ # -----------------------------------------------------------------------------
324
+ # Orchestration
325
+ # -----------------------------------------------------------------------------
326
+
327
+ def hybrid_search(query: str, top: int = 10, budget: int = DEFAULT_TOKEN_BUDGET,
328
+ grep_only: bool = False, semantic_only: bool = False) -> dict:
329
+ """Run hybrid (or single-mode) search and return a structured result dict.
330
+
331
+ Returns:
332
+ {
333
+ "query": str,
334
+ "results": [ {file, line, snippet, sources, fused_score}, ... ],
335
+ "grep_tool": str, # ripgrep | grep | python-scan | none
336
+ "semantic_available": bool,
337
+ "mode": str, # hybrid | grep-only | semantic-only
338
+ "fallback": bool, # True if semantic was requested but down
339
+ "budget": int,
340
+ }
341
+ """
342
+ files = _indexer_files()
343
+ fallback = False
344
+
345
+ grep_results: list = []
346
+ grep_tool = "none"
347
+ if not semantic_only:
348
+ grep_results, grep_tool = grep_search(query, files, top=max(top, 30))
349
+
350
+ semantic_results: list = []
351
+ semantic_available = False
352
+ if not grep_only:
353
+ semantic_results, semantic_available = semantic_search(query, top=max(top, 30))
354
+ if not semantic_available and not semantic_only:
355
+ # Semantic requested as part of hybrid but unavailable -> grep-only.
356
+ fallback = True
357
+
358
+ if semantic_only:
359
+ fused = reciprocal_rank_fusion([], semantic_results)
360
+ mode = "semantic-only"
361
+ elif grep_only:
362
+ fused = reciprocal_rank_fusion(grep_results, [])
363
+ mode = "grep-only"
364
+ else:
365
+ fused = reciprocal_rank_fusion(grep_results, semantic_results)
366
+ mode = "hybrid"
367
+
368
+ fused = fused[:top]
369
+ kept = truncate_to_budget(fused, budget)
370
+
371
+ out = []
372
+ for item in kept:
373
+ out.append({
374
+ "file": item.get("file", ""),
375
+ "line": item.get("line", 0),
376
+ "name": item.get("name", ""),
377
+ "snippet": item.get("snippet", ""),
378
+ "sources": item.get("_sources", []),
379
+ "fused_score": round(item.get("_fused_score", 0.0), 6),
380
+ })
381
+
382
+ return {
383
+ "query": query,
384
+ "results": out,
385
+ "grep_tool": grep_tool,
386
+ "semantic_available": semantic_available,
387
+ "mode": mode,
388
+ "fallback": fallback,
389
+ "budget": budget,
390
+ }
391
+
392
+
393
+ def _render_text(payload: dict) -> str:
394
+ """Render a hybrid_search payload as plain text (no emojis, no dashes)."""
395
+ lines = []
396
+ mode = payload["mode"]
397
+ note = ""
398
+ if payload.get("fallback"):
399
+ note = " (semantic index unavailable, grep-only fallback)"
400
+ grep_tool = payload.get("grep_tool", "none")
401
+ if grep_tool in ("grep", "python-scan") and mode != "semantic-only":
402
+ note += f" (ripgrep not found, using {grep_tool})"
403
+ lines.append(f"hybrid search: {payload['query']!r} [{mode}]{note}")
404
+ lines.append(f"budget: {payload['budget']} tokens, "
405
+ f"{len(payload['results'])} result(s)")
406
+ lines.append("")
407
+ if not payload["results"]:
408
+ lines.append("no matches.")
409
+ return "\n".join(lines)
410
+ for i, r in enumerate(payload["results"], start=1):
411
+ src = ",".join(r.get("sources", [])) or "?"
412
+ lines.append(f"[{i}] {r['file']}:{r['line']} "
413
+ f"(match: {src}, score: {r['fused_score']})")
414
+ snip = (r.get("snippet") or "").strip()
415
+ if snip:
416
+ lines.append(f" {snip}")
417
+ return "\n".join(lines)
418
+
419
+
420
+ def main():
421
+ parser = argparse.ArgumentParser(
422
+ description="Hybrid (grep + semantic) codebase search")
423
+ parser.add_argument("query", help="Search query")
424
+ parser.add_argument("--top", type=int, default=10, help="Max results")
425
+ parser.add_argument("--budget", type=int, default=DEFAULT_TOKEN_BUDGET,
426
+ help="Token budget for the merged result set")
427
+ parser.add_argument("--grep-only", action="store_true",
428
+ help="Lexical search only (skip semantic)")
429
+ parser.add_argument("--semantic-only", action="store_true",
430
+ help="Semantic search only (skip grep)")
431
+ parser.add_argument("--json", action="store_true", help="Output JSON")
432
+ args = parser.parse_args()
433
+
434
+ if args.grep_only and args.semantic_only:
435
+ print("error: --grep-only and --semantic-only are mutually exclusive",
436
+ file=sys.stderr)
437
+ return 2
438
+
439
+ payload = hybrid_search(
440
+ args.query, top=args.top, budget=args.budget,
441
+ grep_only=args.grep_only, semantic_only=args.semantic_only)
442
+
443
+ if args.json:
444
+ print(json.dumps(payload, indent=2))
445
+ else:
446
+ print(_render_text(payload))
447
+ return 0
448
+
449
+
450
+ if __name__ == "__main__":
451
+ sys.exit(main())