loki-mode 7.7.24 → 7.7.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,474 @@
1
+ #!/opt/homebrew/bin/python3.12
2
+ """
3
+ Loki Mode Codebase Indexer
4
+
5
+ Indexes the loki-mode codebase into ChromaDB for semantic code search.
6
+ Chunks code at function-level for shell/Python, and stores metadata
7
+ (file path, line number, function name, language, type).
8
+
9
+ Usage:
10
+ python tools/index-codebase.py # Index everything
11
+ python tools/index-codebase.py --collection loki # Custom collection name
12
+ python tools/index-codebase.py --reset # Clear and re-index
13
+ python tools/index-codebase.py --stats # Show index stats
14
+
15
+ Requires:
16
+ - ChromaDB running on localhost:8100 (docker)
17
+ - pip install chromadb
18
+ """
19
+
20
+ import argparse
21
+ import os
22
+ import re
23
+ import sys
24
+ import time
25
+ from pathlib import Path
26
+ from typing import Optional
27
+
28
+ import chromadb
29
+
30
+ # Project root
31
+ PROJECT_ROOT = Path(__file__).parent.parent.resolve()
32
+
33
+ # ChromaDB connection
34
+ CHROMA_HOST = os.environ.get("LOKI_CHROMA_HOST", "localhost")
35
+ CHROMA_PORT = int(os.environ.get("LOKI_CHROMA_PORT", "8100"))
36
+ COLLECTION_NAME = os.environ.get("LOKI_CHROMA_COLLECTION", "loki-codebase")
37
+
38
+ # File patterns to index
39
+ SHELL_PATTERNS = [
40
+ "autonomy/loki",
41
+ "autonomy/run.sh",
42
+ "autonomy/completion-council.sh",
43
+ "autonomy/issue-providers.sh",
44
+ "autonomy/issue-parser.sh",
45
+ "autonomy/prd-checklist.sh",
46
+ "autonomy/app-runner.sh",
47
+ "autonomy/playwright-verify.sh",
48
+ "autonomy/sandbox.sh",
49
+ "autonomy/migration-agents.sh",
50
+ "autonomy/notify.sh",
51
+ "autonomy/serve.sh",
52
+ "autonomy/telemetry.sh",
53
+ "autonomy/voice.sh",
54
+ "autonomy/council-v2.sh",
55
+ "providers/claude.sh",
56
+ "providers/codex.sh",
57
+ "providers/gemini.sh",
58
+ "providers/loader.sh",
59
+ "events/emit.sh",
60
+ "learning/aggregate.sh",
61
+ "learning/emit.sh",
62
+ "learning/suggest.sh",
63
+ ]
64
+
65
+ PYTHON_GLOBS = [
66
+ "memory/*.py",
67
+ "dashboard/*.py",
68
+ "mcp/*.py",
69
+ "swarm/*.py",
70
+ "learning/*.py",
71
+ "events/*.py",
72
+ "state/*.py",
73
+ ]
74
+
75
+ OTHER_GLOBS = [
76
+ "SKILL.md",
77
+ "skills/*.md",
78
+ "CLAUDE.md",
79
+ ]
80
+
81
+ # Skip patterns
82
+ SKIP_DIRS = {
83
+ "node_modules", ".git", ".loki", "__pycache__", "dist",
84
+ "dashboard-ui", "vscode-extension", ".claude",
85
+ }
86
+
87
+
88
+ def get_client() -> chromadb.HttpClient:
89
+ """Connect to ChromaDB."""
90
+ return chromadb.HttpClient(host=CHROMA_HOST, port=CHROMA_PORT)
91
+
92
+
93
+ def chunk_shell_file(filepath: Path) -> list[dict]:
94
+ """Parse a shell file into function-level chunks."""
95
+ chunks = []
96
+ content = filepath.read_text(errors="replace")
97
+ lines = content.split("\n")
98
+
99
+ # Find all function definitions
100
+ func_pattern = re.compile(r"^([a-zA-Z_][a-zA-Z0-9_]*)\s*\(\)\s*\{?\s*$")
101
+ functions = []
102
+
103
+ for i, line in enumerate(lines):
104
+ m = func_pattern.match(line)
105
+ if m:
106
+ functions.append((m.group(1), i))
107
+
108
+ if not functions:
109
+ # No functions found - index as a single chunk (or split by sections)
110
+ chunks.append({
111
+ "id": f"{filepath.relative_to(PROJECT_ROOT)}::whole-file",
112
+ "content": content[:8000], # Limit chunk size
113
+ "metadata": {
114
+ "file": str(filepath.relative_to(PROJECT_ROOT)),
115
+ "line": 1,
116
+ "type": "file",
117
+ "language": "shell",
118
+ "name": filepath.name,
119
+ "lines_total": len(lines),
120
+ }
121
+ })
122
+ return chunks
123
+
124
+ # Extract each function as a chunk
125
+ # Deduplicate function names by appending line number for duplicates
126
+ seen_names = {}
127
+ for idx, (func_name, start_line) in enumerate(functions):
128
+ # Function ends at next function start or EOF
129
+ if idx + 1 < len(functions):
130
+ end_line = functions[idx + 1][1]
131
+ else:
132
+ end_line = len(lines)
133
+
134
+ func_content = "\n".join(lines[start_line:end_line])
135
+ # Limit chunk size to ~4000 chars for embedding quality
136
+ if len(func_content) > 4000:
137
+ func_content = func_content[:4000] + "\n# ... (truncated)"
138
+
139
+ rel_path = str(filepath.relative_to(PROJECT_ROOT))
140
+ # Make IDs unique for duplicate function names
141
+ if func_name in seen_names:
142
+ chunk_id = f"{rel_path}::{func_name}_L{start_line + 1}"
143
+ else:
144
+ chunk_id = f"{rel_path}::{func_name}"
145
+ seen_names[func_name] = True
146
+
147
+ chunks.append({
148
+ "id": chunk_id,
149
+ "content": func_content,
150
+ "metadata": {
151
+ "file": rel_path,
152
+ "line": start_line + 1,
153
+ "type": "function",
154
+ "language": "shell",
155
+ "name": func_name,
156
+ "lines": min(end_line - start_line, 200),
157
+ }
158
+ })
159
+
160
+ # Also index the file header (before first function) for config/globals
161
+ if functions[0][1] > 5:
162
+ header = "\n".join(lines[:functions[0][1]])
163
+ if len(header) > 200: # Only if meaningful
164
+ chunks.append({
165
+ "id": f"{filepath.relative_to(PROJECT_ROOT)}::header",
166
+ "content": header[:4000],
167
+ "metadata": {
168
+ "file": str(filepath.relative_to(PROJECT_ROOT)),
169
+ "line": 1,
170
+ "type": "header",
171
+ "language": "shell",
172
+ "name": f"{filepath.name} globals/config",
173
+ "lines": functions[0][1],
174
+ }
175
+ })
176
+
177
+ return chunks
178
+
179
+
180
+ def chunk_python_file(filepath: Path) -> list[dict]:
181
+ """Parse a Python file into class/function-level chunks."""
182
+ chunks = []
183
+ content = filepath.read_text(errors="replace")
184
+ lines = content.split("\n")
185
+
186
+ # Find classes and top-level functions
187
+ items = []
188
+ class_pattern = re.compile(r"^class\s+(\w+)")
189
+ func_pattern = re.compile(r"^(?:async\s+)?def\s+(\w+)")
190
+
191
+ for i, line in enumerate(lines):
192
+ mc = class_pattern.match(line)
193
+ mf = func_pattern.match(line)
194
+ if mc:
195
+ items.append(("class", mc.group(1), i))
196
+ elif mf:
197
+ items.append(("function", mf.group(1), i))
198
+
199
+ if not items:
200
+ # Index whole file
201
+ chunks.append({
202
+ "id": f"{filepath.relative_to(PROJECT_ROOT)}::whole-file",
203
+ "content": content[:8000],
204
+ "metadata": {
205
+ "file": str(filepath.relative_to(PROJECT_ROOT)),
206
+ "line": 1,
207
+ "type": "file",
208
+ "language": "python",
209
+ "name": filepath.name,
210
+ "lines_total": len(lines),
211
+ }
212
+ })
213
+ return chunks
214
+
215
+ seen_names = {}
216
+ for idx, (item_type, name, start_line) in enumerate(items):
217
+ if idx + 1 < len(items):
218
+ end_line = items[idx + 1][2]
219
+ else:
220
+ end_line = len(lines)
221
+
222
+ item_content = "\n".join(lines[start_line:end_line])
223
+ if len(item_content) > 4000:
224
+ item_content = item_content[:4000] + "\n# ... (truncated)"
225
+
226
+ rel_path = str(filepath.relative_to(PROJECT_ROOT))
227
+ if name in seen_names:
228
+ chunk_id = f"{rel_path}::{name}_L{start_line + 1}"
229
+ else:
230
+ chunk_id = f"{rel_path}::{name}"
231
+ seen_names[name] = True
232
+
233
+ chunks.append({
234
+ "id": chunk_id,
235
+ "content": item_content,
236
+ "metadata": {
237
+ "file": rel_path,
238
+ "line": start_line + 1,
239
+ "type": item_type,
240
+ "language": "python",
241
+ "name": name,
242
+ "lines": min(end_line - start_line, 200),
243
+ }
244
+ })
245
+
246
+ # Index module docstring / imports
247
+ if items[0][2] > 5:
248
+ header = "\n".join(lines[:items[0][2]])
249
+ if len(header) > 200:
250
+ chunks.append({
251
+ "id": f"{filepath.relative_to(PROJECT_ROOT)}::header",
252
+ "content": header[:4000],
253
+ "metadata": {
254
+ "file": str(filepath.relative_to(PROJECT_ROOT)),
255
+ "line": 1,
256
+ "type": "header",
257
+ "language": "python",
258
+ "name": f"{filepath.name} imports/config",
259
+ "lines": items[0][2],
260
+ }
261
+ })
262
+
263
+ return chunks
264
+
265
+
266
+ def chunk_markdown_file(filepath: Path) -> list[dict]:
267
+ """Parse a markdown file into section-level chunks."""
268
+ chunks = []
269
+ content = filepath.read_text(errors="replace")
270
+
271
+ # Split by ## headers
272
+ sections = re.split(r"(?=^## )", content, flags=re.MULTILINE)
273
+
274
+ for i, section in enumerate(sections):
275
+ section = section.strip()
276
+ if not section or len(section) < 50:
277
+ continue
278
+
279
+ # Extract title
280
+ title_match = re.match(r"^##\s+(.+)", section)
281
+ title = title_match.group(1) if title_match else f"section-{i}"
282
+
283
+ if len(section) > 4000:
284
+ section = section[:4000] + "\n... (truncated)"
285
+
286
+ rel_path = str(filepath.relative_to(PROJECT_ROOT))
287
+ # Sanitize title for use as ID
288
+ safe_title = re.sub(r"[^a-zA-Z0-9_\-. ]", "", title)[:80]
289
+ chunk_id = f"{rel_path}::{safe_title}_{i}"
290
+ chunks.append({
291
+ "id": chunk_id,
292
+ "content": section,
293
+ "metadata": {
294
+ "file": rel_path,
295
+ "line": 1,
296
+ "type": "section",
297
+ "language": "markdown",
298
+ "name": title,
299
+ }
300
+ })
301
+
302
+ return chunks
303
+
304
+
305
+ def collect_files() -> list[tuple[Path, str]]:
306
+ """Collect all files to index with their type."""
307
+ files = []
308
+
309
+ # Shell files (explicit list)
310
+ for pattern in SHELL_PATTERNS:
311
+ p = PROJECT_ROOT / pattern
312
+ if p.exists():
313
+ files.append((p, "shell"))
314
+
315
+ # Python files (glob)
316
+ for glob_pattern in PYTHON_GLOBS:
317
+ for p in sorted(PROJECT_ROOT.glob(glob_pattern)):
318
+ if p.name.startswith("__"):
319
+ continue
320
+ if any(skip in str(p) for skip in SKIP_DIRS):
321
+ continue
322
+ files.append((p, "python"))
323
+
324
+ # Markdown files
325
+ for glob_pattern in OTHER_GLOBS:
326
+ for p in sorted(PROJECT_ROOT.glob(glob_pattern)):
327
+ files.append((p, "markdown"))
328
+
329
+ # Test files (shell)
330
+ for p in sorted((PROJECT_ROOT / "tests").glob("test-*.sh")):
331
+ files.append((p, "shell"))
332
+
333
+ return files
334
+
335
+
336
+ def index_all(collection, reset: bool = False):
337
+ """Index the entire codebase."""
338
+ files = collect_files()
339
+ total_chunks = 0
340
+ file_count = 0
341
+
342
+ print(f"Indexing {len(files)} files into collection '{collection.name}'...")
343
+
344
+ for filepath, file_type in files:
345
+ try:
346
+ if file_type == "shell":
347
+ chunks = chunk_shell_file(filepath)
348
+ elif file_type == "python":
349
+ chunks = chunk_python_file(filepath)
350
+ elif file_type == "markdown":
351
+ chunks = chunk_markdown_file(filepath)
352
+ else:
353
+ continue
354
+
355
+ if not chunks:
356
+ continue
357
+
358
+ # Batch upsert
359
+ ids = [c["id"] for c in chunks]
360
+ documents = [c["content"] for c in chunks]
361
+ metadatas = [c["metadata"] for c in chunks]
362
+
363
+ collection.upsert(
364
+ ids=ids,
365
+ documents=documents,
366
+ metadatas=metadatas,
367
+ )
368
+
369
+ file_count += 1
370
+ total_chunks += len(chunks)
371
+ rel = filepath.relative_to(PROJECT_ROOT)
372
+ print(f" [{file_count}/{len(files)}] {rel}: {len(chunks)} chunks")
373
+
374
+ except Exception as e:
375
+ print(f" ERROR indexing {filepath}: {e}", file=sys.stderr)
376
+
377
+ return file_count, total_chunks
378
+
379
+
380
+ def show_stats(collection):
381
+ """Show collection statistics."""
382
+ count = collection.count()
383
+ print(f"\nCollection: {collection.name}")
384
+ print(f"Total chunks: {count}")
385
+
386
+ if count == 0:
387
+ return
388
+
389
+ # Sample some metadata to show distribution
390
+ results = collection.get(limit=count, include=["metadatas"])
391
+ langs = {}
392
+ types = {}
393
+ files = set()
394
+ for meta in results["metadatas"]:
395
+ lang = meta.get("language", "unknown")
396
+ typ = meta.get("type", "unknown")
397
+ langs[lang] = langs.get(lang, 0) + 1
398
+ types[typ] = types.get(typ, 0) + 1
399
+ files.add(meta.get("file", ""))
400
+
401
+ print(f"Unique files: {len(files)}")
402
+ print(f"\nBy language:")
403
+ for lang, count in sorted(langs.items(), key=lambda x: -x[1]):
404
+ print(f" {lang}: {count}")
405
+ print(f"\nBy type:")
406
+ for typ, count in sorted(types.items(), key=lambda x: -x[1]):
407
+ print(f" {typ}: {count}")
408
+
409
+
410
+ def test_search(collection, query: str, n: int = 5):
411
+ """Run a test search."""
412
+ results = collection.query(
413
+ query_texts=[query],
414
+ n_results=n,
415
+ include=["documents", "metadatas", "distances"],
416
+ )
417
+
418
+ print(f"\nSearch: '{query}' (top {n})")
419
+ print("-" * 60)
420
+ for i in range(len(results["ids"][0])):
421
+ meta = results["metadatas"][0][i]
422
+ dist = results["distances"][0][i]
423
+ print(f" [{i+1}] {meta['file']}:{meta.get('line', '?')} "
424
+ f"({meta['name']}) [{meta['type']}/{meta['language']}] "
425
+ f"distance={dist:.4f}")
426
+
427
+
428
+ def main():
429
+ parser = argparse.ArgumentParser(description="Index loki-mode codebase into ChromaDB")
430
+ parser.add_argument("--collection", default=COLLECTION_NAME, help="Collection name")
431
+ parser.add_argument("--reset", action="store_true", help="Clear and re-index")
432
+ parser.add_argument("--stats", action="store_true", help="Show index stats")
433
+ parser.add_argument("--search", type=str, help="Run a test search query")
434
+ parser.add_argument("--host", default=CHROMA_HOST, help="ChromaDB host")
435
+ parser.add_argument("--port", type=int, default=CHROMA_PORT, help="ChromaDB port")
436
+ args = parser.parse_args()
437
+
438
+ client = chromadb.HttpClient(host=args.host, port=args.port)
439
+
440
+ if args.reset:
441
+ try:
442
+ client.delete_collection(args.collection)
443
+ print(f"Deleted collection '{args.collection}'")
444
+ except Exception:
445
+ pass
446
+
447
+ collection = client.get_or_create_collection(
448
+ name=args.collection,
449
+ metadata={"description": "Loki Mode codebase index for semantic code search"},
450
+ )
451
+
452
+ if args.stats:
453
+ show_stats(collection)
454
+ return
455
+
456
+ if args.search:
457
+ test_search(collection, args.search)
458
+ return
459
+
460
+ start = time.time()
461
+ file_count, total_chunks = index_all(collection)
462
+ elapsed = time.time() - start
463
+
464
+ print(f"\nDone: {total_chunks} chunks from {file_count} files in {elapsed:.1f}s")
465
+ show_stats(collection)
466
+
467
+ # Run a few test searches
468
+ test_search(collection, "rate limit detection and backoff")
469
+ test_search(collection, "model selection for RARV tier")
470
+ test_search(collection, "completion council voting")
471
+
472
+
473
+ if __name__ == "__main__":
474
+ main()
@@ -0,0 +1,159 @@
1
+ #!/usr/bin/env python3
2
+ """Probe provider documentation pages and report new models.
3
+
4
+ Approach kept deliberately conservative: we fetch known docs URLs, look for
5
+ model IDs that match well-defined regex patterns, and compare against the
6
+ current `providers/model_catalog.json`. We do NOT auto-rewrite the catalog;
7
+ we emit a report and a unified diff so a maintainer (or the cron-driven PR)
8
+ can review.
9
+
10
+ Run locally:
11
+ python3 tools/probe-model-catalog.py # report only
12
+ python3 tools/probe-model-catalog.py --json # machine-readable
13
+
14
+ In CI: see .github/workflows/model-catalog-probe.yml
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ import re
22
+ import sys
23
+ import urllib.error
24
+ import urllib.request
25
+ from pathlib import Path
26
+
27
+ REPO_ROOT = Path(__file__).resolve().parent.parent
28
+ CATALOG_PATH = REPO_ROOT / "providers" / "model_catalog.json"
29
+
30
+ # Patterns that look like provider model IDs. Conservative -- only the well
31
+ # defined Claude/Codex/Gemini families today.
32
+ PATTERNS: dict[str, list[re.Pattern[str]]] = {
33
+ "claude": [
34
+ re.compile(r"\bclaude-(?:opus|sonnet|haiku)-\d+(?:-\d+)?(?:-\d{8})?\b"),
35
+ ],
36
+ "codex": [
37
+ re.compile(r"\bgpt-\d+(?:\.\d+)?-codex\b"),
38
+ ],
39
+ "gemini": [
40
+ re.compile(r"\bgemini-\d+(?:\.\d+)?-(?:pro|flash)(?:-(?:preview|exp|latest))?\b"),
41
+ ],
42
+ }
43
+
44
+ # Pages we read. These should be public documentation. Failure to fetch any
45
+ # single one is non-fatal -- we report what we got.
46
+ SOURCES: dict[str, list[str]] = {
47
+ "claude": [
48
+ "https://docs.claude.com/en/about-claude/models/overview",
49
+ ],
50
+ "codex": [
51
+ "https://platform.openai.com/docs/models",
52
+ ],
53
+ "gemini": [
54
+ "https://ai.google.dev/gemini-api/docs/models",
55
+ ],
56
+ }
57
+
58
+ USER_AGENT = (
59
+ "loki-mode-model-probe/1.0 "
60
+ "(+https://github.com/asklokesh/loki-mode)"
61
+ )
62
+
63
+
64
+ def fetch(url: str, timeout: int = 30) -> str:
65
+ req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
66
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
67
+ body = resp.read().decode("utf-8", errors="ignore")
68
+ return body
69
+
70
+
71
+ def load_catalog() -> dict:
72
+ with CATALOG_PATH.open("r", encoding="utf-8") as fh:
73
+ return json.load(fh)
74
+
75
+
76
+ def known_ids(catalog: dict, provider: str) -> set[str]:
77
+ p = catalog.get("providers", {}).get(provider, {})
78
+ ids: set[str] = set()
79
+ for m in p.get("models", []):
80
+ if isinstance(m, dict) and m.get("id"):
81
+ ids.add(m["id"])
82
+ for key in ("latest_planning", "latest_development", "latest_fast"):
83
+ if p.get(key):
84
+ ids.add(p[key])
85
+ aliases = p.get("cli_aliases", {})
86
+ if isinstance(aliases, dict):
87
+ for v in aliases.values():
88
+ if isinstance(v, str):
89
+ ids.add(v)
90
+ return ids
91
+
92
+
93
+ def probe_provider(provider: str) -> tuple[set[str], list[str]]:
94
+ """Return (found_ids, errors)."""
95
+ seen: set[str] = set()
96
+ errors: list[str] = []
97
+ for url in SOURCES.get(provider, []):
98
+ try:
99
+ body = fetch(url)
100
+ except urllib.error.URLError as exc:
101
+ errors.append(f"{url}: {exc}")
102
+ continue
103
+ except Exception as exc: # network / parse / etc.
104
+ errors.append(f"{url}: {type(exc).__name__}: {exc}")
105
+ continue
106
+ for pat in PATTERNS.get(provider, []):
107
+ for m in pat.findall(body):
108
+ seen.add(m)
109
+ return seen, errors
110
+
111
+
112
+ def main() -> int:
113
+ ap = argparse.ArgumentParser(description=__doc__)
114
+ ap.add_argument("--json", action="store_true", help="emit machine-readable JSON")
115
+ ap.add_argument("--strict", action="store_true", help="exit nonzero if new models are found")
116
+ args = ap.parse_args()
117
+
118
+ catalog = load_catalog()
119
+ report: dict[str, dict] = {}
120
+ any_new = False
121
+ for provider in PATTERNS:
122
+ found, errors = probe_provider(provider)
123
+ known = known_ids(catalog, provider)
124
+ new_only = sorted(found - known)
125
+ report[provider] = {
126
+ "known_count": len(known),
127
+ "found_count": len(found),
128
+ "new_candidates": new_only,
129
+ "errors": errors,
130
+ }
131
+ if new_only:
132
+ any_new = True
133
+
134
+ if args.json:
135
+ json.dump(report, sys.stdout, indent=2, sort_keys=True)
136
+ sys.stdout.write("\n")
137
+ else:
138
+ for provider, info in report.items():
139
+ new = info["new_candidates"]
140
+ errs = info["errors"]
141
+ print(f"== {provider} ==")
142
+ print(f" known in catalog: {info['known_count']}")
143
+ print(f" found in docs: {info['found_count']}")
144
+ if new:
145
+ print(f" NEW CANDIDATES: {', '.join(new)}")
146
+ else:
147
+ print(f" NEW CANDIDATES: (none)")
148
+ for e in errs:
149
+ print(f" ERROR: {e}")
150
+ print()
151
+ if any_new:
152
+ print("To adopt a new model: edit providers/model_catalog.json -> bump latest_<tier>")
153
+ print("and add to models[]. Then re-run this script to confirm it disappears from new_candidates.")
154
+
155
+ return 1 if (args.strict and any_new) else 0
156
+
157
+
158
+ if __name__ == "__main__":
159
+ sys.exit(main())