loki-mode 7.12.0 → 7.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/SKILL.md +4 -2
- package/VERSION +1 -1
- package/autonomy/lib/wiki-ask.py +137 -0
- package/autonomy/lib/wiki-generator.py +322 -0
- package/autonomy/lib/wiki_index.py +258 -0
- package/autonomy/lib/wiki_llm.py +140 -0
- package/autonomy/loki +304 -11
- package/autonomy/run.sh +62 -12
- package/bin/loki +1 -1
- package/dashboard/__init__.py +1 -1
- package/dashboard/server.py +202 -0
- package/dashboard/static/index.html +405 -329
- package/docs/INSTALLATION.md +1 -1
- package/docs/R5-AUTO-WIKI-DESIGN.md +137 -0
- package/docs/R6-ROLLBACK-CHECKPOINT-PLAN.md +107 -0
- package/loki-ts/dist/loki.js +245 -206
- package/mcp/__init__.py +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""wiki_index.py -- dependency-free codebase index for the R5 auto-wiki.
|
|
2
|
+
|
|
3
|
+
Builds a line-anchored chunk index over a project's source files and provides
|
|
4
|
+
deterministic token-overlap retrieval. This is the grounding substrate for
|
|
5
|
+
cited answers: every chunk carries the REAL repo-relative file path and the
|
|
6
|
+
REAL start/end line numbers it came from, so a citation can always be checked
|
|
7
|
+
against the filesystem.
|
|
8
|
+
|
|
9
|
+
Reuse note: the token-overlap scoring (`_tokenize` + overlap weighting) is
|
|
10
|
+
ported from memory/knowledge_graph.py (OrganizationKnowledgeGraph), which scores
|
|
11
|
+
memory patterns the same way. knowledge_graph.py is NOT a code index (it
|
|
12
|
+
aggregates .loki/memory/semantic patterns), so the code scanning/chunking here
|
|
13
|
+
is new. retrieval.py is a memory retriever, not a code indexer, so it is not
|
|
14
|
+
reused for code retrieval.
|
|
15
|
+
|
|
16
|
+
No third-party dependencies. CI-safe (no Docker, no network, no provider).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import hashlib
|
|
22
|
+
import json
|
|
23
|
+
import os
|
|
24
|
+
import re
|
|
25
|
+
import subprocess
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
# Source extensions we index. Kept broad but excludes lockfiles/binaries.
|
|
29
|
+
SOURCE_EXTS = {
|
|
30
|
+
".py", ".js", ".ts", ".tsx", ".jsx", ".mjs", ".cjs",
|
|
31
|
+
".rs", ".go", ".rb", ".java", ".kt", ".kts", ".c", ".cc", ".cpp",
|
|
32
|
+
".h", ".hpp", ".cs", ".php", ".swift", ".sh", ".bash", ".sql",
|
|
33
|
+
".vue", ".svelte", ".scala", ".clj", ".ex", ".exs", ".lua", ".r",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# Directories never worth indexing.
|
|
37
|
+
SKIP_DIRS = {
|
|
38
|
+
"node_modules", ".git", "vendor", "__pycache__", "dist", "build",
|
|
39
|
+
".next", "target", ".venv", "venv", "coverage", ".loki", ".cache",
|
|
40
|
+
"out", ".turbo", ".pytest_cache", ".mypy_cache",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
CHUNK_LINES = 60 # lines per chunk; overlap-free, line-anchored.
|
|
44
|
+
MAX_FILES = 800 # safety cap so huge repos stay cheap.
|
|
45
|
+
MAX_FILE_BYTES = 400_000 # skip very large generated/minified files.
|
|
46
|
+
|
|
47
|
+
# Tokenizer ported from memory/knowledge_graph.py:_tokenize / _STOPWORDS.
|
|
48
|
+
_STOPWORDS = {
|
|
49
|
+
"the", "a", "an", "to", "for", "of", "and", "or", "with", "without",
|
|
50
|
+
"is", "are", "be", "up", "on", "in", "by", "not", "this", "that",
|
|
51
|
+
"from", "as", "at", "it", "if", "do", "we", "my", "our", "how",
|
|
52
|
+
"def", "self", "return", "import", "const", "let", "var", "function",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _tokenize(text):
|
|
57
|
+
"""Lowercase, split on non-alphanumerics, drop stopwords + short tokens.
|
|
58
|
+
|
|
59
|
+
Ported from knowledge_graph.OrganizationKnowledgeGraph._tokenize so wiki
|
|
60
|
+
retrieval scores text the same way memory-pattern retrieval does.
|
|
61
|
+
"""
|
|
62
|
+
toks = re.split(r"[^a-z0-9_]+", str(text or "").lower())
|
|
63
|
+
return {t for t in toks if len(t) > 2 and t not in _STOPWORDS}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _git_tracked_files(root):
|
|
67
|
+
"""Return git-tracked file paths (repo-relative), or None if not a repo."""
|
|
68
|
+
try:
|
|
69
|
+
out = subprocess.run(
|
|
70
|
+
["git", "-C", str(root), "ls-files"],
|
|
71
|
+
capture_output=True, text=True, timeout=30,
|
|
72
|
+
)
|
|
73
|
+
if out.returncode != 0:
|
|
74
|
+
return None
|
|
75
|
+
files = [line.strip() for line in out.stdout.splitlines() if line.strip()]
|
|
76
|
+
return files or None
|
|
77
|
+
except (OSError, subprocess.SubprocessError):
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _walk_files(root):
|
|
82
|
+
"""Filtered filesystem walk fallback (when not a git repo)."""
|
|
83
|
+
root = Path(root)
|
|
84
|
+
results = []
|
|
85
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
86
|
+
dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
|
|
87
|
+
for fn in filenames:
|
|
88
|
+
rel = os.path.relpath(os.path.join(dirpath, fn), root)
|
|
89
|
+
results.append(rel)
|
|
90
|
+
return results
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def list_source_files(root):
|
|
94
|
+
"""Return a sorted list of repo-relative source files under root.
|
|
95
|
+
|
|
96
|
+
Prefers git ls-files (respects .gitignore), falls back to a filtered walk.
|
|
97
|
+
"""
|
|
98
|
+
root = Path(root)
|
|
99
|
+
candidates = _git_tracked_files(root)
|
|
100
|
+
if candidates is None:
|
|
101
|
+
candidates = _walk_files(root)
|
|
102
|
+
|
|
103
|
+
sources = []
|
|
104
|
+
for rel in candidates:
|
|
105
|
+
# Skip anything inside a skip dir (git tracked files can include them
|
|
106
|
+
# if they were committed; we still exclude noise dirs).
|
|
107
|
+
parts = set(Path(rel).parts)
|
|
108
|
+
if parts & SKIP_DIRS:
|
|
109
|
+
continue
|
|
110
|
+
ext = os.path.splitext(rel)[1].lower()
|
|
111
|
+
if ext not in SOURCE_EXTS:
|
|
112
|
+
continue
|
|
113
|
+
abs_path = root / rel
|
|
114
|
+
try:
|
|
115
|
+
if not abs_path.is_file():
|
|
116
|
+
continue
|
|
117
|
+
if abs_path.stat().st_size > MAX_FILE_BYTES:
|
|
118
|
+
continue
|
|
119
|
+
except OSError:
|
|
120
|
+
continue
|
|
121
|
+
sources.append(rel)
|
|
122
|
+
sources.sort()
|
|
123
|
+
return sources[:MAX_FILES]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _read_lines(abs_path):
|
|
127
|
+
try:
|
|
128
|
+
with open(abs_path, "r", encoding="utf-8", errors="replace") as f:
|
|
129
|
+
return f.read().splitlines()
|
|
130
|
+
except OSError:
|
|
131
|
+
return []
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def build_index(root):
|
|
135
|
+
"""Build a line-anchored chunk index over the project's source files.
|
|
136
|
+
|
|
137
|
+
Returns a dict:
|
|
138
|
+
{
|
|
139
|
+
"root": <abs root>,
|
|
140
|
+
"files": [<repo-relative paths>],
|
|
141
|
+
"chunks": [
|
|
142
|
+
{"id": int, "file": <rel>, "start_line": int, "end_line": int,
|
|
143
|
+
"text": <chunk text>},
|
|
144
|
+
...
|
|
145
|
+
],
|
|
146
|
+
}
|
|
147
|
+
Paths are ALWAYS repo-relative (no PII, no absolute paths leak).
|
|
148
|
+
Line numbers are 1-based and inclusive.
|
|
149
|
+
"""
|
|
150
|
+
root = Path(root).resolve()
|
|
151
|
+
files = list_source_files(root)
|
|
152
|
+
chunks = []
|
|
153
|
+
cid = 0
|
|
154
|
+
for rel in files:
|
|
155
|
+
lines = _read_lines(root / rel)
|
|
156
|
+
if not lines:
|
|
157
|
+
continue
|
|
158
|
+
for start in range(0, len(lines), CHUNK_LINES):
|
|
159
|
+
block = lines[start:start + CHUNK_LINES]
|
|
160
|
+
if not any(line.strip() for line in block):
|
|
161
|
+
continue # skip all-blank chunks
|
|
162
|
+
chunks.append({
|
|
163
|
+
"id": cid,
|
|
164
|
+
"file": rel,
|
|
165
|
+
"start_line": start + 1,
|
|
166
|
+
"end_line": start + len(block),
|
|
167
|
+
"text": "\n".join(block),
|
|
168
|
+
})
|
|
169
|
+
cid += 1
|
|
170
|
+
return {"root": str(root), "files": files, "chunks": chunks}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def retrieve(index, query, k=6):
|
|
174
|
+
"""Deterministic top-K chunk retrieval by token overlap.
|
|
175
|
+
|
|
176
|
+
Scoring mirrors knowledge_graph.query_patterns: token overlap between the
|
|
177
|
+
query and the chunk text, plus a small bonus when the query substring
|
|
178
|
+
appears verbatim and when query tokens appear in the file path (so
|
|
179
|
+
"how does the cli dispatch" surfaces cli.* files). No LLM, no network.
|
|
180
|
+
Ties broken by chunk id for stable, reproducible ordering.
|
|
181
|
+
"""
|
|
182
|
+
qtokens = _tokenize(query)
|
|
183
|
+
qlower = str(query or "").lower()
|
|
184
|
+
scored = []
|
|
185
|
+
for ch in index.get("chunks", []):
|
|
186
|
+
text = ch.get("text", "")
|
|
187
|
+
score = 0
|
|
188
|
+
overlap = qtokens & _tokenize(text)
|
|
189
|
+
score += 3 * len(overlap)
|
|
190
|
+
# Path tokens (file/dir names are strong signals).
|
|
191
|
+
path_overlap = qtokens & _tokenize(ch.get("file", ""))
|
|
192
|
+
score += 2 * len(path_overlap)
|
|
193
|
+
# Verbatim substring bonus.
|
|
194
|
+
if qlower and len(qlower) > 3 and qlower in text.lower():
|
|
195
|
+
score += 4
|
|
196
|
+
if score > 0:
|
|
197
|
+
scored.append((score, ch["id"], ch))
|
|
198
|
+
# Highest score first; stable tiebreak on id.
|
|
199
|
+
scored.sort(key=lambda t: (-t[0], t[1]))
|
|
200
|
+
return [ch for _, _, ch in scored[:k]]
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def compute_signature(root):
|
|
204
|
+
"""Cheap-incremental signature over git HEAD + per-file content hashes.
|
|
205
|
+
|
|
206
|
+
Same idea as the proof/docs manifest: a deterministic hash that changes
|
|
207
|
+
iff the indexed source set changes. Used to skip regeneration when the
|
|
208
|
+
codebase is unchanged.
|
|
209
|
+
"""
|
|
210
|
+
root = Path(root).resolve()
|
|
211
|
+
h = hashlib.sha256()
|
|
212
|
+
# git HEAD (if available) makes the signature cheap to invalidate on commit.
|
|
213
|
+
try:
|
|
214
|
+
head = subprocess.run(
|
|
215
|
+
["git", "-C", str(root), "rev-parse", "HEAD"],
|
|
216
|
+
capture_output=True, text=True, timeout=15,
|
|
217
|
+
)
|
|
218
|
+
if head.returncode == 0:
|
|
219
|
+
h.update(b"head:" + head.stdout.strip().encode())
|
|
220
|
+
except (OSError, subprocess.SubprocessError):
|
|
221
|
+
pass
|
|
222
|
+
for rel in list_source_files(root):
|
|
223
|
+
try:
|
|
224
|
+
data = (root / rel).read_bytes()
|
|
225
|
+
except OSError:
|
|
226
|
+
continue
|
|
227
|
+
h.update(rel.encode("utf-8"))
|
|
228
|
+
h.update(hashlib.sha256(data).digest())
|
|
229
|
+
return h.hexdigest()
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def extract_definitions(root, rel, limit=12):
|
|
233
|
+
"""Return real def/class/function lines for a file, for code-derived citations.
|
|
234
|
+
|
|
235
|
+
Returns a list of {"name": str, "line": int} where line is 1-based and
|
|
236
|
+
points at a real definition in the file. Language-agnostic via a small set
|
|
237
|
+
of regexes; only emits matches that actually exist in the file.
|
|
238
|
+
"""
|
|
239
|
+
lines = _read_lines(Path(root) / rel)
|
|
240
|
+
patterns = [
|
|
241
|
+
re.compile(r"^\s*def\s+([A-Za-z_][A-Za-z0-9_]*)"),
|
|
242
|
+
re.compile(r"^\s*class\s+([A-Za-z_][A-Za-z0-9_]*)"),
|
|
243
|
+
re.compile(r"^\s*(?:export\s+)?(?:async\s+)?function\s+([A-Za-z_$][\w$]*)"),
|
|
244
|
+
re.compile(r"^\s*(?:export\s+)?const\s+([A-Za-z_$][\w$]*)\s*=\s*(?:async\s*)?\("),
|
|
245
|
+
re.compile(r"^\s*(?:pub\s+)?fn\s+([A-Za-z_][A-Za-z0-9_]*)"),
|
|
246
|
+
re.compile(r"^\s*func\s+(?:\([^)]*\)\s*)?([A-Za-z_][A-Za-z0-9_]*)"),
|
|
247
|
+
re.compile(r"^\s*([A-Za-z_][A-Za-z0-9_]*)\s*\(\)\s*\{"), # bash funcs
|
|
248
|
+
]
|
|
249
|
+
defs = []
|
|
250
|
+
for i, line in enumerate(lines, start=1):
|
|
251
|
+
for pat in patterns:
|
|
252
|
+
m = pat.match(line)
|
|
253
|
+
if m:
|
|
254
|
+
defs.append({"name": m.group(1), "line": i})
|
|
255
|
+
break
|
|
256
|
+
if len(defs) >= limit:
|
|
257
|
+
break
|
|
258
|
+
return defs
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""wiki_llm.py -- stub-aware LLM invocation + citation validation for R5 wiki.
|
|
2
|
+
|
|
3
|
+
Keeps every paid-call and grounding-guarantee concern in one place so the
|
|
4
|
+
generator and the ask script behave identically.
|
|
5
|
+
|
|
6
|
+
LLM stub contract (CI-safe, zero paid calls in tests):
|
|
7
|
+
LOKI_WIKI_LLM_STUB unset -> call the real provider (claude -p / codex / ...)
|
|
8
|
+
via the same mechanism loki docs uses; if no
|
|
9
|
+
provider is on PATH, return None (callers then
|
|
10
|
+
fall back to extractive/template output).
|
|
11
|
+
LOKI_WIKI_LLM_STUB=<file path> -> read the completion from that file.
|
|
12
|
+
LOKI_WIKI_LLM_STUB=<other> -> use the value literally as the completion.
|
|
13
|
+
|
|
14
|
+
No third-party dependencies.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
import re
|
|
21
|
+
import subprocess
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def invoke_llm(prompt, timeout=120):
|
|
26
|
+
"""Return the LLM completion string, or None if unavailable.
|
|
27
|
+
|
|
28
|
+
Honors LOKI_WIKI_LLM_STUB for CI. Otherwise shells out to the configured
|
|
29
|
+
provider, mirroring loki docs `_docs_invoke_provider`.
|
|
30
|
+
"""
|
|
31
|
+
stub = os.environ.get("LOKI_WIKI_LLM_STUB")
|
|
32
|
+
if stub is not None:
|
|
33
|
+
# A path to a file with the canned completion, else the literal value.
|
|
34
|
+
if os.path.sep in stub or stub.endswith(".txt"):
|
|
35
|
+
p = Path(stub)
|
|
36
|
+
if p.is_file():
|
|
37
|
+
try:
|
|
38
|
+
return p.read_text(encoding="utf-8", errors="replace")
|
|
39
|
+
except OSError:
|
|
40
|
+
return ""
|
|
41
|
+
return stub
|
|
42
|
+
|
|
43
|
+
provider = os.environ.get("LOKI_PROVIDER", "claude")
|
|
44
|
+
state_provider = Path(".loki/state/provider")
|
|
45
|
+
if state_provider.is_file():
|
|
46
|
+
try:
|
|
47
|
+
provider = state_provider.read_text().strip() or provider
|
|
48
|
+
except OSError:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
# Resolve a timeout wrapper if present (matches the bash docs helper).
|
|
52
|
+
timeout_cmd = None
|
|
53
|
+
for cand in ("timeout", "gtimeout"):
|
|
54
|
+
if _which(cand):
|
|
55
|
+
timeout_cmd = cand
|
|
56
|
+
break
|
|
57
|
+
|
|
58
|
+
cmds = {
|
|
59
|
+
"claude": ["claude", "-p", prompt],
|
|
60
|
+
"codex": ["codex", "exec", "--full-auto", prompt],
|
|
61
|
+
"cline": ["cline", "-y", prompt],
|
|
62
|
+
"aider": ["aider", "--message", prompt, "--yes-always", "--no-auto-commits"],
|
|
63
|
+
}
|
|
64
|
+
base = cmds.get(provider)
|
|
65
|
+
if base is None or not _which(base[0]):
|
|
66
|
+
return None
|
|
67
|
+
cmd = ([timeout_cmd, str(timeout)] + base) if timeout_cmd else base
|
|
68
|
+
try:
|
|
69
|
+
out = subprocess.run(
|
|
70
|
+
cmd, capture_output=True, text=True, timeout=timeout + 10,
|
|
71
|
+
stdin=subprocess.DEVNULL,
|
|
72
|
+
)
|
|
73
|
+
except (OSError, subprocess.SubprocessError):
|
|
74
|
+
return None
|
|
75
|
+
if out.returncode != 0 and not out.stdout.strip():
|
|
76
|
+
return None
|
|
77
|
+
return out.stdout
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _which(name):
|
|
81
|
+
for d in os.environ.get("PATH", "").split(os.pathsep):
|
|
82
|
+
cand = os.path.join(d, name)
|
|
83
|
+
if os.path.isfile(cand) and os.access(cand, os.X_OK):
|
|
84
|
+
return cand
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
_CITE_RE = re.compile(r"\[(\d+)\]")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def map_and_validate_citations(answer_text, chunks, root):
|
|
92
|
+
"""Map [n] indices in answer_text to real {file,line} citations.
|
|
93
|
+
|
|
94
|
+
chunks: the numbered chunk list shown to the LLM. chunks[n-1] is the chunk
|
|
95
|
+
referenced by [n] (1-based). A citation is kept only if:
|
|
96
|
+
- the index is in range (it references a chunk we actually supplied), and
|
|
97
|
+
- the file exists on disk AND start_line <= file length.
|
|
98
|
+
This makes a fabricated citation structurally impossible to survive.
|
|
99
|
+
|
|
100
|
+
Returns (clean_text, citations) where citations is a de-duplicated list of
|
|
101
|
+
{"file": rel, "line": int} in first-appearance order, and clean_text has the
|
|
102
|
+
[n] markers rewritten to [file:line] for human-readable output.
|
|
103
|
+
"""
|
|
104
|
+
root = Path(root)
|
|
105
|
+
citations = []
|
|
106
|
+
seen = set()
|
|
107
|
+
|
|
108
|
+
def _resolve(idx):
|
|
109
|
+
if idx < 1 or idx > len(chunks):
|
|
110
|
+
return None
|
|
111
|
+
ch = chunks[idx - 1]
|
|
112
|
+
rel = ch.get("file")
|
|
113
|
+
line = int(ch.get("start_line", 1))
|
|
114
|
+
abs_path = root / rel
|
|
115
|
+
try:
|
|
116
|
+
if not abs_path.is_file():
|
|
117
|
+
return None
|
|
118
|
+
with open(abs_path, "r", encoding="utf-8", errors="replace") as f:
|
|
119
|
+
nlines = sum(1 for _ in f)
|
|
120
|
+
except OSError:
|
|
121
|
+
return None
|
|
122
|
+
if line < 1 or line > max(nlines, 1):
|
|
123
|
+
return None
|
|
124
|
+
return {"file": rel, "line": line}
|
|
125
|
+
|
|
126
|
+
def _sub(m):
|
|
127
|
+
idx = int(m.group(1))
|
|
128
|
+
cite = _resolve(idx)
|
|
129
|
+
if cite is None:
|
|
130
|
+
return "" # drop a bogus/non-resolving citation marker
|
|
131
|
+
key = (cite["file"], cite["line"])
|
|
132
|
+
if key not in seen:
|
|
133
|
+
seen.add(key)
|
|
134
|
+
citations.append(cite)
|
|
135
|
+
return "[%s:%d]" % (cite["file"], cite["line"])
|
|
136
|
+
|
|
137
|
+
clean = _CITE_RE.sub(_sub, answer_text or "")
|
|
138
|
+
# Collapse any double spaces left by dropped markers.
|
|
139
|
+
clean = re.sub(r"[ \t]{2,}", " ", clean)
|
|
140
|
+
return clean, citations
|