loom-code 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loom_code/__init__.py +22 -0
- loom_code/_post_commit.py +119 -0
- loom_code/agent.py +544 -0
- loom_code/approval.py +616 -0
- loom_code/browse/__init__.py +291 -0
- loom_code/browse/act.py +467 -0
- loom_code/browse/observe.py +249 -0
- loom_code/browse/session.py +96 -0
- loom_code/browse/verify.py +194 -0
- loom_code/checkpoint.py +283 -0
- loom_code/cli.py +495 -0
- loom_code/code_index.py +703 -0
- loom_code/compact.py +143 -0
- loom_code/consent.py +47 -0
- loom_code/credentials.py +527 -0
- loom_code/edit_tool.py +635 -0
- loom_code/extensions.py +522 -0
- loom_code/file_history.py +322 -0
- loom_code/file_tools.py +93 -0
- loom_code/git_hook.py +200 -0
- loom_code/grep_tool.py +430 -0
- loom_code/hooks.py +297 -0
- loom_code/loominit/__init__.py +23 -0
- loom_code/loominit/_ast_walk.py +429 -0
- loom_code/loominit/_files.py +284 -0
- loom_code/loominit/_graph.py +141 -0
- loom_code/loominit/_resolve.py +392 -0
- loom_code/loominit/_tests_map.py +108 -0
- loom_code/loominit/extractor.py +332 -0
- loom_code/loominit/repomap.py +225 -0
- loom_code/loominit/schema.py +242 -0
- loom_code/lsp_tools.py +396 -0
- loom_code/mcp_host.py +79 -0
- loom_code/operator.py +449 -0
- loom_code/paste.py +97 -0
- loom_code/paths.py +52 -0
- loom_code/permissions.py +177 -0
- loom_code/project.py +104 -0
- loom_code/prompts.py +451 -0
- loom_code/render.py +783 -0
- loom_code/repl.py +4080 -0
- loom_code/rules.py +267 -0
- loom_code/sandboxed_bash.py +176 -0
- loom_code/scribe.py +88 -0
- loom_code/skills/__init__.py +16 -0
- loom_code/skills/graphify/SKILL.md +97 -0
- loom_code/skills/graphify/tools.py +570 -0
- loom_code/trust.py +216 -0
- loom_code/turn.py +169 -0
- loom_code/web_fetch.py +370 -0
- loom_code/workers.py +758 -0
- loom_code/worktree.py +134 -0
- loom_code-0.1.1.dist-info/METADATA +224 -0
- loom_code-0.1.1.dist-info/RECORD +58 -0
- loom_code-0.1.1.dist-info/WHEEL +5 -0
- loom_code-0.1.1.dist-info/entry_points.txt +2 -0
- loom_code-0.1.1.dist-info/licenses/LICENSE +21 -0
- loom_code-0.1.1.dist-info/top_level.txt +1 -0
loom_code/code_index.py
ADDED
|
@@ -0,0 +1,703 @@
|
|
|
1
|
+
"""Semantic codebase index — embed source symbols, search by meaning.
|
|
2
|
+
|
|
3
|
+
The differentiator loom-code ships over grep: ``grep`` finds the *string*
|
|
4
|
+
``authenticate``; ``codebase_search`` finds the code that *handles auth*
|
|
5
|
+
even when the word never appears. It mirrors Cursor's ``@Codebase`` —
|
|
6
|
+
but local, in the same ``.loom`` partition as memory, and (Phase 1b)
|
|
7
|
+
fusible with what the agent has *learned* about that code across runs.
|
|
8
|
+
|
|
9
|
+
How it works, end to end:
|
|
10
|
+
|
|
11
|
+
* **Chunk** — reuse the structural AST walk (:func:`walk_python_file`)
|
|
12
|
+
to split each ``.py`` file into class/function/method chunks. Bare
|
|
13
|
+
module constants are skipped (same call repomap's ``_score`` makes:
|
|
14
|
+
they're noise in a semantic overview). Each chunk's embeddable text
|
|
15
|
+
is ``path + qualified_name + signature + docstring + body`` — the
|
|
16
|
+
body is sliced ``line:end_line`` from the file we already read.
|
|
17
|
+
* **Embed** — via the SAME embedder loom-code picks for memory
|
|
18
|
+
(:class:`OpenAIEmbedder` for OpenAI chat models, :class:`HashEmbedder`
|
|
19
|
+
otherwise — zero-key, offline, lower quality but never a cross-
|
|
20
|
+
provider call). The caller passes the resolved name so the index and
|
|
21
|
+
memory always embed in the same space (Phase 1b fuses them). Both
|
|
22
|
+
embedders expose ``async embed_batch(texts) -> list[list[float]]``.
|
|
23
|
+
* **Store** — a SEPARATE sqlite db ``<root>/.loom/code_index.db``.
|
|
24
|
+
NOT ``memory.db``: loomflow's memory schema is locked to Episodes /
|
|
25
|
+
Facts, so a fourth data model (code chunks) gets its own file. Per-
|
|
26
|
+
file ``sha256`` gates re-embedding — only changed files re-embed,
|
|
27
|
+
which matters because OpenAI embedding costs real money per token.
|
|
28
|
+
* **Search** — cosine over the stored vectors, grouped + file:line
|
|
29
|
+
cited like ``grep`` so the agent can ``read`` the exact range next.
|
|
30
|
+
|
|
31
|
+
Python-only today (the AST walk is stdlib ``ast``). The walk already
|
|
32
|
+
routes by language, so a future tree-sitter backend drops in here
|
|
33
|
+
without touching the tool or the store — the seam is the chunker, not
|
|
34
|
+
the index.
|
|
35
|
+
|
|
36
|
+
Failure is always graceful: a broken build, a missing embedder key, an
|
|
37
|
+
empty index — the tool returns a one-line explanation, never raises.
|
|
38
|
+
A semantic-search outage must not abort a turn.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
from __future__ import annotations
|
|
42
|
+
|
|
43
|
+
import hashlib
|
|
44
|
+
import math
|
|
45
|
+
import sqlite3
|
|
46
|
+
import struct
|
|
47
|
+
from collections.abc import Sequence
|
|
48
|
+
from dataclasses import dataclass
|
|
49
|
+
from pathlib import Path
|
|
50
|
+
from typing import Any
|
|
51
|
+
|
|
52
|
+
from loomflow import tool
|
|
53
|
+
from loomflow.tools.registry import Tool
|
|
54
|
+
|
|
55
|
+
from .loominit._ast_walk import walk_python_file
|
|
56
|
+
|
|
57
|
+
# Directories we never index — vendored / generated / VCS noise. Same
|
|
58
|
+
# spirit as repomap's skip set; kept local so the two can diverge (the
|
|
59
|
+
# code index may later want to include tests, which the overview map
|
|
60
|
+
# collapses).
|
|
61
|
+
_SKIP_DIRS: frozenset[str] = frozenset(
|
|
62
|
+
{
|
|
63
|
+
".git",
|
|
64
|
+
".loom",
|
|
65
|
+
".venv",
|
|
66
|
+
"venv",
|
|
67
|
+
"node_modules",
|
|
68
|
+
"__pycache__",
|
|
69
|
+
".mypy_cache",
|
|
70
|
+
".ruff_cache",
|
|
71
|
+
".pytest_cache",
|
|
72
|
+
"dist",
|
|
73
|
+
"build",
|
|
74
|
+
".tox",
|
|
75
|
+
"site-packages",
|
|
76
|
+
}
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Chunk bodies are capped before embedding: a 2000-line god-function
|
|
80
|
+
# would blow the embedder's token limit AND dilute its own signal (the
|
|
81
|
+
# first ~120 lines carry the intent; the tail is detail). Slicing keeps
|
|
82
|
+
# embeddings cheap and focused. The agent reads the full range from the
|
|
83
|
+
# file:line citation anyway.
|
|
84
|
+
_MAX_CHUNK_LINES = 120
|
|
85
|
+
|
|
86
|
+
# Default result count — enough to surface the relevant cluster, few
|
|
87
|
+
# enough not to flood the model's context. Matches grep's file-cap feel.
|
|
88
|
+
_DEFAULT_LIMIT = 8
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass(frozen=True)
|
|
92
|
+
class _Chunk:
|
|
93
|
+
"""One indexable code unit (a class / function / method)."""
|
|
94
|
+
|
|
95
|
+
path: str # repo-relative POSIX
|
|
96
|
+
qualified_name: str
|
|
97
|
+
kind: str
|
|
98
|
+
start_line: int
|
|
99
|
+
end_line: int
|
|
100
|
+
signature: str
|
|
101
|
+
text: str # the embeddable doc (header + body)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass(frozen=True)
|
|
105
|
+
class CodeHit:
|
|
106
|
+
"""A semantic search result — enough to cite and re-read."""
|
|
107
|
+
|
|
108
|
+
path: str
|
|
109
|
+
qualified_name: str
|
|
110
|
+
kind: str
|
|
111
|
+
start_line: int
|
|
112
|
+
end_line: int
|
|
113
|
+
signature: str
|
|
114
|
+
score: float
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
# Embedder resolution (shared with memory — see agent._is_openai_model)
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def resolve_embedder(name: str) -> Any:
|
|
123
|
+
"""Build the embedder backend for ``name`` (``"openai"`` / ``"hash"``).
|
|
124
|
+
|
|
125
|
+
Returns an object with ``async embed_batch(texts) -> list[list[
|
|
126
|
+
float]]`` — the batch method both backends share. The caller passes
|
|
127
|
+
the same name loom-code resolved for memory, so the code index and
|
|
128
|
+
the note store embed in one vector space (Phase 1b reciprocal-rank-
|
|
129
|
+
fuses across them). ``"hash"`` is the zero-key, offline default for
|
|
130
|
+
non-OpenAI chat models; anything unrecognised also falls to hash so
|
|
131
|
+
the index degrades to "works, lower quality" rather than crashing.
|
|
132
|
+
"""
|
|
133
|
+
from loomflow.memory import HashEmbedder, OpenAIEmbedder
|
|
134
|
+
|
|
135
|
+
if name == "openai":
|
|
136
|
+
return OpenAIEmbedder() # text-embedding-3-small, reads OPENAI_API_KEY
|
|
137
|
+
return HashEmbedder()
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# ---------------------------------------------------------------------------
|
|
141
|
+
# Chunking — AST walk -> embeddable units
|
|
142
|
+
# ---------------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _iter_py_files(root: Path) -> list[Path]:
|
|
146
|
+
out: list[Path] = []
|
|
147
|
+
for p in root.rglob("*.py"):
|
|
148
|
+
if any(part in _SKIP_DIRS for part in p.parts):
|
|
149
|
+
continue
|
|
150
|
+
out.append(p)
|
|
151
|
+
return out
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _file_sha256(text: str) -> str:
|
|
155
|
+
return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _chunks_for_file(rel_path: str, source: str) -> list[_Chunk]:
|
|
159
|
+
"""Split one file's source into embeddable chunks via the AST walk.
|
|
160
|
+
|
|
161
|
+
Skips module-level constants (``kind == "constant"``) — they're a
|
|
162
|
+
single line of value with no behaviour to search for, and they
|
|
163
|
+
crowd out real symbols. A syntax error yields no chunks (the walk
|
|
164
|
+
returns empty lists rather than raising), so one broken file never
|
|
165
|
+
aborts a build.
|
|
166
|
+
"""
|
|
167
|
+
symbols, _imports, _decorators = walk_python_file(source, rel_path)
|
|
168
|
+
lines = source.splitlines()
|
|
169
|
+
chunks: list[_Chunk] = []
|
|
170
|
+
for sym in symbols:
|
|
171
|
+
if sym.kind == "constant":
|
|
172
|
+
continue
|
|
173
|
+
# Slice the body we already have in memory. AST line numbers are
|
|
174
|
+
# 1-based inclusive; clamp end to the cap so giant functions
|
|
175
|
+
# don't blow the embedder budget (the citation still spans the
|
|
176
|
+
# true range so the agent can read all of it).
|
|
177
|
+
start = max(sym.line, 1)
|
|
178
|
+
end = min(sym.end_line, start + _MAX_CHUNK_LINES - 1)
|
|
179
|
+
body = "\n".join(lines[start - 1 : end])
|
|
180
|
+
# The embeddable doc: location + identity + intent + body. The
|
|
181
|
+
# path + qualname + docstring carry most of the semantic signal
|
|
182
|
+
# cheaply; the body grounds it in the actual implementation.
|
|
183
|
+
doc_parts = [f"{rel_path} :: {sym.qualified_name}", sym.signature]
|
|
184
|
+
if sym.docstring_first_line:
|
|
185
|
+
doc_parts.append(sym.docstring_first_line)
|
|
186
|
+
doc_parts.append(body)
|
|
187
|
+
chunks.append(
|
|
188
|
+
_Chunk(
|
|
189
|
+
path=rel_path,
|
|
190
|
+
qualified_name=sym.qualified_name,
|
|
191
|
+
kind=sym.kind,
|
|
192
|
+
start_line=sym.line,
|
|
193
|
+
end_line=sym.end_line,
|
|
194
|
+
signature=sym.signature,
|
|
195
|
+
text="\n".join(doc_parts),
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
return chunks
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# ---------------------------------------------------------------------------
|
|
202
|
+
# Store — sqlite, separate from memory.db
|
|
203
|
+
# ---------------------------------------------------------------------------
|
|
204
|
+
|
|
205
|
+
# Vectors persist as packed little-endian float32 blobs — compact and
|
|
206
|
+
# numpy-free (loom-code has no numpy dep; cosine is a plain loop). The
|
|
207
|
+
# dimension is implied by the blob length, so the store is agnostic to
|
|
208
|
+
# whether hash (384) or openai (1536) wrote it; switching embedders
|
|
209
|
+
# invalidates every file via the staleness gate, forcing a clean
|
|
210
|
+
# re-embed in the new dimension.
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _pack(vec: Sequence[float]) -> bytes:
|
|
214
|
+
return struct.pack(f"<{len(vec)}f", *vec)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _unpack(blob: bytes) -> list[float]:
|
|
218
|
+
return list(struct.unpack(f"<{len(blob) // 4}f", blob))
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class CodeIndexStore:
|
|
222
|
+
"""The sqlite-backed code-chunk index for one project.
|
|
223
|
+
|
|
224
|
+
Async note: sqlite calls here are synchronous and fast (local
|
|
225
|
+
file). They run inside the tool's async function but are not
|
|
226
|
+
offloaded to a thread — acceptable because a query is a single
|
|
227
|
+
indexed read + an in-process cosine loop, well under the latency
|
|
228
|
+
that would justify a thread pool. Embedding (the slow part) IS
|
|
229
|
+
async and awaited.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
def __init__(self, db_path: Path, embedder_name: str) -> None:
|
|
233
|
+
self._db_path = db_path
|
|
234
|
+
self._embedder_name = embedder_name
|
|
235
|
+
self._conn = sqlite3.connect(str(db_path))
|
|
236
|
+
self._conn.execute(
|
|
237
|
+
"""
|
|
238
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
239
|
+
id TEXT PRIMARY KEY, -- path::qualname
|
|
240
|
+
path TEXT NOT NULL,
|
|
241
|
+
qualname TEXT NOT NULL,
|
|
242
|
+
kind TEXT NOT NULL,
|
|
243
|
+
start_line INTEGER NOT NULL,
|
|
244
|
+
end_line INTEGER NOT NULL,
|
|
245
|
+
signature TEXT NOT NULL,
|
|
246
|
+
embedding BLOB NOT NULL
|
|
247
|
+
)
|
|
248
|
+
"""
|
|
249
|
+
)
|
|
250
|
+
# Per-file content hash — re-embed only what changed. Stores the
|
|
251
|
+
# embedder name too, so switching providers (hash -> openai)
|
|
252
|
+
# invalidates every file (different vector space).
|
|
253
|
+
self._conn.execute(
|
|
254
|
+
"""
|
|
255
|
+
CREATE TABLE IF NOT EXISTS files (
|
|
256
|
+
path TEXT PRIMARY KEY,
|
|
257
|
+
sha256 TEXT NOT NULL,
|
|
258
|
+
embedder TEXT NOT NULL
|
|
259
|
+
)
|
|
260
|
+
"""
|
|
261
|
+
)
|
|
262
|
+
self._conn.execute(
|
|
263
|
+
"CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path)"
|
|
264
|
+
)
|
|
265
|
+
self._conn.commit()
|
|
266
|
+
|
|
267
|
+
def close(self) -> None:
|
|
268
|
+
self._conn.close()
|
|
269
|
+
|
|
270
|
+
def file_is_fresh(self, rel_path: str, sha: str) -> bool:
|
|
271
|
+
"""True when ``rel_path`` is already indexed at ``sha`` with the
|
|
272
|
+
current embedder — i.e. nothing to re-embed."""
|
|
273
|
+
row = self._conn.execute(
|
|
274
|
+
"SELECT sha256, embedder FROM files WHERE path = ?", (rel_path,)
|
|
275
|
+
).fetchone()
|
|
276
|
+
return (
|
|
277
|
+
row is not None
|
|
278
|
+
and row[0] == sha
|
|
279
|
+
and row[1] == self._embedder_name
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
def replace_file_chunks(
|
|
283
|
+
self,
|
|
284
|
+
rel_path: str,
|
|
285
|
+
sha: str,
|
|
286
|
+
chunks: list[_Chunk],
|
|
287
|
+
vectors: list[Sequence[float]],
|
|
288
|
+
) -> None:
|
|
289
|
+
"""Atomically swap one file's chunks (delete-then-insert in a
|
|
290
|
+
single transaction) so a crash mid-reindex never leaves a file
|
|
291
|
+
half-indexed."""
|
|
292
|
+
cur = self._conn
|
|
293
|
+
cur.execute("DELETE FROM chunks WHERE path = ?", (rel_path,))
|
|
294
|
+
for chunk, vec in zip(chunks, vectors, strict=True):
|
|
295
|
+
cur.execute(
|
|
296
|
+
"INSERT OR REPLACE INTO chunks "
|
|
297
|
+
"(id, path, qualname, kind, start_line, end_line, "
|
|
298
|
+
"signature, embedding) "
|
|
299
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
|
300
|
+
(
|
|
301
|
+
f"{chunk.path}::{chunk.qualified_name}",
|
|
302
|
+
chunk.path,
|
|
303
|
+
chunk.qualified_name,
|
|
304
|
+
chunk.kind,
|
|
305
|
+
chunk.start_line,
|
|
306
|
+
chunk.end_line,
|
|
307
|
+
chunk.signature,
|
|
308
|
+
_pack(vec),
|
|
309
|
+
),
|
|
310
|
+
)
|
|
311
|
+
cur.execute(
|
|
312
|
+
"INSERT OR REPLACE INTO files (path, sha256, embedder) "
|
|
313
|
+
"VALUES (?, ?, ?)",
|
|
314
|
+
(rel_path, sha, self._embedder_name),
|
|
315
|
+
)
|
|
316
|
+
cur.commit()
|
|
317
|
+
|
|
318
|
+
def prune_missing(self, live_paths: set[str]) -> None:
|
|
319
|
+
"""Drop chunks/files for source files that no longer exist (the
|
|
320
|
+
delete half of incremental indexing)."""
|
|
321
|
+
rows = self._conn.execute("SELECT path FROM files").fetchall()
|
|
322
|
+
stale = [r[0] for r in rows if r[0] not in live_paths]
|
|
323
|
+
for path in stale:
|
|
324
|
+
self._conn.execute("DELETE FROM chunks WHERE path = ?", (path,))
|
|
325
|
+
self._conn.execute("DELETE FROM files WHERE path = ?", (path,))
|
|
326
|
+
if stale:
|
|
327
|
+
self._conn.commit()
|
|
328
|
+
|
|
329
|
+
def search(self, query_vec: Sequence[float], limit: int) -> list[CodeHit]:
|
|
330
|
+
"""Cosine-rank every stored chunk against ``query_vec``.
|
|
331
|
+
|
|
332
|
+
A linear scan: fine for the tens-of-thousands of symbols a
|
|
333
|
+
normal repo has (each cosine is a 384–1536-float dot product).
|
|
334
|
+
If a monorepo ever makes this slow, the swap-in is a vector
|
|
335
|
+
index (sqlite-vec / faiss) behind this same method — callers
|
|
336
|
+
don't change.
|
|
337
|
+
"""
|
|
338
|
+
rows = self._conn.execute(
|
|
339
|
+
"SELECT path, qualname, kind, start_line, end_line, signature, "
|
|
340
|
+
"embedding FROM chunks"
|
|
341
|
+
).fetchall()
|
|
342
|
+
qn = _norm(query_vec)
|
|
343
|
+
if qn == 0.0:
|
|
344
|
+
return []
|
|
345
|
+
scored: list[CodeHit] = []
|
|
346
|
+
for path, qual, kind, start, end, sig, blob in rows:
|
|
347
|
+
vec = _unpack(blob)
|
|
348
|
+
score = _cosine(query_vec, vec, qn)
|
|
349
|
+
scored.append(
|
|
350
|
+
CodeHit(
|
|
351
|
+
path=path,
|
|
352
|
+
qualified_name=qual,
|
|
353
|
+
kind=kind,
|
|
354
|
+
start_line=start,
|
|
355
|
+
end_line=end,
|
|
356
|
+
signature=sig,
|
|
357
|
+
score=score,
|
|
358
|
+
)
|
|
359
|
+
)
|
|
360
|
+
scored.sort(key=lambda h: h.score, reverse=True)
|
|
361
|
+
return scored[:limit]
|
|
362
|
+
|
|
363
|
+
def is_empty(self) -> bool:
|
|
364
|
+
row = self._conn.execute("SELECT 1 FROM chunks LIMIT 1").fetchone()
|
|
365
|
+
return row is None
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def _norm(vec: Sequence[float]) -> float:
|
|
369
|
+
return math.sqrt(sum(x * x for x in vec))
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _cosine(a: Sequence[float], b: Sequence[float], a_norm: float) -> float:
|
|
373
|
+
"""Cosine similarity; ``a_norm`` is precomputed (the query norm is
|
|
374
|
+
constant across all chunks, so we hoist it out of the scan loop).
|
|
375
|
+
Mismatched dims (shouldn't happen — the staleness gate forces a
|
|
376
|
+
uniform embedder) score 0 rather than raising."""
|
|
377
|
+
if len(a) != len(b):
|
|
378
|
+
return 0.0
|
|
379
|
+
bn = _norm(b)
|
|
380
|
+
if bn == 0.0:
|
|
381
|
+
return 0.0
|
|
382
|
+
# strict=False: a dim mismatch is already guarded above (returns
|
|
383
|
+
# 0.0), so don't raise here — just stop at the shorter vector.
|
|
384
|
+
dot = sum(x * y for x, y in zip(a, b, strict=False))
|
|
385
|
+
return dot / (a_norm * bn)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
# ---------------------------------------------------------------------------
|
|
389
|
+
# Build — incremental index over the tree
|
|
390
|
+
# ---------------------------------------------------------------------------
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
async def build_index(
|
|
394
|
+
root: Path, store: CodeIndexStore, embedder: Any
|
|
395
|
+
) -> tuple[int, int]:
|
|
396
|
+
"""(Re)index ``root`` into ``store``. Returns ``(files_embedded,
|
|
397
|
+
files_skipped)``.
|
|
398
|
+
|
|
399
|
+
Incremental: a file whose sha256 + embedder match the stored row is
|
|
400
|
+
skipped (no re-embed). Deleted files are pruned. Embedding is
|
|
401
|
+
batched per file (one ``embed_batch()`` call covers all of a file's
|
|
402
|
+
chunks) to amortise the API round-trip.
|
|
403
|
+
"""
|
|
404
|
+
files = _iter_py_files(root)
|
|
405
|
+
live: set[str] = set()
|
|
406
|
+
embedded = 0
|
|
407
|
+
skipped = 0
|
|
408
|
+
for fpath in files:
|
|
409
|
+
rel = fpath.relative_to(root).as_posix()
|
|
410
|
+
live.add(rel)
|
|
411
|
+
try:
|
|
412
|
+
source = fpath.read_text(encoding="utf-8", errors="replace")
|
|
413
|
+
except OSError:
|
|
414
|
+
continue
|
|
415
|
+
sha = _file_sha256(source)
|
|
416
|
+
if store.file_is_fresh(rel, sha):
|
|
417
|
+
skipped += 1
|
|
418
|
+
continue
|
|
419
|
+
chunks = _chunks_for_file(rel, source)
|
|
420
|
+
if not chunks:
|
|
421
|
+
# Empty/constant-only file: record the hash so we don't
|
|
422
|
+
# re-walk it every build, but store no chunks.
|
|
423
|
+
store.replace_file_chunks(rel, sha, [], [])
|
|
424
|
+
continue
|
|
425
|
+
vectors = await embedder.embed_batch([c.text for c in chunks])
|
|
426
|
+
store.replace_file_chunks(rel, sha, chunks, vectors)
|
|
427
|
+
embedded += 1
|
|
428
|
+
store.prune_missing(live)
|
|
429
|
+
return embedded, skipped
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
async def search_code(
|
|
433
|
+
root: Path | str, embedder_name: str, query: str, *, limit: int = 8
|
|
434
|
+
) -> list[CodeHit]:
|
|
435
|
+
"""Structured semantic search — build/refresh the index for ``root``
|
|
436
|
+
and return ranked :class:`CodeHit`s for ``query``.
|
|
437
|
+
|
|
438
|
+
The structured-results entry point (the tool returns rendered text
|
|
439
|
+
for the model; callers that need ``(path, score, line)`` — the
|
|
440
|
+
desktop ``@Codebase`` RPC — use this). Builds lazily + incrementally
|
|
441
|
+
like the tool, so first call on a fresh repo embeds, later calls are
|
|
442
|
+
cheap. Returns ``[]`` (never raises) on an empty index so the caller
|
|
443
|
+
can degrade gracefully.
|
|
444
|
+
"""
|
|
445
|
+
# Sync filesystem ops in an async fn are intentional here: these are
|
|
446
|
+
# local-path resolves + a mkdir, microsecond-scale, and the rest of
|
|
447
|
+
# this module uses sqlite/pathlib synchronously by design. Pulling in
|
|
448
|
+
# anyio.path for one resolve buys nothing.
|
|
449
|
+
root_p = Path(root).resolve() # noqa: ASYNC240
|
|
450
|
+
db_path = root_p / ".loom" / "code_index.db"
|
|
451
|
+
db_path.parent.mkdir(exist_ok=True)
|
|
452
|
+
embedder = resolve_embedder(embedder_name)
|
|
453
|
+
store = CodeIndexStore(db_path, embedder_name)
|
|
454
|
+
try:
|
|
455
|
+
await build_index(root_p, store, embedder)
|
|
456
|
+
if store.is_empty():
|
|
457
|
+
return []
|
|
458
|
+
qvecs = await embedder.embed_batch([query])
|
|
459
|
+
return store.search(qvecs[0], limit)
|
|
460
|
+
finally:
|
|
461
|
+
store.close()
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
# ---------------------------------------------------------------------------
|
|
465
|
+
# Tool — codebase_search
|
|
466
|
+
# ---------------------------------------------------------------------------
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def _render_hits(hits: list[CodeHit]) -> str:
|
|
470
|
+
"""Cite file:line so the agent can ``read`` the exact range next —
|
|
471
|
+
same citation shape as grep."""
|
|
472
|
+
if not hits:
|
|
473
|
+
return "no semantic matches"
|
|
474
|
+
out: list[str] = []
|
|
475
|
+
for h in hits:
|
|
476
|
+
loc = f"{h.path}:{h.start_line}-{h.end_line}"
|
|
477
|
+
out.append(f" [{h.score:.2f}] {h.kind} {h.qualified_name} ({loc})")
|
|
478
|
+
out.append(f" {h.signature.strip()}")
|
|
479
|
+
return "\n".join(out)
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
# ---------------------------------------------------------------------------
|
|
483
|
+
# Phase 1b — blend code hits with learned notes (the differentiator)
|
|
484
|
+
# ---------------------------------------------------------------------------
|
|
485
|
+
|
|
486
|
+
# Reciprocal Rank Fusion constant. RRF score for an item at rank r (0-
|
|
487
|
+
# based) in a list is 1/(k + r + 1); summed across lists. k=60 is the
|
|
488
|
+
# canonical value (Cormack et al.) — it damps the top-rank dominance so
|
|
489
|
+
# a strong #2 in both lists beats a #1-in-one/absent-in-other. We fuse
|
|
490
|
+
# by RANK not raw score precisely because the two stores live in
|
|
491
|
+
# different score spaces (cosine vs the notebook's BM25/hybrid RRF) —
|
|
492
|
+
# ranks are comparable, raw scores are not. THIS is the call that lets
|
|
493
|
+
# "the code that does X" and "what we learned about X" share one list.
|
|
494
|
+
_RRF_K = 60
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
@dataclass(frozen=True)
|
|
498
|
+
class _BlendRow:
|
|
499
|
+
"""A unified result — either a code symbol or a learned note."""
|
|
500
|
+
|
|
501
|
+
kind: str # "code" or "note"
|
|
502
|
+
label: str # rendered one-liner
|
|
503
|
+
rrf: float
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def _fuse(
|
|
507
|
+
code_hits: list[CodeHit], note_matches: list[Any]
|
|
508
|
+
) -> list[_BlendRow]:
|
|
509
|
+
"""Reciprocal-rank-fuse code symbols and notes into one ranked list.
|
|
510
|
+
|
|
511
|
+
Each source contributes ``1/(k + rank)`` per item; since an item
|
|
512
|
+
appears in only one source here (a code symbol is never also a
|
|
513
|
+
note), the fusion is really an interleave weighted by within-source
|
|
514
|
+
rank — a #1 code hit and a #1 note land adjacent, a #5 note sinks
|
|
515
|
+
below a #2 code hit. Keeps the best of both surfaces visible
|
|
516
|
+
instead of letting whichever store happens to score higher in its
|
|
517
|
+
own units dominate.
|
|
518
|
+
"""
|
|
519
|
+
rows: list[_BlendRow] = []
|
|
520
|
+
for rank, h in enumerate(code_hits):
|
|
521
|
+
loc = f"{h.path}:{h.start_line}-{h.end_line}"
|
|
522
|
+
rows.append(
|
|
523
|
+
_BlendRow(
|
|
524
|
+
kind="code",
|
|
525
|
+
label=(
|
|
526
|
+
f" code {h.kind} {h.qualified_name} ({loc})\n"
|
|
527
|
+
f" {h.signature.strip()}"
|
|
528
|
+
),
|
|
529
|
+
rrf=1.0 / (_RRF_K + rank + 1),
|
|
530
|
+
)
|
|
531
|
+
)
|
|
532
|
+
for rank, m in enumerate(note_matches):
|
|
533
|
+
# NoteMatch = (summary: NoteSummary, score: float, snippet: str).
|
|
534
|
+
# ``summary`` is the STRUCTURED note metadata (NOT a string) —
|
|
535
|
+
# pull .title/.slug off it. ``snippet`` is the query-relevant
|
|
536
|
+
# text excerpt. Show title + the excerpt so the agent sees what
|
|
537
|
+
# was learned and why it matched, plus the slug to read_note.
|
|
538
|
+
summary = getattr(m, "summary", None)
|
|
539
|
+
title = (getattr(summary, "title", None) or "learned note").strip()
|
|
540
|
+
slug = getattr(summary, "slug", "") or ""
|
|
541
|
+
snippet = (getattr(m, "snippet", "") or "").strip().replace("\n", " ")
|
|
542
|
+
if len(snippet) > 160:
|
|
543
|
+
snippet = snippet[:157] + "..."
|
|
544
|
+
header = f" learned {title}"
|
|
545
|
+
if slug:
|
|
546
|
+
header += f" (note:{slug})"
|
|
547
|
+
label = header
|
|
548
|
+
if snippet:
|
|
549
|
+
label += f"\n {snippet}"
|
|
550
|
+
rows.append(
|
|
551
|
+
_BlendRow(
|
|
552
|
+
kind="note",
|
|
553
|
+
label=label,
|
|
554
|
+
rrf=1.0 / (_RRF_K + rank + 1),
|
|
555
|
+
)
|
|
556
|
+
)
|
|
557
|
+
rows.sort(key=lambda r: r.rrf, reverse=True)
|
|
558
|
+
return rows
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def _render_blend(rows: list[_BlendRow], limit: int) -> str:
|
|
562
|
+
if not rows:
|
|
563
|
+
return "no semantic matches"
|
|
564
|
+
return "\n".join(r.label for r in rows[:limit])
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def _current_user_id() -> str | None:
|
|
568
|
+
"""The live tenant from the run context (set by the agent loop), or
|
|
569
|
+
None outside a run. Keeps note recall partitioned per user in the
|
|
570
|
+
multi-tenant desktop; harmless (None) in the single-tenant CLI."""
|
|
571
|
+
try:
|
|
572
|
+
from loomflow.core.context import get_run_context
|
|
573
|
+
|
|
574
|
+
ctx = get_run_context()
|
|
575
|
+
return getattr(ctx, "user_id", None) if ctx is not None else None
|
|
576
|
+
except Exception:
|
|
577
|
+
return None
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def codebase_search_tool(
|
|
581
|
+
workdir: Path | str,
|
|
582
|
+
embedder_name: str,
|
|
583
|
+
*,
|
|
584
|
+
default_limit: int = _DEFAULT_LIMIT,
|
|
585
|
+
workspace: Any | None = None,
|
|
586
|
+
) -> Tool:
|
|
587
|
+
"""Build the ``codebase_search`` tool for ``workdir``.
|
|
588
|
+
|
|
589
|
+
The model sees::
|
|
590
|
+
|
|
591
|
+
codebase_search(query, limit=8)
|
|
592
|
+
|
|
593
|
+
``query`` is a natural-language description of behaviour ("where do
|
|
594
|
+
we validate JWTs", "the retry/backoff logic"). Returns the most
|
|
595
|
+
semantically similar code symbols with file:line citations to
|
|
596
|
+
``read`` next. Use this when ``grep`` would miss the code because
|
|
597
|
+
the words don't match the concept; use ``grep`` when you know the
|
|
598
|
+
literal string.
|
|
599
|
+
|
|
600
|
+
``embedder_name`` (``"openai"`` / ``"hash"``) MUST match the name
|
|
601
|
+
loom-code resolved for memory, so the index embeds in the same
|
|
602
|
+
space the notes do. The index is built lazily on first search and
|
|
603
|
+
incrementally refreshed each call — only changed files re-embed, so
|
|
604
|
+
steady-state search is cheap.
|
|
605
|
+
|
|
606
|
+
``workspace`` (Phase 1b — the differentiator): when a
|
|
607
|
+
``LocalDiskWorkspace`` is passed, every search ALSO queries the
|
|
608
|
+
shared notebook (``search_notes``, hybrid + citation-boosted) and
|
|
609
|
+
reciprocal-rank-fuses the learned notes INTO the code results. One
|
|
610
|
+
ranked list then surfaces "the code that does X" *and* "what we
|
|
611
|
+
learned about X across past runs" — the thing a stateless indexer
|
|
612
|
+
(Cursor's ``@Codebase``) structurally cannot do. ``None`` falls
|
|
613
|
+
back to code-only results (identical to Phase 1).
|
|
614
|
+
"""
|
|
615
|
+
root = Path(workdir).resolve()
|
|
616
|
+
db_path = root / ".loom" / "code_index.db"
|
|
617
|
+
|
|
618
|
+
# Lazily constructed on first call so building the agent stays cheap
|
|
619
|
+
# (no disk/embedder touch until the tool actually runs) and so an
|
|
620
|
+
# embedder import error surfaces as a tool message, not a build
|
|
621
|
+
# crash.
|
|
622
|
+
state: dict[str, Any] = {"store": None, "embedder": None}
|
|
623
|
+
|
|
624
|
+
async def codebase_search(query: str, limit: int = default_limit) -> str:
|
|
625
|
+
"""Semantic code search — find symbols by meaning, not string,
|
|
626
|
+
blended with what we've learned about this code. Args: query
|
|
627
|
+
(natural language), limit (max results, default 8). Returns a
|
|
628
|
+
ranked list of code symbols (file:line to read next) and any
|
|
629
|
+
relevant learned notes. Prefer grep for literal strings; use
|
|
630
|
+
this for conceptual lookups."""
|
|
631
|
+
try:
|
|
632
|
+
limit = int(limit)
|
|
633
|
+
except (TypeError, ValueError):
|
|
634
|
+
limit = default_limit
|
|
635
|
+
limit = max(1, min(limit, 50))
|
|
636
|
+
|
|
637
|
+
try:
|
|
638
|
+
db_path.parent.mkdir(exist_ok=True)
|
|
639
|
+
if state["store"] is None:
|
|
640
|
+
state["embedder"] = resolve_embedder(embedder_name)
|
|
641
|
+
state["store"] = CodeIndexStore(db_path, embedder_name)
|
|
642
|
+
store: CodeIndexStore = state["store"]
|
|
643
|
+
embedder = state["embedder"]
|
|
644
|
+
|
|
645
|
+
# Incremental refresh — cheap when nothing changed (all
|
|
646
|
+
# files skip on sha match). First call on a fresh repo does
|
|
647
|
+
# the full embed.
|
|
648
|
+
await build_index(root, store, embedder)
|
|
649
|
+
|
|
650
|
+
code_hits: list[CodeHit] = []
|
|
651
|
+
if not store.is_empty():
|
|
652
|
+
qvecs = await embedder.embed_batch([query])
|
|
653
|
+
# Over-fetch so the fusion has depth to rank against the
|
|
654
|
+
# notes; the blend trims to ``limit``.
|
|
655
|
+
code_hits = store.search(qvecs[0], limit * 2)
|
|
656
|
+
|
|
657
|
+
# Phase 1b: pull learned notes from the shared notebook and
|
|
658
|
+
# fuse. A notebook failure (or no workspace) degrades to
|
|
659
|
+
# code-only — never an error.
|
|
660
|
+
note_matches: list[Any] = []
|
|
661
|
+
if workspace is not None:
|
|
662
|
+
try:
|
|
663
|
+
note_matches = await workspace.search_notes(
|
|
664
|
+
query,
|
|
665
|
+
user_id=_current_user_id(),
|
|
666
|
+
mode="hybrid",
|
|
667
|
+
boost_relevance=True,
|
|
668
|
+
limit=limit,
|
|
669
|
+
)
|
|
670
|
+
except Exception:
|
|
671
|
+
note_matches = []
|
|
672
|
+
|
|
673
|
+
if not code_hits and not note_matches:
|
|
674
|
+
if store.is_empty():
|
|
675
|
+
return (
|
|
676
|
+
"codebase_search: index is empty (no Python "
|
|
677
|
+
"symbols found under this project) and no learned "
|
|
678
|
+
"notes match. Use grep for non-Python files."
|
|
679
|
+
)
|
|
680
|
+
return "no semantic matches"
|
|
681
|
+
|
|
682
|
+
if note_matches:
|
|
683
|
+
return _render_blend(_fuse(code_hits, note_matches), limit)
|
|
684
|
+
return _render_hits(code_hits[:limit])
|
|
685
|
+
except Exception as exc: # never abort a turn on a search failure
|
|
686
|
+
return (
|
|
687
|
+
f"codebase_search unavailable ({type(exc).__name__}: {exc}). "
|
|
688
|
+
"Fall back to grep."
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
return tool(
|
|
692
|
+
name="codebase_search",
|
|
693
|
+
description=(
|
|
694
|
+
"Semantic code search: find code by MEANING, not literal "
|
|
695
|
+
"string — blended with what we've learned about this code "
|
|
696
|
+
"across past runs. Args: query (natural-language description "
|
|
697
|
+
"of the behaviour, e.g. 'where JWTs are validated'), limit=8. "
|
|
698
|
+
"Returns ranked code symbols (file:line to read next) plus "
|
|
699
|
+
"any relevant learned notes. Use this when grep would miss "
|
|
700
|
+
"the code because the words don't match the concept; use "
|
|
701
|
+
"grep when you know the exact string."
|
|
702
|
+
),
|
|
703
|
+
)(codebase_search)
|