loom-code 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. loom_code/__init__.py +22 -0
  2. loom_code/_post_commit.py +119 -0
  3. loom_code/agent.py +544 -0
  4. loom_code/approval.py +616 -0
  5. loom_code/browse/__init__.py +291 -0
  6. loom_code/browse/act.py +467 -0
  7. loom_code/browse/observe.py +249 -0
  8. loom_code/browse/session.py +96 -0
  9. loom_code/browse/verify.py +194 -0
  10. loom_code/checkpoint.py +283 -0
  11. loom_code/cli.py +495 -0
  12. loom_code/code_index.py +703 -0
  13. loom_code/compact.py +143 -0
  14. loom_code/consent.py +47 -0
  15. loom_code/credentials.py +527 -0
  16. loom_code/edit_tool.py +635 -0
  17. loom_code/extensions.py +522 -0
  18. loom_code/file_history.py +322 -0
  19. loom_code/file_tools.py +93 -0
  20. loom_code/git_hook.py +200 -0
  21. loom_code/grep_tool.py +430 -0
  22. loom_code/hooks.py +297 -0
  23. loom_code/loominit/__init__.py +23 -0
  24. loom_code/loominit/_ast_walk.py +429 -0
  25. loom_code/loominit/_files.py +284 -0
  26. loom_code/loominit/_graph.py +141 -0
  27. loom_code/loominit/_resolve.py +392 -0
  28. loom_code/loominit/_tests_map.py +108 -0
  29. loom_code/loominit/extractor.py +332 -0
  30. loom_code/loominit/repomap.py +225 -0
  31. loom_code/loominit/schema.py +242 -0
  32. loom_code/lsp_tools.py +396 -0
  33. loom_code/mcp_host.py +79 -0
  34. loom_code/operator.py +449 -0
  35. loom_code/paste.py +97 -0
  36. loom_code/paths.py +52 -0
  37. loom_code/permissions.py +177 -0
  38. loom_code/project.py +104 -0
  39. loom_code/prompts.py +451 -0
  40. loom_code/render.py +783 -0
  41. loom_code/repl.py +4080 -0
  42. loom_code/rules.py +267 -0
  43. loom_code/sandboxed_bash.py +176 -0
  44. loom_code/scribe.py +88 -0
  45. loom_code/skills/__init__.py +16 -0
  46. loom_code/skills/graphify/SKILL.md +97 -0
  47. loom_code/skills/graphify/tools.py +570 -0
  48. loom_code/trust.py +216 -0
  49. loom_code/turn.py +169 -0
  50. loom_code/web_fetch.py +370 -0
  51. loom_code/workers.py +758 -0
  52. loom_code/worktree.py +134 -0
  53. loom_code-0.1.1.dist-info/METADATA +224 -0
  54. loom_code-0.1.1.dist-info/RECORD +58 -0
  55. loom_code-0.1.1.dist-info/WHEEL +5 -0
  56. loom_code-0.1.1.dist-info/entry_points.txt +2 -0
  57. loom_code-0.1.1.dist-info/licenses/LICENSE +21 -0
  58. loom_code-0.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,703 @@
1
+ """Semantic codebase index — embed source symbols, search by meaning.
2
+
3
+ The differentiator loom-code ships over grep: ``grep`` finds the *string*
4
+ ``authenticate``; ``codebase_search`` finds the code that *handles auth*
5
+ even when the word never appears. It mirrors Cursor's ``@Codebase`` —
6
+ but local, in the same ``.loom`` partition as memory, and (Phase 1b)
7
+ fusible with what the agent has *learned* about that code across runs.
8
+
9
+ How it works, end to end:
10
+
11
+ * **Chunk** — reuse the structural AST walk (:func:`walk_python_file`)
12
+ to split each ``.py`` file into class/function/method chunks. Bare
13
+ module constants are skipped (same call repomap's ``_score`` makes:
14
+ they're noise in a semantic overview). Each chunk's embeddable text
15
+ is ``path + qualified_name + signature + docstring + body`` — the
16
+ body is sliced ``line:end_line`` from the file we already read.
17
+ * **Embed** — via the SAME embedder loom-code picks for memory
18
+ (:class:`OpenAIEmbedder` for OpenAI chat models, :class:`HashEmbedder`
19
+ otherwise — zero-key, offline, lower quality but never a cross-
20
+ provider call). The caller passes the resolved name so the index and
21
+ memory always embed in the same space (Phase 1b fuses them). Both
22
+ embedders expose ``async embed_batch(texts) -> list[list[float]]``.
23
+ * **Store** — a SEPARATE sqlite db ``<root>/.loom/code_index.db``.
24
+ NOT ``memory.db``: loomflow's memory schema is locked to Episodes /
25
+ Facts, so a fourth data model (code chunks) gets its own file. Per-
26
+ file ``sha256`` gates re-embedding — only changed files re-embed,
27
+ which matters because OpenAI embedding costs real money per token.
28
+ * **Search** — cosine over the stored vectors, grouped + file:line
29
+ cited like ``grep`` so the agent can ``read`` the exact range next.
30
+
31
+ Python-only today (the AST walk is stdlib ``ast``). The walk already
32
+ routes by language, so a future tree-sitter backend drops in here
33
+ without touching the tool or the store — the seam is the chunker, not
34
+ the index.
35
+
36
+ Failure is always graceful: a broken build, a missing embedder key, an
37
+ empty index — the tool returns a one-line explanation, never raises.
38
+ A semantic-search outage must not abort a turn.
39
+ """
40
+
41
+ from __future__ import annotations
42
+
43
+ import hashlib
44
+ import math
45
+ import sqlite3
46
+ import struct
47
+ from collections.abc import Sequence
48
+ from dataclasses import dataclass
49
+ from pathlib import Path
50
+ from typing import Any
51
+
52
+ from loomflow import tool
53
+ from loomflow.tools.registry import Tool
54
+
55
+ from .loominit._ast_walk import walk_python_file
56
+
57
+ # Directories we never index — vendored / generated / VCS noise. Same
58
+ # spirit as repomap's skip set; kept local so the two can diverge (the
59
+ # code index may later want to include tests, which the overview map
60
+ # collapses).
61
+ _SKIP_DIRS: frozenset[str] = frozenset(
62
+ {
63
+ ".git",
64
+ ".loom",
65
+ ".venv",
66
+ "venv",
67
+ "node_modules",
68
+ "__pycache__",
69
+ ".mypy_cache",
70
+ ".ruff_cache",
71
+ ".pytest_cache",
72
+ "dist",
73
+ "build",
74
+ ".tox",
75
+ "site-packages",
76
+ }
77
+ )
78
+
79
+ # Chunk bodies are capped before embedding: a 2000-line god-function
80
+ # would blow the embedder's token limit AND dilute its own signal (the
81
+ # first ~120 lines carry the intent; the tail is detail). Slicing keeps
82
+ # embeddings cheap and focused. The agent reads the full range from the
83
+ # file:line citation anyway.
84
+ _MAX_CHUNK_LINES = 120
85
+
86
+ # Default result count — enough to surface the relevant cluster, few
87
+ # enough not to flood the model's context. Matches grep's file-cap feel.
88
+ _DEFAULT_LIMIT = 8
89
+
90
+
91
+ @dataclass(frozen=True)
92
+ class _Chunk:
93
+ """One indexable code unit (a class / function / method)."""
94
+
95
+ path: str # repo-relative POSIX
96
+ qualified_name: str
97
+ kind: str
98
+ start_line: int
99
+ end_line: int
100
+ signature: str
101
+ text: str # the embeddable doc (header + body)
102
+
103
+
104
+ @dataclass(frozen=True)
105
+ class CodeHit:
106
+ """A semantic search result — enough to cite and re-read."""
107
+
108
+ path: str
109
+ qualified_name: str
110
+ kind: str
111
+ start_line: int
112
+ end_line: int
113
+ signature: str
114
+ score: float
115
+
116
+
117
+ # ---------------------------------------------------------------------------
118
+ # Embedder resolution (shared with memory — see agent._is_openai_model)
119
+ # ---------------------------------------------------------------------------
120
+
121
+
122
+ def resolve_embedder(name: str) -> Any:
123
+ """Build the embedder backend for ``name`` (``"openai"`` / ``"hash"``).
124
+
125
+ Returns an object with ``async embed_batch(texts) -> list[list[
126
+ float]]`` — the batch method both backends share. The caller passes
127
+ the same name loom-code resolved for memory, so the code index and
128
+ the note store embed in one vector space (Phase 1b reciprocal-rank-
129
+ fuses across them). ``"hash"`` is the zero-key, offline default for
130
+ non-OpenAI chat models; anything unrecognised also falls to hash so
131
+ the index degrades to "works, lower quality" rather than crashing.
132
+ """
133
+ from loomflow.memory import HashEmbedder, OpenAIEmbedder
134
+
135
+ if name == "openai":
136
+ return OpenAIEmbedder() # text-embedding-3-small, reads OPENAI_API_KEY
137
+ return HashEmbedder()
138
+
139
+
140
+ # ---------------------------------------------------------------------------
141
+ # Chunking — AST walk -> embeddable units
142
+ # ---------------------------------------------------------------------------
143
+
144
+
145
+ def _iter_py_files(root: Path) -> list[Path]:
146
+ out: list[Path] = []
147
+ for p in root.rglob("*.py"):
148
+ if any(part in _SKIP_DIRS for part in p.parts):
149
+ continue
150
+ out.append(p)
151
+ return out
152
+
153
+
154
+ def _file_sha256(text: str) -> str:
155
+ return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
156
+
157
+
158
+ def _chunks_for_file(rel_path: str, source: str) -> list[_Chunk]:
159
+ """Split one file's source into embeddable chunks via the AST walk.
160
+
161
+ Skips module-level constants (``kind == "constant"``) — they're a
162
+ single line of value with no behaviour to search for, and they
163
+ crowd out real symbols. A syntax error yields no chunks (the walk
164
+ returns empty lists rather than raising), so one broken file never
165
+ aborts a build.
166
+ """
167
+ symbols, _imports, _decorators = walk_python_file(source, rel_path)
168
+ lines = source.splitlines()
169
+ chunks: list[_Chunk] = []
170
+ for sym in symbols:
171
+ if sym.kind == "constant":
172
+ continue
173
+ # Slice the body we already have in memory. AST line numbers are
174
+ # 1-based inclusive; clamp end to the cap so giant functions
175
+ # don't blow the embedder budget (the citation still spans the
176
+ # true range so the agent can read all of it).
177
+ start = max(sym.line, 1)
178
+ end = min(sym.end_line, start + _MAX_CHUNK_LINES - 1)
179
+ body = "\n".join(lines[start - 1 : end])
180
+ # The embeddable doc: location + identity + intent + body. The
181
+ # path + qualname + docstring carry most of the semantic signal
182
+ # cheaply; the body grounds it in the actual implementation.
183
+ doc_parts = [f"{rel_path} :: {sym.qualified_name}", sym.signature]
184
+ if sym.docstring_first_line:
185
+ doc_parts.append(sym.docstring_first_line)
186
+ doc_parts.append(body)
187
+ chunks.append(
188
+ _Chunk(
189
+ path=rel_path,
190
+ qualified_name=sym.qualified_name,
191
+ kind=sym.kind,
192
+ start_line=sym.line,
193
+ end_line=sym.end_line,
194
+ signature=sym.signature,
195
+ text="\n".join(doc_parts),
196
+ )
197
+ )
198
+ return chunks
199
+
200
+
201
+ # ---------------------------------------------------------------------------
202
+ # Store — sqlite, separate from memory.db
203
+ # ---------------------------------------------------------------------------
204
+
205
+ # Vectors persist as packed little-endian float32 blobs — compact and
206
+ # numpy-free (loom-code has no numpy dep; cosine is a plain loop). The
207
+ # dimension is implied by the blob length, so the store is agnostic to
208
+ # whether hash (384) or openai (1536) wrote it; switching embedders
209
+ # invalidates every file via the staleness gate, forcing a clean
210
+ # re-embed in the new dimension.
211
+
212
+
213
+ def _pack(vec: Sequence[float]) -> bytes:
214
+ return struct.pack(f"<{len(vec)}f", *vec)
215
+
216
+
217
+ def _unpack(blob: bytes) -> list[float]:
218
+ return list(struct.unpack(f"<{len(blob) // 4}f", blob))
219
+
220
+
221
+ class CodeIndexStore:
222
+ """The sqlite-backed code-chunk index for one project.
223
+
224
+ Async note: sqlite calls here are synchronous and fast (local
225
+ file). They run inside the tool's async function but are not
226
+ offloaded to a thread — acceptable because a query is a single
227
+ indexed read + an in-process cosine loop, well under the latency
228
+ that would justify a thread pool. Embedding (the slow part) IS
229
+ async and awaited.
230
+ """
231
+
232
+ def __init__(self, db_path: Path, embedder_name: str) -> None:
233
+ self._db_path = db_path
234
+ self._embedder_name = embedder_name
235
+ self._conn = sqlite3.connect(str(db_path))
236
+ self._conn.execute(
237
+ """
238
+ CREATE TABLE IF NOT EXISTS chunks (
239
+ id TEXT PRIMARY KEY, -- path::qualname
240
+ path TEXT NOT NULL,
241
+ qualname TEXT NOT NULL,
242
+ kind TEXT NOT NULL,
243
+ start_line INTEGER NOT NULL,
244
+ end_line INTEGER NOT NULL,
245
+ signature TEXT NOT NULL,
246
+ embedding BLOB NOT NULL
247
+ )
248
+ """
249
+ )
250
+ # Per-file content hash — re-embed only what changed. Stores the
251
+ # embedder name too, so switching providers (hash -> openai)
252
+ # invalidates every file (different vector space).
253
+ self._conn.execute(
254
+ """
255
+ CREATE TABLE IF NOT EXISTS files (
256
+ path TEXT PRIMARY KEY,
257
+ sha256 TEXT NOT NULL,
258
+ embedder TEXT NOT NULL
259
+ )
260
+ """
261
+ )
262
+ self._conn.execute(
263
+ "CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path)"
264
+ )
265
+ self._conn.commit()
266
+
267
+ def close(self) -> None:
268
+ self._conn.close()
269
+
270
+ def file_is_fresh(self, rel_path: str, sha: str) -> bool:
271
+ """True when ``rel_path`` is already indexed at ``sha`` with the
272
+ current embedder — i.e. nothing to re-embed."""
273
+ row = self._conn.execute(
274
+ "SELECT sha256, embedder FROM files WHERE path = ?", (rel_path,)
275
+ ).fetchone()
276
+ return (
277
+ row is not None
278
+ and row[0] == sha
279
+ and row[1] == self._embedder_name
280
+ )
281
+
282
+ def replace_file_chunks(
283
+ self,
284
+ rel_path: str,
285
+ sha: str,
286
+ chunks: list[_Chunk],
287
+ vectors: list[Sequence[float]],
288
+ ) -> None:
289
+ """Atomically swap one file's chunks (delete-then-insert in a
290
+ single transaction) so a crash mid-reindex never leaves a file
291
+ half-indexed."""
292
+ cur = self._conn
293
+ cur.execute("DELETE FROM chunks WHERE path = ?", (rel_path,))
294
+ for chunk, vec in zip(chunks, vectors, strict=True):
295
+ cur.execute(
296
+ "INSERT OR REPLACE INTO chunks "
297
+ "(id, path, qualname, kind, start_line, end_line, "
298
+ "signature, embedding) "
299
+ "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
300
+ (
301
+ f"{chunk.path}::{chunk.qualified_name}",
302
+ chunk.path,
303
+ chunk.qualified_name,
304
+ chunk.kind,
305
+ chunk.start_line,
306
+ chunk.end_line,
307
+ chunk.signature,
308
+ _pack(vec),
309
+ ),
310
+ )
311
+ cur.execute(
312
+ "INSERT OR REPLACE INTO files (path, sha256, embedder) "
313
+ "VALUES (?, ?, ?)",
314
+ (rel_path, sha, self._embedder_name),
315
+ )
316
+ cur.commit()
317
+
318
+ def prune_missing(self, live_paths: set[str]) -> None:
319
+ """Drop chunks/files for source files that no longer exist (the
320
+ delete half of incremental indexing)."""
321
+ rows = self._conn.execute("SELECT path FROM files").fetchall()
322
+ stale = [r[0] for r in rows if r[0] not in live_paths]
323
+ for path in stale:
324
+ self._conn.execute("DELETE FROM chunks WHERE path = ?", (path,))
325
+ self._conn.execute("DELETE FROM files WHERE path = ?", (path,))
326
+ if stale:
327
+ self._conn.commit()
328
+
329
+ def search(self, query_vec: Sequence[float], limit: int) -> list[CodeHit]:
330
+ """Cosine-rank every stored chunk against ``query_vec``.
331
+
332
+ A linear scan: fine for the tens-of-thousands of symbols a
333
+ normal repo has (each cosine is a 384–1536-float dot product).
334
+ If a monorepo ever makes this slow, the swap-in is a vector
335
+ index (sqlite-vec / faiss) behind this same method — callers
336
+ don't change.
337
+ """
338
+ rows = self._conn.execute(
339
+ "SELECT path, qualname, kind, start_line, end_line, signature, "
340
+ "embedding FROM chunks"
341
+ ).fetchall()
342
+ qn = _norm(query_vec)
343
+ if qn == 0.0:
344
+ return []
345
+ scored: list[CodeHit] = []
346
+ for path, qual, kind, start, end, sig, blob in rows:
347
+ vec = _unpack(blob)
348
+ score = _cosine(query_vec, vec, qn)
349
+ scored.append(
350
+ CodeHit(
351
+ path=path,
352
+ qualified_name=qual,
353
+ kind=kind,
354
+ start_line=start,
355
+ end_line=end,
356
+ signature=sig,
357
+ score=score,
358
+ )
359
+ )
360
+ scored.sort(key=lambda h: h.score, reverse=True)
361
+ return scored[:limit]
362
+
363
+ def is_empty(self) -> bool:
364
+ row = self._conn.execute("SELECT 1 FROM chunks LIMIT 1").fetchone()
365
+ return row is None
366
+
367
+
368
+ def _norm(vec: Sequence[float]) -> float:
369
+ return math.sqrt(sum(x * x for x in vec))
370
+
371
+
372
+ def _cosine(a: Sequence[float], b: Sequence[float], a_norm: float) -> float:
373
+ """Cosine similarity; ``a_norm`` is precomputed (the query norm is
374
+ constant across all chunks, so we hoist it out of the scan loop).
375
+ Mismatched dims (shouldn't happen — the staleness gate forces a
376
+ uniform embedder) score 0 rather than raising."""
377
+ if len(a) != len(b):
378
+ return 0.0
379
+ bn = _norm(b)
380
+ if bn == 0.0:
381
+ return 0.0
382
+ # strict=False: a dim mismatch is already guarded above (returns
383
+ # 0.0), so don't raise here — just stop at the shorter vector.
384
+ dot = sum(x * y for x, y in zip(a, b, strict=False))
385
+ return dot / (a_norm * bn)
386
+
387
+
388
+ # ---------------------------------------------------------------------------
389
+ # Build — incremental index over the tree
390
+ # ---------------------------------------------------------------------------
391
+
392
+
393
+ async def build_index(
394
+ root: Path, store: CodeIndexStore, embedder: Any
395
+ ) -> tuple[int, int]:
396
+ """(Re)index ``root`` into ``store``. Returns ``(files_embedded,
397
+ files_skipped)``.
398
+
399
+ Incremental: a file whose sha256 + embedder match the stored row is
400
+ skipped (no re-embed). Deleted files are pruned. Embedding is
401
+ batched per file (one ``embed_batch()`` call covers all of a file's
402
+ chunks) to amortise the API round-trip.
403
+ """
404
+ files = _iter_py_files(root)
405
+ live: set[str] = set()
406
+ embedded = 0
407
+ skipped = 0
408
+ for fpath in files:
409
+ rel = fpath.relative_to(root).as_posix()
410
+ live.add(rel)
411
+ try:
412
+ source = fpath.read_text(encoding="utf-8", errors="replace")
413
+ except OSError:
414
+ continue
415
+ sha = _file_sha256(source)
416
+ if store.file_is_fresh(rel, sha):
417
+ skipped += 1
418
+ continue
419
+ chunks = _chunks_for_file(rel, source)
420
+ if not chunks:
421
+ # Empty/constant-only file: record the hash so we don't
422
+ # re-walk it every build, but store no chunks.
423
+ store.replace_file_chunks(rel, sha, [], [])
424
+ continue
425
+ vectors = await embedder.embed_batch([c.text for c in chunks])
426
+ store.replace_file_chunks(rel, sha, chunks, vectors)
427
+ embedded += 1
428
+ store.prune_missing(live)
429
+ return embedded, skipped
430
+
431
+
432
+ async def search_code(
433
+ root: Path | str, embedder_name: str, query: str, *, limit: int = 8
434
+ ) -> list[CodeHit]:
435
+ """Structured semantic search — build/refresh the index for ``root``
436
+ and return ranked :class:`CodeHit`s for ``query``.
437
+
438
+ The structured-results entry point (the tool returns rendered text
439
+ for the model; callers that need ``(path, score, line)`` — the
440
+ desktop ``@Codebase`` RPC — use this). Builds lazily + incrementally
441
+ like the tool, so first call on a fresh repo embeds, later calls are
442
+ cheap. Returns ``[]`` (never raises) on an empty index so the caller
443
+ can degrade gracefully.
444
+ """
445
+ # Sync filesystem ops in an async fn are intentional here: these are
446
+ # local-path resolves + a mkdir, microsecond-scale, and the rest of
447
+ # this module uses sqlite/pathlib synchronously by design. Pulling in
448
+ # anyio.path for one resolve buys nothing.
449
+ root_p = Path(root).resolve() # noqa: ASYNC240
450
+ db_path = root_p / ".loom" / "code_index.db"
451
+ db_path.parent.mkdir(exist_ok=True)
452
+ embedder = resolve_embedder(embedder_name)
453
+ store = CodeIndexStore(db_path, embedder_name)
454
+ try:
455
+ await build_index(root_p, store, embedder)
456
+ if store.is_empty():
457
+ return []
458
+ qvecs = await embedder.embed_batch([query])
459
+ return store.search(qvecs[0], limit)
460
+ finally:
461
+ store.close()
462
+
463
+
464
+ # ---------------------------------------------------------------------------
465
+ # Tool — codebase_search
466
+ # ---------------------------------------------------------------------------
467
+
468
+
469
+ def _render_hits(hits: list[CodeHit]) -> str:
470
+ """Cite file:line so the agent can ``read`` the exact range next —
471
+ same citation shape as grep."""
472
+ if not hits:
473
+ return "no semantic matches"
474
+ out: list[str] = []
475
+ for h in hits:
476
+ loc = f"{h.path}:{h.start_line}-{h.end_line}"
477
+ out.append(f" [{h.score:.2f}] {h.kind} {h.qualified_name} ({loc})")
478
+ out.append(f" {h.signature.strip()}")
479
+ return "\n".join(out)
480
+
481
+
482
+ # ---------------------------------------------------------------------------
483
+ # Phase 1b — blend code hits with learned notes (the differentiator)
484
+ # ---------------------------------------------------------------------------
485
+
486
+ # Reciprocal Rank Fusion constant. RRF score for an item at rank r (0-
487
+ # based) in a list is 1/(k + r + 1); summed across lists. k=60 is the
488
+ # canonical value (Cormack et al.) — it damps the top-rank dominance so
489
+ # a strong #2 in both lists beats a #1-in-one/absent-in-other. We fuse
490
+ # by RANK not raw score precisely because the two stores live in
491
+ # different score spaces (cosine vs the notebook's BM25/hybrid RRF) —
492
+ # ranks are comparable, raw scores are not. THIS is the call that lets
493
+ # "the code that does X" and "what we learned about X" share one list.
494
+ _RRF_K = 60
495
+
496
+
497
+ @dataclass(frozen=True)
498
+ class _BlendRow:
499
+ """A unified result — either a code symbol or a learned note."""
500
+
501
+ kind: str # "code" or "note"
502
+ label: str # rendered one-liner
503
+ rrf: float
504
+
505
+
506
+ def _fuse(
507
+ code_hits: list[CodeHit], note_matches: list[Any]
508
+ ) -> list[_BlendRow]:
509
+ """Reciprocal-rank-fuse code symbols and notes into one ranked list.
510
+
511
+ Each source contributes ``1/(k + rank)`` per item; since an item
512
+ appears in only one source here (a code symbol is never also a
513
+ note), the fusion is really an interleave weighted by within-source
514
+ rank — a #1 code hit and a #1 note land adjacent, a #5 note sinks
515
+ below a #2 code hit. Keeps the best of both surfaces visible
516
+ instead of letting whichever store happens to score higher in its
517
+ own units dominate.
518
+ """
519
+ rows: list[_BlendRow] = []
520
+ for rank, h in enumerate(code_hits):
521
+ loc = f"{h.path}:{h.start_line}-{h.end_line}"
522
+ rows.append(
523
+ _BlendRow(
524
+ kind="code",
525
+ label=(
526
+ f" code {h.kind} {h.qualified_name} ({loc})\n"
527
+ f" {h.signature.strip()}"
528
+ ),
529
+ rrf=1.0 / (_RRF_K + rank + 1),
530
+ )
531
+ )
532
+ for rank, m in enumerate(note_matches):
533
+ # NoteMatch = (summary: NoteSummary, score: float, snippet: str).
534
+ # ``summary`` is the STRUCTURED note metadata (NOT a string) —
535
+ # pull .title/.slug off it. ``snippet`` is the query-relevant
536
+ # text excerpt. Show title + the excerpt so the agent sees what
537
+ # was learned and why it matched, plus the slug to read_note.
538
+ summary = getattr(m, "summary", None)
539
+ title = (getattr(summary, "title", None) or "learned note").strip()
540
+ slug = getattr(summary, "slug", "") or ""
541
+ snippet = (getattr(m, "snippet", "") or "").strip().replace("\n", " ")
542
+ if len(snippet) > 160:
543
+ snippet = snippet[:157] + "..."
544
+ header = f" learned {title}"
545
+ if slug:
546
+ header += f" (note:{slug})"
547
+ label = header
548
+ if snippet:
549
+ label += f"\n {snippet}"
550
+ rows.append(
551
+ _BlendRow(
552
+ kind="note",
553
+ label=label,
554
+ rrf=1.0 / (_RRF_K + rank + 1),
555
+ )
556
+ )
557
+ rows.sort(key=lambda r: r.rrf, reverse=True)
558
+ return rows
559
+
560
+
561
+ def _render_blend(rows: list[_BlendRow], limit: int) -> str:
562
+ if not rows:
563
+ return "no semantic matches"
564
+ return "\n".join(r.label for r in rows[:limit])
565
+
566
+
567
+ def _current_user_id() -> str | None:
568
+ """The live tenant from the run context (set by the agent loop), or
569
+ None outside a run. Keeps note recall partitioned per user in the
570
+ multi-tenant desktop; harmless (None) in the single-tenant CLI."""
571
+ try:
572
+ from loomflow.core.context import get_run_context
573
+
574
+ ctx = get_run_context()
575
+ return getattr(ctx, "user_id", None) if ctx is not None else None
576
+ except Exception:
577
+ return None
578
+
579
+
580
+ def codebase_search_tool(
581
+ workdir: Path | str,
582
+ embedder_name: str,
583
+ *,
584
+ default_limit: int = _DEFAULT_LIMIT,
585
+ workspace: Any | None = None,
586
+ ) -> Tool:
587
+ """Build the ``codebase_search`` tool for ``workdir``.
588
+
589
+ The model sees::
590
+
591
+ codebase_search(query, limit=8)
592
+
593
+ ``query`` is a natural-language description of behaviour ("where do
594
+ we validate JWTs", "the retry/backoff logic"). Returns the most
595
+ semantically similar code symbols with file:line citations to
596
+ ``read`` next. Use this when ``grep`` would miss the code because
597
+ the words don't match the concept; use ``grep`` when you know the
598
+ literal string.
599
+
600
+ ``embedder_name`` (``"openai"`` / ``"hash"``) MUST match the name
601
+ loom-code resolved for memory, so the index embeds in the same
602
+ space the notes do. The index is built lazily on first search and
603
+ incrementally refreshed each call — only changed files re-embed, so
604
+ steady-state search is cheap.
605
+
606
+ ``workspace`` (Phase 1b — the differentiator): when a
607
+ ``LocalDiskWorkspace`` is passed, every search ALSO queries the
608
+ shared notebook (``search_notes``, hybrid + citation-boosted) and
609
+ reciprocal-rank-fuses the learned notes INTO the code results. One
610
+ ranked list then surfaces "the code that does X" *and* "what we
611
+ learned about X across past runs" — the thing a stateless indexer
612
+ (Cursor's ``@Codebase``) structurally cannot do. ``None`` falls
613
+ back to code-only results (identical to Phase 1).
614
+ """
615
+ root = Path(workdir).resolve()
616
+ db_path = root / ".loom" / "code_index.db"
617
+
618
+ # Lazily constructed on first call so building the agent stays cheap
619
+ # (no disk/embedder touch until the tool actually runs) and so an
620
+ # embedder import error surfaces as a tool message, not a build
621
+ # crash.
622
+ state: dict[str, Any] = {"store": None, "embedder": None}
623
+
624
+ async def codebase_search(query: str, limit: int = default_limit) -> str:
625
+ """Semantic code search — find symbols by meaning, not string,
626
+ blended with what we've learned about this code. Args: query
627
+ (natural language), limit (max results, default 8). Returns a
628
+ ranked list of code symbols (file:line to read next) and any
629
+ relevant learned notes. Prefer grep for literal strings; use
630
+ this for conceptual lookups."""
631
+ try:
632
+ limit = int(limit)
633
+ except (TypeError, ValueError):
634
+ limit = default_limit
635
+ limit = max(1, min(limit, 50))
636
+
637
+ try:
638
+ db_path.parent.mkdir(exist_ok=True)
639
+ if state["store"] is None:
640
+ state["embedder"] = resolve_embedder(embedder_name)
641
+ state["store"] = CodeIndexStore(db_path, embedder_name)
642
+ store: CodeIndexStore = state["store"]
643
+ embedder = state["embedder"]
644
+
645
+ # Incremental refresh — cheap when nothing changed (all
646
+ # files skip on sha match). First call on a fresh repo does
647
+ # the full embed.
648
+ await build_index(root, store, embedder)
649
+
650
+ code_hits: list[CodeHit] = []
651
+ if not store.is_empty():
652
+ qvecs = await embedder.embed_batch([query])
653
+ # Over-fetch so the fusion has depth to rank against the
654
+ # notes; the blend trims to ``limit``.
655
+ code_hits = store.search(qvecs[0], limit * 2)
656
+
657
+ # Phase 1b: pull learned notes from the shared notebook and
658
+ # fuse. A notebook failure (or no workspace) degrades to
659
+ # code-only — never an error.
660
+ note_matches: list[Any] = []
661
+ if workspace is not None:
662
+ try:
663
+ note_matches = await workspace.search_notes(
664
+ query,
665
+ user_id=_current_user_id(),
666
+ mode="hybrid",
667
+ boost_relevance=True,
668
+ limit=limit,
669
+ )
670
+ except Exception:
671
+ note_matches = []
672
+
673
+ if not code_hits and not note_matches:
674
+ if store.is_empty():
675
+ return (
676
+ "codebase_search: index is empty (no Python "
677
+ "symbols found under this project) and no learned "
678
+ "notes match. Use grep for non-Python files."
679
+ )
680
+ return "no semantic matches"
681
+
682
+ if note_matches:
683
+ return _render_blend(_fuse(code_hits, note_matches), limit)
684
+ return _render_hits(code_hits[:limit])
685
+ except Exception as exc: # never abort a turn on a search failure
686
+ return (
687
+ f"codebase_search unavailable ({type(exc).__name__}: {exc}). "
688
+ "Fall back to grep."
689
+ )
690
+
691
+ return tool(
692
+ name="codebase_search",
693
+ description=(
694
+ "Semantic code search: find code by MEANING, not literal "
695
+ "string — blended with what we've learned about this code "
696
+ "across past runs. Args: query (natural-language description "
697
+ "of the behaviour, e.g. 'where JWTs are validated'), limit=8. "
698
+ "Returns ranked code symbols (file:line to read next) plus "
699
+ "any relevant learned notes. Use this when grep would miss "
700
+ "the code because the words don't match the concept; use "
701
+ "grep when you know the exact string."
702
+ ),
703
+ )(codebase_search)