npm - xindex - Versions diffs - 1.0.0 - Mend

xindex 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (160) hide show

package/.ai/research/2026-04-10-file-watching.md +79 -0
package/.ai/research/2026-04-10-mcp-output-format.md +129 -0
package/.ai/task/INDEX.md +12 -0
package/.ai/task/done/INDEX.md +3 -0
package/.ai/task/done/task.2026-04-09-local-ai-research-protos.log.md +98 -0
package/.ai/task/done/task.2026-04-09-local-ai-research-protos.md +102 -0
package/.ai/task/task.2026-04-10-cluster-config.log.md +19 -0
package/.ai/task/task.2026-04-10-cluster-config.md +118 -0
package/.ai/task/task.2026-04-10-dir-indexing.log.md +8 -0
package/.ai/task/task.2026-04-10-dir-indexing.md +92 -0
package/.ai/task/task.2026-04-10-line-clustering.log.md +50 -0
package/.ai/task/task.2026-04-10-line-clustering.md +176 -0
package/.ai/task/task.2026-04-10-object-store.log.md +7 -0
package/.ai/task/task.2026-04-10-object-store.md +81 -0
package/.ai/task/task.2026-04-10-search-config.log.md +46 -0
package/.ai/task/task.2026-04-10-search-config.md +274 -0
package/.ai/task/task.2026-04-10-watch-indexing.log.md +32 -0
package/.ai/task/task.2026-04-10-watch-indexing.md +101 -0
package/.ai/task/task.2026-04-10-xindex-mcp.log.md +5 -0
package/.ai/task/task.2026-04-10-xindex-mcp.md +92 -0
package/.ai/task/task.2026-04-10-xindex-mcp.report.md +113 -0
package/.claude/settings.local.json +73 -0
package/.claude/skills/make-hof/SKILL.md +8 -0
package/.claude/skills/make-hof/playbook.md +38 -0
package/.cursor/mcp.json +8 -0
package/.mcp.json +8 -0
package/.xindex.json +22 -0
package/CLAUDE.md +54 -0
package/README.md +206 -0
package/apps/indexApp.ts +31 -0
package/apps/mcpApp.ts +119 -0
package/apps/run.index.ts +19 -0
package/apps/run.mcp.ts +49 -0
package/apps/run.reset.ts +10 -0
package/apps/run.search.ts +21 -0
package/apps/run.watch.ts +44 -0
package/apps/searchApp.ts +9 -0
package/apps/watchApp.ts +53 -0
package/apps/watchFileEventsApp.ts +39 -0
package/bin/xindex-index +2 -0
package/bin/xindex-mcp +2 -0
package/bin/xindex-reset +2 -0
package/bin/xindex-search +2 -0
package/bin/xindex-watch +2 -0
package/componets/IType.ts +1 -0
package/componets/appId.ts +3 -0
package/componets/buildComponents.ts +27 -0
package/componets/config/loadConfig.ts +43 -0
package/componets/config/xindexConfig.ts +4 -0
package/componets/index/contentIndexDriver.ts +39 -0
package/componets/index/formatSearchResults.ts +18 -0
package/componets/index/getIndexStats.ts +11 -0
package/componets/index/handleFileEvent.ts +25 -0
package/componets/index/indexApi.ts +45 -0
package/componets/index/vectraIndex.ts +11 -0
package/componets/index/watcherLock.ts +107 -0
package/componets/keywords/cleanUpKeywords.ts +38 -0
package/componets/keywords/extractKeywords.ts +14 -0
package/componets/keywords/refineKeywords.ts +16 -0
package/componets/llm/embed.ts +18 -0
package/componets/llm/queryLLM.ts +20 -0
package/componets/logger.ts +34 -0
package/componets/walkFiles.ts +51 -0
package/componets/watchFiles.ts +106 -0
package/features/indexContent.ts +16 -0
package/features/removeContent.ts +9 -0
package/features/resetIndex.ts +9 -0
package/features/searchIndex.ts +33 -0
package/package.json +32 -0
package/packages/fun/src/IType.ts +5 -0
package/packages/fun/src/array-finder.ts +55 -0
package/packages/fun/src/array-index.ts +35 -0
package/packages/fun/src/array.ts +112 -0
package/packages/fun/src/assert.ts +5 -0
package/packages/fun/src/asyncRequest.ts +35 -0
package/packages/fun/src/callsites.ts +18 -0
package/packages/fun/src/case-never.ts +9 -0
package/packages/fun/src/casting.ts +41 -0
package/packages/fun/src/collect.ts +13 -0
package/packages/fun/src/concurrency.ts +186 -0
package/packages/fun/src/container.ts +86 -0
package/packages/fun/src/counter.ts +45 -0
package/packages/fun/src/create-map.ts +2 -0
package/packages/fun/src/dedupe.ts +2 -0
package/packages/fun/src/defer.ts +55 -0
package/packages/fun/src/delay.ts +5 -0
package/packages/fun/src/discriminate.ts +34 -0
package/packages/fun/src/enum-values.ts +12 -0
package/packages/fun/src/exponential-backoff.ts +20 -0
package/packages/fun/src/flatten.ts +11 -0
package/packages/fun/src/hash.ts +67 -0
package/packages/fun/src/hash128.ts +6 -0
package/packages/fun/src/hash256.ts +6 -0
package/packages/fun/src/hub.ts +53 -0
package/packages/fun/src/id.ts +10 -0
package/packages/fun/src/interval.ts +76 -0
package/packages/fun/src/is-non-nullable.ts +2 -0
package/packages/fun/src/isIterable.ts +3 -0
package/packages/fun/src/mailbox.ts +13 -0
package/packages/fun/src/map-record.ts +19 -0
package/packages/fun/src/match-collections.ts +57 -0
package/packages/fun/src/match-left-and-right-arrays.ts +78 -0
package/packages/fun/src/mem.ts +26 -0
package/packages/fun/src/memos.ts +28 -0
package/packages/fun/src/normalizeError.ts +25 -0
package/packages/fun/src/nothing.ts +3 -0
package/packages/fun/src/pipe.ts +18 -0
package/packages/fun/src/prettyJson.ts +3 -0
package/packages/fun/src/project.ts +8 -0
package/packages/fun/src/promise.ts +27 -0
package/packages/fun/src/pubsub.ts +128 -0
package/packages/fun/src/randomId.ts +14 -0
package/packages/fun/src/regexp-escape.ts +13 -0
package/packages/fun/src/retry.ts +15 -0
package/packages/fun/src/serial.test.ts +107 -0
package/packages/fun/src/serial.ts +17 -0
package/packages/fun/src/sleep.ts +3 -0
package/packages/fun/src/sort-object.ts +46 -0
package/packages/fun/src/speed-test.ts +56 -0
package/packages/fun/src/tick.ts +37 -0
package/packages/fun/src/time-behavior.ts +50 -0
package/packages/fun/src/time.ts +22 -0
package/packages/fun/src/timedFallback.ts +37 -0
package/packages/fun/src/timer.ts +30 -0
package/packages/fun/src/value.ts +33 -0
package/packages/fun/src/waitForCounter.ts +15 -0
package/packages/streamx/src/batch.ts +23 -0
package/packages/streamx/src/batchTimed.ts +113 -0
package/packages/streamx/src/buffer.ts +72 -0
package/packages/streamx/src/concatenate.ts +33 -0
package/packages/streamx/src/filter.ts +14 -0
package/packages/streamx/src/flat.ts +19 -0
package/packages/streamx/src/flatMap.ts +9 -0
package/packages/streamx/src/from.ts +30 -0
package/packages/streamx/src/index.ts +49 -0
package/packages/streamx/src/interval.ts +58 -0
package/packages/streamx/src/loop.ts +8 -0
package/packages/streamx/src/map.ts +12 -0
package/packages/streamx/src/merge.ts +89 -0
package/packages/streamx/src/nodeReadable.ts +6 -0
package/packages/streamx/src/nodeTransform.ts +9 -0
package/packages/streamx/src/nodeWritable.ts +38 -0
package/packages/streamx/src/objectReader.ts +16 -0
package/packages/streamx/src/polyfill.ts +20 -0
package/packages/streamx/src/reader.ts +38 -0
package/packages/streamx/src/reduce.ts +15 -0
package/packages/streamx/src/scale.ts +93 -0
package/packages/streamx/src/scaleSync.ts +13 -0
package/packages/streamx/src/sequence.ts +7 -0
package/packages/streamx/src/tap.ts +9 -0
package/packages/streamx/src/toArray.ts +9 -0
package/packages/streamx/src/writer.ts +96 -0
package/rnd/hf.ts +14 -0
package/rnd/keywords-compromise.ts +18 -0
package/rnd/keywords-pipeline.ts +79 -0
package/rnd/keywords.ts +38 -0
package/rnd/test-vectra-memory.ts +63 -0
package/rnd/vectra-keywords.ts +95 -0
package/rnd/vectra.ts +50 -0
package/tsconfig.json +14 -0

package/.ai/task/task.2026-04-10-dir-indexing.md ADDED Viewed

@@ -0,0 +1,92 @@
+# Task: Directory-based Indexing with Async Streams
+## Context
+Current `IndexApp` takes an explicit file list — no directory scanning. User wants to pass files **or** dirs, recursively scan dirs, and index everything as a stream.
+**Current pipeline** (`apps/indexApp.ts`):
+```
+files[] → for each → readFile → extractKeywords → cleanUp → indexContent
+```
+**streamx available** (`packages/streamx/`):
+- `from(iterable)` — wraps async/sync iterable into StreamX
+- `of()` → `.pipe()` for chaining
+- Operators: `map`, `filter`, `flat`, `flatMap`, `batch`, `buffer`, `merge`, `scale`, `reduce`, `tap`
+- `run()` — consumes stream, returns last value
+**Gitignore: `ignore` npm package** (used by ESLint, Prettier):
+```ts
+import ignore from "ignore";
+const ig = ignore();
+ig.add(await readFile(".gitignore", "utf8"));  // load rules
+ig.ignores("node_modules/foo.js");              // true
+ig.filter(["src/index.ts", "dist/out.js"]);     // ["src/index.ts"]
+```
+- Paths must be **relative** to .gitignore location
+- `.add()` stackable — call per nested .gitignore
+- Handles negation (`!`), globs, `**/`, comments
+**Decisions:**
+- Paths are **relative to working directory** (children of cwd)
+- Sequential indexing now; `scale()` for parallelism later
+- Use `ignore` npm package for .gitignore parsing
+- Default: if no .gitignore in a folder, skip `.*` dirs (`.git`, `.idea`, etc.)
+## Goal
+Accept files or directories as input, recursively walk directories (respecting .gitignore), and index all discovered files as an async stream using streamx.
+## Diagram
+```
+INPUT:  ["file.ts", "src/", "lib/"]
+         │
+         ├── file? ──→ yield relative path
+         │
+         └── dir? ──→ walk recursively
+                       │
+                       ├── load .gitignore (if exists)
+                       │   (else: default ignore .* dirs)
+                       │
+                       ├── skip ignored paths
+                       │
+                       └── yield each file (relative)
+                            │
+                            ▼
+              from(walkFiles) ──→ streamx pipeline
+                            │
+                  ├── map: readFile
+                  ├── map: extractKeywords + cleanUp
+                  └── tap: indexContent
+                            │
+                            ▼
+                      {indexed count}
+```
+## Steps
+### 1. Directory Walker
+- Create `componets/walkFiles.ts` — HOF `WalkFiles()` returning async generator that yields relative file paths
+- Detect file vs dir via `fs.stat`, yield files directly, recurse into dirs
+- Use `node:fs/promises` `opendir` for streaming directory reads
+### 2. Gitignore Filtering
+- Install `ignore` package — `npm install ignore`
+- Load `.gitignore` per directory during walk; stack rules with parent via `ig.add()`
+- Default rule when no `.gitignore`: skip `.*` dirs (`.git`, `.idea`, `.DS_Store` etc.)
+- Check `ig.ignores(relativePath)` before yielding or descending into subdirs
+### 3. Stream Pipeline
+- Wire walker into streamx: `from(walkFiles(inputs))` → `pipe(map(indexFile))` → `run()`
+- Update `IndexApp` HOF to accept `string[]` (mix of files and dirs)
+- `run.index.ts` passes argv as-is — no change needed
+## Dependencies
+- `ignore` — gitignore pattern matching (new dep)
+- `packages/streamx` — async stream operators (already in repo)
+Sources:
+- [ignore npm package](https://www.npmjs.com/package/ignore)
+- [node-ignore GitHub](https://github.com/kaelzhang/node-ignore)

package/.ai/task/task.2026-04-10-line-clustering.log.md ADDED Viewed

@@ -0,0 +1,50 @@
+# Log: Line-level clustering
+### 2026-04-10
+- Task created from user notes about recursive bisection clustering
+- Scouted codebase: current granularity is 1 vector per file, `id=filePath`, metadata in separate object store (MD5-keyed JSON)
+- Confirmed in-memory Vectra works via `VirtualFileStorage` (test-vectra-memory.ts) — no disk I/O, cosine similarity queries work with small dims
+- Key insight: Vectra `metadata: {}` is always empty in current code — all real metadata lives in object store. This pattern can extend to line-level clusters
+- Identified integration points: indexContent.ts (upsert), searchContentIndex.ts (query), indexMeta.ts (type), removeContent.ts (delete)
+- Open: similarity threshold calibration, keyword quality for code, cluster deletion strategy on re-index
+- Clarification round: user confirmed embedding cosine (not Jaccard) — meaning matters more than keyword overlap
+- Threshold: user says 0.55–0.70 range, will start at 0.6
+- Min cluster: 3–5 lines, default 5
+- ID format: `file.ts:12-45` works as-is for both Vectra and object store
+- Cleanup strategy: object store entry per file tracks all cluster IDs, delete all on re-index
+- Researched NPM packages: semantic-chunking (jparkerweb), semantic-chunker (johnhenry, BYOE), LangChain RecursiveCharacterTextSplitter. All target prose/sentences, not code lines. Custom bisection with our embed pipeline is better fit.
+- NAACL 2025: fixed-size chunks match semantic chunking for prose RAG, but code has mixed concerns per file where semantic splitting should win
+- Consistency check: expanded 3x3 → 6x3 steps with concrete file paths and implementation details
+- Fixed: diagram now shows full flow from handleFileEvent through clusterLines to persistent store + manifest
+- Fixed: IIndexMeta type is `{keywords, id}` not `{keywords, file}` — corrected in Context
+- Found: main change site is `indexFileContent.ts` (not `indexContent.ts`), and `handleFileEvent.ts` for cleanup
+- Found: cosine similarity doesn't need in-memory Vectra — embed returns normalized vectors, direct dot product suffices
+- Found: object store dual-use issue — need to store both cluster metas and file manifests with different shapes. Added to Open Questions.
+- Added Edge Cases section: small files, empty files, uniform content, legacy data
+- Consistency check #2: fixed Goal (removed "in-memory Vectra" — cosine is direct dot product), removed stale whole-file keyword step from diagram, added tagged union pattern for manifest, clarified RemoveFileContent as separate HOF, added buildComponents wiring step
+- Decision: cosine via direct dot product (Option A). 3-line helper, no Vectra needed for bisection — comparing exactly 2 vectors, not searching N. Fallback to in-memory Vectra (Option B) if needed.
+- **Key architecture correction**: existing file-level indexing must stay intact. Clustering is an EXTENSION, not a replacement. Both file-level and cluster-level entries coexist.
+- If file is cohesive (1 cluster = whole file) → skip clustering entirely, file-level entry is enough
+- Resolved object store dual-use: three separate keys — `filePath` (file meta), `filePath:1-10` (cluster meta), `filePath::manifest` (cluster ID list). Widen `IObjectStore` types to `IStoreEntry = IIndexMeta | IFileManifest`.
+- Traced full dependency wiring: `IndexFileContent` is constructed in run.*.ts (NOT inside contentIndexDriver). Plan: move construction inside driver since it has all deps. Simplifies callers.
+- `handleFileEvent` flow on re-index: `removeFileContent(path)` first (cleans file entry + clusters + manifest), then `indexFileContent(path, text)` (creates file entry + clusters + manifest)
+- Expanded steps from 6x3 → 7x(2-5) with concrete file paths, signatures, and implementation notes
+- Updated diagram to show full bidirectional flow: removal path + indexing path + all three store key types
+- Consistency check #3:
+  - CRITICAL: fixed file paths — `indexFileContent.ts` and `handleFileEvent.ts` are in `componets/index/`, not `componets/`. Same for `removeFileContent.ts` (new file).
+  - CRITICAL: found `indexApp.ts` gap — bulk indexer calls `indexFileContent` directly via stream (no `HandleFileEvent`). Old clusters would linger on re-index. Added `removeFileContent` dep to `IndexApp`, call cleanup before indexFileContent in stream callback.
+  - Fixed edge case: empty files still get file-level entry (existing pipeline), clustering just returns `[]`.
+  - Added `indexApp.ts`, `buildComponents.ts`, `run.*.ts` to Context key files.
+  - Added step 6.3 for indexApp.ts and step 6.4 for import path cleanup.
+  - Synced plan file with same fixes.
+- **Implementation complete** — all 7 steps done:
+  - Step 1: created `componets/index/clusterLines.ts` — cosine helper, ILineCluster, ClusterLines HOF
+  - Step 2: updated `indexMeta.ts` (IType tagged union: IIndexMeta, IClusterMeta, IFileManifest, IStoreEntry), `objectStore.ts` (widened types)
+  - Step 3: created `componets/index/removeFileContent.ts` — manifest-aware cleanup
+  - Step 4: updated `indexFileContent.ts` — kept file-level index, added clustering extension
+  - Step 5: updated `contentIndexDriver.ts` (wires ClusterLines, IndexFileContent, RemoveFileContent inside), `buildComponents.ts` (returns new components)
+  - Step 6: simplified `run.index.ts`, `run.watch.ts`, `run.mcp.ts` (removed manual IndexFileContent construction), updated `handleFileEvent.ts` (removeFileContent), updated `indexApp.ts` (added removeFileContent dep)
+  - Also updated `indexContent.ts` (widened meta param) and `searchContentIndex.ts` (narrow by type)
+- Reset + re-index: 113 files → 211 indexed items (98 cluster entries created)
+- Search verified: cluster hits showing as `file.ts:fromLine-toLine` (e.g. `rnd/test-vectra-memory.ts:9-12`)

package/.ai/task/task.2026-04-10-line-clustering.md ADDED Viewed

@@ -0,0 +1,176 @@
+# Task: Line-level clustering for block-granular search
+## Context
+**Current state**: xindex indexes one vector per file. The `id` is the file path, keywords are extracted from the entire file content, and search returns file-level matches. This is too coarse — a 500-line file with mixed concerns returns as a single hit with no indication of *where* in the file the match is.
+**User's idea**: split files into semantically coherent blocks (clusters of lines), then index each block separately so search returns `file:fromLine-toLine` references.
+**Approach — extend existing pipeline with recursive bisection**:
+1. Keep existing file-level indexing intact — `indexContent(filePath, keywords, meta)` runs first, unchanged
+2. After file-level index: split file content into lines
+3. Bisect into 2 halves → extract keywords for each → embed → compute cosine similarity (dot product of normalized vectors)
+4. If similarity is high (≥ 0.6) → cohesive, no clustering needed. If low → 2 separate clusters.
+5. Recurse: split each cluster into 2 again, test overlap, stop when clusters are cohesive or hit limits (max depth 4 → up to 16 clusters, min 5 lines per cluster)
+6. If only 1 cluster (whole file is cohesive) → skip clustering, file-level entry is enough
+7. If 2+ clusters → index each in persistent Vectra as `<file>:<fromLine>-<toLine>` alongside the file-level entry
+8. Write a manifest at `<file>::manifest` tracking cluster IDs for cleanup on re-index
+9. Both file-level and cluster-level entries coexist — search may return both
+**Key files (change targets)**:
+- `componets/index/indexFileContent.ts` — **main change site**: currently calls `indexContent(id, keywords, meta)` once per file. Will call `clusterLines` then loop over clusters.
+- `componets/index/handleFileEvent.ts` — calls `removeContent(path)` on file change. Must delete all clusters for a file, not just one ID.
+- `componets/index/indexContent.ts` — low-level: embeds + upserts one item. No change needed — called per cluster.
+- `componets/index/removeContent.ts` — low-level: deletes one item. No change needed — called per cluster ID.
+- `componets/index/searchContentIndex.ts` — returns `IIndexRecord{score, id, meta}`. No change needed — `id` becomes `file:1-27` naturally.
+- `componets/index/indexMeta.ts` — `IIndexMeta{keywords, id}`. Add `type` tag, add `IClusterMeta`, `IFileManifest` using `IType<>` tagged union.
+- `componets/index/objectStore.ts` — stores `IIndexMeta` as JSON, keyed by MD5(id). Needs a manifest entry per file to track cluster IDs.
+- `componets/index/contentIndexDriver.ts` — wires components together. Must construct `ClusterLines`, `IndexFileContent`, `RemoveFileContent` inside. Currently `IndexFileContent` is constructed by callers.
+- `componets/buildComponents.ts` — top-level builder. Must return `indexFileContent` + `removeFileContent` from driver.
+- `apps/indexApp.ts` — bulk indexer. Calls `indexFileContent` directly via stream (no `HandleFileEvent`). Needs `removeFileContent` for cleanup.
+- `apps/run.index.ts`, `apps/run.watch.ts`, `apps/run.mcp.ts` — entry points. Currently construct `IndexFileContent` manually. Will use driver-provided version.
+- `componets/index/vectraIndex.ts` — creates `LocalIndex(path)`. No change.
+- `componets/llm/embed.ts` — MiniLM-L6 embeddings, returns `number[]`. No change.
+- `test-vectra-memory.ts` — proved VirtualFileStorage works for in-memory cosine queries.
+**Raw notes**: recursive split → 2 → 4 → 8 → 16 hard stop. Overlap by keywords via embedding cosine similarity. Final clusters get indexed in persistent store with line references. MCP query returns lines.
+## Goal
+Extend the existing indexing pipeline with a `ClusterLines` component (HOF pattern) that takes file content, splits it into semantically coherent line clusters using recursive bisection with embedding cosine similarity, and returns cluster descriptors `{fromLine, toLine, content, keywords}[]`. The existing file-level index stays intact — clustering adds block-level entries alongside it.
+## Diagram
+```
+handleFileEvent (file change/add)
+        │
+        ├── removeFileContent(path)              ◄── clean ALL old data first
+        │       ├── removeContent(path)               delete file-level vectra + meta
+        │       └── read manifest(path::manifest)     if exists:
+        │           ├── removeContent(path:1-10)        delete each cluster
+        │           ├── removeContent(path:11-25)
+        │           └── objectStore.remove(manifest)
+        │
+        └── indexFileContent(path, text)         ◄── create ALL new data
+                │
+                ├── EXISTING: file-level index (unchanged)
+                │   extractKeywords + cleanUpKeywords(text)
+                │   indexContent(path, keywords, {keywords, id: path})
+                │       ├── embed(keywords) → vector
+                │       ├── vectra.upsertItem({id: path, vector})
+                │       └── objectStore.write(path, meta)
+                │
+                ├── NEW: cluster-level index (extension)
+                │   clusterLines(lines, path)
+                │       │
+                │       ▼
+                │   ┌─────────────────┐
+                │   │  Split in half   │
+                │   │  lines[0..n/2]   │
+                │   │  lines[n/2..n]   │
+                │   └────────┬────────┘
+                │            │
+                │            ▼
+                │   ┌──────────────────────────┐
+                │   │  Extract keywords each    │
+                │   │  Embed keywords → vec     │
+                │   │  cosine(vecA, vecB)       │
+                │   └────────┬─────────────────┘
+                │            │
+                │       sim ≥ 0.6? ──yes──► 1 cluster (leaf)
+                │            │
+                │           no → recurse each half  ◄── depth ≤ 4, min 5 lines
+                │            │
+                │            ▼
+                │   clusters[] = {fromLine, toLine, content, keywords}[]
+                │
+                │   clusters.length ≤ 1? → SKIP (file entry is enough)
+                │
+                │   clusters.length > 1? → for each cluster:
+                │       indexContent(id="path:12-45", cluster.keywords, clusterMeta)
+                │
+                └── objectStore.write(path::manifest, {clusterIds})
+Three key types in store:
+  path           → file-level entry (vectra + objectStore)
+  path:1-10      → cluster entry (vectra + objectStore)
+  path::manifest → {type:"manifest", clusterIds} (objectStore only)
+```
+## Steps
+### 1. ClusterLines component — NEW `componets/index/clusterLines.ts`
+1. **Cosine helper** — `cosine(a: number[], b: number[]): number` — dot product of two normalized vectors. Pure function, no deps.
+2. **HOF factory** — `ClusterLines({embed, extractKeywords, cleanUpKeywords, threshold, minLines, maxDepth})` returns `IClusterLines(lines: string[], file: string) → Promise<ILineCluster[]>`. Defaults: threshold=0.6, minLines=5, maxDepth=4.
+3. **ILineCluster type** — `{fromLine: number, toLine: number, content: string, keywords: string}`. `fromLine`/`toLine` are 1-based line numbers.
+4. **Recursive bisection** — split lines at midpoint → join each half → `extractKeywords` + `cleanUpKeywords` → `embed` each → `cosine(vecA, vecB)`. If sim ≥ threshold → leaf cluster. If sim < threshold → recurse on each half.
+5. **Guards** — `lines.length ≤ minLines` or `depth ≥ maxDepth` → leaf. Empty lines → return `[]`. Either half has no keywords → leaf.
+### 2. Extend metadata — MODIFY `componets/index/indexMeta.ts` + `objectStore.ts`
+1. **Tag IIndexMeta** — add `type: "meta"` field using `IType<>` pattern: `IType<{type: "meta", keywords: string, id: string}>`. Breaking change — all constructors must add `type: "meta"`.
+2. **Add IClusterMeta type** — `IType<{type: "cluster", keywords: string, id: string, fromLine: number, toLine: number}>`. Cluster-level entries with line ranges.
+3. **Add IFileManifest type** — `IType<{type: "manifest", id: string, clusterIds: string[]}>`. Stored at key `filePath::manifest` in object store.
+4. **IStoreEntry union** — `IIndexMeta | IClusterMeta | IFileManifest`. Discriminated by `type` field.
+5. **Widen objectStore types** — `IObjectStore.write`/`read` accept/return `IStoreEntry`.
+6. **Update indexContent.ts** — widen `meta` param from `IIndexMeta` to `IIndexMeta | IClusterMeta`.
+7. **Update searchContentIndex.ts** — narrow `IStoreEntry` by `type` when reading results from object store.
+### 3. RemoveFileContent — NEW `componets/index/removeFileContent.ts`
+1. **HOF factory** — `RemoveFileContent({removeContent, objectStore})` returns `IRemoveFileContent(filePath: string) => Promise<void>`.
+2. **Deletes all layers** — (a) `removeContent(filePath)` to delete file-level vectra item + meta. (b) Read manifest at `filePath::manifest` → if exists, `removeContent(clusterId)` for each → `objectStore.remove(manifestKey)`. (c) All deletes wrapped in try/catch — missing entries are fine (first-time index, no clusters).
+### 4. Update indexFileContent — MODIFY `componets/index/indexFileContent.ts`
+1. **Add deps** — `{extractKeywords, cleanUpKeywords, indexContent, clusterLines, objectStore}`. Existing deps stay — file-level index needs `extractKeywords`/`cleanUpKeywords`.
+2. **File-level index (EXISTING, now tagged)** — `extractKeywords(content)` → `cleanUpKeywords` → `indexContent(id, keywords, {type: "meta", keywords, id})`. Runs first, always.
+3. **Cluster-level index (NEW, extension)** — `content.split("\n")` → `clusterLines(lines, id)` → if `clusters.length ≤ 1` → skip (file is cohesive). If `clusters.length > 1` → for each cluster: `indexContent(\`${id}:${fromLine}-${toLine}\`, cluster.keywords, {type: "cluster", ...})`.
+4. **Write manifest** — after all clusters indexed, `objectStore.write(id + "::manifest", {type: "manifest", id, clusterIds})`.
+### 5. Wire through driver + builder — MODIFY `contentIndexDriver.ts` + `buildComponents.ts`
+1. **contentIndexDriver.ts** — instantiate `ClusterLines({embed, extractKeywords, cleanUpKeywords})`. Construct `IndexFileContent({extractKeywords, cleanUpKeywords, indexContent, clusterLines, objectStore})` inside driver (currently constructed by callers). Construct `RemoveFileContent({removeContent, objectStore})`. Add `indexFileContent` + `removeFileContent` to `IContentIndexDriver`.
+2. **buildComponents.ts** — destructure `indexFileContent` + `removeFileContent` from `ContentIndexDriver`. Return them. Callers no longer construct `IndexFileContent` themselves.
+### 6. Update callers — MODIFY `run.*.ts` + `handleFileEvent.ts` + `indexApp.ts`
+1. **run.index.ts, run.watch.ts, run.mcp.ts** — remove `IndexFileContent(...)` construction. Get `indexFileContent` + `removeFileContent` from `BuildComponents()`.
+2. **handleFileEvent.ts** — replace `removeContent` dep with `removeFileContent`. On `FileEventType.index`: `removeFileContent(path)` first (clean old data), then `indexFileContent(path, text)` (creates file entry + cluster entries). On `FileEventType.remove`: `removeFileContent(path)`.
+3. **indexApp.ts** — currently calls `indexFileContent(id, text)` directly via stream pipeline (no `HandleFileEvent`). Add `removeFileContent` dep. Call `removeFileContent(id)` before `indexFileContent(id, text)` in the `map` callback — otherwise old clusters linger when cluster boundaries change on re-index. Update `IndexApp({walkFiles, indexFileContent, removeFileContent, log})`.
+4. **Import paths** — `run.index.ts` imports `IndexFileContent` from `componets/index/indexFileContent.js`. After moving construction inside driver, remove this import. Same for `run.watch.ts` and `run.mcp.ts`.
+### 7. Test end-to-end
+1. **Unit test clusterLines** — feed a file with 2 distinct sections (imports+types vs. implementation), verify ≥2 clusters with correct 1-based line ranges.
+2. **Integration test** — index a multi-concern file, query for a specific concept, verify search returns both `file.ts` (file-level) and `file.ts:12-45` (cluster-level).
+3. **Re-index test** — modify file, re-index, verify old clusters deleted + new ones created.
+4. **Cohesive file test** — index a small/uniform file, verify only file-level entry exists (no clusters, no manifest).
+## Decisions
+- **Extend, don't replace** — existing file-level indexing stays intact. Clustering is an additional step that runs after. Both levels coexist in the index.
+- **1 cluster = skip** — if the file is cohesive (clustering returns 1 cluster = whole file), no cluster entries are created. File-level entry is enough.
+- **Embedding cosine similarity** for bisection (not Jaccard). Jaccard only matches exact keyword strings — `fetchUser` and `getUser` would score 0% overlap despite being the same concern. Embeddings capture meaning. Cost is acceptable: MiniLM-L6 is local, ~30 embed calls per file at max depth, ~50-100ms total.
+- **Cosine computation**: Option A — direct dot product (3-line helper, vectors already normalized). Fallback to Option B (in-memory Vectra via `VirtualFileStorage`) if direct cosine proves insufficient.
+- **Similarity threshold**: start at 0.55–0.70, tune empirically. Try 0.6 as default.
+- **Min cluster size**: 3–5 lines. Use 5 as default, configurable.
+- **Three tagged types in store** (using `IType<>` pattern): `IIndexMeta{type:"meta"}` at `filePath`, `IClusterMeta{type:"cluster"}` at `filePath:fromLine-toLine`, `IFileManifest{type:"manifest"}` at `filePath::manifest`. All separate keys, discriminated by `type`.
+- **Cleanup on re-index**: `removeFileContent` deletes file-level entry, then reads manifest to delete all cluster entries, then deletes manifest itself. Graceful on missing data.
+- **Move IndexFileContent inside driver** — currently constructed by callers in `run.*.ts`. Moving inside `contentIndexDriver.ts` consolidates wiring since the driver already has all deps.
+## Research: existing NPM packages
+- **semantic-chunking** (jparkerweb, v2.4.4) — splits text into sentences, embeds each with ONNX model, groups by cosine similarity. Sentence-level, not line-level. Uses its own ONNX pipeline, not BYOE.
+- **semantic-chunker** (johnhenry) — BYOE approach, bring your own embedding function. More flexible. Could plug in our MiniLM-L6 embed.
+- **LangChain RecursiveCharacterTextSplitter** — recursive splitting by character/token boundaries, not semantic. 2026 benchmarks show 512-token recursive splitting at 69% accuracy — good baseline but not meaning-aware.
+- **NAACL 2025 finding**: fixed 200-word chunks match or beat semantic chunking for general RAG. But for *code* with mixed concerns in one file, semantic splitting should outperform fixed-size.
+- **Verdict**: existing packages target prose (sentence-level). Our use case is code (line-level, preserve line boundaries for references). Custom recursive bisection with our existing embed pipeline is the right call — simpler than adapting a prose chunker to respect line boundaries.
+## Edge Cases
+- **Small files (≤ 5 lines)** — return as single cluster, no splitting attempted.
+- **Empty files** — file-level entry is still indexed (existing pipeline runs first). Clustering returns `[]`, no cluster entries created.
+- **Files with uniform content** (e.g., all imports) — cosine similarity stays high at every split, returns 1 cluster. Expected behavior.
+- **Binary/non-text files** — already filtered upstream by the file walker. Not a concern here.
+- **Legacy index data** — files indexed before this change won't have manifests. On re-index, no old clusters to delete — just index fresh.
+## Open Questions
+- **Keyword extraction quality**: current keywords come from compromise NLP + keyword-extractor. May need tuning for code (variable names, imports, function signatures).
+- **Threshold tuning**: need to test 0.55 vs 0.60 vs 0.70 on real project files to find the sweet spot.
+- ~~**Object store dual use**~~ — resolved: three tagged types (`IIndexMeta`, `IClusterMeta`, `IFileManifest`) discriminated by `type` field, stored at separate keys. Union `IStoreEntry = IIndexMeta | IClusterMeta | IFileManifest`.

package/.ai/task/task.2026-04-10-object-store.log.md ADDED Viewed

@@ -0,0 +1,7 @@
+### 2026-04-10 — Task created
+- Scouted: vectra currently stores vector + IIndexMeta (keywords, file) together
+- User wants to separate: vectra for vectors only, .xindex/objects/ for meta JSON
+- Hash-based path: md5(id) → xx/yy/xxyyzz.json
+- Need to update indexContent, searchContentIndex, resetIndex, contentIndexDriver
+- New components: objectStore (read/write/clear), indexStructure (manage .xindex/ dirs)

package/.ai/task/task.2026-04-10-object-store.md ADDED Viewed

@@ -0,0 +1,81 @@
+# Task: Object Store — Separate Meta Storage from Vectra
+## Context
+Currently vectra stores both vectors AND metadata (`{keywords, file}`) in the same index. Vectra is good for semantic search, not for storage. Goal: split storage into two layers:
+- **`.xindex/semantic/`** — vectra stores only vectors + id (for search)
+- **`.xindex/objects/`** — file-based JSON store for meta objects (for storage/retrieval)
+**Current state:**
+- `indexContent.ts` — embeds content, upserts `{id, vector, metadata: IIndexMeta}` into vectra
+- `searchContentIndex.ts` — queries vectra, reads `r.item.metadata as IIndexMeta`
+- `resetIndex.ts` — `deleteIndex()` + `createIndex()` on vectra only
+- `IIndexMeta = {keywords: string, file: string}`
+- Index path: `.xindex` (single vectra folder)
+**New structure:**
+```
+.xindex/
+├── semantic/     ← vectra (vectors + id only, minimal meta)
+└── objects/      ← JSON files keyed by hash of id
+    └── xx/
+        └── yy/
+            └── xxyyzz.json  ← {keywords, file, ...}
+```
+## Goal
+Introduce an object store layer that writes `IIndexMeta` as JSON files in `.xindex/objects/`, remove metadata from vectra (keep only vector + id), and decorate `indexContent` and `searchContentIndex` to read/write both layers.
+## Diagram
+```
+INDEX PIPELINE:
+  file → extractKeywords → cleanUp → keywords
+    │
+    ├── [1] embed(keywords) → vector
+    │     └── vectra.upsert({id, vector})     → .xindex/semantic/
+    │
+    └── [2] objectStore.write(id, meta)       → .xindex/objects/xx/yy/xxyyzz.json
+                                                 {keywords, file}
+SEARCH PIPELINE:
+  query → extractKeywords → cleanUp → embed → vector
+    │
+    ├── [1] vectra.query(vector, limit)       → [{score, id}]
+    │
+    └── [2] objectStore.read(id)              → IIndexMeta
+                                                 ↓
+                                          [{score, id, meta}]
+RESET:
+  [1] vectra.deleteIndex + createIndex        → .xindex/semantic/ wiped
+  [2] rm -rf .xindex/objects/                 → objects wiped
+```
+## Steps
+### 1. Object Store HOF
+- Create `componets/index/objectStore.ts` — `ObjectStore({basePath}): IObjectStore`
+- `write(id, meta)` — hash id (md5 → hex), split into `xx/yy/xxyyzz`, `mkdir -p`, write JSON
+- `read(id)` — hash id, read JSON, parse as `IIndexMeta`
+- `clear()` — rm -rf basePath, recreate empty dir
+### 2. Update Index Structure
+- Create `componets/index/indexStructure.ts` — `IndexStructure({basePath}): IIndexStructure`
+- Manages `.xindex/` top-level: ensures `semantic/` and `objects/` dirs exist
+- Returns paths: `{semanticPath, objectsPath}`
+- Used by `contentIndexDriver` at init
+### 3. Decorate Index/Search
+- Update `IndexContent` — upsert vector+id to vectra (no meta), write meta to objectStore
+- Update `SearchContentIndex` — query vectra for `{score, id}[]`, then `objectStore.read(id)` for each result to attach meta
+- Update `ResetIndex` — call both `vectra.deleteIndex/createIndex` and `objectStore.clear()`
+- Update `ContentIndexDriver` — pass `semanticPath` to `VectraIndex`, create `ObjectStore({basePath: objectsPath})`
+## Open Questions
+- Hash function: `crypto.createHash('md5')` from Node built-in — fast enough, no deps. Or use simpler hash?
+- Should objectStore support partial updates (upsert) or always overwrite?
+- Should search batch-read objects or read one by one per result?

package/.ai/task/task.2026-04-10-search-config.log.md ADDED Viewed

@@ -0,0 +1,46 @@
+### 2026-04-10
+- Task created from user notes
+- Scouted codebase via xindex search (indexed 167 files)
+- `.xindex.json` exists but empty `{}`, no config loading anywhere
+- `CleanUpKeywords` at `componets/keywords/cleanUpKeywords.ts:8` — HOF takes `{maxNgrams, minLength}`. Add `ignoreKeywords` here.
+- `SearchContentIndex` at `componets/index/searchContentIndex.ts:12` — search pipeline, uses `cleanUpKeywords` on query. Ignore list propagates automatically.
+- `IClusterMeta` at `componets/index/indexMeta.ts:11` — has `fromLine`/`toLine` for reading snippet lines
+- MCP tool at `apps/mcpApp.ts:34` — `xindex_search` schema has `{query, limit}` only. Add snippet params.
+- CLI at `apps/run.search.ts:23-31` — formats results with score + keywords, no snippets
+- `BuildComponents` at `componets/buildComponents.ts:6` — wires everything, no config loading. Config loads here.
+- `ContentIndexDriver` at `componets/index/contentIndexDriver.ts:27` — passes `cleanUpKeywords` to `ClusterLines` and `SearchContentIndex`
+- Entry points: `apps/run.mcp.ts:19` (MCP), `apps/run.search.ts:8` (CLI) — both call `BuildComponents()`
+- User wants explicit config names: `ignoreKeywords`, `snippetLines`, `snippetResults`
+**Clarification round — decisions:**
+- Defaults confirmed: `snippetResults: 3`, `snippetLines: 7`
+- `ignoreKeywords`: exact strings, case-insensitive. No globs/patterns.
+- Ignore at **index time** — re-index + MCP restart after config change is acceptable. One-time setup, review in 3mo.
+- File-level results (whole file, no cluster) also get snippets if file total lines ≤ `snippetLines`
+- Task finalized
+**Round 2 — user feedback during detail expansion:**
+- Renamed `snippetLines` → `maxSnippetLines`, `snippetResults` → `maxSnippetResults` (user preference for explicit names)
+- Added `ignoreFiles` feature: gitignore-style glob patterns in `.xindex.json` to exclude files from indexing. Reuses existing `ignore` package already in `walkFiles.ts:3` and `watchFiles.ts`
+- Expanded task from 3x3 to 4x3 to accommodate file ignore list as separate step
+- Traced all WalkFiles/WatchFiles consumers: `run.mcp.ts`, `run.index.ts`, `run.watch.ts` — all need `ignoreFiles` plumbed
+- Task ready for implementation
+**Round 3 — consistency check (7 findings, all fixed):**
+- [Missing] `.xindex.json` is optional — added to Decisions + diagram label
+- [Drift] Diagram only showed WalkFiles — added WatchFiles
+- [Mismatch] Step 2.3 duplicated validation from 1.2 — removed 2.3, kept in 1.2 only
+- [Mismatch] `console.warn` in LoadConfig violates project `ILogger` pattern — added `log: ILogger` dep to LoadConfig and BuildComponents
+- [Drift] Files Changed table had tentative "(if it creates its own)" for run.index.ts — made definitive
+- [Inconsistency] Step 4.2 parsed fromLine/toLine from ID string — uses `meta.fromLine`/`meta.toLine` directly now
+- [Missing] Step 1.3 vague "WalkFiles consumers" — listed all 5 specific construction sites (run.mcp.ts:18,30, run.index.ts:10, run.watch.ts:13,14)
+**Round 4 — implementation:**
+- Implemented all 12 files (3 new, 9 modified) + run.reset.ts (missed in plan, also calls BuildComponents)
+- Phase 1: config type + loadConfig HOF
+- Phase 2: cleanUpKeywords ignoreSet, walkFiles + watchFiles ignoreFiles
+- Phase 3: readSnippet HOF
+- Phase 4: buildComponents wiring ({log} param, config loading, return config)
+- Phase 5: all entry points updated (run.mcp, run.search, run.index, run.watch, run.reset)
+- Verified: keyword ignore filters noisy words, file ignore excludes rnd/**, snippets show for small results (top 3, ≤7 lines)