npm - xindex - Versions diffs - 1.0.0 - Mend

xindex 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (160) hide show

package/.ai/research/2026-04-10-file-watching.md +79 -0
package/.ai/research/2026-04-10-mcp-output-format.md +129 -0
package/.ai/task/INDEX.md +12 -0
package/.ai/task/done/INDEX.md +3 -0
package/.ai/task/done/task.2026-04-09-local-ai-research-protos.log.md +98 -0
package/.ai/task/done/task.2026-04-09-local-ai-research-protos.md +102 -0
package/.ai/task/task.2026-04-10-cluster-config.log.md +19 -0
package/.ai/task/task.2026-04-10-cluster-config.md +118 -0
package/.ai/task/task.2026-04-10-dir-indexing.log.md +8 -0
package/.ai/task/task.2026-04-10-dir-indexing.md +92 -0
package/.ai/task/task.2026-04-10-line-clustering.log.md +50 -0
package/.ai/task/task.2026-04-10-line-clustering.md +176 -0
package/.ai/task/task.2026-04-10-object-store.log.md +7 -0
package/.ai/task/task.2026-04-10-object-store.md +81 -0
package/.ai/task/task.2026-04-10-search-config.log.md +46 -0
package/.ai/task/task.2026-04-10-search-config.md +274 -0
package/.ai/task/task.2026-04-10-watch-indexing.log.md +32 -0
package/.ai/task/task.2026-04-10-watch-indexing.md +101 -0
package/.ai/task/task.2026-04-10-xindex-mcp.log.md +5 -0
package/.ai/task/task.2026-04-10-xindex-mcp.md +92 -0
package/.ai/task/task.2026-04-10-xindex-mcp.report.md +113 -0
package/.claude/settings.local.json +73 -0
package/.claude/skills/make-hof/SKILL.md +8 -0
package/.claude/skills/make-hof/playbook.md +38 -0
package/.cursor/mcp.json +8 -0
package/.mcp.json +8 -0
package/.xindex.json +22 -0
package/CLAUDE.md +54 -0
package/README.md +206 -0
package/apps/indexApp.ts +31 -0
package/apps/mcpApp.ts +119 -0
package/apps/run.index.ts +19 -0
package/apps/run.mcp.ts +49 -0
package/apps/run.reset.ts +10 -0
package/apps/run.search.ts +21 -0
package/apps/run.watch.ts +44 -0
package/apps/searchApp.ts +9 -0
package/apps/watchApp.ts +53 -0
package/apps/watchFileEventsApp.ts +39 -0
package/bin/xindex-index +2 -0
package/bin/xindex-mcp +2 -0
package/bin/xindex-reset +2 -0
package/bin/xindex-search +2 -0
package/bin/xindex-watch +2 -0
package/componets/IType.ts +1 -0
package/componets/appId.ts +3 -0
package/componets/buildComponents.ts +27 -0
package/componets/config/loadConfig.ts +43 -0
package/componets/config/xindexConfig.ts +4 -0
package/componets/index/contentIndexDriver.ts +39 -0
package/componets/index/formatSearchResults.ts +18 -0
package/componets/index/getIndexStats.ts +11 -0
package/componets/index/handleFileEvent.ts +25 -0
package/componets/index/indexApi.ts +45 -0
package/componets/index/vectraIndex.ts +11 -0
package/componets/index/watcherLock.ts +107 -0
package/componets/keywords/cleanUpKeywords.ts +38 -0
package/componets/keywords/extractKeywords.ts +14 -0
package/componets/keywords/refineKeywords.ts +16 -0
package/componets/llm/embed.ts +18 -0
package/componets/llm/queryLLM.ts +20 -0
package/componets/logger.ts +34 -0
package/componets/walkFiles.ts +51 -0
package/componets/watchFiles.ts +106 -0
package/features/indexContent.ts +16 -0
package/features/removeContent.ts +9 -0
package/features/resetIndex.ts +9 -0
package/features/searchIndex.ts +33 -0
package/package.json +32 -0
package/packages/fun/src/IType.ts +5 -0
package/packages/fun/src/array-finder.ts +55 -0
package/packages/fun/src/array-index.ts +35 -0
package/packages/fun/src/array.ts +112 -0
package/packages/fun/src/assert.ts +5 -0
package/packages/fun/src/asyncRequest.ts +35 -0
package/packages/fun/src/callsites.ts +18 -0
package/packages/fun/src/case-never.ts +9 -0
package/packages/fun/src/casting.ts +41 -0
package/packages/fun/src/collect.ts +13 -0
package/packages/fun/src/concurrency.ts +186 -0
package/packages/fun/src/container.ts +86 -0
package/packages/fun/src/counter.ts +45 -0
package/packages/fun/src/create-map.ts +2 -0
package/packages/fun/src/dedupe.ts +2 -0
package/packages/fun/src/defer.ts +55 -0
package/packages/fun/src/delay.ts +5 -0
package/packages/fun/src/discriminate.ts +34 -0
package/packages/fun/src/enum-values.ts +12 -0
package/packages/fun/src/exponential-backoff.ts +20 -0
package/packages/fun/src/flatten.ts +11 -0
package/packages/fun/src/hash.ts +67 -0
package/packages/fun/src/hash128.ts +6 -0
package/packages/fun/src/hash256.ts +6 -0
package/packages/fun/src/hub.ts +53 -0
package/packages/fun/src/id.ts +10 -0
package/packages/fun/src/interval.ts +76 -0
package/packages/fun/src/is-non-nullable.ts +2 -0
package/packages/fun/src/isIterable.ts +3 -0
package/packages/fun/src/mailbox.ts +13 -0
package/packages/fun/src/map-record.ts +19 -0
package/packages/fun/src/match-collections.ts +57 -0
package/packages/fun/src/match-left-and-right-arrays.ts +78 -0
package/packages/fun/src/mem.ts +26 -0
package/packages/fun/src/memos.ts +28 -0
package/packages/fun/src/normalizeError.ts +25 -0
package/packages/fun/src/nothing.ts +3 -0
package/packages/fun/src/pipe.ts +18 -0
package/packages/fun/src/prettyJson.ts +3 -0
package/packages/fun/src/project.ts +8 -0
package/packages/fun/src/promise.ts +27 -0
package/packages/fun/src/pubsub.ts +128 -0
package/packages/fun/src/randomId.ts +14 -0
package/packages/fun/src/regexp-escape.ts +13 -0
package/packages/fun/src/retry.ts +15 -0
package/packages/fun/src/serial.test.ts +107 -0
package/packages/fun/src/serial.ts +17 -0
package/packages/fun/src/sleep.ts +3 -0
package/packages/fun/src/sort-object.ts +46 -0
package/packages/fun/src/speed-test.ts +56 -0
package/packages/fun/src/tick.ts +37 -0
package/packages/fun/src/time-behavior.ts +50 -0
package/packages/fun/src/time.ts +22 -0
package/packages/fun/src/timedFallback.ts +37 -0
package/packages/fun/src/timer.ts +30 -0
package/packages/fun/src/value.ts +33 -0
package/packages/fun/src/waitForCounter.ts +15 -0
package/packages/streamx/src/batch.ts +23 -0
package/packages/streamx/src/batchTimed.ts +113 -0
package/packages/streamx/src/buffer.ts +72 -0
package/packages/streamx/src/concatenate.ts +33 -0
package/packages/streamx/src/filter.ts +14 -0
package/packages/streamx/src/flat.ts +19 -0
package/packages/streamx/src/flatMap.ts +9 -0
package/packages/streamx/src/from.ts +30 -0
package/packages/streamx/src/index.ts +49 -0
package/packages/streamx/src/interval.ts +58 -0
package/packages/streamx/src/loop.ts +8 -0
package/packages/streamx/src/map.ts +12 -0
package/packages/streamx/src/merge.ts +89 -0
package/packages/streamx/src/nodeReadable.ts +6 -0
package/packages/streamx/src/nodeTransform.ts +9 -0
package/packages/streamx/src/nodeWritable.ts +38 -0
package/packages/streamx/src/objectReader.ts +16 -0
package/packages/streamx/src/polyfill.ts +20 -0
package/packages/streamx/src/reader.ts +38 -0
package/packages/streamx/src/reduce.ts +15 -0
package/packages/streamx/src/scale.ts +93 -0
package/packages/streamx/src/scaleSync.ts +13 -0
package/packages/streamx/src/sequence.ts +7 -0
package/packages/streamx/src/tap.ts +9 -0
package/packages/streamx/src/toArray.ts +9 -0
package/packages/streamx/src/writer.ts +96 -0
package/rnd/hf.ts +14 -0
package/rnd/keywords-compromise.ts +18 -0
package/rnd/keywords-pipeline.ts +79 -0
package/rnd/keywords.ts +38 -0
package/rnd/test-vectra-memory.ts +63 -0
package/rnd/vectra-keywords.ts +95 -0
package/rnd/vectra.ts +50 -0
package/tsconfig.json +14 -0

package/.ai/research/2026-04-10-file-watching.md ADDED Viewed

@@ -0,0 +1,79 @@
+# Research: File Watching in Node.js (2026)
+## Question 1: fs.watch recursive — platform support?
+**macOS** — native FSEvents backend, recursive works perfectly since early Node versions.
+**Windows** — native ReadDirectoryChangesW, recursive works since early Node versions.
+**Linux** — added in Node ~19 via [PR #45098](https://github.com/nodejs/node/pull/45098) (Oct 2022). Uses inotify (opens one fd per directory, not native recursive). Had race condition bug in Node 20.3.0 ([#48437](https://github.com/nodejs/node/issues/48437)), fixed in [PR #51406](https://github.com/nodejs/node/pull/51406). Also had crash-on-delete bug, fixed in [commit e7d0d80](https://github.com/nodejs/node/commit/e7d0d804b2).
+**Status:** Recursive fs.watch works on all three platforms in Node 22+. Linux implementation is stable after fixes.
+## Question 2: chokidar vs fs.watch — still needed?
+**chokidar v5** (Nov 2025):
+- ESM-only, min Node 20, TypeScript rewrite
+- Deps reduced from 13 → 1
+- Still uses fs.watch as primary backend, normalizes events
+- Events: `add`, `addDir`, `change`, `unlink`, `unlinkDir`, `ready`
+- ~30M repos, de facto standard
+- API: event emitter pattern (`watcher.on("add", path => ...)`)
+**When chokidar adds value:**
+- Cross-platform consistency (normalizes all platform quirks)
+- Glob pattern matching (removed in v5 actually)
+- Handles edge cases: atomic writes, duplicate events, initial scan
+- `ready` event (know when initial scan is done)
+**When fs.watch is sufficient:**
+- Single platform or modern Node (22+)
+- Simple needs (just file paths + change type)
+- Already have debouncing infrastructure
+- Prefer async iterable over event emitter
+## Question 3: @parcel/watcher and alternatives?
+**@parcel/watcher** — native C++ addon. Backends: FSEvents (macOS), inotify (Linux), ReadDirectoryChangesW (Windows). Most performant for large codebases. Heavy dep (native addon build). Vite considered switching to it from chokidar ([#12495](https://github.com/vitejs/vite/issues/12495)).
+**node-watch** — thin wrapper over fs.watch, adds recursive support for Linux. Lighter than chokidar.
+**watchpack** — webpack's watcher. Uses chokidar under the hood.
+None of these add significant value over chokidar or native fs.watch for our use case.
+## Question 4: fs.watch known issues + best practices?
+**Issues:**
+- Duplicate events per single file save (editor writes temp → rename → delete)
+- Null filenames on some platforms/scenarios
+- "rename" event is ambiguous (create, delete, or rename)
+- No built-in debouncing
+**Best practices:**
+- Debounce: 50-200ms window to batch rapid events
+- Stat validation: after event, `stat()` to check if file exists and get mtime
+- Resource cleanup: always `watcher.close()` on shutdown
+- Path handling: `fs.watch` gives filename relative to watched dir, need `path.join`
+## Decision: fs.watch for xindex
+**Recommendation: native `fs.watch`** with our own debouncing via streamx `batchTimed`.
+**Why:**
+1. Zero new deps — project is private, macOS primary, Node 22+ assumed
+2. Async iterable — `fs.watch` returns `AsyncIterable<FileChangeInfo>`, fits streamx architecture naturally (no adapter needed)
+3. Debouncing covered — `batchTimed(20, 150)` already in streamx handles the duplicate event problem
+4. Stat validation — simple: `stat()` after event, exists → index, throws → remove
+5. Simpler shutdown — close watcher handle vs chokidar's async `.close()`
+**Tradeoff accepted:** more manual edge case handling (null filenames, dedup). Acceptable for a private tool with batchTimed already available.
+**If issues arise:** chokidar v5 is 1 dep away, same ESM/Node 20+ requirements, drop-in upgrade path. Not worth adding preemptively.
+## Sources
+- [Node.js fs.watch recursive Linux PR #45098](https://github.com/nodejs/node/pull/45098)
+- [Node 20 recursive bug #48437](https://github.com/nodejs/node/issues/48437)
+- [Chokidar v5 README](https://github.com/paulmillr/chokidar/blob/main/README.md)
+- [Vite fs.watch discussion #12495](https://github.com/vitejs/vite/issues/12495)
+- [@parcel/watcher](https://github.com/parcel-bundler/watcher)
+- [fs.watch best practices](https://www.w3tutorials.net/blog/nodejs-fs-watch/)

package/.ai/research/2026-04-10-mcp-output-format.md ADDED Viewed

@@ -0,0 +1,129 @@
+# MCP Tool Output Format for LLM Consumption
+**Question**: What output format should our xindex_search MCP tool use to return search results to an LLM?
+**Current state**: `JSON.stringify(results, null, 2)` — pretty-printed JSON with score, id, meta.keywords, meta.file (id and meta.file are redundant).
+---
+## Findings
+### 1. Token efficiency benchmarks (ImprovingAgents, Oct 2025)
+**Nested data** — 1,000 questions, 3 models, 4 formats:
+| Format   | Tokens  | GPT-5 Nano | Gemini 2.5 Flash Lite |
+|----------|---------|------------|----------------------|
+| Markdown | 38,357  | 54.3%      | 48.2%                |
+| YAML     | 42,477  | 62.1%      | 51.9%                |
+| JSON     | 57,933  | 50.3%      | 43.1%                |
+| XML      | 68,804  | 44.4%      | 33.8%                |
+Markdown uses **34% fewer tokens** than JSON. YAML has better accuracy but more tokens.
+**Flat/tabular data** — 11 formats, 1,000 queries, GPT-4.1-nano:
+| Format         | Accuracy | Tokens  | Efficiency |
+|----------------|----------|---------|------------|
+| Markdown-KV    | 60.7%    | 52,104  | Best accuracy |
+| Markdown Table | 51.9%    | 25,140  | Best ratio |
+| JSON           | 52.3%    | 66,396  | Mediocre |
+| CSV            | 44.3%    | 19,524  | Cheapest but worst |
+For flat data (which our search results are), **Markdown-KV** gives best LLM comprehension. A numbered list with `key: value` pairs is effectively Markdown-KV.
+Sources: [Nested formats](https://www.improvingagents.com/blog/best-nested-data-format/), [Table formats](https://www.improvingagents.com/blog/best-input-data-format-for-llms/)
+### 2. MCP spec guidance (June 2025)
+- `content` (TextContent) = what the LLM reads
+- `structuredContent` = machine-to-machine, optional
+- Spec's own example uses **plain text**: `"Current weather in New York:\nTemperature: 72°F\nConditions: Partly cloudy"`
+- If `outputSchema` is defined, SHOULD return both `structuredContent` AND serialized JSON in TextContent for backwards compat
+The spec explicitly shows plain text as the standard tool result format for LLM consumption.
+Source: [MCP Tools Spec](https://modelcontextprotocol.io/specification/2025-06-18/server/tools)
+### 3. What popular MCP servers do
+| Server       | Output format |
+|-------------|--------------|
+| Perplexity  | AI-synthesized text + citation URLs |
+| Context7    | Plain text documentation snippets |
+| markdownify | Markdown (entire category exists for this) |
+| Elasticsearch | JSON (machine-oriented) |
+LLM-facing servers use text/markdown. Only machine-oriented servers use JSON.
+### 4. JSON specifically degrades LLM reasoning
+- Aider benchmarks: JSON wrapping reduces code reasoning quality by 10-15% ([source](https://aider.chat/2024/08/14/code-in-json.html))
+- arxiv paper: frontier models top out at ~77% accuracy on JSON processing tasks ([source](https://arxiv.org/html/2510.15955v1))
+- OpenAI community: Markdown is 15% more token-efficient than JSON ([source](https://community.openai.com/t/markdown-is-15-more-token-efficient-than-json/841742))
+### 5. TOON format (Nov 2025) — not recommended
+New token-optimized format. Mixed results: 73.9% on flat retrieval but **last place** (43.1%) on nested data. Immature ecosystem, no MCP support. Not applicable here.
+Source: [TOON benchmarks](https://www.improvingagents.com/blog/toon-benchmarks/)
+### 6. Workato design guidelines
+- Return only necessary fields — avoid sending 200+ fields when 3 suffice
+- Preprocess/summarize large content before returning to LLM
+- Consider token efficiency — "excessive data can overwhelm the AI agent"
+Source: [Workato MCP Tool Design](https://docs.workato.com/en/mcp/mcp-server-tool-design.html)
+---
+## Analysis
+Our search results are **flat data** with 3 fields per result (score, file path, keywords). This is the simplest case:
+| Approach | Tokens/result | LLM quality | Fit |
+|----------|--------------|-------------|-----|
+| Pretty JSON (current) | ~55 | Worst — syntax overhead | Bad |
+| Compact JSON | ~22 | OK but cryptic keys | Meh |
+| Markdown numbered list | ~12 | Best — Markdown-KV pattern | Best |
+| TSV | ~15 | OK but less natural | OK |
+The markdown numbered list matches the **Markdown-KV** pattern that scored highest (60.7%) in flat data benchmarks. It's also **77% fewer tokens** than current JSON.
+Additional advantages:
+- File path is visually prominent (it's what the LLM acts on next)
+- Score at 2 decimals is sufficient ranking signal
+- Keywords give semantic context without opening the file
+- Zero structural noise (no braces, brackets, quotes, commas)
+- Matches how Perplexity/Context7 format their responses
+No significant trade-offs: we don't need machine-parseability (the consumer is always an LLM), and there's no nested data to worry about.
+---
+## Recommendation
+**Switch to markdown numbered list.**
+```
+Search: "authentication flow" — 3 result(s)
+1. src/components/auth.ts (0.87) — authentication, login, session, token
+2. src/middleware/jwt.ts (0.81) — jwt, token, verify, middleware
+3. src/routes/login.ts (0.74) — login, form, credentials, redirect
+```
+Implementation in `mcpApp.ts`:
+```ts
+const header = `Search: "${query}" — ${results.length} result(s)\n\n`;
+const lines = results.map((r, i) =>
+    `${i + 1}. ${r.id} (${r.score.toFixed(2)}) — ${r.meta.keywords ?? ""}`
+);
+const text = header + lines.join("\n");
+return {content: [{type: "text" as const, text}]};
+```
+Empty case: `No results for "${query}"` — avoids confusing the model with an empty list.
+**Future consideration**: Add `outputSchema` + `structuredContent` when clients start using it, but keep TextContent as the primary format for LLM consumption.

package/.ai/task/INDEX.md ADDED Viewed

@@ -0,0 +1,12 @@
+# Tasks
+- [xindex-mcp — MCP Server for Semantic Code Search](task.2026-04-10-xindex-mcp.md) — wrap xindex as MCP server so Claude Code can search codebase
+- [Directory-based Indexing with Async Streams](task.2026-04-10-dir-indexing.md) — accept files/dirs, recursive walk with .gitignore, index via streamx pipeline
+- [xindex-watch — Continuous Indexing](task.2026-04-10-watch-indexing.md) — new entry point: index all + watch for changes continuously via merged stream
+- [Object Store — Separate Meta from Vectra](task.2026-04-10-object-store.md) — store meta as JSON files in .xindex/objects/, vectra keeps only vectors
+- [Line-level Clustering](task.2026-04-10-line-clustering.md) — recursive bisection to split files into semantic blocks, index as file:fromLine-toLine
+- [Search Config — Keyword Ignore & Inline Snippets](task.2026-04-10-search-config.md) — `.xindex.json` config for ignoring noisy keywords + inlining small code clusters in results
+- [Cluster Config — Move ClusterLines defaults to .xindex.json](task.2026-04-10-cluster-config.md) — repo-level clustering params (`threshold`, `minLines`, `maxDepth`) instead of hardcoded defaults
+See [done/INDEX.md](done/INDEX.md) for completed tasks.

package/.ai/task/done/INDEX.md ADDED Viewed

@@ -0,0 +1,3 @@
+# Done Tasks
+- [xindex — Local Semantic Code Search](task.2026-04-09-local-ai-research-protos.md) — R&D prototyping → HOF refactoring → working semantic search tool (completed 2026-04-10)

package/.ai/task/done/task.2026-04-09-local-ai-research-protos.log.md ADDED Viewed

@@ -0,0 +1,98 @@
+### 2026-04-09 — Session log
+#### 1. hf.ts — Local text generation
+- Installed `@huggingface/transformers` (v4.0.1)
+- First run failed: top-level `await` not supported in CJS → fixed by adding `"type": "module"` to package.json
+- Model: `HuggingFaceTB/SmolLM2-135M-Instruct` — downloads ONNX weights on first run, cached after
+- API: `pipeline("text-generation", model)` → pass chat messages array → `output[0].generated_text.at(-1).content`
+- Output quality: basic — responded "Hello, my name is [Your Name]." to "Write a one-line hello"
+#### 2. vectra.ts — Local vector search
+- Installed `vectra` (v0.14.0)
+- **API gotchas** — online examples use outdated API:
+  - `VectraIndex` → actually `LocalIndex`
+  - Constructor: `new LocalIndex(folderPath)` — no options object, no `dimension` param
+  - Must call `createIndex()` before first use, check with `isIndexCreated()`
+  - `queryItems(vector, query, topK, filter)` — 4 positional args, not an options object
+  - Filter format: `{ category: { $eq: "fruit" } }` — nested operator syntax
+- Embeddings: `pipeline("feature-extraction", "sentence-transformers/all-MiniLM-L6-v2")` → 384-dim vectors
+- Embedding helper: `embedder(text, { pooling: "mean", normalize: true })` → `Array.from(result.data)`
+- Tested: 3 items indexed, query "red fruit" with category filter → correctly returned 2 fruit items, filtered out "Cars are vehicles"
+- Scores: Apples=0.7830, Bananas=0.5188
+#### 3. keywords.ts — Keyword extraction from code files
+- Installed `keyword-extractor` (v0.0.28) — CJS module, needed `createRequire(import.meta.url)` for ESM import
+- **Iteration 1**: `return_chained_words: true` → way too aggressive, merged entire code lines into single "keywords"
+- **Iteration 2**: `return_max_ngrams: 3` instead → still noisy, code syntax tokens (`{`, `}`, `const`, `await`) dominated results
+- **Iteration 3**: Added code-aware preprocessing before extraction:
+  - Strip `//` from comments but keep comment text
+  - Remove code punctuation: `{}()[];=<>|&!+*/$@` etc.
+  - Remove JS keywords: `const`, `let`, `var`, `import`, `export`, `from`, `await`, `async`, `function`, `return`, `for`, `of`, `if`, `new`, `typeof`, `as`
+  - Collapse whitespace
+  - Post-filter: skip keywords < 3 chars or non-alphabetic, word-boundary regex for frequency count
+- Final output on vectra.ts: clean results — `metadata(8x)`, `index(7x)`, `text(7x)`, `fruit(5x)`, `pipeline(3x)`, `huggingface(1x)`
+- Tested without filters: confirmed `const(10x)`, `await(8x)`, `}(19x)` dominate — filters are necessary for code
+#### 4. keywords-compromise.ts — Compromise NLP extraction
+- Installed `compromise` (v14.15.0) — ESM native, no createRequire needed
+- Tested all extractors: `.topics()`, `.nouns()`, `.verbs()`, `.people()`, `.organizations()`
+- **Result on code files**: poor. Nouns are code fragments (`'const index = new LocalIndex("./vectra-index");'`), zero topics, zero people
+- On hf.ts: caught "Microsoft" as topic + organization from prompt string — works on embedded natural language
+- **Conclusion**: compromise is designed for prose (articles, emails, chat), not source code
+#### 5. keywords-pipeline.ts — Full extraction pipeline
+- Combined: read file → compromise (nouns/verbs/topics) → regex `\W+` → space → keyword-extractor → show
+- Added LLM step (SmolLM2-135M): asked to extract/refine keywords
+- **LLM result**: echoed input then looped on `await transformer.get(index)` — too small to understand the instruction
+- LLM fallback logic: if output has <3 unique terms or contains repetition pattern, fall back to raw keywords
+- Pipeline works end-to-end but LLM step is effectively a passthrough
+#### 6. vectra-keywords.ts — Combined indexing + synonym search
+- Merged keyword extraction + vectra indexing into one script
+- Indexed 5 files, tested synonym search:
+  - "fruit" → hf.ts (0.18), vectra.ts (0.17)
+  - "automobile vehicle transportation" → vectra.ts (0.18) — synonym for "cars/vehicles"
+  - "embedding model neural network" → vectra.ts (0.27) — semantic match
+- Scores low because code noise (`const`, `await`) dilutes the keyword embeddings
+#### 7. xindex.ts — Final combined solution
+- Created unified CLI: `xindex.ts index <files>` and `xindex.ts search <query>`
+- Full index pipeline: compromise → regex → keyword-extractor → LLM refine → MiniLM embed → vectra store
+- Query pipeline: input → keyword-extractor → embed → vectra search
+- Added full payload logging at each step: [1] keywords, [2] LLM refined, [3] vector preview, [4] metadata
+- Test results:
+  - "natural language processing" → keywords-pipeline.ts (0.56) — strongest match
+  - "automobile transportation" → vectra.ts (0.21) — synonym works
+  - "neural network deep learning" → vectra.ts (0.28)
+#### 8. Project setup — ~/project/xindex
+- Created standalone TypeScript project at `/Users/slava/project/xindex`
+- Moved all RnD files: xindex.ts, hf.ts, vectra.ts, keywords.ts, keywords-compromise.ts, keywords-pipeline.ts, vectra-keywords.ts
+- package.json, tsconfig.json, bin/xindex entry point
+- Git initialized
+#### Alternative keyword extraction libs (from user research)
+- `textlens` — TF-IDF, 1-line API, fastest, 10k+ weekly downloads
+- `node-keyword-extractor` — RAKE-like, 1-line API, very fast
+- `compromise` — full NLP, 3 lines, fast (tested — poor on code)
+- `natural` — TF-IDF tokenizer, 5 lines, fast
+#### Decisions & findings
+- `"type": "module"` in package.json is required for all prototypes (top-level await)
+- HuggingFace transformers JS works well for embeddings; text generation quality limited by model size
+- Vectra API docs/examples online are outdated — always check the actual `.d.ts` types
+- Code keyword extraction needs domain-specific preprocessing regardless of library choice
+- Compromise NLP is not suitable for code — only for natural language text
+- SmolLM2-135M is too small for keyword refinement — needs 360M+ or external API
+- MiniLM-L6-v2 embeddings understand synonyms well enough for semantic code search
+- The concept works: index codebase → query with natural language → get relevant files

package/.ai/task/done/task.2026-04-09-local-ai-research-protos.md ADDED Viewed

@@ -0,0 +1,102 @@
+# Task: xindex — Local Semantic Code Search [COMPLETED 2026-04-10]
+## Context
+Built a local semantic code search tool — index a codebase, query by keyword or meaning, get relevant files back. No cloud APIs, everything runs on-device.
+**Project**: `~/project/xindex`
+**Dependencies:** `@huggingface/transformers`, `vectra`, `compromise`, `keyword-extractor`, `tsx`
+## Goal
+Index a codebase so that minimally meaningful text queries return relevant files/info about the project. Local-first, no cloud APIs.
+## Final Architecture
+```
+componets/
+├── llm/
+│   ├── embed.ts            — Embed({pooling, normalize}) → MiniLM-L6 384-dim
+│   └── queryLLM.ts         — QueryLLM({maxTokens}) → SmolLM2-135M (kept aside, unused)
+├── keywords/
+│   ├── extractKeywords.ts  — ExtractKeywords() → compromise NLP (nouns/verbs/topics)
+│   ├── cleanUpKeywords.ts  — CleanUpKeywords({maxNgrams, minLength}) → keyword-extractor + dedup
+│   └── refineKeywords.ts   — RefineKeywords({queryLLM, cleanUpKeywords, prompt}) (kept aside, unused)
+├── index/
+│   ├── vectraIndex.ts      — VectraIndex(path) → LocalIndex init
+│   ├── indexContent.ts     — IndexContent({embed, index}) → embed + upsert
+│   ├── getIndexStats.ts    — GetIndexStats({index}) → {indexedAmount}
+│   ├── searchContentIndex.ts — SearchContentIndex({extractKeywords, cleanUpKeywords, embed, index})
+│   └── contentIndexDriver.ts — ContentIndexDriver({path, embed, extractKeywords, cleanUpKeywords})
+└── buildComponents.ts      — wires everything, returns ready-to-use functions
+apps/
+├── indexApp.ts             — IndexApp({extractKeywords, cleanUpKeywords, indexContent})
+├── searchApp.ts            — SearchApp({searchContentIndex})
+├── run.index.ts            — CLI entry point for indexing
+└── run.search.ts           — CLI entry point for search
+bin/
+├── xindex-index            — #!/usr/bin/env tsx → apps/run.index.ts
+└── xindex-search           — #!/usr/bin/env tsx → apps/run.search.ts
+```
+## Final Pipeline
+```
+INDEX (per file):
+  file + filename
+   │
+   ├─[1] extractKeywords ──→ compromise NLP (nouns, verbs, topics)
+   │
+   ├─[2] cleanUpKeywords ──→ keyword-extractor + dedup + filter
+   │
+   ├─[3] MiniLM-L6 ────────→ 384-dim embedding
+   │
+   └─[4] vectra ────────────→ upsert { id, vector, metadata: { keywords, file } }
+SEARCH:
+  user input
+   │
+   ├─[1] extractKeywords ──→ compromise NLP
+   │
+   ├─[2] cleanUpKeywords ──→ keyword-extractor + dedup + filter
+   │
+   ├─[3] MiniLM-L6 ────────→ 384-dim embedding
+   │
+   └─[4] vectra ────────────→ queryItems → ranked by cosine similarity
+```
+## Key Decisions
+- **LLM refine step removed** — SmolLM2-135M was too small, acted as passthrough or generated garbage. Without it: 10x faster indexing, better accuracy (11/14 → 16/20 correct #1)
+- **HOF component pattern** — all components are factory functions: `DoThing({deps}): IDoThing`. Export factory + type only, no default instances
+- **Dependencies as destructured objects** — `DoThing({embed, index}: {embed: IEmbed, index: LocalIndex})`
+- **Separate entry points** — `bin/xindex-index` and `bin/xindex-search` instead of one CLI with subcommands
+- **ContentIndexDriver** — bundles index layer (indexContent, getIndexStats, searchContentIndex) behind one factory
+## Final Test Results (41 files, 20 queries)
+| Metric | Value |
+|--------|-------|
+| Index time (41 files) | 1.07s (26ms/file) |
+| Search time | 0.71s (constant) |
+| Correct #1 | 16/20 (80%) |
+| Correct in top 3 | 19/20 (95%) |
+| Cross-domain isolation | Perfect |
+## Resolved Review Items
+- [x] Search loads unused LLM → removed LLM from pipeline entirely
+- [x] `// --- Init ---` comment artifact → removed
+- [ ] `BuildComponents` hardcodes index path → still hardcoded
+- [ ] Stale `main` in package.json → still points to deleted xindex.ts
+- [ ] `componets/` typo → deferred
+## Open Questions (carried forward)
+- What's the practical index size limit for vectra LocalIndex before it slows down?
+- Is hybrid search (vector + BM25) in vectra good enough to skip a separate keyword index?
+- Directory walking instead of explicit file list
+- Chunking for large files — one embedding per file loses detail

package/.ai/task/task.2026-04-10-cluster-config.log.md ADDED Viewed

@@ -0,0 +1,19 @@
+### 2026-04-10
+- Created task from user note: move line clustering defaults (`threshold`, `minLines`, `maxDepth`) into `.xindex.json`.
+- Ran scout via code search and xindex MCP search:
+  - confirmed hardcoded defaults in `componets/index/clusterLines.ts`
+  - confirmed config infra exists in `componets/config/xindexConfig.ts` + `componets/config/loadConfig.ts`
+  - confirmed `.xindex.json` already used for other settings
+- Drafted initial plan with diagram + 3x3 steps.
+- Updated from user clarifications:
+  - default threshold when missing is `0.7`
+  - threshold validation is strict `[0,1]` with fallback to default
+  - apply through shared wiring in this repo folder
+  - do not update `.xindex.json` file content now
+  - config shape finalized as flat keys: `clusterThreshold`, `clusterMinLines`, `clusterMaxDepth`
+- Ran consistency pass and fixed task mismatches:
+  - removed old `0.75` default references
+  - replaced clamp/range language with strict validation wording
+  - removed docs/example update requirement that conflicted with user instruction
+- Expanded task with Detailed Change Map + Acceptance Criteria for implementation readiness.

package/.ai/task/task.2026-04-10-cluster-config.md ADDED Viewed

@@ -0,0 +1,118 @@
+# Task: Move ClusterLines defaults into .xindex.json
+## Context
+User goal: move line-clustering configuration from hardcoded defaults to repo-level config in `.xindex.json`:
+- `threshold = 0.70` (new default when missing)
+- `minLines = 5`
+- `maxDepth = 5`
+- config key shape is flat:
+  - `clusterThreshold`
+  - `clusterMinLines`
+  - `clusterMaxDepth`
+Why: different repositories need different clustering behavior, so these values should be configurable per repo.
+Scout findings (`@xi` + code scan):
+- `componets/index/clusterLines.ts` currently hardcodes defaults in the HOF signature:
+  - `threshold = 0.75`
+  - `minLines = 5`
+  - `maxDepth = 5`
+- `.xindex.json` already exists and currently contains search/index config (`ignoreKeywords`, `ignoreFiles`, `maxSnippetLines`, `maxSnippetResults`).
+- `componets/config/xindexConfig.ts` and `componets/config/loadConfig.ts` already provide optional config loading with defaults.
+- `componets/buildComponents.ts` already loads `.xindex.json` and wires config into keyword cleanup, but does not yet pass clustering params to driver/components.
+Related active tasks:
+- `task.2026-04-10-line-clustering.md`
+- `task.2026-04-10-search-config.md`
+## Goal
+Extend `.xindex.json` + config loading so line-clustering params are configurable per repository, and wire them into `ClusterLines` construction with strict threshold validation and the new default threshold `0.7` when config keys are absent.
+## Diagram
+```
+.xindex.json (optional)
+┌──────────────────────────────────────────────┐
+│ existing: ignoreKeywords, ignoreFiles, ...  │
+│ new:                                         │
+│   clusterThreshold: number (default 0.7)     │
+│   clusterMinLines: number (default 5)        │
+│   clusterMaxDepth: number (default 5)        │
+└───────────────────┬──────────────────────────┘
+                    │
+                    ▼
+LoadConfig -> IXindexConfig (defaults applied)
+                    │
+                    ▼
+BuildComponents
+                    │
+                    ▼
+ContentIndexDriver / ClusterLines factory
+                    │
+                    ▼
+clusterLines() uses repo-specific values
+```
+## Steps
+### 1. Extend config schema (3x3)
+- **1.1 Add fields to `IXindexConfig`** — add three clustering fields with explicit names and numeric types.
+- **1.2 Parse + default in `LoadConfig`** — map new JSON keys to validated numbers with defaults `{clusterThreshold: 0.7, clusterMinLines: 5, clusterMaxDepth: 5}`.
+- **1.3 Validate bounds** — `clusterThreshold` must be in `[0,1]` (otherwise fallback to default), and line/depth values must be finite integers with safe lower bounds.
+### 2. Wire config into clustering (3x3)
+- **2.1 Thread config through builder/driver** — ensure the clustering factory gets config values from `BuildComponents` path.
+- **2.2 Update `ClusterLines` construction** — pass config values from driver wiring instead of relying on hardcoded constructor defaults.
+- **2.3 Preserve backward compatibility** — missing `.xindex.json` or missing keys should still produce stable clustering behavior via loader defaults.
+### 3. Validate behavior and docs (3x3)
+- **3.1 Runtime sanity checks** — run index/search flow to confirm no regressions and that loaded config values are honored.
+- **3.2 Surface scope in repo entry points** — ensure clustering config is available from common build path used by apps in this folder.
+- **3.3 Add/update tests** — cover default path (no config keys), invalid threshold path (fallback to 0.7), and override path (custom values).
+## Detailed Change Map
+- `componets/config/xindexConfig.ts`
+  - Add:
+    - `clusterThreshold: number`
+    - `clusterMinLines: number`
+    - `clusterMaxDepth: number`
+- `componets/config/loadConfig.ts`
+  - Extend `DEFAULTS` with clustering keys (`0.7`, `5`, `5`).
+  - Parse new keys from `.xindex.json`.
+  - Apply strict validation:
+    - threshold valid only if finite number and `0 <= v <= 1`, otherwise default
+    - min/depth valid only if finite number, integerized, and `>= 1`, otherwise default
+- `componets/index/contentIndexDriver.ts`
+  - Accept `config` in factory deps.
+  - Construct `ClusterLines({... , threshold: config.clusterThreshold, minLines: config.clusterMinLines, maxDepth: config.clusterMaxDepth})`.
+- `componets/buildComponents.ts`
+  - Pass loaded `config` into `ContentIndexDriver`.
+  - Keep returning `config` so consumers in this folder can use consistent runtime config.
+- `apps/run.*.ts` and `apps/mcpApp.ts` (as needed)
+  - No direct clustering logic, but rely on `BuildComponents` so new config applies everywhere in this folder through shared wiring.
+## Acceptance Criteria
+- `.xindex.json` may define clustering keys and they influence `ClusterLines` without code changes.
+- Missing clustering keys use defaults: `clusterThreshold=0.7`, `clusterMinLines=5`, `clusterMaxDepth=5`.
+- Invalid threshold values (e.g., `-0.1`, `1.2`, `"0.7"`, `null`) fallback to `0.7`.
+- Indexing pipeline compiles and runs with unchanged public entry points.
+- No update to `.xindex.json` file contents is required in this task.
+## Decisions
+- Default threshold changed from old runtime value to `0.7`.
+- Threshold validation is strict `[0,1]` with fallback to default (no clamping).
+- Scope is this repo folder via shared component wiring, not only one direct caller.
+- Do not modify current `.xindex.json` as part of this task.
+- Config shape is flat keys in `.xindex.json`:
+  - `clusterThreshold`
+  - `clusterMinLines`
+  - `clusterMaxDepth`
+## Open Questions
+- Compatibility strategy: accept legacy names (if any) or only new canonical names?

package/.ai/task/task.2026-04-10-dir-indexing.log.md ADDED Viewed

@@ -0,0 +1,8 @@
+### 2026-04-10 — Task created
+- Scouted: streamx has `from()`, `map`, `flat`, `pipe`, `run` — full async stream toolkit
+- Current IndexApp is a simple for-loop over explicit file paths
+- Node `fs.readdir({recursive: true})` exists but no gitignore support
+- `.gitignore` already has sensible rules for the project
+- User wants: files or dirs as input → recursive walk → stream → index
+- User preference: use streamx from packages/ for the stream pipeline