xindex 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/.ai/research/2026-04-10-file-watching.md +79 -0
  2. package/.ai/research/2026-04-10-mcp-output-format.md +129 -0
  3. package/.ai/task/INDEX.md +12 -0
  4. package/.ai/task/done/INDEX.md +3 -0
  5. package/.ai/task/done/task.2026-04-09-local-ai-research-protos.log.md +98 -0
  6. package/.ai/task/done/task.2026-04-09-local-ai-research-protos.md +102 -0
  7. package/.ai/task/task.2026-04-10-cluster-config.log.md +19 -0
  8. package/.ai/task/task.2026-04-10-cluster-config.md +118 -0
  9. package/.ai/task/task.2026-04-10-dir-indexing.log.md +8 -0
  10. package/.ai/task/task.2026-04-10-dir-indexing.md +92 -0
  11. package/.ai/task/task.2026-04-10-line-clustering.log.md +50 -0
  12. package/.ai/task/task.2026-04-10-line-clustering.md +176 -0
  13. package/.ai/task/task.2026-04-10-object-store.log.md +7 -0
  14. package/.ai/task/task.2026-04-10-object-store.md +81 -0
  15. package/.ai/task/task.2026-04-10-search-config.log.md +46 -0
  16. package/.ai/task/task.2026-04-10-search-config.md +274 -0
  17. package/.ai/task/task.2026-04-10-watch-indexing.log.md +32 -0
  18. package/.ai/task/task.2026-04-10-watch-indexing.md +101 -0
  19. package/.ai/task/task.2026-04-10-xindex-mcp.log.md +5 -0
  20. package/.ai/task/task.2026-04-10-xindex-mcp.md +92 -0
  21. package/.ai/task/task.2026-04-10-xindex-mcp.report.md +113 -0
  22. package/.claude/settings.local.json +73 -0
  23. package/.claude/skills/make-hof/SKILL.md +8 -0
  24. package/.claude/skills/make-hof/playbook.md +38 -0
  25. package/.cursor/mcp.json +8 -0
  26. package/.mcp.json +8 -0
  27. package/.xindex.json +22 -0
  28. package/CLAUDE.md +54 -0
  29. package/README.md +206 -0
  30. package/apps/indexApp.ts +31 -0
  31. package/apps/mcpApp.ts +119 -0
  32. package/apps/run.index.ts +19 -0
  33. package/apps/run.mcp.ts +49 -0
  34. package/apps/run.reset.ts +10 -0
  35. package/apps/run.search.ts +21 -0
  36. package/apps/run.watch.ts +44 -0
  37. package/apps/searchApp.ts +9 -0
  38. package/apps/watchApp.ts +53 -0
  39. package/apps/watchFileEventsApp.ts +39 -0
  40. package/bin/xindex-index +2 -0
  41. package/bin/xindex-mcp +2 -0
  42. package/bin/xindex-reset +2 -0
  43. package/bin/xindex-search +2 -0
  44. package/bin/xindex-watch +2 -0
  45. package/componets/IType.ts +1 -0
  46. package/componets/appId.ts +3 -0
  47. package/componets/buildComponents.ts +27 -0
  48. package/componets/config/loadConfig.ts +43 -0
  49. package/componets/config/xindexConfig.ts +4 -0
  50. package/componets/index/contentIndexDriver.ts +39 -0
  51. package/componets/index/formatSearchResults.ts +18 -0
  52. package/componets/index/getIndexStats.ts +11 -0
  53. package/componets/index/handleFileEvent.ts +25 -0
  54. package/componets/index/indexApi.ts +45 -0
  55. package/componets/index/vectraIndex.ts +11 -0
  56. package/componets/index/watcherLock.ts +107 -0
  57. package/componets/keywords/cleanUpKeywords.ts +38 -0
  58. package/componets/keywords/extractKeywords.ts +14 -0
  59. package/componets/keywords/refineKeywords.ts +16 -0
  60. package/componets/llm/embed.ts +18 -0
  61. package/componets/llm/queryLLM.ts +20 -0
  62. package/componets/logger.ts +34 -0
  63. package/componets/walkFiles.ts +51 -0
  64. package/componets/watchFiles.ts +106 -0
  65. package/features/indexContent.ts +16 -0
  66. package/features/removeContent.ts +9 -0
  67. package/features/resetIndex.ts +9 -0
  68. package/features/searchIndex.ts +33 -0
  69. package/package.json +32 -0
  70. package/packages/fun/src/IType.ts +5 -0
  71. package/packages/fun/src/array-finder.ts +55 -0
  72. package/packages/fun/src/array-index.ts +35 -0
  73. package/packages/fun/src/array.ts +112 -0
  74. package/packages/fun/src/assert.ts +5 -0
  75. package/packages/fun/src/asyncRequest.ts +35 -0
  76. package/packages/fun/src/callsites.ts +18 -0
  77. package/packages/fun/src/case-never.ts +9 -0
  78. package/packages/fun/src/casting.ts +41 -0
  79. package/packages/fun/src/collect.ts +13 -0
  80. package/packages/fun/src/concurrency.ts +186 -0
  81. package/packages/fun/src/container.ts +86 -0
  82. package/packages/fun/src/counter.ts +45 -0
  83. package/packages/fun/src/create-map.ts +2 -0
  84. package/packages/fun/src/dedupe.ts +2 -0
  85. package/packages/fun/src/defer.ts +55 -0
  86. package/packages/fun/src/delay.ts +5 -0
  87. package/packages/fun/src/discriminate.ts +34 -0
  88. package/packages/fun/src/enum-values.ts +12 -0
  89. package/packages/fun/src/exponential-backoff.ts +20 -0
  90. package/packages/fun/src/flatten.ts +11 -0
  91. package/packages/fun/src/hash.ts +67 -0
  92. package/packages/fun/src/hash128.ts +6 -0
  93. package/packages/fun/src/hash256.ts +6 -0
  94. package/packages/fun/src/hub.ts +53 -0
  95. package/packages/fun/src/id.ts +10 -0
  96. package/packages/fun/src/interval.ts +76 -0
  97. package/packages/fun/src/is-non-nullable.ts +2 -0
  98. package/packages/fun/src/isIterable.ts +3 -0
  99. package/packages/fun/src/mailbox.ts +13 -0
  100. package/packages/fun/src/map-record.ts +19 -0
  101. package/packages/fun/src/match-collections.ts +57 -0
  102. package/packages/fun/src/match-left-and-right-arrays.ts +78 -0
  103. package/packages/fun/src/mem.ts +26 -0
  104. package/packages/fun/src/memos.ts +28 -0
  105. package/packages/fun/src/normalizeError.ts +25 -0
  106. package/packages/fun/src/nothing.ts +3 -0
  107. package/packages/fun/src/pipe.ts +18 -0
  108. package/packages/fun/src/prettyJson.ts +3 -0
  109. package/packages/fun/src/project.ts +8 -0
  110. package/packages/fun/src/promise.ts +27 -0
  111. package/packages/fun/src/pubsub.ts +128 -0
  112. package/packages/fun/src/randomId.ts +14 -0
  113. package/packages/fun/src/regexp-escape.ts +13 -0
  114. package/packages/fun/src/retry.ts +15 -0
  115. package/packages/fun/src/serial.test.ts +107 -0
  116. package/packages/fun/src/serial.ts +17 -0
  117. package/packages/fun/src/sleep.ts +3 -0
  118. package/packages/fun/src/sort-object.ts +46 -0
  119. package/packages/fun/src/speed-test.ts +56 -0
  120. package/packages/fun/src/tick.ts +37 -0
  121. package/packages/fun/src/time-behavior.ts +50 -0
  122. package/packages/fun/src/time.ts +22 -0
  123. package/packages/fun/src/timedFallback.ts +37 -0
  124. package/packages/fun/src/timer.ts +30 -0
  125. package/packages/fun/src/value.ts +33 -0
  126. package/packages/fun/src/waitForCounter.ts +15 -0
  127. package/packages/streamx/src/batch.ts +23 -0
  128. package/packages/streamx/src/batchTimed.ts +113 -0
  129. package/packages/streamx/src/buffer.ts +72 -0
  130. package/packages/streamx/src/concatenate.ts +33 -0
  131. package/packages/streamx/src/filter.ts +14 -0
  132. package/packages/streamx/src/flat.ts +19 -0
  133. package/packages/streamx/src/flatMap.ts +9 -0
  134. package/packages/streamx/src/from.ts +30 -0
  135. package/packages/streamx/src/index.ts +49 -0
  136. package/packages/streamx/src/interval.ts +58 -0
  137. package/packages/streamx/src/loop.ts +8 -0
  138. package/packages/streamx/src/map.ts +12 -0
  139. package/packages/streamx/src/merge.ts +89 -0
  140. package/packages/streamx/src/nodeReadable.ts +6 -0
  141. package/packages/streamx/src/nodeTransform.ts +9 -0
  142. package/packages/streamx/src/nodeWritable.ts +38 -0
  143. package/packages/streamx/src/objectReader.ts +16 -0
  144. package/packages/streamx/src/polyfill.ts +20 -0
  145. package/packages/streamx/src/reader.ts +38 -0
  146. package/packages/streamx/src/reduce.ts +15 -0
  147. package/packages/streamx/src/scale.ts +93 -0
  148. package/packages/streamx/src/scaleSync.ts +13 -0
  149. package/packages/streamx/src/sequence.ts +7 -0
  150. package/packages/streamx/src/tap.ts +9 -0
  151. package/packages/streamx/src/toArray.ts +9 -0
  152. package/packages/streamx/src/writer.ts +96 -0
  153. package/rnd/hf.ts +14 -0
  154. package/rnd/keywords-compromise.ts +18 -0
  155. package/rnd/keywords-pipeline.ts +79 -0
  156. package/rnd/keywords.ts +38 -0
  157. package/rnd/test-vectra-memory.ts +63 -0
  158. package/rnd/vectra-keywords.ts +95 -0
  159. package/rnd/vectra.ts +50 -0
  160. package/tsconfig.json +14 -0
@@ -0,0 +1,79 @@
1
+ # Research: File Watching in Node.js (2026)
2
+
3
+ ## Question 1: fs.watch recursive — platform support?
4
+
5
+ **macOS** — native FSEvents backend, recursive works perfectly since early Node versions.
6
+ **Windows** — native ReadDirectoryChangesW, recursive works since early Node versions.
7
+ **Linux** — added in Node ~19 via [PR #45098](https://github.com/nodejs/node/pull/45098) (Oct 2022). Uses inotify (opens one fd per directory, not native recursive). Had race condition bug in Node 20.3.0 ([#48437](https://github.com/nodejs/node/issues/48437)), fixed in [PR #51406](https://github.com/nodejs/node/pull/51406). Also had crash-on-delete bug, fixed in [commit e7d0d80](https://github.com/nodejs/node/commit/e7d0d804b2).
8
+
9
+ **Status:** Recursive fs.watch works on all three platforms in Node 22+. Linux implementation is stable after fixes.
10
+
11
+ ## Question 2: chokidar vs fs.watch — still needed?
12
+
13
+ **chokidar v5** (Nov 2025):
14
+ - ESM-only, min Node 20, TypeScript rewrite
15
+ - Deps reduced from 13 → 1
16
+ - Still uses fs.watch as primary backend, normalizes events
17
+ - Events: `add`, `addDir`, `change`, `unlink`, `unlinkDir`, `ready`
18
+ - ~30M repos, de facto standard
19
+ - API: event emitter pattern (`watcher.on("add", path => ...)`)
20
+
21
+ **When chokidar adds value:**
22
+ - Cross-platform consistency (normalizes all platform quirks)
23
+ - Glob pattern matching (removed in v5 actually)
24
+ - Handles edge cases: atomic writes, duplicate events, initial scan
25
+ - `ready` event (know when initial scan is done)
26
+
27
+ **When fs.watch is sufficient:**
28
+ - Single platform or modern Node (22+)
29
+ - Simple needs (just file paths + change type)
30
+ - Already have debouncing infrastructure
31
+ - Prefer async iterable over event emitter
32
+
33
+ ## Question 3: @parcel/watcher and alternatives?
34
+
35
+ **@parcel/watcher** — native C++ addon. Backends: FSEvents (macOS), inotify (Linux), ReadDirectoryChangesW (Windows). Most performant for large codebases. Heavy dep (native addon build). Vite considered switching to it from chokidar ([#12495](https://github.com/vitejs/vite/issues/12495)).
36
+
37
+ **node-watch** — thin wrapper over fs.watch, adds recursive support for Linux. Lighter than chokidar.
38
+
39
+ **watchpack** — webpack's watcher. Uses chokidar under the hood.
40
+
41
+ None of these add significant value over chokidar or native fs.watch for our use case.
42
+
43
+ ## Question 4: fs.watch known issues + best practices?
44
+
45
+ **Issues:**
46
+ - Duplicate events per single file save (editor writes temp → rename → delete)
47
+ - Null filenames on some platforms/scenarios
48
+ - "rename" event is ambiguous (create, delete, or rename)
49
+ - No built-in debouncing
50
+
51
+ **Best practices:**
52
+ - Debounce: 50-200ms window to batch rapid events
53
+ - Stat validation: after event, `stat()` to check if file exists and get mtime
54
+ - Resource cleanup: always `watcher.close()` on shutdown
55
+ - Path handling: `fs.watch` gives filename relative to watched dir, need `path.join`
56
+
57
+ ## Decision: fs.watch for xindex
58
+
59
+ **Recommendation: native `fs.watch`** with our own debouncing via streamx `batchTimed`.
60
+
61
+ **Why:**
62
+ 1. Zero new deps — project is private, macOS primary, Node 22+ assumed
63
+ 2. Async iterable — `fs.watch` returns `AsyncIterable<FileChangeInfo>`, fits streamx architecture naturally (no adapter needed)
64
+ 3. Debouncing covered — `batchTimed(20, 150)` already in streamx handles the duplicate event problem
65
+ 4. Stat validation — simple: `stat()` after event, exists → index, throws → remove
66
+ 5. Simpler shutdown — close watcher handle vs chokidar's async `.close()`
67
+
68
+ **Tradeoff accepted:** more manual edge case handling (null filenames, dedup). Acceptable for a private tool with batchTimed already available.
69
+
70
+ **If issues arise:** chokidar v5 is 1 dep away, same ESM/Node 20+ requirements, drop-in upgrade path. Not worth adding preemptively.
71
+
72
+ ## Sources
73
+
74
+ - [Node.js fs.watch recursive Linux PR #45098](https://github.com/nodejs/node/pull/45098)
75
+ - [Node 20 recursive bug #48437](https://github.com/nodejs/node/issues/48437)
76
+ - [Chokidar v5 README](https://github.com/paulmillr/chokidar/blob/main/README.md)
77
+ - [Vite fs.watch discussion #12495](https://github.com/vitejs/vite/issues/12495)
78
+ - [@parcel/watcher](https://github.com/parcel-bundler/watcher)
79
+ - [fs.watch best practices](https://www.w3tutorials.net/blog/nodejs-fs-watch/)
@@ -0,0 +1,129 @@
1
+ # MCP Tool Output Format for LLM Consumption
2
+
3
+ **Question**: What output format should our xindex_search MCP tool use to return search results to an LLM?
4
+
5
+ **Current state**: `JSON.stringify(results, null, 2)` — pretty-printed JSON with score, id, meta.keywords, meta.file (id and meta.file are redundant).
6
+
7
+ ---
8
+
9
+ ## Findings
10
+
11
+ ### 1. Token efficiency benchmarks (ImprovingAgents, Oct 2025)
12
+
13
+ **Nested data** — 1,000 questions, 3 models, 4 formats:
14
+
15
+ | Format | Tokens | GPT-5 Nano | Gemini 2.5 Flash Lite |
16
+ |----------|---------|------------|----------------------|
17
+ | Markdown | 38,357 | 54.3% | 48.2% |
18
+ | YAML | 42,477 | 62.1% | 51.9% |
19
+ | JSON | 57,933 | 50.3% | 43.1% |
20
+ | XML | 68,804 | 44.4% | 33.8% |
21
+
22
+ Markdown uses **34% fewer tokens** than JSON. YAML has better accuracy but more tokens.
23
+
24
+ **Flat/tabular data** — 11 formats, 1,000 queries, GPT-4.1-nano:
25
+
26
+ | Format | Accuracy | Tokens | Efficiency |
27
+ |----------------|----------|---------|------------|
28
+ | Markdown-KV | 60.7% | 52,104 | Best accuracy |
29
+ | Markdown Table | 51.9% | 25,140 | Best ratio |
30
+ | JSON | 52.3% | 66,396 | Mediocre |
31
+ | CSV | 44.3% | 19,524 | Cheapest but worst |
32
+
33
+ For flat data (which our search results are), **Markdown-KV** gives best LLM comprehension. A numbered list with `key: value` pairs is effectively Markdown-KV.
34
+
35
+ Sources: [Nested formats](https://www.improvingagents.com/blog/best-nested-data-format/), [Table formats](https://www.improvingagents.com/blog/best-input-data-format-for-llms/)
36
+
37
+ ### 2. MCP spec guidance (June 2025)
38
+
39
+ - `content` (TextContent) = what the LLM reads
40
+ - `structuredContent` = machine-to-machine, optional
41
+ - Spec's own example uses **plain text**: `"Current weather in New York:\nTemperature: 72°F\nConditions: Partly cloudy"`
42
+ - If `outputSchema` is defined, SHOULD return both `structuredContent` AND serialized JSON in TextContent for backwards compat
43
+
44
+ The spec explicitly shows plain text as the standard tool result format for LLM consumption.
45
+
46
+ Source: [MCP Tools Spec](https://modelcontextprotocol.io/specification/2025-06-18/server/tools)
47
+
48
+ ### 3. What popular MCP servers do
49
+
50
+ | Server | Output format |
51
+ |-------------|--------------|
52
+ | Perplexity | AI-synthesized text + citation URLs |
53
+ | Context7 | Plain text documentation snippets |
54
+ | markdownify | Markdown (entire category exists for this) |
55
+ | Elasticsearch | JSON (machine-oriented) |
56
+
57
+ LLM-facing servers use text/markdown. Only machine-oriented servers use JSON.
58
+
59
+ ### 4. JSON specifically degrades LLM reasoning
60
+
61
+ - Aider benchmarks: JSON wrapping reduces code reasoning quality by 10-15% ([source](https://aider.chat/2024/08/14/code-in-json.html))
62
+ - arxiv paper: frontier models top out at ~77% accuracy on JSON processing tasks ([source](https://arxiv.org/html/2510.15955v1))
63
+ - OpenAI community: Markdown is 15% more token-efficient than JSON ([source](https://community.openai.com/t/markdown-is-15-more-token-efficient-than-json/841742))
64
+
65
+ ### 5. TOON format (Nov 2025) — not recommended
66
+
67
+ New token-optimized format. Mixed results: 73.9% on flat retrieval but **last place** (43.1%) on nested data. Immature ecosystem, no MCP support. Not applicable here.
68
+
69
+ Source: [TOON benchmarks](https://www.improvingagents.com/blog/toon-benchmarks/)
70
+
71
+ ### 6. Workato design guidelines
72
+
73
+ - Return only necessary fields — avoid sending 200+ fields when 3 suffice
74
+ - Preprocess/summarize large content before returning to LLM
75
+ - Consider token efficiency — "excessive data can overwhelm the AI agent"
76
+
77
+ Source: [Workato MCP Tool Design](https://docs.workato.com/en/mcp/mcp-server-tool-design.html)
78
+
79
+ ---
80
+
81
+ ## Analysis
82
+
83
+ Our search results are **flat data** with 3 fields per result (score, file path, keywords). This is the simplest case:
84
+
85
+ | Approach | Tokens/result | LLM quality | Fit |
86
+ |----------|--------------|-------------|-----|
87
+ | Pretty JSON (current) | ~55 | Worst — syntax overhead | Bad |
88
+ | Compact JSON | ~22 | OK but cryptic keys | Meh |
89
+ | Markdown numbered list | ~12 | Best — Markdown-KV pattern | Best |
90
+ | TSV | ~15 | OK but less natural | OK |
91
+
92
+ The markdown numbered list matches the **Markdown-KV** pattern that scored highest (60.7%) in flat data benchmarks. It's also **77% fewer tokens** than current JSON.
93
+
94
+ Additional advantages:
95
+ - File path is visually prominent (it's what the LLM acts on next)
96
+ - Score at 2 decimals is sufficient ranking signal
97
+ - Keywords give semantic context without opening the file
98
+ - Zero structural noise (no braces, brackets, quotes, commas)
99
+ - Matches how Perplexity/Context7 format their responses
100
+
101
+ No significant trade-offs: we don't need machine-parseability (the consumer is always an LLM), and there's no nested data to worry about.
102
+
103
+ ---
104
+
105
+ ## Recommendation
106
+
107
+ **Switch to markdown numbered list.**
108
+
109
+ ```
110
+ Search: "authentication flow" — 3 result(s)
111
+
112
+ 1. src/components/auth.ts (0.87) — authentication, login, session, token
113
+ 2. src/middleware/jwt.ts (0.81) — jwt, token, verify, middleware
114
+ 3. src/routes/login.ts (0.74) — login, form, credentials, redirect
115
+ ```
116
+
117
+ Implementation in `mcpApp.ts`:
118
+ ```ts
119
+ const header = `Search: "${query}" — ${results.length} result(s)\n\n`;
120
+ const lines = results.map((r, i) =>
121
+ `${i + 1}. ${r.id} (${r.score.toFixed(2)}) — ${r.meta.keywords ?? ""}`
122
+ );
123
+ const text = header + lines.join("\n");
124
+ return {content: [{type: "text" as const, text}]};
125
+ ```
126
+
127
+ Empty case: `No results for "${query}"` — avoids confusing the model with an empty list.
128
+
129
+ **Future consideration**: Add `outputSchema` + `structuredContent` when clients start using it, but keep TextContent as the primary format for LLM consumption.
@@ -0,0 +1,12 @@
1
+ # Tasks
2
+
3
+ - [xindex-mcp — MCP Server for Semantic Code Search](task.2026-04-10-xindex-mcp.md) — wrap xindex as MCP server so Claude Code can search codebase
4
+ - [Directory-based Indexing with Async Streams](task.2026-04-10-dir-indexing.md) — accept files/dirs, recursive walk with .gitignore, index via streamx pipeline
5
+ - [xindex-watch — Continuous Indexing](task.2026-04-10-watch-indexing.md) — new entry point: index all + watch for changes continuously via merged stream
6
+ - [Object Store — Separate Meta from Vectra](task.2026-04-10-object-store.md) — store meta as JSON files in .xindex/objects/, vectra keeps only vectors
7
+ - [Line-level Clustering](task.2026-04-10-line-clustering.md) — recursive bisection to split files into semantic blocks, index as file:fromLine-toLine
8
+
9
+ - [Search Config — Keyword Ignore & Inline Snippets](task.2026-04-10-search-config.md) — `.xindex.json` config for ignoring noisy keywords + inlining small code clusters in results
10
+ - [Cluster Config — Move ClusterLines defaults to .xindex.json](task.2026-04-10-cluster-config.md) — repo-level clustering params (`threshold`, `minLines`, `maxDepth`) instead of hardcoded defaults
11
+
12
+ See [done/INDEX.md](done/INDEX.md) for completed tasks.
@@ -0,0 +1,3 @@
1
+ # Done Tasks
2
+
3
+ - [xindex — Local Semantic Code Search](task.2026-04-09-local-ai-research-protos.md) — R&D prototyping → HOF refactoring → working semantic search tool (completed 2026-04-10)
@@ -0,0 +1,98 @@
1
+ ### 2026-04-09 — Session log
2
+
3
+ #### 1. hf.ts — Local text generation
4
+
5
+ - Installed `@huggingface/transformers` (v4.0.1)
6
+ - First run failed: top-level `await` not supported in CJS → fixed by adding `"type": "module"` to package.json
7
+ - Model: `HuggingFaceTB/SmolLM2-135M-Instruct` — downloads ONNX weights on first run, cached after
8
+ - API: `pipeline("text-generation", model)` → pass chat messages array → `output[0].generated_text.at(-1).content`
9
+ - Output quality: basic — responded "Hello, my name is [Your Name]." to "Write a one-line hello"
10
+
11
+ #### 2. vectra.ts — Local vector search
12
+
13
+ - Installed `vectra` (v0.14.0)
14
+ - **API gotchas** — online examples use outdated API:
15
+ - `VectraIndex` → actually `LocalIndex`
16
+ - Constructor: `new LocalIndex(folderPath)` — no options object, no `dimension` param
17
+ - Must call `createIndex()` before first use, check with `isIndexCreated()`
18
+ - `queryItems(vector, query, topK, filter)` — 4 positional args, not an options object
19
+ - Filter format: `{ category: { $eq: "fruit" } }` — nested operator syntax
20
+ - Embeddings: `pipeline("feature-extraction", "sentence-transformers/all-MiniLM-L6-v2")` → 384-dim vectors
21
+ - Embedding helper: `embedder(text, { pooling: "mean", normalize: true })` → `Array.from(result.data)`
22
+ - Tested: 3 items indexed, query "red fruit" with category filter → correctly returned 2 fruit items, filtered out "Cars are vehicles"
23
+ - Scores: Apples=0.7830, Bananas=0.5188
24
+
25
+ #### 3. keywords.ts — Keyword extraction from code files
26
+
27
+ - Installed `keyword-extractor` (v0.0.28) — CJS module, needed `createRequire(import.meta.url)` for ESM import
28
+ - **Iteration 1**: `return_chained_words: true` → way too aggressive, merged entire code lines into single "keywords"
29
+ - **Iteration 2**: `return_max_ngrams: 3` instead → still noisy, code syntax tokens (`{`, `}`, `const`, `await`) dominated results
30
+ - **Iteration 3**: Added code-aware preprocessing before extraction:
31
+ - Strip `//` from comments but keep comment text
32
+ - Remove code punctuation: `{}()[];=<>|&!+*/$@` etc.
33
+ - Remove JS keywords: `const`, `let`, `var`, `import`, `export`, `from`, `await`, `async`, `function`, `return`, `for`, `of`, `if`, `new`, `typeof`, `as`
34
+ - Collapse whitespace
35
+ - Post-filter: skip keywords < 3 chars or non-alphabetic, word-boundary regex for frequency count
36
+ - Final output on vectra.ts: clean results — `metadata(8x)`, `index(7x)`, `text(7x)`, `fruit(5x)`, `pipeline(3x)`, `huggingface(1x)`
37
+ - Tested without filters: confirmed `const(10x)`, `await(8x)`, `}(19x)` dominate — filters are necessary for code
38
+
39
+ #### 4. keywords-compromise.ts — Compromise NLP extraction
40
+
41
+ - Installed `compromise` (v14.15.0) — ESM native, no createRequire needed
42
+ - Tested all extractors: `.topics()`, `.nouns()`, `.verbs()`, `.people()`, `.organizations()`
43
+ - **Result on code files**: poor. Nouns are code fragments (`'const index = new LocalIndex("./vectra-index");'`), zero topics, zero people
44
+ - On hf.ts: caught "Microsoft" as topic + organization from prompt string — works on embedded natural language
45
+ - **Conclusion**: compromise is designed for prose (articles, emails, chat), not source code
46
+
47
+ #### 5. keywords-pipeline.ts — Full extraction pipeline
48
+
49
+ - Combined: read file → compromise (nouns/verbs/topics) → regex `\W+` → space → keyword-extractor → show
50
+ - Added LLM step (SmolLM2-135M): asked to extract/refine keywords
51
+ - **LLM result**: echoed input then looped on `await transformer.get(index)` — too small to understand the instruction
52
+ - LLM fallback logic: if output has <3 unique terms or contains repetition pattern, fall back to raw keywords
53
+ - Pipeline works end-to-end but LLM step is effectively a passthrough
54
+
55
+ #### 6. vectra-keywords.ts — Combined indexing + synonym search
56
+
57
+ - Merged keyword extraction + vectra indexing into one script
58
+ - Indexed 5 files, tested synonym search:
59
+ - "fruit" → hf.ts (0.18), vectra.ts (0.17)
60
+ - "automobile vehicle transportation" → vectra.ts (0.18) — synonym for "cars/vehicles"
61
+ - "embedding model neural network" → vectra.ts (0.27) — semantic match
62
+ - Scores low because code noise (`const`, `await`) dilutes the keyword embeddings
63
+
64
+ #### 7. xindex.ts — Final combined solution
65
+
66
+ - Created unified CLI: `xindex.ts index <files>` and `xindex.ts search <query>`
67
+ - Full index pipeline: compromise → regex → keyword-extractor → LLM refine → MiniLM embed → vectra store
68
+ - Query pipeline: input → keyword-extractor → embed → vectra search
69
+ - Added full payload logging at each step: [1] keywords, [2] LLM refined, [3] vector preview, [4] metadata
70
+ - Test results:
71
+ - "natural language processing" → keywords-pipeline.ts (0.56) — strongest match
72
+ - "automobile transportation" → vectra.ts (0.21) — synonym works
73
+ - "neural network deep learning" → vectra.ts (0.28)
74
+
75
+ #### 8. Project setup — ~/project/xindex
76
+
77
+ - Created standalone TypeScript project at `/Users/slava/project/xindex`
78
+ - Moved all RnD files: xindex.ts, hf.ts, vectra.ts, keywords.ts, keywords-compromise.ts, keywords-pipeline.ts, vectra-keywords.ts
79
+ - package.json, tsconfig.json, bin/xindex entry point
80
+ - Git initialized
81
+
82
+ #### Alternative keyword extraction libs (from user research)
83
+
84
+ - `textlens` — TF-IDF, 1-line API, fastest, 10k+ weekly downloads
85
+ - `node-keyword-extractor` — RAKE-like, 1-line API, very fast
86
+ - `compromise` — full NLP, 3 lines, fast (tested — poor on code)
87
+ - `natural` — TF-IDF tokenizer, 5 lines, fast
88
+
89
+ #### Decisions & findings
90
+
91
+ - `"type": "module"` in package.json is required for all prototypes (top-level await)
92
+ - HuggingFace transformers JS works well for embeddings; text generation quality limited by model size
93
+ - Vectra API docs/examples online are outdated — always check the actual `.d.ts` types
94
+ - Code keyword extraction needs domain-specific preprocessing regardless of library choice
95
+ - Compromise NLP is not suitable for code — only for natural language text
96
+ - SmolLM2-135M is too small for keyword refinement — needs 360M+ or external API
97
+ - MiniLM-L6-v2 embeddings understand synonyms well enough for semantic code search
98
+ - The concept works: index codebase → query with natural language → get relevant files
@@ -0,0 +1,102 @@
1
+ # Task: xindex — Local Semantic Code Search [COMPLETED 2026-04-10]
2
+
3
+ ## Context
4
+
5
+ Built a local semantic code search tool — index a codebase, query by keyword or meaning, get relevant files back. No cloud APIs, everything runs on-device.
6
+
7
+ **Project**: `~/project/xindex`
8
+
9
+ **Dependencies:** `@huggingface/transformers`, `vectra`, `compromise`, `keyword-extractor`, `tsx`
10
+
11
+ ## Goal
12
+
13
+ Index a codebase so that minimally meaningful text queries return relevant files/info about the project. Local-first, no cloud APIs.
14
+
15
+ ## Final Architecture
16
+
17
+ ```
18
+ componets/
19
+ ├── llm/
20
+ │ ├── embed.ts — Embed({pooling, normalize}) → MiniLM-L6 384-dim
21
+ │ └── queryLLM.ts — QueryLLM({maxTokens}) → SmolLM2-135M (kept aside, unused)
22
+ ├── keywords/
23
+ │ ├── extractKeywords.ts — ExtractKeywords() → compromise NLP (nouns/verbs/topics)
24
+ │ ├── cleanUpKeywords.ts — CleanUpKeywords({maxNgrams, minLength}) → keyword-extractor + dedup
25
+ │ └── refineKeywords.ts — RefineKeywords({queryLLM, cleanUpKeywords, prompt}) (kept aside, unused)
26
+ ├── index/
27
+ │ ├── vectraIndex.ts — VectraIndex(path) → LocalIndex init
28
+ │ ├── indexContent.ts — IndexContent({embed, index}) → embed + upsert
29
+ │ ├── getIndexStats.ts — GetIndexStats({index}) → {indexedAmount}
30
+ │ ├── searchContentIndex.ts — SearchContentIndex({extractKeywords, cleanUpKeywords, embed, index})
31
+ │ └── contentIndexDriver.ts — ContentIndexDriver({path, embed, extractKeywords, cleanUpKeywords})
32
+ └── buildComponents.ts — wires everything, returns ready-to-use functions
33
+
34
+ apps/
35
+ ├── indexApp.ts — IndexApp({extractKeywords, cleanUpKeywords, indexContent})
36
+ ├── searchApp.ts — SearchApp({searchContentIndex})
37
+ ├── run.index.ts — CLI entry point for indexing
38
+ └── run.search.ts — CLI entry point for search
39
+
40
+ bin/
41
+ ├── xindex-index — #!/usr/bin/env tsx → apps/run.index.ts
42
+ └── xindex-search — #!/usr/bin/env tsx → apps/run.search.ts
43
+ ```
44
+
45
+ ## Final Pipeline
46
+
47
+ ```
48
+ INDEX (per file):
49
+ file + filename
50
+
51
+ ├─[1] extractKeywords ──→ compromise NLP (nouns, verbs, topics)
52
+
53
+ ├─[2] cleanUpKeywords ──→ keyword-extractor + dedup + filter
54
+
55
+ ├─[3] MiniLM-L6 ────────→ 384-dim embedding
56
+
57
+ └─[4] vectra ────────────→ upsert { id, vector, metadata: { keywords, file } }
58
+
59
+ SEARCH:
60
+ user input
61
+
62
+ ├─[1] extractKeywords ──→ compromise NLP
63
+
64
+ ├─[2] cleanUpKeywords ──→ keyword-extractor + dedup + filter
65
+
66
+ ├─[3] MiniLM-L6 ────────→ 384-dim embedding
67
+
68
+ └─[4] vectra ────────────→ queryItems → ranked by cosine similarity
69
+ ```
70
+
71
+ ## Key Decisions
72
+
73
+ - **LLM refine step removed** — SmolLM2-135M was too small, acted as passthrough or generated garbage. Without it: 10x faster indexing, better accuracy (11/14 → 16/20 correct #1)
74
+ - **HOF component pattern** — all components are factory functions: `DoThing({deps}): IDoThing`. Export factory + type only, no default instances
75
+ - **Dependencies as destructured objects** — `DoThing({embed, index}: {embed: IEmbed, index: LocalIndex})`
76
+ - **Separate entry points** — `bin/xindex-index` and `bin/xindex-search` instead of one CLI with subcommands
77
+ - **ContentIndexDriver** — bundles index layer (indexContent, getIndexStats, searchContentIndex) behind one factory
78
+
79
+ ## Final Test Results (41 files, 20 queries)
80
+
81
+ | Metric | Value |
82
+ |--------|-------|
83
+ | Index time (41 files) | 1.07s (26ms/file) |
84
+ | Search time | 0.71s (constant) |
85
+ | Correct #1 | 16/20 (80%) |
86
+ | Correct in top 3 | 19/20 (95%) |
87
+ | Cross-domain isolation | Perfect |
88
+
89
+ ## Resolved Review Items
90
+
91
+ - [x] Search loads unused LLM → removed LLM from pipeline entirely
92
+ - [x] `// --- Init ---` comment artifact → removed
93
+ - [ ] `BuildComponents` hardcodes index path → still hardcoded
94
+ - [ ] Stale `main` in package.json → still points to deleted xindex.ts
95
+ - [ ] `componets/` typo → deferred
96
+
97
+ ## Open Questions (carried forward)
98
+
99
+ - What's the practical index size limit for vectra LocalIndex before it slows down?
100
+ - Is hybrid search (vector + BM25) in vectra good enough to skip a separate keyword index?
101
+ - Directory walking instead of explicit file list
102
+ - Chunking for large files — one embedding per file loses detail
@@ -0,0 +1,19 @@
1
+ ### 2026-04-10
2
+
3
+ - Created task from user note: move line clustering defaults (`threshold`, `minLines`, `maxDepth`) into `.xindex.json`.
4
+ - Ran scout via code search and xindex MCP search:
5
+ - confirmed hardcoded defaults in `componets/index/clusterLines.ts`
6
+ - confirmed config infra exists in `componets/config/xindexConfig.ts` + `componets/config/loadConfig.ts`
7
+ - confirmed `.xindex.json` already used for other settings
8
+ - Drafted initial plan with diagram + 3x3 steps.
9
+ - Updated from user clarifications:
10
+ - default threshold when missing is `0.7`
11
+ - threshold validation is strict `[0,1]` with fallback to default
12
+ - apply through shared wiring in this repo folder
13
+ - do not update `.xindex.json` file content now
14
+ - config shape finalized as flat keys: `clusterThreshold`, `clusterMinLines`, `clusterMaxDepth`
15
+ - Ran consistency pass and fixed task mismatches:
16
+ - removed old `0.75` default references
17
+ - replaced clamp/range language with strict validation wording
18
+ - removed docs/example update requirement that conflicted with user instruction
19
+ - Expanded task with Detailed Change Map + Acceptance Criteria for implementation readiness.
@@ -0,0 +1,118 @@
1
+ # Task: Move ClusterLines defaults into .xindex.json
2
+
3
+ ## Context
4
+
5
+ User goal: move line-clustering configuration from hardcoded defaults to repo-level config in `.xindex.json`:
6
+ - `threshold = 0.70` (new default when missing)
7
+ - `minLines = 5`
8
+ - `maxDepth = 5`
9
+ - config key shape is flat:
10
+ - `clusterThreshold`
11
+ - `clusterMinLines`
12
+ - `clusterMaxDepth`
13
+
14
+ Why: different repositories need different clustering behavior, so these values should be configurable per repo.
15
+
16
+ Scout findings (`@xi` + code scan):
17
+ - `componets/index/clusterLines.ts` currently hardcodes defaults in the HOF signature:
18
+ - `threshold = 0.75`
19
+ - `minLines = 5`
20
+ - `maxDepth = 5`
21
+ - `.xindex.json` already exists and currently contains search/index config (`ignoreKeywords`, `ignoreFiles`, `maxSnippetLines`, `maxSnippetResults`).
22
+ - `componets/config/xindexConfig.ts` and `componets/config/loadConfig.ts` already provide optional config loading with defaults.
23
+ - `componets/buildComponents.ts` already loads `.xindex.json` and wires config into keyword cleanup, but does not yet pass clustering params to driver/components.
24
+
25
+ Related active tasks:
26
+ - `task.2026-04-10-line-clustering.md`
27
+ - `task.2026-04-10-search-config.md`
28
+
29
+ ## Goal
30
+
31
+ Extend `.xindex.json` + config loading so line-clustering params are configurable per repository, and wire them into `ClusterLines` construction with strict threshold validation and the new default threshold `0.7` when config keys are absent.
32
+
33
+ ## Diagram
34
+
35
+ ```
36
+ .xindex.json (optional)
37
+ ┌──────────────────────────────────────────────┐
38
+ │ existing: ignoreKeywords, ignoreFiles, ... │
39
+ │ new: │
40
+ │ clusterThreshold: number (default 0.7) │
41
+ │ clusterMinLines: number (default 5) │
42
+ │ clusterMaxDepth: number (default 5) │
43
+ └───────────────────┬──────────────────────────┘
44
+
45
+
46
+ LoadConfig -> IXindexConfig (defaults applied)
47
+
48
+
49
+ BuildComponents
50
+
51
+
52
+ ContentIndexDriver / ClusterLines factory
53
+
54
+
55
+ clusterLines() uses repo-specific values
56
+ ```
57
+
58
+ ## Steps
59
+
60
+ ### 1. Extend config schema (3x3)
61
+ - **1.1 Add fields to `IXindexConfig`** — add three clustering fields with explicit names and numeric types.
62
+ - **1.2 Parse + default in `LoadConfig`** — map new JSON keys to validated numbers with defaults `{clusterThreshold: 0.7, clusterMinLines: 5, clusterMaxDepth: 5}`.
63
+ - **1.3 Validate bounds** — `clusterThreshold` must be in `[0,1]` (otherwise fallback to default), and line/depth values must be finite integers with safe lower bounds.
64
+
65
+ ### 2. Wire config into clustering (3x3)
66
+ - **2.1 Thread config through builder/driver** — ensure the clustering factory gets config values from `BuildComponents` path.
67
+ - **2.2 Update `ClusterLines` construction** — pass config values from driver wiring instead of relying on hardcoded constructor defaults.
68
+ - **2.3 Preserve backward compatibility** — missing `.xindex.json` or missing keys should still produce stable clustering behavior via loader defaults.
69
+
70
+ ### 3. Validate behavior and docs (3x3)
71
+ - **3.1 Runtime sanity checks** — run index/search flow to confirm no regressions and that loaded config values are honored.
72
+ - **3.2 Surface scope in repo entry points** — ensure clustering config is available from common build path used by apps in this folder.
73
+ - **3.3 Add/update tests** — cover default path (no config keys), invalid threshold path (fallback to 0.7), and override path (custom values).
74
+
75
+ ## Detailed Change Map
76
+
77
+ - `componets/config/xindexConfig.ts`
78
+ - Add:
79
+ - `clusterThreshold: number`
80
+ - `clusterMinLines: number`
81
+ - `clusterMaxDepth: number`
82
+ - `componets/config/loadConfig.ts`
83
+ - Extend `DEFAULTS` with clustering keys (`0.7`, `5`, `5`).
84
+ - Parse new keys from `.xindex.json`.
85
+ - Apply strict validation:
86
+ - threshold valid only if finite number and `0 <= v <= 1`, otherwise default
87
+ - min/depth valid only if finite number, integerized, and `>= 1`, otherwise default
88
+ - `componets/index/contentIndexDriver.ts`
89
+ - Accept `config` in factory deps.
90
+ - Construct `ClusterLines({... , threshold: config.clusterThreshold, minLines: config.clusterMinLines, maxDepth: config.clusterMaxDepth})`.
91
+ - `componets/buildComponents.ts`
92
+ - Pass loaded `config` into `ContentIndexDriver`.
93
+ - Keep returning `config` so consumers in this folder can use consistent runtime config.
94
+ - `apps/run.*.ts` and `apps/mcpApp.ts` (as needed)
95
+ - No direct clustering logic, but rely on `BuildComponents` so new config applies everywhere in this folder through shared wiring.
96
+
97
+ ## Acceptance Criteria
98
+
99
+ - `.xindex.json` may define clustering keys and they influence `ClusterLines` without code changes.
100
+ - Missing clustering keys use defaults: `clusterThreshold=0.7`, `clusterMinLines=5`, `clusterMaxDepth=5`.
101
+ - Invalid threshold values (e.g., `-0.1`, `1.2`, `"0.7"`, `null`) fallback to `0.7`.
102
+ - Indexing pipeline compiles and runs with unchanged public entry points.
103
+ - No update to `.xindex.json` file contents is required in this task.
104
+
105
+ ## Decisions
106
+
107
+ - Default threshold changed from old runtime value to `0.7`.
108
+ - Threshold validation is strict `[0,1]` with fallback to default (no clamping).
109
+ - Scope is this repo folder via shared component wiring, not only one direct caller.
110
+ - Do not modify current `.xindex.json` as part of this task.
111
+ - Config shape is flat keys in `.xindex.json`:
112
+ - `clusterThreshold`
113
+ - `clusterMinLines`
114
+ - `clusterMaxDepth`
115
+
116
+ ## Open Questions
117
+
118
+ - Compatibility strategy: accept legacy names (if any) or only new canonical names?
@@ -0,0 +1,8 @@
1
+ ### 2026-04-10 — Task created
2
+
3
+ - Scouted: streamx has `from()`, `map`, `flat`, `pipe`, `run` — full async stream toolkit
4
+ - Current IndexApp is a simple for-loop over explicit file paths
5
+ - Node `fs.readdir({recursive: true})` exists but no gitignore support
6
+ - `.gitignore` already has sensible rules for the project
7
+ - User wants: files or dirs as input → recursive walk → stream → index
8
+ - User preference: use streamx from packages/ for the stream pipeline