xindex 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/.ai/research/2026-04-10-file-watching.md +79 -0
  2. package/.ai/research/2026-04-10-mcp-output-format.md +129 -0
  3. package/.ai/task/INDEX.md +12 -0
  4. package/.ai/task/done/INDEX.md +3 -0
  5. package/.ai/task/done/task.2026-04-09-local-ai-research-protos.log.md +98 -0
  6. package/.ai/task/done/task.2026-04-09-local-ai-research-protos.md +102 -0
  7. package/.ai/task/task.2026-04-10-cluster-config.log.md +19 -0
  8. package/.ai/task/task.2026-04-10-cluster-config.md +118 -0
  9. package/.ai/task/task.2026-04-10-dir-indexing.log.md +8 -0
  10. package/.ai/task/task.2026-04-10-dir-indexing.md +92 -0
  11. package/.ai/task/task.2026-04-10-line-clustering.log.md +50 -0
  12. package/.ai/task/task.2026-04-10-line-clustering.md +176 -0
  13. package/.ai/task/task.2026-04-10-object-store.log.md +7 -0
  14. package/.ai/task/task.2026-04-10-object-store.md +81 -0
  15. package/.ai/task/task.2026-04-10-search-config.log.md +46 -0
  16. package/.ai/task/task.2026-04-10-search-config.md +274 -0
  17. package/.ai/task/task.2026-04-10-watch-indexing.log.md +32 -0
  18. package/.ai/task/task.2026-04-10-watch-indexing.md +101 -0
  19. package/.ai/task/task.2026-04-10-xindex-mcp.log.md +5 -0
  20. package/.ai/task/task.2026-04-10-xindex-mcp.md +92 -0
  21. package/.ai/task/task.2026-04-10-xindex-mcp.report.md +113 -0
  22. package/.claude/settings.local.json +73 -0
  23. package/.claude/skills/make-hof/SKILL.md +8 -0
  24. package/.claude/skills/make-hof/playbook.md +38 -0
  25. package/.cursor/mcp.json +8 -0
  26. package/.mcp.json +8 -0
  27. package/.xindex.json +22 -0
  28. package/CLAUDE.md +54 -0
  29. package/README.md +206 -0
  30. package/apps/indexApp.ts +31 -0
  31. package/apps/mcpApp.ts +119 -0
  32. package/apps/run.index.ts +19 -0
  33. package/apps/run.mcp.ts +49 -0
  34. package/apps/run.reset.ts +10 -0
  35. package/apps/run.search.ts +21 -0
  36. package/apps/run.watch.ts +44 -0
  37. package/apps/searchApp.ts +9 -0
  38. package/apps/watchApp.ts +53 -0
  39. package/apps/watchFileEventsApp.ts +39 -0
  40. package/bin/xindex-index +2 -0
  41. package/bin/xindex-mcp +2 -0
  42. package/bin/xindex-reset +2 -0
  43. package/bin/xindex-search +2 -0
  44. package/bin/xindex-watch +2 -0
  45. package/componets/IType.ts +1 -0
  46. package/componets/appId.ts +3 -0
  47. package/componets/buildComponents.ts +27 -0
  48. package/componets/config/loadConfig.ts +43 -0
  49. package/componets/config/xindexConfig.ts +4 -0
  50. package/componets/index/contentIndexDriver.ts +39 -0
  51. package/componets/index/formatSearchResults.ts +18 -0
  52. package/componets/index/getIndexStats.ts +11 -0
  53. package/componets/index/handleFileEvent.ts +25 -0
  54. package/componets/index/indexApi.ts +45 -0
  55. package/componets/index/vectraIndex.ts +11 -0
  56. package/componets/index/watcherLock.ts +107 -0
  57. package/componets/keywords/cleanUpKeywords.ts +38 -0
  58. package/componets/keywords/extractKeywords.ts +14 -0
  59. package/componets/keywords/refineKeywords.ts +16 -0
  60. package/componets/llm/embed.ts +18 -0
  61. package/componets/llm/queryLLM.ts +20 -0
  62. package/componets/logger.ts +34 -0
  63. package/componets/walkFiles.ts +51 -0
  64. package/componets/watchFiles.ts +106 -0
  65. package/features/indexContent.ts +16 -0
  66. package/features/removeContent.ts +9 -0
  67. package/features/resetIndex.ts +9 -0
  68. package/features/searchIndex.ts +33 -0
  69. package/package.json +32 -0
  70. package/packages/fun/src/IType.ts +5 -0
  71. package/packages/fun/src/array-finder.ts +55 -0
  72. package/packages/fun/src/array-index.ts +35 -0
  73. package/packages/fun/src/array.ts +112 -0
  74. package/packages/fun/src/assert.ts +5 -0
  75. package/packages/fun/src/asyncRequest.ts +35 -0
  76. package/packages/fun/src/callsites.ts +18 -0
  77. package/packages/fun/src/case-never.ts +9 -0
  78. package/packages/fun/src/casting.ts +41 -0
  79. package/packages/fun/src/collect.ts +13 -0
  80. package/packages/fun/src/concurrency.ts +186 -0
  81. package/packages/fun/src/container.ts +86 -0
  82. package/packages/fun/src/counter.ts +45 -0
  83. package/packages/fun/src/create-map.ts +2 -0
  84. package/packages/fun/src/dedupe.ts +2 -0
  85. package/packages/fun/src/defer.ts +55 -0
  86. package/packages/fun/src/delay.ts +5 -0
  87. package/packages/fun/src/discriminate.ts +34 -0
  88. package/packages/fun/src/enum-values.ts +12 -0
  89. package/packages/fun/src/exponential-backoff.ts +20 -0
  90. package/packages/fun/src/flatten.ts +11 -0
  91. package/packages/fun/src/hash.ts +67 -0
  92. package/packages/fun/src/hash128.ts +6 -0
  93. package/packages/fun/src/hash256.ts +6 -0
  94. package/packages/fun/src/hub.ts +53 -0
  95. package/packages/fun/src/id.ts +10 -0
  96. package/packages/fun/src/interval.ts +76 -0
  97. package/packages/fun/src/is-non-nullable.ts +2 -0
  98. package/packages/fun/src/isIterable.ts +3 -0
  99. package/packages/fun/src/mailbox.ts +13 -0
  100. package/packages/fun/src/map-record.ts +19 -0
  101. package/packages/fun/src/match-collections.ts +57 -0
  102. package/packages/fun/src/match-left-and-right-arrays.ts +78 -0
  103. package/packages/fun/src/mem.ts +26 -0
  104. package/packages/fun/src/memos.ts +28 -0
  105. package/packages/fun/src/normalizeError.ts +25 -0
  106. package/packages/fun/src/nothing.ts +3 -0
  107. package/packages/fun/src/pipe.ts +18 -0
  108. package/packages/fun/src/prettyJson.ts +3 -0
  109. package/packages/fun/src/project.ts +8 -0
  110. package/packages/fun/src/promise.ts +27 -0
  111. package/packages/fun/src/pubsub.ts +128 -0
  112. package/packages/fun/src/randomId.ts +14 -0
  113. package/packages/fun/src/regexp-escape.ts +13 -0
  114. package/packages/fun/src/retry.ts +15 -0
  115. package/packages/fun/src/serial.test.ts +107 -0
  116. package/packages/fun/src/serial.ts +17 -0
  117. package/packages/fun/src/sleep.ts +3 -0
  118. package/packages/fun/src/sort-object.ts +46 -0
  119. package/packages/fun/src/speed-test.ts +56 -0
  120. package/packages/fun/src/tick.ts +37 -0
  121. package/packages/fun/src/time-behavior.ts +50 -0
  122. package/packages/fun/src/time.ts +22 -0
  123. package/packages/fun/src/timedFallback.ts +37 -0
  124. package/packages/fun/src/timer.ts +30 -0
  125. package/packages/fun/src/value.ts +33 -0
  126. package/packages/fun/src/waitForCounter.ts +15 -0
  127. package/packages/streamx/src/batch.ts +23 -0
  128. package/packages/streamx/src/batchTimed.ts +113 -0
  129. package/packages/streamx/src/buffer.ts +72 -0
  130. package/packages/streamx/src/concatenate.ts +33 -0
  131. package/packages/streamx/src/filter.ts +14 -0
  132. package/packages/streamx/src/flat.ts +19 -0
  133. package/packages/streamx/src/flatMap.ts +9 -0
  134. package/packages/streamx/src/from.ts +30 -0
  135. package/packages/streamx/src/index.ts +49 -0
  136. package/packages/streamx/src/interval.ts +58 -0
  137. package/packages/streamx/src/loop.ts +8 -0
  138. package/packages/streamx/src/map.ts +12 -0
  139. package/packages/streamx/src/merge.ts +89 -0
  140. package/packages/streamx/src/nodeReadable.ts +6 -0
  141. package/packages/streamx/src/nodeTransform.ts +9 -0
  142. package/packages/streamx/src/nodeWritable.ts +38 -0
  143. package/packages/streamx/src/objectReader.ts +16 -0
  144. package/packages/streamx/src/polyfill.ts +20 -0
  145. package/packages/streamx/src/reader.ts +38 -0
  146. package/packages/streamx/src/reduce.ts +15 -0
  147. package/packages/streamx/src/scale.ts +93 -0
  148. package/packages/streamx/src/scaleSync.ts +13 -0
  149. package/packages/streamx/src/sequence.ts +7 -0
  150. package/packages/streamx/src/tap.ts +9 -0
  151. package/packages/streamx/src/toArray.ts +9 -0
  152. package/packages/streamx/src/writer.ts +96 -0
  153. package/rnd/hf.ts +14 -0
  154. package/rnd/keywords-compromise.ts +18 -0
  155. package/rnd/keywords-pipeline.ts +79 -0
  156. package/rnd/keywords.ts +38 -0
  157. package/rnd/test-vectra-memory.ts +63 -0
  158. package/rnd/vectra-keywords.ts +95 -0
  159. package/rnd/vectra.ts +50 -0
  160. package/tsconfig.json +14 -0
@@ -0,0 +1,92 @@
1
+ # Task: Directory-based Indexing with Async Streams
2
+
3
+ ## Context
4
+
5
+ Current `IndexApp` takes an explicit file list — no directory scanning. User wants to pass files **or** dirs, recursively scan dirs, and index everything as a stream.
6
+
7
+ **Current pipeline** (`apps/indexApp.ts`):
8
+ ```
9
+ files[] → for each → readFile → extractKeywords → cleanUp → indexContent
10
+ ```
11
+
12
+ **streamx available** (`packages/streamx/`):
13
+ - `from(iterable)` — wraps async/sync iterable into StreamX
14
+ - `of()` → `.pipe()` for chaining
15
+ - Operators: `map`, `filter`, `flat`, `flatMap`, `batch`, `buffer`, `merge`, `scale`, `reduce`, `tap`
16
+ - `run()` — consumes stream, returns last value
17
+
18
+ **Gitignore: `ignore` npm package** (used by ESLint, Prettier):
19
+ ```ts
20
+ import ignore from "ignore";
21
+ const ig = ignore();
22
+ ig.add(await readFile(".gitignore", "utf8")); // load rules
23
+ ig.ignores("node_modules/foo.js"); // true
24
+ ig.filter(["src/index.ts", "dist/out.js"]); // ["src/index.ts"]
25
+ ```
26
+ - Paths must be **relative** to .gitignore location
27
+ - `.add()` stackable — call per nested .gitignore
28
+ - Handles negation (`!`), globs, `**/`, comments
29
+
30
+ **Decisions:**
31
+ - Paths are **relative to working directory** (children of cwd)
32
+ - Sequential indexing now; `scale()` for parallelism later
33
+ - Use `ignore` npm package for .gitignore parsing
34
+ - Default: if no .gitignore in a folder, skip `.*` dirs (`.git`, `.idea`, etc.)
35
+
36
+ ## Goal
37
+
38
+ Accept files or directories as input, recursively walk directories (respecting .gitignore), and index all discovered files as an async stream using streamx.
39
+
40
+ ## Diagram
41
+
42
+ ```
43
+ INPUT: ["file.ts", "src/", "lib/"]
44
+
45
+ ├── file? ──→ yield relative path
46
+
47
+ └── dir? ──→ walk recursively
48
+
49
+ ├── load .gitignore (if exists)
50
+ │ (else: default ignore .* dirs)
51
+
52
+ ├── skip ignored paths
53
+
54
+ └── yield each file (relative)
55
+
56
+
57
+ from(walkFiles) ──→ streamx pipeline
58
+
59
+ ├── map: readFile
60
+ ├── map: extractKeywords + cleanUp
61
+ └── tap: indexContent
62
+
63
+
64
+ {indexed count}
65
+ ```
66
+
67
+ ## Steps
68
+
69
+ ### 1. Directory Walker
70
+ - Create `componets/walkFiles.ts` — HOF `WalkFiles()` returning async generator that yields relative file paths
71
+ - Detect file vs dir via `fs.stat`, yield files directly, recurse into dirs
72
+ - Use `node:fs/promises` `opendir` for streaming directory reads
73
+
74
+ ### 2. Gitignore Filtering
75
+ - Install `ignore` package — `npm install ignore`
76
+ - Load `.gitignore` per directory during walk; stack rules with parent via `ig.add()`
77
+ - Default rule when no `.gitignore`: skip `.*` dirs (`.git`, `.idea`, `.DS_Store` etc.)
78
+ - Check `ig.ignores(relativePath)` before yielding or descending into subdirs
79
+
80
+ ### 3. Stream Pipeline
81
+ - Wire walker into streamx: `from(walkFiles(inputs))` → `pipe(map(indexFile))` → `run()`
82
+ - Update `IndexApp` HOF to accept `string[]` (mix of files and dirs)
83
+ - `run.index.ts` passes argv as-is — no change needed
84
+
85
+ ## Dependencies
86
+
87
+ - `ignore` — gitignore pattern matching (new dep)
88
+ - `packages/streamx` — async stream operators (already in repo)
89
+
90
+ Sources:
91
+ - [ignore npm package](https://www.npmjs.com/package/ignore)
92
+ - [node-ignore GitHub](https://github.com/kaelzhang/node-ignore)
@@ -0,0 +1,50 @@
1
+ # Log: Line-level clustering
2
+
3
+ ### 2026-04-10
4
+
5
+ - Task created from user notes about recursive bisection clustering
6
+ - Scouted codebase: current granularity is 1 vector per file, `id=filePath`, metadata in separate object store (MD5-keyed JSON)
7
+ - Confirmed in-memory Vectra works via `VirtualFileStorage` (test-vectra-memory.ts) — no disk I/O, cosine similarity queries work with small dims
8
+ - Key insight: Vectra `metadata: {}` is always empty in current code — all real metadata lives in object store. This pattern can extend to line-level clusters
9
+ - Identified integration points: indexContent.ts (upsert), searchContentIndex.ts (query), indexMeta.ts (type), removeContent.ts (delete)
10
+ - Open: similarity threshold calibration, keyword quality for code, cluster deletion strategy on re-index
11
+ - Clarification round: user confirmed embedding cosine (not Jaccard) — meaning matters more than keyword overlap
12
+ - Threshold: user says 0.55–0.70 range, will start at 0.6
13
+ - Min cluster: 3–5 lines, default 5
14
+ - ID format: `file.ts:12-45` works as-is for both Vectra and object store
15
+ - Cleanup strategy: object store entry per file tracks all cluster IDs, delete all on re-index
16
+ - Researched NPM packages: semantic-chunking (jparkerweb), semantic-chunker (johnhenry, BYOE), LangChain RecursiveCharacterTextSplitter. All target prose/sentences, not code lines. Custom bisection with our embed pipeline is better fit.
17
+ - NAACL 2025: fixed-size chunks match semantic chunking for prose RAG, but code has mixed concerns per file where semantic splitting should win
18
+ - Consistency check: expanded 3x3 → 6x3 steps with concrete file paths and implementation details
19
+ - Fixed: diagram now shows full flow from handleFileEvent through clusterLines to persistent store + manifest
20
+ - Fixed: IIndexMeta type is `{keywords, id}` not `{keywords, file}` — corrected in Context
21
+ - Found: main change site is `indexFileContent.ts` (not `indexContent.ts`), and `handleFileEvent.ts` for cleanup
22
+ - Found: cosine similarity doesn't need in-memory Vectra — embed returns normalized vectors, direct dot product suffices
23
+ - Found: object store dual-use issue — need to store both cluster metas and file manifests with different shapes. Added to Open Questions.
24
+ - Added Edge Cases section: small files, empty files, uniform content, legacy data
25
+ - Consistency check #2: fixed Goal (removed "in-memory Vectra" — cosine is direct dot product), removed stale whole-file keyword step from diagram, added tagged union pattern for manifest, clarified RemoveFileContent as separate HOF, added buildComponents wiring step
26
+ - Decision: cosine via direct dot product (Option A). 3-line helper, no Vectra needed for bisection — comparing exactly 2 vectors, not searching N. Fallback to in-memory Vectra (Option B) if needed.
27
+ - **Key architecture correction**: existing file-level indexing must stay intact. Clustering is an EXTENSION, not a replacement. Both file-level and cluster-level entries coexist.
28
+ - If file is cohesive (1 cluster = whole file) → skip clustering entirely, file-level entry is enough
29
+ - Resolved object store dual-use: three separate keys — `filePath` (file meta), `filePath:1-10` (cluster meta), `filePath::manifest` (cluster ID list). Widen `IObjectStore` types to `IStoreEntry = IIndexMeta | IFileManifest`.
30
+ - Traced full dependency wiring: `IndexFileContent` is constructed in run.*.ts (NOT inside contentIndexDriver). Plan: move construction inside driver since it has all deps. Simplifies callers.
31
+ - `handleFileEvent` flow on re-index: `removeFileContent(path)` first (cleans file entry + clusters + manifest), then `indexFileContent(path, text)` (creates file entry + clusters + manifest)
32
+ - Expanded steps from 6x3 → 7x(2-5) with concrete file paths, signatures, and implementation notes
33
+ - Updated diagram to show full bidirectional flow: removal path + indexing path + all three store key types
34
+ - Consistency check #3:
35
+ - CRITICAL: fixed file paths — `indexFileContent.ts` and `handleFileEvent.ts` are in `componets/index/`, not `componets/`. Same for `removeFileContent.ts` (new file).
36
+ - CRITICAL: found `indexApp.ts` gap — bulk indexer calls `indexFileContent` directly via stream (no `HandleFileEvent`). Old clusters would linger on re-index. Added `removeFileContent` dep to `IndexApp`, call cleanup before indexFileContent in stream callback.
37
+ - Fixed edge case: empty files still get file-level entry (existing pipeline), clustering just returns `[]`.
38
+ - Added `indexApp.ts`, `buildComponents.ts`, `run.*.ts` to Context key files.
39
+ - Added step 6.3 for indexApp.ts and step 6.4 for import path cleanup.
40
+ - Synced plan file with same fixes.
41
+ - **Implementation complete** — all 7 steps done:
42
+ - Step 1: created `componets/index/clusterLines.ts` — cosine helper, ILineCluster, ClusterLines HOF
43
+ - Step 2: updated `indexMeta.ts` (IType tagged union: IIndexMeta, IClusterMeta, IFileManifest, IStoreEntry), `objectStore.ts` (widened types)
44
+ - Step 3: created `componets/index/removeFileContent.ts` — manifest-aware cleanup
45
+ - Step 4: updated `indexFileContent.ts` — kept file-level index, added clustering extension
46
+ - Step 5: updated `contentIndexDriver.ts` (wires ClusterLines, IndexFileContent, RemoveFileContent inside), `buildComponents.ts` (returns new components)
47
+ - Step 6: simplified `run.index.ts`, `run.watch.ts`, `run.mcp.ts` (removed manual IndexFileContent construction), updated `handleFileEvent.ts` (removeFileContent), updated `indexApp.ts` (added removeFileContent dep)
48
+ - Also updated `indexContent.ts` (widened meta param) and `searchContentIndex.ts` (narrow by type)
49
+ - Reset + re-index: 113 files → 211 indexed items (98 cluster entries created)
50
+ - Search verified: cluster hits showing as `file.ts:fromLine-toLine` (e.g. `rnd/test-vectra-memory.ts:9-12`)
@@ -0,0 +1,176 @@
1
+ # Task: Line-level clustering for block-granular search
2
+
3
+ ## Context
4
+
5
+ **Current state**: xindex indexes one vector per file. The `id` is the file path, keywords are extracted from the entire file content, and search returns file-level matches. This is too coarse — a 500-line file with mixed concerns returns as a single hit with no indication of *where* in the file the match is.
6
+
7
+ **User's idea**: split files into semantically coherent blocks (clusters of lines), then index each block separately so search returns `file:fromLine-toLine` references.
8
+
9
+ **Approach — extend existing pipeline with recursive bisection**:
10
+ 1. Keep existing file-level indexing intact — `indexContent(filePath, keywords, meta)` runs first, unchanged
11
+ 2. After file-level index: split file content into lines
12
+ 3. Bisect into 2 halves → extract keywords for each → embed → compute cosine similarity (dot product of normalized vectors)
13
+ 4. If similarity is high (≥ 0.6) → cohesive, no clustering needed. If low → 2 separate clusters.
14
+ 5. Recurse: split each cluster into 2 again, test overlap, stop when clusters are cohesive or hit limits (max depth 4 → up to 16 clusters, min 5 lines per cluster)
15
+ 6. If only 1 cluster (whole file is cohesive) → skip clustering, file-level entry is enough
16
+ 7. If 2+ clusters → index each in persistent Vectra as `<file>:<fromLine>-<toLine>` alongside the file-level entry
17
+ 8. Write a manifest at `<file>::manifest` tracking cluster IDs for cleanup on re-index
18
+ 9. Both file-level and cluster-level entries coexist — search may return both
19
+
20
+ **Key files (change targets)**:
21
+ - `componets/index/indexFileContent.ts` — **main change site**: currently calls `indexContent(id, keywords, meta)` once per file. Will call `clusterLines` then loop over clusters.
22
+ - `componets/index/handleFileEvent.ts` — calls `removeContent(path)` on file change. Must delete all clusters for a file, not just one ID.
23
+ - `componets/index/indexContent.ts` — low-level: embeds + upserts one item. No change needed — called per cluster.
24
+ - `componets/index/removeContent.ts` — low-level: deletes one item. No change needed — called per cluster ID.
25
+ - `componets/index/searchContentIndex.ts` — returns `IIndexRecord{score, id, meta}`. No change needed — `id` becomes `file:1-27` naturally.
26
+ - `componets/index/indexMeta.ts` — `IIndexMeta{keywords, id}`. Add `type` tag, add `IClusterMeta`, `IFileManifest` using `IType<>` tagged union.
27
+ - `componets/index/objectStore.ts` — stores `IIndexMeta` as JSON, keyed by MD5(id). Needs a manifest entry per file to track cluster IDs.
28
+ - `componets/index/contentIndexDriver.ts` — wires components together. Must construct `ClusterLines`, `IndexFileContent`, `RemoveFileContent` inside. Currently `IndexFileContent` is constructed by callers.
29
+ - `componets/buildComponents.ts` — top-level builder. Must return `indexFileContent` + `removeFileContent` from driver.
30
+ - `apps/indexApp.ts` — bulk indexer. Calls `indexFileContent` directly via stream (no `HandleFileEvent`). Needs `removeFileContent` for cleanup.
31
+ - `apps/run.index.ts`, `apps/run.watch.ts`, `apps/run.mcp.ts` — entry points. Currently construct `IndexFileContent` manually. Will use driver-provided version.
32
+ - `componets/index/vectraIndex.ts` — creates `LocalIndex(path)`. No change.
33
+ - `componets/llm/embed.ts` — MiniLM-L6 embeddings, returns `number[]`. No change.
34
+ - `test-vectra-memory.ts` — proved VirtualFileStorage works for in-memory cosine queries.
35
+
36
+ **Raw notes**: recursive split → 2 → 4 → 8 → 16 hard stop. Overlap by keywords via embedding cosine similarity. Final clusters get indexed in persistent store with line references. MCP query returns lines.
37
+
38
+ ## Goal
39
+
40
+ Extend the existing indexing pipeline with a `ClusterLines` component (HOF pattern) that takes file content, splits it into semantically coherent line clusters using recursive bisection with embedding cosine similarity, and returns cluster descriptors `{fromLine, toLine, content, keywords}[]`. The existing file-level index stays intact — clustering adds block-level entries alongside it.
41
+
42
+ ## Diagram
43
+
44
+ ```
45
+ handleFileEvent (file change/add)
46
+
47
+ ├── removeFileContent(path) ◄── clean ALL old data first
48
+ │ ├── removeContent(path) delete file-level vectra + meta
49
+ │ └── read manifest(path::manifest) if exists:
50
+ │ ├── removeContent(path:1-10) delete each cluster
51
+ │ ├── removeContent(path:11-25)
52
+ │ └── objectStore.remove(manifest)
53
+
54
+ └── indexFileContent(path, text) ◄── create ALL new data
55
+
56
+ ├── EXISTING: file-level index (unchanged)
57
+ │ extractKeywords + cleanUpKeywords(text)
58
+ │ indexContent(path, keywords, {keywords, id: path})
59
+ │ ├── embed(keywords) → vector
60
+ │ ├── vectra.upsertItem({id: path, vector})
61
+ │ └── objectStore.write(path, meta)
62
+
63
+ ├── NEW: cluster-level index (extension)
64
+ │ clusterLines(lines, path)
65
+ │ │
66
+ │ ▼
67
+ │ ┌─────────────────┐
68
+ │ │ Split in half │
69
+ │ │ lines[0..n/2] │
70
+ │ │ lines[n/2..n] │
71
+ │ └────────┬────────┘
72
+ │ │
73
+ │ ▼
74
+ │ ┌──────────────────────────┐
75
+ │ │ Extract keywords each │
76
+ │ │ Embed keywords → vec │
77
+ │ │ cosine(vecA, vecB) │
78
+ │ └────────┬─────────────────┘
79
+ │ │
80
+ │ sim ≥ 0.6? ──yes──► 1 cluster (leaf)
81
+ │ │
82
+ │ no → recurse each half ◄── depth ≤ 4, min 5 lines
83
+ │ │
84
+ │ ▼
85
+ │ clusters[] = {fromLine, toLine, content, keywords}[]
86
+
87
+ │ clusters.length ≤ 1? → SKIP (file entry is enough)
88
+
89
+ │ clusters.length > 1? → for each cluster:
90
+ │ indexContent(id="path:12-45", cluster.keywords, clusterMeta)
91
+
92
+ └── objectStore.write(path::manifest, {clusterIds})
93
+
94
+ Three key types in store:
95
+ path → file-level entry (vectra + objectStore)
96
+ path:1-10 → cluster entry (vectra + objectStore)
97
+ path::manifest → {type:"manifest", clusterIds} (objectStore only)
98
+ ```
99
+
100
+ ## Steps
101
+
102
+ ### 1. ClusterLines component — NEW `componets/index/clusterLines.ts`
103
+ 1. **Cosine helper** — `cosine(a: number[], b: number[]): number` — dot product of two normalized vectors. Pure function, no deps.
104
+ 2. **HOF factory** — `ClusterLines({embed, extractKeywords, cleanUpKeywords, threshold, minLines, maxDepth})` returns `IClusterLines(lines: string[], file: string) → Promise<ILineCluster[]>`. Defaults: threshold=0.6, minLines=5, maxDepth=4.
105
+ 3. **ILineCluster type** — `{fromLine: number, toLine: number, content: string, keywords: string}`. `fromLine`/`toLine` are 1-based line numbers.
106
+ 4. **Recursive bisection** — split lines at midpoint → join each half → `extractKeywords` + `cleanUpKeywords` → `embed` each → `cosine(vecA, vecB)`. If sim ≥ threshold → leaf cluster. If sim < threshold → recurse on each half.
107
+ 5. **Guards** — `lines.length ≤ minLines` or `depth ≥ maxDepth` → leaf. Empty lines → return `[]`. Either half has no keywords → leaf.
108
+
109
+ ### 2. Extend metadata — MODIFY `componets/index/indexMeta.ts` + `objectStore.ts`
110
+ 1. **Tag IIndexMeta** — add `type: "meta"` field using `IType<>` pattern: `IType<{type: "meta", keywords: string, id: string}>`. Breaking change — all constructors must add `type: "meta"`.
111
+ 2. **Add IClusterMeta type** — `IType<{type: "cluster", keywords: string, id: string, fromLine: number, toLine: number}>`. Cluster-level entries with line ranges.
112
+ 3. **Add IFileManifest type** — `IType<{type: "manifest", id: string, clusterIds: string[]}>`. Stored at key `filePath::manifest` in object store.
113
+ 4. **IStoreEntry union** — `IIndexMeta | IClusterMeta | IFileManifest`. Discriminated by `type` field.
114
+ 5. **Widen objectStore types** — `IObjectStore.write`/`read` accept/return `IStoreEntry`.
115
+ 6. **Update indexContent.ts** — widen `meta` param from `IIndexMeta` to `IIndexMeta | IClusterMeta`.
116
+ 7. **Update searchContentIndex.ts** — narrow `IStoreEntry` by `type` when reading results from object store.
117
+
118
+ ### 3. RemoveFileContent — NEW `componets/index/removeFileContent.ts`
119
+ 1. **HOF factory** — `RemoveFileContent({removeContent, objectStore})` returns `IRemoveFileContent(filePath: string) => Promise<void>`.
120
+ 2. **Deletes all layers** — (a) `removeContent(filePath)` to delete file-level vectra item + meta. (b) Read manifest at `filePath::manifest` → if exists, `removeContent(clusterId)` for each → `objectStore.remove(manifestKey)`. (c) All deletes wrapped in try/catch — missing entries are fine (first-time index, no clusters).
121
+
122
+ ### 4. Update indexFileContent — MODIFY `componets/index/indexFileContent.ts`
123
+ 1. **Add deps** — `{extractKeywords, cleanUpKeywords, indexContent, clusterLines, objectStore}`. Existing deps stay — file-level index needs `extractKeywords`/`cleanUpKeywords`.
124
+ 2. **File-level index (EXISTING, now tagged)** — `extractKeywords(content)` → `cleanUpKeywords` → `indexContent(id, keywords, {type: "meta", keywords, id})`. Runs first, always.
125
+ 3. **Cluster-level index (NEW, extension)** — `content.split("\n")` → `clusterLines(lines, id)` → if `clusters.length ≤ 1` → skip (file is cohesive). If `clusters.length > 1` → for each cluster: `indexContent(\`${id}:${fromLine}-${toLine}\`, cluster.keywords, {type: "cluster", ...})`.
126
+ 4. **Write manifest** — after all clusters indexed, `objectStore.write(id + "::manifest", {type: "manifest", id, clusterIds})`.
127
+
128
+ ### 5. Wire through driver + builder — MODIFY `contentIndexDriver.ts` + `buildComponents.ts`
129
+ 1. **contentIndexDriver.ts** — instantiate `ClusterLines({embed, extractKeywords, cleanUpKeywords})`. Construct `IndexFileContent({extractKeywords, cleanUpKeywords, indexContent, clusterLines, objectStore})` inside driver (currently constructed by callers). Construct `RemoveFileContent({removeContent, objectStore})`. Add `indexFileContent` + `removeFileContent` to `IContentIndexDriver`.
130
+ 2. **buildComponents.ts** — destructure `indexFileContent` + `removeFileContent` from `ContentIndexDriver`. Return them. Callers no longer construct `IndexFileContent` themselves.
131
+
132
+ ### 6. Update callers — MODIFY `run.*.ts` + `handleFileEvent.ts` + `indexApp.ts`
133
+ 1. **run.index.ts, run.watch.ts, run.mcp.ts** — remove `IndexFileContent(...)` construction. Get `indexFileContent` + `removeFileContent` from `BuildComponents()`.
134
+ 2. **handleFileEvent.ts** — replace `removeContent` dep with `removeFileContent`. On `FileEventType.index`: `removeFileContent(path)` first (clean old data), then `indexFileContent(path, text)` (creates file entry + cluster entries). On `FileEventType.remove`: `removeFileContent(path)`.
135
+ 3. **indexApp.ts** — currently calls `indexFileContent(id, text)` directly via stream pipeline (no `HandleFileEvent`). Add `removeFileContent` dep. Call `removeFileContent(id)` before `indexFileContent(id, text)` in the `map` callback — otherwise old clusters linger when cluster boundaries change on re-index. Update `IndexApp({walkFiles, indexFileContent, removeFileContent, log})`.
136
+ 4. **Import paths** — `run.index.ts` imports `IndexFileContent` from `componets/index/indexFileContent.js`. After moving construction inside driver, remove this import. Same for `run.watch.ts` and `run.mcp.ts`.
137
+
138
+ ### 7. Test end-to-end
139
+ 1. **Unit test clusterLines** — feed a file with 2 distinct sections (imports+types vs. implementation), verify ≥2 clusters with correct 1-based line ranges.
140
+ 2. **Integration test** — index a multi-concern file, query for a specific concept, verify search returns both `file.ts` (file-level) and `file.ts:12-45` (cluster-level).
141
+ 3. **Re-index test** — modify file, re-index, verify old clusters deleted + new ones created.
142
+ 4. **Cohesive file test** — index a small/uniform file, verify only file-level entry exists (no clusters, no manifest).
143
+
144
+ ## Decisions
145
+
146
+ - **Extend, don't replace** — existing file-level indexing stays intact. Clustering is an additional step that runs after. Both levels coexist in the index.
147
+ - **1 cluster = skip** — if the file is cohesive (clustering returns 1 cluster = whole file), no cluster entries are created. File-level entry is enough.
148
+ - **Embedding cosine similarity** for bisection (not Jaccard). Jaccard only matches exact keyword strings — `fetchUser` and `getUser` would score 0% overlap despite being the same concern. Embeddings capture meaning. Cost is acceptable: MiniLM-L6 is local, ~30 embed calls per file at max depth, ~50-100ms total.
149
+ - **Cosine computation**: Option A — direct dot product (3-line helper, vectors already normalized). Fallback to Option B (in-memory Vectra via `VirtualFileStorage`) if direct cosine proves insufficient.
150
+ - **Similarity threshold**: start at 0.55–0.70, tune empirically. Try 0.6 as default.
151
+ - **Min cluster size**: 3–5 lines. Use 5 as default, configurable.
152
+ - **Three tagged types in store** (using `IType<>` pattern): `IIndexMeta{type:"meta"}` at `filePath`, `IClusterMeta{type:"cluster"}` at `filePath:fromLine-toLine`, `IFileManifest{type:"manifest"}` at `filePath::manifest`. All separate keys, discriminated by `type`.
153
+ - **Cleanup on re-index**: `removeFileContent` deletes file-level entry, then reads manifest to delete all cluster entries, then deletes manifest itself. Graceful on missing data.
154
+ - **Move IndexFileContent inside driver** — currently constructed by callers in `run.*.ts`. Moving inside `contentIndexDriver.ts` consolidates wiring since the driver already has all deps.
155
+
156
+ ## Research: existing NPM packages
157
+
158
+ - **semantic-chunking** (jparkerweb, v2.4.4) — splits text into sentences, embeds each with ONNX model, groups by cosine similarity. Sentence-level, not line-level. Uses its own ONNX pipeline, not BYOE.
159
+ - **semantic-chunker** (johnhenry) — BYOE approach, bring your own embedding function. More flexible. Could plug in our MiniLM-L6 embed.
160
+ - **LangChain RecursiveCharacterTextSplitter** — recursive splitting by character/token boundaries, not semantic. 2026 benchmarks show 512-token recursive splitting at 69% accuracy — good baseline but not meaning-aware.
161
+ - **NAACL 2025 finding**: fixed 200-word chunks match or beat semantic chunking for general RAG. But for *code* with mixed concerns in one file, semantic splitting should outperform fixed-size.
162
+ - **Verdict**: existing packages target prose (sentence-level). Our use case is code (line-level, preserve line boundaries for references). Custom recursive bisection with our existing embed pipeline is the right call — simpler than adapting a prose chunker to respect line boundaries.
163
+
164
+ ## Edge Cases
165
+
166
+ - **Small files (≤ 5 lines)** — return as single cluster, no splitting attempted.
167
+ - **Empty files** — file-level entry is still indexed (existing pipeline runs first). Clustering returns `[]`, no cluster entries created.
168
+ - **Files with uniform content** (e.g., all imports) — cosine similarity stays high at every split, returns 1 cluster. Expected behavior.
169
+ - **Binary/non-text files** — already filtered upstream by the file walker. Not a concern here.
170
+ - **Legacy index data** — files indexed before this change won't have manifests. On re-index, no old clusters to delete — just index fresh.
171
+
172
+ ## Open Questions
173
+
174
+ - **Keyword extraction quality**: current keywords come from compromise NLP + keyword-extractor. May need tuning for code (variable names, imports, function signatures).
175
+ - **Threshold tuning**: need to test 0.55 vs 0.60 vs 0.70 on real project files to find the sweet spot.
176
+ - ~~**Object store dual use**~~ — resolved: three tagged types (`IIndexMeta`, `IClusterMeta`, `IFileManifest`) discriminated by `type` field, stored at separate keys. Union `IStoreEntry = IIndexMeta | IClusterMeta | IFileManifest`.
@@ -0,0 +1,7 @@
1
+ ### 2026-04-10 — Task created
2
+
3
+ - Scouted: vectra currently stores vector + IIndexMeta (keywords, file) together
4
+ - User wants to separate: vectra for vectors only, .xindex/objects/ for meta JSON
5
+ - Hash-based path: md5(id) → xx/yy/xxyyzz.json
6
+ - Need to update indexContent, searchContentIndex, resetIndex, contentIndexDriver
7
+ - New components: objectStore (read/write/clear), indexStructure (manage .xindex/ dirs)
@@ -0,0 +1,81 @@
1
+ # Task: Object Store — Separate Meta Storage from Vectra
2
+
3
+ ## Context
4
+
5
+ Currently vectra stores both vectors AND metadata (`{keywords, file}`) in the same index. Vectra is good for semantic search, not for storage. Goal: split storage into two layers:
6
+
7
+ - **`.xindex/semantic/`** — vectra stores only vectors + id (for search)
8
+ - **`.xindex/objects/`** — file-based JSON store for meta objects (for storage/retrieval)
9
+
10
+ **Current state:**
11
+ - `indexContent.ts` — embeds content, upserts `{id, vector, metadata: IIndexMeta}` into vectra
12
+ - `searchContentIndex.ts` — queries vectra, reads `r.item.metadata as IIndexMeta`
13
+ - `resetIndex.ts` — `deleteIndex()` + `createIndex()` on vectra only
14
+ - `IIndexMeta = {keywords: string, file: string}`
15
+ - Index path: `.xindex` (single vectra folder)
16
+
17
+ **New structure:**
18
+ ```
19
+ .xindex/
20
+ ├── semantic/ ← vectra (vectors + id only, minimal meta)
21
+ └── objects/ ← JSON files keyed by hash of id
22
+ └── xx/
23
+ └── yy/
24
+ └── xxyyzz.json ← {keywords, file, ...}
25
+ ```
26
+
27
+ ## Goal
28
+
29
+ Introduce an object store layer that writes `IIndexMeta` as JSON files in `.xindex/objects/`, remove metadata from vectra (keep only vector + id), and decorate `indexContent` and `searchContentIndex` to read/write both layers.
30
+
31
+ ## Diagram
32
+
33
+ ```
34
+ INDEX PIPELINE:
35
+ file → extractKeywords → cleanUp → keywords
36
+
37
+ ├── [1] embed(keywords) → vector
38
+ │ └── vectra.upsert({id, vector}) → .xindex/semantic/
39
+
40
+ └── [2] objectStore.write(id, meta) → .xindex/objects/xx/yy/xxyyzz.json
41
+ {keywords, file}
42
+
43
+ SEARCH PIPELINE:
44
+ query → extractKeywords → cleanUp → embed → vector
45
+
46
+ ├── [1] vectra.query(vector, limit) → [{score, id}]
47
+
48
+ └── [2] objectStore.read(id) → IIndexMeta
49
+
50
+ [{score, id, meta}]
51
+
52
+ RESET:
53
+ [1] vectra.deleteIndex + createIndex → .xindex/semantic/ wiped
54
+ [2] rm -rf .xindex/objects/ → objects wiped
55
+ ```
56
+
57
+ ## Steps
58
+
59
+ ### 1. Object Store HOF
60
+ - Create `componets/index/objectStore.ts` — `ObjectStore({basePath}): IObjectStore`
61
+ - `write(id, meta)` — hash id (md5 → hex), split into `xx/yy/xxyyzz`, `mkdir -p`, write JSON
62
+ - `read(id)` — hash id, read JSON, parse as `IIndexMeta`
63
+ - `clear()` — rm -rf basePath, recreate empty dir
64
+
65
+ ### 2. Update Index Structure
66
+ - Create `componets/index/indexStructure.ts` — `IndexStructure({basePath}): IIndexStructure`
67
+ - Manages `.xindex/` top-level: ensures `semantic/` and `objects/` dirs exist
68
+ - Returns paths: `{semanticPath, objectsPath}`
69
+ - Used by `contentIndexDriver` at init
70
+
71
+ ### 3. Decorate Index/Search
72
+ - Update `IndexContent` — upsert vector+id to vectra (no meta), write meta to objectStore
73
+ - Update `SearchContentIndex` — query vectra for `{score, id}[]`, then `objectStore.read(id)` for each result to attach meta
74
+ - Update `ResetIndex` — call both `vectra.deleteIndex/createIndex` and `objectStore.clear()`
75
+ - Update `ContentIndexDriver` — pass `semanticPath` to `VectraIndex`, create `ObjectStore({basePath: objectsPath})`
76
+
77
+ ## Open Questions
78
+
79
+ - Hash function: `crypto.createHash('md5')` from Node built-in — fast enough, no deps. Or use simpler hash?
80
+ - Should objectStore support partial updates (upsert) or always overwrite?
81
+ - Should search batch-read objects or read one by one per result?
@@ -0,0 +1,46 @@
1
+ ### 2026-04-10
2
+
3
+ - Task created from user notes
4
+ - Scouted codebase via xindex search (indexed 167 files)
5
+ - `.xindex.json` exists but empty `{}`, no config loading anywhere
6
+ - `CleanUpKeywords` at `componets/keywords/cleanUpKeywords.ts:8` — HOF takes `{maxNgrams, minLength}`. Add `ignoreKeywords` here.
7
+ - `SearchContentIndex` at `componets/index/searchContentIndex.ts:12` — search pipeline, uses `cleanUpKeywords` on query. Ignore list propagates automatically.
8
+ - `IClusterMeta` at `componets/index/indexMeta.ts:11` — has `fromLine`/`toLine` for reading snippet lines
9
+ - MCP tool at `apps/mcpApp.ts:34` — `xindex_search` schema has `{query, limit}` only. Add snippet params.
10
+ - CLI at `apps/run.search.ts:23-31` — formats results with score + keywords, no snippets
11
+ - `BuildComponents` at `componets/buildComponents.ts:6` — wires everything, no config loading. Config loads here.
12
+ - `ContentIndexDriver` at `componets/index/contentIndexDriver.ts:27` — passes `cleanUpKeywords` to `ClusterLines` and `SearchContentIndex`
13
+ - Entry points: `apps/run.mcp.ts:19` (MCP), `apps/run.search.ts:8` (CLI) — both call `BuildComponents()`
14
+ - User wants explicit config names: `ignoreKeywords`, `snippetLines`, `snippetResults`
15
+
16
+ **Clarification round — decisions:**
17
+ - Defaults confirmed: `snippetResults: 3`, `snippetLines: 7`
18
+ - `ignoreKeywords`: exact strings, case-insensitive. No globs/patterns.
19
+ - Ignore at **index time** — re-index + MCP restart after config change is acceptable. One-time setup, review in 3mo.
20
+ - File-level results (whole file, no cluster) also get snippets if file total lines ≤ `snippetLines`
21
+ - Task finalized
22
+
23
+ **Round 2 — user feedback during detail expansion:**
24
+ - Renamed `snippetLines` → `maxSnippetLines`, `snippetResults` → `maxSnippetResults` (user preference for explicit names)
25
+ - Added `ignoreFiles` feature: gitignore-style glob patterns in `.xindex.json` to exclude files from indexing. Reuses existing `ignore` package already in `walkFiles.ts:3` and `watchFiles.ts`
26
+ - Expanded task from 3x3 to 4x3 to accommodate file ignore list as separate step
27
+ - Traced all WalkFiles/WatchFiles consumers: `run.mcp.ts`, `run.index.ts`, `run.watch.ts` — all need `ignoreFiles` plumbed
28
+ - Task ready for implementation
29
+
30
+ **Round 3 — consistency check (7 findings, all fixed):**
31
+ - [Missing] `.xindex.json` is optional — added to Decisions + diagram label
32
+ - [Drift] Diagram only showed WalkFiles — added WatchFiles
33
+ - [Mismatch] Step 2.3 duplicated validation from 1.2 — removed 2.3, kept in 1.2 only
34
+ - [Mismatch] `console.warn` in LoadConfig violates project `ILogger` pattern — added `log: ILogger` dep to LoadConfig and BuildComponents
35
+ - [Drift] Files Changed table had tentative "(if it creates its own)" for run.index.ts — made definitive
36
+ - [Inconsistency] Step 4.2 parsed fromLine/toLine from ID string — uses `meta.fromLine`/`meta.toLine` directly now
37
+ - [Missing] Step 1.3 vague "WalkFiles consumers" — listed all 5 specific construction sites (run.mcp.ts:18,30, run.index.ts:10, run.watch.ts:13,14)
38
+
39
+ **Round 4 — implementation:**
40
+ - Implemented all 12 files (3 new, 9 modified) + run.reset.ts (missed in plan, also calls BuildComponents)
41
+ - Phase 1: config type + loadConfig HOF
42
+ - Phase 2: cleanUpKeywords ignoreSet, walkFiles + watchFiles ignoreFiles
43
+ - Phase 3: readSnippet HOF
44
+ - Phase 4: buildComponents wiring ({log} param, config loading, return config)
45
+ - Phase 5: all entry points updated (run.mcp, run.search, run.index, run.watch, run.reset)
46
+ - Verified: keyword ignore filters noisy words, file ignore excludes rnd/**, snippets show for small results (top 3, ≤7 lines)