mdcontext 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. package/.changeset/README.md +28 -0
  2. package/.changeset/config.json +11 -0
  3. package/.claude/settings.local.json +25 -0
  4. package/.github/workflows/ci.yml +83 -0
  5. package/.github/workflows/claude-code-review.yml +44 -0
  6. package/.github/workflows/claude.yml +85 -0
  7. package/.github/workflows/release.yml +113 -0
  8. package/.tldrignore +112 -0
  9. package/BACKLOG.md +338 -0
  10. package/CONTRIBUTING.md +186 -0
  11. package/NOTES/NOTES +44 -0
  12. package/README.md +434 -11
  13. package/biome.json +36 -0
  14. package/cspell.config.yaml +14 -0
  15. package/dist/chunk-23UPXDNL.js +3044 -0
  16. package/dist/chunk-2W7MO2DL.js +1366 -0
  17. package/dist/chunk-3NUAZGMA.js +1689 -0
  18. package/dist/chunk-7TOWB2XB.js +366 -0
  19. package/dist/chunk-7XOTOADQ.js +3065 -0
  20. package/dist/chunk-AH2PDM2K.js +3042 -0
  21. package/dist/chunk-BNXWSZ63.js +3742 -0
  22. package/dist/chunk-BTL5DJVU.js +3222 -0
  23. package/dist/chunk-HDHYG7E4.js +104 -0
  24. package/dist/chunk-HLR4KZBP.js +3234 -0
  25. package/dist/chunk-IP3FRFEB.js +1045 -0
  26. package/dist/chunk-KHU56VDO.js +3042 -0
  27. package/dist/chunk-KRYIFLQR.js +88 -0
  28. package/dist/chunk-LBSDNLEM.js +287 -0
  29. package/dist/chunk-MNTQ7HCP.js +2643 -0
  30. package/dist/chunk-MUJELQQ6.js +1387 -0
  31. package/dist/chunk-MXJGMSLV.js +2199 -0
  32. package/dist/chunk-N6QJGC3Z.js +2636 -0
  33. package/dist/chunk-OBELGBPM.js +1713 -0
  34. package/dist/chunk-OT7R5XTA.js +3192 -0
  35. package/dist/chunk-P7X4RA2T.js +106 -0
  36. package/dist/chunk-PIDUQNC2.js +3185 -0
  37. package/dist/chunk-POGCDIH4.js +3187 -0
  38. package/dist/chunk-PSIEOQGZ.js +3043 -0
  39. package/dist/chunk-PVRT3IHA.js +3238 -0
  40. package/dist/chunk-QNN4TT23.js +1430 -0
  41. package/dist/chunk-RE3R45RJ.js +3042 -0
  42. package/dist/chunk-S7E6TFX6.js +803 -0
  43. package/dist/chunk-SG6GLU4U.js +1378 -0
  44. package/dist/chunk-SJCDV2ST.js +274 -0
  45. package/dist/chunk-SYE5XLF3.js +104 -0
  46. package/dist/chunk-T5VLYBZD.js +103 -0
  47. package/dist/chunk-TOQB7VWU.js +3238 -0
  48. package/dist/chunk-VFNMZ4ZQ.js +3228 -0
  49. package/dist/chunk-VVTGZNBT.js +1629 -0
  50. package/dist/chunk-W7Q4RFEV.js +104 -0
  51. package/dist/chunk-XTYYVRLO.js +3190 -0
  52. package/dist/chunk-Y6MDYVJD.js +3063 -0
  53. package/dist/cli/main.d.ts +1 -0
  54. package/dist/cli/main.js +5458 -0
  55. package/dist/index.d.ts +653 -0
  56. package/dist/index.js +79 -0
  57. package/dist/mcp/server.d.ts +1 -0
  58. package/dist/mcp/server.js +472 -0
  59. package/dist/schema-BAWSG7KY.js +22 -0
  60. package/dist/schema-E3QUPL26.js +20 -0
  61. package/dist/schema-EHL7WUT6.js +20 -0
  62. package/docs/019-USAGE.md +625 -0
  63. package/docs/020-current-implementation.md +364 -0
  64. package/docs/021-DOGFOODING-FINDINGS.md +175 -0
  65. package/docs/BACKLOG.md +80 -0
  66. package/docs/CONFIG.md +1123 -0
  67. package/docs/DESIGN.md +439 -0
  68. package/docs/ERRORS.md +383 -0
  69. package/docs/PROJECT.md +88 -0
  70. package/docs/ROADMAP.md +407 -0
  71. package/docs/summarization.md +320 -0
  72. package/docs/test-links.md +9 -0
  73. package/justfile +40 -0
  74. package/package.json +74 -9
  75. package/pnpm-workspace.yaml +5 -0
  76. package/research/INDEX.md +315 -0
  77. package/research/code-review/README.md +90 -0
  78. package/research/code-review/cli-error-handling-review.md +979 -0
  79. package/research/code-review/code-review-validation-report.md +464 -0
  80. package/research/code-review/main-ts-review.md +1128 -0
  81. package/research/config-analysis/01-current-implementation.md +470 -0
  82. package/research/config-analysis/02-strategy-recommendation.md +428 -0
  83. package/research/config-analysis/03-task-candidates.md +715 -0
  84. package/research/config-analysis/033-research-configuration-management.md +828 -0
  85. package/research/config-analysis/034-research-effect-cli-config.md +1504 -0
  86. package/research/config-analysis/04-consolidated-task-candidates.md +277 -0
  87. package/research/config-docs/SUMMARY.md +357 -0
  88. package/research/config-docs/TEST-RESULTS.md +776 -0
  89. package/research/config-docs/TODO.md +542 -0
  90. package/research/config-docs/analysis.md +744 -0
  91. package/research/config-docs/fix-validation.md +502 -0
  92. package/research/config-docs/help-audit.md +264 -0
  93. package/research/config-docs/help-system-analysis.md +890 -0
  94. package/research/dogfood/consolidated-tool-evaluation.md +373 -0
  95. package/research/dogfood/strategy-a/a-synthesis.md +184 -0
  96. package/research/dogfood/strategy-a/a1-docs.md +226 -0
  97. package/research/dogfood/strategy-a/a2-amorphic.md +156 -0
  98. package/research/dogfood/strategy-a/a3-llm.md +164 -0
  99. package/research/dogfood/strategy-b/b-synthesis.md +228 -0
  100. package/research/dogfood/strategy-b/b1-architecture.md +207 -0
  101. package/research/dogfood/strategy-b/b2-gaps.md +258 -0
  102. package/research/dogfood/strategy-b/b3-workflows.md +250 -0
  103. package/research/dogfood/strategy-c/c-synthesis.md +451 -0
  104. package/research/dogfood/strategy-c/c1-explorer.md +192 -0
  105. package/research/dogfood/strategy-c/c2-diver-memory.md +145 -0
  106. package/research/dogfood/strategy-c/c3-diver-control.md +148 -0
  107. package/research/dogfood/strategy-c/c4-diver-failure.md +151 -0
  108. package/research/dogfood/strategy-c/c5-diver-execution.md +221 -0
  109. package/research/dogfood/strategy-c/c6-diver-org.md +221 -0
  110. package/research/effect-cli-error-handling.md +845 -0
  111. package/research/effect-errors-as-values.md +943 -0
  112. package/research/errors-task-analysis/00-consolidated-tasks.md +207 -0
  113. package/research/errors-task-analysis/cli-commands-analysis.md +909 -0
  114. package/research/errors-task-analysis/embeddings-analysis.md +709 -0
  115. package/research/errors-task-analysis/index-search-analysis.md +812 -0
  116. package/research/frontmatter/COMMENTS-ARE-SKIPPED.md +149 -0
  117. package/research/frontmatter/LLM-CODE-NAVIGATION.md +276 -0
  118. package/research/issue-review.md +603 -0
  119. package/research/llm-summarization/agent-cli-tools-2026.md +1082 -0
  120. package/research/llm-summarization/alternative-providers-2026.md +1428 -0
  121. package/research/llm-summarization/anthropic-2026.md +367 -0
  122. package/research/llm-summarization/claude-cli-integration.md +1706 -0
  123. package/research/llm-summarization/cli-integration-patterns.md +3155 -0
  124. package/research/llm-summarization/openai-2026.md +473 -0
  125. package/research/llm-summarization/openai-compatible-providers-2026.md +1022 -0
  126. package/research/llm-summarization/opencode-cli-integration.md +1552 -0
  127. package/research/llm-summarization/prompt-engineering-2026.md +1426 -0
  128. package/research/llm-summarization/prototype-results.md +56 -0
  129. package/research/llm-summarization/provider-switching-patterns-2026.md +2153 -0
  130. package/research/llm-summarization/typescript-llm-libraries-2026.md +2436 -0
  131. package/research/mdcontext-error-analysis.md +521 -0
  132. package/research/mdcontext-pudding/00-EXECUTIVE-SUMMARY.md +282 -0
  133. package/research/mdcontext-pudding/01-index-embed.md +956 -0
  134. package/research/mdcontext-pudding/02-search-COMMANDS.md +142 -0
  135. package/research/mdcontext-pudding/02-search-SUMMARY.md +146 -0
  136. package/research/mdcontext-pudding/02-search.md +970 -0
  137. package/research/mdcontext-pudding/03-context.md +779 -0
  138. package/research/mdcontext-pudding/04-navigation-and-analytics.md +803 -0
  139. package/research/mdcontext-pudding/04-tree.md +704 -0
  140. package/research/mdcontext-pudding/05-config.md +1038 -0
  141. package/research/mdcontext-pudding/06-links-summary.txt +87 -0
  142. package/research/mdcontext-pudding/06-links.md +679 -0
  143. package/research/mdcontext-pudding/07-stats.md +693 -0
  144. package/research/mdcontext-pudding/BUG-FIX-PLAN.md +388 -0
  145. package/research/mdcontext-pudding/P0-BUG-VALIDATION.md +167 -0
  146. package/research/mdcontext-pudding/README.md +168 -0
  147. package/research/mdcontext-pudding/TESTING-SUMMARY.md +128 -0
  148. package/research/npm_publish/011-npm-workflow-research-agent2.md +792 -0
  149. package/research/npm_publish/012-npm-workflow-research-agent1.md +530 -0
  150. package/research/npm_publish/013-npm-workflow-research-agent3.md +722 -0
  151. package/research/npm_publish/014-npm-workflow-synthesis.md +556 -0
  152. package/research/npm_publish/031-npm-workflow-task-analysis.md +134 -0
  153. package/research/research-quality-review.md +834 -0
  154. package/research/semantic-search/002-research-embedding-models.md +490 -0
  155. package/research/semantic-search/003-research-rag-alternatives.md +523 -0
  156. package/research/semantic-search/004-research-vector-search.md +841 -0
  157. package/research/semantic-search/032-research-semantic-search.md +427 -0
  158. package/research/semantic-search/embedding-text-analysis.md +156 -0
  159. package/research/semantic-search/multi-word-failure-reproduction.md +171 -0
  160. package/research/semantic-search/query-processing-analysis.md +207 -0
  161. package/research/semantic-search/root-cause-and-solution.md +114 -0
  162. package/research/semantic-search/threshold-validation-report.md +69 -0
  163. package/research/semantic-search/vector-search-analysis.md +63 -0
  164. package/research/task-management-2026/00-synthesis-recommendations.md +295 -0
  165. package/research/task-management-2026/01-ai-workflow-tools.md +416 -0
  166. package/research/task-management-2026/02-agent-framework-patterns.md +476 -0
  167. package/research/task-management-2026/03-lightweight-file-based.md +567 -0
  168. package/research/task-management-2026/04-established-tools-ai-features.md +541 -0
  169. package/research/task-management-2026/linear/01-core-features-workflow.md +771 -0
  170. package/research/task-management-2026/linear/02-api-integrations.md +930 -0
  171. package/research/task-management-2026/linear/03-ai-features.md +368 -0
  172. package/research/task-management-2026/linear/04-pricing-setup.md +205 -0
  173. package/research/task-management-2026/linear/05-usage-patterns-best-practices.md +605 -0
  174. package/research/test-path-issues.md +276 -0
  175. package/review/ALP-76/1-error-type-design.md +962 -0
  176. package/review/ALP-76/2-error-handling-patterns.md +906 -0
  177. package/review/ALP-76/3-error-presentation.md +624 -0
  178. package/review/ALP-76/4-test-coverage.md +625 -0
  179. package/review/ALP-76/5-migration-completeness.md +440 -0
  180. package/review/ALP-76/6-effect-best-practices.md +755 -0
  181. package/scripts/apply-branch-protection.sh +47 -0
  182. package/scripts/branch-protection-templates.json +79 -0
  183. package/scripts/prototype-summarization.ts +346 -0
  184. package/scripts/rebuild-hnswlib.js +58 -0
  185. package/scripts/setup-branch-protection.sh +64 -0
  186. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/active-provider.json +7 -0
  187. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.json +541 -0
  188. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.meta.json +5 -0
  189. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/config.json +8 -0
  190. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
  191. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
  192. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/documents.json +60 -0
  193. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/links.json +13 -0
  194. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/sections.json +1197 -0
  195. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/configuration-management.md +99 -0
  196. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/distributed-systems.md +92 -0
  197. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/error-handling.md +78 -0
  198. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/failure-automation.md +55 -0
  199. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/job-context.md +69 -0
  200. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/process-orchestration.md +99 -0
  201. package/src/cli/argv-preprocessor.test.ts +210 -0
  202. package/src/cli/argv-preprocessor.ts +202 -0
  203. package/src/cli/cli.test.ts +627 -0
  204. package/src/cli/commands/backlinks.ts +54 -0
  205. package/src/cli/commands/config-cmd.ts +642 -0
  206. package/src/cli/commands/context.ts +285 -0
  207. package/src/cli/commands/duplicates.ts +122 -0
  208. package/src/cli/commands/embeddings.ts +529 -0
  209. package/src/cli/commands/index-cmd.ts +480 -0
  210. package/src/cli/commands/index.ts +16 -0
  211. package/src/cli/commands/links.ts +52 -0
  212. package/src/cli/commands/search.ts +1281 -0
  213. package/src/cli/commands/stats.ts +149 -0
  214. package/src/cli/commands/tree.ts +128 -0
  215. package/src/cli/config-layer.ts +176 -0
  216. package/src/cli/error-handler.test.ts +235 -0
  217. package/src/cli/error-handler.ts +655 -0
  218. package/src/cli/flag-schemas.ts +341 -0
  219. package/src/cli/help.ts +588 -0
  220. package/src/cli/index.ts +9 -0
  221. package/src/cli/main.ts +435 -0
  222. package/src/cli/options.ts +41 -0
  223. package/src/cli/shared-error-handling.ts +199 -0
  224. package/src/cli/typo-suggester.test.ts +105 -0
  225. package/src/cli/typo-suggester.ts +130 -0
  226. package/src/cli/utils.ts +259 -0
  227. package/src/config/file-provider.test.ts +320 -0
  228. package/src/config/file-provider.ts +273 -0
  229. package/src/config/index.ts +72 -0
  230. package/src/config/integration.test.ts +667 -0
  231. package/src/config/precedence.test.ts +277 -0
  232. package/src/config/precedence.ts +451 -0
  233. package/src/config/schema.test.ts +414 -0
  234. package/src/config/schema.ts +603 -0
  235. package/src/config/service.test.ts +320 -0
  236. package/src/config/service.ts +243 -0
  237. package/src/config/testing.test.ts +264 -0
  238. package/src/config/testing.ts +110 -0
  239. package/src/core/index.ts +1 -0
  240. package/src/core/types.ts +113 -0
  241. package/src/duplicates/detector.test.ts +183 -0
  242. package/src/duplicates/detector.ts +414 -0
  243. package/src/duplicates/index.ts +18 -0
  244. package/src/embeddings/embedding-namespace.test.ts +300 -0
  245. package/src/embeddings/embedding-namespace.ts +947 -0
  246. package/src/embeddings/heading-boost.test.ts +222 -0
  247. package/src/embeddings/hnsw-build-options.test.ts +198 -0
  248. package/src/embeddings/hyde.test.ts +272 -0
  249. package/src/embeddings/hyde.ts +264 -0
  250. package/src/embeddings/index.ts +10 -0
  251. package/src/embeddings/openai-provider.ts +414 -0
  252. package/src/embeddings/pricing.json +22 -0
  253. package/src/embeddings/provider-constants.ts +204 -0
  254. package/src/embeddings/provider-errors.test.ts +967 -0
  255. package/src/embeddings/provider-errors.ts +565 -0
  256. package/src/embeddings/provider-factory.test.ts +240 -0
  257. package/src/embeddings/provider-factory.ts +225 -0
  258. package/src/embeddings/provider-integration.test.ts +788 -0
  259. package/src/embeddings/query-preprocessing.test.ts +187 -0
  260. package/src/embeddings/semantic-search-threshold.test.ts +508 -0
  261. package/src/embeddings/semantic-search.ts +1270 -0
  262. package/src/embeddings/types.ts +359 -0
  263. package/src/embeddings/vector-store.ts +708 -0
  264. package/src/embeddings/voyage-provider.ts +313 -0
  265. package/src/errors/errors.test.ts +845 -0
  266. package/src/errors/index.ts +533 -0
  267. package/src/index/ignore-patterns.test.ts +354 -0
  268. package/src/index/ignore-patterns.ts +305 -0
  269. package/src/index/index.ts +4 -0
  270. package/src/index/indexer.ts +684 -0
  271. package/src/index/storage.ts +260 -0
  272. package/src/index/types.ts +147 -0
  273. package/src/index/watcher.ts +189 -0
  274. package/src/index.ts +30 -0
  275. package/src/integration/search-keyword.test.ts +678 -0
  276. package/src/mcp/server.ts +612 -0
  277. package/src/parser/index.ts +1 -0
  278. package/src/parser/parser.test.ts +291 -0
  279. package/src/parser/parser.ts +394 -0
  280. package/src/parser/section-filter.test.ts +277 -0
  281. package/src/parser/section-filter.ts +392 -0
  282. package/src/search/__tests__/hybrid-search.test.ts +650 -0
  283. package/src/search/bm25-store.ts +366 -0
  284. package/src/search/cross-encoder.test.ts +253 -0
  285. package/src/search/cross-encoder.ts +406 -0
  286. package/src/search/fuzzy-search.test.ts +419 -0
  287. package/src/search/fuzzy-search.ts +273 -0
  288. package/src/search/hybrid-search.ts +448 -0
  289. package/src/search/path-matcher.test.ts +276 -0
  290. package/src/search/path-matcher.ts +33 -0
  291. package/src/search/query-parser.test.ts +260 -0
  292. package/src/search/query-parser.ts +319 -0
  293. package/src/search/searcher.test.ts +280 -0
  294. package/src/search/searcher.ts +724 -0
  295. package/src/search/wink-bm25.d.ts +30 -0
  296. package/src/summarization/cli-providers/claude.ts +202 -0
  297. package/src/summarization/cli-providers/detection.test.ts +273 -0
  298. package/src/summarization/cli-providers/detection.ts +118 -0
  299. package/src/summarization/cli-providers/index.ts +8 -0
  300. package/src/summarization/cost.test.ts +139 -0
  301. package/src/summarization/cost.ts +102 -0
  302. package/src/summarization/error-handler.test.ts +127 -0
  303. package/src/summarization/error-handler.ts +111 -0
  304. package/src/summarization/index.ts +102 -0
  305. package/src/summarization/pipeline.test.ts +498 -0
  306. package/src/summarization/pipeline.ts +231 -0
  307. package/src/summarization/prompts.test.ts +269 -0
  308. package/src/summarization/prompts.ts +133 -0
  309. package/src/summarization/provider-factory.test.ts +396 -0
  310. package/src/summarization/provider-factory.ts +178 -0
  311. package/src/summarization/types.ts +184 -0
  312. package/src/summarize/budget-bugs.test.ts +620 -0
  313. package/src/summarize/formatters.ts +419 -0
  314. package/src/summarize/index.ts +20 -0
  315. package/src/summarize/summarizer.test.ts +275 -0
  316. package/src/summarize/summarizer.ts +597 -0
  317. package/src/summarize/verify-bugs.test.ts +238 -0
  318. package/src/types/huggingface-transformers.d.ts +66 -0
  319. package/src/utils/index.ts +1 -0
  320. package/src/utils/tokens.test.ts +142 -0
  321. package/src/utils/tokens.ts +186 -0
  322. package/tests/fixtures/cli/.mdcontext/active-provider.json +7 -0
  323. package/tests/fixtures/cli/.mdcontext/config.json +8 -0
  324. package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
  325. package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
  326. package/tests/fixtures/cli/.mdcontext/indexes/documents.json +33 -0
  327. package/tests/fixtures/cli/.mdcontext/indexes/links.json +12 -0
  328. package/tests/fixtures/cli/.mdcontext/indexes/sections.json +247 -0
  329. package/tests/fixtures/cli/README.md +9 -0
  330. package/tests/fixtures/cli/api-reference.md +11 -0
  331. package/tests/fixtures/cli/getting-started.md +11 -0
  332. package/tests/integration/embed-index.test.ts +712 -0
  333. package/tests/integration/search-context.test.ts +469 -0
  334. package/tests/integration/search-semantic.test.ts +522 -0
  335. package/tsconfig.json +26 -0
  336. package/vitest.config.ts +16 -0
  337. package/vitest.setup.ts +12 -0
@@ -0,0 +1,1270 @@
1
+ /**
2
+ * Semantic search functionality
3
+ */
4
+
5
+ import * as fs from 'node:fs/promises'
6
+ import * as path from 'node:path'
7
+ import { Effect } from 'effect'
8
+ import {
9
+ type ApiKeyInvalidError,
10
+ type ApiKeyMissingError,
11
+ DimensionMismatchError,
12
+ EmbeddingError,
13
+ EmbeddingsNotFoundError,
14
+ type FileReadError,
15
+ type IndexCorruptedError,
16
+ IndexNotFoundError,
17
+ type VectorStoreError,
18
+ } from '../errors/index.js'
19
+ import {
20
+ createStorage,
21
+ loadDocumentIndex,
22
+ loadSectionIndex,
23
+ } from '../index/storage.js'
24
+ import type { SectionEntry } from '../index/types.js'
25
+ import {
26
+ type ActiveProvider,
27
+ generateNamespace,
28
+ getActiveNamespace,
29
+ writeActiveProvider,
30
+ } from './embedding-namespace.js'
31
+ import { generateHypotheticalDocument, type HydeResult } from './hyde.js'
32
+ import {
33
+ checkPricingFreshness,
34
+ getPricingDate,
35
+ PRICING_DATA,
36
+ wrapEmbedding,
37
+ } from './openai-provider.js'
38
+ import {
39
+ createEmbeddingProviderDirect,
40
+ type ProviderFactoryConfig,
41
+ } from './provider-factory.js'
42
+ import {
43
+ calculateFileImportanceBoost,
44
+ calculateHeadingBoost,
45
+ type EmbeddingProvider,
46
+ hasProviderMetadata,
47
+ preprocessQuery,
48
+ QUALITY_EF_SEARCH,
49
+ type SemanticSearchOptions,
50
+ type SemanticSearchResult,
51
+ type SemanticSearchResultWithStats,
52
+ type VectorEntry,
53
+ } from './types.js'
54
+ import {
55
+ createNamespacedVectorStore,
56
+ type HnswBuildOptions,
57
+ type HnswMismatchWarning,
58
+ type HnswVectorStore,
59
+ type VectorSearchResult,
60
+ type VectorStoreLoadResult,
61
+ } from './vector-store.js'
62
+
63
+ // ============================================================================
64
+ // HNSW Parameter Warning
65
+ // ============================================================================
66
+
67
+ /**
68
+ * Check for HNSW parameter mismatch and log a warning if found.
69
+ * This helps users understand when their config doesn't match the stored index.
70
+ */
71
+ const checkHnswMismatch = (
72
+ mismatch: HnswMismatchWarning | undefined,
73
+ ): Effect.Effect<void, never, never> => {
74
+ if (!mismatch) {
75
+ return Effect.void
76
+ }
77
+
78
+ const { configParams, indexParams } = mismatch
79
+ return Effect.logWarning(
80
+ `HNSW parameter mismatch: Index was built with M=${indexParams.m}, efConstruction=${indexParams.efConstruction}, ` +
81
+ `but config specifies M=${configParams.m}, efConstruction=${configParams.efConstruction}. ` +
82
+ `HNSW parameters only affect index construction. Run 'mdcontext index --embed --force' to rebuild with new parameters.`,
83
+ )
84
+ }
85
+
86
+ // ============================================================================
87
+ // Embedding Text Generation
88
+ // ============================================================================
89
+
90
+ const generateEmbeddingText = (
91
+ section: SectionEntry,
92
+ content: string,
93
+ documentTitle: string,
94
+ parentHeading?: string | undefined,
95
+ ): string => {
96
+ const parts: string[] = []
97
+
98
+ parts.push(`# ${section.heading}`)
99
+ if (parentHeading) {
100
+ parts.push(`Parent section: ${parentHeading}`)
101
+ }
102
+ parts.push(`Document: ${documentTitle}`)
103
+ parts.push('')
104
+ parts.push(content)
105
+
106
+ return parts.join('\n')
107
+ }
108
+
109
+ // ============================================================================
110
+ // Cost Estimation
111
+ // ============================================================================
112
+
113
+ // Price per 1M tokens for text-embedding-3-small (from PRICING_DATA)
114
+ const EMBEDDING_PRICE_PER_MILLION =
115
+ PRICING_DATA.prices['text-embedding-3-small'] ?? 0.02
116
+
117
+ // Re-export pricing utilities for CLI use
118
+ export { checkPricingFreshness, getPricingDate }
119
+
120
+ export interface DirectoryEstimate {
121
+ readonly directory: string
122
+ readonly fileCount: number
123
+ readonly sectionCount: number
124
+ readonly estimatedTokens: number
125
+ readonly estimatedCost: number
126
+ }
127
+
128
+ export interface EmbeddingEstimate {
129
+ readonly totalFiles: number
130
+ readonly totalSections: number
131
+ readonly totalTokens: number
132
+ readonly totalCost: number
133
+ readonly estimatedTimeSeconds: number
134
+ readonly byDirectory: readonly DirectoryEstimate[]
135
+ }
136
+
137
+ /**
138
+ * Estimate the cost of generating embeddings for a directory.
139
+ *
140
+ * @param rootPath - Root directory containing indexed markdown files
141
+ * @param options - Optional exclude patterns
142
+ * @returns Estimate with token counts and costs
143
+ *
144
+ * @throws IndexNotFoundError - Index doesn't exist at path
145
+ * @throws FileReadError - Cannot read index files
146
+ * @throws IndexCorruptedError - Index files are corrupted
147
+ */
148
+ export const estimateEmbeddingCost = (
149
+ rootPath: string,
150
+ options: { excludePatterns?: readonly string[] | undefined } = {},
151
+ ): Effect.Effect<
152
+ EmbeddingEstimate,
153
+ IndexNotFoundError | FileReadError | IndexCorruptedError
154
+ > =>
155
+ Effect.gen(function* () {
156
+ const resolvedRoot = path.resolve(rootPath)
157
+ const storage = createStorage(resolvedRoot)
158
+
159
+ const docIndex = yield* loadDocumentIndex(storage)
160
+ const sectionIndex = yield* loadSectionIndex(storage)
161
+
162
+ if (!docIndex || !sectionIndex) {
163
+ return yield* Effect.fail(new IndexNotFoundError({ path: resolvedRoot }))
164
+ }
165
+
166
+ // Group by directory
167
+ const byDir: Map<
168
+ string,
169
+ { files: Set<string>; sections: number; tokens: number }
170
+ > = new Map()
171
+
172
+ for (const section of Object.values(sectionIndex.sections)) {
173
+ // Skip very short sections (< 10 tokens)
174
+ if (section.tokenCount < 10) continue
175
+
176
+ // Check exclude patterns
177
+ if (options.excludePatterns?.length) {
178
+ const excluded = options.excludePatterns.some((pattern) => {
179
+ const regex = new RegExp(
180
+ `^${pattern.replace(/\*/g, '.*').replace(/\?/g, '.')}$`,
181
+ )
182
+ return regex.test(section.documentPath)
183
+ })
184
+ if (excluded) continue
185
+ }
186
+
187
+ const dir = path.dirname(section.documentPath) || '.'
188
+ if (!byDir.has(dir)) {
189
+ byDir.set(dir, { files: new Set(), sections: 0, tokens: 0 })
190
+ }
191
+ const entry = byDir.get(dir)!
192
+ entry.files.add(section.documentPath)
193
+ entry.sections++
194
+ entry.tokens += section.tokenCount
195
+ }
196
+
197
+ const directoryEstimates: DirectoryEstimate[] = []
198
+ let totalFiles = 0
199
+ let totalSections = 0
200
+ let totalTokens = 0
201
+
202
+ for (const [dir, data] of byDir) {
203
+ directoryEstimates.push({
204
+ directory: dir,
205
+ fileCount: data.files.size,
206
+ sectionCount: data.sections,
207
+ estimatedTokens: data.tokens,
208
+ estimatedCost: (data.tokens / 1_000_000) * EMBEDDING_PRICE_PER_MILLION,
209
+ })
210
+ totalFiles += data.files.size
211
+ totalSections += data.sections
212
+ totalTokens += data.tokens
213
+ }
214
+
215
+ // Sort by directory name
216
+ directoryEstimates.sort((a, b) => a.directory.localeCompare(b.directory))
217
+
218
+ // Estimate time: ~1.5s per 100 sections (API batch processing)
219
+ const estimatedTimeSeconds = Math.ceil(totalSections / 100) * 1.5
220
+
221
+ return {
222
+ totalFiles,
223
+ totalSections,
224
+ totalTokens,
225
+ totalCost: (totalTokens / 1_000_000) * EMBEDDING_PRICE_PER_MILLION,
226
+ estimatedTimeSeconds,
227
+ byDirectory: directoryEstimates,
228
+ }
229
+ })
230
+
231
+ // ============================================================================
232
+ // Build Embeddings
233
+ // ============================================================================
234
+
235
+ export interface FileProgress {
236
+ readonly fileIndex: number
237
+ readonly totalFiles: number
238
+ readonly filePath: string
239
+ readonly sectionCount: number
240
+ }
241
+
242
+ export interface EmbeddingBatchProgress {
243
+ readonly batchIndex: number
244
+ readonly totalBatches: number
245
+ readonly processedSections: number
246
+ readonly totalSections: number
247
+ }
248
+
249
+ export interface BuildEmbeddingsOptions {
250
+ readonly force?: boolean | undefined
251
+ readonly provider?: EmbeddingProvider | undefined
252
+ readonly providerConfig?: ProviderFactoryConfig | undefined
253
+ readonly excludePatterns?: readonly string[] | undefined
254
+ readonly onFileProgress?: ((progress: FileProgress) => void) | undefined
255
+ /** Callback for batch progress during embedding API calls */
256
+ readonly onBatchProgress?:
257
+ | ((progress: EmbeddingBatchProgress) => void)
258
+ | undefined
259
+ /** HNSW build parameters for vector index construction */
260
+ readonly hnswOptions?: HnswBuildOptions | undefined
261
+ }
262
+
263
+ export interface BuildEmbeddingsResult {
264
+ readonly sectionsEmbedded: number
265
+ readonly tokensUsed: number
266
+ readonly cost: number
267
+ readonly duration: number
268
+ readonly filesProcessed: number
269
+ readonly cacheHit?: boolean | undefined
270
+ readonly existingVectors?: number | undefined
271
+ readonly estimatedSavings?: number | undefined
272
+ }
273
+
274
+ /**
275
+ * Build embeddings for all indexed sections in a directory.
276
+ *
277
+ * @param rootPath - Root directory containing indexed markdown files
278
+ * @param options - Build options (force rebuild, progress callbacks)
279
+ * @returns Result with embedding counts, costs, and timing
280
+ *
281
+ * @throws IndexNotFoundError - Index doesn't exist at path
282
+ * @throws FileReadError - Cannot read index or source files
283
+ * @throws IndexCorruptedError - Index files are corrupted
284
+ * @throws ApiKeyMissingError - API key not set (check provider config)
285
+ * @throws ApiKeyInvalidError - API key rejected by provider
286
+ * @throws EmbeddingError - Embedding API failure (rate limit, quota, network)
287
+ * @throws VectorStoreError - Cannot save vector index
288
+ * @throws DimensionMismatchError - Existing embeddings have different dimensions
289
+ */
290
+ export const buildEmbeddings = (
291
+ rootPath: string,
292
+ options: BuildEmbeddingsOptions = {},
293
+ ): Effect.Effect<
294
+ BuildEmbeddingsResult,
295
+ | IndexNotFoundError
296
+ | FileReadError
297
+ | IndexCorruptedError
298
+ | ApiKeyMissingError
299
+ | ApiKeyInvalidError
300
+ | EmbeddingError
301
+ | VectorStoreError
302
+ | DimensionMismatchError
303
+ > =>
304
+ Effect.gen(function* () {
305
+ const startTime = Date.now()
306
+ const resolvedRoot = path.resolve(rootPath)
307
+ const storage = createStorage(resolvedRoot)
308
+
309
+ // Load indexes
310
+ const docIndex = yield* loadDocumentIndex(storage)
311
+ const sectionIndex = yield* loadSectionIndex(storage)
312
+
313
+ if (!docIndex || !sectionIndex) {
314
+ return yield* Effect.fail(new IndexNotFoundError({ path: resolvedRoot }))
315
+ }
316
+
317
+ // Get or create provider - use factory for config-driven provider selection
318
+ // Priority: explicit provider > providerConfig > default (openai)
319
+ const providerConfig = options.providerConfig ?? { provider: 'openai' }
320
+ const provider =
321
+ options.provider ?? (yield* createEmbeddingProviderDirect(providerConfig))
322
+ const dimensions = provider.dimensions
323
+
324
+ // Extract provider info for namespacing from the actual provider instance
325
+ // This ensures we use the correct values even when options.provider is explicitly set
326
+ let providerName: string
327
+ let providerModel: string
328
+
329
+ if (hasProviderMetadata(provider)) {
330
+ // Provider has metadata - extract provider name from provider.name (format: "provider:model")
331
+ const nameParts = provider.name.split(':')
332
+ providerName = nameParts[0] || 'openai'
333
+ providerModel = provider.model
334
+ } else {
335
+ // Fallback to config values for providers without metadata
336
+ providerName = providerConfig.provider ?? 'openai'
337
+ providerModel = providerConfig.model ?? 'text-embedding-3-small'
338
+ }
339
+
340
+ // Create namespaced vector store for this provider/model/dimensions combination
341
+ const vectorStore = createNamespacedVectorStore(
342
+ resolvedRoot,
343
+ providerName,
344
+ providerModel,
345
+ dimensions,
346
+ options.hnswOptions,
347
+ ) as HnswVectorStore
348
+
349
+ // Set provider metadata
350
+ if (hasProviderMetadata(provider)) {
351
+ vectorStore.setProvider(provider.name, provider.model, provider.baseURL)
352
+ } else {
353
+ vectorStore.setProvider(providerName, providerModel, undefined)
354
+ }
355
+
356
+ // Load existing if not forcing
357
+ if (!options.force) {
358
+ const loadResult = yield* vectorStore.load()
359
+ if (loadResult.loaded) {
360
+ const stats = vectorStore.getStats()
361
+ // Skip if any embeddings exist
362
+ if (stats.count > 0) {
363
+ const duration = Date.now() - startTime
364
+ // Estimate savings based on existing tokens
365
+ const estimatedSavings =
366
+ (stats.totalTokens / 1_000_000) * EMBEDDING_PRICE_PER_MILLION
367
+ return {
368
+ sectionsEmbedded: 0,
369
+ tokensUsed: 0,
370
+ cost: 0,
371
+ duration,
372
+ filesProcessed: 0,
373
+ cacheHit: true,
374
+ existingVectors: stats.count,
375
+ estimatedSavings,
376
+ }
377
+ }
378
+ }
379
+ }
380
+
381
+ // Helper to check if a path matches exclude patterns
382
+ const isExcluded = (docPath: string): boolean => {
383
+ if (!options.excludePatterns?.length) return false
384
+ return options.excludePatterns.some((pattern) => {
385
+ const regex = new RegExp(
386
+ `^${pattern.replace(/\*/g, '.*').replace(/\?/g, '.')}$`,
387
+ )
388
+ return regex.test(docPath)
389
+ })
390
+ }
391
+
392
+ // Group sections by document for efficient file reading
393
+ const sectionsByDoc: Map<
394
+ string,
395
+ { section: SectionEntry; parentHeading: string | undefined }[]
396
+ > = new Map()
397
+
398
+ for (const section of Object.values(sectionIndex.sections)) {
399
+ const document = docIndex.documents[section.documentPath]
400
+ if (!document) continue
401
+
402
+ // Skip very short sections (< 10 tokens)
403
+ if (section.tokenCount < 10) continue
404
+
405
+ // Check exclude patterns
406
+ if (isExcluded(section.documentPath)) continue
407
+
408
+ // Find parent heading if any
409
+ let parentHeading: string | undefined
410
+ if (section.level > 1) {
411
+ const docSections = sectionIndex.byDocument[document.id] ?? []
412
+ for (const sibId of docSections) {
413
+ const sib = sectionIndex.sections[sibId]
414
+ if (
415
+ sib &&
416
+ sib.level === section.level - 1 &&
417
+ sib.startLine < section.startLine
418
+ ) {
419
+ parentHeading = sib.heading
420
+ }
421
+ }
422
+ }
423
+
424
+ const docPath = section.documentPath
425
+ if (!sectionsByDoc.has(docPath)) {
426
+ sectionsByDoc.set(docPath, [])
427
+ }
428
+ sectionsByDoc.get(docPath)!.push({ section, parentHeading })
429
+ }
430
+
431
+ if (sectionsByDoc.size === 0) {
432
+ const duration = Date.now() - startTime
433
+ return {
434
+ sectionsEmbedded: 0,
435
+ tokensUsed: 0,
436
+ cost: 0,
437
+ duration,
438
+ filesProcessed: 0,
439
+ }
440
+ }
441
+
442
+ // Prepare sections for embedding by reading file content
443
+ const sectionsToEmbed: { section: SectionEntry; text: string }[] = []
444
+ const docPaths = Array.from(sectionsByDoc.keys())
445
+ let filesProcessed = 0
446
+
447
+ for (let fileIndex = 0; fileIndex < docPaths.length; fileIndex++) {
448
+ const docPath = docPaths[fileIndex]!
449
+ const sections = sectionsByDoc.get(docPath)!
450
+ const document = docIndex.documents[docPath]
451
+ if (!document) continue
452
+
453
+ // Report file progress
454
+ if (options.onFileProgress) {
455
+ options.onFileProgress({
456
+ fileIndex: fileIndex + 1,
457
+ totalFiles: docPaths.length,
458
+ filePath: docPath,
459
+ sectionCount: sections.length,
460
+ })
461
+ }
462
+
463
+ const filePath = path.join(resolvedRoot, docPath)
464
+
465
+ // Note: catchAll is intentional - file read failures during embedding
466
+ // should skip the file with a warning rather than abort the entire operation.
467
+ // A warning is logged below when the read fails.
468
+ const fileContentResult = yield* Effect.promise(() =>
469
+ fs.readFile(filePath, 'utf-8'),
470
+ ).pipe(
471
+ Effect.map((content) => ({ ok: true as const, content })),
472
+ Effect.catchAll(() =>
473
+ Effect.succeed({ ok: false as const, content: '' }),
474
+ ),
475
+ )
476
+
477
+ if (!fileContentResult.ok) {
478
+ yield* Effect.logWarning(`Skipping file (cannot read): ${docPath}`)
479
+ continue
480
+ }
481
+
482
+ filesProcessed++
483
+ const lines = fileContentResult.content.split('\n')
484
+
485
+ for (const { section, parentHeading } of sections) {
486
+ // Extract section content from file
487
+ const content = lines
488
+ .slice(section.startLine - 1, section.endLine)
489
+ .join('\n')
490
+
491
+ const text = generateEmbeddingText(
492
+ section,
493
+ content,
494
+ document.title,
495
+ parentHeading,
496
+ )
497
+ sectionsToEmbed.push({ section, text })
498
+ }
499
+ }
500
+
501
+ if (sectionsToEmbed.length === 0) {
502
+ const duration = Date.now() - startTime
503
+ return {
504
+ sectionsEmbedded: 0,
505
+ tokensUsed: 0,
506
+ cost: 0,
507
+ duration,
508
+ filesProcessed,
509
+ }
510
+ }
511
+
512
+ // Generate embeddings
513
+ const texts = sectionsToEmbed.map((s) => s.text)
514
+ const result = yield* wrapEmbedding(
515
+ provider.embed(texts, {
516
+ onBatchProgress: options.onBatchProgress
517
+ ? (p) =>
518
+ options.onBatchProgress?.({
519
+ batchIndex: p.batchIndex,
520
+ totalBatches: p.totalBatches,
521
+ processedSections: p.processedTexts,
522
+ totalSections: p.totalTexts,
523
+ })
524
+ : undefined,
525
+ }),
526
+ providerConfig.provider ?? 'openai',
527
+ )
528
+
529
+ // Create vector entries
530
+ const entries: VectorEntry[] = []
531
+ for (let i = 0; i < sectionsToEmbed.length; i++) {
532
+ const { section } = sectionsToEmbed[i] ?? { section: null }
533
+ const embedding = result.embeddings[i]
534
+ if (!section || !embedding) continue
535
+
536
+ entries.push({
537
+ id: section.id,
538
+ sectionId: section.id,
539
+ documentPath: section.documentPath,
540
+ heading: section.heading,
541
+ embedding,
542
+ })
543
+ }
544
+
545
+ // Add to vector store
546
+ yield* vectorStore.add(entries)
547
+ vectorStore.addCost(result.cost, result.tokensUsed)
548
+
549
+ // Save
550
+ yield* vectorStore.save()
551
+
552
+ // Set this namespace as the active provider
553
+ const namespace = generateNamespace(providerName, providerModel, dimensions)
554
+ yield* writeActiveProvider(resolvedRoot, {
555
+ namespace,
556
+ provider: providerName,
557
+ model: providerModel,
558
+ dimensions,
559
+ activatedAt: new Date().toISOString(),
560
+ }).pipe(
561
+ Effect.catchAll((e) => {
562
+ // Don't fail the build if we can't write the active provider file
563
+ console.warn(`Warning: Could not set active provider: ${e.message}`)
564
+ return Effect.succeed(undefined)
565
+ }),
566
+ )
567
+
568
+ const duration = Date.now() - startTime
569
+
570
+ return {
571
+ sectionsEmbedded: entries.length,
572
+ tokensUsed: result.tokensUsed,
573
+ cost: result.cost,
574
+ duration,
575
+ filesProcessed,
576
+ }
577
+ })
578
+
579
+ // ============================================================================
580
+ // Context Lines Helper
581
+ // ============================================================================
582
+
583
+ /**
584
+ * Add context lines to search results by loading section content from files.
585
+ * This helper is used by both semanticSearch and semanticSearchWithStats to avoid code duplication.
586
+ */
587
+ const addContextLinesToResults = (
588
+ limitedResults: readonly VectorSearchResult[],
589
+ sectionIndex: { sections: Record<string, SectionEntry> },
590
+ resolvedRoot: string,
591
+ options: {
592
+ contextBefore?: number | undefined
593
+ contextAfter?: number | undefined
594
+ },
595
+ ): Effect.Effect<readonly SemanticSearchResult[], FileReadError, never> =>
596
+ Effect.gen(function* () {
597
+ const contextBefore = options.contextBefore ?? 0
598
+ const contextAfter = options.contextAfter ?? 0
599
+
600
+ const resultsWithContext: SemanticSearchResult[] = []
601
+ const fileCache = new Map<string, string>()
602
+
603
+ for (const r of limitedResults) {
604
+ const section = sectionIndex.sections[r.sectionId]
605
+ if (!section) {
606
+ resultsWithContext.push({
607
+ sectionId: r.sectionId,
608
+ documentPath: r.documentPath,
609
+ heading: r.heading,
610
+ similarity: r.similarity,
611
+ })
612
+ continue
613
+ }
614
+
615
+ let fileContent = fileCache.get(r.documentPath)
616
+ if (!fileContent) {
617
+ const filePath = path.join(resolvedRoot, r.documentPath)
618
+ const contentResult = yield* Effect.promise(() =>
619
+ fs.readFile(filePath, 'utf-8'),
620
+ ).pipe(
621
+ Effect.map((content) => content),
622
+ Effect.catchAll(() => Effect.succeed(null as string | null)),
623
+ )
624
+
625
+ if (contentResult) {
626
+ fileContent = contentResult
627
+ fileCache.set(r.documentPath, fileContent)
628
+ }
629
+ }
630
+
631
+ if (fileContent) {
632
+ const lines = fileContent.split('\n')
633
+ const startIdx = Math.max(0, section.startLine - 1 - contextBefore)
634
+ const endIdx = Math.min(lines.length, section.endLine + contextAfter)
635
+
636
+ const contextLines: {
637
+ lineNumber: number
638
+ line: string
639
+ isMatch: boolean
640
+ }[] = []
641
+ for (let i = startIdx; i < endIdx; i++) {
642
+ const line = lines[i]
643
+ if (line !== undefined) {
644
+ contextLines.push({
645
+ lineNumber: i + 1,
646
+ line,
647
+ isMatch: i >= section.startLine - 1 && i < section.endLine,
648
+ })
649
+ }
650
+ }
651
+
652
+ resultsWithContext.push({
653
+ sectionId: r.sectionId,
654
+ documentPath: r.documentPath,
655
+ heading: r.heading,
656
+ similarity: r.similarity,
657
+ contextLines,
658
+ })
659
+ } else {
660
+ resultsWithContext.push({
661
+ sectionId: r.sectionId,
662
+ documentPath: r.documentPath,
663
+ heading: r.heading,
664
+ similarity: r.similarity,
665
+ })
666
+ }
667
+ }
668
+
669
+ return resultsWithContext
670
+ })
671
+
672
+ // ============================================================================
673
+ // Semantic Search
674
+ // ============================================================================
675
+
676
+ /**
677
+ * Perform semantic search over embedded sections.
678
+ *
679
+ * @param rootPath - Root directory containing embeddings
680
+ * @param query - Natural language search query
681
+ * @param options - Search options (limit, threshold, path filter)
682
+ * @returns Ranked list of matching sections by similarity
683
+ *
684
+ * @throws EmbeddingsNotFoundError - No embeddings exist (run index --embed first)
685
+ * @throws ApiKeyMissingError - API key not set (check provider config)
686
+ * @throws ApiKeyInvalidError - API key rejected by provider
687
+ * @throws EmbeddingError - Embedding API failure (rate limit, quota, network)
688
+ * @throws VectorStoreError - Cannot load or search vector index
689
+ * @throws DimensionMismatchError - Corpus has different dimensions than current provider
690
+ */
691
+ export const semanticSearch = (
692
+ rootPath: string,
693
+ query: string,
694
+ options: SemanticSearchOptions = {},
695
+ ): Effect.Effect<
696
+ readonly SemanticSearchResult[],
697
+ | EmbeddingsNotFoundError
698
+ | FileReadError
699
+ | IndexCorruptedError
700
+ | ApiKeyMissingError
701
+ | ApiKeyInvalidError
702
+ | EmbeddingError
703
+ | VectorStoreError
704
+ | DimensionMismatchError
705
+ > =>
706
+ Effect.gen(function* () {
707
+ const resolvedRoot = path.resolve(rootPath)
708
+
709
+ // Get active namespace to determine which embedding index to use
710
+ const activeProvider = yield* getActiveNamespace(resolvedRoot).pipe(
711
+ Effect.catchAll(() => Effect.succeed(null as ActiveProvider | null)),
712
+ )
713
+
714
+ if (!activeProvider) {
715
+ return yield* Effect.fail(
716
+ new EmbeddingsNotFoundError({ path: resolvedRoot }),
717
+ )
718
+ }
719
+
720
+ // Create provider for query embedding
721
+ const provider = yield* createEmbeddingProviderDirect(
722
+ options.providerConfig ?? { provider: 'openai' },
723
+ )
724
+ const dimensions = provider.dimensions
725
+
726
+ // Get current provider name for error messages
727
+ const currentProviderName = options.providerConfig?.provider ?? 'openai'
728
+
729
+ // Verify dimensions match the active namespace
730
+ if (dimensions !== activeProvider.dimensions) {
731
+ return yield* Effect.fail(
732
+ new DimensionMismatchError({
733
+ corpusDimensions: activeProvider.dimensions,
734
+ providerDimensions: dimensions,
735
+ corpusProvider: `${activeProvider.provider}:${activeProvider.model}`,
736
+ currentProvider: currentProviderName,
737
+ path: resolvedRoot,
738
+ }),
739
+ )
740
+ }
741
+
742
+ // Load vector store from the active namespace
743
+ const vectorStore = createNamespacedVectorStore(
744
+ resolvedRoot,
745
+ activeProvider.provider,
746
+ activeProvider.model,
747
+ activeProvider.dimensions,
748
+ )
749
+ const loadResult = yield* vectorStore.load()
750
+
751
+ if (!loadResult.loaded) {
752
+ return yield* Effect.fail(
753
+ new EmbeddingsNotFoundError({ path: resolvedRoot }),
754
+ )
755
+ }
756
+
757
+ // Check for HNSW parameter mismatch
758
+ yield* checkHnswMismatch(loadResult.hnswMismatch)
759
+
760
+ // Determine the text to embed
761
+ // If HyDE is enabled, generate a hypothetical document first
762
+ let textToEmbed: string
763
+ let hydeResult: HydeResult | undefined
764
+
765
+ if (options.hyde) {
766
+ // Generate hypothetical document using LLM
767
+ hydeResult = yield* generateHypotheticalDocument(query, {
768
+ model: options.hydeOptions?.model,
769
+ maxTokens: options.hydeOptions?.maxTokens,
770
+ temperature: options.hydeOptions?.temperature,
771
+ })
772
+ textToEmbed = hydeResult.hypotheticalDocument
773
+ yield* Effect.logDebug(
774
+ `HyDE generated ${hydeResult.tokensUsed} tokens ($${hydeResult.cost.toFixed(6)})`,
775
+ )
776
+ } else {
777
+ // Preprocess query for better recall (unless disabled)
778
+ textToEmbed = options.skipPreprocessing ? query : preprocessQuery(query)
779
+ }
780
+
781
+ // Embed the query (or hypothetical document)
782
+ const queryResult = yield* wrapEmbedding(
783
+ provider.embed([textToEmbed]),
784
+ currentProviderName,
785
+ )
786
+
787
+ const queryVector = queryResult.embeddings[0]
788
+ if (!queryVector) {
789
+ return yield* Effect.fail(
790
+ new EmbeddingError({
791
+ reason: 'Unknown',
792
+ message: 'Failed to generate query embedding',
793
+ provider: currentProviderName,
794
+ }),
795
+ )
796
+ }
797
+
798
+ // Search
799
+ const limit = options.limit ?? 10
800
+ const threshold = options.threshold ?? 0
801
+
802
+ // Convert quality mode to efSearch value
803
+ const efSearch = options.quality
804
+ ? QUALITY_EF_SEARCH[options.quality]
805
+ : undefined
806
+
807
+ const searchResults = yield* vectorStore.search(
808
+ queryVector,
809
+ limit * 2,
810
+ threshold,
811
+ { efSearch },
812
+ )
813
+
814
+ // Apply path filter if specified
815
+ let filteredResults = searchResults
816
+ if (options.pathPattern) {
817
+ const pattern = options.pathPattern
818
+ .replace(/\./g, '\\.')
819
+ .replace(/\*/g, '.*')
820
+ const regex = new RegExp(`^${pattern}$`, 'i')
821
+ filteredResults = searchResults.filter((r) => regex.test(r.documentPath))
822
+ }
823
+
824
+ // Apply ranking boost (heading + file importance, enabled by default)
825
+ const applyBoost = options.headingBoost !== false
826
+ const boostedResults = applyBoost
827
+ ? filteredResults.map((r) => ({
828
+ ...r,
829
+ similarity: Math.min(
830
+ 1,
831
+ r.similarity +
832
+ calculateHeadingBoost(r.heading, query) +
833
+ calculateFileImportanceBoost(r.documentPath),
834
+ ),
835
+ }))
836
+ : filteredResults
837
+
838
+ // Re-sort by boosted similarity
839
+ const sortedResults = boostedResults.sort(
840
+ (a, b) => b.similarity - a.similarity,
841
+ )
842
+ const limitedResults = sortedResults.slice(0, limit)
843
+
844
+ // If context lines are requested, load section content
845
+ let results: readonly SemanticSearchResult[]
846
+ if (
847
+ options.contextBefore !== undefined ||
848
+ options.contextAfter !== undefined
849
+ ) {
850
+ const storage = createStorage(resolvedRoot)
851
+ const sectionIndex = yield* loadSectionIndex(storage)
852
+
853
+ if (sectionIndex) {
854
+ results = yield* addContextLinesToResults(
855
+ limitedResults,
856
+ sectionIndex,
857
+ resolvedRoot,
858
+ options,
859
+ )
860
+ } else {
861
+ results = limitedResults.map((r) => ({
862
+ sectionId: r.sectionId,
863
+ documentPath: r.documentPath,
864
+ heading: r.heading,
865
+ similarity: r.similarity,
866
+ }))
867
+ }
868
+ } else {
869
+ results = limitedResults.map((r) => ({
870
+ sectionId: r.sectionId,
871
+ documentPath: r.documentPath,
872
+ heading: r.heading,
873
+ similarity: r.similarity,
874
+ }))
875
+ }
876
+
877
+ return results
878
+ })
879
+
880
+ /**
881
+ * Perform semantic search with stats about below-threshold results.
882
+ * Use this when you want to provide feedback to users about results that
883
+ * didn't meet the threshold.
884
+ *
885
+ * @param rootPath - Root directory containing embeddings
886
+ * @param query - Natural language search query
887
+ * @param options - Search options (limit, threshold, path filter)
888
+ * @returns Results with optional below-threshold stats
889
+ *
890
+ * @throws EmbeddingsNotFoundError - No embeddings exist (run index --embed first)
891
+ * @throws ApiKeyMissingError - API key not set (check provider config)
892
+ * @throws ApiKeyInvalidError - API key rejected by provider
893
+ * @throws EmbeddingError - Embedding API failure (rate limit, quota, network)
894
+ * @throws VectorStoreError - Cannot load or search vector index
895
+ * @throws DimensionMismatchError - Corpus has different dimensions than current provider
896
+ */
897
+ export const semanticSearchWithStats = (
898
+ rootPath: string,
899
+ query: string,
900
+ options: SemanticSearchOptions = {},
901
+ ): Effect.Effect<
902
+ SemanticSearchResultWithStats,
903
+ | EmbeddingsNotFoundError
904
+ | FileReadError
905
+ | IndexCorruptedError
906
+ | ApiKeyMissingError
907
+ | ApiKeyInvalidError
908
+ | EmbeddingError
909
+ | VectorStoreError
910
+ | DimensionMismatchError
911
+ > =>
912
+ Effect.gen(function* () {
913
+ const resolvedRoot = path.resolve(rootPath)
914
+
915
+ // Get active namespace to determine which embedding index to use
916
+ const activeProvider = yield* getActiveNamespace(resolvedRoot).pipe(
917
+ Effect.catchAll(() => Effect.succeed(null as ActiveProvider | null)),
918
+ )
919
+
920
+ if (!activeProvider) {
921
+ return yield* Effect.fail(
922
+ new EmbeddingsNotFoundError({ path: resolvedRoot }),
923
+ )
924
+ }
925
+
926
+ // Create provider for query embedding
927
+ const provider = yield* createEmbeddingProviderDirect(
928
+ options.providerConfig ?? { provider: 'openai' },
929
+ )
930
+ const dimensions = provider.dimensions
931
+
932
+ // Get current provider name for error messages
933
+ const currentProviderName = options.providerConfig?.provider ?? 'openai'
934
+
935
+ // Verify dimensions match the active namespace
936
+ if (dimensions !== activeProvider.dimensions) {
937
+ return yield* Effect.fail(
938
+ new DimensionMismatchError({
939
+ corpusDimensions: activeProvider.dimensions,
940
+ providerDimensions: dimensions,
941
+ corpusProvider: `${activeProvider.provider}:${activeProvider.model}`,
942
+ currentProvider: currentProviderName,
943
+ path: resolvedRoot,
944
+ }),
945
+ )
946
+ }
947
+
948
+ // Load vector store from the active namespace
949
+ const vectorStore = createNamespacedVectorStore(
950
+ resolvedRoot,
951
+ activeProvider.provider,
952
+ activeProvider.model,
953
+ activeProvider.dimensions,
954
+ )
955
+ const loadResult = yield* vectorStore.load()
956
+
957
+ if (!loadResult.loaded) {
958
+ return yield* Effect.fail(
959
+ new EmbeddingsNotFoundError({ path: resolvedRoot }),
960
+ )
961
+ }
962
+
963
+ // Check for HNSW parameter mismatch
964
+ yield* checkHnswMismatch(loadResult.hnswMismatch)
965
+
966
+ // Determine the text to embed
967
+ // If HyDE is enabled, generate a hypothetical document first
968
+ let textToEmbed: string
969
+ let hydeResult: HydeResult | undefined
970
+
971
+ if (options.hyde) {
972
+ // Generate hypothetical document using LLM
973
+ hydeResult = yield* generateHypotheticalDocument(query, {
974
+ model: options.hydeOptions?.model,
975
+ maxTokens: options.hydeOptions?.maxTokens,
976
+ temperature: options.hydeOptions?.temperature,
977
+ })
978
+ textToEmbed = hydeResult.hypotheticalDocument
979
+ yield* Effect.logDebug(
980
+ `HyDE generated ${hydeResult.tokensUsed} tokens ($${hydeResult.cost.toFixed(6)})`,
981
+ )
982
+ } else {
983
+ // Preprocess query for better recall (unless disabled)
984
+ textToEmbed = options.skipPreprocessing ? query : preprocessQuery(query)
985
+ }
986
+
987
+ // Embed the query (or hypothetical document)
988
+ const queryResult = yield* wrapEmbedding(
989
+ provider.embed([textToEmbed]),
990
+ currentProviderName,
991
+ )
992
+
993
+ const queryVector = queryResult.embeddings[0]
994
+ if (!queryVector) {
995
+ return yield* Effect.fail(
996
+ new EmbeddingError({
997
+ reason: 'Unknown',
998
+ message: 'Failed to generate query embedding',
999
+ provider: currentProviderName,
1000
+ }),
1001
+ )
1002
+ }
1003
+
1004
+ // Search with stats
1005
+ const limit = options.limit ?? 10
1006
+ const threshold = options.threshold ?? 0
1007
+
1008
+ // Convert quality mode to efSearch value
1009
+ const efSearch = options.quality
1010
+ ? QUALITY_EF_SEARCH[options.quality]
1011
+ : undefined
1012
+
1013
+ const searchResultWithStats = yield* vectorStore.searchWithStats(
1014
+ queryVector,
1015
+ limit * 2,
1016
+ threshold,
1017
+ { efSearch },
1018
+ )
1019
+
1020
+ // Apply path filter if specified
1021
+ let filteredResults = searchResultWithStats.results
1022
+ if (options.pathPattern) {
1023
+ const pattern = options.pathPattern
1024
+ .replace(/\./g, '\\.')
1025
+ .replace(/\*/g, '.*')
1026
+ const regex = new RegExp(`^${pattern}$`, 'i')
1027
+ filteredResults = searchResultWithStats.results.filter((r) =>
1028
+ regex.test(r.documentPath),
1029
+ )
1030
+ }
1031
+
1032
+ // Apply ranking boost (heading + file importance, enabled by default)
1033
+ const applyBoost = options.headingBoost !== false
1034
+ const boostedResults = applyBoost
1035
+ ? filteredResults.map((r) => ({
1036
+ ...r,
1037
+ similarity: Math.min(
1038
+ 1,
1039
+ r.similarity +
1040
+ calculateHeadingBoost(r.heading, query) +
1041
+ calculateFileImportanceBoost(r.documentPath),
1042
+ ),
1043
+ }))
1044
+ : filteredResults
1045
+
1046
+ // Re-sort by boosted similarity and convert to SemanticSearchResult
1047
+ const sortedResults = boostedResults.sort(
1048
+ (a, b) => b.similarity - a.similarity,
1049
+ )
1050
+ const totalAvailable = sortedResults.length
1051
+ const limitedResults = sortedResults.slice(0, limit)
1052
+
1053
+ // If context lines are requested, load section content
1054
+ let results: readonly SemanticSearchResult[]
1055
+ if (
1056
+ options.contextBefore !== undefined ||
1057
+ options.contextAfter !== undefined
1058
+ ) {
1059
+ const storage = createStorage(resolvedRoot)
1060
+ const sectionIndex = yield* loadSectionIndex(storage)
1061
+
1062
+ if (sectionIndex) {
1063
+ results = yield* addContextLinesToResults(
1064
+ limitedResults,
1065
+ sectionIndex,
1066
+ resolvedRoot,
1067
+ options,
1068
+ )
1069
+ } else {
1070
+ results = limitedResults.map((r) => ({
1071
+ sectionId: r.sectionId,
1072
+ documentPath: r.documentPath,
1073
+ heading: r.heading,
1074
+ similarity: r.similarity,
1075
+ }))
1076
+ }
1077
+ } else {
1078
+ results = limitedResults.map((r) => ({
1079
+ sectionId: r.sectionId,
1080
+ documentPath: r.documentPath,
1081
+ heading: r.heading,
1082
+ similarity: r.similarity,
1083
+ }))
1084
+ }
1085
+
1086
+ return {
1087
+ results,
1088
+ belowThresholdCount: searchResultWithStats.belowThresholdCount,
1089
+ belowThresholdHighest:
1090
+ searchResultWithStats.belowThresholdHighest ?? undefined,
1091
+ totalAvailable,
1092
+ }
1093
+ })
1094
+
1095
+ // ============================================================================
1096
+ // Search with Content
1097
+ // ============================================================================
1098
+
1099
+ /**
1100
+ * Perform semantic search and include section content in results.
1101
+ *
1102
+ * @param rootPath - Root directory containing embeddings
1103
+ * @param query - Natural language search query
1104
+ * @param options - Search options (limit, threshold, path filter)
1105
+ * @returns Ranked list of matching sections with content
1106
+ *
1107
+ * @throws EmbeddingsNotFoundError - No embeddings exist (run index --embed first)
1108
+ * @throws FileReadError - Cannot read index files
1109
+ * @throws IndexCorruptedError - Index files are corrupted
1110
+ * @throws ApiKeyMissingError - API key not set (check provider config)
1111
+ * @throws ApiKeyInvalidError - API key rejected by provider
1112
+ * @throws EmbeddingError - Embedding API failure (rate limit, quota, network)
1113
+ * @throws VectorStoreError - Cannot load or search vector index
1114
+ * @throws DimensionMismatchError - Corpus has different dimensions than current provider
1115
+ */
1116
+ export const semanticSearchWithContent = (
1117
+ rootPath: string,
1118
+ query: string,
1119
+ options: SemanticSearchOptions = {},
1120
+ ): Effect.Effect<
1121
+ readonly SemanticSearchResult[],
1122
+ | EmbeddingsNotFoundError
1123
+ | FileReadError
1124
+ | IndexCorruptedError
1125
+ | ApiKeyMissingError
1126
+ | ApiKeyInvalidError
1127
+ | EmbeddingError
1128
+ | VectorStoreError
1129
+ | DimensionMismatchError
1130
+ > =>
1131
+ Effect.gen(function* () {
1132
+ const resolvedRoot = path.resolve(rootPath)
1133
+ const results = yield* semanticSearch(resolvedRoot, query, options)
1134
+
1135
+ const storage = createStorage(resolvedRoot)
1136
+ const sectionIndex = yield* loadSectionIndex(storage)
1137
+
1138
+ if (!sectionIndex) {
1139
+ return results
1140
+ }
1141
+
1142
+ const resultsWithContent: SemanticSearchResult[] = []
1143
+
1144
+ for (const result of results) {
1145
+ const section = sectionIndex.sections[result.sectionId]
1146
+ if (!section) {
1147
+ resultsWithContent.push(result)
1148
+ continue
1149
+ }
1150
+
1151
+ const filePath = path.join(resolvedRoot, result.documentPath)
1152
+
1153
+ // Note: catchAll is intentional - file read failures during search result
1154
+ // enrichment should skip content loading with a warning, not fail the search.
1155
+ // Results are still returned without content when files can't be read.
1156
+ const fileContentResult = yield* Effect.promise(() =>
1157
+ fs.readFile(filePath, 'utf-8'),
1158
+ ).pipe(
1159
+ Effect.map((content) => ({ ok: true as const, content })),
1160
+ Effect.catchAll(() =>
1161
+ Effect.succeed({ ok: false as const, content: '' }),
1162
+ ),
1163
+ )
1164
+
1165
+ if (!fileContentResult.ok) {
1166
+ yield* Effect.logWarning(
1167
+ `Skipping content load (cannot read): ${result.documentPath}`,
1168
+ )
1169
+ resultsWithContent.push(result)
1170
+ continue
1171
+ }
1172
+
1173
+ const lines = fileContentResult.content.split('\n')
1174
+ const content = lines
1175
+ .slice(section.startLine - 1, section.endLine)
1176
+ .join('\n')
1177
+
1178
+ resultsWithContent.push({
1179
+ ...result,
1180
+ content,
1181
+ })
1182
+ }
1183
+
1184
+ return resultsWithContent
1185
+ })
1186
+
1187
+ // ============================================================================
1188
+ // Get Embedding Stats
1189
+ // ============================================================================
1190
+
1191
+ export interface EmbeddingStats {
1192
+ readonly hasEmbeddings: boolean
1193
+ readonly count: number
1194
+ readonly provider: string
1195
+ readonly model?: string | undefined
1196
+ readonly dimensions: number
1197
+ readonly totalCost: number
1198
+ readonly totalTokens: number
1199
+ }
1200
+
1201
+ /**
1202
+ * Get statistics about stored embeddings.
1203
+ * Uses the active namespace to find the current embedding index.
1204
+ *
1205
+ * @param rootPath - Root directory containing embeddings
1206
+ * @returns Embedding statistics (count, provider, costs)
1207
+ *
1208
+ * @throws VectorStoreError - Cannot load vector index metadata
1209
+ */
1210
+ export const getEmbeddingStats = (
1211
+ rootPath: string,
1212
+ ): Effect.Effect<EmbeddingStats, VectorStoreError> =>
1213
+ Effect.gen(function* () {
1214
+ const resolvedRoot = path.resolve(rootPath)
1215
+
1216
+ // Get the active namespace to find where embeddings are stored
1217
+ const activeProvider = yield* getActiveNamespace(resolvedRoot).pipe(
1218
+ Effect.catchAll(() => Effect.succeed(null as ActiveProvider | null)),
1219
+ )
1220
+
1221
+ if (!activeProvider) {
1222
+ return {
1223
+ hasEmbeddings: false,
1224
+ count: 0,
1225
+ provider: 'none',
1226
+ dimensions: 0,
1227
+ totalCost: 0,
1228
+ totalTokens: 0,
1229
+ }
1230
+ }
1231
+
1232
+ // Load the namespaced vector store to get stats
1233
+ const vectorStore = createNamespacedVectorStore(
1234
+ resolvedRoot,
1235
+ activeProvider.provider,
1236
+ activeProvider.model,
1237
+ activeProvider.dimensions,
1238
+ )
1239
+
1240
+ const loadResult = yield* vectorStore
1241
+ .load()
1242
+ .pipe(
1243
+ Effect.catchAll(() =>
1244
+ Effect.succeed({ loaded: false } as VectorStoreLoadResult),
1245
+ ),
1246
+ )
1247
+
1248
+ if (!loadResult.loaded) {
1249
+ return {
1250
+ hasEmbeddings: false,
1251
+ count: 0,
1252
+ provider: 'none',
1253
+ dimensions: 0,
1254
+ totalCost: 0,
1255
+ totalTokens: 0,
1256
+ }
1257
+ }
1258
+
1259
+ const stats = vectorStore.getStats()
1260
+
1261
+ return {
1262
+ hasEmbeddings: true,
1263
+ count: stats.count,
1264
+ provider: stats.provider || 'openai',
1265
+ model: stats.providerModel,
1266
+ dimensions: stats.dimensions,
1267
+ totalCost: stats.totalCost || 0,
1268
+ totalTokens: stats.totalTokens || 0,
1269
+ }
1270
+ })