mdcontext 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. package/.changeset/README.md +28 -0
  2. package/.changeset/config.json +11 -0
  3. package/.claude/settings.local.json +25 -0
  4. package/.github/workflows/ci.yml +83 -0
  5. package/.github/workflows/claude-code-review.yml +44 -0
  6. package/.github/workflows/claude.yml +85 -0
  7. package/.github/workflows/release.yml +113 -0
  8. package/.tldrignore +112 -0
  9. package/BACKLOG.md +338 -0
  10. package/CONTRIBUTING.md +186 -0
  11. package/NOTES/NOTES +44 -0
  12. package/README.md +434 -11
  13. package/biome.json +36 -0
  14. package/cspell.config.yaml +14 -0
  15. package/dist/chunk-23UPXDNL.js +3044 -0
  16. package/dist/chunk-2W7MO2DL.js +1366 -0
  17. package/dist/chunk-3NUAZGMA.js +1689 -0
  18. package/dist/chunk-7TOWB2XB.js +366 -0
  19. package/dist/chunk-7XOTOADQ.js +3065 -0
  20. package/dist/chunk-AH2PDM2K.js +3042 -0
  21. package/dist/chunk-BNXWSZ63.js +3742 -0
  22. package/dist/chunk-BTL5DJVU.js +3222 -0
  23. package/dist/chunk-HDHYG7E4.js +104 -0
  24. package/dist/chunk-HLR4KZBP.js +3234 -0
  25. package/dist/chunk-IP3FRFEB.js +1045 -0
  26. package/dist/chunk-KHU56VDO.js +3042 -0
  27. package/dist/chunk-KRYIFLQR.js +88 -0
  28. package/dist/chunk-LBSDNLEM.js +287 -0
  29. package/dist/chunk-MNTQ7HCP.js +2643 -0
  30. package/dist/chunk-MUJELQQ6.js +1387 -0
  31. package/dist/chunk-MXJGMSLV.js +2199 -0
  32. package/dist/chunk-N6QJGC3Z.js +2636 -0
  33. package/dist/chunk-OBELGBPM.js +1713 -0
  34. package/dist/chunk-OT7R5XTA.js +3192 -0
  35. package/dist/chunk-P7X4RA2T.js +106 -0
  36. package/dist/chunk-PIDUQNC2.js +3185 -0
  37. package/dist/chunk-POGCDIH4.js +3187 -0
  38. package/dist/chunk-PSIEOQGZ.js +3043 -0
  39. package/dist/chunk-PVRT3IHA.js +3238 -0
  40. package/dist/chunk-QNN4TT23.js +1430 -0
  41. package/dist/chunk-RE3R45RJ.js +3042 -0
  42. package/dist/chunk-S7E6TFX6.js +803 -0
  43. package/dist/chunk-SG6GLU4U.js +1378 -0
  44. package/dist/chunk-SJCDV2ST.js +274 -0
  45. package/dist/chunk-SYE5XLF3.js +104 -0
  46. package/dist/chunk-T5VLYBZD.js +103 -0
  47. package/dist/chunk-TOQB7VWU.js +3238 -0
  48. package/dist/chunk-VFNMZ4ZQ.js +3228 -0
  49. package/dist/chunk-VVTGZNBT.js +1629 -0
  50. package/dist/chunk-W7Q4RFEV.js +104 -0
  51. package/dist/chunk-XTYYVRLO.js +3190 -0
  52. package/dist/chunk-Y6MDYVJD.js +3063 -0
  53. package/dist/cli/main.d.ts +1 -0
  54. package/dist/cli/main.js +5458 -0
  55. package/dist/index.d.ts +653 -0
  56. package/dist/index.js +79 -0
  57. package/dist/mcp/server.d.ts +1 -0
  58. package/dist/mcp/server.js +472 -0
  59. package/dist/schema-BAWSG7KY.js +22 -0
  60. package/dist/schema-E3QUPL26.js +20 -0
  61. package/dist/schema-EHL7WUT6.js +20 -0
  62. package/docs/019-USAGE.md +625 -0
  63. package/docs/020-current-implementation.md +364 -0
  64. package/docs/021-DOGFOODING-FINDINGS.md +175 -0
  65. package/docs/BACKLOG.md +80 -0
  66. package/docs/CONFIG.md +1123 -0
  67. package/docs/DESIGN.md +439 -0
  68. package/docs/ERRORS.md +383 -0
  69. package/docs/PROJECT.md +88 -0
  70. package/docs/ROADMAP.md +407 -0
  71. package/docs/summarization.md +320 -0
  72. package/docs/test-links.md +9 -0
  73. package/justfile +40 -0
  74. package/package.json +74 -9
  75. package/pnpm-workspace.yaml +5 -0
  76. package/research/INDEX.md +315 -0
  77. package/research/code-review/README.md +90 -0
  78. package/research/code-review/cli-error-handling-review.md +979 -0
  79. package/research/code-review/code-review-validation-report.md +464 -0
  80. package/research/code-review/main-ts-review.md +1128 -0
  81. package/research/config-analysis/01-current-implementation.md +470 -0
  82. package/research/config-analysis/02-strategy-recommendation.md +428 -0
  83. package/research/config-analysis/03-task-candidates.md +715 -0
  84. package/research/config-analysis/033-research-configuration-management.md +828 -0
  85. package/research/config-analysis/034-research-effect-cli-config.md +1504 -0
  86. package/research/config-analysis/04-consolidated-task-candidates.md +277 -0
  87. package/research/config-docs/SUMMARY.md +357 -0
  88. package/research/config-docs/TEST-RESULTS.md +776 -0
  89. package/research/config-docs/TODO.md +542 -0
  90. package/research/config-docs/analysis.md +744 -0
  91. package/research/config-docs/fix-validation.md +502 -0
  92. package/research/config-docs/help-audit.md +264 -0
  93. package/research/config-docs/help-system-analysis.md +890 -0
  94. package/research/dogfood/consolidated-tool-evaluation.md +373 -0
  95. package/research/dogfood/strategy-a/a-synthesis.md +184 -0
  96. package/research/dogfood/strategy-a/a1-docs.md +226 -0
  97. package/research/dogfood/strategy-a/a2-amorphic.md +156 -0
  98. package/research/dogfood/strategy-a/a3-llm.md +164 -0
  99. package/research/dogfood/strategy-b/b-synthesis.md +228 -0
  100. package/research/dogfood/strategy-b/b1-architecture.md +207 -0
  101. package/research/dogfood/strategy-b/b2-gaps.md +258 -0
  102. package/research/dogfood/strategy-b/b3-workflows.md +250 -0
  103. package/research/dogfood/strategy-c/c-synthesis.md +451 -0
  104. package/research/dogfood/strategy-c/c1-explorer.md +192 -0
  105. package/research/dogfood/strategy-c/c2-diver-memory.md +145 -0
  106. package/research/dogfood/strategy-c/c3-diver-control.md +148 -0
  107. package/research/dogfood/strategy-c/c4-diver-failure.md +151 -0
  108. package/research/dogfood/strategy-c/c5-diver-execution.md +221 -0
  109. package/research/dogfood/strategy-c/c6-diver-org.md +221 -0
  110. package/research/effect-cli-error-handling.md +845 -0
  111. package/research/effect-errors-as-values.md +943 -0
  112. package/research/errors-task-analysis/00-consolidated-tasks.md +207 -0
  113. package/research/errors-task-analysis/cli-commands-analysis.md +909 -0
  114. package/research/errors-task-analysis/embeddings-analysis.md +709 -0
  115. package/research/errors-task-analysis/index-search-analysis.md +812 -0
  116. package/research/frontmatter/COMMENTS-ARE-SKIPPED.md +149 -0
  117. package/research/frontmatter/LLM-CODE-NAVIGATION.md +276 -0
  118. package/research/issue-review.md +603 -0
  119. package/research/llm-summarization/agent-cli-tools-2026.md +1082 -0
  120. package/research/llm-summarization/alternative-providers-2026.md +1428 -0
  121. package/research/llm-summarization/anthropic-2026.md +367 -0
  122. package/research/llm-summarization/claude-cli-integration.md +1706 -0
  123. package/research/llm-summarization/cli-integration-patterns.md +3155 -0
  124. package/research/llm-summarization/openai-2026.md +473 -0
  125. package/research/llm-summarization/openai-compatible-providers-2026.md +1022 -0
  126. package/research/llm-summarization/opencode-cli-integration.md +1552 -0
  127. package/research/llm-summarization/prompt-engineering-2026.md +1426 -0
  128. package/research/llm-summarization/prototype-results.md +56 -0
  129. package/research/llm-summarization/provider-switching-patterns-2026.md +2153 -0
  130. package/research/llm-summarization/typescript-llm-libraries-2026.md +2436 -0
  131. package/research/mdcontext-error-analysis.md +521 -0
  132. package/research/mdcontext-pudding/00-EXECUTIVE-SUMMARY.md +282 -0
  133. package/research/mdcontext-pudding/01-index-embed.md +956 -0
  134. package/research/mdcontext-pudding/02-search-COMMANDS.md +142 -0
  135. package/research/mdcontext-pudding/02-search-SUMMARY.md +146 -0
  136. package/research/mdcontext-pudding/02-search.md +970 -0
  137. package/research/mdcontext-pudding/03-context.md +779 -0
  138. package/research/mdcontext-pudding/04-navigation-and-analytics.md +803 -0
  139. package/research/mdcontext-pudding/04-tree.md +704 -0
  140. package/research/mdcontext-pudding/05-config.md +1038 -0
  141. package/research/mdcontext-pudding/06-links-summary.txt +87 -0
  142. package/research/mdcontext-pudding/06-links.md +679 -0
  143. package/research/mdcontext-pudding/07-stats.md +693 -0
  144. package/research/mdcontext-pudding/BUG-FIX-PLAN.md +388 -0
  145. package/research/mdcontext-pudding/P0-BUG-VALIDATION.md +167 -0
  146. package/research/mdcontext-pudding/README.md +168 -0
  147. package/research/mdcontext-pudding/TESTING-SUMMARY.md +128 -0
  148. package/research/npm_publish/011-npm-workflow-research-agent2.md +792 -0
  149. package/research/npm_publish/012-npm-workflow-research-agent1.md +530 -0
  150. package/research/npm_publish/013-npm-workflow-research-agent3.md +722 -0
  151. package/research/npm_publish/014-npm-workflow-synthesis.md +556 -0
  152. package/research/npm_publish/031-npm-workflow-task-analysis.md +134 -0
  153. package/research/research-quality-review.md +834 -0
  154. package/research/semantic-search/002-research-embedding-models.md +490 -0
  155. package/research/semantic-search/003-research-rag-alternatives.md +523 -0
  156. package/research/semantic-search/004-research-vector-search.md +841 -0
  157. package/research/semantic-search/032-research-semantic-search.md +427 -0
  158. package/research/semantic-search/embedding-text-analysis.md +156 -0
  159. package/research/semantic-search/multi-word-failure-reproduction.md +171 -0
  160. package/research/semantic-search/query-processing-analysis.md +207 -0
  161. package/research/semantic-search/root-cause-and-solution.md +114 -0
  162. package/research/semantic-search/threshold-validation-report.md +69 -0
  163. package/research/semantic-search/vector-search-analysis.md +63 -0
  164. package/research/task-management-2026/00-synthesis-recommendations.md +295 -0
  165. package/research/task-management-2026/01-ai-workflow-tools.md +416 -0
  166. package/research/task-management-2026/02-agent-framework-patterns.md +476 -0
  167. package/research/task-management-2026/03-lightweight-file-based.md +567 -0
  168. package/research/task-management-2026/04-established-tools-ai-features.md +541 -0
  169. package/research/task-management-2026/linear/01-core-features-workflow.md +771 -0
  170. package/research/task-management-2026/linear/02-api-integrations.md +930 -0
  171. package/research/task-management-2026/linear/03-ai-features.md +368 -0
  172. package/research/task-management-2026/linear/04-pricing-setup.md +205 -0
  173. package/research/task-management-2026/linear/05-usage-patterns-best-practices.md +605 -0
  174. package/research/test-path-issues.md +276 -0
  175. package/review/ALP-76/1-error-type-design.md +962 -0
  176. package/review/ALP-76/2-error-handling-patterns.md +906 -0
  177. package/review/ALP-76/3-error-presentation.md +624 -0
  178. package/review/ALP-76/4-test-coverage.md +625 -0
  179. package/review/ALP-76/5-migration-completeness.md +440 -0
  180. package/review/ALP-76/6-effect-best-practices.md +755 -0
  181. package/scripts/apply-branch-protection.sh +47 -0
  182. package/scripts/branch-protection-templates.json +79 -0
  183. package/scripts/prototype-summarization.ts +346 -0
  184. package/scripts/rebuild-hnswlib.js +58 -0
  185. package/scripts/setup-branch-protection.sh +64 -0
  186. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/active-provider.json +7 -0
  187. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.json +541 -0
  188. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.meta.json +5 -0
  189. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/config.json +8 -0
  190. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
  191. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
  192. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/documents.json +60 -0
  193. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/links.json +13 -0
  194. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/sections.json +1197 -0
  195. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/configuration-management.md +99 -0
  196. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/distributed-systems.md +92 -0
  197. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/error-handling.md +78 -0
  198. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/failure-automation.md +55 -0
  199. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/job-context.md +69 -0
  200. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/process-orchestration.md +99 -0
  201. package/src/cli/argv-preprocessor.test.ts +210 -0
  202. package/src/cli/argv-preprocessor.ts +202 -0
  203. package/src/cli/cli.test.ts +627 -0
  204. package/src/cli/commands/backlinks.ts +54 -0
  205. package/src/cli/commands/config-cmd.ts +642 -0
  206. package/src/cli/commands/context.ts +285 -0
  207. package/src/cli/commands/duplicates.ts +122 -0
  208. package/src/cli/commands/embeddings.ts +529 -0
  209. package/src/cli/commands/index-cmd.ts +480 -0
  210. package/src/cli/commands/index.ts +16 -0
  211. package/src/cli/commands/links.ts +52 -0
  212. package/src/cli/commands/search.ts +1281 -0
  213. package/src/cli/commands/stats.ts +149 -0
  214. package/src/cli/commands/tree.ts +128 -0
  215. package/src/cli/config-layer.ts +176 -0
  216. package/src/cli/error-handler.test.ts +235 -0
  217. package/src/cli/error-handler.ts +655 -0
  218. package/src/cli/flag-schemas.ts +341 -0
  219. package/src/cli/help.ts +588 -0
  220. package/src/cli/index.ts +9 -0
  221. package/src/cli/main.ts +435 -0
  222. package/src/cli/options.ts +41 -0
  223. package/src/cli/shared-error-handling.ts +199 -0
  224. package/src/cli/typo-suggester.test.ts +105 -0
  225. package/src/cli/typo-suggester.ts +130 -0
  226. package/src/cli/utils.ts +259 -0
  227. package/src/config/file-provider.test.ts +320 -0
  228. package/src/config/file-provider.ts +273 -0
  229. package/src/config/index.ts +72 -0
  230. package/src/config/integration.test.ts +667 -0
  231. package/src/config/precedence.test.ts +277 -0
  232. package/src/config/precedence.ts +451 -0
  233. package/src/config/schema.test.ts +414 -0
  234. package/src/config/schema.ts +603 -0
  235. package/src/config/service.test.ts +320 -0
  236. package/src/config/service.ts +243 -0
  237. package/src/config/testing.test.ts +264 -0
  238. package/src/config/testing.ts +110 -0
  239. package/src/core/index.ts +1 -0
  240. package/src/core/types.ts +113 -0
  241. package/src/duplicates/detector.test.ts +183 -0
  242. package/src/duplicates/detector.ts +414 -0
  243. package/src/duplicates/index.ts +18 -0
  244. package/src/embeddings/embedding-namespace.test.ts +300 -0
  245. package/src/embeddings/embedding-namespace.ts +947 -0
  246. package/src/embeddings/heading-boost.test.ts +222 -0
  247. package/src/embeddings/hnsw-build-options.test.ts +198 -0
  248. package/src/embeddings/hyde.test.ts +272 -0
  249. package/src/embeddings/hyde.ts +264 -0
  250. package/src/embeddings/index.ts +10 -0
  251. package/src/embeddings/openai-provider.ts +414 -0
  252. package/src/embeddings/pricing.json +22 -0
  253. package/src/embeddings/provider-constants.ts +204 -0
  254. package/src/embeddings/provider-errors.test.ts +967 -0
  255. package/src/embeddings/provider-errors.ts +565 -0
  256. package/src/embeddings/provider-factory.test.ts +240 -0
  257. package/src/embeddings/provider-factory.ts +225 -0
  258. package/src/embeddings/provider-integration.test.ts +788 -0
  259. package/src/embeddings/query-preprocessing.test.ts +187 -0
  260. package/src/embeddings/semantic-search-threshold.test.ts +508 -0
  261. package/src/embeddings/semantic-search.ts +1270 -0
  262. package/src/embeddings/types.ts +359 -0
  263. package/src/embeddings/vector-store.ts +708 -0
  264. package/src/embeddings/voyage-provider.ts +313 -0
  265. package/src/errors/errors.test.ts +845 -0
  266. package/src/errors/index.ts +533 -0
  267. package/src/index/ignore-patterns.test.ts +354 -0
  268. package/src/index/ignore-patterns.ts +305 -0
  269. package/src/index/index.ts +4 -0
  270. package/src/index/indexer.ts +684 -0
  271. package/src/index/storage.ts +260 -0
  272. package/src/index/types.ts +147 -0
  273. package/src/index/watcher.ts +189 -0
  274. package/src/index.ts +30 -0
  275. package/src/integration/search-keyword.test.ts +678 -0
  276. package/src/mcp/server.ts +612 -0
  277. package/src/parser/index.ts +1 -0
  278. package/src/parser/parser.test.ts +291 -0
  279. package/src/parser/parser.ts +394 -0
  280. package/src/parser/section-filter.test.ts +277 -0
  281. package/src/parser/section-filter.ts +392 -0
  282. package/src/search/__tests__/hybrid-search.test.ts +650 -0
  283. package/src/search/bm25-store.ts +366 -0
  284. package/src/search/cross-encoder.test.ts +253 -0
  285. package/src/search/cross-encoder.ts +406 -0
  286. package/src/search/fuzzy-search.test.ts +419 -0
  287. package/src/search/fuzzy-search.ts +273 -0
  288. package/src/search/hybrid-search.ts +448 -0
  289. package/src/search/path-matcher.test.ts +276 -0
  290. package/src/search/path-matcher.ts +33 -0
  291. package/src/search/query-parser.test.ts +260 -0
  292. package/src/search/query-parser.ts +319 -0
  293. package/src/search/searcher.test.ts +280 -0
  294. package/src/search/searcher.ts +724 -0
  295. package/src/search/wink-bm25.d.ts +30 -0
  296. package/src/summarization/cli-providers/claude.ts +202 -0
  297. package/src/summarization/cli-providers/detection.test.ts +273 -0
  298. package/src/summarization/cli-providers/detection.ts +118 -0
  299. package/src/summarization/cli-providers/index.ts +8 -0
  300. package/src/summarization/cost.test.ts +139 -0
  301. package/src/summarization/cost.ts +102 -0
  302. package/src/summarization/error-handler.test.ts +127 -0
  303. package/src/summarization/error-handler.ts +111 -0
  304. package/src/summarization/index.ts +102 -0
  305. package/src/summarization/pipeline.test.ts +498 -0
  306. package/src/summarization/pipeline.ts +231 -0
  307. package/src/summarization/prompts.test.ts +269 -0
  308. package/src/summarization/prompts.ts +133 -0
  309. package/src/summarization/provider-factory.test.ts +396 -0
  310. package/src/summarization/provider-factory.ts +178 -0
  311. package/src/summarization/types.ts +184 -0
  312. package/src/summarize/budget-bugs.test.ts +620 -0
  313. package/src/summarize/formatters.ts +419 -0
  314. package/src/summarize/index.ts +20 -0
  315. package/src/summarize/summarizer.test.ts +275 -0
  316. package/src/summarize/summarizer.ts +597 -0
  317. package/src/summarize/verify-bugs.test.ts +238 -0
  318. package/src/types/huggingface-transformers.d.ts +66 -0
  319. package/src/utils/index.ts +1 -0
  320. package/src/utils/tokens.test.ts +142 -0
  321. package/src/utils/tokens.ts +186 -0
  322. package/tests/fixtures/cli/.mdcontext/active-provider.json +7 -0
  323. package/tests/fixtures/cli/.mdcontext/config.json +8 -0
  324. package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
  325. package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
  326. package/tests/fixtures/cli/.mdcontext/indexes/documents.json +33 -0
  327. package/tests/fixtures/cli/.mdcontext/indexes/links.json +12 -0
  328. package/tests/fixtures/cli/.mdcontext/indexes/sections.json +247 -0
  329. package/tests/fixtures/cli/README.md +9 -0
  330. package/tests/fixtures/cli/api-reference.md +11 -0
  331. package/tests/fixtures/cli/getting-started.md +11 -0
  332. package/tests/integration/embed-index.test.ts +712 -0
  333. package/tests/integration/search-context.test.ts +469 -0
  334. package/tests/integration/search-semantic.test.ts +522 -0
  335. package/tsconfig.json +26 -0
  336. package/vitest.config.ts +16 -0
  337. package/vitest.setup.ts +12 -0
@@ -0,0 +1,364 @@
1
+ # mdcontext Semantic Search: Current Implementation
2
+
3
+ This document describes the current semantic search implementation in mdcontext, covering architecture, components, data flow, and known limitations.
4
+
5
+ ## Overview
6
+
7
+ mdcontext provides semantic search capabilities that allow users to search markdown documentation by meaning rather than exact text matching. The system uses OpenAI's text-embedding-3-small model to generate vector embeddings and HNSW (Hierarchical Navigable Small World) for approximate nearest neighbor search.
8
+
9
+ ## Architecture
10
+
11
+ ```
12
+ ┌─────────────────────────────────────────────────────────────────────────┐
13
+ │ CLI Layer │
14
+ │ src/cli/commands/search.ts │
15
+ │ - Mode detection (semantic vs keyword) │
16
+ │ - Auto-index prompt for missing embeddings │
17
+ │ - Result formatting and display │
18
+ └──────────────────────────────┬──────────────────────────────────────────┘
19
+
20
+ ┌──────────────────────────────▼──────────────────────────────────────────┐
21
+ │ Semantic Search Layer │
22
+ │ src/embeddings/semantic-search.ts │
23
+ │ - Cost estimation (estimateEmbeddingCost) │
24
+ │ - Embedding generation (buildEmbeddings) │
25
+ │ - Query execution (semanticSearch, semanticSearchWithContent) │
26
+ │ - Statistics (getEmbeddingStats) │
27
+ └─────────────┬─────────────────────────────────────┬─────────────────────┘
28
+ │ │
29
+ ┌─────────────▼───────────────┐ ┌───────────────▼─────────────────────┐
30
+ │ Embedding Provider │ │ Vector Store │
31
+ │ src/embeddings/ │ │ src/embeddings/vector-store.ts │
32
+ │ openai-provider.ts │ │ - HNSW index (hnswlib-node) │
33
+ │ - OpenAI API integration │ │ - Cosine similarity search │
34
+ │ - text-embedding-3-small │ │ - Binary index persistence │
35
+ │ - Batch processing (100) │ │ - Metadata JSON storage │
36
+ └──────────────────────────────┘ └─────────────────────────────────────┘
37
+ ```
38
+
39
+ ## Components
40
+
41
+ ### 1. Embedding Provider (`src/embeddings/openai-provider.ts`)
42
+
43
+ **Current Provider**: OpenAI `text-embedding-3-small`
44
+
45
+ | Property | Value |
46
+ | ---------- | ------------------------ |
47
+ | Model | `text-embedding-3-small` |
48
+ | Dimensions | 1536 |
49
+ | Batch Size | 100 texts per API call |
50
+ | Cost | $0.02 per 1M tokens |
51
+
52
+ **Interface**:
53
+
54
+ ```typescript
55
+ interface EmbeddingProvider {
56
+ readonly name: string; // e.g., "openai:text-embedding-3-small"
57
+ readonly dimensions: number; // 1536 for small, 3072 for large
58
+ embed(texts: string[]): Promise<EmbeddingResult>;
59
+ }
60
+
61
+ interface EmbeddingResult {
62
+ readonly embeddings: readonly number[][];
63
+ readonly tokensUsed: number;
64
+ readonly cost: number;
65
+ }
66
+ ```
67
+
68
+ **Supported Models**:
69
+
70
+ - `text-embedding-3-small` (default): 1536 dimensions, $0.02/1M tokens
71
+ - `text-embedding-3-large`: 3072 dimensions, $0.13/1M tokens
72
+ - `text-embedding-ada-002` (legacy): 1536 dimensions, $0.10/1M tokens
73
+
74
+ ### 2. Vector Store (`src/embeddings/vector-store.ts`)
75
+
76
+ **Implementation**: HNSW via `hnswlib-node`
77
+
78
+ | Parameter | Value | Description |
79
+ | ---------------- | -------- | -------------------------------------------- |
80
+ | Space | `cosine` | Cosine similarity distance metric |
81
+ | Initial Capacity | 10,000 | Auto-resizes by 2x when full |
82
+ | M | 16 | Max connections per node (default) |
83
+ | efConstruction | 200 | Construction-time search width |
84
+ | efSearch | 100 | Query-time search width (implicit from init) |
85
+
86
+ **Storage Format**:
87
+
88
+ - `vectors.bin`: Binary HNSW index file
89
+ - `vectors.meta.json`: Metadata including entries, costs, timestamps
90
+
91
+ **Vector Entry Structure**:
92
+
93
+ ```typescript
94
+ interface VectorEntry {
95
+ readonly id: string; // Section ID
96
+ readonly sectionId: string; // Same as id
97
+ readonly documentPath: string; // Relative path to document
98
+ readonly heading: string; // Section heading text
99
+ readonly embedding: readonly number[]; // 1536-dimensional vector
100
+ }
101
+ ```
102
+
103
+ **Similarity Calculation**:
104
+
105
+ - HNSW stores cosine distance (1 - similarity)
106
+ - Search returns `similarity = 1 - distance`
107
+ - Results filtered by threshold (default: 0.35)
108
+
109
+ ### 3. Semantic Search (`src/embeddings/semantic-search.ts`)
110
+
111
+ **Text Generation for Embeddings**:
112
+
113
+ Each section is embedded with contextual metadata:
114
+
115
+ ```
116
+ # {heading}
117
+ Parent section: {parentHeading} // if nested
118
+ Document: {documentTitle}
119
+
120
+ {full section content}
121
+ ```
122
+
123
+ **Filtering**:
124
+
125
+ - Sections with < 10 tokens are skipped
126
+ - Exclude patterns can filter by document path
127
+
128
+ **Search Flow**:
129
+
130
+ 1. Load vector store from disk
131
+ 2. Embed query using same provider
132
+ 3. kNN search with limit \* 2 (over-fetch for filtering)
133
+ 4. Apply path pattern filter if specified
134
+ 5. Return top `limit` results above threshold
135
+
136
+ ### 4. CLI Search Command (`src/cli/commands/search.ts`)
137
+
138
+ **Mode Detection Priority**:
139
+
140
+ 1. `--mode semantic` or `--mode keyword` (explicit)
141
+ 2. `--keyword` flag (force keyword)
142
+ 3. Boolean/phrase pattern detected (`AND`, `OR`, `NOT`, `"quoted"`)
143
+ 4. Regex pattern detected (special characters)
144
+ 5. Embeddings available → semantic
145
+ 6. No embeddings → keyword
146
+
147
+ **Auto-Index Behavior**:
148
+
149
+ - If semantic mode requested but no embeddings exist:
150
+ - Estimate time/cost
151
+ - If < 10 seconds: auto-create silently
152
+ - Otherwise: prompt user for choice
153
+
154
+ **Default Search Threshold**: 0.35 (raised from 0.3 to filter low-quality matches)
155
+
156
+ ## Data Flow
157
+
158
+ ### Building Embeddings
159
+
160
+ ```
161
+ ┌─────────────────────────────────────────────────────────────────────────┐
162
+ │ 1. Load Indexes │
163
+ │ - documents.json (document metadata) │
164
+ │ - sections.json (section index with line numbers) │
165
+ └──────────────────────────────┬──────────────────────────────────────────┘
166
+
167
+ ┌─────────────────────────────────────────────────────────────────────────┐
168
+ │ 2. Group Sections by Document │
169
+ │ - Skip sections < 10 tokens │
170
+ │ - Apply exclude patterns │
171
+ │ - Track parent headings for context │
172
+ └──────────────────────────────┬──────────────────────────────────────────┘
173
+
174
+ ┌─────────────────────────────────────────────────────────────────────────┐
175
+ │ 3. Read File Content │
176
+ │ - For each document, read file │
177
+ │ - Extract section content by line numbers │
178
+ │ - Generate embedding text with metadata │
179
+ └──────────────────────────────┬──────────────────────────────────────────┘
180
+
181
+ ┌─────────────────────────────────────────────────────────────────────────┐
182
+ │ 4. Generate Embeddings │
183
+ │ - Send all texts to OpenAI API (batched by 100) │
184
+ │ - Track token usage and cost │
185
+ └──────────────────────────────┬──────────────────────────────────────────┘
186
+
187
+ ┌─────────────────────────────────────────────────────────────────────────┐
188
+ │ 5. Build HNSW Index │
189
+ │ - Add vectors with sequential integer IDs │
190
+ │ - Map section IDs to index positions │
191
+ └──────────────────────────────┬──────────────────────────────────────────┘
192
+
193
+ ┌─────────────────────────────────────────────────────────────────────────┐
194
+ │ 6. Persist to Disk │
195
+ │ - vectors.bin (HNSW binary) │
196
+ │ - vectors.meta.json (metadata + entries) │
197
+ └─────────────────────────────────────────────────────────────────────────┘
198
+ ```
199
+
200
+ ### Query Execution
201
+
202
+ ```
203
+ ┌─────────────────────────────────────────────────────────────────────────┐
204
+ │ 1. Query Input │
205
+ │ "How do I configure authentication?" │
206
+ └──────────────────────────────┬──────────────────────────────────────────┘
207
+
208
+ ┌─────────────────────────────────────────────────────────────────────────┐
209
+ │ 2. Load Vector Store │
210
+ │ - Read vectors.bin into HNSW index │
211
+ │ - Load metadata from vectors.meta.json │
212
+ └──────────────────────────────┬──────────────────────────────────────────┘
213
+
214
+ ┌─────────────────────────────────────────────────────────────────────────┐
215
+ │ 3. Embed Query │
216
+ │ - Single API call to OpenAI │
217
+ │ - Returns 1536-dim vector │
218
+ └──────────────────────────────┬──────────────────────────────────────────┘
219
+
220
+ ┌─────────────────────────────────────────────────────────────────────────┐
221
+ │ 4. HNSW kNN Search │
222
+ │ - Find k nearest neighbors (cosine similarity) │
223
+ │ - Over-fetch: request limit * 2 │
224
+ └──────────────────────────────┬──────────────────────────────────────────┘
225
+
226
+ ┌─────────────────────────────────────────────────────────────────────────┐
227
+ │ 5. Post-Processing │
228
+ │ - Filter by similarity threshold (default: 0.35) │
229
+ │ - Filter by path pattern (if specified) │
230
+ │ - Truncate to requested limit │
231
+ └──────────────────────────────┬──────────────────────────────────────────┘
232
+
233
+ ┌─────────────────────────────────────────────────────────────────────────┐
234
+ │ 6. Return Results │
235
+ │ [{sectionId, documentPath, heading, similarity}, ...] │
236
+ └─────────────────────────────────────────────────────────────────────────┘
237
+ ```
238
+
239
+ ## Storage Files
240
+
241
+ Located in `.mdcontext/` directory:
242
+
243
+ | File | Format | Contents |
244
+ | ------------------- | ------ | ---------------------------------------------- |
245
+ | `vectors.bin` | Binary | HNSW index (hnswlib native format) |
246
+ | `vectors.meta.json` | JSON | Entry metadata, costs, timestamps |
247
+ | `documents.json` | JSON | Document index (title, path, stats) |
248
+ | `sections.json` | JSON | Section index (headings, line numbers, tokens) |
249
+
250
+ ## Current Limitations and Gaps
251
+
252
+ ### 1. ~~Single Provider Lock-in~~ RESOLVED (ALP-215)
253
+
254
+ - **RESOLVED**: Multiple embedding providers now supported (OpenAI, Ollama, LM Studio, OpenRouter)
255
+ - **Impact**: Users can choose local providers for offline capability and cost savings
256
+ - **Code Location**: `provider-factory.ts` handles provider selection based on config
257
+
258
+ ### 2. No Incremental Updates
259
+
260
+ - **Issue**: `buildEmbeddings` either builds all or skips entirely
261
+ - **Impact**: Adding one document requires re-embedding everything (with `--force`)
262
+ - **Workaround**: Cache hit detection skips if any embeddings exist
263
+
264
+ ### 3. Fixed HNSW Parameters
265
+
266
+ - **Issue**: HNSW parameters (M=16, efConstruction=200) are hardcoded
267
+ - **Impact**: No tuning for different corpus sizes or quality/speed tradeoffs
268
+ - **Code Location**: `vector-store.ts:94`
269
+
270
+ ### 4. No Hybrid Search
271
+
272
+ - **Issue**: Semantic and keyword search are mutually exclusive
273
+ - **Impact**: Can't combine exact matches with semantic similarity
274
+ - **Workaround**: Mode auto-detection helps, but no fusion ranking
275
+
276
+ ### 5. No Re-ranking
277
+
278
+ - **Issue**: Results are pure cosine similarity, no re-ranking
279
+ - **Impact**: May miss contextually relevant results that rank lower in embedding space
280
+ - **Alternative**: Cross-encoder re-ranking could improve precision
281
+
282
+ ### 6. Section-Level Granularity Only
283
+
284
+ - **Issue**: Embeddings are per-section, no paragraph or sentence chunking
285
+ - **Impact**: Large sections may have diluted embeddings; queries may match subsections better
286
+ - **Tradeoff**: Current approach preserves document structure
287
+
288
+ ### 7. No Query Expansion
289
+
290
+ - **Issue**: Queries are embedded as-is
291
+ - **Impact**: Synonyms, abbreviations, and related terms may not match
292
+ - **Opportunity**: HyDE or query reformulation could help
293
+
294
+ ### 8. Limited Metadata Filtering
295
+
296
+ - **Issue**: Only path pattern filtering supported
297
+ - **Impact**: Can't filter by date, author, tags, or other metadata
298
+ - **Code Location**: `semanticSearch` has `pathPattern` option only
299
+
300
+ ### 9. No Batch Query Support
301
+
302
+ - **Issue**: Each search embeds query individually
303
+ - **Impact**: Multiple searches incur repeated API calls
304
+ - **Opportunity**: Query batching could reduce latency
305
+
306
+ ### 10. Memory Usage
307
+
308
+ - **Issue**: Entire HNSW index loaded into memory
309
+ - **Impact**: Large corpora may hit memory limits
310
+ - **Note**: Not a problem for typical documentation sizes
311
+
312
+ ## Cost Analysis
313
+
314
+ For a typical documentation corpus (~1000 sections, ~500K tokens):
315
+
316
+ | Operation | Tokens | Cost |
317
+ | ----------------- | ------- | ---------- |
318
+ | Initial embedding | ~500K | ~$0.01 |
319
+ | Per query | ~50-100 | ~$0.000002 |
320
+
321
+ The cost is dominated by initial embedding creation. Query costs are negligible.
322
+
323
+ ## Performance Characteristics
324
+
325
+ | Metric | Typical Value |
326
+ | --------------- | ------------------------------ |
327
+ | Embedding build | ~1.5s per 100 sections |
328
+ | Query latency | ~200-500ms (API call dominant) |
329
+ | Index load time | ~50-100ms for 1000 vectors |
330
+ | Memory usage | ~10MB per 1000 vectors |
331
+
332
+ ## Configuration
333
+
334
+ Current configuration is largely hardcoded. Key values:
335
+
336
+ ```typescript
337
+ // Embedding
338
+ model: 'text-embedding-3-small' // openai-provider.ts
339
+ batchSize: 100 // openai-provider.ts
340
+ minTokens: 10 // semantic-search.ts (skip small sections)
341
+
342
+ // Vector store
343
+ space: 'cosine' // vector-store.ts
344
+ initialCapacity: 10000 // vector-store.ts
345
+ M: 16 // vector-store.ts
346
+ efConstruction: 200 // vector-store.ts
347
+
348
+ // Search
349
+ defaultLimit: 10 // search.ts
350
+ defaultThreshold: 0.35 // search.ts
351
+ autoIndexThreshold: 10 seconds // search.ts
352
+ ```
353
+
354
+ ## Type Definitions
355
+
356
+ Full type definitions are in `src/embeddings/types.ts`:
357
+
358
+ - `EmbeddingProvider`: Provider interface
359
+ - `EmbeddingResult`: Embed response
360
+ - `VectorEntry`: Stored vector with metadata
361
+ - `VectorIndex`: Full index schema
362
+ - `SemanticSearchOptions`: Search parameters
363
+ - `SemanticSearchResult`: Search result item
364
+ - `EmbedError`: Error types (RateLimit, ApiKey, Network, Unknown)
@@ -0,0 +1,175 @@
1
+ # mdcontext Dogfooding Findings
2
+
3
+ **Date:** 2025-01-19
4
+ **Method:** 6 autonomous agents explored documentation directories using only mdcontext CLI
5
+ **Target directories:** `./docs`, `./doc.llm`, `./docs.amorphic` (in `/Users/alphab/Dev/LLM/DEV/TMP/ralph`)
6
+
7
+ ---
8
+
9
+ ## Executive Summary
10
+
11
+ **Verdict: YES - mdcontext is useful**, with the `context` command being the standout feature delivering 80-99% token reduction. However, several issues limit the tool's effectiveness for discovery workflows.
12
+
13
+ ---
14
+
15
+ ## What Works Well
16
+
17
+ ### 1. Context Command (Killer Feature)
18
+
19
+ - Consistently achieved 80-99% token reduction across all test directories
20
+ - Multi-file assembly with budget allocation works as designed
21
+ - `--brief` mode effective for quick overviews
22
+ - JSON output useful for programmatic consumption
23
+
24
+ ### 2. Tree Command
25
+
26
+ - Excellent for initial codebase discovery
27
+ - Clean hierarchical output
28
+ - Good starting point before diving into specific files
29
+
30
+ ### 3. Help System
31
+
32
+ - Well-polished with examples
33
+ - Subcommand help (`mdcontext context --help`) informative
34
+ - Agents successfully learned the tool from `--help` alone
35
+
36
+ ---
37
+
38
+ ## Issues Found
39
+
40
+ ### Critical: `index` Command Returns "0 Documents"
41
+
42
+ **Symptom:**
43
+
44
+ ```bash
45
+ mdcontext index ./docs
46
+ # Output: "Indexed: 0 documents"
47
+ ```
48
+
49
+ **Expected:** Should index markdown files in the directory.
50
+
51
+ **Impact:** Breaks the discovery workflow. Users can't find content without working index.
52
+
53
+ **Frequency:** Inconsistent - sometimes works, sometimes doesn't.
54
+
55
+ ---
56
+
57
+ ### High: `search` Only Matches Headings
58
+
59
+ **Symptom:**
60
+
61
+ ```bash
62
+ mdcontext search "authentication" ./docs
63
+ # Returns: Only matches if "authentication" appears in a heading
64
+ ```
65
+
66
+ **Expected:** Search should find content anywhere in documents.
67
+
68
+ **Impact:** Major limitation for discovery. Users searching for concepts/keywords get no results if those words aren't in headings.
69
+
70
+ **Note:** Full semantic search requires an embedding provider (OpenAI, Ollama, LM Studio, or OpenRouter) + `--embed` flag. See [CONFIG.md](./CONFIG.md) for free local options. Structural search should at least search content.
71
+
72
+ ---
73
+
74
+ ### Medium: Token Budget Sometimes Exceeded
75
+
76
+ **Symptom:**
77
+
78
+ ```bash
79
+ mdcontext context --tokens 500 --brief file.md
80
+ # Output may exceed 500 tokens
81
+ ```
82
+
83
+ **Expected:** Output should respect token budget.
84
+
85
+ **Impact:** Unpredictable context sizes when assembling for LLM consumption.
86
+
87
+ ---
88
+
89
+ ### Low: `stats` Command Minimal Without Embeddings
90
+
91
+ **Symptom:**
92
+
93
+ ```bash
94
+ mdcontext stats ./docs
95
+ # Shows basic counts only
96
+ ```
97
+
98
+ **Impact:** Limited usefulness without embeddings enabled.
99
+
100
+ ---
101
+
102
+ ## Recommendations
103
+
104
+ ### P0 - Fix Index Command
105
+
106
+ The "0 documents" issue breaks the primary discovery workflow. Investigate:
107
+
108
+ - Directory path resolution
109
+ - File extension filtering
110
+ - Silent failures in indexing process
111
+
112
+ ### P1 - Expand Structural Search
113
+
114
+ Make structural search (`--structural` or default without embeddings) search document content, not just headings:
115
+
116
+ - Full-text regex matching
117
+ - Content snippet in results
118
+ - Line number references
119
+
120
+ ### P2 - Enforce Token Budgets
121
+
122
+ Ensure `--tokens` flag is respected:
123
+
124
+ - Truncate output if necessary
125
+ - Warn user if content exceeds budget
126
+ - Consider separate flags for hard vs soft limits
127
+
128
+ ### P3 - Improve Stats Without Embeddings
129
+
130
+ Show useful stats even without embeddings:
131
+
132
+ - Document count, total tokens, avg tokens/doc
133
+ - Section depth analysis
134
+ - File size distribution
135
+
136
+ ---
137
+
138
+ ## Test Matrix
139
+
140
+ | Agent | Directory | Commands Used | Verdict |
141
+ | ------- | --------------- | ---------------------- | ------------------------------- |
142
+ | a3caa1b | ./docs | tree, context, search | YES with caveats |
143
+ | a199309 | ./docs | tree, context, index | YES - context justifies tool |
144
+ | a7857e0 | ./docs | help, context, tree | YES - direction is good |
145
+ | a71de5c | ./doc.llm | index, search, context | Partially - search/index issues |
146
+ | a4ec1e1 | ./docs (ralph) | tree, context, stats | YES - solves real problem |
147
+ | a96ff96 | ./docs.amorphic | tree, context, index | YES - context is valuable |
148
+
149
+ ---
150
+
151
+ ## Raw Findings Summary
152
+
153
+ ### What Agents Tried That Failed
154
+
155
+ 1. `mdcontext index <dir>` → "0 documents"
156
+ 2. `mdcontext search "keyword" <dir>` → No results (keyword in content, not heading)
157
+ 3. `mdcontext stats <dir>` → Minimal output without embeddings
158
+
159
+ ### What Agents Found Valuable
160
+
161
+ 1. `mdcontext context --brief file.md` → Instant useful summary
162
+ 2. `mdcontext tree <dir>` → Quick structure overview
163
+ 3. `mdcontext context file1.md file2.md --tokens 1000` → Multi-file assembly
164
+ 4. `mdcontext --help` / `mdcontext <cmd> --help` → Self-discovery worked
165
+
166
+ ---
167
+
168
+ ## Suggested Task Scope
169
+
170
+ A follow-up task should address:
171
+
172
+ 1. **Index reliability** - Debug why index returns 0 documents
173
+ 2. **Content search** - Extend structural search beyond headings
174
+ 3. **Budget enforcement** - Ensure token limits are respected
175
+ 4. **Error messages** - Surface why operations "silently fail"
@@ -0,0 +1,80 @@
1
+ # Backlog
2
+
3
+ Ideas and improvements to revisit later.
4
+
5
+ ---
6
+
7
+ ## CLI: Schema-Based Argv Preprocessor
8
+
9
+ **Date:** 2025-01-19
10
+ **Priority:** Medium
11
+ **Context:** Current `argv-preprocessor.ts` uses hardcoded flag lists which breaks on unknown flags.
12
+
13
+ ### Problem
14
+
15
+ ```bash
16
+ mdcontext context --json docs/*.md --pretty -x 200
17
+ # Error: ENOENT: no such file or directory, open '.../-x'
18
+ ```
19
+
20
+ Unknown flags like `-x` get passed through as positional args (file paths) instead of being rejected with a clear error.
21
+
22
+ ### Proposed Solution
23
+
24
+ Replace hardcoded `flagsWithValues` set with a schema-based approach:
25
+
26
+ ```typescript
27
+ interface FlagSpec {
28
+ type: "boolean" | "string";
29
+ }
30
+
31
+ const schema: Record<string, FlagSpec> = {
32
+ "--json": { type: "boolean" },
33
+ "--output": { type: "string" },
34
+ };
35
+
36
+ function parse(argv: string[], schema: Record<string, FlagSpec>) {
37
+ const options: Record<string, any> = {};
38
+ const positionals: string[] = [];
39
+
40
+ for (let i = 0; i < argv.length; i++) {
41
+ const arg = argv[i];
42
+ if (arg.startsWith("-")) {
43
+ const spec = schema[arg];
44
+ if (!spec) throw new Error(`Unknown option: ${arg}`);
45
+ if (spec.type === "boolean") {
46
+ options[arg] = true;
47
+ } else {
48
+ const value = argv[i + 1];
49
+ if (!value || value.startsWith("-")) {
50
+ throw new Error(`Missing value for option: ${arg}`);
51
+ }
52
+ options[arg] = value;
53
+ i++;
54
+ }
55
+ } else {
56
+ positionals.push(arg);
57
+ }
58
+ }
59
+ return { options, positionals };
60
+ }
61
+ ```
62
+
63
+ ### Benefits
64
+
65
+ 1. **Clear errors** - "Unknown option: -x" instead of cryptic file errors
66
+ 2. **Single source of truth** - Schema can align with Effect CLI definitions
67
+ 3. **Per-command schemas** - Each command declares its own flags
68
+ 4. **Maintainable** - Adding flags = adding to schema, not hunting through code
69
+
70
+ ### Implementation Notes
71
+
72
+ - Could extract schema from existing Effect CLI option definitions
73
+ - Or define shared schema that both preprocessor and CLI use
74
+ - Consider generating schema from CLI definitions at build time
75
+
76
+ ### Related Files
77
+
78
+ - `src/cli/argv-preprocessor.ts` - Current implementation
79
+ - `src/cli/commands/*.ts` - Effect CLI command definitions
80
+ - `src/cli/options.ts` - Shared options