mdcontext 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. package/.changeset/README.md +28 -0
  2. package/.changeset/config.json +11 -0
  3. package/.claude/settings.local.json +25 -0
  4. package/.github/workflows/ci.yml +83 -0
  5. package/.github/workflows/claude-code-review.yml +44 -0
  6. package/.github/workflows/claude.yml +85 -0
  7. package/.github/workflows/release.yml +113 -0
  8. package/.tldrignore +112 -0
  9. package/BACKLOG.md +338 -0
  10. package/CONTRIBUTING.md +186 -0
  11. package/NOTES/NOTES +44 -0
  12. package/README.md +434 -11
  13. package/biome.json +36 -0
  14. package/cspell.config.yaml +14 -0
  15. package/dist/chunk-23UPXDNL.js +3044 -0
  16. package/dist/chunk-2W7MO2DL.js +1366 -0
  17. package/dist/chunk-3NUAZGMA.js +1689 -0
  18. package/dist/chunk-7TOWB2XB.js +366 -0
  19. package/dist/chunk-7XOTOADQ.js +3065 -0
  20. package/dist/chunk-AH2PDM2K.js +3042 -0
  21. package/dist/chunk-BNXWSZ63.js +3742 -0
  22. package/dist/chunk-BTL5DJVU.js +3222 -0
  23. package/dist/chunk-HDHYG7E4.js +104 -0
  24. package/dist/chunk-HLR4KZBP.js +3234 -0
  25. package/dist/chunk-IP3FRFEB.js +1045 -0
  26. package/dist/chunk-KHU56VDO.js +3042 -0
  27. package/dist/chunk-KRYIFLQR.js +88 -0
  28. package/dist/chunk-LBSDNLEM.js +287 -0
  29. package/dist/chunk-MNTQ7HCP.js +2643 -0
  30. package/dist/chunk-MUJELQQ6.js +1387 -0
  31. package/dist/chunk-MXJGMSLV.js +2199 -0
  32. package/dist/chunk-N6QJGC3Z.js +2636 -0
  33. package/dist/chunk-OBELGBPM.js +1713 -0
  34. package/dist/chunk-OT7R5XTA.js +3192 -0
  35. package/dist/chunk-P7X4RA2T.js +106 -0
  36. package/dist/chunk-PIDUQNC2.js +3185 -0
  37. package/dist/chunk-POGCDIH4.js +3187 -0
  38. package/dist/chunk-PSIEOQGZ.js +3043 -0
  39. package/dist/chunk-PVRT3IHA.js +3238 -0
  40. package/dist/chunk-QNN4TT23.js +1430 -0
  41. package/dist/chunk-RE3R45RJ.js +3042 -0
  42. package/dist/chunk-S7E6TFX6.js +803 -0
  43. package/dist/chunk-SG6GLU4U.js +1378 -0
  44. package/dist/chunk-SJCDV2ST.js +274 -0
  45. package/dist/chunk-SYE5XLF3.js +104 -0
  46. package/dist/chunk-T5VLYBZD.js +103 -0
  47. package/dist/chunk-TOQB7VWU.js +3238 -0
  48. package/dist/chunk-VFNMZ4ZQ.js +3228 -0
  49. package/dist/chunk-VVTGZNBT.js +1629 -0
  50. package/dist/chunk-W7Q4RFEV.js +104 -0
  51. package/dist/chunk-XTYYVRLO.js +3190 -0
  52. package/dist/chunk-Y6MDYVJD.js +3063 -0
  53. package/dist/cli/main.d.ts +1 -0
  54. package/dist/cli/main.js +5458 -0
  55. package/dist/index.d.ts +653 -0
  56. package/dist/index.js +79 -0
  57. package/dist/mcp/server.d.ts +1 -0
  58. package/dist/mcp/server.js +472 -0
  59. package/dist/schema-BAWSG7KY.js +22 -0
  60. package/dist/schema-E3QUPL26.js +20 -0
  61. package/dist/schema-EHL7WUT6.js +20 -0
  62. package/docs/019-USAGE.md +625 -0
  63. package/docs/020-current-implementation.md +364 -0
  64. package/docs/021-DOGFOODING-FINDINGS.md +175 -0
  65. package/docs/BACKLOG.md +80 -0
  66. package/docs/CONFIG.md +1123 -0
  67. package/docs/DESIGN.md +439 -0
  68. package/docs/ERRORS.md +383 -0
  69. package/docs/PROJECT.md +88 -0
  70. package/docs/ROADMAP.md +407 -0
  71. package/docs/summarization.md +320 -0
  72. package/docs/test-links.md +9 -0
  73. package/justfile +40 -0
  74. package/package.json +74 -9
  75. package/pnpm-workspace.yaml +5 -0
  76. package/research/INDEX.md +315 -0
  77. package/research/code-review/README.md +90 -0
  78. package/research/code-review/cli-error-handling-review.md +979 -0
  79. package/research/code-review/code-review-validation-report.md +464 -0
  80. package/research/code-review/main-ts-review.md +1128 -0
  81. package/research/config-analysis/01-current-implementation.md +470 -0
  82. package/research/config-analysis/02-strategy-recommendation.md +428 -0
  83. package/research/config-analysis/03-task-candidates.md +715 -0
  84. package/research/config-analysis/033-research-configuration-management.md +828 -0
  85. package/research/config-analysis/034-research-effect-cli-config.md +1504 -0
  86. package/research/config-analysis/04-consolidated-task-candidates.md +277 -0
  87. package/research/config-docs/SUMMARY.md +357 -0
  88. package/research/config-docs/TEST-RESULTS.md +776 -0
  89. package/research/config-docs/TODO.md +542 -0
  90. package/research/config-docs/analysis.md +744 -0
  91. package/research/config-docs/fix-validation.md +502 -0
  92. package/research/config-docs/help-audit.md +264 -0
  93. package/research/config-docs/help-system-analysis.md +890 -0
  94. package/research/dogfood/consolidated-tool-evaluation.md +373 -0
  95. package/research/dogfood/strategy-a/a-synthesis.md +184 -0
  96. package/research/dogfood/strategy-a/a1-docs.md +226 -0
  97. package/research/dogfood/strategy-a/a2-amorphic.md +156 -0
  98. package/research/dogfood/strategy-a/a3-llm.md +164 -0
  99. package/research/dogfood/strategy-b/b-synthesis.md +228 -0
  100. package/research/dogfood/strategy-b/b1-architecture.md +207 -0
  101. package/research/dogfood/strategy-b/b2-gaps.md +258 -0
  102. package/research/dogfood/strategy-b/b3-workflows.md +250 -0
  103. package/research/dogfood/strategy-c/c-synthesis.md +451 -0
  104. package/research/dogfood/strategy-c/c1-explorer.md +192 -0
  105. package/research/dogfood/strategy-c/c2-diver-memory.md +145 -0
  106. package/research/dogfood/strategy-c/c3-diver-control.md +148 -0
  107. package/research/dogfood/strategy-c/c4-diver-failure.md +151 -0
  108. package/research/dogfood/strategy-c/c5-diver-execution.md +221 -0
  109. package/research/dogfood/strategy-c/c6-diver-org.md +221 -0
  110. package/research/effect-cli-error-handling.md +845 -0
  111. package/research/effect-errors-as-values.md +943 -0
  112. package/research/errors-task-analysis/00-consolidated-tasks.md +207 -0
  113. package/research/errors-task-analysis/cli-commands-analysis.md +909 -0
  114. package/research/errors-task-analysis/embeddings-analysis.md +709 -0
  115. package/research/errors-task-analysis/index-search-analysis.md +812 -0
  116. package/research/frontmatter/COMMENTS-ARE-SKIPPED.md +149 -0
  117. package/research/frontmatter/LLM-CODE-NAVIGATION.md +276 -0
  118. package/research/issue-review.md +603 -0
  119. package/research/llm-summarization/agent-cli-tools-2026.md +1082 -0
  120. package/research/llm-summarization/alternative-providers-2026.md +1428 -0
  121. package/research/llm-summarization/anthropic-2026.md +367 -0
  122. package/research/llm-summarization/claude-cli-integration.md +1706 -0
  123. package/research/llm-summarization/cli-integration-patterns.md +3155 -0
  124. package/research/llm-summarization/openai-2026.md +473 -0
  125. package/research/llm-summarization/openai-compatible-providers-2026.md +1022 -0
  126. package/research/llm-summarization/opencode-cli-integration.md +1552 -0
  127. package/research/llm-summarization/prompt-engineering-2026.md +1426 -0
  128. package/research/llm-summarization/prototype-results.md +56 -0
  129. package/research/llm-summarization/provider-switching-patterns-2026.md +2153 -0
  130. package/research/llm-summarization/typescript-llm-libraries-2026.md +2436 -0
  131. package/research/mdcontext-error-analysis.md +521 -0
  132. package/research/mdcontext-pudding/00-EXECUTIVE-SUMMARY.md +282 -0
  133. package/research/mdcontext-pudding/01-index-embed.md +956 -0
  134. package/research/mdcontext-pudding/02-search-COMMANDS.md +142 -0
  135. package/research/mdcontext-pudding/02-search-SUMMARY.md +146 -0
  136. package/research/mdcontext-pudding/02-search.md +970 -0
  137. package/research/mdcontext-pudding/03-context.md +779 -0
  138. package/research/mdcontext-pudding/04-navigation-and-analytics.md +803 -0
  139. package/research/mdcontext-pudding/04-tree.md +704 -0
  140. package/research/mdcontext-pudding/05-config.md +1038 -0
  141. package/research/mdcontext-pudding/06-links-summary.txt +87 -0
  142. package/research/mdcontext-pudding/06-links.md +679 -0
  143. package/research/mdcontext-pudding/07-stats.md +693 -0
  144. package/research/mdcontext-pudding/BUG-FIX-PLAN.md +388 -0
  145. package/research/mdcontext-pudding/P0-BUG-VALIDATION.md +167 -0
  146. package/research/mdcontext-pudding/README.md +168 -0
  147. package/research/mdcontext-pudding/TESTING-SUMMARY.md +128 -0
  148. package/research/npm_publish/011-npm-workflow-research-agent2.md +792 -0
  149. package/research/npm_publish/012-npm-workflow-research-agent1.md +530 -0
  150. package/research/npm_publish/013-npm-workflow-research-agent3.md +722 -0
  151. package/research/npm_publish/014-npm-workflow-synthesis.md +556 -0
  152. package/research/npm_publish/031-npm-workflow-task-analysis.md +134 -0
  153. package/research/research-quality-review.md +834 -0
  154. package/research/semantic-search/002-research-embedding-models.md +490 -0
  155. package/research/semantic-search/003-research-rag-alternatives.md +523 -0
  156. package/research/semantic-search/004-research-vector-search.md +841 -0
  157. package/research/semantic-search/032-research-semantic-search.md +427 -0
  158. package/research/semantic-search/embedding-text-analysis.md +156 -0
  159. package/research/semantic-search/multi-word-failure-reproduction.md +171 -0
  160. package/research/semantic-search/query-processing-analysis.md +207 -0
  161. package/research/semantic-search/root-cause-and-solution.md +114 -0
  162. package/research/semantic-search/threshold-validation-report.md +69 -0
  163. package/research/semantic-search/vector-search-analysis.md +63 -0
  164. package/research/task-management-2026/00-synthesis-recommendations.md +295 -0
  165. package/research/task-management-2026/01-ai-workflow-tools.md +416 -0
  166. package/research/task-management-2026/02-agent-framework-patterns.md +476 -0
  167. package/research/task-management-2026/03-lightweight-file-based.md +567 -0
  168. package/research/task-management-2026/04-established-tools-ai-features.md +541 -0
  169. package/research/task-management-2026/linear/01-core-features-workflow.md +771 -0
  170. package/research/task-management-2026/linear/02-api-integrations.md +930 -0
  171. package/research/task-management-2026/linear/03-ai-features.md +368 -0
  172. package/research/task-management-2026/linear/04-pricing-setup.md +205 -0
  173. package/research/task-management-2026/linear/05-usage-patterns-best-practices.md +605 -0
  174. package/research/test-path-issues.md +276 -0
  175. package/review/ALP-76/1-error-type-design.md +962 -0
  176. package/review/ALP-76/2-error-handling-patterns.md +906 -0
  177. package/review/ALP-76/3-error-presentation.md +624 -0
  178. package/review/ALP-76/4-test-coverage.md +625 -0
  179. package/review/ALP-76/5-migration-completeness.md +440 -0
  180. package/review/ALP-76/6-effect-best-practices.md +755 -0
  181. package/scripts/apply-branch-protection.sh +47 -0
  182. package/scripts/branch-protection-templates.json +79 -0
  183. package/scripts/prototype-summarization.ts +346 -0
  184. package/scripts/rebuild-hnswlib.js +58 -0
  185. package/scripts/setup-branch-protection.sh +64 -0
  186. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/active-provider.json +7 -0
  187. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.json +541 -0
  188. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.meta.json +5 -0
  189. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/config.json +8 -0
  190. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
  191. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
  192. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/documents.json +60 -0
  193. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/links.json +13 -0
  194. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/sections.json +1197 -0
  195. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/configuration-management.md +99 -0
  196. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/distributed-systems.md +92 -0
  197. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/error-handling.md +78 -0
  198. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/failure-automation.md +55 -0
  199. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/job-context.md +69 -0
  200. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/process-orchestration.md +99 -0
  201. package/src/cli/argv-preprocessor.test.ts +210 -0
  202. package/src/cli/argv-preprocessor.ts +202 -0
  203. package/src/cli/cli.test.ts +627 -0
  204. package/src/cli/commands/backlinks.ts +54 -0
  205. package/src/cli/commands/config-cmd.ts +642 -0
  206. package/src/cli/commands/context.ts +285 -0
  207. package/src/cli/commands/duplicates.ts +122 -0
  208. package/src/cli/commands/embeddings.ts +529 -0
  209. package/src/cli/commands/index-cmd.ts +480 -0
  210. package/src/cli/commands/index.ts +16 -0
  211. package/src/cli/commands/links.ts +52 -0
  212. package/src/cli/commands/search.ts +1281 -0
  213. package/src/cli/commands/stats.ts +149 -0
  214. package/src/cli/commands/tree.ts +128 -0
  215. package/src/cli/config-layer.ts +176 -0
  216. package/src/cli/error-handler.test.ts +235 -0
  217. package/src/cli/error-handler.ts +655 -0
  218. package/src/cli/flag-schemas.ts +341 -0
  219. package/src/cli/help.ts +588 -0
  220. package/src/cli/index.ts +9 -0
  221. package/src/cli/main.ts +435 -0
  222. package/src/cli/options.ts +41 -0
  223. package/src/cli/shared-error-handling.ts +199 -0
  224. package/src/cli/typo-suggester.test.ts +105 -0
  225. package/src/cli/typo-suggester.ts +130 -0
  226. package/src/cli/utils.ts +259 -0
  227. package/src/config/file-provider.test.ts +320 -0
  228. package/src/config/file-provider.ts +273 -0
  229. package/src/config/index.ts +72 -0
  230. package/src/config/integration.test.ts +667 -0
  231. package/src/config/precedence.test.ts +277 -0
  232. package/src/config/precedence.ts +451 -0
  233. package/src/config/schema.test.ts +414 -0
  234. package/src/config/schema.ts +603 -0
  235. package/src/config/service.test.ts +320 -0
  236. package/src/config/service.ts +243 -0
  237. package/src/config/testing.test.ts +264 -0
  238. package/src/config/testing.ts +110 -0
  239. package/src/core/index.ts +1 -0
  240. package/src/core/types.ts +113 -0
  241. package/src/duplicates/detector.test.ts +183 -0
  242. package/src/duplicates/detector.ts +414 -0
  243. package/src/duplicates/index.ts +18 -0
  244. package/src/embeddings/embedding-namespace.test.ts +300 -0
  245. package/src/embeddings/embedding-namespace.ts +947 -0
  246. package/src/embeddings/heading-boost.test.ts +222 -0
  247. package/src/embeddings/hnsw-build-options.test.ts +198 -0
  248. package/src/embeddings/hyde.test.ts +272 -0
  249. package/src/embeddings/hyde.ts +264 -0
  250. package/src/embeddings/index.ts +10 -0
  251. package/src/embeddings/openai-provider.ts +414 -0
  252. package/src/embeddings/pricing.json +22 -0
  253. package/src/embeddings/provider-constants.ts +204 -0
  254. package/src/embeddings/provider-errors.test.ts +967 -0
  255. package/src/embeddings/provider-errors.ts +565 -0
  256. package/src/embeddings/provider-factory.test.ts +240 -0
  257. package/src/embeddings/provider-factory.ts +225 -0
  258. package/src/embeddings/provider-integration.test.ts +788 -0
  259. package/src/embeddings/query-preprocessing.test.ts +187 -0
  260. package/src/embeddings/semantic-search-threshold.test.ts +508 -0
  261. package/src/embeddings/semantic-search.ts +1270 -0
  262. package/src/embeddings/types.ts +359 -0
  263. package/src/embeddings/vector-store.ts +708 -0
  264. package/src/embeddings/voyage-provider.ts +313 -0
  265. package/src/errors/errors.test.ts +845 -0
  266. package/src/errors/index.ts +533 -0
  267. package/src/index/ignore-patterns.test.ts +354 -0
  268. package/src/index/ignore-patterns.ts +305 -0
  269. package/src/index/index.ts +4 -0
  270. package/src/index/indexer.ts +684 -0
  271. package/src/index/storage.ts +260 -0
  272. package/src/index/types.ts +147 -0
  273. package/src/index/watcher.ts +189 -0
  274. package/src/index.ts +30 -0
  275. package/src/integration/search-keyword.test.ts +678 -0
  276. package/src/mcp/server.ts +612 -0
  277. package/src/parser/index.ts +1 -0
  278. package/src/parser/parser.test.ts +291 -0
  279. package/src/parser/parser.ts +394 -0
  280. package/src/parser/section-filter.test.ts +277 -0
  281. package/src/parser/section-filter.ts +392 -0
  282. package/src/search/__tests__/hybrid-search.test.ts +650 -0
  283. package/src/search/bm25-store.ts +366 -0
  284. package/src/search/cross-encoder.test.ts +253 -0
  285. package/src/search/cross-encoder.ts +406 -0
  286. package/src/search/fuzzy-search.test.ts +419 -0
  287. package/src/search/fuzzy-search.ts +273 -0
  288. package/src/search/hybrid-search.ts +448 -0
  289. package/src/search/path-matcher.test.ts +276 -0
  290. package/src/search/path-matcher.ts +33 -0
  291. package/src/search/query-parser.test.ts +260 -0
  292. package/src/search/query-parser.ts +319 -0
  293. package/src/search/searcher.test.ts +280 -0
  294. package/src/search/searcher.ts +724 -0
  295. package/src/search/wink-bm25.d.ts +30 -0
  296. package/src/summarization/cli-providers/claude.ts +202 -0
  297. package/src/summarization/cli-providers/detection.test.ts +273 -0
  298. package/src/summarization/cli-providers/detection.ts +118 -0
  299. package/src/summarization/cli-providers/index.ts +8 -0
  300. package/src/summarization/cost.test.ts +139 -0
  301. package/src/summarization/cost.ts +102 -0
  302. package/src/summarization/error-handler.test.ts +127 -0
  303. package/src/summarization/error-handler.ts +111 -0
  304. package/src/summarization/index.ts +102 -0
  305. package/src/summarization/pipeline.test.ts +498 -0
  306. package/src/summarization/pipeline.ts +231 -0
  307. package/src/summarization/prompts.test.ts +269 -0
  308. package/src/summarization/prompts.ts +133 -0
  309. package/src/summarization/provider-factory.test.ts +396 -0
  310. package/src/summarization/provider-factory.ts +178 -0
  311. package/src/summarization/types.ts +184 -0
  312. package/src/summarize/budget-bugs.test.ts +620 -0
  313. package/src/summarize/formatters.ts +419 -0
  314. package/src/summarize/index.ts +20 -0
  315. package/src/summarize/summarizer.test.ts +275 -0
  316. package/src/summarize/summarizer.ts +597 -0
  317. package/src/summarize/verify-bugs.test.ts +238 -0
  318. package/src/types/huggingface-transformers.d.ts +66 -0
  319. package/src/utils/index.ts +1 -0
  320. package/src/utils/tokens.test.ts +142 -0
  321. package/src/utils/tokens.ts +186 -0
  322. package/tests/fixtures/cli/.mdcontext/active-provider.json +7 -0
  323. package/tests/fixtures/cli/.mdcontext/config.json +8 -0
  324. package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
  325. package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
  326. package/tests/fixtures/cli/.mdcontext/indexes/documents.json +33 -0
  327. package/tests/fixtures/cli/.mdcontext/indexes/links.json +12 -0
  328. package/tests/fixtures/cli/.mdcontext/indexes/sections.json +247 -0
  329. package/tests/fixtures/cli/README.md +9 -0
  330. package/tests/fixtures/cli/api-reference.md +11 -0
  331. package/tests/fixtures/cli/getting-started.md +11 -0
  332. package/tests/integration/embed-index.test.ts +712 -0
  333. package/tests/integration/search-context.test.ts +469 -0
  334. package/tests/integration/search-semantic.test.ts +522 -0
  335. package/tsconfig.json +26 -0
  336. package/vitest.config.ts +16 -0
  337. package/vitest.setup.ts +12 -0
@@ -0,0 +1,407 @@
1
+ # Roadmap: @hw/mdcontext
2
+
3
+ ## Overview
4
+
5
+ Build a token-efficient markdown analysis tool for LLM consumption. Each phase delivers testable functionality, building toward a complete system with parsing, semantic search, summarization, and analytics.
6
+
7
+ ## Phases
8
+
9
+ - [ ] **Phase 1: Core Parsing** — Markdown AST extraction and structure
10
+ - [ ] **Phase 2: Index & Storage** — Persistent indexes, file watching, caching
11
+ - [ ] **Phase 3: Semantic Layer** — Embeddings, vector search
12
+ - [ ] **Phase 4: Summarization** — Hierarchical compression, token optimization
13
+ - [ ] **Phase 5: Analytics** — Performance metrics, query tracking
14
+ - [ ] **Phase 6: Integration** — CLI, MCP server, HumanWork skills
15
+
16
+ ---
17
+
18
+ ## Phase 1: Core Parsing
19
+
20
+ **Goal:** Extract structured data from markdown files.
21
+
22
+ ### 1.1: Project Setup
23
+
24
+ - Initialize `packages/hw_mdcontext` in monorepo
25
+ - TypeScript + Effect setup
26
+ - Test infrastructure (vitest)
27
+ - Basic CI integration
28
+
29
+ **Deliverables:**
30
+
31
+ - Package scaffolding
32
+ - Build working
33
+ - First test passing
34
+
35
+ ### 1.2: Markdown Parser
36
+
37
+ - Integrate remark/unified
38
+ - Parse to mdast (Markdown AST)
39
+ - Handle frontmatter (YAML)
40
+ - Handle GFM extensions (tables, task lists)
41
+
42
+ **Deliverables:**
43
+
44
+ - `parse(content: string): MdastRoot`
45
+ - Frontmatter extraction
46
+ - Unit tests for various markdown features
47
+
48
+ ### 1.3: Structure Extraction
49
+
50
+ - Extract heading hierarchy
51
+ - Identify sections (content between headings)
52
+ - Extract code blocks with language tags
53
+ - Extract links (internal, external, images)
54
+ - Extract lists and tables
55
+
56
+ **Deliverables:**
57
+
58
+ - `extractStructure(ast): DocumentStructure`
59
+ - Section tree with content
60
+ - Link graph per document
61
+ - Code block inventory
62
+
63
+ ### 1.4: Document Model
64
+
65
+ - Define document schema (Effect Schema)
66
+ - Section schema with metadata
67
+ - Serialize/deserialize to JSON
68
+
69
+ **Deliverables:**
70
+
71
+ - `Document` type with full structure
72
+ - `Section` type with bounds, content, metadata
73
+ - JSON round-trip tests
74
+
75
+ ---
76
+
77
+ ## Phase 2: Index & Storage
78
+
79
+ **Goal:** Persist parsed data, enable fast lookups, handle updates.
80
+
81
+ ### 2.1: Storage Interface
82
+
83
+ - Define `MdStore` interface
84
+ - In-memory implementation for testing
85
+ - File-based implementation for persistence
86
+
87
+ **Deliverables:**
88
+
89
+ - `MdStore` interface (save, load, query)
90
+ - `MemoryMdStore`
91
+ - `FileMdStore` (JSON files in `.mdcontext/`)
92
+
93
+ ### 2.2: Document Indexing
94
+
95
+ - Index documents by path
96
+ - Index sections by heading
97
+ - Index links (forward and back)
98
+ - Incremental updates (changed files only)
99
+
100
+ **Deliverables:**
101
+
102
+ - Path → Document lookup
103
+ - Heading → Section lookup
104
+ - Backlink index
105
+ - Change detection (mtime, hash)
106
+
107
+ ### 2.3: File Watching
108
+
109
+ - Watch directory for changes
110
+ - Debounce rapid changes
111
+ - Incremental re-index
112
+ - Configurable ignore patterns
113
+
114
+ **Deliverables:**
115
+
116
+ - `watch(dir, options): Effect<void>`
117
+ - `.mdcontextignore` support
118
+ - Debounce logic (default 500ms)
119
+
120
+ ### 2.4: Cache Management
121
+
122
+ - Cache parsed documents
123
+ - Cache structure indexes
124
+ - Invalidation on file change
125
+ - Size limits and eviction
126
+
127
+ **Deliverables:**
128
+
129
+ - LRU cache for documents
130
+ - Index persistence to disk
131
+ - Cache stats (hits, misses, size)
132
+
133
+ ---
134
+
135
+ ## Phase 3: Semantic Layer
136
+
137
+ **Goal:** Enable meaning-based search via embeddings.
138
+
139
+ ### 3.1: Embedding Interface
140
+
141
+ - Define `Embedder` interface
142
+ - Pluggable backends (API, local)
143
+ - Batch embedding support
144
+
145
+ **Deliverables:**
146
+
147
+ - `Embedder` interface
148
+ - `embed(texts: string[]): Effect<Vector[]>`
149
+ - Configuration for model selection
150
+
151
+ ### 3.2: OpenAI Embeddings
152
+
153
+ - Implement OpenAI text-embedding-3-small
154
+ - Rate limiting and retry logic
155
+ - Cost tracking
156
+
157
+ **Deliverables:**
158
+
159
+ - `OpenAIEmbedder`
160
+ - Automatic batching (max 8k tokens)
161
+ - Cost per query metric
162
+
163
+ ### 3.3: Local Embeddings (Optional)
164
+
165
+ - Python subprocess for sentence-transformers
166
+ - Or ONNX runtime in Node
167
+ - Fallback when API unavailable
168
+
169
+ **Deliverables:**
170
+
171
+ - `LocalEmbedder` (stretch goal)
172
+ - Model download management
173
+
174
+ ### 3.4: Vector Index
175
+
176
+ - Store embeddings with document/section IDs
177
+ - Similarity search (cosine)
178
+ - FAISS or hnswlib integration
179
+
180
+ **Deliverables:**
181
+
182
+ - `VectorIndex` interface
183
+ - `search(query: Vector, k: number): Result[]`
184
+ - Persistence to disk
185
+
186
+ ### 3.5: Semantic Search API
187
+
188
+ - Text query → embed → search
189
+ - Combine with structural filters
190
+ - Rank and return results
191
+
192
+ **Deliverables:**
193
+
194
+ - `semanticSearch(query: string, options): SearchResult[]`
195
+ - Filter by path pattern, heading level
196
+ - Result with score, snippet, location
197
+
198
+ ---
199
+
200
+ ## Phase 4: Summarization
201
+
202
+ **Goal:** Generate token-efficient summaries at multiple granularities.
203
+
204
+ ### 4.1: Token Counting
205
+
206
+ - Accurate token counting (tiktoken or similar)
207
+ - Budget management
208
+ - Truncation strategies
209
+
210
+ **Deliverables:**
211
+
212
+ - `countTokens(text: string): number`
213
+ - `truncateToTokens(text, limit): string`
214
+ - Model-specific tokenizers (GPT-4, Claude)
215
+
216
+ ### 4.2: Section Summarization
217
+
218
+ - Extract key points from section
219
+ - Preserve structure indicators
220
+ - Configurable compression ratio
221
+
222
+ **Deliverables:**
223
+
224
+ - `summarizeSection(section, options): Summary`
225
+ - Key sentence extraction
226
+ - Heading preservation
227
+
228
+ ### 4.3: Document Summarization
229
+
230
+ - Hierarchical: summarize sections, then combine
231
+ - TOC generation
232
+ - Key topics extraction
233
+
234
+ **Deliverables:**
235
+
236
+ - `summarizeDocument(doc, options): DocSummary`
237
+ - Multi-level output (100, 500, 2000 tokens)
238
+ - Topic list
239
+
240
+ ### 4.4: Context Assembly
241
+
242
+ - Build LLM-ready context from multiple sources
243
+ - Priority-based inclusion
244
+ - Token budget management
245
+
246
+ **Deliverables:**
247
+
248
+ - `assembleContext(sources, budget): string`
249
+ - Source attribution
250
+ - Overflow handling (truncate vs omit)
251
+
252
+ ---
253
+
254
+ ## Phase 5: Analytics
255
+
256
+ **Goal:** Built-in observability for performance and usage.
257
+
258
+ ### 5.1: Metrics Foundation
259
+
260
+ - Effect Metrics integration
261
+ - Counter, Gauge, Histogram types
262
+ - Metric naming conventions
263
+
264
+ **Deliverables:**
265
+
266
+ - Metrics layer setup
267
+ - Standard metric types
268
+ - Tagging (operation, status)
269
+
270
+ ### 5.2: Performance Metrics
271
+
272
+ - Query latency (p50, p95, p99)
273
+ - Index build time
274
+ - Cache hit/miss rates
275
+ - Embedding API latency
276
+
277
+ **Deliverables:**
278
+
279
+ - `mdcontext_query_duration_ms` histogram
280
+ - `mdcontext_cache_hits_total` counter
281
+ - `mdcontext_index_build_duration_ms` gauge
282
+
283
+ ### 5.3: Usage Metrics
284
+
285
+ - Queries per time period
286
+ - Token usage (input/output)
287
+ - Most queried documents/sections
288
+ - Search result click-through (if applicable)
289
+
290
+ **Deliverables:**
291
+
292
+ - `mdcontext_queries_total` counter
293
+ - `mdcontext_tokens_used` counter
294
+ - Query log with timestamps
295
+
296
+ ### 5.4: Reporting
297
+
298
+ - Metrics export (Prometheus format)
299
+ - Simple CLI report command
300
+ - Alerting thresholds (optional)
301
+
302
+ **Deliverables:**
303
+
304
+ - `mdcontext metrics` CLI command
305
+ - JSON and text output formats
306
+ - Configurable retention
307
+
308
+ ---
309
+
310
+ ## Phase 6: Integration
311
+
312
+ **Goal:** Make mdcontext usable from CLI, MCP, and HumanWork.
313
+
314
+ ### 6.1: CLI Tool
315
+
316
+ - `mdcontext index <dir>` — build index
317
+ - `mdcontext search <query>` — semantic search
318
+ - `mdcontext context <path>` — LLM-ready summary
319
+ - `mdcontext structure <path>` — show document structure
320
+
321
+ **Deliverables:**
322
+
323
+ - CLI with subcommands
324
+ - Output formats (text, JSON)
325
+ - Config file support
326
+
327
+ ### 6.2: Daemon Mode
328
+
329
+ - `mdcontext daemon` — run as background service
330
+ - HTTP/IPC API for queries
331
+ - Auto-rebuild on changes
332
+
333
+ **Deliverables:**
334
+
335
+ - Daemon process management
336
+ - Query API (REST or IPC)
337
+ - Health check endpoint
338
+
339
+ ### 6.3: MCP Server
340
+
341
+ - Expose tools for Claude integration
342
+ - `md_search` — semantic search
343
+ - `md_context` — get context for file/section
344
+ - `md_structure` — document outline
345
+
346
+ **Deliverables:**
347
+
348
+ - MCP server implementation
349
+ - Tool definitions
350
+ - Claude Desktop/Code integration docs
351
+
352
+ ### 6.4: HumanWork Skills
353
+
354
+ - `hw-md-search` — search markdown in .humanwork/
355
+ - `hw-md-context` — get context for task/session
356
+ - Integration with session-history
357
+
358
+ **Deliverables:**
359
+
360
+ - Skill definitions
361
+ - Integration with existing HumanWork skills
362
+ - Documentation
363
+
364
+ ---
365
+
366
+ ## Progress
367
+
368
+ | Phase | Status | Plans | Completed |
369
+ | ------------------ | ----------- | ----- | --------- |
370
+ | 1. Core Parsing | Not started | 4 | - |
371
+ | 2. Index & Storage | Not started | 4 | - |
372
+ | 3. Semantic Layer | Not started | 5 | - |
373
+ | 4. Summarization | Not started | 4 | - |
374
+ | 5. Analytics | Not started | 4 | - |
375
+ | 6. Integration | Not started | 4 | - |
376
+
377
+ **Total: 25 tasks across 6 phases**
378
+
379
+ ---
380
+
381
+ ## Dependencies
382
+
383
+ ```
384
+ Phase 1 ─────────────────────────────────────────┐
385
+ │ │
386
+ ▼ │
387
+ Phase 2 ──────────────┐ │
388
+ │ │ │
389
+ ▼ ▼ │
390
+ Phase 3 Phase 4 │
391
+ │ │ │
392
+ └────────┬────────┘ │
393
+ ▼ │
394
+ Phase 5 ◄───────────────────────────────┘
395
+
396
+
397
+ Phase 6
398
+ ```
399
+
400
+ - Phase 2 depends on Phase 1 (need parser for indexing)
401
+ - Phase 3 & 4 can parallel after Phase 2
402
+ - Phase 5 spans all (analytics hooks added throughout)
403
+ - Phase 6 integrates everything
404
+
405
+ ---
406
+
407
+ _Created: 2025-01-18_
@@ -0,0 +1,320 @@
1
+ # AI Summarization Architecture
2
+
3
+ This document covers the architecture and implementation details of mdcontext's AI-powered search result summarization feature.
4
+
5
+ ## Overview
6
+
7
+ mdcontext can generate AI-powered summaries of search results using either:
8
+
9
+ 1. **CLI tools** (Claude Code, Copilot CLI, OpenCode) - Free with your subscription
10
+ 2. **API providers** (DeepSeek, Anthropic, OpenAI, Gemini) - Pay per query
11
+
12
+ The design prioritizes CLI providers as the primary option since they leverage existing subscriptions that developers already have.
13
+
14
+ ## Architecture
15
+
16
+ ```
17
+ ┌─────────────────────────────────────────────────────────────────┐
18
+ │ CLI (search.ts) │
19
+ │ --summarize flag triggers summarization pipeline │
20
+ └─────────────────────────┬───────────────────────────────────────┘
21
+
22
+
23
+ ┌─────────────────────────────────────────────────────────────────┐
24
+ │ Provider Factory │
25
+ │ getBestAvailableSummarizer() / createSummarizer() │
26
+ │ - Detects installed CLI tools │
27
+ │ - Creates appropriate provider instance │
28
+ └─────────────────────────┬───────────────────────────────────────┘
29
+
30
+ ┌───────────────┴───────────────┐
31
+ ▼ ▼
32
+ ┌─────────────────────┐ ┌─────────────────────┐
33
+ │ CLI Providers │ │ API Providers │
34
+ │ (Free) │ │ (Pay-per-use) │
35
+ │ │ │ │
36
+ │ - ClaudeCLI │ │ - DeepSeek │
37
+ │ - OpenCode │ │ - Anthropic │
38
+ │ - Copilot │ │ - OpenAI │
39
+ │ - Aider │ │ - Gemini │
40
+ │ - Cline │ │ - Qwen │
41
+ └─────────────────────┘ └─────────────────────┘
42
+ │ │
43
+ └───────────────┬───────────────┘
44
+
45
+ ┌─────────────────────────────────────────────────────────────────┐
46
+ │ Summarizer Interface │
47
+ │ summarize(input, prompt) → SummaryResult │
48
+ │ summarizeStream(input, prompt, options) → void │
49
+ │ estimateCost(inputTokens) → number │
50
+ │ isAvailable() → boolean │
51
+ └─────────────────────────────────────────────────────────────────┘
52
+ ```
53
+
54
+ ## Components
55
+
56
+ ### Provider Detection (`cli-providers/detection.ts`)
57
+
58
+ Automatically discovers installed CLI tools:
59
+
60
+ ```typescript
61
+ import { detectInstalledCLIs } from './summarization/index.js'
62
+
63
+ const installed = await detectInstalledCLIs()
64
+ // [{ name: 'claude', command: 'claude', displayName: 'Claude Code', ... }]
65
+ ```
66
+
67
+ Detection uses `which` (Unix) or `where` (Windows) via `spawn()` - never shell interpolation.
68
+
69
+ ### Provider Factory (`provider-factory.ts`)
70
+
71
+ Creates summarizer instances based on configuration:
72
+
73
+ ```typescript
74
+ import { createSummarizer, getBestAvailableSummarizer } from './summarization/index.js'
75
+
76
+ // Auto-detect best available provider
77
+ const result = await getBestAvailableSummarizer()
78
+ if (result) {
79
+ const { summarizer, config } = result
80
+ // Use summarizer...
81
+ }
82
+
83
+ // Or create from explicit config
84
+ const summarizer = await createSummarizer({
85
+ mode: 'cli',
86
+ provider: 'claude',
87
+ })
88
+ ```
89
+
90
+ ### Cost Estimation (`cost.ts`)
91
+
92
+ Estimates costs before execution:
93
+
94
+ ```typescript
95
+ import { estimateSummaryCost, formatCostDisplay } from './summarization/index.js'
96
+
97
+ const estimate = estimateSummaryCost(inputText, 'api', 'deepseek')
98
+ // {
99
+ // inputTokens: 2500,
100
+ // outputTokens: 500,
101
+ // estimatedCost: 0.0007,
102
+ // provider: 'deepseek',
103
+ // isPaid: true,
104
+ // formattedCost: '$0.0007'
105
+ // }
106
+
107
+ console.log(formatCostDisplay(estimate))
108
+ // "Estimated cost: $0.0007"
109
+ ```
110
+
111
+ CLI providers always return `isPaid: false` with `formattedCost: 'FREE (subscription)'`.
112
+
113
+ ### Prompt Templates (`prompts.ts`)
114
+
115
+ Pre-built prompts for different summarization styles:
116
+
117
+ | Template | Description |
118
+ |----------|-------------|
119
+ | `default` | Balanced summary with key findings |
120
+ | `concise` | 2-3 sentence quick summary |
121
+ | `detailed` | Comprehensive analysis |
122
+ | `actionable` | Focus on next steps |
123
+ | `technical` | Code patterns and API details |
124
+
125
+ ```typescript
126
+ import { buildPrompt } from './summarization/index.js'
127
+
128
+ const prompt = buildPrompt({
129
+ query: 'authentication',
130
+ resultCount: 10,
131
+ searchMode: 'hybrid',
132
+ }, 'actionable')
133
+ ```
134
+
135
+ ### Error Handling (`error-handler.ts`)
136
+
137
+ Graceful degradation on failures:
138
+
139
+ ```typescript
140
+ import { displaySummarizationError, isRecoverableError } from './summarization/index.js'
141
+
142
+ try {
143
+ await summarizer.summarize(input, prompt)
144
+ } catch (error) {
145
+ if (isRecoverableError(error)) {
146
+ // Retry logic
147
+ } else {
148
+ displaySummarizationError(error)
149
+ // Shows user-friendly message, search results still displayed
150
+ }
151
+ }
152
+ ```
153
+
154
+ ## Security Considerations
155
+
156
+ ### Shell Injection Prevention
157
+
158
+ All CLI invocations use `spawn()` with argument arrays - **NEVER** `exec()` with string interpolation:
159
+
160
+ ```typescript
161
+ // CORRECT - Safe from shell injection
162
+ spawn('claude', ['-p', userInput, '--output-format', 'text'])
163
+
164
+ // WRONG - Vulnerable to shell injection
165
+ exec(`claude -p "${userInput}"`) // NEVER DO THIS
166
+ ```
167
+
168
+ This is enforced throughout the codebase. User input is passed as array elements, never interpolated into shell commands.
169
+
170
+ ### API Key Handling
171
+
172
+ - API keys are sourced from environment variables only
173
+ - Never stored in config files
174
+ - Environment variable names follow provider conventions:
175
+ - `DEEPSEEK_API_KEY`
176
+ - `ANTHROPIC_API_KEY`
177
+ - `OPENAI_API_KEY`
178
+ - `GOOGLE_API_KEY` (for Gemini)
179
+ - `QWEN_API_KEY`
180
+
181
+ ### Timeout Protection
182
+
183
+ CLI processes have a default 60-second timeout to prevent hung processes.
184
+
185
+ ## Adding New Providers
186
+
187
+ ### CLI Provider
188
+
189
+ 1. Add to `KNOWN_CLIS` in `cli-providers/detection.ts`:
190
+
191
+ ```typescript
192
+ {
193
+ name: 'newcli',
194
+ command: 'newcli',
195
+ displayName: 'New CLI Tool',
196
+ args: ['--prompt'],
197
+ useStdin: false,
198
+ }
199
+ ```
200
+
201
+ 2. Create implementation in `cli-providers/newcli.ts`:
202
+
203
+ ```typescript
204
+ import { spawn } from 'node:child_process'
205
+ import type { Summarizer, SummaryResult } from '../types.js'
206
+
207
+ export class NewCLISummarizer implements Summarizer {
208
+ async summarize(input: string, prompt: string): Promise<SummaryResult> {
209
+ // SECURITY: Always use spawn() with argument arrays
210
+ const proc = spawn('newcli', ['--prompt', prompt, input])
211
+ // ... implementation
212
+ }
213
+
214
+ async isAvailable(): Promise<boolean> {
215
+ // Check if CLI is installed
216
+ }
217
+ }
218
+ ```
219
+
220
+ 3. Add to factory in `provider-factory.ts`
221
+
222
+ ### API Provider
223
+
224
+ 1. Add pricing to `cost.ts`:
225
+
226
+ ```typescript
227
+ export const API_PRICING = {
228
+ // ... existing providers
229
+ newapi: { input: 0.50, output: 1.00, displayName: 'New API' },
230
+ }
231
+ ```
232
+
233
+ 2. Create implementation using Vercel AI SDK (when implemented):
234
+
235
+ ```typescript
236
+ import { createOpenAI } from '@ai-sdk/openai'
237
+
238
+ export class NewAPISummarizer implements Summarizer {
239
+ // Use Vercel AI SDK for OpenAI-compatible APIs
240
+ }
241
+ ```
242
+
243
+ ## Performance
244
+
245
+ | Provider Type | Latency | Cost |
246
+ |--------------|---------|------|
247
+ | CLI (Claude) | 2-5s | Free |
248
+ | CLI (OpenCode) | 2-5s | Free |
249
+ | API (DeepSeek) | 1-3s | ~$0.0007/query |
250
+ | API (OpenAI) | 1-2s | ~$0.005/query |
251
+
252
+ ### Token Limits
253
+
254
+ - Input is automatically truncated at 100K characters (~25K tokens)
255
+ - Result content is truncated to 500 chars per result
256
+ - Output tokens capped at 500 for cost estimates
257
+
258
+ ## Configuration Reference
259
+
260
+ ### Config File
261
+
262
+ ```javascript
263
+ // mdcontext.config.js
264
+ /** @type {import('mdcontext').PartialMdContextConfig} */
265
+ export default {
266
+ aiSummarization: {
267
+ mode: 'cli', // 'cli' or 'api'
268
+ provider: 'claude', // Provider name
269
+ model: 'deepseek-chat', // Model for API providers
270
+ stream: false, // Enable streaming
271
+ },
272
+ }
273
+ ```
274
+
275
+ ### Environment Variables
276
+
277
+ | Variable | Description |
278
+ |----------|-------------|
279
+ | `MDCONTEXT_AISUMMARIZATION_MODE` | 'cli' or 'api' |
280
+ | `MDCONTEXT_AISUMMARIZATION_PROVIDER` | Provider name |
281
+ | `MDCONTEXT_AISUMMARIZATION_MODEL` | Model name (API only) |
282
+ | `MDCONTEXT_AISUMMARIZATION_STREAM` | 'true' or 'false' |
283
+
284
+ ## Troubleshooting
285
+
286
+ ### "CLI tool 'claude' not found"
287
+
288
+ **Solution:** Install Claude Code from https://claude.ai/download
289
+
290
+ ### "CLI tool 'opencode' not found"
291
+
292
+ **Solution:** Install OpenCode from https://github.com/opencode-ai/opencode
293
+
294
+ ### "Authentication failed for anthropic"
295
+
296
+ **Solution:** Set API key: `export ANTHROPIC_API_KEY=sk-...`
297
+
298
+ ### "Rate limit exceeded"
299
+
300
+ **Solution:** Wait and retry. Consider switching to CLI provider (free).
301
+
302
+ ### "Summarization failed: timeout"
303
+
304
+ **Solution:** Reduce result set with `--limit` or increase timeout in config.
305
+
306
+ ### "No summarization providers available"
307
+
308
+ **Solution:** Either:
309
+ 1. Install a CLI tool (Claude Code, OpenCode)
310
+ 2. Configure an API provider with valid API key
311
+
312
+ ### OpenCode JSON format errors
313
+
314
+ **Solution:** OpenCode JSON format is undocumented. Try updating OpenCode or switch to Claude CLI.
315
+
316
+ ## Related Documentation
317
+
318
+ - [README.md](../README.md#ai-summarization) - Quick start guide
319
+ - [CONFIG.md](./CONFIG.md) - Full configuration reference
320
+ - [ERRORS.md](./ERRORS.md) - Error handling patterns