mdcontext 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. package/.changeset/README.md +28 -0
  2. package/.changeset/config.json +11 -0
  3. package/.claude/settings.local.json +25 -0
  4. package/.github/workflows/ci.yml +83 -0
  5. package/.github/workflows/claude-code-review.yml +44 -0
  6. package/.github/workflows/claude.yml +85 -0
  7. package/.github/workflows/release.yml +113 -0
  8. package/.tldrignore +112 -0
  9. package/BACKLOG.md +338 -0
  10. package/CONTRIBUTING.md +186 -0
  11. package/NOTES/NOTES +44 -0
  12. package/README.md +434 -11
  13. package/biome.json +36 -0
  14. package/cspell.config.yaml +14 -0
  15. package/dist/chunk-23UPXDNL.js +3044 -0
  16. package/dist/chunk-2W7MO2DL.js +1366 -0
  17. package/dist/chunk-3NUAZGMA.js +1689 -0
  18. package/dist/chunk-7TOWB2XB.js +366 -0
  19. package/dist/chunk-7XOTOADQ.js +3065 -0
  20. package/dist/chunk-AH2PDM2K.js +3042 -0
  21. package/dist/chunk-BNXWSZ63.js +3742 -0
  22. package/dist/chunk-BTL5DJVU.js +3222 -0
  23. package/dist/chunk-HDHYG7E4.js +104 -0
  24. package/dist/chunk-HLR4KZBP.js +3234 -0
  25. package/dist/chunk-IP3FRFEB.js +1045 -0
  26. package/dist/chunk-KHU56VDO.js +3042 -0
  27. package/dist/chunk-KRYIFLQR.js +88 -0
  28. package/dist/chunk-LBSDNLEM.js +287 -0
  29. package/dist/chunk-MNTQ7HCP.js +2643 -0
  30. package/dist/chunk-MUJELQQ6.js +1387 -0
  31. package/dist/chunk-MXJGMSLV.js +2199 -0
  32. package/dist/chunk-N6QJGC3Z.js +2636 -0
  33. package/dist/chunk-OBELGBPM.js +1713 -0
  34. package/dist/chunk-OT7R5XTA.js +3192 -0
  35. package/dist/chunk-P7X4RA2T.js +106 -0
  36. package/dist/chunk-PIDUQNC2.js +3185 -0
  37. package/dist/chunk-POGCDIH4.js +3187 -0
  38. package/dist/chunk-PSIEOQGZ.js +3043 -0
  39. package/dist/chunk-PVRT3IHA.js +3238 -0
  40. package/dist/chunk-QNN4TT23.js +1430 -0
  41. package/dist/chunk-RE3R45RJ.js +3042 -0
  42. package/dist/chunk-S7E6TFX6.js +803 -0
  43. package/dist/chunk-SG6GLU4U.js +1378 -0
  44. package/dist/chunk-SJCDV2ST.js +274 -0
  45. package/dist/chunk-SYE5XLF3.js +104 -0
  46. package/dist/chunk-T5VLYBZD.js +103 -0
  47. package/dist/chunk-TOQB7VWU.js +3238 -0
  48. package/dist/chunk-VFNMZ4ZQ.js +3228 -0
  49. package/dist/chunk-VVTGZNBT.js +1629 -0
  50. package/dist/chunk-W7Q4RFEV.js +104 -0
  51. package/dist/chunk-XTYYVRLO.js +3190 -0
  52. package/dist/chunk-Y6MDYVJD.js +3063 -0
  53. package/dist/cli/main.d.ts +1 -0
  54. package/dist/cli/main.js +5458 -0
  55. package/dist/index.d.ts +653 -0
  56. package/dist/index.js +79 -0
  57. package/dist/mcp/server.d.ts +1 -0
  58. package/dist/mcp/server.js +472 -0
  59. package/dist/schema-BAWSG7KY.js +22 -0
  60. package/dist/schema-E3QUPL26.js +20 -0
  61. package/dist/schema-EHL7WUT6.js +20 -0
  62. package/docs/019-USAGE.md +625 -0
  63. package/docs/020-current-implementation.md +364 -0
  64. package/docs/021-DOGFOODING-FINDINGS.md +175 -0
  65. package/docs/BACKLOG.md +80 -0
  66. package/docs/CONFIG.md +1123 -0
  67. package/docs/DESIGN.md +439 -0
  68. package/docs/ERRORS.md +383 -0
  69. package/docs/PROJECT.md +88 -0
  70. package/docs/ROADMAP.md +407 -0
  71. package/docs/summarization.md +320 -0
  72. package/docs/test-links.md +9 -0
  73. package/justfile +40 -0
  74. package/package.json +74 -9
  75. package/pnpm-workspace.yaml +5 -0
  76. package/research/INDEX.md +315 -0
  77. package/research/code-review/README.md +90 -0
  78. package/research/code-review/cli-error-handling-review.md +979 -0
  79. package/research/code-review/code-review-validation-report.md +464 -0
  80. package/research/code-review/main-ts-review.md +1128 -0
  81. package/research/config-analysis/01-current-implementation.md +470 -0
  82. package/research/config-analysis/02-strategy-recommendation.md +428 -0
  83. package/research/config-analysis/03-task-candidates.md +715 -0
  84. package/research/config-analysis/033-research-configuration-management.md +828 -0
  85. package/research/config-analysis/034-research-effect-cli-config.md +1504 -0
  86. package/research/config-analysis/04-consolidated-task-candidates.md +277 -0
  87. package/research/config-docs/SUMMARY.md +357 -0
  88. package/research/config-docs/TEST-RESULTS.md +776 -0
  89. package/research/config-docs/TODO.md +542 -0
  90. package/research/config-docs/analysis.md +744 -0
  91. package/research/config-docs/fix-validation.md +502 -0
  92. package/research/config-docs/help-audit.md +264 -0
  93. package/research/config-docs/help-system-analysis.md +890 -0
  94. package/research/dogfood/consolidated-tool-evaluation.md +373 -0
  95. package/research/dogfood/strategy-a/a-synthesis.md +184 -0
  96. package/research/dogfood/strategy-a/a1-docs.md +226 -0
  97. package/research/dogfood/strategy-a/a2-amorphic.md +156 -0
  98. package/research/dogfood/strategy-a/a3-llm.md +164 -0
  99. package/research/dogfood/strategy-b/b-synthesis.md +228 -0
  100. package/research/dogfood/strategy-b/b1-architecture.md +207 -0
  101. package/research/dogfood/strategy-b/b2-gaps.md +258 -0
  102. package/research/dogfood/strategy-b/b3-workflows.md +250 -0
  103. package/research/dogfood/strategy-c/c-synthesis.md +451 -0
  104. package/research/dogfood/strategy-c/c1-explorer.md +192 -0
  105. package/research/dogfood/strategy-c/c2-diver-memory.md +145 -0
  106. package/research/dogfood/strategy-c/c3-diver-control.md +148 -0
  107. package/research/dogfood/strategy-c/c4-diver-failure.md +151 -0
  108. package/research/dogfood/strategy-c/c5-diver-execution.md +221 -0
  109. package/research/dogfood/strategy-c/c6-diver-org.md +221 -0
  110. package/research/effect-cli-error-handling.md +845 -0
  111. package/research/effect-errors-as-values.md +943 -0
  112. package/research/errors-task-analysis/00-consolidated-tasks.md +207 -0
  113. package/research/errors-task-analysis/cli-commands-analysis.md +909 -0
  114. package/research/errors-task-analysis/embeddings-analysis.md +709 -0
  115. package/research/errors-task-analysis/index-search-analysis.md +812 -0
  116. package/research/frontmatter/COMMENTS-ARE-SKIPPED.md +149 -0
  117. package/research/frontmatter/LLM-CODE-NAVIGATION.md +276 -0
  118. package/research/issue-review.md +603 -0
  119. package/research/llm-summarization/agent-cli-tools-2026.md +1082 -0
  120. package/research/llm-summarization/alternative-providers-2026.md +1428 -0
  121. package/research/llm-summarization/anthropic-2026.md +367 -0
  122. package/research/llm-summarization/claude-cli-integration.md +1706 -0
  123. package/research/llm-summarization/cli-integration-patterns.md +3155 -0
  124. package/research/llm-summarization/openai-2026.md +473 -0
  125. package/research/llm-summarization/openai-compatible-providers-2026.md +1022 -0
  126. package/research/llm-summarization/opencode-cli-integration.md +1552 -0
  127. package/research/llm-summarization/prompt-engineering-2026.md +1426 -0
  128. package/research/llm-summarization/prototype-results.md +56 -0
  129. package/research/llm-summarization/provider-switching-patterns-2026.md +2153 -0
  130. package/research/llm-summarization/typescript-llm-libraries-2026.md +2436 -0
  131. package/research/mdcontext-error-analysis.md +521 -0
  132. package/research/mdcontext-pudding/00-EXECUTIVE-SUMMARY.md +282 -0
  133. package/research/mdcontext-pudding/01-index-embed.md +956 -0
  134. package/research/mdcontext-pudding/02-search-COMMANDS.md +142 -0
  135. package/research/mdcontext-pudding/02-search-SUMMARY.md +146 -0
  136. package/research/mdcontext-pudding/02-search.md +970 -0
  137. package/research/mdcontext-pudding/03-context.md +779 -0
  138. package/research/mdcontext-pudding/04-navigation-and-analytics.md +803 -0
  139. package/research/mdcontext-pudding/04-tree.md +704 -0
  140. package/research/mdcontext-pudding/05-config.md +1038 -0
  141. package/research/mdcontext-pudding/06-links-summary.txt +87 -0
  142. package/research/mdcontext-pudding/06-links.md +679 -0
  143. package/research/mdcontext-pudding/07-stats.md +693 -0
  144. package/research/mdcontext-pudding/BUG-FIX-PLAN.md +388 -0
  145. package/research/mdcontext-pudding/P0-BUG-VALIDATION.md +167 -0
  146. package/research/mdcontext-pudding/README.md +168 -0
  147. package/research/mdcontext-pudding/TESTING-SUMMARY.md +128 -0
  148. package/research/npm_publish/011-npm-workflow-research-agent2.md +792 -0
  149. package/research/npm_publish/012-npm-workflow-research-agent1.md +530 -0
  150. package/research/npm_publish/013-npm-workflow-research-agent3.md +722 -0
  151. package/research/npm_publish/014-npm-workflow-synthesis.md +556 -0
  152. package/research/npm_publish/031-npm-workflow-task-analysis.md +134 -0
  153. package/research/research-quality-review.md +834 -0
  154. package/research/semantic-search/002-research-embedding-models.md +490 -0
  155. package/research/semantic-search/003-research-rag-alternatives.md +523 -0
  156. package/research/semantic-search/004-research-vector-search.md +841 -0
  157. package/research/semantic-search/032-research-semantic-search.md +427 -0
  158. package/research/semantic-search/embedding-text-analysis.md +156 -0
  159. package/research/semantic-search/multi-word-failure-reproduction.md +171 -0
  160. package/research/semantic-search/query-processing-analysis.md +207 -0
  161. package/research/semantic-search/root-cause-and-solution.md +114 -0
  162. package/research/semantic-search/threshold-validation-report.md +69 -0
  163. package/research/semantic-search/vector-search-analysis.md +63 -0
  164. package/research/task-management-2026/00-synthesis-recommendations.md +295 -0
  165. package/research/task-management-2026/01-ai-workflow-tools.md +416 -0
  166. package/research/task-management-2026/02-agent-framework-patterns.md +476 -0
  167. package/research/task-management-2026/03-lightweight-file-based.md +567 -0
  168. package/research/task-management-2026/04-established-tools-ai-features.md +541 -0
  169. package/research/task-management-2026/linear/01-core-features-workflow.md +771 -0
  170. package/research/task-management-2026/linear/02-api-integrations.md +930 -0
  171. package/research/task-management-2026/linear/03-ai-features.md +368 -0
  172. package/research/task-management-2026/linear/04-pricing-setup.md +205 -0
  173. package/research/task-management-2026/linear/05-usage-patterns-best-practices.md +605 -0
  174. package/research/test-path-issues.md +276 -0
  175. package/review/ALP-76/1-error-type-design.md +962 -0
  176. package/review/ALP-76/2-error-handling-patterns.md +906 -0
  177. package/review/ALP-76/3-error-presentation.md +624 -0
  178. package/review/ALP-76/4-test-coverage.md +625 -0
  179. package/review/ALP-76/5-migration-completeness.md +440 -0
  180. package/review/ALP-76/6-effect-best-practices.md +755 -0
  181. package/scripts/apply-branch-protection.sh +47 -0
  182. package/scripts/branch-protection-templates.json +79 -0
  183. package/scripts/prototype-summarization.ts +346 -0
  184. package/scripts/rebuild-hnswlib.js +58 -0
  185. package/scripts/setup-branch-protection.sh +64 -0
  186. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/active-provider.json +7 -0
  187. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.json +541 -0
  188. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.meta.json +5 -0
  189. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/config.json +8 -0
  190. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
  191. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
  192. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/documents.json +60 -0
  193. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/links.json +13 -0
  194. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/sections.json +1197 -0
  195. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/configuration-management.md +99 -0
  196. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/distributed-systems.md +92 -0
  197. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/error-handling.md +78 -0
  198. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/failure-automation.md +55 -0
  199. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/job-context.md +69 -0
  200. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/process-orchestration.md +99 -0
  201. package/src/cli/argv-preprocessor.test.ts +210 -0
  202. package/src/cli/argv-preprocessor.ts +202 -0
  203. package/src/cli/cli.test.ts +627 -0
  204. package/src/cli/commands/backlinks.ts +54 -0
  205. package/src/cli/commands/config-cmd.ts +642 -0
  206. package/src/cli/commands/context.ts +285 -0
  207. package/src/cli/commands/duplicates.ts +122 -0
  208. package/src/cli/commands/embeddings.ts +529 -0
  209. package/src/cli/commands/index-cmd.ts +480 -0
  210. package/src/cli/commands/index.ts +16 -0
  211. package/src/cli/commands/links.ts +52 -0
  212. package/src/cli/commands/search.ts +1281 -0
  213. package/src/cli/commands/stats.ts +149 -0
  214. package/src/cli/commands/tree.ts +128 -0
  215. package/src/cli/config-layer.ts +176 -0
  216. package/src/cli/error-handler.test.ts +235 -0
  217. package/src/cli/error-handler.ts +655 -0
  218. package/src/cli/flag-schemas.ts +341 -0
  219. package/src/cli/help.ts +588 -0
  220. package/src/cli/index.ts +9 -0
  221. package/src/cli/main.ts +435 -0
  222. package/src/cli/options.ts +41 -0
  223. package/src/cli/shared-error-handling.ts +199 -0
  224. package/src/cli/typo-suggester.test.ts +105 -0
  225. package/src/cli/typo-suggester.ts +130 -0
  226. package/src/cli/utils.ts +259 -0
  227. package/src/config/file-provider.test.ts +320 -0
  228. package/src/config/file-provider.ts +273 -0
  229. package/src/config/index.ts +72 -0
  230. package/src/config/integration.test.ts +667 -0
  231. package/src/config/precedence.test.ts +277 -0
  232. package/src/config/precedence.ts +451 -0
  233. package/src/config/schema.test.ts +414 -0
  234. package/src/config/schema.ts +603 -0
  235. package/src/config/service.test.ts +320 -0
  236. package/src/config/service.ts +243 -0
  237. package/src/config/testing.test.ts +264 -0
  238. package/src/config/testing.ts +110 -0
  239. package/src/core/index.ts +1 -0
  240. package/src/core/types.ts +113 -0
  241. package/src/duplicates/detector.test.ts +183 -0
  242. package/src/duplicates/detector.ts +414 -0
  243. package/src/duplicates/index.ts +18 -0
  244. package/src/embeddings/embedding-namespace.test.ts +300 -0
  245. package/src/embeddings/embedding-namespace.ts +947 -0
  246. package/src/embeddings/heading-boost.test.ts +222 -0
  247. package/src/embeddings/hnsw-build-options.test.ts +198 -0
  248. package/src/embeddings/hyde.test.ts +272 -0
  249. package/src/embeddings/hyde.ts +264 -0
  250. package/src/embeddings/index.ts +10 -0
  251. package/src/embeddings/openai-provider.ts +414 -0
  252. package/src/embeddings/pricing.json +22 -0
  253. package/src/embeddings/provider-constants.ts +204 -0
  254. package/src/embeddings/provider-errors.test.ts +967 -0
  255. package/src/embeddings/provider-errors.ts +565 -0
  256. package/src/embeddings/provider-factory.test.ts +240 -0
  257. package/src/embeddings/provider-factory.ts +225 -0
  258. package/src/embeddings/provider-integration.test.ts +788 -0
  259. package/src/embeddings/query-preprocessing.test.ts +187 -0
  260. package/src/embeddings/semantic-search-threshold.test.ts +508 -0
  261. package/src/embeddings/semantic-search.ts +1270 -0
  262. package/src/embeddings/types.ts +359 -0
  263. package/src/embeddings/vector-store.ts +708 -0
  264. package/src/embeddings/voyage-provider.ts +313 -0
  265. package/src/errors/errors.test.ts +845 -0
  266. package/src/errors/index.ts +533 -0
  267. package/src/index/ignore-patterns.test.ts +354 -0
  268. package/src/index/ignore-patterns.ts +305 -0
  269. package/src/index/index.ts +4 -0
  270. package/src/index/indexer.ts +684 -0
  271. package/src/index/storage.ts +260 -0
  272. package/src/index/types.ts +147 -0
  273. package/src/index/watcher.ts +189 -0
  274. package/src/index.ts +30 -0
  275. package/src/integration/search-keyword.test.ts +678 -0
  276. package/src/mcp/server.ts +612 -0
  277. package/src/parser/index.ts +1 -0
  278. package/src/parser/parser.test.ts +291 -0
  279. package/src/parser/parser.ts +394 -0
  280. package/src/parser/section-filter.test.ts +277 -0
  281. package/src/parser/section-filter.ts +392 -0
  282. package/src/search/__tests__/hybrid-search.test.ts +650 -0
  283. package/src/search/bm25-store.ts +366 -0
  284. package/src/search/cross-encoder.test.ts +253 -0
  285. package/src/search/cross-encoder.ts +406 -0
  286. package/src/search/fuzzy-search.test.ts +419 -0
  287. package/src/search/fuzzy-search.ts +273 -0
  288. package/src/search/hybrid-search.ts +448 -0
  289. package/src/search/path-matcher.test.ts +276 -0
  290. package/src/search/path-matcher.ts +33 -0
  291. package/src/search/query-parser.test.ts +260 -0
  292. package/src/search/query-parser.ts +319 -0
  293. package/src/search/searcher.test.ts +280 -0
  294. package/src/search/searcher.ts +724 -0
  295. package/src/search/wink-bm25.d.ts +30 -0
  296. package/src/summarization/cli-providers/claude.ts +202 -0
  297. package/src/summarization/cli-providers/detection.test.ts +273 -0
  298. package/src/summarization/cli-providers/detection.ts +118 -0
  299. package/src/summarization/cli-providers/index.ts +8 -0
  300. package/src/summarization/cost.test.ts +139 -0
  301. package/src/summarization/cost.ts +102 -0
  302. package/src/summarization/error-handler.test.ts +127 -0
  303. package/src/summarization/error-handler.ts +111 -0
  304. package/src/summarization/index.ts +102 -0
  305. package/src/summarization/pipeline.test.ts +498 -0
  306. package/src/summarization/pipeline.ts +231 -0
  307. package/src/summarization/prompts.test.ts +269 -0
  308. package/src/summarization/prompts.ts +133 -0
  309. package/src/summarization/provider-factory.test.ts +396 -0
  310. package/src/summarization/provider-factory.ts +178 -0
  311. package/src/summarization/types.ts +184 -0
  312. package/src/summarize/budget-bugs.test.ts +620 -0
  313. package/src/summarize/formatters.ts +419 -0
  314. package/src/summarize/index.ts +20 -0
  315. package/src/summarize/summarizer.test.ts +275 -0
  316. package/src/summarize/summarizer.ts +597 -0
  317. package/src/summarize/verify-bugs.test.ts +238 -0
  318. package/src/types/huggingface-transformers.d.ts +66 -0
  319. package/src/utils/index.ts +1 -0
  320. package/src/utils/tokens.test.ts +142 -0
  321. package/src/utils/tokens.ts +186 -0
  322. package/tests/fixtures/cli/.mdcontext/active-provider.json +7 -0
  323. package/tests/fixtures/cli/.mdcontext/config.json +8 -0
  324. package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
  325. package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
  326. package/tests/fixtures/cli/.mdcontext/indexes/documents.json +33 -0
  327. package/tests/fixtures/cli/.mdcontext/indexes/links.json +12 -0
  328. package/tests/fixtures/cli/.mdcontext/indexes/sections.json +247 -0
  329. package/tests/fixtures/cli/README.md +9 -0
  330. package/tests/fixtures/cli/api-reference.md +11 -0
  331. package/tests/fixtures/cli/getting-started.md +11 -0
  332. package/tests/integration/embed-index.test.ts +712 -0
  333. package/tests/integration/search-context.test.ts +469 -0
  334. package/tests/integration/search-semantic.test.ts +522 -0
  335. package/tsconfig.json +26 -0
  336. package/vitest.config.ts +16 -0
  337. package/vitest.setup.ts +12 -0
@@ -0,0 +1,414 @@
1
+ /**
2
+ * Duplicate Content Detection
3
+ *
4
+ * Detects duplicate and near-duplicate content across markdown sections.
5
+ * Uses both exact hash matching and embedding similarity for detection.
6
+ */
7
+
8
+ import * as crypto from 'node:crypto'
9
+ import * as fs from 'node:fs/promises'
10
+ import * as path from 'node:path'
11
+ import { Effect } from 'effect'
12
+ import type { FileReadError, IndexCorruptedError } from '../errors/index.js'
13
+ import { createStorage, loadSectionIndex } from '../index/storage.js'
14
+
15
+ // ============================================================================
16
+ // Types
17
+ // ============================================================================
18
+
19
+ /**
20
+ * A group of duplicate sections, with one primary and zero or more duplicates.
21
+ */
22
+ export interface DuplicateGroup {
23
+ /** The primary section (first encountered or highest-ranked) */
24
+ readonly primary: DuplicateSectionInfo
25
+ /** All sections that are duplicates of the primary */
26
+ readonly duplicates: readonly DuplicateSectionInfo[]
27
+ /** Detection method used */
28
+ readonly method: 'exact' | 'similar'
29
+ /** Similarity score (1.0 for exact matches, <1.0 for similar) */
30
+ readonly similarity: number
31
+ }
32
+
33
+ /**
34
+ * Information about a section in a duplicate group.
35
+ */
36
+ export interface DuplicateSectionInfo {
37
+ readonly sectionId: string
38
+ readonly documentPath: string
39
+ readonly heading: string
40
+ readonly startLine: number
41
+ readonly endLine: number
42
+ readonly tokenCount: number
43
+ }
44
+
45
+ /**
46
+ * Options for duplicate detection.
47
+ */
48
+ export interface DuplicateDetectionOptions {
49
+ /** Minimum content length (characters) to consider for duplicate detection */
50
+ readonly minContentLength?: number | undefined
51
+ /** Similarity threshold for near-duplicate detection (0-1, default: 0.85) */
52
+ readonly similarityThreshold?: number | undefined
53
+ /** Include exact matches only (skip similarity detection) */
54
+ readonly exactOnly?: boolean | undefined
55
+ /** Filter by document path pattern */
56
+ readonly pathPattern?: string | undefined
57
+ }
58
+
59
+ /**
60
+ * Result of duplicate detection.
61
+ */
62
+ export interface DuplicateDetectionResult {
63
+ /** Groups of duplicate sections */
64
+ readonly groups: readonly DuplicateGroup[]
65
+ /** Total sections analyzed */
66
+ readonly sectionsAnalyzed: number
67
+ /** Total duplicate pairs found */
68
+ readonly duplicatePairs: number
69
+ /** Sections involved in at least one duplicate relationship */
70
+ readonly sectionsWithDuplicates: number
71
+ }
72
+
73
+ /**
74
+ * Options for collapsing search results.
75
+ */
76
+ export interface CollapseOptions {
77
+ /** Show duplicate locations in output */
78
+ readonly showLocations?: boolean
79
+ /** Maximum duplicate locations to show */
80
+ readonly maxLocations?: number
81
+ }
82
+
83
+ /**
84
+ * A search result with collapsed duplicate information.
85
+ */
86
+ export interface CollapsedResult<T> {
87
+ /** The primary result */
88
+ readonly result: T
89
+ /** Number of duplicates collapsed */
90
+ readonly duplicateCount: number
91
+ /** Locations of duplicates (if showLocations enabled) */
92
+ readonly duplicateLocations:
93
+ | readonly {
94
+ readonly documentPath: string
95
+ readonly heading: string
96
+ }[]
97
+ | undefined
98
+ }
99
+
100
+ // ============================================================================
101
+ // Content Hashing
102
+ // ============================================================================
103
+
104
+ /**
105
+ * Normalize content for comparison by removing whitespace variations
106
+ * and normalizing line endings.
107
+ */
108
+ const normalizeContent = (content: string): string => {
109
+ return content
110
+ .trim()
111
+ .replace(/\r\n/g, '\n')
112
+ .replace(/[ \t]+/g, ' ')
113
+ .replace(/\n{3,}/g, '\n\n')
114
+ }
115
+
116
+ /**
117
+ * Compute a content hash for exact duplicate detection.
118
+ */
119
+ const computeContentHash = (content: string): string => {
120
+ const normalized = normalizeContent(content)
121
+ return crypto.createHash('sha256').update(normalized).digest('hex')
122
+ }
123
+
124
+ // ============================================================================
125
+ // Section Content Loading
126
+ // ============================================================================
127
+
128
+ /**
129
+ * File content cache for efficient section loading.
130
+ * Multiple sections from the same file share the cached content.
131
+ */
132
+ interface FileContentCache {
133
+ readonly cache: Map<string, string | null>
134
+ get: (
135
+ rootPath: string,
136
+ documentPath: string,
137
+ ) => Effect.Effect<string | null, never>
138
+ }
139
+
140
+ /**
141
+ * Create a file content cache for efficient repeated lookups.
142
+ */
143
+ const createFileContentCache = (): FileContentCache => {
144
+ const cache = new Map<string, string | null>()
145
+
146
+ return {
147
+ cache,
148
+ get: (rootPath: string, documentPath: string) =>
149
+ Effect.gen(function* () {
150
+ if (cache.has(documentPath)) {
151
+ return cache.get(documentPath)!
152
+ }
153
+ const content = yield* Effect.promise(async () => {
154
+ try {
155
+ const filePath = path.join(rootPath, documentPath)
156
+ return await fs.readFile(filePath, 'utf-8')
157
+ } catch {
158
+ return null
159
+ }
160
+ })
161
+ cache.set(documentPath, content)
162
+ return content
163
+ }),
164
+ }
165
+ }
166
+
167
+ /**
168
+ * Extract section content from cached file content.
169
+ */
170
+ const extractSectionFromContent = (
171
+ content: string,
172
+ startLine: number,
173
+ endLine: number,
174
+ ): string => {
175
+ const lines = content.split('\n')
176
+ return lines.slice(startLine - 1, endLine).join('\n')
177
+ }
178
+
179
+ // ============================================================================
180
+ // Duplicate Detection
181
+ // ============================================================================
182
+
183
+ /**
184
+ * Detect duplicate sections using content hashing (exact matches).
185
+ * This is fast and doesn't require embeddings.
186
+ */
187
+ export const detectExactDuplicates = (
188
+ rootPath: string,
189
+ options: DuplicateDetectionOptions = {},
190
+ ): Effect.Effect<
191
+ DuplicateDetectionResult,
192
+ FileReadError | IndexCorruptedError
193
+ > =>
194
+ Effect.gen(function* () {
195
+ const minContentLength = options.minContentLength ?? 50
196
+ const storage = createStorage(rootPath)
197
+
198
+ // Load section index
199
+ const sectionIndex = yield* loadSectionIndex(storage)
200
+ if (!sectionIndex) {
201
+ return {
202
+ groups: [],
203
+ sectionsAnalyzed: 0,
204
+ duplicatePairs: 0,
205
+ sectionsWithDuplicates: 0,
206
+ }
207
+ }
208
+
209
+ const sections = Object.values(sectionIndex.sections)
210
+
211
+ // Filter sections by path pattern if specified
212
+ const filteredSections = options.pathPattern
213
+ ? sections.filter((s) =>
214
+ matchPathPattern(s.documentPath, options.pathPattern!),
215
+ )
216
+ : sections
217
+
218
+ // Map: hash -> list of sections with that hash
219
+ const hashGroups = new Map<string, DuplicateSectionInfo[]>()
220
+
221
+ // Create file content cache to avoid re-reading files
222
+ const fileCache = createFileContentCache()
223
+
224
+ // Process sections in parallel batches, grouped by file for cache efficiency
225
+ // First, group sections by file to maximize cache hits
226
+ const sectionsByFile = new Map<string, typeof filteredSections>()
227
+ for (const section of filteredSections) {
228
+ const existing = sectionsByFile.get(section.documentPath)
229
+ if (existing) {
230
+ existing.push(section)
231
+ } else {
232
+ sectionsByFile.set(section.documentPath, [section])
233
+ }
234
+ }
235
+
236
+ // Process all files in parallel with concurrency limit
237
+ yield* Effect.all(
238
+ Array.from(sectionsByFile.entries()).map(([documentPath, sections]) =>
239
+ Effect.gen(function* () {
240
+ // Load file content once (cached)
241
+ const fileContent = yield* fileCache.get(rootPath, documentPath)
242
+ if (!fileContent) return
243
+
244
+ // Process all sections from this file
245
+ for (const section of sections) {
246
+ const content = extractSectionFromContent(
247
+ fileContent,
248
+ section.startLine,
249
+ section.endLine,
250
+ )
251
+
252
+ if (content.length < minContentLength) {
253
+ continue
254
+ }
255
+
256
+ const hash = computeContentHash(content)
257
+ const info: DuplicateSectionInfo = {
258
+ sectionId: section.id,
259
+ documentPath: section.documentPath,
260
+ heading: section.heading,
261
+ startLine: section.startLine,
262
+ endLine: section.endLine,
263
+ tokenCount: section.tokenCount,
264
+ }
265
+
266
+ const existing = hashGroups.get(hash)
267
+ if (existing) {
268
+ existing.push(info)
269
+ } else {
270
+ hashGroups.set(hash, [info])
271
+ }
272
+ }
273
+ }),
274
+ ),
275
+ { concurrency: 10 },
276
+ )
277
+
278
+ // Convert to DuplicateGroup format
279
+ const groups: DuplicateGroup[] = []
280
+ let duplicatePairs = 0
281
+ const sectionsInDuplicates = new Set<string>()
282
+
283
+ for (const members of hashGroups.values()) {
284
+ if (members.length > 1) {
285
+ const [primary, ...duplicates] = members
286
+ groups.push({
287
+ primary: primary!,
288
+ duplicates,
289
+ method: 'exact',
290
+ similarity: 1.0,
291
+ })
292
+
293
+ // Track stats
294
+ duplicatePairs += duplicates.length
295
+ for (const m of members) {
296
+ sectionsInDuplicates.add(m.sectionId)
297
+ }
298
+ }
299
+ }
300
+
301
+ // Sort by number of duplicates (descending)
302
+ groups.sort((a, b) => b.duplicates.length - a.duplicates.length)
303
+
304
+ return {
305
+ groups,
306
+ sectionsAnalyzed: filteredSections.length,
307
+ duplicatePairs,
308
+ sectionsWithDuplicates: sectionsInDuplicates.size,
309
+ }
310
+ })
311
+
312
+ /**
313
+ * Simple path pattern matching (supports glob-like patterns).
314
+ */
315
+ const matchPathPattern = (filePath: string, pattern: string): boolean => {
316
+ // Simple glob support: * matches any sequence, ** matches any path segments
317
+ const regexPattern = pattern
318
+ .replace(/[.+^${}()|[\]\\]/g, '\\$&') // Escape regex special chars
319
+ .replace(/\*\*/g, '.*') // ** matches anything
320
+ .replace(/\*/g, '[^/]*') // * matches within a segment
321
+ const regex = new RegExp(`^${regexPattern}`)
322
+ return regex.test(filePath)
323
+ }
324
+
325
+ // ============================================================================
326
+ // Search Result Collapsing
327
+ // ============================================================================
328
+
329
+ /**
330
+ * Collapse duplicate search results.
331
+ * Takes search results and duplicate groups, returns collapsed results.
332
+ *
333
+ * @param results - Search results with sectionId property
334
+ * @param duplicateGroups - Pre-computed duplicate groups
335
+ * @param options - Collapse options
336
+ * @returns Collapsed results with duplicate counts
337
+ */
338
+ export const collapseDuplicates = <
339
+ T extends { readonly sectionId: string; readonly documentPath: string },
340
+ >(
341
+ results: readonly T[],
342
+ duplicateGroups: readonly DuplicateGroup[],
343
+ options: CollapseOptions = {},
344
+ ): readonly CollapsedResult<T>[] => {
345
+ const maxLocations = options.maxLocations ?? 3
346
+
347
+ // Build a map: sectionId -> primary sectionId (or self if not a duplicate)
348
+ const primaryMap = new Map<string, string>()
349
+ const duplicateMap = new Map<string, DuplicateSectionInfo[]>()
350
+
351
+ for (const group of duplicateGroups) {
352
+ // Map primary to itself
353
+ primaryMap.set(group.primary.sectionId, group.primary.sectionId)
354
+ duplicateMap.set(group.primary.sectionId, [...group.duplicates])
355
+
356
+ // Map all duplicates to primary
357
+ for (const dup of group.duplicates) {
358
+ primaryMap.set(dup.sectionId, group.primary.sectionId)
359
+ }
360
+ }
361
+
362
+ // Track which primaries we've already added
363
+ const seenPrimaries = new Set<string>()
364
+ const collapsedResults: CollapsedResult<T>[] = []
365
+
366
+ for (const result of results) {
367
+ const primaryId = primaryMap.get(result.sectionId) ?? result.sectionId
368
+
369
+ if (seenPrimaries.has(primaryId)) {
370
+ // Skip - we've already added this duplicate group
371
+ continue
372
+ }
373
+
374
+ seenPrimaries.add(primaryId)
375
+
376
+ // Get duplicate info
377
+ const duplicates = duplicateMap.get(primaryId) ?? []
378
+ const duplicateLocations =
379
+ options.showLocations && duplicates.length > 0
380
+ ? duplicates.slice(0, maxLocations).map((d) => ({
381
+ documentPath: d.documentPath,
382
+ heading: d.heading,
383
+ }))
384
+ : undefined
385
+
386
+ collapsedResults.push({
387
+ result,
388
+ duplicateCount: duplicates.length,
389
+ duplicateLocations,
390
+ })
391
+ }
392
+
393
+ return collapsedResults
394
+ }
395
+
396
+ // ============================================================================
397
+ // Detection from Index (no content loading needed for hash-only)
398
+ // ============================================================================
399
+
400
+ /**
401
+ * Get duplicate groups from the section index.
402
+ * This is the main entry point for duplicate detection.
403
+ */
404
+ export const detectDuplicates = (
405
+ rootPath: string,
406
+ options: DuplicateDetectionOptions = {},
407
+ ): Effect.Effect<
408
+ DuplicateDetectionResult,
409
+ FileReadError | IndexCorruptedError
410
+ > => {
411
+ // For now, we only support exact duplicate detection via content hashing.
412
+ // Future: Add embedding-based similarity detection for near-duplicates.
413
+ return detectExactDuplicates(rootPath, options)
414
+ }
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Duplicate detection module exports
3
+ */
4
+
5
+ export type {
6
+ CollapsedResult,
7
+ CollapseOptions,
8
+ DuplicateDetectionOptions,
9
+ DuplicateDetectionResult,
10
+ DuplicateGroup,
11
+ DuplicateSectionInfo,
12
+ } from './detector.js'
13
+
14
+ export {
15
+ collapseDuplicates,
16
+ detectDuplicates,
17
+ detectExactDuplicates,
18
+ } from './detector.js'