mdcontext 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. package/.changeset/config.json +9 -9
  2. package/.claude/settings.local.json +25 -0
  3. package/.github/workflows/claude-code-review.yml +44 -0
  4. package/.github/workflows/claude.yml +85 -0
  5. package/CONTRIBUTING.md +186 -0
  6. package/NOTES/NOTES +44 -0
  7. package/README.md +206 -3
  8. package/biome.json +1 -1
  9. package/dist/chunk-23UPXDNL.js +3044 -0
  10. package/dist/chunk-2W7MO2DL.js +1366 -0
  11. package/dist/chunk-3NUAZGMA.js +1689 -0
  12. package/dist/chunk-7TOWB2XB.js +366 -0
  13. package/dist/chunk-7XOTOADQ.js +3065 -0
  14. package/dist/chunk-AH2PDM2K.js +3042 -0
  15. package/dist/chunk-BNXWSZ63.js +3742 -0
  16. package/dist/chunk-BTL5DJVU.js +3222 -0
  17. package/dist/chunk-HDHYG7E4.js +104 -0
  18. package/dist/chunk-HLR4KZBP.js +3234 -0
  19. package/dist/chunk-IP3FRFEB.js +1045 -0
  20. package/dist/chunk-KHU56VDO.js +3042 -0
  21. package/dist/chunk-KRYIFLQR.js +85 -89
  22. package/dist/chunk-LBSDNLEM.js +287 -0
  23. package/dist/chunk-MNTQ7HCP.js +2643 -0
  24. package/dist/chunk-MUJELQQ6.js +1387 -0
  25. package/dist/chunk-MXJGMSLV.js +2199 -0
  26. package/dist/chunk-N6QJGC3Z.js +2636 -0
  27. package/dist/chunk-OBELGBPM.js +1713 -0
  28. package/dist/chunk-OT7R5XTA.js +3192 -0
  29. package/dist/chunk-P7X4RA2T.js +106 -0
  30. package/dist/chunk-PIDUQNC2.js +3185 -0
  31. package/dist/chunk-POGCDIH4.js +3187 -0
  32. package/dist/chunk-PSIEOQGZ.js +3043 -0
  33. package/dist/chunk-PVRT3IHA.js +3238 -0
  34. package/dist/chunk-QNN4TT23.js +1430 -0
  35. package/dist/chunk-RE3R45RJ.js +3042 -0
  36. package/dist/chunk-S7E6TFX6.js +718 -657
  37. package/dist/chunk-SG6GLU4U.js +1378 -0
  38. package/dist/chunk-SJCDV2ST.js +274 -0
  39. package/dist/chunk-SYE5XLF3.js +104 -0
  40. package/dist/chunk-T5VLYBZD.js +103 -0
  41. package/dist/chunk-TOQB7VWU.js +3238 -0
  42. package/dist/chunk-VFNMZ4ZQ.js +3228 -0
  43. package/dist/chunk-VVTGZNBT.js +1533 -1423
  44. package/dist/chunk-W7Q4RFEV.js +104 -0
  45. package/dist/chunk-XTYYVRLO.js +3190 -0
  46. package/dist/chunk-Y6MDYVJD.js +3063 -0
  47. package/dist/cli/main.js +4072 -629
  48. package/dist/index.d.ts +420 -33
  49. package/dist/index.js +8 -15
  50. package/dist/mcp/server.js +103 -7
  51. package/dist/schema-BAWSG7KY.js +22 -0
  52. package/dist/schema-E3QUPL26.js +20 -0
  53. package/dist/schema-EHL7WUT6.js +20 -0
  54. package/docs/019-USAGE.md +44 -5
  55. package/docs/020-current-implementation.md +8 -8
  56. package/docs/021-DOGFOODING-FINDINGS.md +1 -1
  57. package/docs/CONFIG.md +1123 -0
  58. package/docs/ERRORS.md +383 -0
  59. package/docs/summarization.md +320 -0
  60. package/justfile +40 -0
  61. package/package.json +39 -33
  62. package/research/INDEX.md +315 -0
  63. package/research/code-review/README.md +90 -0
  64. package/research/code-review/cli-error-handling-review.md +979 -0
  65. package/research/code-review/code-review-validation-report.md +464 -0
  66. package/research/code-review/main-ts-review.md +1128 -0
  67. package/research/config-docs/SUMMARY.md +357 -0
  68. package/research/config-docs/TEST-RESULTS.md +776 -0
  69. package/research/config-docs/TODO.md +542 -0
  70. package/research/config-docs/analysis.md +744 -0
  71. package/research/config-docs/fix-validation.md +502 -0
  72. package/research/config-docs/help-audit.md +264 -0
  73. package/research/config-docs/help-system-analysis.md +890 -0
  74. package/research/frontmatter/COMMENTS-ARE-SKIPPED.md +149 -0
  75. package/research/frontmatter/LLM-CODE-NAVIGATION.md +276 -0
  76. package/research/issue-review.md +603 -0
  77. package/research/llm-summarization/agent-cli-tools-2026.md +1082 -0
  78. package/research/llm-summarization/alternative-providers-2026.md +1428 -0
  79. package/research/llm-summarization/anthropic-2026.md +367 -0
  80. package/research/llm-summarization/claude-cli-integration.md +1706 -0
  81. package/research/llm-summarization/cli-integration-patterns.md +3155 -0
  82. package/research/llm-summarization/openai-2026.md +473 -0
  83. package/research/llm-summarization/openai-compatible-providers-2026.md +1022 -0
  84. package/research/llm-summarization/opencode-cli-integration.md +1552 -0
  85. package/research/llm-summarization/prompt-engineering-2026.md +1426 -0
  86. package/research/llm-summarization/prototype-results.md +56 -0
  87. package/research/llm-summarization/provider-switching-patterns-2026.md +2153 -0
  88. package/research/llm-summarization/typescript-llm-libraries-2026.md +2436 -0
  89. package/research/mdcontext-pudding/00-EXECUTIVE-SUMMARY.md +282 -0
  90. package/research/mdcontext-pudding/01-index-embed.md +956 -0
  91. package/research/mdcontext-pudding/02-search-COMMANDS.md +142 -0
  92. package/research/mdcontext-pudding/02-search-SUMMARY.md +146 -0
  93. package/research/mdcontext-pudding/02-search.md +970 -0
  94. package/research/mdcontext-pudding/03-context.md +779 -0
  95. package/research/mdcontext-pudding/04-navigation-and-analytics.md +803 -0
  96. package/research/mdcontext-pudding/04-tree.md +704 -0
  97. package/research/mdcontext-pudding/05-config.md +1038 -0
  98. package/research/mdcontext-pudding/06-links-summary.txt +87 -0
  99. package/research/mdcontext-pudding/06-links.md +679 -0
  100. package/research/mdcontext-pudding/07-stats.md +693 -0
  101. package/research/mdcontext-pudding/BUG-FIX-PLAN.md +388 -0
  102. package/research/mdcontext-pudding/P0-BUG-VALIDATION.md +167 -0
  103. package/research/mdcontext-pudding/README.md +168 -0
  104. package/research/mdcontext-pudding/TESTING-SUMMARY.md +128 -0
  105. package/research/research-quality-review.md +834 -0
  106. package/research/semantic-search/embedding-text-analysis.md +156 -0
  107. package/research/semantic-search/multi-word-failure-reproduction.md +171 -0
  108. package/research/semantic-search/query-processing-analysis.md +207 -0
  109. package/research/semantic-search/root-cause-and-solution.md +114 -0
  110. package/research/semantic-search/threshold-validation-report.md +69 -0
  111. package/research/semantic-search/vector-search-analysis.md +63 -0
  112. package/research/test-path-issues.md +276 -0
  113. package/review/ALP-76/1-error-type-design.md +962 -0
  114. package/review/ALP-76/2-error-handling-patterns.md +906 -0
  115. package/review/ALP-76/3-error-presentation.md +624 -0
  116. package/review/ALP-76/4-test-coverage.md +625 -0
  117. package/review/ALP-76/5-migration-completeness.md +440 -0
  118. package/review/ALP-76/6-effect-best-practices.md +755 -0
  119. package/scripts/apply-branch-protection.sh +47 -0
  120. package/scripts/branch-protection-templates.json +79 -0
  121. package/scripts/prototype-summarization.ts +346 -0
  122. package/scripts/rebuild-hnswlib.js +32 -37
  123. package/scripts/setup-branch-protection.sh +64 -0
  124. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/active-provider.json +7 -0
  125. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.json +541 -0
  126. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.meta.json +5 -0
  127. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/config.json +8 -0
  128. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
  129. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
  130. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/documents.json +60 -0
  131. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/links.json +13 -0
  132. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/sections.json +1197 -0
  133. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/configuration-management.md +99 -0
  134. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/distributed-systems.md +92 -0
  135. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/error-handling.md +78 -0
  136. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/failure-automation.md +55 -0
  137. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/job-context.md +69 -0
  138. package/src/__tests__/fixtures/semantic-search/multi-word-corpus/process-orchestration.md +99 -0
  139. package/src/cli/argv-preprocessor.test.ts +2 -2
  140. package/src/cli/cli.test.ts +230 -33
  141. package/src/cli/commands/config-cmd.ts +642 -0
  142. package/src/cli/commands/context.ts +97 -9
  143. package/src/cli/commands/duplicates.ts +122 -0
  144. package/src/cli/commands/embeddings.ts +529 -0
  145. package/src/cli/commands/index-cmd.ts +210 -30
  146. package/src/cli/commands/index.ts +3 -0
  147. package/src/cli/commands/search.ts +894 -64
  148. package/src/cli/commands/stats.ts +3 -0
  149. package/src/cli/commands/tree.ts +26 -5
  150. package/src/cli/config-layer.ts +176 -0
  151. package/src/cli/error-handler.test.ts +235 -0
  152. package/src/cli/error-handler.ts +655 -0
  153. package/src/cli/flag-schemas.ts +66 -0
  154. package/src/cli/help.ts +209 -7
  155. package/src/cli/main.ts +348 -58
  156. package/src/cli/options.ts +10 -0
  157. package/src/cli/shared-error-handling.ts +199 -0
  158. package/src/cli/utils.ts +150 -17
  159. package/src/config/file-provider.test.ts +320 -0
  160. package/src/config/file-provider.ts +273 -0
  161. package/src/config/index.ts +72 -0
  162. package/src/config/integration.test.ts +667 -0
  163. package/src/config/precedence.test.ts +277 -0
  164. package/src/config/precedence.ts +451 -0
  165. package/src/config/schema.test.ts +414 -0
  166. package/src/config/schema.ts +603 -0
  167. package/src/config/service.test.ts +320 -0
  168. package/src/config/service.ts +243 -0
  169. package/src/config/testing.test.ts +264 -0
  170. package/src/config/testing.ts +110 -0
  171. package/src/core/types.ts +6 -33
  172. package/src/duplicates/detector.test.ts +183 -0
  173. package/src/duplicates/detector.ts +414 -0
  174. package/src/duplicates/index.ts +18 -0
  175. package/src/embeddings/embedding-namespace.test.ts +300 -0
  176. package/src/embeddings/embedding-namespace.ts +947 -0
  177. package/src/embeddings/heading-boost.test.ts +222 -0
  178. package/src/embeddings/hnsw-build-options.test.ts +198 -0
  179. package/src/embeddings/hyde.test.ts +272 -0
  180. package/src/embeddings/hyde.ts +264 -0
  181. package/src/embeddings/index.ts +2 -0
  182. package/src/embeddings/openai-provider.ts +332 -83
  183. package/src/embeddings/pricing.json +22 -0
  184. package/src/embeddings/provider-constants.ts +204 -0
  185. package/src/embeddings/provider-errors.test.ts +967 -0
  186. package/src/embeddings/provider-errors.ts +565 -0
  187. package/src/embeddings/provider-factory.test.ts +240 -0
  188. package/src/embeddings/provider-factory.ts +225 -0
  189. package/src/embeddings/provider-integration.test.ts +788 -0
  190. package/src/embeddings/query-preprocessing.test.ts +187 -0
  191. package/src/embeddings/semantic-search-threshold.test.ts +508 -0
  192. package/src/embeddings/semantic-search.ts +780 -93
  193. package/src/embeddings/types.ts +293 -16
  194. package/src/embeddings/vector-store.ts +486 -77
  195. package/src/embeddings/voyage-provider.ts +313 -0
  196. package/src/errors/errors.test.ts +845 -0
  197. package/src/errors/index.ts +533 -0
  198. package/src/index/ignore-patterns.test.ts +354 -0
  199. package/src/index/ignore-patterns.ts +305 -0
  200. package/src/index/indexer.ts +286 -48
  201. package/src/index/storage.ts +94 -30
  202. package/src/index/types.ts +40 -2
  203. package/src/index/watcher.ts +67 -9
  204. package/src/index.ts +22 -0
  205. package/src/integration/search-keyword.test.ts +678 -0
  206. package/src/mcp/server.ts +135 -6
  207. package/src/parser/parser.ts +18 -19
  208. package/src/parser/section-filter.test.ts +277 -0
  209. package/src/parser/section-filter.ts +125 -3
  210. package/src/search/__tests__/hybrid-search.test.ts +650 -0
  211. package/src/search/bm25-store.ts +366 -0
  212. package/src/search/cross-encoder.test.ts +253 -0
  213. package/src/search/cross-encoder.ts +406 -0
  214. package/src/search/fuzzy-search.test.ts +419 -0
  215. package/src/search/fuzzy-search.ts +273 -0
  216. package/src/search/hybrid-search.ts +448 -0
  217. package/src/search/path-matcher.test.ts +276 -0
  218. package/src/search/path-matcher.ts +33 -0
  219. package/src/search/searcher.test.ts +99 -1
  220. package/src/search/searcher.ts +189 -67
  221. package/src/search/wink-bm25.d.ts +30 -0
  222. package/src/summarization/cli-providers/claude.ts +202 -0
  223. package/src/summarization/cli-providers/detection.test.ts +273 -0
  224. package/src/summarization/cli-providers/detection.ts +118 -0
  225. package/src/summarization/cli-providers/index.ts +8 -0
  226. package/src/summarization/cost.test.ts +139 -0
  227. package/src/summarization/cost.ts +102 -0
  228. package/src/summarization/error-handler.test.ts +127 -0
  229. package/src/summarization/error-handler.ts +111 -0
  230. package/src/summarization/index.ts +102 -0
  231. package/src/summarization/pipeline.test.ts +498 -0
  232. package/src/summarization/pipeline.ts +231 -0
  233. package/src/summarization/prompts.test.ts +269 -0
  234. package/src/summarization/prompts.ts +133 -0
  235. package/src/summarization/provider-factory.test.ts +396 -0
  236. package/src/summarization/provider-factory.ts +178 -0
  237. package/src/summarization/types.ts +184 -0
  238. package/src/summarize/summarizer.ts +104 -35
  239. package/src/types/huggingface-transformers.d.ts +66 -0
  240. package/tests/fixtures/cli/.mdcontext/active-provider.json +7 -0
  241. package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
  242. package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
  243. package/tests/fixtures/cli/.mdcontext/indexes/documents.json +4 -4
  244. package/tests/fixtures/cli/.mdcontext/indexes/sections.json +14 -0
  245. package/tests/integration/embed-index.test.ts +712 -0
  246. package/tests/integration/search-context.test.ts +469 -0
  247. package/tests/integration/search-semantic.test.ts +522 -0
  248. package/vitest.config.ts +1 -6
  249. package/AGENTS.md +0 -46
  250. package/tests/fixtures/cli/.mdcontext/vectors.bin +0 -0
  251. package/tests/fixtures/cli/.mdcontext/vectors.meta.json +0 -1264
@@ -0,0 +1,187 @@
1
+ /**
2
+ * Query Preprocessing Tests
3
+ *
4
+ * Tests for query preprocessing before embedding generation.
5
+ * Preprocessing normalizes queries to improve semantic search recall.
6
+ */
7
+
8
+ import { describe, expect, it } from 'vitest'
9
+ import { preprocessQuery, type SemanticSearchOptions } from './types.js'
10
+
11
+ describe('Query Preprocessing', () => {
12
+ describe('preprocessQuery function', () => {
13
+ it('should convert query to lowercase', () => {
14
+ expect(preprocessQuery('How Does Authentication Work')).toBe(
15
+ 'how does authentication work',
16
+ )
17
+ })
18
+
19
+ it('should replace punctuation with spaces', () => {
20
+ expect(preprocessQuery('user-authentication')).toBe('user authentication')
21
+ expect(preprocessQuery('what is config.json?')).toBe(
22
+ 'what is config json',
23
+ )
24
+ expect(preprocessQuery('test@example.com')).toBe('test example com')
25
+ })
26
+
27
+ it('should collapse multiple spaces to single space', () => {
28
+ expect(preprocessQuery('how does it work')).toBe('how does it work')
29
+ expect(preprocessQuery('user - auth')).toBe('user auth')
30
+ })
31
+
32
+ it('should trim leading/trailing whitespace', () => {
33
+ expect(preprocessQuery(' query ')).toBe('query')
34
+ expect(preprocessQuery(' how does it work ')).toBe('how does it work')
35
+ })
36
+
37
+ it('should handle empty string', () => {
38
+ expect(preprocessQuery('')).toBe('')
39
+ })
40
+
41
+ it('should handle whitespace-only string', () => {
42
+ expect(preprocessQuery(' ')).toBe('')
43
+ })
44
+
45
+ it('should preserve alphanumeric content', () => {
46
+ expect(preprocessQuery('user123')).toBe('user123')
47
+ expect(preprocessQuery('v2 api')).toBe('v2 api')
48
+ })
49
+
50
+ it('should handle complex queries', () => {
51
+ expect(preprocessQuery("What's the best way to handle errors?")).toBe(
52
+ 'what s the best way to handle errors',
53
+ )
54
+ expect(preprocessQuery('API v2.0 - authentication')).toBe(
55
+ 'api v2 0 authentication',
56
+ )
57
+ })
58
+
59
+ it('should handle special characters', () => {
60
+ expect(preprocessQuery('C++ programming')).toBe('c programming')
61
+ expect(preprocessQuery('Node.js')).toBe('node js')
62
+ expect(preprocessQuery('$PATH variable')).toBe('path variable')
63
+ })
64
+
65
+ it('should handle unicode and accented characters', () => {
66
+ // Accented characters are stripped by the regex (non-word chars in ASCII)
67
+ // This is intentional as embeddings handle normalized ASCII better
68
+ expect(preprocessQuery('café')).toBe('caf')
69
+ expect(preprocessQuery('naïve')).toBe('na ve')
70
+ // Basic ASCII preserved
71
+ expect(preprocessQuery('cafe')).toBe('cafe')
72
+ expect(preprocessQuery('naive')).toBe('naive')
73
+ })
74
+
75
+ it('should handle quotes', () => {
76
+ expect(preprocessQuery('"exact match"')).toBe('exact match')
77
+ expect(preprocessQuery("'single quotes'")).toBe('single quotes')
78
+ })
79
+
80
+ it('should handle brackets and parentheses', () => {
81
+ expect(preprocessQuery('function(args)')).toBe('function args')
82
+ expect(preprocessQuery('[array]')).toBe('array')
83
+ expect(preprocessQuery('{object}')).toBe('object')
84
+ })
85
+ })
86
+
87
+ describe('SemanticSearchOptions skipPreprocessing', () => {
88
+ it('should accept skipPreprocessing option in interface', () => {
89
+ const options: SemanticSearchOptions = {
90
+ skipPreprocessing: true,
91
+ }
92
+ expect(options.skipPreprocessing).toBe(true)
93
+ })
94
+
95
+ it('should default to undefined (preprocessing enabled)', () => {
96
+ const options: SemanticSearchOptions = {}
97
+ expect(options.skipPreprocessing).toBeUndefined()
98
+ })
99
+
100
+ it('should accept skipPreprocessing with other options', () => {
101
+ const options: SemanticSearchOptions = {
102
+ limit: 10,
103
+ threshold: 0.35,
104
+ skipPreprocessing: false,
105
+ }
106
+ expect(options.skipPreprocessing).toBe(false)
107
+ expect(options.limit).toBe(10)
108
+ expect(options.threshold).toBe(0.35)
109
+ })
110
+ })
111
+
112
+ describe('Preprocessing benefits', () => {
113
+ it('should normalize case variations', () => {
114
+ // Same query with different case should produce same result
115
+ const query1 = 'Authentication'
116
+ const query2 = 'authentication'
117
+ const query3 = 'AUTHENTICATION'
118
+
119
+ expect(preprocessQuery(query1)).toBe(preprocessQuery(query2))
120
+ expect(preprocessQuery(query2)).toBe(preprocessQuery(query3))
121
+ })
122
+
123
+ it('should normalize punctuation variations', () => {
124
+ // Similar queries with punctuation differences should be closer
125
+ const query1 = 'user-auth'
126
+ const query2 = 'user auth'
127
+
128
+ expect(preprocessQuery(query1)).toBe(preprocessQuery(query2))
129
+ })
130
+
131
+ it('should handle file path references gracefully', () => {
132
+ // File paths in queries should be handled
133
+ expect(preprocessQuery('src/components/Button.tsx')).toBe(
134
+ 'src components button tsx',
135
+ )
136
+ })
137
+
138
+ it('should handle code references gracefully', () => {
139
+ // Code snippets in queries should be handled
140
+ expect(preprocessQuery('function handleClick()')).toBe(
141
+ 'function handleclick',
142
+ )
143
+ })
144
+ })
145
+
146
+ describe('Edge cases', () => {
147
+ it('should handle only punctuation', () => {
148
+ expect(preprocessQuery('...')).toBe('')
149
+ expect(preprocessQuery('???')).toBe('')
150
+ })
151
+
152
+ it('should handle only numbers', () => {
153
+ expect(preprocessQuery('12345')).toBe('12345')
154
+ })
155
+
156
+ it('should handle mixed numbers and punctuation', () => {
157
+ expect(preprocessQuery('123-456-789')).toBe('123 456 789')
158
+ })
159
+
160
+ it('should handle underscores (word characters)', () => {
161
+ // Underscores are word characters in regex, so they're preserved
162
+ expect(preprocessQuery('user_name')).toBe('user_name')
163
+ })
164
+
165
+ it('should handle newlines', () => {
166
+ expect(preprocessQuery('line1\nline2')).toBe('line1 line2')
167
+ })
168
+
169
+ it('should handle tabs', () => {
170
+ expect(preprocessQuery('tab\ttab')).toBe('tab tab')
171
+ })
172
+ })
173
+ })
174
+
175
+ describe('Export verification', () => {
176
+ it('should export preprocessQuery from types module', async () => {
177
+ const { preprocessQuery } = await import('./types.js')
178
+ expect(preprocessQuery).toBeDefined()
179
+ expect(typeof preprocessQuery).toBe('function')
180
+ })
181
+
182
+ it('should export preprocessQuery from main embeddings module', async () => {
183
+ const { preprocessQuery } = await import('./index.js')
184
+ expect(preprocessQuery).toBeDefined()
185
+ expect(typeof preprocessQuery).toBe('function')
186
+ })
187
+ })
@@ -0,0 +1,508 @@
1
+ /**
2
+ * Semantic Search Threshold Tests
3
+ *
4
+ * Tests for threshold-related functionality including:
5
+ * - VectorStore searchWithStats() API
6
+ * - Below-threshold feedback mechanism
7
+ * - Default threshold configuration
8
+ * - Threshold boundary conditions
9
+ *
10
+ * Uses pre-built test corpus at:
11
+ * src/__tests__/fixtures/semantic-search/multi-word-corpus/
12
+ */
13
+
14
+ import * as path from 'node:path'
15
+ import { Effect } from 'effect'
16
+ import { describe, expect, it } from 'vitest'
17
+ import {
18
+ createNamespacedVectorStore,
19
+ createVectorStore,
20
+ type VectorSearchResultWithStats,
21
+ } from './vector-store.js'
22
+
23
+ // Path to test corpus with pre-built embeddings
24
+ const TEST_CORPUS_PATH = path.join(
25
+ __dirname,
26
+ '../__tests__/fixtures/semantic-search/multi-word-corpus',
27
+ )
28
+
29
+ // Test corpus uses 512 dimensions (text-embedding-3-small with Matryoshka reduction)
30
+ const TEST_CORPUS_DIMENSIONS = 512
31
+ const TEST_CORPUS_PROVIDER = 'openai'
32
+ const TEST_CORPUS_MODEL = 'text-embedding-3-small'
33
+
34
+ // Helper to create the namespaced vector store for test corpus
35
+ const createTestVectorStore = () =>
36
+ createNamespacedVectorStore(
37
+ TEST_CORPUS_PATH,
38
+ TEST_CORPUS_PROVIDER,
39
+ TEST_CORPUS_MODEL,
40
+ TEST_CORPUS_DIMENSIONS,
41
+ )
42
+
43
+ describe('Semantic Search Threshold', () => {
44
+ describe('VectorStore searchWithStats', () => {
45
+ it('should load test corpus with embeddings', async () => {
46
+ const vectorStore = createTestVectorStore()
47
+ const loadResult = await Effect.runPromise(vectorStore.load())
48
+ expect(loadResult.loaded).toBe(true)
49
+
50
+ const stats = vectorStore.getStats()
51
+ expect(stats.count).toBeGreaterThan(0)
52
+ expect(stats.dimensions).toBe(TEST_CORPUS_DIMENSIONS)
53
+ })
54
+
55
+ it('should return results with searchWithStats', async () => {
56
+ const vectorStore = createTestVectorStore()
57
+ await Effect.runPromise(vectorStore.load())
58
+
59
+ // Use a zero threshold to get all results
60
+ const result = await Effect.runPromise(
61
+ vectorStore.searchWithStats(
62
+ new Array(TEST_CORPUS_DIMENSIONS).fill(0.1),
63
+ 10,
64
+ 0,
65
+ ),
66
+ )
67
+
68
+ expect(result.results).toBeDefined()
69
+ expect(Array.isArray(result.results)).toBe(true)
70
+ expect(result.results.length).toBeGreaterThan(0)
71
+ })
72
+
73
+ it('should track below-threshold results count', async () => {
74
+ const vectorStore = createTestVectorStore()
75
+ await Effect.runPromise(vectorStore.load())
76
+
77
+ // Use a very high threshold to push all results below it
78
+ const result = await Effect.runPromise(
79
+ vectorStore.searchWithStats(
80
+ new Array(TEST_CORPUS_DIMENSIONS).fill(0.1),
81
+ 10,
82
+ 0.99,
83
+ ),
84
+ )
85
+
86
+ // With 0.99 threshold, most/all results should be below threshold
87
+ expect(result.belowThresholdCount).toBeGreaterThanOrEqual(0)
88
+ })
89
+
90
+ it('should track highest below-threshold similarity', async () => {
91
+ const vectorStore = createTestVectorStore()
92
+ await Effect.runPromise(vectorStore.load())
93
+
94
+ // Use high threshold to force below-threshold results
95
+ const result = await Effect.runPromise(
96
+ vectorStore.searchWithStats(
97
+ new Array(TEST_CORPUS_DIMENSIONS).fill(0.1),
98
+ 10,
99
+ 0.99,
100
+ ),
101
+ )
102
+
103
+ // When there are below-threshold results, highest should be tracked
104
+ if (result.belowThresholdCount > 0) {
105
+ expect(result.belowThresholdHighest).not.toBeNull()
106
+ expect(result.belowThresholdHighest).toBeLessThan(0.99)
107
+ expect(result.belowThresholdHighest).toBeGreaterThan(0)
108
+ }
109
+ })
110
+
111
+ it('should return empty results when no embeddings exist', async () => {
112
+ const vectorStore = createVectorStore('/nonexistent/path', 1536)
113
+ const loadResult = await Effect.runPromise(vectorStore.load())
114
+ expect(loadResult.loaded).toBe(false)
115
+
116
+ const result = await Effect.runPromise(
117
+ vectorStore.searchWithStats(
118
+ new Array(TEST_CORPUS_DIMENSIONS).fill(0),
119
+ 10,
120
+ 0,
121
+ ),
122
+ )
123
+
124
+ expect(result.results).toHaveLength(0)
125
+ expect(result.belowThresholdCount).toBe(0)
126
+ expect(result.belowThresholdHighest).toBeNull()
127
+ })
128
+ })
129
+
130
+ describe('Threshold boundaries', () => {
131
+ it('should return all results with threshold of 0', async () => {
132
+ const vectorStore = createTestVectorStore()
133
+ await Effect.runPromise(vectorStore.load())
134
+
135
+ // Use 0 threshold - everything should be above
136
+ const result = await Effect.runPromise(
137
+ vectorStore.searchWithStats(
138
+ new Array(TEST_CORPUS_DIMENSIONS).fill(0.1),
139
+ 10,
140
+ 0,
141
+ ),
142
+ )
143
+
144
+ expect(result.belowThresholdCount).toBe(0)
145
+ expect(result.results.length).toBeGreaterThan(0)
146
+ })
147
+
148
+ it('should return no results with threshold of 1', async () => {
149
+ const vectorStore = createTestVectorStore()
150
+ await Effect.runPromise(vectorStore.load())
151
+
152
+ const result = await Effect.runPromise(
153
+ vectorStore.searchWithStats(
154
+ new Array(TEST_CORPUS_DIMENSIONS).fill(0.1),
155
+ 10,
156
+ 1,
157
+ ),
158
+ )
159
+
160
+ // With threshold of 1, nothing should pass (similarity is never >= 1 in practice)
161
+ // Note: if a result has exactly similarity=1, it would pass
162
+ expect(result.results.length).toBeLessThanOrEqual(1)
163
+ })
164
+
165
+ it('should respect the limit parameter', async () => {
166
+ const vectorStore = createTestVectorStore()
167
+ await Effect.runPromise(vectorStore.load())
168
+
169
+ const stats = vectorStore.getStats()
170
+ const limit = 3
171
+
172
+ const result = await Effect.runPromise(
173
+ vectorStore.searchWithStats(
174
+ new Array(TEST_CORPUS_DIMENSIONS).fill(0.1),
175
+ limit,
176
+ 0,
177
+ ),
178
+ )
179
+
180
+ // Should not return more than limit
181
+ expect(result.results.length).toBeLessThanOrEqual(limit)
182
+ // Should return results if corpus has entries
183
+ if (stats.count > 0) {
184
+ expect(result.results.length).toBeGreaterThan(0)
185
+ }
186
+ })
187
+ })
188
+
189
+ describe('Default threshold value (0.35)', () => {
190
+ it('should use 0.35 as the default threshold in config schema', async () => {
191
+ const { defaultConfig } = await import('../config/schema.js')
192
+ expect(defaultConfig.search.minSimilarity).toBe(0.35)
193
+ })
194
+
195
+ it('should document 0.35 threshold in help text', async () => {
196
+ const { helpContent } = await import('../cli/help.js')
197
+ const searchHelp = helpContent.search
198
+ expect(searchHelp).toBeDefined()
199
+ expect(searchHelp!.notes).toBeDefined()
200
+
201
+ // Verify notes mention 0.35
202
+ const notesText = searchHelp!.notes?.join(' ') ?? ''
203
+ expect(notesText).toContain('0.35')
204
+ })
205
+
206
+ it('should mention threshold in search options', async () => {
207
+ const { helpContent } = await import('../cli/help.js')
208
+ const searchHelp = helpContent.search
209
+ expect(searchHelp).toBeDefined()
210
+
211
+ // Find threshold option
212
+ const thresholdOption = searchHelp!.options.find((opt) =>
213
+ opt.name.includes('--threshold'),
214
+ )
215
+ expect(thresholdOption).toBeDefined()
216
+ expect(thresholdOption?.description).toContain('0.35')
217
+ })
218
+ })
219
+
220
+ describe('VectorSearchResultWithStats type shape', () => {
221
+ it('should have correct structure', async () => {
222
+ const vectorStore = createTestVectorStore()
223
+ await Effect.runPromise(vectorStore.load())
224
+
225
+ const result = await Effect.runPromise(
226
+ vectorStore.searchWithStats(
227
+ new Array(TEST_CORPUS_DIMENSIONS).fill(0.1),
228
+ 10,
229
+ 0.35,
230
+ ),
231
+ )
232
+
233
+ // Type assertions
234
+ const typed: VectorSearchResultWithStats = result
235
+
236
+ expect('results' in typed).toBe(true)
237
+ expect('belowThresholdCount' in typed).toBe(true)
238
+ expect('belowThresholdHighest' in typed).toBe(true)
239
+
240
+ // Results array should have proper shape
241
+ for (const r of typed.results) {
242
+ expect(typeof r.id).toBe('string')
243
+ expect(typeof r.sectionId).toBe('string')
244
+ expect(typeof r.documentPath).toBe('string')
245
+ expect(typeof r.heading).toBe('string')
246
+ expect(typeof r.similarity).toBe('number')
247
+ }
248
+ })
249
+ })
250
+
251
+ describe('Test corpus validation', () => {
252
+ it('should have test corpus with multiple documents', async () => {
253
+ const vectorStore = createTestVectorStore()
254
+ const loadResult = await Effect.runPromise(vectorStore.load())
255
+
256
+ expect(loadResult.loaded).toBe(true)
257
+ const stats = vectorStore.getStats()
258
+ // Test corpus has 6 documents with multiple sections each
259
+ expect(stats.count).toBeGreaterThan(10)
260
+ })
261
+
262
+ it('should have correct dimensions (512 for test corpus)', async () => {
263
+ const vectorStore = createTestVectorStore()
264
+ await Effect.runPromise(vectorStore.load())
265
+
266
+ const stats = vectorStore.getStats()
267
+ expect(stats.dimensions).toBe(TEST_CORPUS_DIMENSIONS)
268
+ })
269
+ })
270
+
271
+ describe('Similarity score validation', () => {
272
+ it('should return similarity scores between 0 and 1', async () => {
273
+ const vectorStore = createTestVectorStore()
274
+ await Effect.runPromise(vectorStore.load())
275
+
276
+ const result = await Effect.runPromise(
277
+ vectorStore.searchWithStats(
278
+ new Array(TEST_CORPUS_DIMENSIONS).fill(0.1),
279
+ 20,
280
+ 0,
281
+ ),
282
+ )
283
+
284
+ for (const r of result.results) {
285
+ expect(r.similarity).toBeGreaterThanOrEqual(0)
286
+ expect(r.similarity).toBeLessThanOrEqual(1)
287
+ }
288
+ })
289
+
290
+ it('should return results sorted by similarity (highest first)', async () => {
291
+ const vectorStore = createTestVectorStore()
292
+ await Effect.runPromise(vectorStore.load())
293
+
294
+ const result = await Effect.runPromise(
295
+ vectorStore.searchWithStats(
296
+ new Array(TEST_CORPUS_DIMENSIONS).fill(0.1),
297
+ 20,
298
+ 0,
299
+ ),
300
+ )
301
+
302
+ // Verify descending order
303
+ for (let i = 1; i < result.results.length; i++) {
304
+ expect(result.results[i]!.similarity).toBeLessThanOrEqual(
305
+ result.results[i - 1]!.similarity,
306
+ )
307
+ }
308
+ })
309
+ })
310
+
311
+ describe('Below-threshold feedback', () => {
312
+ it('should provide count when results are below threshold', async () => {
313
+ const vectorStore = createTestVectorStore()
314
+ await Effect.runPromise(vectorStore.load())
315
+
316
+ // Use very high threshold to get 0 passing results
317
+ const result = await Effect.runPromise(
318
+ vectorStore.searchWithStats(
319
+ new Array(TEST_CORPUS_DIMENSIONS).fill(0.1),
320
+ 10,
321
+ 0.95,
322
+ ),
323
+ )
324
+
325
+ // When 0 results pass, we should have below-threshold stats
326
+ if (result.results.length === 0) {
327
+ expect(result.belowThresholdCount).toBeGreaterThan(0)
328
+ expect(result.belowThresholdHighest).not.toBeNull()
329
+ }
330
+ })
331
+
332
+ it('should allow calculating suggested threshold', async () => {
333
+ const vectorStore = createTestVectorStore()
334
+ await Effect.runPromise(vectorStore.load())
335
+
336
+ const result = await Effect.runPromise(
337
+ vectorStore.searchWithStats(
338
+ new Array(TEST_CORPUS_DIMENSIONS).fill(0.1),
339
+ 10,
340
+ 0.9,
341
+ ),
342
+ )
343
+
344
+ if (
345
+ result.results.length === 0 &&
346
+ result.belowThresholdHighest !== null
347
+ ) {
348
+ // Suggested threshold formula: max(0.1, highest - 0.05)
349
+ const suggestedThreshold = Math.max(
350
+ 0.1,
351
+ result.belowThresholdHighest - 0.05,
352
+ )
353
+ expect(suggestedThreshold).toBeLessThan(0.9)
354
+ expect(suggestedThreshold).toBeGreaterThanOrEqual(0.1)
355
+ }
356
+ })
357
+ })
358
+ })
359
+
360
+ describe('Hybrid Search Threshold', () => {
361
+ it('should export hybridSearch function', async () => {
362
+ const { hybridSearch } = await import('../search/hybrid-search.js')
363
+ expect(hybridSearch).toBeDefined()
364
+ expect(typeof hybridSearch).toBe('function')
365
+ })
366
+ })
367
+
368
+ describe('Search Quality Modes', () => {
369
+ describe('QUALITY_EF_SEARCH constants', () => {
370
+ it('should export quality mode constants', async () => {
371
+ const { QUALITY_EF_SEARCH } = await import('./types.js')
372
+ expect(QUALITY_EF_SEARCH).toBeDefined()
373
+ expect(QUALITY_EF_SEARCH.fast).toBe(64)
374
+ expect(QUALITY_EF_SEARCH.balanced).toBe(100)
375
+ expect(QUALITY_EF_SEARCH.thorough).toBe(256)
376
+ })
377
+
378
+ it('should have fast mode with lowest efSearch', async () => {
379
+ const { QUALITY_EF_SEARCH } = await import('./types.js')
380
+ expect(QUALITY_EF_SEARCH.fast).toBeLessThan(QUALITY_EF_SEARCH.balanced)
381
+ })
382
+
383
+ it('should have thorough mode with highest efSearch', async () => {
384
+ const { QUALITY_EF_SEARCH } = await import('./types.js')
385
+ expect(QUALITY_EF_SEARCH.thorough).toBeGreaterThan(
386
+ QUALITY_EF_SEARCH.balanced,
387
+ )
388
+ })
389
+ })
390
+
391
+ describe('VectorStore efSearch support', () => {
392
+ it('should accept efSearch option in search method', async () => {
393
+ const vectorStore = createTestVectorStore()
394
+ await Effect.runPromise(vectorStore.load())
395
+
396
+ // Should not throw when passing efSearch
397
+ const result = await Effect.runPromise(
398
+ vectorStore.search(new Array(TEST_CORPUS_DIMENSIONS).fill(0.1), 10, 0, {
399
+ efSearch: 64,
400
+ }),
401
+ )
402
+
403
+ expect(Array.isArray(result)).toBe(true)
404
+ })
405
+
406
+ it('should accept efSearch option in searchWithStats method', async () => {
407
+ const vectorStore = createTestVectorStore()
408
+ await Effect.runPromise(vectorStore.load())
409
+
410
+ // Should not throw when passing efSearch
411
+ const result = await Effect.runPromise(
412
+ vectorStore.searchWithStats(
413
+ new Array(TEST_CORPUS_DIMENSIONS).fill(0.1),
414
+ 10,
415
+ 0,
416
+ { efSearch: 256 },
417
+ ),
418
+ )
419
+
420
+ expect(result.results).toBeDefined()
421
+ expect(Array.isArray(result.results)).toBe(true)
422
+ })
423
+
424
+ it('should work without efSearch option (defaults)', async () => {
425
+ const vectorStore = createTestVectorStore()
426
+ await Effect.runPromise(vectorStore.load())
427
+
428
+ // Should not throw without efSearch option
429
+ const result = await Effect.runPromise(
430
+ vectorStore.search(new Array(TEST_CORPUS_DIMENSIONS).fill(0.1), 10, 0),
431
+ )
432
+
433
+ expect(Array.isArray(result)).toBe(true)
434
+ })
435
+
436
+ it('should return consistent results for same query with different efSearch', async () => {
437
+ const vectorStore = createTestVectorStore()
438
+ await Effect.runPromise(vectorStore.load())
439
+
440
+ const queryVector = new Array(TEST_CORPUS_DIMENSIONS).fill(0.1)
441
+
442
+ const fastResult = await Effect.runPromise(
443
+ vectorStore.search(queryVector, 5, 0, { efSearch: 64 }),
444
+ )
445
+
446
+ const thoroughResult = await Effect.runPromise(
447
+ vectorStore.search(queryVector, 5, 0, { efSearch: 256 }),
448
+ )
449
+
450
+ // Both should return results
451
+ expect(fastResult.length).toBeGreaterThan(0)
452
+ expect(thoroughResult.length).toBeGreaterThan(0)
453
+
454
+ // Top result should likely be the same (though not guaranteed with HNSW)
455
+ // At minimum, both should return valid results
456
+ expect(fastResult[0]?.sectionId).toBeDefined()
457
+ expect(thoroughResult[0]?.sectionId).toBeDefined()
458
+ })
459
+ })
460
+
461
+ describe('SemanticSearchOptions quality field', () => {
462
+ it('should accept quality in SemanticSearchOptions type', async () => {
463
+ // Type check - if this compiles, the type has the quality field
464
+ const options: import('./types.js').SemanticSearchOptions = {
465
+ limit: 10,
466
+ threshold: 0.35,
467
+ quality: 'balanced',
468
+ }
469
+ expect(options.quality).toBe('balanced')
470
+ })
471
+
472
+ it('should accept all three quality modes', async () => {
473
+ const fastOptions: import('./types.js').SemanticSearchOptions = {
474
+ quality: 'fast',
475
+ }
476
+ const balancedOptions: import('./types.js').SemanticSearchOptions = {
477
+ quality: 'balanced',
478
+ }
479
+ const thoroughOptions: import('./types.js').SemanticSearchOptions = {
480
+ quality: 'thorough',
481
+ }
482
+
483
+ expect(fastOptions.quality).toBe('fast')
484
+ expect(balancedOptions.quality).toBe('balanced')
485
+ expect(thoroughOptions.quality).toBe('thorough')
486
+ })
487
+ })
488
+
489
+ describe('HybridSearchOptions quality field', () => {
490
+ it('should accept quality in HybridSearchOptions type', async () => {
491
+ type HybridSearchOptions =
492
+ import('../search/hybrid-search.js').HybridSearchOptions
493
+
494
+ const options: HybridSearchOptions = {
495
+ limit: 10,
496
+ quality: 'thorough',
497
+ }
498
+ expect(options.quality).toBe('thorough')
499
+ })
500
+ })
501
+ })
502
+
503
+ describe('CLI Search Threshold', () => {
504
+ it('should have 0.35 as config default threshold', async () => {
505
+ const { defaultConfig } = await import('../config/schema.js')
506
+ expect(defaultConfig.search.minSimilarity).toBe(0.35)
507
+ })
508
+ })