mdcontext 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.changeset/config.json +9 -9
- package/.claude/settings.local.json +25 -0
- package/.github/workflows/claude-code-review.yml +44 -0
- package/.github/workflows/claude.yml +85 -0
- package/CONTRIBUTING.md +186 -0
- package/NOTES/NOTES +44 -0
- package/README.md +206 -3
- package/biome.json +1 -1
- package/dist/chunk-23UPXDNL.js +3044 -0
- package/dist/chunk-2W7MO2DL.js +1366 -0
- package/dist/chunk-3NUAZGMA.js +1689 -0
- package/dist/chunk-7TOWB2XB.js +366 -0
- package/dist/chunk-7XOTOADQ.js +3065 -0
- package/dist/chunk-AH2PDM2K.js +3042 -0
- package/dist/chunk-BNXWSZ63.js +3742 -0
- package/dist/chunk-BTL5DJVU.js +3222 -0
- package/dist/chunk-HDHYG7E4.js +104 -0
- package/dist/chunk-HLR4KZBP.js +3234 -0
- package/dist/chunk-IP3FRFEB.js +1045 -0
- package/dist/chunk-KHU56VDO.js +3042 -0
- package/dist/chunk-KRYIFLQR.js +85 -89
- package/dist/chunk-LBSDNLEM.js +287 -0
- package/dist/chunk-MNTQ7HCP.js +2643 -0
- package/dist/chunk-MUJELQQ6.js +1387 -0
- package/dist/chunk-MXJGMSLV.js +2199 -0
- package/dist/chunk-N6QJGC3Z.js +2636 -0
- package/dist/chunk-OBELGBPM.js +1713 -0
- package/dist/chunk-OT7R5XTA.js +3192 -0
- package/dist/chunk-P7X4RA2T.js +106 -0
- package/dist/chunk-PIDUQNC2.js +3185 -0
- package/dist/chunk-POGCDIH4.js +3187 -0
- package/dist/chunk-PSIEOQGZ.js +3043 -0
- package/dist/chunk-PVRT3IHA.js +3238 -0
- package/dist/chunk-QNN4TT23.js +1430 -0
- package/dist/chunk-RE3R45RJ.js +3042 -0
- package/dist/chunk-S7E6TFX6.js +718 -657
- package/dist/chunk-SG6GLU4U.js +1378 -0
- package/dist/chunk-SJCDV2ST.js +274 -0
- package/dist/chunk-SYE5XLF3.js +104 -0
- package/dist/chunk-T5VLYBZD.js +103 -0
- package/dist/chunk-TOQB7VWU.js +3238 -0
- package/dist/chunk-VFNMZ4ZQ.js +3228 -0
- package/dist/chunk-VVTGZNBT.js +1533 -1423
- package/dist/chunk-W7Q4RFEV.js +104 -0
- package/dist/chunk-XTYYVRLO.js +3190 -0
- package/dist/chunk-Y6MDYVJD.js +3063 -0
- package/dist/cli/main.js +4072 -629
- package/dist/index.d.ts +420 -33
- package/dist/index.js +8 -15
- package/dist/mcp/server.js +103 -7
- package/dist/schema-BAWSG7KY.js +22 -0
- package/dist/schema-E3QUPL26.js +20 -0
- package/dist/schema-EHL7WUT6.js +20 -0
- package/docs/019-USAGE.md +44 -5
- package/docs/020-current-implementation.md +8 -8
- package/docs/021-DOGFOODING-FINDINGS.md +1 -1
- package/docs/CONFIG.md +1123 -0
- package/docs/ERRORS.md +383 -0
- package/docs/summarization.md +320 -0
- package/justfile +40 -0
- package/package.json +39 -33
- package/research/INDEX.md +315 -0
- package/research/code-review/README.md +90 -0
- package/research/code-review/cli-error-handling-review.md +979 -0
- package/research/code-review/code-review-validation-report.md +464 -0
- package/research/code-review/main-ts-review.md +1128 -0
- package/research/config-docs/SUMMARY.md +357 -0
- package/research/config-docs/TEST-RESULTS.md +776 -0
- package/research/config-docs/TODO.md +542 -0
- package/research/config-docs/analysis.md +744 -0
- package/research/config-docs/fix-validation.md +502 -0
- package/research/config-docs/help-audit.md +264 -0
- package/research/config-docs/help-system-analysis.md +890 -0
- package/research/frontmatter/COMMENTS-ARE-SKIPPED.md +149 -0
- package/research/frontmatter/LLM-CODE-NAVIGATION.md +276 -0
- package/research/issue-review.md +603 -0
- package/research/llm-summarization/agent-cli-tools-2026.md +1082 -0
- package/research/llm-summarization/alternative-providers-2026.md +1428 -0
- package/research/llm-summarization/anthropic-2026.md +367 -0
- package/research/llm-summarization/claude-cli-integration.md +1706 -0
- package/research/llm-summarization/cli-integration-patterns.md +3155 -0
- package/research/llm-summarization/openai-2026.md +473 -0
- package/research/llm-summarization/openai-compatible-providers-2026.md +1022 -0
- package/research/llm-summarization/opencode-cli-integration.md +1552 -0
- package/research/llm-summarization/prompt-engineering-2026.md +1426 -0
- package/research/llm-summarization/prototype-results.md +56 -0
- package/research/llm-summarization/provider-switching-patterns-2026.md +2153 -0
- package/research/llm-summarization/typescript-llm-libraries-2026.md +2436 -0
- package/research/mdcontext-pudding/00-EXECUTIVE-SUMMARY.md +282 -0
- package/research/mdcontext-pudding/01-index-embed.md +956 -0
- package/research/mdcontext-pudding/02-search-COMMANDS.md +142 -0
- package/research/mdcontext-pudding/02-search-SUMMARY.md +146 -0
- package/research/mdcontext-pudding/02-search.md +970 -0
- package/research/mdcontext-pudding/03-context.md +779 -0
- package/research/mdcontext-pudding/04-navigation-and-analytics.md +803 -0
- package/research/mdcontext-pudding/04-tree.md +704 -0
- package/research/mdcontext-pudding/05-config.md +1038 -0
- package/research/mdcontext-pudding/06-links-summary.txt +87 -0
- package/research/mdcontext-pudding/06-links.md +679 -0
- package/research/mdcontext-pudding/07-stats.md +693 -0
- package/research/mdcontext-pudding/BUG-FIX-PLAN.md +388 -0
- package/research/mdcontext-pudding/P0-BUG-VALIDATION.md +167 -0
- package/research/mdcontext-pudding/README.md +168 -0
- package/research/mdcontext-pudding/TESTING-SUMMARY.md +128 -0
- package/research/research-quality-review.md +834 -0
- package/research/semantic-search/embedding-text-analysis.md +156 -0
- package/research/semantic-search/multi-word-failure-reproduction.md +171 -0
- package/research/semantic-search/query-processing-analysis.md +207 -0
- package/research/semantic-search/root-cause-and-solution.md +114 -0
- package/research/semantic-search/threshold-validation-report.md +69 -0
- package/research/semantic-search/vector-search-analysis.md +63 -0
- package/research/test-path-issues.md +276 -0
- package/review/ALP-76/1-error-type-design.md +962 -0
- package/review/ALP-76/2-error-handling-patterns.md +906 -0
- package/review/ALP-76/3-error-presentation.md +624 -0
- package/review/ALP-76/4-test-coverage.md +625 -0
- package/review/ALP-76/5-migration-completeness.md +440 -0
- package/review/ALP-76/6-effect-best-practices.md +755 -0
- package/scripts/apply-branch-protection.sh +47 -0
- package/scripts/branch-protection-templates.json +79 -0
- package/scripts/prototype-summarization.ts +346 -0
- package/scripts/rebuild-hnswlib.js +32 -37
- package/scripts/setup-branch-protection.sh +64 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/active-provider.json +7 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.json +541 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.meta.json +5 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/config.json +8 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/documents.json +60 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/links.json +13 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/sections.json +1197 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/configuration-management.md +99 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/distributed-systems.md +92 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/error-handling.md +78 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/failure-automation.md +55 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/job-context.md +69 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/process-orchestration.md +99 -0
- package/src/cli/argv-preprocessor.test.ts +2 -2
- package/src/cli/cli.test.ts +230 -33
- package/src/cli/commands/config-cmd.ts +642 -0
- package/src/cli/commands/context.ts +97 -9
- package/src/cli/commands/duplicates.ts +122 -0
- package/src/cli/commands/embeddings.ts +529 -0
- package/src/cli/commands/index-cmd.ts +210 -30
- package/src/cli/commands/index.ts +3 -0
- package/src/cli/commands/search.ts +894 -64
- package/src/cli/commands/stats.ts +3 -0
- package/src/cli/commands/tree.ts +26 -5
- package/src/cli/config-layer.ts +176 -0
- package/src/cli/error-handler.test.ts +235 -0
- package/src/cli/error-handler.ts +655 -0
- package/src/cli/flag-schemas.ts +66 -0
- package/src/cli/help.ts +209 -7
- package/src/cli/main.ts +348 -58
- package/src/cli/options.ts +10 -0
- package/src/cli/shared-error-handling.ts +199 -0
- package/src/cli/utils.ts +150 -17
- package/src/config/file-provider.test.ts +320 -0
- package/src/config/file-provider.ts +273 -0
- package/src/config/index.ts +72 -0
- package/src/config/integration.test.ts +667 -0
- package/src/config/precedence.test.ts +277 -0
- package/src/config/precedence.ts +451 -0
- package/src/config/schema.test.ts +414 -0
- package/src/config/schema.ts +603 -0
- package/src/config/service.test.ts +320 -0
- package/src/config/service.ts +243 -0
- package/src/config/testing.test.ts +264 -0
- package/src/config/testing.ts +110 -0
- package/src/core/types.ts +6 -33
- package/src/duplicates/detector.test.ts +183 -0
- package/src/duplicates/detector.ts +414 -0
- package/src/duplicates/index.ts +18 -0
- package/src/embeddings/embedding-namespace.test.ts +300 -0
- package/src/embeddings/embedding-namespace.ts +947 -0
- package/src/embeddings/heading-boost.test.ts +222 -0
- package/src/embeddings/hnsw-build-options.test.ts +198 -0
- package/src/embeddings/hyde.test.ts +272 -0
- package/src/embeddings/hyde.ts +264 -0
- package/src/embeddings/index.ts +2 -0
- package/src/embeddings/openai-provider.ts +332 -83
- package/src/embeddings/pricing.json +22 -0
- package/src/embeddings/provider-constants.ts +204 -0
- package/src/embeddings/provider-errors.test.ts +967 -0
- package/src/embeddings/provider-errors.ts +565 -0
- package/src/embeddings/provider-factory.test.ts +240 -0
- package/src/embeddings/provider-factory.ts +225 -0
- package/src/embeddings/provider-integration.test.ts +788 -0
- package/src/embeddings/query-preprocessing.test.ts +187 -0
- package/src/embeddings/semantic-search-threshold.test.ts +508 -0
- package/src/embeddings/semantic-search.ts +780 -93
- package/src/embeddings/types.ts +293 -16
- package/src/embeddings/vector-store.ts +486 -77
- package/src/embeddings/voyage-provider.ts +313 -0
- package/src/errors/errors.test.ts +845 -0
- package/src/errors/index.ts +533 -0
- package/src/index/ignore-patterns.test.ts +354 -0
- package/src/index/ignore-patterns.ts +305 -0
- package/src/index/indexer.ts +286 -48
- package/src/index/storage.ts +94 -30
- package/src/index/types.ts +40 -2
- package/src/index/watcher.ts +67 -9
- package/src/index.ts +22 -0
- package/src/integration/search-keyword.test.ts +678 -0
- package/src/mcp/server.ts +135 -6
- package/src/parser/parser.ts +18 -19
- package/src/parser/section-filter.test.ts +277 -0
- package/src/parser/section-filter.ts +125 -3
- package/src/search/__tests__/hybrid-search.test.ts +650 -0
- package/src/search/bm25-store.ts +366 -0
- package/src/search/cross-encoder.test.ts +253 -0
- package/src/search/cross-encoder.ts +406 -0
- package/src/search/fuzzy-search.test.ts +419 -0
- package/src/search/fuzzy-search.ts +273 -0
- package/src/search/hybrid-search.ts +448 -0
- package/src/search/path-matcher.test.ts +276 -0
- package/src/search/path-matcher.ts +33 -0
- package/src/search/searcher.test.ts +99 -1
- package/src/search/searcher.ts +189 -67
- package/src/search/wink-bm25.d.ts +30 -0
- package/src/summarization/cli-providers/claude.ts +202 -0
- package/src/summarization/cli-providers/detection.test.ts +273 -0
- package/src/summarization/cli-providers/detection.ts +118 -0
- package/src/summarization/cli-providers/index.ts +8 -0
- package/src/summarization/cost.test.ts +139 -0
- package/src/summarization/cost.ts +102 -0
- package/src/summarization/error-handler.test.ts +127 -0
- package/src/summarization/error-handler.ts +111 -0
- package/src/summarization/index.ts +102 -0
- package/src/summarization/pipeline.test.ts +498 -0
- package/src/summarization/pipeline.ts +231 -0
- package/src/summarization/prompts.test.ts +269 -0
- package/src/summarization/prompts.ts +133 -0
- package/src/summarization/provider-factory.test.ts +396 -0
- package/src/summarization/provider-factory.ts +178 -0
- package/src/summarization/types.ts +184 -0
- package/src/summarize/summarizer.ts +104 -35
- package/src/types/huggingface-transformers.d.ts +66 -0
- package/tests/fixtures/cli/.mdcontext/active-provider.json +7 -0
- package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/indexes/documents.json +4 -4
- package/tests/fixtures/cli/.mdcontext/indexes/sections.json +14 -0
- package/tests/integration/embed-index.test.ts +712 -0
- package/tests/integration/search-context.test.ts +469 -0
- package/tests/integration/search-semantic.test.ts +522 -0
- package/vitest.config.ts +1 -6
- package/AGENTS.md +0 -46
- package/tests/fixtures/cli/.mdcontext/vectors.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/vectors.meta.json +0 -1264
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hybrid Search with Reciprocal Rank Fusion (RRF)
|
|
3
|
+
*
|
|
4
|
+
* Combines BM25 keyword search with semantic vector search for improved
|
|
5
|
+
* recall (15-30% improvement over single-method retrieval).
|
|
6
|
+
*
|
|
7
|
+
* RRF Formula: score(doc) = Σ weight / (k + rank)
|
|
8
|
+
* k = 60 (standard smoothing constant from research)
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import * as path from 'node:path'
|
|
12
|
+
import { Effect } from 'effect'
|
|
13
|
+
import { listNamespaces } from '../embeddings/embedding-namespace.js'
|
|
14
|
+
import { semanticSearch } from '../embeddings/semantic-search.js'
|
|
15
|
+
import type {
|
|
16
|
+
SearchQuality,
|
|
17
|
+
SemanticSearchResult,
|
|
18
|
+
} from '../embeddings/types.js'
|
|
19
|
+
import type {
|
|
20
|
+
ApiKeyInvalidError,
|
|
21
|
+
ApiKeyMissingError,
|
|
22
|
+
EmbeddingError,
|
|
23
|
+
FileReadError,
|
|
24
|
+
VectorStoreError,
|
|
25
|
+
} from '../errors/index.js'
|
|
26
|
+
import {
|
|
27
|
+
type BM25SearchResult,
|
|
28
|
+
bm25IndexExists,
|
|
29
|
+
bm25Search,
|
|
30
|
+
} from './bm25-store.js'
|
|
31
|
+
import {
|
|
32
|
+
isRerankerAvailable,
|
|
33
|
+
type RerankerError,
|
|
34
|
+
rerankResults,
|
|
35
|
+
} from './cross-encoder.js'
|
|
36
|
+
import { matchPath } from './path-matcher.js'
|
|
37
|
+
|
|
38
|
+
// ============================================================================
|
|
39
|
+
// Types
|
|
40
|
+
// ============================================================================
|
|
41
|
+
|
|
42
|
+
export type SearchMode = 'hybrid' | 'semantic' | 'keyword'
|
|
43
|
+
|
|
44
|
+
export interface HybridSearchOptions {
|
|
45
|
+
/** Maximum number of results */
|
|
46
|
+
readonly limit?: number
|
|
47
|
+
/** Minimum similarity threshold for semantic search (0-1) */
|
|
48
|
+
readonly threshold?: number
|
|
49
|
+
/** Filter by document path pattern */
|
|
50
|
+
readonly pathPattern?: string
|
|
51
|
+
/** Force a specific search mode */
|
|
52
|
+
readonly mode?: SearchMode
|
|
53
|
+
/** BM25 weight for RRF (default: 1.0) */
|
|
54
|
+
readonly bm25Weight?: number
|
|
55
|
+
/** Semantic weight for RRF (default: 1.0) */
|
|
56
|
+
readonly semanticWeight?: number
|
|
57
|
+
/** RRF k constant (default: 60) */
|
|
58
|
+
readonly rrfK?: number
|
|
59
|
+
/** Enable cross-encoder re-ranking for improved precision */
|
|
60
|
+
readonly rerank?: boolean
|
|
61
|
+
/** Search quality mode: fast, balanced (default), or thorough */
|
|
62
|
+
readonly quality?: SearchQuality | undefined
|
|
63
|
+
/** Lines of context before matches */
|
|
64
|
+
readonly contextBefore?: number | undefined
|
|
65
|
+
/** Lines of context after matches */
|
|
66
|
+
readonly contextAfter?: number | undefined
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
export interface HybridSearchResult {
|
|
70
|
+
readonly sectionId: string
|
|
71
|
+
readonly documentPath: string
|
|
72
|
+
readonly heading: string
|
|
73
|
+
/** Combined RRF score (higher is better) */
|
|
74
|
+
readonly score: number
|
|
75
|
+
/** Semantic similarity if available (0-1) */
|
|
76
|
+
readonly similarity?: number
|
|
77
|
+
/** BM25 score if available */
|
|
78
|
+
readonly bm25Score?: number
|
|
79
|
+
/** Which search methods contributed to this result */
|
|
80
|
+
readonly sources: readonly ('semantic' | 'keyword')[]
|
|
81
|
+
/** Cross-encoder re-ranking score (if reranking was enabled) */
|
|
82
|
+
readonly rerankerScore?: number
|
|
83
|
+
/** Context lines with their line numbers (when context is requested) */
|
|
84
|
+
readonly contextLines?: readonly ContextLine[] | undefined
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export interface ContextLine {
|
|
88
|
+
/** The line number (1-based) */
|
|
89
|
+
readonly lineNumber: number
|
|
90
|
+
/** The line text */
|
|
91
|
+
readonly line: string
|
|
92
|
+
/** Whether this is a matching line (for keyword search) */
|
|
93
|
+
readonly isMatch: boolean
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
export interface HybridSearchStats {
|
|
97
|
+
readonly mode: SearchMode
|
|
98
|
+
readonly modeReason: string
|
|
99
|
+
readonly semanticResults: number
|
|
100
|
+
readonly keywordResults: number
|
|
101
|
+
readonly combinedResults: number
|
|
102
|
+
readonly bm25Available: boolean
|
|
103
|
+
readonly embeddingsAvailable: boolean
|
|
104
|
+
/** Whether re-ranking was applied */
|
|
105
|
+
readonly reranked?: boolean
|
|
106
|
+
/** Total unique results available before limit was applied */
|
|
107
|
+
readonly totalAvailable?: number
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// ============================================================================
|
|
111
|
+
// RRF Fusion
|
|
112
|
+
// ============================================================================
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Reciprocal Rank Fusion (RRF) combines rankings from multiple retrieval methods.
|
|
116
|
+
*
|
|
117
|
+
* For each document, RRF score = Σ weight / (k + rank)
|
|
118
|
+
* where k is a smoothing constant (60 by default from research).
|
|
119
|
+
*
|
|
120
|
+
* This approach:
|
|
121
|
+
* - Doesn't require score normalization between methods
|
|
122
|
+
* - Gives higher weight to documents ranked highly by both methods
|
|
123
|
+
* - Naturally handles missing results from either method
|
|
124
|
+
*/
|
|
125
|
+
const fusionRRF = (
|
|
126
|
+
semanticResults: readonly SemanticSearchResult[],
|
|
127
|
+
keywordResults: readonly BM25SearchResult[],
|
|
128
|
+
options: {
|
|
129
|
+
bm25Weight: number
|
|
130
|
+
semanticWeight: number
|
|
131
|
+
rrfK: number
|
|
132
|
+
limit: number
|
|
133
|
+
},
|
|
134
|
+
): { results: HybridSearchResult[]; totalAvailable: number } => {
|
|
135
|
+
const { bm25Weight, semanticWeight, rrfK, limit } = options
|
|
136
|
+
|
|
137
|
+
// Map to accumulate RRF scores by sectionId
|
|
138
|
+
const scoreMap = new Map<
|
|
139
|
+
string,
|
|
140
|
+
{
|
|
141
|
+
documentPath: string
|
|
142
|
+
heading: string
|
|
143
|
+
rrfScore: number
|
|
144
|
+
similarity?: number
|
|
145
|
+
bm25Score?: number
|
|
146
|
+
sources: Set<'semantic' | 'keyword'>
|
|
147
|
+
contextLines?: readonly ContextLine[]
|
|
148
|
+
}
|
|
149
|
+
>()
|
|
150
|
+
|
|
151
|
+
// Add semantic results (rank is 1-indexed)
|
|
152
|
+
for (let rank = 0; rank < semanticResults.length; rank++) {
|
|
153
|
+
const result = semanticResults[rank]
|
|
154
|
+
if (!result) continue
|
|
155
|
+
|
|
156
|
+
const rrfContribution = semanticWeight / (rrfK + rank + 1)
|
|
157
|
+
|
|
158
|
+
const existing = scoreMap.get(result.sectionId)
|
|
159
|
+
if (existing) {
|
|
160
|
+
existing.rrfScore += rrfContribution
|
|
161
|
+
existing.similarity = result.similarity
|
|
162
|
+
existing.sources.add('semantic')
|
|
163
|
+
if (result.contextLines && !existing.contextLines) {
|
|
164
|
+
existing.contextLines = result.contextLines
|
|
165
|
+
}
|
|
166
|
+
} else {
|
|
167
|
+
const entry: {
|
|
168
|
+
documentPath: string
|
|
169
|
+
heading: string
|
|
170
|
+
rrfScore: number
|
|
171
|
+
similarity?: number
|
|
172
|
+
bm25Score?: number
|
|
173
|
+
sources: Set<'semantic' | 'keyword'>
|
|
174
|
+
contextLines?: readonly ContextLine[]
|
|
175
|
+
} = {
|
|
176
|
+
documentPath: result.documentPath,
|
|
177
|
+
heading: result.heading,
|
|
178
|
+
rrfScore: rrfContribution,
|
|
179
|
+
similarity: result.similarity,
|
|
180
|
+
sources: new Set(['semantic']),
|
|
181
|
+
}
|
|
182
|
+
if (result.contextLines) {
|
|
183
|
+
entry.contextLines = result.contextLines
|
|
184
|
+
}
|
|
185
|
+
scoreMap.set(result.sectionId, entry)
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// Add keyword (BM25) results
|
|
190
|
+
for (const result of keywordResults) {
|
|
191
|
+
const rrfContribution = bm25Weight / (rrfK + result.rank)
|
|
192
|
+
|
|
193
|
+
const existing = scoreMap.get(result.sectionId)
|
|
194
|
+
if (existing) {
|
|
195
|
+
existing.rrfScore += rrfContribution
|
|
196
|
+
existing.bm25Score = result.score
|
|
197
|
+
existing.sources.add('keyword')
|
|
198
|
+
} else {
|
|
199
|
+
scoreMap.set(result.sectionId, {
|
|
200
|
+
documentPath: result.documentPath,
|
|
201
|
+
heading: result.heading,
|
|
202
|
+
rrfScore: rrfContribution,
|
|
203
|
+
bm25Score: result.score,
|
|
204
|
+
sources: new Set(['keyword']),
|
|
205
|
+
})
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Convert to array and sort by RRF score
|
|
210
|
+
const allResults: HybridSearchResult[] = Array.from(scoreMap.entries())
|
|
211
|
+
.map(([sectionId, data]) => {
|
|
212
|
+
const result: HybridSearchResult = {
|
|
213
|
+
sectionId,
|
|
214
|
+
documentPath: data.documentPath,
|
|
215
|
+
heading: data.heading,
|
|
216
|
+
score: data.rrfScore,
|
|
217
|
+
sources: Array.from(data.sources) as readonly (
|
|
218
|
+
| 'semantic'
|
|
219
|
+
| 'keyword'
|
|
220
|
+
)[],
|
|
221
|
+
}
|
|
222
|
+
if (data.similarity !== undefined) {
|
|
223
|
+
;(result as { similarity: number }).similarity = data.similarity
|
|
224
|
+
}
|
|
225
|
+
if (data.bm25Score !== undefined) {
|
|
226
|
+
;(result as { bm25Score: number }).bm25Score = data.bm25Score
|
|
227
|
+
}
|
|
228
|
+
if (data.contextLines !== undefined) {
|
|
229
|
+
;(result as { contextLines: readonly ContextLine[] }).contextLines =
|
|
230
|
+
data.contextLines
|
|
231
|
+
}
|
|
232
|
+
return result
|
|
233
|
+
})
|
|
234
|
+
.sort((a, b) => b.score - a.score)
|
|
235
|
+
|
|
236
|
+
return {
|
|
237
|
+
results: allResults.slice(0, limit),
|
|
238
|
+
totalAvailable: allResults.length,
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// ============================================================================
|
|
243
|
+
// Hybrid Search
|
|
244
|
+
// ============================================================================
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Perform hybrid search combining semantic and keyword (BM25) search.
|
|
248
|
+
*
|
|
249
|
+
* Mode detection priority:
|
|
250
|
+
* 1. Explicit mode option
|
|
251
|
+
* 2. 'hybrid' if both indexes available
|
|
252
|
+
* 3. 'semantic' if only embeddings available
|
|
253
|
+
* 4. 'keyword' if only BM25 available
|
|
254
|
+
* 5. Error if neither available
|
|
255
|
+
*
|
|
256
|
+
* @param rootPath - Root directory containing indexes
|
|
257
|
+
* @param query - Search query text
|
|
258
|
+
* @param options - Search options
|
|
259
|
+
* @returns Ranked list of results with combined scores
|
|
260
|
+
*/
|
|
261
|
+
export const hybridSearch = (
|
|
262
|
+
rootPath: string,
|
|
263
|
+
query: string,
|
|
264
|
+
options: HybridSearchOptions = {},
|
|
265
|
+
): Effect.Effect<
|
|
266
|
+
{ results: readonly HybridSearchResult[]; stats: HybridSearchStats },
|
|
267
|
+
| FileReadError
|
|
268
|
+
| ApiKeyMissingError
|
|
269
|
+
| ApiKeyInvalidError
|
|
270
|
+
| EmbeddingError
|
|
271
|
+
| VectorStoreError
|
|
272
|
+
| RerankerError
|
|
273
|
+
> =>
|
|
274
|
+
Effect.gen(function* () {
|
|
275
|
+
const resolvedRoot = path.resolve(rootPath)
|
|
276
|
+
const limit = options.limit ?? 10
|
|
277
|
+
const threshold = options.threshold ?? 0.35
|
|
278
|
+
const bm25Weight = options.bm25Weight ?? 1.0
|
|
279
|
+
const semanticWeight = options.semanticWeight ?? 1.0
|
|
280
|
+
const rrfK = options.rrfK ?? 60
|
|
281
|
+
|
|
282
|
+
// Check index availability
|
|
283
|
+
const hasBM25 = yield* bm25IndexExists(resolvedRoot)
|
|
284
|
+
|
|
285
|
+
// Check for embeddings by trying semantic search
|
|
286
|
+
// This is a lightweight check that fails fast if no embeddings exist
|
|
287
|
+
let hasEmbeddings = false
|
|
288
|
+
let semanticResults: readonly SemanticSearchResult[] = []
|
|
289
|
+
|
|
290
|
+
if (options.mode !== 'keyword') {
|
|
291
|
+
const semanticEffect = semanticSearch(resolvedRoot, query, {
|
|
292
|
+
limit: limit * 2, // Get more for better fusion
|
|
293
|
+
threshold,
|
|
294
|
+
pathPattern: options.pathPattern,
|
|
295
|
+
quality: options.quality,
|
|
296
|
+
contextBefore: options.contextBefore,
|
|
297
|
+
contextAfter: options.contextAfter,
|
|
298
|
+
})
|
|
299
|
+
|
|
300
|
+
const semanticTry = yield* Effect.either(semanticEffect)
|
|
301
|
+
if (semanticTry._tag === 'Right') {
|
|
302
|
+
hasEmbeddings = true
|
|
303
|
+
semanticResults = semanticTry.right
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// Get BM25 results if available
|
|
308
|
+
let keywordResults: readonly BM25SearchResult[] = []
|
|
309
|
+
if (hasBM25 && options.mode !== 'semantic') {
|
|
310
|
+
const rawResults = yield* bm25Search(resolvedRoot, query, limit * 2)
|
|
311
|
+
// Apply path pattern filter if specified
|
|
312
|
+
keywordResults = options.pathPattern
|
|
313
|
+
? rawResults.filter((r) =>
|
|
314
|
+
matchPath(r.documentPath, options.pathPattern!),
|
|
315
|
+
)
|
|
316
|
+
: rawResults
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// Determine effective mode and reason
|
|
320
|
+
let effectiveMode: SearchMode
|
|
321
|
+
let modeReason: string
|
|
322
|
+
|
|
323
|
+
if (options.mode) {
|
|
324
|
+
effectiveMode = options.mode
|
|
325
|
+
modeReason = `--mode ${options.mode}`
|
|
326
|
+
} else if (hasEmbeddings && hasBM25) {
|
|
327
|
+
effectiveMode = 'hybrid'
|
|
328
|
+
modeReason = 'both indexes available'
|
|
329
|
+
} else if (hasEmbeddings) {
|
|
330
|
+
effectiveMode = 'semantic'
|
|
331
|
+
modeReason = 'embeddings available, no BM25 index'
|
|
332
|
+
} else if (hasBM25) {
|
|
333
|
+
effectiveMode = 'keyword'
|
|
334
|
+
modeReason = 'BM25 available, no embeddings'
|
|
335
|
+
} else {
|
|
336
|
+
effectiveMode = 'keyword'
|
|
337
|
+
modeReason = 'no indexes available'
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Perform fusion based on mode
|
|
341
|
+
let results: HybridSearchResult[]
|
|
342
|
+
let totalAvailable: number | undefined
|
|
343
|
+
|
|
344
|
+
if (effectiveMode === 'hybrid') {
|
|
345
|
+
const fusionResult = fusionRRF(semanticResults, keywordResults, {
|
|
346
|
+
bm25Weight,
|
|
347
|
+
semanticWeight,
|
|
348
|
+
rrfK,
|
|
349
|
+
limit,
|
|
350
|
+
})
|
|
351
|
+
results = fusionResult.results
|
|
352
|
+
totalAvailable = fusionResult.totalAvailable
|
|
353
|
+
} else if (effectiveMode === 'semantic') {
|
|
354
|
+
// Convert semantic results to hybrid format
|
|
355
|
+
totalAvailable = semanticResults.length
|
|
356
|
+
results = semanticResults.slice(0, limit).map((r, idx) => ({
|
|
357
|
+
sectionId: r.sectionId,
|
|
358
|
+
documentPath: r.documentPath,
|
|
359
|
+
heading: r.heading,
|
|
360
|
+
score: semanticWeight / (rrfK + idx + 1), // RRF-style score for consistency
|
|
361
|
+
similarity: r.similarity,
|
|
362
|
+
sources: ['semantic'] as const,
|
|
363
|
+
}))
|
|
364
|
+
} else {
|
|
365
|
+
// Convert keyword results to hybrid format
|
|
366
|
+
totalAvailable = keywordResults.length
|
|
367
|
+
results = keywordResults.slice(0, limit).map((r) => ({
|
|
368
|
+
sectionId: r.sectionId,
|
|
369
|
+
documentPath: r.documentPath,
|
|
370
|
+
heading: r.heading,
|
|
371
|
+
score: bm25Weight / (rrfK + r.rank),
|
|
372
|
+
bm25Score: r.score,
|
|
373
|
+
sources: ['keyword'] as const,
|
|
374
|
+
}))
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// Apply cross-encoder re-ranking if enabled
|
|
378
|
+
let reranked = false
|
|
379
|
+
if (options.rerank && results.length > 0) {
|
|
380
|
+
// Check if reranker is available
|
|
381
|
+
const rerankerAvailable = yield* isRerankerAvailable()
|
|
382
|
+
if (rerankerAvailable) {
|
|
383
|
+
// Re-rank using cross-encoder (top 20 -> top N)
|
|
384
|
+
const rerankedResults = yield* rerankResults(
|
|
385
|
+
query,
|
|
386
|
+
results,
|
|
387
|
+
(r) => `${r.heading} (${r.documentPath})`,
|
|
388
|
+
{ topK: 20, returnTopN: limit },
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
// Update results with reranker scores
|
|
392
|
+
results = rerankedResults.map((rr) => ({
|
|
393
|
+
...rr.item,
|
|
394
|
+
rerankerScore: rr.rerankerScore,
|
|
395
|
+
}))
|
|
396
|
+
reranked = true
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
const stats: HybridSearchStats = {
|
|
401
|
+
mode: effectiveMode,
|
|
402
|
+
modeReason,
|
|
403
|
+
semanticResults: semanticResults.length,
|
|
404
|
+
keywordResults: keywordResults.length,
|
|
405
|
+
combinedResults: results.length,
|
|
406
|
+
bm25Available: hasBM25,
|
|
407
|
+
embeddingsAvailable: hasEmbeddings,
|
|
408
|
+
reranked,
|
|
409
|
+
totalAvailable,
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
return { results, stats }
|
|
413
|
+
})
|
|
414
|
+
|
|
415
|
+
// ============================================================================
|
|
416
|
+
// Mode Detection Helper
|
|
417
|
+
// ============================================================================
|
|
418
|
+
|
|
419
|
+
/**
|
|
420
|
+
* Detect available search modes for a directory
|
|
421
|
+
*/
|
|
422
|
+
export const detectSearchModes = (
|
|
423
|
+
rootPath: string,
|
|
424
|
+
): Effect.Effect<
|
|
425
|
+
{ hasBM25: boolean; hasEmbeddings: boolean; recommendedMode: SearchMode },
|
|
426
|
+
never
|
|
427
|
+
> =>
|
|
428
|
+
Effect.gen(function* () {
|
|
429
|
+
const resolvedRoot = path.resolve(rootPath)
|
|
430
|
+
const hasBM25 = yield* bm25IndexExists(resolvedRoot)
|
|
431
|
+
|
|
432
|
+
// Check embeddings by looking for namespaced vector stores
|
|
433
|
+
const hasEmbeddings = yield* listNamespaces(resolvedRoot).pipe(
|
|
434
|
+
Effect.map((namespaces) => namespaces.length > 0),
|
|
435
|
+
Effect.catchAll(() => Effect.succeed(false)),
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
let recommendedMode: SearchMode
|
|
439
|
+
if (hasBM25 && hasEmbeddings) {
|
|
440
|
+
recommendedMode = 'hybrid'
|
|
441
|
+
} else if (hasEmbeddings) {
|
|
442
|
+
recommendedMode = 'semantic'
|
|
443
|
+
} else {
|
|
444
|
+
recommendedMode = 'keyword'
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
return { hasBM25, hasEmbeddings, recommendedMode }
|
|
448
|
+
})
|