mdcontext 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.changeset/config.json +9 -9
- package/.claude/settings.local.json +25 -0
- package/.github/workflows/claude-code-review.yml +44 -0
- package/.github/workflows/claude.yml +85 -0
- package/CONTRIBUTING.md +186 -0
- package/NOTES/NOTES +44 -0
- package/README.md +206 -3
- package/biome.json +1 -1
- package/dist/chunk-23UPXDNL.js +3044 -0
- package/dist/chunk-2W7MO2DL.js +1366 -0
- package/dist/chunk-3NUAZGMA.js +1689 -0
- package/dist/chunk-7TOWB2XB.js +366 -0
- package/dist/chunk-7XOTOADQ.js +3065 -0
- package/dist/chunk-AH2PDM2K.js +3042 -0
- package/dist/chunk-BNXWSZ63.js +3742 -0
- package/dist/chunk-BTL5DJVU.js +3222 -0
- package/dist/chunk-HDHYG7E4.js +104 -0
- package/dist/chunk-HLR4KZBP.js +3234 -0
- package/dist/chunk-IP3FRFEB.js +1045 -0
- package/dist/chunk-KHU56VDO.js +3042 -0
- package/dist/chunk-KRYIFLQR.js +85 -89
- package/dist/chunk-LBSDNLEM.js +287 -0
- package/dist/chunk-MNTQ7HCP.js +2643 -0
- package/dist/chunk-MUJELQQ6.js +1387 -0
- package/dist/chunk-MXJGMSLV.js +2199 -0
- package/dist/chunk-N6QJGC3Z.js +2636 -0
- package/dist/chunk-OBELGBPM.js +1713 -0
- package/dist/chunk-OT7R5XTA.js +3192 -0
- package/dist/chunk-P7X4RA2T.js +106 -0
- package/dist/chunk-PIDUQNC2.js +3185 -0
- package/dist/chunk-POGCDIH4.js +3187 -0
- package/dist/chunk-PSIEOQGZ.js +3043 -0
- package/dist/chunk-PVRT3IHA.js +3238 -0
- package/dist/chunk-QNN4TT23.js +1430 -0
- package/dist/chunk-RE3R45RJ.js +3042 -0
- package/dist/chunk-S7E6TFX6.js +718 -657
- package/dist/chunk-SG6GLU4U.js +1378 -0
- package/dist/chunk-SJCDV2ST.js +274 -0
- package/dist/chunk-SYE5XLF3.js +104 -0
- package/dist/chunk-T5VLYBZD.js +103 -0
- package/dist/chunk-TOQB7VWU.js +3238 -0
- package/dist/chunk-VFNMZ4ZQ.js +3228 -0
- package/dist/chunk-VVTGZNBT.js +1533 -1423
- package/dist/chunk-W7Q4RFEV.js +104 -0
- package/dist/chunk-XTYYVRLO.js +3190 -0
- package/dist/chunk-Y6MDYVJD.js +3063 -0
- package/dist/cli/main.js +4072 -629
- package/dist/index.d.ts +420 -33
- package/dist/index.js +8 -15
- package/dist/mcp/server.js +103 -7
- package/dist/schema-BAWSG7KY.js +22 -0
- package/dist/schema-E3QUPL26.js +20 -0
- package/dist/schema-EHL7WUT6.js +20 -0
- package/docs/019-USAGE.md +44 -5
- package/docs/020-current-implementation.md +8 -8
- package/docs/021-DOGFOODING-FINDINGS.md +1 -1
- package/docs/CONFIG.md +1123 -0
- package/docs/ERRORS.md +383 -0
- package/docs/summarization.md +320 -0
- package/justfile +40 -0
- package/package.json +39 -33
- package/research/INDEX.md +315 -0
- package/research/code-review/README.md +90 -0
- package/research/code-review/cli-error-handling-review.md +979 -0
- package/research/code-review/code-review-validation-report.md +464 -0
- package/research/code-review/main-ts-review.md +1128 -0
- package/research/config-docs/SUMMARY.md +357 -0
- package/research/config-docs/TEST-RESULTS.md +776 -0
- package/research/config-docs/TODO.md +542 -0
- package/research/config-docs/analysis.md +744 -0
- package/research/config-docs/fix-validation.md +502 -0
- package/research/config-docs/help-audit.md +264 -0
- package/research/config-docs/help-system-analysis.md +890 -0
- package/research/frontmatter/COMMENTS-ARE-SKIPPED.md +149 -0
- package/research/frontmatter/LLM-CODE-NAVIGATION.md +276 -0
- package/research/issue-review.md +603 -0
- package/research/llm-summarization/agent-cli-tools-2026.md +1082 -0
- package/research/llm-summarization/alternative-providers-2026.md +1428 -0
- package/research/llm-summarization/anthropic-2026.md +367 -0
- package/research/llm-summarization/claude-cli-integration.md +1706 -0
- package/research/llm-summarization/cli-integration-patterns.md +3155 -0
- package/research/llm-summarization/openai-2026.md +473 -0
- package/research/llm-summarization/openai-compatible-providers-2026.md +1022 -0
- package/research/llm-summarization/opencode-cli-integration.md +1552 -0
- package/research/llm-summarization/prompt-engineering-2026.md +1426 -0
- package/research/llm-summarization/prototype-results.md +56 -0
- package/research/llm-summarization/provider-switching-patterns-2026.md +2153 -0
- package/research/llm-summarization/typescript-llm-libraries-2026.md +2436 -0
- package/research/mdcontext-pudding/00-EXECUTIVE-SUMMARY.md +282 -0
- package/research/mdcontext-pudding/01-index-embed.md +956 -0
- package/research/mdcontext-pudding/02-search-COMMANDS.md +142 -0
- package/research/mdcontext-pudding/02-search-SUMMARY.md +146 -0
- package/research/mdcontext-pudding/02-search.md +970 -0
- package/research/mdcontext-pudding/03-context.md +779 -0
- package/research/mdcontext-pudding/04-navigation-and-analytics.md +803 -0
- package/research/mdcontext-pudding/04-tree.md +704 -0
- package/research/mdcontext-pudding/05-config.md +1038 -0
- package/research/mdcontext-pudding/06-links-summary.txt +87 -0
- package/research/mdcontext-pudding/06-links.md +679 -0
- package/research/mdcontext-pudding/07-stats.md +693 -0
- package/research/mdcontext-pudding/BUG-FIX-PLAN.md +388 -0
- package/research/mdcontext-pudding/P0-BUG-VALIDATION.md +167 -0
- package/research/mdcontext-pudding/README.md +168 -0
- package/research/mdcontext-pudding/TESTING-SUMMARY.md +128 -0
- package/research/research-quality-review.md +834 -0
- package/research/semantic-search/embedding-text-analysis.md +156 -0
- package/research/semantic-search/multi-word-failure-reproduction.md +171 -0
- package/research/semantic-search/query-processing-analysis.md +207 -0
- package/research/semantic-search/root-cause-and-solution.md +114 -0
- package/research/semantic-search/threshold-validation-report.md +69 -0
- package/research/semantic-search/vector-search-analysis.md +63 -0
- package/research/test-path-issues.md +276 -0
- package/review/ALP-76/1-error-type-design.md +962 -0
- package/review/ALP-76/2-error-handling-patterns.md +906 -0
- package/review/ALP-76/3-error-presentation.md +624 -0
- package/review/ALP-76/4-test-coverage.md +625 -0
- package/review/ALP-76/5-migration-completeness.md +440 -0
- package/review/ALP-76/6-effect-best-practices.md +755 -0
- package/scripts/apply-branch-protection.sh +47 -0
- package/scripts/branch-protection-templates.json +79 -0
- package/scripts/prototype-summarization.ts +346 -0
- package/scripts/rebuild-hnswlib.js +32 -37
- package/scripts/setup-branch-protection.sh +64 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/active-provider.json +7 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.json +541 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.meta.json +5 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/config.json +8 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/documents.json +60 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/links.json +13 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/sections.json +1197 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/configuration-management.md +99 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/distributed-systems.md +92 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/error-handling.md +78 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/failure-automation.md +55 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/job-context.md +69 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/process-orchestration.md +99 -0
- package/src/cli/argv-preprocessor.test.ts +2 -2
- package/src/cli/cli.test.ts +230 -33
- package/src/cli/commands/config-cmd.ts +642 -0
- package/src/cli/commands/context.ts +97 -9
- package/src/cli/commands/duplicates.ts +122 -0
- package/src/cli/commands/embeddings.ts +529 -0
- package/src/cli/commands/index-cmd.ts +210 -30
- package/src/cli/commands/index.ts +3 -0
- package/src/cli/commands/search.ts +894 -64
- package/src/cli/commands/stats.ts +3 -0
- package/src/cli/commands/tree.ts +26 -5
- package/src/cli/config-layer.ts +176 -0
- package/src/cli/error-handler.test.ts +235 -0
- package/src/cli/error-handler.ts +655 -0
- package/src/cli/flag-schemas.ts +66 -0
- package/src/cli/help.ts +209 -7
- package/src/cli/main.ts +348 -58
- package/src/cli/options.ts +10 -0
- package/src/cli/shared-error-handling.ts +199 -0
- package/src/cli/utils.ts +150 -17
- package/src/config/file-provider.test.ts +320 -0
- package/src/config/file-provider.ts +273 -0
- package/src/config/index.ts +72 -0
- package/src/config/integration.test.ts +667 -0
- package/src/config/precedence.test.ts +277 -0
- package/src/config/precedence.ts +451 -0
- package/src/config/schema.test.ts +414 -0
- package/src/config/schema.ts +603 -0
- package/src/config/service.test.ts +320 -0
- package/src/config/service.ts +243 -0
- package/src/config/testing.test.ts +264 -0
- package/src/config/testing.ts +110 -0
- package/src/core/types.ts +6 -33
- package/src/duplicates/detector.test.ts +183 -0
- package/src/duplicates/detector.ts +414 -0
- package/src/duplicates/index.ts +18 -0
- package/src/embeddings/embedding-namespace.test.ts +300 -0
- package/src/embeddings/embedding-namespace.ts +947 -0
- package/src/embeddings/heading-boost.test.ts +222 -0
- package/src/embeddings/hnsw-build-options.test.ts +198 -0
- package/src/embeddings/hyde.test.ts +272 -0
- package/src/embeddings/hyde.ts +264 -0
- package/src/embeddings/index.ts +2 -0
- package/src/embeddings/openai-provider.ts +332 -83
- package/src/embeddings/pricing.json +22 -0
- package/src/embeddings/provider-constants.ts +204 -0
- package/src/embeddings/provider-errors.test.ts +967 -0
- package/src/embeddings/provider-errors.ts +565 -0
- package/src/embeddings/provider-factory.test.ts +240 -0
- package/src/embeddings/provider-factory.ts +225 -0
- package/src/embeddings/provider-integration.test.ts +788 -0
- package/src/embeddings/query-preprocessing.test.ts +187 -0
- package/src/embeddings/semantic-search-threshold.test.ts +508 -0
- package/src/embeddings/semantic-search.ts +780 -93
- package/src/embeddings/types.ts +293 -16
- package/src/embeddings/vector-store.ts +486 -77
- package/src/embeddings/voyage-provider.ts +313 -0
- package/src/errors/errors.test.ts +845 -0
- package/src/errors/index.ts +533 -0
- package/src/index/ignore-patterns.test.ts +354 -0
- package/src/index/ignore-patterns.ts +305 -0
- package/src/index/indexer.ts +286 -48
- package/src/index/storage.ts +94 -30
- package/src/index/types.ts +40 -2
- package/src/index/watcher.ts +67 -9
- package/src/index.ts +22 -0
- package/src/integration/search-keyword.test.ts +678 -0
- package/src/mcp/server.ts +135 -6
- package/src/parser/parser.ts +18 -19
- package/src/parser/section-filter.test.ts +277 -0
- package/src/parser/section-filter.ts +125 -3
- package/src/search/__tests__/hybrid-search.test.ts +650 -0
- package/src/search/bm25-store.ts +366 -0
- package/src/search/cross-encoder.test.ts +253 -0
- package/src/search/cross-encoder.ts +406 -0
- package/src/search/fuzzy-search.test.ts +419 -0
- package/src/search/fuzzy-search.ts +273 -0
- package/src/search/hybrid-search.ts +448 -0
- package/src/search/path-matcher.test.ts +276 -0
- package/src/search/path-matcher.ts +33 -0
- package/src/search/searcher.test.ts +99 -1
- package/src/search/searcher.ts +189 -67
- package/src/search/wink-bm25.d.ts +30 -0
- package/src/summarization/cli-providers/claude.ts +202 -0
- package/src/summarization/cli-providers/detection.test.ts +273 -0
- package/src/summarization/cli-providers/detection.ts +118 -0
- package/src/summarization/cli-providers/index.ts +8 -0
- package/src/summarization/cost.test.ts +139 -0
- package/src/summarization/cost.ts +102 -0
- package/src/summarization/error-handler.test.ts +127 -0
- package/src/summarization/error-handler.ts +111 -0
- package/src/summarization/index.ts +102 -0
- package/src/summarization/pipeline.test.ts +498 -0
- package/src/summarization/pipeline.ts +231 -0
- package/src/summarization/prompts.test.ts +269 -0
- package/src/summarization/prompts.ts +133 -0
- package/src/summarization/provider-factory.test.ts +396 -0
- package/src/summarization/provider-factory.ts +178 -0
- package/src/summarization/types.ts +184 -0
- package/src/summarize/summarizer.ts +104 -35
- package/src/types/huggingface-transformers.d.ts +66 -0
- package/tests/fixtures/cli/.mdcontext/active-provider.json +7 -0
- package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/indexes/documents.json +4 -4
- package/tests/fixtures/cli/.mdcontext/indexes/sections.json +14 -0
- package/tests/integration/embed-index.test.ts +712 -0
- package/tests/integration/search-context.test.ts +469 -0
- package/tests/integration/search-semantic.test.ts +522 -0
- package/vitest.config.ts +1 -6
- package/AGENTS.md +0 -46
- package/tests/fixtures/cli/.mdcontext/vectors.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/vectors.meta.json +0 -1264
|
@@ -5,20 +5,83 @@
|
|
|
5
5
|
import * as fs from 'node:fs/promises'
|
|
6
6
|
import * as path from 'node:path'
|
|
7
7
|
import { Effect } from 'effect'
|
|
8
|
+
import {
|
|
9
|
+
type ApiKeyInvalidError,
|
|
10
|
+
type ApiKeyMissingError,
|
|
11
|
+
DimensionMismatchError,
|
|
12
|
+
EmbeddingError,
|
|
13
|
+
EmbeddingsNotFoundError,
|
|
14
|
+
type FileReadError,
|
|
15
|
+
type IndexCorruptedError,
|
|
16
|
+
IndexNotFoundError,
|
|
17
|
+
type VectorStoreError,
|
|
18
|
+
} from '../errors/index.js'
|
|
8
19
|
import {
|
|
9
20
|
createStorage,
|
|
10
21
|
loadDocumentIndex,
|
|
11
22
|
loadSectionIndex,
|
|
12
23
|
} from '../index/storage.js'
|
|
13
24
|
import type { SectionEntry } from '../index/types.js'
|
|
14
|
-
import {
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
25
|
+
import {
|
|
26
|
+
type ActiveProvider,
|
|
27
|
+
generateNamespace,
|
|
28
|
+
getActiveNamespace,
|
|
29
|
+
writeActiveProvider,
|
|
30
|
+
} from './embedding-namespace.js'
|
|
31
|
+
import { generateHypotheticalDocument, type HydeResult } from './hyde.js'
|
|
32
|
+
import {
|
|
33
|
+
checkPricingFreshness,
|
|
34
|
+
getPricingDate,
|
|
35
|
+
PRICING_DATA,
|
|
36
|
+
wrapEmbedding,
|
|
37
|
+
} from './openai-provider.js'
|
|
38
|
+
import {
|
|
39
|
+
createEmbeddingProviderDirect,
|
|
40
|
+
type ProviderFactoryConfig,
|
|
41
|
+
} from './provider-factory.js'
|
|
42
|
+
import {
|
|
43
|
+
calculateFileImportanceBoost,
|
|
44
|
+
calculateHeadingBoost,
|
|
45
|
+
type EmbeddingProvider,
|
|
46
|
+
hasProviderMetadata,
|
|
47
|
+
preprocessQuery,
|
|
48
|
+
QUALITY_EF_SEARCH,
|
|
49
|
+
type SemanticSearchOptions,
|
|
50
|
+
type SemanticSearchResult,
|
|
51
|
+
type SemanticSearchResultWithStats,
|
|
52
|
+
type VectorEntry,
|
|
20
53
|
} from './types.js'
|
|
21
|
-
import {
|
|
54
|
+
import {
|
|
55
|
+
createNamespacedVectorStore,
|
|
56
|
+
type HnswBuildOptions,
|
|
57
|
+
type HnswMismatchWarning,
|
|
58
|
+
type HnswVectorStore,
|
|
59
|
+
type VectorSearchResult,
|
|
60
|
+
type VectorStoreLoadResult,
|
|
61
|
+
} from './vector-store.js'
|
|
62
|
+
|
|
63
|
+
// ============================================================================
|
|
64
|
+
// HNSW Parameter Warning
|
|
65
|
+
// ============================================================================
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Check for HNSW parameter mismatch and log a warning if found.
|
|
69
|
+
* This helps users understand when their config doesn't match the stored index.
|
|
70
|
+
*/
|
|
71
|
+
const checkHnswMismatch = (
|
|
72
|
+
mismatch: HnswMismatchWarning | undefined,
|
|
73
|
+
): Effect.Effect<void, never, never> => {
|
|
74
|
+
if (!mismatch) {
|
|
75
|
+
return Effect.void
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const { configParams, indexParams } = mismatch
|
|
79
|
+
return Effect.logWarning(
|
|
80
|
+
`HNSW parameter mismatch: Index was built with M=${indexParams.m}, efConstruction=${indexParams.efConstruction}, ` +
|
|
81
|
+
`but config specifies M=${configParams.m}, efConstruction=${configParams.efConstruction}. ` +
|
|
82
|
+
`HNSW parameters only affect index construction. Run 'mdcontext index --embed --force' to rebuild with new parameters.`,
|
|
83
|
+
)
|
|
84
|
+
}
|
|
22
85
|
|
|
23
86
|
// ============================================================================
|
|
24
87
|
// Embedding Text Generation
|
|
@@ -47,8 +110,12 @@ const generateEmbeddingText = (
|
|
|
47
110
|
// Cost Estimation
|
|
48
111
|
// ============================================================================
|
|
49
112
|
|
|
50
|
-
// Price per 1M tokens for text-embedding-3-small
|
|
51
|
-
const EMBEDDING_PRICE_PER_MILLION =
|
|
113
|
+
// Price per 1M tokens for text-embedding-3-small (from PRICING_DATA)
|
|
114
|
+
const EMBEDDING_PRICE_PER_MILLION =
|
|
115
|
+
PRICING_DATA.prices['text-embedding-3-small'] ?? 0.02
|
|
116
|
+
|
|
117
|
+
// Re-export pricing utilities for CLI use
|
|
118
|
+
export { checkPricingFreshness, getPricingDate }
|
|
52
119
|
|
|
53
120
|
export interface DirectoryEstimate {
|
|
54
121
|
readonly directory: string
|
|
@@ -67,10 +134,24 @@ export interface EmbeddingEstimate {
|
|
|
67
134
|
readonly byDirectory: readonly DirectoryEstimate[]
|
|
68
135
|
}
|
|
69
136
|
|
|
137
|
+
/**
|
|
138
|
+
* Estimate the cost of generating embeddings for a directory.
|
|
139
|
+
*
|
|
140
|
+
* @param rootPath - Root directory containing indexed markdown files
|
|
141
|
+
* @param options - Optional exclude patterns
|
|
142
|
+
* @returns Estimate with token counts and costs
|
|
143
|
+
*
|
|
144
|
+
* @throws IndexNotFoundError - Index doesn't exist at path
|
|
145
|
+
* @throws FileReadError - Cannot read index files
|
|
146
|
+
* @throws IndexCorruptedError - Index files are corrupted
|
|
147
|
+
*/
|
|
70
148
|
export const estimateEmbeddingCost = (
|
|
71
149
|
rootPath: string,
|
|
72
150
|
options: { excludePatterns?: readonly string[] | undefined } = {},
|
|
73
|
-
): Effect.Effect<
|
|
151
|
+
): Effect.Effect<
|
|
152
|
+
EmbeddingEstimate,
|
|
153
|
+
IndexNotFoundError | FileReadError | IndexCorruptedError
|
|
154
|
+
> =>
|
|
74
155
|
Effect.gen(function* () {
|
|
75
156
|
const resolvedRoot = path.resolve(rootPath)
|
|
76
157
|
const storage = createStorage(resolvedRoot)
|
|
@@ -79,9 +160,7 @@ export const estimateEmbeddingCost = (
|
|
|
79
160
|
const sectionIndex = yield* loadSectionIndex(storage)
|
|
80
161
|
|
|
81
162
|
if (!docIndex || !sectionIndex) {
|
|
82
|
-
return yield* Effect.fail(
|
|
83
|
-
new Error("Index not found. Run 'mdcontext index' first."),
|
|
84
|
-
)
|
|
163
|
+
return yield* Effect.fail(new IndexNotFoundError({ path: resolvedRoot }))
|
|
85
164
|
}
|
|
86
165
|
|
|
87
166
|
// Group by directory
|
|
@@ -160,11 +239,25 @@ export interface FileProgress {
|
|
|
160
239
|
readonly sectionCount: number
|
|
161
240
|
}
|
|
162
241
|
|
|
242
|
+
export interface EmbeddingBatchProgress {
|
|
243
|
+
readonly batchIndex: number
|
|
244
|
+
readonly totalBatches: number
|
|
245
|
+
readonly processedSections: number
|
|
246
|
+
readonly totalSections: number
|
|
247
|
+
}
|
|
248
|
+
|
|
163
249
|
export interface BuildEmbeddingsOptions {
|
|
164
250
|
readonly force?: boolean | undefined
|
|
165
251
|
readonly provider?: EmbeddingProvider | undefined
|
|
252
|
+
readonly providerConfig?: ProviderFactoryConfig | undefined
|
|
166
253
|
readonly excludePatterns?: readonly string[] | undefined
|
|
167
254
|
readonly onFileProgress?: ((progress: FileProgress) => void) | undefined
|
|
255
|
+
/** Callback for batch progress during embedding API calls */
|
|
256
|
+
readonly onBatchProgress?:
|
|
257
|
+
| ((progress: EmbeddingBatchProgress) => void)
|
|
258
|
+
| undefined
|
|
259
|
+
/** HNSW build parameters for vector index construction */
|
|
260
|
+
readonly hnswOptions?: HnswBuildOptions | undefined
|
|
168
261
|
}
|
|
169
262
|
|
|
170
263
|
export interface BuildEmbeddingsResult {
|
|
@@ -178,10 +271,36 @@ export interface BuildEmbeddingsResult {
|
|
|
178
271
|
readonly estimatedSavings?: number | undefined
|
|
179
272
|
}
|
|
180
273
|
|
|
274
|
+
/**
|
|
275
|
+
* Build embeddings for all indexed sections in a directory.
|
|
276
|
+
*
|
|
277
|
+
* @param rootPath - Root directory containing indexed markdown files
|
|
278
|
+
* @param options - Build options (force rebuild, progress callbacks)
|
|
279
|
+
* @returns Result with embedding counts, costs, and timing
|
|
280
|
+
*
|
|
281
|
+
* @throws IndexNotFoundError - Index doesn't exist at path
|
|
282
|
+
* @throws FileReadError - Cannot read index or source files
|
|
283
|
+
* @throws IndexCorruptedError - Index files are corrupted
|
|
284
|
+
* @throws ApiKeyMissingError - API key not set (check provider config)
|
|
285
|
+
* @throws ApiKeyInvalidError - API key rejected by provider
|
|
286
|
+
* @throws EmbeddingError - Embedding API failure (rate limit, quota, network)
|
|
287
|
+
* @throws VectorStoreError - Cannot save vector index
|
|
288
|
+
* @throws DimensionMismatchError - Existing embeddings have different dimensions
|
|
289
|
+
*/
|
|
181
290
|
export const buildEmbeddings = (
|
|
182
291
|
rootPath: string,
|
|
183
292
|
options: BuildEmbeddingsOptions = {},
|
|
184
|
-
): Effect.Effect<
|
|
293
|
+
): Effect.Effect<
|
|
294
|
+
BuildEmbeddingsResult,
|
|
295
|
+
| IndexNotFoundError
|
|
296
|
+
| FileReadError
|
|
297
|
+
| IndexCorruptedError
|
|
298
|
+
| ApiKeyMissingError
|
|
299
|
+
| ApiKeyInvalidError
|
|
300
|
+
| EmbeddingError
|
|
301
|
+
| VectorStoreError
|
|
302
|
+
| DimensionMismatchError
|
|
303
|
+
> =>
|
|
185
304
|
Effect.gen(function* () {
|
|
186
305
|
const startTime = Date.now()
|
|
187
306
|
const resolvedRoot = path.resolve(rootPath)
|
|
@@ -192,31 +311,52 @@ export const buildEmbeddings = (
|
|
|
192
311
|
const sectionIndex = yield* loadSectionIndex(storage)
|
|
193
312
|
|
|
194
313
|
if (!docIndex || !sectionIndex) {
|
|
195
|
-
return yield* Effect.fail(
|
|
196
|
-
new Error("Index not found. Run 'mdcontext index' first."),
|
|
197
|
-
)
|
|
314
|
+
return yield* Effect.fail(new IndexNotFoundError({ path: resolvedRoot }))
|
|
198
315
|
}
|
|
199
316
|
|
|
200
|
-
// Get or create provider
|
|
317
|
+
// Get or create provider - use factory for config-driven provider selection
|
|
318
|
+
// Priority: explicit provider > providerConfig > default (openai)
|
|
319
|
+
const providerConfig = options.providerConfig ?? { provider: 'openai' }
|
|
201
320
|
const provider =
|
|
202
|
-
options.provider ??
|
|
203
|
-
(yield* Effect.try({
|
|
204
|
-
try: () => createOpenAIProvider(),
|
|
205
|
-
catch: (e) => e as Error,
|
|
206
|
-
}))
|
|
321
|
+
options.provider ?? (yield* createEmbeddingProviderDirect(providerConfig))
|
|
207
322
|
const dimensions = provider.dimensions
|
|
208
323
|
|
|
209
|
-
//
|
|
210
|
-
|
|
324
|
+
// Extract provider info for namespacing from the actual provider instance
|
|
325
|
+
// This ensures we use the correct values even when options.provider is explicitly set
|
|
326
|
+
let providerName: string
|
|
327
|
+
let providerModel: string
|
|
328
|
+
|
|
329
|
+
if (hasProviderMetadata(provider)) {
|
|
330
|
+
// Provider has metadata - extract provider name from provider.name (format: "provider:model")
|
|
331
|
+
const nameParts = provider.name.split(':')
|
|
332
|
+
providerName = nameParts[0] || 'openai'
|
|
333
|
+
providerModel = provider.model
|
|
334
|
+
} else {
|
|
335
|
+
// Fallback to config values for providers without metadata
|
|
336
|
+
providerName = providerConfig.provider ?? 'openai'
|
|
337
|
+
providerModel = providerConfig.model ?? 'text-embedding-3-small'
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Create namespaced vector store for this provider/model/dimensions combination
|
|
341
|
+
const vectorStore = createNamespacedVectorStore(
|
|
211
342
|
resolvedRoot,
|
|
343
|
+
providerName,
|
|
344
|
+
providerModel,
|
|
212
345
|
dimensions,
|
|
346
|
+
options.hnswOptions,
|
|
213
347
|
) as HnswVectorStore
|
|
214
|
-
|
|
348
|
+
|
|
349
|
+
// Set provider metadata
|
|
350
|
+
if (hasProviderMetadata(provider)) {
|
|
351
|
+
vectorStore.setProvider(provider.name, provider.model, provider.baseURL)
|
|
352
|
+
} else {
|
|
353
|
+
vectorStore.setProvider(providerName, providerModel, undefined)
|
|
354
|
+
}
|
|
215
355
|
|
|
216
356
|
// Load existing if not forcing
|
|
217
357
|
if (!options.force) {
|
|
218
|
-
const
|
|
219
|
-
if (loaded) {
|
|
358
|
+
const loadResult = yield* vectorStore.load()
|
|
359
|
+
if (loadResult.loaded) {
|
|
220
360
|
const stats = vectorStore.getStats()
|
|
221
361
|
// Skip if any embeddings exist
|
|
222
362
|
if (stats.count > 0) {
|
|
@@ -321,18 +461,26 @@ export const buildEmbeddings = (
|
|
|
321
461
|
}
|
|
322
462
|
|
|
323
463
|
const filePath = path.join(resolvedRoot, docPath)
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
464
|
+
|
|
465
|
+
// Note: catchAll is intentional - file read failures during embedding
|
|
466
|
+
// should skip the file with a warning rather than abort the entire operation.
|
|
467
|
+
// A warning is logged below when the read fails.
|
|
468
|
+
const fileContentResult = yield* Effect.promise(() =>
|
|
469
|
+
fs.readFile(filePath, 'utf-8'),
|
|
470
|
+
).pipe(
|
|
471
|
+
Effect.map((content) => ({ ok: true as const, content })),
|
|
472
|
+
Effect.catchAll(() =>
|
|
473
|
+
Effect.succeed({ ok: false as const, content: '' }),
|
|
474
|
+
),
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
if (!fileContentResult.ok) {
|
|
478
|
+
yield* Effect.logWarning(`Skipping file (cannot read): ${docPath}`)
|
|
331
479
|
continue
|
|
332
480
|
}
|
|
333
481
|
|
|
334
482
|
filesProcessed++
|
|
335
|
-
const lines =
|
|
483
|
+
const lines = fileContentResult.content.split('\n')
|
|
336
484
|
|
|
337
485
|
for (const { section, parentHeading } of sections) {
|
|
338
486
|
// Extract section content from file
|
|
@@ -363,16 +511,20 @@ export const buildEmbeddings = (
|
|
|
363
511
|
|
|
364
512
|
// Generate embeddings
|
|
365
513
|
const texts = sectionsToEmbed.map((s) => s.text)
|
|
366
|
-
const result = yield*
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
514
|
+
const result = yield* wrapEmbedding(
|
|
515
|
+
provider.embed(texts, {
|
|
516
|
+
onBatchProgress: options.onBatchProgress
|
|
517
|
+
? (p) =>
|
|
518
|
+
options.onBatchProgress?.({
|
|
519
|
+
batchIndex: p.batchIndex,
|
|
520
|
+
totalBatches: p.totalBatches,
|
|
521
|
+
processedSections: p.processedTexts,
|
|
522
|
+
totalSections: p.totalTexts,
|
|
523
|
+
})
|
|
524
|
+
: undefined,
|
|
525
|
+
}),
|
|
526
|
+
providerConfig.provider ?? 'openai',
|
|
527
|
+
)
|
|
376
528
|
|
|
377
529
|
// Create vector entries
|
|
378
530
|
const entries: VectorEntry[] = []
|
|
@@ -397,6 +549,22 @@ export const buildEmbeddings = (
|
|
|
397
549
|
// Save
|
|
398
550
|
yield* vectorStore.save()
|
|
399
551
|
|
|
552
|
+
// Set this namespace as the active provider
|
|
553
|
+
const namespace = generateNamespace(providerName, providerModel, dimensions)
|
|
554
|
+
yield* writeActiveProvider(resolvedRoot, {
|
|
555
|
+
namespace,
|
|
556
|
+
provider: providerName,
|
|
557
|
+
model: providerModel,
|
|
558
|
+
dimensions,
|
|
559
|
+
activatedAt: new Date().toISOString(),
|
|
560
|
+
}).pipe(
|
|
561
|
+
Effect.catchAll((e) => {
|
|
562
|
+
// Don't fail the build if we can't write the active provider file
|
|
563
|
+
console.warn(`Warning: Could not set active provider: ${e.message}`)
|
|
564
|
+
return Effect.succeed(undefined)
|
|
565
|
+
}),
|
|
566
|
+
)
|
|
567
|
+
|
|
400
568
|
const duration = Date.now() - startTime
|
|
401
569
|
|
|
402
570
|
return {
|
|
@@ -408,57 +576,239 @@ export const buildEmbeddings = (
|
|
|
408
576
|
}
|
|
409
577
|
})
|
|
410
578
|
|
|
579
|
+
// ============================================================================
|
|
580
|
+
// Context Lines Helper
|
|
581
|
+
// ============================================================================
|
|
582
|
+
|
|
583
|
+
/**
|
|
584
|
+
* Add context lines to search results by loading section content from files.
|
|
585
|
+
* This helper is used by both semanticSearch and semanticSearchWithStats to avoid code duplication.
|
|
586
|
+
*/
|
|
587
|
+
const addContextLinesToResults = (
|
|
588
|
+
limitedResults: readonly VectorSearchResult[],
|
|
589
|
+
sectionIndex: { sections: Record<string, SectionEntry> },
|
|
590
|
+
resolvedRoot: string,
|
|
591
|
+
options: {
|
|
592
|
+
contextBefore?: number | undefined
|
|
593
|
+
contextAfter?: number | undefined
|
|
594
|
+
},
|
|
595
|
+
): Effect.Effect<readonly SemanticSearchResult[], FileReadError, never> =>
|
|
596
|
+
Effect.gen(function* () {
|
|
597
|
+
const contextBefore = options.contextBefore ?? 0
|
|
598
|
+
const contextAfter = options.contextAfter ?? 0
|
|
599
|
+
|
|
600
|
+
const resultsWithContext: SemanticSearchResult[] = []
|
|
601
|
+
const fileCache = new Map<string, string>()
|
|
602
|
+
|
|
603
|
+
for (const r of limitedResults) {
|
|
604
|
+
const section = sectionIndex.sections[r.sectionId]
|
|
605
|
+
if (!section) {
|
|
606
|
+
resultsWithContext.push({
|
|
607
|
+
sectionId: r.sectionId,
|
|
608
|
+
documentPath: r.documentPath,
|
|
609
|
+
heading: r.heading,
|
|
610
|
+
similarity: r.similarity,
|
|
611
|
+
})
|
|
612
|
+
continue
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
let fileContent = fileCache.get(r.documentPath)
|
|
616
|
+
if (!fileContent) {
|
|
617
|
+
const filePath = path.join(resolvedRoot, r.documentPath)
|
|
618
|
+
const contentResult = yield* Effect.promise(() =>
|
|
619
|
+
fs.readFile(filePath, 'utf-8'),
|
|
620
|
+
).pipe(
|
|
621
|
+
Effect.map((content) => content),
|
|
622
|
+
Effect.catchAll(() => Effect.succeed(null as string | null)),
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
if (contentResult) {
|
|
626
|
+
fileContent = contentResult
|
|
627
|
+
fileCache.set(r.documentPath, fileContent)
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
if (fileContent) {
|
|
632
|
+
const lines = fileContent.split('\n')
|
|
633
|
+
const startIdx = Math.max(0, section.startLine - 1 - contextBefore)
|
|
634
|
+
const endIdx = Math.min(lines.length, section.endLine + contextAfter)
|
|
635
|
+
|
|
636
|
+
const contextLines: {
|
|
637
|
+
lineNumber: number
|
|
638
|
+
line: string
|
|
639
|
+
isMatch: boolean
|
|
640
|
+
}[] = []
|
|
641
|
+
for (let i = startIdx; i < endIdx; i++) {
|
|
642
|
+
const line = lines[i]
|
|
643
|
+
if (line !== undefined) {
|
|
644
|
+
contextLines.push({
|
|
645
|
+
lineNumber: i + 1,
|
|
646
|
+
line,
|
|
647
|
+
isMatch: i >= section.startLine - 1 && i < section.endLine,
|
|
648
|
+
})
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
resultsWithContext.push({
|
|
653
|
+
sectionId: r.sectionId,
|
|
654
|
+
documentPath: r.documentPath,
|
|
655
|
+
heading: r.heading,
|
|
656
|
+
similarity: r.similarity,
|
|
657
|
+
contextLines,
|
|
658
|
+
})
|
|
659
|
+
} else {
|
|
660
|
+
resultsWithContext.push({
|
|
661
|
+
sectionId: r.sectionId,
|
|
662
|
+
documentPath: r.documentPath,
|
|
663
|
+
heading: r.heading,
|
|
664
|
+
similarity: r.similarity,
|
|
665
|
+
})
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
return resultsWithContext
|
|
670
|
+
})
|
|
671
|
+
|
|
411
672
|
// ============================================================================
|
|
412
673
|
// Semantic Search
|
|
413
674
|
// ============================================================================
|
|
414
675
|
|
|
676
|
+
/**
|
|
677
|
+
* Perform semantic search over embedded sections.
|
|
678
|
+
*
|
|
679
|
+
* @param rootPath - Root directory containing embeddings
|
|
680
|
+
* @param query - Natural language search query
|
|
681
|
+
* @param options - Search options (limit, threshold, path filter)
|
|
682
|
+
* @returns Ranked list of matching sections by similarity
|
|
683
|
+
*
|
|
684
|
+
* @throws EmbeddingsNotFoundError - No embeddings exist (run index --embed first)
|
|
685
|
+
* @throws ApiKeyMissingError - API key not set (check provider config)
|
|
686
|
+
* @throws ApiKeyInvalidError - API key rejected by provider
|
|
687
|
+
* @throws EmbeddingError - Embedding API failure (rate limit, quota, network)
|
|
688
|
+
* @throws VectorStoreError - Cannot load or search vector index
|
|
689
|
+
* @throws DimensionMismatchError - Corpus has different dimensions than current provider
|
|
690
|
+
*/
|
|
415
691
|
export const semanticSearch = (
|
|
416
692
|
rootPath: string,
|
|
417
693
|
query: string,
|
|
418
694
|
options: SemanticSearchOptions = {},
|
|
419
|
-
): Effect.Effect<
|
|
695
|
+
): Effect.Effect<
|
|
696
|
+
readonly SemanticSearchResult[],
|
|
697
|
+
| EmbeddingsNotFoundError
|
|
698
|
+
| FileReadError
|
|
699
|
+
| IndexCorruptedError
|
|
700
|
+
| ApiKeyMissingError
|
|
701
|
+
| ApiKeyInvalidError
|
|
702
|
+
| EmbeddingError
|
|
703
|
+
| VectorStoreError
|
|
704
|
+
| DimensionMismatchError
|
|
705
|
+
> =>
|
|
420
706
|
Effect.gen(function* () {
|
|
421
707
|
const resolvedRoot = path.resolve(rootPath)
|
|
422
708
|
|
|
423
|
-
// Get
|
|
424
|
-
const
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
709
|
+
// Get active namespace to determine which embedding index to use
|
|
710
|
+
const activeProvider = yield* getActiveNamespace(resolvedRoot).pipe(
|
|
711
|
+
Effect.catchAll(() => Effect.succeed(null as ActiveProvider | null)),
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
if (!activeProvider) {
|
|
715
|
+
return yield* Effect.fail(
|
|
716
|
+
new EmbeddingsNotFoundError({ path: resolvedRoot }),
|
|
717
|
+
)
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
// Create provider for query embedding
|
|
721
|
+
const provider = yield* createEmbeddingProviderDirect(
|
|
722
|
+
options.providerConfig ?? { provider: 'openai' },
|
|
723
|
+
)
|
|
428
724
|
const dimensions = provider.dimensions
|
|
429
725
|
|
|
430
|
-
//
|
|
431
|
-
const
|
|
432
|
-
const loaded = yield* vectorStore.load()
|
|
726
|
+
// Get current provider name for error messages
|
|
727
|
+
const currentProviderName = options.providerConfig?.provider ?? 'openai'
|
|
433
728
|
|
|
434
|
-
|
|
729
|
+
// Verify dimensions match the active namespace
|
|
730
|
+
if (dimensions !== activeProvider.dimensions) {
|
|
435
731
|
return yield* Effect.fail(
|
|
436
|
-
new
|
|
732
|
+
new DimensionMismatchError({
|
|
733
|
+
corpusDimensions: activeProvider.dimensions,
|
|
734
|
+
providerDimensions: dimensions,
|
|
735
|
+
corpusProvider: `${activeProvider.provider}:${activeProvider.model}`,
|
|
736
|
+
currentProvider: currentProviderName,
|
|
737
|
+
path: resolvedRoot,
|
|
738
|
+
}),
|
|
437
739
|
)
|
|
438
740
|
}
|
|
439
741
|
|
|
440
|
-
//
|
|
441
|
-
const
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
742
|
+
// Load vector store from the active namespace
|
|
743
|
+
const vectorStore = createNamespacedVectorStore(
|
|
744
|
+
resolvedRoot,
|
|
745
|
+
activeProvider.provider,
|
|
746
|
+
activeProvider.model,
|
|
747
|
+
activeProvider.dimensions,
|
|
748
|
+
)
|
|
749
|
+
const loadResult = yield* vectorStore.load()
|
|
750
|
+
|
|
751
|
+
if (!loadResult.loaded) {
|
|
752
|
+
return yield* Effect.fail(
|
|
753
|
+
new EmbeddingsNotFoundError({ path: resolvedRoot }),
|
|
754
|
+
)
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
// Check for HNSW parameter mismatch
|
|
758
|
+
yield* checkHnswMismatch(loadResult.hnswMismatch)
|
|
759
|
+
|
|
760
|
+
// Determine the text to embed
|
|
761
|
+
// If HyDE is enabled, generate a hypothetical document first
|
|
762
|
+
let textToEmbed: string
|
|
763
|
+
let hydeResult: HydeResult | undefined
|
|
764
|
+
|
|
765
|
+
if (options.hyde) {
|
|
766
|
+
// Generate hypothetical document using LLM
|
|
767
|
+
hydeResult = yield* generateHypotheticalDocument(query, {
|
|
768
|
+
model: options.hydeOptions?.model,
|
|
769
|
+
maxTokens: options.hydeOptions?.maxTokens,
|
|
770
|
+
temperature: options.hydeOptions?.temperature,
|
|
771
|
+
})
|
|
772
|
+
textToEmbed = hydeResult.hypotheticalDocument
|
|
773
|
+
yield* Effect.logDebug(
|
|
774
|
+
`HyDE generated ${hydeResult.tokensUsed} tokens ($${hydeResult.cost.toFixed(6)})`,
|
|
775
|
+
)
|
|
776
|
+
} else {
|
|
777
|
+
// Preprocess query for better recall (unless disabled)
|
|
778
|
+
textToEmbed = options.skipPreprocessing ? query : preprocessQuery(query)
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
// Embed the query (or hypothetical document)
|
|
782
|
+
const queryResult = yield* wrapEmbedding(
|
|
783
|
+
provider.embed([textToEmbed]),
|
|
784
|
+
currentProviderName,
|
|
785
|
+
)
|
|
448
786
|
|
|
449
787
|
const queryVector = queryResult.embeddings[0]
|
|
450
788
|
if (!queryVector) {
|
|
451
|
-
return yield* Effect.fail(
|
|
789
|
+
return yield* Effect.fail(
|
|
790
|
+
new EmbeddingError({
|
|
791
|
+
reason: 'Unknown',
|
|
792
|
+
message: 'Failed to generate query embedding',
|
|
793
|
+
provider: currentProviderName,
|
|
794
|
+
}),
|
|
795
|
+
)
|
|
452
796
|
}
|
|
453
797
|
|
|
454
798
|
// Search
|
|
455
799
|
const limit = options.limit ?? 10
|
|
456
800
|
const threshold = options.threshold ?? 0
|
|
457
801
|
|
|
802
|
+
// Convert quality mode to efSearch value
|
|
803
|
+
const efSearch = options.quality
|
|
804
|
+
? QUALITY_EF_SEARCH[options.quality]
|
|
805
|
+
: undefined
|
|
806
|
+
|
|
458
807
|
const searchResults = yield* vectorStore.search(
|
|
459
808
|
queryVector,
|
|
460
809
|
limit * 2,
|
|
461
810
|
threshold,
|
|
811
|
+
{ efSearch },
|
|
462
812
|
)
|
|
463
813
|
|
|
464
814
|
// Apply path filter if specified
|
|
@@ -471,28 +821,313 @@ export const semanticSearch = (
|
|
|
471
821
|
filteredResults = searchResults.filter((r) => regex.test(r.documentPath))
|
|
472
822
|
}
|
|
473
823
|
|
|
474
|
-
//
|
|
475
|
-
const
|
|
476
|
-
|
|
477
|
-
.map((r) => ({
|
|
824
|
+
// Apply ranking boost (heading + file importance, enabled by default)
|
|
825
|
+
const applyBoost = options.headingBoost !== false
|
|
826
|
+
const boostedResults = applyBoost
|
|
827
|
+
? filteredResults.map((r) => ({
|
|
828
|
+
...r,
|
|
829
|
+
similarity: Math.min(
|
|
830
|
+
1,
|
|
831
|
+
r.similarity +
|
|
832
|
+
calculateHeadingBoost(r.heading, query) +
|
|
833
|
+
calculateFileImportanceBoost(r.documentPath),
|
|
834
|
+
),
|
|
835
|
+
}))
|
|
836
|
+
: filteredResults
|
|
837
|
+
|
|
838
|
+
// Re-sort by boosted similarity
|
|
839
|
+
const sortedResults = boostedResults.sort(
|
|
840
|
+
(a, b) => b.similarity - a.similarity,
|
|
841
|
+
)
|
|
842
|
+
const limitedResults = sortedResults.slice(0, limit)
|
|
843
|
+
|
|
844
|
+
// If context lines are requested, load section content
|
|
845
|
+
let results: readonly SemanticSearchResult[]
|
|
846
|
+
if (
|
|
847
|
+
options.contextBefore !== undefined ||
|
|
848
|
+
options.contextAfter !== undefined
|
|
849
|
+
) {
|
|
850
|
+
const storage = createStorage(resolvedRoot)
|
|
851
|
+
const sectionIndex = yield* loadSectionIndex(storage)
|
|
852
|
+
|
|
853
|
+
if (sectionIndex) {
|
|
854
|
+
results = yield* addContextLinesToResults(
|
|
855
|
+
limitedResults,
|
|
856
|
+
sectionIndex,
|
|
857
|
+
resolvedRoot,
|
|
858
|
+
options,
|
|
859
|
+
)
|
|
860
|
+
} else {
|
|
861
|
+
results = limitedResults.map((r) => ({
|
|
862
|
+
sectionId: r.sectionId,
|
|
863
|
+
documentPath: r.documentPath,
|
|
864
|
+
heading: r.heading,
|
|
865
|
+
similarity: r.similarity,
|
|
866
|
+
}))
|
|
867
|
+
}
|
|
868
|
+
} else {
|
|
869
|
+
results = limitedResults.map((r) => ({
|
|
478
870
|
sectionId: r.sectionId,
|
|
479
871
|
documentPath: r.documentPath,
|
|
480
872
|
heading: r.heading,
|
|
481
873
|
similarity: r.similarity,
|
|
482
874
|
}))
|
|
875
|
+
}
|
|
483
876
|
|
|
484
877
|
return results
|
|
485
878
|
})
|
|
486
879
|
|
|
880
|
+
/**
|
|
881
|
+
* Perform semantic search with stats about below-threshold results.
|
|
882
|
+
* Use this when you want to provide feedback to users about results that
|
|
883
|
+
* didn't meet the threshold.
|
|
884
|
+
*
|
|
885
|
+
* @param rootPath - Root directory containing embeddings
|
|
886
|
+
* @param query - Natural language search query
|
|
887
|
+
* @param options - Search options (limit, threshold, path filter)
|
|
888
|
+
* @returns Results with optional below-threshold stats
|
|
889
|
+
*
|
|
890
|
+
* @throws EmbeddingsNotFoundError - No embeddings exist (run index --embed first)
|
|
891
|
+
* @throws ApiKeyMissingError - API key not set (check provider config)
|
|
892
|
+
* @throws ApiKeyInvalidError - API key rejected by provider
|
|
893
|
+
* @throws EmbeddingError - Embedding API failure (rate limit, quota, network)
|
|
894
|
+
* @throws VectorStoreError - Cannot load or search vector index
|
|
895
|
+
* @throws DimensionMismatchError - Corpus has different dimensions than current provider
|
|
896
|
+
*/
|
|
897
|
+
export const semanticSearchWithStats = (
|
|
898
|
+
rootPath: string,
|
|
899
|
+
query: string,
|
|
900
|
+
options: SemanticSearchOptions = {},
|
|
901
|
+
): Effect.Effect<
|
|
902
|
+
SemanticSearchResultWithStats,
|
|
903
|
+
| EmbeddingsNotFoundError
|
|
904
|
+
| FileReadError
|
|
905
|
+
| IndexCorruptedError
|
|
906
|
+
| ApiKeyMissingError
|
|
907
|
+
| ApiKeyInvalidError
|
|
908
|
+
| EmbeddingError
|
|
909
|
+
| VectorStoreError
|
|
910
|
+
| DimensionMismatchError
|
|
911
|
+
> =>
|
|
912
|
+
Effect.gen(function* () {
|
|
913
|
+
const resolvedRoot = path.resolve(rootPath)
|
|
914
|
+
|
|
915
|
+
// Get active namespace to determine which embedding index to use
|
|
916
|
+
const activeProvider = yield* getActiveNamespace(resolvedRoot).pipe(
|
|
917
|
+
Effect.catchAll(() => Effect.succeed(null as ActiveProvider | null)),
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
if (!activeProvider) {
|
|
921
|
+
return yield* Effect.fail(
|
|
922
|
+
new EmbeddingsNotFoundError({ path: resolvedRoot }),
|
|
923
|
+
)
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
// Create provider for query embedding
|
|
927
|
+
const provider = yield* createEmbeddingProviderDirect(
|
|
928
|
+
options.providerConfig ?? { provider: 'openai' },
|
|
929
|
+
)
|
|
930
|
+
const dimensions = provider.dimensions
|
|
931
|
+
|
|
932
|
+
// Get current provider name for error messages
|
|
933
|
+
const currentProviderName = options.providerConfig?.provider ?? 'openai'
|
|
934
|
+
|
|
935
|
+
// Verify dimensions match the active namespace
|
|
936
|
+
if (dimensions !== activeProvider.dimensions) {
|
|
937
|
+
return yield* Effect.fail(
|
|
938
|
+
new DimensionMismatchError({
|
|
939
|
+
corpusDimensions: activeProvider.dimensions,
|
|
940
|
+
providerDimensions: dimensions,
|
|
941
|
+
corpusProvider: `${activeProvider.provider}:${activeProvider.model}`,
|
|
942
|
+
currentProvider: currentProviderName,
|
|
943
|
+
path: resolvedRoot,
|
|
944
|
+
}),
|
|
945
|
+
)
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
// Load vector store from the active namespace
|
|
949
|
+
const vectorStore = createNamespacedVectorStore(
|
|
950
|
+
resolvedRoot,
|
|
951
|
+
activeProvider.provider,
|
|
952
|
+
activeProvider.model,
|
|
953
|
+
activeProvider.dimensions,
|
|
954
|
+
)
|
|
955
|
+
const loadResult = yield* vectorStore.load()
|
|
956
|
+
|
|
957
|
+
if (!loadResult.loaded) {
|
|
958
|
+
return yield* Effect.fail(
|
|
959
|
+
new EmbeddingsNotFoundError({ path: resolvedRoot }),
|
|
960
|
+
)
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
// Check for HNSW parameter mismatch
|
|
964
|
+
yield* checkHnswMismatch(loadResult.hnswMismatch)
|
|
965
|
+
|
|
966
|
+
// Determine the text to embed
|
|
967
|
+
// If HyDE is enabled, generate a hypothetical document first
|
|
968
|
+
let textToEmbed: string
|
|
969
|
+
let hydeResult: HydeResult | undefined
|
|
970
|
+
|
|
971
|
+
if (options.hyde) {
|
|
972
|
+
// Generate hypothetical document using LLM
|
|
973
|
+
hydeResult = yield* generateHypotheticalDocument(query, {
|
|
974
|
+
model: options.hydeOptions?.model,
|
|
975
|
+
maxTokens: options.hydeOptions?.maxTokens,
|
|
976
|
+
temperature: options.hydeOptions?.temperature,
|
|
977
|
+
})
|
|
978
|
+
textToEmbed = hydeResult.hypotheticalDocument
|
|
979
|
+
yield* Effect.logDebug(
|
|
980
|
+
`HyDE generated ${hydeResult.tokensUsed} tokens ($${hydeResult.cost.toFixed(6)})`,
|
|
981
|
+
)
|
|
982
|
+
} else {
|
|
983
|
+
// Preprocess query for better recall (unless disabled)
|
|
984
|
+
textToEmbed = options.skipPreprocessing ? query : preprocessQuery(query)
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
// Embed the query (or hypothetical document)
|
|
988
|
+
const queryResult = yield* wrapEmbedding(
|
|
989
|
+
provider.embed([textToEmbed]),
|
|
990
|
+
currentProviderName,
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
const queryVector = queryResult.embeddings[0]
|
|
994
|
+
if (!queryVector) {
|
|
995
|
+
return yield* Effect.fail(
|
|
996
|
+
new EmbeddingError({
|
|
997
|
+
reason: 'Unknown',
|
|
998
|
+
message: 'Failed to generate query embedding',
|
|
999
|
+
provider: currentProviderName,
|
|
1000
|
+
}),
|
|
1001
|
+
)
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
// Search with stats
|
|
1005
|
+
const limit = options.limit ?? 10
|
|
1006
|
+
const threshold = options.threshold ?? 0
|
|
1007
|
+
|
|
1008
|
+
// Convert quality mode to efSearch value
|
|
1009
|
+
const efSearch = options.quality
|
|
1010
|
+
? QUALITY_EF_SEARCH[options.quality]
|
|
1011
|
+
: undefined
|
|
1012
|
+
|
|
1013
|
+
const searchResultWithStats = yield* vectorStore.searchWithStats(
|
|
1014
|
+
queryVector,
|
|
1015
|
+
limit * 2,
|
|
1016
|
+
threshold,
|
|
1017
|
+
{ efSearch },
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
// Apply path filter if specified
|
|
1021
|
+
let filteredResults = searchResultWithStats.results
|
|
1022
|
+
if (options.pathPattern) {
|
|
1023
|
+
const pattern = options.pathPattern
|
|
1024
|
+
.replace(/\./g, '\\.')
|
|
1025
|
+
.replace(/\*/g, '.*')
|
|
1026
|
+
const regex = new RegExp(`^${pattern}$`, 'i')
|
|
1027
|
+
filteredResults = searchResultWithStats.results.filter((r) =>
|
|
1028
|
+
regex.test(r.documentPath),
|
|
1029
|
+
)
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
// Apply ranking boost (heading + file importance, enabled by default)
|
|
1033
|
+
const applyBoost = options.headingBoost !== false
|
|
1034
|
+
const boostedResults = applyBoost
|
|
1035
|
+
? filteredResults.map((r) => ({
|
|
1036
|
+
...r,
|
|
1037
|
+
similarity: Math.min(
|
|
1038
|
+
1,
|
|
1039
|
+
r.similarity +
|
|
1040
|
+
calculateHeadingBoost(r.heading, query) +
|
|
1041
|
+
calculateFileImportanceBoost(r.documentPath),
|
|
1042
|
+
),
|
|
1043
|
+
}))
|
|
1044
|
+
: filteredResults
|
|
1045
|
+
|
|
1046
|
+
// Re-sort by boosted similarity and convert to SemanticSearchResult
|
|
1047
|
+
const sortedResults = boostedResults.sort(
|
|
1048
|
+
(a, b) => b.similarity - a.similarity,
|
|
1049
|
+
)
|
|
1050
|
+
const totalAvailable = sortedResults.length
|
|
1051
|
+
const limitedResults = sortedResults.slice(0, limit)
|
|
1052
|
+
|
|
1053
|
+
// If context lines are requested, load section content
|
|
1054
|
+
let results: readonly SemanticSearchResult[]
|
|
1055
|
+
if (
|
|
1056
|
+
options.contextBefore !== undefined ||
|
|
1057
|
+
options.contextAfter !== undefined
|
|
1058
|
+
) {
|
|
1059
|
+
const storage = createStorage(resolvedRoot)
|
|
1060
|
+
const sectionIndex = yield* loadSectionIndex(storage)
|
|
1061
|
+
|
|
1062
|
+
if (sectionIndex) {
|
|
1063
|
+
results = yield* addContextLinesToResults(
|
|
1064
|
+
limitedResults,
|
|
1065
|
+
sectionIndex,
|
|
1066
|
+
resolvedRoot,
|
|
1067
|
+
options,
|
|
1068
|
+
)
|
|
1069
|
+
} else {
|
|
1070
|
+
results = limitedResults.map((r) => ({
|
|
1071
|
+
sectionId: r.sectionId,
|
|
1072
|
+
documentPath: r.documentPath,
|
|
1073
|
+
heading: r.heading,
|
|
1074
|
+
similarity: r.similarity,
|
|
1075
|
+
}))
|
|
1076
|
+
}
|
|
1077
|
+
} else {
|
|
1078
|
+
results = limitedResults.map((r) => ({
|
|
1079
|
+
sectionId: r.sectionId,
|
|
1080
|
+
documentPath: r.documentPath,
|
|
1081
|
+
heading: r.heading,
|
|
1082
|
+
similarity: r.similarity,
|
|
1083
|
+
}))
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
return {
|
|
1087
|
+
results,
|
|
1088
|
+
belowThresholdCount: searchResultWithStats.belowThresholdCount,
|
|
1089
|
+
belowThresholdHighest:
|
|
1090
|
+
searchResultWithStats.belowThresholdHighest ?? undefined,
|
|
1091
|
+
totalAvailable,
|
|
1092
|
+
}
|
|
1093
|
+
})
|
|
1094
|
+
|
|
487
1095
|
// ============================================================================
|
|
488
1096
|
// Search with Content
|
|
489
1097
|
// ============================================================================
|
|
490
1098
|
|
|
1099
|
+
/**
|
|
1100
|
+
* Perform semantic search and include section content in results.
|
|
1101
|
+
*
|
|
1102
|
+
* @param rootPath - Root directory containing embeddings
|
|
1103
|
+
* @param query - Natural language search query
|
|
1104
|
+
* @param options - Search options (limit, threshold, path filter)
|
|
1105
|
+
* @returns Ranked list of matching sections with content
|
|
1106
|
+
*
|
|
1107
|
+
* @throws EmbeddingsNotFoundError - No embeddings exist (run index --embed first)
|
|
1108
|
+
* @throws FileReadError - Cannot read index files
|
|
1109
|
+
* @throws IndexCorruptedError - Index files are corrupted
|
|
1110
|
+
* @throws ApiKeyMissingError - API key not set (check provider config)
|
|
1111
|
+
* @throws ApiKeyInvalidError - API key rejected by provider
|
|
1112
|
+
* @throws EmbeddingError - Embedding API failure (rate limit, quota, network)
|
|
1113
|
+
* @throws VectorStoreError - Cannot load or search vector index
|
|
1114
|
+
* @throws DimensionMismatchError - Corpus has different dimensions than current provider
|
|
1115
|
+
*/
|
|
491
1116
|
export const semanticSearchWithContent = (
|
|
492
1117
|
rootPath: string,
|
|
493
1118
|
query: string,
|
|
494
1119
|
options: SemanticSearchOptions = {},
|
|
495
|
-
): Effect.Effect<
|
|
1120
|
+
): Effect.Effect<
|
|
1121
|
+
readonly SemanticSearchResult[],
|
|
1122
|
+
| EmbeddingsNotFoundError
|
|
1123
|
+
| FileReadError
|
|
1124
|
+
| IndexCorruptedError
|
|
1125
|
+
| ApiKeyMissingError
|
|
1126
|
+
| ApiKeyInvalidError
|
|
1127
|
+
| EmbeddingError
|
|
1128
|
+
| VectorStoreError
|
|
1129
|
+
| DimensionMismatchError
|
|
1130
|
+
> =>
|
|
496
1131
|
Effect.gen(function* () {
|
|
497
1132
|
const resolvedRoot = path.resolve(rootPath)
|
|
498
1133
|
const results = yield* semanticSearch(resolvedRoot, query, options)
|
|
@@ -515,23 +1150,35 @@ export const semanticSearchWithContent = (
|
|
|
515
1150
|
|
|
516
1151
|
const filePath = path.join(resolvedRoot, result.documentPath)
|
|
517
1152
|
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
const content
|
|
525
|
-
|
|
526
|
-
.
|
|
1153
|
+
// Note: catchAll is intentional - file read failures during search result
|
|
1154
|
+
// enrichment should skip content loading with a warning, not fail the search.
|
|
1155
|
+
// Results are still returned without content when files can't be read.
|
|
1156
|
+
const fileContentResult = yield* Effect.promise(() =>
|
|
1157
|
+
fs.readFile(filePath, 'utf-8'),
|
|
1158
|
+
).pipe(
|
|
1159
|
+
Effect.map((content) => ({ ok: true as const, content })),
|
|
1160
|
+
Effect.catchAll(() =>
|
|
1161
|
+
Effect.succeed({ ok: false as const, content: '' }),
|
|
1162
|
+
),
|
|
1163
|
+
)
|
|
527
1164
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
content
|
|
531
|
-
|
|
532
|
-
} catch {
|
|
1165
|
+
if (!fileContentResult.ok) {
|
|
1166
|
+
yield* Effect.logWarning(
|
|
1167
|
+
`Skipping content load (cannot read): ${result.documentPath}`,
|
|
1168
|
+
)
|
|
533
1169
|
resultsWithContent.push(result)
|
|
1170
|
+
continue
|
|
534
1171
|
}
|
|
1172
|
+
|
|
1173
|
+
const lines = fileContentResult.content.split('\n')
|
|
1174
|
+
const content = lines
|
|
1175
|
+
.slice(section.startLine - 1, section.endLine)
|
|
1176
|
+
.join('\n')
|
|
1177
|
+
|
|
1178
|
+
resultsWithContent.push({
|
|
1179
|
+
...result,
|
|
1180
|
+
content,
|
|
1181
|
+
})
|
|
535
1182
|
}
|
|
536
1183
|
|
|
537
1184
|
return resultsWithContent
|
|
@@ -545,22 +1192,60 @@ export interface EmbeddingStats {
|
|
|
545
1192
|
readonly hasEmbeddings: boolean
|
|
546
1193
|
readonly count: number
|
|
547
1194
|
readonly provider: string
|
|
1195
|
+
readonly model?: string | undefined
|
|
548
1196
|
readonly dimensions: number
|
|
549
1197
|
readonly totalCost: number
|
|
550
1198
|
readonly totalTokens: number
|
|
551
1199
|
}
|
|
552
1200
|
|
|
1201
|
+
/**
|
|
1202
|
+
* Get statistics about stored embeddings.
|
|
1203
|
+
* Uses the active namespace to find the current embedding index.
|
|
1204
|
+
*
|
|
1205
|
+
* @param rootPath - Root directory containing embeddings
|
|
1206
|
+
* @returns Embedding statistics (count, provider, costs)
|
|
1207
|
+
*
|
|
1208
|
+
* @throws VectorStoreError - Cannot load vector index metadata
|
|
1209
|
+
*/
|
|
553
1210
|
export const getEmbeddingStats = (
|
|
554
1211
|
rootPath: string,
|
|
555
|
-
): Effect.Effect<EmbeddingStats,
|
|
1212
|
+
): Effect.Effect<EmbeddingStats, VectorStoreError> =>
|
|
556
1213
|
Effect.gen(function* () {
|
|
557
1214
|
const resolvedRoot = path.resolve(rootPath)
|
|
558
1215
|
|
|
559
|
-
//
|
|
560
|
-
const
|
|
561
|
-
|
|
1216
|
+
// Get the active namespace to find where embeddings are stored
|
|
1217
|
+
const activeProvider = yield* getActiveNamespace(resolvedRoot).pipe(
|
|
1218
|
+
Effect.catchAll(() => Effect.succeed(null as ActiveProvider | null)),
|
|
1219
|
+
)
|
|
562
1220
|
|
|
563
|
-
if (!
|
|
1221
|
+
if (!activeProvider) {
|
|
1222
|
+
return {
|
|
1223
|
+
hasEmbeddings: false,
|
|
1224
|
+
count: 0,
|
|
1225
|
+
provider: 'none',
|
|
1226
|
+
dimensions: 0,
|
|
1227
|
+
totalCost: 0,
|
|
1228
|
+
totalTokens: 0,
|
|
1229
|
+
}
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
// Load the namespaced vector store to get stats
|
|
1233
|
+
const vectorStore = createNamespacedVectorStore(
|
|
1234
|
+
resolvedRoot,
|
|
1235
|
+
activeProvider.provider,
|
|
1236
|
+
activeProvider.model,
|
|
1237
|
+
activeProvider.dimensions,
|
|
1238
|
+
)
|
|
1239
|
+
|
|
1240
|
+
const loadResult = yield* vectorStore
|
|
1241
|
+
.load()
|
|
1242
|
+
.pipe(
|
|
1243
|
+
Effect.catchAll(() =>
|
|
1244
|
+
Effect.succeed({ loaded: false } as VectorStoreLoadResult),
|
|
1245
|
+
),
|
|
1246
|
+
)
|
|
1247
|
+
|
|
1248
|
+
if (!loadResult.loaded) {
|
|
564
1249
|
return {
|
|
565
1250
|
hasEmbeddings: false,
|
|
566
1251
|
count: 0,
|
|
@@ -572,12 +1257,14 @@ export const getEmbeddingStats = (
|
|
|
572
1257
|
}
|
|
573
1258
|
|
|
574
1259
|
const stats = vectorStore.getStats()
|
|
1260
|
+
|
|
575
1261
|
return {
|
|
576
1262
|
hasEmbeddings: true,
|
|
577
1263
|
count: stats.count,
|
|
578
|
-
provider: stats.provider,
|
|
1264
|
+
provider: stats.provider || 'openai',
|
|
1265
|
+
model: stats.providerModel,
|
|
579
1266
|
dimensions: stats.dimensions,
|
|
580
|
-
totalCost: stats.totalCost,
|
|
581
|
-
totalTokens: stats.totalTokens,
|
|
1267
|
+
totalCost: stats.totalCost || 0,
|
|
1268
|
+
totalTokens: stats.totalTokens || 0,
|
|
582
1269
|
}
|
|
583
1270
|
})
|