mdcontext 0.0.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.changeset/README.md +28 -0
- package/.changeset/config.json +11 -0
- package/.claude/settings.local.json +25 -0
- package/.github/workflows/ci.yml +83 -0
- package/.github/workflows/claude-code-review.yml +44 -0
- package/.github/workflows/claude.yml +85 -0
- package/.github/workflows/release.yml +113 -0
- package/.tldrignore +112 -0
- package/BACKLOG.md +338 -0
- package/CONTRIBUTING.md +186 -0
- package/NOTES/NOTES +44 -0
- package/README.md +434 -11
- package/biome.json +36 -0
- package/cspell.config.yaml +14 -0
- package/dist/chunk-23UPXDNL.js +3044 -0
- package/dist/chunk-2W7MO2DL.js +1366 -0
- package/dist/chunk-3NUAZGMA.js +1689 -0
- package/dist/chunk-7TOWB2XB.js +366 -0
- package/dist/chunk-7XOTOADQ.js +3065 -0
- package/dist/chunk-AH2PDM2K.js +3042 -0
- package/dist/chunk-BNXWSZ63.js +3742 -0
- package/dist/chunk-BTL5DJVU.js +3222 -0
- package/dist/chunk-HDHYG7E4.js +104 -0
- package/dist/chunk-HLR4KZBP.js +3234 -0
- package/dist/chunk-IP3FRFEB.js +1045 -0
- package/dist/chunk-KHU56VDO.js +3042 -0
- package/dist/chunk-KRYIFLQR.js +88 -0
- package/dist/chunk-LBSDNLEM.js +287 -0
- package/dist/chunk-MNTQ7HCP.js +2643 -0
- package/dist/chunk-MUJELQQ6.js +1387 -0
- package/dist/chunk-MXJGMSLV.js +2199 -0
- package/dist/chunk-N6QJGC3Z.js +2636 -0
- package/dist/chunk-OBELGBPM.js +1713 -0
- package/dist/chunk-OT7R5XTA.js +3192 -0
- package/dist/chunk-P7X4RA2T.js +106 -0
- package/dist/chunk-PIDUQNC2.js +3185 -0
- package/dist/chunk-POGCDIH4.js +3187 -0
- package/dist/chunk-PSIEOQGZ.js +3043 -0
- package/dist/chunk-PVRT3IHA.js +3238 -0
- package/dist/chunk-QNN4TT23.js +1430 -0
- package/dist/chunk-RE3R45RJ.js +3042 -0
- package/dist/chunk-S7E6TFX6.js +803 -0
- package/dist/chunk-SG6GLU4U.js +1378 -0
- package/dist/chunk-SJCDV2ST.js +274 -0
- package/dist/chunk-SYE5XLF3.js +104 -0
- package/dist/chunk-T5VLYBZD.js +103 -0
- package/dist/chunk-TOQB7VWU.js +3238 -0
- package/dist/chunk-VFNMZ4ZQ.js +3228 -0
- package/dist/chunk-VVTGZNBT.js +1629 -0
- package/dist/chunk-W7Q4RFEV.js +104 -0
- package/dist/chunk-XTYYVRLO.js +3190 -0
- package/dist/chunk-Y6MDYVJD.js +3063 -0
- package/dist/cli/main.d.ts +1 -0
- package/dist/cli/main.js +5458 -0
- package/dist/index.d.ts +653 -0
- package/dist/index.js +79 -0
- package/dist/mcp/server.d.ts +1 -0
- package/dist/mcp/server.js +472 -0
- package/dist/schema-BAWSG7KY.js +22 -0
- package/dist/schema-E3QUPL26.js +20 -0
- package/dist/schema-EHL7WUT6.js +20 -0
- package/docs/019-USAGE.md +625 -0
- package/docs/020-current-implementation.md +364 -0
- package/docs/021-DOGFOODING-FINDINGS.md +175 -0
- package/docs/BACKLOG.md +80 -0
- package/docs/CONFIG.md +1123 -0
- package/docs/DESIGN.md +439 -0
- package/docs/ERRORS.md +383 -0
- package/docs/PROJECT.md +88 -0
- package/docs/ROADMAP.md +407 -0
- package/docs/summarization.md +320 -0
- package/docs/test-links.md +9 -0
- package/justfile +40 -0
- package/package.json +74 -9
- package/pnpm-workspace.yaml +5 -0
- package/research/INDEX.md +315 -0
- package/research/code-review/README.md +90 -0
- package/research/code-review/cli-error-handling-review.md +979 -0
- package/research/code-review/code-review-validation-report.md +464 -0
- package/research/code-review/main-ts-review.md +1128 -0
- package/research/config-analysis/01-current-implementation.md +470 -0
- package/research/config-analysis/02-strategy-recommendation.md +428 -0
- package/research/config-analysis/03-task-candidates.md +715 -0
- package/research/config-analysis/033-research-configuration-management.md +828 -0
- package/research/config-analysis/034-research-effect-cli-config.md +1504 -0
- package/research/config-analysis/04-consolidated-task-candidates.md +277 -0
- package/research/config-docs/SUMMARY.md +357 -0
- package/research/config-docs/TEST-RESULTS.md +776 -0
- package/research/config-docs/TODO.md +542 -0
- package/research/config-docs/analysis.md +744 -0
- package/research/config-docs/fix-validation.md +502 -0
- package/research/config-docs/help-audit.md +264 -0
- package/research/config-docs/help-system-analysis.md +890 -0
- package/research/dogfood/consolidated-tool-evaluation.md +373 -0
- package/research/dogfood/strategy-a/a-synthesis.md +184 -0
- package/research/dogfood/strategy-a/a1-docs.md +226 -0
- package/research/dogfood/strategy-a/a2-amorphic.md +156 -0
- package/research/dogfood/strategy-a/a3-llm.md +164 -0
- package/research/dogfood/strategy-b/b-synthesis.md +228 -0
- package/research/dogfood/strategy-b/b1-architecture.md +207 -0
- package/research/dogfood/strategy-b/b2-gaps.md +258 -0
- package/research/dogfood/strategy-b/b3-workflows.md +250 -0
- package/research/dogfood/strategy-c/c-synthesis.md +451 -0
- package/research/dogfood/strategy-c/c1-explorer.md +192 -0
- package/research/dogfood/strategy-c/c2-diver-memory.md +145 -0
- package/research/dogfood/strategy-c/c3-diver-control.md +148 -0
- package/research/dogfood/strategy-c/c4-diver-failure.md +151 -0
- package/research/dogfood/strategy-c/c5-diver-execution.md +221 -0
- package/research/dogfood/strategy-c/c6-diver-org.md +221 -0
- package/research/effect-cli-error-handling.md +845 -0
- package/research/effect-errors-as-values.md +943 -0
- package/research/errors-task-analysis/00-consolidated-tasks.md +207 -0
- package/research/errors-task-analysis/cli-commands-analysis.md +909 -0
- package/research/errors-task-analysis/embeddings-analysis.md +709 -0
- package/research/errors-task-analysis/index-search-analysis.md +812 -0
- package/research/frontmatter/COMMENTS-ARE-SKIPPED.md +149 -0
- package/research/frontmatter/LLM-CODE-NAVIGATION.md +276 -0
- package/research/issue-review.md +603 -0
- package/research/llm-summarization/agent-cli-tools-2026.md +1082 -0
- package/research/llm-summarization/alternative-providers-2026.md +1428 -0
- package/research/llm-summarization/anthropic-2026.md +367 -0
- package/research/llm-summarization/claude-cli-integration.md +1706 -0
- package/research/llm-summarization/cli-integration-patterns.md +3155 -0
- package/research/llm-summarization/openai-2026.md +473 -0
- package/research/llm-summarization/openai-compatible-providers-2026.md +1022 -0
- package/research/llm-summarization/opencode-cli-integration.md +1552 -0
- package/research/llm-summarization/prompt-engineering-2026.md +1426 -0
- package/research/llm-summarization/prototype-results.md +56 -0
- package/research/llm-summarization/provider-switching-patterns-2026.md +2153 -0
- package/research/llm-summarization/typescript-llm-libraries-2026.md +2436 -0
- package/research/mdcontext-error-analysis.md +521 -0
- package/research/mdcontext-pudding/00-EXECUTIVE-SUMMARY.md +282 -0
- package/research/mdcontext-pudding/01-index-embed.md +956 -0
- package/research/mdcontext-pudding/02-search-COMMANDS.md +142 -0
- package/research/mdcontext-pudding/02-search-SUMMARY.md +146 -0
- package/research/mdcontext-pudding/02-search.md +970 -0
- package/research/mdcontext-pudding/03-context.md +779 -0
- package/research/mdcontext-pudding/04-navigation-and-analytics.md +803 -0
- package/research/mdcontext-pudding/04-tree.md +704 -0
- package/research/mdcontext-pudding/05-config.md +1038 -0
- package/research/mdcontext-pudding/06-links-summary.txt +87 -0
- package/research/mdcontext-pudding/06-links.md +679 -0
- package/research/mdcontext-pudding/07-stats.md +693 -0
- package/research/mdcontext-pudding/BUG-FIX-PLAN.md +388 -0
- package/research/mdcontext-pudding/P0-BUG-VALIDATION.md +167 -0
- package/research/mdcontext-pudding/README.md +168 -0
- package/research/mdcontext-pudding/TESTING-SUMMARY.md +128 -0
- package/research/npm_publish/011-npm-workflow-research-agent2.md +792 -0
- package/research/npm_publish/012-npm-workflow-research-agent1.md +530 -0
- package/research/npm_publish/013-npm-workflow-research-agent3.md +722 -0
- package/research/npm_publish/014-npm-workflow-synthesis.md +556 -0
- package/research/npm_publish/031-npm-workflow-task-analysis.md +134 -0
- package/research/research-quality-review.md +834 -0
- package/research/semantic-search/002-research-embedding-models.md +490 -0
- package/research/semantic-search/003-research-rag-alternatives.md +523 -0
- package/research/semantic-search/004-research-vector-search.md +841 -0
- package/research/semantic-search/032-research-semantic-search.md +427 -0
- package/research/semantic-search/embedding-text-analysis.md +156 -0
- package/research/semantic-search/multi-word-failure-reproduction.md +171 -0
- package/research/semantic-search/query-processing-analysis.md +207 -0
- package/research/semantic-search/root-cause-and-solution.md +114 -0
- package/research/semantic-search/threshold-validation-report.md +69 -0
- package/research/semantic-search/vector-search-analysis.md +63 -0
- package/research/task-management-2026/00-synthesis-recommendations.md +295 -0
- package/research/task-management-2026/01-ai-workflow-tools.md +416 -0
- package/research/task-management-2026/02-agent-framework-patterns.md +476 -0
- package/research/task-management-2026/03-lightweight-file-based.md +567 -0
- package/research/task-management-2026/04-established-tools-ai-features.md +541 -0
- package/research/task-management-2026/linear/01-core-features-workflow.md +771 -0
- package/research/task-management-2026/linear/02-api-integrations.md +930 -0
- package/research/task-management-2026/linear/03-ai-features.md +368 -0
- package/research/task-management-2026/linear/04-pricing-setup.md +205 -0
- package/research/task-management-2026/linear/05-usage-patterns-best-practices.md +605 -0
- package/research/test-path-issues.md +276 -0
- package/review/ALP-76/1-error-type-design.md +962 -0
- package/review/ALP-76/2-error-handling-patterns.md +906 -0
- package/review/ALP-76/3-error-presentation.md +624 -0
- package/review/ALP-76/4-test-coverage.md +625 -0
- package/review/ALP-76/5-migration-completeness.md +440 -0
- package/review/ALP-76/6-effect-best-practices.md +755 -0
- package/scripts/apply-branch-protection.sh +47 -0
- package/scripts/branch-protection-templates.json +79 -0
- package/scripts/prototype-summarization.ts +346 -0
- package/scripts/rebuild-hnswlib.js +58 -0
- package/scripts/setup-branch-protection.sh +64 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/active-provider.json +7 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.json +541 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/bm25.meta.json +5 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/config.json +8 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/documents.json +60 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/links.json +13 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/.mdcontext/indexes/sections.json +1197 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/configuration-management.md +99 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/distributed-systems.md +92 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/error-handling.md +78 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/failure-automation.md +55 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/job-context.md +69 -0
- package/src/__tests__/fixtures/semantic-search/multi-word-corpus/process-orchestration.md +99 -0
- package/src/cli/argv-preprocessor.test.ts +210 -0
- package/src/cli/argv-preprocessor.ts +202 -0
- package/src/cli/cli.test.ts +627 -0
- package/src/cli/commands/backlinks.ts +54 -0
- package/src/cli/commands/config-cmd.ts +642 -0
- package/src/cli/commands/context.ts +285 -0
- package/src/cli/commands/duplicates.ts +122 -0
- package/src/cli/commands/embeddings.ts +529 -0
- package/src/cli/commands/index-cmd.ts +480 -0
- package/src/cli/commands/index.ts +16 -0
- package/src/cli/commands/links.ts +52 -0
- package/src/cli/commands/search.ts +1281 -0
- package/src/cli/commands/stats.ts +149 -0
- package/src/cli/commands/tree.ts +128 -0
- package/src/cli/config-layer.ts +176 -0
- package/src/cli/error-handler.test.ts +235 -0
- package/src/cli/error-handler.ts +655 -0
- package/src/cli/flag-schemas.ts +341 -0
- package/src/cli/help.ts +588 -0
- package/src/cli/index.ts +9 -0
- package/src/cli/main.ts +435 -0
- package/src/cli/options.ts +41 -0
- package/src/cli/shared-error-handling.ts +199 -0
- package/src/cli/typo-suggester.test.ts +105 -0
- package/src/cli/typo-suggester.ts +130 -0
- package/src/cli/utils.ts +259 -0
- package/src/config/file-provider.test.ts +320 -0
- package/src/config/file-provider.ts +273 -0
- package/src/config/index.ts +72 -0
- package/src/config/integration.test.ts +667 -0
- package/src/config/precedence.test.ts +277 -0
- package/src/config/precedence.ts +451 -0
- package/src/config/schema.test.ts +414 -0
- package/src/config/schema.ts +603 -0
- package/src/config/service.test.ts +320 -0
- package/src/config/service.ts +243 -0
- package/src/config/testing.test.ts +264 -0
- package/src/config/testing.ts +110 -0
- package/src/core/index.ts +1 -0
- package/src/core/types.ts +113 -0
- package/src/duplicates/detector.test.ts +183 -0
- package/src/duplicates/detector.ts +414 -0
- package/src/duplicates/index.ts +18 -0
- package/src/embeddings/embedding-namespace.test.ts +300 -0
- package/src/embeddings/embedding-namespace.ts +947 -0
- package/src/embeddings/heading-boost.test.ts +222 -0
- package/src/embeddings/hnsw-build-options.test.ts +198 -0
- package/src/embeddings/hyde.test.ts +272 -0
- package/src/embeddings/hyde.ts +264 -0
- package/src/embeddings/index.ts +10 -0
- package/src/embeddings/openai-provider.ts +414 -0
- package/src/embeddings/pricing.json +22 -0
- package/src/embeddings/provider-constants.ts +204 -0
- package/src/embeddings/provider-errors.test.ts +967 -0
- package/src/embeddings/provider-errors.ts +565 -0
- package/src/embeddings/provider-factory.test.ts +240 -0
- package/src/embeddings/provider-factory.ts +225 -0
- package/src/embeddings/provider-integration.test.ts +788 -0
- package/src/embeddings/query-preprocessing.test.ts +187 -0
- package/src/embeddings/semantic-search-threshold.test.ts +508 -0
- package/src/embeddings/semantic-search.ts +1270 -0
- package/src/embeddings/types.ts +359 -0
- package/src/embeddings/vector-store.ts +708 -0
- package/src/embeddings/voyage-provider.ts +313 -0
- package/src/errors/errors.test.ts +845 -0
- package/src/errors/index.ts +533 -0
- package/src/index/ignore-patterns.test.ts +354 -0
- package/src/index/ignore-patterns.ts +305 -0
- package/src/index/index.ts +4 -0
- package/src/index/indexer.ts +684 -0
- package/src/index/storage.ts +260 -0
- package/src/index/types.ts +147 -0
- package/src/index/watcher.ts +189 -0
- package/src/index.ts +30 -0
- package/src/integration/search-keyword.test.ts +678 -0
- package/src/mcp/server.ts +612 -0
- package/src/parser/index.ts +1 -0
- package/src/parser/parser.test.ts +291 -0
- package/src/parser/parser.ts +394 -0
- package/src/parser/section-filter.test.ts +277 -0
- package/src/parser/section-filter.ts +392 -0
- package/src/search/__tests__/hybrid-search.test.ts +650 -0
- package/src/search/bm25-store.ts +366 -0
- package/src/search/cross-encoder.test.ts +253 -0
- package/src/search/cross-encoder.ts +406 -0
- package/src/search/fuzzy-search.test.ts +419 -0
- package/src/search/fuzzy-search.ts +273 -0
- package/src/search/hybrid-search.ts +448 -0
- package/src/search/path-matcher.test.ts +276 -0
- package/src/search/path-matcher.ts +33 -0
- package/src/search/query-parser.test.ts +260 -0
- package/src/search/query-parser.ts +319 -0
- package/src/search/searcher.test.ts +280 -0
- package/src/search/searcher.ts +724 -0
- package/src/search/wink-bm25.d.ts +30 -0
- package/src/summarization/cli-providers/claude.ts +202 -0
- package/src/summarization/cli-providers/detection.test.ts +273 -0
- package/src/summarization/cli-providers/detection.ts +118 -0
- package/src/summarization/cli-providers/index.ts +8 -0
- package/src/summarization/cost.test.ts +139 -0
- package/src/summarization/cost.ts +102 -0
- package/src/summarization/error-handler.test.ts +127 -0
- package/src/summarization/error-handler.ts +111 -0
- package/src/summarization/index.ts +102 -0
- package/src/summarization/pipeline.test.ts +498 -0
- package/src/summarization/pipeline.ts +231 -0
- package/src/summarization/prompts.test.ts +269 -0
- package/src/summarization/prompts.ts +133 -0
- package/src/summarization/provider-factory.test.ts +396 -0
- package/src/summarization/provider-factory.ts +178 -0
- package/src/summarization/types.ts +184 -0
- package/src/summarize/budget-bugs.test.ts +620 -0
- package/src/summarize/formatters.ts +419 -0
- package/src/summarize/index.ts +20 -0
- package/src/summarize/summarizer.test.ts +275 -0
- package/src/summarize/summarizer.ts +597 -0
- package/src/summarize/verify-bugs.test.ts +238 -0
- package/src/types/huggingface-transformers.d.ts +66 -0
- package/src/utils/index.ts +1 -0
- package/src/utils/tokens.test.ts +142 -0
- package/src/utils/tokens.ts +186 -0
- package/tests/fixtures/cli/.mdcontext/active-provider.json +7 -0
- package/tests/fixtures/cli/.mdcontext/config.json +8 -0
- package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/embeddings/openai_text-embedding-3-small_512/vectors.meta.bin +0 -0
- package/tests/fixtures/cli/.mdcontext/indexes/documents.json +33 -0
- package/tests/fixtures/cli/.mdcontext/indexes/links.json +12 -0
- package/tests/fixtures/cli/.mdcontext/indexes/sections.json +247 -0
- package/tests/fixtures/cli/README.md +9 -0
- package/tests/fixtures/cli/api-reference.md +11 -0
- package/tests/fixtures/cli/getting-started.md +11 -0
- package/tests/integration/embed-index.test.ts +712 -0
- package/tests/integration/search-context.test.ts +469 -0
- package/tests/integration/search-semantic.test.ts +522 -0
- package/tsconfig.json +26 -0
- package/vitest.config.ts +16 -0
- package/vitest.setup.ts +12 -0
|
@@ -0,0 +1,1270 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic search functionality
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import * as fs from 'node:fs/promises'
|
|
6
|
+
import * as path from 'node:path'
|
|
7
|
+
import { Effect } from 'effect'
|
|
8
|
+
import {
|
|
9
|
+
type ApiKeyInvalidError,
|
|
10
|
+
type ApiKeyMissingError,
|
|
11
|
+
DimensionMismatchError,
|
|
12
|
+
EmbeddingError,
|
|
13
|
+
EmbeddingsNotFoundError,
|
|
14
|
+
type FileReadError,
|
|
15
|
+
type IndexCorruptedError,
|
|
16
|
+
IndexNotFoundError,
|
|
17
|
+
type VectorStoreError,
|
|
18
|
+
} from '../errors/index.js'
|
|
19
|
+
import {
|
|
20
|
+
createStorage,
|
|
21
|
+
loadDocumentIndex,
|
|
22
|
+
loadSectionIndex,
|
|
23
|
+
} from '../index/storage.js'
|
|
24
|
+
import type { SectionEntry } from '../index/types.js'
|
|
25
|
+
import {
|
|
26
|
+
type ActiveProvider,
|
|
27
|
+
generateNamespace,
|
|
28
|
+
getActiveNamespace,
|
|
29
|
+
writeActiveProvider,
|
|
30
|
+
} from './embedding-namespace.js'
|
|
31
|
+
import { generateHypotheticalDocument, type HydeResult } from './hyde.js'
|
|
32
|
+
import {
|
|
33
|
+
checkPricingFreshness,
|
|
34
|
+
getPricingDate,
|
|
35
|
+
PRICING_DATA,
|
|
36
|
+
wrapEmbedding,
|
|
37
|
+
} from './openai-provider.js'
|
|
38
|
+
import {
|
|
39
|
+
createEmbeddingProviderDirect,
|
|
40
|
+
type ProviderFactoryConfig,
|
|
41
|
+
} from './provider-factory.js'
|
|
42
|
+
import {
|
|
43
|
+
calculateFileImportanceBoost,
|
|
44
|
+
calculateHeadingBoost,
|
|
45
|
+
type EmbeddingProvider,
|
|
46
|
+
hasProviderMetadata,
|
|
47
|
+
preprocessQuery,
|
|
48
|
+
QUALITY_EF_SEARCH,
|
|
49
|
+
type SemanticSearchOptions,
|
|
50
|
+
type SemanticSearchResult,
|
|
51
|
+
type SemanticSearchResultWithStats,
|
|
52
|
+
type VectorEntry,
|
|
53
|
+
} from './types.js'
|
|
54
|
+
import {
|
|
55
|
+
createNamespacedVectorStore,
|
|
56
|
+
type HnswBuildOptions,
|
|
57
|
+
type HnswMismatchWarning,
|
|
58
|
+
type HnswVectorStore,
|
|
59
|
+
type VectorSearchResult,
|
|
60
|
+
type VectorStoreLoadResult,
|
|
61
|
+
} from './vector-store.js'
|
|
62
|
+
|
|
63
|
+
// ============================================================================
|
|
64
|
+
// HNSW Parameter Warning
|
|
65
|
+
// ============================================================================
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Check for HNSW parameter mismatch and log a warning if found.
|
|
69
|
+
* This helps users understand when their config doesn't match the stored index.
|
|
70
|
+
*/
|
|
71
|
+
const checkHnswMismatch = (
|
|
72
|
+
mismatch: HnswMismatchWarning | undefined,
|
|
73
|
+
): Effect.Effect<void, never, never> => {
|
|
74
|
+
if (!mismatch) {
|
|
75
|
+
return Effect.void
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const { configParams, indexParams } = mismatch
|
|
79
|
+
return Effect.logWarning(
|
|
80
|
+
`HNSW parameter mismatch: Index was built with M=${indexParams.m}, efConstruction=${indexParams.efConstruction}, ` +
|
|
81
|
+
`but config specifies M=${configParams.m}, efConstruction=${configParams.efConstruction}. ` +
|
|
82
|
+
`HNSW parameters only affect index construction. Run 'mdcontext index --embed --force' to rebuild with new parameters.`,
|
|
83
|
+
)
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ============================================================================
|
|
87
|
+
// Embedding Text Generation
|
|
88
|
+
// ============================================================================
|
|
89
|
+
|
|
90
|
+
const generateEmbeddingText = (
|
|
91
|
+
section: SectionEntry,
|
|
92
|
+
content: string,
|
|
93
|
+
documentTitle: string,
|
|
94
|
+
parentHeading?: string | undefined,
|
|
95
|
+
): string => {
|
|
96
|
+
const parts: string[] = []
|
|
97
|
+
|
|
98
|
+
parts.push(`# ${section.heading}`)
|
|
99
|
+
if (parentHeading) {
|
|
100
|
+
parts.push(`Parent section: ${parentHeading}`)
|
|
101
|
+
}
|
|
102
|
+
parts.push(`Document: ${documentTitle}`)
|
|
103
|
+
parts.push('')
|
|
104
|
+
parts.push(content)
|
|
105
|
+
|
|
106
|
+
return parts.join('\n')
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// ============================================================================
|
|
110
|
+
// Cost Estimation
|
|
111
|
+
// ============================================================================
|
|
112
|
+
|
|
113
|
+
// Price per 1M tokens for text-embedding-3-small (from PRICING_DATA)
|
|
114
|
+
const EMBEDDING_PRICE_PER_MILLION =
|
|
115
|
+
PRICING_DATA.prices['text-embedding-3-small'] ?? 0.02
|
|
116
|
+
|
|
117
|
+
// Re-export pricing utilities for CLI use
|
|
118
|
+
export { checkPricingFreshness, getPricingDate }
|
|
119
|
+
|
|
120
|
+
export interface DirectoryEstimate {
|
|
121
|
+
readonly directory: string
|
|
122
|
+
readonly fileCount: number
|
|
123
|
+
readonly sectionCount: number
|
|
124
|
+
readonly estimatedTokens: number
|
|
125
|
+
readonly estimatedCost: number
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
export interface EmbeddingEstimate {
|
|
129
|
+
readonly totalFiles: number
|
|
130
|
+
readonly totalSections: number
|
|
131
|
+
readonly totalTokens: number
|
|
132
|
+
readonly totalCost: number
|
|
133
|
+
readonly estimatedTimeSeconds: number
|
|
134
|
+
readonly byDirectory: readonly DirectoryEstimate[]
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Estimate the cost of generating embeddings for a directory.
|
|
139
|
+
*
|
|
140
|
+
* @param rootPath - Root directory containing indexed markdown files
|
|
141
|
+
* @param options - Optional exclude patterns
|
|
142
|
+
* @returns Estimate with token counts and costs
|
|
143
|
+
*
|
|
144
|
+
* @throws IndexNotFoundError - Index doesn't exist at path
|
|
145
|
+
* @throws FileReadError - Cannot read index files
|
|
146
|
+
* @throws IndexCorruptedError - Index files are corrupted
|
|
147
|
+
*/
|
|
148
|
+
export const estimateEmbeddingCost = (
|
|
149
|
+
rootPath: string,
|
|
150
|
+
options: { excludePatterns?: readonly string[] | undefined } = {},
|
|
151
|
+
): Effect.Effect<
|
|
152
|
+
EmbeddingEstimate,
|
|
153
|
+
IndexNotFoundError | FileReadError | IndexCorruptedError
|
|
154
|
+
> =>
|
|
155
|
+
Effect.gen(function* () {
|
|
156
|
+
const resolvedRoot = path.resolve(rootPath)
|
|
157
|
+
const storage = createStorage(resolvedRoot)
|
|
158
|
+
|
|
159
|
+
const docIndex = yield* loadDocumentIndex(storage)
|
|
160
|
+
const sectionIndex = yield* loadSectionIndex(storage)
|
|
161
|
+
|
|
162
|
+
if (!docIndex || !sectionIndex) {
|
|
163
|
+
return yield* Effect.fail(new IndexNotFoundError({ path: resolvedRoot }))
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Group by directory
|
|
167
|
+
const byDir: Map<
|
|
168
|
+
string,
|
|
169
|
+
{ files: Set<string>; sections: number; tokens: number }
|
|
170
|
+
> = new Map()
|
|
171
|
+
|
|
172
|
+
for (const section of Object.values(sectionIndex.sections)) {
|
|
173
|
+
// Skip very short sections (< 10 tokens)
|
|
174
|
+
if (section.tokenCount < 10) continue
|
|
175
|
+
|
|
176
|
+
// Check exclude patterns
|
|
177
|
+
if (options.excludePatterns?.length) {
|
|
178
|
+
const excluded = options.excludePatterns.some((pattern) => {
|
|
179
|
+
const regex = new RegExp(
|
|
180
|
+
`^${pattern.replace(/\*/g, '.*').replace(/\?/g, '.')}$`,
|
|
181
|
+
)
|
|
182
|
+
return regex.test(section.documentPath)
|
|
183
|
+
})
|
|
184
|
+
if (excluded) continue
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const dir = path.dirname(section.documentPath) || '.'
|
|
188
|
+
if (!byDir.has(dir)) {
|
|
189
|
+
byDir.set(dir, { files: new Set(), sections: 0, tokens: 0 })
|
|
190
|
+
}
|
|
191
|
+
const entry = byDir.get(dir)!
|
|
192
|
+
entry.files.add(section.documentPath)
|
|
193
|
+
entry.sections++
|
|
194
|
+
entry.tokens += section.tokenCount
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
const directoryEstimates: DirectoryEstimate[] = []
|
|
198
|
+
let totalFiles = 0
|
|
199
|
+
let totalSections = 0
|
|
200
|
+
let totalTokens = 0
|
|
201
|
+
|
|
202
|
+
for (const [dir, data] of byDir) {
|
|
203
|
+
directoryEstimates.push({
|
|
204
|
+
directory: dir,
|
|
205
|
+
fileCount: data.files.size,
|
|
206
|
+
sectionCount: data.sections,
|
|
207
|
+
estimatedTokens: data.tokens,
|
|
208
|
+
estimatedCost: (data.tokens / 1_000_000) * EMBEDDING_PRICE_PER_MILLION,
|
|
209
|
+
})
|
|
210
|
+
totalFiles += data.files.size
|
|
211
|
+
totalSections += data.sections
|
|
212
|
+
totalTokens += data.tokens
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Sort by directory name
|
|
216
|
+
directoryEstimates.sort((a, b) => a.directory.localeCompare(b.directory))
|
|
217
|
+
|
|
218
|
+
// Estimate time: ~1.5s per 100 sections (API batch processing)
|
|
219
|
+
const estimatedTimeSeconds = Math.ceil(totalSections / 100) * 1.5
|
|
220
|
+
|
|
221
|
+
return {
|
|
222
|
+
totalFiles,
|
|
223
|
+
totalSections,
|
|
224
|
+
totalTokens,
|
|
225
|
+
totalCost: (totalTokens / 1_000_000) * EMBEDDING_PRICE_PER_MILLION,
|
|
226
|
+
estimatedTimeSeconds,
|
|
227
|
+
byDirectory: directoryEstimates,
|
|
228
|
+
}
|
|
229
|
+
})
|
|
230
|
+
|
|
231
|
+
// ============================================================================
|
|
232
|
+
// Build Embeddings
|
|
233
|
+
// ============================================================================
|
|
234
|
+
|
|
235
|
+
export interface FileProgress {
|
|
236
|
+
readonly fileIndex: number
|
|
237
|
+
readonly totalFiles: number
|
|
238
|
+
readonly filePath: string
|
|
239
|
+
readonly sectionCount: number
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
export interface EmbeddingBatchProgress {
|
|
243
|
+
readonly batchIndex: number
|
|
244
|
+
readonly totalBatches: number
|
|
245
|
+
readonly processedSections: number
|
|
246
|
+
readonly totalSections: number
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
export interface BuildEmbeddingsOptions {
|
|
250
|
+
readonly force?: boolean | undefined
|
|
251
|
+
readonly provider?: EmbeddingProvider | undefined
|
|
252
|
+
readonly providerConfig?: ProviderFactoryConfig | undefined
|
|
253
|
+
readonly excludePatterns?: readonly string[] | undefined
|
|
254
|
+
readonly onFileProgress?: ((progress: FileProgress) => void) | undefined
|
|
255
|
+
/** Callback for batch progress during embedding API calls */
|
|
256
|
+
readonly onBatchProgress?:
|
|
257
|
+
| ((progress: EmbeddingBatchProgress) => void)
|
|
258
|
+
| undefined
|
|
259
|
+
/** HNSW build parameters for vector index construction */
|
|
260
|
+
readonly hnswOptions?: HnswBuildOptions | undefined
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
export interface BuildEmbeddingsResult {
|
|
264
|
+
readonly sectionsEmbedded: number
|
|
265
|
+
readonly tokensUsed: number
|
|
266
|
+
readonly cost: number
|
|
267
|
+
readonly duration: number
|
|
268
|
+
readonly filesProcessed: number
|
|
269
|
+
readonly cacheHit?: boolean | undefined
|
|
270
|
+
readonly existingVectors?: number | undefined
|
|
271
|
+
readonly estimatedSavings?: number | undefined
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Build embeddings for all indexed sections in a directory.
|
|
276
|
+
*
|
|
277
|
+
* @param rootPath - Root directory containing indexed markdown files
|
|
278
|
+
* @param options - Build options (force rebuild, progress callbacks)
|
|
279
|
+
* @returns Result with embedding counts, costs, and timing
|
|
280
|
+
*
|
|
281
|
+
* @throws IndexNotFoundError - Index doesn't exist at path
|
|
282
|
+
* @throws FileReadError - Cannot read index or source files
|
|
283
|
+
* @throws IndexCorruptedError - Index files are corrupted
|
|
284
|
+
* @throws ApiKeyMissingError - API key not set (check provider config)
|
|
285
|
+
* @throws ApiKeyInvalidError - API key rejected by provider
|
|
286
|
+
* @throws EmbeddingError - Embedding API failure (rate limit, quota, network)
|
|
287
|
+
* @throws VectorStoreError - Cannot save vector index
|
|
288
|
+
* @throws DimensionMismatchError - Existing embeddings have different dimensions
|
|
289
|
+
*/
|
|
290
|
+
export const buildEmbeddings = (
|
|
291
|
+
rootPath: string,
|
|
292
|
+
options: BuildEmbeddingsOptions = {},
|
|
293
|
+
): Effect.Effect<
|
|
294
|
+
BuildEmbeddingsResult,
|
|
295
|
+
| IndexNotFoundError
|
|
296
|
+
| FileReadError
|
|
297
|
+
| IndexCorruptedError
|
|
298
|
+
| ApiKeyMissingError
|
|
299
|
+
| ApiKeyInvalidError
|
|
300
|
+
| EmbeddingError
|
|
301
|
+
| VectorStoreError
|
|
302
|
+
| DimensionMismatchError
|
|
303
|
+
> =>
|
|
304
|
+
Effect.gen(function* () {
|
|
305
|
+
const startTime = Date.now()
|
|
306
|
+
const resolvedRoot = path.resolve(rootPath)
|
|
307
|
+
const storage = createStorage(resolvedRoot)
|
|
308
|
+
|
|
309
|
+
// Load indexes
|
|
310
|
+
const docIndex = yield* loadDocumentIndex(storage)
|
|
311
|
+
const sectionIndex = yield* loadSectionIndex(storage)
|
|
312
|
+
|
|
313
|
+
if (!docIndex || !sectionIndex) {
|
|
314
|
+
return yield* Effect.fail(new IndexNotFoundError({ path: resolvedRoot }))
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Get or create provider - use factory for config-driven provider selection
|
|
318
|
+
// Priority: explicit provider > providerConfig > default (openai)
|
|
319
|
+
const providerConfig = options.providerConfig ?? { provider: 'openai' }
|
|
320
|
+
const provider =
|
|
321
|
+
options.provider ?? (yield* createEmbeddingProviderDirect(providerConfig))
|
|
322
|
+
const dimensions = provider.dimensions
|
|
323
|
+
|
|
324
|
+
// Extract provider info for namespacing from the actual provider instance
|
|
325
|
+
// This ensures we use the correct values even when options.provider is explicitly set
|
|
326
|
+
let providerName: string
|
|
327
|
+
let providerModel: string
|
|
328
|
+
|
|
329
|
+
if (hasProviderMetadata(provider)) {
|
|
330
|
+
// Provider has metadata - extract provider name from provider.name (format: "provider:model")
|
|
331
|
+
const nameParts = provider.name.split(':')
|
|
332
|
+
providerName = nameParts[0] || 'openai'
|
|
333
|
+
providerModel = provider.model
|
|
334
|
+
} else {
|
|
335
|
+
// Fallback to config values for providers without metadata
|
|
336
|
+
providerName = providerConfig.provider ?? 'openai'
|
|
337
|
+
providerModel = providerConfig.model ?? 'text-embedding-3-small'
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Create namespaced vector store for this provider/model/dimensions combination
|
|
341
|
+
const vectorStore = createNamespacedVectorStore(
|
|
342
|
+
resolvedRoot,
|
|
343
|
+
providerName,
|
|
344
|
+
providerModel,
|
|
345
|
+
dimensions,
|
|
346
|
+
options.hnswOptions,
|
|
347
|
+
) as HnswVectorStore
|
|
348
|
+
|
|
349
|
+
// Set provider metadata
|
|
350
|
+
if (hasProviderMetadata(provider)) {
|
|
351
|
+
vectorStore.setProvider(provider.name, provider.model, provider.baseURL)
|
|
352
|
+
} else {
|
|
353
|
+
vectorStore.setProvider(providerName, providerModel, undefined)
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// Load existing if not forcing
|
|
357
|
+
if (!options.force) {
|
|
358
|
+
const loadResult = yield* vectorStore.load()
|
|
359
|
+
if (loadResult.loaded) {
|
|
360
|
+
const stats = vectorStore.getStats()
|
|
361
|
+
// Skip if any embeddings exist
|
|
362
|
+
if (stats.count > 0) {
|
|
363
|
+
const duration = Date.now() - startTime
|
|
364
|
+
// Estimate savings based on existing tokens
|
|
365
|
+
const estimatedSavings =
|
|
366
|
+
(stats.totalTokens / 1_000_000) * EMBEDDING_PRICE_PER_MILLION
|
|
367
|
+
return {
|
|
368
|
+
sectionsEmbedded: 0,
|
|
369
|
+
tokensUsed: 0,
|
|
370
|
+
cost: 0,
|
|
371
|
+
duration,
|
|
372
|
+
filesProcessed: 0,
|
|
373
|
+
cacheHit: true,
|
|
374
|
+
existingVectors: stats.count,
|
|
375
|
+
estimatedSavings,
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Helper to check if a path matches exclude patterns
|
|
382
|
+
const isExcluded = (docPath: string): boolean => {
|
|
383
|
+
if (!options.excludePatterns?.length) return false
|
|
384
|
+
return options.excludePatterns.some((pattern) => {
|
|
385
|
+
const regex = new RegExp(
|
|
386
|
+
`^${pattern.replace(/\*/g, '.*').replace(/\?/g, '.')}$`,
|
|
387
|
+
)
|
|
388
|
+
return regex.test(docPath)
|
|
389
|
+
})
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
// Group sections by document for efficient file reading
|
|
393
|
+
const sectionsByDoc: Map<
|
|
394
|
+
string,
|
|
395
|
+
{ section: SectionEntry; parentHeading: string | undefined }[]
|
|
396
|
+
> = new Map()
|
|
397
|
+
|
|
398
|
+
for (const section of Object.values(sectionIndex.sections)) {
|
|
399
|
+
const document = docIndex.documents[section.documentPath]
|
|
400
|
+
if (!document) continue
|
|
401
|
+
|
|
402
|
+
// Skip very short sections (< 10 tokens)
|
|
403
|
+
if (section.tokenCount < 10) continue
|
|
404
|
+
|
|
405
|
+
// Check exclude patterns
|
|
406
|
+
if (isExcluded(section.documentPath)) continue
|
|
407
|
+
|
|
408
|
+
// Find parent heading if any
|
|
409
|
+
let parentHeading: string | undefined
|
|
410
|
+
if (section.level > 1) {
|
|
411
|
+
const docSections = sectionIndex.byDocument[document.id] ?? []
|
|
412
|
+
for (const sibId of docSections) {
|
|
413
|
+
const sib = sectionIndex.sections[sibId]
|
|
414
|
+
if (
|
|
415
|
+
sib &&
|
|
416
|
+
sib.level === section.level - 1 &&
|
|
417
|
+
sib.startLine < section.startLine
|
|
418
|
+
) {
|
|
419
|
+
parentHeading = sib.heading
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
const docPath = section.documentPath
|
|
425
|
+
if (!sectionsByDoc.has(docPath)) {
|
|
426
|
+
sectionsByDoc.set(docPath, [])
|
|
427
|
+
}
|
|
428
|
+
sectionsByDoc.get(docPath)!.push({ section, parentHeading })
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
if (sectionsByDoc.size === 0) {
|
|
432
|
+
const duration = Date.now() - startTime
|
|
433
|
+
return {
|
|
434
|
+
sectionsEmbedded: 0,
|
|
435
|
+
tokensUsed: 0,
|
|
436
|
+
cost: 0,
|
|
437
|
+
duration,
|
|
438
|
+
filesProcessed: 0,
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// Prepare sections for embedding by reading file content
|
|
443
|
+
const sectionsToEmbed: { section: SectionEntry; text: string }[] = []
|
|
444
|
+
const docPaths = Array.from(sectionsByDoc.keys())
|
|
445
|
+
let filesProcessed = 0
|
|
446
|
+
|
|
447
|
+
for (let fileIndex = 0; fileIndex < docPaths.length; fileIndex++) {
|
|
448
|
+
const docPath = docPaths[fileIndex]!
|
|
449
|
+
const sections = sectionsByDoc.get(docPath)!
|
|
450
|
+
const document = docIndex.documents[docPath]
|
|
451
|
+
if (!document) continue
|
|
452
|
+
|
|
453
|
+
// Report file progress
|
|
454
|
+
if (options.onFileProgress) {
|
|
455
|
+
options.onFileProgress({
|
|
456
|
+
fileIndex: fileIndex + 1,
|
|
457
|
+
totalFiles: docPaths.length,
|
|
458
|
+
filePath: docPath,
|
|
459
|
+
sectionCount: sections.length,
|
|
460
|
+
})
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
const filePath = path.join(resolvedRoot, docPath)
|
|
464
|
+
|
|
465
|
+
// Note: catchAll is intentional - file read failures during embedding
|
|
466
|
+
// should skip the file with a warning rather than abort the entire operation.
|
|
467
|
+
// A warning is logged below when the read fails.
|
|
468
|
+
const fileContentResult = yield* Effect.promise(() =>
|
|
469
|
+
fs.readFile(filePath, 'utf-8'),
|
|
470
|
+
).pipe(
|
|
471
|
+
Effect.map((content) => ({ ok: true as const, content })),
|
|
472
|
+
Effect.catchAll(() =>
|
|
473
|
+
Effect.succeed({ ok: false as const, content: '' }),
|
|
474
|
+
),
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
if (!fileContentResult.ok) {
|
|
478
|
+
yield* Effect.logWarning(`Skipping file (cannot read): ${docPath}`)
|
|
479
|
+
continue
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
filesProcessed++
|
|
483
|
+
const lines = fileContentResult.content.split('\n')
|
|
484
|
+
|
|
485
|
+
for (const { section, parentHeading } of sections) {
|
|
486
|
+
// Extract section content from file
|
|
487
|
+
const content = lines
|
|
488
|
+
.slice(section.startLine - 1, section.endLine)
|
|
489
|
+
.join('\n')
|
|
490
|
+
|
|
491
|
+
const text = generateEmbeddingText(
|
|
492
|
+
section,
|
|
493
|
+
content,
|
|
494
|
+
document.title,
|
|
495
|
+
parentHeading,
|
|
496
|
+
)
|
|
497
|
+
sectionsToEmbed.push({ section, text })
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
if (sectionsToEmbed.length === 0) {
|
|
502
|
+
const duration = Date.now() - startTime
|
|
503
|
+
return {
|
|
504
|
+
sectionsEmbedded: 0,
|
|
505
|
+
tokensUsed: 0,
|
|
506
|
+
cost: 0,
|
|
507
|
+
duration,
|
|
508
|
+
filesProcessed,
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
// Generate embeddings
|
|
513
|
+
const texts = sectionsToEmbed.map((s) => s.text)
|
|
514
|
+
const result = yield* wrapEmbedding(
|
|
515
|
+
provider.embed(texts, {
|
|
516
|
+
onBatchProgress: options.onBatchProgress
|
|
517
|
+
? (p) =>
|
|
518
|
+
options.onBatchProgress?.({
|
|
519
|
+
batchIndex: p.batchIndex,
|
|
520
|
+
totalBatches: p.totalBatches,
|
|
521
|
+
processedSections: p.processedTexts,
|
|
522
|
+
totalSections: p.totalTexts,
|
|
523
|
+
})
|
|
524
|
+
: undefined,
|
|
525
|
+
}),
|
|
526
|
+
providerConfig.provider ?? 'openai',
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
// Create vector entries
|
|
530
|
+
const entries: VectorEntry[] = []
|
|
531
|
+
for (let i = 0; i < sectionsToEmbed.length; i++) {
|
|
532
|
+
const { section } = sectionsToEmbed[i] ?? { section: null }
|
|
533
|
+
const embedding = result.embeddings[i]
|
|
534
|
+
if (!section || !embedding) continue
|
|
535
|
+
|
|
536
|
+
entries.push({
|
|
537
|
+
id: section.id,
|
|
538
|
+
sectionId: section.id,
|
|
539
|
+
documentPath: section.documentPath,
|
|
540
|
+
heading: section.heading,
|
|
541
|
+
embedding,
|
|
542
|
+
})
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
// Add to vector store
|
|
546
|
+
yield* vectorStore.add(entries)
|
|
547
|
+
vectorStore.addCost(result.cost, result.tokensUsed)
|
|
548
|
+
|
|
549
|
+
// Save
|
|
550
|
+
yield* vectorStore.save()
|
|
551
|
+
|
|
552
|
+
// Set this namespace as the active provider
|
|
553
|
+
const namespace = generateNamespace(providerName, providerModel, dimensions)
|
|
554
|
+
yield* writeActiveProvider(resolvedRoot, {
|
|
555
|
+
namespace,
|
|
556
|
+
provider: providerName,
|
|
557
|
+
model: providerModel,
|
|
558
|
+
dimensions,
|
|
559
|
+
activatedAt: new Date().toISOString(),
|
|
560
|
+
}).pipe(
|
|
561
|
+
Effect.catchAll((e) => {
|
|
562
|
+
// Don't fail the build if we can't write the active provider file
|
|
563
|
+
console.warn(`Warning: Could not set active provider: ${e.message}`)
|
|
564
|
+
return Effect.succeed(undefined)
|
|
565
|
+
}),
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
const duration = Date.now() - startTime
|
|
569
|
+
|
|
570
|
+
return {
|
|
571
|
+
sectionsEmbedded: entries.length,
|
|
572
|
+
tokensUsed: result.tokensUsed,
|
|
573
|
+
cost: result.cost,
|
|
574
|
+
duration,
|
|
575
|
+
filesProcessed,
|
|
576
|
+
}
|
|
577
|
+
})
|
|
578
|
+
|
|
579
|
+
// ============================================================================
|
|
580
|
+
// Context Lines Helper
|
|
581
|
+
// ============================================================================
|
|
582
|
+
|
|
583
|
+
/**
|
|
584
|
+
* Add context lines to search results by loading section content from files.
|
|
585
|
+
* This helper is used by both semanticSearch and semanticSearchWithStats to avoid code duplication.
|
|
586
|
+
*/
|
|
587
|
+
const addContextLinesToResults = (
|
|
588
|
+
limitedResults: readonly VectorSearchResult[],
|
|
589
|
+
sectionIndex: { sections: Record<string, SectionEntry> },
|
|
590
|
+
resolvedRoot: string,
|
|
591
|
+
options: {
|
|
592
|
+
contextBefore?: number | undefined
|
|
593
|
+
contextAfter?: number | undefined
|
|
594
|
+
},
|
|
595
|
+
): Effect.Effect<readonly SemanticSearchResult[], FileReadError, never> =>
|
|
596
|
+
Effect.gen(function* () {
|
|
597
|
+
const contextBefore = options.contextBefore ?? 0
|
|
598
|
+
const contextAfter = options.contextAfter ?? 0
|
|
599
|
+
|
|
600
|
+
const resultsWithContext: SemanticSearchResult[] = []
|
|
601
|
+
const fileCache = new Map<string, string>()
|
|
602
|
+
|
|
603
|
+
for (const r of limitedResults) {
|
|
604
|
+
const section = sectionIndex.sections[r.sectionId]
|
|
605
|
+
if (!section) {
|
|
606
|
+
resultsWithContext.push({
|
|
607
|
+
sectionId: r.sectionId,
|
|
608
|
+
documentPath: r.documentPath,
|
|
609
|
+
heading: r.heading,
|
|
610
|
+
similarity: r.similarity,
|
|
611
|
+
})
|
|
612
|
+
continue
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
let fileContent = fileCache.get(r.documentPath)
|
|
616
|
+
if (!fileContent) {
|
|
617
|
+
const filePath = path.join(resolvedRoot, r.documentPath)
|
|
618
|
+
const contentResult = yield* Effect.promise(() =>
|
|
619
|
+
fs.readFile(filePath, 'utf-8'),
|
|
620
|
+
).pipe(
|
|
621
|
+
Effect.map((content) => content),
|
|
622
|
+
Effect.catchAll(() => Effect.succeed(null as string | null)),
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
if (contentResult) {
|
|
626
|
+
fileContent = contentResult
|
|
627
|
+
fileCache.set(r.documentPath, fileContent)
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
if (fileContent) {
|
|
632
|
+
const lines = fileContent.split('\n')
|
|
633
|
+
const startIdx = Math.max(0, section.startLine - 1 - contextBefore)
|
|
634
|
+
const endIdx = Math.min(lines.length, section.endLine + contextAfter)
|
|
635
|
+
|
|
636
|
+
const contextLines: {
|
|
637
|
+
lineNumber: number
|
|
638
|
+
line: string
|
|
639
|
+
isMatch: boolean
|
|
640
|
+
}[] = []
|
|
641
|
+
for (let i = startIdx; i < endIdx; i++) {
|
|
642
|
+
const line = lines[i]
|
|
643
|
+
if (line !== undefined) {
|
|
644
|
+
contextLines.push({
|
|
645
|
+
lineNumber: i + 1,
|
|
646
|
+
line,
|
|
647
|
+
isMatch: i >= section.startLine - 1 && i < section.endLine,
|
|
648
|
+
})
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
resultsWithContext.push({
|
|
653
|
+
sectionId: r.sectionId,
|
|
654
|
+
documentPath: r.documentPath,
|
|
655
|
+
heading: r.heading,
|
|
656
|
+
similarity: r.similarity,
|
|
657
|
+
contextLines,
|
|
658
|
+
})
|
|
659
|
+
} else {
|
|
660
|
+
resultsWithContext.push({
|
|
661
|
+
sectionId: r.sectionId,
|
|
662
|
+
documentPath: r.documentPath,
|
|
663
|
+
heading: r.heading,
|
|
664
|
+
similarity: r.similarity,
|
|
665
|
+
})
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
return resultsWithContext
|
|
670
|
+
})
|
|
671
|
+
|
|
672
|
+
// ============================================================================
|
|
673
|
+
// Semantic Search
|
|
674
|
+
// ============================================================================
|
|
675
|
+
|
|
676
|
+
/**
|
|
677
|
+
* Perform semantic search over embedded sections.
|
|
678
|
+
*
|
|
679
|
+
* @param rootPath - Root directory containing embeddings
|
|
680
|
+
* @param query - Natural language search query
|
|
681
|
+
* @param options - Search options (limit, threshold, path filter)
|
|
682
|
+
* @returns Ranked list of matching sections by similarity
|
|
683
|
+
*
|
|
684
|
+
* @throws EmbeddingsNotFoundError - No embeddings exist (run index --embed first)
|
|
685
|
+
* @throws ApiKeyMissingError - API key not set (check provider config)
|
|
686
|
+
* @throws ApiKeyInvalidError - API key rejected by provider
|
|
687
|
+
* @throws EmbeddingError - Embedding API failure (rate limit, quota, network)
|
|
688
|
+
* @throws VectorStoreError - Cannot load or search vector index
|
|
689
|
+
* @throws DimensionMismatchError - Corpus has different dimensions than current provider
|
|
690
|
+
*/
|
|
691
|
+
export const semanticSearch = (
|
|
692
|
+
rootPath: string,
|
|
693
|
+
query: string,
|
|
694
|
+
options: SemanticSearchOptions = {},
|
|
695
|
+
): Effect.Effect<
|
|
696
|
+
readonly SemanticSearchResult[],
|
|
697
|
+
| EmbeddingsNotFoundError
|
|
698
|
+
| FileReadError
|
|
699
|
+
| IndexCorruptedError
|
|
700
|
+
| ApiKeyMissingError
|
|
701
|
+
| ApiKeyInvalidError
|
|
702
|
+
| EmbeddingError
|
|
703
|
+
| VectorStoreError
|
|
704
|
+
| DimensionMismatchError
|
|
705
|
+
> =>
|
|
706
|
+
Effect.gen(function* () {
|
|
707
|
+
const resolvedRoot = path.resolve(rootPath)
|
|
708
|
+
|
|
709
|
+
// Get active namespace to determine which embedding index to use
|
|
710
|
+
const activeProvider = yield* getActiveNamespace(resolvedRoot).pipe(
|
|
711
|
+
Effect.catchAll(() => Effect.succeed(null as ActiveProvider | null)),
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
if (!activeProvider) {
|
|
715
|
+
return yield* Effect.fail(
|
|
716
|
+
new EmbeddingsNotFoundError({ path: resolvedRoot }),
|
|
717
|
+
)
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
// Create provider for query embedding
|
|
721
|
+
const provider = yield* createEmbeddingProviderDirect(
|
|
722
|
+
options.providerConfig ?? { provider: 'openai' },
|
|
723
|
+
)
|
|
724
|
+
const dimensions = provider.dimensions
|
|
725
|
+
|
|
726
|
+
// Get current provider name for error messages
|
|
727
|
+
const currentProviderName = options.providerConfig?.provider ?? 'openai'
|
|
728
|
+
|
|
729
|
+
// Verify dimensions match the active namespace
|
|
730
|
+
if (dimensions !== activeProvider.dimensions) {
|
|
731
|
+
return yield* Effect.fail(
|
|
732
|
+
new DimensionMismatchError({
|
|
733
|
+
corpusDimensions: activeProvider.dimensions,
|
|
734
|
+
providerDimensions: dimensions,
|
|
735
|
+
corpusProvider: `${activeProvider.provider}:${activeProvider.model}`,
|
|
736
|
+
currentProvider: currentProviderName,
|
|
737
|
+
path: resolvedRoot,
|
|
738
|
+
}),
|
|
739
|
+
)
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
// Load vector store from the active namespace
|
|
743
|
+
const vectorStore = createNamespacedVectorStore(
|
|
744
|
+
resolvedRoot,
|
|
745
|
+
activeProvider.provider,
|
|
746
|
+
activeProvider.model,
|
|
747
|
+
activeProvider.dimensions,
|
|
748
|
+
)
|
|
749
|
+
const loadResult = yield* vectorStore.load()
|
|
750
|
+
|
|
751
|
+
if (!loadResult.loaded) {
|
|
752
|
+
return yield* Effect.fail(
|
|
753
|
+
new EmbeddingsNotFoundError({ path: resolvedRoot }),
|
|
754
|
+
)
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
// Check for HNSW parameter mismatch
|
|
758
|
+
yield* checkHnswMismatch(loadResult.hnswMismatch)
|
|
759
|
+
|
|
760
|
+
// Determine the text to embed
|
|
761
|
+
// If HyDE is enabled, generate a hypothetical document first
|
|
762
|
+
let textToEmbed: string
|
|
763
|
+
let hydeResult: HydeResult | undefined
|
|
764
|
+
|
|
765
|
+
if (options.hyde) {
|
|
766
|
+
// Generate hypothetical document using LLM
|
|
767
|
+
hydeResult = yield* generateHypotheticalDocument(query, {
|
|
768
|
+
model: options.hydeOptions?.model,
|
|
769
|
+
maxTokens: options.hydeOptions?.maxTokens,
|
|
770
|
+
temperature: options.hydeOptions?.temperature,
|
|
771
|
+
})
|
|
772
|
+
textToEmbed = hydeResult.hypotheticalDocument
|
|
773
|
+
yield* Effect.logDebug(
|
|
774
|
+
`HyDE generated ${hydeResult.tokensUsed} tokens ($${hydeResult.cost.toFixed(6)})`,
|
|
775
|
+
)
|
|
776
|
+
} else {
|
|
777
|
+
// Preprocess query for better recall (unless disabled)
|
|
778
|
+
textToEmbed = options.skipPreprocessing ? query : preprocessQuery(query)
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
// Embed the query (or hypothetical document)
|
|
782
|
+
const queryResult = yield* wrapEmbedding(
|
|
783
|
+
provider.embed([textToEmbed]),
|
|
784
|
+
currentProviderName,
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
const queryVector = queryResult.embeddings[0]
|
|
788
|
+
if (!queryVector) {
|
|
789
|
+
return yield* Effect.fail(
|
|
790
|
+
new EmbeddingError({
|
|
791
|
+
reason: 'Unknown',
|
|
792
|
+
message: 'Failed to generate query embedding',
|
|
793
|
+
provider: currentProviderName,
|
|
794
|
+
}),
|
|
795
|
+
)
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
// Search
|
|
799
|
+
const limit = options.limit ?? 10
|
|
800
|
+
const threshold = options.threshold ?? 0
|
|
801
|
+
|
|
802
|
+
// Convert quality mode to efSearch value
|
|
803
|
+
const efSearch = options.quality
|
|
804
|
+
? QUALITY_EF_SEARCH[options.quality]
|
|
805
|
+
: undefined
|
|
806
|
+
|
|
807
|
+
const searchResults = yield* vectorStore.search(
|
|
808
|
+
queryVector,
|
|
809
|
+
limit * 2,
|
|
810
|
+
threshold,
|
|
811
|
+
{ efSearch },
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
// Apply path filter if specified
|
|
815
|
+
let filteredResults = searchResults
|
|
816
|
+
if (options.pathPattern) {
|
|
817
|
+
const pattern = options.pathPattern
|
|
818
|
+
.replace(/\./g, '\\.')
|
|
819
|
+
.replace(/\*/g, '.*')
|
|
820
|
+
const regex = new RegExp(`^${pattern}$`, 'i')
|
|
821
|
+
filteredResults = searchResults.filter((r) => regex.test(r.documentPath))
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
// Apply ranking boost (heading + file importance, enabled by default)
|
|
825
|
+
const applyBoost = options.headingBoost !== false
|
|
826
|
+
const boostedResults = applyBoost
|
|
827
|
+
? filteredResults.map((r) => ({
|
|
828
|
+
...r,
|
|
829
|
+
similarity: Math.min(
|
|
830
|
+
1,
|
|
831
|
+
r.similarity +
|
|
832
|
+
calculateHeadingBoost(r.heading, query) +
|
|
833
|
+
calculateFileImportanceBoost(r.documentPath),
|
|
834
|
+
),
|
|
835
|
+
}))
|
|
836
|
+
: filteredResults
|
|
837
|
+
|
|
838
|
+
// Re-sort by boosted similarity
|
|
839
|
+
const sortedResults = boostedResults.sort(
|
|
840
|
+
(a, b) => b.similarity - a.similarity,
|
|
841
|
+
)
|
|
842
|
+
const limitedResults = sortedResults.slice(0, limit)
|
|
843
|
+
|
|
844
|
+
// If context lines are requested, load section content
|
|
845
|
+
let results: readonly SemanticSearchResult[]
|
|
846
|
+
if (
|
|
847
|
+
options.contextBefore !== undefined ||
|
|
848
|
+
options.contextAfter !== undefined
|
|
849
|
+
) {
|
|
850
|
+
const storage = createStorage(resolvedRoot)
|
|
851
|
+
const sectionIndex = yield* loadSectionIndex(storage)
|
|
852
|
+
|
|
853
|
+
if (sectionIndex) {
|
|
854
|
+
results = yield* addContextLinesToResults(
|
|
855
|
+
limitedResults,
|
|
856
|
+
sectionIndex,
|
|
857
|
+
resolvedRoot,
|
|
858
|
+
options,
|
|
859
|
+
)
|
|
860
|
+
} else {
|
|
861
|
+
results = limitedResults.map((r) => ({
|
|
862
|
+
sectionId: r.sectionId,
|
|
863
|
+
documentPath: r.documentPath,
|
|
864
|
+
heading: r.heading,
|
|
865
|
+
similarity: r.similarity,
|
|
866
|
+
}))
|
|
867
|
+
}
|
|
868
|
+
} else {
|
|
869
|
+
results = limitedResults.map((r) => ({
|
|
870
|
+
sectionId: r.sectionId,
|
|
871
|
+
documentPath: r.documentPath,
|
|
872
|
+
heading: r.heading,
|
|
873
|
+
similarity: r.similarity,
|
|
874
|
+
}))
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
return results
|
|
878
|
+
})
|
|
879
|
+
|
|
880
|
+
/**
|
|
881
|
+
* Perform semantic search with stats about below-threshold results.
|
|
882
|
+
* Use this when you want to provide feedback to users about results that
|
|
883
|
+
* didn't meet the threshold.
|
|
884
|
+
*
|
|
885
|
+
* @param rootPath - Root directory containing embeddings
|
|
886
|
+
* @param query - Natural language search query
|
|
887
|
+
* @param options - Search options (limit, threshold, path filter)
|
|
888
|
+
* @returns Results with optional below-threshold stats
|
|
889
|
+
*
|
|
890
|
+
* @throws EmbeddingsNotFoundError - No embeddings exist (run index --embed first)
|
|
891
|
+
* @throws ApiKeyMissingError - API key not set (check provider config)
|
|
892
|
+
* @throws ApiKeyInvalidError - API key rejected by provider
|
|
893
|
+
* @throws EmbeddingError - Embedding API failure (rate limit, quota, network)
|
|
894
|
+
* @throws VectorStoreError - Cannot load or search vector index
|
|
895
|
+
* @throws DimensionMismatchError - Corpus has different dimensions than current provider
|
|
896
|
+
*/
|
|
897
|
+
export const semanticSearchWithStats = (
|
|
898
|
+
rootPath: string,
|
|
899
|
+
query: string,
|
|
900
|
+
options: SemanticSearchOptions = {},
|
|
901
|
+
): Effect.Effect<
|
|
902
|
+
SemanticSearchResultWithStats,
|
|
903
|
+
| EmbeddingsNotFoundError
|
|
904
|
+
| FileReadError
|
|
905
|
+
| IndexCorruptedError
|
|
906
|
+
| ApiKeyMissingError
|
|
907
|
+
| ApiKeyInvalidError
|
|
908
|
+
| EmbeddingError
|
|
909
|
+
| VectorStoreError
|
|
910
|
+
| DimensionMismatchError
|
|
911
|
+
> =>
|
|
912
|
+
Effect.gen(function* () {
|
|
913
|
+
const resolvedRoot = path.resolve(rootPath)
|
|
914
|
+
|
|
915
|
+
// Get active namespace to determine which embedding index to use
|
|
916
|
+
const activeProvider = yield* getActiveNamespace(resolvedRoot).pipe(
|
|
917
|
+
Effect.catchAll(() => Effect.succeed(null as ActiveProvider | null)),
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
if (!activeProvider) {
|
|
921
|
+
return yield* Effect.fail(
|
|
922
|
+
new EmbeddingsNotFoundError({ path: resolvedRoot }),
|
|
923
|
+
)
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
// Create provider for query embedding
|
|
927
|
+
const provider = yield* createEmbeddingProviderDirect(
|
|
928
|
+
options.providerConfig ?? { provider: 'openai' },
|
|
929
|
+
)
|
|
930
|
+
const dimensions = provider.dimensions
|
|
931
|
+
|
|
932
|
+
// Get current provider name for error messages
|
|
933
|
+
const currentProviderName = options.providerConfig?.provider ?? 'openai'
|
|
934
|
+
|
|
935
|
+
// Verify dimensions match the active namespace
|
|
936
|
+
if (dimensions !== activeProvider.dimensions) {
|
|
937
|
+
return yield* Effect.fail(
|
|
938
|
+
new DimensionMismatchError({
|
|
939
|
+
corpusDimensions: activeProvider.dimensions,
|
|
940
|
+
providerDimensions: dimensions,
|
|
941
|
+
corpusProvider: `${activeProvider.provider}:${activeProvider.model}`,
|
|
942
|
+
currentProvider: currentProviderName,
|
|
943
|
+
path: resolvedRoot,
|
|
944
|
+
}),
|
|
945
|
+
)
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
// Load vector store from the active namespace
|
|
949
|
+
const vectorStore = createNamespacedVectorStore(
|
|
950
|
+
resolvedRoot,
|
|
951
|
+
activeProvider.provider,
|
|
952
|
+
activeProvider.model,
|
|
953
|
+
activeProvider.dimensions,
|
|
954
|
+
)
|
|
955
|
+
const loadResult = yield* vectorStore.load()
|
|
956
|
+
|
|
957
|
+
if (!loadResult.loaded) {
|
|
958
|
+
return yield* Effect.fail(
|
|
959
|
+
new EmbeddingsNotFoundError({ path: resolvedRoot }),
|
|
960
|
+
)
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
// Check for HNSW parameter mismatch
|
|
964
|
+
yield* checkHnswMismatch(loadResult.hnswMismatch)
|
|
965
|
+
|
|
966
|
+
// Determine the text to embed
|
|
967
|
+
// If HyDE is enabled, generate a hypothetical document first
|
|
968
|
+
let textToEmbed: string
|
|
969
|
+
let hydeResult: HydeResult | undefined
|
|
970
|
+
|
|
971
|
+
if (options.hyde) {
|
|
972
|
+
// Generate hypothetical document using LLM
|
|
973
|
+
hydeResult = yield* generateHypotheticalDocument(query, {
|
|
974
|
+
model: options.hydeOptions?.model,
|
|
975
|
+
maxTokens: options.hydeOptions?.maxTokens,
|
|
976
|
+
temperature: options.hydeOptions?.temperature,
|
|
977
|
+
})
|
|
978
|
+
textToEmbed = hydeResult.hypotheticalDocument
|
|
979
|
+
yield* Effect.logDebug(
|
|
980
|
+
`HyDE generated ${hydeResult.tokensUsed} tokens ($${hydeResult.cost.toFixed(6)})`,
|
|
981
|
+
)
|
|
982
|
+
} else {
|
|
983
|
+
// Preprocess query for better recall (unless disabled)
|
|
984
|
+
textToEmbed = options.skipPreprocessing ? query : preprocessQuery(query)
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
// Embed the query (or hypothetical document)
|
|
988
|
+
const queryResult = yield* wrapEmbedding(
|
|
989
|
+
provider.embed([textToEmbed]),
|
|
990
|
+
currentProviderName,
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
const queryVector = queryResult.embeddings[0]
|
|
994
|
+
if (!queryVector) {
|
|
995
|
+
return yield* Effect.fail(
|
|
996
|
+
new EmbeddingError({
|
|
997
|
+
reason: 'Unknown',
|
|
998
|
+
message: 'Failed to generate query embedding',
|
|
999
|
+
provider: currentProviderName,
|
|
1000
|
+
}),
|
|
1001
|
+
)
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
// Search with stats
|
|
1005
|
+
const limit = options.limit ?? 10
|
|
1006
|
+
const threshold = options.threshold ?? 0
|
|
1007
|
+
|
|
1008
|
+
// Convert quality mode to efSearch value
|
|
1009
|
+
const efSearch = options.quality
|
|
1010
|
+
? QUALITY_EF_SEARCH[options.quality]
|
|
1011
|
+
: undefined
|
|
1012
|
+
|
|
1013
|
+
const searchResultWithStats = yield* vectorStore.searchWithStats(
|
|
1014
|
+
queryVector,
|
|
1015
|
+
limit * 2,
|
|
1016
|
+
threshold,
|
|
1017
|
+
{ efSearch },
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
// Apply path filter if specified
|
|
1021
|
+
let filteredResults = searchResultWithStats.results
|
|
1022
|
+
if (options.pathPattern) {
|
|
1023
|
+
const pattern = options.pathPattern
|
|
1024
|
+
.replace(/\./g, '\\.')
|
|
1025
|
+
.replace(/\*/g, '.*')
|
|
1026
|
+
const regex = new RegExp(`^${pattern}$`, 'i')
|
|
1027
|
+
filteredResults = searchResultWithStats.results.filter((r) =>
|
|
1028
|
+
regex.test(r.documentPath),
|
|
1029
|
+
)
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
// Apply ranking boost (heading + file importance, enabled by default)
|
|
1033
|
+
const applyBoost = options.headingBoost !== false
|
|
1034
|
+
const boostedResults = applyBoost
|
|
1035
|
+
? filteredResults.map((r) => ({
|
|
1036
|
+
...r,
|
|
1037
|
+
similarity: Math.min(
|
|
1038
|
+
1,
|
|
1039
|
+
r.similarity +
|
|
1040
|
+
calculateHeadingBoost(r.heading, query) +
|
|
1041
|
+
calculateFileImportanceBoost(r.documentPath),
|
|
1042
|
+
),
|
|
1043
|
+
}))
|
|
1044
|
+
: filteredResults
|
|
1045
|
+
|
|
1046
|
+
// Re-sort by boosted similarity and convert to SemanticSearchResult
|
|
1047
|
+
const sortedResults = boostedResults.sort(
|
|
1048
|
+
(a, b) => b.similarity - a.similarity,
|
|
1049
|
+
)
|
|
1050
|
+
const totalAvailable = sortedResults.length
|
|
1051
|
+
const limitedResults = sortedResults.slice(0, limit)
|
|
1052
|
+
|
|
1053
|
+
// If context lines are requested, load section content
|
|
1054
|
+
let results: readonly SemanticSearchResult[]
|
|
1055
|
+
if (
|
|
1056
|
+
options.contextBefore !== undefined ||
|
|
1057
|
+
options.contextAfter !== undefined
|
|
1058
|
+
) {
|
|
1059
|
+
const storage = createStorage(resolvedRoot)
|
|
1060
|
+
const sectionIndex = yield* loadSectionIndex(storage)
|
|
1061
|
+
|
|
1062
|
+
if (sectionIndex) {
|
|
1063
|
+
results = yield* addContextLinesToResults(
|
|
1064
|
+
limitedResults,
|
|
1065
|
+
sectionIndex,
|
|
1066
|
+
resolvedRoot,
|
|
1067
|
+
options,
|
|
1068
|
+
)
|
|
1069
|
+
} else {
|
|
1070
|
+
results = limitedResults.map((r) => ({
|
|
1071
|
+
sectionId: r.sectionId,
|
|
1072
|
+
documentPath: r.documentPath,
|
|
1073
|
+
heading: r.heading,
|
|
1074
|
+
similarity: r.similarity,
|
|
1075
|
+
}))
|
|
1076
|
+
}
|
|
1077
|
+
} else {
|
|
1078
|
+
results = limitedResults.map((r) => ({
|
|
1079
|
+
sectionId: r.sectionId,
|
|
1080
|
+
documentPath: r.documentPath,
|
|
1081
|
+
heading: r.heading,
|
|
1082
|
+
similarity: r.similarity,
|
|
1083
|
+
}))
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
return {
|
|
1087
|
+
results,
|
|
1088
|
+
belowThresholdCount: searchResultWithStats.belowThresholdCount,
|
|
1089
|
+
belowThresholdHighest:
|
|
1090
|
+
searchResultWithStats.belowThresholdHighest ?? undefined,
|
|
1091
|
+
totalAvailable,
|
|
1092
|
+
}
|
|
1093
|
+
})
|
|
1094
|
+
|
|
1095
|
+
// ============================================================================
|
|
1096
|
+
// Search with Content
|
|
1097
|
+
// ============================================================================
|
|
1098
|
+
|
|
1099
|
+
/**
|
|
1100
|
+
* Perform semantic search and include section content in results.
|
|
1101
|
+
*
|
|
1102
|
+
* @param rootPath - Root directory containing embeddings
|
|
1103
|
+
* @param query - Natural language search query
|
|
1104
|
+
* @param options - Search options (limit, threshold, path filter)
|
|
1105
|
+
* @returns Ranked list of matching sections with content
|
|
1106
|
+
*
|
|
1107
|
+
* @throws EmbeddingsNotFoundError - No embeddings exist (run index --embed first)
|
|
1108
|
+
* @throws FileReadError - Cannot read index files
|
|
1109
|
+
* @throws IndexCorruptedError - Index files are corrupted
|
|
1110
|
+
* @throws ApiKeyMissingError - API key not set (check provider config)
|
|
1111
|
+
* @throws ApiKeyInvalidError - API key rejected by provider
|
|
1112
|
+
* @throws EmbeddingError - Embedding API failure (rate limit, quota, network)
|
|
1113
|
+
* @throws VectorStoreError - Cannot load or search vector index
|
|
1114
|
+
* @throws DimensionMismatchError - Corpus has different dimensions than current provider
|
|
1115
|
+
*/
|
|
1116
|
+
export const semanticSearchWithContent = (
|
|
1117
|
+
rootPath: string,
|
|
1118
|
+
query: string,
|
|
1119
|
+
options: SemanticSearchOptions = {},
|
|
1120
|
+
): Effect.Effect<
|
|
1121
|
+
readonly SemanticSearchResult[],
|
|
1122
|
+
| EmbeddingsNotFoundError
|
|
1123
|
+
| FileReadError
|
|
1124
|
+
| IndexCorruptedError
|
|
1125
|
+
| ApiKeyMissingError
|
|
1126
|
+
| ApiKeyInvalidError
|
|
1127
|
+
| EmbeddingError
|
|
1128
|
+
| VectorStoreError
|
|
1129
|
+
| DimensionMismatchError
|
|
1130
|
+
> =>
|
|
1131
|
+
Effect.gen(function* () {
|
|
1132
|
+
const resolvedRoot = path.resolve(rootPath)
|
|
1133
|
+
const results = yield* semanticSearch(resolvedRoot, query, options)
|
|
1134
|
+
|
|
1135
|
+
const storage = createStorage(resolvedRoot)
|
|
1136
|
+
const sectionIndex = yield* loadSectionIndex(storage)
|
|
1137
|
+
|
|
1138
|
+
if (!sectionIndex) {
|
|
1139
|
+
return results
|
|
1140
|
+
}
|
|
1141
|
+
|
|
1142
|
+
const resultsWithContent: SemanticSearchResult[] = []
|
|
1143
|
+
|
|
1144
|
+
for (const result of results) {
|
|
1145
|
+
const section = sectionIndex.sections[result.sectionId]
|
|
1146
|
+
if (!section) {
|
|
1147
|
+
resultsWithContent.push(result)
|
|
1148
|
+
continue
|
|
1149
|
+
}
|
|
1150
|
+
|
|
1151
|
+
const filePath = path.join(resolvedRoot, result.documentPath)
|
|
1152
|
+
|
|
1153
|
+
// Note: catchAll is intentional - file read failures during search result
|
|
1154
|
+
// enrichment should skip content loading with a warning, not fail the search.
|
|
1155
|
+
// Results are still returned without content when files can't be read.
|
|
1156
|
+
const fileContentResult = yield* Effect.promise(() =>
|
|
1157
|
+
fs.readFile(filePath, 'utf-8'),
|
|
1158
|
+
).pipe(
|
|
1159
|
+
Effect.map((content) => ({ ok: true as const, content })),
|
|
1160
|
+
Effect.catchAll(() =>
|
|
1161
|
+
Effect.succeed({ ok: false as const, content: '' }),
|
|
1162
|
+
),
|
|
1163
|
+
)
|
|
1164
|
+
|
|
1165
|
+
if (!fileContentResult.ok) {
|
|
1166
|
+
yield* Effect.logWarning(
|
|
1167
|
+
`Skipping content load (cannot read): ${result.documentPath}`,
|
|
1168
|
+
)
|
|
1169
|
+
resultsWithContent.push(result)
|
|
1170
|
+
continue
|
|
1171
|
+
}
|
|
1172
|
+
|
|
1173
|
+
const lines = fileContentResult.content.split('\n')
|
|
1174
|
+
const content = lines
|
|
1175
|
+
.slice(section.startLine - 1, section.endLine)
|
|
1176
|
+
.join('\n')
|
|
1177
|
+
|
|
1178
|
+
resultsWithContent.push({
|
|
1179
|
+
...result,
|
|
1180
|
+
content,
|
|
1181
|
+
})
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
return resultsWithContent
|
|
1185
|
+
})
|
|
1186
|
+
|
|
1187
|
+
// ============================================================================
|
|
1188
|
+
// Get Embedding Stats
|
|
1189
|
+
// ============================================================================
|
|
1190
|
+
|
|
1191
|
+
export interface EmbeddingStats {
|
|
1192
|
+
readonly hasEmbeddings: boolean
|
|
1193
|
+
readonly count: number
|
|
1194
|
+
readonly provider: string
|
|
1195
|
+
readonly model?: string | undefined
|
|
1196
|
+
readonly dimensions: number
|
|
1197
|
+
readonly totalCost: number
|
|
1198
|
+
readonly totalTokens: number
|
|
1199
|
+
}
|
|
1200
|
+
|
|
1201
|
+
/**
|
|
1202
|
+
* Get statistics about stored embeddings.
|
|
1203
|
+
* Uses the active namespace to find the current embedding index.
|
|
1204
|
+
*
|
|
1205
|
+
* @param rootPath - Root directory containing embeddings
|
|
1206
|
+
* @returns Embedding statistics (count, provider, costs)
|
|
1207
|
+
*
|
|
1208
|
+
* @throws VectorStoreError - Cannot load vector index metadata
|
|
1209
|
+
*/
|
|
1210
|
+
export const getEmbeddingStats = (
|
|
1211
|
+
rootPath: string,
|
|
1212
|
+
): Effect.Effect<EmbeddingStats, VectorStoreError> =>
|
|
1213
|
+
Effect.gen(function* () {
|
|
1214
|
+
const resolvedRoot = path.resolve(rootPath)
|
|
1215
|
+
|
|
1216
|
+
// Get the active namespace to find where embeddings are stored
|
|
1217
|
+
const activeProvider = yield* getActiveNamespace(resolvedRoot).pipe(
|
|
1218
|
+
Effect.catchAll(() => Effect.succeed(null as ActiveProvider | null)),
|
|
1219
|
+
)
|
|
1220
|
+
|
|
1221
|
+
if (!activeProvider) {
|
|
1222
|
+
return {
|
|
1223
|
+
hasEmbeddings: false,
|
|
1224
|
+
count: 0,
|
|
1225
|
+
provider: 'none',
|
|
1226
|
+
dimensions: 0,
|
|
1227
|
+
totalCost: 0,
|
|
1228
|
+
totalTokens: 0,
|
|
1229
|
+
}
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
// Load the namespaced vector store to get stats
|
|
1233
|
+
const vectorStore = createNamespacedVectorStore(
|
|
1234
|
+
resolvedRoot,
|
|
1235
|
+
activeProvider.provider,
|
|
1236
|
+
activeProvider.model,
|
|
1237
|
+
activeProvider.dimensions,
|
|
1238
|
+
)
|
|
1239
|
+
|
|
1240
|
+
const loadResult = yield* vectorStore
|
|
1241
|
+
.load()
|
|
1242
|
+
.pipe(
|
|
1243
|
+
Effect.catchAll(() =>
|
|
1244
|
+
Effect.succeed({ loaded: false } as VectorStoreLoadResult),
|
|
1245
|
+
),
|
|
1246
|
+
)
|
|
1247
|
+
|
|
1248
|
+
if (!loadResult.loaded) {
|
|
1249
|
+
return {
|
|
1250
|
+
hasEmbeddings: false,
|
|
1251
|
+
count: 0,
|
|
1252
|
+
provider: 'none',
|
|
1253
|
+
dimensions: 0,
|
|
1254
|
+
totalCost: 0,
|
|
1255
|
+
totalTokens: 0,
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1258
|
+
|
|
1259
|
+
const stats = vectorStore.getStats()
|
|
1260
|
+
|
|
1261
|
+
return {
|
|
1262
|
+
hasEmbeddings: true,
|
|
1263
|
+
count: stats.count,
|
|
1264
|
+
provider: stats.provider || 'openai',
|
|
1265
|
+
model: stats.providerModel,
|
|
1266
|
+
dimensions: stats.dimensions,
|
|
1267
|
+
totalCost: stats.totalCost || 0,
|
|
1268
|
+
totalTokens: stats.totalTokens || 0,
|
|
1269
|
+
}
|
|
1270
|
+
})
|