scythe-context-mcp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +21 -0
- package/CHANGELOG.md +27 -0
- package/LICENSE +201 -0
- package/README.en.md +197 -0
- package/README.md +197 -0
- package/README.zh-CN.md +197 -0
- package/dist/config.js +61 -0
- package/dist/index.js +16 -0
- package/dist/indexing/binary.js +15 -0
- package/dist/indexing/chunker.js +64 -0
- package/dist/indexing/contextPack.js +54 -0
- package/dist/indexing/defaults.js +6 -0
- package/dist/indexing/dryRun.js +48 -0
- package/dist/indexing/embeddingWriter.js +102 -0
- package/dist/indexing/hash.js +4 -0
- package/dist/indexing/hybridSearch.js +67 -0
- package/dist/indexing/indexStatus.js +224 -0
- package/dist/indexing/indexWriter.js +106 -0
- package/dist/indexing/keywordSearch.js +86 -0
- package/dist/indexing/relatedFiles.js +137 -0
- package/dist/indexing/relatedSnippets.js +105 -0
- package/dist/indexing/resultFormat.js +69 -0
- package/dist/indexing/scanner.js +123 -0
- package/dist/indexing/semanticSearch.js +48 -0
- package/dist/indexing/symbolGraph.js +121 -0
- package/dist/indexing/types.js +1 -0
- package/dist/providers/gemini.js +149 -0
- package/dist/providers/types.js +1 -0
- package/dist/storage/schema.js +187 -0
- package/dist/storage/sqliteVec.js +17 -0
- package/dist/tools/registerTools.js +364 -0
- package/docs/architecture.md +280 -0
- package/docs/codex-integration.md +114 -0
- package/docs/development-plan.md +218 -0
- package/docs/gemini-compatibility.md +214 -0
- package/docs/tech-stack.md +122 -0
- package/package.json +58 -0
package/README.zh-CN.md
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# Scythe Context MCP
|
|
2
|
+
|
|
3
|
+
[](https://github.com/Lianye-Scythe/scythe-context-mcp/actions/workflows/ci.yml)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
[](package.json)
|
|
6
|
+
|
|
7
|
+
[繁體中文](README.md) | [English](README.en.md) | [简体中文](README.zh-CN.md)
|
|
8
|
+
|
|
9
|
+
Scythe Context MCP 是给 Codex App / Codex CLI 使用的本地代码上下文引擎。它在 repo 内建立 SQLite/sqlite-vec 索引,结合语义搜索、关键字搜索、符号/依赖关系与 context packing,让 Codex 更快拿到可操作的文件、行号、片段与相关路径。
|
|
10
|
+
|
|
11
|
+
## 为什么用它
|
|
12
|
+
|
|
13
|
+
- **本地优先**:metadata、FTS 与向量索引都存在 repo 内的 `.scythe-context/`。
|
|
14
|
+
- **混合搜索**:结合 Gemini embeddings、SQLite FTS5、path/symbol ranking,避免只靠单一召回方式。
|
|
15
|
+
- **Codex 友好输出**:返回 line ranges、snippets、match reasons、grep keywords、related files 与 suggested paths。
|
|
16
|
+
- **可接自己的 provider**:支持官方 Gemini API,也支持第三方 Gemini-compatible v1beta proxy。
|
|
17
|
+
- **可诊断**:内置 provider probe、index freshness、embedding coverage 与可修复建议。
|
|
18
|
+
|
|
19
|
+
隐私提醒:只有在执行 embedding 相关功能时,query 或 chunk text 才会发送到你配置的 Gemini-compatible endpoint。第三方 proxy 应视为可以看到这些文字。
|
|
20
|
+
|
|
21
|
+
## 功能状态
|
|
22
|
+
|
|
23
|
+
已完成:
|
|
24
|
+
|
|
25
|
+
- repo 扫描、binary/large-file skip、chunking
|
|
26
|
+
- SQLite metadata、SQLite FTS5、sqlite-vec 向量索引
|
|
27
|
+
- Gemini Embedding 2 provider 与 batch fallback
|
|
28
|
+
- semantic / keyword / hybrid search
|
|
29
|
+
- 轻量 symbol/dependency graph
|
|
30
|
+
- related-file lookup、bounded multi-hop traversal
|
|
31
|
+
- `repo_context_pack` context budgeting 与 related snippet packing
|
|
32
|
+
- provider diagnostics 与 index freshness diagnostics
|
|
33
|
+
|
|
34
|
+
下一步:
|
|
35
|
+
|
|
36
|
+
- provider capability cache
|
|
37
|
+
- 更完整的安装/原生依赖 doctor
|
|
38
|
+
- embedding 失败时的 keyword-only fallback
|
|
39
|
+
- 必要时加入 tree-sitter symbol extraction
|
|
40
|
+
|
|
41
|
+
## 安装
|
|
42
|
+
|
|
43
|
+
### 从 npm 安装
|
|
44
|
+
|
|
45
|
+
使用 npm 安装全局命令:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
npm install -g scythe-context-mcp
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### 从源码安装
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
git clone https://github.com/Lianye-Scythe/scythe-context-mcp.git
|
|
55
|
+
cd scythe-context-mcp
|
|
56
|
+
npm install
|
|
57
|
+
cp .env.example .env
|
|
58
|
+
npm run build
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Runtime 目标是 Node.js 24 LTS。Node 26 可能可用,但在进入 LTS 前不作为主要验收基准。
|
|
62
|
+
|
|
63
|
+
旧项目名 `repo-beacon-mcp` 已改为 `scythe-context-mcp`。旧的 `REPO_BEACON_*` 环境变量仍作为 fallback 兼容,但新配置应改用 `SCYTHE_CONTEXT_*`。
|
|
64
|
+
|
|
65
|
+
## Codex 配置
|
|
66
|
+
|
|
67
|
+
### npm binary
|
|
68
|
+
|
|
69
|
+
如果已用 npm 全局安装:
|
|
70
|
+
|
|
71
|
+
```toml
|
|
72
|
+
[mcp_servers.scythe_context]
|
|
73
|
+
command = "scythe-context-mcp"
|
|
74
|
+
enabled = true
|
|
75
|
+
required = false
|
|
76
|
+
startup_timeout_sec = 20
|
|
77
|
+
tool_timeout_sec = 120
|
|
78
|
+
env_vars = ["GEMINI_API_KEY"]
|
|
79
|
+
enabled_tools = [
|
|
80
|
+
"repo_index_status",
|
|
81
|
+
"repo_reindex",
|
|
82
|
+
"repo_context_pack",
|
|
83
|
+
"repo_semantic_search",
|
|
84
|
+
"repo_related_files",
|
|
85
|
+
"gemini_embedding_probe"
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
[mcp_servers.scythe_context.env]
|
|
89
|
+
GEMINI_OUTPUT_DIMENSIONALITY = "1536"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### 本地 checkout
|
|
93
|
+
|
|
94
|
+
如果从源码执行:
|
|
95
|
+
|
|
96
|
+
```toml
|
|
97
|
+
[mcp_servers.scythe_context]
|
|
98
|
+
command = "node"
|
|
99
|
+
args = ["/path/to/scythe-context-mcp/dist/index.js"]
|
|
100
|
+
cwd = "/path/to/scythe-context-mcp"
|
|
101
|
+
enabled = true
|
|
102
|
+
required = false
|
|
103
|
+
startup_timeout_sec = 20
|
|
104
|
+
tool_timeout_sec = 120
|
|
105
|
+
env_vars = ["GEMINI_API_KEY"]
|
|
106
|
+
|
|
107
|
+
[mcp_servers.scythe_context.env]
|
|
108
|
+
GEMINI_OUTPUT_DIMENSIONALITY = "1536"
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### 第三方 v1beta proxy
|
|
112
|
+
|
|
113
|
+
```toml
|
|
114
|
+
[mcp_servers.scythe_context.env]
|
|
115
|
+
GEMINI_BASE_URL = "https://your-proxy.example.com/v1beta"
|
|
116
|
+
GEMINI_AUTH_MODE = "bearer"
|
|
117
|
+
GEMINI_OUTPUT_DIMENSIONALITY = "1536"
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
支持的 auth mode:
|
|
121
|
+
|
|
122
|
+
- `x-goog-api-key`
|
|
123
|
+
- `bearer`
|
|
124
|
+
- `query`
|
|
125
|
+
|
|
126
|
+
启动 Codex 前在 shell 或系统环境中设置 `GEMINI_API_KEY`,避免把 key 写进会同步或会提交的配置文件。
|
|
127
|
+
|
|
128
|
+
## 常用工作流
|
|
129
|
+
|
|
130
|
+
1. 先检查索引状态:
|
|
131
|
+
|
|
132
|
+
```text
|
|
133
|
+
repo_index_status
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
2. 如果 metadata 不存在或 freshness 显示 stale:
|
|
137
|
+
|
|
138
|
+
```text
|
|
139
|
+
repo_reindex({ "dry_run": false })
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
3. 需要语义搜索或 context pack 时,再建立 embeddings:
|
|
143
|
+
|
|
144
|
+
```text
|
|
145
|
+
repo_reindex({ "dry_run": false, "index_embeddings": true })
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
4. 让 Codex 针对任务拿上下文:
|
|
149
|
+
|
|
150
|
+
```text
|
|
151
|
+
repo_context_pack({ "query": "where is auth token validation handled?" })
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
5. 对某个命中文件展开 imports / reverse imports:
|
|
155
|
+
|
|
156
|
+
```text
|
|
157
|
+
repo_related_files({ "path": "src/server/auth.ts" })
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## MCP 工具
|
|
161
|
+
|
|
162
|
+
| Tool | 用途 |
|
|
163
|
+
| --- | --- |
|
|
164
|
+
| `repo_index_status` | 查看 index path、metadata/embedding coverage、freshness diagnostics 与建议动作。 |
|
|
165
|
+
| `repo_reindex` | 扫描项目并写入 metadata;设置 `index_embeddings=true` 时才会调用 embedding provider。 |
|
|
166
|
+
| `repo_context_pack` | 针对任务查询打包 primary snippets、match reasons、related files 与 suggested paths。 |
|
|
167
|
+
| `repo_semantic_search` | 对已索引 chunks 做 hybrid 或 semantic search,适合排查 ranking。 |
|
|
168
|
+
| `repo_related_files` | 查看单一文件的 symbols、imports、importedBy。 |
|
|
169
|
+
| `gemini_embedding_probe` | 测试 Gemini 或 proxy 兼容性,返回 endpoint、latency、错误分类与可修复建议。 |
|
|
170
|
+
|
|
171
|
+
## 隐私与本地文件
|
|
172
|
+
|
|
173
|
+
- `.scythe-context/`: 默认索引目录,不提交。
|
|
174
|
+
- `.repo-beacon/`: 旧索引目录名称,仍被 ignore。
|
|
175
|
+
- `local/`: 私密 API 测试文件、参考 HTML、截图等本地资料,不提交。
|
|
176
|
+
- `.env`: 本地配置,不提交。
|
|
177
|
+
|
|
178
|
+
不要把 API key、proxy token、私有代码片段或 index database 放进 issue、PR 或公开 logs。
|
|
179
|
+
|
|
180
|
+
## 文档
|
|
181
|
+
|
|
182
|
+
- [架构设计](docs/architecture.md)
|
|
183
|
+
- [开发计划](docs/development-plan.md)
|
|
184
|
+
- [Gemini 兼容性](docs/gemini-compatibility.md)
|
|
185
|
+
- [技术栈](docs/tech-stack.md)
|
|
186
|
+
- [Codex 集成审查](docs/codex-integration.md)
|
|
187
|
+
|
|
188
|
+
## 开发与发布检查
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
npm test
|
|
192
|
+
npm run build
|
|
193
|
+
npm audit --omit=dev
|
|
194
|
+
npm pack --dry-run
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
确认 package 不包含 `.env`、`.scythe-context/`, `.repo-beacon/`, `local/`, API key 或私密参考文件。
|
package/dist/config.js
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import process from "node:process";
|
|
3
|
+
import { DEFAULT_INDEXING_LIMITS } from "./indexing/defaults.js";
|
|
4
|
+
function numberFromEnv(name, fallback) {
|
|
5
|
+
const value = process.env[name];
|
|
6
|
+
if (!value)
|
|
7
|
+
return fallback;
|
|
8
|
+
const parsed = Number(value);
|
|
9
|
+
if (!Number.isFinite(parsed) || parsed <= 0) {
|
|
10
|
+
throw new Error(`${name} must be a positive number`);
|
|
11
|
+
}
|
|
12
|
+
return parsed;
|
|
13
|
+
}
|
|
14
|
+
function envValue(name, legacyName) {
|
|
15
|
+
return process.env[name] || (legacyName ? process.env[legacyName] : undefined);
|
|
16
|
+
}
|
|
17
|
+
function numberFromEnvAlias(name, legacyName, fallback) {
|
|
18
|
+
const value = envValue(name, legacyName);
|
|
19
|
+
if (!value)
|
|
20
|
+
return fallback;
|
|
21
|
+
const parsed = Number(value);
|
|
22
|
+
if (!Number.isFinite(parsed) || parsed <= 0) {
|
|
23
|
+
throw new Error(`${name} or ${legacyName} must be a positive number`);
|
|
24
|
+
}
|
|
25
|
+
return parsed;
|
|
26
|
+
}
|
|
27
|
+
function authModeFromEnv(value) {
|
|
28
|
+
if (!value)
|
|
29
|
+
return "x-goog-api-key";
|
|
30
|
+
if (value === "x-goog-api-key" || value === "bearer" || value === "query") {
|
|
31
|
+
return value;
|
|
32
|
+
}
|
|
33
|
+
throw new Error("GEMINI_AUTH_MODE must be one of: x-goog-api-key, bearer, query");
|
|
34
|
+
}
|
|
35
|
+
export function loadConfig() {
|
|
36
|
+
return {
|
|
37
|
+
defaultProjectPath: path.resolve(envValue("SCYTHE_CONTEXT_DEFAULT_PROJECT", "REPO_BEACON_DEFAULT_PROJECT") || process.cwd()),
|
|
38
|
+
indexDirName: envValue("SCYTHE_CONTEXT_INDEX_DIR", "REPO_BEACON_INDEX_DIR") || ".scythe-context",
|
|
39
|
+
indexing: {
|
|
40
|
+
maxFileBytes: numberFromEnvAlias("SCYTHE_CONTEXT_MAX_FILE_BYTES", "REPO_BEACON_MAX_FILE_BYTES", DEFAULT_INDEXING_LIMITS.maxFileBytes) ??
|
|
41
|
+
DEFAULT_INDEXING_LIMITS.maxFileBytes,
|
|
42
|
+
targetChunkChars: numberFromEnvAlias("SCYTHE_CONTEXT_TARGET_CHUNK_CHARS", "REPO_BEACON_TARGET_CHUNK_CHARS", DEFAULT_INDEXING_LIMITS.targetChunkChars) ??
|
|
43
|
+
DEFAULT_INDEXING_LIMITS.targetChunkChars,
|
|
44
|
+
chunkOverlapChars: numberFromEnvAlias("SCYTHE_CONTEXT_CHUNK_OVERLAP_CHARS", "REPO_BEACON_CHUNK_OVERLAP_CHARS", DEFAULT_INDEXING_LIMITS.chunkOverlapChars) ??
|
|
45
|
+
DEFAULT_INDEXING_LIMITS.chunkOverlapChars,
|
|
46
|
+
maxChunksPerFile: numberFromEnvAlias("SCYTHE_CONTEXT_MAX_CHUNKS_PER_FILE", "REPO_BEACON_MAX_CHUNKS_PER_FILE", DEFAULT_INDEXING_LIMITS.maxChunksPerFile) ??
|
|
47
|
+
DEFAULT_INDEXING_LIMITS.maxChunksPerFile,
|
|
48
|
+
embeddingBatchSize: numberFromEnvAlias("SCYTHE_CONTEXT_EMBEDDING_BATCH_SIZE", "REPO_BEACON_EMBEDDING_BATCH_SIZE", 16) ?? 16,
|
|
49
|
+
maxEmbeddingChunks: numberFromEnvAlias("SCYTHE_CONTEXT_MAX_EMBEDDING_CHUNKS", "REPO_BEACON_MAX_EMBEDDING_CHUNKS", 256) ?? 256,
|
|
50
|
+
},
|
|
51
|
+
gemini: {
|
|
52
|
+
apiKey: process.env.GEMINI_API_KEY,
|
|
53
|
+
baseUrl: process.env.GEMINI_BASE_URL || "https://generativelanguage.googleapis.com/v1beta",
|
|
54
|
+
model: process.env.GEMINI_MODEL || "gemini-embedding-2",
|
|
55
|
+
outputDimensionality: numberFromEnv("GEMINI_OUTPUT_DIMENSIONALITY", 1536),
|
|
56
|
+
authMode: authModeFromEnv(process.env.GEMINI_AUTH_MODE),
|
|
57
|
+
apiKeyHeader: process.env.GEMINI_API_KEY_HEADER || "x-goog-api-key",
|
|
58
|
+
apiKeyQueryParam: process.env.GEMINI_API_KEY_QUERY_PARAM || "key",
|
|
59
|
+
},
|
|
60
|
+
};
|
|
61
|
+
}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import "dotenv/config";
|
|
3
|
+
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
4
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
5
|
+
import { loadConfig } from "./config.js";
|
|
6
|
+
import { registerTools } from "./tools/registerTools.js";
|
|
7
|
+
const config = loadConfig();
|
|
8
|
+
const server = new McpServer({
|
|
9
|
+
name: "scythe-context-mcp",
|
|
10
|
+
version: "0.1.0",
|
|
11
|
+
}, {
|
|
12
|
+
instructions: "Scythe Context is a local code-context MCP server for Codex. Use repo_index_status first, repo_reindex(dry_run=false) when metadata is missing or stale, and repo_context_pack for task lookup. Only set index_embeddings=true when semantic vectors are needed because chunk text is sent to the configured embedding endpoint. Keep max_context_chars and max_related_context_chars bounded. Use repo_semantic_search for ranking/debugging and repo_related_files for a focused file graph.",
|
|
13
|
+
});
|
|
14
|
+
registerTools(server, config);
|
|
15
|
+
const transport = new StdioServerTransport();
|
|
16
|
+
await server.connect(transport);
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { TextDecoder } from "node:util";
|
|
2
|
+
const utf8Decoder = new TextDecoder("utf-8", { fatal: true });
|
|
3
|
+
export function isProbablyBinary(buffer) {
|
|
4
|
+
if (buffer.length === 0)
|
|
5
|
+
return false;
|
|
6
|
+
if (buffer.includes(0))
|
|
7
|
+
return true;
|
|
8
|
+
try {
|
|
9
|
+
utf8Decoder.decode(buffer);
|
|
10
|
+
return false;
|
|
11
|
+
}
|
|
12
|
+
catch {
|
|
13
|
+
return true;
|
|
14
|
+
}
|
|
15
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { sha256Hex } from "./hash.js";
|
|
2
|
+
const MAX_LINE_EXTENSION_CHARS = 240;
|
|
3
|
+
function lineEndIndex(text, startIndex) {
|
|
4
|
+
const nextNewline = text.indexOf("\n", startIndex);
|
|
5
|
+
return nextNewline === -1 ? text.length : nextNewline + 1;
|
|
6
|
+
}
|
|
7
|
+
function chooseChunkEnd(text, targetEnd) {
|
|
8
|
+
if (targetEnd >= text.length)
|
|
9
|
+
return text.length;
|
|
10
|
+
const nextNewline = text.indexOf("\n", targetEnd);
|
|
11
|
+
if (nextNewline !== -1 && nextNewline - targetEnd <= MAX_LINE_EXTENSION_CHARS) {
|
|
12
|
+
return nextNewline + 1;
|
|
13
|
+
}
|
|
14
|
+
return targetEnd;
|
|
15
|
+
}
|
|
16
|
+
function countNewlines(text) {
|
|
17
|
+
return text.length === 0 ? 0 : (text.match(/\n/g) || []).length;
|
|
18
|
+
}
|
|
19
|
+
function findOverlapStart(text, chunkStart, chunkEnd, overlapChars) {
|
|
20
|
+
if (overlapChars <= 0)
|
|
21
|
+
return chunkEnd;
|
|
22
|
+
const approximateStart = Math.max(chunkStart, chunkEnd - overlapChars);
|
|
23
|
+
const nextLineStart = lineEndIndex(text, approximateStart);
|
|
24
|
+
return nextLineStart > chunkEnd ? approximateStart : nextLineStart;
|
|
25
|
+
}
|
|
26
|
+
export function chunkText(relativePath, text, limits) {
|
|
27
|
+
if (text.length === 0) {
|
|
28
|
+
return [
|
|
29
|
+
{
|
|
30
|
+
relativePath,
|
|
31
|
+
startLine: 1,
|
|
32
|
+
endLine: 1,
|
|
33
|
+
text: "",
|
|
34
|
+
hash: sha256Hex(`${relativePath}\n1\n1\n`),
|
|
35
|
+
},
|
|
36
|
+
];
|
|
37
|
+
}
|
|
38
|
+
const chunks = [];
|
|
39
|
+
let start = 0;
|
|
40
|
+
let startLine = 1;
|
|
41
|
+
while (start < text.length && chunks.length < limits.maxChunksPerFile) {
|
|
42
|
+
const targetEnd = Math.min(text.length, start + limits.targetChunkChars);
|
|
43
|
+
let end = chooseChunkEnd(text, targetEnd);
|
|
44
|
+
if (end <= start)
|
|
45
|
+
end = Math.min(text.length, start + limits.targetChunkChars);
|
|
46
|
+
const chunkTextValue = text.slice(start, end);
|
|
47
|
+
const lineSpan = Math.max(1, countNewlines(chunkTextValue) + (chunkTextValue.endsWith("\n") ? 0 : 1));
|
|
48
|
+
const endLine = startLine + lineSpan - 1;
|
|
49
|
+
chunks.push({
|
|
50
|
+
relativePath,
|
|
51
|
+
startLine,
|
|
52
|
+
endLine,
|
|
53
|
+
text: chunkTextValue,
|
|
54
|
+
hash: sha256Hex(`${relativePath}\n${startLine}\n${endLine}\n${chunkTextValue}`),
|
|
55
|
+
});
|
|
56
|
+
if (end >= text.length)
|
|
57
|
+
break;
|
|
58
|
+
const nextStart = findOverlapStart(text, start, end, limits.chunkOverlapChars);
|
|
59
|
+
const advancedStart = nextStart <= start ? end : nextStart;
|
|
60
|
+
startLine += countNewlines(text.slice(start, advancedStart));
|
|
61
|
+
start = advancedStart;
|
|
62
|
+
}
|
|
63
|
+
return chunks;
|
|
64
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { classifyRelatedPath } from "./relatedFiles.js";
|
|
2
|
+
import { formatSearchResults, } from "./resultFormat.js";
|
|
3
|
+
function addUnique(values, value) {
|
|
4
|
+
if (value && !values.includes(value))
|
|
5
|
+
values.push(value);
|
|
6
|
+
}
|
|
7
|
+
function compactRelatedFile(related, maxRelatedItems) {
|
|
8
|
+
return {
|
|
9
|
+
sourcePath: related.path,
|
|
10
|
+
role: classifyRelatedPath(related.path),
|
|
11
|
+
...("depth" in related ? { depth: related.depth, via: related.via } : {}),
|
|
12
|
+
symbols: related.symbols.slice(0, maxRelatedItems),
|
|
13
|
+
imports: related.imports.slice(0, maxRelatedItems),
|
|
14
|
+
importedBy: related.importedBy.slice(0, maxRelatedItems),
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
export function buildContextPack(query, searchResults, relatedFiles, options) {
|
|
18
|
+
const formatted = formatSearchResults(query, searchResults, { maxContextChars: options.maxContextChars });
|
|
19
|
+
const relatedSnippetPack = options.relatedSnippets ??
|
|
20
|
+
{
|
|
21
|
+
snippets: [],
|
|
22
|
+
summary: {
|
|
23
|
+
maxRelatedContextChars: 0,
|
|
24
|
+
usedRelatedContextChars: 0,
|
|
25
|
+
relatedSnippetCount: 0,
|
|
26
|
+
truncatedRelatedSnippets: 0,
|
|
27
|
+
},
|
|
28
|
+
};
|
|
29
|
+
const primaryPaths = formatted.results.map((result) => result.path);
|
|
30
|
+
const related = relatedFiles
|
|
31
|
+
.slice(0, options.maxRelatedFiles)
|
|
32
|
+
.map((item) => compactRelatedFile(item, options.maxRelatedItems));
|
|
33
|
+
const suggestedPaths = [];
|
|
34
|
+
for (const path of primaryPaths)
|
|
35
|
+
addUnique(suggestedPaths, path);
|
|
36
|
+
for (const item of related) {
|
|
37
|
+
for (const dependency of item.imports)
|
|
38
|
+
addUnique(suggestedPaths, dependency.resolvedPath);
|
|
39
|
+
for (const reverseDependency of item.importedBy)
|
|
40
|
+
addUnique(suggestedPaths, reverseDependency.path);
|
|
41
|
+
}
|
|
42
|
+
return {
|
|
43
|
+
primaryResults: formatted.results,
|
|
44
|
+
relatedFiles: related,
|
|
45
|
+
relatedSnippets: relatedSnippetPack.snippets,
|
|
46
|
+
suggestedPaths,
|
|
47
|
+
context: {
|
|
48
|
+
...formatted.summary,
|
|
49
|
+
...relatedSnippetPack.summary,
|
|
50
|
+
primaryResultCount: formatted.results.length,
|
|
51
|
+
relatedFileCount: related.length,
|
|
52
|
+
},
|
|
53
|
+
};
|
|
54
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { chunkText } from "./chunker.js";
|
|
4
|
+
import { DEFAULT_INDEXING_LIMITS } from "./defaults.js";
|
|
5
|
+
import { sha256Hex } from "./hash.js";
|
|
6
|
+
import { scanProject } from "./scanner.js";
|
|
7
|
+
export function resolveIndexingLimits(options) {
|
|
8
|
+
return {
|
|
9
|
+
maxFileBytes: options.maxFileBytes ?? DEFAULT_INDEXING_LIMITS.maxFileBytes,
|
|
10
|
+
targetChunkChars: options.targetChunkChars ?? DEFAULT_INDEXING_LIMITS.targetChunkChars,
|
|
11
|
+
chunkOverlapChars: options.chunkOverlapChars ?? DEFAULT_INDEXING_LIMITS.chunkOverlapChars,
|
|
12
|
+
maxChunksPerFile: options.maxChunksPerFile ?? DEFAULT_INDEXING_LIMITS.maxChunksPerFile,
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
export async function reindexDryRun(options) {
|
|
16
|
+
const projectPath = path.resolve(options.projectPath);
|
|
17
|
+
const limits = resolveIndexingLimits(options);
|
|
18
|
+
const scan = await scanProject(projectPath, limits);
|
|
19
|
+
const files = [];
|
|
20
|
+
let chunkCount = 0;
|
|
21
|
+
let byteCount = 0;
|
|
22
|
+
for (const file of scan.files) {
|
|
23
|
+
const content = await fs.readFile(file.absolutePath, "utf8");
|
|
24
|
+
const chunks = chunkText(file.relativePath, content, limits);
|
|
25
|
+
chunkCount += chunks.length;
|
|
26
|
+
byteCount += file.size;
|
|
27
|
+
files.push({
|
|
28
|
+
path: file.relativePath,
|
|
29
|
+
size: file.size,
|
|
30
|
+
hash: sha256Hex(content),
|
|
31
|
+
chunks: chunks.length,
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
return {
|
|
35
|
+
projectPath,
|
|
36
|
+
dryRun: true,
|
|
37
|
+
limits,
|
|
38
|
+
stats: {
|
|
39
|
+
scannedFiles: scan.files.length + scan.skipped.length,
|
|
40
|
+
indexedFiles: scan.files.length,
|
|
41
|
+
skippedFiles: scan.skipped.length,
|
|
42
|
+
chunks: chunkCount,
|
|
43
|
+
bytes: byteCount,
|
|
44
|
+
},
|
|
45
|
+
files,
|
|
46
|
+
skipped: scan.skipped,
|
|
47
|
+
};
|
|
48
|
+
}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import Database from "better-sqlite3";
|
|
2
|
+
import { sha256Hex } from "./hash.js";
|
|
3
|
+
import { getOrCreateEmbeddingRecord, getOrCreateEmbeddingSet, initializeStorageSchema, vectorTableName, } from "../storage/schema.js";
|
|
4
|
+
import { vectorToFloat32Buffer } from "../storage/sqliteVec.js";
|
|
5
|
+
function chunkTitle(chunk) {
|
|
6
|
+
return chunk.title || `${chunk.path}:${chunk.startLine}-${chunk.endLine}`;
|
|
7
|
+
}
|
|
8
|
+
function toEmbeddingRequest(chunk) {
|
|
9
|
+
return {
|
|
10
|
+
kind: "document",
|
|
11
|
+
title: chunkTitle(chunk),
|
|
12
|
+
text: chunk.text,
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
function chunkArray(items, batchSize) {
|
|
16
|
+
const batches = [];
|
|
17
|
+
for (let index = 0; index < items.length; index += batchSize) {
|
|
18
|
+
batches.push(items.slice(index, index + batchSize));
|
|
19
|
+
}
|
|
20
|
+
return batches;
|
|
21
|
+
}
|
|
22
|
+
async function embedWithFallback(provider, requests) {
|
|
23
|
+
try {
|
|
24
|
+
return { results: await provider.embedBatch(requests), usedFallback: false };
|
|
25
|
+
}
|
|
26
|
+
catch (error) {
|
|
27
|
+
const results = [];
|
|
28
|
+
for (const request of requests) {
|
|
29
|
+
results.push(await provider.embed(request));
|
|
30
|
+
}
|
|
31
|
+
return { results, usedFallback: true };
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
export async function indexMissingEmbeddings(options) {
|
|
35
|
+
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
36
|
+
throw new Error("batchSize must be a positive integer");
|
|
37
|
+
}
|
|
38
|
+
const db = new Database(options.dbPath);
|
|
39
|
+
let embeddedChunks = 0;
|
|
40
|
+
let batchFallbacks = 0;
|
|
41
|
+
let batches = 0;
|
|
42
|
+
try {
|
|
43
|
+
initializeStorageSchema(db, { vectorDimensions: options.dimensions });
|
|
44
|
+
const embeddingSetId = getOrCreateEmbeddingSet(db, {
|
|
45
|
+
provider: options.providerName,
|
|
46
|
+
baseUrlHash: sha256Hex(options.providerBaseUrl),
|
|
47
|
+
model: options.model,
|
|
48
|
+
dimensions: options.dimensions,
|
|
49
|
+
});
|
|
50
|
+
const limitSql = options.maxChunks ? "limit @maxChunks" : "";
|
|
51
|
+
const pendingChunks = db
|
|
52
|
+
.prepare(`
|
|
53
|
+
select chunks.id, files.path, chunks.start_line as startLine, chunks.end_line as endLine,
|
|
54
|
+
chunks.title, chunks.text
|
|
55
|
+
from chunks
|
|
56
|
+
join files on files.id = chunks.file_id
|
|
57
|
+
left join embeddings
|
|
58
|
+
on embeddings.chunk_id = chunks.id
|
|
59
|
+
and embeddings.embedding_set_id = @embeddingSetId
|
|
60
|
+
where embeddings.id is null
|
|
61
|
+
order by files.path, chunks.start_line
|
|
62
|
+
${limitSql}
|
|
63
|
+
`)
|
|
64
|
+
.all({ embeddingSetId, maxChunks: options.maxChunks });
|
|
65
|
+
const insertVector = db.prepare(`insert or replace into ${vectorTableName(options.dimensions)}(rowid, embedding) values (?, ?)`);
|
|
66
|
+
const writeEmbedding = db.transaction((chunkId, vector) => {
|
|
67
|
+
const embeddingId = getOrCreateEmbeddingRecord(db, { chunkId, embeddingSetId });
|
|
68
|
+
insertVector.run(BigInt(embeddingId), vectorToFloat32Buffer(vector, options.dimensions));
|
|
69
|
+
});
|
|
70
|
+
for (const batch of chunkArray(pendingChunks, options.batchSize)) {
|
|
71
|
+
batches += 1;
|
|
72
|
+
const { results, usedFallback } = await embedWithFallback(options.provider, batch.map((chunk) => toEmbeddingRequest(chunk)));
|
|
73
|
+
if (usedFallback)
|
|
74
|
+
batchFallbacks += 1;
|
|
75
|
+
for (let index = 0; index < batch.length; index += 1) {
|
|
76
|
+
const chunk = batch[index];
|
|
77
|
+
const result = results[index];
|
|
78
|
+
if (result.dimensions !== options.dimensions) {
|
|
79
|
+
throw new Error(`Embedding dimensions mismatch for chunk ${chunk.id}: expected ${options.dimensions}, got ${result.dimensions}`);
|
|
80
|
+
}
|
|
81
|
+
writeEmbedding(chunk.id, result.vector);
|
|
82
|
+
embeddedChunks += 1;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
return {
|
|
86
|
+
status: "embeddings_indexed",
|
|
87
|
+
dbPath: options.dbPath,
|
|
88
|
+
embeddingSetId,
|
|
89
|
+
dimensions: options.dimensions,
|
|
90
|
+
stats: {
|
|
91
|
+
pendingChunks: pendingChunks.length,
|
|
92
|
+
embeddedChunks,
|
|
93
|
+
skippedChunks: 0,
|
|
94
|
+
batches,
|
|
95
|
+
batchFallbacks,
|
|
96
|
+
},
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
finally {
|
|
100
|
+
db.close();
|
|
101
|
+
}
|
|
102
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { searchByKeyword } from "./keywordSearch.js";
|
|
2
|
+
import { searchByVector } from "./semanticSearch.js";
|
|
3
|
+
function keyOf(result) {
|
|
4
|
+
return `${result.path}:${result.startLine}:${result.endLine}`;
|
|
5
|
+
}
|
|
6
|
+
function semanticScore(result, index) {
|
|
7
|
+
return 1 / (1 + Math.max(0, result.distance)) + Math.max(0, 0.2 - index * 0.01);
|
|
8
|
+
}
|
|
9
|
+
function keywordScore(result, index) {
|
|
10
|
+
return 1.2 + Math.max(0, 0.3 - index * 0.02) + Math.min(0.5, Math.abs(result.score) / 10);
|
|
11
|
+
}
|
|
12
|
+
export function mergeHybridResults(semanticResults, keywordResults, maxResults) {
|
|
13
|
+
const merged = new Map();
|
|
14
|
+
semanticResults.forEach((result, index) => {
|
|
15
|
+
merged.set(keyOf(result), {
|
|
16
|
+
path: result.path,
|
|
17
|
+
startLine: result.startLine,
|
|
18
|
+
endLine: result.endLine,
|
|
19
|
+
score: semanticScore(result, index),
|
|
20
|
+
distance: result.distance,
|
|
21
|
+
snippet: result.snippet,
|
|
22
|
+
matchTypes: ["semantic"],
|
|
23
|
+
});
|
|
24
|
+
});
|
|
25
|
+
keywordResults.forEach((result, index) => {
|
|
26
|
+
const key = keyOf(result);
|
|
27
|
+
const score = keywordScore(result, index);
|
|
28
|
+
const existing = merged.get(key);
|
|
29
|
+
if (existing) {
|
|
30
|
+
existing.score += score;
|
|
31
|
+
existing.keywordScore = result.score;
|
|
32
|
+
if (!existing.matchTypes.includes("keyword"))
|
|
33
|
+
existing.matchTypes.push("keyword");
|
|
34
|
+
if (result.snippet.length > existing.snippet.length)
|
|
35
|
+
existing.snippet = result.snippet;
|
|
36
|
+
return;
|
|
37
|
+
}
|
|
38
|
+
merged.set(key, {
|
|
39
|
+
path: result.path,
|
|
40
|
+
startLine: result.startLine,
|
|
41
|
+
endLine: result.endLine,
|
|
42
|
+
score,
|
|
43
|
+
keywordScore: result.score,
|
|
44
|
+
snippet: result.snippet,
|
|
45
|
+
matchTypes: ["keyword"],
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
return Array.from(merged.values())
|
|
49
|
+
.sort((a, b) => b.score - a.score || a.path.localeCompare(b.path) || a.startLine - b.startLine)
|
|
50
|
+
.slice(0, maxResults);
|
|
51
|
+
}
|
|
52
|
+
export function searchHybrid(options) {
|
|
53
|
+
const semanticResults = searchByVector({
|
|
54
|
+
dbPath: options.dbPath,
|
|
55
|
+
dimensions: options.dimensions,
|
|
56
|
+
queryVector: options.queryVector,
|
|
57
|
+
maxResults: Math.max(options.maxResults * 2, options.maxResults),
|
|
58
|
+
maxSnippetChars: options.maxSnippetChars,
|
|
59
|
+
});
|
|
60
|
+
const keywordResults = searchByKeyword({
|
|
61
|
+
dbPath: options.dbPath,
|
|
62
|
+
query: options.query,
|
|
63
|
+
maxResults: Math.max(options.maxResults * 2, options.maxResults),
|
|
64
|
+
maxSnippetChars: options.maxSnippetChars,
|
|
65
|
+
});
|
|
66
|
+
return mergeHybridResults(semanticResults, keywordResults, options.maxResults);
|
|
67
|
+
}
|