@tobilu/qmd 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +51 -0
- package/README.md +29 -3
- package/dist/collections.d.ts +1 -0
- package/dist/llm.d.ts +19 -5
- package/dist/llm.js +98 -50
- package/dist/mcp.js +80 -9
- package/dist/qmd.js +103 -34
- package/dist/store.d.ts +44 -9
- package/dist/store.js +148 -16
- package/package.json +4 -3
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,57 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
## [1.1.2] - 2026-03-07
|
|
6
|
+
|
|
7
|
+
13 community PRs merged. GPU initialization replaced with node-llama-cpp's
|
|
8
|
+
built-in `autoAttempt` — deleting ~220 lines of manual fallback code and
|
|
9
|
+
fixing GPU issues reported across 10+ PRs in one shot. Reranking is faster
|
|
10
|
+
through chunk deduplication and a parallelism cap that prevents VRAM
|
|
11
|
+
exhaustion.
|
|
12
|
+
|
|
13
|
+
### Changes
|
|
14
|
+
|
|
15
|
+
- **GPU init**: use node-llama-cpp's `build: "autoAttempt"` instead of manual
|
|
16
|
+
GPU backend detection. Automatically tries Metal/CUDA/Vulkan and falls back
|
|
17
|
+
gracefully. #310 (thanks @giladgd — the node-llama-cpp author)
|
|
18
|
+
- **Query `--explain`**: `qmd query --explain` exposes retrieval score traces
|
|
19
|
+
— backend scores, per-list RRF contributions, top-rank bonus, reranker
|
|
20
|
+
score, and final blended score. Works in JSON and CLI output. #242
|
|
21
|
+
(thanks @vyalamar)
|
|
22
|
+
- **Collection ignore patterns**: `ignore: ["Sessions/**", "*.tmp"]` in
|
|
23
|
+
collection config to exclude files from indexing. #304 (thanks @sebkouba)
|
|
24
|
+
- **Multilingual embeddings**: `QMD_EMBED_MODEL` env var lets you swap in
|
|
25
|
+
models like Qwen3-Embedding for non-English collections. #273 (thanks
|
|
26
|
+
@daocoding)
|
|
27
|
+
- **Configurable expansion context**: `QMD_EXPAND_CONTEXT_SIZE` env var
|
|
28
|
+
(default 2048) — previously used the model's full 40960-token window,
|
|
29
|
+
wasting VRAM. #313 (thanks @0xble)
|
|
30
|
+
- **`candidateLimit` exposed**: `-C` / `--candidate-limit` flag and MCP
|
|
31
|
+
parameter to tune how many candidates reach the reranker. #255 (thanks
|
|
32
|
+
@pandysp)
|
|
33
|
+
- **MCP multi-session**: HTTP transport now supports multiple concurrent
|
|
34
|
+
client sessions, each with its own server instance. #286 (thanks @joelev)
|
|
35
|
+
|
|
36
|
+
### Fixes
|
|
37
|
+
|
|
38
|
+
- **Reranking performance**: cap parallel rerank contexts at 4 to prevent
|
|
39
|
+
VRAM exhaustion on high-core machines. Deduplicate identical chunk texts
|
|
40
|
+
before reranking — same content from different files now shares a single
|
|
41
|
+
reranker call. Cache scores by content hash instead of file path.
|
|
42
|
+
- Deactivate stale docs when all files are removed from a collection and
|
|
43
|
+
`qmd update` is run. #312 (thanks @0xble)
|
|
44
|
+
- Handle emoji-only filenames (`🐘.md` → `1f418.md`) instead of crashing.
|
|
45
|
+
#308 (thanks @debugerman)
|
|
46
|
+
- Skip unreadable files during indexing (e.g. iCloud-evicted files returning
|
|
47
|
+
EAGAIN) instead of crashing. #253 (thanks @jimmynail)
|
|
48
|
+
- Suppress progress bar escape sequences when stderr is not a TTY. #230
|
|
49
|
+
(thanks @dgilperez)
|
|
50
|
+
- Emit format-appropriate empty output (`[]` for JSON, CSV header for CSV,
|
|
51
|
+
etc.) instead of plain text "No results." #228 (thanks @amsminn)
|
|
52
|
+
- Correct Windows sqlite-vec package name (`sqlite-vec-windows-x64`) and add
|
|
53
|
+
`sqlite-vec-linux-arm64`. #225 (thanks @ilepn)
|
|
54
|
+
- Fix claude plugin setup CLI commands in README. #311 (thanks @gi11es)
|
|
55
|
+
|
|
5
56
|
## [1.1.1] - 2026-03-06
|
|
6
57
|
|
|
7
58
|
### Fixes
|
package/README.md
CHANGED
|
@@ -97,8 +97,8 @@ Although the tool works perfectly fine when you just tell your agent to use it o
|
|
|
97
97
|
**Claude Code** — Install the plugin (recommended):
|
|
98
98
|
|
|
99
99
|
```bash
|
|
100
|
-
claude marketplace add tobi/qmd
|
|
101
|
-
claude plugin
|
|
100
|
+
claude plugin marketplace add tobi/qmd
|
|
101
|
+
claude plugin install qmd@qmd
|
|
102
102
|
```
|
|
103
103
|
|
|
104
104
|
Or configure MCP manually in `~/.claude/settings.json`:
|
|
@@ -252,12 +252,34 @@ QMD uses three local GGUF models (auto-downloaded on first use):
|
|
|
252
252
|
|
|
253
253
|
| Model | Purpose | Size |
|
|
254
254
|
|-------|---------|------|
|
|
255
|
-
| `embeddinggemma-300M-Q8_0` | Vector embeddings | ~300MB |
|
|
255
|
+
| `embeddinggemma-300M-Q8_0` | Vector embeddings (default) | ~300MB |
|
|
256
256
|
| `qwen3-reranker-0.6b-q8_0` | Re-ranking | ~640MB |
|
|
257
257
|
| `qmd-query-expansion-1.7B-q4_k_m` | Query expansion (fine-tuned) | ~1.1GB |
|
|
258
258
|
|
|
259
259
|
Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`.
|
|
260
260
|
|
|
261
|
+
### Custom Embedding Model
|
|
262
|
+
|
|
263
|
+
Override the default embedding model via the `QMD_EMBED_MODEL` environment variable.
|
|
264
|
+
This is useful for multilingual corpora (e.g. Chinese, Japanese, Korean) where
|
|
265
|
+
`embeddinggemma-300M` has limited coverage.
|
|
266
|
+
|
|
267
|
+
```sh
|
|
268
|
+
# Use Qwen3-Embedding-0.6B for better multilingual (CJK) support
|
|
269
|
+
export QMD_EMBED_MODEL="hf:Qwen/Qwen3-Embedding-0.6B-GGUF/qwen3-embedding-0.6b-q8_0.gguf"
|
|
270
|
+
|
|
271
|
+
# After changing the model, re-embed all collections:
|
|
272
|
+
qmd embed -f
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
Supported model families:
|
|
276
|
+
- **embeddinggemma** (default) — English-optimized, small footprint
|
|
277
|
+
- **Qwen3-Embedding** — Multilingual (119 languages including CJK), MTEB top-ranked
|
|
278
|
+
|
|
279
|
+
> **Note:** When switching embedding models, you must re-index with `qmd embed -f`
|
|
280
|
+
> since vectors are not cross-compatible between models. The prompt format is
|
|
281
|
+
> automatically adjusted for each model family.
|
|
282
|
+
|
|
261
283
|
## Installation
|
|
262
284
|
|
|
263
285
|
```sh
|
|
@@ -366,6 +388,7 @@ qmd query "user authentication"
|
|
|
366
388
|
--min-score <num> # Minimum score threshold (default: 0)
|
|
367
389
|
--full # Show full document content
|
|
368
390
|
--line-numbers # Add line numbers to output
|
|
391
|
+
--explain # Include retrieval score traces (query, JSON/CLI output)
|
|
369
392
|
--index <name> # Use named index
|
|
370
393
|
|
|
371
394
|
# Output formats (for search and multi-get)
|
|
@@ -428,6 +451,9 @@ qmd search --md --full "error handling"
|
|
|
428
451
|
# JSON output for scripting
|
|
429
452
|
qmd query --json "quarterly reports"
|
|
430
453
|
|
|
454
|
+
# Inspect how each result was scored (RRF + rerank blend)
|
|
455
|
+
qmd query --json --explain "quarterly reports"
|
|
456
|
+
|
|
431
457
|
# Use separate index for different knowledge base
|
|
432
458
|
qmd --index work search "quarterly reports"
|
|
433
459
|
```
|
package/dist/collections.d.ts
CHANGED
package/dist/llm.d.ts
CHANGED
|
@@ -4,16 +4,23 @@
|
|
|
4
4
|
* Provides embeddings, text generation, and reranking using local GGUF models.
|
|
5
5
|
*/
|
|
6
6
|
import { type Token as LlamaToken } from "node-llama-cpp";
|
|
7
|
+
/**
|
|
8
|
+
* Detect if a model URI uses the Qwen3-Embedding format.
|
|
9
|
+
* Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
|
|
10
|
+
*/
|
|
11
|
+
export declare function isQwen3EmbeddingModel(modelUri: string): boolean;
|
|
7
12
|
/**
|
|
8
13
|
* Format a query for embedding.
|
|
9
|
-
* Uses nomic-style task prefix format for embeddinggemma.
|
|
14
|
+
* Uses nomic-style task prefix format for embeddinggemma (default).
|
|
15
|
+
* Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
|
|
10
16
|
*/
|
|
11
|
-
export declare function formatQueryForEmbedding(query: string): string;
|
|
17
|
+
export declare function formatQueryForEmbedding(query: string, modelUri?: string): string;
|
|
12
18
|
/**
|
|
13
19
|
* Format a document for embedding.
|
|
14
|
-
* Uses nomic-style format with title and text fields.
|
|
20
|
+
* Uses nomic-style format with title and text fields (default).
|
|
21
|
+
* Qwen3-Embedding encodes documents as raw text without special prefixes.
|
|
15
22
|
*/
|
|
16
|
-
export declare function formatDocForEmbedding(text: string, title?: string): string;
|
|
23
|
+
export declare function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string;
|
|
17
24
|
/**
|
|
18
25
|
* Token with log probability
|
|
19
26
|
*/
|
|
@@ -130,7 +137,7 @@ export type RerankDocument = {
|
|
|
130
137
|
};
|
|
131
138
|
export declare const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
|
|
132
139
|
export declare const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
|
|
133
|
-
export declare const DEFAULT_EMBED_MODEL_URI
|
|
140
|
+
export declare const DEFAULT_EMBED_MODEL_URI: string;
|
|
134
141
|
export declare const DEFAULT_RERANK_MODEL_URI = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
|
135
142
|
export declare const DEFAULT_GENERATE_MODEL_URI = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
|
|
136
143
|
export declare const DEFAULT_MODEL_CACHE_DIR: string;
|
|
@@ -183,6 +190,11 @@ export type LlamaCppConfig = {
|
|
|
183
190
|
generateModel?: string;
|
|
184
191
|
rerankModel?: string;
|
|
185
192
|
modelCacheDir?: string;
|
|
193
|
+
/**
|
|
194
|
+
* Context size used for query expansion generation contexts.
|
|
195
|
+
* Default: 2048. Can also be set via QMD_EXPAND_CONTEXT_SIZE.
|
|
196
|
+
*/
|
|
197
|
+
expandContextSize?: number;
|
|
186
198
|
/**
|
|
187
199
|
* Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
|
|
188
200
|
*
|
|
@@ -210,6 +222,7 @@ export declare class LlamaCpp implements LLM {
|
|
|
210
222
|
private generateModelUri;
|
|
211
223
|
private rerankModelUri;
|
|
212
224
|
private modelCacheDir;
|
|
225
|
+
private expandContextSize;
|
|
213
226
|
private embedModelLoadPromise;
|
|
214
227
|
private generateModelLoadPromise;
|
|
215
228
|
private rerankModelLoadPromise;
|
|
@@ -319,6 +332,7 @@ export declare class LlamaCpp implements LLM {
|
|
|
319
332
|
includeLexical?: boolean;
|
|
320
333
|
}): Promise<Queryable[]>;
|
|
321
334
|
private static readonly RERANK_TEMPLATE_OVERHEAD;
|
|
335
|
+
private static readonly RERANK_TARGET_DOCS_PER_CONTEXT;
|
|
322
336
|
rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
|
|
323
337
|
/**
|
|
324
338
|
* Get device/GPU info for status display.
|
package/dist/llm.js
CHANGED
|
@@ -3,25 +3,43 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Provides embeddings, text generation, and reranking using local GGUF models.
|
|
5
5
|
*/
|
|
6
|
-
import { getLlama,
|
|
6
|
+
import { getLlama, resolveModelFile, LlamaChatSession, LlamaLogLevel, } from "node-llama-cpp";
|
|
7
7
|
import { homedir } from "os";
|
|
8
8
|
import { join } from "path";
|
|
9
9
|
import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
|
|
10
10
|
// =============================================================================
|
|
11
11
|
// Embedding Formatting Functions
|
|
12
12
|
// =============================================================================
|
|
13
|
+
/**
|
|
14
|
+
* Detect if a model URI uses the Qwen3-Embedding format.
|
|
15
|
+
* Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
|
|
16
|
+
*/
|
|
17
|
+
export function isQwen3EmbeddingModel(modelUri) {
|
|
18
|
+
return /qwen.*embed/i.test(modelUri) || /embed.*qwen/i.test(modelUri);
|
|
19
|
+
}
|
|
13
20
|
/**
|
|
14
21
|
* Format a query for embedding.
|
|
15
|
-
* Uses nomic-style task prefix format for embeddinggemma.
|
|
22
|
+
* Uses nomic-style task prefix format for embeddinggemma (default).
|
|
23
|
+
* Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
|
|
16
24
|
*/
|
|
17
|
-
export function formatQueryForEmbedding(query) {
|
|
25
|
+
export function formatQueryForEmbedding(query, modelUri) {
|
|
26
|
+
const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
|
|
27
|
+
if (isQwen3EmbeddingModel(uri)) {
|
|
28
|
+
return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
|
|
29
|
+
}
|
|
18
30
|
return `task: search result | query: ${query}`;
|
|
19
31
|
}
|
|
20
32
|
/**
|
|
21
33
|
* Format a document for embedding.
|
|
22
|
-
* Uses nomic-style format with title and text fields.
|
|
34
|
+
* Uses nomic-style format with title and text fields (default).
|
|
35
|
+
* Qwen3-Embedding encodes documents as raw text without special prefixes.
|
|
23
36
|
*/
|
|
24
|
-
export function formatDocForEmbedding(text, title) {
|
|
37
|
+
export function formatDocForEmbedding(text, title, modelUri) {
|
|
38
|
+
const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
|
|
39
|
+
if (isQwen3EmbeddingModel(uri)) {
|
|
40
|
+
// Qwen3-Embedding: documents are raw text, no task prefix
|
|
41
|
+
return title ? `${title}\n${text}` : text;
|
|
42
|
+
}
|
|
25
43
|
return `title: ${title || "none"} | text: ${text}`;
|
|
26
44
|
}
|
|
27
45
|
// =============================================================================
|
|
@@ -29,7 +47,8 @@ export function formatDocForEmbedding(text, title) {
|
|
|
29
47
|
// =============================================================================
|
|
30
48
|
// HuggingFace model URIs for node-llama-cpp
|
|
31
49
|
// Format: hf:<user>/<repo>/<file>
|
|
32
|
-
|
|
50
|
+
// Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/qwen3-embedding-0.6b-q8_0.gguf)
|
|
51
|
+
const DEFAULT_EMBED_MODEL = process.env.QMD_EMBED_MODEL ?? "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
|
|
33
52
|
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
|
34
53
|
// const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
|
|
35
54
|
const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
|
|
@@ -126,6 +145,24 @@ export async function pullModels(models, options = {}) {
|
|
|
126
145
|
*/
|
|
127
146
|
// Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
|
|
128
147
|
const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
|
|
148
|
+
const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
|
|
149
|
+
function resolveExpandContextSize(configValue) {
|
|
150
|
+
if (configValue !== undefined) {
|
|
151
|
+
if (!Number.isInteger(configValue) || configValue <= 0) {
|
|
152
|
+
throw new Error(`Invalid expandContextSize: ${configValue}. Must be a positive integer.`);
|
|
153
|
+
}
|
|
154
|
+
return configValue;
|
|
155
|
+
}
|
|
156
|
+
const envValue = process.env.QMD_EXPAND_CONTEXT_SIZE?.trim();
|
|
157
|
+
if (!envValue)
|
|
158
|
+
return DEFAULT_EXPAND_CONTEXT_SIZE;
|
|
159
|
+
const parsed = Number.parseInt(envValue, 10);
|
|
160
|
+
if (!Number.isInteger(parsed) || parsed <= 0) {
|
|
161
|
+
process.stderr.write(`QMD Warning: invalid QMD_EXPAND_CONTEXT_SIZE="${envValue}", using default ${DEFAULT_EXPAND_CONTEXT_SIZE}.\n`);
|
|
162
|
+
return DEFAULT_EXPAND_CONTEXT_SIZE;
|
|
163
|
+
}
|
|
164
|
+
return parsed;
|
|
165
|
+
}
|
|
129
166
|
export class LlamaCpp {
|
|
130
167
|
llama = null;
|
|
131
168
|
embedModel = null;
|
|
@@ -137,6 +174,7 @@ export class LlamaCpp {
|
|
|
137
174
|
generateModelUri;
|
|
138
175
|
rerankModelUri;
|
|
139
176
|
modelCacheDir;
|
|
177
|
+
expandContextSize;
|
|
140
178
|
// Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
|
|
141
179
|
embedModelLoadPromise = null;
|
|
142
180
|
generateModelLoadPromise = null;
|
|
@@ -152,6 +190,7 @@ export class LlamaCpp {
|
|
|
152
190
|
this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
|
|
153
191
|
this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
|
|
154
192
|
this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
|
|
193
|
+
this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
|
|
155
194
|
this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
|
|
156
195
|
this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
|
|
157
196
|
}
|
|
@@ -249,27 +288,12 @@ export class LlamaCpp {
|
|
|
249
288
|
*/
|
|
250
289
|
async ensureLlama() {
|
|
251
290
|
if (!this.llama) {
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
const preferred = ["cuda", "metal", "vulkan"].find(g => gpuTypes.includes(g));
|
|
259
|
-
let llama;
|
|
260
|
-
if (preferred) {
|
|
261
|
-
try {
|
|
262
|
-
llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
|
|
263
|
-
}
|
|
264
|
-
catch {
|
|
265
|
-
llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
|
|
266
|
-
process.stderr.write(`QMD Warning: ${preferred} reported available but failed to initialize. Falling back to CPU.\n`);
|
|
267
|
-
}
|
|
268
|
-
}
|
|
269
|
-
else {
|
|
270
|
-
llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
|
|
271
|
-
}
|
|
272
|
-
if (!llama.gpu) {
|
|
291
|
+
const llama = await getLlama({
|
|
292
|
+
// attempt to build
|
|
293
|
+
build: "autoAttempt",
|
|
294
|
+
logLevel: LlamaLogLevel.error
|
|
295
|
+
});
|
|
296
|
+
if (llama.gpu === false) {
|
|
273
297
|
process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n");
|
|
274
298
|
}
|
|
275
299
|
this.llama = llama;
|
|
@@ -466,7 +490,7 @@ export class LlamaCpp {
|
|
|
466
490
|
if (this.rerankContexts.length === 0) {
|
|
467
491
|
const model = await this.ensureRerankModel();
|
|
468
492
|
// ~960 MB per context with flash attention at contextSize 2048
|
|
469
|
-
const n = await this.computeParallelism(1000);
|
|
493
|
+
const n = Math.min(await this.computeParallelism(1000), 4);
|
|
470
494
|
const threads = await this.threadsPerContext(n);
|
|
471
495
|
for (let i = 0; i < n; i++) {
|
|
472
496
|
try {
|
|
@@ -668,8 +692,10 @@ export class LlamaCpp {
|
|
|
668
692
|
`
|
|
669
693
|
});
|
|
670
694
|
const prompt = `/no_think Expand this search query: ${query}`;
|
|
671
|
-
// Create
|
|
672
|
-
const genContext = await this.generateModel.createContext(
|
|
695
|
+
// Create a bounded context for expansion to prevent large default VRAM allocations.
|
|
696
|
+
const genContext = await this.generateModel.createContext({
|
|
697
|
+
contextSize: this.expandContextSize,
|
|
698
|
+
});
|
|
673
699
|
const sequence = genContext.getSequence();
|
|
674
700
|
const session = new LlamaChatSession({ contextSequence: sequence });
|
|
675
701
|
try {
|
|
@@ -733,6 +759,7 @@ export class LlamaCpp {
|
|
|
733
759
|
}
|
|
734
760
|
// Qwen3 reranker chat template overhead (system prompt, tags, separators)
|
|
735
761
|
static RERANK_TEMPLATE_OVERHEAD = 200;
|
|
762
|
+
static RERANK_TARGET_DOCS_PER_CONTEXT = 10;
|
|
736
763
|
async rerank(query, documents, options = {}) {
|
|
737
764
|
// Ping activity at start to keep models alive during this operation
|
|
738
765
|
this.touchActivity();
|
|
@@ -742,41 +769,61 @@ export class LlamaCpp {
|
|
|
742
769
|
// Budget = contextSize - template overhead - query tokens
|
|
743
770
|
const queryTokens = model.tokenize(query).length;
|
|
744
771
|
const maxDocTokens = LlamaCpp.RERANK_CONTEXT_SIZE - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
|
|
772
|
+
const truncationCache = new Map();
|
|
745
773
|
const truncatedDocs = documents.map((doc) => {
|
|
774
|
+
const cached = truncationCache.get(doc.text);
|
|
775
|
+
if (cached !== undefined) {
|
|
776
|
+
return cached === doc.text ? doc : { ...doc, text: cached };
|
|
777
|
+
}
|
|
746
778
|
const tokens = model.tokenize(doc.text);
|
|
747
|
-
|
|
779
|
+
const truncatedText = tokens.length <= maxDocTokens
|
|
780
|
+
? doc.text
|
|
781
|
+
: model.detokenize(tokens.slice(0, maxDocTokens));
|
|
782
|
+
truncationCache.set(doc.text, truncatedText);
|
|
783
|
+
if (truncatedText === doc.text)
|
|
748
784
|
return doc;
|
|
749
|
-
const truncatedText = model.detokenize(tokens.slice(0, maxDocTokens));
|
|
750
785
|
return { ...doc, text: truncatedText };
|
|
751
786
|
});
|
|
752
|
-
//
|
|
753
|
-
|
|
787
|
+
// Deduplicate identical effective texts before scoring.
|
|
788
|
+
// This avoids redundant work for repeated chunks and fixes collisions where
|
|
789
|
+
// multiple docs map to the same chunk text.
|
|
790
|
+
const textToDocs = new Map();
|
|
754
791
|
truncatedDocs.forEach((doc, index) => {
|
|
755
|
-
|
|
792
|
+
const existing = textToDocs.get(doc.text);
|
|
793
|
+
if (existing) {
|
|
794
|
+
existing.push({ file: doc.file, index });
|
|
795
|
+
}
|
|
796
|
+
else {
|
|
797
|
+
textToDocs.set(doc.text, [{ file: doc.file, index }]);
|
|
798
|
+
}
|
|
756
799
|
});
|
|
757
800
|
// Extract just the text for ranking
|
|
758
|
-
const texts =
|
|
801
|
+
const texts = Array.from(textToDocs.keys());
|
|
759
802
|
// Split documents across contexts for parallel evaluation.
|
|
760
803
|
// Each context has its own sequence with a lock, so parallelism comes
|
|
761
804
|
// from multiple contexts evaluating different chunks simultaneously.
|
|
762
|
-
const
|
|
763
|
-
const
|
|
764
|
-
const
|
|
765
|
-
const
|
|
805
|
+
const activeContextCount = Math.max(1, Math.min(contexts.length, Math.ceil(texts.length / LlamaCpp.RERANK_TARGET_DOCS_PER_CONTEXT)));
|
|
806
|
+
const activeContexts = contexts.slice(0, activeContextCount);
|
|
807
|
+
const chunkSize = Math.ceil(texts.length / activeContexts.length);
|
|
808
|
+
const chunks = Array.from({ length: activeContexts.length }, (_, i) => texts.slice(i * chunkSize, (i + 1) * chunkSize)).filter(chunk => chunk.length > 0);
|
|
809
|
+
const allScores = await Promise.all(chunks.map((chunk, i) => activeContexts[i].rankAll(query, chunk)));
|
|
766
810
|
// Reassemble scores in original order and sort
|
|
767
811
|
const flatScores = allScores.flat();
|
|
768
812
|
const ranked = texts
|
|
769
813
|
.map((text, i) => ({ document: text, score: flatScores[i] }))
|
|
770
814
|
.sort((a, b) => b.score - a.score);
|
|
771
|
-
// Map back to our result format
|
|
772
|
-
const results =
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
815
|
+
// Map back to our result format.
|
|
816
|
+
const results = [];
|
|
817
|
+
for (const item of ranked) {
|
|
818
|
+
const docInfos = textToDocs.get(item.document) ?? [];
|
|
819
|
+
for (const docInfo of docInfos) {
|
|
820
|
+
results.push({
|
|
821
|
+
file: docInfo.file,
|
|
822
|
+
score: item.score,
|
|
823
|
+
index: docInfo.index,
|
|
824
|
+
});
|
|
825
|
+
}
|
|
826
|
+
}
|
|
780
827
|
return {
|
|
781
828
|
results,
|
|
782
829
|
model: this.rerankModelUri,
|
|
@@ -1033,7 +1080,8 @@ let defaultLlamaCpp = null;
|
|
|
1033
1080
|
*/
|
|
1034
1081
|
export function getDefaultLlamaCpp() {
|
|
1035
1082
|
if (!defaultLlamaCpp) {
|
|
1036
|
-
|
|
1083
|
+
const embedModel = process.env.QMD_EMBED_MODEL;
|
|
1084
|
+
defaultLlamaCpp = new LlamaCpp(embedModel ? { embedModel } : {});
|
|
1037
1085
|
}
|
|
1038
1086
|
return defaultLlamaCpp;
|
|
1039
1087
|
}
|
package/dist/mcp.js
CHANGED
|
@@ -12,6 +12,7 @@ import { fileURLToPath } from "url";
|
|
|
12
12
|
import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
13
13
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
14
14
|
import { WebStandardStreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js";
|
|
15
|
+
import { isInitializeRequest } from "@modelcontextprotocol/sdk/types.js";
|
|
15
16
|
import { z } from "zod";
|
|
16
17
|
import { createStore, extractSnippet, addLineNumbers, structuredSearch, DEFAULT_MULTI_GET_MAX_BYTES, } from "./store.js";
|
|
17
18
|
import { getCollection, getGlobalContext, getDefaultCollectionNames } from "./collections.js";
|
|
@@ -233,9 +234,10 @@ Intent-aware lex (C++ performance, not sports):
|
|
|
233
234
|
searches: z.array(subSearchSchema).min(1).max(10).describe("Typed sub-queries to execute (lex/vec/hyde). First gets 2x weight."),
|
|
234
235
|
limit: z.number().optional().default(10).describe("Max results (default: 10)"),
|
|
235
236
|
minScore: z.number().optional().default(0).describe("Min relevance 0-1 (default: 0)"),
|
|
237
|
+
candidateLimit: z.number().optional().describe("Maximum candidates to rerank (default: 40, lower = faster but may miss results)"),
|
|
236
238
|
collections: z.array(z.string()).optional().describe("Filter to collections (OR match)"),
|
|
237
239
|
},
|
|
238
|
-
}, async ({ searches, limit, minScore, collections }) => {
|
|
240
|
+
}, async ({ searches, limit, minScore, candidateLimit, collections }) => {
|
|
239
241
|
// Map to internal format
|
|
240
242
|
const subSearches = searches.map(s => ({
|
|
241
243
|
type: s.type,
|
|
@@ -247,6 +249,7 @@ Intent-aware lex (C++ performance, not sports):
|
|
|
247
249
|
collections: effectiveCollections.length > 0 ? effectiveCollections : undefined,
|
|
248
250
|
limit,
|
|
249
251
|
minScore,
|
|
252
|
+
candidateLimit,
|
|
250
253
|
});
|
|
251
254
|
// Use first lex or vec query for snippet extraction
|
|
252
255
|
const primaryQuery = searches.find(s => s.type === 'lex')?.query
|
|
@@ -425,12 +428,27 @@ export async function startMcpServer() {
|
|
|
425
428
|
*/
|
|
426
429
|
export async function startMcpHttpServer(port, options) {
|
|
427
430
|
const store = createStore();
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
431
|
+
// Session map: each client gets its own McpServer + Transport pair (MCP spec requirement).
|
|
432
|
+
// The store is shared — it's stateless SQLite, safe for concurrent access.
|
|
433
|
+
const sessions = new Map();
|
|
434
|
+
async function createSession() {
|
|
435
|
+
const transport = new WebStandardStreamableHTTPServerTransport({
|
|
436
|
+
sessionIdGenerator: () => randomUUID(),
|
|
437
|
+
enableJsonResponse: true,
|
|
438
|
+
onsessioninitialized: (sessionId) => {
|
|
439
|
+
sessions.set(sessionId, transport);
|
|
440
|
+
log(`${ts()} New session ${sessionId} (${sessions.size} active)`);
|
|
441
|
+
},
|
|
442
|
+
});
|
|
443
|
+
const server = createMcpServer(store);
|
|
444
|
+
await server.connect(transport);
|
|
445
|
+
transport.onclose = () => {
|
|
446
|
+
if (transport.sessionId) {
|
|
447
|
+
sessions.delete(transport.sessionId);
|
|
448
|
+
}
|
|
449
|
+
};
|
|
450
|
+
return transport;
|
|
451
|
+
}
|
|
434
452
|
const startTime = Date.now();
|
|
435
453
|
const quiet = options?.quiet ?? false;
|
|
436
454
|
/** Format timestamp for request logging */
|
|
@@ -500,6 +518,7 @@ export async function startMcpHttpServer(port, options) {
|
|
|
500
518
|
collections: effectiveCollections.length > 0 ? effectiveCollections : undefined,
|
|
501
519
|
limit: params.limit ?? 10,
|
|
502
520
|
minScore: params.minScore ?? 0,
|
|
521
|
+
candidateLimit: params.candidateLimit,
|
|
503
522
|
});
|
|
504
523
|
// Use first lex or vec query for snippet extraction
|
|
505
524
|
const primaryQuery = params.searches.find((s) => s.type === 'lex')?.query
|
|
@@ -531,6 +550,34 @@ export async function startMcpHttpServer(port, options) {
|
|
|
531
550
|
if (typeof v === "string")
|
|
532
551
|
headers[k] = v;
|
|
533
552
|
}
|
|
553
|
+
// Route to existing session or create new one on initialize
|
|
554
|
+
const sessionId = headers["mcp-session-id"];
|
|
555
|
+
let transport;
|
|
556
|
+
if (sessionId) {
|
|
557
|
+
const existing = sessions.get(sessionId);
|
|
558
|
+
if (!existing) {
|
|
559
|
+
nodeRes.writeHead(404, { "Content-Type": "application/json" });
|
|
560
|
+
nodeRes.end(JSON.stringify({
|
|
561
|
+
jsonrpc: "2.0",
|
|
562
|
+
error: { code: -32001, message: "Session not found" },
|
|
563
|
+
id: body?.id ?? null,
|
|
564
|
+
}));
|
|
565
|
+
return;
|
|
566
|
+
}
|
|
567
|
+
transport = existing;
|
|
568
|
+
}
|
|
569
|
+
else if (isInitializeRequest(body)) {
|
|
570
|
+
transport = await createSession();
|
|
571
|
+
}
|
|
572
|
+
else {
|
|
573
|
+
nodeRes.writeHead(400, { "Content-Type": "application/json" });
|
|
574
|
+
nodeRes.end(JSON.stringify({
|
|
575
|
+
jsonrpc: "2.0",
|
|
576
|
+
error: { code: -32000, message: "Bad Request: Missing session ID" },
|
|
577
|
+
id: body?.id ?? null,
|
|
578
|
+
}));
|
|
579
|
+
return;
|
|
580
|
+
}
|
|
534
581
|
const request = new Request(url, { method: "POST", headers, body: rawBody });
|
|
535
582
|
const response = await transport.handleRequest(request, { parsedBody: body });
|
|
536
583
|
nodeRes.writeHead(response.status, Object.fromEntries(response.headers));
|
|
@@ -539,12 +586,33 @@ export async function startMcpHttpServer(port, options) {
|
|
|
539
586
|
return;
|
|
540
587
|
}
|
|
541
588
|
if (pathname === "/mcp") {
|
|
542
|
-
const url = `http://localhost:${port}${pathname}`;
|
|
543
589
|
const headers = {};
|
|
544
590
|
for (const [k, v] of Object.entries(nodeReq.headers)) {
|
|
545
591
|
if (typeof v === "string")
|
|
546
592
|
headers[k] = v;
|
|
547
593
|
}
|
|
594
|
+
// GET/DELETE must have a valid session
|
|
595
|
+
const sessionId = headers["mcp-session-id"];
|
|
596
|
+
if (!sessionId) {
|
|
597
|
+
nodeRes.writeHead(400, { "Content-Type": "application/json" });
|
|
598
|
+
nodeRes.end(JSON.stringify({
|
|
599
|
+
jsonrpc: "2.0",
|
|
600
|
+
error: { code: -32000, message: "Bad Request: Missing session ID" },
|
|
601
|
+
id: null,
|
|
602
|
+
}));
|
|
603
|
+
return;
|
|
604
|
+
}
|
|
605
|
+
const transport = sessions.get(sessionId);
|
|
606
|
+
if (!transport) {
|
|
607
|
+
nodeRes.writeHead(404, { "Content-Type": "application/json" });
|
|
608
|
+
nodeRes.end(JSON.stringify({
|
|
609
|
+
jsonrpc: "2.0",
|
|
610
|
+
error: { code: -32001, message: "Session not found" },
|
|
611
|
+
id: null,
|
|
612
|
+
}));
|
|
613
|
+
return;
|
|
614
|
+
}
|
|
615
|
+
const url = `http://localhost:${port}${pathname}`;
|
|
548
616
|
const rawBody = nodeReq.method !== "GET" && nodeReq.method !== "HEAD" ? await collectBody(nodeReq) : undefined;
|
|
549
617
|
const request = new Request(url, { method: nodeReq.method || "GET", headers, ...(rawBody ? { body: rawBody } : {}) });
|
|
550
618
|
const response = await transport.handleRequest(request);
|
|
@@ -571,7 +639,10 @@ export async function startMcpHttpServer(port, options) {
|
|
|
571
639
|
if (stopping)
|
|
572
640
|
return;
|
|
573
641
|
stopping = true;
|
|
574
|
-
|
|
642
|
+
for (const transport of sessions.values()) {
|
|
643
|
+
await transport.close();
|
|
644
|
+
}
|
|
645
|
+
sessions.clear();
|
|
575
646
|
httpServer.close();
|
|
576
647
|
store.close();
|
|
577
648
|
await disposeDefaultLlamaCpp();
|