@tobilu/qmd 0.9.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +34 -0
- package/README.md +12 -5
- package/package.json +18 -13
- package/qmd +15 -24
- package/src/bench-rerank.ts +327 -0
- package/src/db.ts +52 -0
- package/src/llm.ts +245 -56
- package/src/mcp.ts +63 -30
- package/src/qmd.ts +74 -41
- package/src/store.ts +84 -99
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,39 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to QMD will be documented in this file.
|
|
4
4
|
|
|
5
|
+
## [1.0.0] - 2026-02-15
|
|
6
|
+
|
|
7
|
+
### Node.js Compatibility
|
|
8
|
+
|
|
9
|
+
QMD now runs on both **Node.js (>=22)** and **Bun**. Install with `npm install -g @tobilu/qmd` or `bun install -g @tobilu/qmd` — your choice. The `qmd` wrapper auto-detects Node.js via `tsx` and works out of the box with mise, asdf, nvm, and Homebrew installs.
|
|
10
|
+
|
|
11
|
+
### Performance
|
|
12
|
+
|
|
13
|
+
- **Parallel embedding & reranking** — multiple contexts split work across CPU cores (or VRAM on GPU), delivering up to **2.7x faster reranking** and significantly faster embedding on multi-core machines
|
|
14
|
+
- **Flash attention** — ~20% less VRAM per reranking context, enabling more parallel contexts on GPU
|
|
15
|
+
- **Right-sized contexts** — reranker context dropped from 40960 to 2048 tokens (17x less memory), since chunks are capped at ~900 tokens
|
|
16
|
+
- **Adaptive parallelism** — automatically scales context count based on available VRAM (GPU) or CPU math cores
|
|
17
|
+
- **CPU thread splitting** — each context runs on its own cores for true parallelism instead of contending on a single context
|
|
18
|
+
|
|
19
|
+
### GPU Auto-Detection
|
|
20
|
+
|
|
21
|
+
- Probes for CUDA, Metal, and Vulkan at startup — uses the best available backend
|
|
22
|
+
- Falls back gracefully to CPU with a warning if GPU init fails
|
|
23
|
+
- `qmd status` now shows device info (GPU type, VRAM usage)
|
|
24
|
+
|
|
25
|
+
### Test Suite
|
|
26
|
+
|
|
27
|
+
- Tests split into `src/*.test.ts` (unit), `src/models/*.test.ts` (model), and `src/integration/*.test.ts` (CLI/integration)
|
|
28
|
+
- Vitest config for Node.js; bun test still works for Bun
|
|
29
|
+
- New `eval-bm25` and `store.helpers.unit` test suites
|
|
30
|
+
|
|
31
|
+
### Fixes
|
|
32
|
+
|
|
33
|
+
- Prevent VRAM waste from duplicate context creation during concurrent loads
|
|
34
|
+
- Collection-aware FTS filtering for scoped keyword search
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
5
38
|
## [0.9.0] - 2026-02-15
|
|
6
39
|
|
|
7
40
|
Initial public release.
|
|
@@ -30,5 +63,6 @@ Initial public release.
|
|
|
30
63
|
- BM25 score normalization with Math.abs
|
|
31
64
|
- Bun UTF-8 path corruption workaround
|
|
32
65
|
|
|
66
|
+
[1.0.0]: https://github.com/tobi/qmd/releases/tag/v1.0.0
|
|
33
67
|
[0.9.0]: https://github.com/tobi/qmd/releases/tag/v0.9.0
|
|
34
68
|
|
package/README.md
CHANGED
|
@@ -9,9 +9,15 @@ QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking
|
|
|
9
9
|
## Quick Start
|
|
10
10
|
|
|
11
11
|
```sh
|
|
12
|
-
# Install globally
|
|
12
|
+
# Install globally (Node or Bun)
|
|
13
|
+
npm install -g @tobilu/qmd
|
|
14
|
+
# or
|
|
13
15
|
bun install -g @tobilu/qmd
|
|
14
16
|
|
|
17
|
+
# Or run directly
|
|
18
|
+
npx @tobilu/qmd ...
|
|
19
|
+
bunx @tobilu/qmd ...
|
|
20
|
+
|
|
15
21
|
# Create collections for your notes, docs, and meeting transcripts
|
|
16
22
|
qmd collection add ~/notes --name notes
|
|
17
23
|
qmd collection add ~/Documents/meetings --name meetings
|
|
@@ -231,6 +237,7 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl
|
|
|
231
237
|
|
|
232
238
|
### System Requirements
|
|
233
239
|
|
|
240
|
+
- **Node.js** >= 22
|
|
234
241
|
- **Bun** >= 1.0.0
|
|
235
242
|
- **macOS**: Homebrew SQLite (for extension support)
|
|
236
243
|
```sh
|
|
@@ -252,18 +259,18 @@ Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`.
|
|
|
252
259
|
## Installation
|
|
253
260
|
|
|
254
261
|
```sh
|
|
262
|
+
npm install -g @tobilu/qmd
|
|
263
|
+
# or
|
|
255
264
|
bun install -g @tobilu/qmd
|
|
256
265
|
```
|
|
257
266
|
|
|
258
|
-
Make sure `~/.bun/bin` is in your PATH.
|
|
259
|
-
|
|
260
267
|
### Development
|
|
261
268
|
|
|
262
269
|
```sh
|
|
263
270
|
git clone https://github.com/tobi/qmd
|
|
264
271
|
cd qmd
|
|
265
|
-
|
|
266
|
-
|
|
272
|
+
npm install
|
|
273
|
+
npm link
|
|
267
274
|
```
|
|
268
275
|
|
|
269
276
|
## Usage
|
package/package.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tobilu/qmd",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "1.0.0",
|
|
4
4
|
"description": "Query Markup Documents - On-device hybrid search for markdown files with BM25, vector search, and LLM reranking",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
7
|
-
"qmd": "
|
|
7
|
+
"qmd": "qmd"
|
|
8
8
|
},
|
|
9
9
|
"files": [
|
|
10
10
|
"src/**/*.ts",
|
|
@@ -15,15 +15,14 @@
|
|
|
15
15
|
"CHANGELOG.md"
|
|
16
16
|
],
|
|
17
17
|
"scripts": {
|
|
18
|
-
"test": "
|
|
19
|
-
"qmd": "
|
|
20
|
-
"index": "
|
|
21
|
-
"vector": "
|
|
22
|
-
"search": "
|
|
23
|
-
"vsearch": "
|
|
24
|
-
"rerank": "
|
|
25
|
-
"
|
|
26
|
-
"inspector": "npx @modelcontextprotocol/inspector bun src/qmd.ts mcp",
|
|
18
|
+
"test": "vitest run --reporter=verbose test/",
|
|
19
|
+
"qmd": "tsx src/qmd.ts",
|
|
20
|
+
"index": "tsx src/qmd.ts index",
|
|
21
|
+
"vector": "tsx src/qmd.ts vector",
|
|
22
|
+
"search": "tsx src/qmd.ts search",
|
|
23
|
+
"vsearch": "tsx src/qmd.ts vsearch",
|
|
24
|
+
"rerank": "tsx src/qmd.ts rerank",
|
|
25
|
+
"inspector": "npx @modelcontextprotocol/inspector tsx src/qmd.ts mcp",
|
|
27
26
|
"release": "./scripts/release.sh"
|
|
28
27
|
},
|
|
29
28
|
"publishConfig": {
|
|
@@ -39,7 +38,10 @@
|
|
|
39
38
|
},
|
|
40
39
|
"dependencies": {
|
|
41
40
|
"@modelcontextprotocol/sdk": "^1.25.1",
|
|
41
|
+
"better-sqlite3": "^11.0.0",
|
|
42
|
+
"fast-glob": "^3.3.0",
|
|
42
43
|
"node-llama-cpp": "^3.14.5",
|
|
44
|
+
"picomatch": "^4.0.0",
|
|
43
45
|
"sqlite-vec": "^0.1.7-alpha.2",
|
|
44
46
|
"yaml": "^2.8.2",
|
|
45
47
|
"zod": "^4.2.1"
|
|
@@ -51,13 +53,15 @@
|
|
|
51
53
|
"sqlite-vec-win32-x64": "^0.1.7-alpha.2"
|
|
52
54
|
},
|
|
53
55
|
"devDependencies": {
|
|
54
|
-
"@types/
|
|
56
|
+
"@types/better-sqlite3": "^7.6.0",
|
|
57
|
+
"tsx": "^4.0.0",
|
|
58
|
+
"vitest": "^3.0.0"
|
|
55
59
|
},
|
|
56
60
|
"peerDependencies": {
|
|
57
61
|
"typescript": "^5.9.3"
|
|
58
62
|
},
|
|
59
63
|
"engines": {
|
|
60
|
-
"
|
|
64
|
+
"node": ">=22.0.0"
|
|
61
65
|
},
|
|
62
66
|
"keywords": [
|
|
63
67
|
"markdown",
|
|
@@ -76,5 +80,6 @@
|
|
|
76
80
|
"local-ai",
|
|
77
81
|
"llm"
|
|
78
82
|
],
|
|
83
|
+
"author": "Tobi Lutke <tobi@lutke.com>",
|
|
79
84
|
"license": "MIT"
|
|
80
85
|
}
|
package/qmd
CHANGED
|
@@ -2,13 +2,13 @@
|
|
|
2
2
|
# qmd - Quick Markdown Search
|
|
3
3
|
set -euo pipefail
|
|
4
4
|
|
|
5
|
-
# Find
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
local ver
|
|
10
|
-
if [[ "$
|
|
11
|
-
command -v
|
|
5
|
+
# Find node - prefer PATH, fallback to known locations
|
|
6
|
+
find_node() {
|
|
7
|
+
if command -v node &>/dev/null; then
|
|
8
|
+
local ver=$(node --version 2>/dev/null | sed 's/^v//' || echo "0")
|
|
9
|
+
local major="${ver%%.*}"
|
|
10
|
+
if [[ "$major" -ge 22 ]]; then
|
|
11
|
+
command -v node
|
|
12
12
|
return 0
|
|
13
13
|
fi
|
|
14
14
|
fi
|
|
@@ -16,23 +16,14 @@ find_bun() {
|
|
|
16
16
|
# Fallback: derive paths (need HOME)
|
|
17
17
|
: "${HOME:=$(eval echo ~)}"
|
|
18
18
|
|
|
19
|
-
# If running from .bun tree, use that bun
|
|
20
|
-
if [[ "${BASH_SOURCE[0]}" == */.bun/* ]]; then
|
|
21
|
-
local bun_home="${BASH_SOURCE[0]%%/.bun/*}/.bun"
|
|
22
|
-
if [[ -x "$bun_home/bin/bun" ]]; then
|
|
23
|
-
echo "$bun_home/bin/bun"
|
|
24
|
-
return 0
|
|
25
|
-
fi
|
|
26
|
-
fi
|
|
27
|
-
|
|
28
19
|
# Check known locations
|
|
29
20
|
local candidates=(
|
|
30
|
-
"$HOME/.local/share/mise/installs/
|
|
31
|
-
"$HOME/.local/share/mise/shims/
|
|
32
|
-
"$HOME/.asdf/shims/
|
|
33
|
-
"/opt/homebrew/bin/
|
|
34
|
-
"/usr/local/bin/
|
|
35
|
-
"$HOME/.
|
|
21
|
+
"$HOME/.local/share/mise/installs/node/latest/bin/node"
|
|
22
|
+
"$HOME/.local/share/mise/shims/node"
|
|
23
|
+
"$HOME/.asdf/shims/node"
|
|
24
|
+
"/opt/homebrew/bin/node"
|
|
25
|
+
"/usr/local/bin/node"
|
|
26
|
+
"$HOME/.nvm/current/bin/node"
|
|
36
27
|
)
|
|
37
28
|
for c in "${candidates[@]}"; do
|
|
38
29
|
[[ -x "$c" ]] && { echo "$c"; return 0; }
|
|
@@ -41,7 +32,7 @@ find_bun() {
|
|
|
41
32
|
return 1
|
|
42
33
|
}
|
|
43
34
|
|
|
44
|
-
|
|
35
|
+
NODE=$(find_node) || { echo "Error: node (>=22) not found. Install from https://nodejs.org" >&2; exit 1; }
|
|
45
36
|
|
|
46
37
|
# Resolve symlinks to find script location
|
|
47
38
|
SOURCE="${BASH_SOURCE[0]}"
|
|
@@ -52,4 +43,4 @@ while [[ -L "$SOURCE" ]]; do
|
|
|
52
43
|
done
|
|
53
44
|
SCRIPT_DIR="$(cd -P "$(dirname "$SOURCE")" && pwd)"
|
|
54
45
|
|
|
55
|
-
exec "$
|
|
46
|
+
exec "$NODE" --import tsx "$SCRIPT_DIR/src/qmd.ts" "$@"
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* QMD Reranker Benchmark
|
|
4
|
+
*
|
|
5
|
+
* Measures reranking performance across different configurations.
|
|
6
|
+
* Reports device, parallelism, memory, VRAM, and throughput.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* bun src/bench-rerank.ts # full benchmark
|
|
10
|
+
* bun src/bench-rerank.ts --quick # quick smoke test (10 docs, 1 iteration)
|
|
11
|
+
* bun src/bench-rerank.ts --docs 100 # custom doc count
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import {
|
|
15
|
+
getLlama,
|
|
16
|
+
getLlamaGpuTypes,
|
|
17
|
+
resolveModelFile,
|
|
18
|
+
LlamaLogLevel,
|
|
19
|
+
type Llama,
|
|
20
|
+
type LlamaModel,
|
|
21
|
+
} from "node-llama-cpp";
|
|
22
|
+
import { homedir } from "os";
|
|
23
|
+
import { join } from "path";
|
|
24
|
+
import { cpus } from "os";
|
|
25
|
+
|
|
26
|
+
// ============================================================================
|
|
27
|
+
// Config
|
|
28
|
+
// ============================================================================
|
|
29
|
+
|
|
30
|
+
const RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
|
31
|
+
const MODEL_CACHE = join(homedir(), ".cache", "qmd", "models");
|
|
32
|
+
const CONTEXT_SIZE = 2048;
|
|
33
|
+
|
|
34
|
+
const args = process.argv.slice(2);
|
|
35
|
+
const quick = args.includes("--quick");
|
|
36
|
+
const docsIdx = args.indexOf("--docs");
|
|
37
|
+
const DOC_COUNT = docsIdx >= 0 ? parseInt(args[docsIdx + 1]!) : (quick ? 10 : 40);
|
|
38
|
+
const ITERATIONS = quick ? 1 : 3;
|
|
39
|
+
const PARALLEL_CONFIGS = quick ? [1, 4] : [1, 2, 4, 8];
|
|
40
|
+
|
|
41
|
+
// ============================================================================
|
|
42
|
+
// Test data — realistic-ish chunks of varying length
|
|
43
|
+
// ============================================================================
|
|
44
|
+
|
|
45
|
+
const QUERY = "How do AI agents work and what are their limitations?";
|
|
46
|
+
|
|
47
|
+
function generateDocs(n: number): string[] {
|
|
48
|
+
const templates = [
|
|
49
|
+
"Artificial intelligence agents are software systems that perceive their environment and take actions to achieve goals. They use techniques like reinforcement learning, planning, and natural language processing to operate autonomously.",
|
|
50
|
+
"The transformer architecture, introduced in 2017, revolutionized natural language processing. Self-attention mechanisms allow models to weigh the importance of different parts of input sequences when generating outputs.",
|
|
51
|
+
"Machine learning models require careful evaluation to avoid overfitting. Cross-validation, holdout sets, and metrics like precision, recall, and F1 score help assess generalization performance.",
|
|
52
|
+
"Retrieval-augmented generation combines information retrieval with language models. Documents are embedded into vector spaces, retrieved based on query similarity, and used as context for generation.",
|
|
53
|
+
"Neural network training involves forward propagation, loss computation, and backpropagation. Optimizers like Adam and SGD adjust weights to minimize the loss function over training iterations.",
|
|
54
|
+
"Large language models exhibit emergent capabilities at scale, including few-shot learning, chain-of-thought reasoning, and instruction following. These properties were not explicitly trained for.",
|
|
55
|
+
"Embedding models convert text into dense vector representations that capture semantic meaning. Similar texts produce similar vectors, enabling efficient similarity search and clustering.",
|
|
56
|
+
"Autonomous agents face challenges including hallucination, lack of grounding, limited planning horizons, and difficulty with multi-step reasoning. Safety and alignment remain open research problems.",
|
|
57
|
+
"The attention mechanism computes query-key-value interactions to determine which parts of the input are most relevant. Multi-head attention allows the model to attend to different representation subspaces.",
|
|
58
|
+
"Fine-tuning adapts a pre-trained model to specific tasks using domain-specific data. Techniques like LoRA reduce the number of trainable parameters while maintaining performance.",
|
|
59
|
+
];
|
|
60
|
+
return Array.from({ length: n }, (_, i) => templates[i % templates.length]!);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// ============================================================================
|
|
64
|
+
// Helpers
|
|
65
|
+
// ============================================================================
|
|
66
|
+
|
|
67
|
+
function formatBytes(bytes: number): string {
|
|
68
|
+
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
|
|
69
|
+
if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
|
|
70
|
+
return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function getMemUsage(): { rss: number; heapUsed: number } {
|
|
74
|
+
const m = process.memoryUsage();
|
|
75
|
+
return { rss: m.rss, heapUsed: m.heapUsed };
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function median(arr: number[]): number {
|
|
79
|
+
const sorted = [...arr].sort((a, b) => a - b);
|
|
80
|
+
const mid = Math.floor(sorted.length / 2);
|
|
81
|
+
return sorted.length % 2 !== 0 ? sorted[mid]! : (sorted[mid - 1]! + sorted[mid]!) / 2;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// ============================================================================
|
|
85
|
+
// Benchmark runner
|
|
86
|
+
// ============================================================================
|
|
87
|
+
|
|
88
|
+
interface BenchResult {
|
|
89
|
+
parallelism: number;
|
|
90
|
+
contextSize: number;
|
|
91
|
+
flashAttention: boolean;
|
|
92
|
+
times: number[]; // ms per run
|
|
93
|
+
medianMs: number;
|
|
94
|
+
docsPerSec: number;
|
|
95
|
+
vramPerContext: number; // bytes
|
|
96
|
+
totalVram: number; // bytes
|
|
97
|
+
peakRss: number; // bytes
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
async function benchmarkConfig(
|
|
101
|
+
model: LlamaModel,
|
|
102
|
+
llama: Llama,
|
|
103
|
+
docs: string[],
|
|
104
|
+
parallelism: number,
|
|
105
|
+
flash: boolean,
|
|
106
|
+
): Promise<BenchResult> {
|
|
107
|
+
// Measure VRAM before
|
|
108
|
+
const vramBefore = llama.gpu ? await llama.getVramState() : null;
|
|
109
|
+
const rssBefore = getMemUsage().rss;
|
|
110
|
+
|
|
111
|
+
// Create contexts. On CPU, split threads evenly across contexts.
|
|
112
|
+
const cpuThreads = !llama.gpu ? Math.floor(llama.cpuMathCores / parallelism) : 0;
|
|
113
|
+
const contexts = [];
|
|
114
|
+
for (let i = 0; i < parallelism; i++) {
|
|
115
|
+
try {
|
|
116
|
+
contexts.push(await model.createRankingContext({
|
|
117
|
+
contextSize: CONTEXT_SIZE,
|
|
118
|
+
flashAttention: flash,
|
|
119
|
+
...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
|
|
120
|
+
}));
|
|
121
|
+
} catch {
|
|
122
|
+
if (contexts.length === 0) {
|
|
123
|
+
// Try without flash
|
|
124
|
+
contexts.push(await model.createRankingContext({
|
|
125
|
+
contextSize: CONTEXT_SIZE,
|
|
126
|
+
...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
|
|
127
|
+
}));
|
|
128
|
+
}
|
|
129
|
+
break;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
const actualParallelism = contexts.length;
|
|
133
|
+
|
|
134
|
+
// Measure VRAM after context creation
|
|
135
|
+
const vramAfter = llama.gpu ? await llama.getVramState() : null;
|
|
136
|
+
const vramUsed = vramBefore && vramAfter ? vramAfter.used - vramBefore.used : 0;
|
|
137
|
+
const vramPerCtx = actualParallelism > 0 ? vramUsed / actualParallelism : 0;
|
|
138
|
+
|
|
139
|
+
// Warm up
|
|
140
|
+
await contexts[0]!.rankAll(QUERY, docs.slice(0, 2));
|
|
141
|
+
|
|
142
|
+
// Benchmark iterations
|
|
143
|
+
const times: number[] = [];
|
|
144
|
+
let peakRss = getMemUsage().rss;
|
|
145
|
+
|
|
146
|
+
for (let iter = 0; iter < ITERATIONS; iter++) {
|
|
147
|
+
const chunkSize = Math.ceil(docs.length / actualParallelism);
|
|
148
|
+
|
|
149
|
+
const t0 = performance.now();
|
|
150
|
+
const allScores = await Promise.all(
|
|
151
|
+
Array.from({ length: actualParallelism }, (_, i) => {
|
|
152
|
+
const chunk = docs.slice(i * chunkSize, (i + 1) * chunkSize);
|
|
153
|
+
return chunk.length > 0 ? contexts[i]!.rankAll(QUERY, chunk) : Promise.resolve([]);
|
|
154
|
+
})
|
|
155
|
+
);
|
|
156
|
+
const elapsed = performance.now() - t0;
|
|
157
|
+
times.push(elapsed);
|
|
158
|
+
|
|
159
|
+
// Verify scores are valid
|
|
160
|
+
const flat = allScores.flat();
|
|
161
|
+
if (flat.some(s => s < 0 || s > 1 || isNaN(s))) {
|
|
162
|
+
throw new Error("Invalid scores detected");
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
const currentRss = getMemUsage().rss;
|
|
166
|
+
if (currentRss > peakRss) peakRss = currentRss;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// Cleanup
|
|
170
|
+
for (const ctx of contexts) await ctx.dispose();
|
|
171
|
+
|
|
172
|
+
const med = median(times);
|
|
173
|
+
return {
|
|
174
|
+
parallelism: actualParallelism,
|
|
175
|
+
contextSize: CONTEXT_SIZE,
|
|
176
|
+
flashAttention: flash,
|
|
177
|
+
times,
|
|
178
|
+
medianMs: med,
|
|
179
|
+
docsPerSec: (docs.length / med) * 1000,
|
|
180
|
+
vramPerContext: vramPerCtx,
|
|
181
|
+
totalVram: vramUsed,
|
|
182
|
+
peakRss,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// ============================================================================
|
|
187
|
+
// Main
|
|
188
|
+
// ============================================================================
|
|
189
|
+
|
|
190
|
+
async function main() {
|
|
191
|
+
console.log("═══════════════════════════════════════════════════════════════");
|
|
192
|
+
console.log(" QMD Reranker Benchmark");
|
|
193
|
+
console.log("═══════════════════════════════════════════════════════════════\n");
|
|
194
|
+
|
|
195
|
+
// Detect GPU
|
|
196
|
+
const gpuTypes = await getLlamaGpuTypes();
|
|
197
|
+
const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
|
|
198
|
+
|
|
199
|
+
let llama: Llama;
|
|
200
|
+
let gpuLabel: string;
|
|
201
|
+
if (preferred) {
|
|
202
|
+
try {
|
|
203
|
+
llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
|
|
204
|
+
gpuLabel = `${preferred}`;
|
|
205
|
+
} catch {
|
|
206
|
+
llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
|
|
207
|
+
gpuLabel = "cpu (gpu init failed)";
|
|
208
|
+
}
|
|
209
|
+
} else {
|
|
210
|
+
llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
|
|
211
|
+
gpuLabel = "cpu";
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// System info
|
|
215
|
+
const cpuInfo = cpus();
|
|
216
|
+
const cpuModel = cpuInfo[0]?.model || "unknown";
|
|
217
|
+
const cpuCount = cpuInfo.length;
|
|
218
|
+
|
|
219
|
+
console.log("System");
|
|
220
|
+
console.log(` CPU: ${cpuModel}`);
|
|
221
|
+
console.log(` Cores: ${cpuCount} (${llama.cpuMathCores} math)`);
|
|
222
|
+
console.log(` Device: ${gpuLabel}`);
|
|
223
|
+
|
|
224
|
+
if (llama.gpu) {
|
|
225
|
+
const gpuNames = await llama.getGpuDeviceNames();
|
|
226
|
+
const counts = new Map<string, number>();
|
|
227
|
+
for (const name of gpuNames) counts.set(name, (counts.get(name) || 0) + 1);
|
|
228
|
+
const devStr = Array.from(counts.entries())
|
|
229
|
+
.map(([name, n]) => n > 1 ? `${n}× ${name}` : name).join(", ");
|
|
230
|
+
console.log(` GPU: ${devStr}`);
|
|
231
|
+
const vram = await llama.getVramState();
|
|
232
|
+
console.log(` VRAM: ${formatBytes(vram.total)} total, ${formatBytes(vram.free)} free`);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
console.log(` RAM: ${formatBytes(getMemUsage().rss)} RSS at start`);
|
|
236
|
+
|
|
237
|
+
// Load model
|
|
238
|
+
console.log(`\nModel`);
|
|
239
|
+
console.log(` URI: ${RERANK_MODEL}`);
|
|
240
|
+
const modelPath = await resolveModelFile(RERANK_MODEL, MODEL_CACHE);
|
|
241
|
+
const vramPreModel = llama.gpu ? await llama.getVramState() : null;
|
|
242
|
+
const model = await llama.loadModel({ modelPath });
|
|
243
|
+
const vramPostModel = llama.gpu ? await llama.getVramState() : null;
|
|
244
|
+
const modelVram = vramPreModel && vramPostModel ? vramPostModel.used - vramPreModel.used : 0;
|
|
245
|
+
console.log(` Params: ${model.trainContextSize} train ctx`);
|
|
246
|
+
if (modelVram > 0) console.log(` VRAM: ${formatBytes(modelVram)} (model weights)`);
|
|
247
|
+
|
|
248
|
+
// Generate test docs
|
|
249
|
+
const docs = generateDocs(DOC_COUNT);
|
|
250
|
+
console.log(`\nBenchmark`);
|
|
251
|
+
console.log(` Documents: ${DOC_COUNT}`);
|
|
252
|
+
console.log(` Ctx size: ${CONTEXT_SIZE}`);
|
|
253
|
+
console.log(` Iterations:${ITERATIONS}`);
|
|
254
|
+
console.log(` Query: "${QUERY.slice(0, 50)}..."`);
|
|
255
|
+
|
|
256
|
+
// Run benchmarks
|
|
257
|
+
const results: BenchResult[] = [];
|
|
258
|
+
|
|
259
|
+
for (const p of PARALLEL_CONFIGS) {
|
|
260
|
+
if (!llama.gpu && p > 1) {
|
|
261
|
+
// CPU: only test if we have enough cores (at least 4 per context)
|
|
262
|
+
if (llama.cpuMathCores < p * 4) {
|
|
263
|
+
console.log(`\n [${p} ctx] skipped (need ${p * 4} cores, have ${llama.cpuMathCores})`);
|
|
264
|
+
continue;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// Test with flash attention
|
|
269
|
+
process.stdout.write(`\n [${p} ctx, flash] running...`);
|
|
270
|
+
try {
|
|
271
|
+
const r = await benchmarkConfig(model, llama, docs, p, true);
|
|
272
|
+
results.push(r);
|
|
273
|
+
process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
|
|
274
|
+
} catch (e: any) {
|
|
275
|
+
process.stdout.write(` failed: ${e.message}\n`);
|
|
276
|
+
// Try without flash
|
|
277
|
+
process.stdout.write(` [${p} ctx, no flash] running...`);
|
|
278
|
+
try {
|
|
279
|
+
const r = await benchmarkConfig(model, llama, docs, p, false);
|
|
280
|
+
results.push(r);
|
|
281
|
+
process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
|
|
282
|
+
} catch (e2: any) {
|
|
283
|
+
process.stdout.write(` failed: ${e2.message}\n`);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// Summary table
|
|
289
|
+
console.log("\n═══════════════════════════════════════════════════════════════");
|
|
290
|
+
console.log(" Results");
|
|
291
|
+
console.log("═══════════════════════════════════════════════════════════════\n");
|
|
292
|
+
|
|
293
|
+
const header = " Ctx Flash Median Docs/s VRAM/ctx Total VRAM Peak RSS";
|
|
294
|
+
const sep = " ─── ───── ────── ────── ──────── ────────── ────────";
|
|
295
|
+
console.log(header);
|
|
296
|
+
console.log(sep);
|
|
297
|
+
|
|
298
|
+
const baseline = results[0]?.medianMs ?? 1;
|
|
299
|
+
for (const r of results) {
|
|
300
|
+
const speedup = baseline / r.medianMs;
|
|
301
|
+
const speedupStr = r === results[0] ? " " : `(${speedup.toFixed(1)}×)`;
|
|
302
|
+
console.log(
|
|
303
|
+
` ${String(r.parallelism).padStart(3)} ` +
|
|
304
|
+
`${r.flashAttention ? " yes " : " no "} ` +
|
|
305
|
+
`${r.medianMs.toFixed(0).padStart(5)}ms ` +
|
|
306
|
+
`${r.docsPerSec.toFixed(1).padStart(6)} ` +
|
|
307
|
+
`${formatBytes(r.vramPerContext).padStart(8)} ` +
|
|
308
|
+
`${formatBytes(r.totalVram).padStart(10)} ` +
|
|
309
|
+
`${formatBytes(r.peakRss).padStart(8)} ` +
|
|
310
|
+
speedupStr
|
|
311
|
+
);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// Best config
|
|
315
|
+
if (results.length > 0) {
|
|
316
|
+
const best = results.reduce((a, b) => a.docsPerSec > b.docsPerSec ? a : b);
|
|
317
|
+
console.log(`\n Best: ${best.parallelism} contexts, flash=${best.flashAttention}`);
|
|
318
|
+
console.log(` ${best.medianMs.toFixed(0)}ms for ${DOC_COUNT} docs (${best.docsPerSec.toFixed(1)} docs/s)`);
|
|
319
|
+
if (best.totalVram > 0) console.log(` ${formatBytes(best.totalVram)} VRAM`);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
console.log("");
|
|
323
|
+
await model.dispose();
|
|
324
|
+
await llama.dispose();
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
main().catch(console.error);
|
package/src/db.ts
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* db.ts - Cross-runtime SQLite compatibility layer
|
|
3
|
+
*
|
|
4
|
+
* Provides a unified Database export that works under both Bun (bun:sqlite)
|
|
5
|
+
* and Node.js (better-sqlite3). The APIs are nearly identical — the main
|
|
6
|
+
* difference is the import path.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
export const isBun = typeof globalThis.Bun !== "undefined";
|
|
10
|
+
|
|
11
|
+
let _Database: any;
|
|
12
|
+
let _sqliteVecLoad: (db: any) => void;
|
|
13
|
+
|
|
14
|
+
if (isBun) {
|
|
15
|
+
_Database = (await import("bun:sqlite")).Database;
|
|
16
|
+
const { getLoadablePath } = await import("sqlite-vec");
|
|
17
|
+
_sqliteVecLoad = (db: any) => db.loadExtension(getLoadablePath());
|
|
18
|
+
} else {
|
|
19
|
+
_Database = (await import("better-sqlite3")).default;
|
|
20
|
+
const sqliteVec = await import("sqlite-vec");
|
|
21
|
+
_sqliteVecLoad = (db: any) => sqliteVec.load(db);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Open a SQLite database. Works with both bun:sqlite and better-sqlite3.
|
|
26
|
+
*/
|
|
27
|
+
export function openDatabase(path: string): Database {
|
|
28
|
+
return new _Database(path) as Database;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Common subset of the Database interface used throughout QMD.
|
|
33
|
+
*/
|
|
34
|
+
export interface Database {
|
|
35
|
+
exec(sql: string): void;
|
|
36
|
+
prepare(sql: string): Statement;
|
|
37
|
+
loadExtension(path: string): void;
|
|
38
|
+
close(): void;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export interface Statement {
|
|
42
|
+
run(...params: any[]): { changes: number; lastInsertRowid: number | bigint };
|
|
43
|
+
get(...params: any[]): any;
|
|
44
|
+
all(...params: any[]): any[];
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Load the sqlite-vec extension into a database.
|
|
49
|
+
*/
|
|
50
|
+
export function loadSqliteVec(db: Database): void {
|
|
51
|
+
_sqliteVecLoad(db);
|
|
52
|
+
}
|