@tobilu/qmd 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,102 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [2.1.0] - 2026-04-05
6
+
7
+ Code files now chunk at function and class boundaries via tree-sitter,
8
+ clickable editor links land you at the right line from search results,
9
+ and per-collection model configuration means you can point different
10
+ collections at different embedding models. 25+ community PRs fix
11
+ embedding stability, BM25 accuracy, and cross-platform launcher issues.
12
+
13
+ ### Changes
14
+
15
+ - AST-aware chunking for code files via `web-tree-sitter`. Supported
16
+ languages: TypeScript/JavaScript, Python, Go, and Rust. Code files
17
+ are chunked at function, class, and import boundaries instead of
18
+ arbitrary text positions. Markdown and unknown file types are unchanged.
19
+ `--chunk-strategy <auto|regex>` flag on `qmd embed` and `qmd query`
20
+ (default `regex`). SDK: `chunkStrategy` option on `embed()` and
21
+ `search()`. `qmd status` shows grammar availability.
22
+ - `qmd bench <fixture.json>` command for search quality benchmarks.
23
+ Measures precision@k, recall, MRR, and F1 across BM25, vector, hybrid,
24
+ and full pipeline backends. Ships with an example fixture against
25
+ the eval-docs test collection. #470 (thanks @jmilinovich)
26
+ - `models:` section in `index.yml` lets you configure `embed`, `rerank`,
27
+ and `generate` model URIs per collection. Resolution order is
28
+ config > env var (`QMD_EMBED_MODEL`, `QMD_RERANK_MODEL`,
29
+ `QMD_GENERATE_MODEL`) > built-in default. #502
30
+ (thanks @JohnRichardEnders)
31
+ - CLI search output now emits clickable OSC 8 terminal hyperlinks when
32
+ stdout is a TTY. Links resolve `qmd://` paths to absolute filesystem
33
+ paths and open in editors via URI templates (default:
34
+ `vscode://file/{path}:{line}:{col}`). Configure with `QMD_EDITOR_URI`
35
+ or `editor_uri` in the YAML config. #508 (thanks @danmackinlay)
36
+ - `--no-rerank` flag skips the reranking step in `qmd query` — useful
37
+ when you want fast results or don't have a GPU. Also exposed as
38
+ `rerank: false` on the MCP `query` tool. #370 (thanks @mvanhorn),
39
+ #478 (thanks @zestyboy)
40
+ - ONNX conversion script for deploying embedding models via
41
+ Transformers.js. #399 (thanks @shreyaskarnik)
42
+ - GitHub Actions workflow to build the Nix flake on Linux and macOS.
43
+
44
+ ### Fixes
45
+
46
+ - Embedding: prevent `qmd embed` from running indefinitely when the
47
+ embedding loop stalls. #458 (thanks @ccc-fff)
48
+ - Embedding: truncate oversized text before embedding to prevent GGML
49
+ crash, and bound memory usage during batch embedding. #393
50
+ (thanks @lskun), #395 (thanks @ProgramCaiCai)
51
+ - Embedding: set explicit embed context size (default 2048, configurable
52
+ via `QMD_EMBED_CONTEXT_SIZE`) instead of using the model's full
53
+ window. #500
54
+ - Embedding: error on dimension mismatch instead of silently rebuilding
55
+ the vec0 table. #501
56
+ - Embedding: handle vec0 `OR REPLACE` limitation in `insertEmbedding`.
57
+ #456 (thanks @antonio-mello-ai)
58
+ - Embedding: fix model selection when multiple models are configured.
59
+ #494
60
+ - BM25: correct field weights to include all 3 FTS columns — title,
61
+ body, and path were not weighted correctly. #462 (thanks @goldsr09)
62
+ - BM25: handle hyphenated tokens in FTS5 lex queries so terms like
63
+ "real-time" match correctly. #463 (thanks @goldsr09)
64
+ - BM25: preserve underscores in search terms instead of stripping them.
65
+ #404
66
+ - BM25: use CTE in `searchFTS` to prevent query planner regression with
67
+ collection filter.
68
+ - Reranker: increase default context size 2048→4096 and make
69
+ configurable via `QMD_RERANK_CONTEXT_SIZE`. Fix template overhead
70
+ underestimate 200→512. #453 (thanks @builderjarvis)
71
+ - GPU: catch initialization failures and fall back to CPU instead of
72
+ crashing.
73
+ - MCP: read version from `package.json` instead of hardcoding. #431
74
+ - MCP: include collection name in status output. #416
75
+ - Multi-get: support brace expansion patterns in glob matching. #424
76
+ - Launcher: prioritize `package-lock.json` to prevent Bun false
77
+ positive. #385 (thanks @rymalia)
78
+ - Launcher: remove `$BUN_INSTALL` check that caused false Bun detection.
79
+ #362 (thanks @syedair)
80
+ - Launcher: skip Git Bash path detection on WSL. #371
81
+ (thanks @oysteinkrog)
82
+ - Model cache: respect `XDG_CACHE_HOME` for model cache directory. #457
83
+ (thanks @antonio-mello-ai)
84
+ - SQLite: add macOS Homebrew SQLite support for Bun and restore
85
+ actionable errors. #377 (thanks @serhii12)
86
+ - Pin zod to exact 4.2.1 to fix `tsc` build failure. #382
87
+ (thanks @rymalia)
88
+ - Preserve dots and original case in `handelize()` — filenames like
89
+ `MEMORY.md` no longer become `memory-md`. #475 (thanks @alexei-led)
90
+ - Include `line` in `--json` search output so editor integrations can
91
+ jump directly to `file:line`. #506 (thanks @danmackinlay)
92
+ - Nix: fix paths in flake and make Bun dependency a fixed-output
93
+ derivation so sandboxed Linux builds work offline. #479
94
+ (thanks @surma-dump)
95
+ - Sync stale `bun.lock` (`better-sqlite3` 11.x → 12.x). CI and release
96
+ script now use `--frozen-lockfile` to prevent recurrence. #386
97
+ (thanks @Mic92)
98
+ - Approve native build scripts in pnpm so `better-sqlite3` and
99
+ tree-sitter modules compile correctly. Update vitest ^3.0.0 → ^3.2.4.
100
+
5
101
  ## [2.0.1] - 2026-03-10
6
102
 
7
103
  ### Changes
package/README.md CHANGED
@@ -318,6 +318,7 @@ const result = await store.update({
318
318
  // Generate vector embeddings
319
319
  const embedResult = await store.embed({
320
320
  force: false, // true to re-embed everything
321
+ chunkStrategy: "auto", // "regex" (default) or "auto" (AST for code files)
321
322
  onProgress: ({ current, total, collection }) => {
322
323
  console.log(`Embedding ${current}/${total}`)
323
324
  },
@@ -564,8 +565,27 @@ qmd embed
564
565
 
565
566
  # Force re-embed everything
566
567
  qmd embed -f
568
+
569
+ # Enable AST-aware chunking for code files (TS, JS, Python, Go, Rust)
570
+ qmd embed --chunk-strategy auto
571
+
572
+ # Also works with query for consistent chunk selection
573
+ qmd query "auth flow" --chunk-strategy auto
567
574
  ```
568
575
 
576
+ **AST-aware chunking** (`--chunk-strategy auto`) uses tree-sitter to chunk code
577
+ files at function, class, and import boundaries instead of arbitrary text
578
+ positions. This produces higher-quality chunks and better search results for
579
+ codebases. Markdown and other file types always use regex-based chunking
580
+ regardless of strategy.
581
+
582
+ The default is `regex` (existing behavior). Use `--chunk-strategy auto` to
583
+ opt in. Run `qmd status` to verify which grammars are available.
584
+
585
+ > **Note:** Tree-sitter grammars are optional dependencies. If they are not
586
+ > installed, `--chunk-strategy auto` falls back to regex-only chunking
587
+ > automatically. Tested on both Node.js and Bun.
588
+
569
589
  ### Context Management
570
590
 
571
591
  Context adds descriptive metadata to collections and paths, helping search understand your content.
@@ -644,7 +664,13 @@ qmd get <file>[:line] # Get document, optionally starting at line
644
664
 
645
665
  ### Output Format
646
666
 
647
- Default output is colorized CLI format (respects `NO_COLOR` env):
667
+ Default output is colorized CLI format (respects `NO_COLOR` env).
668
+
669
+ When stdout is a TTY, result paths are emitted as clickable terminal hyperlinks (OSC 8). Clicking a path opens the file in your editor using an editor URI template.
670
+
671
+ When stdout is not a TTY (for example piped to another command or redirected to a file), QMD emits plain text paths with no escape sequences.
672
+
673
+ TTY example:
648
674
 
649
675
  ```
650
676
  docs/guide.md:42 #a1b2c3
@@ -666,6 +692,27 @@ Discussion about code quality and craftsmanship
666
692
  in the development process.
667
693
  ```
668
694
 
695
+ Configure the editor link target with `QMD_EDITOR_URI` (or `editor_uri` in config):
696
+
697
+ ```sh
698
+ # VS Code (default)
699
+ export QMD_EDITOR_URI="vscode://file/{path}:{line}:{col}"
700
+
701
+ # Cursor
702
+ export QMD_EDITOR_URI="cursor://file/{path}:{line}:{col}"
703
+
704
+ # Zed
705
+ export QMD_EDITOR_URI="zed://file/{path}:{line}:{col}"
706
+
707
+ # Sublime Text
708
+ export QMD_EDITOR_URI="subl://open?url=file://{path}&line={line}"
709
+ ```
710
+
711
+ Template placeholders:
712
+ - `{path}` absolute filesystem path (URI-encoded)
713
+ - `{line}` 1-based line number
714
+ - `{col}` or `{column}` 1-based column number
715
+
669
716
  - **Path**: Collection-relative path (e.g., `docs/guide.md`)
670
717
  - **Docid**: Short hash identifier (e.g., `#a1b2c3`) - use with `qmd get #a1b2c3`
671
718
  - **Title**: Extracted from document (first heading or filename)
@@ -813,6 +860,19 @@ The squared distance decay means a heading 200 tokens back (score ~30) still bea
813
860
 
814
861
  **Code Fence Protection:** Break points inside code blocks are ignored—code stays together. If a code block exceeds the chunk size, it's kept whole when possible.
815
862
 
863
+ **AST-Aware Chunking (Code Files):**
864
+
865
+ For supported code files, QMD also parses the source with [tree-sitter](https://tree-sitter.github.io/) and adds AST-derived break points that are merged with the regex scores above:
866
+
867
+ | AST Node | Score | Languages |
868
+ |----------|-------|-----------|
869
+ | Class / interface / struct / impl / trait | 100 | All |
870
+ | Function / method | 90 | All |
871
+ | Type alias / enum | 80 | All |
872
+ | Import / use declaration | 60 | All |
873
+
874
+ Supported for `.ts`, `.tsx`, `.js`, `.jsx`, `.py`, `.go`, and `.rs` files. Enable with `--chunk-strategy auto`. Markdown and other file types always use regex chunking.
875
+
816
876
  ### Query Flow (Hybrid)
817
877
 
818
878
  ```
package/bin/qmd CHANGED
@@ -15,8 +15,17 @@ done
15
15
  # to avoid native module ABI mismatches (e.g., better-sqlite3 compiled for bun vs node)
16
16
  DIR="$(cd -P "$(dirname "$SOURCE")/.." && pwd)"
17
17
 
18
- # Check if we were installed with bun (look for bun.lock or bun-lockb)
19
- if [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ] || [ -n "$BUN_INSTALL" ]; then
18
+ # Detect the package manager that installed dependencies by checking lockfiles.
19
+ # $BUN_INSTALL is intentionally NOT checked it only indicates that bun exists
20
+ # on the system, not that it was used to install this package (see #361).
21
+ #
22
+ # package-lock.json takes priority: if it exists, npm installed the native
23
+ # modules for Node. The repo ships bun.lock, so without this check, source
24
+ # builds that use npm would be incorrectly routed to bun, causing ABI
25
+ # mismatches with better-sqlite3 / sqlite-vec (see #381).
26
+ if [ -f "$DIR/package-lock.json" ]; then
27
+ exec node "$DIR/dist/cli/qmd.js" "$@"
28
+ elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
20
29
  exec bun "$DIR/dist/cli/qmd.js" "$@"
21
30
  else
22
31
  exec node "$DIR/dist/cli/qmd.js" "$@"
package/dist/ast.d.ts ADDED
@@ -0,0 +1,64 @@
1
+ /**
2
+ * AST-aware chunking support via web-tree-sitter.
3
+ *
4
+ * Provides language detection, AST break point extraction for supported
5
+ * code file types, and a stub for future symbol extraction.
6
+ *
7
+ * All functions degrade gracefully: parse failures or unsupported languages
8
+ * return empty arrays, falling back to regex-only chunking.
9
+ *
10
+ * ## Dependency Note
11
+ *
12
+ * Grammar packages (tree-sitter-typescript, etc.) are listed as
13
+ * optionalDependencies with pinned versions. They ship native prebuilds
14
+ * and source files (~72 MB total) but QMD only uses the .wasm files
15
+ * (~5 MB). If install size becomes a concern, the .wasm files can be
16
+ * bundled directly in the repo (e.g. assets/grammars/) and resolved
17
+ * via import.meta.url instead of require.resolve(), eliminating the
18
+ * grammar packages entirely.
19
+ */
20
+ import type { BreakPoint } from "./store.js";
21
+ export type SupportedLanguage = "typescript" | "tsx" | "javascript" | "python" | "go" | "rust";
22
+ /**
23
+ * Detect language from file path extension.
24
+ * Returns null for unsupported or unknown extensions (including .md).
25
+ */
26
+ export declare function detectLanguage(filepath: string): SupportedLanguage | null;
27
+ /**
28
+ * Parse a source file and return break points at AST node boundaries.
29
+ *
30
+ * Returns an empty array for unsupported languages, parse failures,
31
+ * or grammar loading failures. Never throws.
32
+ *
33
+ * @param content - The file content to parse.
34
+ * @param filepath - The file path (used for language detection).
35
+ * @returns Array of BreakPoint objects suitable for merging with regex break points.
36
+ */
37
+ export declare function getASTBreakPoints(content: string, filepath: string): Promise<BreakPoint[]>;
38
+ /**
39
+ * Check which tree-sitter grammars are available.
40
+ * Returns a status object for each supported language.
41
+ */
42
+ export declare function getASTStatus(): Promise<{
43
+ available: boolean;
44
+ languages: {
45
+ language: SupportedLanguage;
46
+ available: boolean;
47
+ error?: string;
48
+ }[];
49
+ }>;
50
+ /**
51
+ * Metadata about a code symbol within a chunk.
52
+ * Stubbed for Phase 2 — always returns empty array in Phase 1.
53
+ */
54
+ export interface SymbolInfo {
55
+ name: string;
56
+ kind: string;
57
+ signature?: string;
58
+ line: number;
59
+ }
60
+ /**
61
+ * Extract symbol metadata for code within a byte range.
62
+ * Stubbed for Phase 2 — returns empty array.
63
+ */
64
+ export declare function extractSymbols(_content: string, _language: string, _startPos: number, _endPos: number): SymbolInfo[];
package/dist/ast.js ADDED
@@ -0,0 +1,324 @@
1
+ /**
2
+ * AST-aware chunking support via web-tree-sitter.
3
+ *
4
+ * Provides language detection, AST break point extraction for supported
5
+ * code file types, and a stub for future symbol extraction.
6
+ *
7
+ * All functions degrade gracefully: parse failures or unsupported languages
8
+ * return empty arrays, falling back to regex-only chunking.
9
+ *
10
+ * ## Dependency Note
11
+ *
12
+ * Grammar packages (tree-sitter-typescript, etc.) are listed as
13
+ * optionalDependencies with pinned versions. They ship native prebuilds
14
+ * and source files (~72 MB total) but QMD only uses the .wasm files
15
+ * (~5 MB). If install size becomes a concern, the .wasm files can be
16
+ * bundled directly in the repo (e.g. assets/grammars/) and resolved
17
+ * via import.meta.url instead of require.resolve(), eliminating the
18
+ * grammar packages entirely.
19
+ */
20
+ import { createRequire } from "node:module";
21
+ import { extname } from "node:path";
22
+ const EXTENSION_MAP = {
23
+ ".ts": "typescript",
24
+ ".tsx": "tsx",
25
+ ".js": "javascript",
26
+ ".jsx": "tsx",
27
+ ".mts": "typescript",
28
+ ".cts": "typescript",
29
+ ".mjs": "javascript",
30
+ ".cjs": "javascript",
31
+ ".py": "python",
32
+ ".go": "go",
33
+ ".rs": "rust",
34
+ };
35
+ /**
36
+ * Detect language from file path extension.
37
+ * Returns null for unsupported or unknown extensions (including .md).
38
+ */
39
+ export function detectLanguage(filepath) {
40
+ const ext = extname(filepath).toLowerCase();
41
+ return EXTENSION_MAP[ext] ?? null;
42
+ }
43
+ // =============================================================================
44
+ // Grammar Resolution
45
+ // =============================================================================
46
+ /**
47
+ * Maps language to the npm package and wasm filename for the grammar.
48
+ */
49
+ const GRAMMAR_MAP = {
50
+ typescript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" },
51
+ tsx: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-tsx.wasm" },
52
+ javascript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" },
53
+ python: { pkg: "tree-sitter-python", wasm: "tree-sitter-python.wasm" },
54
+ go: { pkg: "tree-sitter-go", wasm: "tree-sitter-go.wasm" },
55
+ rust: { pkg: "tree-sitter-rust", wasm: "tree-sitter-rust.wasm" },
56
+ };
57
+ // =============================================================================
58
+ // Per-Language Query Definitions
59
+ // =============================================================================
60
+ /**
61
+ * Tree-sitter S-expression queries for each language.
62
+ * Each capture name maps to a break point score via SCORE_MAP.
63
+ *
64
+ * For TypeScript/JavaScript, we match export_statement wrappers to get the
65
+ * correct start position (before `export`), plus bare declarations for
66
+ * non-exported code.
67
+ */
68
+ const LANGUAGE_QUERIES = {
69
+ typescript: `
70
+ (export_statement) @export
71
+ (class_declaration) @class
72
+ (function_declaration) @func
73
+ (method_definition) @method
74
+ (interface_declaration) @iface
75
+ (type_alias_declaration) @type
76
+ (enum_declaration) @enum
77
+ (import_statement) @import
78
+ (lexical_declaration (variable_declarator value: (arrow_function))) @func
79
+ (lexical_declaration (variable_declarator value: (function_expression))) @func
80
+ `,
81
+ tsx: `
82
+ (export_statement) @export
83
+ (class_declaration) @class
84
+ (function_declaration) @func
85
+ (method_definition) @method
86
+ (interface_declaration) @iface
87
+ (type_alias_declaration) @type
88
+ (enum_declaration) @enum
89
+ (import_statement) @import
90
+ (lexical_declaration (variable_declarator value: (arrow_function))) @func
91
+ (lexical_declaration (variable_declarator value: (function_expression))) @func
92
+ `,
93
+ javascript: `
94
+ (export_statement) @export
95
+ (class_declaration) @class
96
+ (function_declaration) @func
97
+ (method_definition) @method
98
+ (import_statement) @import
99
+ (lexical_declaration (variable_declarator value: (arrow_function))) @func
100
+ (lexical_declaration (variable_declarator value: (function_expression))) @func
101
+ `,
102
+ python: `
103
+ (class_definition) @class
104
+ (function_definition) @func
105
+ (decorated_definition) @decorated
106
+ (import_statement) @import
107
+ (import_from_statement) @import
108
+ `,
109
+ go: `
110
+ (type_declaration) @type
111
+ (function_declaration) @func
112
+ (method_declaration) @method
113
+ (import_declaration) @import
114
+ `,
115
+ rust: `
116
+ (struct_item) @struct
117
+ (impl_item) @impl
118
+ (function_item) @func
119
+ (trait_item) @trait
120
+ (enum_item) @enum
121
+ (use_declaration) @import
122
+ (type_item) @type
123
+ (mod_item) @mod
124
+ `,
125
+ };
126
+ /**
127
+ * Score mapping from capture names to break point scores.
128
+ * Aligned with the markdown BREAK_PATTERNS scale (h1=100, h2=90, etc.)
129
+ * so findBestCutoff() decay works unchanged.
130
+ */
131
+ const SCORE_MAP = {
132
+ class: 100,
133
+ iface: 100,
134
+ struct: 100,
135
+ trait: 100,
136
+ impl: 100,
137
+ mod: 100,
138
+ export: 90,
139
+ func: 90,
140
+ method: 90,
141
+ decorated: 90,
142
+ type: 80,
143
+ enum: 80,
144
+ import: 60,
145
+ };
146
+ // =============================================================================
147
+ // Parser Caching & Initialization
148
+ // =============================================================================
149
+ let ParserClass = null;
150
+ let LanguageClass = null;
151
+ let QueryClass = null;
152
+ let initPromise = null;
153
+ /** Languages that have already failed to load — warn only once per process. */
154
+ const failedLanguages = new Set();
155
+ /** Cached grammar load promises. */
156
+ const grammarCache = new Map();
157
+ /** Cached compiled queries per language. */
158
+ const queryCache = new Map();
159
+ /**
160
+ * Initialize web-tree-sitter. Called once and cached.
161
+ */
162
+ async function ensureInit() {
163
+ if (!initPromise) {
164
+ initPromise = (async () => {
165
+ const mod = await import("web-tree-sitter");
166
+ ParserClass = mod.Parser;
167
+ LanguageClass = mod.Language;
168
+ QueryClass = mod.Query;
169
+ await ParserClass.init();
170
+ })();
171
+ }
172
+ return initPromise;
173
+ }
174
+ /**
175
+ * Resolve the filesystem path to a grammar .wasm file.
176
+ * Uses createRequire to resolve from installed dependency packages.
177
+ */
178
+ function resolveGrammarPath(language) {
179
+ const { pkg, wasm } = GRAMMAR_MAP[language];
180
+ const require = createRequire(import.meta.url);
181
+ return require.resolve(`${pkg}/${wasm}`);
182
+ }
183
+ /**
184
+ * Load and cache a grammar for the given language.
185
+ * Returns null on failure (logs once per language).
186
+ */
187
+ async function loadGrammar(language) {
188
+ if (failedLanguages.has(language))
189
+ return null;
190
+ const wasmKey = GRAMMAR_MAP[language].wasm;
191
+ if (!grammarCache.has(wasmKey)) {
192
+ grammarCache.set(wasmKey, (async () => {
193
+ const path = resolveGrammarPath(language);
194
+ return LanguageClass.load(path);
195
+ })());
196
+ }
197
+ try {
198
+ return await grammarCache.get(wasmKey);
199
+ }
200
+ catch (err) {
201
+ failedLanguages.add(language);
202
+ grammarCache.delete(wasmKey);
203
+ console.warn(`[qmd] Failed to load tree-sitter grammar for ${language}: ${err}`);
204
+ return null;
205
+ }
206
+ }
207
+ /**
208
+ * Get or create a compiled query for the given language.
209
+ */
210
+ function getQuery(language, grammar) {
211
+ if (!queryCache.has(language)) {
212
+ const source = LANGUAGE_QUERIES[language];
213
+ const query = new QueryClass(grammar, source);
214
+ queryCache.set(language, query);
215
+ }
216
+ return queryCache.get(language);
217
+ }
218
+ // =============================================================================
219
+ // AST Break Point Extraction
220
+ // =============================================================================
221
+ /**
222
+ * Parse a source file and return break points at AST node boundaries.
223
+ *
224
+ * Returns an empty array for unsupported languages, parse failures,
225
+ * or grammar loading failures. Never throws.
226
+ *
227
+ * @param content - The file content to parse.
228
+ * @param filepath - The file path (used for language detection).
229
+ * @returns Array of BreakPoint objects suitable for merging with regex break points.
230
+ */
231
+ export async function getASTBreakPoints(content, filepath) {
232
+ const language = detectLanguage(filepath);
233
+ if (!language)
234
+ return [];
235
+ try {
236
+ await ensureInit();
237
+ const grammar = await loadGrammar(language);
238
+ if (!grammar)
239
+ return [];
240
+ const parser = new ParserClass();
241
+ parser.setLanguage(grammar);
242
+ const tree = parser.parse(content);
243
+ if (!tree) {
244
+ parser.delete();
245
+ return [];
246
+ }
247
+ const query = getQuery(language, grammar);
248
+ const captures = query.captures(tree.rootNode);
249
+ // Deduplicate: at each byte position, keep the highest-scoring capture.
250
+ // This handles cases like export_statement wrapping a class_declaration
251
+ // at different offsets — we want the outermost (earliest) position.
252
+ const seen = new Map();
253
+ for (const cap of captures) {
254
+ const pos = cap.node.startIndex;
255
+ const score = SCORE_MAP[cap.name] ?? 20;
256
+ const type = `ast:${cap.name}`;
257
+ const existing = seen.get(pos);
258
+ if (!existing || score > existing.score) {
259
+ seen.set(pos, { pos, score, type });
260
+ }
261
+ }
262
+ tree.delete();
263
+ parser.delete();
264
+ return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
265
+ }
266
+ catch (err) {
267
+ console.warn(`[qmd] AST parse failed for ${filepath}, falling back to regex: ${err instanceof Error ? err.message : err}`);
268
+ return [];
269
+ }
270
+ }
271
+ // =============================================================================
272
+ // Health / Status
273
+ // =============================================================================
274
+ /**
275
+ * Check which tree-sitter grammars are available.
276
+ * Returns a status object for each supported language.
277
+ */
278
+ export async function getASTStatus() {
279
+ const languages = [];
280
+ try {
281
+ await ensureInit();
282
+ }
283
+ catch (err) {
284
+ return {
285
+ available: false,
286
+ languages: Object.keys(GRAMMAR_MAP).map(lang => ({
287
+ language: lang,
288
+ available: false,
289
+ error: `web-tree-sitter init failed: ${err instanceof Error ? err.message : err}`,
290
+ })),
291
+ };
292
+ }
293
+ for (const lang of Object.keys(GRAMMAR_MAP)) {
294
+ try {
295
+ const grammar = await loadGrammar(lang);
296
+ if (grammar) {
297
+ // Also verify the query compiles
298
+ getQuery(lang, grammar);
299
+ languages.push({ language: lang, available: true });
300
+ }
301
+ else {
302
+ languages.push({ language: lang, available: false, error: "grammar failed to load" });
303
+ }
304
+ }
305
+ catch (err) {
306
+ languages.push({
307
+ language: lang,
308
+ available: false,
309
+ error: err instanceof Error ? err.message : String(err),
310
+ });
311
+ }
312
+ }
313
+ return {
314
+ available: languages.some(l => l.available),
315
+ languages,
316
+ };
317
+ }
318
+ /**
319
+ * Extract symbol metadata for code within a byte range.
320
+ * Stubbed for Phase 2 — returns empty array.
321
+ */
322
+ export function extractSymbols(_content, _language, _startPos, _endPos) {
323
+ return [];
324
+ }
@@ -0,0 +1,21 @@
1
+ /**
2
+ * QMD Benchmark Harness
3
+ *
4
+ * Runs queries from a fixture file against multiple search backends
5
+ * and measures precision@k, recall, MRR, F1, and latency.
6
+ *
7
+ * Usage:
8
+ * qmd bench <fixture.json> [--json] [--collection <name>]
9
+ *
10
+ * Backends tested:
11
+ * - bm25: BM25 keyword search (searchLex)
12
+ * - vector: Vector similarity search (searchVector)
13
+ * - hybrid: BM25 + vector RRF fusion without reranking
14
+ * - full: Full hybrid pipeline with LLM reranking
15
+ */
16
+ import type { BenchmarkResult } from "./types.js";
17
+ export declare function runBenchmark(fixturePath: string, options?: {
18
+ json?: boolean;
19
+ collection?: string;
20
+ backends?: string[];
21
+ }): Promise<BenchmarkResult>;