@tobilu/qmd 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +96 -0
- package/README.md +61 -1
- package/bin/qmd +11 -2
- package/dist/ast.d.ts +64 -0
- package/dist/ast.js +324 -0
- package/dist/bench/bench.d.ts +21 -0
- package/dist/bench/bench.js +185 -0
- package/dist/bench/score.d.ts +26 -0
- package/dist/bench/score.js +67 -0
- package/dist/bench/types.d.ts +67 -0
- package/dist/bench/types.js +8 -0
- package/dist/cli/formatter.js +5 -1
- package/dist/cli/qmd.d.ts +2 -1
- package/dist/cli/qmd.js +171 -9
- package/dist/collections.d.ts +11 -0
- package/dist/db.d.ts +8 -0
- package/dist/db.js +44 -3
- package/dist/index.d.ts +7 -1
- package/dist/index.js +13 -3
- package/dist/llm.d.ts +12 -3
- package/dist/llm.js +94 -24
- package/dist/mcp/server.js +29 -5
- package/dist/store.d.ts +56 -6
- package/dist/store.js +401 -138
- package/package.json +34 -17
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,102 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
## [2.1.0] - 2026-04-05
|
|
6
|
+
|
|
7
|
+
Code files now chunk at function and class boundaries via tree-sitter,
|
|
8
|
+
clickable editor links land you at the right line from search results,
|
|
9
|
+
and per-collection model configuration means you can point different
|
|
10
|
+
collections at different embedding models. 25+ community PRs fix
|
|
11
|
+
embedding stability, BM25 accuracy, and cross-platform launcher issues.
|
|
12
|
+
|
|
13
|
+
### Changes
|
|
14
|
+
|
|
15
|
+
- AST-aware chunking for code files via `web-tree-sitter`. Supported
|
|
16
|
+
languages: TypeScript/JavaScript, Python, Go, and Rust. Code files
|
|
17
|
+
are chunked at function, class, and import boundaries instead of
|
|
18
|
+
arbitrary text positions. Markdown and unknown file types are unchanged.
|
|
19
|
+
`--chunk-strategy <auto|regex>` flag on `qmd embed` and `qmd query`
|
|
20
|
+
(default `regex`). SDK: `chunkStrategy` option on `embed()` and
|
|
21
|
+
`search()`. `qmd status` shows grammar availability.
|
|
22
|
+
- `qmd bench <fixture.json>` command for search quality benchmarks.
|
|
23
|
+
Measures precision@k, recall, MRR, and F1 across BM25, vector, hybrid,
|
|
24
|
+
and full pipeline backends. Ships with an example fixture against
|
|
25
|
+
the eval-docs test collection. #470 (thanks @jmilinovich)
|
|
26
|
+
- `models:` section in `index.yml` lets you configure `embed`, `rerank`,
|
|
27
|
+
and `generate` model URIs per collection. Resolution order is
|
|
28
|
+
config > env var (`QMD_EMBED_MODEL`, `QMD_RERANK_MODEL`,
|
|
29
|
+
`QMD_GENERATE_MODEL`) > built-in default. #502
|
|
30
|
+
(thanks @JohnRichardEnders)
|
|
31
|
+
- CLI search output now emits clickable OSC 8 terminal hyperlinks when
|
|
32
|
+
stdout is a TTY. Links resolve `qmd://` paths to absolute filesystem
|
|
33
|
+
paths and open in editors via URI templates (default:
|
|
34
|
+
`vscode://file/{path}:{line}:{col}`). Configure with `QMD_EDITOR_URI`
|
|
35
|
+
or `editor_uri` in the YAML config. #508 (thanks @danmackinlay)
|
|
36
|
+
- `--no-rerank` flag skips the reranking step in `qmd query` — useful
|
|
37
|
+
when you want fast results or don't have a GPU. Also exposed as
|
|
38
|
+
`rerank: false` on the MCP `query` tool. #370 (thanks @mvanhorn),
|
|
39
|
+
#478 (thanks @zestyboy)
|
|
40
|
+
- ONNX conversion script for deploying embedding models via
|
|
41
|
+
Transformers.js. #399 (thanks @shreyaskarnik)
|
|
42
|
+
- GitHub Actions workflow to build the Nix flake on Linux and macOS.
|
|
43
|
+
|
|
44
|
+
### Fixes
|
|
45
|
+
|
|
46
|
+
- Embedding: prevent `qmd embed` from running indefinitely when the
|
|
47
|
+
embedding loop stalls. #458 (thanks @ccc-fff)
|
|
48
|
+
- Embedding: truncate oversized text before embedding to prevent GGML
|
|
49
|
+
crash, and bound memory usage during batch embedding. #393
|
|
50
|
+
(thanks @lskun), #395 (thanks @ProgramCaiCai)
|
|
51
|
+
- Embedding: set explicit embed context size (default 2048, configurable
|
|
52
|
+
via `QMD_EMBED_CONTEXT_SIZE`) instead of using the model's full
|
|
53
|
+
window. #500
|
|
54
|
+
- Embedding: error on dimension mismatch instead of silently rebuilding
|
|
55
|
+
the vec0 table. #501
|
|
56
|
+
- Embedding: handle vec0 `OR REPLACE` limitation in `insertEmbedding`.
|
|
57
|
+
#456 (thanks @antonio-mello-ai)
|
|
58
|
+
- Embedding: fix model selection when multiple models are configured.
|
|
59
|
+
#494
|
|
60
|
+
- BM25: correct field weights to include all 3 FTS columns — title,
|
|
61
|
+
body, and path were not weighted correctly. #462 (thanks @goldsr09)
|
|
62
|
+
- BM25: handle hyphenated tokens in FTS5 lex queries so terms like
|
|
63
|
+
"real-time" match correctly. #463 (thanks @goldsr09)
|
|
64
|
+
- BM25: preserve underscores in search terms instead of stripping them.
|
|
65
|
+
#404
|
|
66
|
+
- BM25: use CTE in `searchFTS` to prevent query planner regression with
|
|
67
|
+
collection filter.
|
|
68
|
+
- Reranker: increase default context size 2048→4096 and make
|
|
69
|
+
configurable via `QMD_RERANK_CONTEXT_SIZE`. Fix template overhead
|
|
70
|
+
underestimate 200→512. #453 (thanks @builderjarvis)
|
|
71
|
+
- GPU: catch initialization failures and fall back to CPU instead of
|
|
72
|
+
crashing.
|
|
73
|
+
- MCP: read version from `package.json` instead of hardcoding. #431
|
|
74
|
+
- MCP: include collection name in status output. #416
|
|
75
|
+
- Multi-get: support brace expansion patterns in glob matching. #424
|
|
76
|
+
- Launcher: prioritize `package-lock.json` to prevent Bun false
|
|
77
|
+
positive. #385 (thanks @rymalia)
|
|
78
|
+
- Launcher: remove `$BUN_INSTALL` check that caused false Bun detection.
|
|
79
|
+
#362 (thanks @syedair)
|
|
80
|
+
- Launcher: skip Git Bash path detection on WSL. #371
|
|
81
|
+
(thanks @oysteinkrog)
|
|
82
|
+
- Model cache: respect `XDG_CACHE_HOME` for model cache directory. #457
|
|
83
|
+
(thanks @antonio-mello-ai)
|
|
84
|
+
- SQLite: add macOS Homebrew SQLite support for Bun and restore
|
|
85
|
+
actionable errors. #377 (thanks @serhii12)
|
|
86
|
+
- Pin zod to exact 4.2.1 to fix `tsc` build failure. #382
|
|
87
|
+
(thanks @rymalia)
|
|
88
|
+
- Preserve dots and original case in `handelize()` — filenames like
|
|
89
|
+
`MEMORY.md` no longer become `memory-md`. #475 (thanks @alexei-led)
|
|
90
|
+
- Include `line` in `--json` search output so editor integrations can
|
|
91
|
+
jump directly to `file:line`. #506 (thanks @danmackinlay)
|
|
92
|
+
- Nix: fix paths in flake and make Bun dependency a fixed-output
|
|
93
|
+
derivation so sandboxed Linux builds work offline. #479
|
|
94
|
+
(thanks @surma-dump)
|
|
95
|
+
- Sync stale `bun.lock` (`better-sqlite3` 11.x → 12.x). CI and release
|
|
96
|
+
script now use `--frozen-lockfile` to prevent recurrence. #386
|
|
97
|
+
(thanks @Mic92)
|
|
98
|
+
- Approve native build scripts in pnpm so `better-sqlite3` and
|
|
99
|
+
tree-sitter modules compile correctly. Update vitest ^3.0.0 → ^3.2.4.
|
|
100
|
+
|
|
5
101
|
## [2.0.1] - 2026-03-10
|
|
6
102
|
|
|
7
103
|
### Changes
|
package/README.md
CHANGED
|
@@ -318,6 +318,7 @@ const result = await store.update({
|
|
|
318
318
|
// Generate vector embeddings
|
|
319
319
|
const embedResult = await store.embed({
|
|
320
320
|
force: false, // true to re-embed everything
|
|
321
|
+
chunkStrategy: "auto", // "regex" (default) or "auto" (AST for code files)
|
|
321
322
|
onProgress: ({ current, total, collection }) => {
|
|
322
323
|
console.log(`Embedding ${current}/${total}`)
|
|
323
324
|
},
|
|
@@ -564,8 +565,27 @@ qmd embed
|
|
|
564
565
|
|
|
565
566
|
# Force re-embed everything
|
|
566
567
|
qmd embed -f
|
|
568
|
+
|
|
569
|
+
# Enable AST-aware chunking for code files (TS, JS, Python, Go, Rust)
|
|
570
|
+
qmd embed --chunk-strategy auto
|
|
571
|
+
|
|
572
|
+
# Also works with query for consistent chunk selection
|
|
573
|
+
qmd query "auth flow" --chunk-strategy auto
|
|
567
574
|
```
|
|
568
575
|
|
|
576
|
+
**AST-aware chunking** (`--chunk-strategy auto`) uses tree-sitter to chunk code
|
|
577
|
+
files at function, class, and import boundaries instead of arbitrary text
|
|
578
|
+
positions. This produces higher-quality chunks and better search results for
|
|
579
|
+
codebases. Markdown and other file types always use regex-based chunking
|
|
580
|
+
regardless of strategy.
|
|
581
|
+
|
|
582
|
+
The default is `regex` (existing behavior). Use `--chunk-strategy auto` to
|
|
583
|
+
opt in. Run `qmd status` to verify which grammars are available.
|
|
584
|
+
|
|
585
|
+
> **Note:** Tree-sitter grammars are optional dependencies. If they are not
|
|
586
|
+
> installed, `--chunk-strategy auto` falls back to regex-only chunking
|
|
587
|
+
> automatically. Tested on both Node.js and Bun.
|
|
588
|
+
|
|
569
589
|
### Context Management
|
|
570
590
|
|
|
571
591
|
Context adds descriptive metadata to collections and paths, helping search understand your content.
|
|
@@ -644,7 +664,13 @@ qmd get <file>[:line] # Get document, optionally starting at line
|
|
|
644
664
|
|
|
645
665
|
### Output Format
|
|
646
666
|
|
|
647
|
-
Default output is colorized CLI format (respects `NO_COLOR` env)
|
|
667
|
+
Default output is colorized CLI format (respects `NO_COLOR` env).
|
|
668
|
+
|
|
669
|
+
When stdout is a TTY, result paths are emitted as clickable terminal hyperlinks (OSC 8). Clicking a path opens the file in your editor using an editor URI template.
|
|
670
|
+
|
|
671
|
+
When stdout is not a TTY (for example piped to another command or redirected to a file), QMD emits plain text paths with no escape sequences.
|
|
672
|
+
|
|
673
|
+
TTY example:
|
|
648
674
|
|
|
649
675
|
```
|
|
650
676
|
docs/guide.md:42 #a1b2c3
|
|
@@ -666,6 +692,27 @@ Discussion about code quality and craftsmanship
|
|
|
666
692
|
in the development process.
|
|
667
693
|
```
|
|
668
694
|
|
|
695
|
+
Configure the editor link target with `QMD_EDITOR_URI` (or `editor_uri` in config):
|
|
696
|
+
|
|
697
|
+
```sh
|
|
698
|
+
# VS Code (default)
|
|
699
|
+
export QMD_EDITOR_URI="vscode://file/{path}:{line}:{col}"
|
|
700
|
+
|
|
701
|
+
# Cursor
|
|
702
|
+
export QMD_EDITOR_URI="cursor://file/{path}:{line}:{col}"
|
|
703
|
+
|
|
704
|
+
# Zed
|
|
705
|
+
export QMD_EDITOR_URI="zed://file/{path}:{line}:{col}"
|
|
706
|
+
|
|
707
|
+
# Sublime Text
|
|
708
|
+
export QMD_EDITOR_URI="subl://open?url=file://{path}&line={line}"
|
|
709
|
+
```
|
|
710
|
+
|
|
711
|
+
Template placeholders:
|
|
712
|
+
- `{path}` absolute filesystem path (URI-encoded)
|
|
713
|
+
- `{line}` 1-based line number
|
|
714
|
+
- `{col}` or `{column}` 1-based column number
|
|
715
|
+
|
|
669
716
|
- **Path**: Collection-relative path (e.g., `docs/guide.md`)
|
|
670
717
|
- **Docid**: Short hash identifier (e.g., `#a1b2c3`) - use with `qmd get #a1b2c3`
|
|
671
718
|
- **Title**: Extracted from document (first heading or filename)
|
|
@@ -813,6 +860,19 @@ The squared distance decay means a heading 200 tokens back (score ~30) still bea
|
|
|
813
860
|
|
|
814
861
|
**Code Fence Protection:** Break points inside code blocks are ignored—code stays together. If a code block exceeds the chunk size, it's kept whole when possible.
|
|
815
862
|
|
|
863
|
+
**AST-Aware Chunking (Code Files):**
|
|
864
|
+
|
|
865
|
+
For supported code files, QMD also parses the source with [tree-sitter](https://tree-sitter.github.io/) and adds AST-derived break points that are merged with the regex scores above:
|
|
866
|
+
|
|
867
|
+
| AST Node | Score | Languages |
|
|
868
|
+
|----------|-------|-----------|
|
|
869
|
+
| Class / interface / struct / impl / trait | 100 | All |
|
|
870
|
+
| Function / method | 90 | All |
|
|
871
|
+
| Type alias / enum | 80 | All |
|
|
872
|
+
| Import / use declaration | 60 | All |
|
|
873
|
+
|
|
874
|
+
Supported for `.ts`, `.tsx`, `.js`, `.jsx`, `.py`, `.go`, and `.rs` files. Enable with `--chunk-strategy auto`. Markdown and other file types always use regex chunking.
|
|
875
|
+
|
|
816
876
|
### Query Flow (Hybrid)
|
|
817
877
|
|
|
818
878
|
```
|
package/bin/qmd
CHANGED
|
@@ -15,8 +15,17 @@ done
|
|
|
15
15
|
# to avoid native module ABI mismatches (e.g., better-sqlite3 compiled for bun vs node)
|
|
16
16
|
DIR="$(cd -P "$(dirname "$SOURCE")/.." && pwd)"
|
|
17
17
|
|
|
18
|
-
#
|
|
19
|
-
|
|
18
|
+
# Detect the package manager that installed dependencies by checking lockfiles.
|
|
19
|
+
# $BUN_INSTALL is intentionally NOT checked — it only indicates that bun exists
|
|
20
|
+
# on the system, not that it was used to install this package (see #361).
|
|
21
|
+
#
|
|
22
|
+
# package-lock.json takes priority: if it exists, npm installed the native
|
|
23
|
+
# modules for Node. The repo ships bun.lock, so without this check, source
|
|
24
|
+
# builds that use npm would be incorrectly routed to bun, causing ABI
|
|
25
|
+
# mismatches with better-sqlite3 / sqlite-vec (see #381).
|
|
26
|
+
if [ -f "$DIR/package-lock.json" ]; then
|
|
27
|
+
exec node "$DIR/dist/cli/qmd.js" "$@"
|
|
28
|
+
elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
|
|
20
29
|
exec bun "$DIR/dist/cli/qmd.js" "$@"
|
|
21
30
|
else
|
|
22
31
|
exec node "$DIR/dist/cli/qmd.js" "$@"
|
package/dist/ast.d.ts
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AST-aware chunking support via web-tree-sitter.
|
|
3
|
+
*
|
|
4
|
+
* Provides language detection, AST break point extraction for supported
|
|
5
|
+
* code file types, and a stub for future symbol extraction.
|
|
6
|
+
*
|
|
7
|
+
* All functions degrade gracefully: parse failures or unsupported languages
|
|
8
|
+
* return empty arrays, falling back to regex-only chunking.
|
|
9
|
+
*
|
|
10
|
+
* ## Dependency Note
|
|
11
|
+
*
|
|
12
|
+
* Grammar packages (tree-sitter-typescript, etc.) are listed as
|
|
13
|
+
* optionalDependencies with pinned versions. They ship native prebuilds
|
|
14
|
+
* and source files (~72 MB total) but QMD only uses the .wasm files
|
|
15
|
+
* (~5 MB). If install size becomes a concern, the .wasm files can be
|
|
16
|
+
* bundled directly in the repo (e.g. assets/grammars/) and resolved
|
|
17
|
+
* via import.meta.url instead of require.resolve(), eliminating the
|
|
18
|
+
* grammar packages entirely.
|
|
19
|
+
*/
|
|
20
|
+
import type { BreakPoint } from "./store.js";
|
|
21
|
+
export type SupportedLanguage = "typescript" | "tsx" | "javascript" | "python" | "go" | "rust";
|
|
22
|
+
/**
|
|
23
|
+
* Detect language from file path extension.
|
|
24
|
+
* Returns null for unsupported or unknown extensions (including .md).
|
|
25
|
+
*/
|
|
26
|
+
export declare function detectLanguage(filepath: string): SupportedLanguage | null;
|
|
27
|
+
/**
|
|
28
|
+
* Parse a source file and return break points at AST node boundaries.
|
|
29
|
+
*
|
|
30
|
+
* Returns an empty array for unsupported languages, parse failures,
|
|
31
|
+
* or grammar loading failures. Never throws.
|
|
32
|
+
*
|
|
33
|
+
* @param content - The file content to parse.
|
|
34
|
+
* @param filepath - The file path (used for language detection).
|
|
35
|
+
* @returns Array of BreakPoint objects suitable for merging with regex break points.
|
|
36
|
+
*/
|
|
37
|
+
export declare function getASTBreakPoints(content: string, filepath: string): Promise<BreakPoint[]>;
|
|
38
|
+
/**
|
|
39
|
+
* Check which tree-sitter grammars are available.
|
|
40
|
+
* Returns a status object for each supported language.
|
|
41
|
+
*/
|
|
42
|
+
export declare function getASTStatus(): Promise<{
|
|
43
|
+
available: boolean;
|
|
44
|
+
languages: {
|
|
45
|
+
language: SupportedLanguage;
|
|
46
|
+
available: boolean;
|
|
47
|
+
error?: string;
|
|
48
|
+
}[];
|
|
49
|
+
}>;
|
|
50
|
+
/**
|
|
51
|
+
* Metadata about a code symbol within a chunk.
|
|
52
|
+
* Stubbed for Phase 2 — always returns empty array in Phase 1.
|
|
53
|
+
*/
|
|
54
|
+
export interface SymbolInfo {
|
|
55
|
+
name: string;
|
|
56
|
+
kind: string;
|
|
57
|
+
signature?: string;
|
|
58
|
+
line: number;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Extract symbol metadata for code within a byte range.
|
|
62
|
+
* Stubbed for Phase 2 — returns empty array.
|
|
63
|
+
*/
|
|
64
|
+
export declare function extractSymbols(_content: string, _language: string, _startPos: number, _endPos: number): SymbolInfo[];
|
package/dist/ast.js
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AST-aware chunking support via web-tree-sitter.
|
|
3
|
+
*
|
|
4
|
+
* Provides language detection, AST break point extraction for supported
|
|
5
|
+
* code file types, and a stub for future symbol extraction.
|
|
6
|
+
*
|
|
7
|
+
* All functions degrade gracefully: parse failures or unsupported languages
|
|
8
|
+
* return empty arrays, falling back to regex-only chunking.
|
|
9
|
+
*
|
|
10
|
+
* ## Dependency Note
|
|
11
|
+
*
|
|
12
|
+
* Grammar packages (tree-sitter-typescript, etc.) are listed as
|
|
13
|
+
* optionalDependencies with pinned versions. They ship native prebuilds
|
|
14
|
+
* and source files (~72 MB total) but QMD only uses the .wasm files
|
|
15
|
+
* (~5 MB). If install size becomes a concern, the .wasm files can be
|
|
16
|
+
* bundled directly in the repo (e.g. assets/grammars/) and resolved
|
|
17
|
+
* via import.meta.url instead of require.resolve(), eliminating the
|
|
18
|
+
* grammar packages entirely.
|
|
19
|
+
*/
|
|
20
|
+
import { createRequire } from "node:module";
|
|
21
|
+
import { extname } from "node:path";
|
|
22
|
+
const EXTENSION_MAP = {
|
|
23
|
+
".ts": "typescript",
|
|
24
|
+
".tsx": "tsx",
|
|
25
|
+
".js": "javascript",
|
|
26
|
+
".jsx": "tsx",
|
|
27
|
+
".mts": "typescript",
|
|
28
|
+
".cts": "typescript",
|
|
29
|
+
".mjs": "javascript",
|
|
30
|
+
".cjs": "javascript",
|
|
31
|
+
".py": "python",
|
|
32
|
+
".go": "go",
|
|
33
|
+
".rs": "rust",
|
|
34
|
+
};
|
|
35
|
+
/**
|
|
36
|
+
* Detect language from file path extension.
|
|
37
|
+
* Returns null for unsupported or unknown extensions (including .md).
|
|
38
|
+
*/
|
|
39
|
+
export function detectLanguage(filepath) {
|
|
40
|
+
const ext = extname(filepath).toLowerCase();
|
|
41
|
+
return EXTENSION_MAP[ext] ?? null;
|
|
42
|
+
}
|
|
43
|
+
// =============================================================================
|
|
44
|
+
// Grammar Resolution
|
|
45
|
+
// =============================================================================
|
|
46
|
+
/**
|
|
47
|
+
* Maps language to the npm package and wasm filename for the grammar.
|
|
48
|
+
*/
|
|
49
|
+
const GRAMMAR_MAP = {
|
|
50
|
+
typescript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" },
|
|
51
|
+
tsx: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-tsx.wasm" },
|
|
52
|
+
javascript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" },
|
|
53
|
+
python: { pkg: "tree-sitter-python", wasm: "tree-sitter-python.wasm" },
|
|
54
|
+
go: { pkg: "tree-sitter-go", wasm: "tree-sitter-go.wasm" },
|
|
55
|
+
rust: { pkg: "tree-sitter-rust", wasm: "tree-sitter-rust.wasm" },
|
|
56
|
+
};
|
|
57
|
+
// =============================================================================
|
|
58
|
+
// Per-Language Query Definitions
|
|
59
|
+
// =============================================================================
|
|
60
|
+
/**
|
|
61
|
+
* Tree-sitter S-expression queries for each language.
|
|
62
|
+
* Each capture name maps to a break point score via SCORE_MAP.
|
|
63
|
+
*
|
|
64
|
+
* For TypeScript/JavaScript, we match export_statement wrappers to get the
|
|
65
|
+
* correct start position (before `export`), plus bare declarations for
|
|
66
|
+
* non-exported code.
|
|
67
|
+
*/
|
|
68
|
+
const LANGUAGE_QUERIES = {
|
|
69
|
+
typescript: `
|
|
70
|
+
(export_statement) @export
|
|
71
|
+
(class_declaration) @class
|
|
72
|
+
(function_declaration) @func
|
|
73
|
+
(method_definition) @method
|
|
74
|
+
(interface_declaration) @iface
|
|
75
|
+
(type_alias_declaration) @type
|
|
76
|
+
(enum_declaration) @enum
|
|
77
|
+
(import_statement) @import
|
|
78
|
+
(lexical_declaration (variable_declarator value: (arrow_function))) @func
|
|
79
|
+
(lexical_declaration (variable_declarator value: (function_expression))) @func
|
|
80
|
+
`,
|
|
81
|
+
tsx: `
|
|
82
|
+
(export_statement) @export
|
|
83
|
+
(class_declaration) @class
|
|
84
|
+
(function_declaration) @func
|
|
85
|
+
(method_definition) @method
|
|
86
|
+
(interface_declaration) @iface
|
|
87
|
+
(type_alias_declaration) @type
|
|
88
|
+
(enum_declaration) @enum
|
|
89
|
+
(import_statement) @import
|
|
90
|
+
(lexical_declaration (variable_declarator value: (arrow_function))) @func
|
|
91
|
+
(lexical_declaration (variable_declarator value: (function_expression))) @func
|
|
92
|
+
`,
|
|
93
|
+
javascript: `
|
|
94
|
+
(export_statement) @export
|
|
95
|
+
(class_declaration) @class
|
|
96
|
+
(function_declaration) @func
|
|
97
|
+
(method_definition) @method
|
|
98
|
+
(import_statement) @import
|
|
99
|
+
(lexical_declaration (variable_declarator value: (arrow_function))) @func
|
|
100
|
+
(lexical_declaration (variable_declarator value: (function_expression))) @func
|
|
101
|
+
`,
|
|
102
|
+
python: `
|
|
103
|
+
(class_definition) @class
|
|
104
|
+
(function_definition) @func
|
|
105
|
+
(decorated_definition) @decorated
|
|
106
|
+
(import_statement) @import
|
|
107
|
+
(import_from_statement) @import
|
|
108
|
+
`,
|
|
109
|
+
go: `
|
|
110
|
+
(type_declaration) @type
|
|
111
|
+
(function_declaration) @func
|
|
112
|
+
(method_declaration) @method
|
|
113
|
+
(import_declaration) @import
|
|
114
|
+
`,
|
|
115
|
+
rust: `
|
|
116
|
+
(struct_item) @struct
|
|
117
|
+
(impl_item) @impl
|
|
118
|
+
(function_item) @func
|
|
119
|
+
(trait_item) @trait
|
|
120
|
+
(enum_item) @enum
|
|
121
|
+
(use_declaration) @import
|
|
122
|
+
(type_item) @type
|
|
123
|
+
(mod_item) @mod
|
|
124
|
+
`,
|
|
125
|
+
};
|
|
126
|
+
/**
|
|
127
|
+
* Score mapping from capture names to break point scores.
|
|
128
|
+
* Aligned with the markdown BREAK_PATTERNS scale (h1=100, h2=90, etc.)
|
|
129
|
+
* so findBestCutoff() decay works unchanged.
|
|
130
|
+
*/
|
|
131
|
+
const SCORE_MAP = {
|
|
132
|
+
class: 100,
|
|
133
|
+
iface: 100,
|
|
134
|
+
struct: 100,
|
|
135
|
+
trait: 100,
|
|
136
|
+
impl: 100,
|
|
137
|
+
mod: 100,
|
|
138
|
+
export: 90,
|
|
139
|
+
func: 90,
|
|
140
|
+
method: 90,
|
|
141
|
+
decorated: 90,
|
|
142
|
+
type: 80,
|
|
143
|
+
enum: 80,
|
|
144
|
+
import: 60,
|
|
145
|
+
};
|
|
146
|
+
// =============================================================================
|
|
147
|
+
// Parser Caching & Initialization
|
|
148
|
+
// =============================================================================
|
|
149
|
+
let ParserClass = null;
|
|
150
|
+
let LanguageClass = null;
|
|
151
|
+
let QueryClass = null;
|
|
152
|
+
let initPromise = null;
|
|
153
|
+
/** Languages that have already failed to load — warn only once per process. */
|
|
154
|
+
const failedLanguages = new Set();
|
|
155
|
+
/** Cached grammar load promises. */
|
|
156
|
+
const grammarCache = new Map();
|
|
157
|
+
/** Cached compiled queries per language. */
|
|
158
|
+
const queryCache = new Map();
|
|
159
|
+
/**
|
|
160
|
+
* Initialize web-tree-sitter. Called once and cached.
|
|
161
|
+
*/
|
|
162
|
+
async function ensureInit() {
|
|
163
|
+
if (!initPromise) {
|
|
164
|
+
initPromise = (async () => {
|
|
165
|
+
const mod = await import("web-tree-sitter");
|
|
166
|
+
ParserClass = mod.Parser;
|
|
167
|
+
LanguageClass = mod.Language;
|
|
168
|
+
QueryClass = mod.Query;
|
|
169
|
+
await ParserClass.init();
|
|
170
|
+
})();
|
|
171
|
+
}
|
|
172
|
+
return initPromise;
|
|
173
|
+
}
|
|
174
|
+
/**
|
|
175
|
+
* Resolve the filesystem path to a grammar .wasm file.
|
|
176
|
+
* Uses createRequire to resolve from installed dependency packages.
|
|
177
|
+
*/
|
|
178
|
+
function resolveGrammarPath(language) {
|
|
179
|
+
const { pkg, wasm } = GRAMMAR_MAP[language];
|
|
180
|
+
const require = createRequire(import.meta.url);
|
|
181
|
+
return require.resolve(`${pkg}/${wasm}`);
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Load and cache a grammar for the given language.
|
|
185
|
+
* Returns null on failure (logs once per language).
|
|
186
|
+
*/
|
|
187
|
+
async function loadGrammar(language) {
|
|
188
|
+
if (failedLanguages.has(language))
|
|
189
|
+
return null;
|
|
190
|
+
const wasmKey = GRAMMAR_MAP[language].wasm;
|
|
191
|
+
if (!grammarCache.has(wasmKey)) {
|
|
192
|
+
grammarCache.set(wasmKey, (async () => {
|
|
193
|
+
const path = resolveGrammarPath(language);
|
|
194
|
+
return LanguageClass.load(path);
|
|
195
|
+
})());
|
|
196
|
+
}
|
|
197
|
+
try {
|
|
198
|
+
return await grammarCache.get(wasmKey);
|
|
199
|
+
}
|
|
200
|
+
catch (err) {
|
|
201
|
+
failedLanguages.add(language);
|
|
202
|
+
grammarCache.delete(wasmKey);
|
|
203
|
+
console.warn(`[qmd] Failed to load tree-sitter grammar for ${language}: ${err}`);
|
|
204
|
+
return null;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Get or create a compiled query for the given language.
|
|
209
|
+
*/
|
|
210
|
+
function getQuery(language, grammar) {
|
|
211
|
+
if (!queryCache.has(language)) {
|
|
212
|
+
const source = LANGUAGE_QUERIES[language];
|
|
213
|
+
const query = new QueryClass(grammar, source);
|
|
214
|
+
queryCache.set(language, query);
|
|
215
|
+
}
|
|
216
|
+
return queryCache.get(language);
|
|
217
|
+
}
|
|
218
|
+
// =============================================================================
|
|
219
|
+
// AST Break Point Extraction
|
|
220
|
+
// =============================================================================
|
|
221
|
+
/**
|
|
222
|
+
* Parse a source file and return break points at AST node boundaries.
|
|
223
|
+
*
|
|
224
|
+
* Returns an empty array for unsupported languages, parse failures,
|
|
225
|
+
* or grammar loading failures. Never throws.
|
|
226
|
+
*
|
|
227
|
+
* @param content - The file content to parse.
|
|
228
|
+
* @param filepath - The file path (used for language detection).
|
|
229
|
+
* @returns Array of BreakPoint objects suitable for merging with regex break points.
|
|
230
|
+
*/
|
|
231
|
+
export async function getASTBreakPoints(content, filepath) {
|
|
232
|
+
const language = detectLanguage(filepath);
|
|
233
|
+
if (!language)
|
|
234
|
+
return [];
|
|
235
|
+
try {
|
|
236
|
+
await ensureInit();
|
|
237
|
+
const grammar = await loadGrammar(language);
|
|
238
|
+
if (!grammar)
|
|
239
|
+
return [];
|
|
240
|
+
const parser = new ParserClass();
|
|
241
|
+
parser.setLanguage(grammar);
|
|
242
|
+
const tree = parser.parse(content);
|
|
243
|
+
if (!tree) {
|
|
244
|
+
parser.delete();
|
|
245
|
+
return [];
|
|
246
|
+
}
|
|
247
|
+
const query = getQuery(language, grammar);
|
|
248
|
+
const captures = query.captures(tree.rootNode);
|
|
249
|
+
// Deduplicate: at each byte position, keep the highest-scoring capture.
|
|
250
|
+
// This handles cases like export_statement wrapping a class_declaration
|
|
251
|
+
// at different offsets — we want the outermost (earliest) position.
|
|
252
|
+
const seen = new Map();
|
|
253
|
+
for (const cap of captures) {
|
|
254
|
+
const pos = cap.node.startIndex;
|
|
255
|
+
const score = SCORE_MAP[cap.name] ?? 20;
|
|
256
|
+
const type = `ast:${cap.name}`;
|
|
257
|
+
const existing = seen.get(pos);
|
|
258
|
+
if (!existing || score > existing.score) {
|
|
259
|
+
seen.set(pos, { pos, score, type });
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
tree.delete();
|
|
263
|
+
parser.delete();
|
|
264
|
+
return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
|
|
265
|
+
}
|
|
266
|
+
catch (err) {
|
|
267
|
+
console.warn(`[qmd] AST parse failed for ${filepath}, falling back to regex: ${err instanceof Error ? err.message : err}`);
|
|
268
|
+
return [];
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
// =============================================================================
|
|
272
|
+
// Health / Status
|
|
273
|
+
// =============================================================================
|
|
274
|
+
/**
|
|
275
|
+
* Check which tree-sitter grammars are available.
|
|
276
|
+
* Returns a status object for each supported language.
|
|
277
|
+
*/
|
|
278
|
+
export async function getASTStatus() {
|
|
279
|
+
const languages = [];
|
|
280
|
+
try {
|
|
281
|
+
await ensureInit();
|
|
282
|
+
}
|
|
283
|
+
catch (err) {
|
|
284
|
+
return {
|
|
285
|
+
available: false,
|
|
286
|
+
languages: Object.keys(GRAMMAR_MAP).map(lang => ({
|
|
287
|
+
language: lang,
|
|
288
|
+
available: false,
|
|
289
|
+
error: `web-tree-sitter init failed: ${err instanceof Error ? err.message : err}`,
|
|
290
|
+
})),
|
|
291
|
+
};
|
|
292
|
+
}
|
|
293
|
+
for (const lang of Object.keys(GRAMMAR_MAP)) {
|
|
294
|
+
try {
|
|
295
|
+
const grammar = await loadGrammar(lang);
|
|
296
|
+
if (grammar) {
|
|
297
|
+
// Also verify the query compiles
|
|
298
|
+
getQuery(lang, grammar);
|
|
299
|
+
languages.push({ language: lang, available: true });
|
|
300
|
+
}
|
|
301
|
+
else {
|
|
302
|
+
languages.push({ language: lang, available: false, error: "grammar failed to load" });
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
catch (err) {
|
|
306
|
+
languages.push({
|
|
307
|
+
language: lang,
|
|
308
|
+
available: false,
|
|
309
|
+
error: err instanceof Error ? err.message : String(err),
|
|
310
|
+
});
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
return {
|
|
314
|
+
available: languages.some(l => l.available),
|
|
315
|
+
languages,
|
|
316
|
+
};
|
|
317
|
+
}
|
|
318
|
+
/**
|
|
319
|
+
* Extract symbol metadata for code within a byte range.
|
|
320
|
+
* Stubbed for Phase 2 — returns empty array.
|
|
321
|
+
*/
|
|
322
|
+
export function extractSymbols(_content, _language, _startPos, _endPos) {
|
|
323
|
+
return [];
|
|
324
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* QMD Benchmark Harness
|
|
3
|
+
*
|
|
4
|
+
* Runs queries from a fixture file against multiple search backends
|
|
5
|
+
* and measures precision@k, recall, MRR, F1, and latency.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* qmd bench <fixture.json> [--json] [--collection <name>]
|
|
9
|
+
*
|
|
10
|
+
* Backends tested:
|
|
11
|
+
* - bm25: BM25 keyword search (searchLex)
|
|
12
|
+
* - vector: Vector similarity search (searchVector)
|
|
13
|
+
* - hybrid: BM25 + vector RRF fusion without reranking
|
|
14
|
+
* - full: Full hybrid pipeline with LLM reranking
|
|
15
|
+
*/
|
|
16
|
+
import type { BenchmarkResult } from "./types.js";
|
|
17
|
+
export declare function runBenchmark(fixturePath: string, options?: {
|
|
18
|
+
json?: boolean;
|
|
19
|
+
collection?: string;
|
|
20
|
+
backends?: string[];
|
|
21
|
+
}): Promise<BenchmarkResult>;
|