@stablemodels/qmd-cf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/chunker.d.ts +11 -0
- package/dist/chunker.d.ts.map +1 -0
- package/dist/chunker.js +199 -0
- package/dist/chunker.js.map +1 -0
- package/dist/fts.d.ts +19 -0
- package/dist/fts.d.ts.map +1 -0
- package/dist/fts.js +109 -0
- package/dist/fts.js.map +1 -0
- package/dist/hash.d.ts +7 -0
- package/dist/hash.d.ts.map +1 -0
- package/dist/hash.js +14 -0
- package/dist/hash.js.map +1 -0
- package/dist/index.d.ts +56 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +57 -0
- package/dist/index.js.map +1 -0
- package/dist/qmd.d.ts +158 -0
- package/dist/qmd.d.ts.map +1 -0
- package/dist/qmd.js +462 -0
- package/dist/qmd.js.map +1 -0
- package/dist/rrf.d.ts +22 -0
- package/dist/rrf.d.ts.map +1 -0
- package/dist/rrf.js +92 -0
- package/dist/rrf.js.map +1 -0
- package/dist/schema.d.ts +14 -0
- package/dist/schema.d.ts.map +1 -0
- package/dist/schema.js +128 -0
- package/dist/schema.js.map +1 -0
- package/dist/testing.d.ts +77 -0
- package/dist/testing.d.ts.map +1 -0
- package/dist/testing.js +242 -0
- package/dist/testing.js.map +1 -0
- package/dist/types.d.ts +118 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +9 -0
- package/dist/types.js.map +1 -0
- package/dist/vector.d.ts +38 -0
- package/dist/vector.d.ts.map +1 -0
- package/dist/vector.js +174 -0
- package/dist/vector.js.map +1 -0
- package/package.json +49 -0
- package/src/bun-sqlite.d.ts +17 -0
- package/src/chunker.ts +250 -0
- package/src/fts.ts +140 -0
- package/src/hash.ts +13 -0
- package/src/index.ts +72 -0
- package/src/qmd.ts +706 -0
- package/src/rrf.ts +115 -0
- package/src/schema.ts +147 -0
- package/src/testing.ts +303 -0
- package/src/types.ts +124 -0
- package/src/vector.ts +236 -0
package/src/fts.ts
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import type { FtsResult, SearchOptions } from "./types.js";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Build an FTS5 query string from a natural language query.
|
|
5
|
+
*
|
|
6
|
+
* Strategy (from qmd):
|
|
7
|
+
* - Single term: prefix match ("term"*)
|
|
8
|
+
* - Multi-term: exact phrase OR NEAR(terms, 10) OR individual terms ORed
|
|
9
|
+
*
|
|
10
|
+
* This gives good recall while still ranking exact matches highest via BM25.
|
|
11
|
+
*/
|
|
12
|
+
export function buildFts5Query(query: string): string {
|
|
13
|
+
const terms = query
|
|
14
|
+
.trim()
|
|
15
|
+
.split(/\s+/)
|
|
16
|
+
.filter((t) => t.length > 0)
|
|
17
|
+
// Strip characters that break FTS5 syntax
|
|
18
|
+
.map((t) => t.replace(/['"(){}[\]*^~:]/g, ""));
|
|
19
|
+
|
|
20
|
+
if (terms.length === 0) return '""';
|
|
21
|
+
|
|
22
|
+
if (terms.length === 1) {
|
|
23
|
+
// Single term: prefix match
|
|
24
|
+
return `"${terms[0]}" *`;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Multi-term: combine strategies for best recall
|
|
28
|
+
const phrase = `"${terms.join(" ")}"`;
|
|
29
|
+
const near = `NEAR(${terms.map((t) => `"${t}"`).join(" ")}, 10)`;
|
|
30
|
+
const orTerms = terms.map((t) => `"${t}"`).join(" OR ");
|
|
31
|
+
|
|
32
|
+
return `(${phrase}) OR (${near}) OR (${orTerms})`;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Normalize a raw BM25 score to (0, 1].
|
|
37
|
+
* SQLite FTS5 bm25() returns negative values where lower (more negative) = better match.
|
|
38
|
+
* We convert to: score = 1 / (1 + abs(raw))
|
|
39
|
+
*/
|
|
40
|
+
function normalizeBm25(raw: number): number {
|
|
41
|
+
return Math.abs(raw) / (1 + Math.abs(raw));
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
type FtsRow = {
|
|
45
|
+
doc_id: string;
|
|
46
|
+
seq: number;
|
|
47
|
+
content: string;
|
|
48
|
+
rank: number;
|
|
49
|
+
title: string | null;
|
|
50
|
+
doc_type: string | null;
|
|
51
|
+
namespace: string | null;
|
|
52
|
+
metadata: string | null;
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Execute a full-text search using FTS5 BM25 ranking.
|
|
57
|
+
*
|
|
58
|
+
* BM25 weights: title gets 10x boost, content gets 1x.
|
|
59
|
+
* Results are deduplicated by document (keeping the best-scoring chunk).
|
|
60
|
+
*/
|
|
61
|
+
export function searchFts(
|
|
62
|
+
sql: SqlStorage,
|
|
63
|
+
query: string,
|
|
64
|
+
options: SearchOptions = {},
|
|
65
|
+
): FtsResult[] {
|
|
66
|
+
const ftsQuery = buildFts5Query(query);
|
|
67
|
+
const limit = options.limit ?? 10;
|
|
68
|
+
|
|
69
|
+
// Build WHERE clauses for optional filters
|
|
70
|
+
const filters: string[] = [];
|
|
71
|
+
const bindings: unknown[] = [ftsQuery];
|
|
72
|
+
|
|
73
|
+
if (options.docType) {
|
|
74
|
+
filters.push("d.doc_type = ?");
|
|
75
|
+
bindings.push(options.docType);
|
|
76
|
+
}
|
|
77
|
+
if (options.namespace) {
|
|
78
|
+
if (options.namespace.includes("*")) {
|
|
79
|
+
const prefix = options.namespace.replace(/\*+$/, "").replace(/\/+$/, "");
|
|
80
|
+
filters.push("d.namespace LIKE ?");
|
|
81
|
+
bindings.push(`${prefix}/%`);
|
|
82
|
+
} else {
|
|
83
|
+
filters.push("d.namespace = ?");
|
|
84
|
+
bindings.push(options.namespace);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
const whereClause = filters.length > 0 ? `AND ${filters.join(" AND ")}` : "";
|
|
89
|
+
|
|
90
|
+
// BM25 weights: doc_id (unindexed, 0), seq (unindexed, 0), title (10.0), content (1.0)
|
|
91
|
+
const rows = sql
|
|
92
|
+
.exec<FtsRow>(
|
|
93
|
+
`
|
|
94
|
+
SELECT
|
|
95
|
+
f.doc_id,
|
|
96
|
+
f.seq,
|
|
97
|
+
f.content,
|
|
98
|
+
bm25(qmd_chunks_fts, 0, 0, 10.0, 1.0) as rank,
|
|
99
|
+
d.title,
|
|
100
|
+
d.doc_type,
|
|
101
|
+
d.namespace,
|
|
102
|
+
d.metadata
|
|
103
|
+
FROM qmd_chunks_fts f
|
|
104
|
+
JOIN qmd_documents d ON d.id = f.doc_id
|
|
105
|
+
WHERE qmd_chunks_fts MATCH ?
|
|
106
|
+
${whereClause}
|
|
107
|
+
ORDER BY rank
|
|
108
|
+
LIMIT ?
|
|
109
|
+
`,
|
|
110
|
+
...bindings,
|
|
111
|
+
// Fetch extra to allow dedup
|
|
112
|
+
limit * 3,
|
|
113
|
+
)
|
|
114
|
+
.toArray();
|
|
115
|
+
|
|
116
|
+
// Deduplicate by doc_id, keeping the best-scoring chunk
|
|
117
|
+
const seen = new Map<string, FtsResult>();
|
|
118
|
+
|
|
119
|
+
for (const row of rows) {
|
|
120
|
+
const score = normalizeBm25(row.rank);
|
|
121
|
+
const existing = seen.get(row.doc_id);
|
|
122
|
+
|
|
123
|
+
if (!existing || score > existing.score) {
|
|
124
|
+
seen.set(row.doc_id, {
|
|
125
|
+
docId: row.doc_id,
|
|
126
|
+
score,
|
|
127
|
+
snippet: row.content,
|
|
128
|
+
seq: row.seq as number,
|
|
129
|
+
title: row.title,
|
|
130
|
+
docType: row.doc_type,
|
|
131
|
+
namespace: row.namespace,
|
|
132
|
+
metadata: row.metadata ? JSON.parse(row.metadata as string) : null,
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return Array.from(seen.values())
|
|
138
|
+
.sort((a, b) => b.score - a.score)
|
|
139
|
+
.slice(0, limit);
|
|
140
|
+
}
|
package/src/hash.ts
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* FNV-1a 32-bit hash — fast, deterministic, non-cryptographic.
|
|
3
|
+
* Used to detect content changes for skip-on-unchanged indexing.
|
|
4
|
+
* Returns an 8-character lowercase hex string.
|
|
5
|
+
*/
|
|
6
|
+
export function fnv1a32(input: string): string {
|
|
7
|
+
let hash = 0x811c9dc5; // FNV offset basis
|
|
8
|
+
for (let i = 0; i < input.length; i++) {
|
|
9
|
+
hash ^= input.charCodeAt(i);
|
|
10
|
+
hash = Math.imul(hash, 0x01000193); // FNV prime
|
|
11
|
+
}
|
|
12
|
+
return (hash >>> 0).toString(16).padStart(8, "0");
|
|
13
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @stablemodels/qmd-cf — Hybrid full-text + vector search for Cloudflare Durable Objects.
|
|
3
|
+
*
|
|
4
|
+
* A DO-native reimagination of qmd (https://github.com/tobi/qmd).
|
|
5
|
+
*
|
|
6
|
+
* FTS5 runs co-located in the Durable Object's SQLite for zero-latency BM25 keyword search.
|
|
7
|
+
* Optionally, Cloudflare Vectorize adds semantic vector search, fused via Reciprocal Rank Fusion.
|
|
8
|
+
*
|
|
9
|
+
* Cloudflare platform types (SqlStorage, Vectorize, VectorizeVector, etc.) are
|
|
10
|
+
* ambient from @cloudflare/workers-types — consumers access them directly.
|
|
11
|
+
*
|
|
12
|
+
* @example FTS-only (zero external dependencies)
|
|
13
|
+
* ```ts
|
|
14
|
+
* import { Qmd } from "@stablemodels/qmd-cf";
|
|
15
|
+
*
|
|
16
|
+
* export class MyDurableObject extends DurableObject {
|
|
17
|
+
* qmd: Qmd;
|
|
18
|
+
*
|
|
19
|
+
* constructor(ctx: DurableObjectState, env: Env) {
|
|
20
|
+
* super(ctx, env);
|
|
21
|
+
* this.qmd = new Qmd(ctx.storage.sql);
|
|
22
|
+
* }
|
|
23
|
+
*
|
|
24
|
+
* async search(query: string) {
|
|
25
|
+
* return this.qmd.search(query);
|
|
26
|
+
* }
|
|
27
|
+
* }
|
|
28
|
+
* ```
|
|
29
|
+
*
|
|
30
|
+
* @example Hybrid FTS + Vector search
|
|
31
|
+
* ```ts
|
|
32
|
+
* import { Qmd } from "@stablemodels/qmd-cf";
|
|
33
|
+
*
|
|
34
|
+
* export class MyDurableObject extends DurableObject {
|
|
35
|
+
* qmd: Qmd;
|
|
36
|
+
*
|
|
37
|
+
* constructor(ctx: DurableObjectState, env: Env) {
|
|
38
|
+
* super(ctx, env);
|
|
39
|
+
* this.qmd = new Qmd(ctx.storage.sql, {
|
|
40
|
+
* vectorize: env.VECTORIZE,
|
|
41
|
+
* embedFn: (texts) =>
|
|
42
|
+
* env.AI.run("@cf/baai/bge-m3", { text: texts })
|
|
43
|
+
* .then(r => r.data),
|
|
44
|
+
* });
|
|
45
|
+
* }
|
|
46
|
+
* }
|
|
47
|
+
* ```
|
|
48
|
+
*/
|
|
49
|
+
|
|
50
|
+
// Main class
|
|
51
|
+
export { Qmd } from "./qmd.js";
|
|
52
|
+
|
|
53
|
+
// Domain types
|
|
54
|
+
export type {
|
|
55
|
+
Document,
|
|
56
|
+
Chunk,
|
|
57
|
+
FtsResult,
|
|
58
|
+
VectorResult,
|
|
59
|
+
SearchResult,
|
|
60
|
+
SearchOptions,
|
|
61
|
+
HybridSearchOptions,
|
|
62
|
+
QmdConfig,
|
|
63
|
+
IndexStats,
|
|
64
|
+
EmbedFn,
|
|
65
|
+
} from "./types.js";
|
|
66
|
+
|
|
67
|
+
// Utilities (useful for custom pipelines)
|
|
68
|
+
export { chunkText } from "./chunker.js";
|
|
69
|
+
export { buildFts5Query } from "./fts.js";
|
|
70
|
+
export { fnv1a32 } from "./hash.js";
|
|
71
|
+
export { reciprocalRankFusion } from "./rrf.js";
|
|
72
|
+
export { formatDocForEmbedding, formatQueryForEmbedding } from "./vector.js";
|