npm - @stablemodels/qmd-cf - Versions diffs - 0.1.0 - Mend

@stablemodels/qmd-cf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

package/src/rrf.ts ADDED Viewed

@@ -0,0 +1,115 @@
+import type { FtsResult, SearchResult, VectorResult } from "./types.js";
+/**
+ * Reciprocal Rank Fusion (RRF) — merge ranked result lists into a single ranking.
+ *
+ * RRF score for document d = Σ(weight_i / (k + rank_i + 1)) across all lists
+ * where rank_i is the 0-based position in list i.
+ *
+ * From qmd: k=60 is the standard constant. Higher k reduces the impact of
+ * being ranked #1 vs #5, making the fusion more conservative.
+ *
+ * Additionally applies a top-rank bonus (from qmd):
+ * - Rank #1 in any list: +0.05
+ * - Rank #2-3 in any list: +0.02
+ * This prevents exact matches from being diluted by expansion queries.
+ */
+export function reciprocalRankFusion(
+	ftsResults: FtsResult[],
+	vectorResults: VectorResult[],
+	options: {
+		ftsWeight?: number;
+		vectorWeight?: number;
+		k?: number;
+		limit?: number;
+	} = {},
+): SearchResult[] {
+	const k = options.k ?? 60;
+	const ftsWeight = options.ftsWeight ?? 1.0;
+	const vectorWeight = options.vectorWeight ?? 1.0;
+	const limit = options.limit ?? 10;
+	const scores = new Map<
+		string,
+		{
+			rrfScore: number;
+			topRank: number;
+			sources: Set<"fts" | "vector">;
+			sourceScores: { fts?: number; vector?: number };
+			bestResult: FtsResult | VectorResult;
+		}
+	>();
+	// Process FTS results
+	for (let rank = 0; rank < ftsResults.length; rank++) {
+		const r = ftsResults[rank];
+		const contribution = ftsWeight / (k + rank + 1);
+		const entry = scores.get(r.docId);
+		if (entry) {
+			entry.rrfScore += contribution;
+			entry.topRank = Math.min(entry.topRank, rank);
+			entry.sources.add("fts");
+			entry.sourceScores.fts = r.score;
+		} else {
+			scores.set(r.docId, {
+				rrfScore: contribution,
+				topRank: rank,
+				sources: new Set(["fts"]),
+				sourceScores: { fts: r.score },
+				bestResult: r,
+			});
+		}
+	}
+	// Process vector results
+	for (let rank = 0; rank < vectorResults.length; rank++) {
+		const r = vectorResults[rank];
+		const contribution = vectorWeight / (k + rank + 1);
+		const entry = scores.get(r.docId);
+		if (entry) {
+			entry.rrfScore += contribution;
+			entry.topRank = Math.min(entry.topRank, rank);
+			entry.sources.add("vector");
+			entry.sourceScores.vector = r.score;
+			// Keep the result with better snippet context
+			if (r.score > (entry.sourceScores.fts ?? 0)) {
+				entry.bestResult = r;
+			}
+		} else {
+			scores.set(r.docId, {
+				rrfScore: contribution,
+				topRank: rank,
+				sources: new Set(["vector"]),
+				sourceScores: { vector: r.score },
+				bestResult: r,
+			});
+		}
+	}
+	// Apply top-rank bonus
+	for (const entry of scores.values()) {
+		if (entry.topRank === 0) {
+			entry.rrfScore += 0.05;
+		} else if (entry.topRank <= 2) {
+			entry.rrfScore += 0.02;
+		}
+	}
+	// Sort by RRF score and return
+	return Array.from(scores.entries())
+		.sort((a, b) => b[1].rrfScore - a[1].rrfScore)
+		.slice(0, limit)
+		.map(([docId, entry]) => ({
+			docId,
+			score: entry.rrfScore,
+			snippet: entry.bestResult.snippet,
+			sources: Array.from(entry.sources),
+			sourceScores: entry.sourceScores,
+			title: entry.bestResult.title,
+			docType: entry.bestResult.docType,
+			namespace: entry.bestResult.namespace,
+			metadata: entry.bestResult.metadata,
+		}));
+}

package/src/schema.ts ADDED Viewed

@@ -0,0 +1,147 @@
+const CURRENT_VERSION = 2;
+/**
+ * Initialize the FTS5 schema on a Durable Object's SQL storage.
+ *
+ * Tables created:
+ * - `qmd_documents`  — document metadata (id, title, docType, namespace, metadata JSON, content_hash)
+ * - `qmd_chunks`     — chunked content with parent doc reference
+ * - `qmd_chunks_fts` — FTS5 virtual table for full-text search over chunks
+ * - `qmd_contexts`   — semantic context descriptions for path prefixes
+ * - `qmd_meta`       — schema version tracking
+ *
+ * The FTS5 table is kept in sync via triggers on `qmd_chunks`.
+ */
+export function initSchema(sql: SqlStorage, tokenizer = "unicode61"): void {
+	// Create version tracking table first so we can check if already initialized
+	sql.exec(`
+		CREATE TABLE IF NOT EXISTS qmd_meta (
+			key   TEXT PRIMARY KEY,
+			version INTEGER NOT NULL
+		)
+	`);
+	const existing = sql
+		.exec<{ version: number }>("SELECT version FROM qmd_meta LIMIT 1")
+		.toArray();
+	const currentVersion = existing.length > 0 ? existing[0].version : 0;
+	if (currentVersion >= CURRENT_VERSION) {
+		return; // Already at current version
+	}
+	// Version 0 -> 1: initial schema
+	if (currentVersion < 1) {
+		// Document metadata table
+		sql.exec(`
+			CREATE TABLE IF NOT EXISTS qmd_documents (
+				id        TEXT PRIMARY KEY,
+				title     TEXT,
+				doc_type  TEXT,
+				namespace TEXT,
+				metadata  TEXT,
+				created_at TEXT DEFAULT (datetime('now')),
+				updated_at TEXT DEFAULT (datetime('now'))
+			)
+		`);
+		// Chunk content table
+		sql.exec(`
+			CREATE TABLE IF NOT EXISTS qmd_chunks (
+				doc_id      TEXT NOT NULL,
+				seq         INTEGER NOT NULL,
+				content     TEXT NOT NULL,
+				char_offset INTEGER NOT NULL DEFAULT 0,
+				PRIMARY KEY (doc_id, seq),
+				FOREIGN KEY (doc_id) REFERENCES qmd_documents(id) ON DELETE CASCADE
+			)
+		`);
+		// FTS5 virtual table — indexes chunk content with document title for boosted relevance
+		sql.exec(`
+			CREATE VIRTUAL TABLE IF NOT EXISTS qmd_chunks_fts USING fts5(
+				doc_id UNINDEXED,
+				seq UNINDEXED,
+				title,
+				content,
+				tokenize='${tokenizer}'
+			)
+		`);
+		// Triggers to keep FTS in sync with chunks table
+		sql.exec(`
+			CREATE TRIGGER IF NOT EXISTS qmd_chunks_ai AFTER INSERT ON qmd_chunks
+			BEGIN
+				INSERT INTO qmd_chunks_fts(doc_id, seq, title, content)
+				SELECT NEW.doc_id, NEW.seq, d.title, NEW.content
+				FROM qmd_documents d WHERE d.id = NEW.doc_id;
+			END
+		`);
+		sql.exec(`
+			CREATE TRIGGER IF NOT EXISTS qmd_chunks_ad AFTER DELETE ON qmd_chunks
+			BEGIN
+				DELETE FROM qmd_chunks_fts
+				WHERE doc_id = OLD.doc_id AND seq = OLD.seq;
+			END
+		`);
+		sql.exec(`
+			CREATE TRIGGER IF NOT EXISTS qmd_chunks_au AFTER UPDATE ON qmd_chunks
+			BEGIN
+				DELETE FROM qmd_chunks_fts
+				WHERE doc_id = OLD.doc_id AND seq = OLD.seq;
+				INSERT INTO qmd_chunks_fts(doc_id, seq, title, content)
+				SELECT NEW.doc_id, NEW.seq, d.title, NEW.content
+				FROM qmd_documents d WHERE d.id = NEW.doc_id;
+			END
+		`);
+		// Index for namespace-scoped lookups
+		sql.exec(`
+			CREATE INDEX IF NOT EXISTS idx_qmd_documents_namespace
+			ON qmd_documents(namespace)
+		`);
+		// Index for doc_type filtering
+		sql.exec(`
+			CREATE INDEX IF NOT EXISTS idx_qmd_documents_doc_type
+			ON qmd_documents(doc_type)
+		`);
+	}
+	// Version 1 -> 2: add content_hash column + contexts table
+	if (currentVersion < 2) {
+		// Add content_hash column for skip-on-unchanged indexing.
+		// For fresh installs (currentVersion === 0), the column doesn't exist yet on the just-created table.
+		// For upgrades from v1, ALTER TABLE adds the column to the existing table.
+		// Both paths use ALTER TABLE which is idempotent-safe with IF NOT EXISTS workaround.
+		const cols = sql
+			.exec<{ name: string }>("PRAGMA table_info(qmd_documents)")
+			.toArray()
+			.map((c) => c.name);
+		if (!cols.includes("content_hash")) {
+			sql.exec("ALTER TABLE qmd_documents ADD COLUMN content_hash TEXT");
+		}
+		// Semantic context descriptions for path prefixes
+		sql.exec(`
+			CREATE TABLE IF NOT EXISTS qmd_contexts (
+				prefix      TEXT NOT NULL,
+				namespace   TEXT NOT NULL DEFAULT '',
+				description TEXT NOT NULL,
+				PRIMARY KEY (prefix, namespace)
+			)
+		`);
+		sql.exec(
+			"CREATE INDEX IF NOT EXISTS idx_qmd_contexts_namespace ON qmd_contexts(namespace)",
+		);
+	}
+	// Record schema version
+	sql.exec(
+		"INSERT OR REPLACE INTO qmd_meta (key, version) VALUES ('schema', ?)",
+		CURRENT_VERSION,
+	);
+}

package/src/testing.ts ADDED Viewed

@@ -0,0 +1,303 @@
+/**
+ * Testing utilities for @stablemodels/qmd-cf.
+ *
+ * Provides mock implementations of Cloudflare's SqlStorage, Vectorize, and
+ * the EmbedFn type so consuming projects can test their Qmd integration
+ * without Cloudflare Workers or Vectorize services.
+ *
+ * MockSqlStorage wraps bun:sqlite's in-memory Database, giving you real FTS5
+ * execution while matching the Cloudflare DO SqlStorage interface.
+ *
+ * MockVectorize is an in-memory vector store with brute-force cosine similarity
+ * for testing the full hybrid search pipeline locally.
+ *
+ * @example
+ * ```ts
+ * import { Qmd } from "@stablemodels/qmd-cf";
+ * import { MockSqlStorage, MockVectorize, createMockEmbedFn } from "@stablemodels/qmd-cf/testing";
+ *
+ * // FTS-only testing
+ * const sql = new MockSqlStorage();
+ * const qmd = new Qmd(sql);
+ * await qmd.index({ id: "doc1", content: "Hello world" });
+ * const results = qmd.searchFts("hello");
+ *
+ * // Hybrid search testing
+ * const vectorize = new MockVectorize();
+ * const embedFn = createMockEmbedFn();
+ * const qmd = new Qmd(sql, { vectorize, embedFn });
+ * ```
+ */
+import { Database } from "bun:sqlite";
+import type { EmbedFn } from "./types.js";
+// ─── BunSqlCursor ───────────────────────────────────────────────────
+/**
+ * Cursor wrapping bun:sqlite results. Structurally compatible with
+ * Cloudflare's SqlStorageCursor.
+ */
+class BunSqlCursor<T extends Record<string, SqlStorageValue>> {
+	private rows: T[];
+	private index = 0;
+	readonly columnNames: string[];
+	readonly rowsRead: number;
+	readonly rowsWritten: number;
+	constructor(
+		rows: T[],
+		columnNames: string[],
+		rowsRead: number,
+		rowsWritten: number,
+	) {
+		this.rows = rows;
+		this.columnNames = columnNames;
+		this.rowsRead = rowsRead;
+		this.rowsWritten = rowsWritten;
+	}
+	toArray(): T[] {
+		return this.rows;
+	}
+	one(): T {
+		if (this.rows.length !== 1) {
+			throw new Error(`Expected exactly one row, got ${this.rows.length}`);
+		}
+		return this.rows[0];
+	}
+	next(): { done?: false; value: T } | { done: true; value?: never } {
+		if (this.index < this.rows.length) {
+			return { value: this.rows[this.index++] };
+		}
+		return { done: true };
+	}
+	raw<U extends SqlStorageValue[]>(): IterableIterator<U> {
+		const data = this.rows.map((row) => Object.values(row)) as unknown as U[];
+		return data[Symbol.iterator]() as IterableIterator<U>;
+	}
+	[Symbol.iterator](): IterableIterator<T> {
+		return this.rows[Symbol.iterator]() as IterableIterator<T>;
+	}
+}
+// ─── MockSqlStorage ─────────────────────────────────────────────────
+/**
+ * In-memory SqlStorage backed by bun:sqlite.
+ *
+ * Provides real SQLite with FTS5 support, structurally compatible with
+ * Cloudflare Durable Object's `ctx.storage.sql` interface.
+ */
+export class MockSqlStorage {
+	private db: Database;
+	constructor() {
+		this.db = new Database(":memory:");
+		this.db.exec("PRAGMA journal_mode=WAL");
+		this.db.exec("PRAGMA foreign_keys=ON");
+	}
+	exec<T extends Record<string, SqlStorageValue>>(
+		query: string,
+		...bindings: any[]
+	): BunSqlCursor<T> {
+		const stmt = this.db.prepare(query);
+		const trimmed = query.trimStart().toUpperCase();
+		const isSelect =
+			trimmed.startsWith("SELECT") ||
+			trimmed.startsWith("WITH") ||
+			trimmed.startsWith("PRAGMA");
+		const isInsertReturning = trimmed.includes("RETURNING");
+		if (isSelect || isInsertReturning) {
+			const rows = stmt.all(...bindings) as T[];
+			const columnNames = rows.length > 0 ? Object.keys(rows[0] as object) : [];
+			return new BunSqlCursor<T>(rows, columnNames, rows.length, 0);
+		}
+		const result = stmt.run(...bindings);
+		return new BunSqlCursor<T>([], [], 0, result.changes);
+	}
+	get databaseSize(): number {
+		return 0;
+	}
+	get Cursor(): any {
+		return BunSqlCursor;
+	}
+	get Statement(): any {
+		return class {};
+	}
+	close(): void {
+		this.db.close();
+	}
+}
+// ─── MockVectorize ──────────────────────────────────────────────────
+interface StoredVector {
+	id: string;
+	values: number[];
+	namespace?: string;
+	metadata?: Record<string, VectorizeVectorMetadata>;
+}
+function cosineSimilarity(a: number[], b: number[]): number {
+	let dot = 0;
+	let normA = 0;
+	let normB = 0;
+	for (let i = 0; i < a.length; i++) {
+		dot += a[i] * b[i];
+		normA += a[i] * a[i];
+		normB += b[i] * b[i];
+	}
+	const denom = Math.sqrt(normA) * Math.sqrt(normB);
+	return denom === 0 ? 0 : dot / denom;
+}
+/**
+ * In-memory Vectorize with brute-force cosine similarity.
+ *
+ * Structurally compatible with Cloudflare's Vectorize abstract class.
+ * Supports insert, upsert, query, queryById, getByIds, deleteByIds.
+ */
+export class MockVectorize {
+	private vectors: Map<string, StoredVector> = new Map();
+	/** Inspect stored vectors for test assertions. */
+	get storedVectors(): Map<string, StoredVector> {
+		return this.vectors;
+	}
+	async describe(): Promise<VectorizeIndexInfo> {
+		return {
+			vectorCount: this.vectors.size,
+			dimensions: 0,
+			processedUpToDatetime: 0,
+			processedUpToMutation: 0,
+		};
+	}
+	async insert(vectors: VectorizeVector[]): Promise<VectorizeAsyncMutation> {
+		for (const v of vectors) {
+			if (this.vectors.has(v.id)) {
+				throw new Error(`Vector ${v.id} already exists`);
+			}
+			this.vectors.set(v.id, {
+				id: v.id,
+				values: Array.from(v.values),
+				namespace: v.namespace,
+				metadata: v.metadata,
+			});
+		}
+		return { mutationId: `mock-insert-${Date.now()}` };
+	}
+	async upsert(vectors: VectorizeVector[]): Promise<VectorizeAsyncMutation> {
+		for (const v of vectors) {
+			this.vectors.set(v.id, {
+				id: v.id,
+				values: Array.from(v.values),
+				namespace: v.namespace,
+				metadata: v.metadata,
+			});
+		}
+		return { mutationId: `mock-upsert-${Date.now()}` };
+	}
+	async query(
+		vector: number[] | Float32Array,
+		options?: VectorizeQueryOptions,
+	): Promise<VectorizeMatches> {
+		const queryVec = Array.from(vector);
+		const topK = options?.topK ?? 5;
+		let candidates = Array.from(this.vectors.values());
+		if (options?.namespace) {
+			candidates = candidates.filter((v) => v.namespace === options.namespace);
+		}
+		const scored = candidates.map((v) => ({
+			id: v.id,
+			score: cosineSimilarity(queryVec, v.values),
+			namespace: v.namespace,
+			metadata: options?.returnMetadata === "all" ? v.metadata : undefined,
+			values: options?.returnValues ? v.values : undefined,
+		}));
+		scored.sort((a, b) => b.score - a.score);
+		const matches = scored.slice(0, topK);
+		return { matches, count: matches.length };
+	}
+	async queryById(
+		vectorId: string,
+		options?: VectorizeQueryOptions,
+	): Promise<VectorizeMatches> {
+		const vec = this.vectors.get(vectorId);
+		if (!vec) return { matches: [], count: 0 };
+		return this.query(vec.values, options);
+	}
+	async getByIds(ids: string[]): Promise<VectorizeVector[]> {
+		return ids
+			.map((id) => this.vectors.get(id))
+			.filter((v): v is StoredVector => v !== undefined)
+			.map((v) => ({
+				id: v.id,
+				values: v.values,
+				namespace: v.namespace,
+				metadata: v.metadata,
+			}));
+	}
+	async deleteByIds(ids: string[]): Promise<VectorizeAsyncMutation> {
+		for (const id of ids) {
+			this.vectors.delete(id);
+		}
+		return { mutationId: `mock-delete-${Date.now()}` };
+	}
+	/** Reset all stored vectors. */
+	clear(): void {
+		this.vectors.clear();
+	}
+}
+// ─── Mock EmbedFn ───────────────────────────────────────────────────
+/**
+ * Create a deterministic mock embedding function.
+ *
+ * Generates consistent vectors based on character frequency distribution.
+ * Similar texts produce similar vectors, enabling meaningful cosine similarity
+ * in tests without calling a real embedding model.
+ *
+ * @param dims - Number of embedding dimensions (default: 8)
+ */
+export function createMockEmbedFn(dims = 8): EmbedFn {
+	return async (texts: string[]): Promise<number[][]> => {
+		return texts.map((text) => {
+			const lower = text.toLowerCase();
+			const vec = new Array(dims).fill(0);
+			for (let i = 0; i < lower.length; i++) {
+				const code = lower.charCodeAt(i);
+				vec[code % dims] += 1;
+			}
+			const norm = Math.sqrt(
+				vec.reduce((s: number, v: number) => s + v * v, 0),
+			);
+			return norm > 0 ? vec.map((v: number) => v / norm) : vec;
+		});
+	};
+}

package/src/types.ts ADDED Viewed

@@ -0,0 +1,124 @@
+/**
+ * Domain types for qmd-cf.
+ *
+ * Cloudflare platform types (SqlStorage, SqlStorageCursor, Vectorize,
+ * VectorizeVector, etc.) are ambient — provided by @cloudflare/workers-types
+ * via tsconfig's "types" array. They don't need to be imported or re-exported.
+ */
+/** A document to be indexed. */
+export interface Document {
+	/** Unique identifier for this document (e.g. file path). */
+	id: string;
+	/** The full text content. */
+	content: string;
+	/** Optional title (boosts search relevance when matched). */
+	title?: string;
+	/** Optional document type for filtering (e.g. "fact", "daily_note", "summary"). */
+	docType?: string;
+	/** Optional namespace for scoped search (e.g. entity path, agent ID). */
+	namespace?: string;
+	/** Arbitrary metadata stored alongside the document. */
+	metadata?: Record<string, string | number | boolean | null>;
+}
+/** A single chunk produced from a document. */
+export interface Chunk {
+	/** Parent document ID. */
+	docId: string;
+	/** Sequence index within the document (0-based). */
+	seq: number;
+	/** The chunk text content. */
+	text: string;
+	/** Character offset in the original document. */
+	charOffset: number;
+}
+/** A search result returned from BM25 full-text search. */
+export interface FtsResult {
+	docId: string;
+	/** BM25 score normalized to (0, 1] — higher is better. */
+	score: number;
+	/** The matching chunk text (snippet). */
+	snippet: string;
+	/** Chunk sequence number. */
+	seq: number;
+	title: string | null;
+	docType: string | null;
+	namespace: string | null;
+	metadata: Record<string, string | number | boolean | null> | null;
+}
+/** A search result returned from vector similarity search. */
+export interface VectorResult {
+	docId: string;
+	/** Cosine similarity score in [0, 1] — higher is better. */
+	score: number;
+	/** The matching chunk text. */
+	snippet: string;
+	/** Chunk sequence number. */
+	seq: number;
+	title: string | null;
+	docType: string | null;
+	namespace: string | null;
+	metadata: Record<string, string | number | boolean | null> | null;
+}
+/** A merged search result after hybrid fusion. */
+export interface SearchResult {
+	docId: string;
+	/** Final fused score — higher is better. */
+	score: number;
+	/** The best matching chunk text. */
+	snippet: string;
+	/** Source of the result: which retrieval methods contributed. */
+	sources: Array<"fts" | "vector">;
+	/** Individual scores from each source. */
+	sourceScores: { fts?: number; vector?: number };
+	title: string | null;
+	docType: string | null;
+	namespace: string | null;
+	metadata: Record<string, string | number | boolean | null> | null;
+}
+/** Options for search queries. */
+export interface SearchOptions {
+	/** Maximum number of results to return. Default: 10. */
+	limit?: number;
+	/** Filter by document type. */
+	docType?: string;
+	/** Filter by namespace. */
+	namespace?: string;
+}
+/** Options for hybrid search queries (extends SearchOptions). */
+export interface HybridSearchOptions extends SearchOptions {
+	/** Weight for FTS results in RRF fusion. Default: 1.0. */
+	ftsWeight?: number;
+	/** Weight for vector results in RRF fusion. Default: 1.0. */
+	vectorWeight?: number;
+	/** RRF constant k. Higher values reduce the impact of high rankings. Default: 60. */
+	rrfK?: number;
+}
+/** Configuration for the QMD index. */
+export interface QmdConfig {
+	/** Maximum characters per chunk. Default: 3200 (~800 tokens). */
+	chunkSize?: number;
+	/** Overlap characters between chunks. Default: 480 (15% of chunkSize). */
+	chunkOverlap?: number;
+	/** FTS5 tokenizer configuration. Default: "unicode61". */
+	tokenizer?: string;
+}
+/** Embedding function signature — maps text to a vector. */
+export type EmbedFn = (texts: string[]) => Promise<number[][]>;
+/** Index statistics. */
+export interface IndexStats {
+	totalDocuments: number;
+	totalChunks: number;
+	totalVectors: number;
+	namespaces: string[];
+	docTypes: string[];
+}