npm - @stablemodels/qmd-cf - Versions diffs - 0.1.0 - Mend

@stablemodels/qmd-cf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

package/dist/types.d.ts ADDED Viewed

@@ -0,0 +1,118 @@
+/**
+ * Domain types for qmd-cf.
+ *
+ * Cloudflare platform types (SqlStorage, SqlStorageCursor, Vectorize,
+ * VectorizeVector, etc.) are ambient — provided by @cloudflare/workers-types
+ * via tsconfig's "types" array. They don't need to be imported or re-exported.
+ */
+/** A document to be indexed. */
+export interface Document {
+    /** Unique identifier for this document (e.g. file path). */
+    id: string;
+    /** The full text content. */
+    content: string;
+    /** Optional title (boosts search relevance when matched). */
+    title?: string;
+    /** Optional document type for filtering (e.g. "fact", "daily_note", "summary"). */
+    docType?: string;
+    /** Optional namespace for scoped search (e.g. entity path, agent ID). */
+    namespace?: string;
+    /** Arbitrary metadata stored alongside the document. */
+    metadata?: Record<string, string | number | boolean | null>;
+}
+/** A single chunk produced from a document. */
+export interface Chunk {
+    /** Parent document ID. */
+    docId: string;
+    /** Sequence index within the document (0-based). */
+    seq: number;
+    /** The chunk text content. */
+    text: string;
+    /** Character offset in the original document. */
+    charOffset: number;
+}
+/** A search result returned from BM25 full-text search. */
+export interface FtsResult {
+    docId: string;
+    /** BM25 score normalized to (0, 1] — higher is better. */
+    score: number;
+    /** The matching chunk text (snippet). */
+    snippet: string;
+    /** Chunk sequence number. */
+    seq: number;
+    title: string | null;
+    docType: string | null;
+    namespace: string | null;
+    metadata: Record<string, string | number | boolean | null> | null;
+}
+/** A search result returned from vector similarity search. */
+export interface VectorResult {
+    docId: string;
+    /** Cosine similarity score in [0, 1] — higher is better. */
+    score: number;
+    /** The matching chunk text. */
+    snippet: string;
+    /** Chunk sequence number. */
+    seq: number;
+    title: string | null;
+    docType: string | null;
+    namespace: string | null;
+    metadata: Record<string, string | number | boolean | null> | null;
+}
+/** A merged search result after hybrid fusion. */
+export interface SearchResult {
+    docId: string;
+    /** Final fused score — higher is better. */
+    score: number;
+    /** The best matching chunk text. */
+    snippet: string;
+    /** Source of the result: which retrieval methods contributed. */
+    sources: Array<"fts" | "vector">;
+    /** Individual scores from each source. */
+    sourceScores: {
+        fts?: number;
+        vector?: number;
+    };
+    title: string | null;
+    docType: string | null;
+    namespace: string | null;
+    metadata: Record<string, string | number | boolean | null> | null;
+}
+/** Options for search queries. */
+export interface SearchOptions {
+    /** Maximum number of results to return. Default: 10. */
+    limit?: number;
+    /** Filter by document type. */
+    docType?: string;
+    /** Filter by namespace. */
+    namespace?: string;
+}
+/** Options for hybrid search queries (extends SearchOptions). */
+export interface HybridSearchOptions extends SearchOptions {
+    /** Weight for FTS results in RRF fusion. Default: 1.0. */
+    ftsWeight?: number;
+    /** Weight for vector results in RRF fusion. Default: 1.0. */
+    vectorWeight?: number;
+    /** RRF constant k. Higher values reduce the impact of high rankings. Default: 60. */
+    rrfK?: number;
+}
+/** Configuration for the QMD index. */
+export interface QmdConfig {
+    /** Maximum characters per chunk. Default: 3200 (~800 tokens). */
+    chunkSize?: number;
+    /** Overlap characters between chunks. Default: 480 (15% of chunkSize). */
+    chunkOverlap?: number;
+    /** FTS5 tokenizer configuration. Default: "unicode61". */
+    tokenizer?: string;
+}
+/** Embedding function signature — maps text to a vector. */
+export type EmbedFn = (texts: string[]) => Promise<number[][]>;
+/** Index statistics. */
+export interface IndexStats {
+    totalDocuments: number;
+    totalChunks: number;
+    totalVectors: number;
+    namespaces: string[];
+    docTypes: string[];
+}
+//# sourceMappingURL=types.d.ts.map

package/dist/types.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,gCAAgC;AAChC,MAAM,WAAW,QAAQ;IACxB,4DAA4D;IAC5D,EAAE,EAAE,MAAM,CAAC;IACX,6BAA6B;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,6DAA6D;IAC7D,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,mFAAmF;IACnF,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,yEAAyE;IACzE,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,wDAAwD;IACxD,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO,GAAG,IAAI,CAAC,CAAC;CAC5D;AAED,+CAA+C;AAC/C,MAAM,WAAW,KAAK;IACrB,0BAA0B;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,oDAAoD;IACpD,GAAG,EAAE,MAAM,CAAC;IACZ,8BAA8B;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,iDAAiD;IACjD,UAAU,EAAE,MAAM,CAAC;CACnB;AAED,2DAA2D;AAC3D,MAAM,WAAW,SAAS;IACzB,KAAK,EAAE,MAAM,CAAC;IACd,0DAA0D;IAC1D,KAAK,EAAE,MAAM,CAAC;IACd,yCAAyC;IACzC,OAAO,EAAE,MAAM,CAAC;IAChB,6BAA6B;IAC7B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC;CAClE;AAED,8DAA8D;AAC9D,MAAM,WAAW,YAAY;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,4DAA4D;IAC5D,KAAK,EAAE,MAAM,CAAC;IACd,+BAA+B;IAC/B,OAAO,EAAE,MAAM,CAAC;IAChB,6BAA6B;IAC7B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC;CAClE;AAED,kDAAkD;AAClD,MAAM,WAAW,YAAY;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,4CAA4C;IAC5C,KAAK,EAAE,MAAM,CAAC;IACd,oCAAoC;IACpC,OAAO,EAAE,MAAM,CAAC;IAChB,iEAAiE;IACjE,OAAO,EAAE,KAAK,CAAC,KAAK,GAAG,QAAQ,CAAC,CAAC;IACjC,0CAA0C;IAC1C,YAAY,EAAE;QAAE,GAAG,CAAC,EAAE,MAAM,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IAChD,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC;CAClE;AAED,kCAAkC;AAClC,MAAM,WAAW,aAAa;IAC7B,wDAAwD;IACxD,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,+BAA+B;IAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,2BAA2B;IAC3B,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,iEAAiE;AACjE,MAAM,WAAW,mBAAoB,SAAQ,aAAa;IACzD,0DAA0D;IAC1D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,6DAA6D;IAC7D,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qFAAqF;IACrF,IAAI,CAAC,EAAE,MAAM,CAAC;CACd;AAED,uCAAuC;AACvC,MAAM,WAAW,SAAS;IACzB,iEAAiE;IACjE,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,0EAA0E;IAC1E,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,0DAA0D;IAC1D,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,4DAA4D;AAC5D,MAAM,MAAM,OAAO,GAAG,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;AAE/D,wBAAwB;AACxB,MAAM,WAAW,UAAU;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,QAAQ,EAAE,MAAM,EAAE,CAAC;CACnB"}

package/dist/types.js ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * Domain types for qmd-cf.
+ *
+ * Cloudflare platform types (SqlStorage, SqlStorageCursor, Vectorize,
+ * VectorizeVector, etc.) are ambient — provided by @cloudflare/workers-types
+ * via tsconfig's "types" array. They don't need to be imported or re-exported.
+ */
+export {};
+//# sourceMappingURL=types.js.map

package/dist/types.js.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG"}

package/dist/vector.d.ts ADDED Viewed

@@ -0,0 +1,38 @@
+import type { EmbedFn, SearchOptions, VectorResult } from "./types.js";
+/**
+ * Format text for embedding (document indexing).
+ * Follows nomic/qmd convention of prefixing with title context.
+ */
+export declare function formatDocForEmbedding(text: string, title?: string, context?: string): string;
+/**
+ * Format a query string for embedding (search time).
+ */
+export declare function formatQueryForEmbedding(query: string): string;
+/**
+ * Index chunks into Vectorize with embeddings.
+ *
+ * Each chunk gets a vector ID of "{docId}_{seq}" which maps back to qmd_chunks.
+ * Vectors are stored in a namespace matching the document's namespace for scoped search.
+ */
+export declare function indexVectors(vectorize: Vectorize, embedFn: EmbedFn, chunks: Array<{
+    docId: string;
+    seq: number;
+    text: string;
+    title?: string;
+    namespace?: string;
+    docType?: string;
+    context?: string;
+}>): Promise<void>;
+/**
+ * Remove all vectors for a document from Vectorize.
+ */
+export declare function removeVectors(vectorize: Vectorize, sql: SqlStorage, docId: string): Promise<void>;
+/**
+ * Execute a vector similarity search via Vectorize.
+ *
+ * 1. Embed the query
+ * 2. Query Vectorize for nearest neighbors (scoped by namespace if provided)
+ * 3. Look up chunk content from the local SQLite for snippet extraction
+ */
+export declare function searchVector(vectorize: Vectorize, embedFn: EmbedFn, sql: SqlStorage, query: string, options?: SearchOptions): Promise<VectorResult[]>;
+//# sourceMappingURL=vector.d.ts.map

package/dist/vector.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"vector.d.ts","sourceRoot":"","sources":["../src/vector.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAEvE;;;GAGG;AACH,wBAAgB,qBAAqB,CACpC,IAAI,EAAE,MAAM,EACZ,KAAK,CAAC,EAAE,MAAM,EACd,OAAO,CAAC,EAAE,MAAM,GACd,MAAM,CAMR;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAE7D;AAED;;;;;GAKG;AACH,wBAAsB,YAAY,CACjC,SAAS,EAAE,SAAS,EACpB,OAAO,EAAE,OAAO,EAChB,MAAM,EAAE,KAAK,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;CACjB,CAAC,GACA,OAAO,CAAC,IAAI,CAAC,CA8Bf;AAED;;GAEG;AACH,wBAAsB,aAAa,CAClC,SAAS,EAAE,SAAS,EACpB,GAAG,EAAE,UAAU,EACf,KAAK,EAAE,MAAM,GACX,OAAO,CAAC,IAAI,CAAC,CAUf;AAYD;;;;;;GAMG;AACH,wBAAsB,YAAY,CACjC,SAAS,EAAE,SAAS,EACpB,OAAO,EAAE,OAAO,EAChB,GAAG,EAAE,UAAU,EACf,KAAK,EAAE,MAAM,EACb,OAAO,GAAE,aAAkB,GACzB,OAAO,CAAC,YAAY,EAAE,CAAC,CAsHzB"}

package/dist/vector.js ADDED Viewed

@@ -0,0 +1,174 @@
+/**
+ * Format text for embedding (document indexing).
+ * Follows nomic/qmd convention of prefixing with title context.
+ */
+export function formatDocForEmbedding(text, title, context) {
+    const parts = [];
+    if (context)
+        parts.push(`context: ${context}`);
+    parts.push(`title: ${title || "none"}`);
+    parts.push(`text: ${text}`);
+    return parts.join(" | ");
+}
+/**
+ * Format a query string for embedding (search time).
+ */
+export function formatQueryForEmbedding(query) {
+    return `search_query: ${query}`;
+}
+/**
+ * Index chunks into Vectorize with embeddings.
+ *
+ * Each chunk gets a vector ID of "{docId}_{seq}" which maps back to qmd_chunks.
+ * Vectors are stored in a namespace matching the document's namespace for scoped search.
+ */
+export async function indexVectors(vectorize, embedFn, chunks) {
+    if (chunks.length === 0)
+        return;
+    // Format texts for embedding (includes context if provided)
+    const texts = chunks.map((c) => formatDocForEmbedding(c.text, c.title, c.context));
+    // Generate embeddings in batch (Workers AI supports up to 100 at a time)
+    const batchSize = 100;
+    for (let i = 0; i < texts.length; i += batchSize) {
+        const batchTexts = texts.slice(i, i + batchSize);
+        const batchChunks = chunks.slice(i, i + batchSize);
+        const embeddings = await embedFn(batchTexts);
+        const vectors = batchChunks.map((c, j) => ({
+            id: `${c.docId}_${c.seq}`,
+            values: embeddings[j],
+            namespace: c.namespace ? c.namespace.split("/")[0] : undefined,
+            metadata: {
+                docId: c.docId,
+                seq: c.seq,
+                docType: c.docType ?? "",
+                directory: c.namespace ?? "",
+            },
+        }));
+        await vectorize.upsert(vectors);
+    }
+}
+/**
+ * Remove all vectors for a document from Vectorize.
+ */
+export async function removeVectors(vectorize, sql, docId) {
+    // Look up all chunk seq numbers for this document
+    const chunks = sql
+        .exec("SELECT seq FROM qmd_chunks WHERE doc_id = ?", docId)
+        .toArray();
+    if (chunks.length === 0)
+        return;
+    const ids = chunks.map((c) => `${docId}_${c.seq}`);
+    await vectorize.deleteByIds(ids);
+}
+/**
+ * Execute a vector similarity search via Vectorize.
+ *
+ * 1. Embed the query
+ * 2. Query Vectorize for nearest neighbors (scoped by namespace if provided)
+ * 3. Look up chunk content from the local SQLite for snippet extraction
+ */
+export async function searchVector(vectorize, embedFn, sql, query, options = {}) {
+    const limit = options.limit ?? 10;
+    // Embed the query
+    const queryText = formatQueryForEmbedding(query);
+    const [queryVector] = await embedFn([queryText]);
+    // Resolve namespace for Vectorize query: use first path segment for glob/path patterns
+    let vectorizeNamespace;
+    let directoryPrefix;
+    if (options.namespace) {
+        if (options.namespace.includes("*")) {
+            // Glob pattern: people/* → Vectorize ns "people", no post-filter needed for top-level
+            const prefix = options.namespace.replace(/\*+$/, "").replace(/\/+$/, "");
+            vectorizeNamespace = prefix.split("/")[0];
+            // Only need post-filter if glob is deeper than top-level (e.g. projects/ember/*)
+            if (prefix.includes("/")) {
+                directoryPrefix = `${prefix}/`;
+            }
+        }
+        else {
+            // Exact directory: people/ryan → Vectorize ns "people", post-filter by full path
+            vectorizeNamespace = options.namespace.split("/")[0];
+            if (options.namespace.includes("/")) {
+                directoryPrefix = options.namespace;
+            }
+        }
+    }
+    // Query Vectorize
+    const matches = await vectorize.query(queryVector, {
+        topK: limit * 3, // Fetch extra for dedup
+        returnMetadata: "all",
+        namespace: vectorizeNamespace,
+    });
+    if (matches.matches.length === 0)
+        return [];
+    // Collect chunk IDs to look up content from local SQLite
+    const chunkKeys = matches.matches.map((m) => {
+        const meta = m.metadata;
+        return {
+            vectorId: m.id,
+            score: m.score,
+            docId: meta?.docId ?? m.id.split("_").slice(0, -1).join("_"),
+            seq: meta?.seq ?? Number.parseInt(m.id.split("_").pop() ?? "0", 10),
+        };
+    });
+    // Filter by docType if specified (Vectorize metadata filtering could also do this,
+    // but we filter here for portability)
+    let filteredKeys = options.docType
+        ? chunkKeys.filter((k) => {
+            const meta = matches.matches.find((m) => m.id === k.vectorId)?.metadata;
+            return meta?.docType === options.docType;
+        })
+        : chunkKeys;
+    // Post-filter by directory prefix when namespace is deeper than first segment
+    if (directoryPrefix) {
+        filteredKeys = filteredKeys.filter((k) => {
+            const meta = matches.matches.find((m) => m.id === k.vectorId)?.metadata;
+            const dir = meta?.directory;
+            if (!dir)
+                return false;
+            return dir === directoryPrefix || dir.startsWith(`${directoryPrefix}/`);
+        });
+    }
+    if (filteredKeys.length === 0)
+        return [];
+    // Batch look up chunk content from SQLite
+    const placeholders = filteredKeys.map(() => "(?, ?)").join(", ");
+    const bindings = filteredKeys.flatMap((k) => [k.docId, k.seq]);
+    const rows = sql
+        .exec(`
+			SELECT c.doc_id, c.seq, c.content, d.title, d.doc_type, d.namespace, d.metadata
+			FROM qmd_chunks c
+			JOIN qmd_documents d ON d.id = c.doc_id
+			WHERE (c.doc_id, c.seq) IN (VALUES ${placeholders})
+		`, ...bindings)
+        .toArray();
+    // Build lookup map
+    const chunkMap = new Map();
+    for (const row of rows) {
+        chunkMap.set(`${row.doc_id}_${row.seq}`, row);
+    }
+    // Merge scores with content, dedup by docId
+    const seen = new Map();
+    for (const key of filteredKeys) {
+        const row = chunkMap.get(`${key.docId}_${key.seq}`);
+        if (!row)
+            continue;
+        const existing = seen.get(key.docId);
+        if (!existing || key.score > existing.score) {
+            seen.set(key.docId, {
+                docId: key.docId,
+                score: key.score,
+                snippet: row.content,
+                seq: key.seq,
+                title: row.title,
+                docType: row.doc_type,
+                namespace: row.namespace,
+                metadata: row.metadata ? JSON.parse(row.metadata) : null,
+            });
+        }
+    }
+    return Array.from(seen.values())
+        .sort((a, b) => b.score - a.score)
+        .slice(0, limit);
+}
+//# sourceMappingURL=vector.js.map

package/dist/vector.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"vector.js","sourceRoot":"","sources":["../src/vector.ts"],"names":[],"mappings":"AAEA;;;GAGG;AACH,MAAM,UAAU,qBAAqB,CACpC,IAAY,EACZ,KAAc,EACd,OAAgB;IAEhB,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,OAAO;QAAE,KAAK,CAAC,IAAI,CAAC,YAAY,OAAO,EAAE,CAAC,CAAC;IAC/C,KAAK,CAAC,IAAI,CAAC,UAAU,KAAK,IAAI,MAAM,EAAE,CAAC,CAAC;IACxC,KAAK,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC,CAAC;IAC5B,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,uBAAuB,CAAC,KAAa;IACpD,OAAO,iBAAiB,KAAK,EAAE,CAAC;AACjC,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CACjC,SAAoB,EACpB,OAAgB,EAChB,MAQE;IAEF,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO;IAEhC,4DAA4D;IAC5D,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAC9B,qBAAqB,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,OAAO,CAAC,CACjD,CAAC;IAEF,yEAAyE;IACzE,MAAM,SAAS,GAAG,GAAG,CAAC;IACtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,IAAI,SAAS,EAAE,CAAC;QAClD,MAAM,UAAU,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,CAAC;QACjD,MAAM,WAAW,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,CAAC;QAEnD,MAAM,UAAU,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;QAE7C,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;YAC1C,EAAE,EAAE,GAAG,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,GAAG,EAAE;YACzB,MAAM,EAAE,UAAU,CAAC,CAAC,CAAC;YACrB,SAAS,EAAE,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS;YAC9D,QAAQ,EAAE;gBACT,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,GAAG,EAAE,CAAC,CAAC,GAAG;gBACV,OAAO,EAAE,CAAC,CAAC,OAAO,IAAI,EAAE;gBACxB,SAAS,EAAE,CAAC,CAAC,SAAS,IAAI,EAAE;aAC5B;SACD,CAAC,CAAC,CAAC;QAEJ,MAAM,SAAS,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IACjC,CAAC;AACF,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAClC,SAAoB,EACpB,GAAe,EACf,KAAa;IAEb,kDAAkD;IAClD,MAAM,MAAM,GAAG,GAAG;SAChB,IAAI,CAAkB,6CAA6C,EAAE,KAAK,CAAC;SAC3E,OAAO,EAAE,CAAC;IAEZ,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO;IAEhC,MAAM,GAAG,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,KAAK,IAAI,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC;IACnD,MAAM,SAAS,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;AAClC,CAAC;AAYD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CACjC,SAAoB,EACpB,OAAgB,EAChB,GAAe,EACf,KAAa,EACb,UAAyB,EAAE;IAE3B,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,EAAE,CAAC;IAElC,kBAAkB;IAClB,MAAM,SAAS,GAAG,uBAAuB,CAAC,KAAK,CAAC,CAAC;IACjD,MAAM,CAAC,WAAW,CAAC,GAAG,MAAM,OAAO,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC;IAEjD,uFAAuF;IACvF,IAAI,kBAAsC,CAAC;IAC3C,IAAI,eAAmC,CAAC;IACxC,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;QACvB,IAAI,OAAO,CAAC,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YACrC,sFAAsF;YACtF,MAAM,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;YACzE,kBAAkB,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAC1C,iFAAiF;YACjF,IAAI,MAAM,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC1B,eAAe,GAAG,GAAG,MAAM,GAAG,CAAC;YAChC,CAAC;QACF,CAAC;aAAM,CAAC;YACP,iFAAiF;YACjF,kBAAkB,GAAG,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YACrD,IAAI,OAAO,CAAC,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBACrC,eAAe,GAAG,OAAO,CAAC,SAAS,CAAC;YACrC,CAAC;QACF,CAAC;IACF,CAAC;IAED,kBAAkB;IAClB,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,KAAK,CAAC,WAAW,EAAE;QAClD,IAAI,EAAE,KAAK,GAAG,CAAC,EAAE,wBAAwB;QACzC,cAAc,EAAE,KAAK;QACrB,SAAS,EAAE,kBAAkB;KAC7B,CAAC,CAAC;IAEH,IAAI,OAAO,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAE5C,yDAAyD;IACzD,MAAM,SAAS,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;QAC3C,MAAM,IAAI,GAAG,CAAC,CAAC,QAEH,CAAC;QACb,OAAO;YACN,QAAQ,EAAE,CAAC,CAAC,EAAE;YACd,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,EAAE,KAAK,IAAI,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC;YAC5D,GAAG,EAAE,IAAI,EAAE,GAAG,IAAI,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,IAAI,GAAG,EAAE,EAAE,CAAC;SACnE,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,mFAAmF;IACnF,sCAAsC;IACtC,IAAI,YAAY,GAAG,OAAO,CAAC,OAAO;QACjC,CAAC,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACvB,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,QAAQ,CAAC,EAAE,QAAQ,CAAC;YACxE,OAAO,IAAI,EAAE,OAAO,KAAK,OAAO,CAAC,OAAO,CAAC;QAC1C,CAAC,CAAC;QACH,CAAC,CAAC,SAAS,CAAC;IAEb,8EAA8E;IAC9E,IAAI,eAAe,EAAE,CAAC;QACrB,YAAY,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACxC,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,QAAQ,CAAC,EAAE,QAAQ,CAAC;YACxE,MAAM,GAAG,GAAG,IAAI,EAAE,SAA0B,CAAC;YAC7C,IAAI,CAAC,GAAG;gBAAE,OAAO,KAAK,CAAC;YACvB,OAAO,GAAG,KAAK,eAAe,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,eAAe,GAAG,CAAC,CAAC;QACzE,CAAC,CAAC,CAAC;IACJ,CAAC;IAED,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEzC,0CAA0C;IAC1C,MAAM,YAAY,GAAG,YAAY,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACjE,MAAM,QAAQ,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAE/D,MAAM,IAAI,GAAG,GAAG;SACd,IAAI,CACJ;;;;wCAIqC,YAAY;GACjD,EACA,GAAG,QAAQ,CACX;SACA,OAAO,EAAE,CAAC;IAEZ,mBAAmB;IACnB,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAoB,CAAC;IAC7C,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACxB,QAAQ,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,GAAG,EAAE,EAAE,GAAG,CAAC,CAAC;IAC/C,CAAC;IAED,4CAA4C;IAC5C,MAAM,IAAI,GAAG,IAAI,GAAG,EAAwB,CAAC;IAE7C,KAAK,MAAM,GAAG,IAAI,YAAY,EAAE,CAAC;QAChC,MAAM,GAAG,GAAG,QAAQ,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,KAAK,IAAI,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC;QACpD,IAAI,CAAC,GAAG;YAAE,SAAS;QAEnB,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QACrC,IAAI,CAAC,QAAQ,IAAI,GAAG,CAAC,KAAK,GAAG,QAAQ,CAAC,KAAK,EAAE,CAAC;YAC7C,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,EAAE;gBACnB,KAAK,EAAE,GAAG,CAAC,KAAK;gBAChB,KAAK,EAAE,GAAG,CAAC,KAAK;gBAChB,OAAO,EAAE,GAAG,CAAC,OAAO;gBACpB,GAAG,EAAE,GAAG,CAAC,GAAG;gBACZ,KAAK,EAAE,GAAG,CAAC,KAAK;gBAChB,OAAO,EAAE,GAAG,CAAC,QAAQ;gBACrB,SAAS,EAAE,GAAG,CAAC,SAAS;gBACxB,QAAQ,EAAE,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,QAAkB,CAAC,CAAC,CAAC,CAAC,IAAI;aAClE,CAAC,CAAC;QACJ,CAAC;IACF,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;SAC9B,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC;SACjC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;AACnB,CAAC"}

package/package.json ADDED Viewed

@@ -0,0 +1,49 @@
+{
+	"name": "@stablemodels/qmd-cf",
+	"version": "0.1.0",
+	"description": "Hybrid full-text + vector search for Cloudflare Durable Objects. A DO-native reimagination of qmd.",
+	"type": "module",
+	"exports": {
+		".": {
+			"types": "./dist/index.d.ts",
+			"import": "./dist/index.js"
+		},
+		"./testing": {
+			"types": "./dist/testing.d.ts",
+			"import": "./dist/testing.js"
+		}
+	},
+	"main": "dist/index.js",
+	"types": "dist/index.d.ts",
+	"files": ["dist", "src", "README.md"],
+	"scripts": {
+		"build": "tsc",
+		"check": "biome check .",
+		"format": "biome check --write .",
+		"test": "bun test tests/*.test.ts && vitest run --config vitest.config.ts",
+		"test:unit": "bun test tests/*.test.ts",
+		"test:cf": "vitest run --config vitest.config.ts"
+	},
+	"repository": {
+		"type": "git",
+		"url": "https://github.com/StableModels/qmd-cf"
+	},
+	"peerDependencies": {
+		"@cloudflare/workers-types": ">=4.0.0"
+	},
+	"peerDependenciesMeta": {
+		"@cloudflare/workers-types": {
+			"optional": true
+		}
+	},
+	"devDependencies": {
+		"@biomejs/biome": "1.9.4",
+		"@cloudflare/vitest-pool-workers": "^0.12.18",
+		"@cloudflare/workers-types": "^4.20251231.0",
+		"@vitest/runner": "3.2.0",
+		"@vitest/snapshot": "3.2.0",
+		"typescript": "^5.7.0",
+		"vitest": "3.2.0",
+		"wrangler": "^4.0.0"
+	}
+}

package/src/bun-sqlite.d.ts ADDED Viewed

@@ -0,0 +1,17 @@
+/**
+ * Minimal type declarations for bun:sqlite used by the testing module.
+ * Kept intentionally narrow to avoid conflicts with @cloudflare/workers-types.
+ */
+declare module "bun:sqlite" {
+	class Database {
+		constructor(filename: string);
+		exec(query: string): void;
+		prepare(query: string): Statement;
+		close(): void;
+	}
+	class Statement {
+		run(...params: unknown[]): { changes: number };
+		all(...params: unknown[]): unknown[];
+	}
+}

package/src/chunker.ts ADDED Viewed

@@ -0,0 +1,250 @@
+import type { Chunk } from "./types.js";
+const DEFAULT_CHUNK_SIZE = 3200; // ~800 tokens at ~4 chars/token
+const DEFAULT_CHUNK_OVERLAP = 480; // 15% overlap
+/** Break point scores — spread wide so headings decisively win over paragraphs. */
+const BREAK_SCORES: Record<string, number> = {
+	h1: 100,
+	h2: 90,
+	h3: 80,
+	h4: 70,
+	h5: 60,
+	h6: 50,
+	code_fence: 80,
+	hr: 60,
+	paragraph: 20,
+	list_item: 5,
+	newline: 1,
+};
+interface BreakPoint {
+	offset: number;
+	score: number;
+}
+/**
+ * Chunk a document into overlapping segments, seeking intelligent break points.
+ *
+ * Uses a scored break point system (from qmd) that pre-scans the entire document
+ * for structural markers (headings, code fences, paragraphs, etc.) and picks the
+ * highest-scoring break point within a window around the target cut position.
+ * Avoids splitting inside fenced code blocks.
+ */
+export function chunkText(
+	docId: string,
+	content: string,
+	maxChars: number = DEFAULT_CHUNK_SIZE,
+	overlapChars: number = DEFAULT_CHUNK_OVERLAP,
+): Chunk[] {
+	if (content.length === 0) {
+		return [];
+	}
+	// Short content: single chunk, no splitting needed
+	if (content.length <= maxChars) {
+		return [{ docId, seq: 0, text: content, charOffset: 0 }];
+	}
+	const breakPoints = scanBreakPoints(content);
+	const codeFences = findCodeFences(content);
+	const chunks: Chunk[] = [];
+	let pos = 0;
+	let seq = 0;
+	while (pos < content.length) {
+		const remaining = content.length - pos;
+		if (remaining <= maxChars) {
+			chunks.push({ docId, seq, text: content.slice(pos), charOffset: pos });
+			break;
+		}
+		const targetEnd = pos + maxChars;
+		const cutoff = findBestCutoff(
+			content,
+			breakPoints,
+			codeFences,
+			targetEnd,
+			maxChars,
+		);
+		// Ensure we make forward progress
+		const endPos = cutoff > pos ? cutoff : pos + maxChars;
+		chunks.push({
+			docId,
+			seq,
+			text: content.slice(pos, endPos),
+			charOffset: pos,
+		});
+		// Advance position, subtracting overlap
+		const advance = endPos - pos - overlapChars;
+		pos += Math.max(advance, 1);
+		// Don't start the next chunk inside a code fence — skip to fence end
+		for (const [fStart, fEnd] of codeFences) {
+			if (pos > fStart && pos < fEnd) {
+				pos = fEnd;
+				break;
+			}
+		}
+		seq++;
+	}
+	return chunks;
+}
+/**
+ * Pre-scan the document for structural break points with scores.
+ *
+ * Returns break points sorted by offset. Each offset points to the first
+ * character of the new section (i.e., right after the structural marker).
+ */
+function scanBreakPoints(text: string): BreakPoint[] {
+	const points: BreakPoint[] = [];
+	const lines = text.split("\n");
+	let offset = 0;
+	for (let i = 0; i < lines.length; i++) {
+		const line = lines[i];
+		const lineStart = offset;
+		const nextLineStart = offset + line.length + 1; // +1 for the \n
+		// Headings (must be at start of line)
+		if (line.startsWith("###### ")) {
+			points.push({ offset: lineStart, score: BREAK_SCORES.h6 });
+		} else if (line.startsWith("##### ")) {
+			points.push({ offset: lineStart, score: BREAK_SCORES.h5 });
+		} else if (line.startsWith("#### ")) {
+			points.push({ offset: lineStart, score: BREAK_SCORES.h4 });
+		} else if (line.startsWith("### ")) {
+			points.push({ offset: lineStart, score: BREAK_SCORES.h3 });
+		} else if (line.startsWith("## ")) {
+			points.push({ offset: lineStart, score: BREAK_SCORES.h2 });
+		} else if (line.startsWith("# ")) {
+			points.push({ offset: lineStart, score: BREAK_SCORES.h1 });
+		}
+		// Code fences (``` at start of line) — break before the fence
+		if (line.startsWith("```")) {
+			points.push({ offset: lineStart, score: BREAK_SCORES.code_fence });
+		}
+		// Horizontal rules (---, ***, ___ with optional spaces)
+		if (/^(\s*[-*_]\s*){3,}$/.test(line)) {
+			points.push({ offset: lineStart, score: BREAK_SCORES.hr });
+		}
+		// Paragraph boundary (empty line followed by content)
+		if (line === "" && i > 0) {
+			points.push({ offset: nextLineStart, score: BREAK_SCORES.paragraph });
+		}
+		// List items
+		if (/^(\s*[-*+]\s|\s*\d+\.\s)/.test(line)) {
+			points.push({ offset: lineStart, score: BREAK_SCORES.list_item });
+		}
+		// Every newline is a minimal break point
+		if (i < lines.length - 1) {
+			points.push({ offset: nextLineStart, score: BREAK_SCORES.newline });
+		}
+		offset = nextLineStart;
+	}
+	return points;
+}
+/**
+ * Find matched code fence (```) ranges. Returns [start, end] pairs
+ * where start is the offset of the opening fence and end is the offset
+ * just after the closing fence line's newline.
+ */
+function findCodeFences(text: string): Array<[number, number]> {
+	const ranges: Array<[number, number]> = [];
+	const lines = text.split("\n");
+	let offset = 0;
+	let fenceStart: number | null = null;
+	for (const line of lines) {
+		if (line.startsWith("```")) {
+			if (fenceStart === null) {
+				fenceStart = offset;
+			} else {
+				// Close the fence — end is after this line
+				ranges.push([fenceStart, offset + line.length + 1]);
+				fenceStart = null;
+			}
+		}
+		offset += line.length + 1;
+	}
+	return ranges;
+}
+/**
+ * Check if an offset falls inside any code fence range.
+ */
+function isInsideCodeFence(
+	offset: number,
+	codeFences: Array<[number, number]>,
+): boolean {
+	for (const [start, end] of codeFences) {
+		// Inside means strictly between the opening and closing fence lines.
+		// Breaking AT the start of a fence (before it) is fine.
+		if (offset > start && offset < end) return true;
+	}
+	return false;
+}
+/**
+ * Find the best break point near the target cut position.
+ *
+ * Searches a window from 50% to 100% of maxChars around the chunk start.
+ * Applies squared distance decay so breaks closer to the target are preferred.
+ * Rejects candidates inside code fences.
+ */
+function findBestCutoff(
+	text: string,
+	breakPoints: BreakPoint[],
+	codeFences: Array<[number, number]>,
+	targetEnd: number,
+	maxChars: number,
+): number {
+	const windowStart = targetEnd - Math.floor(maxChars * 0.5);
+	const windowEnd = targetEnd;
+	const windowSize = windowEnd - windowStart;
+	let bestScore = -1;
+	let bestOffset = targetEnd;
+	for (const bp of breakPoints) {
+		if (bp.offset < windowStart || bp.offset > windowEnd) continue;
+		if (isInsideCodeFence(bp.offset, codeFences)) continue;
+		// Squared distance decay: prefer breaks closer to targetEnd
+		const dist = Math.abs(bp.offset - targetEnd);
+		const normalizedDist = dist / windowSize;
+		const multiplier = 1.0 - normalizedDist * normalizedDist * 0.7;
+		const weightedScore = bp.score * multiplier;
+		if (weightedScore > bestScore) {
+			bestScore = weightedScore;
+			bestOffset = bp.offset;
+		}
+	}
+	// Fallback: if no structural break points found, try word boundary (last space)
+	if (bestScore < 0) {
+		const lastSpace = text.lastIndexOf(" ", targetEnd);
+		if (lastSpace >= windowStart) {
+			return lastSpace + 1;
+		}
+	}
+	return bestOffset;
+}