@stablemodels/qmd-cf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/LICENSE +21 -0
  2. package/dist/chunker.d.ts +11 -0
  3. package/dist/chunker.d.ts.map +1 -0
  4. package/dist/chunker.js +199 -0
  5. package/dist/chunker.js.map +1 -0
  6. package/dist/fts.d.ts +19 -0
  7. package/dist/fts.d.ts.map +1 -0
  8. package/dist/fts.js +109 -0
  9. package/dist/fts.js.map +1 -0
  10. package/dist/hash.d.ts +7 -0
  11. package/dist/hash.d.ts.map +1 -0
  12. package/dist/hash.js +14 -0
  13. package/dist/hash.js.map +1 -0
  14. package/dist/index.d.ts +56 -0
  15. package/dist/index.d.ts.map +1 -0
  16. package/dist/index.js +57 -0
  17. package/dist/index.js.map +1 -0
  18. package/dist/qmd.d.ts +158 -0
  19. package/dist/qmd.d.ts.map +1 -0
  20. package/dist/qmd.js +462 -0
  21. package/dist/qmd.js.map +1 -0
  22. package/dist/rrf.d.ts +22 -0
  23. package/dist/rrf.d.ts.map +1 -0
  24. package/dist/rrf.js +92 -0
  25. package/dist/rrf.js.map +1 -0
  26. package/dist/schema.d.ts +14 -0
  27. package/dist/schema.d.ts.map +1 -0
  28. package/dist/schema.js +128 -0
  29. package/dist/schema.js.map +1 -0
  30. package/dist/testing.d.ts +77 -0
  31. package/dist/testing.d.ts.map +1 -0
  32. package/dist/testing.js +242 -0
  33. package/dist/testing.js.map +1 -0
  34. package/dist/types.d.ts +118 -0
  35. package/dist/types.d.ts.map +1 -0
  36. package/dist/types.js +9 -0
  37. package/dist/types.js.map +1 -0
  38. package/dist/vector.d.ts +38 -0
  39. package/dist/vector.d.ts.map +1 -0
  40. package/dist/vector.js +174 -0
  41. package/dist/vector.js.map +1 -0
  42. package/package.json +49 -0
  43. package/src/bun-sqlite.d.ts +17 -0
  44. package/src/chunker.ts +250 -0
  45. package/src/fts.ts +140 -0
  46. package/src/hash.ts +13 -0
  47. package/src/index.ts +72 -0
  48. package/src/qmd.ts +706 -0
  49. package/src/rrf.ts +115 -0
  50. package/src/schema.ts +147 -0
  51. package/src/testing.ts +303 -0
  52. package/src/types.ts +124 -0
  53. package/src/vector.ts +236 -0
@@ -0,0 +1,118 @@
1
+ /**
2
+ * Domain types for qmd-cf.
3
+ *
4
+ * Cloudflare platform types (SqlStorage, SqlStorageCursor, Vectorize,
5
+ * VectorizeVector, etc.) are ambient — provided by @cloudflare/workers-types
6
+ * via tsconfig's "types" array. They don't need to be imported or re-exported.
7
+ */
8
+ /** A document to be indexed. */
9
+ export interface Document {
10
+ /** Unique identifier for this document (e.g. file path). */
11
+ id: string;
12
+ /** The full text content. */
13
+ content: string;
14
+ /** Optional title (boosts search relevance when matched). */
15
+ title?: string;
16
+ /** Optional document type for filtering (e.g. "fact", "daily_note", "summary"). */
17
+ docType?: string;
18
+ /** Optional namespace for scoped search (e.g. entity path, agent ID). */
19
+ namespace?: string;
20
+ /** Arbitrary metadata stored alongside the document. */
21
+ metadata?: Record<string, string | number | boolean | null>;
22
+ }
23
+ /** A single chunk produced from a document. */
24
+ export interface Chunk {
25
+ /** Parent document ID. */
26
+ docId: string;
27
+ /** Sequence index within the document (0-based). */
28
+ seq: number;
29
+ /** The chunk text content. */
30
+ text: string;
31
+ /** Character offset in the original document. */
32
+ charOffset: number;
33
+ }
34
+ /** A search result returned from BM25 full-text search. */
35
+ export interface FtsResult {
36
+ docId: string;
37
+ /** BM25 score normalized to (0, 1] — higher is better. */
38
+ score: number;
39
+ /** The matching chunk text (snippet). */
40
+ snippet: string;
41
+ /** Chunk sequence number. */
42
+ seq: number;
43
+ title: string | null;
44
+ docType: string | null;
45
+ namespace: string | null;
46
+ metadata: Record<string, string | number | boolean | null> | null;
47
+ }
48
+ /** A search result returned from vector similarity search. */
49
+ export interface VectorResult {
50
+ docId: string;
51
+ /** Cosine similarity score in [0, 1] — higher is better. */
52
+ score: number;
53
+ /** The matching chunk text. */
54
+ snippet: string;
55
+ /** Chunk sequence number. */
56
+ seq: number;
57
+ title: string | null;
58
+ docType: string | null;
59
+ namespace: string | null;
60
+ metadata: Record<string, string | number | boolean | null> | null;
61
+ }
62
+ /** A merged search result after hybrid fusion. */
63
+ export interface SearchResult {
64
+ docId: string;
65
+ /** Final fused score — higher is better. */
66
+ score: number;
67
+ /** The best matching chunk text. */
68
+ snippet: string;
69
+ /** Source of the result: which retrieval methods contributed. */
70
+ sources: Array<"fts" | "vector">;
71
+ /** Individual scores from each source. */
72
+ sourceScores: {
73
+ fts?: number;
74
+ vector?: number;
75
+ };
76
+ title: string | null;
77
+ docType: string | null;
78
+ namespace: string | null;
79
+ metadata: Record<string, string | number | boolean | null> | null;
80
+ }
81
+ /** Options for search queries. */
82
+ export interface SearchOptions {
83
+ /** Maximum number of results to return. Default: 10. */
84
+ limit?: number;
85
+ /** Filter by document type. */
86
+ docType?: string;
87
+ /** Filter by namespace. */
88
+ namespace?: string;
89
+ }
90
+ /** Options for hybrid search queries (extends SearchOptions). */
91
+ export interface HybridSearchOptions extends SearchOptions {
92
+ /** Weight for FTS results in RRF fusion. Default: 1.0. */
93
+ ftsWeight?: number;
94
+ /** Weight for vector results in RRF fusion. Default: 1.0. */
95
+ vectorWeight?: number;
96
+ /** RRF constant k. Higher values reduce the impact of high rankings. Default: 60. */
97
+ rrfK?: number;
98
+ }
99
+ /** Configuration for the QMD index. */
100
+ export interface QmdConfig {
101
+ /** Maximum characters per chunk. Default: 3200 (~800 tokens). */
102
+ chunkSize?: number;
103
+ /** Overlap characters between chunks. Default: 480 (15% of chunkSize). */
104
+ chunkOverlap?: number;
105
+ /** FTS5 tokenizer configuration. Default: "unicode61". */
106
+ tokenizer?: string;
107
+ }
108
+ /** Embedding function signature — maps text to a vector. */
109
+ export type EmbedFn = (texts: string[]) => Promise<number[][]>;
110
+ /** Index statistics. */
111
+ export interface IndexStats {
112
+ totalDocuments: number;
113
+ totalChunks: number;
114
+ totalVectors: number;
115
+ namespaces: string[];
116
+ docTypes: string[];
117
+ }
118
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,gCAAgC;AAChC,MAAM,WAAW,QAAQ;IACxB,4DAA4D;IAC5D,EAAE,EAAE,MAAM,CAAC;IACX,6BAA6B;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,6DAA6D;IAC7D,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,mFAAmF;IACnF,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,yEAAyE;IACzE,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,wDAAwD;IACxD,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO,GAAG,IAAI,CAAC,CAAC;CAC5D;AAED,+CAA+C;AAC/C,MAAM,WAAW,KAAK;IACrB,0BAA0B;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,oDAAoD;IACpD,GAAG,EAAE,MAAM,CAAC;IACZ,8BAA8B;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,iDAAiD;IACjD,UAAU,EAAE,MAAM,CAAC;CACnB;AAED,2DAA2D;AAC3D,MAAM,WAAW,SAAS;IACzB,KAAK,EAAE,MAAM,CAAC;IACd,0DAA0D;IAC1D,KAAK,EAAE,MAAM,CAAC;IACd,yCAAyC;IACzC,OAAO,EAAE,MAAM,CAAC;IAChB,6BAA6B;IAC7B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC;CAClE;AAED,8DAA8D;AAC9D,MAAM,WAAW,YAAY;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,4DAA4D;IAC5D,KAAK,EAAE,MAAM,CAAC;IACd,+BAA+B;IAC/B,OAAO,EAAE,MAAM,CAAC;IAChB,6BAA6B;IAC7B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC;CAClE;AAED,kDAAkD;AAClD,MAAM,WAAW,YAAY;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,4CAA4C;IAC5C,KAAK,EAAE,MAAM,CAAC;IACd,oCAAoC;IACpC,OAAO,EAAE,MAAM,CAAC;IAChB,iEAAiE;IACjE,OAAO,EAAE,KAAK,CAAC,KAAK,GAAG,QAAQ,CAAC,CAAC;IACjC,0CAA0C;IAC1C,YAAY,EAAE;QAAE,GAAG,CAAC,EAAE,MAAM,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IAChD,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC;CAClE;AAED,kCAAkC;AAClC,MAAM,WAAW,aAAa;IAC7B,wDAAwD;IACxD,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,+BAA+B;IAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,2BAA2B;IAC3B,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,iEAAiE;AACjE,MAAM,WAAW,mBAAoB,SAAQ,aAAa;IACzD,0DAA0D;IAC1D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,6DAA6D;IAC7D,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qFAAqF;IACrF,IAAI,CAAC,EAAE,MAAM,CAAC;CACd;AAED,uCAAuC;AACvC,MAAM,WAAW,SAAS;IACzB,iEAAiE;IACjE,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,0EAA0E;IAC1E,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,0DAA0D;IAC1D,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,4DAA4D;AAC5D,MAAM,MAAM,OAAO,GAAG,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;AAE/D,wBAAwB;AACxB,MAAM,WAAW,UAAU;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,QAAQ,EAAE,MAAM,EAAE,CAAC;CACnB"}
package/dist/types.js ADDED
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Domain types for qmd-cf.
3
+ *
4
+ * Cloudflare platform types (SqlStorage, SqlStorageCursor, Vectorize,
5
+ * VectorizeVector, etc.) are ambient — provided by @cloudflare/workers-types
6
+ * via tsconfig's "types" array. They don't need to be imported or re-exported.
7
+ */
8
+ export {};
9
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG"}
@@ -0,0 +1,38 @@
1
+ import type { EmbedFn, SearchOptions, VectorResult } from "./types.js";
2
+ /**
3
+ * Format text for embedding (document indexing).
4
+ * Follows nomic/qmd convention of prefixing with title context.
5
+ */
6
+ export declare function formatDocForEmbedding(text: string, title?: string, context?: string): string;
7
+ /**
8
+ * Format a query string for embedding (search time).
9
+ */
10
+ export declare function formatQueryForEmbedding(query: string): string;
11
+ /**
12
+ * Index chunks into Vectorize with embeddings.
13
+ *
14
+ * Each chunk gets a vector ID of "{docId}_{seq}" which maps back to qmd_chunks.
15
+ * Vectors are stored in a namespace matching the document's namespace for scoped search.
16
+ */
17
+ export declare function indexVectors(vectorize: Vectorize, embedFn: EmbedFn, chunks: Array<{
18
+ docId: string;
19
+ seq: number;
20
+ text: string;
21
+ title?: string;
22
+ namespace?: string;
23
+ docType?: string;
24
+ context?: string;
25
+ }>): Promise<void>;
26
+ /**
27
+ * Remove all vectors for a document from Vectorize.
28
+ */
29
+ export declare function removeVectors(vectorize: Vectorize, sql: SqlStorage, docId: string): Promise<void>;
30
+ /**
31
+ * Execute a vector similarity search via Vectorize.
32
+ *
33
+ * 1. Embed the query
34
+ * 2. Query Vectorize for nearest neighbors (scoped by namespace if provided)
35
+ * 3. Look up chunk content from the local SQLite for snippet extraction
36
+ */
37
+ export declare function searchVector(vectorize: Vectorize, embedFn: EmbedFn, sql: SqlStorage, query: string, options?: SearchOptions): Promise<VectorResult[]>;
38
+ //# sourceMappingURL=vector.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vector.d.ts","sourceRoot":"","sources":["../src/vector.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAEvE;;;GAGG;AACH,wBAAgB,qBAAqB,CACpC,IAAI,EAAE,MAAM,EACZ,KAAK,CAAC,EAAE,MAAM,EACd,OAAO,CAAC,EAAE,MAAM,GACd,MAAM,CAMR;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAE7D;AAED;;;;;GAKG;AACH,wBAAsB,YAAY,CACjC,SAAS,EAAE,SAAS,EACpB,OAAO,EAAE,OAAO,EAChB,MAAM,EAAE,KAAK,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;CACjB,CAAC,GACA,OAAO,CAAC,IAAI,CAAC,CA8Bf;AAED;;GAEG;AACH,wBAAsB,aAAa,CAClC,SAAS,EAAE,SAAS,EACpB,GAAG,EAAE,UAAU,EACf,KAAK,EAAE,MAAM,GACX,OAAO,CAAC,IAAI,CAAC,CAUf;AAYD;;;;;;GAMG;AACH,wBAAsB,YAAY,CACjC,SAAS,EAAE,SAAS,EACpB,OAAO,EAAE,OAAO,EAChB,GAAG,EAAE,UAAU,EACf,KAAK,EAAE,MAAM,EACb,OAAO,GAAE,aAAkB,GACzB,OAAO,CAAC,YAAY,EAAE,CAAC,CAsHzB"}
package/dist/vector.js ADDED
@@ -0,0 +1,174 @@
1
+ /**
2
+ * Format text for embedding (document indexing).
3
+ * Follows nomic/qmd convention of prefixing with title context.
4
+ */
5
+ export function formatDocForEmbedding(text, title, context) {
6
+ const parts = [];
7
+ if (context)
8
+ parts.push(`context: ${context}`);
9
+ parts.push(`title: ${title || "none"}`);
10
+ parts.push(`text: ${text}`);
11
+ return parts.join(" | ");
12
+ }
13
+ /**
14
+ * Format a query string for embedding (search time).
15
+ */
16
+ export function formatQueryForEmbedding(query) {
17
+ return `search_query: ${query}`;
18
+ }
19
+ /**
20
+ * Index chunks into Vectorize with embeddings.
21
+ *
22
+ * Each chunk gets a vector ID of "{docId}_{seq}" which maps back to qmd_chunks.
23
+ * Vectors are stored in a namespace matching the document's namespace for scoped search.
24
+ */
25
+ export async function indexVectors(vectorize, embedFn, chunks) {
26
+ if (chunks.length === 0)
27
+ return;
28
+ // Format texts for embedding (includes context if provided)
29
+ const texts = chunks.map((c) => formatDocForEmbedding(c.text, c.title, c.context));
30
+ // Generate embeddings in batch (Workers AI supports up to 100 at a time)
31
+ const batchSize = 100;
32
+ for (let i = 0; i < texts.length; i += batchSize) {
33
+ const batchTexts = texts.slice(i, i + batchSize);
34
+ const batchChunks = chunks.slice(i, i + batchSize);
35
+ const embeddings = await embedFn(batchTexts);
36
+ const vectors = batchChunks.map((c, j) => ({
37
+ id: `${c.docId}_${c.seq}`,
38
+ values: embeddings[j],
39
+ namespace: c.namespace ? c.namespace.split("/")[0] : undefined,
40
+ metadata: {
41
+ docId: c.docId,
42
+ seq: c.seq,
43
+ docType: c.docType ?? "",
44
+ directory: c.namespace ?? "",
45
+ },
46
+ }));
47
+ await vectorize.upsert(vectors);
48
+ }
49
+ }
50
+ /**
51
+ * Remove all vectors for a document from Vectorize.
52
+ */
53
+ export async function removeVectors(vectorize, sql, docId) {
54
+ // Look up all chunk seq numbers for this document
55
+ const chunks = sql
56
+ .exec("SELECT seq FROM qmd_chunks WHERE doc_id = ?", docId)
57
+ .toArray();
58
+ if (chunks.length === 0)
59
+ return;
60
+ const ids = chunks.map((c) => `${docId}_${c.seq}`);
61
+ await vectorize.deleteByIds(ids);
62
+ }
63
+ /**
64
+ * Execute a vector similarity search via Vectorize.
65
+ *
66
+ * 1. Embed the query
67
+ * 2. Query Vectorize for nearest neighbors (scoped by namespace if provided)
68
+ * 3. Look up chunk content from the local SQLite for snippet extraction
69
+ */
70
+ export async function searchVector(vectorize, embedFn, sql, query, options = {}) {
71
+ const limit = options.limit ?? 10;
72
+ // Embed the query
73
+ const queryText = formatQueryForEmbedding(query);
74
+ const [queryVector] = await embedFn([queryText]);
75
+ // Resolve namespace for Vectorize query: use first path segment for glob/path patterns
76
+ let vectorizeNamespace;
77
+ let directoryPrefix;
78
+ if (options.namespace) {
79
+ if (options.namespace.includes("*")) {
80
+ // Glob pattern: people/* → Vectorize ns "people", no post-filter needed for top-level
81
+ const prefix = options.namespace.replace(/\*+$/, "").replace(/\/+$/, "");
82
+ vectorizeNamespace = prefix.split("/")[0];
83
+ // Only need post-filter if glob is deeper than top-level (e.g. projects/ember/*)
84
+ if (prefix.includes("/")) {
85
+ directoryPrefix = `${prefix}/`;
86
+ }
87
+ }
88
+ else {
89
+ // Exact directory: people/ryan → Vectorize ns "people", post-filter by full path
90
+ vectorizeNamespace = options.namespace.split("/")[0];
91
+ if (options.namespace.includes("/")) {
92
+ directoryPrefix = options.namespace;
93
+ }
94
+ }
95
+ }
96
+ // Query Vectorize
97
+ const matches = await vectorize.query(queryVector, {
98
+ topK: limit * 3, // Fetch extra for dedup
99
+ returnMetadata: "all",
100
+ namespace: vectorizeNamespace,
101
+ });
102
+ if (matches.matches.length === 0)
103
+ return [];
104
+ // Collect chunk IDs to look up content from local SQLite
105
+ const chunkKeys = matches.matches.map((m) => {
106
+ const meta = m.metadata;
107
+ return {
108
+ vectorId: m.id,
109
+ score: m.score,
110
+ docId: meta?.docId ?? m.id.split("_").slice(0, -1).join("_"),
111
+ seq: meta?.seq ?? Number.parseInt(m.id.split("_").pop() ?? "0", 10),
112
+ };
113
+ });
114
+ // Filter by docType if specified (Vectorize metadata filtering could also do this,
115
+ // but we filter here for portability)
116
+ let filteredKeys = options.docType
117
+ ? chunkKeys.filter((k) => {
118
+ const meta = matches.matches.find((m) => m.id === k.vectorId)?.metadata;
119
+ return meta?.docType === options.docType;
120
+ })
121
+ : chunkKeys;
122
+ // Post-filter by directory prefix when namespace is deeper than first segment
123
+ if (directoryPrefix) {
124
+ filteredKeys = filteredKeys.filter((k) => {
125
+ const meta = matches.matches.find((m) => m.id === k.vectorId)?.metadata;
126
+ const dir = meta?.directory;
127
+ if (!dir)
128
+ return false;
129
+ return dir === directoryPrefix || dir.startsWith(`${directoryPrefix}/`);
130
+ });
131
+ }
132
+ if (filteredKeys.length === 0)
133
+ return [];
134
+ // Batch look up chunk content from SQLite
135
+ const placeholders = filteredKeys.map(() => "(?, ?)").join(", ");
136
+ const bindings = filteredKeys.flatMap((k) => [k.docId, k.seq]);
137
+ const rows = sql
138
+ .exec(`
139
+ SELECT c.doc_id, c.seq, c.content, d.title, d.doc_type, d.namespace, d.metadata
140
+ FROM qmd_chunks c
141
+ JOIN qmd_documents d ON d.id = c.doc_id
142
+ WHERE (c.doc_id, c.seq) IN (VALUES ${placeholders})
143
+ `, ...bindings)
144
+ .toArray();
145
+ // Build lookup map
146
+ const chunkMap = new Map();
147
+ for (const row of rows) {
148
+ chunkMap.set(`${row.doc_id}_${row.seq}`, row);
149
+ }
150
+ // Merge scores with content, dedup by docId
151
+ const seen = new Map();
152
+ for (const key of filteredKeys) {
153
+ const row = chunkMap.get(`${key.docId}_${key.seq}`);
154
+ if (!row)
155
+ continue;
156
+ const existing = seen.get(key.docId);
157
+ if (!existing || key.score > existing.score) {
158
+ seen.set(key.docId, {
159
+ docId: key.docId,
160
+ score: key.score,
161
+ snippet: row.content,
162
+ seq: key.seq,
163
+ title: row.title,
164
+ docType: row.doc_type,
165
+ namespace: row.namespace,
166
+ metadata: row.metadata ? JSON.parse(row.metadata) : null,
167
+ });
168
+ }
169
+ }
170
+ return Array.from(seen.values())
171
+ .sort((a, b) => b.score - a.score)
172
+ .slice(0, limit);
173
+ }
174
+ //# sourceMappingURL=vector.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vector.js","sourceRoot":"","sources":["../src/vector.ts"],"names":[],"mappings":"AAEA;;;GAGG;AACH,MAAM,UAAU,qBAAqB,CACpC,IAAY,EACZ,KAAc,EACd,OAAgB;IAEhB,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,OAAO;QAAE,KAAK,CAAC,IAAI,CAAC,YAAY,OAAO,EAAE,CAAC,CAAC;IAC/C,KAAK,CAAC,IAAI,CAAC,UAAU,KAAK,IAAI,MAAM,EAAE,CAAC,CAAC;IACxC,KAAK,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC,CAAC;IAC5B,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,uBAAuB,CAAC,KAAa;IACpD,OAAO,iBAAiB,KAAK,EAAE,CAAC;AACjC,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CACjC,SAAoB,EACpB,OAAgB,EAChB,MAQE;IAEF,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO;IAEhC,4DAA4D;IAC5D,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAC9B,qBAAqB,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,OAAO,CAAC,CACjD,CAAC;IAEF,yEAAyE;IACzE,MAAM,SAAS,GAAG,GAAG,CAAC;IACtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,IAAI,SAAS,EAAE,CAAC;QAClD,MAAM,UAAU,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,CAAC;QACjD,MAAM,WAAW,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,CAAC;QAEnD,MAAM,UAAU,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;QAE7C,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;YAC1C,EAAE,EAAE,GAAG,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,GAAG,EAAE;YACzB,MAAM,EAAE,UAAU,CAAC,CAAC,CAAC;YACrB,SAAS,EAAE,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS;YAC9D,QAAQ,EAAE;gBACT,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,GAAG,EAAE,CAAC,CAAC,GAAG;gBACV,OAAO,EAAE,CAAC,CAAC,OAAO,IAAI,EAAE;gBACxB,SAAS,EAAE,CAAC,CAAC,SAAS,IAAI,EAAE;aAC5B;SACD,CAAC,CAAC,CAAC;QAEJ,MAAM,SAAS,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IACjC,CAAC;AACF,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAClC,SAAoB,EACpB,GAAe,EACf,KAAa;IAEb,kDAAkD;IAClD,MAAM,MAAM,GAAG,GAAG;SAChB,IAAI,CAAkB,6CAA6C,EAAE,KAAK,CAAC;SAC3E,OAAO,EAAE,CAAC;IAEZ,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO;IAEhC,MAAM,GAAG,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,KAAK,IAAI,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC;IACnD,MAAM,SAAS,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;AAClC,CAAC;AAYD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CACjC,SAAoB,EACpB,OAAgB,EAChB,GAAe,EACf,KAAa,EACb,UAAyB,EAAE;IAE3B,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,EAAE,CAAC;IAElC,kBAAkB;IAClB,MAAM,SAAS,GAAG,uBAAuB,CAAC,KAAK,CAAC,CAAC;IACjD,MAAM,CAAC,WAAW,CAAC,GAAG,MAAM,OAAO,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC;IAEjD,uFAAuF;IACvF,IAAI,kBAAsC,CAAC;IAC3C,IAAI,eAAmC,CAAC;IACxC,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;QACvB,IAAI,OAAO,CAAC,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YACrC,sFAAsF;YACtF,MAAM,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;YACzE,kBAAkB,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAC1C,iFAAiF;YACjF,IAAI,MAAM,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC1B,eAAe,GAAG,GAAG,MAAM,GAAG,CAAC;YAChC,CAAC;QACF,CAAC;aAAM,CAAC;YACP,iFAAiF;YACjF,kBAAkB,GAAG,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YACrD,IAAI,OAAO,CAAC,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBACrC,eAAe,GAAG,OAAO,CAAC,SAAS,CAAC;YACrC,CAAC;QACF,CAAC;IACF,CAAC;IAED,kBAAkB;IAClB,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,KAAK,CAAC,WAAW,EAAE;QAClD,IAAI,EAAE,KAAK,GAAG,CAAC,EAAE,wBAAwB;QACzC,cAAc,EAAE,KAAK;QACrB,SAAS,EAAE,kBAAkB;KAC7B,CAAC,CAAC;IAEH,IAAI,OAAO,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAE5C,yDAAyD;IACzD,MAAM,SAAS,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;QAC3C,MAAM,IAAI,GAAG,CAAC,CAAC,QAEH,CAAC;QACb,OAAO;YACN,QAAQ,EAAE,CAAC,CAAC,EAAE;YACd,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,EAAE,KAAK,IAAI,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC;YAC5D,GAAG,EAAE,IAAI,EAAE,GAAG,IAAI,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,IAAI,GAAG,EAAE,EAAE,CAAC;SACnE,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,mFAAmF;IACnF,sCAAsC;IACtC,IAAI,YAAY,GAAG,OAAO,CAAC,OAAO;QACjC,CAAC,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACvB,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,QAAQ,CAAC,EAAE,QAAQ,CAAC;YACxE,OAAO,IAAI,EAAE,OAAO,KAAK,OAAO,CAAC,OAAO,CAAC;QAC1C,CAAC,CAAC;QACH,CAAC,CAAC,SAAS,CAAC;IAEb,8EAA8E;IAC9E,IAAI,eAAe,EAAE,CAAC;QACrB,YAAY,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACxC,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,QAAQ,CAAC,EAAE,QAAQ,CAAC;YACxE,MAAM,GAAG,GAAG,IAAI,EAAE,SAA0B,CAAC;YAC7C,IAAI,CAAC,GAAG;gBAAE,OAAO,KAAK,CAAC;YACvB,OAAO,GAAG,KAAK,eAAe,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,eAAe,GAAG,CAAC,CAAC;QACzE,CAAC,CAAC,CAAC;IACJ,CAAC;IAED,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEzC,0CAA0C;IAC1C,MAAM,YAAY,GAAG,YAAY,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACjE,MAAM,QAAQ,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAE/D,MAAM,IAAI,GAAG,GAAG;SACd,IAAI,CACJ;;;;wCAIqC,YAAY;GACjD,EACA,GAAG,QAAQ,CACX;SACA,OAAO,EAAE,CAAC;IAEZ,mBAAmB;IACnB,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAoB,CAAC;IAC7C,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACxB,QAAQ,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,GAAG,EAAE,EAAE,GAAG,CAAC,CAAC;IAC/C,CAAC;IAED,4CAA4C;IAC5C,MAAM,IAAI,GAAG,IAAI,GAAG,EAAwB,CAAC;IAE7C,KAAK,MAAM,GAAG,IAAI,YAAY,EAAE,CAAC;QAChC,MAAM,GAAG,GAAG,QAAQ,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,KAAK,IAAI,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC;QACpD,IAAI,CAAC,GAAG;YAAE,SAAS;QAEnB,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QACrC,IAAI,CAAC,QAAQ,IAAI,GAAG,CAAC,KAAK,GAAG,QAAQ,CAAC,KAAK,EAAE,CAAC;YAC7C,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,EAAE;gBACnB,KAAK,EAAE,GAAG,CAAC,KAAK;gBAChB,KAAK,EAAE,GAAG,CAAC,KAAK;gBAChB,OAAO,EAAE,GAAG,CAAC,OAAO;gBACpB,GAAG,EAAE,GAAG,CAAC,GAAG;gBACZ,KAAK,EAAE,GAAG,CAAC,KAAK;gBAChB,OAAO,EAAE,GAAG,CAAC,QAAQ;gBACrB,SAAS,EAAE,GAAG,CAAC,SAAS;gBACxB,QAAQ,EAAE,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,QAAkB,CAAC,CAAC,CAAC,CAAC,IAAI;aAClE,CAAC,CAAC;QACJ,CAAC;IACF,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;SAC9B,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC;SACjC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;AACnB,CAAC"}
package/package.json ADDED
@@ -0,0 +1,49 @@
1
+ {
2
+ "name": "@stablemodels/qmd-cf",
3
+ "version": "0.1.0",
4
+ "description": "Hybrid full-text + vector search for Cloudflare Durable Objects. A DO-native reimagination of qmd.",
5
+ "type": "module",
6
+ "exports": {
7
+ ".": {
8
+ "types": "./dist/index.d.ts",
9
+ "import": "./dist/index.js"
10
+ },
11
+ "./testing": {
12
+ "types": "./dist/testing.d.ts",
13
+ "import": "./dist/testing.js"
14
+ }
15
+ },
16
+ "main": "dist/index.js",
17
+ "types": "dist/index.d.ts",
18
+ "files": ["dist", "src", "README.md"],
19
+ "scripts": {
20
+ "build": "tsc",
21
+ "check": "biome check .",
22
+ "format": "biome check --write .",
23
+ "test": "bun test tests/*.test.ts && vitest run --config vitest.config.ts",
24
+ "test:unit": "bun test tests/*.test.ts",
25
+ "test:cf": "vitest run --config vitest.config.ts"
26
+ },
27
+ "repository": {
28
+ "type": "git",
29
+ "url": "https://github.com/StableModels/qmd-cf"
30
+ },
31
+ "peerDependencies": {
32
+ "@cloudflare/workers-types": ">=4.0.0"
33
+ },
34
+ "peerDependenciesMeta": {
35
+ "@cloudflare/workers-types": {
36
+ "optional": true
37
+ }
38
+ },
39
+ "devDependencies": {
40
+ "@biomejs/biome": "1.9.4",
41
+ "@cloudflare/vitest-pool-workers": "^0.12.18",
42
+ "@cloudflare/workers-types": "^4.20251231.0",
43
+ "@vitest/runner": "3.2.0",
44
+ "@vitest/snapshot": "3.2.0",
45
+ "typescript": "^5.7.0",
46
+ "vitest": "3.2.0",
47
+ "wrangler": "^4.0.0"
48
+ }
49
+ }
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Minimal type declarations for bun:sqlite used by the testing module.
3
+ * Kept intentionally narrow to avoid conflicts with @cloudflare/workers-types.
4
+ */
5
+ declare module "bun:sqlite" {
6
+ class Database {
7
+ constructor(filename: string);
8
+ exec(query: string): void;
9
+ prepare(query: string): Statement;
10
+ close(): void;
11
+ }
12
+
13
+ class Statement {
14
+ run(...params: unknown[]): { changes: number };
15
+ all(...params: unknown[]): unknown[];
16
+ }
17
+ }
package/src/chunker.ts ADDED
@@ -0,0 +1,250 @@
1
+ import type { Chunk } from "./types.js";
2
+
3
+ const DEFAULT_CHUNK_SIZE = 3200; // ~800 tokens at ~4 chars/token
4
+ const DEFAULT_CHUNK_OVERLAP = 480; // 15% overlap
5
+
6
+ /** Break point scores — spread wide so headings decisively win over paragraphs. */
7
+ const BREAK_SCORES: Record<string, number> = {
8
+ h1: 100,
9
+ h2: 90,
10
+ h3: 80,
11
+ h4: 70,
12
+ h5: 60,
13
+ h6: 50,
14
+ code_fence: 80,
15
+ hr: 60,
16
+ paragraph: 20,
17
+ list_item: 5,
18
+ newline: 1,
19
+ };
20
+
21
+ interface BreakPoint {
22
+ offset: number;
23
+ score: number;
24
+ }
25
+
26
+ /**
27
+ * Chunk a document into overlapping segments, seeking intelligent break points.
28
+ *
29
+ * Uses a scored break point system (from qmd) that pre-scans the entire document
30
+ * for structural markers (headings, code fences, paragraphs, etc.) and picks the
31
+ * highest-scoring break point within a window around the target cut position.
32
+ * Avoids splitting inside fenced code blocks.
33
+ */
34
+ export function chunkText(
35
+ docId: string,
36
+ content: string,
37
+ maxChars: number = DEFAULT_CHUNK_SIZE,
38
+ overlapChars: number = DEFAULT_CHUNK_OVERLAP,
39
+ ): Chunk[] {
40
+ if (content.length === 0) {
41
+ return [];
42
+ }
43
+
44
+ // Short content: single chunk, no splitting needed
45
+ if (content.length <= maxChars) {
46
+ return [{ docId, seq: 0, text: content, charOffset: 0 }];
47
+ }
48
+
49
+ const breakPoints = scanBreakPoints(content);
50
+ const codeFences = findCodeFences(content);
51
+ const chunks: Chunk[] = [];
52
+ let pos = 0;
53
+ let seq = 0;
54
+
55
+ while (pos < content.length) {
56
+ const remaining = content.length - pos;
57
+
58
+ if (remaining <= maxChars) {
59
+ chunks.push({ docId, seq, text: content.slice(pos), charOffset: pos });
60
+ break;
61
+ }
62
+
63
+ const targetEnd = pos + maxChars;
64
+ const cutoff = findBestCutoff(
65
+ content,
66
+ breakPoints,
67
+ codeFences,
68
+ targetEnd,
69
+ maxChars,
70
+ );
71
+
72
+ // Ensure we make forward progress
73
+ const endPos = cutoff > pos ? cutoff : pos + maxChars;
74
+
75
+ chunks.push({
76
+ docId,
77
+ seq,
78
+ text: content.slice(pos, endPos),
79
+ charOffset: pos,
80
+ });
81
+
82
+ // Advance position, subtracting overlap
83
+ const advance = endPos - pos - overlapChars;
84
+ pos += Math.max(advance, 1);
85
+
86
+ // Don't start the next chunk inside a code fence — skip to fence end
87
+ for (const [fStart, fEnd] of codeFences) {
88
+ if (pos > fStart && pos < fEnd) {
89
+ pos = fEnd;
90
+ break;
91
+ }
92
+ }
93
+
94
+ seq++;
95
+ }
96
+
97
+ return chunks;
98
+ }
99
+
100
+ /**
101
+ * Pre-scan the document for structural break points with scores.
102
+ *
103
+ * Returns break points sorted by offset. Each offset points to the first
104
+ * character of the new section (i.e., right after the structural marker).
105
+ */
106
+ function scanBreakPoints(text: string): BreakPoint[] {
107
+ const points: BreakPoint[] = [];
108
+ const lines = text.split("\n");
109
+ let offset = 0;
110
+
111
+ for (let i = 0; i < lines.length; i++) {
112
+ const line = lines[i];
113
+ const lineStart = offset;
114
+ const nextLineStart = offset + line.length + 1; // +1 for the \n
115
+
116
+ // Headings (must be at start of line)
117
+ if (line.startsWith("###### ")) {
118
+ points.push({ offset: lineStart, score: BREAK_SCORES.h6 });
119
+ } else if (line.startsWith("##### ")) {
120
+ points.push({ offset: lineStart, score: BREAK_SCORES.h5 });
121
+ } else if (line.startsWith("#### ")) {
122
+ points.push({ offset: lineStart, score: BREAK_SCORES.h4 });
123
+ } else if (line.startsWith("### ")) {
124
+ points.push({ offset: lineStart, score: BREAK_SCORES.h3 });
125
+ } else if (line.startsWith("## ")) {
126
+ points.push({ offset: lineStart, score: BREAK_SCORES.h2 });
127
+ } else if (line.startsWith("# ")) {
128
+ points.push({ offset: lineStart, score: BREAK_SCORES.h1 });
129
+ }
130
+
131
+ // Code fences (``` at start of line) — break before the fence
132
+ if (line.startsWith("```")) {
133
+ points.push({ offset: lineStart, score: BREAK_SCORES.code_fence });
134
+ }
135
+
136
+ // Horizontal rules (---, ***, ___ with optional spaces)
137
+ if (/^(\s*[-*_]\s*){3,}$/.test(line)) {
138
+ points.push({ offset: lineStart, score: BREAK_SCORES.hr });
139
+ }
140
+
141
+ // Paragraph boundary (empty line followed by content)
142
+ if (line === "" && i > 0) {
143
+ points.push({ offset: nextLineStart, score: BREAK_SCORES.paragraph });
144
+ }
145
+
146
+ // List items
147
+ if (/^(\s*[-*+]\s|\s*\d+\.\s)/.test(line)) {
148
+ points.push({ offset: lineStart, score: BREAK_SCORES.list_item });
149
+ }
150
+
151
+ // Every newline is a minimal break point
152
+ if (i < lines.length - 1) {
153
+ points.push({ offset: nextLineStart, score: BREAK_SCORES.newline });
154
+ }
155
+
156
+ offset = nextLineStart;
157
+ }
158
+
159
+ return points;
160
+ }
161
+
162
+ /**
163
+ * Find matched code fence (```) ranges. Returns [start, end] pairs
164
+ * where start is the offset of the opening fence and end is the offset
165
+ * just after the closing fence line's newline.
166
+ */
167
+ function findCodeFences(text: string): Array<[number, number]> {
168
+ const ranges: Array<[number, number]> = [];
169
+ const lines = text.split("\n");
170
+ let offset = 0;
171
+ let fenceStart: number | null = null;
172
+
173
+ for (const line of lines) {
174
+ if (line.startsWith("```")) {
175
+ if (fenceStart === null) {
176
+ fenceStart = offset;
177
+ } else {
178
+ // Close the fence — end is after this line
179
+ ranges.push([fenceStart, offset + line.length + 1]);
180
+ fenceStart = null;
181
+ }
182
+ }
183
+ offset += line.length + 1;
184
+ }
185
+
186
+ return ranges;
187
+ }
188
+
189
+ /**
190
+ * Check if an offset falls inside any code fence range.
191
+ */
192
+ function isInsideCodeFence(
193
+ offset: number,
194
+ codeFences: Array<[number, number]>,
195
+ ): boolean {
196
+ for (const [start, end] of codeFences) {
197
+ // Inside means strictly between the opening and closing fence lines.
198
+ // Breaking AT the start of a fence (before it) is fine.
199
+ if (offset > start && offset < end) return true;
200
+ }
201
+ return false;
202
+ }
203
+
204
+ /**
205
+ * Find the best break point near the target cut position.
206
+ *
207
+ * Searches a window from 50% to 100% of maxChars around the chunk start.
208
+ * Applies squared distance decay so breaks closer to the target are preferred.
209
+ * Rejects candidates inside code fences.
210
+ */
211
+ function findBestCutoff(
212
+ text: string,
213
+ breakPoints: BreakPoint[],
214
+ codeFences: Array<[number, number]>,
215
+ targetEnd: number,
216
+ maxChars: number,
217
+ ): number {
218
+ const windowStart = targetEnd - Math.floor(maxChars * 0.5);
219
+ const windowEnd = targetEnd;
220
+ const windowSize = windowEnd - windowStart;
221
+
222
+ let bestScore = -1;
223
+ let bestOffset = targetEnd;
224
+
225
+ for (const bp of breakPoints) {
226
+ if (bp.offset < windowStart || bp.offset > windowEnd) continue;
227
+ if (isInsideCodeFence(bp.offset, codeFences)) continue;
228
+
229
+ // Squared distance decay: prefer breaks closer to targetEnd
230
+ const dist = Math.abs(bp.offset - targetEnd);
231
+ const normalizedDist = dist / windowSize;
232
+ const multiplier = 1.0 - normalizedDist * normalizedDist * 0.7;
233
+ const weightedScore = bp.score * multiplier;
234
+
235
+ if (weightedScore > bestScore) {
236
+ bestScore = weightedScore;
237
+ bestOffset = bp.offset;
238
+ }
239
+ }
240
+
241
+ // Fallback: if no structural break points found, try word boundary (last space)
242
+ if (bestScore < 0) {
243
+ const lastSpace = text.lastIndexOf(" ", targetEnd);
244
+ if (lastSpace >= windowStart) {
245
+ return lastSpace + 1;
246
+ }
247
+ }
248
+
249
+ return bestOffset;
250
+ }