@convex-dev/rag 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +371 -0
- package/dist/client/_generated/_ignore.d.ts +1 -0
- package/dist/client/_generated/_ignore.d.ts.map +1 -0
- package/dist/client/_generated/_ignore.js +3 -0
- package/dist/client/_generated/_ignore.js.map +1 -0
- package/dist/client/defaultChunker.d.ts +15 -0
- package/dist/client/defaultChunker.d.ts.map +1 -0
- package/dist/client/defaultChunker.js +148 -0
- package/dist/client/defaultChunker.js.map +1 -0
- package/dist/client/fileUtils.d.ts +24 -0
- package/dist/client/fileUtils.d.ts.map +1 -0
- package/dist/client/fileUtils.js +179 -0
- package/dist/client/fileUtils.js.map +1 -0
- package/dist/client/index.d.ts +442 -0
- package/dist/client/index.d.ts.map +1 -0
- package/dist/client/index.js +597 -0
- package/dist/client/index.js.map +1 -0
- package/dist/client/types.d.ts +29 -0
- package/dist/client/types.d.ts.map +1 -0
- package/dist/client/types.js +2 -0
- package/dist/client/types.js.map +1 -0
- package/dist/component/_generated/api.d.ts +439 -0
- package/dist/component/_generated/api.d.ts.map +1 -0
- package/dist/component/_generated/api.js +22 -0
- package/dist/component/_generated/api.js.map +1 -0
- package/dist/component/_generated/dataModel.d.ts +60 -0
- package/dist/component/_generated/server.d.ts +149 -0
- package/dist/component/_generated/server.d.ts.map +1 -0
- package/dist/component/_generated/server.js +74 -0
- package/dist/component/_generated/server.js.map +1 -0
- package/dist/component/chunks.d.ts +139 -0
- package/dist/component/chunks.d.ts.map +1 -0
- package/dist/component/chunks.js +413 -0
- package/dist/component/chunks.js.map +1 -0
- package/dist/component/convex.config.d.ts +3 -0
- package/dist/component/convex.config.d.ts.map +1 -0
- package/dist/component/convex.config.js +6 -0
- package/dist/component/convex.config.js.map +1 -0
- package/dist/component/embeddings/importance.d.ts +21 -0
- package/dist/component/embeddings/importance.d.ts.map +1 -0
- package/dist/component/embeddings/importance.js +67 -0
- package/dist/component/embeddings/importance.js.map +1 -0
- package/dist/component/embeddings/index.d.ts +23 -0
- package/dist/component/embeddings/index.d.ts.map +1 -0
- package/dist/component/embeddings/index.js +54 -0
- package/dist/component/embeddings/index.js.map +1 -0
- package/dist/component/embeddings/tables.d.ts +39 -0
- package/dist/component/embeddings/tables.d.ts.map +1 -0
- package/dist/component/embeddings/tables.js +53 -0
- package/dist/component/embeddings/tables.js.map +1 -0
- package/dist/component/entries.d.ts +167 -0
- package/dist/component/entries.d.ts.map +1 -0
- package/dist/component/entries.js +409 -0
- package/dist/component/entries.js.map +1 -0
- package/dist/component/filters.d.ts +46 -0
- package/dist/component/filters.d.ts.map +1 -0
- package/dist/component/filters.js +72 -0
- package/dist/component/filters.js.map +1 -0
- package/dist/component/namespaces.d.ts +131 -0
- package/dist/component/namespaces.d.ts.map +1 -0
- package/dist/component/namespaces.js +222 -0
- package/dist/component/namespaces.js.map +1 -0
- package/dist/component/schema.d.ts +1697 -0
- package/dist/component/schema.d.ts.map +1 -0
- package/dist/component/schema.js +88 -0
- package/dist/component/schema.js.map +1 -0
- package/dist/component/search.d.ts +20 -0
- package/dist/component/search.d.ts.map +1 -0
- package/dist/component/search.js +69 -0
- package/dist/component/search.js.map +1 -0
- package/dist/package.json +3 -0
- package/dist/react/index.d.ts +2 -0
- package/dist/react/index.d.ts.map +1 -0
- package/dist/react/index.js +6 -0
- package/dist/react/index.js.map +1 -0
- package/dist/shared.d.ts +479 -0
- package/dist/shared.d.ts.map +1 -0
- package/dist/shared.js +98 -0
- package/dist/shared.js.map +1 -0
- package/package.json +97 -0
- package/src/client/_generated/_ignore.ts +1 -0
- package/src/client/defaultChunker.test.ts +243 -0
- package/src/client/defaultChunker.ts +183 -0
- package/src/client/fileUtils.ts +179 -0
- package/src/client/index.test.ts +475 -0
- package/src/client/index.ts +1125 -0
- package/src/client/setup.test.ts +28 -0
- package/src/client/types.ts +69 -0
- package/src/component/_generated/api.d.ts +439 -0
- package/src/component/_generated/api.js +23 -0
- package/src/component/_generated/dataModel.d.ts +60 -0
- package/src/component/_generated/server.d.ts +149 -0
- package/src/component/_generated/server.js +90 -0
- package/src/component/chunks.test.ts +915 -0
- package/src/component/chunks.ts +555 -0
- package/src/component/convex.config.ts +7 -0
- package/src/component/embeddings/importance.test.ts +249 -0
- package/src/component/embeddings/importance.ts +75 -0
- package/src/component/embeddings/index.test.ts +482 -0
- package/src/component/embeddings/index.ts +99 -0
- package/src/component/embeddings/tables.ts +114 -0
- package/src/component/entries.test.ts +341 -0
- package/src/component/entries.ts +546 -0
- package/src/component/filters.ts +119 -0
- package/src/component/namespaces.ts +299 -0
- package/src/component/schema.ts +106 -0
- package/src/component/search.test.ts +445 -0
- package/src/component/search.ts +97 -0
- package/src/component/setup.test.ts +5 -0
- package/src/react/index.ts +7 -0
- package/src/shared.ts +247 -0
- package/src/vitest.config.ts +7 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
export function guessMimeTypeFromExtension(
|
|
2
|
+
filename: string
|
|
3
|
+
): string | undefined {
|
|
4
|
+
const extension = filename.split(".").pop();
|
|
5
|
+
if (!extension || extension.includes(" ")) {
|
|
6
|
+
return undefined;
|
|
7
|
+
}
|
|
8
|
+
switch (extension.toLowerCase()) {
|
|
9
|
+
case "pdf":
|
|
10
|
+
return "application/pdf";
|
|
11
|
+
case "txt":
|
|
12
|
+
case "rtf":
|
|
13
|
+
return "text/plain";
|
|
14
|
+
case "json":
|
|
15
|
+
return "application/json";
|
|
16
|
+
case "xml":
|
|
17
|
+
return "application/xml";
|
|
18
|
+
case "html":
|
|
19
|
+
return "text/html";
|
|
20
|
+
case "css":
|
|
21
|
+
return "text/css";
|
|
22
|
+
case "js":
|
|
23
|
+
case "cjs":
|
|
24
|
+
case "mjs":
|
|
25
|
+
case "jsx":
|
|
26
|
+
case "ts":
|
|
27
|
+
case "tsx":
|
|
28
|
+
return "text/javascript";
|
|
29
|
+
case "md":
|
|
30
|
+
case "mdx":
|
|
31
|
+
return "text/markdown";
|
|
32
|
+
case "csv":
|
|
33
|
+
return "text/csv";
|
|
34
|
+
case "zip":
|
|
35
|
+
return "application/zip";
|
|
36
|
+
case "apng":
|
|
37
|
+
return "image/apng";
|
|
38
|
+
case "png":
|
|
39
|
+
return "image/png";
|
|
40
|
+
case "avif":
|
|
41
|
+
return "image/avif";
|
|
42
|
+
case "gif":
|
|
43
|
+
return "image/gif";
|
|
44
|
+
case "svg":
|
|
45
|
+
return "image/svg+xml";
|
|
46
|
+
case "webp":
|
|
47
|
+
return "image/webp";
|
|
48
|
+
case "tiff":
|
|
49
|
+
return "image/tiff";
|
|
50
|
+
case "ico":
|
|
51
|
+
return "image/x-icon";
|
|
52
|
+
case "jpeg":
|
|
53
|
+
case "jpg":
|
|
54
|
+
return "image/jpeg";
|
|
55
|
+
case "mp1":
|
|
56
|
+
case "mp2":
|
|
57
|
+
case "mp3":
|
|
58
|
+
return "audio/mpeg";
|
|
59
|
+
case "mp4":
|
|
60
|
+
return "video/mp4";
|
|
61
|
+
default:
|
|
62
|
+
return "application/octet-stream";
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Return a best-guess MIME type based on the magic-number signature
|
|
67
|
+
* found at the start of an ArrayBuffer.
|
|
68
|
+
*
|
|
69
|
+
* @param buf – the source ArrayBuffer
|
|
70
|
+
* @returns the detected MIME type, or `"application/octet-stream"` if unknown
|
|
71
|
+
*/
|
|
72
|
+
|
|
73
|
+
export function guessMimeTypeFromContents(buf: ArrayBuffer | string): string {
|
|
74
|
+
if (typeof buf === "string") {
|
|
75
|
+
if (buf.match(/^data:\w+\/\w+;base64/)) {
|
|
76
|
+
return buf.split(";")[0].split(":")[1]!;
|
|
77
|
+
}
|
|
78
|
+
return "text/plain";
|
|
79
|
+
}
|
|
80
|
+
if (buf.byteLength < 4) return "application/octet-stream";
|
|
81
|
+
|
|
82
|
+
// Read the first 12 bytes (enough for all signatures below)
|
|
83
|
+
const bytes = new Uint8Array(buf.slice(0, 12));
|
|
84
|
+
const hex = [...bytes].map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
85
|
+
|
|
86
|
+
// Helper so we can look at only the needed prefix
|
|
87
|
+
const startsWith = (sig: string) => hex.startsWith(sig.toLowerCase());
|
|
88
|
+
|
|
89
|
+
// --- image formats ---
|
|
90
|
+
if (startsWith("89504e47")) return "image/png"; // PNG - 89 50 4E 47
|
|
91
|
+
if (
|
|
92
|
+
startsWith("ffd8ffdb") ||
|
|
93
|
+
startsWith("ffd8ffe0") ||
|
|
94
|
+
startsWith("ffd8ffee") ||
|
|
95
|
+
startsWith("ffd8ffe1")
|
|
96
|
+
)
|
|
97
|
+
return "image/jpeg"; // JPEG
|
|
98
|
+
if (startsWith("47494638")) return "image/gif"; // GIF
|
|
99
|
+
if (startsWith("424d")) return "image/bmp"; // BMP
|
|
100
|
+
if (startsWith("52494646") && hex.substr(16, 8) === "57454250")
|
|
101
|
+
return "image/webp"; // WEBP (RIFF....WEBP)
|
|
102
|
+
if (startsWith("49492a00")) return "image/tiff"; // TIFF
|
|
103
|
+
|
|
104
|
+
// <svg in hex is 3c 3f 78 6d 6c
|
|
105
|
+
if (startsWith("3c737667")) return "image/svg+xml"; // <svg
|
|
106
|
+
if (startsWith("3c3f786d")) return "image/svg+xml"; // <?xm
|
|
107
|
+
|
|
108
|
+
// --- audio/video ---
|
|
109
|
+
if (startsWith("494433")) return "audio/mpeg"; // MP3 (ID3)
|
|
110
|
+
if (startsWith("000001ba") || startsWith("000001b3")) return "video/mpeg"; // MPEG container
|
|
111
|
+
if (startsWith("1a45dfa3")) return "video/webm"; // WEBM / Matroska
|
|
112
|
+
if (startsWith("00000018") && hex.substr(16, 8) === "66747970")
|
|
113
|
+
return "video/mp4"; // MP4
|
|
114
|
+
if (startsWith("4f676753")) return "audio/ogg"; // OGG / Opus
|
|
115
|
+
|
|
116
|
+
// --- documents & archives ---
|
|
117
|
+
if (startsWith("25504446")) return "application/pdf"; // PDF
|
|
118
|
+
if (
|
|
119
|
+
startsWith("504b0304") ||
|
|
120
|
+
startsWith("504b0506") ||
|
|
121
|
+
startsWith("504b0708")
|
|
122
|
+
)
|
|
123
|
+
return "application/zip"; // ZIP / DOCX / PPTX / XLSX / EPUB
|
|
124
|
+
if (startsWith("52617221")) return "application/x-rar-compressed"; // RAR
|
|
125
|
+
if (startsWith("7f454c46")) return "application/x-elf"; // ELF binaries
|
|
126
|
+
if (startsWith("1f8b08")) return "application/gzip"; // GZIP
|
|
127
|
+
if (startsWith("425a68")) return "application/x-bzip2"; // BZIP2
|
|
128
|
+
if (startsWith("3c3f786d6c")) return "application/xml"; // XML
|
|
129
|
+
|
|
130
|
+
// Plain text, JSON and others are trickier—fallback:
|
|
131
|
+
return "application/octet-stream";
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Make a contentHash of a Blob that matches the File Storage metadata, allowing
|
|
135
|
+
* identifying when content is identical.
|
|
136
|
+
* @param blob The contents to hash
|
|
137
|
+
* @returns sha256 hash of the contents
|
|
138
|
+
*/
|
|
139
|
+
|
|
140
|
+
export async function contentHashFromArrayBuffer(buffer: ArrayBuffer) {
|
|
141
|
+
return Array.from(
|
|
142
|
+
new Uint8Array(await crypto.subtle.digest("SHA-256", buffer))
|
|
143
|
+
)
|
|
144
|
+
.map((b) => b.toString(16).padStart(2, "0"))
|
|
145
|
+
.join("");
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Split a filename into a keyword-friendly string. Specifically adds sections
|
|
150
|
+
* of camelCase and TitleCase into a space-separated strings.
|
|
151
|
+
* e.g. "MyFile is soGreat.txt" -> "MyFile is soGreat.txt My File so Great"
|
|
152
|
+
* Note: it doesn't split up titles that don't have a file extension.
|
|
153
|
+
*/
|
|
154
|
+
export function splitFilename(title: string | undefined): string | undefined {
|
|
155
|
+
if (!title) {
|
|
156
|
+
return undefined;
|
|
157
|
+
}
|
|
158
|
+
const parts = title.split(".");
|
|
159
|
+
if (parts.pop()?.includes(" ")) {
|
|
160
|
+
// There isn't an extension, so don't treat it as a filename
|
|
161
|
+
return title;
|
|
162
|
+
}
|
|
163
|
+
// split up camelCase into "camel Case"
|
|
164
|
+
return [
|
|
165
|
+
title,
|
|
166
|
+
...parts.flatMap((part) => {
|
|
167
|
+
const words = part.split(" ");
|
|
168
|
+
const camelCaseWords = words.flatMap((word) => {
|
|
169
|
+
const pieces = word.split(/(?=[A-Z])/);
|
|
170
|
+
if (pieces.length === 1) {
|
|
171
|
+
// This will already be verbatim in the regular title parts
|
|
172
|
+
return [];
|
|
173
|
+
}
|
|
174
|
+
return pieces;
|
|
175
|
+
});
|
|
176
|
+
return camelCaseWords;
|
|
177
|
+
}),
|
|
178
|
+
].join(" ");
|
|
179
|
+
}
|
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
import { describe, expect, test } from "vitest";
|
|
2
|
+
import { RAG } from "./index.js";
|
|
3
|
+
import type { DataModelFromSchemaDefinition } from "convex/server";
|
|
4
|
+
import {
|
|
5
|
+
anyApi,
|
|
6
|
+
queryGeneric,
|
|
7
|
+
mutationGeneric,
|
|
8
|
+
actionGeneric,
|
|
9
|
+
} from "convex/server";
|
|
10
|
+
import type {
|
|
11
|
+
ApiFromModules,
|
|
12
|
+
ActionBuilder,
|
|
13
|
+
MutationBuilder,
|
|
14
|
+
QueryBuilder,
|
|
15
|
+
} from "convex/server";
|
|
16
|
+
import { v } from "convex/values";
|
|
17
|
+
import { defineSchema } from "convex/server";
|
|
18
|
+
import { components, initConvexTest } from "./setup.test.js";
|
|
19
|
+
import { openai } from "@ai-sdk/openai";
|
|
20
|
+
|
|
21
|
+
// The schema for the tests
|
|
22
|
+
const schema = defineSchema({});
|
|
23
|
+
type DataModel = DataModelFromSchemaDefinition<typeof schema>;
|
|
24
|
+
// type DatabaseReader = GenericDatabaseReader<DataModel>;
|
|
25
|
+
const query = queryGeneric as QueryBuilder<DataModel, "public">;
|
|
26
|
+
const mutation = mutationGeneric as MutationBuilder<DataModel, "public">;
|
|
27
|
+
const action = actionGeneric as ActionBuilder<DataModel, "public">;
|
|
28
|
+
|
|
29
|
+
const rag = new RAG(components.rag, {
|
|
30
|
+
embeddingDimension: 1536,
|
|
31
|
+
textEmbeddingModel: openai.textEmbeddingModel("text-embedding-3-small"),
|
|
32
|
+
filterNames: ["simpleString", "arrayOfStrings", "customObject"],
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
export const findExistingEntryByContentHash = query({
|
|
36
|
+
args: { namespace: v.string(), key: v.string(), contentHash: v.string() },
|
|
37
|
+
handler: async (ctx, args) => {
|
|
38
|
+
return rag.findExistingEntryByContentHash(ctx, {
|
|
39
|
+
namespace: args.namespace,
|
|
40
|
+
key: args.key,
|
|
41
|
+
contentHash: args.contentHash,
|
|
42
|
+
});
|
|
43
|
+
},
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
export const add = mutation({
|
|
47
|
+
args: {
|
|
48
|
+
key: v.string(),
|
|
49
|
+
chunks: v.array(
|
|
50
|
+
v.object({
|
|
51
|
+
text: v.string(),
|
|
52
|
+
metadata: v.record(v.string(), v.any()),
|
|
53
|
+
embedding: v.array(v.number()),
|
|
54
|
+
})
|
|
55
|
+
),
|
|
56
|
+
namespace: v.string(),
|
|
57
|
+
title: v.optional(v.string()),
|
|
58
|
+
filterValues: v.optional(
|
|
59
|
+
v.array(
|
|
60
|
+
v.union(
|
|
61
|
+
v.object({
|
|
62
|
+
name: v.literal("simpleString"),
|
|
63
|
+
value: v.string(),
|
|
64
|
+
}),
|
|
65
|
+
v.object({
|
|
66
|
+
name: v.literal("arrayOfStrings"),
|
|
67
|
+
value: v.array(v.string()),
|
|
68
|
+
}),
|
|
69
|
+
v.object({
|
|
70
|
+
name: v.literal("customObject"),
|
|
71
|
+
value: v.record(v.string(), v.any()),
|
|
72
|
+
})
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
),
|
|
76
|
+
importance: v.optional(v.number()),
|
|
77
|
+
contentHash: v.optional(v.string()),
|
|
78
|
+
},
|
|
79
|
+
handler: async (ctx, args) => {
|
|
80
|
+
return rag.add(ctx, args);
|
|
81
|
+
},
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
export const search = action({
|
|
85
|
+
args: {
|
|
86
|
+
embedding: v.array(v.number()),
|
|
87
|
+
namespace: v.string(),
|
|
88
|
+
limit: v.optional(v.number()),
|
|
89
|
+
chunkContext: v.optional(
|
|
90
|
+
v.object({
|
|
91
|
+
before: v.number(),
|
|
92
|
+
after: v.number(),
|
|
93
|
+
})
|
|
94
|
+
),
|
|
95
|
+
},
|
|
96
|
+
handler: async (ctx, args) => {
|
|
97
|
+
const { results, entries, text } = await rag.search(ctx, {
|
|
98
|
+
embedding: args.embedding,
|
|
99
|
+
namespace: args.namespace,
|
|
100
|
+
limit: args.limit ?? 10,
|
|
101
|
+
chunkContext: args.chunkContext ?? { before: 0, after: 0 },
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
return {
|
|
105
|
+
results,
|
|
106
|
+
text,
|
|
107
|
+
entries,
|
|
108
|
+
};
|
|
109
|
+
},
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
const testApi: ApiFromModules<{
|
|
113
|
+
fns: {
|
|
114
|
+
findExistingEntryByContentHash: typeof findExistingEntryByContentHash;
|
|
115
|
+
add: typeof add;
|
|
116
|
+
search: typeof search;
|
|
117
|
+
};
|
|
118
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
119
|
+
}>["fns"] = anyApi["index.test"] as any;
|
|
120
|
+
|
|
121
|
+
function dummyEmbeddings(text: string) {
|
|
122
|
+
return Array.from({ length: 1536 }, (_, i) =>
|
|
123
|
+
i === 0 ? text.charCodeAt(0) / 256 : 0.1
|
|
124
|
+
);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
describe("RAG thick client", () => {
|
|
128
|
+
test("should add a entry and be able to list it", async () => {
|
|
129
|
+
const t = initConvexTest(schema);
|
|
130
|
+
const { entryId, status } = await t.mutation(testApi.add, {
|
|
131
|
+
key: "test",
|
|
132
|
+
chunks: [
|
|
133
|
+
{ text: "A", metadata: {}, embedding: dummyEmbeddings("A") },
|
|
134
|
+
{ text: "B", metadata: {}, embedding: dummyEmbeddings("B") },
|
|
135
|
+
{ text: "C", metadata: {}, embedding: dummyEmbeddings("C") },
|
|
136
|
+
],
|
|
137
|
+
namespace: "test",
|
|
138
|
+
});
|
|
139
|
+
expect(entryId).toBeDefined();
|
|
140
|
+
expect(status).toBe("ready");
|
|
141
|
+
await t.run(async (ctx) => {
|
|
142
|
+
const { isDone, page } = await rag.listChunks(ctx, {
|
|
143
|
+
entryId,
|
|
144
|
+
paginationOpts: { numItems: 10, cursor: null },
|
|
145
|
+
});
|
|
146
|
+
expect(page.length).toBe(3);
|
|
147
|
+
expect(isDone).toBe(true);
|
|
148
|
+
expect(page[0].order).toBe(0);
|
|
149
|
+
expect(page[1].order).toBe(1);
|
|
150
|
+
expect(page[2].order).toBe(2);
|
|
151
|
+
});
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
test("should work from a test function", async () => {
|
|
155
|
+
const t = initConvexTest(schema);
|
|
156
|
+
await t.mutation(testApi.add, {
|
|
157
|
+
key: "test",
|
|
158
|
+
chunks: [
|
|
159
|
+
{ text: "A", metadata: {}, embedding: dummyEmbeddings("A") },
|
|
160
|
+
{ text: "B", metadata: {}, embedding: dummyEmbeddings("B") },
|
|
161
|
+
{ text: "C", metadata: {}, embedding: dummyEmbeddings("C") },
|
|
162
|
+
],
|
|
163
|
+
namespace: "test",
|
|
164
|
+
});
|
|
165
|
+
// expect(result).toBe(1);
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
describe("text formatting validation", () => {
|
|
169
|
+
test("should format single entry with sequential chunks correctly", async () => {
|
|
170
|
+
const t = initConvexTest(schema);
|
|
171
|
+
|
|
172
|
+
// Add entry with sequential chunks
|
|
173
|
+
await t.mutation(testApi.add, {
|
|
174
|
+
key: "sequential-test",
|
|
175
|
+
chunks: [
|
|
176
|
+
{
|
|
177
|
+
text: "Chunk 1 content",
|
|
178
|
+
metadata: {},
|
|
179
|
+
embedding: dummyEmbeddings("Chunk 1 content"),
|
|
180
|
+
},
|
|
181
|
+
{
|
|
182
|
+
text: "Chunk 2 content",
|
|
183
|
+
metadata: {},
|
|
184
|
+
embedding: dummyEmbeddings("Chunk 2 content"),
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
text: "Chunk 3 content",
|
|
188
|
+
metadata: {},
|
|
189
|
+
embedding: dummyEmbeddings("Chunk 3 content"),
|
|
190
|
+
},
|
|
191
|
+
],
|
|
192
|
+
namespace: "format-test",
|
|
193
|
+
title: "Test Document",
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
// Search and verify text format
|
|
197
|
+
const { text, entries } = await t.action(testApi.search, {
|
|
198
|
+
embedding: dummyEmbeddings("content"),
|
|
199
|
+
namespace: "format-test",
|
|
200
|
+
limit: 10,
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
// Should match README format: "# Title:\n{entry.text}"
|
|
204
|
+
expect(text).toContain("# Test Document:");
|
|
205
|
+
expect(entries).toHaveLength(1);
|
|
206
|
+
expect(entries[0].text).toBe(
|
|
207
|
+
"Chunk 1 content\nChunk 2 content\nChunk 3 content"
|
|
208
|
+
);
|
|
209
|
+
|
|
210
|
+
// Overall text should be: "# Test Document:\nChunk 1 content\nChunk 2 content\nChunk 3 content"
|
|
211
|
+
expect(text).toBe(
|
|
212
|
+
"# Test Document:\nChunk 1 content\nChunk 2 content\nChunk 3 content"
|
|
213
|
+
);
|
|
214
|
+
});
|
|
215
|
+
|
|
216
|
+
test("should format single entry without title correctly", async () => {
|
|
217
|
+
const t = initConvexTest(schema);
|
|
218
|
+
|
|
219
|
+
// Add entry without title
|
|
220
|
+
await t.mutation(testApi.add, {
|
|
221
|
+
key: "no-title-test",
|
|
222
|
+
chunks: [
|
|
223
|
+
{
|
|
224
|
+
text: "Content without title",
|
|
225
|
+
metadata: {},
|
|
226
|
+
embedding: dummyEmbeddings("Content without title"),
|
|
227
|
+
},
|
|
228
|
+
],
|
|
229
|
+
namespace: "format-test-notitle",
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
const { text, entries } = await t.action(testApi.search, {
|
|
233
|
+
embedding: dummyEmbeddings("content"),
|
|
234
|
+
namespace: "format-test-notitle",
|
|
235
|
+
limit: 10,
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
// Should not have "# " prefix since no title
|
|
239
|
+
expect(text).not.toContain("# ");
|
|
240
|
+
expect(entries).toHaveLength(1);
|
|
241
|
+
expect(entries[0].text).toBe("Content without title");
|
|
242
|
+
expect(text).toBe("Content without title");
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
test("should format non-sequential chunks with ellipsis separator", async () => {
|
|
246
|
+
const t = initConvexTest(schema);
|
|
247
|
+
|
|
248
|
+
// Add multiple entries to create potential non-sequential results
|
|
249
|
+
await t.mutation(testApi.add, {
|
|
250
|
+
key: "doc1",
|
|
251
|
+
chunks: [
|
|
252
|
+
{
|
|
253
|
+
text: "Chunk 1 content",
|
|
254
|
+
metadata: {},
|
|
255
|
+
embedding: dummyEmbeddings("Chunk 1 content"),
|
|
256
|
+
},
|
|
257
|
+
{
|
|
258
|
+
text: "Chunk 2 content",
|
|
259
|
+
metadata: {},
|
|
260
|
+
embedding: dummyEmbeddings("Chunk 2 content"),
|
|
261
|
+
},
|
|
262
|
+
{
|
|
263
|
+
text: "Important chunk",
|
|
264
|
+
metadata: {},
|
|
265
|
+
embedding: dummyEmbeddings("A important chunk"),
|
|
266
|
+
},
|
|
267
|
+
{
|
|
268
|
+
text: "Chunk 4 content",
|
|
269
|
+
metadata: {},
|
|
270
|
+
embedding: dummyEmbeddings("Chunk 4 content"),
|
|
271
|
+
},
|
|
272
|
+
{
|
|
273
|
+
text: "Another important chunk",
|
|
274
|
+
metadata: {},
|
|
275
|
+
// embedding hack uses first char to determine order
|
|
276
|
+
embedding: dummyEmbeddings("B important chunk"),
|
|
277
|
+
},
|
|
278
|
+
],
|
|
279
|
+
namespace: "ellipsis-test",
|
|
280
|
+
title: "Document with gaps",
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
// Search with chunk context to potentially get non-sequential results
|
|
284
|
+
const { text, entries } = await t.action(testApi.search, {
|
|
285
|
+
embedding: dummyEmbeddings("A important chunk"),
|
|
286
|
+
namespace: "ellipsis-test",
|
|
287
|
+
limit: 2,
|
|
288
|
+
chunkContext: { before: 0, after: 0 }, // Just the matching chunks
|
|
289
|
+
});
|
|
290
|
+
|
|
291
|
+
expect(entries).toHaveLength(1);
|
|
292
|
+
|
|
293
|
+
// If we get non-sequential chunks, they should be separated by "\n...\n"
|
|
294
|
+
// The exact behavior depends on the search results, but we can at least verify structure
|
|
295
|
+
expect(entries[0].text).toContain("Important chunk");
|
|
296
|
+
expect(entries[0].text).toContain("Another important chunk");
|
|
297
|
+
|
|
298
|
+
// The text might contain ellipsis if chunks are non-sequential
|
|
299
|
+
expect(text).toMatch(/\n\.\.\.\n/);
|
|
300
|
+
});
|
|
301
|
+
|
|
302
|
+
test("should format multiple entries with separators", async () => {
|
|
303
|
+
const t = initConvexTest(schema);
|
|
304
|
+
|
|
305
|
+
// Add two separate entries
|
|
306
|
+
await t.mutation(testApi.add, {
|
|
307
|
+
key: "first-doc",
|
|
308
|
+
chunks: [
|
|
309
|
+
{
|
|
310
|
+
text: "First document content",
|
|
311
|
+
metadata: {},
|
|
312
|
+
embedding: dummyEmbeddings("First document content"),
|
|
313
|
+
},
|
|
314
|
+
],
|
|
315
|
+
namespace: "multi-entry-test",
|
|
316
|
+
title: "First Document",
|
|
317
|
+
});
|
|
318
|
+
|
|
319
|
+
await t.mutation(testApi.add, {
|
|
320
|
+
key: "second-doc",
|
|
321
|
+
chunks: [
|
|
322
|
+
{
|
|
323
|
+
text: "Second document content",
|
|
324
|
+
metadata: {},
|
|
325
|
+
embedding: dummyEmbeddings("Second document content"),
|
|
326
|
+
},
|
|
327
|
+
],
|
|
328
|
+
namespace: "multi-entry-test",
|
|
329
|
+
title: "Second Document",
|
|
330
|
+
});
|
|
331
|
+
|
|
332
|
+
const { text, entries } = await t.action(testApi.search, {
|
|
333
|
+
embedding: dummyEmbeddings("document"),
|
|
334
|
+
namespace: "multi-entry-test",
|
|
335
|
+
limit: 10,
|
|
336
|
+
});
|
|
337
|
+
|
|
338
|
+
// Should have entries separated by "\n---\n" as per README
|
|
339
|
+
expect(text).toContain("---");
|
|
340
|
+
expect(text).toMatch(/# .+:\n.+\n---\n# .+:\n.+/);
|
|
341
|
+
|
|
342
|
+
// Should have both titles prefixed with "# "
|
|
343
|
+
expect(text).toContain("# First Document:");
|
|
344
|
+
expect(text).toContain("# Second Document:");
|
|
345
|
+
|
|
346
|
+
expect(entries).toHaveLength(2);
|
|
347
|
+
});
|
|
348
|
+
|
|
349
|
+
test("should format mixed entries (with and without titles)", async () => {
|
|
350
|
+
const t = initConvexTest(schema);
|
|
351
|
+
|
|
352
|
+
// Add entry with title
|
|
353
|
+
await t.mutation(testApi.add, {
|
|
354
|
+
key: "titled-doc",
|
|
355
|
+
chunks: [
|
|
356
|
+
{
|
|
357
|
+
text: "Content with title",
|
|
358
|
+
metadata: {},
|
|
359
|
+
embedding: dummyEmbeddings("Content with title"),
|
|
360
|
+
},
|
|
361
|
+
],
|
|
362
|
+
namespace: "mixed-test",
|
|
363
|
+
title: "Titled Document",
|
|
364
|
+
});
|
|
365
|
+
|
|
366
|
+
// Add entry without title
|
|
367
|
+
await t.mutation(testApi.add, {
|
|
368
|
+
key: "untitled-doc",
|
|
369
|
+
chunks: [
|
|
370
|
+
{
|
|
371
|
+
text: "Content without title",
|
|
372
|
+
metadata: {},
|
|
373
|
+
embedding: dummyEmbeddings("Content without title"),
|
|
374
|
+
},
|
|
375
|
+
],
|
|
376
|
+
namespace: "mixed-test",
|
|
377
|
+
});
|
|
378
|
+
|
|
379
|
+
const { text, entries } = await t.action(testApi.search, {
|
|
380
|
+
embedding: dummyEmbeddings("content"),
|
|
381
|
+
namespace: "mixed-test",
|
|
382
|
+
limit: 10,
|
|
383
|
+
});
|
|
384
|
+
|
|
385
|
+
// Should properly handle mixed formatting
|
|
386
|
+
expect(text).toContain("---"); // Entries should be separated
|
|
387
|
+
expect(text).toContain("# Titled Document:"); // Titled entry should have prefix
|
|
388
|
+
|
|
389
|
+
// One entry should have title format, one should not
|
|
390
|
+
const parts = text.split("\n---\n");
|
|
391
|
+
expect(parts).toHaveLength(2);
|
|
392
|
+
|
|
393
|
+
const hasTitle = parts.some((part) => part.startsWith("# "));
|
|
394
|
+
const hasNoTitle = parts.some((part) => !part.startsWith("# "));
|
|
395
|
+
expect(hasTitle).toBe(true);
|
|
396
|
+
expect(hasNoTitle).toBe(true);
|
|
397
|
+
|
|
398
|
+
expect(entries).toHaveLength(2);
|
|
399
|
+
});
|
|
400
|
+
|
|
401
|
+
test("should match exact README format specification", async () => {
|
|
402
|
+
const t = initConvexTest(schema);
|
|
403
|
+
|
|
404
|
+
// Create the exact scenario from README example
|
|
405
|
+
await t.mutation(testApi.add, {
|
|
406
|
+
key: "title1-doc",
|
|
407
|
+
chunks: [
|
|
408
|
+
{
|
|
409
|
+
text: "Chunk 1 contents",
|
|
410
|
+
metadata: {},
|
|
411
|
+
embedding: dummyEmbeddings("Chunk 1 contents"),
|
|
412
|
+
},
|
|
413
|
+
{
|
|
414
|
+
text: "Chunk 2 contents",
|
|
415
|
+
metadata: {},
|
|
416
|
+
embedding: dummyEmbeddings("Chunk 2 contents"),
|
|
417
|
+
},
|
|
418
|
+
],
|
|
419
|
+
namespace: "readme-format-test",
|
|
420
|
+
title: "Title 1",
|
|
421
|
+
});
|
|
422
|
+
|
|
423
|
+
await t.mutation(testApi.add, {
|
|
424
|
+
key: "title2-doc",
|
|
425
|
+
chunks: [
|
|
426
|
+
{
|
|
427
|
+
text: "Chunk 3 contents",
|
|
428
|
+
metadata: {},
|
|
429
|
+
embedding: dummyEmbeddings("Chunk 3 contents"),
|
|
430
|
+
},
|
|
431
|
+
{
|
|
432
|
+
text: "Chunk 4 contents",
|
|
433
|
+
metadata: {},
|
|
434
|
+
embedding: dummyEmbeddings("Chunk 4 contents"),
|
|
435
|
+
},
|
|
436
|
+
],
|
|
437
|
+
namespace: "readme-format-test",
|
|
438
|
+
title: "Title 2",
|
|
439
|
+
});
|
|
440
|
+
|
|
441
|
+
const { text, entries } = await t.action(testApi.search, {
|
|
442
|
+
embedding: dummyEmbeddings("contents"),
|
|
443
|
+
namespace: "readme-format-test",
|
|
444
|
+
limit: 10,
|
|
445
|
+
});
|
|
446
|
+
|
|
447
|
+
// Verify basic structure matches README
|
|
448
|
+
expect(text).toContain("# Title 1:");
|
|
449
|
+
expect(text).toContain("# Title 2:");
|
|
450
|
+
expect(text).toContain("---");
|
|
451
|
+
|
|
452
|
+
// Should have proper entry separation
|
|
453
|
+
const parts = text.split("\n---\n");
|
|
454
|
+
expect(parts).toHaveLength(2);
|
|
455
|
+
|
|
456
|
+
// Each part should start with "# Title X:"
|
|
457
|
+
parts.forEach((part) => {
|
|
458
|
+
expect(part).toMatch(/^# Title \d+:/);
|
|
459
|
+
});
|
|
460
|
+
|
|
461
|
+
expect(entries).toHaveLength(2);
|
|
462
|
+
|
|
463
|
+
// Individual entry texts should follow the pattern
|
|
464
|
+
expect(text).toBe(
|
|
465
|
+
`# Title 1:
|
|
466
|
+
Chunk 1 contents
|
|
467
|
+
Chunk 2 contents
|
|
468
|
+
---
|
|
469
|
+
# Title 2:
|
|
470
|
+
Chunk 3 contents
|
|
471
|
+
Chunk 4 contents`
|
|
472
|
+
);
|
|
473
|
+
});
|
|
474
|
+
});
|
|
475
|
+
});
|