@convex-dev/rag 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/client/hybridRank.d.ts +1 -1
- package/dist/client/hybridRank.js +1 -1
- package/dist/client/index.d.ts +35 -3
- package/dist/client/index.d.ts.map +1 -1
- package/dist/client/index.js +32 -16
- package/dist/client/index.js.map +1 -1
- package/dist/component/_generated/component.d.ts +6 -1
- package/dist/component/_generated/component.d.ts.map +1 -1
- package/dist/component/_generated/server.d.ts.map +1 -1
- package/dist/component/chunks.d.ts +9 -2
- package/dist/component/chunks.d.ts.map +1 -1
- package/dist/component/chunks.js +66 -63
- package/dist/component/chunks.js.map +1 -1
- package/dist/component/schema.d.ts +34 -34
- package/dist/component/schema.d.ts.map +1 -1
- package/dist/component/schema.js +0 -1
- package/dist/component/schema.js.map +1 -1
- package/dist/component/search.d.ts +44 -1
- package/dist/component/search.d.ts.map +1 -1
- package/dist/component/search.js +188 -17
- package/dist/component/search.js.map +1 -1
- package/dist/shared.d.ts +2 -0
- package/dist/shared.d.ts.map +1 -1
- package/dist/shared.js +1 -0
- package/dist/shared.js.map +1 -1
- package/package.json +30 -28
- package/src/client/hybridRank.ts +1 -1
- package/src/client/index.ts +76 -16
- package/src/component/_generated/component.ts +6 -1
- package/src/component/_generated/server.ts +0 -5
- package/src/component/chunks.ts +102 -92
- package/src/component/schema.ts +0 -1
- package/src/component/search.test.ts +303 -1
- package/src/component/search.ts +266 -19
- package/src/shared.ts +7 -0
|
@@ -409,12 +409,17 @@ export type ComponentApi<Name extends string | undefined = string | undefined> =
|
|
|
409
409
|
"internal",
|
|
410
410
|
{
|
|
411
411
|
chunkContext?: { after: number; before: number };
|
|
412
|
-
|
|
412
|
+
dimension?: number;
|
|
413
|
+
embedding?: Array<number>;
|
|
413
414
|
filters: Array<{ name: string; value: any }>;
|
|
414
415
|
limit: number;
|
|
415
416
|
modelId: string;
|
|
416
417
|
namespace: string;
|
|
418
|
+
searchType?: "vector" | "text" | "hybrid";
|
|
419
|
+
textQuery?: string;
|
|
420
|
+
textWeight?: number;
|
|
417
421
|
vectorScoreThreshold?: number;
|
|
422
|
+
vectorWeight?: number;
|
|
418
423
|
},
|
|
419
424
|
{
|
|
420
425
|
entries: Array<{
|
|
@@ -107,11 +107,6 @@ export const internalAction: ActionBuilder<DataModel, "internal"> =
|
|
|
107
107
|
*/
|
|
108
108
|
export const httpAction: HttpActionBuilder = httpActionGeneric;
|
|
109
109
|
|
|
110
|
-
type GenericCtx =
|
|
111
|
-
| GenericActionCtx<DataModel>
|
|
112
|
-
| GenericMutationCtx<DataModel>
|
|
113
|
-
| GenericQueryCtx<DataModel>;
|
|
114
|
-
|
|
115
110
|
/**
|
|
116
111
|
* A set of services for use within Convex query functions.
|
|
117
112
|
*
|
package/src/component/chunks.ts
CHANGED
|
@@ -311,6 +311,107 @@ export const vRangeResult = v.object({
|
|
|
311
311
|
),
|
|
312
312
|
});
|
|
313
313
|
|
|
314
|
+
export async function buildRanges(
|
|
315
|
+
ctx: QueryCtx,
|
|
316
|
+
chunks: (Doc<"chunks"> | null)[],
|
|
317
|
+
chunkContext: { before: number; after: number },
|
|
318
|
+
): Promise<{
|
|
319
|
+
ranges: (null | Infer<typeof vRangeResult>)[];
|
|
320
|
+
entries: Entry[];
|
|
321
|
+
}> {
|
|
322
|
+
// Note: This preserves order of entries as they first appeared.
|
|
323
|
+
const entryDocs = (
|
|
324
|
+
await Promise.all(
|
|
325
|
+
Array.from(
|
|
326
|
+
new Set(chunks.filter((c) => c !== null).map((c) => c.entryId)),
|
|
327
|
+
).map((id) => ctx.db.get(id)),
|
|
328
|
+
)
|
|
329
|
+
).filter((d): d is Doc<"entries"> => d !== null);
|
|
330
|
+
const entries = entryDocs.map(publicEntry);
|
|
331
|
+
const entryDocById = new Map(entryDocs.map((d) => [d._id, d]));
|
|
332
|
+
|
|
333
|
+
const entryOrders = chunks
|
|
334
|
+
.filter((c) => c !== null)
|
|
335
|
+
.map((c) => [c.entryId, c.order] as const)
|
|
336
|
+
.reduce(
|
|
337
|
+
(acc, [entryId, order]) => {
|
|
338
|
+
if (acc[entryId]?.includes(order)) {
|
|
339
|
+
// De-dupe orders
|
|
340
|
+
return acc;
|
|
341
|
+
}
|
|
342
|
+
acc[entryId] = [...(acc[entryId] ?? []), order].sort((a, b) => a - b);
|
|
343
|
+
return acc;
|
|
344
|
+
},
|
|
345
|
+
{} as Record<Id<"entries">, number[]>,
|
|
346
|
+
);
|
|
347
|
+
|
|
348
|
+
const result: Array<Infer<typeof vRangeResult> | null> = [];
|
|
349
|
+
|
|
350
|
+
for (const chunk of chunks) {
|
|
351
|
+
if (chunk === null) {
|
|
352
|
+
result.push(null);
|
|
353
|
+
continue;
|
|
354
|
+
}
|
|
355
|
+
// Note: if we parallelize this in the future, we could have a race
|
|
356
|
+
// instead we'd check that other chunks are not the same doc/order
|
|
357
|
+
if (
|
|
358
|
+
result.find(
|
|
359
|
+
(r) => r?.entryId === chunk.entryId && r?.order === chunk.order,
|
|
360
|
+
)
|
|
361
|
+
) {
|
|
362
|
+
// De-dupe chunks
|
|
363
|
+
result.push(null);
|
|
364
|
+
continue;
|
|
365
|
+
}
|
|
366
|
+
const entryId = chunk.entryId;
|
|
367
|
+
const entry = entryDocById.get(entryId);
|
|
368
|
+
assert(entry, `Entry ${entryId} not found`);
|
|
369
|
+
const otherOrders = entryOrders[entryId] ?? [chunk.order];
|
|
370
|
+
const ourOrderIndex = otherOrders.indexOf(chunk.order);
|
|
371
|
+
const previousOrder = otherOrders[ourOrderIndex - 1] ?? -Infinity;
|
|
372
|
+
const nextOrder = otherOrders[ourOrderIndex + 1] ?? Infinity;
|
|
373
|
+
// We absorb all previous context up to the previous chunk.
|
|
374
|
+
const startOrder = Math.max(
|
|
375
|
+
chunk.order - chunkContext.before,
|
|
376
|
+
0,
|
|
377
|
+
Math.min(previousOrder + 1, chunk.order),
|
|
378
|
+
);
|
|
379
|
+
// We stop short if the next chunk order's "before" context will cover it.
|
|
380
|
+
const endOrder = Math.min(
|
|
381
|
+
chunk.order + chunkContext.after + 1,
|
|
382
|
+
Math.max(nextOrder - chunkContext.before, chunk.order + 1),
|
|
383
|
+
);
|
|
384
|
+
const contentIds: Id<"content">[] = [];
|
|
385
|
+
if (startOrder === chunk.order && endOrder === chunk.order + 1) {
|
|
386
|
+
contentIds.push(chunk.contentId);
|
|
387
|
+
} else {
|
|
388
|
+
const rangeChunks = await ctx.db
|
|
389
|
+
.query("chunks")
|
|
390
|
+
.withIndex("entryId_order", (q) =>
|
|
391
|
+
q
|
|
392
|
+
.eq("entryId", entryId)
|
|
393
|
+
.gte("order", startOrder)
|
|
394
|
+
.lt("order", endOrder),
|
|
395
|
+
)
|
|
396
|
+
.collect();
|
|
397
|
+
for (const c of rangeChunks) {
|
|
398
|
+
contentIds.push(c.contentId);
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
const content = await Promise.all(
|
|
402
|
+
contentIds.map(async (contentId) => {
|
|
403
|
+
const content = await ctx.db.get(contentId);
|
|
404
|
+
assert(content, `Content ${contentId} not found`);
|
|
405
|
+
return { text: content.text, metadata: content.metadata };
|
|
406
|
+
}),
|
|
407
|
+
);
|
|
408
|
+
|
|
409
|
+
result.push({ entryId, order: chunk.order, startOrder, content });
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
return { ranges: result, entries };
|
|
413
|
+
}
|
|
414
|
+
|
|
314
415
|
export const getRangesOfChunks = internalQuery({
|
|
315
416
|
args: {
|
|
316
417
|
embeddingIds: v.array(vVectorId),
|
|
@@ -339,98 +440,7 @@ export const getRangesOfChunks = internalQuery({
|
|
|
339
440
|
.first(),
|
|
340
441
|
),
|
|
341
442
|
);
|
|
342
|
-
|
|
343
|
-
// Note: This preserves order of entries as they first appeared.
|
|
344
|
-
const entries = (
|
|
345
|
-
await Promise.all(
|
|
346
|
-
Array.from(
|
|
347
|
-
new Set(chunks.filter((c) => c !== null).map((c) => c.entryId)),
|
|
348
|
-
).map((id) => ctx.db.get(id)),
|
|
349
|
-
)
|
|
350
|
-
)
|
|
351
|
-
.filter((d) => d !== null)
|
|
352
|
-
.map(publicEntry);
|
|
353
|
-
|
|
354
|
-
const entryOders = chunks
|
|
355
|
-
.filter((c) => c !== null)
|
|
356
|
-
.map((c) => [c.entryId, c.order] as const)
|
|
357
|
-
.reduce(
|
|
358
|
-
(acc, [entryId, order]) => {
|
|
359
|
-
if (acc[entryId]?.includes(order)) {
|
|
360
|
-
// De-dupe orders
|
|
361
|
-
return acc;
|
|
362
|
-
}
|
|
363
|
-
acc[entryId] = [...(acc[entryId] ?? []), order].sort((a, b) => a - b);
|
|
364
|
-
return acc;
|
|
365
|
-
},
|
|
366
|
-
{} as Record<Id<"entries">, number[]>,
|
|
367
|
-
);
|
|
368
|
-
|
|
369
|
-
const result: Array<Infer<typeof vRangeResult> | null> = [];
|
|
370
|
-
|
|
371
|
-
for (const chunk of chunks) {
|
|
372
|
-
if (chunk === null) {
|
|
373
|
-
result.push(null);
|
|
374
|
-
continue;
|
|
375
|
-
}
|
|
376
|
-
// Note: if we parallelize this in the future, we could have a race
|
|
377
|
-
// instead we'd check that other chunks are not the same doc/order
|
|
378
|
-
if (
|
|
379
|
-
result.find(
|
|
380
|
-
(r) => r?.entryId === chunk.entryId && r?.order === chunk.order,
|
|
381
|
-
)
|
|
382
|
-
) {
|
|
383
|
-
// De-dupe chunks
|
|
384
|
-
result.push(null);
|
|
385
|
-
continue;
|
|
386
|
-
}
|
|
387
|
-
const entryId = chunk.entryId;
|
|
388
|
-
const entry = await ctx.db.get(entryId);
|
|
389
|
-
assert(entry, `Entry ${entryId} not found`);
|
|
390
|
-
const otherOrders = entryOders[entryId] ?? [chunk.order];
|
|
391
|
-
const ourOrderIndex = otherOrders.indexOf(chunk.order);
|
|
392
|
-
const previousOrder = otherOrders[ourOrderIndex - 1] ?? -Infinity;
|
|
393
|
-
const nextOrder = otherOrders[ourOrderIndex + 1] ?? Infinity;
|
|
394
|
-
// We absorb all previous context up to the previous chunk.
|
|
395
|
-
const startOrder = Math.max(
|
|
396
|
-
chunk.order - chunkContext.before,
|
|
397
|
-
0,
|
|
398
|
-
Math.min(previousOrder + 1, chunk.order),
|
|
399
|
-
);
|
|
400
|
-
// We stop short if the next chunk order's "before" context will cover it.
|
|
401
|
-
const endOrder = Math.min(
|
|
402
|
-
chunk.order + chunkContext.after + 1,
|
|
403
|
-
Math.max(nextOrder - chunkContext.before, chunk.order + 1),
|
|
404
|
-
);
|
|
405
|
-
const contentIds: Id<"content">[] = [];
|
|
406
|
-
if (startOrder === chunk.order && endOrder === chunk.order + 1) {
|
|
407
|
-
contentIds.push(chunk.contentId);
|
|
408
|
-
} else {
|
|
409
|
-
const chunks = await ctx.db
|
|
410
|
-
.query("chunks")
|
|
411
|
-
.withIndex("entryId_order", (q) =>
|
|
412
|
-
q
|
|
413
|
-
.eq("entryId", entryId)
|
|
414
|
-
.gte("order", startOrder)
|
|
415
|
-
.lt("order", endOrder),
|
|
416
|
-
)
|
|
417
|
-
.collect();
|
|
418
|
-
for (const chunk of chunks) {
|
|
419
|
-
contentIds.push(chunk.contentId);
|
|
420
|
-
}
|
|
421
|
-
}
|
|
422
|
-
const content = await Promise.all(
|
|
423
|
-
contentIds.map(async (contentId) => {
|
|
424
|
-
const content = await ctx.db.get(contentId);
|
|
425
|
-
assert(content, `Content ${contentId} not found`);
|
|
426
|
-
return { text: content.text, metadata: content.metadata };
|
|
427
|
-
}),
|
|
428
|
-
);
|
|
429
|
-
|
|
430
|
-
result.push({ entryId, order: chunk.order, startOrder, content });
|
|
431
|
-
}
|
|
432
|
-
|
|
433
|
-
return { ranges: result, entries };
|
|
443
|
+
return buildRanges(ctx, chunks, chunkContext);
|
|
434
444
|
},
|
|
435
445
|
});
|
|
436
446
|
|
package/src/component/schema.ts
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import { describe, expect, test } from "vitest";
|
|
4
4
|
import { convexTest, type TestConvex } from "convex-test";
|
|
5
5
|
import schema from "./schema.js";
|
|
6
|
-
import { api } from "./_generated/api.js";
|
|
6
|
+
import { api, internal } from "./_generated/api.js";
|
|
7
7
|
import { modules } from "./setup.test.js";
|
|
8
8
|
import { insertChunks } from "./chunks.js";
|
|
9
9
|
import type { Id } from "./_generated/dataModel.js";
|
|
@@ -442,4 +442,306 @@ describe("search", () => {
|
|
|
442
442
|
);
|
|
443
443
|
}
|
|
444
444
|
});
|
|
445
|
+
|
|
446
|
+
describe("hybrid search", () => {
|
|
447
|
+
function createSearchableChunks(texts: string[], baseEmbedding = 0.1) {
|
|
448
|
+
return texts.map((text, i) => ({
|
|
449
|
+
content: { text, metadata: { index: i } },
|
|
450
|
+
embedding: [...Array(127).fill(0.01), baseEmbedding + i * 0.01],
|
|
451
|
+
searchableText: text,
|
|
452
|
+
}));
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
test("textSearch internal query finds chunks by text content", async () => {
|
|
456
|
+
const t = convexTest(schema, modules);
|
|
457
|
+
const namespaceId = await setupTestNamespace(t);
|
|
458
|
+
const entryId = await setupTestEntry(t, namespaceId);
|
|
459
|
+
|
|
460
|
+
const chunks = createSearchableChunks([
|
|
461
|
+
"The quick brown fox jumps over the lazy dog",
|
|
462
|
+
"A fast red car drives on the highway",
|
|
463
|
+
"The brown bear sleeps in the forest",
|
|
464
|
+
]);
|
|
465
|
+
|
|
466
|
+
await t.run(async (ctx) => {
|
|
467
|
+
await insertChunks(ctx, { entryId, startOrder: 0, chunks });
|
|
468
|
+
});
|
|
469
|
+
|
|
470
|
+
const results = await t.query(internal.search.textSearch, {
|
|
471
|
+
query: "brown",
|
|
472
|
+
namespaceId,
|
|
473
|
+
filters: [],
|
|
474
|
+
limit: 10,
|
|
475
|
+
});
|
|
476
|
+
|
|
477
|
+
expect(results.length).toBeGreaterThan(0);
|
|
478
|
+
for (const r of results) {
|
|
479
|
+
expect(r.entryId).toBe(entryId);
|
|
480
|
+
}
|
|
481
|
+
});
|
|
482
|
+
|
|
483
|
+
test("textSearch scopes results to the given namespace", async () => {
|
|
484
|
+
const t = convexTest(schema, modules);
|
|
485
|
+
const ns1Id = await setupTestNamespace(t, "namespace-1");
|
|
486
|
+
const ns2Id = await setupTestNamespace(t, "namespace-2");
|
|
487
|
+
const entry1Id = await setupTestEntry(t, ns1Id, "entry-1");
|
|
488
|
+
const entry2Id = await setupTestEntry(t, ns2Id, "entry-2");
|
|
489
|
+
|
|
490
|
+
await t.run(async (ctx) => {
|
|
491
|
+
await insertChunks(ctx, {
|
|
492
|
+
entryId: entry1Id,
|
|
493
|
+
startOrder: 0,
|
|
494
|
+
chunks: createSearchableChunks(["alpha bravo charlie"]),
|
|
495
|
+
});
|
|
496
|
+
await insertChunks(ctx, {
|
|
497
|
+
entryId: entry2Id,
|
|
498
|
+
startOrder: 0,
|
|
499
|
+
chunks: createSearchableChunks(["alpha delta echo"]),
|
|
500
|
+
});
|
|
501
|
+
});
|
|
502
|
+
|
|
503
|
+
const ns1Results = await t.query(internal.search.textSearch, {
|
|
504
|
+
query: "alpha",
|
|
505
|
+
namespaceId: ns1Id,
|
|
506
|
+
filters: [],
|
|
507
|
+
limit: 10,
|
|
508
|
+
});
|
|
509
|
+
|
|
510
|
+
// All results should belong to namespace-1's entry.
|
|
511
|
+
for (const r of ns1Results) {
|
|
512
|
+
expect(r.entryId).toBe(entry1Id);
|
|
513
|
+
}
|
|
514
|
+
});
|
|
515
|
+
|
|
516
|
+
test("textSearch applies numbered filters", async () => {
|
|
517
|
+
const t = convexTest(schema, modules);
|
|
518
|
+
const namespaceId = await setupTestNamespace(t, "filtered-ns", 128, [
|
|
519
|
+
"category",
|
|
520
|
+
]);
|
|
521
|
+
|
|
522
|
+
const cat1Entry = await setupTestEntry(t, namespaceId, "cat1", 0, [
|
|
523
|
+
{ name: "category", value: "docs" },
|
|
524
|
+
]);
|
|
525
|
+
const cat2Entry = await setupTestEntry(t, namespaceId, "cat2", 0, [
|
|
526
|
+
{ name: "category", value: "blogs" },
|
|
527
|
+
]);
|
|
528
|
+
|
|
529
|
+
await t.run(async (ctx) => {
|
|
530
|
+
await insertChunks(ctx, {
|
|
531
|
+
entryId: cat1Entry,
|
|
532
|
+
startOrder: 0,
|
|
533
|
+
chunks: createSearchableChunks(["shared keyword content"]),
|
|
534
|
+
});
|
|
535
|
+
await insertChunks(ctx, {
|
|
536
|
+
entryId: cat2Entry,
|
|
537
|
+
startOrder: 0,
|
|
538
|
+
chunks: createSearchableChunks(["shared keyword content"]),
|
|
539
|
+
});
|
|
540
|
+
});
|
|
541
|
+
|
|
542
|
+
// Filter to "docs" category only (filter index 0 = "category").
|
|
543
|
+
const results = await t.query(internal.search.textSearch, {
|
|
544
|
+
query: "shared keyword",
|
|
545
|
+
namespaceId,
|
|
546
|
+
filters: [{ 0: "docs" }],
|
|
547
|
+
limit: 10,
|
|
548
|
+
});
|
|
549
|
+
|
|
550
|
+
expect(results.length).toBeGreaterThan(0);
|
|
551
|
+
for (const r of results) {
|
|
552
|
+
expect(r.entryId).toBe(cat1Entry);
|
|
553
|
+
}
|
|
554
|
+
});
|
|
555
|
+
|
|
556
|
+
test("text-only search returns results via dimension arg", async () => {
|
|
557
|
+
const t = convexTest(schema, modules);
|
|
558
|
+
const namespaceId = await setupTestNamespace(t);
|
|
559
|
+
const entryId = await setupTestEntry(t, namespaceId);
|
|
560
|
+
|
|
561
|
+
const chunks = createSearchableChunks([
|
|
562
|
+
"Machine learning is a subset of artificial intelligence",
|
|
563
|
+
"Deep learning uses neural networks with many layers",
|
|
564
|
+
"Natural language processing handles text data",
|
|
565
|
+
]);
|
|
566
|
+
|
|
567
|
+
await t.run(async (ctx) => {
|
|
568
|
+
await insertChunks(ctx, { entryId, startOrder: 0, chunks });
|
|
569
|
+
});
|
|
570
|
+
|
|
571
|
+
// Text-only: no embedding, provide dimension instead.
|
|
572
|
+
const result = await t.action(api.search.search, {
|
|
573
|
+
namespace: "test-namespace",
|
|
574
|
+
dimension: 128,
|
|
575
|
+
modelId: "test-model",
|
|
576
|
+
filters: [],
|
|
577
|
+
limit: 10,
|
|
578
|
+
textQuery: "neural networks",
|
|
579
|
+
});
|
|
580
|
+
|
|
581
|
+
expect(result.results.length).toBeGreaterThan(0);
|
|
582
|
+
expect(result.entries).toHaveLength(1);
|
|
583
|
+
|
|
584
|
+
// Text-only scores are position-based.
|
|
585
|
+
expect(result.results[0].score).toBe(1.0);
|
|
586
|
+
for (let i = 1; i < result.results.length; i++) {
|
|
587
|
+
expect(result.results[i].score).toBeLessThan(
|
|
588
|
+
result.results[i - 1].score,
|
|
589
|
+
);
|
|
590
|
+
}
|
|
591
|
+
});
|
|
592
|
+
|
|
593
|
+
test("hybrid search returns results when textQuery is provided", async () => {
|
|
594
|
+
const t = convexTest(schema, modules);
|
|
595
|
+
const namespaceId = await setupTestNamespace(t);
|
|
596
|
+
const entryId = await setupTestEntry(t, namespaceId);
|
|
597
|
+
|
|
598
|
+
const chunks = createSearchableChunks([
|
|
599
|
+
"Machine learning is a subset of artificial intelligence",
|
|
600
|
+
"Deep learning uses neural networks with many layers",
|
|
601
|
+
"Natural language processing handles text data",
|
|
602
|
+
]);
|
|
603
|
+
|
|
604
|
+
await t.run(async (ctx) => {
|
|
605
|
+
await insertChunks(ctx, { entryId, startOrder: 0, chunks });
|
|
606
|
+
});
|
|
607
|
+
|
|
608
|
+
const result = await t.action(api.search.search, {
|
|
609
|
+
namespace: "test-namespace",
|
|
610
|
+
embedding: [...Array(127).fill(0.01), 0.1],
|
|
611
|
+
modelId: "test-model",
|
|
612
|
+
filters: [],
|
|
613
|
+
limit: 10,
|
|
614
|
+
textQuery: "neural networks",
|
|
615
|
+
});
|
|
616
|
+
|
|
617
|
+
expect(result.results.length).toBeGreaterThan(0);
|
|
618
|
+
expect(result.entries).toHaveLength(1);
|
|
619
|
+
|
|
620
|
+
// Hybrid scores are position-based (1.0 for top, decreasing linearly).
|
|
621
|
+
expect(result.results[0].score).toBe(1.0);
|
|
622
|
+
for (let i = 1; i < result.results.length; i++) {
|
|
623
|
+
expect(result.results[i].score).toBeLessThan(
|
|
624
|
+
result.results[i - 1].score,
|
|
625
|
+
);
|
|
626
|
+
}
|
|
627
|
+
});
|
|
628
|
+
|
|
629
|
+
test("hybrid search deduplicates results from vector and text paths", async () => {
|
|
630
|
+
const t = convexTest(schema, modules);
|
|
631
|
+
const namespaceId = await setupTestNamespace(t);
|
|
632
|
+
const entryId = await setupTestEntry(t, namespaceId);
|
|
633
|
+
|
|
634
|
+
const chunks = createSearchableChunks([
|
|
635
|
+
"Unique content about quantum computing",
|
|
636
|
+
"Another chunk about classical physics",
|
|
637
|
+
]);
|
|
638
|
+
|
|
639
|
+
await t.run(async (ctx) => {
|
|
640
|
+
await insertChunks(ctx, { entryId, startOrder: 0, chunks });
|
|
641
|
+
});
|
|
642
|
+
|
|
643
|
+
const result = await t.action(api.search.search, {
|
|
644
|
+
namespace: "test-namespace",
|
|
645
|
+
embedding: [...Array(127).fill(0.01), 0.1],
|
|
646
|
+
modelId: "test-model",
|
|
647
|
+
filters: [],
|
|
648
|
+
limit: 10,
|
|
649
|
+
textQuery: "quantum computing",
|
|
650
|
+
});
|
|
651
|
+
|
|
652
|
+
// Each chunk should appear at most once in the results.
|
|
653
|
+
const entryOrderPairs = result.results.map(
|
|
654
|
+
(r) => `${r.entryId}:${r.order}`,
|
|
655
|
+
);
|
|
656
|
+
const uniquePairs = new Set(entryOrderPairs);
|
|
657
|
+
expect(uniquePairs.size).toBe(entryOrderPairs.length);
|
|
658
|
+
});
|
|
659
|
+
|
|
660
|
+
test("vector-only search is unchanged when textQuery is not provided", async () => {
|
|
661
|
+
const t = convexTest(schema, modules);
|
|
662
|
+
const namespaceId = await setupTestNamespace(t);
|
|
663
|
+
const entryId = await setupTestEntry(t, namespaceId);
|
|
664
|
+
|
|
665
|
+
const targetEmbedding = [...Array(127).fill(0.5), 1];
|
|
666
|
+
const chunks = [
|
|
667
|
+
{
|
|
668
|
+
content: { text: "Target chunk", metadata: {} },
|
|
669
|
+
embedding: targetEmbedding,
|
|
670
|
+
searchableText: "Target chunk",
|
|
671
|
+
},
|
|
672
|
+
{
|
|
673
|
+
content: { text: "Other chunk", metadata: {} },
|
|
674
|
+
embedding: [...Array(127).fill(0.1), 0],
|
|
675
|
+
searchableText: "Other chunk",
|
|
676
|
+
},
|
|
677
|
+
];
|
|
678
|
+
|
|
679
|
+
await t.run(async (ctx) => {
|
|
680
|
+
await insertChunks(ctx, { entryId, startOrder: 0, chunks });
|
|
681
|
+
});
|
|
682
|
+
|
|
683
|
+
const result = await t.action(api.search.search, {
|
|
684
|
+
namespace: "test-namespace",
|
|
685
|
+
embedding: targetEmbedding,
|
|
686
|
+
modelId: "test-model",
|
|
687
|
+
filters: [],
|
|
688
|
+
limit: 10,
|
|
689
|
+
});
|
|
690
|
+
|
|
691
|
+
// Without textQuery, scores should be cosine similarity (not position-based).
|
|
692
|
+
expect(result.results).toHaveLength(2);
|
|
693
|
+
expect(result.results[0].score).toBeGreaterThan(result.results[1].score);
|
|
694
|
+
// Cosine similarity scores are typically between -1 and 1, not exactly 1.0.
|
|
695
|
+
// Position-based would give exactly 1.0 for the first result.
|
|
696
|
+
// With cosine similarity the first result can be 1.0 if exact match,
|
|
697
|
+
// but the second should not follow the linear decrease pattern.
|
|
698
|
+
expect(result.results[0].content[0].text).toBe("Target chunk");
|
|
699
|
+
});
|
|
700
|
+
|
|
701
|
+
test("textWeight and vectorWeight influence hybrid ranking", async () => {
|
|
702
|
+
const t = convexTest(schema, modules);
|
|
703
|
+
const namespaceId = await setupTestNamespace(t);
|
|
704
|
+
const entryId = await setupTestEntry(t, namespaceId);
|
|
705
|
+
|
|
706
|
+
const chunks = createSearchableChunks([
|
|
707
|
+
"Alpha topic with specific terminology",
|
|
708
|
+
"Beta topic with different keywords",
|
|
709
|
+
"Gamma topic about something else entirely",
|
|
710
|
+
]);
|
|
711
|
+
|
|
712
|
+
await t.run(async (ctx) => {
|
|
713
|
+
await insertChunks(ctx, { entryId, startOrder: 0, chunks });
|
|
714
|
+
});
|
|
715
|
+
|
|
716
|
+
const embedding = [...Array(127).fill(0.01), 0.1];
|
|
717
|
+
|
|
718
|
+
// Search with heavy text weight.
|
|
719
|
+
const textHeavy = await t.action(api.search.search, {
|
|
720
|
+
namespace: "test-namespace",
|
|
721
|
+
embedding,
|
|
722
|
+
modelId: "test-model",
|
|
723
|
+
filters: [],
|
|
724
|
+
limit: 10,
|
|
725
|
+
textQuery: "specific terminology",
|
|
726
|
+
textWeight: 10,
|
|
727
|
+
vectorWeight: 1,
|
|
728
|
+
});
|
|
729
|
+
|
|
730
|
+
// Search with heavy vector weight.
|
|
731
|
+
const vectorHeavy = await t.action(api.search.search, {
|
|
732
|
+
namespace: "test-namespace",
|
|
733
|
+
embedding,
|
|
734
|
+
modelId: "test-model",
|
|
735
|
+
filters: [],
|
|
736
|
+
limit: 10,
|
|
737
|
+
textQuery: "specific terminology",
|
|
738
|
+
textWeight: 1,
|
|
739
|
+
vectorWeight: 10,
|
|
740
|
+
});
|
|
741
|
+
|
|
742
|
+
// Both should return results.
|
|
743
|
+
expect(textHeavy.results.length).toBeGreaterThan(0);
|
|
744
|
+
expect(vectorHeavy.results.length).toBeGreaterThan(0);
|
|
745
|
+
});
|
|
746
|
+
});
|
|
445
747
|
});
|