unrag 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ export { createPrismaVectorStore } from "./store";
2
+
3
+
@@ -0,0 +1,133 @@
1
+ import type { Chunk, VectorStore } from "../../core/types";
2
+ import type { PrismaClient } from "@prisma/client";
3
+ import { empty, sqltag as sql } from "@prisma/client/runtime/library";
4
+
5
+ const sanitizeMetadata = (metadata: unknown) => {
6
+ if (metadata === undefined) return null;
7
+ try {
8
+ return JSON.parse(JSON.stringify(metadata));
9
+ } catch {
10
+ return null;
11
+ }
12
+ };
13
+
14
+ const toVectorLiteral = (embedding: number[]) => `[${embedding.join(",")}]`;
15
+
16
+ export const createPrismaVectorStore = (prisma: PrismaClient): VectorStore => ({
17
+ upsert: async (chunkItems) => {
18
+ if (chunkItems.length === 0) return;
19
+
20
+ const head = chunkItems[0]!;
21
+ const documentMetadata = sanitizeMetadata(head.metadata);
22
+
23
+ await prisma.$transaction(async (tx: { $executeRaw: (query: unknown) => Promise<unknown> }) => {
24
+ await tx.$executeRaw(
25
+ sql`
26
+ insert into documents (id, source_id, content, metadata)
27
+ values (${head.documentId}::uuid, ${head.sourceId}, ${head.documentContent ?? ""}, ${
28
+ JSON.stringify(documentMetadata)
29
+ }::jsonb)
30
+ on conflict (id) do update set
31
+ source_id = excluded.source_id,
32
+ content = excluded.content,
33
+ metadata = excluded.metadata
34
+ `
35
+ );
36
+
37
+ for (const chunk of chunkItems) {
38
+ const chunkMetadata = sanitizeMetadata(chunk.metadata);
39
+
40
+ await tx.$executeRaw(
41
+ sql`
42
+ insert into chunks (id, document_id, source_id, idx, content, token_count, metadata)
43
+ values (
44
+ ${chunk.id}::uuid,
45
+ ${chunk.documentId}::uuid,
46
+ ${chunk.sourceId},
47
+ ${chunk.index},
48
+ ${chunk.content},
49
+ ${chunk.tokenCount},
50
+ ${JSON.stringify(chunkMetadata)}::jsonb
51
+ )
52
+ on conflict (id) do update set
53
+ document_id = excluded.document_id,
54
+ source_id = excluded.source_id,
55
+ idx = excluded.idx,
56
+ content = excluded.content,
57
+ token_count = excluded.token_count,
58
+ metadata = excluded.metadata
59
+ `
60
+ );
61
+
62
+ if (!chunk.embedding) continue;
63
+ const embeddingLiteral = toVectorLiteral(chunk.embedding);
64
+
65
+ await tx.$executeRaw(
66
+ sql`
67
+ insert into embeddings (chunk_id, embedding, embedding_dimension)
68
+ values (${chunk.id}::uuid, ${embeddingLiteral}::vector, ${
69
+ chunk.embedding.length
70
+ })
71
+ on conflict (chunk_id) do update set
72
+ embedding = excluded.embedding,
73
+ embedding_dimension = excluded.embedding_dimension
74
+ `
75
+ );
76
+ }
77
+ });
78
+ },
79
+
80
+ query: async ({ embedding, topK, scope = {} }) => {
81
+ type QueryRow = {
82
+ id: string;
83
+ document_id: string;
84
+ source_id: string;
85
+ idx: number;
86
+ content: string;
87
+ token_count: number;
88
+ metadata: unknown;
89
+ score: number;
90
+ };
91
+
92
+ const vectorLiteral = toVectorLiteral(embedding);
93
+
94
+ const whereSql = scope.sourceId
95
+ ? // Interpret scope.sourceId as a prefix so callers can namespace content
96
+ // (e.g. `tenant:acme:`) without needing separate tables.
97
+ sql`where c.source_id like ${scope.sourceId + "%"}`
98
+ : empty;
99
+
100
+ const rows = (await prisma.$queryRaw(
101
+ sql`
102
+ select
103
+ c.id,
104
+ c.document_id,
105
+ c.source_id,
106
+ c.idx,
107
+ c.content,
108
+ c.token_count,
109
+ c.metadata,
110
+ (e.embedding <=> ${vectorLiteral}::vector) as score
111
+ from chunks as c
112
+ join embeddings as e on e.chunk_id = c.id
113
+ join documents as d on d.id = c.document_id
114
+ ${whereSql}
115
+ order by score asc
116
+ limit ${topK}
117
+ `
118
+ )) as QueryRow[];
119
+
120
+ return rows.map((row: QueryRow) => ({
121
+ id: String(row.id),
122
+ documentId: String(row.document_id),
123
+ sourceId: String(row.source_id),
124
+ index: Number(row.idx),
125
+ content: String(row.content),
126
+ tokenCount: Number(row.token_count),
127
+ metadata: (row.metadata ?? {}) as Chunk["metadata"],
128
+ score: Number(row.score),
129
+ }));
130
+ },
131
+ });
132
+
133
+
@@ -0,0 +1,3 @@
1
+ export { createRawSqlVectorStore } from "./store";
2
+
3
+
@@ -0,0 +1,154 @@
1
+ import type { Chunk, VectorStore } from "../../core/types";
2
+ import type { Pool, PoolClient } from "pg";
3
+
4
+ const sanitizeMetadata = (metadata: unknown) => {
5
+ if (metadata === undefined) return null;
6
+ try {
7
+ return JSON.parse(JSON.stringify(metadata));
8
+ } catch {
9
+ return null;
10
+ }
11
+ };
12
+
13
+ const toVectorLiteral = (embedding: number[]) => `[${embedding.join(",")}]`;
14
+
15
+ const withTx = async <T>(
16
+ pool: Pool,
17
+ fn: (client: PoolClient) => Promise<T>
18
+ ): Promise<T> => {
19
+ const client = await pool.connect();
20
+ try {
21
+ await client.query("begin");
22
+ const result = await fn(client);
23
+ await client.query("commit");
24
+ return result;
25
+ } catch (err) {
26
+ try {
27
+ await client.query("rollback");
28
+ } catch {
29
+ // ignore rollback errors
30
+ }
31
+ throw err;
32
+ } finally {
33
+ client.release();
34
+ }
35
+ };
36
+
37
+ export const createRawSqlVectorStore = (pool: Pool): VectorStore => ({
38
+ upsert: async (chunkItems) => {
39
+ if (chunkItems.length === 0) return;
40
+
41
+ await withTx(pool, async (client) => {
42
+ const head = chunkItems[0]!;
43
+ const documentMetadata = sanitizeMetadata(head.metadata);
44
+
45
+ await client.query(
46
+ `
47
+ insert into documents (id, source_id, content, metadata)
48
+ values ($1, $2, $3, $4::jsonb)
49
+ on conflict (id) do update set
50
+ source_id = excluded.source_id,
51
+ content = excluded.content,
52
+ metadata = excluded.metadata
53
+ `,
54
+ [
55
+ head.documentId,
56
+ head.sourceId,
57
+ head.documentContent ?? "",
58
+ JSON.stringify(documentMetadata),
59
+ ]
60
+ );
61
+
62
+ for (const chunk of chunkItems) {
63
+ const chunkMetadata = sanitizeMetadata(chunk.metadata);
64
+
65
+ await client.query(
66
+ `
67
+ insert into chunks (id, document_id, source_id, idx, content, token_count, metadata)
68
+ values ($1, $2, $3, $4, $5, $6, $7::jsonb)
69
+ on conflict (id) do update set
70
+ document_id = excluded.document_id,
71
+ source_id = excluded.source_id,
72
+ idx = excluded.idx,
73
+ content = excluded.content,
74
+ token_count = excluded.token_count,
75
+ metadata = excluded.metadata
76
+ `,
77
+ [
78
+ chunk.id,
79
+ chunk.documentId,
80
+ chunk.sourceId,
81
+ chunk.index,
82
+ chunk.content,
83
+ chunk.tokenCount,
84
+ JSON.stringify(chunkMetadata),
85
+ ]
86
+ );
87
+
88
+ if (!chunk.embedding) continue;
89
+
90
+ const embeddingLiteral = toVectorLiteral(chunk.embedding);
91
+ await client.query(
92
+ `
93
+ insert into embeddings (chunk_id, embedding, embedding_dimension)
94
+ values ($1, $2::vector, $3)
95
+ on conflict (chunk_id) do update set
96
+ embedding = excluded.embedding,
97
+ embedding_dimension = excluded.embedding_dimension
98
+ `,
99
+ [chunk.id, embeddingLiteral, chunk.embedding.length]
100
+ );
101
+ }
102
+ });
103
+ },
104
+
105
+ query: async ({ embedding, topK, scope = {} }) => {
106
+ const vectorLiteral = toVectorLiteral(embedding);
107
+
108
+ const values: unknown[] = [vectorLiteral, topK];
109
+ const where: string[] = [];
110
+
111
+ if (scope.sourceId) {
112
+ // Interpret scope.sourceId as a prefix so callers can namespace content
113
+ // (e.g. `tenant:acme:`) without needing separate tables.
114
+ values.push(scope.sourceId + "%");
115
+ where.push(`c.source_id like $${values.length}`);
116
+ }
117
+
118
+ const whereSql = where.length ? `where ${where.join(" and ")}` : "";
119
+
120
+ const res = await pool.query(
121
+ `
122
+ select
123
+ c.id,
124
+ c.document_id,
125
+ c.source_id,
126
+ c.idx,
127
+ c.content,
128
+ c.token_count,
129
+ c.metadata,
130
+ (e.embedding <=> $1::vector) as score
131
+ from chunks as c
132
+ join embeddings as e on e.chunk_id = c.id
133
+ join documents as d on d.id = c.document_id
134
+ ${whereSql}
135
+ order by score asc
136
+ limit $2
137
+ `,
138
+ values
139
+ );
140
+
141
+ return res.rows.map((row) => ({
142
+ id: String(row.id),
143
+ documentId: String(row.document_id),
144
+ sourceId: String(row.source_id),
145
+ index: Number(row.idx),
146
+ content: String(row.content),
147
+ tokenCount: Number(row.token_count),
148
+ metadata: (row.metadata ?? {}) as Chunk["metadata"],
149
+ score: Number(row.score),
150
+ }));
151
+ },
152
+ });
153
+
154
+