vectra-js 0.9.5 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm-publish.yml +10 -27
- package/README.md +2 -1
- package/bin/vectra.js +0 -0
- package/examples/chromadb.js +96 -0
- package/examples/pg-prisma.js +119 -0
- package/examples/postgress.js +115 -0
- package/package.json +13 -7
- package/src/backends/gemini.js +15 -8
- package/src/backends/postgres_store.js +191 -0
- package/src/core.js +2 -0
- package/src/processor.js +31 -1
|
@@ -1,42 +1,25 @@
|
|
|
1
|
-
name: Publish
|
|
1
|
+
name: Publish Package
|
|
2
2
|
|
|
3
3
|
on:
|
|
4
4
|
push:
|
|
5
5
|
branches:
|
|
6
6
|
- master
|
|
7
7
|
|
|
8
|
+
permissions:
|
|
9
|
+
id-token: write
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
8
12
|
jobs:
|
|
9
13
|
publish:
|
|
10
14
|
runs-on: ubuntu-latest
|
|
11
15
|
environment: Build
|
|
12
|
-
|
|
13
|
-
permissions:
|
|
14
|
-
contents: read
|
|
15
|
-
|
|
16
16
|
steps:
|
|
17
17
|
- uses: actions/checkout@v4
|
|
18
18
|
|
|
19
19
|
- uses: actions/setup-node@v4
|
|
20
20
|
with:
|
|
21
|
-
node-version:
|
|
22
|
-
registry-url: https://registry.npmjs.org
|
|
23
|
-
|
|
24
|
-
-
|
|
25
|
-
|
|
26
|
-
version: 9
|
|
27
|
-
|
|
28
|
-
- name: Install dependencies
|
|
29
|
-
run: pnpm install --frozen-lockfile
|
|
30
|
-
|
|
31
|
-
- name: Configure npm auth
|
|
32
|
-
run: |
|
|
33
|
-
printf "//registry.npmjs.org/:_authToken=%s" "$NODE_AUTH_TOKEN" > "$NPM_CONFIG_USERCONFIG"
|
|
34
|
-
env:
|
|
35
|
-
NODE_AUTH_TOKEN: ${{ secrets.NPM_KEY }}
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
- name: Verify auth (CI check)
|
|
39
|
-
run: npm whoami
|
|
40
|
-
|
|
41
|
-
- name: Publish to npm
|
|
42
|
-
run: pnpm publish --no-git-checks --access public
|
|
21
|
+
node-version: '24'
|
|
22
|
+
registry-url: 'https://registry.npmjs.org'
|
|
23
|
+
- run: npm install
|
|
24
|
+
- run: npm run build --if-present
|
|
25
|
+
- run: npm publish
|
package/README.md
CHANGED
|
@@ -6,7 +6,8 @@ A production-ready, provider-agnostic Node.js SDK for End-to-End RAG (Retrieval-
|
|
|
6
6
|
|
|
7
7
|
* **Multi-Provider Support**: First-class support for **OpenAI**, **Gemini**, **Anthropic**, **OpenRouter**, and **Hugging Face**.
|
|
8
8
|
* **Modular Vector Store**:
|
|
9
|
-
* **Prisma**: Use your existing PostgreSQL database with `pgvector
|
|
9
|
+
* **Prisma**: Use your existing PostgreSQL database with `pgvector` (via Prisma).
|
|
10
|
+
* **Native PostgreSQL**: Direct connection to PostgreSQL using `pg` driver (no ORM required).
|
|
10
11
|
* **ChromaDB**: Native support for the open-source vector database.
|
|
11
12
|
* **Qdrant & Milvus**: Additional backends for portability.
|
|
12
13
|
* **Extensible**: Easily add others by extending the `VectorStore` class.
|
package/bin/vectra.js
CHANGED
|
File without changes
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
const path = require('path');
|
|
2
|
+
require('dotenv').config({ path: path.join(__dirname, '.env') });
|
|
3
|
+
const { ChromaClient } = require('chromadb');
|
|
4
|
+
const { VectraClient, ProviderType, ChunkingStrategy, RetrievalStrategy } = require('../vectra-js');
|
|
5
|
+
const { LoggingCallbackHandler } = require('../vectra-js/src/callbacks');
|
|
6
|
+
|
|
7
|
+
async function runSimulation() {
|
|
8
|
+
console.log('=== Starting Vectra SDK Simulation (Node.js) ===\n');
|
|
9
|
+
|
|
10
|
+
const chroma = new ChromaClient({
|
|
11
|
+
ssl: false,
|
|
12
|
+
host: "localhost",
|
|
13
|
+
port: 8000,
|
|
14
|
+
headers: {},
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
const config = {
|
|
18
|
+
embedding: {
|
|
19
|
+
provider: ProviderType.GEMINI,
|
|
20
|
+
apiKey: process.env.GEMINI_KEY,
|
|
21
|
+
modelName: 'gemini-embedding-001',
|
|
22
|
+
dimensions: 1536,
|
|
23
|
+
},
|
|
24
|
+
llm: {
|
|
25
|
+
provider: ProviderType.GEMINI,
|
|
26
|
+
apiKey: process.env.GEMINI_KEY,
|
|
27
|
+
modelName: 'gemini-2.5-flash-lite',
|
|
28
|
+
},
|
|
29
|
+
chunking: {
|
|
30
|
+
strategy: ChunkingStrategy.RECURSIVE,
|
|
31
|
+
chunkSize: 500,
|
|
32
|
+
chunkOverlap: 200,
|
|
33
|
+
},
|
|
34
|
+
database: {
|
|
35
|
+
type: 'chroma',
|
|
36
|
+
tableName: 'rag_collection',
|
|
37
|
+
clientInstance: chroma,
|
|
38
|
+
columnMap: { content: 'content', vector: 'embedding', metadata: 'metadata' },
|
|
39
|
+
},
|
|
40
|
+
retrieval: {
|
|
41
|
+
strategy: RetrievalStrategy.HYBRID,
|
|
42
|
+
},
|
|
43
|
+
reranking: {
|
|
44
|
+
enabled: true,
|
|
45
|
+
topN: 5,
|
|
46
|
+
windowSize: 20,
|
|
47
|
+
llmConfig: {
|
|
48
|
+
provider: ProviderType.GEMINI,
|
|
49
|
+
apiKey: process.env.GEMINI_KEY,
|
|
50
|
+
modelName: 'gemini-2.5-flash-lite',
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
observability: {
|
|
54
|
+
enabled: true,
|
|
55
|
+
projectId: "node-test-project",
|
|
56
|
+
sqlitePath: path.resolve(__dirname, "db/node-observability.db")
|
|
57
|
+
},
|
|
58
|
+
callbacks: [
|
|
59
|
+
new LoggingCallbackHandler(),
|
|
60
|
+
{ onEmbeddingStart: (c) => console.info(`[RAG] Embedding ${c} chunks...`) }
|
|
61
|
+
],
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
console.log('Initializing Client...');
|
|
65
|
+
const client = new VectraClient(config);
|
|
66
|
+
if (config.database.type === 'prisma' && client.vectorStore.ensureIndexes) {
|
|
67
|
+
await client.vectorStore.ensureIndexes();
|
|
68
|
+
}
|
|
69
|
+
await client.ingestDocuments('data/llm-ebook-part1-1.pdf');
|
|
70
|
+
|
|
71
|
+
console.log('\n--- Step 1: Standard Query (Hybrid) ---\n');
|
|
72
|
+
try {
|
|
73
|
+
const result = await client.queryRAG('What is LLM?');
|
|
74
|
+
console.log('Answer:', result.answer);
|
|
75
|
+
} catch (error) {
|
|
76
|
+
console.error('Query failed:', error);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
console.log('\n--- Step 2: Streaming Query ---\n');
|
|
80
|
+
try {
|
|
81
|
+
const stream = await client.queryRAG('Tell me more about LLM...', null, true);
|
|
82
|
+
process.stdout.write('Stream Output: ');
|
|
83
|
+
for await (const chunk of stream) {
|
|
84
|
+
if (typeof chunk === 'string') {
|
|
85
|
+
process.stdout.write(chunk);
|
|
86
|
+
} else if (chunk && chunk.delta) {
|
|
87
|
+
process.stdout.write(chunk.delta);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
process.stdout.write('\n');
|
|
91
|
+
} catch (error) {
|
|
92
|
+
console.error('Streaming failed:', error);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
runSimulation();
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
const path = require('path');
|
|
2
|
+
require('dotenv').config({ path: path.join(__dirname, '.env') });
|
|
3
|
+
const { PrismaClient } = require('@prisma/client');
|
|
4
|
+
const { PrismaPg } = require('@prisma/adapter-pg');
|
|
5
|
+
const { Pool } = require('pg');
|
|
6
|
+
const { VectraClient, ProviderType, ChunkingStrategy, RetrievalStrategy } = require('../vectra-js');
|
|
7
|
+
const { LoggingCallbackHandler } = require('../vectra-js/src/callbacks');
|
|
8
|
+
|
|
9
|
+
async function runSimulation() {
|
|
10
|
+
console.log('=== Starting Vectra SDK Simulation (Node.js + Prisma) ===\n');
|
|
11
|
+
|
|
12
|
+
const connectionString = process.env.DATABASE_URL;
|
|
13
|
+
const pool = new Pool({ connectionString });
|
|
14
|
+
const adapter = new PrismaPg(pool);
|
|
15
|
+
const prisma = new PrismaClient({ adapter });
|
|
16
|
+
|
|
17
|
+
const config = {
|
|
18
|
+
embedding: {
|
|
19
|
+
provider: ProviderType.GEMINI,
|
|
20
|
+
apiKey: process.env.GEMINI_KEY,
|
|
21
|
+
modelName: 'gemini-embedding-001',
|
|
22
|
+
dimensions: 1536,
|
|
23
|
+
},
|
|
24
|
+
llm: {
|
|
25
|
+
provider: ProviderType.GEMINI,
|
|
26
|
+
apiKey: process.env.GEMINI_KEY,
|
|
27
|
+
modelName: 'gemini-2.5-flash-lite',
|
|
28
|
+
},
|
|
29
|
+
chunking: {
|
|
30
|
+
strategy: ChunkingStrategy.RECURSIVE,
|
|
31
|
+
chunkSize: 1000,
|
|
32
|
+
chunkOverlap: 200,
|
|
33
|
+
},
|
|
34
|
+
database: {
|
|
35
|
+
type: 'prisma',
|
|
36
|
+
tableName: 'Document',
|
|
37
|
+
clientInstance: prisma,
|
|
38
|
+
columnMap: { content: 'content', vector: 'embedding', metadata: 'metadata' },
|
|
39
|
+
},
|
|
40
|
+
retrieval: {
|
|
41
|
+
strategy: RetrievalStrategy.HYBRID,
|
|
42
|
+
},
|
|
43
|
+
reranking: {
|
|
44
|
+
enabled: true,
|
|
45
|
+
topN: 5,
|
|
46
|
+
windowSize: 20,
|
|
47
|
+
llmConfig: {
|
|
48
|
+
provider: ProviderType.GEMINI,
|
|
49
|
+
apiKey: process.env.GEMINI_KEY,
|
|
50
|
+
modelName: 'gemini-2.5-flash-lite',
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
observability: {
|
|
54
|
+
enabled: true,
|
|
55
|
+
projectId: "node-test-project",
|
|
56
|
+
sqlitePath: path.resolve(__dirname, "db/node-observability.db")
|
|
57
|
+
},
|
|
58
|
+
callbacks: [
|
|
59
|
+
new LoggingCallbackHandler(),
|
|
60
|
+
{ onEmbeddingStart: (c) => console.info(`[RAG] Embedding ${c} chunks...`) }
|
|
61
|
+
],
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
console.log('Initializing Client...');
|
|
65
|
+
const client = new VectraClient(config);
|
|
66
|
+
|
|
67
|
+
if (config.database.type === 'prisma' && client.vectorStore.ensureIndexes) {
|
|
68
|
+
try {
|
|
69
|
+
await client.vectorStore.ensureIndexes();
|
|
70
|
+
console.log('Database indexes ensured.');
|
|
71
|
+
} catch (e) {
|
|
72
|
+
console.warn('Index creation warning (may already exist):', e.message);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Clean up table for simulation using Prisma
|
|
77
|
+
try {
|
|
78
|
+
// Note: Prisma doesn't support deleteMany on tables with unsupported types (like vector) easily in all versions
|
|
79
|
+
// or sometimes we need to use executeRaw.
|
|
80
|
+
// Since Document model has unsupported field, standard deleteMany might work but let's check.
|
|
81
|
+
// However, it is safer to use raw query if standard model usage is limited.
|
|
82
|
+
await prisma.$executeRawUnsafe(`DELETE FROM "Document"`);
|
|
83
|
+
console.log('Cleared existing documents from table.');
|
|
84
|
+
} catch (e) {
|
|
85
|
+
console.warn('Could not clear table:', e.message);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
await client.ingestDocuments('data/sample.txt');
|
|
89
|
+
|
|
90
|
+
console.log('\n--- Step 1: Standard Query (Hybrid) ---\n');
|
|
91
|
+
try {
|
|
92
|
+
const result = await client.queryRAG('What is RAG?');
|
|
93
|
+
console.log('Answer:', result.answer);
|
|
94
|
+
} catch (error) {
|
|
95
|
+
console.error('Query failed:', error);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
console.log('\n--- Step 2: Streaming Query ---\n');
|
|
99
|
+
try {
|
|
100
|
+
const stream = await client.queryRAG('Tell me more...', null, true);
|
|
101
|
+
process.stdout.write('Stream Output: ');
|
|
102
|
+
for await (const chunk of stream) {
|
|
103
|
+
if (typeof chunk === 'string') {
|
|
104
|
+
process.stdout.write(chunk);
|
|
105
|
+
} else if (chunk && chunk.delta) {
|
|
106
|
+
process.stdout.write(chunk.delta);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
process.stdout.write('\n');
|
|
110
|
+
} catch (error) {
|
|
111
|
+
console.error('Streaming failed:', error);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Clean up
|
|
115
|
+
await prisma.$disconnect();
|
|
116
|
+
await pool.end();
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
runSimulation();
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
const path = require('path');
|
|
2
|
+
require('dotenv').config({ path: path.join(__dirname, '.env') });
|
|
3
|
+
const { PrismaClient } = require('@prisma/client');
|
|
4
|
+
const { PrismaPg } = require('@prisma/adapter-pg');
|
|
5
|
+
const { Pool } = require('pg');
|
|
6
|
+
const { VectraClient, ProviderType, ChunkingStrategy, RetrievalStrategy } = require('../vectra-js');
|
|
7
|
+
const { LoggingCallbackHandler } = require('../vectra-js/src/callbacks');
|
|
8
|
+
|
|
9
|
+
async function runSimulation() {
|
|
10
|
+
console.log('=== Starting Vectra SDK Simulation (Node.js + Prisma) ===\n');
|
|
11
|
+
|
|
12
|
+
const connectionString = process.env.DATABASE_URL;
|
|
13
|
+
const pool = new Pool({ connectionString });
|
|
14
|
+
// const adapter = new PrismaPg({ pool });
|
|
15
|
+
// const prisma = new PrismaClient({ adapter });
|
|
16
|
+
|
|
17
|
+
const config = {
|
|
18
|
+
embedding: {
|
|
19
|
+
provider: ProviderType.GEMINI,
|
|
20
|
+
apiKey: process.env.GEMINI_KEY,
|
|
21
|
+
modelName: 'gemini-embedding-001',
|
|
22
|
+
dimensions: 1536,
|
|
23
|
+
},
|
|
24
|
+
llm: {
|
|
25
|
+
provider: ProviderType.GEMINI,
|
|
26
|
+
apiKey: process.env.GEMINI_KEY,
|
|
27
|
+
modelName: 'gemini-2.5-flash-lite',
|
|
28
|
+
},
|
|
29
|
+
chunking: {
|
|
30
|
+
strategy: ChunkingStrategy.RECURSIVE,
|
|
31
|
+
chunkSize: 1000,
|
|
32
|
+
chunkOverlap: 200,
|
|
33
|
+
},
|
|
34
|
+
database: {
|
|
35
|
+
type: 'postgres',
|
|
36
|
+
tableName: 'Document',
|
|
37
|
+
clientInstance: pool,
|
|
38
|
+
columnMap: { content: 'content', vector: 'embedding', metadata: 'metadata' },
|
|
39
|
+
},
|
|
40
|
+
retrieval: {
|
|
41
|
+
strategy: RetrievalStrategy.HYBRID,
|
|
42
|
+
},
|
|
43
|
+
reranking: {
|
|
44
|
+
enabled: true,
|
|
45
|
+
topN: 5,
|
|
46
|
+
windowSize: 20,
|
|
47
|
+
llmConfig: {
|
|
48
|
+
provider: ProviderType.GEMINI,
|
|
49
|
+
apiKey: process.env.GEMINI_KEY,
|
|
50
|
+
modelName: 'gemini-2.5-flash-lite',
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
observability: {
|
|
54
|
+
enabled: true,
|
|
55
|
+
projectId: "node-test-project",
|
|
56
|
+
sqlitePath: path.resolve(__dirname, "db/node-observability.db")
|
|
57
|
+
},
|
|
58
|
+
callbacks: [
|
|
59
|
+
new LoggingCallbackHandler(),
|
|
60
|
+
{ onEmbeddingStart: (c) => console.info(`[RAG] Embedding ${c} chunks...`) }
|
|
61
|
+
],
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
console.log('Initializing Client...');
|
|
65
|
+
const client = new VectraClient(config);
|
|
66
|
+
|
|
67
|
+
if ((config.database.type === 'prisma' || config.database.type === 'postgres') && client.vectorStore.ensureIndexes) {
|
|
68
|
+
try {
|
|
69
|
+
await client.vectorStore.ensureIndexes();
|
|
70
|
+
console.log('Database indexes ensured.');
|
|
71
|
+
} catch (e) {
|
|
72
|
+
console.warn('Index creation warning (may already exist):', e.message);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Clean up table for simulation
|
|
77
|
+
if (config.database.type === 'postgres') {
|
|
78
|
+
try {
|
|
79
|
+
await pool.query(`DELETE FROM "${config.database.tableName}"`); // Use quoted identifier
|
|
80
|
+
console.log('Cleared existing documents from table.');
|
|
81
|
+
} catch (e) {
|
|
82
|
+
console.warn('Could not clear table:', e.message);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
await client.ingestDocuments('data/sample.txt');
|
|
87
|
+
|
|
88
|
+
console.log('\n--- Step 1: Standard Query (Hybrid) ---\n');
|
|
89
|
+
try {
|
|
90
|
+
const result = await client.queryRAG('What is RAG?');
|
|
91
|
+
console.log('Answer:', result.answer);
|
|
92
|
+
} catch (error) {
|
|
93
|
+
console.error('Query failed:', error);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
console.log('\n--- Step 2: Streaming Query ---\n');
|
|
97
|
+
try {
|
|
98
|
+
const stream = await client.queryRAG('Tell me more...', null, true);
|
|
99
|
+
process.stdout.write('Stream Output: ');
|
|
100
|
+
for await (const chunk of stream) {
|
|
101
|
+
if (typeof chunk === 'string') {
|
|
102
|
+
process.stdout.write(chunk);
|
|
103
|
+
} else if (chunk && chunk.delta) {
|
|
104
|
+
process.stdout.write(chunk.delta);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
process.stdout.write('\n');
|
|
108
|
+
} catch (error) {
|
|
109
|
+
console.error('Streaming failed:', error);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
await pool.end();
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
runSimulation();
|
package/package.json
CHANGED
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "vectra-js",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.7",
|
|
4
4
|
"description": "A production-ready, provider-agnostic Node.js SDK for End-to-End RAG pipelines.",
|
|
5
5
|
"main": "index.js",
|
|
6
|
+
"scripts": {
|
|
7
|
+
"test": "echo \"Error: no test specified\" && exit 1",
|
|
8
|
+
"prisma:generate": "prisma generate",
|
|
9
|
+
"lint": "eslint . --ext .js,.cjs,.mjs",
|
|
10
|
+
"lint:fix": "eslint . --ext .js,.cjs,.mjs --fix"
|
|
11
|
+
},
|
|
6
12
|
"bin": {
|
|
7
13
|
"vectra": "bin/vectra.js"
|
|
8
14
|
},
|
|
@@ -34,6 +40,7 @@
|
|
|
34
40
|
"mammoth": "^1.11.0",
|
|
35
41
|
"openai": "^6.15.0",
|
|
36
42
|
"pdf-parse": "^2.4.5",
|
|
43
|
+
"pg": "^8.16.3",
|
|
37
44
|
"sqlite3": "^5.1.7",
|
|
38
45
|
"uuid": "^9.0.1",
|
|
39
46
|
"xlsx": "^0.18.5",
|
|
@@ -47,10 +54,9 @@
|
|
|
47
54
|
"globals": "^16.5.0",
|
|
48
55
|
"prisma": "^7.2.0"
|
|
49
56
|
},
|
|
50
|
-
"
|
|
51
|
-
"
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
"lint:fix": "eslint . --ext .js,.cjs,.mjs --fix"
|
|
57
|
+
"pnpm": {
|
|
58
|
+
"onlyBuiltDependencies": [
|
|
59
|
+
"sqlite3"
|
|
60
|
+
]
|
|
55
61
|
}
|
|
56
|
-
}
|
|
62
|
+
}
|
package/src/backends/gemini.js
CHANGED
|
@@ -19,14 +19,21 @@ class GeminiBackend {
|
|
|
19
19
|
}
|
|
20
20
|
|
|
21
21
|
async embedDocuments(texts) {
|
|
22
|
-
const
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
22
|
+
const BATCH_SIZE = 100;
|
|
23
|
+
const allEmbeddings = [];
|
|
24
|
+
|
|
25
|
+
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
|
26
|
+
const batch = texts.slice(i, i + BATCH_SIZE);
|
|
27
|
+
const res = await this._retry(() => this.client.models.embedContent({
|
|
28
|
+
model: this.config.modelName,
|
|
29
|
+
contents: batch,
|
|
30
|
+
config: { outputDimensionality: this.config.dimensions }
|
|
31
|
+
}));
|
|
32
|
+
const out = res?.embeddings || res?.data?.embeddings;
|
|
33
|
+
if (!out || !Array.isArray(out)) throw new Error('Gemini embedding response missing embeddings');
|
|
34
|
+
allEmbeddings.push(...out.map(e => e.values || e.embedding?.values || e));
|
|
35
|
+
}
|
|
36
|
+
return allEmbeddings;
|
|
30
37
|
}
|
|
31
38
|
async embedQuery(text) {
|
|
32
39
|
const res = await this._retry(() => this.client.models.embedContent({
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
const { v4: uuidv4 } = require('uuid');
|
|
2
|
+
const { VectorStore } = require('../interfaces');
|
|
3
|
+
|
|
4
|
+
const isSafeIdentifier = (value) => typeof value === 'string' && /^[A-Za-z_][A-Za-z0-9_]*$/.test(value);
|
|
5
|
+
const assertSafeIdentifier = (value, label) => {
|
|
6
|
+
if (!isSafeIdentifier(value)) throw new Error(`Unsafe SQL identifier for ${label}`);
|
|
7
|
+
};
|
|
8
|
+
const quoteIdentifier = (value, label) => {
|
|
9
|
+
assertSafeIdentifier(value, label);
|
|
10
|
+
return `"${value}"`;
|
|
11
|
+
};
|
|
12
|
+
const quoteTableName = (value, label) => {
|
|
13
|
+
if (typeof value !== 'string' || value.trim().length === 0) throw new Error(`Unsafe SQL identifier for ${label}`);
|
|
14
|
+
const parts = value.split('.').map(p => p.trim()).filter(Boolean);
|
|
15
|
+
if (parts.length === 0 || parts.length > 2) throw new Error(`Unsafe SQL identifier for ${label}`);
|
|
16
|
+
parts.forEach((p, i) => assertSafeIdentifier(p, i === 0 && parts.length === 2 ? `${label} schema` : `${label} table`));
|
|
17
|
+
return parts.map(p => `"${p}"`).join('.');
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
class PostgresVectorStore extends VectorStore {
|
|
21
|
+
constructor(config) {
|
|
22
|
+
super();
|
|
23
|
+
this.config = config;
|
|
24
|
+
const tableName = config.tableName || 'document';
|
|
25
|
+
const columnMap = config.columnMap || {};
|
|
26
|
+
this._table = quoteTableName(tableName, 'tableName');
|
|
27
|
+
this._cContent = quoteIdentifier(columnMap.content || 'content', 'columnMap.content');
|
|
28
|
+
this._cMeta = quoteIdentifier(columnMap.metadata || 'metadata', 'columnMap.metadata');
|
|
29
|
+
this._cVec = quoteIdentifier(columnMap.vector || 'vector', 'columnMap.vector');
|
|
30
|
+
|
|
31
|
+
// We expect config.clientInstance to be a pg.Pool or pg.Client
|
|
32
|
+
if (!this.config.clientInstance) {
|
|
33
|
+
throw new Error('PostgresVectorStore requires a clientInstance (pg.Pool or pg.Client)');
|
|
34
|
+
}
|
|
35
|
+
this.client = this.config.clientInstance;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
normalizeVector(v) {
|
|
39
|
+
const m = Math.sqrt(v.reduce((s, x) => s + x * x, 0));
|
|
40
|
+
return m === 0 ? v : v.map(x => x / m);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Helper to ensure table and extension exist
|
|
44
|
+
async ensureIndexes() {
|
|
45
|
+
// Enable pgvector extension
|
|
46
|
+
await this.client.query('CREATE EXTENSION IF NOT EXISTS vector');
|
|
47
|
+
|
|
48
|
+
// Create table if not exists
|
|
49
|
+
// Note: We need to know vector dimensions. We'll try to guess or use default 1536
|
|
50
|
+
// If embedding dimensions are provided in config, use them
|
|
51
|
+
// But store config usually doesn't have embedding config directly unless passed down
|
|
52
|
+
// For now we will assume the user creates the table or we default to 1536 (OpenAI)
|
|
53
|
+
// A better approach is to rely on user schema, but for convenience:
|
|
54
|
+
const dim = 1536; // Default to OpenAI dimension if unknown.
|
|
55
|
+
// However, if the table exists, we don't change it.
|
|
56
|
+
|
|
57
|
+
const createTableQuery = `
|
|
58
|
+
CREATE TABLE IF NOT EXISTS ${this._table} (
|
|
59
|
+
"id" TEXT PRIMARY KEY,
|
|
60
|
+
${this._cContent} TEXT,
|
|
61
|
+
${this._cMeta} JSONB,
|
|
62
|
+
${this._cVec} vector(${dim}),
|
|
63
|
+
"createdAt" TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
|
64
|
+
)
|
|
65
|
+
`;
|
|
66
|
+
await this.client.query(createTableQuery);
|
|
67
|
+
|
|
68
|
+
// Create HNSW index for faster search
|
|
69
|
+
// checking if index exists is hard in raw sql cross-version,
|
|
70
|
+
// simpler to CREATE INDEX IF NOT EXISTS which pg supports in recent versions
|
|
71
|
+
// or catch error
|
|
72
|
+
try {
|
|
73
|
+
await this.client.query(`CREATE INDEX IF NOT EXISTS "${this._table.replace(/"/g, '')}_vec_idx" ON ${this._table} USING hnsw (${this._cVec} vector_cosine_ops)`);
|
|
74
|
+
} catch (e) {
|
|
75
|
+
console.warn('Could not create vector index (might be fine if not supported):', e.message);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
async addDocuments(docs) {
|
|
80
|
+
const q = `INSERT INTO ${this._table} ("id", ${this._cContent}, ${this._cMeta}, ${this._cVec}, "createdAt") VALUES ($1, $2, $3, $4, NOW())`;
|
|
81
|
+
|
|
82
|
+
for (const doc of docs) {
|
|
83
|
+
const id = doc.id || uuidv4();
|
|
84
|
+
const vec = `[${this.normalizeVector(doc.embedding).join(',')}]`; // pgvector format
|
|
85
|
+
try {
|
|
86
|
+
await this.client.query(q, [id, doc.content, doc.metadata, vec]);
|
|
87
|
+
} catch (e) {
|
|
88
|
+
const msg = e?.message || String(e);
|
|
89
|
+
if (msg.includes('vector') && msg.includes('dimension')) {
|
|
90
|
+
throw new Error('DimensionMismatchError: Embedding dimension does not match pgvector column.');
|
|
91
|
+
}
|
|
92
|
+
throw e;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
async upsertDocuments(docs) {
|
|
98
|
+
const q = `
|
|
99
|
+
INSERT INTO ${this._table} ("id", ${this._cContent}, ${this._cMeta}, ${this._cVec}, "createdAt")
|
|
100
|
+
VALUES ($1, $2, $3, $4, NOW())
|
|
101
|
+
ON CONFLICT ("id")
|
|
102
|
+
DO UPDATE SET
|
|
103
|
+
${this._cContent} = EXCLUDED.${this._cContent},
|
|
104
|
+
${this._cMeta} = EXCLUDED.${this._cMeta},
|
|
105
|
+
${this._cVec} = EXCLUDED.${this._cVec}
|
|
106
|
+
`;
|
|
107
|
+
|
|
108
|
+
for (const doc of docs) {
|
|
109
|
+
const id = doc.id || uuidv4();
|
|
110
|
+
const vec = `[${this.normalizeVector(doc.embedding).join(',')}]`;
|
|
111
|
+
await this.client.query(q, [id, doc.content, doc.metadata, vec]);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
async similaritySearch(vector, limit = 5, filter = null) {
|
|
116
|
+
const vec = `[${this.normalizeVector(vector).join(',')}]`;
|
|
117
|
+
let where = "";
|
|
118
|
+
const params = [vec];
|
|
119
|
+
|
|
120
|
+
if (filter) {
|
|
121
|
+
where = `WHERE ${this._cMeta} @> $2`;
|
|
122
|
+
params.push(filter);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const limitIdx = params.length + 1;
|
|
126
|
+
// <=> is cosine distance. 1 - distance = similarity (roughly)
|
|
127
|
+
const q = `
|
|
128
|
+
SELECT ${this._cContent} as content, ${this._cMeta} as metadata, 1 - (${this._cVec} <=> $1) as score
|
|
129
|
+
FROM ${this._table}
|
|
130
|
+
${where}
|
|
131
|
+
ORDER BY ${this._cVec} <=> $1 ASC
|
|
132
|
+
LIMIT $${limitIdx}
|
|
133
|
+
`;
|
|
134
|
+
params.push(Math.max(1, Number(limit) || 5));
|
|
135
|
+
|
|
136
|
+
const res = await this.client.query(q, params);
|
|
137
|
+
return res.rows.map(r => ({ content: r.content, metadata: r.metadata, score: r.score }));
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
async hybridSearch(text, vector, limit = 5, filter = null) {
|
|
141
|
+
// 1. Semantic search
|
|
142
|
+
const semantic = await this.similaritySearch(vector, limit * 2, filter);
|
|
143
|
+
|
|
144
|
+
// 2. Keyword search using to_tsvector
|
|
145
|
+
// We assume english config 'simple' or 'english'
|
|
146
|
+
const params = [text];
|
|
147
|
+
let where = "";
|
|
148
|
+
if (filter) {
|
|
149
|
+
where = `AND ${this._cMeta} @> $2`;
|
|
150
|
+
params.push(filter);
|
|
151
|
+
}
|
|
152
|
+
const limitIdx = params.length + 1;
|
|
153
|
+
|
|
154
|
+
const q = `
|
|
155
|
+
SELECT ${this._cContent} as content, ${this._cMeta} as metadata
|
|
156
|
+
FROM ${this._table}
|
|
157
|
+
WHERE to_tsvector('english', ${this._cContent}) @@ plainto_tsquery('english', $1)
|
|
158
|
+
${where}
|
|
159
|
+
LIMIT $${limitIdx}
|
|
160
|
+
`;
|
|
161
|
+
params.push(Math.max(1, Number(limit) || 5) * 2);
|
|
162
|
+
|
|
163
|
+
let lexical = [];
|
|
164
|
+
try {
|
|
165
|
+
const res = await this.client.query(q, params);
|
|
166
|
+
lexical = res.rows.map(r => ({ content: r.content, metadata: r.metadata, score: 1.0 }));
|
|
167
|
+
} catch (e) {
|
|
168
|
+
console.warn("Keyword search failed (maybe missing indexes):", e.message);
|
|
169
|
+
lexical = [];
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// 3. Reciprocal Rank Fusion
|
|
173
|
+
const combined = {};
|
|
174
|
+
const add = (list, weight = 1) => {
|
|
175
|
+
list.forEach((doc, idx) => {
|
|
176
|
+
const key = doc.content; // Use content as key if id not returned, ideally use id
|
|
177
|
+
// But doc structure returned by similaritySearch might not have id unless we select it
|
|
178
|
+
// existing implementations use content as key often in simple RRF
|
|
179
|
+
const score = 1 / (60 + idx + 1) * weight;
|
|
180
|
+
if (!combined[key]) combined[key] = { ...doc, score: 0 };
|
|
181
|
+
combined[key].score += score;
|
|
182
|
+
});
|
|
183
|
+
};
|
|
184
|
+
add(semantic, 1);
|
|
185
|
+
add(lexical, 1);
|
|
186
|
+
|
|
187
|
+
return Object.values(combined).sort((a, b) => b.score - a.score).slice(0, limit);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
module.exports = { PostgresVectorStore };
|
package/src/core.js
CHANGED
|
@@ -10,6 +10,7 @@ const { OpenRouterBackend } = require('./backends/openrouter');
|
|
|
10
10
|
const { HuggingFaceBackend } = require('./backends/huggingface');
|
|
11
11
|
const { PrismaVectorStore } = require('./backends/prisma_store');
|
|
12
12
|
const { ChromaVectorStore } = require('./backends/chroma_store');
|
|
13
|
+
const { PostgresVectorStore } = require('./backends/postgres_store');
|
|
13
14
|
const { QdrantVectorStore } = require('./backends/qdrant_store');
|
|
14
15
|
const { MilvusVectorStore } = require('./backends/milvus_store');
|
|
15
16
|
const { LLMReranker } = require('./reranker');
|
|
@@ -98,6 +99,7 @@ class VectraClient {
|
|
|
98
99
|
if (!dbConfig || !dbConfig.type) throw new Error('Database config missing type');
|
|
99
100
|
const t = dbConfig.type.toLowerCase();
|
|
100
101
|
if (t === 'prisma') return new PrismaVectorStore(dbConfig);
|
|
102
|
+
if (t === 'postgres') return new PostgresVectorStore(dbConfig);
|
|
101
103
|
if (t === 'chroma') return new ChromaVectorStore(dbConfig);
|
|
102
104
|
if (t === 'qdrant') return new QdrantVectorStore(dbConfig);
|
|
103
105
|
if (t === 'milvus') return new MilvusVectorStore(dbConfig);
|
package/src/processor.js
CHANGED
|
@@ -16,8 +16,38 @@ class DocumentProcessor {
|
|
|
16
16
|
const ext = path.extname(filePath).toLowerCase();
|
|
17
17
|
const buffer = await fs.promises.readFile(filePath);
|
|
18
18
|
if (ext === '.pdf') {
|
|
19
|
+
let PDFParse = pdf.PDFParse;
|
|
20
|
+
if (!PDFParse && pdf.default && pdf.default.PDFParse) {
|
|
21
|
+
PDFParse = pdf.default.PDFParse;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if (PDFParse) {
|
|
25
|
+
// Handle pdf-parse v2
|
|
26
|
+
const parser = new PDFParse({ data: buffer });
|
|
27
|
+
const info = await parser.getInfo();
|
|
28
|
+
const total = info.total;
|
|
29
|
+
const pages = [];
|
|
30
|
+
let fullText = '';
|
|
31
|
+
|
|
32
|
+
for (let i = 1; i <= total; i++) {
|
|
33
|
+
const pageRes = await parser.getText({ partial: [i] });
|
|
34
|
+
const pageText = pageRes.text || '';
|
|
35
|
+
pages.push(pageText);
|
|
36
|
+
fullText += pageText + '\n';
|
|
37
|
+
}
|
|
38
|
+
await parser.destroy();
|
|
39
|
+
this._lastPages = pages;
|
|
40
|
+
return fullText;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Fallback for v1 (or if PDFParse class not found)
|
|
44
|
+
let pdfFunc = pdf;
|
|
45
|
+
if (typeof pdfFunc !== 'function' && pdfFunc.default) {
|
|
46
|
+
pdfFunc = pdfFunc.default;
|
|
47
|
+
}
|
|
48
|
+
|
|
19
49
|
const pages = [];
|
|
20
|
-
const res = await
|
|
50
|
+
const res = await pdfFunc(buffer, {
|
|
21
51
|
pagerender: pageData => pageData.getTextContent().then(tc => {
|
|
22
52
|
const s = tc.items.map(it => it.str).join(' ');
|
|
23
53
|
pages.push(s);
|