prescient 0.0.0 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.env.example +37 -0
- data/.rubocop.yml +326 -0
- data/Dockerfile.example +41 -0
- data/README.md +859 -13
- data/Rakefile +25 -3
- data/VECTOR_SEARCH_GUIDE.md +450 -0
- data/db/init/01_enable_pgvector.sql +30 -0
- data/db/init/02_create_schema.sql +108 -0
- data/db/init/03_create_indexes.sql +96 -0
- data/db/init/04_insert_sample_data.sql +121 -0
- data/db/migrate/001_create_prescient_tables.rb +158 -0
- data/docker-compose.yml +153 -0
- data/examples/basic_usage.rb +123 -0
- data/examples/custom_contexts.rb +355 -0
- data/examples/custom_prompts.rb +212 -0
- data/examples/vector_search.rb +330 -0
- data/lib/prescient/base.rb +270 -0
- data/lib/prescient/client.rb +107 -0
- data/lib/prescient/provider/anthropic.rb +146 -0
- data/lib/prescient/provider/huggingface.rb +202 -0
- data/lib/prescient/provider/ollama.rb +172 -0
- data/lib/prescient/provider/openai.rb +181 -0
- data/lib/prescient/version.rb +1 -1
- data/lib/prescient.rb +84 -2
- data/prescient.gemspec +51 -0
- data/scripts/setup-ollama-models.sh +77 -0
- metadata +215 -12
- data/.vscode/settings.json +0 -1
data/Rakefile
CHANGED
@@ -1,8 +1,30 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require "bundler/gem_tasks"
|
4
|
-
require "
|
4
|
+
require "rake/testtask"
|
5
|
+
require "rubocop/rake_task"
|
5
6
|
|
6
|
-
|
7
|
+
Rake::TestTask.new(:test) do |t|
|
8
|
+
t.libs << "test"
|
9
|
+
t.libs << "lib"
|
10
|
+
t.test_files = FileList["test/**/*_test.rb"]
|
11
|
+
end
|
7
12
|
|
8
|
-
|
13
|
+
RuboCop::RakeTask.new
|
14
|
+
|
15
|
+
desc "Run tests and linting"
|
16
|
+
task default: %w[test rubocop]
|
17
|
+
|
18
|
+
desc "Run tests with coverage"
|
19
|
+
task :coverage do
|
20
|
+
ENV['COVERAGE'] = 'true'
|
21
|
+
Rake::Task[:test].execute
|
22
|
+
end
|
23
|
+
|
24
|
+
desc "Console with gem loaded"
|
25
|
+
task :console do
|
26
|
+
require "bundler/setup"
|
27
|
+
require "prescient"
|
28
|
+
require "irb"
|
29
|
+
IRB.start
|
30
|
+
end
|
@@ -0,0 +1,450 @@
|
|
1
|
+
# Vector Search with Prescient and pgvector
|
2
|
+
|
3
|
+
This guide provides a comprehensive overview of using Prescient with PostgreSQL's pgvector extension for semantic search and similarity matching.
|
4
|
+
|
5
|
+
## Quick Start
|
6
|
+
|
7
|
+
### 1. Start Services
|
8
|
+
|
9
|
+
```bash
|
10
|
+
# Start PostgreSQL with pgvector and Ollama
|
11
|
+
docker-compose up -d postgres ollama
|
12
|
+
|
13
|
+
# Wait for services to be ready
|
14
|
+
docker-compose logs -f postgres ollama
|
15
|
+
```
|
16
|
+
|
17
|
+
### 2. Initialize Models
|
18
|
+
|
19
|
+
```bash
|
20
|
+
# Pull required Ollama models
|
21
|
+
docker-compose up ollama-init
|
22
|
+
|
23
|
+
# Or manually:
|
24
|
+
./scripts/setup-ollama-models.sh
|
25
|
+
```
|
26
|
+
|
27
|
+
### 3. Run Vector Search Example
|
28
|
+
|
29
|
+
```bash
|
30
|
+
# Set environment variables
|
31
|
+
export DB_HOST=localhost
|
32
|
+
export OLLAMA_URL=http://localhost:11434
|
33
|
+
|
34
|
+
# Run the example
|
35
|
+
ruby examples/vector_search.rb
|
36
|
+
```
|
37
|
+
|
38
|
+
## Architecture Overview
|
39
|
+
|
40
|
+
### Database Schema
|
41
|
+
|
42
|
+
```
|
43
|
+
documents
|
44
|
+
├── id (Primary Key)
|
45
|
+
├── title
|
46
|
+
├── content
|
47
|
+
├── source_type
|
48
|
+
├── source_url
|
49
|
+
├── metadata (JSONB)
|
50
|
+
└── timestamps
|
51
|
+
|
52
|
+
document_embeddings
|
53
|
+
├── id (Primary Key)
|
54
|
+
├── document_id (Foreign Key)
|
55
|
+
├── embedding_provider
|
56
|
+
├── embedding_model
|
57
|
+
├── embedding_dimensions
|
58
|
+
├── embedding (VECTOR)
|
59
|
+
├── embedding_text
|
60
|
+
└── timestamps
|
61
|
+
|
62
|
+
document_chunks
|
63
|
+
├── id (Primary Key)
|
64
|
+
├── document_id (Foreign Key)
|
65
|
+
├── chunk_index
|
66
|
+
├── chunk_text
|
67
|
+
├── chunk_metadata (JSONB)
|
68
|
+
└── timestamps
|
69
|
+
|
70
|
+
chunk_embeddings
|
71
|
+
├── id (Primary Key)
|
72
|
+
├── chunk_id (Foreign Key)
|
73
|
+
├── document_id (Foreign Key)
|
74
|
+
├── embedding_provider
|
75
|
+
├── embedding_model
|
76
|
+
├── embedding_dimensions
|
77
|
+
├── embedding (VECTOR)
|
78
|
+
└── timestamps
|
79
|
+
```
|
80
|
+
|
81
|
+
### Vector Indexes
|
82
|
+
|
83
|
+
The setup automatically creates HNSW indexes for optimal performance:
|
84
|
+
|
85
|
+
- **Cosine Distance**: `embedding <=> query_vector`
|
86
|
+
- **L2 Distance**: `embedding <-> query_vector`
|
87
|
+
- **Inner Product**: `embedding <#> query_vector`
|
88
|
+
|
89
|
+
## Common Workflows
|
90
|
+
|
91
|
+
### 1. Document Ingestion
|
92
|
+
|
93
|
+
```ruby
|
94
|
+
require 'prescient'
|
95
|
+
require 'pg'
|
96
|
+
|
97
|
+
# Connect to database
|
98
|
+
db = PG.connect(
|
99
|
+
host: 'localhost',
|
100
|
+
dbname: 'prescient_development',
|
101
|
+
user: 'prescient',
|
102
|
+
password: 'prescient_password'
|
103
|
+
)
|
104
|
+
|
105
|
+
client = Prescient.client(:ollama)
|
106
|
+
|
107
|
+
# Insert document
|
108
|
+
doc_result = db.exec_params(
|
109
|
+
"INSERT INTO documents (title, content, source_type, metadata) VALUES ($1, $2, $3, $4) RETURNING id",
|
110
|
+
[title, content, 'article', metadata.to_json]
|
111
|
+
)
|
112
|
+
document_id = doc_result[0]['id']
|
113
|
+
|
114
|
+
# Generate and store embedding
|
115
|
+
embedding = client.generate_embedding(content)
|
116
|
+
vector_str = "[#{embedding.join(',')}]"
|
117
|
+
|
118
|
+
db.exec_params(
|
119
|
+
"INSERT INTO document_embeddings (document_id, embedding_provider, embedding_model, embedding_dimensions, embedding, embedding_text) VALUES ($1, $2, $3, $4, $5, $6)",
|
120
|
+
[document_id, 'ollama', 'nomic-embed-text', 768, vector_str, content]
|
121
|
+
)
|
122
|
+
```
|
123
|
+
|
124
|
+
### 2. Similarity Search
|
125
|
+
|
126
|
+
```ruby
|
127
|
+
# Basic similarity search
|
128
|
+
query_text = "machine learning algorithms"
|
129
|
+
query_embedding = client.generate_embedding(query_text)
|
130
|
+
query_vector = "[#{query_embedding.join(',')}]"
|
131
|
+
|
132
|
+
results = db.exec_params(
|
133
|
+
"SELECT d.title, d.content, de.embedding <=> $1::vector AS distance
|
134
|
+
FROM documents d
|
135
|
+
JOIN document_embeddings de ON d.id = de.document_id
|
136
|
+
ORDER BY de.embedding <=> $1::vector
|
137
|
+
LIMIT 5",
|
138
|
+
[query_vector]
|
139
|
+
)
|
140
|
+
|
141
|
+
results.each do |row|
|
142
|
+
similarity = 1 - row['distance'].to_f
|
143
|
+
puts "#{row['title']} (#{(similarity * 100).round(1)}% similar)"
|
144
|
+
end
|
145
|
+
```
|
146
|
+
|
147
|
+
### 3. Filtered Search
|
148
|
+
|
149
|
+
```ruby
|
150
|
+
# Search with metadata filtering
|
151
|
+
results = db.exec_params(
|
152
|
+
"SELECT d.title, de.embedding <=> $1::vector as distance
|
153
|
+
FROM documents d
|
154
|
+
JOIN document_embeddings de ON d.id = de.document_id
|
155
|
+
WHERE d.metadata->'tags' ? 'programming'
|
156
|
+
AND d.metadata->>'difficulty' = 'beginner'
|
157
|
+
ORDER BY de.embedding <=> $1::vector
|
158
|
+
LIMIT 10",
|
159
|
+
[query_vector]
|
160
|
+
)
|
161
|
+
```
|
162
|
+
|
163
|
+
### 4. Document Chunking
|
164
|
+
|
165
|
+
For large documents, split into chunks for better search granularity:
|
166
|
+
|
167
|
+
```ruby
|
168
|
+
def chunk_document(text, chunk_size: 500, overlap: 50)
|
169
|
+
chunks = []
|
170
|
+
start = 0
|
171
|
+
|
172
|
+
while start < text.length
|
173
|
+
end_pos = [start + chunk_size, text.length].min
|
174
|
+
|
175
|
+
# Find word boundary to avoid cutting words
|
176
|
+
if end_pos < text.length
|
177
|
+
while end_pos > start && text[end_pos] != ' '
|
178
|
+
end_pos -= 1
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
chunk = text[start...end_pos].strip
|
183
|
+
chunks << {
|
184
|
+
text: chunk,
|
185
|
+
start_pos: start,
|
186
|
+
end_pos: end_pos,
|
187
|
+
index: chunks.length
|
188
|
+
}
|
189
|
+
|
190
|
+
start = end_pos - overlap
|
191
|
+
break if start >= text.length
|
192
|
+
end
|
193
|
+
|
194
|
+
chunks
|
195
|
+
end
|
196
|
+
|
197
|
+
# Process chunks
|
198
|
+
chunks = chunk_document(document.content)
|
199
|
+
chunks.each do |chunk|
|
200
|
+
# Insert chunk
|
201
|
+
chunk_result = db.exec_params(
|
202
|
+
"INSERT INTO document_chunks (document_id, chunk_index, chunk_text, chunk_metadata) VALUES ($1, $2, $3, $4) RETURNING id",
|
203
|
+
[document_id, chunk[:index], chunk[:text], {start_pos: chunk[:start_pos], end_pos: chunk[:end_pos]}.to_json]
|
204
|
+
)
|
205
|
+
chunk_id = chunk_result[0]['id']
|
206
|
+
|
207
|
+
# Generate embedding for chunk
|
208
|
+
chunk_embedding = client.generate_embedding(chunk[:text])
|
209
|
+
chunk_vector = "[#{chunk_embedding.join(',')}]"
|
210
|
+
|
211
|
+
# Store chunk embedding
|
212
|
+
db.exec_params(
|
213
|
+
"INSERT INTO chunk_embeddings (chunk_id, document_id, embedding_provider, embedding_model, embedding_dimensions, embedding) VALUES ($1, $2, $3, $4, $5, $6)",
|
214
|
+
[chunk_id, document_id, 'ollama', 'nomic-embed-text', 768, chunk_vector]
|
215
|
+
)
|
216
|
+
end
|
217
|
+
```
|
218
|
+
|
219
|
+
## Performance Optimization
|
220
|
+
|
221
|
+
### Index Tuning
|
222
|
+
|
223
|
+
For different dataset sizes and performance requirements:
|
224
|
+
|
225
|
+
```sql
|
226
|
+
-- Small datasets (< 100K vectors): Fast build, good accuracy
|
227
|
+
CREATE INDEX idx_embeddings_small
|
228
|
+
ON document_embeddings
|
229
|
+
USING hnsw (embedding vector_cosine_ops)
|
230
|
+
WITH (m = 8, ef_construction = 32);
|
231
|
+
|
232
|
+
-- Medium datasets (100K - 1M vectors): Balanced
|
233
|
+
CREATE INDEX idx_embeddings_medium
|
234
|
+
ON document_embeddings
|
235
|
+
USING hnsw (embedding vector_cosine_ops)
|
236
|
+
WITH (m = 16, ef_construction = 64);
|
237
|
+
|
238
|
+
-- Large datasets (> 1M vectors): High accuracy
|
239
|
+
CREATE INDEX idx_embeddings_large
|
240
|
+
ON document_embeddings
|
241
|
+
USING hnsw (embedding vector_cosine_ops)
|
242
|
+
WITH (m = 32, ef_construction = 128);
|
243
|
+
```
|
244
|
+
|
245
|
+
### Query Optimization
|
246
|
+
|
247
|
+
```sql
|
248
|
+
-- Adjust search quality vs speed
|
249
|
+
SET hnsw.ef_search = 40; -- Fast search, lower accuracy
|
250
|
+
SET hnsw.ef_search = 100; -- Balanced (default)
|
251
|
+
SET hnsw.ef_search = 200; -- High accuracy, slower
|
252
|
+
|
253
|
+
-- Monitor query performance
|
254
|
+
EXPLAIN (ANALYZE, BUFFERS)
|
255
|
+
SELECT * FROM document_embeddings
|
256
|
+
ORDER BY embedding <=> '[0.1,0.2,...]'::vector
|
257
|
+
LIMIT 10;
|
258
|
+
```
|
259
|
+
|
260
|
+
### Batch Operations
|
261
|
+
|
262
|
+
```ruby
|
263
|
+
# Batch embed multiple texts for efficiency
|
264
|
+
texts = documents.map(&:content)
|
265
|
+
embeddings = []
|
266
|
+
|
267
|
+
texts.each_slice(10) do |batch|
|
268
|
+
batch.each do |text|
|
269
|
+
embedding = client.generate_embedding(text)
|
270
|
+
embeddings << embedding
|
271
|
+
|
272
|
+
# Small delay to avoid rate limiting
|
273
|
+
sleep(0.1)
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
# Batch insert embeddings
|
278
|
+
db.transaction do
|
279
|
+
embeddings.each_with_index do |embedding, index|
|
280
|
+
vector_str = "[#{embedding.join(',')}]"
|
281
|
+
db.exec_params(
|
282
|
+
"INSERT INTO document_embeddings (...) VALUES (...)",
|
283
|
+
[documents[index].id, 'ollama', 'nomic-embed-text', 768, vector_str, texts[index]]
|
284
|
+
)
|
285
|
+
end
|
286
|
+
end
|
287
|
+
```
|
288
|
+
|
289
|
+
## Advanced Features
|
290
|
+
|
291
|
+
### Hybrid Search
|
292
|
+
|
293
|
+
Combine vector similarity with traditional text search:
|
294
|
+
|
295
|
+
```sql
|
296
|
+
WITH vector_results AS (
|
297
|
+
SELECT document_id, embedding <=> $1::vector as distance
|
298
|
+
FROM document_embeddings
|
299
|
+
ORDER BY embedding <=> $1::vector
|
300
|
+
LIMIT 20
|
301
|
+
),
|
302
|
+
text_results AS (
|
303
|
+
SELECT id as document_id, ts_rank(to_tsvector(content), plainto_tsquery($2)) as rank
|
304
|
+
FROM documents
|
305
|
+
WHERE to_tsvector(content) @@ plainto_tsquery($2)
|
306
|
+
)
|
307
|
+
SELECT d.title, d.content,
|
308
|
+
COALESCE(vr.distance, 1.0) as vector_distance,
|
309
|
+
COALESCE(tr.rank, 0.0) as text_rank,
|
310
|
+
(COALESCE(1 - vr.distance, 0) * 0.7 + COALESCE(tr.rank, 0) * 0.3) as combined_score
|
311
|
+
FROM documents d
|
312
|
+
LEFT JOIN vector_results vr ON d.id = vr.document_id
|
313
|
+
LEFT JOIN text_results tr ON d.id = tr.document_id
|
314
|
+
WHERE vr.document_id IS NOT NULL OR tr.document_id IS NOT NULL
|
315
|
+
ORDER BY combined_score DESC
|
316
|
+
LIMIT 10;
|
317
|
+
```
|
318
|
+
|
319
|
+
### Multi-Model Embeddings
|
320
|
+
|
321
|
+
Store embeddings from multiple providers for comparison:
|
322
|
+
|
323
|
+
```ruby
|
324
|
+
providers = [
|
325
|
+
{ client: Prescient.client(:ollama), name: 'ollama', model: 'nomic-embed-text', dims: 768 },
|
326
|
+
{ client: Prescient.client(:openai), name: 'openai', model: 'text-embedding-3-small', dims: 1536 }
|
327
|
+
]
|
328
|
+
|
329
|
+
providers.each do |provider|
|
330
|
+
next unless provider[:client].available?
|
331
|
+
|
332
|
+
embedding = provider[:client].generate_embedding(text)
|
333
|
+
vector_str = "[#{embedding.join(',')}]"
|
334
|
+
|
335
|
+
db.exec_params(
|
336
|
+
"INSERT INTO document_embeddings (document_id, embedding_provider, embedding_model, embedding_dimensions, embedding, embedding_text) VALUES ($1, $2, $3, $4, $5, $6)",
|
337
|
+
[document_id, provider[:name], provider[:model], provider[:dims], vector_str, text]
|
338
|
+
)
|
339
|
+
end
|
340
|
+
```
|
341
|
+
|
342
|
+
## Monitoring and Analytics
|
343
|
+
|
344
|
+
### Search Performance Tracking
|
345
|
+
|
346
|
+
```ruby
|
347
|
+
# Track search queries and results
|
348
|
+
def track_search(query_text, results, provider, model)
|
349
|
+
query_embedding = client.generate_embedding(query_text)
|
350
|
+
query_vector = "[#{query_embedding.join(',')}]"
|
351
|
+
|
352
|
+
# Insert search query
|
353
|
+
query_result = db.exec_params(
|
354
|
+
"INSERT INTO search_queries (query_text, embedding_provider, embedding_model, query_embedding, result_count) VALUES ($1, $2, $3, $4, $5) RETURNING id",
|
355
|
+
[query_text, provider, model, query_vector, results.length]
|
356
|
+
)
|
357
|
+
query_id = query_result[0]['id']
|
358
|
+
|
359
|
+
# Insert query results
|
360
|
+
results.each_with_index do |result, index|
|
361
|
+
db.exec_params(
|
362
|
+
"INSERT INTO query_results (query_id, document_id, similarity_score, rank_position) VALUES ($1, $2, $3, $4)",
|
363
|
+
[query_id, result['document_id'], result['similarity_score'], index + 1]
|
364
|
+
)
|
365
|
+
end
|
366
|
+
end
|
367
|
+
```
|
368
|
+
|
369
|
+
### Analytics Queries
|
370
|
+
|
371
|
+
```sql
|
372
|
+
-- Popular search terms
|
373
|
+
SELECT query_text, COUNT(*) as search_count
|
374
|
+
FROM search_queries
|
375
|
+
WHERE created_at > NOW() - INTERVAL '7 days'
|
376
|
+
GROUP BY query_text
|
377
|
+
ORDER BY search_count DESC
|
378
|
+
LIMIT 10;
|
379
|
+
|
380
|
+
-- Average similarity scores
|
381
|
+
SELECT embedding_provider, embedding_model,
|
382
|
+
AVG(similarity_score) as avg_similarity,
|
383
|
+
COUNT(*) as result_count
|
384
|
+
FROM query_results qr
|
385
|
+
JOIN search_queries sq ON qr.query_id = sq.id
|
386
|
+
GROUP BY embedding_provider, embedding_model;
|
387
|
+
|
388
|
+
-- Search performance over time
|
389
|
+
SELECT DATE_TRUNC('hour', created_at) as hour,
|
390
|
+
COUNT(*) as searches,
|
391
|
+
AVG(result_count) as avg_results
|
392
|
+
FROM search_queries
|
393
|
+
WHERE created_at > NOW() - INTERVAL '24 hours'
|
394
|
+
GROUP BY hour
|
395
|
+
ORDER BY hour;
|
396
|
+
```
|
397
|
+
|
398
|
+
## Troubleshooting
|
399
|
+
|
400
|
+
### Common Issues
|
401
|
+
|
402
|
+
**Slow queries:**
|
403
|
+
```sql
|
404
|
+
-- Check if indexes are being used
|
405
|
+
EXPLAIN (ANALYZE, BUFFERS)
|
406
|
+
SELECT * FROM document_embeddings
|
407
|
+
ORDER BY embedding <=> '[...]'::vector
|
408
|
+
LIMIT 10;
|
409
|
+
|
410
|
+
-- Rebuild indexes if needed
|
411
|
+
REINDEX INDEX idx_document_embeddings_cosine;
|
412
|
+
```
|
413
|
+
|
414
|
+
**Memory issues:**
|
415
|
+
```sql
|
416
|
+
-- Check index sizes
|
417
|
+
SELECT schemaname, tablename, indexname, pg_size_pretty(pg_relation_size(indexrelid)) as size
|
418
|
+
FROM pg_stat_user_indexes
|
419
|
+
WHERE tablename LIKE '%embedding%'
|
420
|
+
ORDER BY pg_relation_size(indexrelid) DESC;
|
421
|
+
|
422
|
+
-- Adjust work_mem for index building
|
423
|
+
SET work_mem = '256MB';
|
424
|
+
```
|
425
|
+
|
426
|
+
**Dimension mismatches:**
|
427
|
+
```ruby
|
428
|
+
# Validate embedding dimensions before storing
|
429
|
+
expected_dims = 768
|
430
|
+
if embedding.length != expected_dims
|
431
|
+
raise "Expected #{expected_dims} dimensions, got #{embedding.length}"
|
432
|
+
end
|
433
|
+
```
|
434
|
+
|
435
|
+
## Best Practices
|
436
|
+
|
437
|
+
1. **Choose appropriate chunk sizes** based on your content and use case
|
438
|
+
2. **Monitor query performance** and adjust indexes as needed
|
439
|
+
3. **Use metadata filtering** to improve search relevance
|
440
|
+
4. **Implement caching** for frequently accessed embeddings
|
441
|
+
5. **Regular maintenance** of vector indexes for optimal performance
|
442
|
+
6. **Test different distance functions** to find what works best for your data
|
443
|
+
7. **Consider hybrid search** combining vector and text search for better results
|
444
|
+
|
445
|
+
## Resources
|
446
|
+
|
447
|
+
- [pgvector Documentation](https://github.com/pgvector/pgvector)
|
448
|
+
- [HNSW Algorithm](https://arxiv.org/abs/1603.09320)
|
449
|
+
- [Vector Database Concepts](https://www.pinecone.io/learn/vector-database/)
|
450
|
+
- [Embedding Best Practices](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
-- Enable pgvector extension for vector operations
|
2
|
+
-- This script runs automatically when the PostgreSQL container starts
|
3
|
+
|
4
|
+
-- Enable the pgvector extension
|
5
|
+
CREATE EXTENSION IF NOT EXISTS vector;
|
6
|
+
|
7
|
+
-- Verify the extension is loaded
|
8
|
+
SELECT * FROM pg_extension WHERE extname = 'vector';
|
9
|
+
|
10
|
+
-- Create a custom vector function for cosine similarity (if needed)
|
11
|
+
CREATE OR REPLACE FUNCTION cosine_similarity(a vector, b vector)
|
12
|
+
RETURNS float AS $$
|
13
|
+
BEGIN
|
14
|
+
RETURN 1 - (a <=> b);
|
15
|
+
END;
|
16
|
+
$$ LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE;
|
17
|
+
|
18
|
+
-- Create a custom function for euclidean distance (if needed)
|
19
|
+
CREATE OR REPLACE FUNCTION euclidean_distance(a vector, b vector)
|
20
|
+
RETURNS float AS $$
|
21
|
+
BEGIN
|
22
|
+
RETURN a <-> b;
|
23
|
+
END;
|
24
|
+
$$ LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE;
|
25
|
+
|
26
|
+
-- Log successful initialization
|
27
|
+
DO $$
|
28
|
+
BEGIN
|
29
|
+
RAISE NOTICE 'pgvector extension enabled successfully';
|
30
|
+
END $$;
|
@@ -0,0 +1,108 @@
|
|
1
|
+
-- Create database schema for storing embeddings and documents
|
2
|
+
-- This demonstrates a typical setup for vector similarity search
|
3
|
+
|
4
|
+
-- Documents table to store original content
|
5
|
+
CREATE TABLE IF NOT EXISTS documents (
|
6
|
+
id SERIAL PRIMARY KEY,
|
7
|
+
title VARCHAR(255) NOT NULL,
|
8
|
+
content TEXT NOT NULL,
|
9
|
+
source_type VARCHAR(50), -- e.g., 'pdf', 'webpage', 'text', 'api'
|
10
|
+
source_url VARCHAR(500),
|
11
|
+
metadata JSONB, -- Additional flexible metadata
|
12
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
13
|
+
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
14
|
+
);
|
15
|
+
|
16
|
+
-- Document embeddings table for vector search
|
17
|
+
CREATE TABLE IF NOT EXISTS document_embeddings (
|
18
|
+
id SERIAL PRIMARY KEY,
|
19
|
+
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
20
|
+
embedding_provider VARCHAR(50) NOT NULL, -- e.g., 'ollama', 'openai', 'huggingface'
|
21
|
+
embedding_model VARCHAR(100) NOT NULL, -- e.g., 'nomic-embed-text', 'text-embedding-3-small'
|
22
|
+
embedding_dimensions INTEGER NOT NULL, -- e.g., 768, 1536, 384
|
23
|
+
embedding VECTOR NOT NULL, -- The actual vector embedding
|
24
|
+
embedding_text TEXT, -- The specific text that was embedded (may be subset of document)
|
25
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
26
|
+
);
|
27
|
+
|
28
|
+
-- Document chunks table for large documents split into smaller pieces
|
29
|
+
CREATE TABLE IF NOT EXISTS document_chunks (
|
30
|
+
id SERIAL PRIMARY KEY,
|
31
|
+
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
32
|
+
chunk_index INTEGER NOT NULL, -- Order of chunks within document
|
33
|
+
chunk_text TEXT NOT NULL,
|
34
|
+
chunk_metadata JSONB, -- Start/end positions, etc.
|
35
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
36
|
+
|
37
|
+
UNIQUE(document_id, chunk_index)
|
38
|
+
);
|
39
|
+
|
40
|
+
-- Chunk embeddings table for chunked document search
|
41
|
+
CREATE TABLE IF NOT EXISTS chunk_embeddings (
|
42
|
+
id SERIAL PRIMARY KEY,
|
43
|
+
chunk_id INTEGER NOT NULL REFERENCES document_chunks(id) ON DELETE CASCADE,
|
44
|
+
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
45
|
+
embedding_provider VARCHAR(50) NOT NULL,
|
46
|
+
embedding_model VARCHAR(100) NOT NULL,
|
47
|
+
embedding_dimensions INTEGER NOT NULL,
|
48
|
+
embedding VECTOR NOT NULL,
|
49
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
50
|
+
);
|
51
|
+
|
52
|
+
-- Search queries table to store user queries and results
|
53
|
+
CREATE TABLE IF NOT EXISTS search_queries (
|
54
|
+
id SERIAL PRIMARY KEY,
|
55
|
+
query_text TEXT NOT NULL,
|
56
|
+
embedding_provider VARCHAR(50) NOT NULL,
|
57
|
+
embedding_model VARCHAR(100) NOT NULL,
|
58
|
+
query_embedding VECTOR,
|
59
|
+
result_count INTEGER,
|
60
|
+
search_metadata JSONB, -- Search parameters, filters, etc.
|
61
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
62
|
+
);
|
63
|
+
|
64
|
+
-- Query results table to store search results for analysis
|
65
|
+
CREATE TABLE IF NOT EXISTS query_results (
|
66
|
+
id SERIAL PRIMARY KEY,
|
67
|
+
query_id INTEGER NOT NULL REFERENCES search_queries(id) ON DELETE CASCADE,
|
68
|
+
document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
|
69
|
+
chunk_id INTEGER REFERENCES document_chunks(id) ON DELETE CASCADE,
|
70
|
+
similarity_score FLOAT NOT NULL,
|
71
|
+
rank_position INTEGER NOT NULL,
|
72
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
73
|
+
);
|
74
|
+
|
75
|
+
-- Add indexes for better performance
|
76
|
+
CREATE INDEX IF NOT EXISTS idx_documents_source_type ON documents(source_type);
|
77
|
+
CREATE INDEX IF NOT EXISTS idx_documents_created_at ON documents(created_at);
|
78
|
+
CREATE INDEX IF NOT EXISTS idx_documents_metadata ON documents USING GIN(metadata);
|
79
|
+
|
80
|
+
CREATE INDEX IF NOT EXISTS idx_document_embeddings_document_id ON document_embeddings(document_id);
|
81
|
+
CREATE INDEX IF NOT EXISTS idx_document_embeddings_provider_model ON document_embeddings(embedding_provider, embedding_model);
|
82
|
+
CREATE INDEX IF NOT EXISTS idx_document_embeddings_dimensions ON document_embeddings(embedding_dimensions);
|
83
|
+
|
84
|
+
CREATE INDEX IF NOT EXISTS idx_chunk_embeddings_chunk_id ON chunk_embeddings(chunk_id);
|
85
|
+
CREATE INDEX IF NOT EXISTS idx_chunk_embeddings_document_id ON chunk_embeddings(document_id);
|
86
|
+
CREATE INDEX IF NOT EXISTS idx_chunk_embeddings_provider_model ON chunk_embeddings(embedding_provider, embedding_model);
|
87
|
+
|
88
|
+
CREATE INDEX IF NOT EXISTS idx_search_queries_created_at ON search_queries(created_at);
|
89
|
+
CREATE INDEX IF NOT EXISTS idx_query_results_query_id ON query_results(query_id);
|
90
|
+
CREATE INDEX IF NOT EXISTS idx_query_results_similarity_score ON query_results(similarity_score);
|
91
|
+
|
92
|
+
-- Add comments for documentation
|
93
|
+
COMMENT ON TABLE documents IS 'Stores original documents and content';
|
94
|
+
COMMENT ON TABLE document_embeddings IS 'Stores vector embeddings for entire documents';
|
95
|
+
COMMENT ON TABLE document_chunks IS 'Stores chunks of large documents for better search granularity';
|
96
|
+
COMMENT ON TABLE chunk_embeddings IS 'Stores vector embeddings for document chunks';
|
97
|
+
COMMENT ON TABLE search_queries IS 'Stores user search queries and their embeddings';
|
98
|
+
COMMENT ON TABLE query_results IS 'Stores search results for analysis and optimization';
|
99
|
+
|
100
|
+
COMMENT ON COLUMN document_embeddings.embedding IS 'Vector embedding generated by AI provider';
|
101
|
+
COMMENT ON COLUMN chunk_embeddings.embedding IS 'Vector embedding for document chunk';
|
102
|
+
COMMENT ON COLUMN search_queries.query_embedding IS 'Vector embedding of the search query';
|
103
|
+
|
104
|
+
-- Log successful schema creation
|
105
|
+
DO $$
|
106
|
+
BEGIN
|
107
|
+
RAISE NOTICE 'Database schema created successfully';
|
108
|
+
END $$;
|