prescient 0.0.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,8 +1,31 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "bundler/gem_tasks"
4
- require "rspec/core/rake_task"
4
+ require "rake/testtask"
5
+ require "rubocop/rake_task"
5
6
 
6
- RSpec::Core::RakeTask.new(:spec)
7
+ Rake::TestTask.new(:test) do |t|
8
+ t.libs << "test"
9
+ t.libs << "lib"
10
+ t.test_files = FileList["test/**/*_test.rb"]
11
+ end
7
12
 
8
- task default: :spec
13
+ RuboCop::RakeTask.new
14
+
15
+ desc "Run tests and linting"
16
+ task default: %w[test rubocop]
17
+
18
+ desc "Run tests with coverage"
19
+ task :coverage do
20
+ ENV['COVERAGE'] = 'true'
21
+ Rake::Task[:test].execute
22
+ end
23
+
24
+ desc "Console with gem loaded"
25
+ task :console do
26
+ require "bundler/setup"
27
+ require "prescient"
28
+ require "irb"
29
+ ARGV.clear
30
+ IRB.start
31
+ end
@@ -0,0 +1,453 @@
1
+ # Vector Search with Prescient and pgvector
2
+
3
+ This guide provides a comprehensive overview of using Prescient with PostgreSQL's pgvector extension for semantic search and similarity matching.
4
+
5
+ ## Quick Start
6
+
7
+ ### 1. Start Services
8
+
9
+ ```bash
10
+ # Start PostgreSQL with pgvector and Ollama
11
+ docker-compose up -d postgres ollama
12
+
13
+ # Wait for services to be ready
14
+ docker-compose logs -f postgres ollama
15
+ ```
16
+
17
+ ### 2. Initialize Models
18
+
19
+ ```bash
20
+ # Pull required Ollama models
21
+ docker-compose up ollama-init
22
+
23
+ # Or manually:
24
+ ./scripts/setup-ollama-models.sh
25
+ ```
26
+
27
+ ### 3. Run Vector Search Example
28
+
29
+ ```bash
30
+ # Set environment variables
31
+ export DB_HOST=localhost
32
+ export OLLAMA_URL=http://localhost:11434
33
+
34
+ # Run the example
35
+ ruby examples/vector_search.rb
36
+ ```
37
+
38
+ ## Architecture Overview
39
+
40
+ ### Database Schema
41
+
42
+ ```
43
+ documents
44
+ ├── id (Primary Key)
45
+ ├── title
46
+ ├── content
47
+ ├── source_type
48
+ ├── source_url
49
+ ├── metadata (JSONB)
50
+ └── timestamps
51
+
52
+ document_embeddings
53
+ ├── id (Primary Key)
54
+ ├── document_id (Foreign Key)
55
+ ├── embedding_provider
56
+ ├── embedding_model
57
+ ├── embedding_dimensions
58
+ ├── embedding (VECTOR)
59
+ ├── embedding_text
60
+ └── timestamps
61
+
62
+ document_chunks
63
+ ├── id (Primary Key)
64
+ ├── document_id (Foreign Key)
65
+ ├── chunk_index
66
+ ├── chunk_text
67
+ ├── chunk_metadata (JSONB)
68
+ └── timestamps
69
+
70
+ chunk_embeddings
71
+ ├── id (Primary Key)
72
+ ├── chunk_id (Foreign Key)
73
+ ├── document_id (Foreign Key)
74
+ ├── embedding_provider
75
+ ├── embedding_model
76
+ ├── embedding_dimensions
77
+ ├── embedding (VECTOR)
78
+ └── timestamps
79
+ ```
80
+
81
+ ### Vector Indexes
82
+
83
+ The setup automatically creates HNSW indexes for optimal performance:
84
+
85
+ - **Cosine Distance**: `embedding <=> query_vector`
86
+ - **L2 Distance**: `embedding <-> query_vector`
87
+ - **Inner Product**: `embedding <#> query_vector`
88
+
89
+ ## Common Workflows
90
+
91
+ ### 1. Document Ingestion
92
+
93
+ ```ruby
94
+ require 'prescient'
95
+ require 'pg'
96
+
97
+ # Connect to database
98
+ db = PG.connect(
99
+ host: 'localhost',
100
+ dbname: 'prescient_development',
101
+ user: 'prescient',
102
+ password: 'prescient_password'
103
+ )
104
+
105
+ client = Prescient.client(:ollama)
106
+
107
+ # Insert document
108
+ doc_result = db.exec_params(
109
+ "INSERT INTO documents (title, content, source_type, metadata) VALUES ($1, $2, $3, $4) RETURNING id",
110
+ [title, content, 'article', metadata.to_json]
111
+ )
112
+ document_id = doc_result[0]['id']
113
+
114
+ # Generate and store embedding
115
+ embedding = client.generate_embedding(content)
116
+ vector_str = "[#{embedding.join(',')}]"
117
+
118
+ db.exec_params(
119
+ "INSERT INTO document_embeddings (document_id, embedding_provider, embedding_model, embedding_dimensions, embedding, embedding_text) VALUES ($1, $2, $3, $4, $5, $6)",
120
+ [document_id, 'ollama', 'nomic-embed-text', 768, vector_str, content]
121
+ )
122
+ ```
123
+
124
+ ### 2. Similarity Search
125
+
126
+ ```ruby
127
+ # Basic similarity search
128
+ query_text = "machine learning algorithms"
129
+ query_embedding = client.generate_embedding(query_text)
130
+ query_vector = "[#{query_embedding.join(',')}]"
131
+
132
+ results = db.exec_params(
133
+ "SELECT d.title, d.content, de.embedding <=> $1::vector AS distance
134
+ FROM documents d
135
+ JOIN document_embeddings de ON d.id = de.document_id
136
+ ORDER BY de.embedding <=> $1::vector
137
+ LIMIT 5",
138
+ [query_vector]
139
+ )
140
+
141
+ results.each do |row|
142
+ similarity = 1 - row['distance'].to_f
143
+ puts "#{ row['title']} (#{ (similarity * 100).round(1)}% similar)"
144
+ end
145
+ ```
146
+
147
+ ### 3. Filtered Search
148
+
149
+ ```ruby
150
+ # Search with metadata filtering
151
+ results = db.exec_params(
152
+ "SELECT d.title, de.embedding <=> $1::vector as distance
153
+ FROM documents d
154
+ JOIN document_embeddings de ON d.id = de.document_id
155
+ WHERE d.metadata->'tags' ? 'programming'
156
+ AND d.metadata->>'difficulty' = 'beginner'
157
+ ORDER BY de.embedding <=> $1::vector
158
+ LIMIT 10",
159
+ [query_vector]
160
+ )
161
+ ```
162
+
163
+ ### 4. Document Chunking
164
+
165
+ For large documents, split into chunks for better search granularity:
166
+
167
+ ```ruby
168
+ def chunk_document(text, chunk_size: 500, overlap: 50)
169
+ chunks = []
170
+ start = 0
171
+
172
+ while start < text.length
173
+ end_pos = [start + chunk_size, text.length].min
174
+
175
+ # Find word boundary to avoid cutting words
176
+ if end_pos < text.length
177
+ while end_pos > start && text[end_pos] != ' '
178
+ end_pos -= 1
179
+ end
180
+ end
181
+
182
+ chunk = text[start...end_pos].strip
183
+ chunks << {
184
+ text: chunk,
185
+ start_pos: start,
186
+ end_pos: end_pos,
187
+ index: chunks.length
188
+ }
189
+
190
+ start = end_pos - overlap
191
+ break if start >= text.length
192
+ end
193
+
194
+ chunks
195
+ end
196
+
197
+ # Process chunks
198
+ chunks = chunk_document(document.content)
199
+ chunks.each do |chunk|
200
+ # Insert chunk
201
+ chunk_result = db.exec_params(
202
+ "INSERT INTO document_chunks (document_id, chunk_index, chunk_text, chunk_metadata) VALUES ($1, $2, $3, $4) RETURNING id",
203
+ [document_id, chunk[:index], chunk[:text], { start_pos: chunk[:start_pos], end_pos: chunk[:end_pos]}.to_json]
204
+ )
205
+ chunk_id = chunk_result[0]['id']
206
+
207
+ # Generate embedding for chunk
208
+ chunk_embedding = client.generate_embedding(chunk[:text])
209
+ chunk_vector = "[#{chunk_embedding.join(',')}]"
210
+
211
+ # Store chunk embedding
212
+ db.exec_params(
213
+ "INSERT INTO chunk_embeddings (chunk_id, document_id, embedding_provider, embedding_model, embedding_dimensions, embedding) VALUES ($1, $2, $3, $4, $5, $6)",
214
+ [chunk_id, document_id, 'ollama', 'nomic-embed-text', 768, chunk_vector]
215
+ )
216
+ end
217
+ ```
218
+
219
+ ## Performance Optimization
220
+
221
+ ### Index Tuning
222
+
223
+ For different dataset sizes and performance requirements:
224
+
225
+ ```sql
226
+ -- Small datasets (< 100K vectors): Fast build, good accuracy
227
+ CREATE INDEX idx_embeddings_small
228
+ ON document_embeddings
229
+ USING hnsw (embedding vector_cosine_ops)
230
+ WITH (m = 8, ef_construction = 32);
231
+
232
+ -- Medium datasets (100K - 1M vectors): Balanced
233
+ CREATE INDEX idx_embeddings_medium
234
+ ON document_embeddings
235
+ USING hnsw (embedding vector_cosine_ops)
236
+ WITH (m = 16, ef_construction = 64);
237
+
238
+ -- Large datasets (> 1M vectors): High accuracy
239
+ CREATE INDEX idx_embeddings_large
240
+ ON document_embeddings
241
+ USING hnsw (embedding vector_cosine_ops)
242
+ WITH (m = 32, ef_construction = 128);
243
+ ```
244
+
245
+ ### Query Optimization
246
+
247
+ ```sql
248
+ -- Adjust search quality vs speed
249
+ SET hnsw.ef_search = 40; -- Fast search, lower accuracy
250
+ SET hnsw.ef_search = 100; -- Balanced (default)
251
+ SET hnsw.ef_search = 200; -- High accuracy, slower
252
+
253
+ -- Monitor query performance
254
+ EXPLAIN (ANALYZE, BUFFERS)
255
+ SELECT * FROM document_embeddings
256
+ ORDER BY embedding <=> '[0.1,0.2,...]'::vector
257
+ LIMIT 10;
258
+ ```
259
+
260
+ ### Batch Operations
261
+
262
+ ```ruby
263
+ # Batch embed multiple texts for efficiency
264
+ texts = documents.map(&:content)
265
+ embeddings = []
266
+
267
+ texts.each_slice(10) do |batch|
268
+ batch.each do |text|
269
+ embedding = client.generate_embedding(text)
270
+ embeddings << embedding
271
+
272
+ # Small delay to avoid rate limiting
273
+ sleep(0.1)
274
+ end
275
+ end
276
+
277
+ # Batch insert embeddings
278
+ db.transaction do
279
+ embeddings.each_with_index do |embedding, index|
280
+ vector_str = "[#{embedding.join(',')}]"
281
+ db.exec_params(
282
+ "INSERT INTO document_embeddings (...) VALUES (...)",
283
+ [documents[index].id, 'ollama', 'nomic-embed-text', 768, vector_str, texts[index]]
284
+ )
285
+ end
286
+ end
287
+ ```
288
+
289
+ ## Advanced Features
290
+
291
+ ### Hybrid Search
292
+
293
+ Combine vector similarity with traditional text search:
294
+
295
+ ```sql
296
+ WITH vector_results AS (
297
+ SELECT document_id, embedding <=> $1::vector as distance
298
+ FROM document_embeddings
299
+ ORDER BY embedding <=> $1::vector
300
+ LIMIT 20
301
+ ),
302
+ text_results AS (
303
+ SELECT id as document_id, ts_rank(to_tsvector(content), plainto_tsquery($2)) as rank
304
+ FROM documents
305
+ WHERE to_tsvector(content) @@ plainto_tsquery($2)
306
+ )
307
+ SELECT d.title, d.content,
308
+ COALESCE(vr.distance, 1.0) as vector_distance,
309
+ COALESCE(tr.rank, 0.0) as text_rank,
310
+ (COALESCE(1 - vr.distance, 0) * 0.7 + COALESCE(tr.rank, 0) * 0.3) as combined_score
311
+ FROM documents d
312
+ LEFT JOIN vector_results vr ON d.id = vr.document_id
313
+ LEFT JOIN text_results tr ON d.id = tr.document_id
314
+ WHERE vr.document_id IS NOT NULL OR tr.document_id IS NOT NULL
315
+ ORDER BY combined_score DESC
316
+ LIMIT 10;
317
+ ```
318
+
319
+ ### Multi-Model Embeddings
320
+
321
+ Store embeddings from multiple providers for comparison:
322
+
323
+ ```ruby
324
+ providers = [
325
+ { client: Prescient.client(:ollama), name: 'ollama', model: 'nomic-embed-text', dims: 768 },
326
+ { client: Prescient.client(:openai), name: 'openai', model: 'text-embedding-3-small', dims: 1536 }
327
+ ]
328
+
329
+ providers.each do |provider|
330
+ next unless provider[:client].available?
331
+
332
+ embedding = provider[:client].generate_embedding(text)
333
+ vector_str = "[#{embedding.join(',')}]"
334
+
335
+ db.exec_params(
336
+ "INSERT INTO document_embeddings (document_id, embedding_provider, embedding_model, embedding_dimensions, embedding, embedding_text) VALUES ($1, $2, $3, $4, $5, $6)",
337
+ [document_id, provider[:name], provider[:model], provider[:dims], vector_str, text]
338
+ )
339
+ end
340
+ ```
341
+
342
+ ## Monitoring and Analytics
343
+
344
+ ### Search Performance Tracking
345
+
346
+ ```ruby
347
+ # Track search queries and results
348
+ def track_search(query_text, results, provider, model)
349
+ query_embedding = client.generate_embedding(query_text)
350
+ query_vector = "[#{query_embedding.join(',')}]"
351
+
352
+ # Insert search query
353
+ query_result = db.exec_params(
354
+ "INSERT INTO search_queries (query_text, embedding_provider, embedding_model, query_embedding, result_count) VALUES ($1, $2, $3, $4, $5) RETURNING id",
355
+ [query_text, provider, model, query_vector, results.length]
356
+ )
357
+ query_id = query_result[0]['id']
358
+
359
+ # Insert query results
360
+ results.each_with_index do |result, index|
361
+ db.exec_params(
362
+ "INSERT INTO query_results (query_id, document_id, similarity_score, rank_position) VALUES ($1, $2, $3, $4)",
363
+ [query_id, result['document_id'], result['similarity_score'], index + 1]
364
+ )
365
+ end
366
+ end
367
+ ```
368
+
369
+ ### Analytics Queries
370
+
371
+ ```sql
372
+ -- Popular search terms
373
+ SELECT query_text, COUNT(*) as search_count
374
+ FROM search_queries
375
+ WHERE created_at > NOW() - INTERVAL '7 days'
376
+ GROUP BY query_text
377
+ ORDER BY search_count DESC
378
+ LIMIT 10;
379
+
380
+ -- Average similarity scores
381
+ SELECT embedding_provider, embedding_model,
382
+ AVG(similarity_score) as avg_similarity,
383
+ COUNT(*) as result_count
384
+ FROM query_results qr
385
+ JOIN search_queries sq ON qr.query_id = sq.id
386
+ GROUP BY embedding_provider, embedding_model;
387
+
388
+ -- Search performance over time
389
+ SELECT DATE_TRUNC('hour', created_at) as hour,
390
+ COUNT(*) as searches,
391
+ AVG(result_count) as avg_results
392
+ FROM search_queries
393
+ WHERE created_at > NOW() - INTERVAL '24 hours'
394
+ GROUP BY hour
395
+ ORDER BY hour;
396
+ ```
397
+
398
+ ## Troubleshooting
399
+
400
+ ### Common Issues
401
+
402
+ **Slow queries:**
403
+
404
+ ```sql
405
+ -- Check if indexes are being used
406
+ EXPLAIN (ANALYZE, BUFFERS)
407
+ SELECT * FROM document_embeddings
408
+ ORDER BY embedding <=> '[...]'::vector
409
+ LIMIT 10;
410
+
411
+ -- Rebuild indexes if needed
412
+ REINDEX INDEX idx_document_embeddings_cosine;
413
+ ```
414
+
415
+ **Memory issues:**
416
+
417
+ ```sql
418
+ -- Check index sizes
419
+ SELECT schemaname, tablename, indexname, pg_size_pretty(pg_relation_size(indexrelid)) as size
420
+ FROM pg_stat_user_indexes
421
+ WHERE tablename LIKE '%embedding%'
422
+ ORDER BY pg_relation_size(indexrelid) DESC;
423
+
424
+ -- Adjust work_mem for index building
425
+ SET work_mem = '256MB';
426
+ ```
427
+
428
+ **Dimension mismatches:**
429
+
430
+ ```ruby
431
+ # Validate embedding dimensions before storing
432
+ expected_dims = 768
433
+ if embedding.length != expected_dims
434
+ raise "Expected #{expected_dims} dimensions, got #{embedding.length}"
435
+ end
436
+ ```
437
+
438
+ ## Best Practices
439
+
440
+ 1. **Choose appropriate chunk sizes** based on your content and use case
441
+ 2. **Monitor query performance** and adjust indexes as needed
442
+ 3. **Use metadata filtering** to improve search relevance
443
+ 4. **Implement caching** for frequently accessed embeddings
444
+ 5. **Regular maintenance** of vector indexes for optimal performance
445
+ 6. **Test different distance functions** to find what works best for your data
446
+ 7. **Consider hybrid search** combining vector and text search for better results
447
+
448
+ ## Resources
449
+
450
+ - [pgvector Documentation](https://github.com/pgvector/pgvector)
451
+ - [HNSW Algorithm](https://arxiv.org/abs/1603.09320)
452
+ - [Vector Database Concepts](https://www.pinecone.io/learn/vector-database/)
453
+ - [Embedding Best Practices](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings)
@@ -0,0 +1,30 @@
1
+ -- Enable pgvector extension for vector operations
2
+ -- This script runs automatically when the PostgreSQL container starts
3
+
4
+ -- Enable the pgvector extension
5
+ CREATE EXTENSION IF NOT EXISTS vector;
6
+
7
+ -- Verify the extension is loaded
8
+ SELECT * FROM pg_extension WHERE extname = 'vector';
9
+
10
+ -- Create a custom vector function for cosine similarity (if needed)
11
+ CREATE OR REPLACE FUNCTION cosine_similarity(a vector, b vector)
12
+ RETURNS float AS $$
13
+ BEGIN
14
+ RETURN 1 - (a <=> b);
15
+ END;
16
+ $$ LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE;
17
+
18
+ -- Create a custom function for euclidean distance (if needed)
19
+ CREATE OR REPLACE FUNCTION euclidean_distance(a vector, b vector)
20
+ RETURNS float AS $$
21
+ BEGIN
22
+ RETURN a <-> b;
23
+ END;
24
+ $$ LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE;
25
+
26
+ -- Log successful initialization
27
+ DO $$
28
+ BEGIN
29
+ RAISE NOTICE 'pgvector extension enabled successfully';
30
+ END $$;
@@ -0,0 +1,108 @@
1
+ -- Create database schema for storing embeddings and documents
2
+ -- This demonstrates a typical setup for vector similarity search
3
+
4
+ -- Documents table to store original content
5
+ CREATE TABLE IF NOT EXISTS documents (
6
+ id SERIAL PRIMARY KEY,
7
+ title VARCHAR(255) NOT NULL,
8
+ content TEXT NOT NULL,
9
+ source_type VARCHAR(50), -- e.g., 'pdf', 'webpage', 'text', 'api'
10
+ source_url VARCHAR(500),
11
+ metadata JSONB, -- Additional flexible metadata
12
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
13
+ updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
14
+ );
15
+
16
+ -- Document embeddings table for vector search
17
+ CREATE TABLE IF NOT EXISTS document_embeddings (
18
+ id SERIAL PRIMARY KEY,
19
+ document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
20
+ embedding_provider VARCHAR(50) NOT NULL, -- e.g., 'ollama', 'openai', 'huggingface'
21
+ embedding_model VARCHAR(100) NOT NULL, -- e.g., 'nomic-embed-text', 'text-embedding-3-small'
22
+ embedding_dimensions INTEGER NOT NULL, -- e.g., 768, 1536, 384
23
+ embedding VECTOR NOT NULL, -- The actual vector embedding
24
+ embedding_text TEXT, -- The specific text that was embedded (may be subset of document)
25
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
26
+ );
27
+
28
+ -- Document chunks table for large documents split into smaller pieces
29
+ CREATE TABLE IF NOT EXISTS document_chunks (
30
+ id SERIAL PRIMARY KEY,
31
+ document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
32
+ chunk_index INTEGER NOT NULL, -- Order of chunks within document
33
+ chunk_text TEXT NOT NULL,
34
+ chunk_metadata JSONB, -- Start/end positions, etc.
35
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
36
+
37
+ UNIQUE(document_id, chunk_index)
38
+ );
39
+
40
+ -- Chunk embeddings table for chunked document search
41
+ CREATE TABLE IF NOT EXISTS chunk_embeddings (
42
+ id SERIAL PRIMARY KEY,
43
+ chunk_id INTEGER NOT NULL REFERENCES document_chunks(id) ON DELETE CASCADE,
44
+ document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
45
+ embedding_provider VARCHAR(50) NOT NULL,
46
+ embedding_model VARCHAR(100) NOT NULL,
47
+ embedding_dimensions INTEGER NOT NULL,
48
+ embedding VECTOR NOT NULL,
49
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
50
+ );
51
+
52
+ -- Search queries table to store user queries and results
53
+ CREATE TABLE IF NOT EXISTS search_queries (
54
+ id SERIAL PRIMARY KEY,
55
+ query_text TEXT NOT NULL,
56
+ embedding_provider VARCHAR(50) NOT NULL,
57
+ embedding_model VARCHAR(100) NOT NULL,
58
+ query_embedding VECTOR,
59
+ result_count INTEGER,
60
+ search_metadata JSONB, -- Search parameters, filters, etc.
61
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
62
+ );
63
+
64
+ -- Query results table to store search results for analysis
65
+ CREATE TABLE IF NOT EXISTS query_results (
66
+ id SERIAL PRIMARY KEY,
67
+ query_id INTEGER NOT NULL REFERENCES search_queries(id) ON DELETE CASCADE,
68
+ document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
69
+ chunk_id INTEGER REFERENCES document_chunks(id) ON DELETE CASCADE,
70
+ similarity_score FLOAT NOT NULL,
71
+ rank_position INTEGER NOT NULL,
72
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
73
+ );
74
+
75
+ -- Add indexes for better performance
76
+ CREATE INDEX IF NOT EXISTS idx_documents_source_type ON documents(source_type);
77
+ CREATE INDEX IF NOT EXISTS idx_documents_created_at ON documents(created_at);
78
+ CREATE INDEX IF NOT EXISTS idx_documents_metadata ON documents USING GIN(metadata);
79
+
80
+ CREATE INDEX IF NOT EXISTS idx_document_embeddings_document_id ON document_embeddings(document_id);
81
+ CREATE INDEX IF NOT EXISTS idx_document_embeddings_provider_model ON document_embeddings(embedding_provider, embedding_model);
82
+ CREATE INDEX IF NOT EXISTS idx_document_embeddings_dimensions ON document_embeddings(embedding_dimensions);
83
+
84
+ CREATE INDEX IF NOT EXISTS idx_chunk_embeddings_chunk_id ON chunk_embeddings(chunk_id);
85
+ CREATE INDEX IF NOT EXISTS idx_chunk_embeddings_document_id ON chunk_embeddings(document_id);
86
+ CREATE INDEX IF NOT EXISTS idx_chunk_embeddings_provider_model ON chunk_embeddings(embedding_provider, embedding_model);
87
+
88
+ CREATE INDEX IF NOT EXISTS idx_search_queries_created_at ON search_queries(created_at);
89
+ CREATE INDEX IF NOT EXISTS idx_query_results_query_id ON query_results(query_id);
90
+ CREATE INDEX IF NOT EXISTS idx_query_results_similarity_score ON query_results(similarity_score);
91
+
92
+ -- Add comments for documentation
93
+ COMMENT ON TABLE documents IS 'Stores original documents and content';
94
+ COMMENT ON TABLE document_embeddings IS 'Stores vector embeddings for entire documents';
95
+ COMMENT ON TABLE document_chunks IS 'Stores chunks of large documents for better search granularity';
96
+ COMMENT ON TABLE chunk_embeddings IS 'Stores vector embeddings for document chunks';
97
+ COMMENT ON TABLE search_queries IS 'Stores user search queries and their embeddings';
98
+ COMMENT ON TABLE query_results IS 'Stores search results for analysis and optimization';
99
+
100
+ COMMENT ON COLUMN document_embeddings.embedding IS 'Vector embedding generated by AI provider';
101
+ COMMENT ON COLUMN chunk_embeddings.embedding IS 'Vector embedding for document chunk';
102
+ COMMENT ON COLUMN search_queries.query_embedding IS 'Vector embedding of the search query';
103
+
104
+ -- Log successful schema creation
105
+ DO $$
106
+ BEGIN
107
+ RAISE NOTICE 'Database schema created successfully';
108
+ END $$;