ragdoll 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +353 -0
  3. data/Rakefile +21 -0
  4. data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
  5. data/db/migrate/004_create_ragdoll_documents.rb +70 -0
  6. data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
  7. data/db/migrate/006_create_ragdoll_contents.rb +47 -0
  8. data/lib/ragdoll/core/client.rb +315 -0
  9. data/lib/ragdoll/core/configuration.rb +273 -0
  10. data/lib/ragdoll/core/database.rb +141 -0
  11. data/lib/ragdoll/core/document_management.rb +110 -0
  12. data/lib/ragdoll/core/document_processor.rb +344 -0
  13. data/lib/ragdoll/core/embedding_service.rb +183 -0
  14. data/lib/ragdoll/core/errors.rb +11 -0
  15. data/lib/ragdoll/core/jobs/extract_keywords.rb +32 -0
  16. data/lib/ragdoll/core/jobs/extract_text.rb +42 -0
  17. data/lib/ragdoll/core/jobs/generate_embeddings.rb +32 -0
  18. data/lib/ragdoll/core/jobs/generate_summary.rb +29 -0
  19. data/lib/ragdoll/core/metadata_schemas.rb +334 -0
  20. data/lib/ragdoll/core/models/audio_content.rb +175 -0
  21. data/lib/ragdoll/core/models/content.rb +126 -0
  22. data/lib/ragdoll/core/models/document.rb +678 -0
  23. data/lib/ragdoll/core/models/embedding.rb +204 -0
  24. data/lib/ragdoll/core/models/image_content.rb +227 -0
  25. data/lib/ragdoll/core/models/text_content.rb +169 -0
  26. data/lib/ragdoll/core/search_engine.rb +50 -0
  27. data/lib/ragdoll/core/services/image_description_service.rb +230 -0
  28. data/lib/ragdoll/core/services/metadata_generator.rb +335 -0
  29. data/lib/ragdoll/core/shrine_config.rb +71 -0
  30. data/lib/ragdoll/core/text_chunker.rb +210 -0
  31. data/lib/ragdoll/core/text_generation_service.rb +360 -0
  32. data/lib/ragdoll/core/version.rb +8 -0
  33. data/lib/ragdoll/core.rb +73 -0
  34. data/lib/ragdoll-core.rb +3 -0
  35. data/lib/ragdoll.rb +249 -0
  36. data/lib/tasks/annotate.rake +126 -0
  37. data/lib/tasks/db.rake +338 -0
  38. metadata +80 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 2d5c41105ebbbb39c2c38db7518607f45d446e6cd3b024abde75beb434ff3d2b
4
+ data.tar.gz: 2d9714a078bd7b9a8adda80869af3f983e3cb637356cba64e1550f0fe77911ee
5
+ SHA512:
6
+ metadata.gz: bf742b9919e1d542b45325e11e197bc9aca313b892c34567ea51d4efd7e7815ecc8081165af5cfef12e2453b86ee215d3a92a807a3c98d486b3797ac3dbaf214
7
+ data.tar.gz: da808c5780e3fecd02ef2ab0c4414b0f70cf3eef6a69ba08311c473ed19f98f3d3d4d0b5912c294dc0441a03f58a3ff510be126736d47881190cf56841e6233d
data/README.md ADDED
@@ -0,0 +1,353 @@
1
+ <div align="center" style="background-color: yellow; color: black; padding: 20px; margin: 20px 0; border: 2px solid black; font-size: 48px; font-weight: bold;">
2
+ ⚠️ CAUTION ⚠️<br />
3
+ Software Under Development by a Crazy Man
4
+ </div>
5
+ <br />
6
+ <div align="center">
7
+ <table>
8
+ <tr>
9
+ <td width="50%">
10
+ <a href="https://research.ibm.com/blog/retrieval-augmented-generation-RAG" target="_blank">
11
+ <img src="rag_doll.png" alt="Ragdoll" width="800">
12
+ </a>
13
+ </td>
14
+ <td width="50%" valign="top">
15
+ <p>Multi-modal RAG (Retrieval-Augmented Generation) is an architecture that integrates multiple data types (such as text, images, and audio) to enhance AI response generation. It combines retrieval-based methods, which fetch relevant information from a knowledge base, with generative large language models (LLMs) that create coherent and contextually appropriate outputs. This approach allows for more comprehensive and engaging user interactions, such as chatbots that respond with both text and images or educational tools that incorporate visual aids into learning materials. By leveraging various modalities, multi-modal RAG systems improve context understanding and user experience.</p>
16
+ </td>
17
+ </tr>
18
+ </table>
19
+ </div>
20
+
21
+ # Ragdoll::Core
22
+
23
+ Database-oriented multi-modal RAG (Retrieval-Augmented Generation) library built on ActiveRecord. Features PostgreSQL + pgvector for high-performance semantic search, polymorphic content architecture, and dual metadata design for sophisticated document analysis.
24
+
25
+ ## Quick Start
26
+
27
+ ```ruby
28
+ require 'ragdoll'
29
+
30
+ # Configure with PostgreSQL + pgvector
31
+ Ragdoll::Core.configure do |config|
32
+ # Database configuration (PostgreSQL only)
33
+ config.database_config = {
34
+ adapter: 'postgresql',
35
+ database: 'ragdoll_production',
36
+ username: 'ragdoll',
37
+ password: ENV['DATABASE_PASSWORD'],
38
+ host: 'localhost',
39
+ port: 5432,
40
+ auto_migrate: true
41
+ }
42
+
43
+ # Ruby LLM configuration
44
+ config.ruby_llm_config[:openai][:api_key] = ENV['OPENAI_API_KEY']
45
+ config.ruby_llm_config[:openai][:organization] = ENV['OPENAI_ORGANIZATION']
46
+ config.ruby_llm_config[:openai][:project] = ENV['OPENAI_PROJECT']
47
+
48
+ # Model configuration
49
+ config.models[:default] = 'openai/gpt-4o'
50
+ config.models[:embedding][:text] = 'text-embedding-3-small'
51
+
52
+ # Logging configuration
53
+ config.logging_config[:log_level] = :warn
54
+ config.logging_config[:log_filepath] = File.join(Dir.home, '.ragdoll', 'ragdoll.log')
55
+ end
56
+
57
+ # Add documents - returns detailed result
58
+ result = Ragdoll::Core.add_document(path: 'research_paper.pdf')
59
+ puts result[:message] # "Document 'research_paper' added successfully with ID 123"
60
+ doc_id = result[:document_id]
61
+
62
+ # Check document status
63
+ status = Ragdoll::Core.document_status(id: doc_id)
64
+ puts status[:message] # Shows processing status and embeddings count
65
+
66
+ # Search across content
67
+ results = Ragdoll::Core.search(query: 'neural networks')
68
+
69
+ # Get detailed document information
70
+ document = Ragdoll::Core.get_document(id: doc_id)
71
+ ```
72
+
73
+ ## High-Level API
74
+
75
+ The `Ragdoll` module provides a convenient high-level API for common operations:
76
+
77
+ ### Document Management
78
+
79
+ ```ruby
80
+ # Add single document - returns detailed result hash
81
+ result = Ragdoll::Core.add_document(path: 'document.pdf')
82
+ puts result[:success] # true
83
+ puts result[:document_id] # "123"
84
+ puts result[:message] # "Document 'document' added successfully with ID 123"
85
+ puts result[:embeddings_queued] # true
86
+
87
+ # Check document processing status
88
+ status = Ragdoll::Core.document_status(id: result[:document_id])
89
+ puts status[:status] # "processed"
90
+ puts status[:embeddings_count] # 15
91
+ puts status[:embeddings_ready] # true
92
+ puts status[:message] # "Document processed successfully with 15 embeddings"
93
+
94
+ # Get detailed document information
95
+ document = Ragdoll::Core.get_document(id: result[:document_id])
96
+ puts document[:title] # "document"
97
+ puts document[:status] # "processed"
98
+ puts document[:embeddings_count] # 15
99
+ puts document[:content_length] # 5000
100
+
101
+ # Update document metadata
102
+ Ragdoll::Core.update_document(id: result[:document_id], title: 'New Title')
103
+
104
+ # Delete document
105
+ Ragdoll::Core.delete_document(id: result[:document_id])
106
+
107
+ # List all documents
108
+ documents = Ragdoll::Core.list_documents(limit: 10)
109
+
110
+ # System statistics
111
+ stats = Ragdoll::Core.stats
112
+ puts stats[:total_documents] # 50
113
+ puts stats[:total_embeddings] # 1250
114
+ ```
115
+
116
+ ### Search and Retrieval
117
+
118
+ ```ruby
119
+ # Semantic search across all content types
120
+ results = Ragdoll::Core.search(query: 'artificial intelligence')
121
+
122
+ # Search specific content types
123
+ text_results = Ragdoll::Core.search(query: 'machine learning', content_type: 'text')
124
+ image_results = Ragdoll::Core.search(query: 'neural network diagram', content_type: 'image')
125
+ audio_results = Ragdoll::Core.search(query: 'AI discussion', content_type: 'audio')
126
+
127
+ # Advanced search with metadata filters
128
+ results = Ragdoll::Core.search(
129
+ query: 'deep learning',
130
+ classification: 'research',
131
+ keywords: ['AI', 'neural networks'],
132
+ tags: ['technical']
133
+ )
134
+
135
+ # Get context for RAG applications
136
+ context = Ragdoll::Core.get_context(query: 'machine learning', limit: 5)
137
+
138
+ # Enhanced prompt with context
139
+ enhanced = Ragdoll::Core.enhance_prompt(
140
+ prompt: 'What is machine learning?',
141
+ context_limit: 5
142
+ )
143
+
144
+ # Hybrid search combining semantic and full-text
145
+ results = Ragdoll::Core.hybrid_search(
146
+ query: 'neural networks',
147
+ semantic_weight: 0.7,
148
+ text_weight: 0.3
149
+ )
150
+ ```
151
+
152
+ ### System Operations
153
+
154
+ ```ruby
155
+ # Get system statistics
156
+ stats = Ragdoll::Core.stats
157
+ # Returns information about documents, content types, embeddings, etc.
158
+
159
+ # Health check
160
+ healthy = Ragdoll::Core.healthy?
161
+
162
+ # Get configuration
163
+ config = Ragdoll::Core.configuration
164
+
165
+ # Reset configuration (useful for testing)
166
+ Ragdoll::Core.reset_configuration!
167
+ ```
168
+
169
+ ### Configuration
170
+
171
+ ```ruby
172
+ # Configure the system
173
+ Ragdoll::Core.configure do |config|
174
+ # Database configuration (PostgreSQL only - REQUIRED)
175
+ config.database_config = {
176
+ adapter: 'postgresql',
177
+ database: 'ragdoll_production',
178
+ username: 'ragdoll',
179
+ password: ENV['DATABASE_PASSWORD'],
180
+ host: 'localhost',
181
+ port: 5432,
182
+ auto_migrate: true
183
+ }
184
+
185
+ # Ruby LLM configuration for multiple providers
186
+ config.ruby_llm_config[:openai][:api_key] = ENV['OPENAI_API_KEY']
187
+ config.ruby_llm_config[:openai][:organization] = ENV['OPENAI_ORGANIZATION']
188
+ config.ruby_llm_config[:openai][:project] = ENV['OPENAI_PROJECT']
189
+
190
+ config.ruby_llm_config[:anthropic][:api_key] = ENV['ANTHROPIC_API_KEY']
191
+ config.ruby_llm_config[:google][:api_key] = ENV['GOOGLE_API_KEY']
192
+
193
+ # Model configuration
194
+ config.models[:default] = 'openai/gpt-4o'
195
+ config.models[:summary] = 'openai/gpt-4o'
196
+ config.models[:keywords] = 'openai/gpt-4o'
197
+ config.models[:embedding][:text] = 'text-embedding-3-small'
198
+ config.models[:embedding][:image] = 'image-embedding-3-small'
199
+ config.models[:embedding][:audio] = 'audio-embedding-3-small'
200
+
201
+ # Logging configuration
202
+ config.logging_config[:log_level] = :warn # :debug, :info, :warn, :error, :fatal
203
+ config.logging_config[:log_filepath] = File.join(Dir.home, '.ragdoll', 'ragdoll.log')
204
+
205
+ # Processing settings
206
+ config.chunking[:text][:max_tokens] = 1000
207
+ config.chunking[:text][:overlap] = 200
208
+ config.search[:similarity_threshold] = 0.7
209
+ config.search[:max_results] = 10
210
+ end
211
+ ```
212
+
213
+ ## Current Implementation Status
214
+
215
+ ### ✅ **Fully Implemented**
216
+ - **Text document processing**: PDF, DOCX, HTML, Markdown, plain text files
217
+ - **Embedding generation**: Text chunking and vector embedding creation
218
+ - **Database schema**: Multi-modal polymorphic architecture with PostgreSQL + pgvector
219
+ - **Dual metadata architecture**: Separate LLM-generated content analysis and file properties
220
+ - **Search functionality**: Semantic search with cosine similarity and usage analytics
221
+ - **Document management**: Add, update, delete, list operations
222
+ - **Background processing**: ActiveJob integration for async embedding generation
223
+ - **LLM metadata generation**: AI-powered structured content analysis with schema validation
224
+ - **Logging**: Configurable file-based logging with multiple levels
225
+
226
+ ### 🚧 **In Development**
227
+ - **Image processing**: Framework exists but vision AI integration needs completion
228
+ - **Audio processing**: Framework exists but speech-to-text integration needs completion
229
+ - **Hybrid search**: Combining semantic and full-text search capabilities
230
+
231
+ ### 📋 **Planned Features**
232
+ - **Multi-modal search**: Search across text, image, and audio content types
233
+ - **Content-type specific embedding models**: Different models for text, image, audio
234
+ - **Enhanced metadata schemas**: Domain-specific metadata templates
235
+
236
+ ## Architecture Highlights
237
+
238
+ ### Dual Metadata Design
239
+
240
+ Ragdoll uses a sophisticated dual metadata architecture to separate concerns:
241
+
242
+ - **`metadata` (JSON)**: LLM-generated content analysis including summary, keywords, classification, topics, sentiment, and domain-specific insights
243
+ - **`file_metadata` (JSON)**: System-generated file properties including size, MIME type, dimensions, processing parameters, and technical characteristics
244
+
245
+ This separation enables both semantic search operations on content meaning and efficient file management operations.
246
+
247
+ ### Polymorphic Multi-Modal Architecture
248
+
249
+ The database schema uses polymorphic associations to elegantly support multiple content types:
250
+
251
+ - **Documents**: Central entity with dual metadata columns
252
+ - **Content Types**: Specialized tables for `text_contents`, `image_contents`, `audio_contents`
253
+ - **Embeddings**: Unified vector storage via polymorphic `embeddable` associations
254
+
255
+ ## Text Document Processing (Current)
256
+
257
+ Currently, Ragdoll processes text documents through:
258
+
259
+ 1. **Content Extraction**: Extracts text from PDF, DOCX, HTML, Markdown, and plain text
260
+ 2. **Metadata Generation**: AI-powered analysis creates structured content metadata
261
+ 3. **Text Chunking**: Splits content into manageable chunks with configurable size/overlap
262
+ 4. **Embedding Generation**: Creates vector embeddings using OpenAI or other providers
263
+ 5. **Database Storage**: Stores in polymorphic multi-modal architecture with dual metadata
264
+ 6. **Search**: Semantic search using cosine similarity with usage analytics
265
+
266
+ ### Example Usage
267
+ ```ruby
268
+ # Add a text document
269
+ result = Ragdoll::Core.add_document(path: 'document.pdf')
270
+
271
+ # Check processing status
272
+ status = Ragdoll::Core.document_status(id: result[:document_id])
273
+
274
+ # Search the content
275
+ results = Ragdoll::Core.search(query: 'machine learning')
276
+ ```
277
+
278
+ ## PostgreSQL + pgvector Configuration
279
+
280
+ ### Database Setup
281
+
282
+ ```bash
283
+ # Install PostgreSQL and pgvector
284
+ brew install postgresql pgvector # macOS
285
+ # or
286
+ apt-get install postgresql postgresql-contrib # Ubuntu
287
+
288
+ # Create database and enable pgvector extension
289
+ createdb ragdoll_production
290
+ psql -d ragdoll_production -c "CREATE EXTENSION IF NOT EXISTS vector;"
291
+ ```
292
+
293
+ ### Configuration Example
294
+
295
+ ```ruby
296
+ Ragdoll::Core.configure do |config|
297
+ config.database_config = {
298
+ adapter: 'postgresql',
299
+ database: 'ragdoll_production',
300
+ username: 'ragdoll',
301
+ password: ENV['DATABASE_PASSWORD'],
302
+ host: 'localhost',
303
+ port: 5432,
304
+ pool: 20,
305
+ auto_migrate: true
306
+ }
307
+ end
308
+ ```
309
+
310
+ ## Performance Features
311
+
312
+ - **Native pgvector**: Hardware-accelerated similarity search
313
+ - **IVFFlat indexing**: Fast approximate nearest neighbor search
314
+ - **Polymorphic embeddings**: Unified search across content types
315
+ - **Batch processing**: Efficient bulk operations
316
+ - **Background jobs**: Asynchronous document processing
317
+ - **Connection pooling**: High-concurrency support
318
+
319
+ ## Installation
320
+
321
+ ```bash
322
+ # Install system dependencies
323
+ brew install postgresql pgvector # macOS
324
+ # or
325
+ apt-get install postgresql postgresql-contrib # Ubuntu
326
+
327
+ # Install gem
328
+ gem install ragdoll
329
+
330
+ # Or add to Gemfile
331
+ gem 'ragdoll'
332
+ ```
333
+
334
+ ## Requirements
335
+
336
+ - **Ruby**: 3.2+
337
+ - **PostgreSQL**: 12+ with pgvector extension (REQUIRED - no other databases supported)
338
+ - **Dependencies**: activerecord, pg, pgvector, neighbor, ruby_llm, pdf-reader, docx, rubyzip, shrine, rmagick, opensearch-ruby, searchkick, ruby-progressbar
339
+
340
+ ## Related Projects
341
+
342
+ - **ragdoll-cli**: Standalone CLI application using ragdoll
343
+ - **ragdoll-rails**: Rails engine with web interface for ragdoll
344
+
345
+ ## Key Design Principles
346
+
347
+ 1. **Database-Oriented**: Built on ActiveRecord with PostgreSQL + pgvector for production performance
348
+ 2. **Multi-Modal First**: Text, image, and audio content as first-class citizens via polymorphic architecture
349
+ 3. **Dual Metadata Design**: Separates LLM-generated content analysis from file properties
350
+ 4. **LLM-Enhanced**: Structured metadata generation with schema validation using latest AI capabilities
351
+ 5. **High-Level API**: Simple, intuitive interface for complex operations
352
+ 6. **Scalable**: Designed for production workloads with background processing and proper indexing
353
+ 7. **Extensible**: Easy to add new content types and embedding models through polymorphic design
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'simplecov'
4
+ SimpleCov.start
5
+
6
+ # Suppress bundler/rubygems warnings
7
+ $VERBOSE = nil
8
+
9
+ require "bundler/gem_tasks"
10
+ require "rake/testtask"
11
+
12
+ Rake::TestTask.new(:test) do |t|
13
+ t.libs << "test"
14
+ t.libs << "lib"
15
+ t.test_files = FileList["test/**/*_test.rb"]
16
+ end
17
+
18
+ # Load annotate tasks
19
+ Dir.glob("lib/tasks/*.rake").each { |r| load r }
20
+
21
+ task default: :test
@@ -0,0 +1,23 @@
1
+ class EnablePostgresqlExtensions < ActiveRecord::Migration[7.0]
2
+ def up
3
+ # This migration is now handled by the db:create rake task
4
+ # Just ensure required extensions are available
5
+
6
+ # Vector similarity search (required for embeddings)
7
+ execute "CREATE EXTENSION IF NOT EXISTS vector"
8
+
9
+ # Useful optional extensions for text processing and search
10
+ execute "CREATE EXTENSION IF NOT EXISTS unaccent" # Remove accents from text
11
+ execute "CREATE EXTENSION IF NOT EXISTS pg_trgm" # Trigram matching for fuzzy search
12
+
13
+ # UUID support (useful for generating unique identifiers)
14
+ execute "CREATE EXTENSION IF NOT EXISTS \"uuid-ossp\""
15
+ end
16
+
17
+ def down
18
+ execute <<-SQL
19
+ DROP DATABASE IF EXISTS ragdoll_development;
20
+ DROP ROLE IF EXISTS ragdoll;
21
+ SQL
22
+ end
23
+ end
@@ -0,0 +1,70 @@
1
+ class CreateRagdollDocuments < ActiveRecord::Migration[7.0]
2
+ def change
3
+ create_table :ragdoll_documents,
4
+ comment: "Core documents table with LLM-generated structured metadata" do |t|
5
+
6
+ t.string :location, null: false,
7
+ comment: "Source location of document (file path, URL, or identifier)"
8
+
9
+ t.string :title, null: false,
10
+ comment: "Human-readable document title for display and search"
11
+
12
+ t.text :summary, null: false, default: "",
13
+ comment: "LLM-generated summary of document content"
14
+
15
+ t.text :keywords , null: false, default: "",
16
+ comment: "LLM-generated comma-separated keywords of document"
17
+
18
+ t.string :document_type, null: false, default: "text",
19
+ comment: "Document format type"
20
+
21
+ t.string :status, null: false, default: "pending",
22
+ comment: "Document processing status"
23
+
24
+ t.json :metadata, default: {},
25
+ comment: "LLM-generated structured metadata about the file"
26
+
27
+ t.timestamp :file_modified_at, null: false, default: -> { "CURRENT_TIMESTAMP" },
28
+ comment: "Timestamp when the source file was last modified"
29
+
30
+ t.timestamps null: false,
31
+ comment: "Standard creation and update timestamps"
32
+
33
+ ###########
34
+ # Indexes #
35
+ ###########
36
+
37
+ t.index :location, unique: true,
38
+ comment: "Unique index for document source lookup"
39
+
40
+ t.index :title,
41
+ comment: "Index for title-based search"
42
+
43
+ t.index :document_type,
44
+ comment: "Index for filtering by document type"
45
+
46
+ t.index :status,
47
+ comment: "Index for filtering by processing status"
48
+
49
+ t.index :created_at,
50
+ comment: "Index for chronological sorting"
51
+
52
+ t.index %i[document_type status],
53
+ comment: "Composite index for type+status filtering"
54
+
55
+ t.index "to_tsvector('english', COALESCE(title, '') ||
56
+ ' ' ||
57
+ COALESCE(metadata->>'summary', '') ||
58
+ ' ' || COALESCE(metadata->>'keywords', '') ||
59
+ ' ' || COALESCE(metadata->>'description', ''))",
60
+ using: :gin, name: "index_ragdoll_documents_on_fulltext_search",
61
+ comment: "Full-text search across title and metadata fields"
62
+
63
+ t.index "(metadata->>'document_type')", name: "index_ragdoll_documents_on_metadata_type",
64
+ comment: "Index for filtering by document type"
65
+
66
+ t.index "(metadata->>'classification')", name: "index_ragdoll_documents_on_metadata_classification",
67
+ comment: "Index for filtering by document classification"
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,41 @@
1
+ class CreateRagdollEmbeddings < ActiveRecord::Migration[7.0]
2
+ def change
3
+ create_table :ragdoll_embeddings,
4
+ comment: "Polymorphic vector embeddings storage for semantic similarity search" do |t|
5
+
6
+ t.references :embeddable, polymorphic: true, null: false,
7
+ comment: "Polymorphic reference to embeddable content"
8
+
9
+ t.text :content, null: false, default: "",
10
+ comment: "Original text content that was embedded"
11
+
12
+ t.vector :embedding_vector, limit: 1536, null: false,
13
+ comment: "Vector embedding using pgvector"
14
+
15
+ t.integer :chunk_index, null: false,
16
+ comment: "Chunk index for ordering embeddings"
17
+
18
+ t.integer :usage_count, default: 0,
19
+ comment: "Number of times used in similarity searches"
20
+
21
+ t.datetime :returned_at,
22
+ comment: "Timestamp of most recent usage"
23
+
24
+ t.json :metadata, default: {},
25
+ comment: "Embedding-specific metadata (positions, processing info)"
26
+
27
+ t.timestamps null: false,
28
+ comment: "Standard creation and update timestamps"
29
+
30
+ ###########
31
+ # Indexes #
32
+ ###########
33
+
34
+ t.index %i[embeddable_type embeddable_id],
35
+ comment: "Index for finding embeddings by embeddable content"
36
+
37
+ t.index :embedding_vector, using: :ivfflat, opclass: :vector_cosine_ops, name: "index_ragdoll_embeddings_on_embedding_vector_cosine",
38
+ comment: "IVFFlat index for fast cosine similarity search"
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,47 @@
1
+ class CreateRagdollContents < ActiveRecord::Migration[7.0]
2
+ def change
3
+ create_table :ragdoll_contents,
4
+ comment: "Content storage for polymorphic embedding architecture using STI" do |t|
5
+
6
+ t.string :type, null: false,
7
+ comment: "Type of content (e.g., AudioContent, ImageContent, TextContent)"
8
+
9
+ t.references :document, null: false, foreign_key: { to_table: :ragdoll_documents },
10
+ comment: "Reference to parent document"
11
+
12
+ t.string :embedding_model, null: false,
13
+ comment: "Embedding model to use for this content"
14
+
15
+ t.text :content,
16
+ comment: "Text content or description of the file"
17
+
18
+ t.text :data,
19
+ comment: "Raw data from file"
20
+
21
+ t.json :metadata, default: {},
22
+ comment: "Additional metadata about the file's raw data"
23
+
24
+ t.float :duration,
25
+ comment: "Duration of audio in seconds (for audio content)"
26
+
27
+ t.integer :sample_rate,
28
+ comment: "Audio sample rate in Hz (for audio content)"
29
+
30
+ t.timestamps null: false,
31
+ comment: "Standard creation and update timestamps"
32
+
33
+ ###########
34
+ # Indexes #
35
+ ###########
36
+
37
+ t.index :embedding_model,
38
+ comment: "Index for filtering by embedding model"
39
+
40
+ t.index :type,
41
+ comment: "Index for filtering by content type"
42
+
43
+ t.index "to_tsvector('english', COALESCE(content, ''))", using: :gin, name: "index_ragdoll_contents_on_fulltext_search",
44
+ comment: "Full-text search index for text content"
45
+ end
46
+ end
47
+ end