ragdoll 0.1.0 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +318 -40
- data/Rakefile +66 -4
- data/app/jobs/ragdoll/extract_keywords_job.rb +28 -0
- data/app/jobs/ragdoll/extract_text_job.rb +38 -0
- data/app/jobs/ragdoll/generate_embeddings_job.rb +28 -0
- data/app/jobs/ragdoll/generate_summary_job.rb +25 -0
- data/app/lib/ragdoll/metadata_schemas.rb +332 -0
- data/app/models/ragdoll/audio_content.rb +142 -0
- data/app/models/ragdoll/content.rb +95 -0
- data/app/models/ragdoll/document.rb +606 -4
- data/app/models/ragdoll/embedding.rb +172 -5
- data/app/models/ragdoll/image_content.rb +194 -0
- data/app/models/ragdoll/text_content.rb +137 -0
- data/app/services/ragdoll/configuration_service.rb +113 -0
- data/app/services/ragdoll/document_management.rb +108 -0
- data/app/services/ragdoll/document_processor.rb +342 -0
- data/app/services/ragdoll/embedding_service.rb +202 -0
- data/app/services/ragdoll/image_description_service.rb +230 -0
- data/app/services/ragdoll/metadata_generator.rb +329 -0
- data/app/services/ragdoll/model_resolver.rb +72 -0
- data/app/services/ragdoll/search_engine.rb +51 -0
- data/app/services/ragdoll/text_chunker.rb +208 -0
- data/app/services/ragdoll/text_generation_service.rb +355 -0
- data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
- data/db/migrate/004_create_ragdoll_documents.rb +70 -0
- data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
- data/db/migrate/006_create_ragdoll_contents.rb +47 -0
- data/lib/ragdoll/core/client.rb +306 -0
- data/lib/ragdoll/core/configuration.rb +257 -0
- data/lib/ragdoll/core/database.rb +141 -0
- data/lib/ragdoll/core/errors.rb +11 -0
- data/lib/ragdoll/core/model.rb +45 -0
- data/lib/ragdoll/core/shrine_config.rb +71 -0
- data/lib/ragdoll/core/version.rb +8 -0
- data/lib/ragdoll/core.rb +91 -0
- data/lib/ragdoll-core.rb +3 -0
- data/lib/ragdoll.rb +243 -6
- data/lib/tasks/annotate.rake +126 -0
- data/lib/tasks/db.rake +338 -0
- metadata +42 -35
- data/config/initializers/ragdoll.rb +0 -6
- data/config/routes.rb +0 -5
- data/db/migrate/20250218123456_create_documents.rb +0 -20
- data/lib/config/database.yml +0 -28
- data/lib/config/ragdoll.yml +0 -31
- data/lib/ragdoll/engine.rb +0 -16
- data/lib/ragdoll/import_job.rb +0 -15
- data/lib/ragdoll/ingestion.rb +0 -30
- data/lib/ragdoll/search.rb +0 -18
- data/lib/ragdoll/version.rb +0 -7
- data/lib/tasks/import_task.thor +0 -32
- data/lib/tasks/jobs_task.thor +0 -40
- data/lib/tasks/ragdoll_tasks.thor +0 -7
- data/lib/tasks/search_task.thor +0 -55
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2016536d66d295c1fe5054aedb77526271692d7562131df9de9e1ad756309459
|
4
|
+
data.tar.gz: 725a221ab132fd9ce77f623114c034d675c626428c9d5d8c72e45e275b08feea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 221c7d3408a9ec1b4c2f735bf733ae40aab896fdc07858b69de0866acb684c1eb65a3fb054342a6d20cd8a6e0b4e3f0c866f1df3a5bd8e5a475d6c3d72062b1a
|
7
|
+
data.tar.gz: 3228762fd152ff2a2fd5c0f514ae39e11e483dba698b8139f6c0696437a70209fb0576d67fb271eed45c4c7a2c08247dcbd68a2eab8f19cda144c01d38c2299f
|
data/README.md
CHANGED
@@ -1,75 +1,353 @@
|
|
1
|
-
|
1
|
+
<div align="center" style="background-color: yellow; color: black; padding: 20px; margin: 20px 0; border: 2px solid black; font-size: 48px; font-weight: bold;">
|
2
|
+
⚠️ CAUTION ⚠️<br />
|
3
|
+
Software Under Development by a Crazy Man
|
4
|
+
</div>
|
5
|
+
<br />
|
6
|
+
<div align="center">
|
7
|
+
<table>
|
8
|
+
<tr>
|
9
|
+
<td width="50%">
|
10
|
+
<a href="https://research.ibm.com/blog/retrieval-augmented-generation-RAG" target="_blank">
|
11
|
+
<img src="ragdoll.png" alt="Ragdoll" width="800">
|
12
|
+
</a>
|
13
|
+
</td>
|
14
|
+
<td width="50%" valign="top">
|
15
|
+
<p>Multi-modal RAG (Retrieval-Augmented Generation) is an architecture that integrates multiple data types (such as text, images, and audio) to enhance AI response generation. It combines retrieval-based methods, which fetch relevant information from a knowledge base, with generative large language models (LLMs) that create coherent and contextually appropriate outputs. This approach allows for more comprehensive and engaging user interactions, such as chatbots that respond with both text and images or educational tools that incorporate visual aids into learning materials. By leveraging various modalities, multi-modal RAG systems improve context understanding and user experience.</p>
|
16
|
+
</td>
|
17
|
+
</tr>
|
18
|
+
</table>
|
19
|
+
</div>
|
2
20
|
|
3
|
-
Ragdoll
|
21
|
+
# Ragdoll::Core
|
4
22
|
|
5
|
-
|
23
|
+
Database-oriented multi-modal RAG (Retrieval-Augmented Generation) library built on ActiveRecord. Features PostgreSQL + pgvector for high-performance semantic search, polymorphic content architecture, and dual metadata design for sophisticated document analysis.
|
6
24
|
|
7
|
-
|
25
|
+
## Quick Start
|
8
26
|
|
9
|
-
```
|
10
|
-
|
27
|
+
```ruby
|
28
|
+
require 'ragdoll'
|
29
|
+
|
30
|
+
# Configure with PostgreSQL + pgvector
|
31
|
+
Ragdoll::Core.configure do |config|
|
32
|
+
# Database configuration (PostgreSQL only)
|
33
|
+
config.database_config = {
|
34
|
+
adapter: 'postgresql',
|
35
|
+
database: 'ragdoll_production',
|
36
|
+
username: 'ragdoll',
|
37
|
+
password: ENV['DATABASE_PASSWORD'],
|
38
|
+
host: 'localhost',
|
39
|
+
port: 5432,
|
40
|
+
auto_migrate: true
|
41
|
+
}
|
42
|
+
|
43
|
+
# Ruby LLM configuration
|
44
|
+
config.ruby_llm_config[:openai][:api_key] = ENV['OPENAI_API_KEY']
|
45
|
+
config.ruby_llm_config[:openai][:organization] = ENV['OPENAI_ORGANIZATION']
|
46
|
+
config.ruby_llm_config[:openai][:project] = ENV['OPENAI_PROJECT']
|
47
|
+
|
48
|
+
# Model configuration
|
49
|
+
config.models[:default] = 'openai/gpt-4o'
|
50
|
+
config.models[:embedding][:text] = 'text-embedding-3-small'
|
51
|
+
|
52
|
+
# Logging configuration
|
53
|
+
config.logging_config[:log_level] = :warn
|
54
|
+
config.logging_config[:log_filepath] = File.join(Dir.home, '.ragdoll', 'ragdoll.log')
|
55
|
+
end
|
56
|
+
|
57
|
+
# Add documents - returns detailed result
|
58
|
+
result = Ragdoll::Core.add_document(path: 'research_paper.pdf')
|
59
|
+
puts result[:message] # "Document 'research_paper' added successfully with ID 123"
|
60
|
+
doc_id = result[:document_id]
|
61
|
+
|
62
|
+
# Check document status
|
63
|
+
status = Ragdoll::Core.document_status(id: doc_id)
|
64
|
+
puts status[:message] # Shows processing status and embeddings count
|
65
|
+
|
66
|
+
# Search across content
|
67
|
+
results = Ragdoll::Core.search(query: 'neural networks')
|
68
|
+
|
69
|
+
# Get detailed document information
|
70
|
+
document = Ragdoll::Core.get_document(id: doc_id)
|
11
71
|
```
|
12
72
|
|
13
|
-
|
73
|
+
## High-Level API
|
14
74
|
|
15
|
-
|
16
|
-
|
75
|
+
The `Ragdoll` module provides a convenient high-level API for common operations:
|
76
|
+
|
77
|
+
### Document Management
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
# Add single document - returns detailed result hash
|
81
|
+
result = Ragdoll::Core.add_document(path: 'document.pdf')
|
82
|
+
puts result[:success] # true
|
83
|
+
puts result[:document_id] # "123"
|
84
|
+
puts result[:message] # "Document 'document' added successfully with ID 123"
|
85
|
+
puts result[:embeddings_queued] # true
|
86
|
+
|
87
|
+
# Check document processing status
|
88
|
+
status = Ragdoll::Core.document_status(id: result[:document_id])
|
89
|
+
puts status[:status] # "processed"
|
90
|
+
puts status[:embeddings_count] # 15
|
91
|
+
puts status[:embeddings_ready] # true
|
92
|
+
puts status[:message] # "Document processed successfully with 15 embeddings"
|
93
|
+
|
94
|
+
# Get detailed document information
|
95
|
+
document = Ragdoll::Core.get_document(id: result[:document_id])
|
96
|
+
puts document[:title] # "document"
|
97
|
+
puts document[:status] # "processed"
|
98
|
+
puts document[:embeddings_count] # 15
|
99
|
+
puts document[:content_length] # 5000
|
100
|
+
|
101
|
+
# Update document metadata
|
102
|
+
Ragdoll::Core.update_document(id: result[:document_id], title: 'New Title')
|
103
|
+
|
104
|
+
# Delete document
|
105
|
+
Ragdoll::Core.delete_document(id: result[:document_id])
|
106
|
+
|
107
|
+
# List all documents
|
108
|
+
documents = Ragdoll::Core.list_documents(limit: 10)
|
109
|
+
|
110
|
+
# System statistics
|
111
|
+
stats = Ragdoll::Core.stats
|
112
|
+
puts stats[:total_documents] # 50
|
113
|
+
puts stats[:total_embeddings] # 1250
|
17
114
|
```
|
18
115
|
|
19
|
-
|
116
|
+
### Search and Retrieval
|
20
117
|
|
21
|
-
```
|
22
|
-
|
118
|
+
```ruby
|
119
|
+
# Semantic search across all content types
|
120
|
+
results = Ragdoll::Core.search(query: 'artificial intelligence')
|
121
|
+
|
122
|
+
# Search specific content types
|
123
|
+
text_results = Ragdoll::Core.search(query: 'machine learning', content_type: 'text')
|
124
|
+
image_results = Ragdoll::Core.search(query: 'neural network diagram', content_type: 'image')
|
125
|
+
audio_results = Ragdoll::Core.search(query: 'AI discussion', content_type: 'audio')
|
126
|
+
|
127
|
+
# Advanced search with metadata filters
|
128
|
+
results = Ragdoll::Core.search(
|
129
|
+
query: 'deep learning',
|
130
|
+
classification: 'research',
|
131
|
+
keywords: ['AI', 'neural networks'],
|
132
|
+
tags: ['technical']
|
133
|
+
)
|
134
|
+
|
135
|
+
# Get context for RAG applications
|
136
|
+
context = Ragdoll::Core.get_context(query: 'machine learning', limit: 5)
|
137
|
+
|
138
|
+
# Enhanced prompt with context
|
139
|
+
enhanced = Ragdoll::Core.enhance_prompt(
|
140
|
+
prompt: 'What is machine learning?',
|
141
|
+
context_limit: 5
|
142
|
+
)
|
143
|
+
|
144
|
+
# Hybrid search combining semantic and full-text
|
145
|
+
results = Ragdoll::Core.hybrid_search(
|
146
|
+
query: 'neural networks',
|
147
|
+
semantic_weight: 0.7,
|
148
|
+
text_weight: 0.3
|
149
|
+
)
|
23
150
|
```
|
24
151
|
|
25
|
-
|
152
|
+
### System Operations
|
26
153
|
|
27
|
-
|
154
|
+
```ruby
|
155
|
+
# Get system statistics
|
156
|
+
stats = Ragdoll::Core.stats
|
157
|
+
# Returns information about documents, content types, embeddings, etc.
|
28
158
|
|
29
|
-
|
159
|
+
# Health check
|
160
|
+
healthy = Ragdoll::Core.healthy?
|
30
161
|
|
31
|
-
|
32
|
-
|
162
|
+
# Get configuration
|
163
|
+
config = Ragdoll::Core.configuration
|
164
|
+
|
165
|
+
# Reset configuration (useful for testing)
|
166
|
+
Ragdoll::Core.reset_configuration!
|
33
167
|
```
|
34
168
|
|
35
|
-
|
36
|
-
- Use the `-r` or `--recursive` option to import files recursively from directories.
|
37
|
-
- Use the `-j` or `--jobs` option to specify the number of concurrent import jobs.
|
169
|
+
### Configuration
|
38
170
|
|
39
|
-
|
171
|
+
```ruby
|
172
|
+
# Configure the system
|
173
|
+
Ragdoll::Core.configure do |config|
|
174
|
+
# Database configuration (PostgreSQL only - REQUIRED)
|
175
|
+
config.database_config = {
|
176
|
+
adapter: 'postgresql',
|
177
|
+
database: 'ragdoll_production',
|
178
|
+
username: 'ragdoll',
|
179
|
+
password: ENV['DATABASE_PASSWORD'],
|
180
|
+
host: 'localhost',
|
181
|
+
port: 5432,
|
182
|
+
auto_migrate: true
|
183
|
+
}
|
40
184
|
|
41
|
-
|
185
|
+
# Ruby LLM configuration for multiple providers
|
186
|
+
config.ruby_llm_config[:openai][:api_key] = ENV['OPENAI_API_KEY']
|
187
|
+
config.ruby_llm_config[:openai][:organization] = ENV['OPENAI_ORGANIZATION']
|
188
|
+
config.ruby_llm_config[:openai][:project] = ENV['OPENAI_PROJECT']
|
42
189
|
|
43
|
-
|
44
|
-
|
190
|
+
config.ruby_llm_config[:anthropic][:api_key] = ENV['ANTHROPIC_API_KEY']
|
191
|
+
config.ruby_llm_config[:google][:api_key] = ENV['GOOGLE_API_KEY']
|
192
|
+
|
193
|
+
# Model configuration
|
194
|
+
config.models[:default] = 'openai/gpt-4o'
|
195
|
+
config.models[:summary] = 'openai/gpt-4o'
|
196
|
+
config.models[:keywords] = 'openai/gpt-4o'
|
197
|
+
config.models[:embedding][:text] = 'text-embedding-3-small'
|
198
|
+
config.models[:embedding][:image] = 'image-embedding-3-small'
|
199
|
+
config.models[:embedding][:audio] = 'audio-embedding-3-small'
|
200
|
+
|
201
|
+
# Logging configuration
|
202
|
+
config.logging_config[:log_level] = :warn # :debug, :info, :warn, :error, :fatal
|
203
|
+
config.logging_config[:log_filepath] = File.join(Dir.home, '.ragdoll', 'ragdoll.log')
|
204
|
+
|
205
|
+
# Processing settings
|
206
|
+
config.chunking[:text][:max_tokens] = 1000
|
207
|
+
config.chunking[:text][:overlap] = 200
|
208
|
+
config.search[:similarity_threshold] = 0.7
|
209
|
+
config.search[:max_results] = 10
|
210
|
+
end
|
45
211
|
```
|
46
212
|
|
47
|
-
|
48
|
-
|
49
|
-
|
213
|
+
## Current Implementation Status
|
214
|
+
|
215
|
+
### ✅ **Fully Implemented**
|
216
|
+
- **Text document processing**: PDF, DOCX, HTML, Markdown, plain text files
|
217
|
+
- **Embedding generation**: Text chunking and vector embedding creation
|
218
|
+
- **Database schema**: Multi-modal polymorphic architecture with PostgreSQL + pgvector
|
219
|
+
- **Dual metadata architecture**: Separate LLM-generated content analysis and file properties
|
220
|
+
- **Search functionality**: Semantic search with cosine similarity and usage analytics
|
221
|
+
- **Document management**: Add, update, delete, list operations
|
222
|
+
- **Background processing**: ActiveJob integration for async embedding generation
|
223
|
+
- **LLM metadata generation**: AI-powered structured content analysis with schema validation
|
224
|
+
- **Logging**: Configurable file-based logging with multiple levels
|
225
|
+
|
226
|
+
### 🚧 **In Development**
|
227
|
+
- **Image processing**: Framework exists but vision AI integration needs completion
|
228
|
+
- **Audio processing**: Framework exists but speech-to-text integration needs completion
|
229
|
+
- **Hybrid search**: Combining semantic and full-text search capabilities
|
230
|
+
|
231
|
+
### 📋 **Planned Features**
|
232
|
+
- **Multi-modal search**: Search across text, image, and audio content types
|
233
|
+
- **Content-type specific embedding models**: Different models for text, image, audio
|
234
|
+
- **Enhanced metadata schemas**: Domain-specific metadata templates
|
235
|
+
|
236
|
+
## Architecture Highlights
|
237
|
+
|
238
|
+
### Dual Metadata Design
|
239
|
+
|
240
|
+
Ragdoll uses a sophisticated dual metadata architecture to separate concerns:
|
241
|
+
|
242
|
+
- **`metadata` (JSON)**: LLM-generated content analysis including summary, keywords, classification, topics, sentiment, and domain-specific insights
|
243
|
+
- **`file_metadata` (JSON)**: System-generated file properties including size, MIME type, dimensions, processing parameters, and technical characteristics
|
244
|
+
|
245
|
+
This separation enables both semantic search operations on content meaning and efficient file management operations.
|
246
|
+
|
247
|
+
### Polymorphic Multi-Modal Architecture
|
248
|
+
|
249
|
+
The database schema uses polymorphic associations to elegantly support multiple content types:
|
250
|
+
|
251
|
+
- **Documents**: Central entity with dual metadata columns
|
252
|
+
- **Content Types**: Specialized tables for `text_contents`, `image_contents`, `audio_contents`
|
253
|
+
- **Embeddings**: Unified vector storage via polymorphic `embeddable` associations
|
254
|
+
|
255
|
+
## Text Document Processing (Current)
|
256
|
+
|
257
|
+
Currently, Ragdoll processes text documents through:
|
258
|
+
|
259
|
+
1. **Content Extraction**: Extracts text from PDF, DOCX, HTML, Markdown, and plain text
|
260
|
+
2. **Metadata Generation**: AI-powered analysis creates structured content metadata
|
261
|
+
3. **Text Chunking**: Splits content into manageable chunks with configurable size/overlap
|
262
|
+
4. **Embedding Generation**: Creates vector embeddings using OpenAI or other providers
|
263
|
+
5. **Database Storage**: Stores in polymorphic multi-modal architecture with dual metadata
|
264
|
+
6. **Search**: Semantic search using cosine similarity with usage analytics
|
50
265
|
|
51
|
-
###
|
266
|
+
### Example Usage
|
267
|
+
```ruby
|
268
|
+
# Add a text document
|
269
|
+
result = Ragdoll::Core.add_document(path: 'document.pdf')
|
52
270
|
|
53
|
-
|
271
|
+
# Check processing status
|
272
|
+
status = Ragdoll::Core.document_status(id: result[:document_id])
|
273
|
+
|
274
|
+
# Search the content
|
275
|
+
results = Ragdoll::Core.search(query: 'machine learning')
|
276
|
+
```
|
277
|
+
|
278
|
+
## PostgreSQL + pgvector Configuration
|
279
|
+
|
280
|
+
### Database Setup
|
54
281
|
|
55
282
|
```bash
|
56
|
-
|
283
|
+
# Install PostgreSQL and pgvector
|
284
|
+
brew install postgresql pgvector # macOS
|
285
|
+
# or
|
286
|
+
apt-get install postgresql postgresql-contrib # Ubuntu
|
287
|
+
|
288
|
+
# Create database and enable pgvector extension
|
289
|
+
createdb ragdoll_production
|
290
|
+
psql -d ragdoll_production -c "CREATE EXTENSION IF NOT EXISTS vector;"
|
291
|
+
```
|
292
|
+
|
293
|
+
### Configuration Example
|
294
|
+
|
295
|
+
```ruby
|
296
|
+
Ragdoll::Core.configure do |config|
|
297
|
+
config.database_config = {
|
298
|
+
adapter: 'postgresql',
|
299
|
+
database: 'ragdoll_production',
|
300
|
+
username: 'ragdoll',
|
301
|
+
password: ENV['DATABASE_PASSWORD'],
|
302
|
+
host: 'localhost',
|
303
|
+
port: 5432,
|
304
|
+
pool: 20,
|
305
|
+
auto_migrate: true
|
306
|
+
}
|
307
|
+
end
|
57
308
|
```
|
58
309
|
|
59
|
-
|
60
|
-
- Use the `--max_count` option to specify the maximum number of results to return.
|
61
|
-
- Use the `--rerank` option to rerank results using keyword search.
|
310
|
+
## Performance Features
|
62
311
|
|
63
|
-
|
312
|
+
- **Native pgvector**: Hardware-accelerated similarity search
|
313
|
+
- **IVFFlat indexing**: Fast approximate nearest neighbor search
|
314
|
+
- **Polymorphic embeddings**: Unified search across content types
|
315
|
+
- **Batch processing**: Efficient bulk operations
|
316
|
+
- **Background jobs**: Asynchronous document processing
|
317
|
+
- **Connection pooling**: High-concurrency support
|
318
|
+
|
319
|
+
## Installation
|
320
|
+
|
321
|
+
```bash
|
322
|
+
# Install system dependencies
|
323
|
+
brew install postgresql pgvector # macOS
|
324
|
+
# or
|
325
|
+
apt-get install postgresql postgresql-contrib # Ubuntu
|
326
|
+
|
327
|
+
# Install gem
|
328
|
+
gem install ragdoll
|
329
|
+
|
330
|
+
# Or add to Gemfile
|
331
|
+
gem 'ragdoll'
|
332
|
+
```
|
64
333
|
|
65
|
-
|
334
|
+
## Requirements
|
66
335
|
|
67
|
-
|
336
|
+
- **Ruby**: 3.2+
|
337
|
+
- **PostgreSQL**: 12+ with pgvector extension (REQUIRED - no other databases supported)
|
338
|
+
- **Dependencies**: activerecord, pg, pgvector, neighbor, ruby_llm, pdf-reader, docx, rubyzip, shrine, rmagick, opensearch-ruby, searchkick, ruby-progressbar
|
68
339
|
|
69
|
-
##
|
340
|
+
## Related Projects
|
70
341
|
|
71
|
-
|
342
|
+
- **ragdoll-cli**: Standalone CLI application using ragdoll
|
343
|
+
- **ragdoll-rails**: Rails engine with web interface for ragdoll
|
72
344
|
|
73
|
-
##
|
345
|
+
## Key Design Principles
|
74
346
|
|
75
|
-
|
347
|
+
1. **Database-Oriented**: Built on ActiveRecord with PostgreSQL + pgvector for production performance
|
348
|
+
2. **Multi-Modal First**: Text, image, and audio content as first-class citizens via polymorphic architecture
|
349
|
+
3. **Dual Metadata Design**: Separates LLM-generated content analysis from file properties
|
350
|
+
4. **LLM-Enhanced**: Structured metadata generation with schema validation using latest AI capabilities
|
351
|
+
5. **High-Level API**: Simple, intuitive interface for complex operations
|
352
|
+
6. **Scalable**: Designed for production workloads with background processing and proper indexing
|
353
|
+
7. **Extensible**: Easy to add new content types and embedding models through polymorphic design
|
data/Rakefile
CHANGED
@@ -1,10 +1,72 @@
|
|
1
|
-
# This file defines the Rake tasks for the Ragdoll gem, including tasks for testing.
|
2
|
-
|
3
1
|
# frozen_string_literal: true
|
4
2
|
|
3
|
+
require "simplecov"
|
4
|
+
SimpleCov.start
|
5
|
+
|
6
|
+
# Suppress bundler/rubygems warnings
|
7
|
+
$VERBOSE = nil
|
8
|
+
|
5
9
|
require "bundler/gem_tasks"
|
6
|
-
require "
|
10
|
+
require "rake/testtask"
|
11
|
+
|
12
|
+
def ci_environment?
|
13
|
+
ENV["CI"] == "true" || ENV["RAGDOLL_SKIP_DATABASE_TESTS"] == "true"
|
14
|
+
end
|
15
|
+
|
16
|
+
desc "Setup test database"
|
17
|
+
task :setup_test_db do
|
18
|
+
require_relative "lib/ragdoll-core"
|
19
|
+
|
20
|
+
# Database configuration for tests
|
21
|
+
test_db_config = {
|
22
|
+
adapter: "postgresql",
|
23
|
+
database: "ragdoll_test",
|
24
|
+
username: ENV.fetch("RAGDOLL_POSTGRES_USER", "postgres"),
|
25
|
+
password: ENV.fetch("RAGDOLL_POSTGRES_PASSWORD", ""),
|
26
|
+
host: ENV.fetch("RAGDOLL_POSTGRES_HOST", "localhost"),
|
27
|
+
port: ENV.fetch("RAGDOLL_POSTGRES_PORT", 5432)
|
28
|
+
}
|
29
|
+
|
30
|
+
# Ensure database exists
|
31
|
+
begin
|
32
|
+
# Try to connect to the database
|
33
|
+
ActiveRecord::Base.establish_connection(test_db_config)
|
34
|
+
ActiveRecord::Base.connection.execute("SELECT 1")
|
35
|
+
rescue ActiveRecord::NoDatabaseError
|
36
|
+
# Database doesn't exist, create it
|
37
|
+
puts "Creating ragdoll_test database..."
|
38
|
+
admin_config = test_db_config.merge(database: "postgres")
|
39
|
+
ActiveRecord::Base.establish_connection(admin_config)
|
40
|
+
ActiveRecord::Base.connection.execute("CREATE DATABASE ragdoll_test")
|
41
|
+
ActiveRecord::Base.establish_connection(test_db_config)
|
42
|
+
rescue PG::ConnectionBad => e
|
43
|
+
puts "Error connecting to PostgreSQL: #{e.message}"
|
44
|
+
puts "Please ensure PostgreSQL is running and accessible"
|
45
|
+
exit 1
|
46
|
+
end
|
47
|
+
|
48
|
+
# Ensure pgvector extension is installed
|
49
|
+
begin
|
50
|
+
ActiveRecord::Base.connection.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
51
|
+
rescue StandardError => e
|
52
|
+
puts "Warning: Could not install pgvector extension: #{e.message}"
|
53
|
+
end
|
54
|
+
|
55
|
+
# Run migrations
|
56
|
+
Ragdoll::Core::Database.setup(test_db_config.merge(auto_migrate: true, logger: nil))
|
57
|
+
puts "Test database setup complete"
|
58
|
+
end
|
59
|
+
|
60
|
+
Rake::TestTask.new(:test) do |t|
|
61
|
+
t.libs << "test"
|
62
|
+
t.libs << "lib"
|
63
|
+
t.test_files = FileList["test/**/*_test.rb"]
|
64
|
+
end
|
65
|
+
|
66
|
+
# Make test task depend on database setup only if not skipping database tests
|
67
|
+
task test: :setup_test_db unless ci_environment?
|
7
68
|
|
8
|
-
|
69
|
+
# Load annotate tasks
|
70
|
+
Dir.glob("lib/tasks/*.rake").each { |r| load r }
|
9
71
|
|
10
72
|
task default: :test
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_job"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class ExtractKeywordsJob < ActiveJob::Base
|
7
|
+
queue_as :default
|
8
|
+
|
9
|
+
def perform(document_id)
|
10
|
+
document = Ragdoll::Document.find(document_id)
|
11
|
+
return unless document.content.present?
|
12
|
+
return if document.keywords.present?
|
13
|
+
|
14
|
+
text_service = Ragdoll::TextGenerationService.new
|
15
|
+
keywords_array = text_service.extract_keywords(document.content)
|
16
|
+
|
17
|
+
if keywords_array.present?
|
18
|
+
keywords_string = keywords_array.join(", ")
|
19
|
+
document.update!(keywords: keywords_string)
|
20
|
+
end
|
21
|
+
rescue ActiveRecord::RecordNotFound
|
22
|
+
# Document was deleted, nothing to do
|
23
|
+
rescue StandardError => e
|
24
|
+
Rails.logger.error "Failed to generate keywords for document #{document_id}: #{e.message}" if defined?(Rails)
|
25
|
+
raise e
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_job"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class ExtractTextJob < ActiveJob::Base
|
7
|
+
queue_as :default
|
8
|
+
|
9
|
+
def perform(document_id)
|
10
|
+
document = Ragdoll::Document.find(document_id)
|
11
|
+
return unless document.file_attached?
|
12
|
+
return if document.content.present?
|
13
|
+
|
14
|
+
document.update!(status: "processing")
|
15
|
+
|
16
|
+
extracted_content = document.extract_text_from_file
|
17
|
+
|
18
|
+
if extracted_content.present?
|
19
|
+
document.update!(
|
20
|
+
content: extracted_content,
|
21
|
+
status: "processed"
|
22
|
+
)
|
23
|
+
|
24
|
+
# Queue follow-up jobs
|
25
|
+
Ragdoll::GenerateSummaryJob.perform_later(document_id)
|
26
|
+
Ragdoll::ExtractKeywordsJob.perform_later(document_id)
|
27
|
+
Ragdoll::GenerateEmbeddingsJob.perform_later(document_id)
|
28
|
+
else
|
29
|
+
document.update!(status: "error")
|
30
|
+
end
|
31
|
+
rescue ActiveRecord::RecordNotFound
|
32
|
+
# Document was deleted, nothing to do
|
33
|
+
rescue StandardError => e
|
34
|
+
document&.update!(status: "error")
|
35
|
+
raise e
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_job"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class GenerateEmbeddingsJob < ActiveJob::Base
|
7
|
+
queue_as :default
|
8
|
+
|
9
|
+
def perform(document_id, chunk_size: nil, chunk_overlap: nil)
|
10
|
+
document = Ragdoll::Document.find(document_id)
|
11
|
+
return unless document.content.present?
|
12
|
+
return if document.all_embeddings.exists?
|
13
|
+
|
14
|
+
# Process all content records using their own generate_embeddings! methods
|
15
|
+
document.contents.each(&:generate_embeddings!)
|
16
|
+
|
17
|
+
# Update document status to processed
|
18
|
+
document.update!(status: "processed")
|
19
|
+
rescue ActiveRecord::RecordNotFound
|
20
|
+
# Document was deleted, nothing to do
|
21
|
+
rescue StandardError => e
|
22
|
+
if defined?(Rails)
|
23
|
+
Rails.logger.error "Failed to generate embeddings for document #{document_id}: #{e.message}"
|
24
|
+
end
|
25
|
+
raise e
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_job"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class GenerateSummaryJob < ActiveJob::Base
|
7
|
+
queue_as :default
|
8
|
+
|
9
|
+
def perform(document_id)
|
10
|
+
document = Ragdoll::Document.find(document_id)
|
11
|
+
return unless document.content.present?
|
12
|
+
return if document.summary.present?
|
13
|
+
|
14
|
+
text_service = Ragdoll::TextGenerationService.new
|
15
|
+
summary = text_service.generate_summary(document.content)
|
16
|
+
|
17
|
+
document.update!(summary: summary) if summary.present?
|
18
|
+
rescue ActiveRecord::RecordNotFound
|
19
|
+
# Document was deleted, nothing to do
|
20
|
+
rescue StandardError => e
|
21
|
+
Rails.logger.error "Failed to generate summary for document #{document_id}: #{e.message}" if defined?(Rails)
|
22
|
+
raise e
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|