ragdoll 0.1.3 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +201 -0
- data/README.md +160 -31
- data/Rakefile +0 -3
- data/app/models/ragdoll/embedding.rb +74 -0
- data/app/models/ragdoll/search.rb +165 -0
- data/app/models/ragdoll/search_result.rb +121 -0
- data/app/services/ragdoll/configuration_service.rb +3 -3
- data/app/services/ragdoll/document_processor.rb +124 -1
- data/app/services/ragdoll/embedding_service.rb +10 -0
- data/app/services/ragdoll/search_engine.rb +64 -6
- data/db/migrate/007_create_ragdoll_searches.rb +73 -0
- data/db/migrate/008_create_ragdoll_search_results.rb +49 -0
- data/lib/ragdoll/core/client.rb +75 -8
- data/lib/ragdoll/core/model.rb +13 -0
- data/lib/ragdoll/core/version.rb +1 -1
- data/lib/ragdoll/core.rb +2 -0
- data/lib/ragdoll.rb +17 -0
- data/lib/tasks/db.rake +13 -13
- metadata +371 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cde84c4b5bbf1e8296bdd762ee78acb2f69663e493ce23b0941ada9d1201bdcd
|
4
|
+
data.tar.gz: f8bc456d3c536a295920bc1c806974b2b39f08977a8761604c7a192b83e756d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c1ce0e46be45fe8004930ec231a83a59f31039f4908be2a0e0ba67043237f1ea03bc00991820f6928a6ef5baa6ca910547876f21ddad5a7ead2d6384192e7708
|
7
|
+
data.tar.gz: e3f50e1205b4ba755c6a978acb06240b7b1fa729f4fa9bef33f956a9b245ad3d3323612f300902051237ffa71a763fc6db8d8e0fedc4f2761c46a977b42d6958
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
All notable changes to the Ragdoll Core project will be documented in this file.
|
4
|
+
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
6
|
+
|
7
|
+
## [Unreleased]
|
8
|
+
|
9
|
+
*Note: These features will be included in the next release (likely v0.1.9) featuring comprehensive search tracking and analytics capabilities.*
|
10
|
+
|
11
|
+
### Added
|
12
|
+
- **Initial CHANGELOG**: Added comprehensive CHANGELOG.md following Keep a Changelog format
|
13
|
+
- Complete version history from git log analysis
|
14
|
+
- Feature status tracking (implemented vs planned)
|
15
|
+
- Migration guides and breaking changes documentation
|
16
|
+
- Structured release notes with proper categorization
|
17
|
+
- **Search Tracking System**: Comprehensive analytics with query embeddings, click-through tracking, and performance monitoring
|
18
|
+
- Automatic search recording with vector embeddings for similarity analysis
|
19
|
+
- Click-through rate tracking and user engagement monitoring
|
20
|
+
- Session and user behavior tracking capabilities
|
21
|
+
- Performance metrics including execution time and result quality analysis
|
22
|
+
- Search similarity analysis using vector embeddings
|
23
|
+
- Automatic cleanup of orphaned and unused searches
|
24
|
+
- **Enhanced README**: Updated documentation with search tracking examples and analytics usage
|
25
|
+
- Comprehensive search analytics examples and usage patterns
|
26
|
+
- Updated API examples to use proper top-level Ragdoll methods
|
27
|
+
- Added search tracking configuration and usage examples
|
28
|
+
- **API Method Consistency**: Added `hybrid_search` delegation to top-level Ragdoll namespace
|
29
|
+
- Complete documentation with examples and parameter descriptions
|
30
|
+
- Consistent API experience across all search methods
|
31
|
+
- Verified method availability at both Ragdoll and Ragdoll::Core levels
|
32
|
+
|
33
|
+
### Fixed
|
34
|
+
- **Model Resolution Warning**: Fixed "undefined method 'empty?' for an instance of Ragdoll::Core::Model" warning
|
35
|
+
- Added defensive `empty?` method to Model class
|
36
|
+
- Enhanced constructor to handle polymorphic Model objects
|
37
|
+
- Added nil/empty checks in embedding service
|
38
|
+
|
39
|
+
### Changed
|
40
|
+
- **Test Coverage**: Added coverage directory to .gitignore for cleaner repository state
|
41
|
+
|
42
|
+
### Technical Details
|
43
|
+
- Commits: `9186067`, `cb952d3`, `e902a5f`, `632527b`
|
44
|
+
- All changes maintain backward compatibility
|
45
|
+
- No breaking API changes
|
46
|
+
|
47
|
+
## [0.1.8] - 2025-01-04
|
48
|
+
|
49
|
+
### Added
|
50
|
+
- **Search Analytics Foundation**: Added `Ragdoll::Search` model with query embedding and result tracking capabilities
|
51
|
+
- **Embedding Service Enhancements**: Fallback mechanism for model resolution in embedding service
|
52
|
+
- **Test Coverage**: Added coverage directory to gitignore and improved test infrastructure
|
53
|
+
|
54
|
+
### Changed
|
55
|
+
- Updated Gemfile.lock with latest gem versions
|
56
|
+
- Enhanced runtime dependencies and version management
|
57
|
+
|
58
|
+
### Fixed
|
59
|
+
- Package directory exclusion in gitignore
|
60
|
+
|
61
|
+
## [0.1.7] - 2025-01-04
|
62
|
+
|
63
|
+
### Added
|
64
|
+
- **Multi-Modal Content Models**: Added AudioContent model for comprehensive audio processing support
|
65
|
+
- **Background Job Processing**: New Ragdoll job classes for asynchronous document processing
|
66
|
+
- **Metadata Schemas**: Structured metadata schemas for text and image documents with validation
|
67
|
+
|
68
|
+
### Changed
|
69
|
+
- Updated ragdoll gem dependencies
|
70
|
+
- Improved submodule management for documentation
|
71
|
+
|
72
|
+
## [0.1.6] - 2025-01-04
|
73
|
+
|
74
|
+
### Added
|
75
|
+
- **Documentation Restructure**: Replaced local docs with ragdoll-docs submodule
|
76
|
+
- **Conventional Commits**: Updated and restructured Conventional Commits specification
|
77
|
+
- **CI/CD Improvements**: Enhanced GitHub Actions workflow and dropped JRuby support for RMagick compatibility
|
78
|
+
|
79
|
+
### Fixed
|
80
|
+
- Test skipping logic for CI environments
|
81
|
+
- Automated release workflow adjustments
|
82
|
+
|
83
|
+
## [0.1.5] - 2025-01-04
|
84
|
+
|
85
|
+
### Added
|
86
|
+
- Enhanced document processing pipeline
|
87
|
+
- Improved error handling and logging
|
88
|
+
|
89
|
+
### Fixed
|
90
|
+
- Version management and release process refinements
|
91
|
+
|
92
|
+
## [0.1.4] - 2025-01-04
|
93
|
+
|
94
|
+
### Added
|
95
|
+
- Extended multi-modal architecture support
|
96
|
+
- Performance optimizations for large document processing
|
97
|
+
|
98
|
+
### Changed
|
99
|
+
- Refined version numbering and release process
|
100
|
+
|
101
|
+
## [0.1.3] - 2025-01-04
|
102
|
+
|
103
|
+
### Added
|
104
|
+
- **Core RAG Architecture**: Multi-modal RAG (Retrieval-Augmented Generation) library built on ActiveRecord
|
105
|
+
- **PostgreSQL + pgvector Integration**: High-performance semantic search with vector similarity
|
106
|
+
- **Polymorphic Content Architecture**: Unified handling of text, image, and audio content types
|
107
|
+
- **Dual Metadata Design**: Separation of LLM-generated content analysis and system file properties
|
108
|
+
- **Document Processing Pipeline**: Support for PDF, DOCX, HTML, Markdown, and plain text files
|
109
|
+
- **Embedding Generation**: Text chunking and vector embedding creation with multiple LLM provider support
|
110
|
+
- **Semantic Search**: Cosine similarity search with usage analytics
|
111
|
+
- **Background Processing**: ActiveJob integration for asynchronous document processing
|
112
|
+
- **Logging System**: Configurable file-based logging with multiple levels
|
113
|
+
|
114
|
+
### Technical Features
|
115
|
+
- **Database Schema**: Multi-modal polymorphic architecture optimized for PostgreSQL
|
116
|
+
- **IVFFlat Indexing**: Fast approximate nearest neighbor search for vector similarity
|
117
|
+
- **Connection Pooling**: High-concurrency support for production workloads
|
118
|
+
- **Configuration Management**: Comprehensive configuration system for LLM providers and processing settings
|
119
|
+
|
120
|
+
## [0.1.1] - 2024-12-XX
|
121
|
+
|
122
|
+
### Added
|
123
|
+
- Initial project structure and basic functionality
|
124
|
+
- Core document management capabilities
|
125
|
+
- Basic search and retrieval features
|
126
|
+
|
127
|
+
## [0.0.2] - 2024-12-XX
|
128
|
+
|
129
|
+
### Added
|
130
|
+
- Initial alpha release
|
131
|
+
- Basic RAG architecture foundation
|
132
|
+
- PostgreSQL database integration
|
133
|
+
|
134
|
+
---
|
135
|
+
|
136
|
+
## Feature Status
|
137
|
+
|
138
|
+
### ✅ Fully Implemented
|
139
|
+
- **Text Document Processing**: PDF, DOCX, HTML, Markdown, plain text files
|
140
|
+
- **Embedding Generation**: Text chunking and vector embedding creation
|
141
|
+
- **Database Schema**: Multi-modal polymorphic architecture with PostgreSQL + pgvector
|
142
|
+
- **Dual Metadata Architecture**: Separate LLM-generated content analysis and file properties
|
143
|
+
- **Search Functionality**: Semantic search with cosine similarity and usage analytics
|
144
|
+
- **Search Tracking System**: Comprehensive analytics with query embeddings, click-through tracking, and performance monitoring
|
145
|
+
- **Document Management**: Add, update, delete, list operations
|
146
|
+
- **Background Processing**: ActiveJob integration for async embedding generation
|
147
|
+
- **LLM Metadata Generation**: AI-powered structured content analysis with schema validation
|
148
|
+
- **Logging**: Configurable file-based logging with multiple levels
|
149
|
+
|
150
|
+
### 🚧 In Development
|
151
|
+
- **Image Processing**: Framework exists but vision AI integration needs completion
|
152
|
+
- **Audio Processing**: Framework exists but speech-to-text integration needs completion
|
153
|
+
- **Hybrid Search**: Combining semantic and full-text search capabilities
|
154
|
+
|
155
|
+
### 📋 Planned Features
|
156
|
+
- **Multi-modal Search**: Search across text, image, and audio content types
|
157
|
+
- **Content-type Specific Embedding Models**: Different models for text, image, audio
|
158
|
+
- **Enhanced Metadata Schemas**: Domain-specific metadata templates
|
159
|
+
|
160
|
+
---
|
161
|
+
|
162
|
+
## Migration Guide
|
163
|
+
|
164
|
+
### From 0.1.7 to 0.1.8
|
165
|
+
- New search tracking tables will be automatically created via migrations
|
166
|
+
- No breaking changes to existing API
|
167
|
+
- Search tracking is enabled by default but can be disabled per search
|
168
|
+
|
169
|
+
### From 0.1.6 to 0.1.7
|
170
|
+
- AudioContent model added - existing installations will auto-migrate
|
171
|
+
- New background job classes available for improved processing
|
172
|
+
- Metadata schemas provide enhanced validation
|
173
|
+
|
174
|
+
### From 0.1.5 to 0.1.6
|
175
|
+
- Documentation moved to submodule - update local references
|
176
|
+
- CI/CD improvements may affect development workflows
|
177
|
+
- JRuby support removed due to RMagick dependency
|
178
|
+
|
179
|
+
---
|
180
|
+
|
181
|
+
## Breaking Changes
|
182
|
+
|
183
|
+
### Version 0.1.6
|
184
|
+
- **JRuby Support Removed**: RMagick dependency incompatibility
|
185
|
+
- **Documentation Structure**: Local docs replaced with submodule
|
186
|
+
|
187
|
+
---
|
188
|
+
|
189
|
+
## Contributors
|
190
|
+
|
191
|
+
- **Dewayne VanHoozer** - Primary developer and maintainer
|
192
|
+
|
193
|
+
---
|
194
|
+
|
195
|
+
## License
|
196
|
+
|
197
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
198
|
+
|
199
|
+
---
|
200
|
+
|
201
|
+
*This changelog is automatically maintained and reflects the actual implementation status of features.*
|
data/README.md
CHANGED
@@ -18,17 +18,63 @@
|
|
18
18
|
</table>
|
19
19
|
</div>
|
20
20
|
|
21
|
-
# Ragdoll
|
21
|
+
# Ragdoll
|
22
22
|
|
23
23
|
Database-oriented multi-modal RAG (Retrieval-Augmented Generation) library built on ActiveRecord. Features PostgreSQL + pgvector for high-performance semantic search, polymorphic content architecture, and dual metadata design for sophisticated document analysis.
|
24
24
|
|
25
|
+
## Overview
|
26
|
+
|
27
|
+
Ragdoll is a database-first, multi-modal Retrieval-Augmented Generation (RAG) library for Ruby. It pairs PostgreSQL + pgvector with an ActiveRecord-driven schema to deliver fast, production-grade semantic search and clean data modeling. Today it ships with robust text processing; image and audio pipelines are scaffolded and actively being completed.
|
28
|
+
|
29
|
+
The library emphasizes a dual-metadata design: LLM-derived semantic metadata for understanding content, and system file metadata for managing assets. With built-in analytics, background processing, and a high-level API, you can go from ingest to answer quickly—and scale confidently.
|
30
|
+
|
31
|
+
### Why Ragdoll?
|
32
|
+
|
33
|
+
- Database-first foundation on ActiveRecord (PostgreSQL + pgvector only) for performance and reliability
|
34
|
+
- Multi-modal architecture (text today; image/audio next) via polymorphic content design
|
35
|
+
- Dual metadata model separating semantic analysis from file properties
|
36
|
+
- Provider-agnostic LLM integration via `ruby_llm` (OpenAI, Anthropic, Google)
|
37
|
+
- Production-friendly: background jobs, connection pooling, indexing, and search analytics
|
38
|
+
- Simple, ergonomic high-level API to keep your application code clean
|
39
|
+
|
40
|
+
### Key Capabilities
|
41
|
+
|
42
|
+
- Semantic search with vector similarity (cosine) across polymorphic content
|
43
|
+
- Text ingestion, chunking, and embedding generation
|
44
|
+
- LLM-powered structured metadata with schema validation
|
45
|
+
- Search tracking and analytics (CTR, performance, similarity of queries)
|
46
|
+
- Hybrid search (semantic + full-text) planned
|
47
|
+
- Extensible model and configuration system
|
48
|
+
|
49
|
+
## Table of Contents
|
50
|
+
|
51
|
+
- [Quick Start](#quick-start)
|
52
|
+
- [API Overview](#api-overview)
|
53
|
+
- [Search and Retrieval](#search-and-retrieval)
|
54
|
+
- [Search Analytics and Tracking](#search-analytics-and-tracking)
|
55
|
+
- [System Operations](#system-operations)
|
56
|
+
- [Configuration](#configuration)
|
57
|
+
- [Current Implementation Status](#current-implementation-status)
|
58
|
+
- [Architecture Highlights](#architecture-highlights)
|
59
|
+
- [Text Document Processing](#text-document-processing-current)
|
60
|
+
- [PostgreSQL + pgvector Configuration](#postgresql--pgvector-configuration)
|
61
|
+
- [Performance Features](#performance-features)
|
62
|
+
- [Installation](#installation)
|
63
|
+
- [Requirements](#requirements)
|
64
|
+
- [Use Cases](#use-cases)
|
65
|
+
- [Environment Variables](#environment-variables)
|
66
|
+
- [Troubleshooting](#troubleshooting)
|
67
|
+
- [Related Projects](#related-projects)
|
68
|
+
- [Key Design Principles](#key-design-principles)
|
69
|
+
- [Contributing & Support](#contributing--support)
|
70
|
+
|
25
71
|
## Quick Start
|
26
72
|
|
27
73
|
```ruby
|
28
74
|
require 'ragdoll'
|
29
75
|
|
30
76
|
# Configure with PostgreSQL + pgvector
|
31
|
-
Ragdoll
|
77
|
+
Ragdoll.configure do |config|
|
32
78
|
# Database configuration (PostgreSQL only)
|
33
79
|
config.database_config = {
|
34
80
|
adapter: 'postgresql',
|
@@ -55,22 +101,22 @@ Ragdoll::Core.configure do |config|
|
|
55
101
|
end
|
56
102
|
|
57
103
|
# Add documents - returns detailed result
|
58
|
-
result = Ragdoll
|
104
|
+
result = Ragdoll.add_document(path: 'research_paper.pdf')
|
59
105
|
puts result[:message] # "Document 'research_paper' added successfully with ID 123"
|
60
106
|
doc_id = result[:document_id]
|
61
107
|
|
62
108
|
# Check document status
|
63
|
-
status = Ragdoll
|
109
|
+
status = Ragdoll.document_status(id: doc_id)
|
64
110
|
puts status[:message] # Shows processing status and embeddings count
|
65
111
|
|
66
112
|
# Search across content
|
67
|
-
results = Ragdoll
|
113
|
+
results = Ragdoll.search(query: 'neural networks')
|
68
114
|
|
69
115
|
# Get detailed document information
|
70
|
-
document = Ragdoll
|
116
|
+
document = Ragdoll.get_document(id: doc_id)
|
71
117
|
```
|
72
118
|
|
73
|
-
##
|
119
|
+
## API Overview
|
74
120
|
|
75
121
|
The `Ragdoll` module provides a convenient high-level API for common operations:
|
76
122
|
|
@@ -78,37 +124,37 @@ The `Ragdoll` module provides a convenient high-level API for common operations:
|
|
78
124
|
|
79
125
|
```ruby
|
80
126
|
# Add single document - returns detailed result hash
|
81
|
-
result = Ragdoll
|
127
|
+
result = Ragdoll.add_document(path: 'document.pdf')
|
82
128
|
puts result[:success] # true
|
83
129
|
puts result[:document_id] # "123"
|
84
130
|
puts result[:message] # "Document 'document' added successfully with ID 123"
|
85
131
|
puts result[:embeddings_queued] # true
|
86
132
|
|
87
133
|
# Check document processing status
|
88
|
-
status = Ragdoll
|
134
|
+
status = Ragdoll.document_status(id: result[:document_id])
|
89
135
|
puts status[:status] # "processed"
|
90
136
|
puts status[:embeddings_count] # 15
|
91
137
|
puts status[:embeddings_ready] # true
|
92
138
|
puts status[:message] # "Document processed successfully with 15 embeddings"
|
93
139
|
|
94
140
|
# Get detailed document information
|
95
|
-
document = Ragdoll
|
141
|
+
document = Ragdoll.get_document(id: result[:document_id])
|
96
142
|
puts document[:title] # "document"
|
97
143
|
puts document[:status] # "processed"
|
98
144
|
puts document[:embeddings_count] # 15
|
99
145
|
puts document[:content_length] # 5000
|
100
146
|
|
101
147
|
# Update document metadata
|
102
|
-
Ragdoll
|
148
|
+
Ragdoll.update_document(id: result[:document_id], title: 'New Title')
|
103
149
|
|
104
150
|
# Delete document
|
105
|
-
Ragdoll
|
151
|
+
Ragdoll.delete_document(id: result[:document_id])
|
106
152
|
|
107
153
|
# List all documents
|
108
|
-
documents = Ragdoll
|
154
|
+
documents = Ragdoll.list_documents(limit: 10)
|
109
155
|
|
110
156
|
# System statistics
|
111
|
-
stats = Ragdoll
|
157
|
+
stats = Ragdoll.stats
|
112
158
|
puts stats[:total_documents] # 50
|
113
159
|
puts stats[:total_embeddings] # 1250
|
114
160
|
```
|
@@ -117,15 +163,22 @@ puts stats[:total_embeddings] # 1250
|
|
117
163
|
|
118
164
|
```ruby
|
119
165
|
# Semantic search across all content types
|
120
|
-
results = Ragdoll
|
166
|
+
results = Ragdoll.search(query: 'artificial intelligence')
|
167
|
+
|
168
|
+
# Search with automatic tracking (default)
|
169
|
+
results = Ragdoll.search(
|
170
|
+
query: 'machine learning',
|
171
|
+
session_id: 123, # Optional: track user sessions
|
172
|
+
user_id: 456 # Optional: track by user
|
173
|
+
)
|
121
174
|
|
122
175
|
# Search specific content types
|
123
|
-
text_results = Ragdoll
|
124
|
-
image_results = Ragdoll
|
125
|
-
audio_results = Ragdoll
|
176
|
+
text_results = Ragdoll.search(query: 'machine learning', content_type: 'text')
|
177
|
+
image_results = Ragdoll.search(query: 'neural network diagram', content_type: 'image')
|
178
|
+
audio_results = Ragdoll.search(query: 'AI discussion', content_type: 'audio')
|
126
179
|
|
127
180
|
# Advanced search with metadata filters
|
128
|
-
results = Ragdoll
|
181
|
+
results = Ragdoll.search(
|
129
182
|
query: 'deep learning',
|
130
183
|
classification: 'research',
|
131
184
|
keywords: ['AI', 'neural networks'],
|
@@ -133,44 +186,77 @@ results = Ragdoll::Core.search(
|
|
133
186
|
)
|
134
187
|
|
135
188
|
# Get context for RAG applications
|
136
|
-
context = Ragdoll
|
189
|
+
context = Ragdoll.get_context(query: 'machine learning', limit: 5)
|
137
190
|
|
138
191
|
# Enhanced prompt with context
|
139
|
-
enhanced = Ragdoll
|
192
|
+
enhanced = Ragdoll.enhance_prompt(
|
140
193
|
prompt: 'What is machine learning?',
|
141
194
|
context_limit: 5
|
142
195
|
)
|
143
196
|
|
144
197
|
# Hybrid search combining semantic and full-text
|
145
|
-
results = Ragdoll
|
198
|
+
results = Ragdoll.hybrid_search(
|
146
199
|
query: 'neural networks',
|
147
200
|
semantic_weight: 0.7,
|
148
201
|
text_weight: 0.3
|
149
202
|
)
|
150
203
|
```
|
151
204
|
|
205
|
+
### Search Analytics and Tracking
|
206
|
+
|
207
|
+
Ragdoll automatically tracks all searches to provide comprehensive analytics and improve search relevance over time:
|
208
|
+
|
209
|
+
```ruby
|
210
|
+
# Get search analytics for the last 30 days
|
211
|
+
analytics = Ragdoll::Search.search_analytics(days: 30)
|
212
|
+
puts "Total searches: #{analytics[:total_searches]}"
|
213
|
+
puts "Unique queries: #{analytics[:unique_queries]}"
|
214
|
+
puts "Average execution time: #{analytics[:avg_execution_time]}ms"
|
215
|
+
puts "Click-through rate: #{analytics[:click_through_rate]}%"
|
216
|
+
|
217
|
+
# Find similar searches using vector similarity
|
218
|
+
search = Ragdoll::Search.first
|
219
|
+
similar_searches = search.nearest_neighbors(:query_embedding, distance: :cosine).limit(5)
|
220
|
+
|
221
|
+
similar_searches.each do |similar|
|
222
|
+
puts "Query: #{similar.query}"
|
223
|
+
puts "Similarity: #{similar.neighbor_distance}"
|
224
|
+
puts "Results: #{similar.results_count}"
|
225
|
+
end
|
226
|
+
|
227
|
+
# Track user interactions (clicks on search results)
|
228
|
+
search_result = Ragdoll::SearchResult.first
|
229
|
+
search_result.mark_as_clicked!
|
230
|
+
|
231
|
+
# Disable tracking for specific searches if needed
|
232
|
+
results = Ragdoll.search(
|
233
|
+
query: 'private query',
|
234
|
+
track_search: false
|
235
|
+
)
|
236
|
+
```
|
237
|
+
|
152
238
|
### System Operations
|
153
239
|
|
154
240
|
```ruby
|
155
241
|
# Get system statistics
|
156
|
-
stats = Ragdoll
|
242
|
+
stats = Ragdoll.stats
|
157
243
|
# Returns information about documents, content types, embeddings, etc.
|
158
244
|
|
159
245
|
# Health check
|
160
|
-
healthy = Ragdoll
|
246
|
+
healthy = Ragdoll.healthy?
|
161
247
|
|
162
248
|
# Get configuration
|
163
|
-
config = Ragdoll
|
249
|
+
config = Ragdoll.configuration
|
164
250
|
|
165
251
|
# Reset configuration (useful for testing)
|
166
|
-
Ragdoll
|
252
|
+
Ragdoll.reset_configuration!
|
167
253
|
```
|
168
254
|
|
169
255
|
### Configuration
|
170
256
|
|
171
257
|
```ruby
|
172
258
|
# Configure the system
|
173
|
-
Ragdoll
|
259
|
+
Ragdoll.configure do |config|
|
174
260
|
# Database configuration (PostgreSQL only - REQUIRED)
|
175
261
|
config.database_config = {
|
176
262
|
adapter: 'postgresql',
|
@@ -218,6 +304,7 @@ end
|
|
218
304
|
- **Database schema**: Multi-modal polymorphic architecture with PostgreSQL + pgvector
|
219
305
|
- **Dual metadata architecture**: Separate LLM-generated content analysis and file properties
|
220
306
|
- **Search functionality**: Semantic search with cosine similarity and usage analytics
|
307
|
+
- **Search tracking system**: Comprehensive analytics with query embeddings, click-through tracking, and performance monitoring
|
221
308
|
- **Document management**: Add, update, delete, list operations
|
222
309
|
- **Background processing**: ActiveJob integration for async embedding generation
|
223
310
|
- **LLM metadata generation**: AI-powered structured content analysis with schema validation
|
@@ -264,15 +351,16 @@ Currently, Ragdoll processes text documents through:
|
|
264
351
|
6. **Search**: Semantic search using cosine similarity with usage analytics
|
265
352
|
|
266
353
|
### Example Usage
|
354
|
+
|
267
355
|
```ruby
|
268
356
|
# Add a text document
|
269
|
-
result = Ragdoll
|
357
|
+
result = Ragdoll.add_document(path: 'document.pdf')
|
270
358
|
|
271
359
|
# Check processing status
|
272
|
-
status = Ragdoll
|
360
|
+
status = Ragdoll.document_status(id: result[:document_id])
|
273
361
|
|
274
362
|
# Search the content
|
275
|
-
results = Ragdoll
|
363
|
+
results = Ragdoll.search(query: 'machine learning')
|
276
364
|
```
|
277
365
|
|
278
366
|
## PostgreSQL + pgvector Configuration
|
@@ -293,7 +381,7 @@ psql -d ragdoll_production -c "CREATE EXTENSION IF NOT EXISTS vector;"
|
|
293
381
|
### Configuration Example
|
294
382
|
|
295
383
|
```ruby
|
296
|
-
Ragdoll
|
384
|
+
Ragdoll.configure do |config|
|
297
385
|
config.database_config = {
|
298
386
|
adapter: 'postgresql',
|
299
387
|
database: 'ragdoll_production',
|
@@ -337,11 +425,52 @@ gem 'ragdoll'
|
|
337
425
|
- **PostgreSQL**: 12+ with pgvector extension (REQUIRED - no other databases supported)
|
338
426
|
- **Dependencies**: activerecord, pg, pgvector, neighbor, ruby_llm, pdf-reader, docx, rubyzip, shrine, rmagick, opensearch-ruby, searchkick, ruby-progressbar
|
339
427
|
|
428
|
+
## Use Cases
|
429
|
+
|
430
|
+
- Internal knowledge bases and chat assistants grounded in your documents
|
431
|
+
- Product documentation and support search with analytics and relevance feedback
|
432
|
+
- Research corpora exploration (summaries, topics, similarity) across large text sets
|
433
|
+
- Incident retrospectives and operational analytics with searchable write-ups
|
434
|
+
- Media libraries preparing for text + image + audio pipelines (image/audio in progress)
|
435
|
+
|
436
|
+
## Environment Variables
|
437
|
+
|
438
|
+
Set the following as environment variables (do not commit secrets to source control):
|
439
|
+
|
440
|
+
- `OPENAI_API_KEY` — required for OpenAI models
|
441
|
+
- `OPENAI_ORGANIZATION` — optional, for OpenAI org scoping
|
442
|
+
- `OPENAI_PROJECT` — optional, for OpenAI project scoping
|
443
|
+
- `ANTHROPIC_API_KEY` — optional, for Anthropic models
|
444
|
+
- `GOOGLE_API_KEY` — optional, for Google models
|
445
|
+
- `DATABASE_PASSWORD` — your PostgreSQL password if not using peer auth
|
446
|
+
|
447
|
+
## Troubleshooting
|
448
|
+
|
449
|
+
### pgvector extension missing
|
450
|
+
|
451
|
+
- Ensure the extension is enabled in your database:
|
452
|
+
|
453
|
+
```bash
|
454
|
+
psql -d ragdoll_production -c "CREATE EXTENSION IF NOT EXISTS vector;"
|
455
|
+
```
|
456
|
+
|
457
|
+
- If the command fails, verify PostgreSQL and pgvector are installed and that you’re connecting to the correct database.
|
458
|
+
|
459
|
+
### Document stuck in "processing"
|
460
|
+
|
461
|
+
- Confirm your API keys are set and valid.
|
462
|
+
- Ensure `auto_migrate: true` in configuration (or run migrations if you manage schema yourself).
|
463
|
+
- Check logs at the path configured by `logging_config[:log_filepath]` for errors.
|
464
|
+
|
340
465
|
## Related Projects
|
341
466
|
|
342
467
|
- **ragdoll-cli**: Standalone CLI application using ragdoll
|
343
468
|
- **ragdoll-rails**: Rails engine with web interface for ragdoll
|
344
469
|
|
470
|
+
## Contributing & Support
|
471
|
+
|
472
|
+
Contributions are welcome! If you find a bug or have a feature request, please open an issue or submit a pull request. For questions and feedback, open an issue in this repository.
|
473
|
+
|
345
474
|
## Key Design Principles
|
346
475
|
|
347
476
|
1. **Database-Oriented**: Built on ActiveRecord with PostgreSQL + pgvector for production performance
|
data/Rakefile
CHANGED
@@ -11,6 +11,8 @@ module Ragdoll
|
|
11
11
|
has_neighbors :embedding_vector
|
12
12
|
|
13
13
|
belongs_to :embeddable, polymorphic: true
|
14
|
+
has_many :search_results, class_name: "Ragdoll::SearchResult", dependent: :destroy
|
15
|
+
has_many :searches, through: :search_results
|
14
16
|
|
15
17
|
validates :embeddable_id, presence: true
|
16
18
|
validates :embeddable_type, presence: true
|
@@ -72,6 +74,24 @@ module Ragdoll
|
|
72
74
|
search_with_pgvector(query_embedding, scope, limit, threshold)
|
73
75
|
end
|
74
76
|
|
77
|
+
# Enhanced search that returns both results and similarity statistics
|
78
|
+
def self.search_similar_with_stats(query_embedding, limit: 20, threshold: 0.8, filters: {})
|
79
|
+
# Apply filters
|
80
|
+
scope = all
|
81
|
+
scope = scope.where(embeddable_id: filters[:embeddable_id]) if filters[:embeddable_id]
|
82
|
+
scope = scope.where(embeddable_type: filters[:embeddable_type]) if filters[:embeddable_type]
|
83
|
+
scope = scope.by_model(filters[:embedding_model]) if filters[:embedding_model]
|
84
|
+
|
85
|
+
# Document-level filters require joining through embeddable (STI Content) to documents
|
86
|
+
if filters[:document_type]
|
87
|
+
scope = scope.joins("JOIN ragdoll_contents ON ragdoll_contents.id = ragdoll_embeddings.embeddable_id")
|
88
|
+
.joins("JOIN ragdoll_documents ON ragdoll_documents.id = ragdoll_contents.document_id")
|
89
|
+
.where("ragdoll_documents.document_type = ?", filters[:document_type])
|
90
|
+
end
|
91
|
+
|
92
|
+
search_with_pgvector_stats(query_embedding, scope, limit, threshold)
|
93
|
+
end
|
94
|
+
|
75
95
|
# Fast search using pgvector with neighbor gem
|
76
96
|
def self.search_with_pgvector(query_embedding, scope, limit, threshold)
|
77
97
|
# Use pgvector for similarity search
|
@@ -103,6 +123,60 @@ module Ragdoll
|
|
103
123
|
results
|
104
124
|
end
|
105
125
|
|
126
|
+
# Enhanced search with statistics
|
127
|
+
def self.search_with_pgvector_stats(query_embedding, scope, limit, threshold)
|
128
|
+
# Use pgvector for similarity search - get more results to analyze
|
129
|
+
# Note: We convert to array immediately to avoid SQL conflicts with count operations
|
130
|
+
neighbor_results = scope
|
131
|
+
.includes(:embeddable)
|
132
|
+
.nearest_neighbors(:embedding_vector, query_embedding, distance: "cosine")
|
133
|
+
.limit([limit * 3, 50].max) # Get enough for statistics
|
134
|
+
.to_a # Convert to array to avoid SQL conflicts
|
135
|
+
|
136
|
+
results = []
|
137
|
+
all_similarities = []
|
138
|
+
highest_similarity = 0.0
|
139
|
+
lowest_similarity = 1.0
|
140
|
+
total_checked = neighbor_results.length
|
141
|
+
|
142
|
+
neighbor_results.each do |embedding|
|
143
|
+
# Calculate cosine similarity (neighbor returns distance, we want similarity)
|
144
|
+
similarity = 1.0 - embedding.neighbor_distance
|
145
|
+
all_similarities << similarity
|
146
|
+
|
147
|
+
highest_similarity = similarity if similarity > highest_similarity
|
148
|
+
lowest_similarity = similarity if similarity < lowest_similarity
|
149
|
+
|
150
|
+
next if similarity < threshold
|
151
|
+
|
152
|
+
usage_score = calculate_usage_score(embedding)
|
153
|
+
combined_score = similarity + usage_score
|
154
|
+
|
155
|
+
results << build_result_hash(embedding, query_embedding, similarity, highest_similarity,
|
156
|
+
usage_score, combined_score)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Sort by combined score and limit
|
160
|
+
results = results.sort_by { |r| -r[:combined_score] }.take(limit)
|
161
|
+
mark_embeddings_as_used(results)
|
162
|
+
|
163
|
+
# Calculate statistics
|
164
|
+
stats = {
|
165
|
+
total_embeddings_checked: total_checked,
|
166
|
+
threshold_used: threshold,
|
167
|
+
highest_similarity: highest_similarity,
|
168
|
+
lowest_similarity: lowest_similarity,
|
169
|
+
average_similarity: all_similarities.empty? ? 0.0 : (all_similarities.sum / all_similarities.length),
|
170
|
+
similarities_above_threshold: all_similarities.count { |s| s >= threshold },
|
171
|
+
total_similarities_calculated: all_similarities.length
|
172
|
+
}
|
173
|
+
|
174
|
+
{
|
175
|
+
results: results,
|
176
|
+
statistics: stats
|
177
|
+
}
|
178
|
+
end
|
179
|
+
|
106
180
|
private
|
107
181
|
|
108
182
|
# Calculate usage score for ranking
|