ragdoll-rails 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +501 -0
- data/Rakefile +40 -0
- data/app/models/ragdoll/document.rb +120 -0
- data/app/models/ragdoll/embedding.rb +31 -0
- data/app/models/ragdoll/search.rb +201 -0
- data/config/initializers/ragdoll.rb +6 -0
- data/config/routes.rb +5 -0
- data/db/migrate/20250218123456_create_documents.rb +46 -0
- data/db/migrate/20250219123456_create_ragdoll_embeddings.rb +41 -0
- data/db/migrate/20250220123456_update_embeddings_vector_column.rb +41 -0
- data/db/migrate/20250223123457_add_metadata_and_foreign_key_to_ragdoll_tables.rb +37 -0
- data/db/migrate/20250225123456_add_summary_to_ragdoll_documents.rb +17 -0
- data/db/migrate/20250226123456_add_usage_tracking_to_ragdoll_embeddings.rb +28 -0
- data/lib/generators/ragdoll/init/init_generator.rb +26 -0
- data/lib/generators/ragdoll/init/templates/INSTALL +56 -0
- data/lib/generators/ragdoll/init/templates/ragdoll_config.rb +96 -0
- data/lib/ragdoll/rails/configuration.rb +33 -0
- data/lib/ragdoll/rails/engine.rb +32 -0
- data/lib/ragdoll/rails/version.rb +9 -0
- data/lib/ragdoll/rails.rb +29 -0
- data/lib/ragdoll-rails.rb +11 -0
- data/lib/tasks/rspec.rake +19 -0
- metadata +67 -0
@@ -0,0 +1,201 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ragdoll
|
4
|
+
module Rails
|
5
|
+
class Search < ApplicationRecord
|
6
|
+
self.table_name = 'ragdoll_searches'
|
7
|
+
|
8
|
+
# Override dangerous attribute to allow access to model_name column
|
9
|
+
def self.dangerous_attribute_method?(name)
|
10
|
+
name.to_s == 'model_name' ? false : super
|
11
|
+
end
|
12
|
+
|
13
|
+
# Validations
|
14
|
+
validates :query, presence: true, length: { minimum: 1, maximum: 10000 }
|
15
|
+
validates :search_type, presence: true, inclusion: { in: %w[semantic keyword hybrid] }
|
16
|
+
validates :result_count, presence: true, numericality: { greater_than_or_equal_to: 0 }
|
17
|
+
validates :search_time, numericality: { greater_than: 0 }, allow_nil: true
|
18
|
+
validates :model_name, presence: true, length: { maximum: 255 }
|
19
|
+
|
20
|
+
# Scopes
|
21
|
+
scope :recent, -> { order(created_at: :desc) }
|
22
|
+
scope :by_type, ->(type) { where(search_type: type) }
|
23
|
+
scope :successful, -> { where('result_count > 0') }
|
24
|
+
scope :failed, -> { where(result_count: 0) }
|
25
|
+
scope :by_model, ->(model) { where(model_name: model) }
|
26
|
+
scope :within_days, ->(days) { where(created_at: days.days.ago..) }
|
27
|
+
scope :slow_searches, ->(threshold = 2.0) { where('search_time > ?', threshold) }
|
28
|
+
|
29
|
+
# Callbacks
|
30
|
+
before_validation :set_defaults
|
31
|
+
before_save :normalize_query
|
32
|
+
after_create :update_analytics, if: -> { Ragdoll.configuration.enable_search_analytics }
|
33
|
+
|
34
|
+
# Class methods
|
35
|
+
def self.analytics(days: 30)
|
36
|
+
searches = within_days(days)
|
37
|
+
|
38
|
+
{
|
39
|
+
total_searches: searches.count,
|
40
|
+
unique_queries: searches.distinct.count(:query),
|
41
|
+
average_results: searches.average(:result_count)&.round(2) || 0,
|
42
|
+
average_search_time: searches.where.not(search_time: nil).average(:search_time)&.round(3) || 0,
|
43
|
+
success_rate: calculate_success_rate(searches),
|
44
|
+
most_common_queries: most_common_queries(searches),
|
45
|
+
search_types: searches.group(:search_type).count,
|
46
|
+
models_used: searches.group(:model_name).count,
|
47
|
+
performance_stats: performance_statistics(searches)
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.most_common_queries(searches = all, limit: 10)
|
52
|
+
searches
|
53
|
+
.group(:query)
|
54
|
+
.count
|
55
|
+
.sort_by { |_, count| -count }
|
56
|
+
.first(limit)
|
57
|
+
.map { |query, count| { query: query, count: count } }
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.calculate_success_rate(searches = all)
|
61
|
+
total = searches.count
|
62
|
+
return 0 if total == 0
|
63
|
+
|
64
|
+
successful_count = searches.successful.count
|
65
|
+
(successful_count.to_f / total * 100).round(2)
|
66
|
+
end
|
67
|
+
|
68
|
+
def self.performance_statistics(searches = all)
|
69
|
+
searches_with_time = searches.where.not(search_time: nil)
|
70
|
+
return {} if searches_with_time.empty?
|
71
|
+
|
72
|
+
times = searches_with_time.pluck(:search_time).sort
|
73
|
+
|
74
|
+
{
|
75
|
+
fastest: times.first,
|
76
|
+
slowest: times.last,
|
77
|
+
median: calculate_median(times),
|
78
|
+
percentile_95: calculate_percentile(times, 95),
|
79
|
+
slow_search_count: searches.slow_searches.count
|
80
|
+
}
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.calculate_median(sorted_array)
|
84
|
+
length = sorted_array.length
|
85
|
+
return 0 if length == 0
|
86
|
+
|
87
|
+
if length.odd?
|
88
|
+
sorted_array[length / 2]
|
89
|
+
else
|
90
|
+
(sorted_array[length / 2 - 1] + sorted_array[length / 2]) / 2.0
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.calculate_percentile(sorted_array, percentile)
|
95
|
+
return 0 if sorted_array.empty?
|
96
|
+
|
97
|
+
index = (percentile / 100.0 * (sorted_array.length - 1)).round
|
98
|
+
sorted_array[index]
|
99
|
+
end
|
100
|
+
|
101
|
+
# Instance methods
|
102
|
+
def successful?
|
103
|
+
result_count > 0
|
104
|
+
end
|
105
|
+
|
106
|
+
def failed?
|
107
|
+
result_count == 0
|
108
|
+
end
|
109
|
+
|
110
|
+
def slow?(threshold = 2.0)
|
111
|
+
search_time && search_time > threshold
|
112
|
+
end
|
113
|
+
|
114
|
+
def embedding_vector
|
115
|
+
return nil unless query_embedding
|
116
|
+
|
117
|
+
if query_embedding.is_a?(String)
|
118
|
+
# Handle string representation of vector
|
119
|
+
JSON.parse(query_embedding)
|
120
|
+
else
|
121
|
+
query_embedding
|
122
|
+
end
|
123
|
+
rescue JSON::ParserError
|
124
|
+
nil
|
125
|
+
end
|
126
|
+
|
127
|
+
def result_ids
|
128
|
+
return [] unless results.is_a?(Hash)
|
129
|
+
|
130
|
+
results['result_ids'] || results[:result_ids] || []
|
131
|
+
end
|
132
|
+
|
133
|
+
def filter_summary
|
134
|
+
return 'None' if filters.blank?
|
135
|
+
|
136
|
+
filter_parts = []
|
137
|
+
filters.each do |key, value|
|
138
|
+
filter_parts << "#{key}: #{value}"
|
139
|
+
end
|
140
|
+
|
141
|
+
filter_parts.join(', ')
|
142
|
+
end
|
143
|
+
|
144
|
+
def performance_category
|
145
|
+
return 'unknown' unless search_time
|
146
|
+
|
147
|
+
case search_time
|
148
|
+
when 0..0.5
|
149
|
+
'fast'
|
150
|
+
when 0.5..1.0
|
151
|
+
'normal'
|
152
|
+
when 1.0..2.0
|
153
|
+
'slow'
|
154
|
+
else
|
155
|
+
'very_slow'
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def to_analytics_hash
|
160
|
+
{
|
161
|
+
id: id,
|
162
|
+
query: query,
|
163
|
+
search_type: search_type,
|
164
|
+
result_count: result_count,
|
165
|
+
search_time: search_time,
|
166
|
+
performance_category: performance_category,
|
167
|
+
successful: successful?,
|
168
|
+
model_name: model_name,
|
169
|
+
filters: filter_summary,
|
170
|
+
created_at: created_at
|
171
|
+
}
|
172
|
+
end
|
173
|
+
|
174
|
+
private
|
175
|
+
|
176
|
+
def set_defaults
|
177
|
+
self.search_type ||= 'semantic'
|
178
|
+
self.result_count ||= 0
|
179
|
+
self.filters ||= {}
|
180
|
+
self.results ||= {}
|
181
|
+
self.model_name ||= Ragdoll.configuration.embedding_model
|
182
|
+
end
|
183
|
+
|
184
|
+
def normalize_query
|
185
|
+
return unless query
|
186
|
+
|
187
|
+
# Remove excessive whitespace
|
188
|
+
self.query = query.strip.gsub(/\s+/, ' ')
|
189
|
+
|
190
|
+
# Truncate if too long
|
191
|
+
self.query = query.truncate(10000) if query.length > 10000
|
192
|
+
end
|
193
|
+
|
194
|
+
def update_analytics
|
195
|
+
# This could be extended to update real-time analytics
|
196
|
+
# For now, it's just a placeholder for future enhancements
|
197
|
+
Rails.logger.debug "Search recorded: #{query} (#{result_count} results in #{search_time}s)"
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
data/config/routes.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
# Creates the core documents table for the Ragdoll RAG (Retrieval-Augmented Generation) system.
|
2
|
+
# This table stores document metadata, content, and processing status information.
|
3
|
+
class CreateRagdollDocuments < ActiveRecord::Migration[8.0]
|
4
|
+
def change
|
5
|
+
# Enable PostgreSQL extensions required for advanced text processing and vector operations
|
6
|
+
enable_extension 'pg_trgm' unless extension_enabled?('pg_trgm') # Trigram matching for fuzzy text search
|
7
|
+
enable_extension 'fuzzystrmatch' unless extension_enabled?('fuzzystrmatch') # Fuzzy string matching algorithms
|
8
|
+
enable_extension 'vector' unless extension_enabled?('vector') # Vector operations for embedding similarity
|
9
|
+
|
10
|
+
create_table :ragdoll_documents, comment: 'Core documents table storing files and content for RAG processing. Each row represents a document that can be chunked into embeddings for semantic search and AI retrieval.' do |t|
|
11
|
+
# File system reference - where the original document is stored
|
12
|
+
t.string :location, null: false, comment: 'File system path or URL to the original document. Required field that uniquely identifies the source location of the document for re-processing or reference.'
|
13
|
+
|
14
|
+
# Document content and derived data
|
15
|
+
t.text :content, comment: 'Full extracted text content of the document. This is the raw text that will be chunked and embedded for semantic search. May be large for documents like PDFs or web pages.'
|
16
|
+
t.text :summary, comment: 'AI-generated summary of the document content. Created during processing to provide quick overviews and improve search relevance. Generated by summarization models.'
|
17
|
+
|
18
|
+
# Document classification and metadata
|
19
|
+
t.string :document_type, comment: 'Classification of the document type (e.g., "pdf", "text", "markdown", "html"). Used for applying type-specific processing rules and display formatting.'
|
20
|
+
t.string :title, comment: 'Human-readable title of the document. May be extracted from filename, document metadata, or content analysis. Used for display and identification purposes.'
|
21
|
+
t.string :source_type, comment: 'Origin source of the document (e.g., "file", "url", "api", "upload"). Helps track how the document entered the system for auditing and re-processing.'
|
22
|
+
|
23
|
+
# Chunking configuration - controls how document is split for embedding
|
24
|
+
t.integer :chunk_size, comment: 'Number of characters per chunk when splitting document for embedding. Larger chunks capture more context but may exceed model limits. Typical values: 500-2000 characters.'
|
25
|
+
t.integer :chunk_overlap, comment: 'Number of characters to overlap between consecutive chunks. Prevents context loss at chunk boundaries. Typically 10-20% of chunk_size to maintain coherence.'
|
26
|
+
|
27
|
+
# Flexible metadata storage for document-specific information
|
28
|
+
t.jsonb :metadata, default: {}, comment: 'Flexible JSON storage for document-specific metadata such as author, creation date, file size, extraction settings, or custom tags. Indexed with GIN for efficient querying.'
|
29
|
+
|
30
|
+
# Processing lifecycle tracking
|
31
|
+
t.datetime :processing_started_at, comment: 'Timestamp when document processing (chunking and embedding generation) began. Used for monitoring processing duration and identifying stuck jobs.'
|
32
|
+
t.datetime :processing_finished_at, comment: 'Timestamp when document processing completed successfully or failed. Used with started_at to calculate processing time and identify performance issues.'
|
33
|
+
t.string :status, default: 'pending', comment: 'Current processing status of the document. Values: "pending" (not yet processed), "processing" (currently being processed), "processed" (successfully completed), "failed" (processing error occurred). Used for job queue management and user feedback.'
|
34
|
+
|
35
|
+
# Standard Rails timestamps for audit trails
|
36
|
+
t.timestamps comment: 'Standard Rails created_at and updated_at timestamps for tracking when document records are created and modified in the database.'
|
37
|
+
end
|
38
|
+
|
39
|
+
# Indexes for performance optimization
|
40
|
+
add_index :ragdoll_documents, :location, unique: true, comment: 'Unique constraint on location prevents duplicate documents and enables fast lookups by file path or URL'
|
41
|
+
add_index :ragdoll_documents, :document_type, comment: 'Index for filtering documents by type, commonly used in admin interfaces and type-specific processing queries'
|
42
|
+
add_index :ragdoll_documents, :status, comment: 'Index for filtering by processing status, critical for job queue management and monitoring dashboard queries'
|
43
|
+
add_index :ragdoll_documents, :metadata, using: :gin, comment: 'GIN index on JSONB metadata field enables efficient queries on nested JSON properties and full-text search within metadata'
|
44
|
+
add_index :ragdoll_documents, :processing_started_at, comment: 'Index for sorting and filtering by processing start time, used for monitoring processing queues and performance analysis'
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Creates the embeddings table for storing document chunks and their vector representations.
|
2
|
+
# This table is the core of the RAG system's semantic search capabilities.
|
3
|
+
class CreateRagdollEmbeddings < ActiveRecord::Migration[8.0]
|
4
|
+
def change
|
5
|
+
create_table :ragdoll_embeddings, comment: 'Stores document chunks and their vector embeddings for semantic search. Each row represents a chunk of text from a document along with its AI-generated vector representation for similarity matching.' do |t|
|
6
|
+
# Parent document reference - establishes the chunk-to-document relationship
|
7
|
+
t.references :document, null: false, foreign_key: { to_table: :ragdoll_documents }, comment: 'Foreign key reference to the parent document in ragdoll_documents table. Required field that links each chunk back to its source document for context and retrieval.'
|
8
|
+
|
9
|
+
# The actual text content of this chunk
|
10
|
+
t.text :content, null: false, comment: 'The actual text content of this chunk extracted from the parent document. Required field containing the text that will be used for semantic matching and returned in search results. Typically 500-2000 characters.'
|
11
|
+
|
12
|
+
# Vector embedding for semantic search (pgvector format for optimal similarity calculations)
|
13
|
+
t.vector :embedding, limit: 1536, comment: 'High-dimensional vector representation of the content generated by AI embedding models. Stored in pgvector format for efficient cosine similarity calculations. Dimension typically 1536 for OpenAI models.'
|
14
|
+
|
15
|
+
# Model identification for compatibility and reprocessing
|
16
|
+
t.string :model_name, comment: 'Name or identifier of the AI model used to generate this embedding (e.g., "text-embedding-3-small", "sentence-transformers/all-MiniLM-L6-v2"). Critical for ensuring embedding compatibility during search operations.'
|
17
|
+
|
18
|
+
# Token usage tracking for cost analysis and optimization
|
19
|
+
t.integer :token_count, comment: 'Number of tokens consumed by the AI model when generating this embedding. Used for cost tracking, billing analysis, and optimizing chunk sizes to minimize API costs while maintaining quality.'
|
20
|
+
|
21
|
+
# Chunk ordering within the parent document
|
22
|
+
t.integer :chunk_index, comment: 'Sequential position of this chunk within the parent document (0-based). Used for maintaining document order, reconstructing original text flow, and providing contextual information in search results.'
|
23
|
+
|
24
|
+
# Flexible storage for chunk-specific metadata
|
25
|
+
t.jsonb :metadata, default: {}, comment: 'Flexible JSON storage for chunk-specific metadata such as section headers, page numbers, processing parameters, or semantic tags. Indexed with GIN for efficient querying of nested properties.'
|
26
|
+
|
27
|
+
# Classification of embedding content type
|
28
|
+
t.string :embedding_type, default: 'text', comment: 'Type of content that was embedded (e.g., "text", "code", "table", "heading"). Allows for type-specific search strategies and filtering. Default "text" covers most document content.'
|
29
|
+
|
30
|
+
# Standard Rails timestamps for audit trails
|
31
|
+
t.timestamps comment: 'Standard Rails created_at and updated_at timestamps for tracking when embedding records are created and modified. Critical for debugging and performance analysis.'
|
32
|
+
end
|
33
|
+
|
34
|
+
# Performance indexes for efficient querying
|
35
|
+
add_index :ragdoll_embeddings, :document_id, comment: 'Index for efficiently retrieving all chunks belonging to a specific document. Essential for document reprocessing and chunk management operations.'
|
36
|
+
add_index :ragdoll_embeddings, :chunk_index, comment: 'Index for sorting chunks by their position within documents. Used for maintaining document order and providing sequential context in search results.'
|
37
|
+
add_index :ragdoll_embeddings, :embedding_type, comment: 'Index for filtering embeddings by content type. Enables type-specific search strategies (e.g., searching only code chunks or headings) and analytics on content distribution.'
|
38
|
+
add_index :ragdoll_embeddings, :metadata, using: :gin, comment: 'GIN index on JSONB metadata field enables efficient queries on nested JSON properties and supports complex filtering on chunk-specific attributes.'
|
39
|
+
add_index :ragdoll_embeddings, :embedding, using: :hnsw, opclass: :vector_cosine_ops, comment: 'Hierarchical Navigable Small World (HNSW) index optimized for cosine similarity searches on vector embeddings. Critical for fast semantic search performance at scale.'
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Updates the embeddings table to support variable vector dimensions and optimize queries.
|
2
|
+
# Originally intended to convert to pgvector format but maintains text compatibility for broader database support.
|
3
|
+
class UpdateEmbeddingsVectorColumn < ActiveRecord::Migration[8.0]
|
4
|
+
def up
|
5
|
+
# Remove the limit constraint to allow variable length vectors
|
6
|
+
# Different AI models produce embeddings with different dimensions (e.g., 1536 for OpenAI, 384 for some sentence transformers)
|
7
|
+
change_column :ragdoll_embeddings, :embedding, :vector, limit: nil, comment: 'High-dimensional vector representation with variable dimensions removed to support multiple embedding models. Allows for mixing different model outputs while maintaining pgvector compatibility.'
|
8
|
+
|
9
|
+
# Add column to track embedding vector dimensions for validation and compatibility
|
10
|
+
add_column :ragdoll_embeddings, :embedding_dimensions, :integer, comment: 'Number of dimensions in the embedding vector (e.g., 1536 for OpenAI text-embedding-3-small). Critical for ensuring embedding compatibility during similarity searches and preventing dimension mismatches.'
|
11
|
+
|
12
|
+
# Update existing records to set their dimensions based on actual vector data
|
13
|
+
# This ensures data integrity for existing embeddings
|
14
|
+
execute <<~SQL
|
15
|
+
UPDATE ragdoll_embeddings
|
16
|
+
SET embedding_dimensions = array_length(embedding::real[], 1)
|
17
|
+
WHERE embedding IS NOT NULL
|
18
|
+
SQL
|
19
|
+
|
20
|
+
# Add index on embedding_dimensions for efficient filtering by vector size
|
21
|
+
# Used when searching embeddings to ensure only compatible vectors are compared
|
22
|
+
add_index :ragdoll_embeddings, :embedding_dimensions, comment: 'Index for filtering embeddings by vector dimension size. Essential for ensuring only compatible embeddings are compared during similarity searches and avoiding dimension mismatch errors.'
|
23
|
+
|
24
|
+
# Add composite index on model_name and embedding_dimensions for optimized similarity searches
|
25
|
+
# This combination is frequently queried together when finding similar embeddings
|
26
|
+
add_index :ragdoll_embeddings, [:model_name, :embedding_dimensions],
|
27
|
+
name: 'index_ragdoll_embeddings_on_model_and_dimensions',
|
28
|
+
comment: 'Composite index on model name and vector dimensions. Optimizes the common query pattern of finding embeddings from the same model with matching dimensions for similarity calculations.'
|
29
|
+
end
|
30
|
+
|
31
|
+
def down
|
32
|
+
# Remove the new columns and indexes in reverse order
|
33
|
+
remove_index :ragdoll_embeddings, :embedding_dimensions
|
34
|
+
remove_index :ragdoll_embeddings, name: 'index_ragdoll_embeddings_on_model_and_dimensions'
|
35
|
+
remove_column :ragdoll_embeddings, :embedding_dimensions
|
36
|
+
|
37
|
+
# Note: The original plan was to restore vector type, but this would fail with mixed dimensions
|
38
|
+
# Restore the original limit (this will fail if there are vectors with different dimensions)
|
39
|
+
change_column :ragdoll_embeddings, :embedding, :vector, limit: 1536
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Creates the searches table for tracking user search queries and analytics.
|
2
|
+
# This table enables search analytics, caching, and performance monitoring for the RAG system.
|
3
|
+
class CreateRagdollSearches < ActiveRecord::Migration[8.0]
|
4
|
+
def change
|
5
|
+
create_table :ragdoll_searches, comment: 'Tracks user search queries and results for analytics and performance monitoring. Each row represents a search performed by a user, storing both the query and metadata about results.' do |t|
|
6
|
+
# User's original search query
|
7
|
+
t.text :query, null: false, comment: 'The original search query text entered by the user. Required field that captures the exact search terms for analytics, popular queries tracking, and search result caching.'
|
8
|
+
|
9
|
+
# Embedding of the search query for similarity calculations
|
10
|
+
t.vector :query_embedding, limit: 1536, comment: 'Vector embedding of the search query. Generated using the same model as document embeddings to enable semantic similarity calculations. Stored in pgvector format for efficient similarity operations.'
|
11
|
+
|
12
|
+
# Search classification and method
|
13
|
+
t.string :search_type, default: 'semantic', comment: 'Type of search performed (e.g., "semantic", "keyword", "hybrid"). Allows for different search strategies and helps analyze which search methods are most effective for users.'
|
14
|
+
|
15
|
+
# Search parameters and constraints
|
16
|
+
t.jsonb :filters, default: {}, comment: 'JSON object containing search filters applied (e.g., document_type, date_range, status). Used for analyzing how users refine searches and for recreating search contexts.'
|
17
|
+
|
18
|
+
# Search results metadata
|
19
|
+
t.jsonb :results, default: {}, comment: 'JSON object containing search result metadata such as result IDs, similarity scores, and ranking information. Enables search result caching and detailed analytics on result quality.'
|
20
|
+
t.integer :result_count, default: 0, comment: 'Number of results returned by this search. Used for analytics on search effectiveness and identifying queries that return too few or too many results.'
|
21
|
+
|
22
|
+
# Performance tracking
|
23
|
+
t.float :search_time, comment: 'Time in seconds taken to execute this search query. Critical for performance monitoring, identifying slow queries, and optimizing search algorithms.'
|
24
|
+
|
25
|
+
# Model identification for compatibility
|
26
|
+
t.string :model_name, comment: 'Name of the AI model used to generate the query embedding (e.g., "text-embedding-3-small"). Ensures search compatibility and enables analytics by model performance.'
|
27
|
+
|
28
|
+
# Standard Rails timestamps
|
29
|
+
t.timestamps comment: 'Standard Rails created_at and updated_at timestamps. The created_at timestamp is particularly important for search analytics and identifying search patterns over time.'
|
30
|
+
end
|
31
|
+
|
32
|
+
# Indexes for analytics and performance
|
33
|
+
add_index :ragdoll_searches, :search_type, comment: 'Index for filtering searches by type, used in analytics dashboards to compare effectiveness of different search strategies.'
|
34
|
+
add_index :ragdoll_searches, :query_embedding, using: :hnsw, opclass: :vector_cosine_ops, comment: 'HNSW index on query embeddings for finding similar queries. Enables query recommendation, duplicate detection, and search clustering analysis.'
|
35
|
+
add_index :ragdoll_searches, :created_at, comment: 'Index for time-based queries and analytics. Essential for generating search trend reports, popular queries over time, and performance monitoring.'
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# Adds summary metadata tracking to documents table for AI-generated summaries.
|
2
|
+
# Enhances the existing summary field with provenance and timing information.
|
3
|
+
class AddSummaryMetadataToRagdollDocuments < ActiveRecord::Migration[8.0]
|
4
|
+
def change
|
5
|
+
# Summary field already exists from initial migration, just add metadata fields for tracking
|
6
|
+
|
7
|
+
# Timestamp tracking for summary generation
|
8
|
+
add_column :ragdoll_documents, :summary_generated_at, :timestamp, comment: 'Timestamp when the AI-generated summary was created. Used for cache invalidation, determining summary freshness, and analytics on summary generation performance.'
|
9
|
+
|
10
|
+
# Model identification for summary provenance
|
11
|
+
add_column :ragdoll_documents, :summary_model, :string, comment: 'Name/identifier of the AI model used to generate the summary (e.g., "gpt-3.5-turbo", "claude-3-haiku"). Critical for tracking summary quality, cost analysis, and ensuring consistency in summary style.'
|
12
|
+
|
13
|
+
# Indexes for summary metadata queries and analytics
|
14
|
+
add_index :ragdoll_documents, :summary_generated_at, comment: 'Index for filtering and sorting documents by summary generation time. Used for cache invalidation queries and identifying documents with stale summaries that need regeneration.'
|
15
|
+
add_index :ragdoll_documents, :summary_model, comment: 'Index for filtering documents by the AI model used for summarization. Enables analytics on summary quality by model and batch re-summarization with newer models.'
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# Adds usage tracking functionality to embeddings for intelligent ranking and caching.
|
2
|
+
# Enables the system to learn which embeddings are most valuable and prioritize them in search results.
|
3
|
+
class AddUsageTrackingToRagdollEmbeddings < ActiveRecord::Migration[8.0]
|
4
|
+
def change
|
5
|
+
# Usage frequency tracking
|
6
|
+
add_column :ragdoll_embeddings, :usage_count, :integer, default: 0, null: false, comment: 'Number of times this embedding has been returned in search results. Incremented each time the embedding appears in search results. Used for frequency-based ranking to surface more relevant content.'
|
7
|
+
|
8
|
+
# Recency tracking for temporal relevance
|
9
|
+
add_column :ragdoll_embeddings, :returned_at, :timestamp, comment: 'Timestamp of the most recent time this embedding was returned in search results. Used for recency-based ranking algorithms and cache warming strategies. NULL indicates never used.'
|
10
|
+
|
11
|
+
# Performance indexes for usage-based ranking queries
|
12
|
+
add_index :ragdoll_embeddings, :usage_count, name: 'index_ragdoll_embeddings_on_usage_count', comment: 'Index for sorting embeddings by usage frequency. Critical for popularity-based ranking algorithms and identifying most/least used content.'
|
13
|
+
add_index :ragdoll_embeddings, :returned_at, name: 'index_ragdoll_embeddings_on_returned_at', comment: 'Index for sorting embeddings by recency of use. Enables temporal ranking algorithms and cache warming strategies based on recent usage patterns.'
|
14
|
+
|
15
|
+
# Composite index for advanced ranking algorithms that combine frequency and recency
|
16
|
+
add_index :ragdoll_embeddings, [:usage_count, :returned_at],
|
17
|
+
name: 'index_ragdoll_embeddings_on_usage_and_recency',
|
18
|
+
comment: 'Composite index for complex ranking algorithms that combine usage frequency with recency. Optimizes queries that balance popular content with recently accessed content for intelligent search result ranking.'
|
19
|
+
|
20
|
+
# Data migration to ensure existing records have proper default values
|
21
|
+
# Usage count defaults to 0, returned_at remains null until first usage
|
22
|
+
reversible do |dir|
|
23
|
+
dir.up do
|
24
|
+
execute "UPDATE ragdoll_embeddings SET usage_count = 0 WHERE usage_count IS NULL"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rails/generators'
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
module Generators
|
7
|
+
class InitGenerator < Rails::Generators::Base
|
8
|
+
desc "Create Ragdoll configuration initializer"
|
9
|
+
source_root File.expand_path("templates", __dir__)
|
10
|
+
|
11
|
+
def create_initializer_file
|
12
|
+
template "ragdoll_config.rb", "config/initializers/ragdoll_config.rb"
|
13
|
+
end
|
14
|
+
|
15
|
+
def show_readme
|
16
|
+
readme "INSTALL" if behavior == :invoke
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def application_name
|
22
|
+
Rails.application.class.name.split("::").first.underscore
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
===============================================================================
|
2
|
+
|
3
|
+
Ragdoll Configuration Created Successfully!
|
4
|
+
|
5
|
+
===============================================================================
|
6
|
+
|
7
|
+
The Ragdoll initializer has been created at:
|
8
|
+
config/initializers/ragdoll_config.rb
|
9
|
+
|
10
|
+
Next steps:
|
11
|
+
|
12
|
+
1. Configure your LLM provider API keys in environment variables:
|
13
|
+
|
14
|
+
For OpenAI:
|
15
|
+
export OPENAI_API_KEY="your-api-key-here"
|
16
|
+
|
17
|
+
For Anthropic:
|
18
|
+
export ANTHROPIC_API_KEY="your-api-key-here"
|
19
|
+
|
20
|
+
For Google:
|
21
|
+
export GOOGLE_API_KEY="your-api-key-here"
|
22
|
+
export GOOGLE_PROJECT_ID="your-project-id"
|
23
|
+
|
24
|
+
2. Install and run database migrations:
|
25
|
+
|
26
|
+
rails ragdoll:install:migrations
|
27
|
+
rails db:migrate
|
28
|
+
|
29
|
+
3. Ensure you have PostgreSQL with pgvector extension installed:
|
30
|
+
|
31
|
+
# In PostgreSQL console:
|
32
|
+
CREATE EXTENSION IF NOT EXISTS vector;
|
33
|
+
|
34
|
+
4. Configure Sidekiq for background job processing (optional):
|
35
|
+
|
36
|
+
# Add to your Gemfile if not already present:
|
37
|
+
gem 'sidekiq'
|
38
|
+
|
39
|
+
# Start Sidekiq:
|
40
|
+
bundle exec sidekiq
|
41
|
+
|
42
|
+
5. Start using Ragdoll in your Rails application:
|
43
|
+
|
44
|
+
# Add documents
|
45
|
+
Ragdoll.add_document('/path/to/document.pdf')
|
46
|
+
|
47
|
+
# Enhance prompts with context
|
48
|
+
enhanced = Ragdoll.enhance_prompt('How do I configure the database?')
|
49
|
+
|
50
|
+
# Use with your LLM
|
51
|
+
response = RubyLLM.ask(enhanced[:enhanced_prompt])
|
52
|
+
|
53
|
+
For more information, visit:
|
54
|
+
https://github.com/MadBomber/ragdoll-rails
|
55
|
+
|
56
|
+
===============================================================================
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Ragdoll RAG (Retrieval-Augmented Generation) Configuration
|
4
|
+
# This initializer configures the Ragdoll Rails engine for your application.
|
5
|
+
|
6
|
+
Ragdoll.configure do |config|
|
7
|
+
# LLM Provider Configuration
|
8
|
+
# Supported providers: :openai, :anthropic, :google, :azure, :ollama, :huggingface
|
9
|
+
config.llm_provider = :openai
|
10
|
+
|
11
|
+
# Optional: Use a different provider for embeddings (defaults to llm_provider)
|
12
|
+
# config.embedding_provider = :openai
|
13
|
+
|
14
|
+
# Provider-specific API configurations
|
15
|
+
# Add your API keys and configuration here
|
16
|
+
config.llm_config = {
|
17
|
+
openai: {
|
18
|
+
api_key: ENV['OPENAI_API_KEY']
|
19
|
+
# organization: ENV['OPENAI_ORGANIZATION'], # optional
|
20
|
+
# project: ENV['OPENAI_PROJECT'] # optional
|
21
|
+
},
|
22
|
+
anthropic: {
|
23
|
+
api_key: ENV['ANTHROPIC_API_KEY']
|
24
|
+
},
|
25
|
+
google: {
|
26
|
+
api_key: ENV['GOOGLE_API_KEY'],
|
27
|
+
project_id: ENV['GOOGLE_PROJECT_ID']
|
28
|
+
},
|
29
|
+
azure: {
|
30
|
+
api_key: ENV['AZURE_API_KEY'],
|
31
|
+
endpoint: ENV['AZURE_ENDPOINT'],
|
32
|
+
api_version: ENV['AZURE_API_VERSION']
|
33
|
+
},
|
34
|
+
ollama: {
|
35
|
+
endpoint: ENV['OLLAMA_ENDPOINT'] || 'http://localhost:11434'
|
36
|
+
},
|
37
|
+
huggingface: {
|
38
|
+
api_key: ENV['HUGGINGFACE_API_KEY']
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
# Embedding Model Configuration
|
43
|
+
# Examples: 'text-embedding-3-small', 'text-embedding-3-large', 'text-embedding-ada-002'
|
44
|
+
config.embedding_model = 'text-embedding-3-small'
|
45
|
+
|
46
|
+
# Default model for chat/completion
|
47
|
+
config.default_model = 'gpt-4o-mini'
|
48
|
+
|
49
|
+
# Text Processing Configuration
|
50
|
+
config.chunk_size = 1000
|
51
|
+
config.chunk_overlap = 200
|
52
|
+
|
53
|
+
# Search Configuration
|
54
|
+
config.search_similarity_threshold = 0.7
|
55
|
+
config.max_search_results = 10
|
56
|
+
|
57
|
+
# Vector Configuration
|
58
|
+
# Maximum dimensions supported (supports variable-length vectors)
|
59
|
+
config.max_embedding_dimensions = 3072
|
60
|
+
|
61
|
+
# Background Jobs Configuration
|
62
|
+
# Set to false if you don't want to use background jobs for document processing
|
63
|
+
config.use_background_jobs = true
|
64
|
+
|
65
|
+
# Analytics Configuration
|
66
|
+
config.enable_search_analytics = true
|
67
|
+
config.cache_embeddings = true
|
68
|
+
|
69
|
+
# Custom Prompt Template (optional)
|
70
|
+
# Use {{context}} and {{prompt}} placeholders
|
71
|
+
# config.prompt_template = <<~TEMPLATE
|
72
|
+
# Based on the following context, please answer the question.
|
73
|
+
#
|
74
|
+
# Context:
|
75
|
+
# {{context}}
|
76
|
+
#
|
77
|
+
# Question: {{prompt}}
|
78
|
+
#
|
79
|
+
# Answer:
|
80
|
+
# TEMPLATE
|
81
|
+
end
|
82
|
+
|
83
|
+
# Optional: Configure Rails-specific settings
|
84
|
+
Ragdoll::Rails.configure do |config|
|
85
|
+
# Enable/disable background job processing
|
86
|
+
config.use_background_jobs = true
|
87
|
+
|
88
|
+
# Background job queue name
|
89
|
+
config.queue_name = :ragdoll
|
90
|
+
|
91
|
+
# Maximum file size for uploads (in bytes)
|
92
|
+
config.max_file_size = 10.megabytes
|
93
|
+
|
94
|
+
# Allowed file types for document upload
|
95
|
+
config.allowed_file_types = %w[pdf docx txt md html htm json xml csv]
|
96
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ragdoll
|
4
|
+
module Rails
|
5
|
+
class Configuration
|
6
|
+
# Rails-specific configuration options
|
7
|
+
attr_accessor :use_background_jobs, :job_queue, :job_adapter, :queue_name, :max_file_size, :allowed_file_types
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@use_background_jobs = true
|
11
|
+
@job_queue = :default
|
12
|
+
@job_adapter = :sidekiq
|
13
|
+
@queue_name = :ragdoll
|
14
|
+
@max_file_size = 10 * 1024 * 1024 # 10MB
|
15
|
+
@allowed_file_types = %w[pdf docx txt md html htm json xml csv]
|
16
|
+
end
|
17
|
+
|
18
|
+
def configure_core
|
19
|
+
# Delegate to core ragdoll gem configuration
|
20
|
+
# This would configure the core ragdoll gem based on Rails-specific settings
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.configuration
|
25
|
+
@configuration ||= Configuration.new
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.configure
|
29
|
+
yield(configuration)
|
30
|
+
configuration.configure_core
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|