ragdoll-rails 0.0.1 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2ac0f85d4a125cc90ca0a5187ae327eeb1c88de8380c3a5929ec79c52d7db54a
4
- data.tar.gz: 7eb055ea9faecbdbdb5a51cce6256ed5350de814447de02d20d09e6a7dd3a86f
3
+ metadata.gz: c65d1b42c610da8ca427a88eae05c3e7636d2b3521a2c0ecccf09efc55af335d
4
+ data.tar.gz: d1ca9240447923aeeb896c57da732d92b874fa3ebf2645306d74f5ee435e642d
5
5
  SHA512:
6
- metadata.gz: 9e52d30d90f0641b02f7b32fab536f3465666c38ab06d9008ae90a67c0fec9e8037614049adb63479607dad320d15e6a51d32e469eca7a80b4439029c5aa031a
7
- data.tar.gz: a5397b0ae44775dd35bddf5a4551bfc456f6c856958695ee9fae0da0117b4ccfef89c612ca0e9182eb310866b61c0b42d5e7302b2160186c6144297a075239d6
6
+ metadata.gz: e656136397c64bbb15923dea45d071dc7ea8a71f3d8334ea7ed69ee1e08d2c067b0358af02e996f822f8fdf57908bfbc47de75aadebdbbe811e5273b242b0594
7
+ data.tar.gz: 1ca6e7f05d39f3135d2623cb3d2c12a6d7822553ed462f991631e4b3d05be6b0e77e667913b72219f08ce06145937b42f9366fa803cb92133adce6221c621cee
@@ -21,13 +21,5 @@ module Ragdoll
21
21
  end
22
22
  end
23
23
 
24
- def self.configuration
25
- @configuration ||= Configuration.new
26
- end
27
-
28
- def self.configure
29
- yield(configuration)
30
- configuration.configure_core
31
- end
32
24
  end
33
25
  end
@@ -4,6 +4,6 @@
4
4
 
5
5
  module Ragdoll
6
6
  module Rails
7
- VERSION = "0.0.1"
7
+ VERSION = "0.1.8"
8
8
  end
9
9
  end
data/lib/ragdoll/rails.rb CHANGED
@@ -2,7 +2,12 @@
2
2
 
3
3
  require_relative 'rails/version'
4
4
  require_relative 'rails/configuration'
5
- require_relative 'rails/engine'
5
+ begin
6
+ require_relative 'rails/engine'
7
+ rescue LoadError, NameError => e
8
+ # Skip engine loading if Rails is not available (e.g., in tests)
9
+ puts "Warning: Could not load Rails engine: #{e.message}" if ENV['RAGDOLL_VERBOSE']
10
+ end
6
11
 
7
12
  module Ragdoll
8
13
  module Rails
data/lib/ragdoll-rails.rb CHANGED
@@ -6,6 +6,7 @@ require_relative "ragdoll/rails"
6
6
  # The ragdoll gem provides the core business logic functionality
7
7
  begin
8
8
  require 'ragdoll'
9
- rescue LoadError
10
- raise LoadError, "The ragdoll gem is required for ragdoll-rails to function. Please add 'gem \"ragdoll\"' to your Gemfile."
9
+ rescue LoadError => e
10
+ puts "Warning: Could not load ragdoll gem: #{e.message}"
11
+ puts "Please ensure 'gem \"ragdoll\"' is in your Gemfile"
11
12
  end
metadata CHANGED
@@ -1,14 +1,42 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ragdoll-rails
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dewayne VanHoozer
8
8
  bindir: bin
9
9
  cert_chain: []
10
10
  date: 1980-01-02 00:00:00.000000000 Z
11
- dependencies: []
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: ragdoll
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: 0.1.0
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: 0.1.0
26
+ - !ruby/object:Gem::Dependency
27
+ name: rails
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 7.0.0
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: 7.0.0
12
40
  description: Rails engine providing ActiveRecord integration, background jobs, and
13
41
  UI components for the Ragdoll RAG (Retrieval-Augmented Generation) system
14
42
  email:
@@ -19,17 +47,8 @@ extra_rdoc_files: []
19
47
  files:
20
48
  - README.md
21
49
  - Rakefile
22
- - app/models/ragdoll/document.rb
23
- - app/models/ragdoll/embedding.rb
24
- - app/models/ragdoll/search.rb
25
50
  - config/initializers/ragdoll.rb
26
51
  - config/routes.rb
27
- - db/migrate/20250218123456_create_documents.rb
28
- - db/migrate/20250219123456_create_ragdoll_embeddings.rb
29
- - db/migrate/20250220123456_update_embeddings_vector_column.rb
30
- - db/migrate/20250223123457_add_metadata_and_foreign_key_to_ragdoll_tables.rb
31
- - db/migrate/20250225123456_add_summary_to_ragdoll_documents.rb
32
- - db/migrate/20250226123456_add_usage_tracking_to_ragdoll_embeddings.rb
33
52
  - lib/generators/ragdoll/init/init_generator.rb
34
53
  - lib/generators/ragdoll/init/templates/INSTALL
35
54
  - lib/generators/ragdoll/init/templates/ragdoll_config.rb
@@ -1,120 +0,0 @@
1
- # This file defines the Rails-specific Document model for the Ragdoll Rails engine.
2
- # This model is separate from Ragdoll::Core::Models::Document to avoid conflicts.
3
-
4
- # frozen_string_literal: true
5
-
6
- module Ragdoll
7
- module Rails
8
- class Document < ApplicationRecord
9
- self.table_name = 'ragdoll_documents'
10
-
11
- # Associations
12
- has_many :ragdoll_embeddings, class_name: 'Ragdoll::Rails::Embedding', foreign_key: 'document_id', dependent: :destroy
13
- has_one_attached :file if respond_to?(:has_one_attached)
14
-
15
- # Validations
16
- validates :location, presence: true, uniqueness: true
17
- validates :status, inclusion: { in: %w[pending processing completed failed] }
18
- validates :chunk_size, numericality: { greater_than: 0 }, allow_nil: true
19
- validates :chunk_overlap, numericality: { greater_than_or_equal_to: 0 }, allow_nil: true
20
-
21
- # Scopes
22
- scope :completed, -> { where(status: 'completed') }
23
- scope :failed, -> { where(status: 'failed') }
24
- scope :processing, -> { where(status: 'processing') }
25
- scope :pending, -> { where(status: 'pending') }
26
- scope :by_type, ->(type) { where(document_type: type) }
27
- scope :with_summaries, -> { where.not(summary: nil) }
28
- scope :needs_summary, -> { where(summary: nil).completed }
29
-
30
- # Search configuration
31
- searchkick text_middle: [:title, :summary, :content, :metadata_name, :metadata_summary] if defined?(Searchkick)
32
-
33
- def search_data
34
- return {} unless defined?(Searchkick)
35
-
36
- {
37
- title: title,
38
- summary: summary,
39
- content: content,
40
- metadata_name: metadata&.dig('name'),
41
- metadata_summary: metadata&.dig('summary'),
42
- document_type: document_type,
43
- status: status
44
- }
45
- end
46
-
47
- # Summary-related methods
48
- def has_summary?
49
- summary.present?
50
- end
51
-
52
- def summary_stale?
53
- return false unless has_summary?
54
- return true unless summary_generated_at
55
-
56
- # Consider summary stale if document was updated after summary generation
57
- updated_at > summary_generated_at
58
- end
59
-
60
- def needs_summary?
61
- return false unless content.present?
62
- # Business logic should be handled by ragdoll gem
63
- # TODO: Delegate to Ragdoll.needs_summary?(content, summary, summary_generated_at)
64
-
65
- !has_summary? || summary_stale?
66
- end
67
-
68
- def summary_word_count
69
- return 0 unless summary.present?
70
- summary.split.length
71
- end
72
-
73
- def regenerate_summary!
74
- # Business logic for summary generation should be handled by the ragdoll gem
75
- # This is a placeholder that delegates to the core ragdoll functionality
76
- return false unless content.present?
77
-
78
- # TODO: Delegate to Ragdoll gem's summarization functionality
79
- # summarization_result = Ragdoll.generate_summary(content, options)
80
- # Update the model with the result
81
-
82
- Rails.logger.warn "Summary regeneration not implemented - should delegate to ragdoll gem"
83
- false
84
- end
85
-
86
- # Processing status helpers
87
- def completed?
88
- status == 'completed'
89
- end
90
-
91
- def failed?
92
- status == 'failed'
93
- end
94
-
95
- def processing?
96
- status == 'processing'
97
- end
98
-
99
- def pending?
100
- status == 'pending'
101
- end
102
-
103
- # Content helpers
104
- def word_count
105
- return 0 unless content.present?
106
- content.split.length
107
- end
108
-
109
- def character_count
110
- return 0 unless content.present?
111
- content.length
112
- end
113
-
114
- def processing_duration
115
- return nil unless processing_started_at && processing_finished_at
116
- processing_finished_at - processing_started_at
117
- end
118
- end
119
- end
120
- end
@@ -1,31 +0,0 @@
1
- # This file defines the Rails-specific Embedding model for the Ragdoll Rails engine.
2
- # This model is separate from Ragdoll::Core::Models::Embedding to avoid conflicts.
3
-
4
- # frozen_string_literal: true
5
-
6
- module Ragdoll
7
- module Rails
8
- class Embedding < ApplicationRecord
9
- searchkick text_middle: [:metadata_content, :metadata_propositions] if defined?(Searchkick)
10
-
11
- belongs_to :document, class_name: 'Ragdoll::Rails::Document'
12
-
13
- # Override dangerous attribute to allow access to model_name column
14
- def self.dangerous_attribute_method?(name)
15
- name.to_s == 'model_name' ? false : super
16
- end
17
-
18
- def search_data
19
- return {} unless defined?(Searchkick)
20
-
21
- {
22
- metadata_content: metadata['content'],
23
- metadata_propositions: metadata['propositions']
24
- }
25
- end
26
-
27
- # Assuming the vector column is named 'vector'
28
- neighbor :vector, method: :euclidean if respond_to?(:neighbor)
29
- end
30
- end
31
- end
@@ -1,201 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Ragdoll
4
- module Rails
5
- class Search < ApplicationRecord
6
- self.table_name = 'ragdoll_searches'
7
-
8
- # Override dangerous attribute to allow access to model_name column
9
- def self.dangerous_attribute_method?(name)
10
- name.to_s == 'model_name' ? false : super
11
- end
12
-
13
- # Validations
14
- validates :query, presence: true, length: { minimum: 1, maximum: 10000 }
15
- validates :search_type, presence: true, inclusion: { in: %w[semantic keyword hybrid] }
16
- validates :result_count, presence: true, numericality: { greater_than_or_equal_to: 0 }
17
- validates :search_time, numericality: { greater_than: 0 }, allow_nil: true
18
- validates :model_name, presence: true, length: { maximum: 255 }
19
-
20
- # Scopes
21
- scope :recent, -> { order(created_at: :desc) }
22
- scope :by_type, ->(type) { where(search_type: type) }
23
- scope :successful, -> { where('result_count > 0') }
24
- scope :failed, -> { where(result_count: 0) }
25
- scope :by_model, ->(model) { where(model_name: model) }
26
- scope :within_days, ->(days) { where(created_at: days.days.ago..) }
27
- scope :slow_searches, ->(threshold = 2.0) { where('search_time > ?', threshold) }
28
-
29
- # Callbacks
30
- before_validation :set_defaults
31
- before_save :normalize_query
32
- after_create :update_analytics, if: -> { Ragdoll.configuration.enable_search_analytics }
33
-
34
- # Class methods
35
- def self.analytics(days: 30)
36
- searches = within_days(days)
37
-
38
- {
39
- total_searches: searches.count,
40
- unique_queries: searches.distinct.count(:query),
41
- average_results: searches.average(:result_count)&.round(2) || 0,
42
- average_search_time: searches.where.not(search_time: nil).average(:search_time)&.round(3) || 0,
43
- success_rate: calculate_success_rate(searches),
44
- most_common_queries: most_common_queries(searches),
45
- search_types: searches.group(:search_type).count,
46
- models_used: searches.group(:model_name).count,
47
- performance_stats: performance_statistics(searches)
48
- }
49
- end
50
-
51
- def self.most_common_queries(searches = all, limit: 10)
52
- searches
53
- .group(:query)
54
- .count
55
- .sort_by { |_, count| -count }
56
- .first(limit)
57
- .map { |query, count| { query: query, count: count } }
58
- end
59
-
60
- def self.calculate_success_rate(searches = all)
61
- total = searches.count
62
- return 0 if total == 0
63
-
64
- successful_count = searches.successful.count
65
- (successful_count.to_f / total * 100).round(2)
66
- end
67
-
68
- def self.performance_statistics(searches = all)
69
- searches_with_time = searches.where.not(search_time: nil)
70
- return {} if searches_with_time.empty?
71
-
72
- times = searches_with_time.pluck(:search_time).sort
73
-
74
- {
75
- fastest: times.first,
76
- slowest: times.last,
77
- median: calculate_median(times),
78
- percentile_95: calculate_percentile(times, 95),
79
- slow_search_count: searches.slow_searches.count
80
- }
81
- end
82
-
83
- def self.calculate_median(sorted_array)
84
- length = sorted_array.length
85
- return 0 if length == 0
86
-
87
- if length.odd?
88
- sorted_array[length / 2]
89
- else
90
- (sorted_array[length / 2 - 1] + sorted_array[length / 2]) / 2.0
91
- end
92
- end
93
-
94
- def self.calculate_percentile(sorted_array, percentile)
95
- return 0 if sorted_array.empty?
96
-
97
- index = (percentile / 100.0 * (sorted_array.length - 1)).round
98
- sorted_array[index]
99
- end
100
-
101
- # Instance methods
102
- def successful?
103
- result_count > 0
104
- end
105
-
106
- def failed?
107
- result_count == 0
108
- end
109
-
110
- def slow?(threshold = 2.0)
111
- search_time && search_time > threshold
112
- end
113
-
114
- def embedding_vector
115
- return nil unless query_embedding
116
-
117
- if query_embedding.is_a?(String)
118
- # Handle string representation of vector
119
- JSON.parse(query_embedding)
120
- else
121
- query_embedding
122
- end
123
- rescue JSON::ParserError
124
- nil
125
- end
126
-
127
- def result_ids
128
- return [] unless results.is_a?(Hash)
129
-
130
- results['result_ids'] || results[:result_ids] || []
131
- end
132
-
133
- def filter_summary
134
- return 'None' if filters.blank?
135
-
136
- filter_parts = []
137
- filters.each do |key, value|
138
- filter_parts << "#{key}: #{value}"
139
- end
140
-
141
- filter_parts.join(', ')
142
- end
143
-
144
- def performance_category
145
- return 'unknown' unless search_time
146
-
147
- case search_time
148
- when 0..0.5
149
- 'fast'
150
- when 0.5..1.0
151
- 'normal'
152
- when 1.0..2.0
153
- 'slow'
154
- else
155
- 'very_slow'
156
- end
157
- end
158
-
159
- def to_analytics_hash
160
- {
161
- id: id,
162
- query: query,
163
- search_type: search_type,
164
- result_count: result_count,
165
- search_time: search_time,
166
- performance_category: performance_category,
167
- successful: successful?,
168
- model_name: model_name,
169
- filters: filter_summary,
170
- created_at: created_at
171
- }
172
- end
173
-
174
- private
175
-
176
- def set_defaults
177
- self.search_type ||= 'semantic'
178
- self.result_count ||= 0
179
- self.filters ||= {}
180
- self.results ||= {}
181
- self.model_name ||= Ragdoll.configuration.embedding_model
182
- end
183
-
184
- def normalize_query
185
- return unless query
186
-
187
- # Remove excessive whitespace
188
- self.query = query.strip.gsub(/\s+/, ' ')
189
-
190
- # Truncate if too long
191
- self.query = query.truncate(10000) if query.length > 10000
192
- end
193
-
194
- def update_analytics
195
- # This could be extended to update real-time analytics
196
- # For now, it's just a placeholder for future enhancements
197
- Rails.logger.debug "Search recorded: #{query} (#{result_count} results in #{search_time}s)"
198
- end
199
- end
200
- end
201
- end
@@ -1,46 +0,0 @@
1
- # Creates the core documents table for the Ragdoll RAG (Retrieval-Augmented Generation) system.
2
- # This table stores document metadata, content, and processing status information.
3
- class CreateRagdollDocuments < ActiveRecord::Migration[8.0]
4
- def change
5
- # Enable PostgreSQL extensions required for advanced text processing and vector operations
6
- enable_extension 'pg_trgm' unless extension_enabled?('pg_trgm') # Trigram matching for fuzzy text search
7
- enable_extension 'fuzzystrmatch' unless extension_enabled?('fuzzystrmatch') # Fuzzy string matching algorithms
8
- enable_extension 'vector' unless extension_enabled?('vector') # Vector operations for embedding similarity
9
-
10
- create_table :ragdoll_documents, comment: 'Core documents table storing files and content for RAG processing. Each row represents a document that can be chunked into embeddings for semantic search and AI retrieval.' do |t|
11
- # File system reference - where the original document is stored
12
- t.string :location, null: false, comment: 'File system path or URL to the original document. Required field that uniquely identifies the source location of the document for re-processing or reference.'
13
-
14
- # Document content and derived data
15
- t.text :content, comment: 'Full extracted text content of the document. This is the raw text that will be chunked and embedded for semantic search. May be large for documents like PDFs or web pages.'
16
- t.text :summary, comment: 'AI-generated summary of the document content. Created during processing to provide quick overviews and improve search relevance. Generated by summarization models.'
17
-
18
- # Document classification and metadata
19
- t.string :document_type, comment: 'Classification of the document type (e.g., "pdf", "text", "markdown", "html"). Used for applying type-specific processing rules and display formatting.'
20
- t.string :title, comment: 'Human-readable title of the document. May be extracted from filename, document metadata, or content analysis. Used for display and identification purposes.'
21
- t.string :source_type, comment: 'Origin source of the document (e.g., "file", "url", "api", "upload"). Helps track how the document entered the system for auditing and re-processing.'
22
-
23
- # Chunking configuration - controls how document is split for embedding
24
- t.integer :chunk_size, comment: 'Number of characters per chunk when splitting document for embedding. Larger chunks capture more context but may exceed model limits. Typical values: 500-2000 characters.'
25
- t.integer :chunk_overlap, comment: 'Number of characters to overlap between consecutive chunks. Prevents context loss at chunk boundaries. Typically 10-20% of chunk_size to maintain coherence.'
26
-
27
- # Flexible metadata storage for document-specific information
28
- t.jsonb :metadata, default: {}, comment: 'Flexible JSON storage for document-specific metadata such as author, creation date, file size, extraction settings, or custom tags. Indexed with GIN for efficient querying.'
29
-
30
- # Processing lifecycle tracking
31
- t.datetime :processing_started_at, comment: 'Timestamp when document processing (chunking and embedding generation) began. Used for monitoring processing duration and identifying stuck jobs.'
32
- t.datetime :processing_finished_at, comment: 'Timestamp when document processing completed successfully or failed. Used with started_at to calculate processing time and identify performance issues.'
33
- t.string :status, default: 'pending', comment: 'Current processing status of the document. Values: "pending" (not yet processed), "processing" (currently being processed), "processed" (successfully completed), "failed" (processing error occurred). Used for job queue management and user feedback.'
34
-
35
- # Standard Rails timestamps for audit trails
36
- t.timestamps comment: 'Standard Rails created_at and updated_at timestamps for tracking when document records are created and modified in the database.'
37
- end
38
-
39
- # Indexes for performance optimization
40
- add_index :ragdoll_documents, :location, unique: true, comment: 'Unique constraint on location prevents duplicate documents and enables fast lookups by file path or URL'
41
- add_index :ragdoll_documents, :document_type, comment: 'Index for filtering documents by type, commonly used in admin interfaces and type-specific processing queries'
42
- add_index :ragdoll_documents, :status, comment: 'Index for filtering by processing status, critical for job queue management and monitoring dashboard queries'
43
- add_index :ragdoll_documents, :metadata, using: :gin, comment: 'GIN index on JSONB metadata field enables efficient queries on nested JSON properties and full-text search within metadata'
44
- add_index :ragdoll_documents, :processing_started_at, comment: 'Index for sorting and filtering by processing start time, used for monitoring processing queues and performance analysis'
45
- end
46
- end
@@ -1,41 +0,0 @@
1
- # Creates the embeddings table for storing document chunks and their vector representations.
2
- # This table is the core of the RAG system's semantic search capabilities.
3
- class CreateRagdollEmbeddings < ActiveRecord::Migration[8.0]
4
- def change
5
- create_table :ragdoll_embeddings, comment: 'Stores document chunks and their vector embeddings for semantic search. Each row represents a chunk of text from a document along with its AI-generated vector representation for similarity matching.' do |t|
6
- # Parent document reference - establishes the chunk-to-document relationship
7
- t.references :document, null: false, foreign_key: { to_table: :ragdoll_documents }, comment: 'Foreign key reference to the parent document in ragdoll_documents table. Required field that links each chunk back to its source document for context and retrieval.'
8
-
9
- # The actual text content of this chunk
10
- t.text :content, null: false, comment: 'The actual text content of this chunk extracted from the parent document. Required field containing the text that will be used for semantic matching and returned in search results. Typically 500-2000 characters.'
11
-
12
- # Vector embedding for semantic search (pgvector format for optimal similarity calculations)
13
- t.vector :embedding, limit: 1536, comment: 'High-dimensional vector representation of the content generated by AI embedding models. Stored in pgvector format for efficient cosine similarity calculations. Dimension typically 1536 for OpenAI models.'
14
-
15
- # Model identification for compatibility and reprocessing
16
- t.string :model_name, comment: 'Name or identifier of the AI model used to generate this embedding (e.g., "text-embedding-3-small", "sentence-transformers/all-MiniLM-L6-v2"). Critical for ensuring embedding compatibility during search operations.'
17
-
18
- # Token usage tracking for cost analysis and optimization
19
- t.integer :token_count, comment: 'Number of tokens consumed by the AI model when generating this embedding. Used for cost tracking, billing analysis, and optimizing chunk sizes to minimize API costs while maintaining quality.'
20
-
21
- # Chunk ordering within the parent document
22
- t.integer :chunk_index, comment: 'Sequential position of this chunk within the parent document (0-based). Used for maintaining document order, reconstructing original text flow, and providing contextual information in search results.'
23
-
24
- # Flexible storage for chunk-specific metadata
25
- t.jsonb :metadata, default: {}, comment: 'Flexible JSON storage for chunk-specific metadata such as section headers, page numbers, processing parameters, or semantic tags. Indexed with GIN for efficient querying of nested properties.'
26
-
27
- # Classification of embedding content type
28
- t.string :embedding_type, default: 'text', comment: 'Type of content that was embedded (e.g., "text", "code", "table", "heading"). Allows for type-specific search strategies and filtering. Default "text" covers most document content.'
29
-
30
- # Standard Rails timestamps for audit trails
31
- t.timestamps comment: 'Standard Rails created_at and updated_at timestamps for tracking when embedding records are created and modified. Critical for debugging and performance analysis.'
32
- end
33
-
34
- # Performance indexes for efficient querying
35
- add_index :ragdoll_embeddings, :document_id, comment: 'Index for efficiently retrieving all chunks belonging to a specific document. Essential for document reprocessing and chunk management operations.'
36
- add_index :ragdoll_embeddings, :chunk_index, comment: 'Index for sorting chunks by their position within documents. Used for maintaining document order and providing sequential context in search results.'
37
- add_index :ragdoll_embeddings, :embedding_type, comment: 'Index for filtering embeddings by content type. Enables type-specific search strategies (e.g., searching only code chunks or headings) and analytics on content distribution.'
38
- add_index :ragdoll_embeddings, :metadata, using: :gin, comment: 'GIN index on JSONB metadata field enables efficient queries on nested JSON properties and supports complex filtering on chunk-specific attributes.'
39
- add_index :ragdoll_embeddings, :embedding, using: :hnsw, opclass: :vector_cosine_ops, comment: 'Hierarchical Navigable Small World (HNSW) index optimized for cosine similarity searches on vector embeddings. Critical for fast semantic search performance at scale.'
40
- end
41
- end
@@ -1,41 +0,0 @@
1
- # Updates the embeddings table to support variable vector dimensions and optimize queries.
2
- # Originally intended to convert to pgvector format but maintains text compatibility for broader database support.
3
- class UpdateEmbeddingsVectorColumn < ActiveRecord::Migration[8.0]
4
- def up
5
- # Remove the limit constraint to allow variable length vectors
6
- # Different AI models produce embeddings with different dimensions (e.g., 1536 for OpenAI, 384 for some sentence transformers)
7
- change_column :ragdoll_embeddings, :embedding, :vector, limit: nil, comment: 'High-dimensional vector representation with variable dimensions removed to support multiple embedding models. Allows for mixing different model outputs while maintaining pgvector compatibility.'
8
-
9
- # Add column to track embedding vector dimensions for validation and compatibility
10
- add_column :ragdoll_embeddings, :embedding_dimensions, :integer, comment: 'Number of dimensions in the embedding vector (e.g., 1536 for OpenAI text-embedding-3-small). Critical for ensuring embedding compatibility during similarity searches and preventing dimension mismatches.'
11
-
12
- # Update existing records to set their dimensions based on actual vector data
13
- # This ensures data integrity for existing embeddings
14
- execute <<~SQL
15
- UPDATE ragdoll_embeddings
16
- SET embedding_dimensions = array_length(embedding::real[], 1)
17
- WHERE embedding IS NOT NULL
18
- SQL
19
-
20
- # Add index on embedding_dimensions for efficient filtering by vector size
21
- # Used when searching embeddings to ensure only compatible vectors are compared
22
- add_index :ragdoll_embeddings, :embedding_dimensions, comment: 'Index for filtering embeddings by vector dimension size. Essential for ensuring only compatible embeddings are compared during similarity searches and avoiding dimension mismatch errors.'
23
-
24
- # Add composite index on model_name and embedding_dimensions for optimized similarity searches
25
- # This combination is frequently queried together when finding similar embeddings
26
- add_index :ragdoll_embeddings, [:model_name, :embedding_dimensions],
27
- name: 'index_ragdoll_embeddings_on_model_and_dimensions',
28
- comment: 'Composite index on model name and vector dimensions. Optimizes the common query pattern of finding embeddings from the same model with matching dimensions for similarity calculations.'
29
- end
30
-
31
- def down
32
- # Remove the new columns and indexes in reverse order
33
- remove_index :ragdoll_embeddings, :embedding_dimensions
34
- remove_index :ragdoll_embeddings, name: 'index_ragdoll_embeddings_on_model_and_dimensions'
35
- remove_column :ragdoll_embeddings, :embedding_dimensions
36
-
37
- # Note: The original plan was to restore vector type, but this would fail with mixed dimensions
38
- # Restore the original limit (this will fail if there are vectors with different dimensions)
39
- change_column :ragdoll_embeddings, :embedding, :vector, limit: 1536
40
- end
41
- end
@@ -1,37 +0,0 @@
1
- # Creates the searches table for tracking user search queries and analytics.
2
- # This table enables search analytics, caching, and performance monitoring for the RAG system.
3
- class CreateRagdollSearches < ActiveRecord::Migration[8.0]
4
- def change
5
- create_table :ragdoll_searches, comment: 'Tracks user search queries and results for analytics and performance monitoring. Each row represents a search performed by a user, storing both the query and metadata about results.' do |t|
6
- # User's original search query
7
- t.text :query, null: false, comment: 'The original search query text entered by the user. Required field that captures the exact search terms for analytics, popular queries tracking, and search result caching.'
8
-
9
- # Embedding of the search query for similarity calculations
10
- t.vector :query_embedding, limit: 1536, comment: 'Vector embedding of the search query. Generated using the same model as document embeddings to enable semantic similarity calculations. Stored in pgvector format for efficient similarity operations.'
11
-
12
- # Search classification and method
13
- t.string :search_type, default: 'semantic', comment: 'Type of search performed (e.g., "semantic", "keyword", "hybrid"). Allows for different search strategies and helps analyze which search methods are most effective for users.'
14
-
15
- # Search parameters and constraints
16
- t.jsonb :filters, default: {}, comment: 'JSON object containing search filters applied (e.g., document_type, date_range, status). Used for analyzing how users refine searches and for recreating search contexts.'
17
-
18
- # Search results metadata
19
- t.jsonb :results, default: {}, comment: 'JSON object containing search result metadata such as result IDs, similarity scores, and ranking information. Enables search result caching and detailed analytics on result quality.'
20
- t.integer :result_count, default: 0, comment: 'Number of results returned by this search. Used for analytics on search effectiveness and identifying queries that return too few or too many results.'
21
-
22
- # Performance tracking
23
- t.float :search_time, comment: 'Time in seconds taken to execute this search query. Critical for performance monitoring, identifying slow queries, and optimizing search algorithms.'
24
-
25
- # Model identification for compatibility
26
- t.string :model_name, comment: 'Name of the AI model used to generate the query embedding (e.g., "text-embedding-3-small"). Ensures search compatibility and enables analytics by model performance.'
27
-
28
- # Standard Rails timestamps
29
- t.timestamps comment: 'Standard Rails created_at and updated_at timestamps. The created_at timestamp is particularly important for search analytics and identifying search patterns over time.'
30
- end
31
-
32
- # Indexes for analytics and performance
33
- add_index :ragdoll_searches, :search_type, comment: 'Index for filtering searches by type, used in analytics dashboards to compare effectiveness of different search strategies.'
34
- add_index :ragdoll_searches, :query_embedding, using: :hnsw, opclass: :vector_cosine_ops, comment: 'HNSW index on query embeddings for finding similar queries. Enables query recommendation, duplicate detection, and search clustering analysis.'
35
- add_index :ragdoll_searches, :created_at, comment: 'Index for time-based queries and analytics. Essential for generating search trend reports, popular queries over time, and performance monitoring.'
36
- end
37
- end
@@ -1,17 +0,0 @@
1
- # Adds summary metadata tracking to documents table for AI-generated summaries.
2
- # Enhances the existing summary field with provenance and timing information.
3
- class AddSummaryMetadataToRagdollDocuments < ActiveRecord::Migration[8.0]
4
- def change
5
- # Summary field already exists from initial migration, just add metadata fields for tracking
6
-
7
- # Timestamp tracking for summary generation
8
- add_column :ragdoll_documents, :summary_generated_at, :timestamp, comment: 'Timestamp when the AI-generated summary was created. Used for cache invalidation, determining summary freshness, and analytics on summary generation performance.'
9
-
10
- # Model identification for summary provenance
11
- add_column :ragdoll_documents, :summary_model, :string, comment: 'Name/identifier of the AI model used to generate the summary (e.g., "gpt-3.5-turbo", "claude-3-haiku"). Critical for tracking summary quality, cost analysis, and ensuring consistency in summary style.'
12
-
13
- # Indexes for summary metadata queries and analytics
14
- add_index :ragdoll_documents, :summary_generated_at, comment: 'Index for filtering and sorting documents by summary generation time. Used for cache invalidation queries and identifying documents with stale summaries that need regeneration.'
15
- add_index :ragdoll_documents, :summary_model, comment: 'Index for filtering documents by the AI model used for summarization. Enables analytics on summary quality by model and batch re-summarization with newer models.'
16
- end
17
- end
@@ -1,28 +0,0 @@
1
- # Adds usage tracking functionality to embeddings for intelligent ranking and caching.
2
- # Enables the system to learn which embeddings are most valuable and prioritize them in search results.
3
- class AddUsageTrackingToRagdollEmbeddings < ActiveRecord::Migration[8.0]
4
- def change
5
- # Usage frequency tracking
6
- add_column :ragdoll_embeddings, :usage_count, :integer, default: 0, null: false, comment: 'Number of times this embedding has been returned in search results. Incremented each time the embedding appears in search results. Used for frequency-based ranking to surface more relevant content.'
7
-
8
- # Recency tracking for temporal relevance
9
- add_column :ragdoll_embeddings, :returned_at, :timestamp, comment: 'Timestamp of the most recent time this embedding was returned in search results. Used for recency-based ranking algorithms and cache warming strategies. NULL indicates never used.'
10
-
11
- # Performance indexes for usage-based ranking queries
12
- add_index :ragdoll_embeddings, :usage_count, name: 'index_ragdoll_embeddings_on_usage_count', comment: 'Index for sorting embeddings by usage frequency. Critical for popularity-based ranking algorithms and identifying most/least used content.'
13
- add_index :ragdoll_embeddings, :returned_at, name: 'index_ragdoll_embeddings_on_returned_at', comment: 'Index for sorting embeddings by recency of use. Enables temporal ranking algorithms and cache warming strategies based on recent usage patterns.'
14
-
15
- # Composite index for advanced ranking algorithms that combine frequency and recency
16
- add_index :ragdoll_embeddings, [:usage_count, :returned_at],
17
- name: 'index_ragdoll_embeddings_on_usage_and_recency',
18
- comment: 'Composite index for complex ranking algorithms that combine usage frequency with recency. Optimizes queries that balance popular content with recently accessed content for intelligent search result ranking.'
19
-
20
- # Data migration to ensure existing records have proper default values
21
- # Usage count defaults to 0, returned_at remains null until first usage
22
- reversible do |dir|
23
- dir.up do
24
- execute "UPDATE ragdoll_embeddings SET usage_count = 0 WHERE usage_count IS NULL"
25
- end
26
- end
27
- end
28
- end