ragdoll 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +353 -0
  3. data/Rakefile +21 -0
  4. data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
  5. data/db/migrate/004_create_ragdoll_documents.rb +70 -0
  6. data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
  7. data/db/migrate/006_create_ragdoll_contents.rb +47 -0
  8. data/lib/ragdoll/core/client.rb +315 -0
  9. data/lib/ragdoll/core/configuration.rb +273 -0
  10. data/lib/ragdoll/core/database.rb +141 -0
  11. data/lib/ragdoll/core/document_management.rb +110 -0
  12. data/lib/ragdoll/core/document_processor.rb +344 -0
  13. data/lib/ragdoll/core/embedding_service.rb +183 -0
  14. data/lib/ragdoll/core/errors.rb +11 -0
  15. data/lib/ragdoll/core/jobs/extract_keywords.rb +32 -0
  16. data/lib/ragdoll/core/jobs/extract_text.rb +42 -0
  17. data/lib/ragdoll/core/jobs/generate_embeddings.rb +32 -0
  18. data/lib/ragdoll/core/jobs/generate_summary.rb +29 -0
  19. data/lib/ragdoll/core/metadata_schemas.rb +334 -0
  20. data/lib/ragdoll/core/models/audio_content.rb +175 -0
  21. data/lib/ragdoll/core/models/content.rb +126 -0
  22. data/lib/ragdoll/core/models/document.rb +678 -0
  23. data/lib/ragdoll/core/models/embedding.rb +204 -0
  24. data/lib/ragdoll/core/models/image_content.rb +227 -0
  25. data/lib/ragdoll/core/models/text_content.rb +169 -0
  26. data/lib/ragdoll/core/search_engine.rb +50 -0
  27. data/lib/ragdoll/core/services/image_description_service.rb +230 -0
  28. data/lib/ragdoll/core/services/metadata_generator.rb +335 -0
  29. data/lib/ragdoll/core/shrine_config.rb +71 -0
  30. data/lib/ragdoll/core/text_chunker.rb +210 -0
  31. data/lib/ragdoll/core/text_generation_service.rb +360 -0
  32. data/lib/ragdoll/core/version.rb +8 -0
  33. data/lib/ragdoll/core.rb +73 -0
  34. data/lib/ragdoll-core.rb +3 -0
  35. data/lib/ragdoll.rb +249 -0
  36. data/lib/tasks/annotate.rake +126 -0
  37. data/lib/tasks/db.rake +338 -0
  38. metadata +80 -0
@@ -0,0 +1,204 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+ require "neighbor"
5
+
6
+ # == Schema Information
7
+ #
8
+ # Table name: ragdoll_embeddings
9
+ #
10
+ # id :bigint not null, primary key
11
+ # chunk_index(Chunk index for ordering embeddings within the embeddable content) :integer not null
12
+ # content(Original text content that was embedded, typically a document chunk) :text not null
13
+ # embeddable_type :string not null
14
+ # embedding_vector(Vector embedding using pgvector for optimal similarity search performance) :vector(1536) not null
15
+ # returned_at(Timestamp of most recent usage, for recency-based ranking and cache management) :datetime
16
+ # usage_count(Number of times used in similarity searches, for caching optimization) :integer default(0)
17
+ # created_at(Standard creation and update timestamps for lifecycle tracking) :datetime not null
18
+ # updated_at(Standard creation and update timestamps for lifecycle tracking) :datetime not null
19
+ # embeddable_id(Polymorphic reference to embeddable content (text, image, audio)) :bigint not null
20
+ #
21
+ # Indexes
22
+ #
23
+ # index_ragdoll_embeddings_on_embeddable (embeddable_type,embeddable_id)
24
+ # index_ragdoll_embeddings_on_embeddable_chunk (embeddable_type,embeddable_id,chunk_index) UNIQUE
25
+ # index_ragdoll_embeddings_on_embeddable_type_and_embeddable_id (embeddable_type,embeddable_id)
26
+ # index_ragdoll_embeddings_on_embedding_vector_cosine (embedding_vector) USING ivfflat
27
+ # index_ragdoll_embeddings_on_returned_at (returned_at)
28
+ # index_ragdoll_embeddings_on_usage_count (usage_count)
29
+ #
30
+
31
+ module Ragdoll
32
+ module Core
33
+ module Models
34
+ class Embedding < ActiveRecord::Base
35
+ self.table_name = "ragdoll_embeddings"
36
+
37
+ # Use pgvector for vector similarity search
38
+ has_neighbors :embedding_vector
39
+
40
+ belongs_to :embeddable, polymorphic: true
41
+
42
+ validates :embeddable_id, presence: true
43
+ validates :embeddable_type, presence: true
44
+ validates :chunk_index, presence: true, uniqueness: { scope: %i[embeddable_id embeddable_type] }
45
+ validates :embedding_vector, presence: true
46
+ validates :content, presence: true
47
+
48
+ scope :by_model, lambda { |model|
49
+ # Use STI table for all content types
50
+ where(
51
+ "embeddable_id IN (SELECT id FROM ragdoll_contents WHERE embedding_model = ?)",
52
+ model
53
+ )
54
+ }
55
+ scope :recent, -> { order(created_at: :desc) }
56
+ scope :frequently_used, -> { where("usage_count > 0").order(usage_count: :desc) }
57
+ scope :by_chunk_order, -> { order(:chunk_index) }
58
+ scope :by_embeddable_type, ->(type) { where(embeddable_type: type) }
59
+ scope :text_embeddings, -> { where(embeddable_type: "Ragdoll::Core::Models::TextContent") }
60
+ scope :image_embeddings, -> { where(embeddable_type: "Ragdoll::Core::Models::ImageContent") }
61
+ scope :audio_embeddings, -> { where(embeddable_type: "Ragdoll::Core::Models::AudioContent") }
62
+
63
+ # JSON columns are handled natively by PostgreSQL - no serialization needed
64
+
65
+ # Callback for vector column updates (no-op for pgvector)
66
+ before_save :update_vector_columns
67
+
68
+ def embedding_dimensions
69
+ embedding_vector&.length || 0
70
+ end
71
+
72
+ # Access embedding_model via polymorphic relationship
73
+ def embedding_model
74
+ embeddable&.embedding_model
75
+ end
76
+
77
+ def mark_as_used!
78
+ increment!(:usage_count)
79
+ update!(returned_at: Time.current)
80
+ end
81
+
82
+ # PostgreSQL pgvector similarity search using neighbor gem
83
+ def self.search_similar(query_embedding, limit: Ragdoll.config.search[:max_results], threshold: Ragdoll.config.search[:similarity_threshold], filters: {})
84
+ # Apply filters
85
+ scope = all
86
+ scope = scope.where(embeddable_id: filters[:embeddable_id]) if filters[:embeddable_id]
87
+ scope = scope.where(embeddable_type: filters[:embeddable_type]) if filters[:embeddable_type]
88
+ scope = scope.by_model(filters[:embedding_model]) if filters[:embedding_model]
89
+
90
+ # Document-level filters require joining through embeddable (STI Content) to documents
91
+ if filters[:document_type]
92
+ scope = scope.joins("JOIN ragdoll_contents ON ragdoll_contents.id = ragdoll_embeddings.embeddable_id")
93
+ .joins("JOIN ragdoll_documents ON ragdoll_documents.id = ragdoll_contents.document_id")
94
+ .where("ragdoll_documents.document_type = ?", filters[:document_type])
95
+ end
96
+
97
+ # Use pgvector for similarity search
98
+ search_with_pgvector(query_embedding, scope, limit, threshold)
99
+ end
100
+
101
+ # Fast search using pgvector with neighbor gem
102
+ def self.search_with_pgvector(query_embedding, scope, limit, threshold)
103
+ # Use pgvector for similarity search
104
+ neighbor_results = scope
105
+ .includes(:embeddable)
106
+ .nearest_neighbors(:embedding_vector, query_embedding, distance: "cosine")
107
+ .limit(limit * 2) # Get more to filter by threshold
108
+
109
+ results = []
110
+ highest_similarity = 0.0
111
+
112
+ neighbor_results.each do |embedding|
113
+ # Calculate cosine similarity (neighbor returns distance, we want similarity)
114
+ similarity = 1.0 - embedding.neighbor_distance
115
+
116
+ highest_similarity = similarity if similarity > highest_similarity
117
+ next if similarity < threshold
118
+
119
+ usage_score = calculate_usage_score(embedding)
120
+ combined_score = similarity + usage_score
121
+
122
+ results << build_result_hash(embedding, query_embedding, similarity, highest_similarity,
123
+ usage_score, combined_score)
124
+ end
125
+
126
+ # Sort by combined score and limit
127
+ results = results.sort_by { |r| -r[:combined_score] }.take(limit)
128
+ mark_embeddings_as_used(results)
129
+ results
130
+ end
131
+
132
+ private
133
+
134
+ # Calculate usage score for ranking
135
+ def self.calculate_usage_score(embedding)
136
+ usage_score = 0.0
137
+ if embedding.returned_at && embedding.usage_count.positive?
138
+ frequency_weight = 0.7
139
+ recency_weight = 0.3
140
+
141
+ frequency_score = [Math.log(embedding.usage_count + 1) / Math.log(100), 1.0].min
142
+ days_since_use = (Time.current - embedding.returned_at) / 1.day
143
+ recency_score = Math.exp(-days_since_use / 30)
144
+
145
+ usage_score = frequency_weight * frequency_score + recency_weight * recency_score
146
+ end
147
+ usage_score
148
+ end
149
+
150
+ # Build standardized result hash
151
+ def self.build_result_hash(embedding, query_embedding, similarity, highest_similarity, usage_score,
152
+ combined_score)
153
+ {
154
+ embedding_id: embedding.id.to_s,
155
+ embeddable_id: embedding.embeddable_id.to_s,
156
+ embeddable_type: embedding.embeddable_type,
157
+ document_id: embedding.embeddable&.document_id&.to_s || "Unknown",
158
+ document_title: embedding.embeddable&.document&.title || "Unknown",
159
+ document_location: embedding.embeddable&.document&.location || "Unknown",
160
+ content: embedding.content,
161
+ similarity: similarity,
162
+ highest_similarity: highest_similarity,
163
+ distance: 1.0 - similarity,
164
+ chunk_index: embedding.chunk_index,
165
+ embedding_dimensions: query_embedding.length,
166
+ embedding_model: embedding.embeddable&.embedding_model,
167
+ usage_count: embedding.usage_count || 0,
168
+ returned_at: embedding.returned_at,
169
+ usage_score: usage_score,
170
+ combined_score: combined_score
171
+ }
172
+ end
173
+
174
+ # Mark embeddings as used for analytics
175
+ def self.mark_embeddings_as_used(results)
176
+ return if results.empty?
177
+
178
+ embedding_ids = results.map { |r| r[:embedding_id] }
179
+ where(id: embedding_ids).update_all(
180
+ usage_count: arel_table[:usage_count] + 1,
181
+ returned_at: Time.current
182
+ )
183
+ end
184
+
185
+ # Callback to update vector columns when embedding_vector changes
186
+ def update_vector_columns
187
+ # No additional processing needed for pgvector
188
+ end
189
+
190
+ def self.cosine_similarity(vec1, vec2)
191
+ return 0.0 if vec1.nil? || vec2.nil? || vec1.length != vec2.length
192
+
193
+ dot_product = vec1.zip(vec2).sum { |a, b| a * b }
194
+ magnitude1 = Math.sqrt(vec1.sum { |a| a * a })
195
+ magnitude2 = Math.sqrt(vec2.sum { |a| a * a })
196
+
197
+ return 0.0 if magnitude1 == 0.0 || magnitude2 == 0.0
198
+
199
+ dot_product / (magnitude1 * magnitude2)
200
+ end
201
+ end
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,227 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+ require_relative "content"
5
+
6
+ # == Schema Information
7
+ #
8
+ # Table name: ragdoll_contents (STI)
9
+ #
10
+ # id :bigint not null, primary key
11
+ # type(Type of content - TextContent, ImageContent, AudioContent) :string not null
12
+ # document_id(Reference to parent document) :bigint not null
13
+ # embedding_model(Embedding model to use for this content) :string not null
14
+ # content(Text content or description of the file) :text
15
+ # data(Raw data from file) :text
16
+ # metadata(Additional metadata about the file's raw data) :json default({})
17
+ # duration(Duration of audio in seconds - for audio content) :float
18
+ # sample_rate(Audio sample rate in Hz - for audio content) :integer
19
+ # created_at(Standard creation and update timestamps) :datetime not null
20
+ # updated_at(Standard creation and update timestamps) :datetime not null
21
+ #
22
+ # Indexes
23
+ #
24
+ # index_ragdoll_contents_on_document_id (document_id)
25
+ # index_ragdoll_contents_on_embedding_model (embedding_model)
26
+ # index_ragdoll_contents_on_type (type)
27
+ # index_ragdoll_contents_on_fulltext_search (to_tsvector('english'::regconfig, COALESCE(content, ''::text))) USING gin
28
+ #
29
+ # Foreign Keys
30
+ #
31
+ # fk_rails_... (document_id => ragdoll_documents.id)
32
+ #
33
+
34
+ module Ragdoll
35
+ module Core
36
+ module Models
37
+ class ImageContent < Content
38
+ validate :image_data_or_description_present
39
+
40
+ scope :recent, -> { order(created_at: :desc) }
41
+ scope :with_images, -> { where.not(data: [nil, ""]) }
42
+ scope :with_descriptions, -> { where.not(content: [nil, ""]) }
43
+
44
+ # Image content accessors - content field stores description for embedding
45
+ def description
46
+ content
47
+ end
48
+
49
+ def description=(value)
50
+ self.content = value
51
+ end
52
+
53
+ # Image file data accessor
54
+ def image_data
55
+ data
56
+ end
57
+
58
+ def image_data=(value)
59
+ self.data = value
60
+ end
61
+
62
+ # Image-specific technical metadata (raw file properties)
63
+ # This metadata is about the actual image file data, not AI-generated insights
64
+ def alt_text
65
+ metadata.dig('alt_text')
66
+ end
67
+
68
+ def alt_text=(value)
69
+ self.metadata = metadata.merge('alt_text' => value)
70
+ end
71
+
72
+ def embedding_count
73
+ embeddings.count
74
+ end
75
+
76
+ # Image file technical properties (stored in content metadata - raw file data)
77
+ def image_attached?
78
+ data.present?
79
+ end
80
+
81
+ def image_size
82
+ metadata.dig('file_size') || 0
83
+ end
84
+
85
+ def image_size=(value)
86
+ self.metadata = metadata.merge('file_size' => value)
87
+ end
88
+
89
+ def image_content_type
90
+ metadata.dig('content_type')
91
+ end
92
+
93
+ def image_content_type=(value)
94
+ self.metadata = metadata.merge('content_type' => value)
95
+ end
96
+
97
+ def image_filename
98
+ metadata.dig('filename')
99
+ end
100
+
101
+ def image_filename=(value)
102
+ self.metadata = metadata.merge('filename' => value)
103
+ end
104
+
105
+ def image_dimensions
106
+ width = metadata.dig('width')
107
+ height = metadata.dig('height')
108
+ return nil unless width && height
109
+
110
+ { width: width, height: height }
111
+ end
112
+
113
+ def set_image_dimensions(width, height)
114
+ self.metadata = metadata.merge('width' => width, 'height' => height)
115
+ end
116
+
117
+ # Image format and technical details
118
+ def color_space
119
+ metadata.dig('color_space')
120
+ end
121
+
122
+ def color_space=(value)
123
+ self.metadata = metadata.merge('color_space' => value)
124
+ end
125
+
126
+ def bit_depth
127
+ metadata.dig('bit_depth')
128
+ end
129
+
130
+ def bit_depth=(value)
131
+ self.metadata = metadata.merge('bit_depth' => value)
132
+ end
133
+
134
+ # Generate description from image file using LLM vision capabilities
135
+ def generate_description_from_image!(options = {})
136
+ return false unless image_attached? || file_path_available?
137
+
138
+ begin
139
+ image_path = get_image_path
140
+ return false unless image_path
141
+
142
+ # Use the image description service
143
+ require_relative "../services/image_description_service"
144
+ description_service = Services::ImageDescriptionService.new
145
+
146
+ generated_description = description_service.generate_description(image_path, options)
147
+
148
+ if generated_description.present?
149
+ self.description = generated_description
150
+ save!
151
+ return true
152
+ end
153
+
154
+ false
155
+ rescue StandardError => e
156
+ puts "Failed to generate image description: #{e.message}"
157
+ false
158
+ end
159
+ end
160
+
161
+ # Override content for embedding to combine description and alt_text
162
+ def content_for_embedding
163
+ content_parts = []
164
+ content_parts << alt_text if alt_text.present?
165
+ content_parts << description if description.present?
166
+ content_parts.join(" ")
167
+ end
168
+
169
+ def generate_embeddings!
170
+ return unless should_generate_embeddings?
171
+
172
+ embedding_content = content_for_embedding
173
+ return if embedding_content.blank?
174
+
175
+ # Generate embeddings using the base class method
176
+ super
177
+ end
178
+
179
+ # Override should_generate_embeddings to check for content
180
+ def should_generate_embeddings?
181
+ content_for_embedding.present? && embeddings.empty?
182
+ end
183
+
184
+ def self.stats
185
+ {
186
+ total_image_contents: count,
187
+ by_model: group(:embedding_model).count,
188
+ total_embeddings: joins(:embeddings).count,
189
+ with_images: with_images.count,
190
+ with_descriptions: with_descriptions.count,
191
+ average_image_size: joins(:image_attachment).average("active_storage_blobs.byte_size")
192
+ }
193
+ end
194
+
195
+ private
196
+
197
+ def file_path_available?
198
+ document&.location&.present? && File.exist?(document.location)
199
+ end
200
+
201
+ def get_image_path
202
+ if file_path_available?
203
+ # Use document location if it's an image file
204
+ document.location if image_file?(document.location)
205
+ elsif image_attached?
206
+ # Try to get path from stored data (if it's a file path)
207
+ data if data&.start_with?('/')
208
+ end
209
+ end
210
+
211
+ def image_file?(file_path)
212
+ return false unless file_path
213
+
214
+ image_extensions = %w[.jpg .jpeg .png .gif .bmp .webp .svg .ico .tiff .tif]
215
+ ext = File.extname(file_path).downcase
216
+ image_extensions.include?(ext)
217
+ end
218
+
219
+ def image_data_or_description_present
220
+ return if image_attached? || description.present? || alt_text.present?
221
+
222
+ errors.add(:base, "Must have either image data or description/alt_text")
223
+ end
224
+ end
225
+ end
226
+ end
227
+ end
@@ -0,0 +1,169 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+ require_relative "content"
5
+
6
+ # == Schema Information
7
+ #
8
+ # Table name: ragdoll_contents (STI)
9
+ #
10
+ # id :bigint not null, primary key
11
+ # type(Type of content - TextContent, ImageContent, AudioContent) :string not null
12
+ # document_id(Reference to parent document) :bigint not null
13
+ # embedding_model(Embedding model to use for this content) :string not null
14
+ # content(Text content or description of the file) :text
15
+ # data(Raw data from file) :text
16
+ # metadata(Additional metadata about the file's raw data) :json default({})
17
+ # duration(Duration of audio in seconds - for audio content) :float
18
+ # sample_rate(Audio sample rate in Hz - for audio content) :integer
19
+ # created_at(Standard creation and update timestamps) :datetime not null
20
+ # updated_at(Standard creation and update timestamps) :datetime not null
21
+ #
22
+ # Indexes
23
+ #
24
+ # index_ragdoll_contents_on_document_id (document_id)
25
+ # index_ragdoll_contents_on_embedding_model (embedding_model)
26
+ # index_ragdoll_contents_on_type (type)
27
+ # index_ragdoll_contents_on_fulltext_search (to_tsvector('english'::regconfig, COALESCE(content, ''::text))) USING gin
28
+ #
29
+ # Foreign Keys
30
+ #
31
+ # fk_rails_... (document_id => ragdoll_documents.id)
32
+ #
33
+
34
+ module Ragdoll
35
+ module Core
36
+ module Models
37
+ class TextContent < Content
38
+ validates :content, presence: true
39
+
40
+ scope :recent, -> { order(created_at: :desc) }
41
+
42
+ # Text-specific processing configuration stored in content metadata
43
+ # This metadata is about the raw content processing, not AI-generated insights
44
+ def chunk_size
45
+ metadata.dig('chunk_size') || 1000
46
+ end
47
+
48
+ def chunk_size=(value)
49
+ self.metadata = metadata.merge('chunk_size' => value)
50
+ end
51
+
52
+ def overlap
53
+ metadata.dig('overlap') || 200
54
+ end
55
+
56
+ def overlap=(value)
57
+ self.metadata = metadata.merge('overlap' => value)
58
+ end
59
+
60
+ # Content-specific technical metadata (file processing info)
61
+ def encoding
62
+ metadata.dig('encoding')
63
+ end
64
+
65
+ def encoding=(value)
66
+ self.metadata = metadata.merge('encoding' => value)
67
+ end
68
+
69
+ def line_count
70
+ metadata.dig('line_count')
71
+ end
72
+
73
+ def line_count=(value)
74
+ self.metadata = metadata.merge('line_count' => value)
75
+ end
76
+
77
+ def word_count
78
+ content&.split&.length || 0
79
+ end
80
+
81
+ def character_count
82
+ content&.length || 0
83
+ end
84
+
85
+ def embedding_count
86
+ embeddings.count
87
+ end
88
+
89
+ # Text-specific processing methods
90
+ def chunks
91
+ return [] if content.blank?
92
+
93
+ chunks = []
94
+ start_pos = 0
95
+
96
+ while start_pos < content.length
97
+ end_pos = [start_pos + chunk_size, content.length].min
98
+
99
+ # Try to break at word boundary if not at end
100
+ if end_pos < content.length
101
+ last_space = content.rindex(" ", end_pos)
102
+ end_pos = last_space if last_space && last_space > start_pos
103
+ end
104
+
105
+ chunk_content = content[start_pos...end_pos].strip
106
+ if chunk_content.present?
107
+ chunks << {
108
+ content: chunk_content,
109
+ start_position: start_pos,
110
+ end_position: end_pos,
111
+ chunk_index: chunks.length
112
+ }
113
+ end
114
+
115
+ break if end_pos >= content.length
116
+
117
+ start_pos = [end_pos - overlap, start_pos + 1].max
118
+ end
119
+
120
+ chunks
121
+ end
122
+
123
+ def generate_embeddings!
124
+ return if content.blank?
125
+
126
+ # Clear existing embeddings
127
+ embeddings.destroy_all
128
+
129
+ # Use TextChunker to split content into manageable chunks
130
+ chunks = Ragdoll::Core::TextChunker.chunk(content)
131
+
132
+ # Generate embeddings for each chunk
133
+ embedding_service = Ragdoll::Core::EmbeddingService.new
134
+
135
+ chunks.each_with_index do |chunk_text, index|
136
+ begin
137
+ vector = embedding_service.generate_embedding(chunk_text)
138
+
139
+ embeddings.create!(
140
+ content: chunk_text,
141
+ embedding_vector: vector,
142
+ chunk_index: index
143
+ )
144
+ rescue StandardError => e
145
+ puts "Failed to generate embedding for chunk #{index}: #{e.message}"
146
+ end
147
+ end
148
+
149
+ update!(metadata: (metadata || {}).merge("embeddings_generated_at" => Time.current))
150
+ end
151
+
152
+ # Override content for embedding to use the text content
153
+ def content_for_embedding
154
+ content
155
+ end
156
+
157
+ def self.stats
158
+ {
159
+ total_text_contents: count,
160
+ by_model: group(:embedding_model).count,
161
+ total_embeddings: joins(:embeddings).count,
162
+ average_word_count: average("LENGTH(content) - LENGTH(REPLACE(content, ' ', '')) + 1"),
163
+ average_chunk_size: average(:chunk_size)
164
+ }
165
+ end
166
+ end
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ # FIXME: This is crap. It does not focus on search.
4
+
5
+ module Ragdoll
6
+ module Core
7
+ class SearchEngine
8
+ def initialize(embedding_service)
9
+ @embedding_service = embedding_service
10
+ end
11
+
12
+ def search_documents(query, options = {})
13
+ limit = options[:limit] || Ragdoll.config.search[:max_results]
14
+ threshold = options[:threshold] || Ragdoll.config.search[:similarity_threshold]
15
+ filters = options[:filters] || {}
16
+
17
+ # Generate embedding for the query
18
+ query_embedding = @embedding_service.generate_embedding(query)
19
+ return [] if query_embedding.nil?
20
+
21
+ # Search using ActiveRecord models
22
+ Models::Embedding.search_similar(query_embedding,
23
+ limit: limit,
24
+ threshold: threshold,
25
+ filters: filters)
26
+ end
27
+
28
+ def search_similar_content(query_or_embedding, options = {})
29
+ limit = options[:limit] || Ragdoll.config.search[:max_results]
30
+ threshold = options[:threshold] || Ragdoll.config.search[:similarity_threshold]
31
+ filters = options[:filters] || {}
32
+
33
+ if query_or_embedding.is_a?(Array)
34
+ # It's already an embedding
35
+ query_embedding = query_or_embedding
36
+ else
37
+ # It's a query string, generate embedding
38
+ query_embedding = @embedding_service.generate_embedding(query_or_embedding)
39
+ return [] if query_embedding.nil?
40
+ end
41
+
42
+ # Search using ActiveRecord models
43
+ Models::Embedding.search_similar(query_embedding,
44
+ limit: limit,
45
+ threshold: threshold,
46
+ filters: filters)
47
+ end
48
+ end
49
+ end
50
+ end