ragdoll 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +243 -0
- data/README.md +209 -31
- data/Rakefile +4 -5
- data/app/models/ragdoll/document.rb +115 -12
- data/app/models/ragdoll/embedding.rb +108 -2
- data/app/models/ragdoll/search.rb +165 -0
- data/app/models/ragdoll/search_result.rb +121 -0
- data/app/services/ragdoll/configuration_service.rb +3 -3
- data/app/services/ragdoll/document_processor.rb +124 -1
- data/app/services/ragdoll/embedding_service.rb +10 -0
- data/app/services/ragdoll/search_engine.rb +75 -6
- data/db/migrate/{001_enable_postgresql_extensions.rb → 20250815234901_enable_postgresql_extensions.rb} +7 -8
- data/db/migrate/20250815234902_create_ragdoll_documents.rb +117 -0
- data/db/migrate/{005_create_ragdoll_embeddings.rb → 20250815234903_create_ragdoll_embeddings.rb} +13 -10
- data/db/migrate/{006_create_ragdoll_contents.rb → 20250815234904_create_ragdoll_contents.rb} +14 -11
- data/db/migrate/20250815234905_create_ragdoll_searches.rb +77 -0
- data/db/migrate/20250815234906_create_ragdoll_search_results.rb +49 -0
- data/lib/ragdoll/core/client.rb +75 -8
- data/lib/ragdoll/core/database.rb +8 -3
- data/lib/ragdoll/core/model.rb +13 -0
- data/lib/ragdoll/core/version.rb +1 -1
- data/lib/ragdoll/core.rb +2 -0
- data/lib/ragdoll.rb +17 -0
- data/lib/tasks/db.rake +75 -27
- metadata +375 -6
- data/db/migrate/004_create_ragdoll_documents.rb +0 -70
@@ -142,10 +142,12 @@ module Ragdoll
|
|
142
142
|
def keywords_array
|
143
143
|
return [] unless keywords.present?
|
144
144
|
|
145
|
+
# After migration, keywords is now a PostgreSQL array
|
145
146
|
case keywords
|
146
147
|
when Array
|
147
|
-
keywords
|
148
|
+
keywords.map(&:to_s).map(&:strip).reject(&:empty?)
|
148
149
|
when String
|
150
|
+
# Fallback for any remaining string data (shouldn't happen after migration)
|
149
151
|
keywords.split(",").map(&:strip).reject(&:empty?)
|
150
152
|
else
|
151
153
|
[]
|
@@ -153,17 +155,23 @@ module Ragdoll
|
|
153
155
|
end
|
154
156
|
|
155
157
|
def add_keyword(keyword)
|
158
|
+
return if keyword.blank?
|
159
|
+
|
156
160
|
current_keywords = keywords_array
|
157
|
-
|
161
|
+
normalized_keyword = keyword.to_s.strip.downcase
|
162
|
+
return if current_keywords.map(&:downcase).include?(normalized_keyword)
|
158
163
|
|
159
|
-
current_keywords <<
|
160
|
-
self.keywords = current_keywords
|
164
|
+
current_keywords << normalized_keyword
|
165
|
+
self.keywords = current_keywords
|
161
166
|
end
|
162
167
|
|
163
168
|
def remove_keyword(keyword)
|
169
|
+
return if keyword.blank?
|
170
|
+
|
164
171
|
current_keywords = keywords_array
|
165
|
-
|
166
|
-
|
172
|
+
normalized_keyword = keyword.to_s.strip.downcase
|
173
|
+
current_keywords.reject! { |k| k.downcase == normalized_keyword }
|
174
|
+
self.keywords = current_keywords
|
167
175
|
end
|
168
176
|
|
169
177
|
# Metadata accessors for common fields
|
@@ -249,15 +257,110 @@ module Ragdoll
|
|
249
257
|
puts "Metadata generation failed: #{e.message}"
|
250
258
|
end
|
251
259
|
|
252
|
-
# PostgreSQL full-text search on metadata fields
|
260
|
+
# PostgreSQL full-text search on metadata fields with per-word match-ratio [0.0..1.0]
|
253
261
|
def self.search_content(query, **options)
|
254
262
|
return none if query.blank?
|
255
263
|
|
256
|
-
#
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
264
|
+
# Split into unique alphanumeric words
|
265
|
+
words = query.downcase.scan(/[[:alnum:]]+/).uniq
|
266
|
+
return none if words.empty?
|
267
|
+
|
268
|
+
limit = options[:limit] || 20
|
269
|
+
threshold = options[:threshold] || 0.0
|
270
|
+
|
271
|
+
# Use precomputed tsvector column if it exists, otherwise build on the fly
|
272
|
+
if column_names.include?("search_vector")
|
273
|
+
tsvector = "#{table_name}.search_vector"
|
274
|
+
else
|
275
|
+
# Build tsvector from title and metadata fields
|
276
|
+
text_expr =
|
277
|
+
"COALESCE(title, '') || ' ' || " \
|
278
|
+
"COALESCE(metadata->>'summary', '') || ' ' || " \
|
279
|
+
"COALESCE(metadata->>'keywords', '') || ' ' || " \
|
280
|
+
"COALESCE(metadata->>'description', '')"
|
281
|
+
tsvector = "to_tsvector('english', #{text_expr})"
|
282
|
+
end
|
283
|
+
|
284
|
+
# Prepare sanitized tsquery terms
|
285
|
+
tsqueries = words.map do |word|
|
286
|
+
sanitize_sql_array(["plainto_tsquery('english', ?)", word])
|
287
|
+
end
|
288
|
+
|
289
|
+
# Combine per-word tsqueries with OR so PostgreSQL can use the GIN index
|
290
|
+
combined_tsquery = tsqueries.join(' || ')
|
291
|
+
|
292
|
+
# Score each match (1 if present, 0 if not), sum them
|
293
|
+
score_terms = tsqueries.map { |tsq| "(#{tsvector} @@ #{tsq})::int" }
|
294
|
+
score_sum = score_terms.join(' + ')
|
295
|
+
|
296
|
+
# Similarity ratio: fraction of query words present
|
297
|
+
similarity_sql = "(#{score_sum})::float / #{words.size}"
|
298
|
+
|
299
|
+
# Start with basic search query
|
300
|
+
query = select("#{table_name}.*, #{similarity_sql} AS fulltext_similarity")
|
301
|
+
|
302
|
+
# Build where conditions
|
303
|
+
conditions = ["#{tsvector} @@ (#{combined_tsquery})"]
|
304
|
+
|
305
|
+
# Add status filter (default to processed unless overridden)
|
306
|
+
status = options[:status] || 'processed'
|
307
|
+
conditions << "#{table_name}.status = '#{status}'"
|
308
|
+
|
309
|
+
# Add document type filter if specified
|
310
|
+
if options[:document_type].present?
|
311
|
+
conditions << sanitize_sql_array(["#{table_name}.document_type = ?", options[:document_type]])
|
312
|
+
end
|
313
|
+
|
314
|
+
# Add threshold filtering if specified
|
315
|
+
if threshold > 0.0
|
316
|
+
conditions << "#{similarity_sql} >= #{threshold}"
|
317
|
+
end
|
318
|
+
|
319
|
+
# Combine all conditions
|
320
|
+
where_clause = conditions.join(' AND ')
|
321
|
+
|
322
|
+
# Materialize to array to avoid COUNT/SELECT alias conflicts in some AR versions
|
323
|
+
query.where(where_clause)
|
324
|
+
.order(Arel.sql("fulltext_similarity DESC, updated_at DESC"))
|
325
|
+
.limit(limit)
|
326
|
+
.to_a
|
327
|
+
end
|
328
|
+
|
329
|
+
# Search documents by keywords using PostgreSQL array operations
|
330
|
+
# Returns documents that match keywords with scoring based on match count
|
331
|
+
# Inspired by find_matching_entries.rb algorithm but optimized for PostgreSQL arrays
|
332
|
+
def self.search_by_keywords(keywords_array, **options)
|
333
|
+
return where("1 = 0") if keywords_array.blank?
|
334
|
+
|
335
|
+
# Normalize keywords to lowercase strings array
|
336
|
+
normalized_keywords = Array(keywords_array).map(&:to_s).map(&:downcase).reject(&:empty?)
|
337
|
+
return where("1 = 0") if normalized_keywords.empty?
|
338
|
+
|
339
|
+
limit = options[:limit] || 20
|
340
|
+
|
341
|
+
# Use PostgreSQL array overlap operator with proper array literal
|
342
|
+
quoted_keywords = normalized_keywords.map { |k| "\"#{k}\"" }.join(',')
|
343
|
+
array_literal = "'{#{quoted_keywords}}'::text[]"
|
344
|
+
where("keywords && #{array_literal}")
|
345
|
+
.order("created_at DESC")
|
346
|
+
.limit(limit)
|
347
|
+
end
|
348
|
+
|
349
|
+
# Find documents that contain ALL specified keywords (exact array matching)
|
350
|
+
def self.search_by_keywords_all(keywords_array, **options)
|
351
|
+
return where("1 = 0") if keywords_array.blank?
|
352
|
+
|
353
|
+
normalized_keywords = Array(keywords_array).map(&:to_s).map(&:downcase).reject(&:empty?)
|
354
|
+
return where("1 = 0") if normalized_keywords.empty?
|
355
|
+
|
356
|
+
limit = options[:limit] || 20
|
357
|
+
|
358
|
+
# Use PostgreSQL array contains operator with proper array literal
|
359
|
+
quoted_keywords = normalized_keywords.map { |k| "\"#{k}\"" }.join(',')
|
360
|
+
array_literal = "'{#{quoted_keywords}}'::text[]"
|
361
|
+
where("keywords @> #{array_literal}")
|
362
|
+
.order("created_at DESC")
|
363
|
+
.limit(limit)
|
261
364
|
end
|
262
365
|
|
263
366
|
# Faceted search by metadata fields
|
@@ -11,6 +11,8 @@ module Ragdoll
|
|
11
11
|
has_neighbors :embedding_vector
|
12
12
|
|
13
13
|
belongs_to :embeddable, polymorphic: true
|
14
|
+
has_many :search_results, class_name: "Ragdoll::SearchResult", dependent: :destroy
|
15
|
+
has_many :searches, through: :search_results
|
14
16
|
|
15
17
|
validates :embeddable_id, presence: true
|
16
18
|
validates :embeddable_type, presence: true
|
@@ -62,16 +64,66 @@ module Ragdoll
|
|
62
64
|
scope = scope.by_model(filters[:embedding_model]) if filters[:embedding_model]
|
63
65
|
|
64
66
|
# Document-level filters require joining through embeddable (STI Content) to documents
|
65
|
-
|
67
|
+
needs_document_join = filters[:document_type] || filters[:keywords]
|
68
|
+
|
69
|
+
if needs_document_join
|
66
70
|
scope = scope.joins("JOIN ragdoll_contents ON ragdoll_contents.id = ragdoll_embeddings.embeddable_id")
|
67
71
|
.joins("JOIN ragdoll_documents ON ragdoll_documents.id = ragdoll_contents.document_id")
|
68
|
-
|
72
|
+
end
|
73
|
+
|
74
|
+
if filters[:document_type]
|
75
|
+
scope = scope.where("ragdoll_documents.document_type = ?", filters[:document_type])
|
76
|
+
end
|
77
|
+
|
78
|
+
# Keywords filtering using PostgreSQL array operations
|
79
|
+
if filters[:keywords] && filters[:keywords].any?
|
80
|
+
normalized_keywords = Array(filters[:keywords]).map(&:to_s).map(&:downcase).reject(&:empty?)
|
81
|
+
if normalized_keywords.any?
|
82
|
+
# Use PostgreSQL array overlap operator with proper array literal
|
83
|
+
quoted_keywords = normalized_keywords.map { |k| "\"#{k}\"" }.join(',')
|
84
|
+
array_literal = "'{#{quoted_keywords}}'::text[]"
|
85
|
+
scope = scope.where("ragdoll_documents.keywords && #{array_literal}")
|
86
|
+
end
|
69
87
|
end
|
70
88
|
|
71
89
|
# Use pgvector for similarity search
|
72
90
|
search_with_pgvector(query_embedding, scope, limit, threshold)
|
73
91
|
end
|
74
92
|
|
93
|
+
# Enhanced search that returns both results and similarity statistics
|
94
|
+
def self.search_similar_with_stats(query_embedding, limit: 20, threshold: 0.8, filters: {})
|
95
|
+
# Apply filters
|
96
|
+
scope = all
|
97
|
+
scope = scope.where(embeddable_id: filters[:embeddable_id]) if filters[:embeddable_id]
|
98
|
+
scope = scope.where(embeddable_type: filters[:embeddable_type]) if filters[:embeddable_type]
|
99
|
+
scope = scope.by_model(filters[:embedding_model]) if filters[:embedding_model]
|
100
|
+
|
101
|
+
# Document-level filters require joining through embeddable (STI Content) to documents
|
102
|
+
needs_document_join = filters[:document_type] || filters[:keywords]
|
103
|
+
|
104
|
+
if needs_document_join
|
105
|
+
scope = scope.joins("JOIN ragdoll_contents ON ragdoll_contents.id = ragdoll_embeddings.embeddable_id")
|
106
|
+
.joins("JOIN ragdoll_documents ON ragdoll_documents.id = ragdoll_contents.document_id")
|
107
|
+
end
|
108
|
+
|
109
|
+
if filters[:document_type]
|
110
|
+
scope = scope.where("ragdoll_documents.document_type = ?", filters[:document_type])
|
111
|
+
end
|
112
|
+
|
113
|
+
# Keywords filtering using PostgreSQL array operations
|
114
|
+
if filters[:keywords] && filters[:keywords].any?
|
115
|
+
normalized_keywords = Array(filters[:keywords]).map(&:to_s).map(&:downcase).reject(&:empty?)
|
116
|
+
if normalized_keywords.any?
|
117
|
+
# Use PostgreSQL array overlap operator with proper array literal
|
118
|
+
quoted_keywords = normalized_keywords.map { |k| "\"#{k}\"" }.join(',')
|
119
|
+
array_literal = "'{#{quoted_keywords}}'::text[]"
|
120
|
+
scope = scope.where("ragdoll_documents.keywords && #{array_literal}")
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
search_with_pgvector_stats(query_embedding, scope, limit, threshold)
|
125
|
+
end
|
126
|
+
|
75
127
|
# Fast search using pgvector with neighbor gem
|
76
128
|
def self.search_with_pgvector(query_embedding, scope, limit, threshold)
|
77
129
|
# Use pgvector for similarity search
|
@@ -103,6 +155,60 @@ module Ragdoll
|
|
103
155
|
results
|
104
156
|
end
|
105
157
|
|
158
|
+
# Enhanced search with statistics
|
159
|
+
def self.search_with_pgvector_stats(query_embedding, scope, limit, threshold)
|
160
|
+
# Use pgvector for similarity search - get more results to analyze
|
161
|
+
# Note: We convert to array immediately to avoid SQL conflicts with count operations
|
162
|
+
neighbor_results = scope
|
163
|
+
.includes(:embeddable)
|
164
|
+
.nearest_neighbors(:embedding_vector, query_embedding, distance: "cosine")
|
165
|
+
.limit([limit * 3, 50].max) # Get enough for statistics
|
166
|
+
.to_a # Convert to array to avoid SQL conflicts
|
167
|
+
|
168
|
+
results = []
|
169
|
+
all_similarities = []
|
170
|
+
highest_similarity = 0.0
|
171
|
+
lowest_similarity = 1.0
|
172
|
+
total_checked = neighbor_results.length
|
173
|
+
|
174
|
+
neighbor_results.each do |embedding|
|
175
|
+
# Calculate cosine similarity (neighbor returns distance, we want similarity)
|
176
|
+
similarity = 1.0 - embedding.neighbor_distance
|
177
|
+
all_similarities << similarity
|
178
|
+
|
179
|
+
highest_similarity = similarity if similarity > highest_similarity
|
180
|
+
lowest_similarity = similarity if similarity < lowest_similarity
|
181
|
+
|
182
|
+
next if similarity < threshold
|
183
|
+
|
184
|
+
usage_score = calculate_usage_score(embedding)
|
185
|
+
combined_score = similarity + usage_score
|
186
|
+
|
187
|
+
results << build_result_hash(embedding, query_embedding, similarity, highest_similarity,
|
188
|
+
usage_score, combined_score)
|
189
|
+
end
|
190
|
+
|
191
|
+
# Sort by combined score and limit
|
192
|
+
results = results.sort_by { |r| -r[:combined_score] }.take(limit)
|
193
|
+
mark_embeddings_as_used(results)
|
194
|
+
|
195
|
+
# Calculate statistics
|
196
|
+
stats = {
|
197
|
+
total_embeddings_checked: total_checked,
|
198
|
+
threshold_used: threshold,
|
199
|
+
highest_similarity: highest_similarity,
|
200
|
+
lowest_similarity: lowest_similarity,
|
201
|
+
average_similarity: all_similarities.empty? ? 0.0 : (all_similarities.sum / all_similarities.length),
|
202
|
+
similarities_above_threshold: all_similarities.count { |s| s >= threshold },
|
203
|
+
total_similarities_calculated: all_similarities.length
|
204
|
+
}
|
205
|
+
|
206
|
+
{
|
207
|
+
results: results,
|
208
|
+
statistics: stats
|
209
|
+
}
|
210
|
+
end
|
211
|
+
|
106
212
|
private
|
107
213
|
|
108
214
|
# Calculate usage score for ranking
|
@@ -0,0 +1,165 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_record"
|
4
|
+
require "neighbor"
|
5
|
+
|
6
|
+
module Ragdoll
|
7
|
+
class Search < ActiveRecord::Base
|
8
|
+
self.table_name = "ragdoll_searches"
|
9
|
+
|
10
|
+
# Use pgvector for vector similarity search on query embeddings
|
11
|
+
has_neighbors :query_embedding
|
12
|
+
|
13
|
+
has_many :search_results, class_name: "Ragdoll::SearchResult", foreign_key: "search_id", dependent: :destroy
|
14
|
+
has_many :embeddings, through: :search_results
|
15
|
+
|
16
|
+
validates :query, presence: true
|
17
|
+
validates :query_embedding, presence: true
|
18
|
+
validates :search_type, presence: true, inclusion: { in: %w[semantic hybrid fulltext] }
|
19
|
+
validates :results_count, presence: true, numericality: { greater_than_or_equal_to: 0 }
|
20
|
+
|
21
|
+
scope :by_type, ->(type) { where(search_type: type) }
|
22
|
+
scope :by_session, ->(session_id) { where(session_id: session_id) }
|
23
|
+
scope :by_user, ->(user_id) { where(user_id: user_id) }
|
24
|
+
scope :recent, -> { order(created_at: :desc) }
|
25
|
+
scope :with_results, -> { where("results_count > 0") }
|
26
|
+
scope :popular, -> { where("results_count > 0").order(results_count: :desc) }
|
27
|
+
scope :slow_searches, ->(threshold_ms = 1000) { where("execution_time_ms > ?", threshold_ms) }
|
28
|
+
|
29
|
+
# Find searches with similar query embeddings
|
30
|
+
def self.find_similar(query_embedding, limit: 10, threshold: 0.8)
|
31
|
+
nearest_neighbors(:query_embedding, query_embedding, distance: "cosine")
|
32
|
+
.limit(limit * 2)
|
33
|
+
.map do |search|
|
34
|
+
similarity = 1.0 - search.neighbor_distance
|
35
|
+
next if similarity < threshold
|
36
|
+
|
37
|
+
search.define_singleton_method(:similarity_score) { similarity }
|
38
|
+
search
|
39
|
+
end
|
40
|
+
.compact
|
41
|
+
.sort_by(&:similarity_score)
|
42
|
+
.reverse
|
43
|
+
.take(limit)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Calculate statistics for this search
|
47
|
+
def calculate_similarity_stats!
|
48
|
+
return unless search_results.any?
|
49
|
+
|
50
|
+
scores = search_results.pluck(:similarity_score)
|
51
|
+
update!(
|
52
|
+
max_similarity_score: scores.max,
|
53
|
+
min_similarity_score: scores.min,
|
54
|
+
avg_similarity_score: scores.sum.to_f / scores.length
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
# Get search results ordered by rank
|
59
|
+
def ranked_results
|
60
|
+
search_results.includes(:embedding).order(:result_rank)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Get clicked results
|
64
|
+
def clicked_results
|
65
|
+
search_results.where(clicked: true).order(:clicked_at)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Calculate click-through rate
|
69
|
+
def click_through_rate
|
70
|
+
return 0.0 if results_count == 0
|
71
|
+
|
72
|
+
clicked_count = search_results.where(clicked: true).count
|
73
|
+
clicked_count.to_f / results_count
|
74
|
+
end
|
75
|
+
|
76
|
+
# Record a search with its results
|
77
|
+
def self.record_search(query:, query_embedding:, results:, search_type: "semantic",
|
78
|
+
filters: {}, options: {}, execution_time_ms: nil,
|
79
|
+
session_id: nil, user_id: nil)
|
80
|
+
search = create!(
|
81
|
+
query: query,
|
82
|
+
query_embedding: query_embedding,
|
83
|
+
search_type: search_type,
|
84
|
+
results_count: results.length,
|
85
|
+
search_filters: filters,
|
86
|
+
search_options: options,
|
87
|
+
execution_time_ms: execution_time_ms,
|
88
|
+
session_id: session_id,
|
89
|
+
user_id: user_id
|
90
|
+
)
|
91
|
+
|
92
|
+
# Create search result records
|
93
|
+
results.each_with_index do |result, index|
|
94
|
+
search.search_results.create!(
|
95
|
+
embedding_id: result[:embedding_id],
|
96
|
+
similarity_score: result[:similarity],
|
97
|
+
result_rank: index + 1
|
98
|
+
)
|
99
|
+
end
|
100
|
+
|
101
|
+
# Calculate and store similarity statistics
|
102
|
+
search.calculate_similarity_stats!
|
103
|
+
search
|
104
|
+
end
|
105
|
+
|
106
|
+
# Search analytics methods
|
107
|
+
def self.search_analytics(days: 30)
|
108
|
+
start_date = days.days.ago
|
109
|
+
searches = where(created_at: start_date..)
|
110
|
+
|
111
|
+
{
|
112
|
+
total_searches: searches.count,
|
113
|
+
unique_queries: searches.distinct.count(:query),
|
114
|
+
avg_results_per_search: searches.average(:results_count)&.round(2),
|
115
|
+
avg_execution_time: searches.average(:execution_time_ms)&.round(2),
|
116
|
+
search_types: searches.group(:search_type).count,
|
117
|
+
searches_with_results: searches.where("results_count > 0").count,
|
118
|
+
avg_click_through_rate: calculate_avg_ctr(searches)
|
119
|
+
}
|
120
|
+
end
|
121
|
+
|
122
|
+
# Cleanup orphaned searches that have no remaining search results
|
123
|
+
def self.cleanup_orphaned_searches
|
124
|
+
orphaned_search_ids = where.not(id: SearchResult.distinct.pluck(:search_id))
|
125
|
+
orphaned_count = orphaned_search_ids.count
|
126
|
+
|
127
|
+
if orphaned_count > 0
|
128
|
+
orphaned_search_ids.destroy_all
|
129
|
+
Rails.logger.info "Cleaned up #{orphaned_count} orphaned search records" if defined?(Rails)
|
130
|
+
end
|
131
|
+
|
132
|
+
orphaned_count
|
133
|
+
end
|
134
|
+
|
135
|
+
# Cleanup searches older than specified days with no clicks
|
136
|
+
def self.cleanup_old_unused_searches(days: 30)
|
137
|
+
cutoff_date = days.days.ago
|
138
|
+
unused_searches = where(created_at: ...cutoff_date)
|
139
|
+
.left_joins(:search_results)
|
140
|
+
.where(search_results: { clicked: [nil, false] })
|
141
|
+
|
142
|
+
unused_count = unused_searches.count
|
143
|
+
|
144
|
+
if unused_count > 0
|
145
|
+
unused_searches.destroy_all
|
146
|
+
Rails.logger.info "Cleaned up #{unused_count} old unused search records" if defined?(Rails)
|
147
|
+
end
|
148
|
+
|
149
|
+
unused_count
|
150
|
+
end
|
151
|
+
|
152
|
+
private
|
153
|
+
|
154
|
+
def self.calculate_avg_ctr(searches)
|
155
|
+
search_ids = searches.pluck(:id)
|
156
|
+
return 0.0 if search_ids.empty?
|
157
|
+
|
158
|
+
total_results = SearchResult.where(search_id: search_ids).count
|
159
|
+
return 0.0 if total_results == 0
|
160
|
+
|
161
|
+
clicked_results = SearchResult.where(search_id: search_ids, clicked: true).count
|
162
|
+
(clicked_results.to_f / total_results * 100).round(2)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_record"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class SearchResult < ActiveRecord::Base
|
7
|
+
self.table_name = "ragdoll_search_results"
|
8
|
+
|
9
|
+
belongs_to :search, class_name: "Ragdoll::Search"
|
10
|
+
belongs_to :embedding, class_name: "Ragdoll::Embedding"
|
11
|
+
|
12
|
+
validates :similarity_score, presence: true, numericality: { in: 0.0..1.0 }
|
13
|
+
validates :result_rank, presence: true, numericality: { greater_than: 0 }
|
14
|
+
validates :result_rank, uniqueness: { scope: :search_id }
|
15
|
+
|
16
|
+
scope :by_rank, -> { order(:result_rank) }
|
17
|
+
scope :clicked, -> { where(clicked: true) }
|
18
|
+
scope :unclicked, -> { where(clicked: false) }
|
19
|
+
scope :high_similarity, ->(threshold = 0.8) { where("similarity_score >= ?", threshold) }
|
20
|
+
scope :recent_clicks, -> { where(clicked: true).order(clicked_at: :desc) }
|
21
|
+
|
22
|
+
# Cleanup callback to remove searches when they have no results left
|
23
|
+
after_destroy :cleanup_empty_search
|
24
|
+
|
25
|
+
# Mark this result as clicked
|
26
|
+
def mark_as_clicked!
|
27
|
+
update!(clicked: true, clicked_at: Time.current)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Get the content through the embedding relationship
|
31
|
+
def content
|
32
|
+
embedding&.content
|
33
|
+
end
|
34
|
+
|
35
|
+
# Get the document through the embedding relationship
|
36
|
+
def document
|
37
|
+
embedding&.embeddable&.document
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get the document title
|
41
|
+
def document_title
|
42
|
+
document&.title
|
43
|
+
end
|
44
|
+
|
45
|
+
# Get the document location
|
46
|
+
def document_location
|
47
|
+
document&.location
|
48
|
+
end
|
49
|
+
|
50
|
+
# Analytics for search results
|
51
|
+
def self.analytics(days: 30)
|
52
|
+
start_date = days.days.ago
|
53
|
+
results = where(created_at: start_date..)
|
54
|
+
|
55
|
+
{
|
56
|
+
total_results: results.count,
|
57
|
+
clicked_results: results.where(clicked: true).count,
|
58
|
+
click_through_rate: calculate_ctr(results),
|
59
|
+
avg_similarity_score: results.average(:similarity_score)&.round(4),
|
60
|
+
high_similarity_results: results.where("similarity_score >= 0.8").count,
|
61
|
+
low_similarity_results: results.where("similarity_score < 0.5").count,
|
62
|
+
rank_performance: rank_click_analysis(results)
|
63
|
+
}
|
64
|
+
end
|
65
|
+
|
66
|
+
# Analyze click performance by result rank
|
67
|
+
def self.rank_click_analysis(results = nil)
|
68
|
+
results ||= all
|
69
|
+
|
70
|
+
results.group(:result_rank)
|
71
|
+
.group("clicked")
|
72
|
+
.count
|
73
|
+
.each_with_object({}) do |((rank, clicked), count), hash|
|
74
|
+
hash[rank] ||= { total: 0, clicked: 0 }
|
75
|
+
hash[rank][:total] += count
|
76
|
+
hash[rank][:clicked] += count if clicked
|
77
|
+
end
|
78
|
+
.transform_values do |stats|
|
79
|
+
stats.merge(
|
80
|
+
ctr: stats[:total] > 0 ? (stats[:clicked].to_f / stats[:total] * 100).round(2) : 0.0
|
81
|
+
)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Find embeddings that perform well across multiple searches
|
86
|
+
def self.top_performing_embeddings(limit: 20)
|
87
|
+
joins(:embedding)
|
88
|
+
.group(:embedding_id)
|
89
|
+
.select(
|
90
|
+
"embedding_id",
|
91
|
+
"COUNT(*) as appearance_count",
|
92
|
+
"AVG(similarity_score) as avg_similarity",
|
93
|
+
"COUNT(CASE WHEN clicked THEN 1 END) as click_count",
|
94
|
+
"ROUND(COUNT(CASE WHEN clicked THEN 1 END) * 100.0 / COUNT(*), 2) as ctr"
|
95
|
+
)
|
96
|
+
.having("COUNT(*) > 1")
|
97
|
+
.order("avg_similarity DESC, ctr DESC")
|
98
|
+
.limit(limit)
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
def self.calculate_ctr(results)
|
104
|
+
total = results.count
|
105
|
+
return 0.0 if total == 0
|
106
|
+
|
107
|
+
clicked = results.where(clicked: true).count
|
108
|
+
(clicked.to_f / total * 100).round(2)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Cleanup callback to remove parent search if it has no results left
|
112
|
+
def cleanup_empty_search
|
113
|
+
return unless search
|
114
|
+
|
115
|
+
# Check if this was the last result for the search
|
116
|
+
if search.search_results.count == 0
|
117
|
+
search.destroy
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -20,10 +20,10 @@ module Ragdoll
|
|
20
20
|
@config.embedding_model(content_type)
|
21
21
|
when :summary, :keywords
|
22
22
|
# Check for task-specific model, fall back to default
|
23
|
-
task_model = @config.models
|
24
|
-
task_model || @config.models
|
23
|
+
task_model = @config.models[:text_generation][task_type]
|
24
|
+
task_model || @config.models[:text_generation][:default]
|
25
25
|
else
|
26
|
-
@config.models
|
26
|
+
@config.models[:text_generation][:default]
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|