ragdoll 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +201 -0
- data/README.md +160 -31
- data/Rakefile +0 -3
- data/app/models/ragdoll/embedding.rb +74 -0
- data/app/models/ragdoll/search.rb +165 -0
- data/app/models/ragdoll/search_result.rb +121 -0
- data/app/services/ragdoll/configuration_service.rb +3 -3
- data/app/services/ragdoll/document_processor.rb +124 -1
- data/app/services/ragdoll/embedding_service.rb +10 -0
- data/app/services/ragdoll/search_engine.rb +64 -6
- data/db/migrate/007_create_ragdoll_searches.rb +73 -0
- data/db/migrate/008_create_ragdoll_search_results.rb +49 -0
- data/lib/ragdoll/core/client.rb +75 -8
- data/lib/ragdoll/core/model.rb +13 -0
- data/lib/ragdoll/core/version.rb +1 -1
- data/lib/ragdoll/core.rb +2 -0
- data/lib/ragdoll.rb +17 -0
- data/lib/tasks/db.rake +13 -13
- metadata +371 -2
@@ -0,0 +1,165 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_record"
|
4
|
+
require "neighbor"
|
5
|
+
|
6
|
+
module Ragdoll
|
7
|
+
class Search < ActiveRecord::Base
|
8
|
+
self.table_name = "ragdoll_searches"
|
9
|
+
|
10
|
+
# Use pgvector for vector similarity search on query embeddings
|
11
|
+
has_neighbors :query_embedding
|
12
|
+
|
13
|
+
has_many :search_results, class_name: "Ragdoll::SearchResult", foreign_key: "search_id", dependent: :destroy
|
14
|
+
has_many :embeddings, through: :search_results
|
15
|
+
|
16
|
+
validates :query, presence: true
|
17
|
+
validates :query_embedding, presence: true
|
18
|
+
validates :search_type, presence: true, inclusion: { in: %w[semantic hybrid fulltext] }
|
19
|
+
validates :results_count, presence: true, numericality: { greater_than_or_equal_to: 0 }
|
20
|
+
|
21
|
+
scope :by_type, ->(type) { where(search_type: type) }
|
22
|
+
scope :by_session, ->(session_id) { where(session_id: session_id) }
|
23
|
+
scope :by_user, ->(user_id) { where(user_id: user_id) }
|
24
|
+
scope :recent, -> { order(created_at: :desc) }
|
25
|
+
scope :with_results, -> { where("results_count > 0") }
|
26
|
+
scope :popular, -> { where("results_count > 0").order(results_count: :desc) }
|
27
|
+
scope :slow_searches, ->(threshold_ms = 1000) { where("execution_time_ms > ?", threshold_ms) }
|
28
|
+
|
29
|
+
# Find searches with similar query embeddings
|
30
|
+
def self.find_similar(query_embedding, limit: 10, threshold: 0.8)
|
31
|
+
nearest_neighbors(:query_embedding, query_embedding, distance: "cosine")
|
32
|
+
.limit(limit * 2)
|
33
|
+
.map do |search|
|
34
|
+
similarity = 1.0 - search.neighbor_distance
|
35
|
+
next if similarity < threshold
|
36
|
+
|
37
|
+
search.define_singleton_method(:similarity_score) { similarity }
|
38
|
+
search
|
39
|
+
end
|
40
|
+
.compact
|
41
|
+
.sort_by(&:similarity_score)
|
42
|
+
.reverse
|
43
|
+
.take(limit)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Calculate statistics for this search
|
47
|
+
def calculate_similarity_stats!
|
48
|
+
return unless search_results.any?
|
49
|
+
|
50
|
+
scores = search_results.pluck(:similarity_score)
|
51
|
+
update!(
|
52
|
+
max_similarity_score: scores.max,
|
53
|
+
min_similarity_score: scores.min,
|
54
|
+
avg_similarity_score: scores.sum.to_f / scores.length
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
# Get search results ordered by rank
|
59
|
+
def ranked_results
|
60
|
+
search_results.includes(:embedding).order(:result_rank)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Get clicked results
|
64
|
+
def clicked_results
|
65
|
+
search_results.where(clicked: true).order(:clicked_at)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Calculate click-through rate
|
69
|
+
def click_through_rate
|
70
|
+
return 0.0 if results_count == 0
|
71
|
+
|
72
|
+
clicked_count = search_results.where(clicked: true).count
|
73
|
+
clicked_count.to_f / results_count
|
74
|
+
end
|
75
|
+
|
76
|
+
# Record a search with its results
|
77
|
+
def self.record_search(query:, query_embedding:, results:, search_type: "semantic",
|
78
|
+
filters: {}, options: {}, execution_time_ms: nil,
|
79
|
+
session_id: nil, user_id: nil)
|
80
|
+
search = create!(
|
81
|
+
query: query,
|
82
|
+
query_embedding: query_embedding,
|
83
|
+
search_type: search_type,
|
84
|
+
results_count: results.length,
|
85
|
+
search_filters: filters,
|
86
|
+
search_options: options,
|
87
|
+
execution_time_ms: execution_time_ms,
|
88
|
+
session_id: session_id,
|
89
|
+
user_id: user_id
|
90
|
+
)
|
91
|
+
|
92
|
+
# Create search result records
|
93
|
+
results.each_with_index do |result, index|
|
94
|
+
search.search_results.create!(
|
95
|
+
embedding_id: result[:embedding_id],
|
96
|
+
similarity_score: result[:similarity],
|
97
|
+
result_rank: index + 1
|
98
|
+
)
|
99
|
+
end
|
100
|
+
|
101
|
+
# Calculate and store similarity statistics
|
102
|
+
search.calculate_similarity_stats!
|
103
|
+
search
|
104
|
+
end
|
105
|
+
|
106
|
+
# Search analytics methods
|
107
|
+
def self.search_analytics(days: 30)
|
108
|
+
start_date = days.days.ago
|
109
|
+
searches = where(created_at: start_date..)
|
110
|
+
|
111
|
+
{
|
112
|
+
total_searches: searches.count,
|
113
|
+
unique_queries: searches.distinct.count(:query),
|
114
|
+
avg_results_per_search: searches.average(:results_count)&.round(2),
|
115
|
+
avg_execution_time: searches.average(:execution_time_ms)&.round(2),
|
116
|
+
search_types: searches.group(:search_type).count,
|
117
|
+
searches_with_results: searches.where("results_count > 0").count,
|
118
|
+
avg_click_through_rate: calculate_avg_ctr(searches)
|
119
|
+
}
|
120
|
+
end
|
121
|
+
|
122
|
+
# Cleanup orphaned searches that have no remaining search results
|
123
|
+
def self.cleanup_orphaned_searches
|
124
|
+
orphaned_search_ids = where.not(id: SearchResult.distinct.pluck(:search_id))
|
125
|
+
orphaned_count = orphaned_search_ids.count
|
126
|
+
|
127
|
+
if orphaned_count > 0
|
128
|
+
orphaned_search_ids.destroy_all
|
129
|
+
Rails.logger.info "Cleaned up #{orphaned_count} orphaned search records" if defined?(Rails)
|
130
|
+
end
|
131
|
+
|
132
|
+
orphaned_count
|
133
|
+
end
|
134
|
+
|
135
|
+
# Cleanup searches older than specified days with no clicks
|
136
|
+
def self.cleanup_old_unused_searches(days: 30)
|
137
|
+
cutoff_date = days.days.ago
|
138
|
+
unused_searches = where(created_at: ...cutoff_date)
|
139
|
+
.left_joins(:search_results)
|
140
|
+
.where(search_results: { clicked: [nil, false] })
|
141
|
+
|
142
|
+
unused_count = unused_searches.count
|
143
|
+
|
144
|
+
if unused_count > 0
|
145
|
+
unused_searches.destroy_all
|
146
|
+
Rails.logger.info "Cleaned up #{unused_count} old unused search records" if defined?(Rails)
|
147
|
+
end
|
148
|
+
|
149
|
+
unused_count
|
150
|
+
end
|
151
|
+
|
152
|
+
private
|
153
|
+
|
154
|
+
def self.calculate_avg_ctr(searches)
|
155
|
+
search_ids = searches.pluck(:id)
|
156
|
+
return 0.0 if search_ids.empty?
|
157
|
+
|
158
|
+
total_results = SearchResult.where(search_id: search_ids).count
|
159
|
+
return 0.0 if total_results == 0
|
160
|
+
|
161
|
+
clicked_results = SearchResult.where(search_id: search_ids, clicked: true).count
|
162
|
+
(clicked_results.to_f / total_results * 100).round(2)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_record"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class SearchResult < ActiveRecord::Base
|
7
|
+
self.table_name = "ragdoll_search_results"
|
8
|
+
|
9
|
+
belongs_to :search, class_name: "Ragdoll::Search"
|
10
|
+
belongs_to :embedding, class_name: "Ragdoll::Embedding"
|
11
|
+
|
12
|
+
validates :similarity_score, presence: true, numericality: { in: 0.0..1.0 }
|
13
|
+
validates :result_rank, presence: true, numericality: { greater_than: 0 }
|
14
|
+
validates :result_rank, uniqueness: { scope: :search_id }
|
15
|
+
|
16
|
+
scope :by_rank, -> { order(:result_rank) }
|
17
|
+
scope :clicked, -> { where(clicked: true) }
|
18
|
+
scope :unclicked, -> { where(clicked: false) }
|
19
|
+
scope :high_similarity, ->(threshold = 0.8) { where("similarity_score >= ?", threshold) }
|
20
|
+
scope :recent_clicks, -> { where(clicked: true).order(clicked_at: :desc) }
|
21
|
+
|
22
|
+
# Cleanup callback to remove searches when they have no results left
|
23
|
+
after_destroy :cleanup_empty_search
|
24
|
+
|
25
|
+
# Mark this result as clicked
|
26
|
+
def mark_as_clicked!
|
27
|
+
update!(clicked: true, clicked_at: Time.current)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Get the content through the embedding relationship
|
31
|
+
def content
|
32
|
+
embedding&.content
|
33
|
+
end
|
34
|
+
|
35
|
+
# Get the document through the embedding relationship
|
36
|
+
def document
|
37
|
+
embedding&.embeddable&.document
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get the document title
|
41
|
+
def document_title
|
42
|
+
document&.title
|
43
|
+
end
|
44
|
+
|
45
|
+
# Get the document location
|
46
|
+
def document_location
|
47
|
+
document&.location
|
48
|
+
end
|
49
|
+
|
50
|
+
# Analytics for search results
|
51
|
+
def self.analytics(days: 30)
|
52
|
+
start_date = days.days.ago
|
53
|
+
results = where(created_at: start_date..)
|
54
|
+
|
55
|
+
{
|
56
|
+
total_results: results.count,
|
57
|
+
clicked_results: results.where(clicked: true).count,
|
58
|
+
click_through_rate: calculate_ctr(results),
|
59
|
+
avg_similarity_score: results.average(:similarity_score)&.round(4),
|
60
|
+
high_similarity_results: results.where("similarity_score >= 0.8").count,
|
61
|
+
low_similarity_results: results.where("similarity_score < 0.5").count,
|
62
|
+
rank_performance: rank_click_analysis(results)
|
63
|
+
}
|
64
|
+
end
|
65
|
+
|
66
|
+
# Analyze click performance by result rank
|
67
|
+
def self.rank_click_analysis(results = nil)
|
68
|
+
results ||= all
|
69
|
+
|
70
|
+
results.group(:result_rank)
|
71
|
+
.group("clicked")
|
72
|
+
.count
|
73
|
+
.each_with_object({}) do |((rank, clicked), count), hash|
|
74
|
+
hash[rank] ||= { total: 0, clicked: 0 }
|
75
|
+
hash[rank][:total] += count
|
76
|
+
hash[rank][:clicked] += count if clicked
|
77
|
+
end
|
78
|
+
.transform_values do |stats|
|
79
|
+
stats.merge(
|
80
|
+
ctr: stats[:total] > 0 ? (stats[:clicked].to_f / stats[:total] * 100).round(2) : 0.0
|
81
|
+
)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Find embeddings that perform well across multiple searches
|
86
|
+
def self.top_performing_embeddings(limit: 20)
|
87
|
+
joins(:embedding)
|
88
|
+
.group(:embedding_id)
|
89
|
+
.select(
|
90
|
+
"embedding_id",
|
91
|
+
"COUNT(*) as appearance_count",
|
92
|
+
"AVG(similarity_score) as avg_similarity",
|
93
|
+
"COUNT(CASE WHEN clicked THEN 1 END) as click_count",
|
94
|
+
"ROUND(COUNT(CASE WHEN clicked THEN 1 END) * 100.0 / COUNT(*), 2) as ctr"
|
95
|
+
)
|
96
|
+
.having("COUNT(*) > 1")
|
97
|
+
.order("avg_similarity DESC, ctr DESC")
|
98
|
+
.limit(limit)
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
def self.calculate_ctr(results)
|
104
|
+
total = results.count
|
105
|
+
return 0.0 if total == 0
|
106
|
+
|
107
|
+
clicked = results.where(clicked: true).count
|
108
|
+
(clicked.to_f / total * 100).round(2)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Cleanup callback to remove parent search if it has no results left
|
112
|
+
def cleanup_empty_search
|
113
|
+
return unless search
|
114
|
+
|
115
|
+
# Check if this was the last result for the search
|
116
|
+
if search.search_results.count == 0
|
117
|
+
search.destroy
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -20,10 +20,10 @@ module Ragdoll
|
|
20
20
|
@config.embedding_model(content_type)
|
21
21
|
when :summary, :keywords
|
22
22
|
# Check for task-specific model, fall back to default
|
23
|
-
task_model = @config.models
|
24
|
-
task_model || @config.models
|
23
|
+
task_model = @config.models[:text_generation][task_type]
|
24
|
+
task_model || @config.models[:text_generation][:default]
|
25
25
|
else
|
26
|
-
@config.models
|
26
|
+
@config.models[:text_generation][:default]
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
@@ -3,6 +3,8 @@
|
|
3
3
|
require "pdf-reader"
|
4
4
|
require "docx"
|
5
5
|
require "rmagick"
|
6
|
+
require "yaml"
|
7
|
+
require "date"
|
6
8
|
# Image description service is auto-loaded from app/services
|
7
9
|
|
8
10
|
module Ragdoll
|
@@ -137,6 +139,11 @@ module Ragdoll
|
|
137
139
|
raise ParseError, "Unsupported PDF feature: #{e.message}"
|
138
140
|
end
|
139
141
|
|
142
|
+
# Add filepath-based title as fallback if no title was found
|
143
|
+
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
144
|
+
metadata[:title] = extract_title_from_filepath
|
145
|
+
end
|
146
|
+
|
140
147
|
{
|
141
148
|
content: content.strip,
|
142
149
|
metadata: metadata,
|
@@ -192,6 +199,11 @@ module Ragdoll
|
|
192
199
|
raise ParseError, "#{__LINE__} Failed to parse DOCX: #{e.message}"
|
193
200
|
end
|
194
201
|
|
202
|
+
# Add filepath-based title as fallback if no title was found
|
203
|
+
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
204
|
+
metadata[:title] = extract_title_from_filepath
|
205
|
+
end
|
206
|
+
|
195
207
|
{
|
196
208
|
content: content.strip,
|
197
209
|
metadata: metadata,
|
@@ -212,6 +224,20 @@ module Ragdoll
|
|
212
224
|
else "text"
|
213
225
|
end
|
214
226
|
|
227
|
+
# Parse YAML front matter for markdown files
|
228
|
+
if document_type == "markdown" && content.start_with?("---\n")
|
229
|
+
front_matter, body_content = parse_yaml_front_matter(content)
|
230
|
+
if front_matter
|
231
|
+
metadata.merge!(front_matter)
|
232
|
+
content = body_content
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
# Add filepath-based title as fallback if no title was found
|
237
|
+
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
238
|
+
metadata[:title] = extract_title_from_filepath
|
239
|
+
end
|
240
|
+
|
215
241
|
{
|
216
242
|
content: content,
|
217
243
|
metadata: metadata,
|
@@ -225,16 +251,41 @@ module Ragdoll
|
|
225
251
|
encoding: "ISO-8859-1"
|
226
252
|
}
|
227
253
|
|
254
|
+
# Try to parse front matter with different encoding too
|
255
|
+
if document_type == "markdown" && content.start_with?("---\n")
|
256
|
+
front_matter, body_content = parse_yaml_front_matter(content)
|
257
|
+
if front_matter
|
258
|
+
metadata.merge!(front_matter)
|
259
|
+
content = body_content
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
# Add filepath-based title as fallback if no title was found
|
264
|
+
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
265
|
+
metadata[:title] = extract_title_from_filepath
|
266
|
+
end
|
267
|
+
|
228
268
|
{
|
229
269
|
content: content,
|
230
270
|
metadata: metadata,
|
231
|
-
document_type: "text"
|
271
|
+
document_type: document_type.nil? ? "text" : document_type
|
232
272
|
}
|
233
273
|
end
|
234
274
|
|
235
275
|
def parse_html
|
236
276
|
content = File.read(@file_path, encoding: "UTF-8")
|
237
277
|
|
278
|
+
# Extract title from H1 tag if present
|
279
|
+
h1_match = content.match(%r{<h1[^>]*>(.*?)</h1>}mi)
|
280
|
+
title = nil
|
281
|
+
if h1_match
|
282
|
+
# Clean up the H1 content by removing any HTML tags and normalizing whitespace
|
283
|
+
title = h1_match[1]
|
284
|
+
.gsub(/<[^>]+>/, " ") # Remove any nested HTML tags
|
285
|
+
.gsub(/\s+/, " ") # Normalize whitespace
|
286
|
+
.strip
|
287
|
+
end
|
288
|
+
|
238
289
|
# Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
|
239
290
|
clean_content = content
|
240
291
|
.gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
|
@@ -248,6 +299,13 @@ module Ragdoll
|
|
248
299
|
original_format: "html"
|
249
300
|
}
|
250
301
|
|
302
|
+
# Add title to metadata if found, otherwise use filepath fallback
|
303
|
+
if title && !title.empty?
|
304
|
+
metadata[:title] = title
|
305
|
+
else
|
306
|
+
metadata[:title] = extract_title_from_filepath
|
307
|
+
end
|
308
|
+
|
251
309
|
{
|
252
310
|
content: clean_content,
|
253
311
|
metadata: metadata,
|
@@ -286,6 +344,9 @@ module Ragdoll
|
|
286
344
|
# Use AI-generated description or fallback placeholder
|
287
345
|
content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
|
288
346
|
|
347
|
+
# Add filepath-based title as fallback
|
348
|
+
metadata[:title] = extract_title_from_filepath
|
349
|
+
|
289
350
|
puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
|
290
351
|
|
291
352
|
{
|
@@ -338,5 +399,67 @@ module Ragdoll
|
|
338
399
|
else "application/octet-stream"
|
339
400
|
end
|
340
401
|
end
|
402
|
+
|
403
|
+
private
|
404
|
+
|
405
|
+
# Extract a meaningful title from the file path as a fallback
|
406
|
+
# @param file_path [String] the full file path
|
407
|
+
# @return [String] a cleaned title derived from the filename
|
408
|
+
def extract_title_from_filepath(file_path = @file_path)
|
409
|
+
filename = File.basename(file_path, File.extname(file_path))
|
410
|
+
|
411
|
+
# Clean up common patterns in filenames to make them more readable
|
412
|
+
title = filename
|
413
|
+
.gsub(/[-_]+/, ' ') # Replace hyphens and underscores with spaces
|
414
|
+
.gsub(/([a-z])([A-Z])/, '\1 \2') # Add space before capital letters (camelCase)
|
415
|
+
.gsub(/\s+/, ' ') # Normalize multiple spaces
|
416
|
+
.strip
|
417
|
+
|
418
|
+
# Capitalize words for better readability
|
419
|
+
title.split(' ').map(&:capitalize).join(' ')
|
420
|
+
end
|
421
|
+
|
422
|
+
# Parse YAML front matter from markdown content
|
423
|
+
# @param content [String] the full content of the markdown file
|
424
|
+
# @return [Array] returns [front_matter_hash, body_content] or [nil, original_content]
|
425
|
+
def parse_yaml_front_matter(content)
|
426
|
+
# Check if content starts with YAML front matter delimiter
|
427
|
+
return [nil, content] unless content.start_with?("---\n")
|
428
|
+
|
429
|
+
# Find the closing delimiter
|
430
|
+
lines = content.lines
|
431
|
+
closing_index = nil
|
432
|
+
|
433
|
+
lines.each_with_index do |line, index|
|
434
|
+
next if index == 0 # Skip the opening ---
|
435
|
+
if line.strip == "---"
|
436
|
+
closing_index = index
|
437
|
+
break
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
# No closing delimiter found
|
442
|
+
return [nil, content] unless closing_index
|
443
|
+
|
444
|
+
# Extract YAML content and body
|
445
|
+
yaml_lines = lines[1...closing_index]
|
446
|
+
body_lines = lines[(closing_index + 1)..-1]
|
447
|
+
|
448
|
+
yaml_content = yaml_lines.join
|
449
|
+
body_content = body_lines&.join || ""
|
450
|
+
|
451
|
+
# Parse YAML
|
452
|
+
begin
|
453
|
+
# Allow Time objects for date fields in YAML front matter
|
454
|
+
front_matter = YAML.safe_load(yaml_content, permitted_classes: [Time, Date])
|
455
|
+
# Convert string keys to symbols for consistency
|
456
|
+
front_matter = front_matter.transform_keys(&:to_sym) if front_matter.is_a?(Hash)
|
457
|
+
[front_matter, body_content.strip]
|
458
|
+
rescue YAML::SyntaxError, Psych::DisallowedClass => e
|
459
|
+
# If YAML parsing fails, return original content
|
460
|
+
Rails.logger.warn "Warning: Failed to parse YAML front matter: #{e.message}" if defined?(Rails)
|
461
|
+
[nil, content]
|
462
|
+
end
|
463
|
+
end
|
341
464
|
end
|
342
465
|
end
|
@@ -38,6 +38,11 @@ module Ragdoll
|
|
38
38
|
embedding_config = @model_resolver.resolve_embedding(:text)
|
39
39
|
# Use just the model name for RubyLLM
|
40
40
|
model = embedding_config.model.model
|
41
|
+
|
42
|
+
# If model is nil or empty, use fallback
|
43
|
+
if model.nil? || model.empty?
|
44
|
+
return generate_fallback_embedding
|
45
|
+
end
|
41
46
|
|
42
47
|
begin
|
43
48
|
response = RubyLLM.embed(cleaned_text, model: model)
|
@@ -93,6 +98,11 @@ module Ragdoll
|
|
93
98
|
embedding_config = @model_resolver.resolve_embedding(:text)
|
94
99
|
# Use just the model name for RubyLLM
|
95
100
|
model = embedding_config.model.model
|
101
|
+
|
102
|
+
# If model is nil or empty, use fallback
|
103
|
+
if model.nil? || model.empty?
|
104
|
+
return cleaned_texts.map { generate_fallback_embedding }
|
105
|
+
end
|
96
106
|
|
97
107
|
cleaned_texts.map do |text|
|
98
108
|
response = RubyLLM.embed(text, model: model)
|
@@ -27,25 +27,83 @@ module Ragdoll
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def search_similar_content(query_or_embedding, options = {})
|
30
|
+
start_time = Time.current
|
30
31
|
search_config = @config_service.search_config
|
31
32
|
limit = options[:limit] || search_config[:max_results]
|
32
33
|
threshold = options[:threshold] || search_config[:similarity_threshold]
|
33
34
|
filters = options[:filters] || {}
|
35
|
+
|
36
|
+
# Extract tracking options
|
37
|
+
session_id = options[:session_id]
|
38
|
+
user_id = options[:user_id]
|
39
|
+
track_search = options.fetch(:track_search, true)
|
34
40
|
|
35
41
|
if query_or_embedding.is_a?(Array)
|
36
42
|
# It's already an embedding
|
37
43
|
query_embedding = query_or_embedding
|
44
|
+
query_string = options[:query] # Should be provided when passing embedding directly
|
38
45
|
else
|
39
46
|
# It's a query string, generate embedding
|
40
|
-
|
47
|
+
query_string = query_or_embedding
|
48
|
+
query_embedding = @embedding_service.generate_embedding(query_string)
|
41
49
|
return [] if query_embedding.nil?
|
42
50
|
end
|
43
51
|
|
44
|
-
# Search using ActiveRecord models
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
52
|
+
# Search using ActiveRecord models with statistics
|
53
|
+
# Try enhanced search first, fall back to original if it fails
|
54
|
+
begin
|
55
|
+
search_response = Ragdoll::Embedding.search_similar_with_stats(query_embedding,
|
56
|
+
limit: limit,
|
57
|
+
threshold: threshold,
|
58
|
+
filters: filters)
|
59
|
+
results = search_response[:results]
|
60
|
+
statistics = search_response[:statistics]
|
61
|
+
rescue NoMethodError, PG::SyntaxError => e
|
62
|
+
# Fall back to original search method if enhanced version fails
|
63
|
+
puts "Warning: Enhanced search failed (#{e.message}), using fallback" if ENV["RAGDOLL_DEBUG"]
|
64
|
+
results = Ragdoll::Embedding.search_similar(query_embedding,
|
65
|
+
limit: limit,
|
66
|
+
threshold: threshold,
|
67
|
+
filters: filters)
|
68
|
+
statistics = nil
|
69
|
+
end
|
70
|
+
|
71
|
+
execution_time = ((Time.current - start_time) * 1000).round
|
72
|
+
|
73
|
+
# Record search if tracking enabled and we have a query string
|
74
|
+
if track_search && query_string && !query_string.empty?
|
75
|
+
begin
|
76
|
+
# Format results for search recording
|
77
|
+
search_results = results.map do |result|
|
78
|
+
{
|
79
|
+
embedding_id: result[:embedding_id] || result[:id],
|
80
|
+
similarity: result[:similarity] || result[:similarity_score] || 0.0
|
81
|
+
}
|
82
|
+
end
|
83
|
+
|
84
|
+
Ragdoll::Search.record_search(
|
85
|
+
query: query_string,
|
86
|
+
query_embedding: query_embedding,
|
87
|
+
results: search_results,
|
88
|
+
search_type: "semantic",
|
89
|
+
filters: filters,
|
90
|
+
options: { limit: limit, threshold: threshold },
|
91
|
+
execution_time_ms: execution_time,
|
92
|
+
session_id: session_id,
|
93
|
+
user_id: user_id
|
94
|
+
)
|
95
|
+
rescue => e
|
96
|
+
# Log error but don't fail the search
|
97
|
+
puts "Warning: Search tracking failed: #{e.message}" if ENV["RAGDOLL_DEBUG"]
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Return results with statistics for better user feedback
|
102
|
+
{
|
103
|
+
results: results,
|
104
|
+
statistics: statistics,
|
105
|
+
execution_time_ms: execution_time
|
106
|
+
}
|
49
107
|
end
|
50
108
|
end
|
51
109
|
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
class CreateRagdollSearches < ActiveRecord::Migration[7.0]
|
2
|
+
def change
|
3
|
+
create_table :ragdoll_searches,
|
4
|
+
comment: "Search queries and results tracking with vector similarity support" do |t|
|
5
|
+
|
6
|
+
t.text :query, null: false,
|
7
|
+
comment: "Original search query text"
|
8
|
+
|
9
|
+
t.vector :query_embedding, limit: 1536, null: false,
|
10
|
+
comment: "Vector embedding of the search query for similarity matching"
|
11
|
+
|
12
|
+
t.string :search_type, null: false, default: "semantic",
|
13
|
+
comment: "Type of search performed (semantic, hybrid, fulltext)"
|
14
|
+
|
15
|
+
t.integer :results_count, null: false, default: 0,
|
16
|
+
comment: "Number of results returned for this search"
|
17
|
+
|
18
|
+
t.float :max_similarity_score,
|
19
|
+
comment: "Highest similarity score from results"
|
20
|
+
|
21
|
+
t.float :min_similarity_score,
|
22
|
+
comment: "Lowest similarity score from results"
|
23
|
+
|
24
|
+
t.float :avg_similarity_score,
|
25
|
+
comment: "Average similarity score of results"
|
26
|
+
|
27
|
+
t.json :search_filters, default: {},
|
28
|
+
comment: "Filters applied during search (document_type, date_range, etc.)"
|
29
|
+
|
30
|
+
t.json :search_options, default: {},
|
31
|
+
comment: "Search configuration options (threshold, limit, etc.)"
|
32
|
+
|
33
|
+
t.integer :execution_time_ms,
|
34
|
+
comment: "Search execution time in milliseconds"
|
35
|
+
|
36
|
+
t.string :session_id,
|
37
|
+
comment: "User session identifier for grouping related searches"
|
38
|
+
|
39
|
+
t.string :user_id,
|
40
|
+
comment: "User identifier if authentication is available"
|
41
|
+
|
42
|
+
t.timestamps null: false,
|
43
|
+
comment: "Standard creation and update timestamps"
|
44
|
+
|
45
|
+
###########
|
46
|
+
# Indexes #
|
47
|
+
###########
|
48
|
+
|
49
|
+
t.index :query_embedding, using: :ivfflat, opclass: :vector_cosine_ops,
|
50
|
+
name: "index_ragdoll_searches_on_query_embedding_cosine",
|
51
|
+
comment: "IVFFlat index for finding similar search queries"
|
52
|
+
|
53
|
+
t.index :search_type,
|
54
|
+
comment: "Index for filtering by search type"
|
55
|
+
|
56
|
+
t.index :session_id,
|
57
|
+
comment: "Index for grouping searches by session"
|
58
|
+
|
59
|
+
t.index :user_id,
|
60
|
+
comment: "Index for filtering searches by user"
|
61
|
+
|
62
|
+
t.index :created_at,
|
63
|
+
comment: "Index for chronological search history"
|
64
|
+
|
65
|
+
t.index :results_count,
|
66
|
+
comment: "Index for analyzing search effectiveness"
|
67
|
+
|
68
|
+
t.index "to_tsvector('english', query)", using: :gin,
|
69
|
+
name: "index_ragdoll_searches_on_fulltext_query",
|
70
|
+
comment: "Full-text search index for finding searches by query text"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|