ragdoll 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +243 -0
- data/README.md +209 -31
- data/Rakefile +4 -5
- data/app/models/ragdoll/document.rb +115 -12
- data/app/models/ragdoll/embedding.rb +108 -2
- data/app/models/ragdoll/search.rb +165 -0
- data/app/models/ragdoll/search_result.rb +121 -0
- data/app/services/ragdoll/configuration_service.rb +3 -3
- data/app/services/ragdoll/document_processor.rb +124 -1
- data/app/services/ragdoll/embedding_service.rb +10 -0
- data/app/services/ragdoll/search_engine.rb +75 -6
- data/db/migrate/{001_enable_postgresql_extensions.rb → 20250815234901_enable_postgresql_extensions.rb} +7 -8
- data/db/migrate/20250815234902_create_ragdoll_documents.rb +117 -0
- data/db/migrate/{005_create_ragdoll_embeddings.rb → 20250815234903_create_ragdoll_embeddings.rb} +13 -10
- data/db/migrate/{006_create_ragdoll_contents.rb → 20250815234904_create_ragdoll_contents.rb} +14 -11
- data/db/migrate/20250815234905_create_ragdoll_searches.rb +77 -0
- data/db/migrate/20250815234906_create_ragdoll_search_results.rb +49 -0
- data/lib/ragdoll/core/client.rb +75 -8
- data/lib/ragdoll/core/database.rb +8 -3
- data/lib/ragdoll/core/model.rb +13 -0
- data/lib/ragdoll/core/version.rb +1 -1
- data/lib/ragdoll/core.rb +2 -0
- data/lib/ragdoll.rb +17 -0
- data/lib/tasks/db.rake +75 -27
- metadata +375 -6
- data/db/migrate/004_create_ragdoll_documents.rb +0 -70
@@ -3,6 +3,8 @@
|
|
3
3
|
require "pdf-reader"
|
4
4
|
require "docx"
|
5
5
|
require "rmagick"
|
6
|
+
require "yaml"
|
7
|
+
require "date"
|
6
8
|
# Image description service is auto-loaded from app/services
|
7
9
|
|
8
10
|
module Ragdoll
|
@@ -137,6 +139,11 @@ module Ragdoll
|
|
137
139
|
raise ParseError, "Unsupported PDF feature: #{e.message}"
|
138
140
|
end
|
139
141
|
|
142
|
+
# Add filepath-based title as fallback if no title was found
|
143
|
+
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
144
|
+
metadata[:title] = extract_title_from_filepath
|
145
|
+
end
|
146
|
+
|
140
147
|
{
|
141
148
|
content: content.strip,
|
142
149
|
metadata: metadata,
|
@@ -192,6 +199,11 @@ module Ragdoll
|
|
192
199
|
raise ParseError, "#{__LINE__} Failed to parse DOCX: #{e.message}"
|
193
200
|
end
|
194
201
|
|
202
|
+
# Add filepath-based title as fallback if no title was found
|
203
|
+
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
204
|
+
metadata[:title] = extract_title_from_filepath
|
205
|
+
end
|
206
|
+
|
195
207
|
{
|
196
208
|
content: content.strip,
|
197
209
|
metadata: metadata,
|
@@ -212,6 +224,20 @@ module Ragdoll
|
|
212
224
|
else "text"
|
213
225
|
end
|
214
226
|
|
227
|
+
# Parse YAML front matter for markdown files
|
228
|
+
if document_type == "markdown" && content.start_with?("---\n")
|
229
|
+
front_matter, body_content = parse_yaml_front_matter(content)
|
230
|
+
if front_matter
|
231
|
+
metadata.merge!(front_matter)
|
232
|
+
content = body_content
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
# Add filepath-based title as fallback if no title was found
|
237
|
+
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
238
|
+
metadata[:title] = extract_title_from_filepath
|
239
|
+
end
|
240
|
+
|
215
241
|
{
|
216
242
|
content: content,
|
217
243
|
metadata: metadata,
|
@@ -225,16 +251,41 @@ module Ragdoll
|
|
225
251
|
encoding: "ISO-8859-1"
|
226
252
|
}
|
227
253
|
|
254
|
+
# Try to parse front matter with different encoding too
|
255
|
+
if document_type == "markdown" && content.start_with?("---\n")
|
256
|
+
front_matter, body_content = parse_yaml_front_matter(content)
|
257
|
+
if front_matter
|
258
|
+
metadata.merge!(front_matter)
|
259
|
+
content = body_content
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
# Add filepath-based title as fallback if no title was found
|
264
|
+
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
265
|
+
metadata[:title] = extract_title_from_filepath
|
266
|
+
end
|
267
|
+
|
228
268
|
{
|
229
269
|
content: content,
|
230
270
|
metadata: metadata,
|
231
|
-
document_type: "text"
|
271
|
+
document_type: document_type.nil? ? "text" : document_type
|
232
272
|
}
|
233
273
|
end
|
234
274
|
|
235
275
|
def parse_html
|
236
276
|
content = File.read(@file_path, encoding: "UTF-8")
|
237
277
|
|
278
|
+
# Extract title from H1 tag if present
|
279
|
+
h1_match = content.match(%r{<h1[^>]*>(.*?)</h1>}mi)
|
280
|
+
title = nil
|
281
|
+
if h1_match
|
282
|
+
# Clean up the H1 content by removing any HTML tags and normalizing whitespace
|
283
|
+
title = h1_match[1]
|
284
|
+
.gsub(/<[^>]+>/, " ") # Remove any nested HTML tags
|
285
|
+
.gsub(/\s+/, " ") # Normalize whitespace
|
286
|
+
.strip
|
287
|
+
end
|
288
|
+
|
238
289
|
# Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
|
239
290
|
clean_content = content
|
240
291
|
.gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
|
@@ -248,6 +299,13 @@ module Ragdoll
|
|
248
299
|
original_format: "html"
|
249
300
|
}
|
250
301
|
|
302
|
+
# Add title to metadata if found, otherwise use filepath fallback
|
303
|
+
if title && !title.empty?
|
304
|
+
metadata[:title] = title
|
305
|
+
else
|
306
|
+
metadata[:title] = extract_title_from_filepath
|
307
|
+
end
|
308
|
+
|
251
309
|
{
|
252
310
|
content: clean_content,
|
253
311
|
metadata: metadata,
|
@@ -286,6 +344,9 @@ module Ragdoll
|
|
286
344
|
# Use AI-generated description or fallback placeholder
|
287
345
|
content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
|
288
346
|
|
347
|
+
# Add filepath-based title as fallback
|
348
|
+
metadata[:title] = extract_title_from_filepath
|
349
|
+
|
289
350
|
puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
|
290
351
|
|
291
352
|
{
|
@@ -338,5 +399,67 @@ module Ragdoll
|
|
338
399
|
else "application/octet-stream"
|
339
400
|
end
|
340
401
|
end
|
402
|
+
|
403
|
+
private
|
404
|
+
|
405
|
+
# Extract a meaningful title from the file path as a fallback
|
406
|
+
# @param file_path [String] the full file path
|
407
|
+
# @return [String] a cleaned title derived from the filename
|
408
|
+
def extract_title_from_filepath(file_path = @file_path)
|
409
|
+
filename = File.basename(file_path, File.extname(file_path))
|
410
|
+
|
411
|
+
# Clean up common patterns in filenames to make them more readable
|
412
|
+
title = filename
|
413
|
+
.gsub(/[-_]+/, ' ') # Replace hyphens and underscores with spaces
|
414
|
+
.gsub(/([a-z])([A-Z])/, '\1 \2') # Add space before capital letters (camelCase)
|
415
|
+
.gsub(/\s+/, ' ') # Normalize multiple spaces
|
416
|
+
.strip
|
417
|
+
|
418
|
+
# Capitalize words for better readability
|
419
|
+
title.split(' ').map(&:capitalize).join(' ')
|
420
|
+
end
|
421
|
+
|
422
|
+
# Parse YAML front matter from markdown content
|
423
|
+
# @param content [String] the full content of the markdown file
|
424
|
+
# @return [Array] returns [front_matter_hash, body_content] or [nil, original_content]
|
425
|
+
def parse_yaml_front_matter(content)
|
426
|
+
# Check if content starts with YAML front matter delimiter
|
427
|
+
return [nil, content] unless content.start_with?("---\n")
|
428
|
+
|
429
|
+
# Find the closing delimiter
|
430
|
+
lines = content.lines
|
431
|
+
closing_index = nil
|
432
|
+
|
433
|
+
lines.each_with_index do |line, index|
|
434
|
+
next if index == 0 # Skip the opening ---
|
435
|
+
if line.strip == "---"
|
436
|
+
closing_index = index
|
437
|
+
break
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
# No closing delimiter found
|
442
|
+
return [nil, content] unless closing_index
|
443
|
+
|
444
|
+
# Extract YAML content and body
|
445
|
+
yaml_lines = lines[1...closing_index]
|
446
|
+
body_lines = lines[(closing_index + 1)..-1]
|
447
|
+
|
448
|
+
yaml_content = yaml_lines.join
|
449
|
+
body_content = body_lines&.join || ""
|
450
|
+
|
451
|
+
# Parse YAML
|
452
|
+
begin
|
453
|
+
# Allow Time objects for date fields in YAML front matter
|
454
|
+
front_matter = YAML.safe_load(yaml_content, permitted_classes: [Time, Date])
|
455
|
+
# Convert string keys to symbols for consistency
|
456
|
+
front_matter = front_matter.transform_keys(&:to_sym) if front_matter.is_a?(Hash)
|
457
|
+
[front_matter, body_content.strip]
|
458
|
+
rescue YAML::SyntaxError, Psych::DisallowedClass => e
|
459
|
+
# If YAML parsing fails, return original content
|
460
|
+
Rails.logger.warn "Warning: Failed to parse YAML front matter: #{e.message}" if defined?(Rails)
|
461
|
+
[nil, content]
|
462
|
+
end
|
463
|
+
end
|
341
464
|
end
|
342
465
|
end
|
@@ -38,6 +38,11 @@ module Ragdoll
|
|
38
38
|
embedding_config = @model_resolver.resolve_embedding(:text)
|
39
39
|
# Use just the model name for RubyLLM
|
40
40
|
model = embedding_config.model.model
|
41
|
+
|
42
|
+
# If model is nil or empty, use fallback
|
43
|
+
if model.nil? || model.empty?
|
44
|
+
return generate_fallback_embedding
|
45
|
+
end
|
41
46
|
|
42
47
|
begin
|
43
48
|
response = RubyLLM.embed(cleaned_text, model: model)
|
@@ -93,6 +98,11 @@ module Ragdoll
|
|
93
98
|
embedding_config = @model_resolver.resolve_embedding(:text)
|
94
99
|
# Use just the model name for RubyLLM
|
95
100
|
model = embedding_config.model.model
|
101
|
+
|
102
|
+
# If model is nil or empty, use fallback
|
103
|
+
if model.nil? || model.empty?
|
104
|
+
return cleaned_texts.map { generate_fallback_embedding }
|
105
|
+
end
|
96
106
|
|
97
107
|
cleaned_texts.map do |text|
|
98
108
|
response = RubyLLM.embed(text, model: model)
|
@@ -27,25 +27,94 @@ module Ragdoll
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def search_similar_content(query_or_embedding, options = {})
|
30
|
+
start_time = Time.current
|
30
31
|
search_config = @config_service.search_config
|
31
32
|
limit = options[:limit] || search_config[:max_results]
|
32
33
|
threshold = options[:threshold] || search_config[:similarity_threshold]
|
33
34
|
filters = options[:filters] || {}
|
35
|
+
|
36
|
+
# Extract keywords option and normalize
|
37
|
+
keywords = options[:keywords] || []
|
38
|
+
keywords = Array(keywords).map(&:to_s).reject(&:empty?)
|
39
|
+
|
40
|
+
# Extract tracking options
|
41
|
+
session_id = options[:session_id]
|
42
|
+
user_id = options[:user_id]
|
43
|
+
track_search = options.fetch(:track_search, true)
|
34
44
|
|
35
45
|
if query_or_embedding.is_a?(Array)
|
36
46
|
# It's already an embedding
|
37
47
|
query_embedding = query_or_embedding
|
48
|
+
query_string = options[:query] # Should be provided when passing embedding directly
|
38
49
|
else
|
39
50
|
# It's a query string, generate embedding
|
40
|
-
|
51
|
+
query_string = query_or_embedding
|
52
|
+
query_embedding = @embedding_service.generate_embedding(query_string)
|
41
53
|
return [] if query_embedding.nil?
|
42
54
|
end
|
43
55
|
|
44
|
-
#
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
56
|
+
# Add keywords to filters if provided
|
57
|
+
if keywords.any?
|
58
|
+
filters[:keywords] = keywords
|
59
|
+
end
|
60
|
+
|
61
|
+
# Search using ActiveRecord models with statistics
|
62
|
+
# Try enhanced search first, fall back to original if it fails
|
63
|
+
begin
|
64
|
+
search_response = Ragdoll::Embedding.search_similar_with_stats(query_embedding,
|
65
|
+
limit: limit,
|
66
|
+
threshold: threshold,
|
67
|
+
filters: filters)
|
68
|
+
results = search_response[:results]
|
69
|
+
statistics = search_response[:statistics]
|
70
|
+
rescue NoMethodError, PG::SyntaxError => e
|
71
|
+
# Fall back to original search method if enhanced version fails
|
72
|
+
puts "Warning: Enhanced search failed (#{e.message}), using fallback" if ENV["RAGDOLL_DEBUG"]
|
73
|
+
results = Ragdoll::Embedding.search_similar(query_embedding,
|
74
|
+
limit: limit,
|
75
|
+
threshold: threshold,
|
76
|
+
filters: filters)
|
77
|
+
statistics = nil
|
78
|
+
end
|
79
|
+
|
80
|
+
execution_time = ((Time.current - start_time) * 1000).round
|
81
|
+
|
82
|
+
# Record search if tracking enabled and we have a query string
|
83
|
+
if track_search && query_string && !query_string.empty?
|
84
|
+
begin
|
85
|
+
# Format results for search recording
|
86
|
+
search_results = results.map do |result|
|
87
|
+
{
|
88
|
+
embedding_id: result[:embedding_id] || result[:id],
|
89
|
+
similarity: result[:similarity] || result[:similarity_score] || 0.0
|
90
|
+
}
|
91
|
+
end
|
92
|
+
|
93
|
+
search_type = keywords.any? ? "semantic_with_keywords" : "semantic"
|
94
|
+
|
95
|
+
Ragdoll::Search.record_search(
|
96
|
+
query: query_string,
|
97
|
+
query_embedding: query_embedding,
|
98
|
+
results: search_results,
|
99
|
+
search_type: search_type,
|
100
|
+
filters: filters,
|
101
|
+
options: { limit: limit, threshold: threshold, keywords: keywords },
|
102
|
+
execution_time_ms: execution_time,
|
103
|
+
session_id: session_id,
|
104
|
+
user_id: user_id
|
105
|
+
)
|
106
|
+
rescue => e
|
107
|
+
# Log error but don't fail the search
|
108
|
+
puts "Warning: Search tracking failed: #{e.message}" if ENV["RAGDOLL_DEBUG"]
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# Return results with statistics for better user feedback
|
113
|
+
{
|
114
|
+
results: results,
|
115
|
+
statistics: statistics,
|
116
|
+
execution_time_ms: execution_time
|
117
|
+
}
|
49
118
|
end
|
50
119
|
end
|
51
120
|
end
|
@@ -1,8 +1,5 @@
|
|
1
1
|
class EnablePostgresqlExtensions < ActiveRecord::Migration[7.0]
|
2
2
|
def up
|
3
|
-
# This migration is now handled by the db:create rake task
|
4
|
-
# Just ensure required extensions are available
|
5
|
-
|
6
3
|
# Vector similarity search (required for embeddings)
|
7
4
|
execute "CREATE EXTENSION IF NOT EXISTS vector"
|
8
5
|
|
@@ -15,9 +12,11 @@ class EnablePostgresqlExtensions < ActiveRecord::Migration[7.0]
|
|
15
12
|
end
|
16
13
|
|
17
14
|
def down
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
# Extensions are typically not dropped as they might be used by other databases
|
16
|
+
# If you really need to drop them, uncomment the following:
|
17
|
+
# execute "DROP EXTENSION IF EXISTS vector"
|
18
|
+
# execute "DROP EXTENSION IF EXISTS unaccent"
|
19
|
+
# execute "DROP EXTENSION IF EXISTS pg_trgm"
|
20
|
+
# execute "DROP EXTENSION IF EXISTS \"uuid-ossp\""
|
22
21
|
end
|
23
|
-
end
|
22
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
class CreateRagdollDocuments < ActiveRecord::Migration[7.0]
|
2
|
+
# For concurrent index creation (PostgreSQL)
|
3
|
+
disable_ddl_transaction!
|
4
|
+
|
5
|
+
def up
|
6
|
+
create_table :ragdoll_documents,
|
7
|
+
comment: "Core documents table with LLM-generated structured metadata" do |t|
|
8
|
+
|
9
|
+
t.string :location, null: false,
|
10
|
+
comment: "Source location of document (file path, URL, or identifier)"
|
11
|
+
|
12
|
+
t.string :title, null: false,
|
13
|
+
comment: "Human-readable document title for display and search"
|
14
|
+
|
15
|
+
t.text :summary, null: false, default: "",
|
16
|
+
comment: "LLM-generated summary of document content"
|
17
|
+
|
18
|
+
t.string :document_type, null: false, default: "text",
|
19
|
+
comment: "Document format type"
|
20
|
+
|
21
|
+
t.string :status, null: false, default: "pending",
|
22
|
+
comment: "Document processing status"
|
23
|
+
|
24
|
+
t.json :metadata, default: {},
|
25
|
+
comment: "LLM-generated structured metadata about the file"
|
26
|
+
|
27
|
+
t.timestamp :file_modified_at, null: false, default: -> { "CURRENT_TIMESTAMP" },
|
28
|
+
comment: "Timestamp when the source file was last modified"
|
29
|
+
|
30
|
+
t.timestamps null: false,
|
31
|
+
comment: "Standard creation and update timestamps"
|
32
|
+
|
33
|
+
# Add tsvector column for full-text search
|
34
|
+
t.tsvector :search_vector
|
35
|
+
|
36
|
+
# Add keywords as array column
|
37
|
+
t.text :keywords, array: true, default: []
|
38
|
+
end
|
39
|
+
|
40
|
+
###########
|
41
|
+
# Indexes #
|
42
|
+
###########
|
43
|
+
|
44
|
+
add_index :ragdoll_documents, :location, unique: true,
|
45
|
+
comment: "Unique index for document source lookup"
|
46
|
+
|
47
|
+
add_index :ragdoll_documents, :title,
|
48
|
+
comment: "Index for title-based search"
|
49
|
+
|
50
|
+
add_index :ragdoll_documents, :document_type,
|
51
|
+
comment: "Index for filtering by document type"
|
52
|
+
|
53
|
+
add_index :ragdoll_documents, :status,
|
54
|
+
comment: "Index for filtering by processing status"
|
55
|
+
|
56
|
+
add_index :ragdoll_documents, :created_at,
|
57
|
+
comment: "Index for chronological sorting"
|
58
|
+
|
59
|
+
add_index :ragdoll_documents, [:document_type, :status],
|
60
|
+
comment: "Composite index for type+status filtering"
|
61
|
+
|
62
|
+
# Full-text search index
|
63
|
+
execute <<-SQL
|
64
|
+
CREATE INDEX CONCURRENTLY index_ragdoll_documents_on_fulltext_search
|
65
|
+
ON ragdoll_documents
|
66
|
+
USING gin(to_tsvector('english',
|
67
|
+
COALESCE(title, '') || ' ' ||
|
68
|
+
COALESCE(metadata->>'summary', '') || ' ' ||
|
69
|
+
COALESCE(metadata->>'keywords', '') || ' ' ||
|
70
|
+
COALESCE(metadata->>'description', '')
|
71
|
+
))
|
72
|
+
SQL
|
73
|
+
|
74
|
+
add_index :ragdoll_documents, "(metadata->>'document_type')",
|
75
|
+
name: "index_ragdoll_documents_on_metadata_type",
|
76
|
+
comment: "Index for filtering by document type"
|
77
|
+
|
78
|
+
add_index :ragdoll_documents, "(metadata->>'classification')",
|
79
|
+
name: "index_ragdoll_documents_on_metadata_classification",
|
80
|
+
comment: "Index for filtering by document classification"
|
81
|
+
|
82
|
+
# GIN index on search_vector
|
83
|
+
add_index :ragdoll_documents, :search_vector, using: :gin, algorithm: :concurrently
|
84
|
+
|
85
|
+
# GIN index on keywords array
|
86
|
+
add_index :ragdoll_documents, :keywords, using: :gin,
|
87
|
+
name: 'index_ragdoll_documents_on_keywords_gin'
|
88
|
+
|
89
|
+
# Trigger to keep search_vector up to date on INSERT/UPDATE
|
90
|
+
execute <<-SQL
|
91
|
+
CREATE FUNCTION ragdoll_documents_vector_update() RETURNS trigger AS $$
|
92
|
+
BEGIN
|
93
|
+
NEW.search_vector := to_tsvector('english',
|
94
|
+
COALESCE(NEW.title, '') || ' ' ||
|
95
|
+
COALESCE(NEW.metadata->>'summary', '') || ' ' ||
|
96
|
+
COALESCE(NEW.metadata->>'keywords', '') || ' ' ||
|
97
|
+
COALESCE(NEW.metadata->>'description', '')
|
98
|
+
);
|
99
|
+
RETURN NEW;
|
100
|
+
END
|
101
|
+
$$ LANGUAGE plpgsql;
|
102
|
+
|
103
|
+
CREATE TRIGGER ragdoll_search_vector_update
|
104
|
+
BEFORE INSERT OR UPDATE ON ragdoll_documents
|
105
|
+
FOR EACH ROW EXECUTE FUNCTION ragdoll_documents_vector_update();
|
106
|
+
SQL
|
107
|
+
end
|
108
|
+
|
109
|
+
def down
|
110
|
+
execute <<-SQL
|
111
|
+
DROP TRIGGER IF EXISTS ragdoll_search_vector_update ON ragdoll_documents;
|
112
|
+
DROP FUNCTION IF EXISTS ragdoll_documents_vector_update();
|
113
|
+
SQL
|
114
|
+
|
115
|
+
drop_table :ragdoll_documents
|
116
|
+
end
|
117
|
+
end
|
data/db/migrate/{005_create_ragdoll_embeddings.rb → 20250815234903_create_ragdoll_embeddings.rb}
RENAMED
@@ -3,7 +3,7 @@ class CreateRagdollEmbeddings < ActiveRecord::Migration[7.0]
|
|
3
3
|
create_table :ragdoll_embeddings,
|
4
4
|
comment: "Polymorphic vector embeddings storage for semantic similarity search" do |t|
|
5
5
|
|
6
|
-
|
6
|
+
t.references :embeddable, polymorphic: true, null: false,
|
7
7
|
comment: "Polymorphic reference to embeddable content"
|
8
8
|
|
9
9
|
t.text :content, null: false, default: "",
|
@@ -26,16 +26,19 @@ class CreateRagdollEmbeddings < ActiveRecord::Migration[7.0]
|
|
26
26
|
|
27
27
|
t.timestamps null: false,
|
28
28
|
comment: "Standard creation and update timestamps"
|
29
|
+
end
|
29
30
|
|
30
|
-
|
31
|
-
|
32
|
-
|
31
|
+
###########
|
32
|
+
# Indexes #
|
33
|
+
###########
|
33
34
|
|
34
|
-
|
35
|
-
|
35
|
+
add_index :ragdoll_embeddings, [:embeddable_type, :embeddable_id],
|
36
|
+
comment: "Index for finding embeddings by embeddable content"
|
36
37
|
|
37
|
-
|
38
|
-
|
39
|
-
|
38
|
+
add_index :ragdoll_embeddings, :embedding_vector,
|
39
|
+
using: :ivfflat,
|
40
|
+
opclass: :vector_cosine_ops,
|
41
|
+
name: "index_ragdoll_embeddings_on_embedding_vector_cosine",
|
42
|
+
comment: "IVFFlat index for fast cosine similarity search"
|
40
43
|
end
|
41
|
-
end
|
44
|
+
end
|
data/db/migrate/{006_create_ragdoll_contents.rb → 20250815234904_create_ragdoll_contents.rb}
RENAMED
@@ -29,19 +29,22 @@ class CreateRagdollContents < ActiveRecord::Migration[7.0]
|
|
29
29
|
|
30
30
|
t.timestamps null: false,
|
31
31
|
comment: "Standard creation and update timestamps"
|
32
|
+
end
|
32
33
|
|
33
|
-
|
34
|
-
|
35
|
-
|
34
|
+
###########
|
35
|
+
# Indexes #
|
36
|
+
###########
|
36
37
|
|
37
|
-
|
38
|
-
|
38
|
+
add_index :ragdoll_contents, :embedding_model,
|
39
|
+
comment: "Index for filtering by embedding model"
|
39
40
|
|
40
|
-
|
41
|
-
|
41
|
+
add_index :ragdoll_contents, :type,
|
42
|
+
comment: "Index for filtering by content type"
|
42
43
|
|
43
|
-
|
44
|
-
|
45
|
-
|
44
|
+
execute <<-SQL
|
45
|
+
CREATE INDEX index_ragdoll_contents_on_fulltext_search
|
46
|
+
ON ragdoll_contents
|
47
|
+
USING gin(to_tsvector('english', COALESCE(content, '')))
|
48
|
+
SQL
|
46
49
|
end
|
47
|
-
end
|
50
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
class CreateRagdollSearches < ActiveRecord::Migration[7.0]
|
2
|
+
def change
|
3
|
+
create_table :ragdoll_searches,
|
4
|
+
comment: "Search queries and results tracking with vector similarity support" do |t|
|
5
|
+
|
6
|
+
t.text :query, null: false,
|
7
|
+
comment: "Original search query text"
|
8
|
+
|
9
|
+
t.vector :query_embedding, limit: 1536, null: false,
|
10
|
+
comment: "Vector embedding of the search query for similarity matching"
|
11
|
+
|
12
|
+
t.string :search_type, null: false, default: "semantic",
|
13
|
+
comment: "Type of search performed (semantic, hybrid, fulltext)"
|
14
|
+
|
15
|
+
t.integer :results_count, null: false, default: 0,
|
16
|
+
comment: "Number of results returned for this search"
|
17
|
+
|
18
|
+
t.float :max_similarity_score,
|
19
|
+
comment: "Highest similarity score from results"
|
20
|
+
|
21
|
+
t.float :min_similarity_score,
|
22
|
+
comment: "Lowest similarity score from results"
|
23
|
+
|
24
|
+
t.float :avg_similarity_score,
|
25
|
+
comment: "Average similarity score of results"
|
26
|
+
|
27
|
+
t.json :search_filters, default: {},
|
28
|
+
comment: "Filters applied during search (document_type, date_range, etc.)"
|
29
|
+
|
30
|
+
t.json :search_options, default: {},
|
31
|
+
comment: "Search configuration options (threshold, limit, etc.)"
|
32
|
+
|
33
|
+
t.integer :execution_time_ms,
|
34
|
+
comment: "Search execution time in milliseconds"
|
35
|
+
|
36
|
+
t.string :session_id,
|
37
|
+
comment: "User session identifier for grouping related searches"
|
38
|
+
|
39
|
+
t.string :user_id,
|
40
|
+
comment: "User identifier if authentication is available"
|
41
|
+
|
42
|
+
t.timestamps null: false,
|
43
|
+
comment: "Standard creation and update timestamps"
|
44
|
+
end
|
45
|
+
|
46
|
+
###########
|
47
|
+
# Indexes #
|
48
|
+
###########
|
49
|
+
|
50
|
+
add_index :ragdoll_searches, :query_embedding,
|
51
|
+
using: :ivfflat,
|
52
|
+
opclass: :vector_cosine_ops,
|
53
|
+
name: "index_ragdoll_searches_on_query_embedding_cosine",
|
54
|
+
comment: "IVFFlat index for finding similar search queries"
|
55
|
+
|
56
|
+
add_index :ragdoll_searches, :search_type,
|
57
|
+
comment: "Index for filtering by search type"
|
58
|
+
|
59
|
+
add_index :ragdoll_searches, :session_id,
|
60
|
+
comment: "Index for grouping searches by session"
|
61
|
+
|
62
|
+
add_index :ragdoll_searches, :user_id,
|
63
|
+
comment: "Index for filtering searches by user"
|
64
|
+
|
65
|
+
add_index :ragdoll_searches, :created_at,
|
66
|
+
comment: "Index for chronological search history"
|
67
|
+
|
68
|
+
add_index :ragdoll_searches, :results_count,
|
69
|
+
comment: "Index for analyzing search effectiveness"
|
70
|
+
|
71
|
+
execute <<-SQL
|
72
|
+
CREATE INDEX index_ragdoll_searches_on_fulltext_query
|
73
|
+
ON ragdoll_searches
|
74
|
+
USING gin(to_tsvector('english', query))
|
75
|
+
SQL
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
class CreateRagdollSearchResults < ActiveRecord::Migration[7.0]
|
2
|
+
def change
|
3
|
+
# Junction table for tracking which embeddings were returned for each search
|
4
|
+
create_table :ragdoll_search_results,
|
5
|
+
comment: "Junction table linking searches to their returned embeddings" do |t|
|
6
|
+
|
7
|
+
t.references :search, null: false, foreign_key: { to_table: :ragdoll_searches },
|
8
|
+
comment: "Reference to the search query"
|
9
|
+
|
10
|
+
t.references :embedding, null: false, foreign_key: { to_table: :ragdoll_embeddings },
|
11
|
+
comment: "Reference to the returned embedding"
|
12
|
+
|
13
|
+
t.float :similarity_score, null: false,
|
14
|
+
comment: "Similarity score for this result"
|
15
|
+
|
16
|
+
t.integer :result_rank, null: false,
|
17
|
+
comment: "Ranking position of this result (1-based)"
|
18
|
+
|
19
|
+
t.boolean :clicked, default: false,
|
20
|
+
comment: "Whether user interacted with this result"
|
21
|
+
|
22
|
+
t.datetime :clicked_at,
|
23
|
+
comment: "Timestamp when result was clicked/selected"
|
24
|
+
|
25
|
+
t.timestamps null: false,
|
26
|
+
comment: "Standard creation and update timestamps"
|
27
|
+
end
|
28
|
+
|
29
|
+
###########
|
30
|
+
# Indexes #
|
31
|
+
###########
|
32
|
+
|
33
|
+
add_index :ragdoll_search_results, [:search_id, :result_rank],
|
34
|
+
name: "idx_search_results_search_rank",
|
35
|
+
comment: "Index for retrieving results in ranked order"
|
36
|
+
|
37
|
+
add_index :ragdoll_search_results, [:embedding_id, :similarity_score],
|
38
|
+
name: "idx_search_results_embedding_score",
|
39
|
+
comment: "Index for analyzing embedding performance"
|
40
|
+
|
41
|
+
add_index :ragdoll_search_results, :similarity_score,
|
42
|
+
name: "idx_search_results_similarity",
|
43
|
+
comment: "Index for similarity score analysis"
|
44
|
+
|
45
|
+
add_index :ragdoll_search_results, [:clicked, :clicked_at],
|
46
|
+
name: "idx_search_results_clicks",
|
47
|
+
comment: "Index for click-through analysis"
|
48
|
+
end
|
49
|
+
end
|