ragdoll 0.1.9 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +68 -4
- data/README.md +86 -1
- data/Rakefile +4 -2
- data/app/models/ragdoll/document.rb +115 -12
- data/app/models/ragdoll/embedding.rb +36 -4
- data/app/models/ragdoll/search.rb +1 -1
- data/app/services/ragdoll/document_management.rb +117 -9
- data/app/services/ragdoll/document_processor.rb +67 -31
- data/app/services/ragdoll/search_engine.rb +13 -2
- data/db/migrate/{001_enable_postgresql_extensions.rb → 20250815234901_enable_postgresql_extensions.rb} +7 -8
- data/db/migrate/20250815234902_create_ragdoll_documents.rb +117 -0
- data/db/migrate/{005_create_ragdoll_embeddings.rb → 20250815234903_create_ragdoll_embeddings.rb} +13 -10
- data/db/migrate/{006_create_ragdoll_contents.rb → 20250815234904_create_ragdoll_contents.rb} +14 -11
- data/db/migrate/{007_create_ragdoll_searches.rb → 20250815234905_create_ragdoll_searches.rb} +24 -20
- data/db/migrate/{008_create_ragdoll_search_results.rb → 20250815234906_create_ragdoll_search_results.rb} +16 -16
- data/lib/ragdoll/core/client.rb +2 -2
- data/lib/ragdoll/core/database.rb +8 -3
- data/lib/ragdoll/core/version.rb +1 -1
- data/lib/tasks/db.rake +63 -15
- metadata +7 -7
- data/db/migrate/004_create_ragdoll_documents.rb +0 -70
@@ -99,8 +99,6 @@ module Ragdoll
|
|
99
99
|
else
|
100
100
|
parse_text # Default to text parsing for unknown formats
|
101
101
|
end
|
102
|
-
rescue StandardError => e # StandardError => e
|
103
|
-
raise ParseError, "#{__LINE__} Failed to parse #{@file_path}: #{e.message}"
|
104
102
|
end
|
105
103
|
|
106
104
|
private
|
@@ -109,6 +107,12 @@ module Ragdoll
|
|
109
107
|
content = ""
|
110
108
|
metadata = {}
|
111
109
|
|
110
|
+
# Add file-based metadata for duplicate detection
|
111
|
+
if File.exist?(@file_path)
|
112
|
+
metadata[:file_size] = File.size(@file_path)
|
113
|
+
metadata[:file_hash] = calculate_file_hash(@file_path)
|
114
|
+
end
|
115
|
+
|
112
116
|
begin
|
113
117
|
PDF::Reader.open(@file_path) do |reader|
|
114
118
|
# Extract metadata
|
@@ -144,6 +148,10 @@ module Ragdoll
|
|
144
148
|
metadata[:title] = extract_title_from_filepath
|
145
149
|
end
|
146
150
|
|
151
|
+
# Add content hash for duplicate detection
|
152
|
+
# Ensure content is UTF-8 encoded before checking presence
|
153
|
+
metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
|
154
|
+
|
147
155
|
{
|
148
156
|
content: content.strip,
|
149
157
|
metadata: metadata,
|
@@ -155,6 +163,12 @@ module Ragdoll
|
|
155
163
|
content = ""
|
156
164
|
metadata = {}
|
157
165
|
|
166
|
+
# Add file-based metadata for duplicate detection
|
167
|
+
if File.exist?(@file_path)
|
168
|
+
metadata[:file_size] = File.size(@file_path)
|
169
|
+
metadata[:file_hash] = calculate_file_hash(@file_path)
|
170
|
+
end
|
171
|
+
|
158
172
|
begin
|
159
173
|
doc = Docx::Document.open(@file_path)
|
160
174
|
|
@@ -204,6 +218,10 @@ module Ragdoll
|
|
204
218
|
metadata[:title] = extract_title_from_filepath
|
205
219
|
end
|
206
220
|
|
221
|
+
# Add content hash for duplicate detection
|
222
|
+
# Ensure content is UTF-8 encoded before checking presence
|
223
|
+
metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
|
224
|
+
|
207
225
|
{
|
208
226
|
content: content.strip,
|
209
227
|
metadata: metadata,
|
@@ -212,46 +230,31 @@ module Ragdoll
|
|
212
230
|
end
|
213
231
|
|
214
232
|
def parse_text
|
215
|
-
|
216
|
-
metadata = {
|
217
|
-
file_size: File.size(@file_path),
|
218
|
-
encoding: "UTF-8"
|
219
|
-
}
|
220
|
-
|
233
|
+
# Determine document type first (before any IO operations)
|
221
234
|
document_type = case @file_extension
|
222
235
|
when ".md", ".markdown" then "markdown"
|
223
236
|
when ".txt" then "text"
|
224
237
|
else "text"
|
225
238
|
end
|
226
239
|
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
# Add filepath-based title as fallback if no title was found
|
237
|
-
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
238
|
-
metadata[:title] = extract_title_from_filepath
|
240
|
+
begin
|
241
|
+
content = File.read(@file_path, encoding: "UTF-8")
|
242
|
+
encoding = "UTF-8"
|
243
|
+
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
|
244
|
+
# Try with different encoding - read as ISO-8859-1 and force encoding to UTF-8
|
245
|
+
content = File.read(@file_path, encoding: "ISO-8859-1").encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
|
246
|
+
encoding = "ISO-8859-1"
|
247
|
+
rescue Errno::ENOENT, Errno::EACCES => e
|
248
|
+
raise ParseError, "Failed to read file #{@file_path}: #{e.message}"
|
239
249
|
end
|
240
250
|
|
241
|
-
{
|
242
|
-
content: content,
|
243
|
-
metadata: metadata,
|
244
|
-
document_type: document_type
|
245
|
-
}
|
246
|
-
rescue Encoding::InvalidByteSequenceError
|
247
|
-
# Try with different encoding
|
248
|
-
content = File.read(@file_path, encoding: "ISO-8859-1")
|
249
251
|
metadata = {
|
250
252
|
file_size: File.size(@file_path),
|
251
|
-
|
253
|
+
file_hash: calculate_file_hash(@file_path),
|
254
|
+
encoding: encoding
|
252
255
|
}
|
253
256
|
|
254
|
-
#
|
257
|
+
# Parse YAML front matter for markdown files
|
255
258
|
if document_type == "markdown" && content.start_with?("---\n")
|
256
259
|
front_matter, body_content = parse_yaml_front_matter(content)
|
257
260
|
if front_matter
|
@@ -265,10 +268,14 @@ module Ragdoll
|
|
265
268
|
metadata[:title] = extract_title_from_filepath
|
266
269
|
end
|
267
270
|
|
271
|
+
# Add content hash for duplicate detection
|
272
|
+
# Ensure content is UTF-8 encoded before checking presence
|
273
|
+
metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
|
274
|
+
|
268
275
|
{
|
269
276
|
content: content,
|
270
277
|
metadata: metadata,
|
271
|
-
document_type: document_type
|
278
|
+
document_type: document_type
|
272
279
|
}
|
273
280
|
end
|
274
281
|
|
@@ -296,6 +303,7 @@ module Ragdoll
|
|
296
303
|
|
297
304
|
metadata = {
|
298
305
|
file_size: File.size(@file_path),
|
306
|
+
file_hash: calculate_file_hash(@file_path),
|
299
307
|
original_format: "html"
|
300
308
|
}
|
301
309
|
|
@@ -306,6 +314,9 @@ module Ragdoll
|
|
306
314
|
metadata[:title] = extract_title_from_filepath
|
307
315
|
end
|
308
316
|
|
317
|
+
# Add content hash for duplicate detection
|
318
|
+
metadata[:content_hash] = calculate_content_hash(clean_content) if clean_content.present?
|
319
|
+
|
309
320
|
{
|
310
321
|
content: clean_content,
|
311
322
|
metadata: metadata,
|
@@ -318,6 +329,7 @@ module Ragdoll
|
|
318
329
|
|
319
330
|
metadata = {
|
320
331
|
file_size: File.size(@file_path),
|
332
|
+
file_hash: calculate_file_hash(@file_path),
|
321
333
|
file_type: @file_extension.sub(".", ""),
|
322
334
|
original_filename: File.basename(@file_path)
|
323
335
|
}
|
@@ -347,6 +359,10 @@ module Ragdoll
|
|
347
359
|
# Add filepath-based title as fallback
|
348
360
|
metadata[:title] = extract_title_from_filepath
|
349
361
|
|
362
|
+
# Add content hash for duplicate detection
|
363
|
+
# Ensure content is UTF-8 encoded before checking presence
|
364
|
+
metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
|
365
|
+
|
350
366
|
puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
|
351
367
|
|
352
368
|
{
|
@@ -461,5 +477,25 @@ module Ragdoll
|
|
461
477
|
[nil, content]
|
462
478
|
end
|
463
479
|
end
|
480
|
+
|
481
|
+
# Calculate SHA256 hash of file content for duplicate detection
|
482
|
+
def calculate_file_hash(file_path)
|
483
|
+
require 'digest'
|
484
|
+
Digest::SHA256.file(file_path).hexdigest
|
485
|
+
rescue StandardError => e
|
486
|
+
Rails.logger.warn "Failed to calculate file hash for #{file_path}: #{e.message}" if defined?(Rails)
|
487
|
+
puts "Warning: Failed to calculate file hash for #{file_path}: #{e.message}"
|
488
|
+
nil
|
489
|
+
end
|
490
|
+
|
491
|
+
# Calculate SHA256 hash of text content for duplicate detection
|
492
|
+
def calculate_content_hash(content)
|
493
|
+
require 'digest'
|
494
|
+
Digest::SHA256.hexdigest(content)
|
495
|
+
rescue StandardError => e
|
496
|
+
Rails.logger.warn "Failed to calculate content hash: #{e.message}" if defined?(Rails)
|
497
|
+
puts "Warning: Failed to calculate content hash: #{e.message}"
|
498
|
+
nil
|
499
|
+
end
|
464
500
|
end
|
465
501
|
end
|
@@ -33,6 +33,10 @@ module Ragdoll
|
|
33
33
|
threshold = options[:threshold] || search_config[:similarity_threshold]
|
34
34
|
filters = options[:filters] || {}
|
35
35
|
|
36
|
+
# Extract keywords option and normalize
|
37
|
+
keywords = options[:keywords] || []
|
38
|
+
keywords = Array(keywords).map(&:to_s).reject(&:empty?)
|
39
|
+
|
36
40
|
# Extract tracking options
|
37
41
|
session_id = options[:session_id]
|
38
42
|
user_id = options[:user_id]
|
@@ -49,6 +53,11 @@ module Ragdoll
|
|
49
53
|
return [] if query_embedding.nil?
|
50
54
|
end
|
51
55
|
|
56
|
+
# Add keywords to filters if provided
|
57
|
+
if keywords.any?
|
58
|
+
filters[:keywords] = keywords
|
59
|
+
end
|
60
|
+
|
52
61
|
# Search using ActiveRecord models with statistics
|
53
62
|
# Try enhanced search first, fall back to original if it fails
|
54
63
|
begin
|
@@ -81,13 +90,15 @@ module Ragdoll
|
|
81
90
|
}
|
82
91
|
end
|
83
92
|
|
93
|
+
search_type = keywords.any? ? "semantic_with_keywords" : "semantic"
|
94
|
+
|
84
95
|
Ragdoll::Search.record_search(
|
85
96
|
query: query_string,
|
86
97
|
query_embedding: query_embedding,
|
87
98
|
results: search_results,
|
88
|
-
search_type:
|
99
|
+
search_type: search_type,
|
89
100
|
filters: filters,
|
90
|
-
options: { limit: limit, threshold: threshold },
|
101
|
+
options: { limit: limit, threshold: threshold, keywords: keywords },
|
91
102
|
execution_time_ms: execution_time,
|
92
103
|
session_id: session_id,
|
93
104
|
user_id: user_id
|
@@ -1,8 +1,5 @@
|
|
1
1
|
class EnablePostgresqlExtensions < ActiveRecord::Migration[7.0]
|
2
2
|
def up
|
3
|
-
# This migration is now handled by the db:create rake task
|
4
|
-
# Just ensure required extensions are available
|
5
|
-
|
6
3
|
# Vector similarity search (required for embeddings)
|
7
4
|
execute "CREATE EXTENSION IF NOT EXISTS vector"
|
8
5
|
|
@@ -15,9 +12,11 @@ class EnablePostgresqlExtensions < ActiveRecord::Migration[7.0]
|
|
15
12
|
end
|
16
13
|
|
17
14
|
def down
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
# Extensions are typically not dropped as they might be used by other databases
|
16
|
+
# If you really need to drop them, uncomment the following:
|
17
|
+
# execute "DROP EXTENSION IF EXISTS vector"
|
18
|
+
# execute "DROP EXTENSION IF EXISTS unaccent"
|
19
|
+
# execute "DROP EXTENSION IF EXISTS pg_trgm"
|
20
|
+
# execute "DROP EXTENSION IF EXISTS \"uuid-ossp\""
|
22
21
|
end
|
23
|
-
end
|
22
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
class CreateRagdollDocuments < ActiveRecord::Migration[7.0]
|
2
|
+
# For concurrent index creation (PostgreSQL)
|
3
|
+
disable_ddl_transaction!
|
4
|
+
|
5
|
+
def up
|
6
|
+
create_table :ragdoll_documents,
|
7
|
+
comment: "Core documents table with LLM-generated structured metadata" do |t|
|
8
|
+
|
9
|
+
t.string :location, null: false,
|
10
|
+
comment: "Source location of document (file path, URL, or identifier)"
|
11
|
+
|
12
|
+
t.string :title, null: false,
|
13
|
+
comment: "Human-readable document title for display and search"
|
14
|
+
|
15
|
+
t.text :summary, null: false, default: "",
|
16
|
+
comment: "LLM-generated summary of document content"
|
17
|
+
|
18
|
+
t.string :document_type, null: false, default: "text",
|
19
|
+
comment: "Document format type"
|
20
|
+
|
21
|
+
t.string :status, null: false, default: "pending",
|
22
|
+
comment: "Document processing status"
|
23
|
+
|
24
|
+
t.json :metadata, default: {},
|
25
|
+
comment: "LLM-generated structured metadata about the file"
|
26
|
+
|
27
|
+
t.timestamp :file_modified_at, null: false, default: -> { "CURRENT_TIMESTAMP" },
|
28
|
+
comment: "Timestamp when the source file was last modified"
|
29
|
+
|
30
|
+
t.timestamps null: false,
|
31
|
+
comment: "Standard creation and update timestamps"
|
32
|
+
|
33
|
+
# Add tsvector column for full-text search
|
34
|
+
t.tsvector :search_vector
|
35
|
+
|
36
|
+
# Add keywords as array column
|
37
|
+
t.text :keywords, array: true, default: []
|
38
|
+
end
|
39
|
+
|
40
|
+
###########
|
41
|
+
# Indexes #
|
42
|
+
###########
|
43
|
+
|
44
|
+
add_index :ragdoll_documents, :location, unique: true,
|
45
|
+
comment: "Unique index for document source lookup"
|
46
|
+
|
47
|
+
add_index :ragdoll_documents, :title,
|
48
|
+
comment: "Index for title-based search"
|
49
|
+
|
50
|
+
add_index :ragdoll_documents, :document_type,
|
51
|
+
comment: "Index for filtering by document type"
|
52
|
+
|
53
|
+
add_index :ragdoll_documents, :status,
|
54
|
+
comment: "Index for filtering by processing status"
|
55
|
+
|
56
|
+
add_index :ragdoll_documents, :created_at,
|
57
|
+
comment: "Index for chronological sorting"
|
58
|
+
|
59
|
+
add_index :ragdoll_documents, [:document_type, :status],
|
60
|
+
comment: "Composite index for type+status filtering"
|
61
|
+
|
62
|
+
# Full-text search index
|
63
|
+
execute <<-SQL
|
64
|
+
CREATE INDEX CONCURRENTLY index_ragdoll_documents_on_fulltext_search
|
65
|
+
ON ragdoll_documents
|
66
|
+
USING gin(to_tsvector('english',
|
67
|
+
COALESCE(title, '') || ' ' ||
|
68
|
+
COALESCE(metadata->>'summary', '') || ' ' ||
|
69
|
+
COALESCE(metadata->>'keywords', '') || ' ' ||
|
70
|
+
COALESCE(metadata->>'description', '')
|
71
|
+
))
|
72
|
+
SQL
|
73
|
+
|
74
|
+
add_index :ragdoll_documents, "(metadata->>'document_type')",
|
75
|
+
name: "index_ragdoll_documents_on_metadata_type",
|
76
|
+
comment: "Index for filtering by document type"
|
77
|
+
|
78
|
+
add_index :ragdoll_documents, "(metadata->>'classification')",
|
79
|
+
name: "index_ragdoll_documents_on_metadata_classification",
|
80
|
+
comment: "Index for filtering by document classification"
|
81
|
+
|
82
|
+
# GIN index on search_vector
|
83
|
+
add_index :ragdoll_documents, :search_vector, using: :gin, algorithm: :concurrently
|
84
|
+
|
85
|
+
# GIN index on keywords array
|
86
|
+
add_index :ragdoll_documents, :keywords, using: :gin,
|
87
|
+
name: 'index_ragdoll_documents_on_keywords_gin'
|
88
|
+
|
89
|
+
# Trigger to keep search_vector up to date on INSERT/UPDATE
|
90
|
+
execute <<-SQL
|
91
|
+
CREATE FUNCTION ragdoll_documents_vector_update() RETURNS trigger AS $$
|
92
|
+
BEGIN
|
93
|
+
NEW.search_vector := to_tsvector('english',
|
94
|
+
COALESCE(NEW.title, '') || ' ' ||
|
95
|
+
COALESCE(NEW.metadata->>'summary', '') || ' ' ||
|
96
|
+
COALESCE(NEW.metadata->>'keywords', '') || ' ' ||
|
97
|
+
COALESCE(NEW.metadata->>'description', '')
|
98
|
+
);
|
99
|
+
RETURN NEW;
|
100
|
+
END
|
101
|
+
$$ LANGUAGE plpgsql;
|
102
|
+
|
103
|
+
CREATE TRIGGER ragdoll_search_vector_update
|
104
|
+
BEFORE INSERT OR UPDATE ON ragdoll_documents
|
105
|
+
FOR EACH ROW EXECUTE FUNCTION ragdoll_documents_vector_update();
|
106
|
+
SQL
|
107
|
+
end
|
108
|
+
|
109
|
+
def down
|
110
|
+
execute <<-SQL
|
111
|
+
DROP TRIGGER IF EXISTS ragdoll_search_vector_update ON ragdoll_documents;
|
112
|
+
DROP FUNCTION IF EXISTS ragdoll_documents_vector_update();
|
113
|
+
SQL
|
114
|
+
|
115
|
+
drop_table :ragdoll_documents
|
116
|
+
end
|
117
|
+
end
|
data/db/migrate/{005_create_ragdoll_embeddings.rb → 20250815234903_create_ragdoll_embeddings.rb}
RENAMED
@@ -3,7 +3,7 @@ class CreateRagdollEmbeddings < ActiveRecord::Migration[7.0]
|
|
3
3
|
create_table :ragdoll_embeddings,
|
4
4
|
comment: "Polymorphic vector embeddings storage for semantic similarity search" do |t|
|
5
5
|
|
6
|
-
|
6
|
+
t.references :embeddable, polymorphic: true, null: false,
|
7
7
|
comment: "Polymorphic reference to embeddable content"
|
8
8
|
|
9
9
|
t.text :content, null: false, default: "",
|
@@ -26,16 +26,19 @@ class CreateRagdollEmbeddings < ActiveRecord::Migration[7.0]
|
|
26
26
|
|
27
27
|
t.timestamps null: false,
|
28
28
|
comment: "Standard creation and update timestamps"
|
29
|
+
end
|
29
30
|
|
30
|
-
|
31
|
-
|
32
|
-
|
31
|
+
###########
|
32
|
+
# Indexes #
|
33
|
+
###########
|
33
34
|
|
34
|
-
|
35
|
-
|
35
|
+
add_index :ragdoll_embeddings, [:embeddable_type, :embeddable_id],
|
36
|
+
comment: "Index for finding embeddings by embeddable content"
|
36
37
|
|
37
|
-
|
38
|
-
|
39
|
-
|
38
|
+
add_index :ragdoll_embeddings, :embedding_vector,
|
39
|
+
using: :ivfflat,
|
40
|
+
opclass: :vector_cosine_ops,
|
41
|
+
name: "index_ragdoll_embeddings_on_embedding_vector_cosine",
|
42
|
+
comment: "IVFFlat index for fast cosine similarity search"
|
40
43
|
end
|
41
|
-
end
|
44
|
+
end
|
data/db/migrate/{006_create_ragdoll_contents.rb → 20250815234904_create_ragdoll_contents.rb}
RENAMED
@@ -29,19 +29,22 @@ class CreateRagdollContents < ActiveRecord::Migration[7.0]
|
|
29
29
|
|
30
30
|
t.timestamps null: false,
|
31
31
|
comment: "Standard creation and update timestamps"
|
32
|
+
end
|
32
33
|
|
33
|
-
|
34
|
-
|
35
|
-
|
34
|
+
###########
|
35
|
+
# Indexes #
|
36
|
+
###########
|
36
37
|
|
37
|
-
|
38
|
-
|
38
|
+
add_index :ragdoll_contents, :embedding_model,
|
39
|
+
comment: "Index for filtering by embedding model"
|
39
40
|
|
40
|
-
|
41
|
-
|
41
|
+
add_index :ragdoll_contents, :type,
|
42
|
+
comment: "Index for filtering by content type"
|
42
43
|
|
43
|
-
|
44
|
-
|
45
|
-
|
44
|
+
execute <<-SQL
|
45
|
+
CREATE INDEX index_ragdoll_contents_on_fulltext_search
|
46
|
+
ON ragdoll_contents
|
47
|
+
USING gin(to_tsvector('english', COALESCE(content, '')))
|
48
|
+
SQL
|
46
49
|
end
|
47
|
-
end
|
50
|
+
end
|
data/db/migrate/{007_create_ragdoll_searches.rb → 20250815234905_create_ragdoll_searches.rb}
RENAMED
@@ -41,33 +41,37 @@ class CreateRagdollSearches < ActiveRecord::Migration[7.0]
|
|
41
41
|
|
42
42
|
t.timestamps null: false,
|
43
43
|
comment: "Standard creation and update timestamps"
|
44
|
+
end
|
44
45
|
|
45
|
-
|
46
|
-
|
47
|
-
|
46
|
+
###########
|
47
|
+
# Indexes #
|
48
|
+
###########
|
48
49
|
|
49
|
-
|
50
|
-
|
51
|
-
|
50
|
+
add_index :ragdoll_searches, :query_embedding,
|
51
|
+
using: :ivfflat,
|
52
|
+
opclass: :vector_cosine_ops,
|
53
|
+
name: "index_ragdoll_searches_on_query_embedding_cosine",
|
54
|
+
comment: "IVFFlat index for finding similar search queries"
|
52
55
|
|
53
|
-
|
54
|
-
|
56
|
+
add_index :ragdoll_searches, :search_type,
|
57
|
+
comment: "Index for filtering by search type"
|
55
58
|
|
56
|
-
|
57
|
-
|
59
|
+
add_index :ragdoll_searches, :session_id,
|
60
|
+
comment: "Index for grouping searches by session"
|
58
61
|
|
59
|
-
|
60
|
-
|
62
|
+
add_index :ragdoll_searches, :user_id,
|
63
|
+
comment: "Index for filtering searches by user"
|
61
64
|
|
62
|
-
|
63
|
-
|
65
|
+
add_index :ragdoll_searches, :created_at,
|
66
|
+
comment: "Index for chronological search history"
|
64
67
|
|
65
|
-
|
66
|
-
|
68
|
+
add_index :ragdoll_searches, :results_count,
|
69
|
+
comment: "Index for analyzing search effectiveness"
|
67
70
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
71
|
+
execute <<-SQL
|
72
|
+
CREATE INDEX index_ragdoll_searches_on_fulltext_query
|
73
|
+
ON ragdoll_searches
|
74
|
+
USING gin(to_tsvector('english', query))
|
75
|
+
SQL
|
72
76
|
end
|
73
77
|
end
|
@@ -24,26 +24,26 @@ class CreateRagdollSearchResults < ActiveRecord::Migration[7.0]
|
|
24
24
|
|
25
25
|
t.timestamps null: false,
|
26
26
|
comment: "Standard creation and update timestamps"
|
27
|
+
end
|
27
28
|
|
28
|
-
|
29
|
-
|
30
|
-
|
29
|
+
###########
|
30
|
+
# Indexes #
|
31
|
+
###########
|
31
32
|
|
32
|
-
|
33
|
-
|
34
|
-
|
33
|
+
add_index :ragdoll_search_results, [:search_id, :result_rank],
|
34
|
+
name: "idx_search_results_search_rank",
|
35
|
+
comment: "Index for retrieving results in ranked order"
|
35
36
|
|
36
|
-
|
37
|
-
|
38
|
-
|
37
|
+
add_index :ragdoll_search_results, [:embedding_id, :similarity_score],
|
38
|
+
name: "idx_search_results_embedding_score",
|
39
|
+
comment: "Index for analyzing embedding performance"
|
39
40
|
|
40
|
-
|
41
|
-
|
42
|
-
|
41
|
+
add_index :ragdoll_search_results, :similarity_score,
|
42
|
+
name: "idx_search_results_similarity",
|
43
|
+
comment: "Index for similarity score analysis"
|
43
44
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
end
|
45
|
+
add_index :ragdoll_search_results, [:clicked, :clicked_at],
|
46
|
+
name: "idx_search_results_clicks",
|
47
|
+
comment: "Index for click-through analysis"
|
48
48
|
end
|
49
49
|
end
|
data/lib/ragdoll/core/client.rb
CHANGED
@@ -184,7 +184,7 @@ module Ragdoll
|
|
184
184
|
end
|
185
185
|
|
186
186
|
# Document management
|
187
|
-
def add_document(path:)
|
187
|
+
def add_document(path:, force: false)
|
188
188
|
# Parse the document
|
189
189
|
parsed = Ragdoll::DocumentProcessor.parse(path)
|
190
190
|
|
@@ -197,7 +197,7 @@ module Ragdoll
|
|
197
197
|
title: title,
|
198
198
|
document_type: parsed[:document_type],
|
199
199
|
**parsed[:metadata]
|
200
|
-
})
|
200
|
+
}, force: force)
|
201
201
|
|
202
202
|
# Queue background jobs for processing if content is available
|
203
203
|
embeddings_queued = false
|
@@ -90,10 +90,10 @@ module Ragdoll
|
|
90
90
|
# Drop all tables in correct order (respecting foreign key constraints)
|
91
91
|
# Order: dependent tables first, then parent tables
|
92
92
|
tables_to_drop = %w[
|
93
|
+
ragdoll_search_results
|
94
|
+
ragdoll_searches
|
93
95
|
ragdoll_embeddings
|
94
|
-
|
95
|
-
ragdoll_image_contents
|
96
|
-
ragdoll_audio_contents
|
96
|
+
ragdoll_contents
|
97
97
|
ragdoll_documents
|
98
98
|
schema_migrations
|
99
99
|
]
|
@@ -109,6 +109,11 @@ module Ragdoll
|
|
109
109
|
end
|
110
110
|
end
|
111
111
|
|
112
|
+
# Also drop any functions/triggers that might exist
|
113
|
+
if ActiveRecord::Base.connection.adapter_name.downcase.include?("postgresql")
|
114
|
+
ActiveRecord::Base.connection.execute("DROP FUNCTION IF EXISTS ragdoll_documents_vector_update() CASCADE")
|
115
|
+
end
|
116
|
+
|
112
117
|
migrate!
|
113
118
|
end
|
114
119
|
|