ragdoll 0.1.9 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -99,8 +99,6 @@ module Ragdoll
99
99
  else
100
100
  parse_text # Default to text parsing for unknown formats
101
101
  end
102
- rescue StandardError => e # StandardError => e
103
- raise ParseError, "#{__LINE__} Failed to parse #{@file_path}: #{e.message}"
104
102
  end
105
103
 
106
104
  private
@@ -109,6 +107,12 @@ module Ragdoll
109
107
  content = ""
110
108
  metadata = {}
111
109
 
110
+ # Add file-based metadata for duplicate detection
111
+ if File.exist?(@file_path)
112
+ metadata[:file_size] = File.size(@file_path)
113
+ metadata[:file_hash] = calculate_file_hash(@file_path)
114
+ end
115
+
112
116
  begin
113
117
  PDF::Reader.open(@file_path) do |reader|
114
118
  # Extract metadata
@@ -144,6 +148,10 @@ module Ragdoll
144
148
  metadata[:title] = extract_title_from_filepath
145
149
  end
146
150
 
151
+ # Add content hash for duplicate detection
152
+ # Ensure content is UTF-8 encoded before checking presence
153
+ metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
154
+
147
155
  {
148
156
  content: content.strip,
149
157
  metadata: metadata,
@@ -155,6 +163,12 @@ module Ragdoll
155
163
  content = ""
156
164
  metadata = {}
157
165
 
166
+ # Add file-based metadata for duplicate detection
167
+ if File.exist?(@file_path)
168
+ metadata[:file_size] = File.size(@file_path)
169
+ metadata[:file_hash] = calculate_file_hash(@file_path)
170
+ end
171
+
158
172
  begin
159
173
  doc = Docx::Document.open(@file_path)
160
174
 
@@ -204,6 +218,10 @@ module Ragdoll
204
218
  metadata[:title] = extract_title_from_filepath
205
219
  end
206
220
 
221
+ # Add content hash for duplicate detection
222
+ # Ensure content is UTF-8 encoded before checking presence
223
+ metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
224
+
207
225
  {
208
226
  content: content.strip,
209
227
  metadata: metadata,
@@ -212,46 +230,31 @@ module Ragdoll
212
230
  end
213
231
 
214
232
  def parse_text
215
- content = File.read(@file_path, encoding: "UTF-8")
216
- metadata = {
217
- file_size: File.size(@file_path),
218
- encoding: "UTF-8"
219
- }
220
-
233
+ # Determine document type first (before any IO operations)
221
234
  document_type = case @file_extension
222
235
  when ".md", ".markdown" then "markdown"
223
236
  when ".txt" then "text"
224
237
  else "text"
225
238
  end
226
239
 
227
- # Parse YAML front matter for markdown files
228
- if document_type == "markdown" && content.start_with?("---\n")
229
- front_matter, body_content = parse_yaml_front_matter(content)
230
- if front_matter
231
- metadata.merge!(front_matter)
232
- content = body_content
233
- end
234
- end
235
-
236
- # Add filepath-based title as fallback if no title was found
237
- if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
238
- metadata[:title] = extract_title_from_filepath
240
+ begin
241
+ content = File.read(@file_path, encoding: "UTF-8")
242
+ encoding = "UTF-8"
243
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
244
+ # Try with different encoding - read as ISO-8859-1 and force encoding to UTF-8
245
+ content = File.read(@file_path, encoding: "ISO-8859-1").encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
246
+ encoding = "ISO-8859-1"
247
+ rescue Errno::ENOENT, Errno::EACCES => e
248
+ raise ParseError, "Failed to read file #{@file_path}: #{e.message}"
239
249
  end
240
250
 
241
- {
242
- content: content,
243
- metadata: metadata,
244
- document_type: document_type
245
- }
246
- rescue Encoding::InvalidByteSequenceError
247
- # Try with different encoding
248
- content = File.read(@file_path, encoding: "ISO-8859-1")
249
251
  metadata = {
250
252
  file_size: File.size(@file_path),
251
- encoding: "ISO-8859-1"
253
+ file_hash: calculate_file_hash(@file_path),
254
+ encoding: encoding
252
255
  }
253
256
 
254
- # Try to parse front matter with different encoding too
257
+ # Parse YAML front matter for markdown files
255
258
  if document_type == "markdown" && content.start_with?("---\n")
256
259
  front_matter, body_content = parse_yaml_front_matter(content)
257
260
  if front_matter
@@ -265,10 +268,14 @@ module Ragdoll
265
268
  metadata[:title] = extract_title_from_filepath
266
269
  end
267
270
 
271
+ # Add content hash for duplicate detection
272
+ # Ensure content is UTF-8 encoded before checking presence
273
+ metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
274
+
268
275
  {
269
276
  content: content,
270
277
  metadata: metadata,
271
- document_type: document_type.nil? ? "text" : document_type
278
+ document_type: document_type
272
279
  }
273
280
  end
274
281
 
@@ -296,6 +303,7 @@ module Ragdoll
296
303
 
297
304
  metadata = {
298
305
  file_size: File.size(@file_path),
306
+ file_hash: calculate_file_hash(@file_path),
299
307
  original_format: "html"
300
308
  }
301
309
 
@@ -306,6 +314,9 @@ module Ragdoll
306
314
  metadata[:title] = extract_title_from_filepath
307
315
  end
308
316
 
317
+ # Add content hash for duplicate detection
318
+ metadata[:content_hash] = calculate_content_hash(clean_content) if clean_content.present?
319
+
309
320
  {
310
321
  content: clean_content,
311
322
  metadata: metadata,
@@ -318,6 +329,7 @@ module Ragdoll
318
329
 
319
330
  metadata = {
320
331
  file_size: File.size(@file_path),
332
+ file_hash: calculate_file_hash(@file_path),
321
333
  file_type: @file_extension.sub(".", ""),
322
334
  original_filename: File.basename(@file_path)
323
335
  }
@@ -347,6 +359,10 @@ module Ragdoll
347
359
  # Add filepath-based title as fallback
348
360
  metadata[:title] = extract_title_from_filepath
349
361
 
362
+ # Add content hash for duplicate detection
363
+ # Ensure content is UTF-8 encoded before checking presence
364
+ metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
365
+
350
366
  puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
351
367
 
352
368
  {
@@ -461,5 +477,25 @@ module Ragdoll
461
477
  [nil, content]
462
478
  end
463
479
  end
480
+
481
+ # Calculate SHA256 hash of file content for duplicate detection
482
+ def calculate_file_hash(file_path)
483
+ require 'digest'
484
+ Digest::SHA256.file(file_path).hexdigest
485
+ rescue StandardError => e
486
+ Rails.logger.warn "Failed to calculate file hash for #{file_path}: #{e.message}" if defined?(Rails)
487
+ puts "Warning: Failed to calculate file hash for #{file_path}: #{e.message}"
488
+ nil
489
+ end
490
+
491
+ # Calculate SHA256 hash of text content for duplicate detection
492
+ def calculate_content_hash(content)
493
+ require 'digest'
494
+ Digest::SHA256.hexdigest(content)
495
+ rescue StandardError => e
496
+ Rails.logger.warn "Failed to calculate content hash: #{e.message}" if defined?(Rails)
497
+ puts "Warning: Failed to calculate content hash: #{e.message}"
498
+ nil
499
+ end
464
500
  end
465
501
  end
@@ -33,6 +33,10 @@ module Ragdoll
33
33
  threshold = options[:threshold] || search_config[:similarity_threshold]
34
34
  filters = options[:filters] || {}
35
35
 
36
+ # Extract keywords option and normalize
37
+ keywords = options[:keywords] || []
38
+ keywords = Array(keywords).map(&:to_s).reject(&:empty?)
39
+
36
40
  # Extract tracking options
37
41
  session_id = options[:session_id]
38
42
  user_id = options[:user_id]
@@ -49,6 +53,11 @@ module Ragdoll
49
53
  return [] if query_embedding.nil?
50
54
  end
51
55
 
56
+ # Add keywords to filters if provided
57
+ if keywords.any?
58
+ filters[:keywords] = keywords
59
+ end
60
+
52
61
  # Search using ActiveRecord models with statistics
53
62
  # Try enhanced search first, fall back to original if it fails
54
63
  begin
@@ -81,13 +90,15 @@ module Ragdoll
81
90
  }
82
91
  end
83
92
 
93
+ search_type = keywords.any? ? "semantic_with_keywords" : "semantic"
94
+
84
95
  Ragdoll::Search.record_search(
85
96
  query: query_string,
86
97
  query_embedding: query_embedding,
87
98
  results: search_results,
88
- search_type: "semantic",
99
+ search_type: search_type,
89
100
  filters: filters,
90
- options: { limit: limit, threshold: threshold },
101
+ options: { limit: limit, threshold: threshold, keywords: keywords },
91
102
  execution_time_ms: execution_time,
92
103
  session_id: session_id,
93
104
  user_id: user_id
@@ -1,8 +1,5 @@
1
1
  class EnablePostgresqlExtensions < ActiveRecord::Migration[7.0]
2
2
  def up
3
- # This migration is now handled by the db:create rake task
4
- # Just ensure required extensions are available
5
-
6
3
  # Vector similarity search (required for embeddings)
7
4
  execute "CREATE EXTENSION IF NOT EXISTS vector"
8
5
 
@@ -15,9 +12,11 @@ class EnablePostgresqlExtensions < ActiveRecord::Migration[7.0]
15
12
  end
16
13
 
17
14
  def down
18
- execute <<-SQL
19
- DROP DATABASE IF EXISTS ragdoll_development;
20
- DROP ROLE IF EXISTS ragdoll;
21
- SQL
15
+ # Extensions are typically not dropped as they might be used by other databases
16
+ # If you really need to drop them, uncomment the following:
17
+ # execute "DROP EXTENSION IF EXISTS vector"
18
+ # execute "DROP EXTENSION IF EXISTS unaccent"
19
+ # execute "DROP EXTENSION IF EXISTS pg_trgm"
20
+ # execute "DROP EXTENSION IF EXISTS \"uuid-ossp\""
22
21
  end
23
- end
22
+ end
@@ -0,0 +1,117 @@
1
+ class CreateRagdollDocuments < ActiveRecord::Migration[7.0]
2
+ # For concurrent index creation (PostgreSQL)
3
+ disable_ddl_transaction!
4
+
5
+ def up
6
+ create_table :ragdoll_documents,
7
+ comment: "Core documents table with LLM-generated structured metadata" do |t|
8
+
9
+ t.string :location, null: false,
10
+ comment: "Source location of document (file path, URL, or identifier)"
11
+
12
+ t.string :title, null: false,
13
+ comment: "Human-readable document title for display and search"
14
+
15
+ t.text :summary, null: false, default: "",
16
+ comment: "LLM-generated summary of document content"
17
+
18
+ t.string :document_type, null: false, default: "text",
19
+ comment: "Document format type"
20
+
21
+ t.string :status, null: false, default: "pending",
22
+ comment: "Document processing status"
23
+
24
+ t.json :metadata, default: {},
25
+ comment: "LLM-generated structured metadata about the file"
26
+
27
+ t.timestamp :file_modified_at, null: false, default: -> { "CURRENT_TIMESTAMP" },
28
+ comment: "Timestamp when the source file was last modified"
29
+
30
+ t.timestamps null: false,
31
+ comment: "Standard creation and update timestamps"
32
+
33
+ # Add tsvector column for full-text search
34
+ t.tsvector :search_vector
35
+
36
+ # Add keywords as array column
37
+ t.text :keywords, array: true, default: []
38
+ end
39
+
40
+ ###########
41
+ # Indexes #
42
+ ###########
43
+
44
+ add_index :ragdoll_documents, :location, unique: true,
45
+ comment: "Unique index for document source lookup"
46
+
47
+ add_index :ragdoll_documents, :title,
48
+ comment: "Index for title-based search"
49
+
50
+ add_index :ragdoll_documents, :document_type,
51
+ comment: "Index for filtering by document type"
52
+
53
+ add_index :ragdoll_documents, :status,
54
+ comment: "Index for filtering by processing status"
55
+
56
+ add_index :ragdoll_documents, :created_at,
57
+ comment: "Index for chronological sorting"
58
+
59
+ add_index :ragdoll_documents, [:document_type, :status],
60
+ comment: "Composite index for type+status filtering"
61
+
62
+ # Full-text search index
63
+ execute <<-SQL
64
+ CREATE INDEX CONCURRENTLY index_ragdoll_documents_on_fulltext_search
65
+ ON ragdoll_documents
66
+ USING gin(to_tsvector('english',
67
+ COALESCE(title, '') || ' ' ||
68
+ COALESCE(metadata->>'summary', '') || ' ' ||
69
+ COALESCE(metadata->>'keywords', '') || ' ' ||
70
+ COALESCE(metadata->>'description', '')
71
+ ))
72
+ SQL
73
+
74
+ add_index :ragdoll_documents, "(metadata->>'document_type')",
75
+ name: "index_ragdoll_documents_on_metadata_type",
76
+ comment: "Index for filtering by document type"
77
+
78
+ add_index :ragdoll_documents, "(metadata->>'classification')",
79
+ name: "index_ragdoll_documents_on_metadata_classification",
80
+ comment: "Index for filtering by document classification"
81
+
82
+ # GIN index on search_vector
83
+ add_index :ragdoll_documents, :search_vector, using: :gin, algorithm: :concurrently
84
+
85
+ # GIN index on keywords array
86
+ add_index :ragdoll_documents, :keywords, using: :gin,
87
+ name: 'index_ragdoll_documents_on_keywords_gin'
88
+
89
+ # Trigger to keep search_vector up to date on INSERT/UPDATE
90
+ execute <<-SQL
91
+ CREATE FUNCTION ragdoll_documents_vector_update() RETURNS trigger AS $$
92
+ BEGIN
93
+ NEW.search_vector := to_tsvector('english',
94
+ COALESCE(NEW.title, '') || ' ' ||
95
+ COALESCE(NEW.metadata->>'summary', '') || ' ' ||
96
+ COALESCE(NEW.metadata->>'keywords', '') || ' ' ||
97
+ COALESCE(NEW.metadata->>'description', '')
98
+ );
99
+ RETURN NEW;
100
+ END
101
+ $$ LANGUAGE plpgsql;
102
+
103
+ CREATE TRIGGER ragdoll_search_vector_update
104
+ BEFORE INSERT OR UPDATE ON ragdoll_documents
105
+ FOR EACH ROW EXECUTE FUNCTION ragdoll_documents_vector_update();
106
+ SQL
107
+ end
108
+
109
+ def down
110
+ execute <<-SQL
111
+ DROP TRIGGER IF EXISTS ragdoll_search_vector_update ON ragdoll_documents;
112
+ DROP FUNCTION IF EXISTS ragdoll_documents_vector_update();
113
+ SQL
114
+
115
+ drop_table :ragdoll_documents
116
+ end
117
+ end
@@ -3,7 +3,7 @@ class CreateRagdollEmbeddings < ActiveRecord::Migration[7.0]
3
3
  create_table :ragdoll_embeddings,
4
4
  comment: "Polymorphic vector embeddings storage for semantic similarity search" do |t|
5
5
 
6
- t.references :embeddable, polymorphic: true, null: false,
6
+ t.references :embeddable, polymorphic: true, null: false,
7
7
  comment: "Polymorphic reference to embeddable content"
8
8
 
9
9
  t.text :content, null: false, default: "",
@@ -26,16 +26,19 @@ class CreateRagdollEmbeddings < ActiveRecord::Migration[7.0]
26
26
 
27
27
  t.timestamps null: false,
28
28
  comment: "Standard creation and update timestamps"
29
+ end
29
30
 
30
- ###########
31
- # Indexes #
32
- ###########
31
+ ###########
32
+ # Indexes #
33
+ ###########
33
34
 
34
- t.index %i[embeddable_type embeddable_id],
35
- comment: "Index for finding embeddings by embeddable content"
35
+ add_index :ragdoll_embeddings, [:embeddable_type, :embeddable_id],
36
+ comment: "Index for finding embeddings by embeddable content"
36
37
 
37
- t.index :embedding_vector, using: :ivfflat, opclass: :vector_cosine_ops, name: "index_ragdoll_embeddings_on_embedding_vector_cosine",
38
- comment: "IVFFlat index for fast cosine similarity search"
39
- end
38
+ add_index :ragdoll_embeddings, :embedding_vector,
39
+ using: :ivfflat,
40
+ opclass: :vector_cosine_ops,
41
+ name: "index_ragdoll_embeddings_on_embedding_vector_cosine",
42
+ comment: "IVFFlat index for fast cosine similarity search"
40
43
  end
41
- end
44
+ end
@@ -29,19 +29,22 @@ class CreateRagdollContents < ActiveRecord::Migration[7.0]
29
29
 
30
30
  t.timestamps null: false,
31
31
  comment: "Standard creation and update timestamps"
32
+ end
32
33
 
33
- ###########
34
- # Indexes #
35
- ###########
34
+ ###########
35
+ # Indexes #
36
+ ###########
36
37
 
37
- t.index :embedding_model,
38
- comment: "Index for filtering by embedding model"
38
+ add_index :ragdoll_contents, :embedding_model,
39
+ comment: "Index for filtering by embedding model"
39
40
 
40
- t.index :type,
41
- comment: "Index for filtering by content type"
41
+ add_index :ragdoll_contents, :type,
42
+ comment: "Index for filtering by content type"
42
43
 
43
- t.index "to_tsvector('english', COALESCE(content, ''))", using: :gin, name: "index_ragdoll_contents_on_fulltext_search",
44
- comment: "Full-text search index for text content"
45
- end
44
+ execute <<-SQL
45
+ CREATE INDEX index_ragdoll_contents_on_fulltext_search
46
+ ON ragdoll_contents
47
+ USING gin(to_tsvector('english', COALESCE(content, '')))
48
+ SQL
46
49
  end
47
- end
50
+ end
@@ -41,33 +41,37 @@ class CreateRagdollSearches < ActiveRecord::Migration[7.0]
41
41
 
42
42
  t.timestamps null: false,
43
43
  comment: "Standard creation and update timestamps"
44
+ end
44
45
 
45
- ###########
46
- # Indexes #
47
- ###########
46
+ ###########
47
+ # Indexes #
48
+ ###########
48
49
 
49
- t.index :query_embedding, using: :ivfflat, opclass: :vector_cosine_ops,
50
- name: "index_ragdoll_searches_on_query_embedding_cosine",
51
- comment: "IVFFlat index for finding similar search queries"
50
+ add_index :ragdoll_searches, :query_embedding,
51
+ using: :ivfflat,
52
+ opclass: :vector_cosine_ops,
53
+ name: "index_ragdoll_searches_on_query_embedding_cosine",
54
+ comment: "IVFFlat index for finding similar search queries"
52
55
 
53
- t.index :search_type,
54
- comment: "Index for filtering by search type"
56
+ add_index :ragdoll_searches, :search_type,
57
+ comment: "Index for filtering by search type"
55
58
 
56
- t.index :session_id,
57
- comment: "Index for grouping searches by session"
59
+ add_index :ragdoll_searches, :session_id,
60
+ comment: "Index for grouping searches by session"
58
61
 
59
- t.index :user_id,
60
- comment: "Index for filtering searches by user"
62
+ add_index :ragdoll_searches, :user_id,
63
+ comment: "Index for filtering searches by user"
61
64
 
62
- t.index :created_at,
63
- comment: "Index for chronological search history"
65
+ add_index :ragdoll_searches, :created_at,
66
+ comment: "Index for chronological search history"
64
67
 
65
- t.index :results_count,
66
- comment: "Index for analyzing search effectiveness"
68
+ add_index :ragdoll_searches, :results_count,
69
+ comment: "Index for analyzing search effectiveness"
67
70
 
68
- t.index "to_tsvector('english', query)", using: :gin,
69
- name: "index_ragdoll_searches_on_fulltext_query",
70
- comment: "Full-text search index for finding searches by query text"
71
- end
71
+ execute <<-SQL
72
+ CREATE INDEX index_ragdoll_searches_on_fulltext_query
73
+ ON ragdoll_searches
74
+ USING gin(to_tsvector('english', query))
75
+ SQL
72
76
  end
73
77
  end
@@ -24,26 +24,26 @@ class CreateRagdollSearchResults < ActiveRecord::Migration[7.0]
24
24
 
25
25
  t.timestamps null: false,
26
26
  comment: "Standard creation and update timestamps"
27
+ end
27
28
 
28
- ###########
29
- # Indexes #
30
- ###########
29
+ ###########
30
+ # Indexes #
31
+ ###########
31
32
 
32
- t.index [:search_id, :result_rank],
33
- name: "idx_search_results_search_rank",
34
- comment: "Index for retrieving results in ranked order"
33
+ add_index :ragdoll_search_results, [:search_id, :result_rank],
34
+ name: "idx_search_results_search_rank",
35
+ comment: "Index for retrieving results in ranked order"
35
36
 
36
- t.index [:embedding_id, :similarity_score],
37
- name: "idx_search_results_embedding_score",
38
- comment: "Index for analyzing embedding performance"
37
+ add_index :ragdoll_search_results, [:embedding_id, :similarity_score],
38
+ name: "idx_search_results_embedding_score",
39
+ comment: "Index for analyzing embedding performance"
39
40
 
40
- t.index :similarity_score,
41
- name: "idx_search_results_similarity",
42
- comment: "Index for similarity score analysis"
41
+ add_index :ragdoll_search_results, :similarity_score,
42
+ name: "idx_search_results_similarity",
43
+ comment: "Index for similarity score analysis"
43
44
 
44
- t.index [:clicked, :clicked_at],
45
- name: "idx_search_results_clicks",
46
- comment: "Index for click-through analysis"
47
- end
45
+ add_index :ragdoll_search_results, [:clicked, :clicked_at],
46
+ name: "idx_search_results_clicks",
47
+ comment: "Index for click-through analysis"
48
48
  end
49
49
  end
@@ -184,7 +184,7 @@ module Ragdoll
184
184
  end
185
185
 
186
186
  # Document management
187
- def add_document(path:)
187
+ def add_document(path:, force: false)
188
188
  # Parse the document
189
189
  parsed = Ragdoll::DocumentProcessor.parse(path)
190
190
 
@@ -197,7 +197,7 @@ module Ragdoll
197
197
  title: title,
198
198
  document_type: parsed[:document_type],
199
199
  **parsed[:metadata]
200
- })
200
+ }, force: force)
201
201
 
202
202
  # Queue background jobs for processing if content is available
203
203
  embeddings_queued = false
@@ -90,10 +90,10 @@ module Ragdoll
90
90
  # Drop all tables in correct order (respecting foreign key constraints)
91
91
  # Order: dependent tables first, then parent tables
92
92
  tables_to_drop = %w[
93
+ ragdoll_search_results
94
+ ragdoll_searches
93
95
  ragdoll_embeddings
94
- ragdoll_text_contents
95
- ragdoll_image_contents
96
- ragdoll_audio_contents
96
+ ragdoll_contents
97
97
  ragdoll_documents
98
98
  schema_migrations
99
99
  ]
@@ -109,6 +109,11 @@ module Ragdoll
109
109
  end
110
110
  end
111
111
 
112
+ # Also drop any functions/triggers that might exist
113
+ if ActiveRecord::Base.connection.adapter_name.downcase.include?("postgresql")
114
+ ActiveRecord::Base.connection.execute("DROP FUNCTION IF EXISTS ragdoll_documents_vector_update() CASCADE")
115
+ end
116
+
112
117
  migrate!
113
118
  end
114
119
 
@@ -3,6 +3,6 @@
3
3
 
4
4
  module Ragdoll
5
5
  module Core
6
- VERSION = "0.1.9"
6
+ VERSION = "0.1.11"
7
7
  end
8
8
  end