ragdoll 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,300 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ragdoll
4
+ # Unified document management service for text-based RAG system
5
+ # Handles the entire pipeline from document ingestion to searchable text embeddings
6
+ class UnifiedDocumentManagement
7
+ class ProcessingError < StandardError; end
8
+
9
+ def self.add_document(file_path, **options)
10
+ new.add_document(file_path, **options)
11
+ end
12
+
13
+ def self.add_document_from_upload(uploaded_file, **options)
14
+ new.add_document_from_upload(uploaded_file, **options)
15
+ end
16
+
17
+ def self.process_document(document_id)
18
+ new.process_document(document_id)
19
+ end
20
+
21
+ def initialize
22
+ @converter = Ragdoll::DocumentConverter.new
23
+ end
24
+
25
+ # Add a document from file path
26
+ def add_document(file_path, **options)
27
+ return nil unless File.exist?(file_path)
28
+
29
+ # Determine document type
30
+ document_type = @converter.determine_document_type(file_path)
31
+
32
+ # Convert to text
33
+ text_content = @converter.convert_to_text(file_path, document_type)
34
+
35
+ # Create document
36
+ document = create_unified_document(
37
+ location: File.expand_path(file_path),
38
+ document_type: document_type,
39
+ text_content: text_content,
40
+ **options
41
+ )
42
+
43
+ # Process asynchronously if requested
44
+ if options[:async]
45
+ process_document_async(document.id)
46
+ else
47
+ process_document_sync(document)
48
+ end
49
+
50
+ document
51
+ end
52
+
53
+ # Add a document from uploaded file
54
+ def add_document_from_upload(uploaded_file, **options)
55
+ # Create temporary file to process
56
+ temp_file = nil
57
+ begin
58
+ temp_file = create_temp_file_from_upload(uploaded_file)
59
+ document_type = @converter.determine_document_type(temp_file.path)
60
+ text_content = @converter.convert_to_text(temp_file.path, document_type)
61
+
62
+ # Create document
63
+ document = create_unified_document(
64
+ location: uploaded_file.original_filename || "uploaded_file",
65
+ document_type: document_type,
66
+ text_content: text_content,
67
+ **options
68
+ )
69
+
70
+ # Process asynchronously if requested
71
+ if options[:async]
72
+ process_document_async(document.id)
73
+ else
74
+ process_document_sync(document)
75
+ end
76
+
77
+ document
78
+ ensure
79
+ temp_file&.close
80
+ temp_file&.unlink if temp_file&.path
81
+ end
82
+ end
83
+
84
+ # Process a document by ID
85
+ def process_document(document_id)
86
+ if defined?(Ragdoll::UnifiedDocument)
87
+ document = Ragdoll::UnifiedDocument.find(document_id)
88
+ else
89
+ # Fallback to regular Document
90
+ document = Ragdoll::Document.find(document_id)
91
+ end
92
+
93
+ process_document_sync(document)
94
+ end
95
+
96
+ # Reprocess document with new text conversion
97
+ def reprocess_document(document_id, **options)
98
+ if defined?(Ragdoll::UnifiedDocument)
99
+ document = Ragdoll::UnifiedDocument.find(document_id)
100
+ else
101
+ document = Ragdoll::Document.find(document_id)
102
+ end
103
+
104
+ return nil unless File.exist?(document.location)
105
+
106
+ # Re-convert to text
107
+ document_type = @converter.determine_document_type(document.location)
108
+ text_content = @converter.convert_to_text(document.location, document_type, **options)
109
+
110
+ # Update document content
111
+ if document.respond_to?(:unified_contents)
112
+ # Unified document approach
113
+ if document.unified_contents.any?
114
+ document.unified_contents.first.update!(content: text_content)
115
+ else
116
+ document.unified_contents.create!(
117
+ content: text_content,
118
+ original_media_type: document_type,
119
+ embedding_model: "text-embedding-3-large",
120
+ metadata: { "reprocessed_at" => Time.current }
121
+ )
122
+ end
123
+ else
124
+ # Fallback to content field
125
+ document.content = text_content
126
+ end
127
+
128
+ # Reprocess
129
+ process_document_sync(document)
130
+ end
131
+
132
+ # Batch processing for multiple documents
133
+ def batch_process_documents(file_paths, **options)
134
+ results = []
135
+ errors = []
136
+
137
+ file_paths.each do |file_path|
138
+ begin
139
+ document = add_document(file_path, **options)
140
+ results << document
141
+ rescue StandardError => e
142
+ errors << { file_path: file_path, error: e.message }
143
+ end
144
+ end
145
+
146
+ {
147
+ processed: results,
148
+ errors: errors,
149
+ total: file_paths.length,
150
+ success_count: results.length,
151
+ error_count: errors.length
152
+ }
153
+ end
154
+
155
+ # Search across all documents
156
+ def search_documents(query, **options)
157
+ if defined?(Ragdoll::UnifiedDocument)
158
+ Ragdoll::UnifiedDocument.search_content(query, **options)
159
+ else
160
+ Ragdoll::Document.search_content(query, **options)
161
+ end
162
+ end
163
+
164
+ # Get processing statistics
165
+ def processing_stats
166
+ if defined?(Ragdoll::UnifiedDocument)
167
+ base_stats = Ragdoll::UnifiedDocument.stats
168
+ content_stats = Ragdoll::UnifiedContent.stats
169
+ else
170
+ base_stats = Ragdoll::Document.stats
171
+ content_stats = Ragdoll::Content.stats
172
+ end
173
+
174
+ {
175
+ documents: base_stats,
176
+ content: content_stats,
177
+ processing_summary: {
178
+ total_documents: base_stats[:total_documents],
179
+ processed_documents: base_stats.dig(:by_status, "processed") || 0,
180
+ total_embeddings: base_stats[:total_embeddings],
181
+ average_processing_time: estimate_average_processing_time
182
+ }
183
+ }
184
+ end
185
+
186
+ private
187
+
188
+ def create_unified_document(location:, document_type:, text_content:, **options)
189
+ title = options[:title] || extract_title_from_location(location)
190
+
191
+ if defined?(Ragdoll::UnifiedDocument)
192
+ document = Ragdoll::UnifiedDocument.create!(
193
+ location: location,
194
+ title: title,
195
+ document_type: document_type,
196
+ status: "pending",
197
+ file_modified_at: options[:file_modified_at] || Time.current,
198
+ metadata: options[:metadata] || {}
199
+ )
200
+
201
+ # Create unified content
202
+ document.unified_contents.create!(
203
+ content: text_content,
204
+ original_media_type: document_type,
205
+ embedding_model: "text-embedding-3-large",
206
+ metadata: {
207
+ "created_at" => Time.current,
208
+ "conversion_method" => "unified_converter",
209
+ "original_filename" => File.basename(location)
210
+ }
211
+ )
212
+ else
213
+ # Fallback to regular Document
214
+ document = Ragdoll::Document.create!(
215
+ location: location,
216
+ title: title,
217
+ content: text_content,
218
+ document_type: document_type,
219
+ status: "pending",
220
+ file_modified_at: options[:file_modified_at] || Time.current,
221
+ metadata: options[:metadata] || {}
222
+ )
223
+ end
224
+
225
+ document
226
+ end
227
+
228
+ def process_document_sync(document)
229
+ begin
230
+ if document.respond_to?(:process_document!)
231
+ document.process_document!
232
+ else
233
+ # Fallback processing
234
+ document.update!(status: "processing")
235
+ generate_embeddings_for_document(document)
236
+ document.update!(status: "processed")
237
+ end
238
+ rescue StandardError => e
239
+ document.update!(status: "error", metadata: (document.metadata || {}).merge("error" => e.message))
240
+ raise ProcessingError, "Failed to process document #{document.id}: #{e.message}"
241
+ end
242
+
243
+ document
244
+ end
245
+
246
+ def process_document_async(document_id)
247
+ # In a real application, this would enqueue a background job
248
+ # For now, we'll just process synchronously
249
+ puts "Note: Async processing not implemented, processing synchronously"
250
+ process_document(document_id)
251
+ end
252
+
253
+ def generate_embeddings_for_document(document)
254
+ if document.respond_to?(:unified_contents)
255
+ document.unified_contents.each(&:generate_embeddings!)
256
+ elsif document.respond_to?(:contents)
257
+ document.contents.each(&:generate_embeddings!)
258
+ end
259
+ end
260
+
261
+ def create_temp_file_from_upload(uploaded_file)
262
+ temp_file = Tempfile.new([
263
+ File.basename(uploaded_file.original_filename || "upload", ".*"),
264
+ File.extname(uploaded_file.original_filename || "")
265
+ ])
266
+
267
+ if uploaded_file.respond_to?(:read)
268
+ temp_file.write(uploaded_file.read)
269
+ elsif uploaded_file.respond_to?(:path)
270
+ FileUtils.cp(uploaded_file.path, temp_file.path)
271
+ else
272
+ raise ProcessingError, "Unknown upload file format"
273
+ end
274
+
275
+ temp_file.flush
276
+ temp_file.rewind
277
+ temp_file
278
+ end
279
+
280
+ def extract_title_from_location(location)
281
+ filename = File.basename(location, File.extname(location))
282
+
283
+ # Clean up common patterns in filenames
284
+ title = filename
285
+ .gsub(/[-_]+/, ' ')
286
+ .gsub(/([a-z])([A-Z])/, '\1 \2')
287
+ .gsub(/\s+/, ' ')
288
+ .strip
289
+
290
+ # Capitalize words for better readability
291
+ title.split(' ').map(&:capitalize).join(' ')
292
+ end
293
+
294
+ def estimate_average_processing_time
295
+ # This would be calculated from actual processing logs in production
296
+ # For now, return a placeholder
297
+ "~2.5 seconds"
298
+ end
299
+ end
300
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ class CreateRagdollUnifiedContents < ActiveRecord::Migration[7.0]
4
+ def change
5
+ unless table_exists?(:ragdoll_unified_contents)
6
+ create_table :ragdoll_unified_contents,
7
+ comment: "Unified content storage for text-based RAG architecture" do |t|
8
+
9
+ t.references :document, null: false, foreign_key: { to_table: :ragdoll_documents },
10
+ comment: "Reference to parent document"
11
+
12
+ t.text :content, null: false,
13
+ comment: "Text content (original text, extracted text, image description, audio transcript, etc.)"
14
+
15
+ t.string :original_media_type, null: false,
16
+ comment: "Original media type (text, image, audio, video, pdf, docx, html, markdown, unknown)"
17
+
18
+ t.string :embedding_model, null: false,
19
+ comment: "Embedding model used for this content"
20
+
21
+ t.string :conversion_method,
22
+ comment: "Method used to convert to text (text_extraction, image_to_text, audio_transcription, etc.)"
23
+
24
+ t.integer :word_count, default: 0,
25
+ comment: "Number of words in the content"
26
+
27
+ t.integer :character_count, default: 0,
28
+ comment: "Number of characters in the content"
29
+
30
+ t.float :content_quality_score, default: 0.0,
31
+ comment: "Quality score of the converted content (0.0-1.0)"
32
+
33
+ t.json :metadata, default: {},
34
+ comment: "Additional metadata about the conversion and content"
35
+
36
+ t.timestamps null: false,
37
+ comment: "Standard creation and update timestamps"
38
+ end
39
+ else
40
+ # Add missing columns to existing table
41
+ add_column :ragdoll_unified_contents, :original_media_type, :string unless column_exists?(:ragdoll_unified_contents, :original_media_type)
42
+ add_column :ragdoll_unified_contents, :conversion_method, :string unless column_exists?(:ragdoll_unified_contents, :conversion_method)
43
+ add_column :ragdoll_unified_contents, :word_count, :integer, default: 0 unless column_exists?(:ragdoll_unified_contents, :word_count)
44
+ add_column :ragdoll_unified_contents, :character_count, :integer, default: 0 unless column_exists?(:ragdoll_unified_contents, :character_count)
45
+ add_column :ragdoll_unified_contents, :content_quality_score, :float, default: 0.0 unless column_exists?(:ragdoll_unified_contents, :content_quality_score)
46
+ end
47
+
48
+ ###########
49
+ # Indexes #
50
+ ###########
51
+
52
+ unless index_exists?(:ragdoll_unified_contents, :embedding_model)
53
+ add_index :ragdoll_unified_contents, :embedding_model,
54
+ comment: "Index for filtering by embedding model"
55
+ end
56
+
57
+ unless index_exists?(:ragdoll_unified_contents, :original_media_type)
58
+ add_index :ragdoll_unified_contents, :original_media_type,
59
+ comment: "Index for filtering by original media type"
60
+ end
61
+
62
+ unless index_exists?(:ragdoll_unified_contents, :conversion_method)
63
+ add_index :ragdoll_unified_contents, :conversion_method,
64
+ comment: "Index for filtering by conversion method"
65
+ end
66
+
67
+ unless index_exists?(:ragdoll_unified_contents, :content_quality_score)
68
+ add_index :ragdoll_unified_contents, :content_quality_score,
69
+ comment: "Index for filtering by content quality"
70
+ end
71
+
72
+ unless index_exists?(:ragdoll_unified_contents, [:document_id, :original_media_type], name: "index_unified_contents_on_doc_and_media_type")
73
+ add_index :ragdoll_unified_contents, [:document_id, :original_media_type],
74
+ name: "index_unified_contents_on_doc_and_media_type",
75
+ comment: "Index for finding content by document and media type"
76
+ end
77
+
78
+ # Full-text search index
79
+ unless connection.execute("SELECT 1 FROM pg_indexes WHERE indexname = 'index_ragdoll_unified_contents_on_fulltext_search'").any?
80
+ execute <<-SQL
81
+ CREATE INDEX index_ragdoll_unified_contents_on_fulltext_search
82
+ ON ragdoll_unified_contents
83
+ USING gin(to_tsvector('english', COALESCE(content, '')))
84
+ SQL
85
+ end
86
+ end
87
+ end
@@ -3,6 +3,6 @@
3
3
 
4
4
  module Ragdoll
5
5
  module Core
6
- VERSION = "0.1.11"
6
+ VERSION = "0.1.12"
7
7
  end
8
8
  end
data/lib/ragdoll/core.rb CHANGED
@@ -25,6 +25,8 @@ require_relative "core/shrine_config"
25
25
 
26
26
  # Require models from app/models/ragdoll
27
27
  require "ragdoll/document"
28
+ require "ragdoll/unified_document"
29
+ require "ragdoll/unified_content"
28
30
  require "ragdoll/embedding"
29
31
  require "ragdoll/content"
30
32
  require "ragdoll/text_content"
@@ -34,11 +36,16 @@ require "ragdoll/search"
34
36
  require "ragdoll/search_result"
35
37
  require "ragdoll/document_processor"
36
38
  require "ragdoll/document_management"
39
+ require "ragdoll/unified_document_management"
40
+ require "ragdoll/document_converter"
41
+ require "ragdoll/migration_service"
37
42
  require "ragdoll/text_chunker"
38
43
  require "ragdoll/embedding_service"
39
44
  require "ragdoll/text_generation_service"
40
45
  require "ragdoll/search_engine"
41
46
  require "ragdoll/image_description_service"
47
+ require "ragdoll/image_to_text_service"
48
+ require "ragdoll/text_extraction_service"
42
49
  require "ragdoll/metadata_generator"
43
50
  # Require from app/lib/ragdoll
44
51
  require "ragdoll/metadata_schemas"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ragdoll
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.11
4
+ version: 0.1.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dewayne VanHoozer
@@ -396,22 +396,31 @@ files:
396
396
  - app/models/ragdoll/search.rb
397
397
  - app/models/ragdoll/search_result.rb
398
398
  - app/models/ragdoll/text_content.rb
399
+ - app/models/ragdoll/unified_content.rb
400
+ - app/models/ragdoll/unified_document.rb
401
+ - app/services/ragdoll/audio_to_text_service.rb
399
402
  - app/services/ragdoll/configuration_service.rb
403
+ - app/services/ragdoll/document_converter.rb
400
404
  - app/services/ragdoll/document_management.rb
401
405
  - app/services/ragdoll/document_processor.rb
402
406
  - app/services/ragdoll/embedding_service.rb
403
407
  - app/services/ragdoll/image_description_service.rb
408
+ - app/services/ragdoll/image_to_text_service.rb
404
409
  - app/services/ragdoll/metadata_generator.rb
410
+ - app/services/ragdoll/migration_service.rb
405
411
  - app/services/ragdoll/model_resolver.rb
406
412
  - app/services/ragdoll/search_engine.rb
407
413
  - app/services/ragdoll/text_chunker.rb
414
+ - app/services/ragdoll/text_extraction_service.rb
408
415
  - app/services/ragdoll/text_generation_service.rb
416
+ - app/services/ragdoll/unified_document_management.rb
409
417
  - db/migrate/20250815234901_enable_postgresql_extensions.rb
410
418
  - db/migrate/20250815234902_create_ragdoll_documents.rb
411
419
  - db/migrate/20250815234903_create_ragdoll_embeddings.rb
412
420
  - db/migrate/20250815234904_create_ragdoll_contents.rb
413
421
  - db/migrate/20250815234905_create_ragdoll_searches.rb
414
422
  - db/migrate/20250815234906_create_ragdoll_search_results.rb
423
+ - db/migrate/20250923000001_create_ragdoll_unified_contents.rb
415
424
  - lib/ragdoll-core.rb
416
425
  - lib/ragdoll.rb
417
426
  - lib/ragdoll/core.rb
@@ -447,7 +456,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
447
456
  - !ruby/object:Gem::Version
448
457
  version: '0'
449
458
  requirements: []
450
- rubygems_version: 3.7.1
459
+ rubygems_version: 3.7.2
451
460
  specification_version: 4
452
461
  summary: Multi-Modal Retrieval Augmented Generation
453
462
  test_files: []