ragdoll 0.1.11 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +323 -384
- data/app/models/ragdoll/document.rb +1 -1
- data/app/models/ragdoll/unified_content.rb +216 -0
- data/app/models/ragdoll/unified_document.rb +338 -0
- data/app/services/ragdoll/audio_to_text_service.rb +200 -0
- data/app/services/ragdoll/document_converter.rb +216 -0
- data/app/services/ragdoll/document_processor.rb +197 -331
- data/app/services/ragdoll/image_to_text_service.rb +322 -0
- data/app/services/ragdoll/migration_service.rb +340 -0
- data/app/services/ragdoll/text_extraction_service.rb +422 -0
- data/app/services/ragdoll/unified_document_management.rb +300 -0
- data/db/migrate/20250923000001_create_ragdoll_unified_contents.rb +87 -0
- data/lib/ragdoll/core/version.rb +1 -1
- data/lib/ragdoll/core.rb +7 -0
- metadata +11 -2
@@ -0,0 +1,300 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ragdoll
|
4
|
+
# Unified document management service for text-based RAG system
|
5
|
+
# Handles the entire pipeline from document ingestion to searchable text embeddings
|
6
|
+
class UnifiedDocumentManagement
|
7
|
+
class ProcessingError < StandardError; end
|
8
|
+
|
9
|
+
def self.add_document(file_path, **options)
|
10
|
+
new.add_document(file_path, **options)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.add_document_from_upload(uploaded_file, **options)
|
14
|
+
new.add_document_from_upload(uploaded_file, **options)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.process_document(document_id)
|
18
|
+
new.process_document(document_id)
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@converter = Ragdoll::DocumentConverter.new
|
23
|
+
end
|
24
|
+
|
25
|
+
# Add a document from file path
|
26
|
+
def add_document(file_path, **options)
|
27
|
+
return nil unless File.exist?(file_path)
|
28
|
+
|
29
|
+
# Determine document type
|
30
|
+
document_type = @converter.determine_document_type(file_path)
|
31
|
+
|
32
|
+
# Convert to text
|
33
|
+
text_content = @converter.convert_to_text(file_path, document_type)
|
34
|
+
|
35
|
+
# Create document
|
36
|
+
document = create_unified_document(
|
37
|
+
location: File.expand_path(file_path),
|
38
|
+
document_type: document_type,
|
39
|
+
text_content: text_content,
|
40
|
+
**options
|
41
|
+
)
|
42
|
+
|
43
|
+
# Process asynchronously if requested
|
44
|
+
if options[:async]
|
45
|
+
process_document_async(document.id)
|
46
|
+
else
|
47
|
+
process_document_sync(document)
|
48
|
+
end
|
49
|
+
|
50
|
+
document
|
51
|
+
end
|
52
|
+
|
53
|
+
# Add a document from uploaded file
|
54
|
+
def add_document_from_upload(uploaded_file, **options)
|
55
|
+
# Create temporary file to process
|
56
|
+
temp_file = nil
|
57
|
+
begin
|
58
|
+
temp_file = create_temp_file_from_upload(uploaded_file)
|
59
|
+
document_type = @converter.determine_document_type(temp_file.path)
|
60
|
+
text_content = @converter.convert_to_text(temp_file.path, document_type)
|
61
|
+
|
62
|
+
# Create document
|
63
|
+
document = create_unified_document(
|
64
|
+
location: uploaded_file.original_filename || "uploaded_file",
|
65
|
+
document_type: document_type,
|
66
|
+
text_content: text_content,
|
67
|
+
**options
|
68
|
+
)
|
69
|
+
|
70
|
+
# Process asynchronously if requested
|
71
|
+
if options[:async]
|
72
|
+
process_document_async(document.id)
|
73
|
+
else
|
74
|
+
process_document_sync(document)
|
75
|
+
end
|
76
|
+
|
77
|
+
document
|
78
|
+
ensure
|
79
|
+
temp_file&.close
|
80
|
+
temp_file&.unlink if temp_file&.path
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# Process a document by ID
|
85
|
+
def process_document(document_id)
|
86
|
+
if defined?(Ragdoll::UnifiedDocument)
|
87
|
+
document = Ragdoll::UnifiedDocument.find(document_id)
|
88
|
+
else
|
89
|
+
# Fallback to regular Document
|
90
|
+
document = Ragdoll::Document.find(document_id)
|
91
|
+
end
|
92
|
+
|
93
|
+
process_document_sync(document)
|
94
|
+
end
|
95
|
+
|
96
|
+
# Reprocess document with new text conversion
|
97
|
+
def reprocess_document(document_id, **options)
|
98
|
+
if defined?(Ragdoll::UnifiedDocument)
|
99
|
+
document = Ragdoll::UnifiedDocument.find(document_id)
|
100
|
+
else
|
101
|
+
document = Ragdoll::Document.find(document_id)
|
102
|
+
end
|
103
|
+
|
104
|
+
return nil unless File.exist?(document.location)
|
105
|
+
|
106
|
+
# Re-convert to text
|
107
|
+
document_type = @converter.determine_document_type(document.location)
|
108
|
+
text_content = @converter.convert_to_text(document.location, document_type, **options)
|
109
|
+
|
110
|
+
# Update document content
|
111
|
+
if document.respond_to?(:unified_contents)
|
112
|
+
# Unified document approach
|
113
|
+
if document.unified_contents.any?
|
114
|
+
document.unified_contents.first.update!(content: text_content)
|
115
|
+
else
|
116
|
+
document.unified_contents.create!(
|
117
|
+
content: text_content,
|
118
|
+
original_media_type: document_type,
|
119
|
+
embedding_model: "text-embedding-3-large",
|
120
|
+
metadata: { "reprocessed_at" => Time.current }
|
121
|
+
)
|
122
|
+
end
|
123
|
+
else
|
124
|
+
# Fallback to content field
|
125
|
+
document.content = text_content
|
126
|
+
end
|
127
|
+
|
128
|
+
# Reprocess
|
129
|
+
process_document_sync(document)
|
130
|
+
end
|
131
|
+
|
132
|
+
# Batch processing for multiple documents
|
133
|
+
def batch_process_documents(file_paths, **options)
|
134
|
+
results = []
|
135
|
+
errors = []
|
136
|
+
|
137
|
+
file_paths.each do |file_path|
|
138
|
+
begin
|
139
|
+
document = add_document(file_path, **options)
|
140
|
+
results << document
|
141
|
+
rescue StandardError => e
|
142
|
+
errors << { file_path: file_path, error: e.message }
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
{
|
147
|
+
processed: results,
|
148
|
+
errors: errors,
|
149
|
+
total: file_paths.length,
|
150
|
+
success_count: results.length,
|
151
|
+
error_count: errors.length
|
152
|
+
}
|
153
|
+
end
|
154
|
+
|
155
|
+
# Search across all documents
|
156
|
+
def search_documents(query, **options)
|
157
|
+
if defined?(Ragdoll::UnifiedDocument)
|
158
|
+
Ragdoll::UnifiedDocument.search_content(query, **options)
|
159
|
+
else
|
160
|
+
Ragdoll::Document.search_content(query, **options)
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
# Get processing statistics
|
165
|
+
def processing_stats
|
166
|
+
if defined?(Ragdoll::UnifiedDocument)
|
167
|
+
base_stats = Ragdoll::UnifiedDocument.stats
|
168
|
+
content_stats = Ragdoll::UnifiedContent.stats
|
169
|
+
else
|
170
|
+
base_stats = Ragdoll::Document.stats
|
171
|
+
content_stats = Ragdoll::Content.stats
|
172
|
+
end
|
173
|
+
|
174
|
+
{
|
175
|
+
documents: base_stats,
|
176
|
+
content: content_stats,
|
177
|
+
processing_summary: {
|
178
|
+
total_documents: base_stats[:total_documents],
|
179
|
+
processed_documents: base_stats.dig(:by_status, "processed") || 0,
|
180
|
+
total_embeddings: base_stats[:total_embeddings],
|
181
|
+
average_processing_time: estimate_average_processing_time
|
182
|
+
}
|
183
|
+
}
|
184
|
+
end
|
185
|
+
|
186
|
+
private
|
187
|
+
|
188
|
+
def create_unified_document(location:, document_type:, text_content:, **options)
|
189
|
+
title = options[:title] || extract_title_from_location(location)
|
190
|
+
|
191
|
+
if defined?(Ragdoll::UnifiedDocument)
|
192
|
+
document = Ragdoll::UnifiedDocument.create!(
|
193
|
+
location: location,
|
194
|
+
title: title,
|
195
|
+
document_type: document_type,
|
196
|
+
status: "pending",
|
197
|
+
file_modified_at: options[:file_modified_at] || Time.current,
|
198
|
+
metadata: options[:metadata] || {}
|
199
|
+
)
|
200
|
+
|
201
|
+
# Create unified content
|
202
|
+
document.unified_contents.create!(
|
203
|
+
content: text_content,
|
204
|
+
original_media_type: document_type,
|
205
|
+
embedding_model: "text-embedding-3-large",
|
206
|
+
metadata: {
|
207
|
+
"created_at" => Time.current,
|
208
|
+
"conversion_method" => "unified_converter",
|
209
|
+
"original_filename" => File.basename(location)
|
210
|
+
}
|
211
|
+
)
|
212
|
+
else
|
213
|
+
# Fallback to regular Document
|
214
|
+
document = Ragdoll::Document.create!(
|
215
|
+
location: location,
|
216
|
+
title: title,
|
217
|
+
content: text_content,
|
218
|
+
document_type: document_type,
|
219
|
+
status: "pending",
|
220
|
+
file_modified_at: options[:file_modified_at] || Time.current,
|
221
|
+
metadata: options[:metadata] || {}
|
222
|
+
)
|
223
|
+
end
|
224
|
+
|
225
|
+
document
|
226
|
+
end
|
227
|
+
|
228
|
+
def process_document_sync(document)
|
229
|
+
begin
|
230
|
+
if document.respond_to?(:process_document!)
|
231
|
+
document.process_document!
|
232
|
+
else
|
233
|
+
# Fallback processing
|
234
|
+
document.update!(status: "processing")
|
235
|
+
generate_embeddings_for_document(document)
|
236
|
+
document.update!(status: "processed")
|
237
|
+
end
|
238
|
+
rescue StandardError => e
|
239
|
+
document.update!(status: "error", metadata: (document.metadata || {}).merge("error" => e.message))
|
240
|
+
raise ProcessingError, "Failed to process document #{document.id}: #{e.message}"
|
241
|
+
end
|
242
|
+
|
243
|
+
document
|
244
|
+
end
|
245
|
+
|
246
|
+
def process_document_async(document_id)
|
247
|
+
# In a real application, this would enqueue a background job
|
248
|
+
# For now, we'll just process synchronously
|
249
|
+
puts "Note: Async processing not implemented, processing synchronously"
|
250
|
+
process_document(document_id)
|
251
|
+
end
|
252
|
+
|
253
|
+
def generate_embeddings_for_document(document)
|
254
|
+
if document.respond_to?(:unified_contents)
|
255
|
+
document.unified_contents.each(&:generate_embeddings!)
|
256
|
+
elsif document.respond_to?(:contents)
|
257
|
+
document.contents.each(&:generate_embeddings!)
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
def create_temp_file_from_upload(uploaded_file)
|
262
|
+
temp_file = Tempfile.new([
|
263
|
+
File.basename(uploaded_file.original_filename || "upload", ".*"),
|
264
|
+
File.extname(uploaded_file.original_filename || "")
|
265
|
+
])
|
266
|
+
|
267
|
+
if uploaded_file.respond_to?(:read)
|
268
|
+
temp_file.write(uploaded_file.read)
|
269
|
+
elsif uploaded_file.respond_to?(:path)
|
270
|
+
FileUtils.cp(uploaded_file.path, temp_file.path)
|
271
|
+
else
|
272
|
+
raise ProcessingError, "Unknown upload file format"
|
273
|
+
end
|
274
|
+
|
275
|
+
temp_file.flush
|
276
|
+
temp_file.rewind
|
277
|
+
temp_file
|
278
|
+
end
|
279
|
+
|
280
|
+
def extract_title_from_location(location)
|
281
|
+
filename = File.basename(location, File.extname(location))
|
282
|
+
|
283
|
+
# Clean up common patterns in filenames
|
284
|
+
title = filename
|
285
|
+
.gsub(/[-_]+/, ' ')
|
286
|
+
.gsub(/([a-z])([A-Z])/, '\1 \2')
|
287
|
+
.gsub(/\s+/, ' ')
|
288
|
+
.strip
|
289
|
+
|
290
|
+
# Capitalize words for better readability
|
291
|
+
title.split(' ').map(&:capitalize).join(' ')
|
292
|
+
end
|
293
|
+
|
294
|
+
def estimate_average_processing_time
|
295
|
+
# This would be calculated from actual processing logs in production
|
296
|
+
# For now, return a placeholder
|
297
|
+
"~2.5 seconds"
|
298
|
+
end
|
299
|
+
end
|
300
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class CreateRagdollUnifiedContents < ActiveRecord::Migration[7.0]
|
4
|
+
def change
|
5
|
+
unless table_exists?(:ragdoll_unified_contents)
|
6
|
+
create_table :ragdoll_unified_contents,
|
7
|
+
comment: "Unified content storage for text-based RAG architecture" do |t|
|
8
|
+
|
9
|
+
t.references :document, null: false, foreign_key: { to_table: :ragdoll_documents },
|
10
|
+
comment: "Reference to parent document"
|
11
|
+
|
12
|
+
t.text :content, null: false,
|
13
|
+
comment: "Text content (original text, extracted text, image description, audio transcript, etc.)"
|
14
|
+
|
15
|
+
t.string :original_media_type, null: false,
|
16
|
+
comment: "Original media type (text, image, audio, video, pdf, docx, html, markdown, unknown)"
|
17
|
+
|
18
|
+
t.string :embedding_model, null: false,
|
19
|
+
comment: "Embedding model used for this content"
|
20
|
+
|
21
|
+
t.string :conversion_method,
|
22
|
+
comment: "Method used to convert to text (text_extraction, image_to_text, audio_transcription, etc.)"
|
23
|
+
|
24
|
+
t.integer :word_count, default: 0,
|
25
|
+
comment: "Number of words in the content"
|
26
|
+
|
27
|
+
t.integer :character_count, default: 0,
|
28
|
+
comment: "Number of characters in the content"
|
29
|
+
|
30
|
+
t.float :content_quality_score, default: 0.0,
|
31
|
+
comment: "Quality score of the converted content (0.0-1.0)"
|
32
|
+
|
33
|
+
t.json :metadata, default: {},
|
34
|
+
comment: "Additional metadata about the conversion and content"
|
35
|
+
|
36
|
+
t.timestamps null: false,
|
37
|
+
comment: "Standard creation and update timestamps"
|
38
|
+
end
|
39
|
+
else
|
40
|
+
# Add missing columns to existing table
|
41
|
+
add_column :ragdoll_unified_contents, :original_media_type, :string unless column_exists?(:ragdoll_unified_contents, :original_media_type)
|
42
|
+
add_column :ragdoll_unified_contents, :conversion_method, :string unless column_exists?(:ragdoll_unified_contents, :conversion_method)
|
43
|
+
add_column :ragdoll_unified_contents, :word_count, :integer, default: 0 unless column_exists?(:ragdoll_unified_contents, :word_count)
|
44
|
+
add_column :ragdoll_unified_contents, :character_count, :integer, default: 0 unless column_exists?(:ragdoll_unified_contents, :character_count)
|
45
|
+
add_column :ragdoll_unified_contents, :content_quality_score, :float, default: 0.0 unless column_exists?(:ragdoll_unified_contents, :content_quality_score)
|
46
|
+
end
|
47
|
+
|
48
|
+
###########
|
49
|
+
# Indexes #
|
50
|
+
###########
|
51
|
+
|
52
|
+
unless index_exists?(:ragdoll_unified_contents, :embedding_model)
|
53
|
+
add_index :ragdoll_unified_contents, :embedding_model,
|
54
|
+
comment: "Index for filtering by embedding model"
|
55
|
+
end
|
56
|
+
|
57
|
+
unless index_exists?(:ragdoll_unified_contents, :original_media_type)
|
58
|
+
add_index :ragdoll_unified_contents, :original_media_type,
|
59
|
+
comment: "Index for filtering by original media type"
|
60
|
+
end
|
61
|
+
|
62
|
+
unless index_exists?(:ragdoll_unified_contents, :conversion_method)
|
63
|
+
add_index :ragdoll_unified_contents, :conversion_method,
|
64
|
+
comment: "Index for filtering by conversion method"
|
65
|
+
end
|
66
|
+
|
67
|
+
unless index_exists?(:ragdoll_unified_contents, :content_quality_score)
|
68
|
+
add_index :ragdoll_unified_contents, :content_quality_score,
|
69
|
+
comment: "Index for filtering by content quality"
|
70
|
+
end
|
71
|
+
|
72
|
+
unless index_exists?(:ragdoll_unified_contents, [:document_id, :original_media_type], name: "index_unified_contents_on_doc_and_media_type")
|
73
|
+
add_index :ragdoll_unified_contents, [:document_id, :original_media_type],
|
74
|
+
name: "index_unified_contents_on_doc_and_media_type",
|
75
|
+
comment: "Index for finding content by document and media type"
|
76
|
+
end
|
77
|
+
|
78
|
+
# Full-text search index
|
79
|
+
unless connection.execute("SELECT 1 FROM pg_indexes WHERE indexname = 'index_ragdoll_unified_contents_on_fulltext_search'").any?
|
80
|
+
execute <<-SQL
|
81
|
+
CREATE INDEX index_ragdoll_unified_contents_on_fulltext_search
|
82
|
+
ON ragdoll_unified_contents
|
83
|
+
USING gin(to_tsvector('english', COALESCE(content, '')))
|
84
|
+
SQL
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/lib/ragdoll/core/version.rb
CHANGED
data/lib/ragdoll/core.rb
CHANGED
@@ -25,6 +25,8 @@ require_relative "core/shrine_config"
|
|
25
25
|
|
26
26
|
# Require models from app/models/ragdoll
|
27
27
|
require "ragdoll/document"
|
28
|
+
require "ragdoll/unified_document"
|
29
|
+
require "ragdoll/unified_content"
|
28
30
|
require "ragdoll/embedding"
|
29
31
|
require "ragdoll/content"
|
30
32
|
require "ragdoll/text_content"
|
@@ -34,11 +36,16 @@ require "ragdoll/search"
|
|
34
36
|
require "ragdoll/search_result"
|
35
37
|
require "ragdoll/document_processor"
|
36
38
|
require "ragdoll/document_management"
|
39
|
+
require "ragdoll/unified_document_management"
|
40
|
+
require "ragdoll/document_converter"
|
41
|
+
require "ragdoll/migration_service"
|
37
42
|
require "ragdoll/text_chunker"
|
38
43
|
require "ragdoll/embedding_service"
|
39
44
|
require "ragdoll/text_generation_service"
|
40
45
|
require "ragdoll/search_engine"
|
41
46
|
require "ragdoll/image_description_service"
|
47
|
+
require "ragdoll/image_to_text_service"
|
48
|
+
require "ragdoll/text_extraction_service"
|
42
49
|
require "ragdoll/metadata_generator"
|
43
50
|
# Require from app/lib/ragdoll
|
44
51
|
require "ragdoll/metadata_schemas"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ragdoll
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dewayne VanHoozer
|
@@ -396,22 +396,31 @@ files:
|
|
396
396
|
- app/models/ragdoll/search.rb
|
397
397
|
- app/models/ragdoll/search_result.rb
|
398
398
|
- app/models/ragdoll/text_content.rb
|
399
|
+
- app/models/ragdoll/unified_content.rb
|
400
|
+
- app/models/ragdoll/unified_document.rb
|
401
|
+
- app/services/ragdoll/audio_to_text_service.rb
|
399
402
|
- app/services/ragdoll/configuration_service.rb
|
403
|
+
- app/services/ragdoll/document_converter.rb
|
400
404
|
- app/services/ragdoll/document_management.rb
|
401
405
|
- app/services/ragdoll/document_processor.rb
|
402
406
|
- app/services/ragdoll/embedding_service.rb
|
403
407
|
- app/services/ragdoll/image_description_service.rb
|
408
|
+
- app/services/ragdoll/image_to_text_service.rb
|
404
409
|
- app/services/ragdoll/metadata_generator.rb
|
410
|
+
- app/services/ragdoll/migration_service.rb
|
405
411
|
- app/services/ragdoll/model_resolver.rb
|
406
412
|
- app/services/ragdoll/search_engine.rb
|
407
413
|
- app/services/ragdoll/text_chunker.rb
|
414
|
+
- app/services/ragdoll/text_extraction_service.rb
|
408
415
|
- app/services/ragdoll/text_generation_service.rb
|
416
|
+
- app/services/ragdoll/unified_document_management.rb
|
409
417
|
- db/migrate/20250815234901_enable_postgresql_extensions.rb
|
410
418
|
- db/migrate/20250815234902_create_ragdoll_documents.rb
|
411
419
|
- db/migrate/20250815234903_create_ragdoll_embeddings.rb
|
412
420
|
- db/migrate/20250815234904_create_ragdoll_contents.rb
|
413
421
|
- db/migrate/20250815234905_create_ragdoll_searches.rb
|
414
422
|
- db/migrate/20250815234906_create_ragdoll_search_results.rb
|
423
|
+
- db/migrate/20250923000001_create_ragdoll_unified_contents.rb
|
415
424
|
- lib/ragdoll-core.rb
|
416
425
|
- lib/ragdoll.rb
|
417
426
|
- lib/ragdoll/core.rb
|
@@ -447,7 +456,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
447
456
|
- !ruby/object:Gem::Version
|
448
457
|
version: '0'
|
449
458
|
requirements: []
|
450
|
-
rubygems_version: 3.7.
|
459
|
+
rubygems_version: 3.7.2
|
451
460
|
specification_version: 4
|
452
461
|
summary: Multi-Modal Retrieval Augmented Generation
|
453
462
|
test_files: []
|