ragdoll 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,7 +40,7 @@ module Ragdoll
40
40
  validates :location, presence: true
41
41
  validates :title, presence: true
42
42
  validates :document_type, presence: true,
43
- inclusion: { in: %w[text image audio pdf docx html markdown mixed] }
43
+ inclusion: { in: %w[text image audio pdf docx html markdown csv json xml mixed] }
44
44
  validates :summary, presence: false # Allow empty summaries initially
45
45
  validates :keywords, presence: false # Allow empty keywords initially
46
46
  validates :status, inclusion: { in: %w[pending processing processed error] }
@@ -0,0 +1,216 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+
5
+ module Ragdoll
6
+ # Unified content model for text-based RAG system
7
+ # All content types (text, image, audio, video) are converted to text
8
+ # and stored in a single content field for unified embedding generation
9
+ class UnifiedContent < ActiveRecord::Base
10
+ self.table_name = "ragdoll_unified_contents"
11
+
12
+ belongs_to :document,
13
+ class_name: "Ragdoll::Document",
14
+ foreign_key: "document_id"
15
+
16
+ has_many :embeddings,
17
+ class_name: "Ragdoll::Embedding",
18
+ as: :embeddable,
19
+ dependent: :destroy
20
+
21
+ validates :content, presence: true
22
+ validates :embedding_model, presence: true
23
+ validates :document_id, presence: true
24
+ validates :original_media_type, presence: true,
25
+ inclusion: { in: %w[text image audio video pdf docx html markdown unknown] }
26
+
27
+ # JSON columns are handled natively by PostgreSQL
28
+
29
+ scope :by_media_type, ->(media_type) { where(original_media_type: media_type) }
30
+ scope :with_embeddings, -> { joins(:embeddings).distinct }
31
+ scope :without_embeddings, -> { left_joins(:embeddings).where(embeddings: { id: nil }) }
32
+
33
+ # Generate embeddings for this content
34
+ def generate_embeddings!
35
+ return unless should_generate_embeddings?
36
+
37
+ # Clear existing embeddings
38
+ embeddings.destroy_all
39
+
40
+ # Use TextChunker to split content into chunks
41
+ chunks = Ragdoll::TextChunker.chunk(content)
42
+
43
+ # Generate embeddings for each chunk
44
+ embedding_service = Ragdoll::EmbeddingService.new
45
+
46
+ chunks.each_with_index do |chunk_text, index|
47
+ begin
48
+ vector = embedding_service.generate_embedding(chunk_text)
49
+
50
+ embeddings.create!(
51
+ content: chunk_text,
52
+ embedding_vector: vector,
53
+ chunk_index: index
54
+ )
55
+ rescue StandardError => e
56
+ puts "Failed to generate embedding for chunk #{index}: #{e.message}"
57
+ end
58
+ end
59
+
60
+ update!(metadata: metadata.merge("embeddings_generated_at" => Time.current))
61
+ end
62
+
63
+ # Whether this content should generate embeddings
64
+ def should_generate_embeddings?
65
+ content.present? && embeddings.empty?
66
+ end
67
+
68
+ # Statistics
69
+ def word_count
70
+ return 0 unless content.present?
71
+ content.split(/\s+/).length
72
+ end
73
+
74
+ def character_count
75
+ content&.length || 0
76
+ end
77
+
78
+ def embedding_count
79
+ embeddings.count
80
+ end
81
+
82
+ # Media type specific accessors for backward compatibility
83
+ def text_content?
84
+ %w[text markdown html pdf docx].include?(original_media_type)
85
+ end
86
+
87
+ def image_content?
88
+ original_media_type == "image"
89
+ end
90
+
91
+ def audio_content?
92
+ original_media_type == "audio"
93
+ end
94
+
95
+ def video_content?
96
+ original_media_type == "video"
97
+ end
98
+
99
+ # Original media metadata
100
+ def original_filename
101
+ metadata.dig("original_filename")
102
+ end
103
+
104
+ def original_filename=(value)
105
+ self.metadata = metadata.merge("original_filename" => value)
106
+ end
107
+
108
+ def file_size
109
+ metadata.dig("file_size") || 0
110
+ end
111
+
112
+ def file_size=(value)
113
+ self.metadata = metadata.merge("file_size" => value)
114
+ end
115
+
116
+ def conversion_method
117
+ metadata.dig("conversion_method")
118
+ end
119
+
120
+ def conversion_method=(value)
121
+ self.metadata = metadata.merge("conversion_method" => value)
122
+ end
123
+
124
+ # Image-specific metadata (for backward compatibility)
125
+ def image_width
126
+ metadata.dig("width")
127
+ end
128
+
129
+ def image_height
130
+ metadata.dig("height")
131
+ end
132
+
133
+ def image_dimensions
134
+ width = image_width
135
+ height = image_height
136
+ return nil unless width && height
137
+
138
+ { width: width, height: height }
139
+ end
140
+
141
+ # Audio-specific metadata
142
+ def audio_duration
143
+ metadata.dig("duration")
144
+ end
145
+
146
+ def audio_duration=(value)
147
+ self.metadata = metadata.merge("duration" => value)
148
+ end
149
+
150
+ # Content quality scoring
151
+ def content_quality_score
152
+ return 0.0 if content.blank?
153
+
154
+ score = 0.0
155
+
156
+ # Base score for having content
157
+ score += 0.3
158
+
159
+ # Length scoring (normalized)
160
+ if word_count > 0
161
+ # Score based on reasonable content length (50-2000 words is ideal)
162
+ length_score = case word_count
163
+ when 0..10 then 0.1
164
+ when 11..50 then 0.5
165
+ when 51..500 then 1.0
166
+ when 501..2000 then 0.9
167
+ when 2001..5000 then 0.7
168
+ else 0.5
169
+ end
170
+ score += length_score * 0.4
171
+ end
172
+
173
+ # Content type scoring
174
+ type_score = case original_media_type
175
+ when "text", "markdown" then 1.0
176
+ when "pdf", "docx", "html" then 0.9
177
+ when "image" then content.include?("Image file:") ? 0.3 : 0.8
178
+ when "audio" then content.include?("Audio file:") ? 0.3 : 0.8
179
+ when "video" then content.include?("Video file:") ? 0.3 : 0.7
180
+ else 0.5
181
+ end
182
+ score += type_score * 0.3
183
+
184
+ [score, 1.0].min # Cap at 1.0
185
+ end
186
+
187
+ # Search within this content type
188
+ def self.search_content(query, **options)
189
+ return none if query.blank?
190
+
191
+ where(
192
+ "to_tsvector('english', COALESCE(content, '')) @@ plainto_tsquery('english', ?)",
193
+ query
194
+ ).limit(options[:limit] || 20)
195
+ end
196
+
197
+ # Get statistics for all content
198
+ def self.stats
199
+ {
200
+ total_contents: count,
201
+ by_media_type: group(:original_media_type).count,
202
+ by_model: group(:embedding_model).count,
203
+ total_embeddings: joins(:embeddings).count,
204
+ with_embeddings: with_embeddings.count,
205
+ without_embeddings: without_embeddings.count,
206
+ average_word_count: average("LENGTH(content) - LENGTH(REPLACE(content, ' ', '')) + 1"),
207
+ average_character_count: average("LENGTH(content)"),
208
+ content_quality_distribution: {
209
+ high: where("LENGTH(content) > 1000").count,
210
+ medium: where("LENGTH(content) BETWEEN 100 AND 1000").count,
211
+ low: where("LENGTH(content) < 100").count
212
+ }
213
+ }
214
+ end
215
+ end
216
+ end
@@ -0,0 +1,338 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+
5
+ module Ragdoll
6
+ # Unified document model for text-based RAG system
7
+ # All documents have their content converted to text for unified search and embedding
8
+ class UnifiedDocument < ActiveRecord::Base
9
+ self.table_name = "ragdoll_documents"
10
+
11
+ # Unified content relationship - all content converted to text
12
+ has_many :unified_contents,
13
+ class_name: "Ragdoll::UnifiedContent",
14
+ foreign_key: "document_id",
15
+ dependent: :destroy
16
+
17
+ # All embeddings through unified content
18
+ has_many :embeddings, through: :unified_contents
19
+
20
+ validates :location, presence: true
21
+ validates :title, presence: true
22
+ validates :document_type, presence: true,
23
+ inclusion: { in: %w[text image audio video pdf docx html markdown csv json xml yaml unknown] }
24
+ validates :status, inclusion: { in: %w[pending processing processed error] }
25
+ validates :file_modified_at, presence: true
26
+
27
+ # Ensure location is always an absolute path for file paths
28
+ before_validation :normalize_location
29
+ before_validation :set_default_file_modified_at
30
+
31
+ scope :processed, -> { where(status: "processed") }
32
+ scope :by_type, ->(type) { where(document_type: type) }
33
+ scope :recent, -> { order(created_at: :desc) }
34
+ scope :with_content, -> { joins(:unified_contents).distinct }
35
+ scope :without_content, -> { left_joins(:unified_contents).where(unified_contents: { id: nil }) }
36
+
37
+ # Callbacks to process content
38
+ after_commit :create_unified_content_from_pending, on: %i[create update],
39
+ if: :has_pending_content?
40
+
41
+ def processed?
42
+ status == "processed"
43
+ end
44
+
45
+ # Unified content access
46
+ def content
47
+ unified_contents.pluck(:content).compact.join("\n\n")
48
+ end
49
+
50
+ def content=(value)
51
+ @pending_content = value
52
+
53
+ return unless persisted?
54
+
55
+ create_unified_content_from_pending
56
+ end
57
+
58
+ # Content statistics
59
+ def total_word_count
60
+ unified_contents.sum(&:word_count)
61
+ end
62
+
63
+ def total_character_count
64
+ unified_contents.sum(&:character_count)
65
+ end
66
+
67
+ def total_embedding_count
68
+ embeddings.count
69
+ end
70
+
71
+ # Document processing for unified text-based RAG
72
+ def process_document!
73
+ return if processed?
74
+
75
+ begin
76
+ update!(status: "processing")
77
+
78
+ # Convert document to text using unified converter
79
+ text_content = Ragdoll::DocumentConverter.convert_to_text(location, document_type)
80
+
81
+ # Create or update unified content
82
+ create_or_update_unified_content(text_content)
83
+
84
+ # Generate embeddings
85
+ generate_embeddings_for_content!
86
+
87
+ # Generate metadata
88
+ generate_metadata!
89
+
90
+ update!(status: "processed")
91
+ rescue StandardError => e
92
+ puts "Document processing failed: #{e.message}"
93
+ update!(status: "error", metadata: metadata.merge("error" => e.message))
94
+ raise
95
+ end
96
+ end
97
+
98
+ # Generate embeddings for all content
99
+ def generate_embeddings_for_content!
100
+ unified_contents.each(&:generate_embeddings!)
101
+ end
102
+
103
+ # Generate structured metadata using LLM
104
+ def generate_metadata!
105
+ return unless unified_contents.any?
106
+
107
+ begin
108
+ # Use the content for metadata generation
109
+ full_content = content
110
+ return if full_content.blank?
111
+
112
+ # Generate basic metadata
113
+ generated_metadata = {
114
+ content_length: full_content.length,
115
+ word_count: full_content.split(/\s+/).length,
116
+ generated_at: Time.current,
117
+ original_media_type: document_type
118
+ }
119
+
120
+ # Add document type specific metadata
121
+ case document_type
122
+ when "image"
123
+ generated_metadata[:description_source] = "ai_generated"
124
+ when "audio"
125
+ generated_metadata[:transcript_source] = "auto_generated"
126
+ when "video"
127
+ generated_metadata[:content_source] = "mixed_media_conversion"
128
+ end
129
+
130
+ # Merge with existing metadata
131
+ self.metadata = metadata.merge(generated_metadata)
132
+ save!
133
+ rescue StandardError => e
134
+ puts "Metadata generation failed: #{e.message}"
135
+ end
136
+ end
137
+
138
+ # Search content using PostgreSQL full-text search
139
+ def self.search_content(query, **options)
140
+ return none if query.blank?
141
+
142
+ words = query.downcase.scan(/[[:alnum:]]+/).uniq
143
+ return none if words.empty?
144
+
145
+ limit = options[:limit] || 20
146
+ threshold = options[:threshold] || 0.0
147
+
148
+ # Build tsvector from title and content
149
+ text_expr = "COALESCE(title, '') || ' ' || COALESCE(content, '')"
150
+ tsvector = "to_tsvector('english', #{text_expr})"
151
+
152
+ # Prepare sanitized tsquery terms
153
+ tsqueries = words.map do |word|
154
+ sanitize_sql_array(["plainto_tsquery('english', ?)", word])
155
+ end
156
+
157
+ # Combine per-word tsqueries
158
+ combined_tsquery = tsqueries.join(' || ')
159
+
160
+ # Score calculation
161
+ score_terms = tsqueries.map { |tsq| "(#{tsvector} @@ #{tsq})::int" }
162
+ score_sum = score_terms.join(' + ')
163
+ similarity_sql = "(#{score_sum})::float / #{words.size}"
164
+
165
+ # Build query with content from unified_contents
166
+ query = joins(:unified_contents)
167
+ .select("#{table_name}.*, string_agg(unified_contents.content, ' ') as content, #{similarity_sql} AS fulltext_similarity")
168
+ .group("#{table_name}.id")
169
+
170
+ # Build where conditions
171
+ conditions = ["#{tsvector} @@ (#{combined_tsquery})"]
172
+
173
+ # Add status filter
174
+ status = options[:status] || 'processed'
175
+ conditions << "#{table_name}.status = '#{status}'"
176
+
177
+ # Add document type filter if specified
178
+ if options[:document_type].present?
179
+ conditions << sanitize_sql_array(["#{table_name}.document_type = ?", options[:document_type]])
180
+ end
181
+
182
+ # Add threshold filtering if specified
183
+ if threshold > 0.0
184
+ conditions << "#{similarity_sql} >= #{threshold}"
185
+ end
186
+
187
+ # Combine all conditions
188
+ where_clause = conditions.join(' AND ')
189
+
190
+ query.where(where_clause)
191
+ .order(Arel.sql("fulltext_similarity DESC, updated_at DESC"))
192
+ .limit(limit)
193
+ .to_a
194
+ end
195
+
196
+ # Content quality assessment
197
+ def content_quality_score
198
+ return 0.0 unless unified_contents.any?
199
+
200
+ scores = unified_contents.map(&:content_quality_score)
201
+ scores.sum / scores.length
202
+ end
203
+
204
+ def high_quality_content?
205
+ content_quality_score >= 0.7
206
+ end
207
+
208
+ # Get all unique original media types
209
+ def self.all_media_types
210
+ joins(:unified_contents).distinct.pluck("unified_contents.original_media_type").compact.sort
211
+ end
212
+
213
+ # Get document statistics
214
+ def self.stats
215
+ {
216
+ total_documents: count,
217
+ by_status: group(:status).count,
218
+ by_type: group(:document_type).count,
219
+ with_content: with_content.count,
220
+ without_content: without_content.count,
221
+ total_unified_contents: joins(:unified_contents).count,
222
+ total_embeddings: joins(:embeddings).count,
223
+ content_quality: {
224
+ high: joins(:unified_contents).where("LENGTH(unified_contents.content) > 1000").distinct.count,
225
+ medium: joins(:unified_contents).where("LENGTH(unified_contents.content) BETWEEN 100 AND 1000").distinct.count,
226
+ low: joins(:unified_contents).where("LENGTH(unified_contents.content) < 100").distinct.count
227
+ },
228
+ storage_type: "unified_text_based"
229
+ }
230
+ end
231
+
232
+ # Convert document to hash representation
233
+ def to_hash(include_content: false)
234
+ {
235
+ id: id.to_s,
236
+ title: title,
237
+ location: location,
238
+ document_type: document_type,
239
+ status: status,
240
+ content_length: content&.length || 0,
241
+ word_count: total_word_count,
242
+ embedding_count: total_embedding_count,
243
+ content_quality_score: content_quality_score,
244
+ file_modified_at: file_modified_at&.iso8601,
245
+ created_at: created_at&.iso8601,
246
+ updated_at: updated_at&.iso8601,
247
+ metadata: metadata || {}
248
+ }.tap do |hash|
249
+ if include_content
250
+ hash[:content] = content
251
+ hash[:content_details] = unified_contents.map do |uc|
252
+ {
253
+ original_media_type: uc.original_media_type,
254
+ content: uc.content,
255
+ word_count: uc.word_count,
256
+ embedding_count: uc.embedding_count,
257
+ conversion_method: uc.conversion_method
258
+ }
259
+ end
260
+ end
261
+ end
262
+ end
263
+
264
+ private
265
+
266
+ def has_pending_content?
267
+ @pending_content.present?
268
+ end
269
+
270
+ def create_unified_content_from_pending
271
+ return unless @pending_content.present?
272
+
273
+ value = @pending_content
274
+ @pending_content = nil
275
+
276
+ create_or_update_unified_content(value)
277
+ end
278
+
279
+ def create_or_update_unified_content(text_content)
280
+ return if text_content.blank?
281
+
282
+ # Create or update the unified content
283
+ if unified_contents.any?
284
+ unified_contents.first.update!(
285
+ content: text_content,
286
+ metadata: unified_contents.first.metadata.merge(
287
+ "updated_at" => Time.current,
288
+ "manually_set" => true
289
+ )
290
+ )
291
+ else
292
+ unified_contents.create!(
293
+ content: text_content,
294
+ original_media_type: document_type,
295
+ embedding_model: default_embedding_model,
296
+ metadata: {
297
+ "created_at" => Time.current,
298
+ "conversion_method" => "unified_converter",
299
+ "original_filename" => File.basename(location)
300
+ }
301
+ )
302
+ end
303
+ end
304
+
305
+ def default_embedding_model
306
+ "text-embedding-3-large"
307
+ end
308
+
309
+ # Normalize location to absolute path for file paths
310
+ def normalize_location
311
+ return if location.blank?
312
+
313
+ # Don't normalize URLs or other non-file protocols
314
+ return if location.start_with?("http://", "https://", "ftp://", "sftp://")
315
+
316
+ # Convert relative file paths to absolute paths
317
+ self.location = File.expand_path(location)
318
+ end
319
+
320
+ # Set default file_modified_at if not provided
321
+ def set_default_file_modified_at
322
+ return if file_modified_at.present?
323
+
324
+ # If location is a file path that exists, use file mtime
325
+ if location.present? && !location.start_with?("http://", "https://", "ftp://", "sftp://")
326
+ expanded_location = File.expand_path(location)
327
+ self.file_modified_at = if File.exist?(expanded_location)
328
+ File.mtime(expanded_location)
329
+ else
330
+ Time.current
331
+ end
332
+ else
333
+ # For URLs or non-file locations, use current time
334
+ self.file_modified_at = Time.current
335
+ end
336
+ end
337
+ end
338
+ end