ragdoll 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/Rakefile +52 -1
  4. data/app/jobs/ragdoll/extract_keywords_job.rb +28 -0
  5. data/app/jobs/ragdoll/extract_text_job.rb +38 -0
  6. data/app/jobs/ragdoll/generate_embeddings_job.rb +28 -0
  7. data/app/jobs/ragdoll/generate_summary_job.rb +25 -0
  8. data/app/lib/ragdoll/metadata_schemas.rb +332 -0
  9. data/app/models/ragdoll/audio_content.rb +142 -0
  10. data/app/models/ragdoll/content.rb +95 -0
  11. data/app/models/ragdoll/document.rb +611 -0
  12. data/app/models/ragdoll/embedding.rb +176 -0
  13. data/app/models/ragdoll/image_content.rb +194 -0
  14. data/app/models/ragdoll/text_content.rb +137 -0
  15. data/app/services/ragdoll/configuration_service.rb +113 -0
  16. data/app/services/ragdoll/document_management.rb +108 -0
  17. data/app/services/ragdoll/document_processor.rb +342 -0
  18. data/app/services/ragdoll/embedding_service.rb +202 -0
  19. data/app/services/ragdoll/image_description_service.rb +230 -0
  20. data/app/services/ragdoll/metadata_generator.rb +329 -0
  21. data/app/services/ragdoll/model_resolver.rb +72 -0
  22. data/app/services/ragdoll/search_engine.rb +51 -0
  23. data/app/services/ragdoll/text_chunker.rb +208 -0
  24. data/app/services/ragdoll/text_generation_service.rb +355 -0
  25. data/lib/ragdoll/core/client.rb +32 -41
  26. data/lib/ragdoll/core/configuration.rb +140 -156
  27. data/lib/ragdoll/core/database.rb +1 -1
  28. data/lib/ragdoll/core/model.rb +45 -0
  29. data/lib/ragdoll/core/version.rb +1 -1
  30. data/lib/ragdoll/core.rb +35 -17
  31. data/lib/ragdoll.rb +1 -1
  32. data/lib/tasks/annotate.rake +1 -1
  33. data/lib/tasks/db.rake +2 -2
  34. metadata +24 -20
  35. data/lib/ragdoll/core/document_management.rb +0 -110
  36. data/lib/ragdoll/core/document_processor.rb +0 -344
  37. data/lib/ragdoll/core/embedding_service.rb +0 -183
  38. data/lib/ragdoll/core/jobs/extract_keywords.rb +0 -32
  39. data/lib/ragdoll/core/jobs/extract_text.rb +0 -42
  40. data/lib/ragdoll/core/jobs/generate_embeddings.rb +0 -32
  41. data/lib/ragdoll/core/jobs/generate_summary.rb +0 -29
  42. data/lib/ragdoll/core/metadata_schemas.rb +0 -334
  43. data/lib/ragdoll/core/models/audio_content.rb +0 -175
  44. data/lib/ragdoll/core/models/content.rb +0 -126
  45. data/lib/ragdoll/core/models/document.rb +0 -678
  46. data/lib/ragdoll/core/models/embedding.rb +0 -204
  47. data/lib/ragdoll/core/models/image_content.rb +0 -227
  48. data/lib/ragdoll/core/models/text_content.rb +0 -169
  49. data/lib/ragdoll/core/search_engine.rb +0 -50
  50. data/lib/ragdoll/core/services/image_description_service.rb +0 -230
  51. data/lib/ragdoll/core/services/metadata_generator.rb +0 -335
  52. data/lib/ragdoll/core/text_chunker.rb +0 -210
  53. data/lib/ragdoll/core/text_generation_service.rb +0 -360
@@ -0,0 +1,176 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+ require "neighbor"
5
+
6
+ module Ragdoll
7
+ class Embedding < ActiveRecord::Base
8
+ self.table_name = "ragdoll_embeddings"
9
+
10
+ # Use pgvector for vector similarity search
11
+ has_neighbors :embedding_vector
12
+
13
+ belongs_to :embeddable, polymorphic: true
14
+
15
+ validates :embeddable_id, presence: true
16
+ validates :embeddable_type, presence: true
17
+ validates :chunk_index, presence: true, uniqueness: { scope: %i[embeddable_id embeddable_type] }
18
+ validates :embedding_vector, presence: true
19
+ validates :content, presence: true
20
+
21
+ scope :by_model, lambda { |model|
22
+ # Use STI table for all content types
23
+ where(
24
+ "embeddable_id IN (SELECT id FROM ragdoll_contents WHERE embedding_model = ?)",
25
+ model
26
+ )
27
+ }
28
+ scope :recent, -> { order(created_at: :desc) }
29
+ scope :frequently_used, -> { where("usage_count > 0").order(usage_count: :desc) }
30
+ scope :by_chunk_order, -> { order(:chunk_index) }
31
+ scope :by_embeddable_type, ->(type) { where(embeddable_type: type) }
32
+ scope :text_embeddings, -> { where(embeddable_type: "Ragdoll::TextContent") }
33
+ scope :image_embeddings, -> { where(embeddable_type: "Ragdoll::ImageContent") }
34
+ scope :audio_embeddings, -> { where(embeddable_type: "Ragdoll::AudioContent") }
35
+
36
+ # JSON columns are handled natively by PostgreSQL - no serialization needed
37
+
38
+ # Callback for vector column updates (no-op for pgvector)
39
+ before_save :update_vector_columns
40
+
41
+ def embedding_dimensions
42
+ embedding_vector&.length || 0
43
+ end
44
+
45
+ # Access embedding_model via polymorphic relationship
46
+ def embedding_model
47
+ embeddable&.embedding_model
48
+ end
49
+
50
+ def mark_as_used!
51
+ increment!(:usage_count)
52
+ update!(returned_at: Time.current)
53
+ end
54
+
55
+ # PostgreSQL pgvector similarity search using neighbor gem
56
+ def self.search_similar(query_embedding, limit: 20,
57
+ threshold: 0.8, filters: {})
58
+ # Apply filters
59
+ scope = all
60
+ scope = scope.where(embeddable_id: filters[:embeddable_id]) if filters[:embeddable_id]
61
+ scope = scope.where(embeddable_type: filters[:embeddable_type]) if filters[:embeddable_type]
62
+ scope = scope.by_model(filters[:embedding_model]) if filters[:embedding_model]
63
+
64
+ # Document-level filters require joining through embeddable (STI Content) to documents
65
+ if filters[:document_type]
66
+ scope = scope.joins("JOIN ragdoll_contents ON ragdoll_contents.id = ragdoll_embeddings.embeddable_id")
67
+ .joins("JOIN ragdoll_documents ON ragdoll_documents.id = ragdoll_contents.document_id")
68
+ .where("ragdoll_documents.document_type = ?", filters[:document_type])
69
+ end
70
+
71
+ # Use pgvector for similarity search
72
+ search_with_pgvector(query_embedding, scope, limit, threshold)
73
+ end
74
+
75
+ # Fast search using pgvector with neighbor gem
76
+ def self.search_with_pgvector(query_embedding, scope, limit, threshold)
77
+ # Use pgvector for similarity search
78
+ neighbor_results = scope
79
+ .includes(:embeddable)
80
+ .nearest_neighbors(:embedding_vector, query_embedding, distance: "cosine")
81
+ .limit(limit * 2) # Get more to filter by threshold
82
+
83
+ results = []
84
+ highest_similarity = 0.0
85
+
86
+ neighbor_results.each do |embedding|
87
+ # Calculate cosine similarity (neighbor returns distance, we want similarity)
88
+ similarity = 1.0 - embedding.neighbor_distance
89
+
90
+ highest_similarity = similarity if similarity > highest_similarity
91
+ next if similarity < threshold
92
+
93
+ usage_score = calculate_usage_score(embedding)
94
+ combined_score = similarity + usage_score
95
+
96
+ results << build_result_hash(embedding, query_embedding, similarity, highest_similarity,
97
+ usage_score, combined_score)
98
+ end
99
+
100
+ # Sort by combined score and limit
101
+ results = results.sort_by { |r| -r[:combined_score] }.take(limit)
102
+ mark_embeddings_as_used(results)
103
+ results
104
+ end
105
+
106
+ private
107
+
108
+ # Calculate usage score for ranking
109
+ def self.calculate_usage_score(embedding)
110
+ usage_score = 0.0
111
+ if embedding.returned_at && embedding.usage_count.positive?
112
+ frequency_weight = 0.7
113
+ recency_weight = 0.3
114
+
115
+ frequency_score = [Math.log(embedding.usage_count + 1) / Math.log(100), 1.0].min
116
+ days_since_use = (Time.current - embedding.returned_at) / 1.day
117
+ recency_score = Math.exp(-days_since_use / 30)
118
+
119
+ usage_score = frequency_weight * frequency_score + recency_weight * recency_score
120
+ end
121
+ usage_score
122
+ end
123
+
124
+ # Build standardized result hash
125
+ def self.build_result_hash(embedding, query_embedding, similarity, highest_similarity, usage_score,
126
+ combined_score)
127
+ {
128
+ embedding_id: embedding.id.to_s,
129
+ embeddable_id: embedding.embeddable_id.to_s,
130
+ embeddable_type: embedding.embeddable_type,
131
+ document_id: embedding.embeddable&.document_id&.to_s || "Unknown",
132
+ document_title: embedding.embeddable&.document&.title || "Unknown",
133
+ document_location: embedding.embeddable&.document&.location || "Unknown",
134
+ content: embedding.content,
135
+ similarity: similarity,
136
+ highest_similarity: highest_similarity,
137
+ distance: 1.0 - similarity,
138
+ chunk_index: embedding.chunk_index,
139
+ embedding_dimensions: query_embedding.length,
140
+ embedding_model: embedding.embeddable&.embedding_model,
141
+ usage_count: embedding.usage_count || 0,
142
+ returned_at: embedding.returned_at,
143
+ usage_score: usage_score,
144
+ combined_score: combined_score
145
+ }
146
+ end
147
+
148
+ # Mark embeddings as used for analytics
149
+ def self.mark_embeddings_as_used(results)
150
+ return if results.empty?
151
+
152
+ embedding_ids = results.map { |r| r[:embedding_id] }
153
+ where(id: embedding_ids).update_all(
154
+ usage_count: arel_table[:usage_count] + 1,
155
+ returned_at: Time.current
156
+ )
157
+ end
158
+
159
+ # Callback to update vector columns when embedding_vector changes
160
+ def update_vector_columns
161
+ # No additional processing needed for pgvector
162
+ end
163
+
164
+ def self.cosine_similarity(vec1, vec2)
165
+ return 0.0 if vec1.nil? || vec2.nil? || vec1.length != vec2.length
166
+
167
+ dot_product = vec1.zip(vec2).sum { |a, b| a * b }
168
+ magnitude1 = Math.sqrt(vec1.sum { |a| a * a })
169
+ magnitude2 = Math.sqrt(vec2.sum { |a| a * a })
170
+
171
+ return 0.0 if magnitude1 == 0.0 || magnitude2 == 0.0
172
+
173
+ dot_product / (magnitude1 * magnitude2)
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,194 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+ require_relative "content"
5
+
6
+ module Ragdoll
7
+ class ImageContent < Content
8
+ validate :image_data_or_description_present
9
+
10
+ scope :recent, -> { order(created_at: :desc) }
11
+ scope :with_images, -> { where.not(data: [nil, ""]) }
12
+ scope :with_descriptions, -> { where.not(content: [nil, ""]) }
13
+
14
+ # Image content accessors - content field stores description for embedding
15
+ def description
16
+ content
17
+ end
18
+
19
+ def description=(value)
20
+ self.content = value
21
+ end
22
+
23
+ # Image file data accessor
24
+ def image_data
25
+ data
26
+ end
27
+
28
+ def image_data=(value)
29
+ self.data = value
30
+ end
31
+
32
+ # Image-specific technical metadata (raw file properties)
33
+ # This metadata is about the actual image file data, not AI-generated insights
34
+ def alt_text
35
+ metadata.dig("alt_text")
36
+ end
37
+
38
+ def alt_text=(value)
39
+ self.metadata = metadata.merge("alt_text" => value)
40
+ end
41
+
42
+ def embedding_count
43
+ embeddings.count
44
+ end
45
+
46
+ # Image file technical properties (stored in content metadata - raw file data)
47
+ def image_attached?
48
+ data.present?
49
+ end
50
+
51
+ def image_size
52
+ metadata.dig("file_size") || 0
53
+ end
54
+
55
+ def image_size=(value)
56
+ self.metadata = metadata.merge("file_size" => value)
57
+ end
58
+
59
+ def image_content_type
60
+ metadata.dig("content_type")
61
+ end
62
+
63
+ def image_content_type=(value)
64
+ self.metadata = metadata.merge("content_type" => value)
65
+ end
66
+
67
+ def image_filename
68
+ metadata.dig("filename")
69
+ end
70
+
71
+ def image_filename=(value)
72
+ self.metadata = metadata.merge("filename" => value)
73
+ end
74
+
75
+ def image_dimensions
76
+ width = metadata.dig("width")
77
+ height = metadata.dig("height")
78
+ return nil unless width && height
79
+
80
+ { width: width, height: height }
81
+ end
82
+
83
+ def set_image_dimensions(width, height)
84
+ self.metadata = metadata.merge("width" => width, "height" => height)
85
+ end
86
+
87
+ # Image format and technical details
88
+ def color_space
89
+ metadata.dig("color_space")
90
+ end
91
+
92
+ def color_space=(value)
93
+ self.metadata = metadata.merge("color_space" => value)
94
+ end
95
+
96
+ def bit_depth
97
+ metadata.dig("bit_depth")
98
+ end
99
+
100
+ def bit_depth=(value)
101
+ self.metadata = metadata.merge("bit_depth" => value)
102
+ end
103
+
104
+ # Generate description from image file using LLM vision capabilities
105
+ def generate_description_from_image!(options = {})
106
+ return false unless image_attached? || file_path_available?
107
+
108
+ begin
109
+ image_path = get_image_path
110
+ return false unless image_path
111
+
112
+ # Use the image description service
113
+ description_service = Ragdoll::ImageDescriptionService.new
114
+
115
+ generated_description = description_service.generate_description(image_path, options)
116
+
117
+ if generated_description.present?
118
+ self.description = generated_description
119
+ save!
120
+ return true
121
+ end
122
+
123
+ false
124
+ rescue StandardError => e
125
+ puts "Failed to generate image description: #{e.message}"
126
+ false
127
+ end
128
+ end
129
+
130
+ # Override content for embedding to combine description and alt_text
131
+ def content_for_embedding
132
+ content_parts = []
133
+ content_parts << alt_text if alt_text.present?
134
+ content_parts << description if description.present?
135
+ content_parts.join(" ")
136
+ end
137
+
138
+ def generate_embeddings!
139
+ return unless should_generate_embeddings?
140
+
141
+ embedding_content = content_for_embedding
142
+ return if embedding_content.blank?
143
+
144
+ # Generate embeddings using the base class method
145
+ super
146
+ end
147
+
148
+ # Override should_generate_embeddings to check for content
149
+ def should_generate_embeddings?
150
+ content_for_embedding.present? && embeddings.empty?
151
+ end
152
+
153
+ def self.stats
154
+ {
155
+ total_image_contents: count,
156
+ by_model: group(:embedding_model).count,
157
+ total_embeddings: joins(:embeddings).count,
158
+ with_images: with_images.count,
159
+ with_descriptions: with_descriptions.count,
160
+ average_image_size: joins(:image_attachment).average("active_storage_blobs.byte_size")
161
+ }
162
+ end
163
+
164
+ private
165
+
166
+ def file_path_available?
167
+ document&.location&.present? && File.exist?(document.location)
168
+ end
169
+
170
+ def get_image_path
171
+ if file_path_available?
172
+ # Use document location if it's an image file
173
+ document.location if image_file?(document.location)
174
+ elsif image_attached?
175
+ # Try to get path from stored data (if it's a file path)
176
+ data if data&.start_with?("/")
177
+ end
178
+ end
179
+
180
+ def image_file?(file_path)
181
+ return false unless file_path
182
+
183
+ image_extensions = %w[.jpg .jpeg .png .gif .bmp .webp .svg .ico .tiff .tif]
184
+ ext = File.extname(file_path).downcase
185
+ image_extensions.include?(ext)
186
+ end
187
+
188
+ def image_data_or_description_present
189
+ return if image_attached? || description.present? || alt_text.present?
190
+
191
+ errors.add(:base, "Must have either image data or description/alt_text")
192
+ end
193
+ end
194
+ end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+ require_relative "content"
5
+
6
+ module Ragdoll
7
+ class TextContent < Content
8
+ validates :content, presence: true
9
+
10
+ scope :recent, -> { order(created_at: :desc) }
11
+
12
+ # Text-specific processing configuration stored in content metadata
13
+ # This metadata is about the raw content processing, not AI-generated insights
14
+ def chunk_size
15
+ metadata.dig('chunk_size') || 1000
16
+ end
17
+
18
+ def chunk_size=(value)
19
+ self.metadata = metadata.merge('chunk_size' => value)
20
+ end
21
+
22
+ def overlap
23
+ metadata.dig('overlap') || 200
24
+ end
25
+
26
+ def overlap=(value)
27
+ self.metadata = metadata.merge('overlap' => value)
28
+ end
29
+
30
+ # Content-specific technical metadata (file processing info)
31
+ def encoding
32
+ metadata.dig('encoding')
33
+ end
34
+
35
+ def encoding=(value)
36
+ self.metadata = metadata.merge('encoding' => value)
37
+ end
38
+
39
+ def line_count
40
+ metadata.dig('line_count')
41
+ end
42
+
43
+ def line_count=(value)
44
+ self.metadata = metadata.merge('line_count' => value)
45
+ end
46
+
47
+ def word_count
48
+ content&.split&.length || 0
49
+ end
50
+
51
+ def character_count
52
+ content&.length || 0
53
+ end
54
+
55
+ def embedding_count
56
+ embeddings.count
57
+ end
58
+
59
+ # Text-specific processing methods
60
+ def chunks
61
+ return [] if content.blank?
62
+
63
+ chunks = []
64
+ start_pos = 0
65
+
66
+ while start_pos < content.length
67
+ end_pos = [start_pos + chunk_size, content.length].min
68
+
69
+ # Try to break at word boundary if not at end
70
+ if end_pos < content.length
71
+ last_space = content.rindex(" ", end_pos)
72
+ end_pos = last_space if last_space && last_space > start_pos
73
+ end
74
+
75
+ chunk_content = content[start_pos...end_pos].strip
76
+ if chunk_content.present?
77
+ chunks << {
78
+ content: chunk_content,
79
+ start_position: start_pos,
80
+ end_position: end_pos,
81
+ chunk_index: chunks.length
82
+ }
83
+ end
84
+
85
+ break if end_pos >= content.length
86
+
87
+ start_pos = [end_pos - overlap, start_pos + 1].max
88
+ end
89
+
90
+ chunks
91
+ end
92
+
93
+ def generate_embeddings!
94
+ return if content.blank?
95
+
96
+ # Clear existing embeddings
97
+ embeddings.destroy_all
98
+
99
+ # Use TextChunker to split content into manageable chunks
100
+ chunks = Ragdoll::TextChunker.chunk(content)
101
+
102
+ # Generate embeddings for each chunk
103
+ embedding_service = Ragdoll::EmbeddingService.new
104
+
105
+ chunks.each_with_index do |chunk_text, index|
106
+ begin
107
+ vector = embedding_service.generate_embedding(chunk_text)
108
+
109
+ embeddings.create!(
110
+ content: chunk_text,
111
+ embedding_vector: vector,
112
+ chunk_index: index
113
+ )
114
+ rescue StandardError => e
115
+ puts "Failed to generate embedding for chunk #{index}: #{e.message}"
116
+ end
117
+ end
118
+
119
+ update!(metadata: (metadata || {}).merge("embeddings_generated_at" => Time.current))
120
+ end
121
+
122
+ # Override content for embedding to use the text content
123
+ def content_for_embedding
124
+ content
125
+ end
126
+
127
+ def self.stats
128
+ {
129
+ total_text_contents: count,
130
+ by_model: group(:embedding_model).count,
131
+ total_embeddings: joins(:embeddings).count,
132
+ average_word_count: average("LENGTH(content) - LENGTH(REPLACE(content, ' ', '')) + 1"),
133
+ average_chunk_size: average(:chunk_size)
134
+ }
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ragdoll
4
+ # Service class for centralized configuration logic
5
+ # Provides a clean interface for accessing configuration with validation
6
+ class ConfigurationService
7
+ def initialize(config = nil)
8
+ @config = config || Ragdoll.config
9
+ end
10
+
11
+ # Expose config as a public method as well for backward compatibility
12
+ def config
13
+ @config
14
+ end
15
+
16
+ # Resolve model for a task with inheritance support
17
+ def resolve_model(task_type, content_type = :text)
18
+ case task_type
19
+ when :embedding
20
+ @config.embedding_model(content_type)
21
+ when :summary, :keywords
22
+ # Check for task-specific model, fall back to default
23
+ task_model = @config.models.text_generation[task_type]
24
+ task_model || @config.models.text_generation[:default]
25
+ else
26
+ @config.models.text_generation[:default]
27
+ end
28
+ end
29
+
30
+ # Get provider credentials with fallback to default provider
31
+ def provider_credentials(provider = nil)
32
+ provider ||= @config.llm_providers[:default_provider]
33
+ credentials = @config.llm_providers[provider]
34
+
35
+ if credentials.nil?
36
+ raise Ragdoll::ConfigurationError, "Provider '#{provider}' not configured"
37
+ end
38
+
39
+ credentials
40
+ end
41
+
42
+ # Get chunking configuration for content type
43
+ def chunking_config(content_type = :text)
44
+ @config.processing[content_type]&.dig(:chunking) ||
45
+ @config.processing[:default][:chunking]
46
+ end
47
+
48
+ # Get search configuration
49
+ def search_config
50
+ @config.processing[:search]
51
+ end
52
+
53
+ # Get prompt template with validation
54
+ def prompt_template(template_name = :rag_enhancement)
55
+ template = @config.prompt_templates[template_name]
56
+
57
+ if template.nil?
58
+ raise Ragdoll::ConfigurationError, "Prompt template '#{template_name}' not found"
59
+ end
60
+
61
+ template
62
+ end
63
+
64
+ # Validate configuration completeness
65
+ def validate!
66
+ errors = []
67
+
68
+ # Check required database configuration
69
+ errors << "Database password not configured" if @config.database[:password].nil?
70
+
71
+ # Check default LLM provider configuration
72
+ default_provider = @config.llm_providers[:default_provider]
73
+ if default_provider.nil?
74
+ errors << "Default LLM provider not specified"
75
+ else
76
+ provider_config = @config.llm_providers[default_provider]
77
+ if provider_config.nil?
78
+ errors << "Default provider '#{default_provider}' not configured"
79
+ elsif provider_config[:api_key].nil?
80
+ errors << "API key for default provider '#{default_provider}' not configured"
81
+ end
82
+ end
83
+
84
+ # Check embedding configuration
85
+ if @config.models.embedding[:text].nil?
86
+ errors << "Text embedding model not configured"
87
+ end
88
+
89
+ # Ensure log directory can be created
90
+ log_dir = File.dirname(@config.logging[:filepath])
91
+ unless Dir.exist?(log_dir) || File.writable?(File.dirname(log_dir))
92
+ errors << "Cannot create log directory '#{log_dir}'"
93
+ end
94
+
95
+ unless errors.empty?
96
+ raise Ragdoll::ConfigurationError, "Configuration validation failed:\n - #{errors.join("\n - ")}"
97
+ end
98
+
99
+ true
100
+ end
101
+
102
+ # Check if configuration is valid without raising
103
+ def valid?
104
+ validate!
105
+ true
106
+ rescue Ragdoll::ConfigurationError
107
+ false
108
+ end
109
+
110
+ # Expose config for access
111
+ attr_reader :config
112
+ end
113
+ end