ragdoll 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/Rakefile +52 -1
  4. data/app/jobs/ragdoll/extract_keywords_job.rb +28 -0
  5. data/app/jobs/ragdoll/extract_text_job.rb +38 -0
  6. data/app/jobs/ragdoll/generate_embeddings_job.rb +28 -0
  7. data/app/jobs/ragdoll/generate_summary_job.rb +25 -0
  8. data/app/lib/ragdoll/metadata_schemas.rb +332 -0
  9. data/app/models/ragdoll/audio_content.rb +142 -0
  10. data/app/models/ragdoll/content.rb +95 -0
  11. data/app/models/ragdoll/document.rb +611 -0
  12. data/app/models/ragdoll/embedding.rb +176 -0
  13. data/app/models/ragdoll/image_content.rb +194 -0
  14. data/app/models/ragdoll/text_content.rb +137 -0
  15. data/app/services/ragdoll/configuration_service.rb +113 -0
  16. data/app/services/ragdoll/document_management.rb +108 -0
  17. data/app/services/ragdoll/document_processor.rb +342 -0
  18. data/app/services/ragdoll/embedding_service.rb +202 -0
  19. data/app/services/ragdoll/image_description_service.rb +230 -0
  20. data/app/services/ragdoll/metadata_generator.rb +329 -0
  21. data/app/services/ragdoll/model_resolver.rb +72 -0
  22. data/app/services/ragdoll/search_engine.rb +51 -0
  23. data/app/services/ragdoll/text_chunker.rb +208 -0
  24. data/app/services/ragdoll/text_generation_service.rb +355 -0
  25. data/lib/ragdoll/core/client.rb +32 -41
  26. data/lib/ragdoll/core/configuration.rb +140 -156
  27. data/lib/ragdoll/core/database.rb +1 -1
  28. data/lib/ragdoll/core/model.rb +45 -0
  29. data/lib/ragdoll/core/version.rb +1 -1
  30. data/lib/ragdoll/core.rb +35 -17
  31. data/lib/ragdoll.rb +1 -1
  32. data/lib/tasks/annotate.rake +1 -1
  33. data/lib/tasks/db.rake +2 -2
  34. metadata +24 -20
  35. data/lib/ragdoll/core/document_management.rb +0 -110
  36. data/lib/ragdoll/core/document_processor.rb +0 -344
  37. data/lib/ragdoll/core/embedding_service.rb +0 -183
  38. data/lib/ragdoll/core/jobs/extract_keywords.rb +0 -32
  39. data/lib/ragdoll/core/jobs/extract_text.rb +0 -42
  40. data/lib/ragdoll/core/jobs/generate_embeddings.rb +0 -32
  41. data/lib/ragdoll/core/jobs/generate_summary.rb +0 -29
  42. data/lib/ragdoll/core/metadata_schemas.rb +0 -334
  43. data/lib/ragdoll/core/models/audio_content.rb +0 -175
  44. data/lib/ragdoll/core/models/content.rb +0 -126
  45. data/lib/ragdoll/core/models/document.rb +0 -678
  46. data/lib/ragdoll/core/models/embedding.rb +0 -204
  47. data/lib/ragdoll/core/models/image_content.rb +0 -227
  48. data/lib/ragdoll/core/models/text_content.rb +0 -169
  49. data/lib/ragdoll/core/search_engine.rb +0 -50
  50. data/lib/ragdoll/core/services/image_description_service.rb +0 -230
  51. data/lib/ragdoll/core/services/metadata_generator.rb +0 -335
  52. data/lib/ragdoll/core/text_chunker.rb +0 -210
  53. data/lib/ragdoll/core/text_generation_service.rb +0 -360
@@ -0,0 +1,108 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ragdoll
4
+ class DocumentManagement
5
+ class << self
6
+ def add_document(location, content, metadata = {})
7
+ # Ensure location is an absolute path if it's a file path
8
+ absolute_location = location.start_with?("http") || location.start_with?("ftp") ? location : File.expand_path(location)
9
+
10
+ # Get file modification time if it's a file path
11
+ file_modified_at = if File.exist?(absolute_location) && !absolute_location.start_with?("http")
12
+ File.mtime(absolute_location)
13
+ else
14
+ Time.current
15
+ end
16
+
17
+ # Check if document already exists with same location and file_modified_at
18
+ existing_document = Ragdoll::Document.find_by(
19
+ location: absolute_location,
20
+ file_modified_at: file_modified_at
21
+ )
22
+
23
+ # Return existing document ID if found (skip duplicate)
24
+ return existing_document.id.to_s if existing_document
25
+
26
+ document = Ragdoll::Document.create!(
27
+ location: absolute_location,
28
+ title: metadata[:title] || metadata["title"] || extract_title_from_location(location),
29
+ document_type: metadata[:document_type] || metadata["document_type"] || "text",
30
+ metadata: metadata.is_a?(Hash) ? metadata : {},
31
+ status: "pending",
32
+ file_modified_at: file_modified_at
33
+ )
34
+
35
+ # Set content using the model's setter to trigger TextContent creation
36
+ document.content = content if content.present?
37
+
38
+ document.id.to_s
39
+ end
40
+
41
+ def get_document(id)
42
+ document = Ragdoll::Document.find_by(id: id)
43
+ return nil unless document
44
+
45
+ hash = document.to_hash
46
+ hash[:content] = document.content
47
+ hash
48
+ end
49
+
50
+ def update_document(id, **updates)
51
+ document = Ragdoll::Document.find_by(id: id)
52
+ return nil unless document
53
+
54
+ # Only update allowed fields
55
+ allowed_updates = updates.slice(:title, :metadata, :status, :document_type)
56
+ document.update!(allowed_updates) if allowed_updates.any?
57
+
58
+ document.to_hash
59
+ end
60
+
61
+ def delete_document(id)
62
+ document = Ragdoll::Document.find_by(id: id)
63
+ return nil unless document
64
+
65
+ document.destroy!
66
+ true
67
+ end
68
+
69
+ def list_documents(options = {})
70
+ limit = options[:limit] || 100
71
+ offset = options[:offset] || 0
72
+
73
+ Ragdoll::Document.offset(offset).limit(limit).recent.map(&:to_hash)
74
+ end
75
+
76
+ def get_document_stats
77
+ Ragdoll::Document.stats
78
+ end
79
+
80
+ # FIXME: should this be here?
81
+
82
+ def add_embedding(embeddable_id, chunk_index, embedding_vector, metadata = {})
83
+ # The embeddable_type should be the actual STI subclass, not the base class
84
+ embeddable_type = if metadata[:embeddable_type]
85
+ metadata[:embeddable_type]
86
+ else
87
+ # Look up the actual STI type from the content record
88
+ content = Ragdoll::Content.find(embeddable_id)
89
+ content.class.name
90
+ end
91
+
92
+ Ragdoll::Embedding.create!(
93
+ embeddable_id: embeddable_id,
94
+ embeddable_type: embeddable_type,
95
+ chunk_index: chunk_index,
96
+ embedding_vector: embedding_vector,
97
+ content: metadata[:content] || ""
98
+ ).id.to_s
99
+ end
100
+
101
+ private
102
+
103
+ def extract_title_from_location(location)
104
+ File.basename(location, File.extname(location))
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,342 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pdf-reader"
4
+ require "docx"
5
+ require "rmagick"
6
+ # Image description service is auto-loaded from app/services
7
+
8
+ module Ragdoll
9
+ class DocumentProcessor
10
+ class ParseError < Ragdoll::Core::DocumentError; end
11
+ class UnsupportedFormatError < ParseError; end
12
+
13
+ def self.parse(file_path)
14
+ new(file_path).parse
15
+ end
16
+
17
+ # Parse from Shrine attached file
18
+ def self.parse_attachment(attached_file)
19
+ attached_file.open do |tempfile|
20
+ new(tempfile.path, attached_file).parse
21
+ end
22
+ end
23
+
24
+ # Create document from file path
25
+ def self.create_document_from_file(file_path, **options)
26
+ parsed = parse(file_path)
27
+
28
+ # Get file modification time
29
+ file_modified_at = File.exist?(file_path) ? File.mtime(file_path) : Time.current
30
+
31
+ document = Ragdoll::Document.create!(
32
+ location: File.expand_path(file_path),
33
+ title: parsed[:title] || File.basename(file_path, File.extname(file_path)),
34
+ content: parsed[:content],
35
+ document_type: determine_document_type(file_path),
36
+ metadata: parsed[:metadata] || {},
37
+ status: "processed",
38
+ file_modified_at: file_modified_at,
39
+ **options
40
+ )
41
+
42
+ # Attach the file if it exists
43
+ document.file = File.open(file_path) if File.exist?(file_path)
44
+
45
+ document
46
+ end
47
+
48
+ # Create document from uploaded file (Shrine compatible)
49
+ def self.create_document_from_upload(uploaded_file, **options)
50
+ # Create document first
51
+ document = Ragdoll::Document.create!(
52
+ location: uploaded_file.original_filename || "uploaded_file",
53
+ title: options[:title] || File.basename(uploaded_file.original_filename || "uploaded_file",
54
+ File.extname(uploaded_file.original_filename || "")),
55
+ content: "", # Will be extracted after file attachment
56
+ document_type: determine_document_type_from_content_type(uploaded_file.mime_type),
57
+ status: "processing",
58
+ metadata: options[:metadata] || {},
59
+ file_modified_at: Time.current
60
+ )
61
+
62
+ # Attach the file
63
+ document.file = uploaded_file
64
+
65
+ # Extract content from attached file
66
+ if document.file.present?
67
+ parsed = parse_attachment(document.file)
68
+ document.update!(
69
+ content: parsed[:content],
70
+ title: parsed[:title] || document.title,
71
+ metadata: document.metadata.merge(parsed[:metadata] || {}),
72
+ status: "processed"
73
+ )
74
+ end
75
+
76
+ document
77
+ end
78
+
79
+ def initialize(file_path, attached_file = nil)
80
+ @file_path = file_path
81
+ @attached_file = attached_file
82
+ @file_extension = File.extname(file_path).downcase
83
+ end
84
+
85
+ def parse
86
+ case @file_extension
87
+ when ".pdf"
88
+ parse_pdf
89
+ when ".docx"
90
+ parse_docx
91
+ when ".txt", ".md", ".markdown"
92
+ parse_text
93
+ when ".html", ".htm"
94
+ parse_html
95
+ when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
96
+ parse_image
97
+ else
98
+ parse_text # Default to text parsing for unknown formats
99
+ end
100
+ rescue StandardError => e # StandardError => e
101
+ raise ParseError, "#{__LINE__} Failed to parse #{@file_path}: #{e.message}"
102
+ end
103
+
104
+ private
105
+
106
+ def parse_pdf
107
+ content = ""
108
+ metadata = {}
109
+
110
+ begin
111
+ PDF::Reader.open(@file_path) do |reader|
112
+ # Extract metadata
113
+ if reader.info
114
+ metadata[:title] = reader.info[:Title] if reader.info[:Title]
115
+ metadata[:author] = reader.info[:Author] if reader.info[:Author]
116
+ metadata[:subject] = reader.info[:Subject] if reader.info[:Subject]
117
+ metadata[:creator] = reader.info[:Creator] if reader.info[:Creator]
118
+ metadata[:producer] = reader.info[:Producer] if reader.info[:Producer]
119
+ metadata[:creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
120
+ metadata[:modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
121
+ end
122
+
123
+ metadata[:page_count] = reader.page_count
124
+
125
+ # Extract text from all pages
126
+ reader.pages.each_with_index do |page, index|
127
+ page_text = page.text.strip
128
+ next if page_text.empty?
129
+
130
+ content += "\n\n--- Page #{index + 1} ---\n\n" if content.length.positive?
131
+ content += page_text
132
+ end
133
+ end
134
+ rescue PDF::Reader::MalformedPDFError => e
135
+ raise ParseError, "Malformed PDF: #{e.message}"
136
+ rescue PDF::Reader::UnsupportedFeatureError => e
137
+ raise ParseError, "Unsupported PDF feature: #{e.message}"
138
+ end
139
+
140
+ {
141
+ content: content.strip,
142
+ metadata: metadata,
143
+ document_type: "pdf"
144
+ }
145
+ end
146
+
147
+ def parse_docx
148
+ content = ""
149
+ metadata = {}
150
+
151
+ begin
152
+ doc = Docx::Document.open(@file_path)
153
+
154
+ # Extract core properties
155
+ if doc.core_properties
156
+ metadata[:title] = doc.core_properties.title if doc.core_properties.title
157
+ metadata[:author] = doc.core_properties.creator if doc.core_properties.creator
158
+ metadata[:subject] = doc.core_properties.subject if doc.core_properties.subject
159
+ metadata[:description] = doc.core_properties.description if doc.core_properties.description
160
+ metadata[:keywords] = doc.core_properties.keywords if doc.core_properties.keywords
161
+ metadata[:created] = doc.core_properties.created if doc.core_properties.created
162
+ metadata[:modified] = doc.core_properties.modified if doc.core_properties.modified
163
+ if doc.core_properties.last_modified_by
164
+ metadata[:last_modified_by] =
165
+ doc.core_properties.last_modified_by
166
+ end
167
+ end
168
+
169
+ # Extract text from paragraphs
170
+ doc.paragraphs.each do |paragraph|
171
+ paragraph_text = paragraph.text.strip
172
+ next if paragraph_text.empty?
173
+
174
+ content += "#{paragraph_text}\n\n"
175
+ end
176
+
177
+ # Extract text from tables
178
+ doc.tables.each_with_index do |table, table_index|
179
+ content += "\n--- Table #{table_index + 1} ---\n\n"
180
+
181
+ table.rows.each do |row|
182
+ row_text = row.cells.map(&:text).join(" | ")
183
+ content += "#{row_text}\n" unless row_text.strip.empty?
184
+ end
185
+
186
+ content += "\n"
187
+ end
188
+
189
+ metadata[:paragraph_count] = doc.paragraphs.count
190
+ metadata[:table_count] = doc.tables.count
191
+ rescue StandardError => e # StandardError => e
192
+ raise ParseError, "#{__LINE__} Failed to parse DOCX: #{e.message}"
193
+ end
194
+
195
+ {
196
+ content: content.strip,
197
+ metadata: metadata,
198
+ document_type: "docx"
199
+ }
200
+ end
201
+
202
+ def parse_text
203
+ content = File.read(@file_path, encoding: "UTF-8")
204
+ metadata = {
205
+ file_size: File.size(@file_path),
206
+ encoding: "UTF-8"
207
+ }
208
+
209
+ document_type = case @file_extension
210
+ when ".md", ".markdown" then "markdown"
211
+ when ".txt" then "text"
212
+ else "text"
213
+ end
214
+
215
+ {
216
+ content: content,
217
+ metadata: metadata,
218
+ document_type: document_type
219
+ }
220
+ rescue Encoding::InvalidByteSequenceError
221
+ # Try with different encoding
222
+ content = File.read(@file_path, encoding: "ISO-8859-1")
223
+ metadata = {
224
+ file_size: File.size(@file_path),
225
+ encoding: "ISO-8859-1"
226
+ }
227
+
228
+ {
229
+ content: content,
230
+ metadata: metadata,
231
+ document_type: "text"
232
+ }
233
+ end
234
+
235
+ def parse_html
236
+ content = File.read(@file_path, encoding: "UTF-8")
237
+
238
+ # Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
239
+ clean_content = content
240
+ .gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
241
+ .gsub(%r{<style[^>]*>.*?</style>}mi, "") # Remove style tags
242
+ .gsub(/<[^>]+>/, " ") # Remove all HTML tags
243
+ .gsub(/\s+/, " ") # Normalize whitespace
244
+ .strip
245
+
246
+ metadata = {
247
+ file_size: File.size(@file_path),
248
+ original_format: "html"
249
+ }
250
+
251
+ {
252
+ content: clean_content,
253
+ metadata: metadata,
254
+ document_type: "html"
255
+ }
256
+ end
257
+
258
+ def parse_image
259
+ puts "🖼️ DocumentProcessor: Starting image parsing for #{@file_path}"
260
+
261
+ metadata = {
262
+ file_size: File.size(@file_path),
263
+ file_type: @file_extension.sub(".", ""),
264
+ original_filename: File.basename(@file_path)
265
+ }
266
+
267
+ # Extract image dimensions
268
+ begin
269
+ img = Magick::Image.read(@file_path).first
270
+ metadata[:width] = img.columns
271
+ metadata[:height] = img.rows
272
+ puts "📏 DocumentProcessor: Image dimensions: #{img.columns}x#{img.rows}"
273
+ rescue StandardError => e # StandardError
274
+ puts "❌ DocumentProcessor: Failed to get image dimensions: #{e.message}"
275
+ metadata[:width] = nil
276
+ metadata[:height] = nil
277
+ end
278
+
279
+ puts "🤖 DocumentProcessor: Creating ImageDescriptionService and calling generate_description..."
280
+ desc = Ragdoll::ImageDescriptionService.new.generate_description(@file_path)
281
+
282
+ puts "📝 DocumentProcessor: Received description: '#{desc}'"
283
+
284
+ metadata[:description] = desc if desc && !desc.empty?
285
+
286
+ # Use AI-generated description or fallback placeholder
287
+ content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
288
+
289
+ puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
290
+
291
+ {
292
+ content: content,
293
+ metadata: metadata,
294
+ document_type: "image"
295
+ }
296
+ end
297
+
298
+ # Helper methods for document type determination
299
+ def self.determine_document_type(file_path)
300
+ case File.extname(file_path).downcase
301
+ when ".pdf" then "pdf"
302
+ when ".docx" then "docx"
303
+ when ".txt" then "text"
304
+ when ".md", ".markdown" then "markdown"
305
+ when ".html", ".htm" then "html"
306
+ when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif" then "image"
307
+ else "text"
308
+ end
309
+ end
310
+
311
+ def self.determine_document_type_from_content_type(content_type)
312
+ case content_type
313
+ when "application/pdf" then "pdf"
314
+ when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx"
315
+ when "text/plain" then "text"
316
+ when "text/markdown" then "markdown"
317
+ when "text/html" then "html"
318
+ when %r{^image/} then "image"
319
+ else "text"
320
+ end
321
+ end
322
+
323
+ def self.determine_content_type(file_path)
324
+ case File.extname(file_path).downcase
325
+ when ".pdf" then "application/pdf"
326
+ when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
327
+ when ".txt" then "text/plain"
328
+ when ".md", ".markdown" then "text/markdown"
329
+ when ".html", ".htm" then "text/html"
330
+ when ".jpg", ".jpeg" then "image/jpeg"
331
+ when ".png" then "image/png"
332
+ when ".gif" then "image/gif"
333
+ when ".webp" then "image/webp"
334
+ when ".bmp" then "image/bmp"
335
+ when ".svg" then "image/svg+xml"
336
+ when ".ico" then "image/x-icon"
337
+ when ".tiff", ".tif" then "image/tiff"
338
+ else "application/octet-stream"
339
+ end
340
+ end
341
+ end
342
+ end
@@ -0,0 +1,202 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ruby_llm"
4
+
5
+ module Ragdoll
6
+ class EmbeddingService
7
+ def initialize(client: nil, config_service: nil, model_resolver: nil)
8
+ @client = client
9
+ @config_service = config_service || Ragdoll::ConfigurationService.new
10
+ @model_resolver = model_resolver || Ragdoll::ModelResolver.new(@config_service)
11
+ configure_ruby_llm unless @client
12
+ end
13
+
14
+ def generate_embedding(text)
15
+ return nil if text.nil? || text.strip.empty?
16
+
17
+ # Clean and prepare text
18
+ cleaned_text = clean_text(text)
19
+
20
+ begin
21
+ if @client
22
+ # Use custom client for testing
23
+ embedding_config = @model_resolver.resolve_embedding(:text)
24
+ response = @client.embed(
25
+ input: cleaned_text,
26
+ model: embedding_config.model.to_s
27
+ )
28
+
29
+ if response && response["embeddings"]&.first
30
+ response["embeddings"].first
31
+ elsif response && response["data"]&.first && response["data"].first["embedding"]
32
+ response["data"].first["embedding"]
33
+ else
34
+ raise Ragdoll::Core::EmbeddingError, "Invalid response format from embedding API"
35
+ end
36
+ else
37
+ # Use RubyLLM for real embedding generation
38
+ embedding_config = @model_resolver.resolve_embedding(:text)
39
+ # Use just the model name for RubyLLM
40
+ model = embedding_config.model.model
41
+
42
+ begin
43
+ response = RubyLLM.embed(cleaned_text, model: model)
44
+
45
+ # Extract the embedding vector from RubyLLM::Embedding object
46
+ return generate_fallback_embedding unless response.respond_to?(:instance_variable_get)
47
+
48
+ vectors = response.instance_variable_get(:@vectors)
49
+ return generate_fallback_embedding unless vectors && vectors.is_a?(Array)
50
+
51
+ vectors
52
+ rescue StandardError
53
+ # If RubyLLM fails, use fallback
54
+ generate_fallback_embedding
55
+ end
56
+ end
57
+ rescue StandardError => e
58
+ # Only use fallback if no client was provided (RubyLLM failures)
59
+ # If a client was provided, we should raise the error for proper test behavior
60
+ raise Ragdoll::Core::EmbeddingError, "Failed to generate embedding: #{e.message}" if @client
61
+
62
+ # No client - this is a RubyLLM configuration issue, use fallback
63
+ puts "Warning: Embedding generation failed (#{e.message}), using fallback"
64
+ generate_fallback_embedding
65
+ end
66
+ end
67
+
68
+ def generate_embeddings_batch(texts)
69
+ return [] if texts.empty?
70
+
71
+ # Clean all texts
72
+ cleaned_texts = texts.map { |text| clean_text(text) }.reject { |t| t.nil? || t.strip.empty? }
73
+ return [] if cleaned_texts.empty?
74
+
75
+ begin
76
+ if @client
77
+ # Use custom client for testing
78
+ embedding_config = @model_resolver.resolve_embedding(:text)
79
+ response = @client.embed(
80
+ input: cleaned_texts,
81
+ model: embedding_config.model.to_s
82
+ )
83
+
84
+ if response && response["embeddings"]
85
+ response["embeddings"]
86
+ elsif response && response["data"]
87
+ response["data"].map { |item| item["embedding"] }
88
+ else
89
+ raise Ragdoll::Core::EmbeddingError, "Invalid response format from embedding API"
90
+ end
91
+ else
92
+ # Use RubyLLM for real embedding generation (batch mode)
93
+ embedding_config = @model_resolver.resolve_embedding(:text)
94
+ # Use just the model name for RubyLLM
95
+ model = embedding_config.model.model
96
+
97
+ cleaned_texts.map do |text|
98
+ response = RubyLLM.embed(text, model: model)
99
+
100
+ # Extract the embedding vector from RubyLLM::Embedding object
101
+ next generate_fallback_embedding unless response.respond_to?(:instance_variable_get)
102
+
103
+ vectors = response.instance_variable_get(:@vectors)
104
+ next generate_fallback_embedding unless vectors && vectors.is_a?(Array)
105
+
106
+ vectors
107
+ rescue StandardError
108
+ # If RubyLLM fails, use fallback
109
+ generate_fallback_embedding
110
+ end
111
+ end
112
+ rescue StandardError => e
113
+ # Only use fallback if no client was provided (RubyLLM failures)
114
+ # If a client was provided, we should raise the error for proper test behavior
115
+ raise Ragdoll::Core::EmbeddingError, "Failed to generate embeddings: #{e.message}" if @client
116
+
117
+ # No client - this is a RubyLLM configuration issue, use fallback
118
+ puts "Warning: Batch embedding generation failed (#{e.message}), using fallback"
119
+ texts.map { generate_fallback_embedding }
120
+ end
121
+ end
122
+
123
+ def cosine_similarity(embedding1, embedding2)
124
+ return 0.0 if embedding1.nil? || embedding2.nil?
125
+ return 0.0 if embedding1.length != embedding2.length
126
+
127
+ dot_product = embedding1.zip(embedding2).sum { |a, b| a * b }
128
+ magnitude1 = Math.sqrt(embedding1.sum { |a| a * a })
129
+ magnitude2 = Math.sqrt(embedding2.sum { |a| a * a })
130
+
131
+ return 0.0 if magnitude1 == 0.0 || magnitude2 == 0.0
132
+
133
+ dot_product / (magnitude1 * magnitude2)
134
+ end
135
+
136
+ private
137
+
138
+ def configure_ruby_llm
139
+ # Configure ruby_llm based on Ragdoll configuration
140
+ provider = @config_service.config.llm_providers[:default_provider]
141
+ config = @config_service.provider_credentials(provider)
142
+
143
+ RubyLLM.configure do |ruby_llm_config|
144
+ case provider
145
+ when :openai
146
+ ruby_llm_config.openai_api_key = config[:api_key]
147
+ # Set organization and project if methods exist
148
+ if config[:organization] && ruby_llm_config.respond_to?(:openai_organization=)
149
+ ruby_llm_config.openai_organization = config[:organization]
150
+ end
151
+ ruby_llm_config.openai_project = config[:project] if config[:project] && ruby_llm_config.respond_to?(:openai_project=)
152
+ when :anthropic
153
+ ruby_llm_config.anthropic_api_key = config[:api_key] if ruby_llm_config.respond_to?(:anthropic_api_key=)
154
+ when :google
155
+ ruby_llm_config.google_api_key = config[:api_key] if ruby_llm_config.respond_to?(:google_api_key=)
156
+ if config[:project_id] && ruby_llm_config.respond_to?(:google_project_id=)
157
+ ruby_llm_config.google_project_id = config[:project_id]
158
+ end
159
+ when :azure
160
+ ruby_llm_config.azure_api_key = config[:api_key] if ruby_llm_config.respond_to?(:azure_api_key=)
161
+ ruby_llm_config.azure_endpoint = config[:endpoint] if config[:endpoint] && ruby_llm_config.respond_to?(:azure_endpoint=)
162
+ if config[:api_version] && ruby_llm_config.respond_to?(:azure_api_version=)
163
+ ruby_llm_config.azure_api_version = config[:api_version]
164
+ end
165
+ when :ollama
166
+ if config[:endpoint] && ruby_llm_config.respond_to?(:ollama_endpoint=)
167
+ ruby_llm_config.ollama_endpoint = config[:endpoint]
168
+ end
169
+ when :huggingface
170
+ ruby_llm_config.huggingface_api_key = config[:api_key] if ruby_llm_config.respond_to?(:huggingface_api_key=)
171
+ when :openrouter
172
+ ruby_llm_config.openrouter_api_key = config[:api_key] if ruby_llm_config.respond_to?(:openrouter_api_key=)
173
+ else
174
+ # Don't raise error for unsupported providers in case RubyLLM doesn't support them yet
175
+ puts "Warning: Unsupported embedding provider: #{provider}"
176
+ end
177
+ end
178
+ end
179
+
180
+ def clean_text(text)
181
+ return "" if text.nil?
182
+
183
+ # Remove excessive whitespace and normalize
184
+ cleaned = text.strip
185
+ .gsub(/\s+/, " ") # Multiple spaces to single space
186
+ .gsub(/\n+/, "\n") # Multiple newlines to single newline
187
+ .gsub(/\t+/, " ") # Tabs to spaces
188
+
189
+ # Truncate if too long (most embedding models have token limits)
190
+ max_chars = 8000 # Conservative limit for most embedding models
191
+ cleaned.length > max_chars ? cleaned[0, max_chars] : cleaned
192
+ end
193
+
194
+ # Generate a fallback embedding for testing/development when LLM services are unavailable
195
+ def generate_fallback_embedding(dimensions = 1536)
196
+ # Generate deterministic pseudo-random embeddings based on the object_id
197
+ # This ensures consistent results for testing while providing different embeddings for different instances
198
+ rng = Random.new(object_id)
199
+ dimensions.times.map { rng.rand * 2.0 - 1.0 }
200
+ end
201
+ end
202
+ end