ragdoll 0.1.0 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +318 -40
- data/Rakefile +66 -4
- data/app/jobs/ragdoll/extract_keywords_job.rb +28 -0
- data/app/jobs/ragdoll/extract_text_job.rb +38 -0
- data/app/jobs/ragdoll/generate_embeddings_job.rb +28 -0
- data/app/jobs/ragdoll/generate_summary_job.rb +25 -0
- data/app/lib/ragdoll/metadata_schemas.rb +332 -0
- data/app/models/ragdoll/audio_content.rb +142 -0
- data/app/models/ragdoll/content.rb +95 -0
- data/app/models/ragdoll/document.rb +606 -4
- data/app/models/ragdoll/embedding.rb +172 -5
- data/app/models/ragdoll/image_content.rb +194 -0
- data/app/models/ragdoll/text_content.rb +137 -0
- data/app/services/ragdoll/configuration_service.rb +113 -0
- data/app/services/ragdoll/document_management.rb +108 -0
- data/app/services/ragdoll/document_processor.rb +342 -0
- data/app/services/ragdoll/embedding_service.rb +202 -0
- data/app/services/ragdoll/image_description_service.rb +230 -0
- data/app/services/ragdoll/metadata_generator.rb +329 -0
- data/app/services/ragdoll/model_resolver.rb +72 -0
- data/app/services/ragdoll/search_engine.rb +51 -0
- data/app/services/ragdoll/text_chunker.rb +208 -0
- data/app/services/ragdoll/text_generation_service.rb +355 -0
- data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
- data/db/migrate/004_create_ragdoll_documents.rb +70 -0
- data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
- data/db/migrate/006_create_ragdoll_contents.rb +47 -0
- data/lib/ragdoll/core/client.rb +306 -0
- data/lib/ragdoll/core/configuration.rb +257 -0
- data/lib/ragdoll/core/database.rb +141 -0
- data/lib/ragdoll/core/errors.rb +11 -0
- data/lib/ragdoll/core/model.rb +45 -0
- data/lib/ragdoll/core/shrine_config.rb +71 -0
- data/lib/ragdoll/core/version.rb +8 -0
- data/lib/ragdoll/core.rb +91 -0
- data/lib/ragdoll-core.rb +3 -0
- data/lib/ragdoll.rb +243 -6
- data/lib/tasks/annotate.rake +126 -0
- data/lib/tasks/db.rake +338 -0
- metadata +42 -35
- data/config/initializers/ragdoll.rb +0 -6
- data/config/routes.rb +0 -5
- data/db/migrate/20250218123456_create_documents.rb +0 -20
- data/lib/config/database.yml +0 -28
- data/lib/config/ragdoll.yml +0 -31
- data/lib/ragdoll/engine.rb +0 -16
- data/lib/ragdoll/import_job.rb +0 -15
- data/lib/ragdoll/ingestion.rb +0 -30
- data/lib/ragdoll/search.rb +0 -18
- data/lib/ragdoll/version.rb +0 -7
- data/lib/tasks/import_task.thor +0 -32
- data/lib/tasks/jobs_task.thor +0 -40
- data/lib/tasks/ragdoll_tasks.thor +0 -7
- data/lib/tasks/search_task.thor +0 -55
@@ -0,0 +1,108 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ragdoll
|
4
|
+
class DocumentManagement
|
5
|
+
class << self
|
6
|
+
def add_document(location, content, metadata = {})
|
7
|
+
# Ensure location is an absolute path if it's a file path
|
8
|
+
absolute_location = location.start_with?("http") || location.start_with?("ftp") ? location : File.expand_path(location)
|
9
|
+
|
10
|
+
# Get file modification time if it's a file path
|
11
|
+
file_modified_at = if File.exist?(absolute_location) && !absolute_location.start_with?("http")
|
12
|
+
File.mtime(absolute_location)
|
13
|
+
else
|
14
|
+
Time.current
|
15
|
+
end
|
16
|
+
|
17
|
+
# Check if document already exists with same location and file_modified_at
|
18
|
+
existing_document = Ragdoll::Document.find_by(
|
19
|
+
location: absolute_location,
|
20
|
+
file_modified_at: file_modified_at
|
21
|
+
)
|
22
|
+
|
23
|
+
# Return existing document ID if found (skip duplicate)
|
24
|
+
return existing_document.id.to_s if existing_document
|
25
|
+
|
26
|
+
document = Ragdoll::Document.create!(
|
27
|
+
location: absolute_location,
|
28
|
+
title: metadata[:title] || metadata["title"] || extract_title_from_location(location),
|
29
|
+
document_type: metadata[:document_type] || metadata["document_type"] || "text",
|
30
|
+
metadata: metadata.is_a?(Hash) ? metadata : {},
|
31
|
+
status: "pending",
|
32
|
+
file_modified_at: file_modified_at
|
33
|
+
)
|
34
|
+
|
35
|
+
# Set content using the model's setter to trigger TextContent creation
|
36
|
+
document.content = content if content.present?
|
37
|
+
|
38
|
+
document.id.to_s
|
39
|
+
end
|
40
|
+
|
41
|
+
def get_document(id)
|
42
|
+
document = Ragdoll::Document.find_by(id: id)
|
43
|
+
return nil unless document
|
44
|
+
|
45
|
+
hash = document.to_hash
|
46
|
+
hash[:content] = document.content
|
47
|
+
hash
|
48
|
+
end
|
49
|
+
|
50
|
+
def update_document(id, **updates)
|
51
|
+
document = Ragdoll::Document.find_by(id: id)
|
52
|
+
return nil unless document
|
53
|
+
|
54
|
+
# Only update allowed fields
|
55
|
+
allowed_updates = updates.slice(:title, :metadata, :status, :document_type)
|
56
|
+
document.update!(allowed_updates) if allowed_updates.any?
|
57
|
+
|
58
|
+
document.to_hash
|
59
|
+
end
|
60
|
+
|
61
|
+
def delete_document(id)
|
62
|
+
document = Ragdoll::Document.find_by(id: id)
|
63
|
+
return nil unless document
|
64
|
+
|
65
|
+
document.destroy!
|
66
|
+
true
|
67
|
+
end
|
68
|
+
|
69
|
+
def list_documents(options = {})
|
70
|
+
limit = options[:limit] || 100
|
71
|
+
offset = options[:offset] || 0
|
72
|
+
|
73
|
+
Ragdoll::Document.offset(offset).limit(limit).recent.map(&:to_hash)
|
74
|
+
end
|
75
|
+
|
76
|
+
def get_document_stats
|
77
|
+
Ragdoll::Document.stats
|
78
|
+
end
|
79
|
+
|
80
|
+
# FIXME: should this be here?
|
81
|
+
|
82
|
+
def add_embedding(embeddable_id, chunk_index, embedding_vector, metadata = {})
|
83
|
+
# The embeddable_type should be the actual STI subclass, not the base class
|
84
|
+
embeddable_type = if metadata[:embeddable_type]
|
85
|
+
metadata[:embeddable_type]
|
86
|
+
else
|
87
|
+
# Look up the actual STI type from the content record
|
88
|
+
content = Ragdoll::Content.find(embeddable_id)
|
89
|
+
content.class.name
|
90
|
+
end
|
91
|
+
|
92
|
+
Ragdoll::Embedding.create!(
|
93
|
+
embeddable_id: embeddable_id,
|
94
|
+
embeddable_type: embeddable_type,
|
95
|
+
chunk_index: chunk_index,
|
96
|
+
embedding_vector: embedding_vector,
|
97
|
+
content: metadata[:content] || ""
|
98
|
+
).id.to_s
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
def extract_title_from_location(location)
|
104
|
+
File.basename(location, File.extname(location))
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,342 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "pdf-reader"
|
4
|
+
require "docx"
|
5
|
+
require "rmagick"
|
6
|
+
# Image description service is auto-loaded from app/services
|
7
|
+
|
8
|
+
module Ragdoll
|
9
|
+
class DocumentProcessor
|
10
|
+
class ParseError < Ragdoll::Core::DocumentError; end
|
11
|
+
class UnsupportedFormatError < ParseError; end
|
12
|
+
|
13
|
+
def self.parse(file_path)
|
14
|
+
new(file_path).parse
|
15
|
+
end
|
16
|
+
|
17
|
+
# Parse from Shrine attached file
|
18
|
+
def self.parse_attachment(attached_file)
|
19
|
+
attached_file.open do |tempfile|
|
20
|
+
new(tempfile.path, attached_file).parse
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Create document from file path
|
25
|
+
def self.create_document_from_file(file_path, **options)
|
26
|
+
parsed = parse(file_path)
|
27
|
+
|
28
|
+
# Get file modification time
|
29
|
+
file_modified_at = File.exist?(file_path) ? File.mtime(file_path) : Time.current
|
30
|
+
|
31
|
+
document = Ragdoll::Document.create!(
|
32
|
+
location: File.expand_path(file_path),
|
33
|
+
title: parsed[:title] || File.basename(file_path, File.extname(file_path)),
|
34
|
+
content: parsed[:content],
|
35
|
+
document_type: determine_document_type(file_path),
|
36
|
+
metadata: parsed[:metadata] || {},
|
37
|
+
status: "processed",
|
38
|
+
file_modified_at: file_modified_at,
|
39
|
+
**options
|
40
|
+
)
|
41
|
+
|
42
|
+
# Attach the file if it exists
|
43
|
+
document.file = File.open(file_path) if File.exist?(file_path)
|
44
|
+
|
45
|
+
document
|
46
|
+
end
|
47
|
+
|
48
|
+
# Create document from uploaded file (Shrine compatible)
|
49
|
+
def self.create_document_from_upload(uploaded_file, **options)
|
50
|
+
# Create document first
|
51
|
+
document = Ragdoll::Document.create!(
|
52
|
+
location: uploaded_file.original_filename || "uploaded_file",
|
53
|
+
title: options[:title] || File.basename(uploaded_file.original_filename || "uploaded_file",
|
54
|
+
File.extname(uploaded_file.original_filename || "")),
|
55
|
+
content: "", # Will be extracted after file attachment
|
56
|
+
document_type: determine_document_type_from_content_type(uploaded_file.mime_type),
|
57
|
+
status: "processing",
|
58
|
+
metadata: options[:metadata] || {},
|
59
|
+
file_modified_at: Time.current
|
60
|
+
)
|
61
|
+
|
62
|
+
# Attach the file
|
63
|
+
document.file = uploaded_file
|
64
|
+
|
65
|
+
# Extract content from attached file
|
66
|
+
if document.file.present?
|
67
|
+
parsed = parse_attachment(document.file)
|
68
|
+
document.update!(
|
69
|
+
content: parsed[:content],
|
70
|
+
title: parsed[:title] || document.title,
|
71
|
+
metadata: document.metadata.merge(parsed[:metadata] || {}),
|
72
|
+
status: "processed"
|
73
|
+
)
|
74
|
+
end
|
75
|
+
|
76
|
+
document
|
77
|
+
end
|
78
|
+
|
79
|
+
def initialize(file_path, attached_file = nil)
|
80
|
+
@file_path = file_path
|
81
|
+
@attached_file = attached_file
|
82
|
+
@file_extension = File.extname(file_path).downcase
|
83
|
+
end
|
84
|
+
|
85
|
+
def parse
|
86
|
+
case @file_extension
|
87
|
+
when ".pdf"
|
88
|
+
parse_pdf
|
89
|
+
when ".docx"
|
90
|
+
parse_docx
|
91
|
+
when ".txt", ".md", ".markdown"
|
92
|
+
parse_text
|
93
|
+
when ".html", ".htm"
|
94
|
+
parse_html
|
95
|
+
when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
|
96
|
+
parse_image
|
97
|
+
else
|
98
|
+
parse_text # Default to text parsing for unknown formats
|
99
|
+
end
|
100
|
+
rescue StandardError => e # StandardError => e
|
101
|
+
raise ParseError, "#{__LINE__} Failed to parse #{@file_path}: #{e.message}"
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
def parse_pdf
|
107
|
+
content = ""
|
108
|
+
metadata = {}
|
109
|
+
|
110
|
+
begin
|
111
|
+
PDF::Reader.open(@file_path) do |reader|
|
112
|
+
# Extract metadata
|
113
|
+
if reader.info
|
114
|
+
metadata[:title] = reader.info[:Title] if reader.info[:Title]
|
115
|
+
metadata[:author] = reader.info[:Author] if reader.info[:Author]
|
116
|
+
metadata[:subject] = reader.info[:Subject] if reader.info[:Subject]
|
117
|
+
metadata[:creator] = reader.info[:Creator] if reader.info[:Creator]
|
118
|
+
metadata[:producer] = reader.info[:Producer] if reader.info[:Producer]
|
119
|
+
metadata[:creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
|
120
|
+
metadata[:modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
|
121
|
+
end
|
122
|
+
|
123
|
+
metadata[:page_count] = reader.page_count
|
124
|
+
|
125
|
+
# Extract text from all pages
|
126
|
+
reader.pages.each_with_index do |page, index|
|
127
|
+
page_text = page.text.strip
|
128
|
+
next if page_text.empty?
|
129
|
+
|
130
|
+
content += "\n\n--- Page #{index + 1} ---\n\n" if content.length.positive?
|
131
|
+
content += page_text
|
132
|
+
end
|
133
|
+
end
|
134
|
+
rescue PDF::Reader::MalformedPDFError => e
|
135
|
+
raise ParseError, "Malformed PDF: #{e.message}"
|
136
|
+
rescue PDF::Reader::UnsupportedFeatureError => e
|
137
|
+
raise ParseError, "Unsupported PDF feature: #{e.message}"
|
138
|
+
end
|
139
|
+
|
140
|
+
{
|
141
|
+
content: content.strip,
|
142
|
+
metadata: metadata,
|
143
|
+
document_type: "pdf"
|
144
|
+
}
|
145
|
+
end
|
146
|
+
|
147
|
+
def parse_docx
|
148
|
+
content = ""
|
149
|
+
metadata = {}
|
150
|
+
|
151
|
+
begin
|
152
|
+
doc = Docx::Document.open(@file_path)
|
153
|
+
|
154
|
+
# Extract core properties
|
155
|
+
if doc.core_properties
|
156
|
+
metadata[:title] = doc.core_properties.title if doc.core_properties.title
|
157
|
+
metadata[:author] = doc.core_properties.creator if doc.core_properties.creator
|
158
|
+
metadata[:subject] = doc.core_properties.subject if doc.core_properties.subject
|
159
|
+
metadata[:description] = doc.core_properties.description if doc.core_properties.description
|
160
|
+
metadata[:keywords] = doc.core_properties.keywords if doc.core_properties.keywords
|
161
|
+
metadata[:created] = doc.core_properties.created if doc.core_properties.created
|
162
|
+
metadata[:modified] = doc.core_properties.modified if doc.core_properties.modified
|
163
|
+
if doc.core_properties.last_modified_by
|
164
|
+
metadata[:last_modified_by] =
|
165
|
+
doc.core_properties.last_modified_by
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
# Extract text from paragraphs
|
170
|
+
doc.paragraphs.each do |paragraph|
|
171
|
+
paragraph_text = paragraph.text.strip
|
172
|
+
next if paragraph_text.empty?
|
173
|
+
|
174
|
+
content += "#{paragraph_text}\n\n"
|
175
|
+
end
|
176
|
+
|
177
|
+
# Extract text from tables
|
178
|
+
doc.tables.each_with_index do |table, table_index|
|
179
|
+
content += "\n--- Table #{table_index + 1} ---\n\n"
|
180
|
+
|
181
|
+
table.rows.each do |row|
|
182
|
+
row_text = row.cells.map(&:text).join(" | ")
|
183
|
+
content += "#{row_text}\n" unless row_text.strip.empty?
|
184
|
+
end
|
185
|
+
|
186
|
+
content += "\n"
|
187
|
+
end
|
188
|
+
|
189
|
+
metadata[:paragraph_count] = doc.paragraphs.count
|
190
|
+
metadata[:table_count] = doc.tables.count
|
191
|
+
rescue StandardError => e # StandardError => e
|
192
|
+
raise ParseError, "#{__LINE__} Failed to parse DOCX: #{e.message}"
|
193
|
+
end
|
194
|
+
|
195
|
+
{
|
196
|
+
content: content.strip,
|
197
|
+
metadata: metadata,
|
198
|
+
document_type: "docx"
|
199
|
+
}
|
200
|
+
end
|
201
|
+
|
202
|
+
def parse_text
|
203
|
+
content = File.read(@file_path, encoding: "UTF-8")
|
204
|
+
metadata = {
|
205
|
+
file_size: File.size(@file_path),
|
206
|
+
encoding: "UTF-8"
|
207
|
+
}
|
208
|
+
|
209
|
+
document_type = case @file_extension
|
210
|
+
when ".md", ".markdown" then "markdown"
|
211
|
+
when ".txt" then "text"
|
212
|
+
else "text"
|
213
|
+
end
|
214
|
+
|
215
|
+
{
|
216
|
+
content: content,
|
217
|
+
metadata: metadata,
|
218
|
+
document_type: document_type
|
219
|
+
}
|
220
|
+
rescue Encoding::InvalidByteSequenceError
|
221
|
+
# Try with different encoding
|
222
|
+
content = File.read(@file_path, encoding: "ISO-8859-1")
|
223
|
+
metadata = {
|
224
|
+
file_size: File.size(@file_path),
|
225
|
+
encoding: "ISO-8859-1"
|
226
|
+
}
|
227
|
+
|
228
|
+
{
|
229
|
+
content: content,
|
230
|
+
metadata: metadata,
|
231
|
+
document_type: "text"
|
232
|
+
}
|
233
|
+
end
|
234
|
+
|
235
|
+
def parse_html
|
236
|
+
content = File.read(@file_path, encoding: "UTF-8")
|
237
|
+
|
238
|
+
# Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
|
239
|
+
clean_content = content
|
240
|
+
.gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
|
241
|
+
.gsub(%r{<style[^>]*>.*?</style>}mi, "") # Remove style tags
|
242
|
+
.gsub(/<[^>]+>/, " ") # Remove all HTML tags
|
243
|
+
.gsub(/\s+/, " ") # Normalize whitespace
|
244
|
+
.strip
|
245
|
+
|
246
|
+
metadata = {
|
247
|
+
file_size: File.size(@file_path),
|
248
|
+
original_format: "html"
|
249
|
+
}
|
250
|
+
|
251
|
+
{
|
252
|
+
content: clean_content,
|
253
|
+
metadata: metadata,
|
254
|
+
document_type: "html"
|
255
|
+
}
|
256
|
+
end
|
257
|
+
|
258
|
+
def parse_image
|
259
|
+
puts "🖼️ DocumentProcessor: Starting image parsing for #{@file_path}"
|
260
|
+
|
261
|
+
metadata = {
|
262
|
+
file_size: File.size(@file_path),
|
263
|
+
file_type: @file_extension.sub(".", ""),
|
264
|
+
original_filename: File.basename(@file_path)
|
265
|
+
}
|
266
|
+
|
267
|
+
# Extract image dimensions
|
268
|
+
begin
|
269
|
+
img = Magick::Image.read(@file_path).first
|
270
|
+
metadata[:width] = img.columns
|
271
|
+
metadata[:height] = img.rows
|
272
|
+
puts "📏 DocumentProcessor: Image dimensions: #{img.columns}x#{img.rows}"
|
273
|
+
rescue StandardError => e # StandardError
|
274
|
+
puts "❌ DocumentProcessor: Failed to get image dimensions: #{e.message}"
|
275
|
+
metadata[:width] = nil
|
276
|
+
metadata[:height] = nil
|
277
|
+
end
|
278
|
+
|
279
|
+
puts "🤖 DocumentProcessor: Creating ImageDescriptionService and calling generate_description..."
|
280
|
+
desc = Ragdoll::ImageDescriptionService.new.generate_description(@file_path)
|
281
|
+
|
282
|
+
puts "📝 DocumentProcessor: Received description: '#{desc}'"
|
283
|
+
|
284
|
+
metadata[:description] = desc if desc && !desc.empty?
|
285
|
+
|
286
|
+
# Use AI-generated description or fallback placeholder
|
287
|
+
content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
|
288
|
+
|
289
|
+
puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
|
290
|
+
|
291
|
+
{
|
292
|
+
content: content,
|
293
|
+
metadata: metadata,
|
294
|
+
document_type: "image"
|
295
|
+
}
|
296
|
+
end
|
297
|
+
|
298
|
+
# Helper methods for document type determination
|
299
|
+
def self.determine_document_type(file_path)
|
300
|
+
case File.extname(file_path).downcase
|
301
|
+
when ".pdf" then "pdf"
|
302
|
+
when ".docx" then "docx"
|
303
|
+
when ".txt" then "text"
|
304
|
+
when ".md", ".markdown" then "markdown"
|
305
|
+
when ".html", ".htm" then "html"
|
306
|
+
when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif" then "image"
|
307
|
+
else "text"
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
def self.determine_document_type_from_content_type(content_type)
|
312
|
+
case content_type
|
313
|
+
when "application/pdf" then "pdf"
|
314
|
+
when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx"
|
315
|
+
when "text/plain" then "text"
|
316
|
+
when "text/markdown" then "markdown"
|
317
|
+
when "text/html" then "html"
|
318
|
+
when %r{^image/} then "image"
|
319
|
+
else "text"
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
def self.determine_content_type(file_path)
|
324
|
+
case File.extname(file_path).downcase
|
325
|
+
when ".pdf" then "application/pdf"
|
326
|
+
when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
327
|
+
when ".txt" then "text/plain"
|
328
|
+
when ".md", ".markdown" then "text/markdown"
|
329
|
+
when ".html", ".htm" then "text/html"
|
330
|
+
when ".jpg", ".jpeg" then "image/jpeg"
|
331
|
+
when ".png" then "image/png"
|
332
|
+
when ".gif" then "image/gif"
|
333
|
+
when ".webp" then "image/webp"
|
334
|
+
when ".bmp" then "image/bmp"
|
335
|
+
when ".svg" then "image/svg+xml"
|
336
|
+
when ".ico" then "image/x-icon"
|
337
|
+
when ".tiff", ".tif" then "image/tiff"
|
338
|
+
else "application/octet-stream"
|
339
|
+
end
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
@@ -0,0 +1,202 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ruby_llm"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class EmbeddingService
|
7
|
+
def initialize(client: nil, config_service: nil, model_resolver: nil)
|
8
|
+
@client = client
|
9
|
+
@config_service = config_service || Ragdoll::ConfigurationService.new
|
10
|
+
@model_resolver = model_resolver || Ragdoll::ModelResolver.new(@config_service)
|
11
|
+
configure_ruby_llm unless @client
|
12
|
+
end
|
13
|
+
|
14
|
+
def generate_embedding(text)
|
15
|
+
return nil if text.nil? || text.strip.empty?
|
16
|
+
|
17
|
+
# Clean and prepare text
|
18
|
+
cleaned_text = clean_text(text)
|
19
|
+
|
20
|
+
begin
|
21
|
+
if @client
|
22
|
+
# Use custom client for testing
|
23
|
+
embedding_config = @model_resolver.resolve_embedding(:text)
|
24
|
+
response = @client.embed(
|
25
|
+
input: cleaned_text,
|
26
|
+
model: embedding_config.model.to_s
|
27
|
+
)
|
28
|
+
|
29
|
+
if response && response["embeddings"]&.first
|
30
|
+
response["embeddings"].first
|
31
|
+
elsif response && response["data"]&.first && response["data"].first["embedding"]
|
32
|
+
response["data"].first["embedding"]
|
33
|
+
else
|
34
|
+
raise Ragdoll::Core::EmbeddingError, "Invalid response format from embedding API"
|
35
|
+
end
|
36
|
+
else
|
37
|
+
# Use RubyLLM for real embedding generation
|
38
|
+
embedding_config = @model_resolver.resolve_embedding(:text)
|
39
|
+
# Use just the model name for RubyLLM
|
40
|
+
model = embedding_config.model.model
|
41
|
+
|
42
|
+
begin
|
43
|
+
response = RubyLLM.embed(cleaned_text, model: model)
|
44
|
+
|
45
|
+
# Extract the embedding vector from RubyLLM::Embedding object
|
46
|
+
return generate_fallback_embedding unless response.respond_to?(:instance_variable_get)
|
47
|
+
|
48
|
+
vectors = response.instance_variable_get(:@vectors)
|
49
|
+
return generate_fallback_embedding unless vectors && vectors.is_a?(Array)
|
50
|
+
|
51
|
+
vectors
|
52
|
+
rescue StandardError
|
53
|
+
# If RubyLLM fails, use fallback
|
54
|
+
generate_fallback_embedding
|
55
|
+
end
|
56
|
+
end
|
57
|
+
rescue StandardError => e
|
58
|
+
# Only use fallback if no client was provided (RubyLLM failures)
|
59
|
+
# If a client was provided, we should raise the error for proper test behavior
|
60
|
+
raise Ragdoll::Core::EmbeddingError, "Failed to generate embedding: #{e.message}" if @client
|
61
|
+
|
62
|
+
# No client - this is a RubyLLM configuration issue, use fallback
|
63
|
+
puts "Warning: Embedding generation failed (#{e.message}), using fallback"
|
64
|
+
generate_fallback_embedding
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def generate_embeddings_batch(texts)
|
69
|
+
return [] if texts.empty?
|
70
|
+
|
71
|
+
# Clean all texts
|
72
|
+
cleaned_texts = texts.map { |text| clean_text(text) }.reject { |t| t.nil? || t.strip.empty? }
|
73
|
+
return [] if cleaned_texts.empty?
|
74
|
+
|
75
|
+
begin
|
76
|
+
if @client
|
77
|
+
# Use custom client for testing
|
78
|
+
embedding_config = @model_resolver.resolve_embedding(:text)
|
79
|
+
response = @client.embed(
|
80
|
+
input: cleaned_texts,
|
81
|
+
model: embedding_config.model.to_s
|
82
|
+
)
|
83
|
+
|
84
|
+
if response && response["embeddings"]
|
85
|
+
response["embeddings"]
|
86
|
+
elsif response && response["data"]
|
87
|
+
response["data"].map { |item| item["embedding"] }
|
88
|
+
else
|
89
|
+
raise Ragdoll::Core::EmbeddingError, "Invalid response format from embedding API"
|
90
|
+
end
|
91
|
+
else
|
92
|
+
# Use RubyLLM for real embedding generation (batch mode)
|
93
|
+
embedding_config = @model_resolver.resolve_embedding(:text)
|
94
|
+
# Use just the model name for RubyLLM
|
95
|
+
model = embedding_config.model.model
|
96
|
+
|
97
|
+
cleaned_texts.map do |text|
|
98
|
+
response = RubyLLM.embed(text, model: model)
|
99
|
+
|
100
|
+
# Extract the embedding vector from RubyLLM::Embedding object
|
101
|
+
next generate_fallback_embedding unless response.respond_to?(:instance_variable_get)
|
102
|
+
|
103
|
+
vectors = response.instance_variable_get(:@vectors)
|
104
|
+
next generate_fallback_embedding unless vectors && vectors.is_a?(Array)
|
105
|
+
|
106
|
+
vectors
|
107
|
+
rescue StandardError
|
108
|
+
# If RubyLLM fails, use fallback
|
109
|
+
generate_fallback_embedding
|
110
|
+
end
|
111
|
+
end
|
112
|
+
rescue StandardError => e
|
113
|
+
# Only use fallback if no client was provided (RubyLLM failures)
|
114
|
+
# If a client was provided, we should raise the error for proper test behavior
|
115
|
+
raise Ragdoll::Core::EmbeddingError, "Failed to generate embeddings: #{e.message}" if @client
|
116
|
+
|
117
|
+
# No client - this is a RubyLLM configuration issue, use fallback
|
118
|
+
puts "Warning: Batch embedding generation failed (#{e.message}), using fallback"
|
119
|
+
texts.map { generate_fallback_embedding }
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def cosine_similarity(embedding1, embedding2)
|
124
|
+
return 0.0 if embedding1.nil? || embedding2.nil?
|
125
|
+
return 0.0 if embedding1.length != embedding2.length
|
126
|
+
|
127
|
+
dot_product = embedding1.zip(embedding2).sum { |a, b| a * b }
|
128
|
+
magnitude1 = Math.sqrt(embedding1.sum { |a| a * a })
|
129
|
+
magnitude2 = Math.sqrt(embedding2.sum { |a| a * a })
|
130
|
+
|
131
|
+
return 0.0 if magnitude1 == 0.0 || magnitude2 == 0.0
|
132
|
+
|
133
|
+
dot_product / (magnitude1 * magnitude2)
|
134
|
+
end
|
135
|
+
|
136
|
+
private
|
137
|
+
|
138
|
+
def configure_ruby_llm
|
139
|
+
# Configure ruby_llm based on Ragdoll configuration
|
140
|
+
provider = @config_service.config.llm_providers[:default_provider]
|
141
|
+
config = @config_service.provider_credentials(provider)
|
142
|
+
|
143
|
+
RubyLLM.configure do |ruby_llm_config|
|
144
|
+
case provider
|
145
|
+
when :openai
|
146
|
+
ruby_llm_config.openai_api_key = config[:api_key]
|
147
|
+
# Set organization and project if methods exist
|
148
|
+
if config[:organization] && ruby_llm_config.respond_to?(:openai_organization=)
|
149
|
+
ruby_llm_config.openai_organization = config[:organization]
|
150
|
+
end
|
151
|
+
ruby_llm_config.openai_project = config[:project] if config[:project] && ruby_llm_config.respond_to?(:openai_project=)
|
152
|
+
when :anthropic
|
153
|
+
ruby_llm_config.anthropic_api_key = config[:api_key] if ruby_llm_config.respond_to?(:anthropic_api_key=)
|
154
|
+
when :google
|
155
|
+
ruby_llm_config.google_api_key = config[:api_key] if ruby_llm_config.respond_to?(:google_api_key=)
|
156
|
+
if config[:project_id] && ruby_llm_config.respond_to?(:google_project_id=)
|
157
|
+
ruby_llm_config.google_project_id = config[:project_id]
|
158
|
+
end
|
159
|
+
when :azure
|
160
|
+
ruby_llm_config.azure_api_key = config[:api_key] if ruby_llm_config.respond_to?(:azure_api_key=)
|
161
|
+
ruby_llm_config.azure_endpoint = config[:endpoint] if config[:endpoint] && ruby_llm_config.respond_to?(:azure_endpoint=)
|
162
|
+
if config[:api_version] && ruby_llm_config.respond_to?(:azure_api_version=)
|
163
|
+
ruby_llm_config.azure_api_version = config[:api_version]
|
164
|
+
end
|
165
|
+
when :ollama
|
166
|
+
if config[:endpoint] && ruby_llm_config.respond_to?(:ollama_endpoint=)
|
167
|
+
ruby_llm_config.ollama_endpoint = config[:endpoint]
|
168
|
+
end
|
169
|
+
when :huggingface
|
170
|
+
ruby_llm_config.huggingface_api_key = config[:api_key] if ruby_llm_config.respond_to?(:huggingface_api_key=)
|
171
|
+
when :openrouter
|
172
|
+
ruby_llm_config.openrouter_api_key = config[:api_key] if ruby_llm_config.respond_to?(:openrouter_api_key=)
|
173
|
+
else
|
174
|
+
# Don't raise error for unsupported providers in case RubyLLM doesn't support them yet
|
175
|
+
puts "Warning: Unsupported embedding provider: #{provider}"
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
def clean_text(text)
|
181
|
+
return "" if text.nil?
|
182
|
+
|
183
|
+
# Remove excessive whitespace and normalize
|
184
|
+
cleaned = text.strip
|
185
|
+
.gsub(/\s+/, " ") # Multiple spaces to single space
|
186
|
+
.gsub(/\n+/, "\n") # Multiple newlines to single newline
|
187
|
+
.gsub(/\t+/, " ") # Tabs to spaces
|
188
|
+
|
189
|
+
# Truncate if too long (most embedding models have token limits)
|
190
|
+
max_chars = 8000 # Conservative limit for most embedding models
|
191
|
+
cleaned.length > max_chars ? cleaned[0, max_chars] : cleaned
|
192
|
+
end
|
193
|
+
|
194
|
+
# Generate a fallback embedding for testing/development when LLM services are unavailable
|
195
|
+
def generate_fallback_embedding(dimensions = 1536)
|
196
|
+
# Generate deterministic pseudo-random embeddings based on the object_id
|
197
|
+
# This ensures consistent results for testing while providing different embeddings for different instances
|
198
|
+
rng = Random.new(object_id)
|
199
|
+
dimensions.times.map { rng.rand * 2.0 - 1.0 }
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|