ragdoll 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +353 -0
  3. data/Rakefile +21 -0
  4. data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
  5. data/db/migrate/004_create_ragdoll_documents.rb +70 -0
  6. data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
  7. data/db/migrate/006_create_ragdoll_contents.rb +47 -0
  8. data/lib/ragdoll/core/client.rb +315 -0
  9. data/lib/ragdoll/core/configuration.rb +273 -0
  10. data/lib/ragdoll/core/database.rb +141 -0
  11. data/lib/ragdoll/core/document_management.rb +110 -0
  12. data/lib/ragdoll/core/document_processor.rb +344 -0
  13. data/lib/ragdoll/core/embedding_service.rb +183 -0
  14. data/lib/ragdoll/core/errors.rb +11 -0
  15. data/lib/ragdoll/core/jobs/extract_keywords.rb +32 -0
  16. data/lib/ragdoll/core/jobs/extract_text.rb +42 -0
  17. data/lib/ragdoll/core/jobs/generate_embeddings.rb +32 -0
  18. data/lib/ragdoll/core/jobs/generate_summary.rb +29 -0
  19. data/lib/ragdoll/core/metadata_schemas.rb +334 -0
  20. data/lib/ragdoll/core/models/audio_content.rb +175 -0
  21. data/lib/ragdoll/core/models/content.rb +126 -0
  22. data/lib/ragdoll/core/models/document.rb +678 -0
  23. data/lib/ragdoll/core/models/embedding.rb +204 -0
  24. data/lib/ragdoll/core/models/image_content.rb +227 -0
  25. data/lib/ragdoll/core/models/text_content.rb +169 -0
  26. data/lib/ragdoll/core/search_engine.rb +50 -0
  27. data/lib/ragdoll/core/services/image_description_service.rb +230 -0
  28. data/lib/ragdoll/core/services/metadata_generator.rb +335 -0
  29. data/lib/ragdoll/core/shrine_config.rb +71 -0
  30. data/lib/ragdoll/core/text_chunker.rb +210 -0
  31. data/lib/ragdoll/core/text_generation_service.rb +360 -0
  32. data/lib/ragdoll/core/version.rb +8 -0
  33. data/lib/ragdoll/core.rb +73 -0
  34. data/lib/ragdoll-core.rb +3 -0
  35. data/lib/ragdoll.rb +249 -0
  36. data/lib/tasks/annotate.rake +126 -0
  37. data/lib/tasks/db.rake +338 -0
  38. metadata +80 -0
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+ require "logger"
5
+
6
+ module Ragdoll
7
+ module Core
8
+ class Database
9
+ def self.setup(config = {})
10
+ database_config = default_config.merge(config)
11
+
12
+ # Set up ActiveRecord connection
13
+ ActiveRecord::Base.establish_connection(database_config)
14
+
15
+ # Set up logging if specified
16
+ ActiveRecord::Base.logger = database_config[:logger] if database_config[:logger]
17
+
18
+ # Auto-migrate if specified
19
+ return unless database_config[:auto_migrate]
20
+
21
+ migrate!
22
+ end
23
+
24
+ def self.migrate!
25
+ # Get the path to the gem root directory
26
+ # Current file is lib/ragdoll/core/database.rb, so go up 3 levels to get to gem root
27
+ gem_root = File.expand_path(File.join(File.dirname(__FILE__), "..", "..", ".."))
28
+ migration_paths = [
29
+ File.join(gem_root, "db", "migrate")
30
+ ]
31
+
32
+ ActiveRecord::Migration.verbose = true
33
+
34
+ # Ensure schema_migrations table exists first
35
+ unless ActiveRecord::Base.connection.table_exists?("schema_migrations")
36
+ ActiveRecord::Base.connection.create_table("schema_migrations", id: false) do |t|
37
+ t.string :version, null: false
38
+ end
39
+ ActiveRecord::Base.connection.add_index("schema_migrations", :version, unique: true)
40
+ end
41
+
42
+ # Debug migration path (silenced for clean test output)
43
+ # puts "Migration path: #{migration_paths.first}" if ActiveRecord::Migration.verbose
44
+ migration_files = Dir[File.join(migration_paths.first, "*.rb")].sort
45
+ # puts "Found #{migration_files.length} migration files" if ActiveRecord::Migration.verbose
46
+
47
+ # Load and run each migration manually since ActiveRecord migration context seems broken
48
+ migration_files.each do |migration_file|
49
+ # Extract version from filename
50
+ version = File.basename(migration_file, ".rb").split("_").first
51
+
52
+ # Skip if already migrated
53
+ next if ActiveRecord::Base.connection.select_values(
54
+ "SELECT version FROM schema_migrations WHERE version = '#{version}'"
55
+ ).any?
56
+
57
+ # Load the migration file to define the class
58
+ require migration_file
59
+
60
+ # Get the migration class - convert snake_case to CamelCase
61
+ filename_parts = File.basename(migration_file, ".rb").split("_")[1..]
62
+ migration_class_name = filename_parts.map(&:capitalize).join
63
+
64
+ begin
65
+ migration_class = Object.const_get(migration_class_name)
66
+ rescue NameError
67
+ puts "Warning: Could not find migration class #{migration_class_name} in #{migration_file}"
68
+ next
69
+ end
70
+
71
+ # Run the migration quietly
72
+ old_verbose = ActiveRecord::Migration.verbose
73
+ ActiveRecord::Migration.verbose = false
74
+ migration_class.migrate(:up)
75
+ ActiveRecord::Migration.verbose = old_verbose
76
+
77
+ # Record the migration
78
+ ActiveRecord::Base.connection.insert(
79
+ "INSERT INTO schema_migrations (version) VALUES ('#{version}')"
80
+ )
81
+
82
+ # Silenced migration progress - uncomment for debugging
83
+ # puts "Migrated #{migration_class_name}" if ActiveRecord::Migration.verbose
84
+ end
85
+ end
86
+
87
+ def self.reset!
88
+ ActiveRecord::Migration.verbose = false
89
+
90
+ # Drop all tables in correct order (respecting foreign key constraints)
91
+ # Order: dependent tables first, then parent tables
92
+ tables_to_drop = %w[
93
+ ragdoll_embeddings
94
+ ragdoll_text_contents
95
+ ragdoll_image_contents
96
+ ragdoll_audio_contents
97
+ ragdoll_documents
98
+ schema_migrations
99
+ ]
100
+
101
+ tables_to_drop.each do |table|
102
+ if ActiveRecord::Base.connection.table_exists?(table)
103
+ # For PostgreSQL, we can use CASCADE to drop dependent objects
104
+ if ActiveRecord::Base.connection.adapter_name.downcase.include?("postgresql")
105
+ ActiveRecord::Base.connection.execute("DROP TABLE IF EXISTS #{table} CASCADE")
106
+ else
107
+ ActiveRecord::Base.connection.drop_table(table)
108
+ end
109
+ end
110
+ end
111
+
112
+ migrate!
113
+ end
114
+
115
+ def self.connected?
116
+ ActiveRecord::Base.connected?
117
+ end
118
+
119
+ def self.disconnect!
120
+ ActiveRecord::Base.clear_all_connections!
121
+ end
122
+
123
+ def self.default_config
124
+ {
125
+ adapter: "postgresql",
126
+ database: "ragdoll_development",
127
+ username: "ragdoll",
128
+ password: ENV["RAGDOLL_DATABASE_PASSWORD"],
129
+ host: "localhost",
130
+ port: 5432,
131
+ auto_migrate: true,
132
+ logger: Logger.new($stdout, level: Logger::WARN)
133
+ }
134
+ end
135
+
136
+ def self.migration_paths
137
+ [File.join(File.dirname(__FILE__), "..", "..", "..", "..", "db", "migrate")]
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,110 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ragdoll
4
+ module Core
5
+ class DocumentManagement
6
+ class << self
7
+ def add_document(location, content, metadata = {})
8
+ # Ensure location is an absolute path if it's a file path
9
+ absolute_location = location.start_with?("http") || location.start_with?("ftp") ? location : File.expand_path(location)
10
+
11
+ # Get file modification time if it's a file path
12
+ file_modified_at = if File.exist?(absolute_location) && !absolute_location.start_with?("http")
13
+ File.mtime(absolute_location)
14
+ else
15
+ Time.current
16
+ end
17
+
18
+ # Check if document already exists with same location and file_modified_at
19
+ existing_document = Models::Document.find_by(
20
+ location: absolute_location,
21
+ file_modified_at: file_modified_at
22
+ )
23
+
24
+ # Return existing document ID if found (skip duplicate)
25
+ return existing_document.id.to_s if existing_document
26
+
27
+ document = Models::Document.create!(
28
+ location: absolute_location,
29
+ title: metadata[:title] || metadata["title"] || extract_title_from_location(location),
30
+ document_type: metadata[:document_type] || metadata["document_type"] || "text",
31
+ metadata: metadata.is_a?(Hash) ? metadata : {},
32
+ status: "pending",
33
+ file_modified_at: file_modified_at
34
+ )
35
+
36
+ # Set content using the model's setter to trigger TextContent creation
37
+ document.content = content if content.present?
38
+
39
+ document.id.to_s
40
+ end
41
+
42
+ def get_document(id)
43
+ document = Models::Document.find_by(id: id)
44
+ return nil unless document
45
+
46
+ hash = document.to_hash
47
+ hash[:content] = document.content
48
+ hash
49
+ end
50
+
51
+ def update_document(id, **updates)
52
+ document = Models::Document.find_by(id: id)
53
+ return nil unless document
54
+
55
+ # Only update allowed fields
56
+ allowed_updates = updates.slice(:title, :metadata, :status, :document_type)
57
+ document.update!(allowed_updates) if allowed_updates.any?
58
+
59
+ document.to_hash
60
+ end
61
+
62
+ def delete_document(id)
63
+ document = Models::Document.find_by(id: id)
64
+ return nil unless document
65
+
66
+ document.destroy!
67
+ true
68
+ end
69
+
70
+ def list_documents(options = {})
71
+ limit = options[:limit] || 100
72
+ offset = options[:offset] || 0
73
+
74
+ Models::Document.offset(offset).limit(limit).recent.map(&:to_hash)
75
+ end
76
+
77
+ def get_document_stats
78
+ Models::Document.stats
79
+ end
80
+
81
+ # FIXME: should this be here?
82
+
83
+ def add_embedding(embeddable_id, chunk_index, embedding_vector, metadata = {})
84
+ # The embeddable_type should be the actual STI subclass, not the base class
85
+ embeddable_type = if metadata[:embeddable_type]
86
+ metadata[:embeddable_type]
87
+ else
88
+ # Look up the actual STI type from the content record
89
+ content = Models::Content.find(embeddable_id)
90
+ content.class.name
91
+ end
92
+
93
+ Models::Embedding.create!(
94
+ embeddable_id: embeddable_id,
95
+ embeddable_type: embeddable_type,
96
+ chunk_index: chunk_index,
97
+ embedding_vector: embedding_vector,
98
+ content: metadata[:content] || ""
99
+ ).id.to_s
100
+ end
101
+
102
+ private
103
+
104
+ def extract_title_from_location(location)
105
+ File.basename(location, File.extname(location))
106
+ end
107
+ end
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,344 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pdf-reader"
4
+ require "docx"
5
+ require "rmagick"
6
+ require_relative "services/image_description_service"
7
+
8
+ module Ragdoll
9
+ module Core
10
+ class DocumentProcessor
11
+ class ParseError < DocumentError; end
12
+ class UnsupportedFormatError < ParseError; end
13
+
14
+ def self.parse(file_path)
15
+ new(file_path).parse
16
+ end
17
+
18
+ # Parse from Shrine attached file
19
+ def self.parse_attachment(attached_file)
20
+ attached_file.open do |tempfile|
21
+ new(tempfile.path, attached_file).parse
22
+ end
23
+ end
24
+
25
+ # Create document from file path
26
+ def self.create_document_from_file(file_path, **options)
27
+ parsed = parse(file_path)
28
+
29
+ # Get file modification time
30
+ file_modified_at = File.exist?(file_path) ? File.mtime(file_path) : Time.current
31
+
32
+ document = Models::Document.create!(
33
+ location: File.expand_path(file_path),
34
+ title: parsed[:title] || File.basename(file_path, File.extname(file_path)),
35
+ content: parsed[:content],
36
+ document_type: determine_document_type(file_path),
37
+ metadata: parsed[:metadata] || {},
38
+ status: "processed",
39
+ file_modified_at: file_modified_at,
40
+ **options
41
+ )
42
+
43
+ # Attach the file if it exists
44
+ document.file = File.open(file_path) if File.exist?(file_path)
45
+
46
+ document
47
+ end
48
+
49
+ # Create document from uploaded file (Shrine compatible)
50
+ def self.create_document_from_upload(uploaded_file, **options)
51
+ # Create document first
52
+ document = Models::Document.create!(
53
+ location: uploaded_file.original_filename || "uploaded_file",
54
+ title: options[:title] || File.basename(uploaded_file.original_filename || "uploaded_file",
55
+ File.extname(uploaded_file.original_filename || "")),
56
+ content: "", # Will be extracted after file attachment
57
+ document_type: determine_document_type_from_content_type(uploaded_file.mime_type),
58
+ status: "processing",
59
+ metadata: options[:metadata] || {},
60
+ file_modified_at: Time.current
61
+ )
62
+
63
+ # Attach the file
64
+ document.file = uploaded_file
65
+
66
+ # Extract content from attached file
67
+ if document.file.present?
68
+ parsed = parse_attachment(document.file)
69
+ document.update!(
70
+ content: parsed[:content],
71
+ title: parsed[:title] || document.title,
72
+ metadata: document.metadata.merge(parsed[:metadata] || {}),
73
+ status: "processed"
74
+ )
75
+ end
76
+
77
+ document
78
+ end
79
+
80
+ def initialize(file_path, attached_file = nil)
81
+ @file_path = file_path
82
+ @attached_file = attached_file
83
+ @file_extension = File.extname(file_path).downcase
84
+ end
85
+
86
+ def parse
87
+ case @file_extension
88
+ when ".pdf"
89
+ parse_pdf
90
+ when ".docx"
91
+ parse_docx
92
+ when ".txt", ".md", ".markdown"
93
+ parse_text
94
+ when ".html", ".htm"
95
+ parse_html
96
+ when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
97
+ parse_image
98
+ else
99
+ parse_text # Default to text parsing for unknown formats
100
+ end
101
+ rescue StandardError => e # StandardError => e
102
+ raise ParseError, "#{__LINE__} Failed to parse #{@file_path}: #{e.message}"
103
+ end
104
+
105
+ private
106
+
107
+ def parse_pdf
108
+ content = ""
109
+ metadata = {}
110
+
111
+ begin
112
+ PDF::Reader.open(@file_path) do |reader|
113
+ # Extract metadata
114
+ if reader.info
115
+ metadata[:title] = reader.info[:Title] if reader.info[:Title]
116
+ metadata[:author] = reader.info[:Author] if reader.info[:Author]
117
+ metadata[:subject] = reader.info[:Subject] if reader.info[:Subject]
118
+ metadata[:creator] = reader.info[:Creator] if reader.info[:Creator]
119
+ metadata[:producer] = reader.info[:Producer] if reader.info[:Producer]
120
+ metadata[:creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
121
+ metadata[:modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
122
+ end
123
+
124
+ metadata[:page_count] = reader.page_count
125
+
126
+ # Extract text from all pages
127
+ reader.pages.each_with_index do |page, index|
128
+ page_text = page.text.strip
129
+ next if page_text.empty?
130
+
131
+ content += "\n\n--- Page #{index + 1} ---\n\n" if content.length.positive?
132
+ content += page_text
133
+ end
134
+ end
135
+ rescue PDF::Reader::MalformedPDFError => e
136
+ raise ParseError, "Malformed PDF: #{e.message}"
137
+ rescue PDF::Reader::UnsupportedFeatureError => e
138
+ raise ParseError, "Unsupported PDF feature: #{e.message}"
139
+ end
140
+
141
+ {
142
+ content: content.strip,
143
+ metadata: metadata,
144
+ document_type: "pdf"
145
+ }
146
+ end
147
+
148
+ def parse_docx
149
+ content = ""
150
+ metadata = {}
151
+
152
+ begin
153
+ doc = Docx::Document.open(@file_path)
154
+
155
+ # Extract core properties
156
+ if doc.core_properties
157
+ metadata[:title] = doc.core_properties.title if doc.core_properties.title
158
+ metadata[:author] = doc.core_properties.creator if doc.core_properties.creator
159
+ metadata[:subject] = doc.core_properties.subject if doc.core_properties.subject
160
+ metadata[:description] = doc.core_properties.description if doc.core_properties.description
161
+ metadata[:keywords] = doc.core_properties.keywords if doc.core_properties.keywords
162
+ metadata[:created] = doc.core_properties.created if doc.core_properties.created
163
+ metadata[:modified] = doc.core_properties.modified if doc.core_properties.modified
164
+ if doc.core_properties.last_modified_by
165
+ metadata[:last_modified_by] =
166
+ doc.core_properties.last_modified_by
167
+ end
168
+ end
169
+
170
+ # Extract text from paragraphs
171
+ doc.paragraphs.each do |paragraph|
172
+ paragraph_text = paragraph.text.strip
173
+ next if paragraph_text.empty?
174
+
175
+ content += "#{paragraph_text}\n\n"
176
+ end
177
+
178
+ # Extract text from tables
179
+ doc.tables.each_with_index do |table, table_index|
180
+ content += "\n--- Table #{table_index + 1} ---\n\n"
181
+
182
+ table.rows.each do |row|
183
+ row_text = row.cells.map(&:text).join(" | ")
184
+ content += "#{row_text}\n" unless row_text.strip.empty?
185
+ end
186
+
187
+ content += "\n"
188
+ end
189
+
190
+ metadata[:paragraph_count] = doc.paragraphs.count
191
+ metadata[:table_count] = doc.tables.count
192
+ rescue StandardError => e # StandardError => e
193
+ raise ParseError, "#{__LINE__} Failed to parse DOCX: #{e.message}"
194
+ end
195
+
196
+ {
197
+ content: content.strip,
198
+ metadata: metadata,
199
+ document_type: "docx"
200
+ }
201
+ end
202
+
203
+ def parse_text
204
+ content = File.read(@file_path, encoding: "UTF-8")
205
+ metadata = {
206
+ file_size: File.size(@file_path),
207
+ encoding: "UTF-8"
208
+ }
209
+
210
+ document_type = case @file_extension
211
+ when ".md", ".markdown" then "markdown"
212
+ when ".txt" then "text"
213
+ else "text"
214
+ end
215
+
216
+ {
217
+ content: content,
218
+ metadata: metadata,
219
+ document_type: document_type
220
+ }
221
+ rescue Encoding::InvalidByteSequenceError
222
+ # Try with different encoding
223
+ content = File.read(@file_path, encoding: "ISO-8859-1")
224
+ metadata = {
225
+ file_size: File.size(@file_path),
226
+ encoding: "ISO-8859-1"
227
+ }
228
+
229
+ {
230
+ content: content,
231
+ metadata: metadata,
232
+ document_type: "text"
233
+ }
234
+ end
235
+
236
+ def parse_html
237
+ content = File.read(@file_path, encoding: "UTF-8")
238
+
239
+ # Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
240
+ clean_content = content
241
+ .gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
242
+ .gsub(%r{<style[^>]*>.*?</style>}mi, "") # Remove style tags
243
+ .gsub(/<[^>]+>/, " ") # Remove all HTML tags
244
+ .gsub(/\s+/, " ") # Normalize whitespace
245
+ .strip
246
+
247
+ metadata = {
248
+ file_size: File.size(@file_path),
249
+ original_format: "html"
250
+ }
251
+
252
+ {
253
+ content: clean_content,
254
+ metadata: metadata,
255
+ document_type: "html"
256
+ }
257
+ end
258
+
259
+ def parse_image
260
+ puts "🖼️ DocumentProcessor: Starting image parsing for #{@file_path}"
261
+
262
+ metadata = {
263
+ file_size: File.size(@file_path),
264
+ file_type: @file_extension.sub(".", ""),
265
+ original_filename: File.basename(@file_path)
266
+ }
267
+
268
+ # Extract image dimensions
269
+ begin
270
+ img = Magick::Image.read(@file_path).first
271
+ metadata[:width] = img.columns
272
+ metadata[:height] = img.rows
273
+ puts "📏 DocumentProcessor: Image dimensions: #{img.columns}x#{img.rows}"
274
+ rescue StandardError => e # StandardError
275
+ puts "❌ DocumentProcessor: Failed to get image dimensions: #{e.message}"
276
+ metadata[:width] = nil
277
+ metadata[:height] = nil
278
+ end
279
+
280
+ puts "🤖 DocumentProcessor: Creating ImageDescriptionService and calling generate_description..."
281
+ desc = Services::ImageDescriptionService.new.generate_description(@file_path)
282
+
283
+ puts "📝 DocumentProcessor: Received description: '#{desc}'"
284
+
285
+ metadata[:description] = desc if desc && !desc.empty?
286
+
287
+ # Use AI-generated description or fallback placeholder
288
+ content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
289
+
290
+ puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
291
+
292
+ {
293
+ content: content,
294
+ metadata: metadata,
295
+ document_type: "image"
296
+ }
297
+ end
298
+
299
+ # Helper methods for document type determination
300
+ def self.determine_document_type(file_path)
301
+ case File.extname(file_path).downcase
302
+ when ".pdf" then "pdf"
303
+ when ".docx" then "docx"
304
+ when ".txt" then "text"
305
+ when ".md", ".markdown" then "markdown"
306
+ when ".html", ".htm" then "html"
307
+ when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif" then "image"
308
+ else "text"
309
+ end
310
+ end
311
+
312
+ def self.determine_document_type_from_content_type(content_type)
313
+ case content_type
314
+ when "application/pdf" then "pdf"
315
+ when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx"
316
+ when "text/plain" then "text"
317
+ when "text/markdown" then "markdown"
318
+ when "text/html" then "html"
319
+ when %r{^image/} then "image"
320
+ else "text"
321
+ end
322
+ end
323
+
324
+ def self.determine_content_type(file_path)
325
+ case File.extname(file_path).downcase
326
+ when ".pdf" then "application/pdf"
327
+ when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
328
+ when ".txt" then "text/plain"
329
+ when ".md", ".markdown" then "text/markdown"
330
+ when ".html", ".htm" then "text/html"
331
+ when ".jpg", ".jpeg" then "image/jpeg"
332
+ when ".png" then "image/png"
333
+ when ".gif" then "image/gif"
334
+ when ".webp" then "image/webp"
335
+ when ".bmp" then "image/bmp"
336
+ when ".svg" then "image/svg+xml"
337
+ when ".ico" then "image/x-icon"
338
+ when ".tiff", ".tif" then "image/tiff"
339
+ else "application/octet-stream"
340
+ end
341
+ end
342
+ end
343
+ end
344
+ end