ragdoll 0.1.0 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +318 -40
  3. data/Rakefile +66 -4
  4. data/app/jobs/ragdoll/extract_keywords_job.rb +28 -0
  5. data/app/jobs/ragdoll/extract_text_job.rb +38 -0
  6. data/app/jobs/ragdoll/generate_embeddings_job.rb +28 -0
  7. data/app/jobs/ragdoll/generate_summary_job.rb +25 -0
  8. data/app/lib/ragdoll/metadata_schemas.rb +332 -0
  9. data/app/models/ragdoll/audio_content.rb +142 -0
  10. data/app/models/ragdoll/content.rb +95 -0
  11. data/app/models/ragdoll/document.rb +606 -4
  12. data/app/models/ragdoll/embedding.rb +172 -5
  13. data/app/models/ragdoll/image_content.rb +194 -0
  14. data/app/models/ragdoll/text_content.rb +137 -0
  15. data/app/services/ragdoll/configuration_service.rb +113 -0
  16. data/app/services/ragdoll/document_management.rb +108 -0
  17. data/app/services/ragdoll/document_processor.rb +342 -0
  18. data/app/services/ragdoll/embedding_service.rb +202 -0
  19. data/app/services/ragdoll/image_description_service.rb +230 -0
  20. data/app/services/ragdoll/metadata_generator.rb +329 -0
  21. data/app/services/ragdoll/model_resolver.rb +72 -0
  22. data/app/services/ragdoll/search_engine.rb +51 -0
  23. data/app/services/ragdoll/text_chunker.rb +208 -0
  24. data/app/services/ragdoll/text_generation_service.rb +355 -0
  25. data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
  26. data/db/migrate/004_create_ragdoll_documents.rb +70 -0
  27. data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
  28. data/db/migrate/006_create_ragdoll_contents.rb +47 -0
  29. data/lib/ragdoll/core/client.rb +306 -0
  30. data/lib/ragdoll/core/configuration.rb +257 -0
  31. data/lib/ragdoll/core/database.rb +141 -0
  32. data/lib/ragdoll/core/errors.rb +11 -0
  33. data/lib/ragdoll/core/model.rb +45 -0
  34. data/lib/ragdoll/core/shrine_config.rb +71 -0
  35. data/lib/ragdoll/core/version.rb +8 -0
  36. data/lib/ragdoll/core.rb +91 -0
  37. data/lib/ragdoll-core.rb +3 -0
  38. data/lib/ragdoll.rb +243 -6
  39. data/lib/tasks/annotate.rake +126 -0
  40. data/lib/tasks/db.rake +338 -0
  41. metadata +42 -35
  42. data/config/initializers/ragdoll.rb +0 -6
  43. data/config/routes.rb +0 -5
  44. data/db/migrate/20250218123456_create_documents.rb +0 -20
  45. data/lib/config/database.yml +0 -28
  46. data/lib/config/ragdoll.yml +0 -31
  47. data/lib/ragdoll/engine.rb +0 -16
  48. data/lib/ragdoll/import_job.rb +0 -15
  49. data/lib/ragdoll/ingestion.rb +0 -30
  50. data/lib/ragdoll/search.rb +0 -18
  51. data/lib/ragdoll/version.rb +0 -7
  52. data/lib/tasks/import_task.thor +0 -32
  53. data/lib/tasks/jobs_task.thor +0 -40
  54. data/lib/tasks/ragdoll_tasks.thor +0 -7
  55. data/lib/tasks/search_task.thor +0 -55
@@ -0,0 +1,332 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ragdoll
4
+ # Document metadata schemas for LLM structured output
5
+ # Each document type has a specific schema that guides LLM generation
6
+ module MetadataSchemas
7
+ # Text document metadata schema
8
+ TEXT_SCHEMA = {
9
+ type: "object",
10
+ properties: {
11
+ summary: {
12
+ type: "string",
13
+ description: "Concise summary of the text content (2-3 paragraphs)"
14
+ },
15
+ keywords: {
16
+ type: "array",
17
+ items: { type: "string" },
18
+ description: "Relevant keywords and phrases extracted from the text",
19
+ maxItems: 10
20
+ },
21
+ classification: {
22
+ type: "string",
23
+ enum: %w[research article blog documentation technical legal financial marketing other],
24
+ description: "Document classification category"
25
+ },
26
+ topics: {
27
+ type: "array",
28
+ items: { type: "string" },
29
+ description: "Main topics discussed in the document",
30
+ maxItems: 5
31
+ },
32
+ sentiment: {
33
+ type: "string",
34
+ enum: %w[positive negative neutral mixed],
35
+ description: "Overall sentiment of the text"
36
+ },
37
+ reading_time_minutes: {
38
+ type: "integer",
39
+ description: "Estimated reading time in minutes"
40
+ },
41
+ language: {
42
+ type: "string",
43
+ description: "Primary language of the text (ISO 639-1 code)"
44
+ },
45
+ complexity_level: {
46
+ type: "string",
47
+ enum: %w[beginner intermediate advanced expert],
48
+ description: "Complexity/difficulty level of the content"
49
+ },
50
+ tags: {
51
+ type: "array",
52
+ items: { type: "string" },
53
+ description: "User-defined or AI-suggested tags for organization"
54
+ }
55
+ },
56
+ required: %w[summary keywords classification]
57
+ }.freeze
58
+
59
+ # Image document metadata schema
60
+ IMAGE_SCHEMA = {
61
+ type: "object",
62
+ properties: {
63
+ description: {
64
+ type: "string",
65
+ description: "Detailed description of what is shown in the image"
66
+ },
67
+ summary: {
68
+ type: "string",
69
+ description: "Brief summary of the image content (1 paragraph)"
70
+ },
71
+ objects: {
72
+ type: "array",
73
+ items: { type: "string" },
74
+ description: "List of objects, people, or items visible in the image",
75
+ maxItems: 15
76
+ },
77
+ scene_type: {
78
+ type: "string",
79
+ enum: %w[indoor outdoor portrait landscape diagram chart screenshot artwork photo other],
80
+ description: "Type of scene or image category"
81
+ },
82
+ colors: {
83
+ type: "array",
84
+ items: { type: "string" },
85
+ description: "Dominant colors in the image",
86
+ maxItems: 5
87
+ },
88
+ style: {
89
+ type: "string",
90
+ enum: %w[photograph illustration diagram chart screenshot artwork technical drawing other],
91
+ description: "Visual style or format of the image"
92
+ },
93
+ mood: {
94
+ type: "string",
95
+ enum: %w[professional casual formal technical artistic dramatic serene energetic other],
96
+ description: "Overall mood or tone of the image"
97
+ },
98
+ text_content: {
99
+ type: "string",
100
+ description: "Any visible text in the image (OCR extracted)"
101
+ },
102
+ keywords: {
103
+ type: "array",
104
+ items: { type: "string" },
105
+ description: "Relevant keywords for image search and categorization",
106
+ maxItems: 10
107
+ },
108
+ classification: {
109
+ type: "string",
110
+ enum: %w[technical diagram photo artwork chart screenshot document other],
111
+ description: "Image classification category"
112
+ },
113
+ tags: {
114
+ type: "array",
115
+ items: { type: "string" },
116
+ description: "User-defined or AI-suggested tags for organization"
117
+ }
118
+ },
119
+ required: %w[description summary scene_type classification]
120
+ }.freeze
121
+
122
+ # Audio document metadata schema
123
+ AUDIO_SCHEMA = {
124
+ type: "object",
125
+ properties: {
126
+ summary: {
127
+ type: "string",
128
+ description: "Summary of audio content (speech transcript summary or music description)"
129
+ },
130
+ content_type: {
131
+ type: "string",
132
+ enum: %w[speech music podcast interview lecture presentation sound_effect meeting other],
133
+ description: "Type of audio content"
134
+ },
135
+ keywords: {
136
+ type: "array",
137
+ items: { type: "string" },
138
+ description: "Relevant keywords extracted from transcript or describing music",
139
+ maxItems: 10
140
+ },
141
+ classification: {
142
+ type: "string",
143
+ enum: %w[educational entertainment business technical musical interview podcast other],
144
+ description: "Audio content classification"
145
+ },
146
+ topics: {
147
+ type: "array",
148
+ items: { type: "string" },
149
+ description: "Main topics discussed (for speech) or musical elements (for music)",
150
+ maxItems: 5
151
+ },
152
+ language: {
153
+ type: "string",
154
+ description: "Language of speech content (ISO 639-1 code) or N/A for music"
155
+ },
156
+ speakers: {
157
+ type: "array",
158
+ items: { type: "string" },
159
+ description: "Number or names of speakers (for speech content)",
160
+ maxItems: 10
161
+ },
162
+ mood: {
163
+ type: "string",
164
+ enum: %w[formal casual energetic calm professional educational entertaining informative other],
165
+ description: "Overall mood or tone of the audio"
166
+ },
167
+ genre: {
168
+ type: "string",
169
+ description: "Music genre (for musical content) or speech type (for spoken content)"
170
+ },
171
+ key_quotes: {
172
+ type: "array",
173
+ items: { type: "string" },
174
+ description: "Important quotes or phrases from speech content",
175
+ maxItems: 3
176
+ },
177
+ tags: {
178
+ type: "array",
179
+ items: { type: "string" },
180
+ description: "User-defined or AI-suggested tags for organization"
181
+ }
182
+ },
183
+ required: %w[summary content_type classification]
184
+ }.freeze
185
+
186
+ # PDF document metadata schema (combines text analysis with document structure)
187
+ PDF_SCHEMA = {
188
+ type: "object",
189
+ properties: {
190
+ summary: {
191
+ type: "string",
192
+ description: "Summary of the PDF document content"
193
+ },
194
+ document_type: {
195
+ type: "string",
196
+ enum: %w[research_paper report manual presentation legal financial technical academic other],
197
+ description: "Type of PDF document"
198
+ },
199
+ keywords: {
200
+ type: "array",
201
+ items: { type: "string" },
202
+ description: "Keywords extracted from the document text",
203
+ maxItems: 15
204
+ },
205
+ classification: {
206
+ type: "string",
207
+ enum: %w[academic business legal technical manual report presentation other],
208
+ description: "Document classification category"
209
+ },
210
+ topics: {
211
+ type: "array",
212
+ items: { type: "string" },
213
+ description: "Main topics covered in the document",
214
+ maxItems: 8
215
+ },
216
+ structure: {
217
+ type: "object",
218
+ properties: {
219
+ has_table_of_contents: { type: "boolean" },
220
+ has_bibliography: { type: "boolean" },
221
+ has_figures: { type: "boolean" },
222
+ has_tables: { type: "boolean" },
223
+ estimated_pages: { type: "integer" }
224
+ }
225
+ },
226
+ reading_time_minutes: {
227
+ type: "integer",
228
+ description: "Estimated reading time in minutes"
229
+ },
230
+ complexity_level: {
231
+ type: "string",
232
+ enum: %w[beginner intermediate advanced expert],
233
+ description: "Complexity level of the content"
234
+ },
235
+ language: {
236
+ type: "string",
237
+ description: "Primary language of the document"
238
+ },
239
+ tags: {
240
+ type: "array",
241
+ items: { type: "string" },
242
+ description: "User-defined or AI-suggested tags for organization"
243
+ }
244
+ },
245
+ required: %w[summary document_type classification]
246
+ }.freeze
247
+
248
+ # Mixed/multi-modal document metadata schema
249
+ MIXED_SCHEMA = {
250
+ type: "object",
251
+ properties: {
252
+ summary: {
253
+ type: "string",
254
+ description: "Overall summary combining all content types in the document"
255
+ },
256
+ content_types: {
257
+ type: "array",
258
+ items: { type: "string", enum: %w[text image audio] },
259
+ description: "Types of content present in this multi-modal document"
260
+ },
261
+ primary_content_type: {
262
+ type: "string",
263
+ enum: %w[text image audio],
264
+ description: "The primary or dominant content type"
265
+ },
266
+ keywords: {
267
+ type: "array",
268
+ items: { type: "string" },
269
+ description: "Keywords extracted from all content types",
270
+ maxItems: 15
271
+ },
272
+ classification: {
273
+ type: "string",
274
+ enum: %w[multimedia_presentation research educational marketing technical training other],
275
+ description: "Multi-modal document classification"
276
+ },
277
+ topics: {
278
+ type: "array",
279
+ items: { type: "string" },
280
+ description: "Main topics across all content types",
281
+ maxItems: 8
282
+ },
283
+ cohesion_analysis: {
284
+ type: "string",
285
+ description: "How well the different content types work together"
286
+ },
287
+ tags: {
288
+ type: "array",
289
+ items: { type: "string" },
290
+ description: "User-defined or AI-suggested tags for organization"
291
+ }
292
+ },
293
+ required: %w[summary content_types primary_content_type classification]
294
+ }.freeze
295
+
296
+ # Get schema for document type
297
+ def self.schema_for(document_type)
298
+ case document_type.to_s.downcase
299
+ when "text", "markdown", "html"
300
+ TEXT_SCHEMA
301
+ when "image"
302
+ IMAGE_SCHEMA
303
+ when "audio"
304
+ AUDIO_SCHEMA
305
+ when "pdf", "docx"
306
+ PDF_SCHEMA
307
+ when "mixed"
308
+ MIXED_SCHEMA
309
+ else
310
+ TEXT_SCHEMA # fallback
311
+ end
312
+ end
313
+
314
+ # Get required fields for document type
315
+ def self.required_fields_for(document_type)
316
+ schema_for(document_type)[:required] || []
317
+ end
318
+
319
+ # Validate metadata against schema
320
+ def self.validate_metadata(document_type, metadata)
321
+ schema = schema_for(document_type)
322
+ required_fields = schema[:required] || []
323
+
324
+ errors = []
325
+ required_fields.each do |field|
326
+ errors << "Missing required field: #{field}" unless metadata.key?(field)
327
+ end
328
+
329
+ errors
330
+ end
331
+ end
332
+ end
@@ -0,0 +1,142 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+ require_relative "content"
5
+
6
+ module Ragdoll
7
+ class AudioContent < Content
8
+ validate :audio_data_or_transcript_present
9
+ validates :duration, numericality: { greater_than: 0 }, allow_nil: true
10
+ validates :sample_rate, numericality: { greater_than: 0 }, allow_nil: true
11
+
12
+ scope :recent, -> { order(created_at: :desc) }
13
+ scope :with_audio, -> { where.not(data: [nil, ""]) }
14
+ scope :with_transcripts, -> { where.not(content: [nil, ""]) }
15
+ scope :by_duration, lambda { |min_duration, max_duration = nil|
16
+ scope = where("duration >= ?", min_duration)
17
+ scope = scope.where("duration <= ?", max_duration) if max_duration
18
+ scope
19
+ }
20
+
21
+ # Audio content accessors - content field stores transcript for embedding
22
+ def transcript
23
+ content
24
+ end
25
+
26
+ def transcript=(value)
27
+ self.content = value
28
+ end
29
+
30
+ # Audio file data accessor
31
+ def audio_data
32
+ data
33
+ end
34
+
35
+ def audio_data=(value)
36
+ self.data = value
37
+ end
38
+
39
+ # Audio file technical properties (stored in content metadata - raw file data)
40
+ def audio_attached?
41
+ data.present?
42
+ end
43
+
44
+ def audio_size
45
+ metadata.dig("file_size") || 0
46
+ end
47
+
48
+ def audio_size=(value)
49
+ self.metadata = metadata.merge("file_size" => value)
50
+ end
51
+
52
+ def audio_content_type
53
+ metadata.dig("content_type")
54
+ end
55
+
56
+ def audio_content_type=(value)
57
+ self.metadata = metadata.merge("content_type" => value)
58
+ end
59
+
60
+ def audio_filename
61
+ metadata.dig("filename")
62
+ end
63
+
64
+ def audio_filename=(value)
65
+ self.metadata = metadata.merge("filename" => value)
66
+ end
67
+
68
+ # Audio format and technical details
69
+ def codec
70
+ metadata.dig("codec")
71
+ end
72
+
73
+ def codec=(value)
74
+ self.metadata = metadata.merge("codec" => value)
75
+ end
76
+
77
+ def bitrate
78
+ metadata.dig("bitrate")
79
+ end
80
+
81
+ def bitrate=(value)
82
+ self.metadata = metadata.merge("bitrate" => value)
83
+ end
84
+
85
+ def channels
86
+ metadata.dig("channels")
87
+ end
88
+
89
+ def channels=(value)
90
+ self.metadata = metadata.merge("channels" => value)
91
+ end
92
+
93
+ def duration_formatted
94
+ return "Unknown" unless duration
95
+
96
+ minutes = (duration / 60).floor
97
+ seconds = (duration % 60).round
98
+ "#{minutes}:#{seconds.to_s.rjust(2, '0')}"
99
+ end
100
+
101
+ # Override content for embedding to use transcript
102
+ def content_for_embedding
103
+ transcript.presence || "Audio content without transcript"
104
+ end
105
+
106
+ def generate_embeddings!
107
+ return unless should_generate_embeddings?
108
+
109
+ embedding_content = content_for_embedding
110
+ return if embedding_content.blank?
111
+
112
+ # Generate embeddings using the base class method
113
+ super
114
+ end
115
+
116
+ # Override should_generate_embeddings to check for transcript
117
+ def should_generate_embeddings?
118
+ content_for_embedding.present? && embeddings.empty?
119
+ end
120
+
121
+ def self.stats
122
+ {
123
+ total_audio_contents: count,
124
+ by_model: group(:embedding_model).count,
125
+ total_embeddings: joins(:embeddings).count,
126
+ with_audio: with_audio.count,
127
+ with_transcripts: with_transcripts.count,
128
+ total_duration: sum(:duration),
129
+ average_duration: average(:duration),
130
+ average_audio_size: joins(:audio_attachment).average("active_storage_blobs.byte_size")
131
+ }
132
+ end
133
+
134
+ private
135
+
136
+ def audio_data_or_transcript_present
137
+ return if audio_attached? || transcript.present?
138
+
139
+ errors.add(:base, "Must have either audio data or transcript")
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+
5
+ module Ragdoll
6
+ class Content < ActiveRecord::Base
7
+ self.table_name = "ragdoll_contents"
8
+
9
+ belongs_to :document,
10
+ class_name: "Ragdoll::Document",
11
+ foreign_key: "document_id"
12
+
13
+ has_many :embeddings,
14
+ class_name: "Ragdoll::Embedding",
15
+ as: :embeddable,
16
+ dependent: :destroy
17
+
18
+ validates :type, presence: true
19
+ validates :embedding_model, presence: true
20
+ validates :document_id, presence: true
21
+
22
+ # JSON columns are handled natively by PostgreSQL
23
+
24
+ scope :by_type, ->(content_type) { where(type: content_type) }
25
+ scope :with_embeddings, -> { joins(:embeddings).distinct }
26
+ scope :without_embeddings, -> { left_joins(:embeddings).where(embeddings: { id: nil }) }
27
+
28
+ # Generate embeddings for this content
29
+ def generate_embeddings!
30
+ return unless should_generate_embeddings?
31
+
32
+ embedding_content = content_for_embedding
33
+ return if embedding_content.blank?
34
+
35
+ # Clear existing embeddings
36
+ embeddings.destroy_all
37
+
38
+ # Use TextChunker to split content into chunks
39
+ chunks = Ragdoll::TextChunker.chunk(embedding_content)
40
+
41
+ # Generate embeddings for each chunk
42
+ embedding_service = Ragdoll::EmbeddingService.new
43
+
44
+ chunks.each_with_index do |chunk_text, index|
45
+ begin
46
+ vector = embedding_service.generate_embedding(chunk_text)
47
+
48
+ embeddings.create!(
49
+ content: chunk_text,
50
+ embedding_vector: vector,
51
+ chunk_index: index
52
+ )
53
+ rescue StandardError => e
54
+ puts "Failed to generate embedding for chunk #{index}: #{e.message}"
55
+ end
56
+ end
57
+
58
+ update!(metadata: metadata.merge("embeddings_generated_at" => Time.current))
59
+ end
60
+
61
+ # Content to use for embedding generation (overridden by subclasses)
62
+ def content_for_embedding
63
+ content
64
+ end
65
+
66
+ # Whether this content should generate embeddings
67
+ def should_generate_embeddings?
68
+ content_for_embedding.present? && embeddings.empty?
69
+ end
70
+
71
+ # Statistics
72
+ def word_count
73
+ return 0 unless content.present?
74
+ content.split(/\s+/).length
75
+ end
76
+
77
+ def character_count
78
+ content&.length || 0
79
+ end
80
+
81
+ def embedding_count
82
+ embeddings.count
83
+ end
84
+
85
+ # Search within this content type
86
+ def self.search_content(query, **options)
87
+ return none if query.blank?
88
+
89
+ where(
90
+ "to_tsvector('english', COALESCE(content, '')) @@ plainto_tsquery('english', ?)",
91
+ query
92
+ ).limit(options[:limit] || 20)
93
+ end
94
+ end
95
+ end