ragdoll 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +318 -40
  3. data/Rakefile +15 -4
  4. data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
  5. data/db/migrate/004_create_ragdoll_documents.rb +70 -0
  6. data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
  7. data/db/migrate/006_create_ragdoll_contents.rb +47 -0
  8. data/lib/ragdoll/core/client.rb +315 -0
  9. data/lib/ragdoll/core/configuration.rb +273 -0
  10. data/lib/ragdoll/core/database.rb +141 -0
  11. data/lib/ragdoll/core/document_management.rb +110 -0
  12. data/lib/ragdoll/core/document_processor.rb +344 -0
  13. data/lib/ragdoll/core/embedding_service.rb +183 -0
  14. data/lib/ragdoll/core/errors.rb +11 -0
  15. data/lib/ragdoll/core/jobs/extract_keywords.rb +32 -0
  16. data/lib/ragdoll/core/jobs/extract_text.rb +42 -0
  17. data/lib/ragdoll/core/jobs/generate_embeddings.rb +32 -0
  18. data/lib/ragdoll/core/jobs/generate_summary.rb +29 -0
  19. data/lib/ragdoll/core/metadata_schemas.rb +334 -0
  20. data/lib/ragdoll/core/models/audio_content.rb +175 -0
  21. data/lib/ragdoll/core/models/content.rb +126 -0
  22. data/lib/ragdoll/core/models/document.rb +678 -0
  23. data/lib/ragdoll/core/models/embedding.rb +204 -0
  24. data/lib/ragdoll/core/models/image_content.rb +227 -0
  25. data/lib/ragdoll/core/models/text_content.rb +169 -0
  26. data/lib/ragdoll/core/search_engine.rb +50 -0
  27. data/lib/ragdoll/core/services/image_description_service.rb +230 -0
  28. data/lib/ragdoll/core/services/metadata_generator.rb +335 -0
  29. data/lib/ragdoll/core/shrine_config.rb +71 -0
  30. data/lib/ragdoll/core/text_chunker.rb +210 -0
  31. data/lib/ragdoll/core/text_generation_service.rb +360 -0
  32. data/lib/ragdoll/core/version.rb +8 -0
  33. data/lib/ragdoll/core.rb +73 -0
  34. data/lib/ragdoll-core.rb +3 -0
  35. data/lib/ragdoll.rb +243 -6
  36. data/lib/tasks/annotate.rake +126 -0
  37. data/lib/tasks/db.rake +338 -0
  38. metadata +40 -37
  39. data/app/models/ragdoll/document.rb +0 -9
  40. data/app/models/ragdoll/embedding.rb +0 -9
  41. data/config/initializers/ragdoll.rb +0 -6
  42. data/config/routes.rb +0 -5
  43. data/db/migrate/20250218123456_create_documents.rb +0 -20
  44. data/lib/config/database.yml +0 -28
  45. data/lib/config/ragdoll.yml +0 -31
  46. data/lib/ragdoll/engine.rb +0 -16
  47. data/lib/ragdoll/import_job.rb +0 -15
  48. data/lib/ragdoll/ingestion.rb +0 -30
  49. data/lib/ragdoll/search.rb +0 -18
  50. data/lib/ragdoll/version.rb +0 -7
  51. data/lib/tasks/import_task.thor +0 -32
  52. data/lib/tasks/jobs_task.thor +0 -40
  53. data/lib/tasks/ragdoll_tasks.thor +0 -7
  54. data/lib/tasks/search_task.thor +0 -55
@@ -0,0 +1,183 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ruby_llm"
4
+
5
+ module Ragdoll
6
+ module Core
7
+ class EmbeddingService
8
+ def initialize(client: nil)
9
+ @client = client
10
+ configure_ruby_llm unless @client
11
+ end
12
+
13
+ def generate_embedding(text)
14
+ return nil if text.nil? || text.strip.empty?
15
+
16
+ # Clean and prepare text
17
+ cleaned_text = clean_text(text)
18
+
19
+ begin
20
+ if @client
21
+ # Use custom client for testing
22
+ # FIXME: embedding_model is not in current config structure
23
+ response = @client.embed(
24
+ input: cleaned_text,
25
+ model: Ragdoll.config.models[:embedding][:text]
26
+ )
27
+
28
+ if response && response["embeddings"]&.first
29
+ response["embeddings"].first
30
+ elsif response && response["data"]&.first && response["data"].first["embedding"]
31
+ response["data"].first["embedding"]
32
+ else
33
+ raise EmbeddingError, "Invalid response format from embedding API"
34
+ end
35
+ else
36
+ # Use RubyLLM for real embedding generation
37
+ model = Ragdoll.config.models[:embedding][:text] || 'text-embedding-3-small'
38
+ response = RubyLLM.embed(cleaned_text, model: model)
39
+
40
+ # Extract the embedding vector from RubyLLM::Embedding object
41
+ if response.respond_to?(:instance_variable_get)
42
+ vectors = response.instance_variable_get(:@vectors)
43
+ if vectors && vectors.is_a?(Array)
44
+ vectors
45
+ else
46
+ raise EmbeddingError, "No vectors found in RubyLLM response"
47
+ end
48
+ else
49
+ raise EmbeddingError, "Unexpected response type from RubyLLM: #{response.class}"
50
+ end
51
+ end
52
+ rescue StandardError => e
53
+ raise EmbeddingError, "Failed to generate embedding: #{e.message}"
54
+ end
55
+ end
56
+
57
+ def generate_embeddings_batch(texts)
58
+ return [] if texts.empty?
59
+
60
+ # Clean all texts
61
+ cleaned_texts = texts.map { |text| clean_text(text) }.reject { |t| t.nil? || t.strip.empty? }
62
+ return [] if cleaned_texts.empty?
63
+
64
+ begin
65
+ if @client
66
+ # Use custom client for testing
67
+ # FIXME: embedding_model is not in current config structure
68
+ response = @client.embed(
69
+ input: cleaned_texts,
70
+ model: Ragdoll.config.models[:embedding][:text]
71
+ )
72
+
73
+ if response && response["embeddings"]
74
+ response["embeddings"]
75
+ elsif response && response["data"]
76
+ response["data"].map { |item| item["embedding"] }
77
+ else
78
+ raise EmbeddingError, "Invalid response format from embedding API"
79
+ end
80
+ else
81
+ # Use RubyLLM for real embedding generation (batch mode)
82
+ model = Ragdoll.config.models[:embedding][:text] || 'text-embedding-3-small'
83
+
84
+ cleaned_texts.map do |text|
85
+ response = RubyLLM.embed(text, model: model)
86
+
87
+ # Extract the embedding vector from RubyLLM::Embedding object
88
+ if response.respond_to?(:instance_variable_get)
89
+ vectors = response.instance_variable_get(:@vectors)
90
+ if vectors && vectors.is_a?(Array)
91
+ vectors
92
+ else
93
+ raise EmbeddingError, "No vectors found in RubyLLM response"
94
+ end
95
+ else
96
+ raise EmbeddingError, "Unexpected response type from RubyLLM: #{response.class}"
97
+ end
98
+ end
99
+ end
100
+ rescue StandardError => e
101
+ raise EmbeddingError, "Failed to generate embeddings: #{e.message}"
102
+ end
103
+ end
104
+
105
+ def cosine_similarity(embedding1, embedding2)
106
+ return 0.0 if embedding1.nil? || embedding2.nil?
107
+ return 0.0 if embedding1.length != embedding2.length
108
+
109
+ dot_product = embedding1.zip(embedding2).sum { |a, b| a * b }
110
+ magnitude1 = Math.sqrt(embedding1.sum { |a| a * a })
111
+ magnitude2 = Math.sqrt(embedding2.sum { |a| a * a })
112
+
113
+ return 0.0 if magnitude1 == 0.0 || magnitude2 == 0.0
114
+
115
+ dot_product / (magnitude1 * magnitude2)
116
+ end
117
+
118
+ private
119
+
120
+ def configure_ruby_llm
121
+ # Configure ruby_llm based on Ragdoll configuration
122
+ # FIXME: embedding_provider and llm_provider are not in current config structure
123
+ # FIXME: llm_config is not in current config structure, should use ruby_llm_config directly
124
+ provider = :openai # Default provider
125
+ config = Ragdoll.config.ruby_llm_config[provider] || {}
126
+
127
+ RubyLLM.configure do |ruby_llm_config|
128
+ case provider
129
+ when :openai
130
+ ruby_llm_config.openai_api_key = config[:api_key]
131
+ # Set organization and project if methods exist
132
+ if config[:organization] && ruby_llm_config.respond_to?(:openai_organization=)
133
+ ruby_llm_config.openai_organization = config[:organization]
134
+ end
135
+ if config[:project] && ruby_llm_config.respond_to?(:openai_project=)
136
+ ruby_llm_config.openai_project = config[:project]
137
+ end
138
+ when :anthropic
139
+ ruby_llm_config.anthropic_api_key = config[:api_key] if ruby_llm_config.respond_to?(:anthropic_api_key=)
140
+ when :google
141
+ ruby_llm_config.google_api_key = config[:api_key] if ruby_llm_config.respond_to?(:google_api_key=)
142
+ if config[:project_id] && ruby_llm_config.respond_to?(:google_project_id=)
143
+ ruby_llm_config.google_project_id = config[:project_id]
144
+ end
145
+ when :azure
146
+ ruby_llm_config.azure_api_key = config[:api_key] if ruby_llm_config.respond_to?(:azure_api_key=)
147
+ if config[:endpoint] && ruby_llm_config.respond_to?(:azure_endpoint=)
148
+ ruby_llm_config.azure_endpoint = config[:endpoint]
149
+ end
150
+ if config[:api_version] && ruby_llm_config.respond_to?(:azure_api_version=)
151
+ ruby_llm_config.azure_api_version = config[:api_version]
152
+ end
153
+ when :ollama
154
+ if config[:endpoint] && ruby_llm_config.respond_to?(:ollama_endpoint=)
155
+ ruby_llm_config.ollama_endpoint = config[:endpoint]
156
+ end
157
+ when :huggingface
158
+ ruby_llm_config.huggingface_api_key = config[:api_key] if ruby_llm_config.respond_to?(:huggingface_api_key=)
159
+ when :openrouter
160
+ ruby_llm_config.openrouter_api_key = config[:api_key] if ruby_llm_config.respond_to?(:openrouter_api_key=)
161
+ else
162
+ # Don't raise error for unsupported providers in case RubyLLM doesn't support them yet
163
+ puts "Warning: Unsupported embedding provider: #{provider}"
164
+ end
165
+ end
166
+ end
167
+
168
+ def clean_text(text)
169
+ return "" if text.nil?
170
+
171
+ # Remove excessive whitespace and normalize
172
+ cleaned = text.strip
173
+ .gsub(/\s+/, " ") # Multiple spaces to single space
174
+ .gsub(/\n+/, "\n") # Multiple newlines to single newline
175
+ .gsub(/\t+/, " ") # Tabs to spaces
176
+
177
+ # Truncate if too long (most embedding models have token limits)
178
+ max_chars = 8000 # Conservative limit for most embedding models
179
+ cleaned.length > max_chars ? cleaned[0, max_chars] : cleaned
180
+ end
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ragdoll
4
+ module Core
5
+ class Error < StandardError; end
6
+ class EmbeddingError < Error; end
7
+ class SearchError < Error; end
8
+ class DocumentError < Error; end
9
+ class ConfigurationError < Error; end
10
+ end
11
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_job"
4
+
5
+ module Ragdoll
6
+ module Core
7
+ module Jobs
8
+ class ExtractKeywords < ActiveJob::Base
9
+ queue_as :default
10
+
11
+ def perform(document_id)
12
+ document = Models::Document.find(document_id)
13
+ return unless document.content.present?
14
+ return if document.keywords.present?
15
+
16
+ text_service = TextGenerationService.new
17
+ keywords_array = text_service.extract_keywords(document.content)
18
+
19
+ if keywords_array.present?
20
+ keywords_string = keywords_array.join(", ")
21
+ document.update!(keywords: keywords_string)
22
+ end
23
+ rescue ActiveRecord::RecordNotFound
24
+ # Document was deleted, nothing to do
25
+ rescue StandardError => e
26
+ Rails.logger.error "Failed to generate keywords for document #{document_id}: #{e.message}" if defined?(Rails)
27
+ raise e
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_job"
4
+
5
+ module Ragdoll
6
+ module Core
7
+ module Jobs
8
+ class ExtractText < ActiveJob::Base
9
+ queue_as :default
10
+
11
+ def perform(document_id)
12
+ document = Models::Document.find(document_id)
13
+ return unless document.file_attached?
14
+ return if document.content.present?
15
+
16
+ document.update!(status: "processing")
17
+
18
+ extracted_content = document.extract_text_from_file
19
+
20
+ if extracted_content.present?
21
+ document.update!(
22
+ content: extracted_content,
23
+ status: "processed"
24
+ )
25
+
26
+ # Queue follow-up jobs
27
+ GenerateSummaryJob.perform_later(document_id)
28
+ GenerateKeywordsJob.perform_later(document_id)
29
+ GenerateEmbeddingsJob.perform_later(document_id)
30
+ else
31
+ document.update!(status: "error")
32
+ end
33
+ rescue ActiveRecord::RecordNotFound
34
+ # Document was deleted, nothing to do
35
+ rescue StandardError => e
36
+ document&.update!(status: "error")
37
+ raise e
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_job"
4
+
5
+ module Ragdoll
6
+ module Core
7
+ module Jobs
8
+ class GenerateEmbeddings < ActiveJob::Base
9
+ queue_as :default
10
+
11
+ def perform(document_id, chunk_size: nil, chunk_overlap: nil)
12
+ document = Models::Document.find(document_id)
13
+ return unless document.content.present?
14
+ return if document.all_embeddings.exists?
15
+
16
+ # Process all content records using their own generate_embeddings! methods
17
+ document.contents.each(&:generate_embeddings!)
18
+
19
+ # Update document status to processed
20
+ document.update!(status: "processed")
21
+ rescue ActiveRecord::RecordNotFound
22
+ # Document was deleted, nothing to do
23
+ rescue StandardError => e
24
+ if defined?(Rails)
25
+ Rails.logger.error "Failed to generate embeddings for document #{document_id}: #{e.message}"
26
+ end
27
+ raise e
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_job"
4
+
5
+ module Ragdoll
6
+ module Core
7
+ module Jobs
8
+ class GenerateSummary < ActiveJob::Base
9
+ queue_as :default
10
+
11
+ def perform(document_id)
12
+ document = Models::Document.find(document_id)
13
+ return unless document.content.present?
14
+ return if document.summary.present?
15
+
16
+ text_service = TextGenerationService.new
17
+ summary = text_service.generate_summary(document.content)
18
+
19
+ document.update!(summary: summary) if summary.present?
20
+ rescue ActiveRecord::RecordNotFound
21
+ # Document was deleted, nothing to do
22
+ rescue StandardError => e
23
+ Rails.logger.error "Failed to generate summary for document #{document_id}: #{e.message}" if defined?(Rails)
24
+ raise e
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,334 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ragdoll
4
+ module Core
5
+ # Document metadata schemas for LLM structured output
6
+ # Each document type has a specific schema that guides LLM generation
7
+ module MetadataSchemas
8
+ # Text document metadata schema
9
+ TEXT_SCHEMA = {
10
+ type: "object",
11
+ properties: {
12
+ summary: {
13
+ type: "string",
14
+ description: "Concise summary of the text content (2-3 sentences)"
15
+ },
16
+ keywords: {
17
+ type: "array",
18
+ items: { type: "string" },
19
+ description: "Relevant keywords and phrases extracted from the text",
20
+ maxItems: 10
21
+ },
22
+ classification: {
23
+ type: "string",
24
+ enum: %w[research article blog documentation technical legal financial marketing other],
25
+ description: "Document classification category"
26
+ },
27
+ topics: {
28
+ type: "array",
29
+ items: { type: "string" },
30
+ description: "Main topics discussed in the document",
31
+ maxItems: 5
32
+ },
33
+ sentiment: {
34
+ type: "string",
35
+ enum: %w[positive negative neutral mixed],
36
+ description: "Overall sentiment of the text"
37
+ },
38
+ reading_time_minutes: {
39
+ type: "integer",
40
+ description: "Estimated reading time in minutes"
41
+ },
42
+ language: {
43
+ type: "string",
44
+ description: "Primary language of the text (ISO 639-1 code)"
45
+ },
46
+ complexity_level: {
47
+ type: "string",
48
+ enum: %w[beginner intermediate advanced expert],
49
+ description: "Complexity/difficulty level of the content"
50
+ },
51
+ tags: {
52
+ type: "array",
53
+ items: { type: "string" },
54
+ description: "User-defined or AI-suggested tags for organization"
55
+ }
56
+ },
57
+ required: %w[summary keywords classification]
58
+ }.freeze
59
+
60
+ # Image document metadata schema
61
+ IMAGE_SCHEMA = {
62
+ type: "object",
63
+ properties: {
64
+ description: {
65
+ type: "string",
66
+ description: "Detailed description of what is shown in the image"
67
+ },
68
+ summary: {
69
+ type: "string",
70
+ description: "Brief summary of the image content (1 sentence)"
71
+ },
72
+ objects: {
73
+ type: "array",
74
+ items: { type: "string" },
75
+ description: "List of objects, people, or items visible in the image",
76
+ maxItems: 15
77
+ },
78
+ scene_type: {
79
+ type: "string",
80
+ enum: %w[indoor outdoor portrait landscape diagram chart screenshot artwork photo other],
81
+ description: "Type of scene or image category"
82
+ },
83
+ colors: {
84
+ type: "array",
85
+ items: { type: "string" },
86
+ description: "Dominant colors in the image",
87
+ maxItems: 5
88
+ },
89
+ style: {
90
+ type: "string",
91
+ enum: %w[photograph illustration diagram chart screenshot artwork technical drawing other],
92
+ description: "Visual style or format of the image"
93
+ },
94
+ mood: {
95
+ type: "string",
96
+ enum: %w[professional casual formal technical artistic dramatic serene energetic other],
97
+ description: "Overall mood or tone of the image"
98
+ },
99
+ text_content: {
100
+ type: "string",
101
+ description: "Any visible text in the image (OCR extracted)"
102
+ },
103
+ keywords: {
104
+ type: "array",
105
+ items: { type: "string" },
106
+ description: "Relevant keywords for image search and categorization",
107
+ maxItems: 10
108
+ },
109
+ classification: {
110
+ type: "string",
111
+ enum: %w[technical diagram photo artwork chart screenshot document other],
112
+ description: "Image classification category"
113
+ },
114
+ tags: {
115
+ type: "array",
116
+ items: { type: "string" },
117
+ description: "User-defined or AI-suggested tags for organization"
118
+ }
119
+ },
120
+ required: %w[description summary scene_type classification]
121
+ }.freeze
122
+
123
+ # Audio document metadata schema
124
+ AUDIO_SCHEMA = {
125
+ type: "object",
126
+ properties: {
127
+ summary: {
128
+ type: "string",
129
+ description: "Summary of audio content (speech transcript summary or music description)"
130
+ },
131
+ content_type: {
132
+ type: "string",
133
+ enum: %w[speech music podcast interview lecture presentation sound_effect other],
134
+ description: "Type of audio content"
135
+ },
136
+ keywords: {
137
+ type: "array",
138
+ items: { type: "string" },
139
+ description: "Relevant keywords extracted from transcript or describing music",
140
+ maxItems: 10
141
+ },
142
+ classification: {
143
+ type: "string",
144
+ enum: %w[educational entertainment business technical musical interview podcast other],
145
+ description: "Audio content classification"
146
+ },
147
+ topics: {
148
+ type: "array",
149
+ items: { type: "string" },
150
+ description: "Main topics discussed (for speech) or musical elements (for music)",
151
+ maxItems: 5
152
+ },
153
+ language: {
154
+ type: "string",
155
+ description: "Language of speech content (ISO 639-1 code) or N/A for music"
156
+ },
157
+ speakers: {
158
+ type: "array",
159
+ items: { type: "string" },
160
+ description: "Number or names of speakers (for speech content)",
161
+ maxItems: 10
162
+ },
163
+ mood: {
164
+ type: "string",
165
+ enum: %w[formal casual energetic calm professional educational entertaining informative other],
166
+ description: "Overall mood or tone of the audio"
167
+ },
168
+ genre: {
169
+ type: "string",
170
+ description: "Music genre (for musical content) or speech type (for spoken content)"
171
+ },
172
+ key_quotes: {
173
+ type: "array",
174
+ items: { type: "string" },
175
+ description: "Important quotes or phrases from speech content",
176
+ maxItems: 3
177
+ },
178
+ tags: {
179
+ type: "array",
180
+ items: { type: "string" },
181
+ description: "User-defined or AI-suggested tags for organization"
182
+ }
183
+ },
184
+ required: %w[summary content_type classification]
185
+ }.freeze
186
+
187
+ # PDF document metadata schema (combines text analysis with document structure)
188
+ PDF_SCHEMA = {
189
+ type: "object",
190
+ properties: {
191
+ summary: {
192
+ type: "string",
193
+ description: "Summary of the PDF document content"
194
+ },
195
+ document_type: {
196
+ type: "string",
197
+ enum: %w[research_paper report manual presentation legal financial technical academic other],
198
+ description: "Type of PDF document"
199
+ },
200
+ keywords: {
201
+ type: "array",
202
+ items: { type: "string" },
203
+ description: "Keywords extracted from the document text",
204
+ maxItems: 15
205
+ },
206
+ classification: {
207
+ type: "string",
208
+ enum: %w[academic business legal technical manual report presentation other],
209
+ description: "Document classification category"
210
+ },
211
+ topics: {
212
+ type: "array",
213
+ items: { type: "string" },
214
+ description: "Main topics covered in the document",
215
+ maxItems: 8
216
+ },
217
+ structure: {
218
+ type: "object",
219
+ properties: {
220
+ has_table_of_contents: { type: "boolean" },
221
+ has_bibliography: { type: "boolean" },
222
+ has_figures: { type: "boolean" },
223
+ has_tables: { type: "boolean" },
224
+ estimated_pages: { type: "integer" }
225
+ }
226
+ },
227
+ reading_time_minutes: {
228
+ type: "integer",
229
+ description: "Estimated reading time in minutes"
230
+ },
231
+ complexity_level: {
232
+ type: "string",
233
+ enum: %w[beginner intermediate advanced expert],
234
+ description: "Complexity level of the content"
235
+ },
236
+ language: {
237
+ type: "string",
238
+ description: "Primary language of the document"
239
+ },
240
+ tags: {
241
+ type: "array",
242
+ items: { type: "string" },
243
+ description: "User-defined or AI-suggested tags for organization"
244
+ }
245
+ },
246
+ required: %w[summary document_type classification]
247
+ }.freeze
248
+
249
+ # Mixed/multi-modal document metadata schema
250
+ MIXED_SCHEMA = {
251
+ type: "object",
252
+ properties: {
253
+ summary: {
254
+ type: "string",
255
+ description: "Overall summary combining all content types in the document"
256
+ },
257
+ content_types: {
258
+ type: "array",
259
+ items: { type: "string", enum: %w[text image audio] },
260
+ description: "Types of content present in this multi-modal document"
261
+ },
262
+ primary_content_type: {
263
+ type: "string",
264
+ enum: %w[text image audio],
265
+ description: "The primary or dominant content type"
266
+ },
267
+ keywords: {
268
+ type: "array",
269
+ items: { type: "string" },
270
+ description: "Keywords extracted from all content types",
271
+ maxItems: 15
272
+ },
273
+ classification: {
274
+ type: "string",
275
+ enum: %w[multimedia_presentation research educational marketing technical training other],
276
+ description: "Multi-modal document classification"
277
+ },
278
+ topics: {
279
+ type: "array",
280
+ items: { type: "string" },
281
+ description: "Main topics across all content types",
282
+ maxItems: 8
283
+ },
284
+ cohesion_analysis: {
285
+ type: "string",
286
+ description: "How well the different content types work together"
287
+ },
288
+ tags: {
289
+ type: "array",
290
+ items: { type: "string" },
291
+ description: "User-defined or AI-suggested tags for organization"
292
+ }
293
+ },
294
+ required: %w[summary content_types primary_content_type classification]
295
+ }.freeze
296
+
297
+ # Get schema for document type
298
+ def self.schema_for(document_type)
299
+ case document_type.to_s.downcase
300
+ when "text", "markdown", "html"
301
+ TEXT_SCHEMA
302
+ when "image"
303
+ IMAGE_SCHEMA
304
+ when "audio"
305
+ AUDIO_SCHEMA
306
+ when "pdf", "docx"
307
+ PDF_SCHEMA
308
+ when "mixed"
309
+ MIXED_SCHEMA
310
+ else
311
+ TEXT_SCHEMA # fallback
312
+ end
313
+ end
314
+
315
+ # Get required fields for document type
316
+ def self.required_fields_for(document_type)
317
+ schema_for(document_type)[:required] || []
318
+ end
319
+
320
+ # Validate metadata against schema
321
+ def self.validate_metadata(document_type, metadata)
322
+ schema = schema_for(document_type)
323
+ required_fields = schema[:required] || []
324
+
325
+ errors = []
326
+ required_fields.each do |field|
327
+ errors << "Missing required field: #{field}" unless metadata.key?(field)
328
+ end
329
+
330
+ errors
331
+ end
332
+ end
333
+ end
334
+ end