ragdoll 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +353 -0
- data/Rakefile +21 -0
- data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
- data/db/migrate/004_create_ragdoll_documents.rb +70 -0
- data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
- data/db/migrate/006_create_ragdoll_contents.rb +47 -0
- data/lib/ragdoll/core/client.rb +315 -0
- data/lib/ragdoll/core/configuration.rb +273 -0
- data/lib/ragdoll/core/database.rb +141 -0
- data/lib/ragdoll/core/document_management.rb +110 -0
- data/lib/ragdoll/core/document_processor.rb +344 -0
- data/lib/ragdoll/core/embedding_service.rb +183 -0
- data/lib/ragdoll/core/errors.rb +11 -0
- data/lib/ragdoll/core/jobs/extract_keywords.rb +32 -0
- data/lib/ragdoll/core/jobs/extract_text.rb +42 -0
- data/lib/ragdoll/core/jobs/generate_embeddings.rb +32 -0
- data/lib/ragdoll/core/jobs/generate_summary.rb +29 -0
- data/lib/ragdoll/core/metadata_schemas.rb +334 -0
- data/lib/ragdoll/core/models/audio_content.rb +175 -0
- data/lib/ragdoll/core/models/content.rb +126 -0
- data/lib/ragdoll/core/models/document.rb +678 -0
- data/lib/ragdoll/core/models/embedding.rb +204 -0
- data/lib/ragdoll/core/models/image_content.rb +227 -0
- data/lib/ragdoll/core/models/text_content.rb +169 -0
- data/lib/ragdoll/core/search_engine.rb +50 -0
- data/lib/ragdoll/core/services/image_description_service.rb +230 -0
- data/lib/ragdoll/core/services/metadata_generator.rb +335 -0
- data/lib/ragdoll/core/shrine_config.rb +71 -0
- data/lib/ragdoll/core/text_chunker.rb +210 -0
- data/lib/ragdoll/core/text_generation_service.rb +360 -0
- data/lib/ragdoll/core/version.rb +8 -0
- data/lib/ragdoll/core.rb +73 -0
- data/lib/ragdoll-core.rb +3 -0
- data/lib/ragdoll.rb +249 -0
- data/lib/tasks/annotate.rake +126 -0
- data/lib/tasks/db.rake +338 -0
- metadata +80 -0
@@ -0,0 +1,183 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ruby_llm"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
module Core
|
7
|
+
class EmbeddingService
|
8
|
+
def initialize(client: nil)
|
9
|
+
@client = client
|
10
|
+
configure_ruby_llm unless @client
|
11
|
+
end
|
12
|
+
|
13
|
+
def generate_embedding(text)
|
14
|
+
return nil if text.nil? || text.strip.empty?
|
15
|
+
|
16
|
+
# Clean and prepare text
|
17
|
+
cleaned_text = clean_text(text)
|
18
|
+
|
19
|
+
begin
|
20
|
+
if @client
|
21
|
+
# Use custom client for testing
|
22
|
+
# FIXME: embedding_model is not in current config structure
|
23
|
+
response = @client.embed(
|
24
|
+
input: cleaned_text,
|
25
|
+
model: Ragdoll.config.models[:embedding][:text]
|
26
|
+
)
|
27
|
+
|
28
|
+
if response && response["embeddings"]&.first
|
29
|
+
response["embeddings"].first
|
30
|
+
elsif response && response["data"]&.first && response["data"].first["embedding"]
|
31
|
+
response["data"].first["embedding"]
|
32
|
+
else
|
33
|
+
raise EmbeddingError, "Invalid response format from embedding API"
|
34
|
+
end
|
35
|
+
else
|
36
|
+
# Use RubyLLM for real embedding generation
|
37
|
+
model = Ragdoll.config.models[:embedding][:text] || 'text-embedding-3-small'
|
38
|
+
response = RubyLLM.embed(cleaned_text, model: model)
|
39
|
+
|
40
|
+
# Extract the embedding vector from RubyLLM::Embedding object
|
41
|
+
if response.respond_to?(:instance_variable_get)
|
42
|
+
vectors = response.instance_variable_get(:@vectors)
|
43
|
+
if vectors && vectors.is_a?(Array)
|
44
|
+
vectors
|
45
|
+
else
|
46
|
+
raise EmbeddingError, "No vectors found in RubyLLM response"
|
47
|
+
end
|
48
|
+
else
|
49
|
+
raise EmbeddingError, "Unexpected response type from RubyLLM: #{response.class}"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
rescue StandardError => e
|
53
|
+
raise EmbeddingError, "Failed to generate embedding: #{e.message}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def generate_embeddings_batch(texts)
|
58
|
+
return [] if texts.empty?
|
59
|
+
|
60
|
+
# Clean all texts
|
61
|
+
cleaned_texts = texts.map { |text| clean_text(text) }.reject { |t| t.nil? || t.strip.empty? }
|
62
|
+
return [] if cleaned_texts.empty?
|
63
|
+
|
64
|
+
begin
|
65
|
+
if @client
|
66
|
+
# Use custom client for testing
|
67
|
+
# FIXME: embedding_model is not in current config structure
|
68
|
+
response = @client.embed(
|
69
|
+
input: cleaned_texts,
|
70
|
+
model: Ragdoll.config.models[:embedding][:text]
|
71
|
+
)
|
72
|
+
|
73
|
+
if response && response["embeddings"]
|
74
|
+
response["embeddings"]
|
75
|
+
elsif response && response["data"]
|
76
|
+
response["data"].map { |item| item["embedding"] }
|
77
|
+
else
|
78
|
+
raise EmbeddingError, "Invalid response format from embedding API"
|
79
|
+
end
|
80
|
+
else
|
81
|
+
# Use RubyLLM for real embedding generation (batch mode)
|
82
|
+
model = Ragdoll.config.models[:embedding][:text] || 'text-embedding-3-small'
|
83
|
+
|
84
|
+
cleaned_texts.map do |text|
|
85
|
+
response = RubyLLM.embed(text, model: model)
|
86
|
+
|
87
|
+
# Extract the embedding vector from RubyLLM::Embedding object
|
88
|
+
if response.respond_to?(:instance_variable_get)
|
89
|
+
vectors = response.instance_variable_get(:@vectors)
|
90
|
+
if vectors && vectors.is_a?(Array)
|
91
|
+
vectors
|
92
|
+
else
|
93
|
+
raise EmbeddingError, "No vectors found in RubyLLM response"
|
94
|
+
end
|
95
|
+
else
|
96
|
+
raise EmbeddingError, "Unexpected response type from RubyLLM: #{response.class}"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
rescue StandardError => e
|
101
|
+
raise EmbeddingError, "Failed to generate embeddings: #{e.message}"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def cosine_similarity(embedding1, embedding2)
|
106
|
+
return 0.0 if embedding1.nil? || embedding2.nil?
|
107
|
+
return 0.0 if embedding1.length != embedding2.length
|
108
|
+
|
109
|
+
dot_product = embedding1.zip(embedding2).sum { |a, b| a * b }
|
110
|
+
magnitude1 = Math.sqrt(embedding1.sum { |a| a * a })
|
111
|
+
magnitude2 = Math.sqrt(embedding2.sum { |a| a * a })
|
112
|
+
|
113
|
+
return 0.0 if magnitude1 == 0.0 || magnitude2 == 0.0
|
114
|
+
|
115
|
+
dot_product / (magnitude1 * magnitude2)
|
116
|
+
end
|
117
|
+
|
118
|
+
private
|
119
|
+
|
120
|
+
def configure_ruby_llm
|
121
|
+
# Configure ruby_llm based on Ragdoll configuration
|
122
|
+
# FIXME: embedding_provider and llm_provider are not in current config structure
|
123
|
+
# FIXME: llm_config is not in current config structure, should use ruby_llm_config directly
|
124
|
+
provider = :openai # Default provider
|
125
|
+
config = Ragdoll.config.ruby_llm_config[provider] || {}
|
126
|
+
|
127
|
+
RubyLLM.configure do |ruby_llm_config|
|
128
|
+
case provider
|
129
|
+
when :openai
|
130
|
+
ruby_llm_config.openai_api_key = config[:api_key]
|
131
|
+
# Set organization and project if methods exist
|
132
|
+
if config[:organization] && ruby_llm_config.respond_to?(:openai_organization=)
|
133
|
+
ruby_llm_config.openai_organization = config[:organization]
|
134
|
+
end
|
135
|
+
if config[:project] && ruby_llm_config.respond_to?(:openai_project=)
|
136
|
+
ruby_llm_config.openai_project = config[:project]
|
137
|
+
end
|
138
|
+
when :anthropic
|
139
|
+
ruby_llm_config.anthropic_api_key = config[:api_key] if ruby_llm_config.respond_to?(:anthropic_api_key=)
|
140
|
+
when :google
|
141
|
+
ruby_llm_config.google_api_key = config[:api_key] if ruby_llm_config.respond_to?(:google_api_key=)
|
142
|
+
if config[:project_id] && ruby_llm_config.respond_to?(:google_project_id=)
|
143
|
+
ruby_llm_config.google_project_id = config[:project_id]
|
144
|
+
end
|
145
|
+
when :azure
|
146
|
+
ruby_llm_config.azure_api_key = config[:api_key] if ruby_llm_config.respond_to?(:azure_api_key=)
|
147
|
+
if config[:endpoint] && ruby_llm_config.respond_to?(:azure_endpoint=)
|
148
|
+
ruby_llm_config.azure_endpoint = config[:endpoint]
|
149
|
+
end
|
150
|
+
if config[:api_version] && ruby_llm_config.respond_to?(:azure_api_version=)
|
151
|
+
ruby_llm_config.azure_api_version = config[:api_version]
|
152
|
+
end
|
153
|
+
when :ollama
|
154
|
+
if config[:endpoint] && ruby_llm_config.respond_to?(:ollama_endpoint=)
|
155
|
+
ruby_llm_config.ollama_endpoint = config[:endpoint]
|
156
|
+
end
|
157
|
+
when :huggingface
|
158
|
+
ruby_llm_config.huggingface_api_key = config[:api_key] if ruby_llm_config.respond_to?(:huggingface_api_key=)
|
159
|
+
when :openrouter
|
160
|
+
ruby_llm_config.openrouter_api_key = config[:api_key] if ruby_llm_config.respond_to?(:openrouter_api_key=)
|
161
|
+
else
|
162
|
+
# Don't raise error for unsupported providers in case RubyLLM doesn't support them yet
|
163
|
+
puts "Warning: Unsupported embedding provider: #{provider}"
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def clean_text(text)
|
169
|
+
return "" if text.nil?
|
170
|
+
|
171
|
+
# Remove excessive whitespace and normalize
|
172
|
+
cleaned = text.strip
|
173
|
+
.gsub(/\s+/, " ") # Multiple spaces to single space
|
174
|
+
.gsub(/\n+/, "\n") # Multiple newlines to single newline
|
175
|
+
.gsub(/\t+/, " ") # Tabs to spaces
|
176
|
+
|
177
|
+
# Truncate if too long (most embedding models have token limits)
|
178
|
+
max_chars = 8000 # Conservative limit for most embedding models
|
179
|
+
cleaned.length > max_chars ? cleaned[0, max_chars] : cleaned
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_job"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
module Core
|
7
|
+
module Jobs
|
8
|
+
class ExtractKeywords < ActiveJob::Base
|
9
|
+
queue_as :default
|
10
|
+
|
11
|
+
def perform(document_id)
|
12
|
+
document = Models::Document.find(document_id)
|
13
|
+
return unless document.content.present?
|
14
|
+
return if document.keywords.present?
|
15
|
+
|
16
|
+
text_service = TextGenerationService.new
|
17
|
+
keywords_array = text_service.extract_keywords(document.content)
|
18
|
+
|
19
|
+
if keywords_array.present?
|
20
|
+
keywords_string = keywords_array.join(", ")
|
21
|
+
document.update!(keywords: keywords_string)
|
22
|
+
end
|
23
|
+
rescue ActiveRecord::RecordNotFound
|
24
|
+
# Document was deleted, nothing to do
|
25
|
+
rescue StandardError => e
|
26
|
+
Rails.logger.error "Failed to generate keywords for document #{document_id}: #{e.message}" if defined?(Rails)
|
27
|
+
raise e
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_job"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
module Core
|
7
|
+
module Jobs
|
8
|
+
class ExtractText < ActiveJob::Base
|
9
|
+
queue_as :default
|
10
|
+
|
11
|
+
def perform(document_id)
|
12
|
+
document = Models::Document.find(document_id)
|
13
|
+
return unless document.file_attached?
|
14
|
+
return if document.content.present?
|
15
|
+
|
16
|
+
document.update!(status: "processing")
|
17
|
+
|
18
|
+
extracted_content = document.extract_text_from_file
|
19
|
+
|
20
|
+
if extracted_content.present?
|
21
|
+
document.update!(
|
22
|
+
content: extracted_content,
|
23
|
+
status: "processed"
|
24
|
+
)
|
25
|
+
|
26
|
+
# Queue follow-up jobs
|
27
|
+
GenerateSummaryJob.perform_later(document_id)
|
28
|
+
GenerateKeywordsJob.perform_later(document_id)
|
29
|
+
GenerateEmbeddingsJob.perform_later(document_id)
|
30
|
+
else
|
31
|
+
document.update!(status: "error")
|
32
|
+
end
|
33
|
+
rescue ActiveRecord::RecordNotFound
|
34
|
+
# Document was deleted, nothing to do
|
35
|
+
rescue StandardError => e
|
36
|
+
document&.update!(status: "error")
|
37
|
+
raise e
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_job"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
module Core
|
7
|
+
module Jobs
|
8
|
+
class GenerateEmbeddings < ActiveJob::Base
|
9
|
+
queue_as :default
|
10
|
+
|
11
|
+
def perform(document_id, chunk_size: nil, chunk_overlap: nil)
|
12
|
+
document = Models::Document.find(document_id)
|
13
|
+
return unless document.content.present?
|
14
|
+
return if document.all_embeddings.exists?
|
15
|
+
|
16
|
+
# Process all content records using their own generate_embeddings! methods
|
17
|
+
document.contents.each(&:generate_embeddings!)
|
18
|
+
|
19
|
+
# Update document status to processed
|
20
|
+
document.update!(status: "processed")
|
21
|
+
rescue ActiveRecord::RecordNotFound
|
22
|
+
# Document was deleted, nothing to do
|
23
|
+
rescue StandardError => e
|
24
|
+
if defined?(Rails)
|
25
|
+
Rails.logger.error "Failed to generate embeddings for document #{document_id}: #{e.message}"
|
26
|
+
end
|
27
|
+
raise e
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_job"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
module Core
|
7
|
+
module Jobs
|
8
|
+
class GenerateSummary < ActiveJob::Base
|
9
|
+
queue_as :default
|
10
|
+
|
11
|
+
def perform(document_id)
|
12
|
+
document = Models::Document.find(document_id)
|
13
|
+
return unless document.content.present?
|
14
|
+
return if document.summary.present?
|
15
|
+
|
16
|
+
text_service = TextGenerationService.new
|
17
|
+
summary = text_service.generate_summary(document.content)
|
18
|
+
|
19
|
+
document.update!(summary: summary) if summary.present?
|
20
|
+
rescue ActiveRecord::RecordNotFound
|
21
|
+
# Document was deleted, nothing to do
|
22
|
+
rescue StandardError => e
|
23
|
+
Rails.logger.error "Failed to generate summary for document #{document_id}: #{e.message}" if defined?(Rails)
|
24
|
+
raise e
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,334 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ragdoll
|
4
|
+
module Core
|
5
|
+
# Document metadata schemas for LLM structured output
|
6
|
+
# Each document type has a specific schema that guides LLM generation
|
7
|
+
module MetadataSchemas
|
8
|
+
# Text document metadata schema
|
9
|
+
TEXT_SCHEMA = {
|
10
|
+
type: "object",
|
11
|
+
properties: {
|
12
|
+
summary: {
|
13
|
+
type: "string",
|
14
|
+
description: "Concise summary of the text content (2-3 sentences)"
|
15
|
+
},
|
16
|
+
keywords: {
|
17
|
+
type: "array",
|
18
|
+
items: { type: "string" },
|
19
|
+
description: "Relevant keywords and phrases extracted from the text",
|
20
|
+
maxItems: 10
|
21
|
+
},
|
22
|
+
classification: {
|
23
|
+
type: "string",
|
24
|
+
enum: %w[research article blog documentation technical legal financial marketing other],
|
25
|
+
description: "Document classification category"
|
26
|
+
},
|
27
|
+
topics: {
|
28
|
+
type: "array",
|
29
|
+
items: { type: "string" },
|
30
|
+
description: "Main topics discussed in the document",
|
31
|
+
maxItems: 5
|
32
|
+
},
|
33
|
+
sentiment: {
|
34
|
+
type: "string",
|
35
|
+
enum: %w[positive negative neutral mixed],
|
36
|
+
description: "Overall sentiment of the text"
|
37
|
+
},
|
38
|
+
reading_time_minutes: {
|
39
|
+
type: "integer",
|
40
|
+
description: "Estimated reading time in minutes"
|
41
|
+
},
|
42
|
+
language: {
|
43
|
+
type: "string",
|
44
|
+
description: "Primary language of the text (ISO 639-1 code)"
|
45
|
+
},
|
46
|
+
complexity_level: {
|
47
|
+
type: "string",
|
48
|
+
enum: %w[beginner intermediate advanced expert],
|
49
|
+
description: "Complexity/difficulty level of the content"
|
50
|
+
},
|
51
|
+
tags: {
|
52
|
+
type: "array",
|
53
|
+
items: { type: "string" },
|
54
|
+
description: "User-defined or AI-suggested tags for organization"
|
55
|
+
}
|
56
|
+
},
|
57
|
+
required: %w[summary keywords classification]
|
58
|
+
}.freeze
|
59
|
+
|
60
|
+
# Image document metadata schema
|
61
|
+
IMAGE_SCHEMA = {
|
62
|
+
type: "object",
|
63
|
+
properties: {
|
64
|
+
description: {
|
65
|
+
type: "string",
|
66
|
+
description: "Detailed description of what is shown in the image"
|
67
|
+
},
|
68
|
+
summary: {
|
69
|
+
type: "string",
|
70
|
+
description: "Brief summary of the image content (1 sentence)"
|
71
|
+
},
|
72
|
+
objects: {
|
73
|
+
type: "array",
|
74
|
+
items: { type: "string" },
|
75
|
+
description: "List of objects, people, or items visible in the image",
|
76
|
+
maxItems: 15
|
77
|
+
},
|
78
|
+
scene_type: {
|
79
|
+
type: "string",
|
80
|
+
enum: %w[indoor outdoor portrait landscape diagram chart screenshot artwork photo other],
|
81
|
+
description: "Type of scene or image category"
|
82
|
+
},
|
83
|
+
colors: {
|
84
|
+
type: "array",
|
85
|
+
items: { type: "string" },
|
86
|
+
description: "Dominant colors in the image",
|
87
|
+
maxItems: 5
|
88
|
+
},
|
89
|
+
style: {
|
90
|
+
type: "string",
|
91
|
+
enum: %w[photograph illustration diagram chart screenshot artwork technical drawing other],
|
92
|
+
description: "Visual style or format of the image"
|
93
|
+
},
|
94
|
+
mood: {
|
95
|
+
type: "string",
|
96
|
+
enum: %w[professional casual formal technical artistic dramatic serene energetic other],
|
97
|
+
description: "Overall mood or tone of the image"
|
98
|
+
},
|
99
|
+
text_content: {
|
100
|
+
type: "string",
|
101
|
+
description: "Any visible text in the image (OCR extracted)"
|
102
|
+
},
|
103
|
+
keywords: {
|
104
|
+
type: "array",
|
105
|
+
items: { type: "string" },
|
106
|
+
description: "Relevant keywords for image search and categorization",
|
107
|
+
maxItems: 10
|
108
|
+
},
|
109
|
+
classification: {
|
110
|
+
type: "string",
|
111
|
+
enum: %w[technical diagram photo artwork chart screenshot document other],
|
112
|
+
description: "Image classification category"
|
113
|
+
},
|
114
|
+
tags: {
|
115
|
+
type: "array",
|
116
|
+
items: { type: "string" },
|
117
|
+
description: "User-defined or AI-suggested tags for organization"
|
118
|
+
}
|
119
|
+
},
|
120
|
+
required: %w[description summary scene_type classification]
|
121
|
+
}.freeze
|
122
|
+
|
123
|
+
# Audio document metadata schema
|
124
|
+
AUDIO_SCHEMA = {
|
125
|
+
type: "object",
|
126
|
+
properties: {
|
127
|
+
summary: {
|
128
|
+
type: "string",
|
129
|
+
description: "Summary of audio content (speech transcript summary or music description)"
|
130
|
+
},
|
131
|
+
content_type: {
|
132
|
+
type: "string",
|
133
|
+
enum: %w[speech music podcast interview lecture presentation sound_effect other],
|
134
|
+
description: "Type of audio content"
|
135
|
+
},
|
136
|
+
keywords: {
|
137
|
+
type: "array",
|
138
|
+
items: { type: "string" },
|
139
|
+
description: "Relevant keywords extracted from transcript or describing music",
|
140
|
+
maxItems: 10
|
141
|
+
},
|
142
|
+
classification: {
|
143
|
+
type: "string",
|
144
|
+
enum: %w[educational entertainment business technical musical interview podcast other],
|
145
|
+
description: "Audio content classification"
|
146
|
+
},
|
147
|
+
topics: {
|
148
|
+
type: "array",
|
149
|
+
items: { type: "string" },
|
150
|
+
description: "Main topics discussed (for speech) or musical elements (for music)",
|
151
|
+
maxItems: 5
|
152
|
+
},
|
153
|
+
language: {
|
154
|
+
type: "string",
|
155
|
+
description: "Language of speech content (ISO 639-1 code) or N/A for music"
|
156
|
+
},
|
157
|
+
speakers: {
|
158
|
+
type: "array",
|
159
|
+
items: { type: "string" },
|
160
|
+
description: "Number or names of speakers (for speech content)",
|
161
|
+
maxItems: 10
|
162
|
+
},
|
163
|
+
mood: {
|
164
|
+
type: "string",
|
165
|
+
enum: %w[formal casual energetic calm professional educational entertaining informative other],
|
166
|
+
description: "Overall mood or tone of the audio"
|
167
|
+
},
|
168
|
+
genre: {
|
169
|
+
type: "string",
|
170
|
+
description: "Music genre (for musical content) or speech type (for spoken content)"
|
171
|
+
},
|
172
|
+
key_quotes: {
|
173
|
+
type: "array",
|
174
|
+
items: { type: "string" },
|
175
|
+
description: "Important quotes or phrases from speech content",
|
176
|
+
maxItems: 3
|
177
|
+
},
|
178
|
+
tags: {
|
179
|
+
type: "array",
|
180
|
+
items: { type: "string" },
|
181
|
+
description: "User-defined or AI-suggested tags for organization"
|
182
|
+
}
|
183
|
+
},
|
184
|
+
required: %w[summary content_type classification]
|
185
|
+
}.freeze
|
186
|
+
|
187
|
+
# PDF document metadata schema (combines text analysis with document structure)
|
188
|
+
PDF_SCHEMA = {
|
189
|
+
type: "object",
|
190
|
+
properties: {
|
191
|
+
summary: {
|
192
|
+
type: "string",
|
193
|
+
description: "Summary of the PDF document content"
|
194
|
+
},
|
195
|
+
document_type: {
|
196
|
+
type: "string",
|
197
|
+
enum: %w[research_paper report manual presentation legal financial technical academic other],
|
198
|
+
description: "Type of PDF document"
|
199
|
+
},
|
200
|
+
keywords: {
|
201
|
+
type: "array",
|
202
|
+
items: { type: "string" },
|
203
|
+
description: "Keywords extracted from the document text",
|
204
|
+
maxItems: 15
|
205
|
+
},
|
206
|
+
classification: {
|
207
|
+
type: "string",
|
208
|
+
enum: %w[academic business legal technical manual report presentation other],
|
209
|
+
description: "Document classification category"
|
210
|
+
},
|
211
|
+
topics: {
|
212
|
+
type: "array",
|
213
|
+
items: { type: "string" },
|
214
|
+
description: "Main topics covered in the document",
|
215
|
+
maxItems: 8
|
216
|
+
},
|
217
|
+
structure: {
|
218
|
+
type: "object",
|
219
|
+
properties: {
|
220
|
+
has_table_of_contents: { type: "boolean" },
|
221
|
+
has_bibliography: { type: "boolean" },
|
222
|
+
has_figures: { type: "boolean" },
|
223
|
+
has_tables: { type: "boolean" },
|
224
|
+
estimated_pages: { type: "integer" }
|
225
|
+
}
|
226
|
+
},
|
227
|
+
reading_time_minutes: {
|
228
|
+
type: "integer",
|
229
|
+
description: "Estimated reading time in minutes"
|
230
|
+
},
|
231
|
+
complexity_level: {
|
232
|
+
type: "string",
|
233
|
+
enum: %w[beginner intermediate advanced expert],
|
234
|
+
description: "Complexity level of the content"
|
235
|
+
},
|
236
|
+
language: {
|
237
|
+
type: "string",
|
238
|
+
description: "Primary language of the document"
|
239
|
+
},
|
240
|
+
tags: {
|
241
|
+
type: "array",
|
242
|
+
items: { type: "string" },
|
243
|
+
description: "User-defined or AI-suggested tags for organization"
|
244
|
+
}
|
245
|
+
},
|
246
|
+
required: %w[summary document_type classification]
|
247
|
+
}.freeze
|
248
|
+
|
249
|
+
# Mixed/multi-modal document metadata schema
|
250
|
+
MIXED_SCHEMA = {
|
251
|
+
type: "object",
|
252
|
+
properties: {
|
253
|
+
summary: {
|
254
|
+
type: "string",
|
255
|
+
description: "Overall summary combining all content types in the document"
|
256
|
+
},
|
257
|
+
content_types: {
|
258
|
+
type: "array",
|
259
|
+
items: { type: "string", enum: %w[text image audio] },
|
260
|
+
description: "Types of content present in this multi-modal document"
|
261
|
+
},
|
262
|
+
primary_content_type: {
|
263
|
+
type: "string",
|
264
|
+
enum: %w[text image audio],
|
265
|
+
description: "The primary or dominant content type"
|
266
|
+
},
|
267
|
+
keywords: {
|
268
|
+
type: "array",
|
269
|
+
items: { type: "string" },
|
270
|
+
description: "Keywords extracted from all content types",
|
271
|
+
maxItems: 15
|
272
|
+
},
|
273
|
+
classification: {
|
274
|
+
type: "string",
|
275
|
+
enum: %w[multimedia_presentation research educational marketing technical training other],
|
276
|
+
description: "Multi-modal document classification"
|
277
|
+
},
|
278
|
+
topics: {
|
279
|
+
type: "array",
|
280
|
+
items: { type: "string" },
|
281
|
+
description: "Main topics across all content types",
|
282
|
+
maxItems: 8
|
283
|
+
},
|
284
|
+
cohesion_analysis: {
|
285
|
+
type: "string",
|
286
|
+
description: "How well the different content types work together"
|
287
|
+
},
|
288
|
+
tags: {
|
289
|
+
type: "array",
|
290
|
+
items: { type: "string" },
|
291
|
+
description: "User-defined or AI-suggested tags for organization"
|
292
|
+
}
|
293
|
+
},
|
294
|
+
required: %w[summary content_types primary_content_type classification]
|
295
|
+
}.freeze
|
296
|
+
|
297
|
+
# Get schema for document type
|
298
|
+
def self.schema_for(document_type)
|
299
|
+
case document_type.to_s.downcase
|
300
|
+
when "text", "markdown", "html"
|
301
|
+
TEXT_SCHEMA
|
302
|
+
when "image"
|
303
|
+
IMAGE_SCHEMA
|
304
|
+
when "audio"
|
305
|
+
AUDIO_SCHEMA
|
306
|
+
when "pdf", "docx"
|
307
|
+
PDF_SCHEMA
|
308
|
+
when "mixed"
|
309
|
+
MIXED_SCHEMA
|
310
|
+
else
|
311
|
+
TEXT_SCHEMA # fallback
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
# Get required fields for document type
|
316
|
+
def self.required_fields_for(document_type)
|
317
|
+
schema_for(document_type)[:required] || []
|
318
|
+
end
|
319
|
+
|
320
|
+
# Validate metadata against schema
|
321
|
+
def self.validate_metadata(document_type, metadata)
|
322
|
+
schema = schema_for(document_type)
|
323
|
+
required_fields = schema[:required] || []
|
324
|
+
|
325
|
+
errors = []
|
326
|
+
required_fields.each do |field|
|
327
|
+
errors << "Missing required field: #{field}" unless metadata.key?(field)
|
328
|
+
end
|
329
|
+
|
330
|
+
errors
|
331
|
+
end
|
332
|
+
end
|
333
|
+
end
|
334
|
+
end
|