ragdoll 0.1.1 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/Rakefile +52 -1
  4. data/app/jobs/ragdoll/extract_keywords_job.rb +28 -0
  5. data/app/jobs/ragdoll/extract_text_job.rb +38 -0
  6. data/app/jobs/ragdoll/generate_embeddings_job.rb +28 -0
  7. data/app/jobs/ragdoll/generate_summary_job.rb +25 -0
  8. data/app/lib/ragdoll/metadata_schemas.rb +332 -0
  9. data/app/models/ragdoll/audio_content.rb +142 -0
  10. data/app/models/ragdoll/content.rb +95 -0
  11. data/app/models/ragdoll/document.rb +611 -0
  12. data/app/models/ragdoll/embedding.rb +176 -0
  13. data/app/models/ragdoll/image_content.rb +194 -0
  14. data/app/models/ragdoll/text_content.rb +137 -0
  15. data/app/services/ragdoll/configuration_service.rb +113 -0
  16. data/app/services/ragdoll/document_management.rb +108 -0
  17. data/app/services/ragdoll/document_processor.rb +342 -0
  18. data/app/services/ragdoll/embedding_service.rb +202 -0
  19. data/app/services/ragdoll/image_description_service.rb +230 -0
  20. data/app/services/ragdoll/metadata_generator.rb +329 -0
  21. data/app/services/ragdoll/model_resolver.rb +72 -0
  22. data/app/services/ragdoll/search_engine.rb +51 -0
  23. data/app/services/ragdoll/text_chunker.rb +208 -0
  24. data/app/services/ragdoll/text_generation_service.rb +355 -0
  25. data/lib/ragdoll/core/client.rb +32 -41
  26. data/lib/ragdoll/core/configuration.rb +140 -156
  27. data/lib/ragdoll/core/database.rb +1 -1
  28. data/lib/ragdoll/core/model.rb +45 -0
  29. data/lib/ragdoll/core/version.rb +1 -1
  30. data/lib/ragdoll/core.rb +35 -17
  31. data/lib/ragdoll.rb +1 -1
  32. data/lib/tasks/annotate.rake +1 -1
  33. data/lib/tasks/db.rake +2 -2
  34. metadata +24 -20
  35. data/lib/ragdoll/core/document_management.rb +0 -110
  36. data/lib/ragdoll/core/document_processor.rb +0 -344
  37. data/lib/ragdoll/core/embedding_service.rb +0 -183
  38. data/lib/ragdoll/core/jobs/extract_keywords.rb +0 -32
  39. data/lib/ragdoll/core/jobs/extract_text.rb +0 -42
  40. data/lib/ragdoll/core/jobs/generate_embeddings.rb +0 -32
  41. data/lib/ragdoll/core/jobs/generate_summary.rb +0 -29
  42. data/lib/ragdoll/core/metadata_schemas.rb +0 -334
  43. data/lib/ragdoll/core/models/audio_content.rb +0 -175
  44. data/lib/ragdoll/core/models/content.rb +0 -126
  45. data/lib/ragdoll/core/models/document.rb +0 -678
  46. data/lib/ragdoll/core/models/embedding.rb +0 -204
  47. data/lib/ragdoll/core/models/image_content.rb +0 -227
  48. data/lib/ragdoll/core/models/text_content.rb +0 -169
  49. data/lib/ragdoll/core/search_engine.rb +0 -50
  50. data/lib/ragdoll/core/services/image_description_service.rb +0 -230
  51. data/lib/ragdoll/core/services/metadata_generator.rb +0 -335
  52. data/lib/ragdoll/core/text_chunker.rb +0 -210
  53. data/lib/ragdoll/core/text_generation_service.rb +0 -360
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '00886bb2bc00baa882b99ce87ff32219cbd8b99c9b7dece3d87d90968a440303'
4
- data.tar.gz: 73449d8f094966cd6cd46c7aed59bedbe0ca1fb12d12fcc4d23e49d84c031158
3
+ metadata.gz: 7fb2f70ebe6d95bfcfca1ba44e84f140f1d75d17e27ead66ce9b7643f3571688
4
+ data.tar.gz: 61e3ccb7dc45bb6196e70770d4eaed9cae17602a9b442e5f525752c1e4a53445
5
5
  SHA512:
6
- metadata.gz: 4d29e99949b98558e6a554a96f3b05ca26dd50ca448ad34a05203c5072db7d50e9525e7103a6abe61cd1433e676d8aabd578c8f59b9a642d12b0e53b40ba3300
7
- data.tar.gz: b5a6f9decda306d972122716f7c2baca82e9c0530eaa9cfe95c9eea3d82a37f773d19176e5887dfaab135aa98602380f770d994cd304766edd6ec23dbf570db5
6
+ metadata.gz: 318e00ff0df2e4b075b9379ffc4a13de4700c4fa6c2c544be8678b700e4810d7cc80479eed3f709e6f25891a394741a8dccfc8e1fed6017d31607946c9267549
7
+ data.tar.gz: a8261e8a3f2740599564f4dd3b2c31914903339035664c01bfdea4800227858f071d25675ffd17c419b59d47baf8c0eb91313600355ac86bfc8d21eaf5e34add
data/README.md CHANGED
@@ -8,7 +8,7 @@
8
8
  <tr>
9
9
  <td width="50%">
10
10
  <a href="https://research.ibm.com/blog/retrieval-augmented-generation-RAG" target="_blank">
11
- <img src="rag_doll.png" alt="Ragdoll" width="800">
11
+ <img src="ragdoll.png" alt="Ragdoll" width="800">
12
12
  </a>
13
13
  </td>
14
14
  <td width="50%" valign="top">
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'simplecov'
3
+ require "simplecov"
4
4
  SimpleCov.start
5
5
 
6
6
  # Suppress bundler/rubygems warnings
@@ -9,12 +9,63 @@ $VERBOSE = nil
9
9
  require "bundler/gem_tasks"
10
10
  require "rake/testtask"
11
11
 
12
+ def ci_environment?
13
+ ENV["CI"] == "true" || ENV["RAGDOLL_SKIP_DATABASE_TESTS"] == "true"
14
+ end
15
+
16
+ desc "Setup test database"
17
+ task :setup_test_db do
18
+ require_relative "lib/ragdoll-core"
19
+
20
+ # Database configuration for tests
21
+ test_db_config = {
22
+ adapter: "postgresql",
23
+ database: "ragdoll_test",
24
+ username: ENV.fetch("RAGDOLL_POSTGRES_USER", "postgres"),
25
+ password: ENV.fetch("RAGDOLL_POSTGRES_PASSWORD", ""),
26
+ host: ENV.fetch("RAGDOLL_POSTGRES_HOST", "localhost"),
27
+ port: ENV.fetch("RAGDOLL_POSTGRES_PORT", 5432)
28
+ }
29
+
30
+ # Ensure database exists
31
+ begin
32
+ # Try to connect to the database
33
+ ActiveRecord::Base.establish_connection(test_db_config)
34
+ ActiveRecord::Base.connection.execute("SELECT 1")
35
+ rescue ActiveRecord::NoDatabaseError
36
+ # Database doesn't exist, create it
37
+ puts "Creating ragdoll_test database..."
38
+ admin_config = test_db_config.merge(database: "postgres")
39
+ ActiveRecord::Base.establish_connection(admin_config)
40
+ ActiveRecord::Base.connection.execute("CREATE DATABASE ragdoll_test")
41
+ ActiveRecord::Base.establish_connection(test_db_config)
42
+ rescue PG::ConnectionBad => e
43
+ puts "Error connecting to PostgreSQL: #{e.message}"
44
+ puts "Please ensure PostgreSQL is running and accessible"
45
+ exit 1
46
+ end
47
+
48
+ # Ensure pgvector extension is installed
49
+ begin
50
+ ActiveRecord::Base.connection.execute("CREATE EXTENSION IF NOT EXISTS vector")
51
+ rescue StandardError => e
52
+ puts "Warning: Could not install pgvector extension: #{e.message}"
53
+ end
54
+
55
+ # Run migrations
56
+ Ragdoll::Core::Database.setup(test_db_config.merge(auto_migrate: true, logger: nil))
57
+ puts "Test database setup complete"
58
+ end
59
+
12
60
  Rake::TestTask.new(:test) do |t|
13
61
  t.libs << "test"
14
62
  t.libs << "lib"
15
63
  t.test_files = FileList["test/**/*_test.rb"]
16
64
  end
17
65
 
66
+ # Make test task depend on database setup only if not skipping database tests
67
+ task test: :setup_test_db unless ci_environment?
68
+
18
69
  # Load annotate tasks
19
70
  Dir.glob("lib/tasks/*.rake").each { |r| load r }
20
71
 
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_job"
4
+
5
+ module Ragdoll
6
+ class ExtractKeywordsJob < ActiveJob::Base
7
+ queue_as :default
8
+
9
+ def perform(document_id)
10
+ document = Ragdoll::Document.find(document_id)
11
+ return unless document.content.present?
12
+ return if document.keywords.present?
13
+
14
+ text_service = Ragdoll::TextGenerationService.new
15
+ keywords_array = text_service.extract_keywords(document.content)
16
+
17
+ if keywords_array.present?
18
+ keywords_string = keywords_array.join(", ")
19
+ document.update!(keywords: keywords_string)
20
+ end
21
+ rescue ActiveRecord::RecordNotFound
22
+ # Document was deleted, nothing to do
23
+ rescue StandardError => e
24
+ Rails.logger.error "Failed to generate keywords for document #{document_id}: #{e.message}" if defined?(Rails)
25
+ raise e
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_job"
4
+
5
+ module Ragdoll
6
+ class ExtractTextJob < ActiveJob::Base
7
+ queue_as :default
8
+
9
+ def perform(document_id)
10
+ document = Ragdoll::Document.find(document_id)
11
+ return unless document.file_attached?
12
+ return if document.content.present?
13
+
14
+ document.update!(status: "processing")
15
+
16
+ extracted_content = document.extract_text_from_file
17
+
18
+ if extracted_content.present?
19
+ document.update!(
20
+ content: extracted_content,
21
+ status: "processed"
22
+ )
23
+
24
+ # Queue follow-up jobs
25
+ Ragdoll::GenerateSummaryJob.perform_later(document_id)
26
+ Ragdoll::ExtractKeywordsJob.perform_later(document_id)
27
+ Ragdoll::GenerateEmbeddingsJob.perform_later(document_id)
28
+ else
29
+ document.update!(status: "error")
30
+ end
31
+ rescue ActiveRecord::RecordNotFound
32
+ # Document was deleted, nothing to do
33
+ rescue StandardError => e
34
+ document&.update!(status: "error")
35
+ raise e
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_job"
4
+
5
+ module Ragdoll
6
+ class GenerateEmbeddingsJob < ActiveJob::Base
7
+ queue_as :default
8
+
9
+ def perform(document_id, chunk_size: nil, chunk_overlap: nil)
10
+ document = Ragdoll::Document.find(document_id)
11
+ return unless document.content.present?
12
+ return if document.all_embeddings.exists?
13
+
14
+ # Process all content records using their own generate_embeddings! methods
15
+ document.contents.each(&:generate_embeddings!)
16
+
17
+ # Update document status to processed
18
+ document.update!(status: "processed")
19
+ rescue ActiveRecord::RecordNotFound
20
+ # Document was deleted, nothing to do
21
+ rescue StandardError => e
22
+ if defined?(Rails)
23
+ Rails.logger.error "Failed to generate embeddings for document #{document_id}: #{e.message}"
24
+ end
25
+ raise e
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_job"
4
+
5
+ module Ragdoll
6
+ class GenerateSummaryJob < ActiveJob::Base
7
+ queue_as :default
8
+
9
+ def perform(document_id)
10
+ document = Ragdoll::Document.find(document_id)
11
+ return unless document.content.present?
12
+ return if document.summary.present?
13
+
14
+ text_service = Ragdoll::TextGenerationService.new
15
+ summary = text_service.generate_summary(document.content)
16
+
17
+ document.update!(summary: summary) if summary.present?
18
+ rescue ActiveRecord::RecordNotFound
19
+ # Document was deleted, nothing to do
20
+ rescue StandardError => e
21
+ Rails.logger.error "Failed to generate summary for document #{document_id}: #{e.message}" if defined?(Rails)
22
+ raise e
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,332 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ragdoll
4
+ # Document metadata schemas for LLM structured output
5
+ # Each document type has a specific schema that guides LLM generation
6
+ module MetadataSchemas
7
+ # Text document metadata schema
8
+ TEXT_SCHEMA = {
9
+ type: "object",
10
+ properties: {
11
+ summary: {
12
+ type: "string",
13
+ description: "Concise summary of the text content (2-3 paragraphs)"
14
+ },
15
+ keywords: {
16
+ type: "array",
17
+ items: { type: "string" },
18
+ description: "Relevant keywords and phrases extracted from the text",
19
+ maxItems: 10
20
+ },
21
+ classification: {
22
+ type: "string",
23
+ enum: %w[research article blog documentation technical legal financial marketing other],
24
+ description: "Document classification category"
25
+ },
26
+ topics: {
27
+ type: "array",
28
+ items: { type: "string" },
29
+ description: "Main topics discussed in the document",
30
+ maxItems: 5
31
+ },
32
+ sentiment: {
33
+ type: "string",
34
+ enum: %w[positive negative neutral mixed],
35
+ description: "Overall sentiment of the text"
36
+ },
37
+ reading_time_minutes: {
38
+ type: "integer",
39
+ description: "Estimated reading time in minutes"
40
+ },
41
+ language: {
42
+ type: "string",
43
+ description: "Primary language of the text (ISO 639-1 code)"
44
+ },
45
+ complexity_level: {
46
+ type: "string",
47
+ enum: %w[beginner intermediate advanced expert],
48
+ description: "Complexity/difficulty level of the content"
49
+ },
50
+ tags: {
51
+ type: "array",
52
+ items: { type: "string" },
53
+ description: "User-defined or AI-suggested tags for organization"
54
+ }
55
+ },
56
+ required: %w[summary keywords classification]
57
+ }.freeze
58
+
59
+ # Image document metadata schema
60
+ IMAGE_SCHEMA = {
61
+ type: "object",
62
+ properties: {
63
+ description: {
64
+ type: "string",
65
+ description: "Detailed description of what is shown in the image"
66
+ },
67
+ summary: {
68
+ type: "string",
69
+ description: "Brief summary of the image content (1 paragraph)"
70
+ },
71
+ objects: {
72
+ type: "array",
73
+ items: { type: "string" },
74
+ description: "List of objects, people, or items visible in the image",
75
+ maxItems: 15
76
+ },
77
+ scene_type: {
78
+ type: "string",
79
+ enum: %w[indoor outdoor portrait landscape diagram chart screenshot artwork photo other],
80
+ description: "Type of scene or image category"
81
+ },
82
+ colors: {
83
+ type: "array",
84
+ items: { type: "string" },
85
+ description: "Dominant colors in the image",
86
+ maxItems: 5
87
+ },
88
+ style: {
89
+ type: "string",
90
+ enum: %w[photograph illustration diagram chart screenshot artwork technical drawing other],
91
+ description: "Visual style or format of the image"
92
+ },
93
+ mood: {
94
+ type: "string",
95
+ enum: %w[professional casual formal technical artistic dramatic serene energetic other],
96
+ description: "Overall mood or tone of the image"
97
+ },
98
+ text_content: {
99
+ type: "string",
100
+ description: "Any visible text in the image (OCR extracted)"
101
+ },
102
+ keywords: {
103
+ type: "array",
104
+ items: { type: "string" },
105
+ description: "Relevant keywords for image search and categorization",
106
+ maxItems: 10
107
+ },
108
+ classification: {
109
+ type: "string",
110
+ enum: %w[technical diagram photo artwork chart screenshot document other],
111
+ description: "Image classification category"
112
+ },
113
+ tags: {
114
+ type: "array",
115
+ items: { type: "string" },
116
+ description: "User-defined or AI-suggested tags for organization"
117
+ }
118
+ },
119
+ required: %w[description summary scene_type classification]
120
+ }.freeze
121
+
122
+ # Audio document metadata schema
123
+ AUDIO_SCHEMA = {
124
+ type: "object",
125
+ properties: {
126
+ summary: {
127
+ type: "string",
128
+ description: "Summary of audio content (speech transcript summary or music description)"
129
+ },
130
+ content_type: {
131
+ type: "string",
132
+ enum: %w[speech music podcast interview lecture presentation sound_effect meeting other],
133
+ description: "Type of audio content"
134
+ },
135
+ keywords: {
136
+ type: "array",
137
+ items: { type: "string" },
138
+ description: "Relevant keywords extracted from transcript or describing music",
139
+ maxItems: 10
140
+ },
141
+ classification: {
142
+ type: "string",
143
+ enum: %w[educational entertainment business technical musical interview podcast other],
144
+ description: "Audio content classification"
145
+ },
146
+ topics: {
147
+ type: "array",
148
+ items: { type: "string" },
149
+ description: "Main topics discussed (for speech) or musical elements (for music)",
150
+ maxItems: 5
151
+ },
152
+ language: {
153
+ type: "string",
154
+ description: "Language of speech content (ISO 639-1 code) or N/A for music"
155
+ },
156
+ speakers: {
157
+ type: "array",
158
+ items: { type: "string" },
159
+ description: "Number or names of speakers (for speech content)",
160
+ maxItems: 10
161
+ },
162
+ mood: {
163
+ type: "string",
164
+ enum: %w[formal casual energetic calm professional educational entertaining informative other],
165
+ description: "Overall mood or tone of the audio"
166
+ },
167
+ genre: {
168
+ type: "string",
169
+ description: "Music genre (for musical content) or speech type (for spoken content)"
170
+ },
171
+ key_quotes: {
172
+ type: "array",
173
+ items: { type: "string" },
174
+ description: "Important quotes or phrases from speech content",
175
+ maxItems: 3
176
+ },
177
+ tags: {
178
+ type: "array",
179
+ items: { type: "string" },
180
+ description: "User-defined or AI-suggested tags for organization"
181
+ }
182
+ },
183
+ required: %w[summary content_type classification]
184
+ }.freeze
185
+
186
+ # PDF document metadata schema (combines text analysis with document structure)
187
+ PDF_SCHEMA = {
188
+ type: "object",
189
+ properties: {
190
+ summary: {
191
+ type: "string",
192
+ description: "Summary of the PDF document content"
193
+ },
194
+ document_type: {
195
+ type: "string",
196
+ enum: %w[research_paper report manual presentation legal financial technical academic other],
197
+ description: "Type of PDF document"
198
+ },
199
+ keywords: {
200
+ type: "array",
201
+ items: { type: "string" },
202
+ description: "Keywords extracted from the document text",
203
+ maxItems: 15
204
+ },
205
+ classification: {
206
+ type: "string",
207
+ enum: %w[academic business legal technical manual report presentation other],
208
+ description: "Document classification category"
209
+ },
210
+ topics: {
211
+ type: "array",
212
+ items: { type: "string" },
213
+ description: "Main topics covered in the document",
214
+ maxItems: 8
215
+ },
216
+ structure: {
217
+ type: "object",
218
+ properties: {
219
+ has_table_of_contents: { type: "boolean" },
220
+ has_bibliography: { type: "boolean" },
221
+ has_figures: { type: "boolean" },
222
+ has_tables: { type: "boolean" },
223
+ estimated_pages: { type: "integer" }
224
+ }
225
+ },
226
+ reading_time_minutes: {
227
+ type: "integer",
228
+ description: "Estimated reading time in minutes"
229
+ },
230
+ complexity_level: {
231
+ type: "string",
232
+ enum: %w[beginner intermediate advanced expert],
233
+ description: "Complexity level of the content"
234
+ },
235
+ language: {
236
+ type: "string",
237
+ description: "Primary language of the document"
238
+ },
239
+ tags: {
240
+ type: "array",
241
+ items: { type: "string" },
242
+ description: "User-defined or AI-suggested tags for organization"
243
+ }
244
+ },
245
+ required: %w[summary document_type classification]
246
+ }.freeze
247
+
248
+ # Mixed/multi-modal document metadata schema
249
+ MIXED_SCHEMA = {
250
+ type: "object",
251
+ properties: {
252
+ summary: {
253
+ type: "string",
254
+ description: "Overall summary combining all content types in the document"
255
+ },
256
+ content_types: {
257
+ type: "array",
258
+ items: { type: "string", enum: %w[text image audio] },
259
+ description: "Types of content present in this multi-modal document"
260
+ },
261
+ primary_content_type: {
262
+ type: "string",
263
+ enum: %w[text image audio],
264
+ description: "The primary or dominant content type"
265
+ },
266
+ keywords: {
267
+ type: "array",
268
+ items: { type: "string" },
269
+ description: "Keywords extracted from all content types",
270
+ maxItems: 15
271
+ },
272
+ classification: {
273
+ type: "string",
274
+ enum: %w[multimedia_presentation research educational marketing technical training other],
275
+ description: "Multi-modal document classification"
276
+ },
277
+ topics: {
278
+ type: "array",
279
+ items: { type: "string" },
280
+ description: "Main topics across all content types",
281
+ maxItems: 8
282
+ },
283
+ cohesion_analysis: {
284
+ type: "string",
285
+ description: "How well the different content types work together"
286
+ },
287
+ tags: {
288
+ type: "array",
289
+ items: { type: "string" },
290
+ description: "User-defined or AI-suggested tags for organization"
291
+ }
292
+ },
293
+ required: %w[summary content_types primary_content_type classification]
294
+ }.freeze
295
+
296
+ # Get schema for document type
297
+ def self.schema_for(document_type)
298
+ case document_type.to_s.downcase
299
+ when "text", "markdown", "html"
300
+ TEXT_SCHEMA
301
+ when "image"
302
+ IMAGE_SCHEMA
303
+ when "audio"
304
+ AUDIO_SCHEMA
305
+ when "pdf", "docx"
306
+ PDF_SCHEMA
307
+ when "mixed"
308
+ MIXED_SCHEMA
309
+ else
310
+ TEXT_SCHEMA # fallback
311
+ end
312
+ end
313
+
314
+ # Get required fields for document type
315
+ def self.required_fields_for(document_type)
316
+ schema_for(document_type)[:required] || []
317
+ end
318
+
319
+ # Validate metadata against schema
320
+ def self.validate_metadata(document_type, metadata)
321
+ schema = schema_for(document_type)
322
+ required_fields = schema[:required] || []
323
+
324
+ errors = []
325
+ required_fields.each do |field|
326
+ errors << "Missing required field: #{field}" unless metadata.key?(field)
327
+ end
328
+
329
+ errors
330
+ end
331
+ end
332
+ end
@@ -0,0 +1,142 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+ require_relative "content"
5
+
6
+ module Ragdoll
7
+ class AudioContent < Content
8
+ validate :audio_data_or_transcript_present
9
+ validates :duration, numericality: { greater_than: 0 }, allow_nil: true
10
+ validates :sample_rate, numericality: { greater_than: 0 }, allow_nil: true
11
+
12
+ scope :recent, -> { order(created_at: :desc) }
13
+ scope :with_audio, -> { where.not(data: [nil, ""]) }
14
+ scope :with_transcripts, -> { where.not(content: [nil, ""]) }
15
+ scope :by_duration, lambda { |min_duration, max_duration = nil|
16
+ scope = where("duration >= ?", min_duration)
17
+ scope = scope.where("duration <= ?", max_duration) if max_duration
18
+ scope
19
+ }
20
+
21
+ # Audio content accessors - content field stores transcript for embedding
22
+ def transcript
23
+ content
24
+ end
25
+
26
+ def transcript=(value)
27
+ self.content = value
28
+ end
29
+
30
+ # Audio file data accessor
31
+ def audio_data
32
+ data
33
+ end
34
+
35
+ def audio_data=(value)
36
+ self.data = value
37
+ end
38
+
39
+ # Audio file technical properties (stored in content metadata - raw file data)
40
+ def audio_attached?
41
+ data.present?
42
+ end
43
+
44
+ def audio_size
45
+ metadata.dig("file_size") || 0
46
+ end
47
+
48
+ def audio_size=(value)
49
+ self.metadata = metadata.merge("file_size" => value)
50
+ end
51
+
52
+ def audio_content_type
53
+ metadata.dig("content_type")
54
+ end
55
+
56
+ def audio_content_type=(value)
57
+ self.metadata = metadata.merge("content_type" => value)
58
+ end
59
+
60
+ def audio_filename
61
+ metadata.dig("filename")
62
+ end
63
+
64
+ def audio_filename=(value)
65
+ self.metadata = metadata.merge("filename" => value)
66
+ end
67
+
68
+ # Audio format and technical details
69
+ def codec
70
+ metadata.dig("codec")
71
+ end
72
+
73
+ def codec=(value)
74
+ self.metadata = metadata.merge("codec" => value)
75
+ end
76
+
77
+ def bitrate
78
+ metadata.dig("bitrate")
79
+ end
80
+
81
+ def bitrate=(value)
82
+ self.metadata = metadata.merge("bitrate" => value)
83
+ end
84
+
85
+ def channels
86
+ metadata.dig("channels")
87
+ end
88
+
89
+ def channels=(value)
90
+ self.metadata = metadata.merge("channels" => value)
91
+ end
92
+
93
+ def duration_formatted
94
+ return "Unknown" unless duration
95
+
96
+ minutes = (duration / 60).floor
97
+ seconds = (duration % 60).round
98
+ "#{minutes}:#{seconds.to_s.rjust(2, '0')}"
99
+ end
100
+
101
+ # Override content for embedding to use transcript
102
+ def content_for_embedding
103
+ transcript.presence || "Audio content without transcript"
104
+ end
105
+
106
+ def generate_embeddings!
107
+ return unless should_generate_embeddings?
108
+
109
+ embedding_content = content_for_embedding
110
+ return if embedding_content.blank?
111
+
112
+ # Generate embeddings using the base class method
113
+ super
114
+ end
115
+
116
+ # Override should_generate_embeddings to check for transcript
117
+ def should_generate_embeddings?
118
+ content_for_embedding.present? && embeddings.empty?
119
+ end
120
+
121
+ def self.stats
122
+ {
123
+ total_audio_contents: count,
124
+ by_model: group(:embedding_model).count,
125
+ total_embeddings: joins(:embeddings).count,
126
+ with_audio: with_audio.count,
127
+ with_transcripts: with_transcripts.count,
128
+ total_duration: sum(:duration),
129
+ average_duration: average(:duration),
130
+ average_audio_size: joins(:audio_attachment).average("active_storage_blobs.byte_size")
131
+ }
132
+ end
133
+
134
+ private
135
+
136
+ def audio_data_or_transcript_present
137
+ return if audio_attached? || transcript.present?
138
+
139
+ errors.add(:base, "Must have either audio data or transcript")
140
+ end
141
+ end
142
+ end