ragdoll 0.1.1 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/Rakefile +52 -1
  4. data/app/jobs/ragdoll/extract_keywords_job.rb +28 -0
  5. data/app/jobs/ragdoll/extract_text_job.rb +38 -0
  6. data/app/jobs/ragdoll/generate_embeddings_job.rb +28 -0
  7. data/app/jobs/ragdoll/generate_summary_job.rb +25 -0
  8. data/app/lib/ragdoll/metadata_schemas.rb +332 -0
  9. data/app/models/ragdoll/audio_content.rb +142 -0
  10. data/app/models/ragdoll/content.rb +95 -0
  11. data/app/models/ragdoll/document.rb +611 -0
  12. data/app/models/ragdoll/embedding.rb +176 -0
  13. data/app/models/ragdoll/image_content.rb +194 -0
  14. data/app/models/ragdoll/text_content.rb +137 -0
  15. data/app/services/ragdoll/configuration_service.rb +113 -0
  16. data/app/services/ragdoll/document_management.rb +108 -0
  17. data/app/services/ragdoll/document_processor.rb +342 -0
  18. data/app/services/ragdoll/embedding_service.rb +202 -0
  19. data/app/services/ragdoll/image_description_service.rb +230 -0
  20. data/app/services/ragdoll/metadata_generator.rb +329 -0
  21. data/app/services/ragdoll/model_resolver.rb +72 -0
  22. data/app/services/ragdoll/search_engine.rb +51 -0
  23. data/app/services/ragdoll/text_chunker.rb +208 -0
  24. data/app/services/ragdoll/text_generation_service.rb +355 -0
  25. data/lib/ragdoll/core/client.rb +32 -41
  26. data/lib/ragdoll/core/configuration.rb +140 -156
  27. data/lib/ragdoll/core/database.rb +1 -1
  28. data/lib/ragdoll/core/model.rb +45 -0
  29. data/lib/ragdoll/core/version.rb +1 -1
  30. data/lib/ragdoll/core.rb +35 -17
  31. data/lib/ragdoll.rb +1 -1
  32. data/lib/tasks/annotate.rake +1 -1
  33. data/lib/tasks/db.rake +2 -2
  34. metadata +24 -20
  35. data/lib/ragdoll/core/document_management.rb +0 -110
  36. data/lib/ragdoll/core/document_processor.rb +0 -344
  37. data/lib/ragdoll/core/embedding_service.rb +0 -183
  38. data/lib/ragdoll/core/jobs/extract_keywords.rb +0 -32
  39. data/lib/ragdoll/core/jobs/extract_text.rb +0 -42
  40. data/lib/ragdoll/core/jobs/generate_embeddings.rb +0 -32
  41. data/lib/ragdoll/core/jobs/generate_summary.rb +0 -29
  42. data/lib/ragdoll/core/metadata_schemas.rb +0 -334
  43. data/lib/ragdoll/core/models/audio_content.rb +0 -175
  44. data/lib/ragdoll/core/models/content.rb +0 -126
  45. data/lib/ragdoll/core/models/document.rb +0 -678
  46. data/lib/ragdoll/core/models/embedding.rb +0 -204
  47. data/lib/ragdoll/core/models/image_content.rb +0 -227
  48. data/lib/ragdoll/core/models/text_content.rb +0 -169
  49. data/lib/ragdoll/core/search_engine.rb +0 -50
  50. data/lib/ragdoll/core/services/image_description_service.rb +0 -230
  51. data/lib/ragdoll/core/services/metadata_generator.rb +0 -335
  52. data/lib/ragdoll/core/text_chunker.rb +0 -210
  53. data/lib/ragdoll/core/text_generation_service.rb +0 -360
@@ -1,678 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "active_record"
4
- require_relative "../metadata_schemas"
5
-
6
- # == Schema Information
7
- #
8
- # Table name: ragdoll_documents
9
- #
10
- # id :bigint not null, primary key
11
- # document_type(Document format type (text, image, audio, pdf, docx, html, markdown, mixed)) :string default("text"), not null
12
- # (no file_data column - file data stored in content models via STI)
13
- # file_metadata(File properties and processing metadata, separate from AI-generated content) :json
14
- # location(Source location of document (file path, URL, or identifier)) :string not null
15
- # metadata(LLM-generated structured metadata using document-type-specific schemas) :json
16
- # status(Document processing status: pending, processing, processed, error) :string default("pending"), not null
17
- # title(Human-readable document title for display and search) :string not null
18
- # file_modified_at(Timestamp when the source file was last modified) :datetime not null
19
- # created_at(Standard creation and update timestamps) :datetime not null
20
- # updated_at(Standard creation and update timestamps) :datetime not null
21
- #
22
- # Indexes
23
- #
24
- # index_ragdoll_documents_on_created_at (created_at)
25
- # index_ragdoll_documents_on_document_type (document_type)
26
- # index_ragdoll_documents_on_document_type_and_status (document_type,status)
27
- # index_ragdoll_documents_on_fulltext_search (to_tsvector('english'::regconfig, (((((((COALESCE(title, ''::character varying))::text || ' '::text) || COALESCE((metadata ->> 'summary'::text), ''::text)) || ' '::text) || COALESCE((metadata ->> 'keywords'::text), ''::text)) || ' '::text) || COALESCE((metadata ->> 'description'::text), ''::text)))) USING gin
28
- # index_ragdoll_documents_on_location (location) UNIQUE
29
- # index_ragdoll_documents_on_metadata_classification (((metadata ->> 'classification'::text)))
30
- # index_ragdoll_documents_on_metadata_type (((metadata ->> 'document_type'::text)))
31
- # index_ragdoll_documents_on_status (status)
32
- # index_ragdoll_documents_on_title (title)
33
- #
34
-
35
- module Ragdoll
36
- module Core
37
- module Models
38
- class Document < ActiveRecord::Base
39
- self.table_name = "ragdoll_documents"
40
-
41
- # PostgreSQL full-text search on summary and keywords
42
- # Uses PostgreSQL's built-in full-text search capabilities
43
-
44
- # File handling moved to content models - no Shrine attachment at document level
45
-
46
- # Multi-modal content relationships using STI
47
- has_many :contents,
48
- class_name: "Ragdoll::Core::Models::Content",
49
- foreign_key: "document_id",
50
- dependent: :destroy
51
-
52
- has_many :text_contents,
53
- -> { where(type: "Ragdoll::Core::Models::TextContent") },
54
- class_name: "Ragdoll::Core::Models::TextContent",
55
- foreign_key: "document_id"
56
-
57
- has_many :image_contents,
58
- -> { where(type: "Ragdoll::Core::Models::ImageContent") },
59
- class_name: "Ragdoll::Core::Models::ImageContent",
60
- foreign_key: "document_id"
61
-
62
- has_many :audio_contents,
63
- -> { where(type: "Ragdoll::Core::Models::AudioContent") },
64
- class_name: "Ragdoll::Core::Models::AudioContent",
65
- foreign_key: "document_id"
66
-
67
- # All embeddings across content types
68
- has_many :text_embeddings, through: :text_contents, source: :embeddings
69
- has_many :image_embeddings, through: :image_contents, source: :embeddings
70
- has_many :audio_embeddings, through: :audio_contents, source: :embeddings
71
-
72
- validates :location, presence: true
73
- validates :title, presence: true
74
- validates :document_type, presence: true,
75
- inclusion: { in: %w[text image audio pdf docx html markdown mixed] }
76
- validates :summary, presence: false # Allow empty summaries initially
77
- validates :keywords, presence: false # Allow empty keywords initially
78
- validates :status, inclusion: { in: %w[pending processing processed error] }
79
- validates :file_modified_at, presence: true
80
-
81
- # Ensure location is always an absolute path for file paths
82
- before_validation :normalize_location
83
- before_validation :set_default_file_modified_at
84
-
85
- # JSON columns are handled natively by PostgreSQL - no serialization needed
86
-
87
- scope :processed, -> { where(status: "processed") }
88
- scope :by_type, ->(type) { where(document_type: type) }
89
- scope :recent, -> { order(created_at: :desc) }
90
- scope :with_content, -> { joins(:contents).distinct }
91
- scope :without_content, -> { left_joins(:contents).where(contents: { id: nil }) }
92
-
93
- # Callbacks to process content
94
- after_commit :create_content_from_pending, on: %i[create update],
95
- if: :has_pending_content?
96
-
97
- def processed?
98
- status == "processed"
99
- end
100
-
101
- # Multi-modal content type detection
102
- def multi_modal?
103
- content_types.length > 1
104
- end
105
-
106
- def content_types
107
- types = []
108
- types << "text" if text_contents.any?
109
- types << "image" if image_contents.any?
110
- types << "audio" if audio_contents.any?
111
- types
112
- end
113
-
114
- def primary_content_type
115
- return document_type if %w[text image audio].include?(document_type)
116
- return content_types.first if content_types.any?
117
-
118
- "text" # default
119
- end
120
-
121
- # Dynamic content method that forwards to appropriate content table
122
- def content
123
- case primary_content_type
124
- when "text"
125
- # Return the combined content from all text_contents
126
- text_contents.pluck(:content).compact.join("\n\n")
127
- when "image"
128
- # Return the combined descriptions from all image_contents (content field stores description)
129
- image_contents.pluck(:content).compact.join("\n\n")
130
- when "audio"
131
- # Return the combined transcripts from all audio_contents (content field stores transcript)
132
- audio_contents.pluck(:content).compact.join("\n\n")
133
- else
134
- # Fallback: try to get any available content
135
- contents.pluck(:content).compact.join("\n\n")
136
- end
137
- end
138
-
139
- # Set content method for backwards compatibility
140
- def content=(value)
141
- # Store the content to be created after save
142
- @pending_content = value
143
-
144
- # If document is already persisted, create the content immediately
145
- if persisted?
146
- create_content_from_pending
147
- end
148
- end
149
-
150
- # Content statistics
151
- def total_word_count
152
- text_contents.sum { |tc| tc.word_count }
153
- end
154
-
155
- def total_character_count
156
- text_contents.sum { |tc| tc.character_count }
157
- end
158
-
159
- def total_embedding_count
160
- text_embeddings.count + image_embeddings.count + audio_embeddings.count
161
- end
162
-
163
- def embeddings_by_type
164
- {
165
- text: text_embeddings.count,
166
- image: image_embeddings.count,
167
- audio: audio_embeddings.count
168
- }
169
- end
170
-
171
- # Document metadata methods - now using dedicated columns
172
- def has_summary?
173
- summary.present?
174
- end
175
-
176
- def has_keywords?
177
- keywords.present?
178
- end
179
-
180
- def keywords_array
181
- return [] unless keywords.present?
182
-
183
- case keywords
184
- when Array
185
- keywords
186
- when String
187
- keywords.split(",").map(&:strip).reject(&:empty?)
188
- else
189
- []
190
- end
191
- end
192
-
193
- def add_keyword(keyword)
194
- current_keywords = keywords_array
195
- return if current_keywords.include?(keyword.strip)
196
-
197
- current_keywords << keyword.strip
198
- self.keywords = current_keywords.join(", ")
199
- end
200
-
201
- def remove_keyword(keyword)
202
- current_keywords = keywords_array
203
- current_keywords.delete(keyword.strip)
204
- self.keywords = current_keywords.join(", ")
205
- end
206
-
207
- # Metadata accessors for common fields
208
- def description
209
- metadata["description"]
210
- end
211
-
212
- def description=(value)
213
- self.metadata = metadata.merge("description" => value)
214
- end
215
-
216
- def classification
217
- metadata["classification"]
218
- end
219
-
220
- def classification=(value)
221
- self.metadata = metadata.merge("classification" => value)
222
- end
223
-
224
- def tags
225
- metadata["tags"] || []
226
- end
227
-
228
- def tags=(value)
229
- self.metadata = metadata.merge("tags" => Array(value))
230
- end
231
-
232
- # File-related helper methods - now delegated to content models
233
- def has_files?
234
- contents.any? { |c| c.data.present? }
235
- end
236
-
237
- def total_file_size
238
- # Could be implemented by summing file sizes from content metadata
239
- contents.sum { |c| c.metadata.dig('file_size') || 0 }
240
- end
241
-
242
- def primary_file_type
243
- # Return the document_type as the primary file type
244
- document_type
245
- end
246
-
247
- # Content processing for multi-modal documents
248
- def process_content!
249
- # Content processing is now handled by individual content models
250
- # This method orchestrates the overall processing
251
-
252
- # Generate embeddings for all content
253
- generate_embeddings_for_all_content!
254
-
255
- # Generate structured metadata using LLM
256
- generate_metadata!
257
-
258
- update!(status: "processed")
259
- end
260
-
261
- # Generate embeddings for all content types
262
- def generate_embeddings_for_all_content!
263
- text_contents.each(&:generate_embeddings!)
264
- image_contents.each(&:generate_embeddings!)
265
- audio_contents.each(&:generate_embeddings!)
266
- end
267
-
268
- # Generate structured metadata using LLM
269
- def generate_metadata!
270
- require_relative "../services/metadata_generator"
271
-
272
- generator = Services::MetadataGenerator.new
273
- generated_metadata = generator.generate_for_document(self)
274
-
275
- # Validate metadata against schema
276
- errors = MetadataSchemas.validate_metadata(document_type, generated_metadata)
277
- if errors.any?
278
- Rails.logger.warn "Metadata validation errors: #{errors.join(', ')}" if defined?(Rails)
279
- puts "Metadata validation errors: #{errors.join(', ')}"
280
- end
281
-
282
- # Merge with existing metadata (preserving user-set values)
283
- self.metadata = metadata.merge(generated_metadata)
284
- save!
285
- rescue StandardError => e
286
- Rails.logger.error "Metadata generation failed: #{e.message}" if defined?(Rails)
287
- puts "Metadata generation failed: #{e.message}"
288
- end
289
-
290
- # PostgreSQL full-text search on metadata fields
291
- def self.search_content(query, **options)
292
- return none if query.blank?
293
-
294
- # Use PostgreSQL's built-in full-text search across metadata fields
295
- where(
296
- "to_tsvector('english', COALESCE(title, '') || ' ' || COALESCE(metadata->>'summary', '') || ' ' || COALESCE(metadata->>'keywords', '') || ' ' || COALESCE(metadata->>'description', '')) @@ plainto_tsquery('english', ?)",
297
- query
298
- ).limit(options[:limit] || 20)
299
- end
300
-
301
- # Faceted search by metadata fields
302
- def self.faceted_search(query: nil, keywords: [], classification: nil, tags: [], **options)
303
- scope = all
304
-
305
- # Filter by keywords if provided
306
- if keywords.any?
307
- keywords.each do |keyword|
308
- scope = scope.where("metadata->>'keywords' ILIKE ?", "%#{keyword}%")
309
- end
310
- end
311
-
312
- # Filter by classification
313
- scope = scope.where("metadata->>'classification' = ?", classification) if classification.present?
314
-
315
- # Filter by tags
316
- if tags.any?
317
- tags.each do |tag|
318
- scope = scope.where("metadata ? 'tags' AND metadata->'tags' @> ?", [tag].to_json)
319
- end
320
- end
321
-
322
- # Apply PostgreSQL full-text search if query provided
323
- if query.present?
324
- scope = scope.where(
325
- "to_tsvector('english', COALESCE(title, '') || ' ' || COALESCE(metadata->>'summary', '') || ' ' || COALESCE(metadata->>'keywords', '') || ' ' || COALESCE(metadata->>'description', '')) @@ plainto_tsquery('english', ?)",
326
- query
327
- )
328
- end
329
-
330
- scope.limit(options[:limit] || 20)
331
- end
332
-
333
- # Get all unique keywords from metadata
334
- def self.all_keywords
335
- keywords = []
336
- where("metadata ? 'keywords'").pluck(:metadata).each do |meta|
337
- case meta["keywords"]
338
- when Array
339
- keywords.concat(meta["keywords"])
340
- when String
341
- keywords.concat(meta["keywords"].split(",").map(&:strip))
342
- end
343
- end
344
- keywords.uniq.sort
345
- end
346
-
347
- # Get all unique classifications
348
- def self.all_classifications
349
- where("metadata ? 'classification'").distinct.pluck("metadata->>'classification'").compact.sort
350
- end
351
-
352
- # Get all unique tags
353
- def self.all_tags
354
- tags = []
355
- where("metadata ? 'tags'").pluck(:metadata).each do |meta|
356
- tags.concat(Array(meta["tags"]))
357
- end
358
- tags.uniq.sort
359
- end
360
-
361
- # Get keyword frequencies for faceted search
362
- def self.keyword_frequencies
363
- frequencies = Hash.new(0)
364
- where("metadata ? 'keywords'").pluck(:metadata).each do |meta|
365
- case meta["keywords"]
366
- when Array
367
- meta["keywords"].each { |k| frequencies[k] += 1 }
368
- when String
369
- meta["keywords"].split(",").map(&:strip).each { |k| frequencies[k] += 1 }
370
- end
371
- end
372
- frequencies.sort_by { |_k, v| -v }.to_h
373
- end
374
-
375
- # Hybrid search combining semantic and PostgreSQL full-text search
376
- def self.hybrid_search(query, query_embedding: nil, **options)
377
- limit = options[:limit] || 20
378
- semantic_weight = options[:semantic_weight] || 0.7
379
- text_weight = options[:text_weight] || 0.3
380
-
381
- results = []
382
-
383
- # Get semantic search results if embedding provided
384
- if query_embedding
385
- semantic_results = embeddings_search(query_embedding, limit: limit)
386
- results.concat(semantic_results.map do |result|
387
- result.merge(
388
- search_type: "semantic",
389
- weighted_score: result[:combined_score] * semantic_weight
390
- )
391
- end)
392
- end
393
-
394
- # Get PostgreSQL full-text search results
395
- text_results = search_content(query, limit: limit)
396
- text_results.each_with_index do |doc, index|
397
- score = (limit - index).to_f / limit * text_weight
398
- results << {
399
- document_id: doc.id.to_s,
400
- document_title: doc.title,
401
- document_location: doc.location,
402
- content: doc.content[0..500], # Preview
403
- search_type: "full_text",
404
- weighted_score: score,
405
- document: doc
406
- }
407
- end
408
-
409
- # Combine and deduplicate by document_id
410
- combined = results.group_by { |r| r[:document_id] }
411
- .map do |_doc_id, doc_results|
412
- best_result = doc_results.max_by { |r| r[:weighted_score] }
413
- total_score = doc_results.sum { |r| r[:weighted_score] }
414
- search_types = doc_results.map { |r| r[:search_type] }.uniq
415
-
416
- best_result.merge(
417
- combined_score: total_score,
418
- search_types: search_types
419
- )
420
- end
421
-
422
- combined.sort_by { |r| -r[:combined_score] }.take(limit)
423
- end
424
-
425
- # Extract keywords from query string (words > 4 characters)
426
- def self.extract_keywords(query:)
427
- return [] if query.nil? || query.strip.empty?
428
-
429
- query.split(/\s+/)
430
- .map(&:strip)
431
- .reject(&:empty?)
432
- .select { |word| word.length > 4 }
433
- end
434
-
435
- # Get search data for indexing
436
- def search_data
437
- data = {
438
- title: title,
439
- document_type: document_type,
440
- location: location,
441
- status: status,
442
- total_word_count: total_word_count,
443
- total_character_count: total_character_count,
444
- total_embedding_count: total_embedding_count,
445
- content_types: content_types,
446
- multi_modal: multi_modal?
447
- }
448
-
449
- # Add document metadata
450
- data.merge!(metadata.transform_keys { |k| "metadata_#{k}" }) if metadata.present?
451
-
452
- # Add file metadata
453
- data.merge!(file_metadata.transform_keys { |k| "file_#{k}" }) if file_metadata.present?
454
-
455
- data
456
- end
457
-
458
- def all_embeddings(content_type: nil)
459
- content_ids = []
460
-
461
- # If content_type is specified, only get IDs for that type
462
- if content_type
463
- case content_type.to_s
464
- when 'text'
465
- content_ids.concat(text_contents.pluck(:id)) if text_contents.any?
466
- when 'image'
467
- content_ids.concat(image_contents.pluck(:id)) if image_contents.any?
468
- when 'audio'
469
- content_ids.concat(audio_contents.pluck(:id)) if audio_contents.any?
470
- end
471
- else
472
- # Collect all content IDs across all content types
473
- content_ids.concat(text_contents.pluck(:id)) if text_contents.any?
474
- content_ids.concat(image_contents.pluck(:id)) if image_contents.any?
475
- content_ids.concat(audio_contents.pluck(:id)) if audio_contents.any?
476
- end
477
-
478
- return Ragdoll::Core::Models::Embedding.none if content_ids.empty?
479
-
480
- # Use the base STI class name 'Ragdoll::Core::Models::Content' as that's what's stored
481
- # in polymorphic associations with STI
482
- Ragdoll::Core::Models::Embedding.where(
483
- embeddable_type: 'Ragdoll::Core::Models::Content',
484
- embeddable_id: content_ids
485
- )
486
- end
487
-
488
- private
489
-
490
-
491
- def has_pending_content?
492
- @pending_content.present?
493
- end
494
-
495
- def create_content_from_pending
496
- return unless @pending_content.present?
497
-
498
- value = @pending_content
499
- @pending_content = nil
500
-
501
- case primary_content_type
502
- when "text"
503
- # Create or update the first text_content
504
- if text_contents.any?
505
- text_contents.first.update!(content: value)
506
- else
507
- text_contents.create!(
508
- content: value,
509
- embedding_model: default_text_model,
510
- metadata: { manually_set: true }
511
- )
512
- end
513
- when "image"
514
- # For images, set the description (stored in content field)
515
- if image_contents.any?
516
- image_contents.first.update!(content: value) # content field stores description
517
- else
518
- image_contents.create!(
519
- content: value, # content field stores description
520
- embedding_model: default_image_model,
521
- metadata: { manually_set: true }
522
- )
523
- end
524
- when "audio"
525
- # For audio, set the transcript (stored in content field)
526
- if audio_contents.any?
527
- audio_contents.first.update!(content: value) # content field stores transcript
528
- else
529
- audio_contents.create!(
530
- content: value, # content field stores transcript
531
- embedding_model: default_audio_model,
532
- metadata: { manually_set: true }
533
- )
534
- end
535
- else
536
- # Default to text content
537
- text_contents.create!(
538
- content: value,
539
- embedding_model: default_text_model,
540
- metadata: { manually_set: true }
541
- )
542
- end
543
- end
544
-
545
-
546
- def self.embeddings_search(query_embedding, **options)
547
- Ragdoll::Core::Models::Embedding.search_similar(query_embedding, **options)
548
- end
549
-
550
- # File processing is now handled by DocumentProcessor and content models
551
- # These methods are no longer needed at the document level
552
-
553
- # Default model names for each content type
554
- def default_text_model
555
- "text-embedding-3-large"
556
- end
557
-
558
- def default_image_model
559
- "clip-vit-large-patch14"
560
- end
561
-
562
- def default_audio_model
563
- "whisper-embedding-v1"
564
- end
565
-
566
- # File extraction is now handled by DocumentProcessor
567
- # Content-specific extraction is handled by individual content models
568
-
569
- # Get document statistics
570
- def self.stats
571
- {
572
- total_documents: count,
573
- by_status: group(:status).count,
574
- by_type: group(:document_type).count,
575
- multi_modal_documents: joins(:text_contents, :image_contents).distinct.count +
576
- joins(:text_contents, :audio_contents).distinct.count +
577
- joins(:image_contents, :audio_contents).distinct.count,
578
- total_text_contents: joins(:text_contents).count,
579
- total_image_contents: joins(:image_contents).count,
580
- total_audio_contents: joins(:audio_contents).count,
581
- total_embeddings: {
582
- text: joins(:text_embeddings).count,
583
- image: joins(:image_embeddings).count,
584
- audio: joins(:audio_embeddings).count
585
- },
586
- storage_type: "activerecord_polymorphic"
587
- }
588
- end
589
-
590
- public
591
-
592
- # Convert document to hash representation for API responses
593
- def to_hash(include_content: false)
594
- {
595
- id: id.to_s,
596
- title: title,
597
- location: location,
598
- document_type: document_type,
599
- status: status,
600
- content_length: content&.length || 0,
601
- file_modified_at: file_modified_at&.iso8601,
602
- created_at: created_at&.iso8601,
603
- updated_at: updated_at&.iso8601,
604
- metadata: metadata || {},
605
- content_summary: {
606
- text_contents: text_contents.count,
607
- image_contents: image_contents.count,
608
- audio_contents: audio_contents.count,
609
- embeddings_count: total_embeddings_count,
610
- embeddings_ready: status == "processed"
611
- }
612
- }.tap do |hash|
613
- if include_content
614
- hash[:content_details] = {
615
- text_content: text_contents.map(&:content),
616
- image_descriptions: image_contents.map(&:description),
617
- audio_transcripts: audio_contents.map(&:transcript)
618
- }
619
- end
620
- end
621
- end
622
-
623
- private
624
-
625
- def total_embeddings_count
626
- # Count embeddings through polymorphic associations
627
- embedding_count = 0
628
-
629
- # Count embeddings for text contents
630
- text_contents.each do |content|
631
- embedding_count += content.embeddings.count
632
- end
633
-
634
- # Count embeddings for image contents
635
- image_contents.each do |content|
636
- embedding_count += content.embeddings.count
637
- end
638
-
639
- # Count embeddings for audio contents
640
- audio_contents.each do |content|
641
- embedding_count += content.embeddings.count
642
- end
643
-
644
- embedding_count
645
- end
646
-
647
- # Normalize location to absolute path for file paths
648
- def normalize_location
649
- return if location.blank?
650
-
651
- # Don't normalize URLs or other non-file protocols
652
- return if location.start_with?("http://", "https://", "ftp://", "sftp://")
653
-
654
- # Convert relative file paths to absolute paths
655
- self.location = File.expand_path(location)
656
- end
657
-
658
- # Set default file_modified_at if not provided
659
- def set_default_file_modified_at
660
- return if file_modified_at.present?
661
-
662
- # If location is a file path that exists, use file mtime
663
- if location.present? && !location.start_with?("http://", "https://", "ftp://", "sftp://")
664
- expanded_location = File.expand_path(location)
665
- self.file_modified_at = if File.exist?(expanded_location)
666
- File.mtime(expanded_location)
667
- else
668
- Time.current
669
- end
670
- else
671
- # For URLs or non-file locations, use current time
672
- self.file_modified_at = Time.current
673
- end
674
- end
675
- end
676
- end
677
- end
678
- end