ragdoll 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +353 -0
  3. data/Rakefile +21 -0
  4. data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
  5. data/db/migrate/004_create_ragdoll_documents.rb +70 -0
  6. data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
  7. data/db/migrate/006_create_ragdoll_contents.rb +47 -0
  8. data/lib/ragdoll/core/client.rb +315 -0
  9. data/lib/ragdoll/core/configuration.rb +273 -0
  10. data/lib/ragdoll/core/database.rb +141 -0
  11. data/lib/ragdoll/core/document_management.rb +110 -0
  12. data/lib/ragdoll/core/document_processor.rb +344 -0
  13. data/lib/ragdoll/core/embedding_service.rb +183 -0
  14. data/lib/ragdoll/core/errors.rb +11 -0
  15. data/lib/ragdoll/core/jobs/extract_keywords.rb +32 -0
  16. data/lib/ragdoll/core/jobs/extract_text.rb +42 -0
  17. data/lib/ragdoll/core/jobs/generate_embeddings.rb +32 -0
  18. data/lib/ragdoll/core/jobs/generate_summary.rb +29 -0
  19. data/lib/ragdoll/core/metadata_schemas.rb +334 -0
  20. data/lib/ragdoll/core/models/audio_content.rb +175 -0
  21. data/lib/ragdoll/core/models/content.rb +126 -0
  22. data/lib/ragdoll/core/models/document.rb +678 -0
  23. data/lib/ragdoll/core/models/embedding.rb +204 -0
  24. data/lib/ragdoll/core/models/image_content.rb +227 -0
  25. data/lib/ragdoll/core/models/text_content.rb +169 -0
  26. data/lib/ragdoll/core/search_engine.rb +50 -0
  27. data/lib/ragdoll/core/services/image_description_service.rb +230 -0
  28. data/lib/ragdoll/core/services/metadata_generator.rb +335 -0
  29. data/lib/ragdoll/core/shrine_config.rb +71 -0
  30. data/lib/ragdoll/core/text_chunker.rb +210 -0
  31. data/lib/ragdoll/core/text_generation_service.rb +360 -0
  32. data/lib/ragdoll/core/version.rb +8 -0
  33. data/lib/ragdoll/core.rb +73 -0
  34. data/lib/ragdoll-core.rb +3 -0
  35. data/lib/ragdoll.rb +249 -0
  36. data/lib/tasks/annotate.rake +126 -0
  37. data/lib/tasks/db.rake +338 -0
  38. metadata +80 -0
@@ -0,0 +1,175 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+ require_relative "content"
5
+
6
+ # == Schema Information
7
+ #
8
+ # Table name: ragdoll_contents (STI)
9
+ #
10
+ # id :bigint not null, primary key
11
+ # type(Type of content - TextContent, ImageContent, AudioContent) :string not null
12
+ # document_id(Reference to parent document) :bigint not null
13
+ # embedding_model(Embedding model to use for this content) :string not null
14
+ # content(Text content or description of the file) :text
15
+ # data(Raw data from file) :text
16
+ # metadata(Additional metadata about the file's raw data) :json default({})
17
+ # duration(Duration of audio in seconds - for audio content) :float
18
+ # sample_rate(Audio sample rate in Hz - for audio content) :integer
19
+ # created_at(Standard creation and update timestamps) :datetime not null
20
+ # updated_at(Standard creation and update timestamps) :datetime not null
21
+ #
22
+ # Indexes
23
+ #
24
+ # index_ragdoll_contents_on_document_id (document_id)
25
+ # index_ragdoll_contents_on_embedding_model (embedding_model)
26
+ # index_ragdoll_contents_on_type (type)
27
+ # index_ragdoll_contents_on_fulltext_search (to_tsvector('english'::regconfig, COALESCE(content, ''::text))) USING gin
28
+ #
29
+ # Foreign Keys
30
+ #
31
+ # fk_rails_... (document_id => ragdoll_documents.id)
32
+ #
33
+
34
+ module Ragdoll
35
+ module Core
36
+ module Models
37
+ class AudioContent < Content
38
+ validate :audio_data_or_transcript_present
39
+ validates :duration, numericality: { greater_than: 0 }, allow_nil: true
40
+ validates :sample_rate, numericality: { greater_than: 0 }, allow_nil: true
41
+
42
+ scope :recent, -> { order(created_at: :desc) }
43
+ scope :with_audio, -> { where.not(data: [nil, ""]) }
44
+ scope :with_transcripts, -> { where.not(content: [nil, ""]) }
45
+ scope :by_duration, lambda { |min_duration, max_duration = nil|
46
+ scope = where("duration >= ?", min_duration)
47
+ scope = scope.where("duration <= ?", max_duration) if max_duration
48
+ scope
49
+ }
50
+
51
+
52
+ # Audio content accessors - content field stores transcript for embedding
53
+ def transcript
54
+ content
55
+ end
56
+
57
+ def transcript=(value)
58
+ self.content = value
59
+ end
60
+
61
+ # Audio file data accessor
62
+ def audio_data
63
+ data
64
+ end
65
+
66
+ def audio_data=(value)
67
+ self.data = value
68
+ end
69
+
70
+ # Audio file technical properties (stored in content metadata - raw file data)
71
+ def audio_attached?
72
+ data.present?
73
+ end
74
+
75
+ def audio_size
76
+ metadata.dig('file_size') || 0
77
+ end
78
+
79
+ def audio_size=(value)
80
+ self.metadata = metadata.merge('file_size' => value)
81
+ end
82
+
83
+ def audio_content_type
84
+ metadata.dig('content_type')
85
+ end
86
+
87
+ def audio_content_type=(value)
88
+ self.metadata = metadata.merge('content_type' => value)
89
+ end
90
+
91
+ def audio_filename
92
+ metadata.dig('filename')
93
+ end
94
+
95
+ def audio_filename=(value)
96
+ self.metadata = metadata.merge('filename' => value)
97
+ end
98
+
99
+ # Audio format and technical details
100
+ def codec
101
+ metadata.dig('codec')
102
+ end
103
+
104
+ def codec=(value)
105
+ self.metadata = metadata.merge('codec' => value)
106
+ end
107
+
108
+ def bitrate
109
+ metadata.dig('bitrate')
110
+ end
111
+
112
+ def bitrate=(value)
113
+ self.metadata = metadata.merge('bitrate' => value)
114
+ end
115
+
116
+ def channels
117
+ metadata.dig('channels')
118
+ end
119
+
120
+ def channels=(value)
121
+ self.metadata = metadata.merge('channels' => value)
122
+ end
123
+
124
+ def duration_formatted
125
+ return "Unknown" unless duration
126
+
127
+ minutes = (duration / 60).floor
128
+ seconds = (duration % 60).round
129
+ "#{minutes}:#{seconds.to_s.rjust(2, '0')}"
130
+ end
131
+
132
+ # Override content for embedding to use transcript
133
+ def content_for_embedding
134
+ transcript.presence || "Audio content without transcript"
135
+ end
136
+
137
+ def generate_embeddings!
138
+ return unless should_generate_embeddings?
139
+
140
+ embedding_content = content_for_embedding
141
+ return if embedding_content.blank?
142
+
143
+ # Generate embeddings using the base class method
144
+ super
145
+ end
146
+
147
+ # Override should_generate_embeddings to check for transcript
148
+ def should_generate_embeddings?
149
+ content_for_embedding.present? && embeddings.empty?
150
+ end
151
+
152
+ def self.stats
153
+ {
154
+ total_audio_contents: count,
155
+ by_model: group(:embedding_model).count,
156
+ total_embeddings: joins(:embeddings).count,
157
+ with_audio: with_audio.count,
158
+ with_transcripts: with_transcripts.count,
159
+ total_duration: sum(:duration),
160
+ average_duration: average(:duration),
161
+ average_audio_size: joins(:audio_attachment).average("active_storage_blobs.byte_size")
162
+ }
163
+ end
164
+
165
+ private
166
+
167
+ def audio_data_or_transcript_present
168
+ return if audio_attached? || transcript.present?
169
+
170
+ errors.add(:base, "Must have either audio data or transcript")
171
+ end
172
+ end
173
+ end
174
+ end
175
+ end
@@ -0,0 +1,126 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+
5
+ # == Schema Information
6
+ #
7
+ # Table name: ragdoll_contents
8
+ #
9
+ # id :bigint not null, primary key
10
+ # type :string not null
11
+ # document_id :bigint not null
12
+ # embedding_model :string not null
13
+ # content :text
14
+ # data :text
15
+ # metadata :json default({})
16
+ # duration :float
17
+ # sample_rate :integer
18
+ # created_at :datetime not null
19
+ # updated_at :datetime not null
20
+ #
21
+ # Indexes
22
+ #
23
+ # index_ragdoll_contents_on_embedding_model (embedding_model)
24
+ # index_ragdoll_contents_on_type (type)
25
+ # index_ragdoll_contents_on_fulltext_search (to_tsvector('english'::regconfig, COALESCE(content, ''::text))) USING gin
26
+ #
27
+ # Foreign Keys
28
+ #
29
+ # fk_rails_... (document_id => ragdoll_documents.id)
30
+ #
31
+
32
+ module Ragdoll
33
+ module Core
34
+ module Models
35
+ class Content < ActiveRecord::Base
36
+ self.table_name = "ragdoll_contents"
37
+
38
+ belongs_to :document,
39
+ class_name: "Ragdoll::Core::Models::Document",
40
+ foreign_key: "document_id"
41
+
42
+ has_many :embeddings,
43
+ class_name: "Ragdoll::Core::Models::Embedding",
44
+ as: :embeddable,
45
+ dependent: :destroy
46
+
47
+ validates :type, presence: true
48
+ validates :embedding_model, presence: true
49
+ validates :document_id, presence: true
50
+
51
+ # JSON columns are handled natively by PostgreSQL
52
+
53
+ scope :by_type, ->(content_type) { where(type: content_type) }
54
+ scope :with_embeddings, -> { joins(:embeddings).distinct }
55
+ scope :without_embeddings, -> { left_joins(:embeddings).where(embeddings: { id: nil }) }
56
+
57
+ # Generate embeddings for this content
58
+ def generate_embeddings!
59
+ return unless should_generate_embeddings?
60
+
61
+ embedding_content = content_for_embedding
62
+ return if embedding_content.blank?
63
+
64
+ # Clear existing embeddings
65
+ embeddings.destroy_all
66
+
67
+ # Use TextChunker to split content into chunks
68
+ chunks = Ragdoll::Core::TextChunker.chunk(embedding_content)
69
+
70
+ # Generate embeddings for each chunk
71
+ embedding_service = Ragdoll::Core::EmbeddingService.new
72
+
73
+ chunks.each_with_index do |chunk_text, index|
74
+ begin
75
+ vector = embedding_service.generate_embedding(chunk_text)
76
+
77
+ embeddings.create!(
78
+ content: chunk_text,
79
+ embedding_vector: vector,
80
+ chunk_index: index
81
+ )
82
+ rescue StandardError => e
83
+ puts "Failed to generate embedding for chunk #{index}: #{e.message}"
84
+ end
85
+ end
86
+
87
+ update!(metadata: metadata.merge("embeddings_generated_at" => Time.current))
88
+ end
89
+
90
+ # Content to use for embedding generation (overridden by subclasses)
91
+ def content_for_embedding
92
+ content
93
+ end
94
+
95
+ # Whether this content should generate embeddings
96
+ def should_generate_embeddings?
97
+ content_for_embedding.present? && embeddings.empty?
98
+ end
99
+
100
+ # Statistics
101
+ def word_count
102
+ return 0 unless content.present?
103
+ content.split(/\s+/).length
104
+ end
105
+
106
+ def character_count
107
+ content&.length || 0
108
+ end
109
+
110
+ def embedding_count
111
+ embeddings.count
112
+ end
113
+
114
+ # Search within this content type
115
+ def self.search_content(query, **options)
116
+ return none if query.blank?
117
+
118
+ where(
119
+ "to_tsvector('english', COALESCE(content, '')) @@ plainto_tsquery('english', ?)",
120
+ query
121
+ ).limit(options[:limit] || 20)
122
+ end
123
+ end
124
+ end
125
+ end
126
+ end