ragdoll 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +353 -0
- data/Rakefile +21 -0
- data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
- data/db/migrate/004_create_ragdoll_documents.rb +70 -0
- data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
- data/db/migrate/006_create_ragdoll_contents.rb +47 -0
- data/lib/ragdoll/core/client.rb +315 -0
- data/lib/ragdoll/core/configuration.rb +273 -0
- data/lib/ragdoll/core/database.rb +141 -0
- data/lib/ragdoll/core/document_management.rb +110 -0
- data/lib/ragdoll/core/document_processor.rb +344 -0
- data/lib/ragdoll/core/embedding_service.rb +183 -0
- data/lib/ragdoll/core/errors.rb +11 -0
- data/lib/ragdoll/core/jobs/extract_keywords.rb +32 -0
- data/lib/ragdoll/core/jobs/extract_text.rb +42 -0
- data/lib/ragdoll/core/jobs/generate_embeddings.rb +32 -0
- data/lib/ragdoll/core/jobs/generate_summary.rb +29 -0
- data/lib/ragdoll/core/metadata_schemas.rb +334 -0
- data/lib/ragdoll/core/models/audio_content.rb +175 -0
- data/lib/ragdoll/core/models/content.rb +126 -0
- data/lib/ragdoll/core/models/document.rb +678 -0
- data/lib/ragdoll/core/models/embedding.rb +204 -0
- data/lib/ragdoll/core/models/image_content.rb +227 -0
- data/lib/ragdoll/core/models/text_content.rb +169 -0
- data/lib/ragdoll/core/search_engine.rb +50 -0
- data/lib/ragdoll/core/services/image_description_service.rb +230 -0
- data/lib/ragdoll/core/services/metadata_generator.rb +335 -0
- data/lib/ragdoll/core/shrine_config.rb +71 -0
- data/lib/ragdoll/core/text_chunker.rb +210 -0
- data/lib/ragdoll/core/text_generation_service.rb +360 -0
- data/lib/ragdoll/core/version.rb +8 -0
- data/lib/ragdoll/core.rb +73 -0
- data/lib/ragdoll-core.rb +3 -0
- data/lib/ragdoll.rb +249 -0
- data/lib/tasks/annotate.rake +126 -0
- data/lib/tasks/db.rake +338 -0
- metadata +80 -0
@@ -0,0 +1,175 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_record"
|
4
|
+
require_relative "content"
|
5
|
+
|
6
|
+
# == Schema Information
|
7
|
+
#
|
8
|
+
# Table name: ragdoll_contents (STI)
|
9
|
+
#
|
10
|
+
# id :bigint not null, primary key
|
11
|
+
# type(Type of content - TextContent, ImageContent, AudioContent) :string not null
|
12
|
+
# document_id(Reference to parent document) :bigint not null
|
13
|
+
# embedding_model(Embedding model to use for this content) :string not null
|
14
|
+
# content(Text content or description of the file) :text
|
15
|
+
# data(Raw data from file) :text
|
16
|
+
# metadata(Additional metadata about the file's raw data) :json default({})
|
17
|
+
# duration(Duration of audio in seconds - for audio content) :float
|
18
|
+
# sample_rate(Audio sample rate in Hz - for audio content) :integer
|
19
|
+
# created_at(Standard creation and update timestamps) :datetime not null
|
20
|
+
# updated_at(Standard creation and update timestamps) :datetime not null
|
21
|
+
#
|
22
|
+
# Indexes
|
23
|
+
#
|
24
|
+
# index_ragdoll_contents_on_document_id (document_id)
|
25
|
+
# index_ragdoll_contents_on_embedding_model (embedding_model)
|
26
|
+
# index_ragdoll_contents_on_type (type)
|
27
|
+
# index_ragdoll_contents_on_fulltext_search (to_tsvector('english'::regconfig, COALESCE(content, ''::text))) USING gin
|
28
|
+
#
|
29
|
+
# Foreign Keys
|
30
|
+
#
|
31
|
+
# fk_rails_... (document_id => ragdoll_documents.id)
|
32
|
+
#
|
33
|
+
|
34
|
+
module Ragdoll
|
35
|
+
module Core
|
36
|
+
module Models
|
37
|
+
class AudioContent < Content
|
38
|
+
validate :audio_data_or_transcript_present
|
39
|
+
validates :duration, numericality: { greater_than: 0 }, allow_nil: true
|
40
|
+
validates :sample_rate, numericality: { greater_than: 0 }, allow_nil: true
|
41
|
+
|
42
|
+
scope :recent, -> { order(created_at: :desc) }
|
43
|
+
scope :with_audio, -> { where.not(data: [nil, ""]) }
|
44
|
+
scope :with_transcripts, -> { where.not(content: [nil, ""]) }
|
45
|
+
scope :by_duration, lambda { |min_duration, max_duration = nil|
|
46
|
+
scope = where("duration >= ?", min_duration)
|
47
|
+
scope = scope.where("duration <= ?", max_duration) if max_duration
|
48
|
+
scope
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
# Audio content accessors - content field stores transcript for embedding
|
53
|
+
def transcript
|
54
|
+
content
|
55
|
+
end
|
56
|
+
|
57
|
+
def transcript=(value)
|
58
|
+
self.content = value
|
59
|
+
end
|
60
|
+
|
61
|
+
# Audio file data accessor
|
62
|
+
def audio_data
|
63
|
+
data
|
64
|
+
end
|
65
|
+
|
66
|
+
def audio_data=(value)
|
67
|
+
self.data = value
|
68
|
+
end
|
69
|
+
|
70
|
+
# Audio file technical properties (stored in content metadata - raw file data)
|
71
|
+
def audio_attached?
|
72
|
+
data.present?
|
73
|
+
end
|
74
|
+
|
75
|
+
def audio_size
|
76
|
+
metadata.dig('file_size') || 0
|
77
|
+
end
|
78
|
+
|
79
|
+
def audio_size=(value)
|
80
|
+
self.metadata = metadata.merge('file_size' => value)
|
81
|
+
end
|
82
|
+
|
83
|
+
def audio_content_type
|
84
|
+
metadata.dig('content_type')
|
85
|
+
end
|
86
|
+
|
87
|
+
def audio_content_type=(value)
|
88
|
+
self.metadata = metadata.merge('content_type' => value)
|
89
|
+
end
|
90
|
+
|
91
|
+
def audio_filename
|
92
|
+
metadata.dig('filename')
|
93
|
+
end
|
94
|
+
|
95
|
+
def audio_filename=(value)
|
96
|
+
self.metadata = metadata.merge('filename' => value)
|
97
|
+
end
|
98
|
+
|
99
|
+
# Audio format and technical details
|
100
|
+
def codec
|
101
|
+
metadata.dig('codec')
|
102
|
+
end
|
103
|
+
|
104
|
+
def codec=(value)
|
105
|
+
self.metadata = metadata.merge('codec' => value)
|
106
|
+
end
|
107
|
+
|
108
|
+
def bitrate
|
109
|
+
metadata.dig('bitrate')
|
110
|
+
end
|
111
|
+
|
112
|
+
def bitrate=(value)
|
113
|
+
self.metadata = metadata.merge('bitrate' => value)
|
114
|
+
end
|
115
|
+
|
116
|
+
def channels
|
117
|
+
metadata.dig('channels')
|
118
|
+
end
|
119
|
+
|
120
|
+
def channels=(value)
|
121
|
+
self.metadata = metadata.merge('channels' => value)
|
122
|
+
end
|
123
|
+
|
124
|
+
def duration_formatted
|
125
|
+
return "Unknown" unless duration
|
126
|
+
|
127
|
+
minutes = (duration / 60).floor
|
128
|
+
seconds = (duration % 60).round
|
129
|
+
"#{minutes}:#{seconds.to_s.rjust(2, '0')}"
|
130
|
+
end
|
131
|
+
|
132
|
+
# Override content for embedding to use transcript
|
133
|
+
def content_for_embedding
|
134
|
+
transcript.presence || "Audio content without transcript"
|
135
|
+
end
|
136
|
+
|
137
|
+
def generate_embeddings!
|
138
|
+
return unless should_generate_embeddings?
|
139
|
+
|
140
|
+
embedding_content = content_for_embedding
|
141
|
+
return if embedding_content.blank?
|
142
|
+
|
143
|
+
# Generate embeddings using the base class method
|
144
|
+
super
|
145
|
+
end
|
146
|
+
|
147
|
+
# Override should_generate_embeddings to check for transcript
|
148
|
+
def should_generate_embeddings?
|
149
|
+
content_for_embedding.present? && embeddings.empty?
|
150
|
+
end
|
151
|
+
|
152
|
+
def self.stats
|
153
|
+
{
|
154
|
+
total_audio_contents: count,
|
155
|
+
by_model: group(:embedding_model).count,
|
156
|
+
total_embeddings: joins(:embeddings).count,
|
157
|
+
with_audio: with_audio.count,
|
158
|
+
with_transcripts: with_transcripts.count,
|
159
|
+
total_duration: sum(:duration),
|
160
|
+
average_duration: average(:duration),
|
161
|
+
average_audio_size: joins(:audio_attachment).average("active_storage_blobs.byte_size")
|
162
|
+
}
|
163
|
+
end
|
164
|
+
|
165
|
+
private
|
166
|
+
|
167
|
+
def audio_data_or_transcript_present
|
168
|
+
return if audio_attached? || transcript.present?
|
169
|
+
|
170
|
+
errors.add(:base, "Must have either audio data or transcript")
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_record"
|
4
|
+
|
5
|
+
# == Schema Information
|
6
|
+
#
|
7
|
+
# Table name: ragdoll_contents
|
8
|
+
#
|
9
|
+
# id :bigint not null, primary key
|
10
|
+
# type :string not null
|
11
|
+
# document_id :bigint not null
|
12
|
+
# embedding_model :string not null
|
13
|
+
# content :text
|
14
|
+
# data :text
|
15
|
+
# metadata :json default({})
|
16
|
+
# duration :float
|
17
|
+
# sample_rate :integer
|
18
|
+
# created_at :datetime not null
|
19
|
+
# updated_at :datetime not null
|
20
|
+
#
|
21
|
+
# Indexes
|
22
|
+
#
|
23
|
+
# index_ragdoll_contents_on_embedding_model (embedding_model)
|
24
|
+
# index_ragdoll_contents_on_type (type)
|
25
|
+
# index_ragdoll_contents_on_fulltext_search (to_tsvector('english'::regconfig, COALESCE(content, ''::text))) USING gin
|
26
|
+
#
|
27
|
+
# Foreign Keys
|
28
|
+
#
|
29
|
+
# fk_rails_... (document_id => ragdoll_documents.id)
|
30
|
+
#
|
31
|
+
|
32
|
+
module Ragdoll
|
33
|
+
module Core
|
34
|
+
module Models
|
35
|
+
class Content < ActiveRecord::Base
|
36
|
+
self.table_name = "ragdoll_contents"
|
37
|
+
|
38
|
+
belongs_to :document,
|
39
|
+
class_name: "Ragdoll::Core::Models::Document",
|
40
|
+
foreign_key: "document_id"
|
41
|
+
|
42
|
+
has_many :embeddings,
|
43
|
+
class_name: "Ragdoll::Core::Models::Embedding",
|
44
|
+
as: :embeddable,
|
45
|
+
dependent: :destroy
|
46
|
+
|
47
|
+
validates :type, presence: true
|
48
|
+
validates :embedding_model, presence: true
|
49
|
+
validates :document_id, presence: true
|
50
|
+
|
51
|
+
# JSON columns are handled natively by PostgreSQL
|
52
|
+
|
53
|
+
scope :by_type, ->(content_type) { where(type: content_type) }
|
54
|
+
scope :with_embeddings, -> { joins(:embeddings).distinct }
|
55
|
+
scope :without_embeddings, -> { left_joins(:embeddings).where(embeddings: { id: nil }) }
|
56
|
+
|
57
|
+
# Generate embeddings for this content
|
58
|
+
def generate_embeddings!
|
59
|
+
return unless should_generate_embeddings?
|
60
|
+
|
61
|
+
embedding_content = content_for_embedding
|
62
|
+
return if embedding_content.blank?
|
63
|
+
|
64
|
+
# Clear existing embeddings
|
65
|
+
embeddings.destroy_all
|
66
|
+
|
67
|
+
# Use TextChunker to split content into chunks
|
68
|
+
chunks = Ragdoll::Core::TextChunker.chunk(embedding_content)
|
69
|
+
|
70
|
+
# Generate embeddings for each chunk
|
71
|
+
embedding_service = Ragdoll::Core::EmbeddingService.new
|
72
|
+
|
73
|
+
chunks.each_with_index do |chunk_text, index|
|
74
|
+
begin
|
75
|
+
vector = embedding_service.generate_embedding(chunk_text)
|
76
|
+
|
77
|
+
embeddings.create!(
|
78
|
+
content: chunk_text,
|
79
|
+
embedding_vector: vector,
|
80
|
+
chunk_index: index
|
81
|
+
)
|
82
|
+
rescue StandardError => e
|
83
|
+
puts "Failed to generate embedding for chunk #{index}: #{e.message}"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
update!(metadata: metadata.merge("embeddings_generated_at" => Time.current))
|
88
|
+
end
|
89
|
+
|
90
|
+
# Content to use for embedding generation (overridden by subclasses)
|
91
|
+
def content_for_embedding
|
92
|
+
content
|
93
|
+
end
|
94
|
+
|
95
|
+
# Whether this content should generate embeddings
|
96
|
+
def should_generate_embeddings?
|
97
|
+
content_for_embedding.present? && embeddings.empty?
|
98
|
+
end
|
99
|
+
|
100
|
+
# Statistics
|
101
|
+
def word_count
|
102
|
+
return 0 unless content.present?
|
103
|
+
content.split(/\s+/).length
|
104
|
+
end
|
105
|
+
|
106
|
+
def character_count
|
107
|
+
content&.length || 0
|
108
|
+
end
|
109
|
+
|
110
|
+
def embedding_count
|
111
|
+
embeddings.count
|
112
|
+
end
|
113
|
+
|
114
|
+
# Search within this content type
|
115
|
+
def self.search_content(query, **options)
|
116
|
+
return none if query.blank?
|
117
|
+
|
118
|
+
where(
|
119
|
+
"to_tsvector('english', COALESCE(content, '')) @@ plainto_tsquery('english', ?)",
|
120
|
+
query
|
121
|
+
).limit(options[:limit] || 20)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|