ragdoll 0.1.11 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +323 -384
- data/app/models/ragdoll/document.rb +1 -1
- data/app/models/ragdoll/unified_content.rb +216 -0
- data/app/models/ragdoll/unified_document.rb +338 -0
- data/app/services/ragdoll/audio_to_text_service.rb +200 -0
- data/app/services/ragdoll/document_converter.rb +216 -0
- data/app/services/ragdoll/document_processor.rb +197 -331
- data/app/services/ragdoll/image_to_text_service.rb +322 -0
- data/app/services/ragdoll/migration_service.rb +340 -0
- data/app/services/ragdoll/text_extraction_service.rb +422 -0
- data/app/services/ragdoll/unified_document_management.rb +300 -0
- data/db/migrate/20250923000001_create_ragdoll_unified_contents.rb +87 -0
- data/lib/ragdoll/core/version.rb +1 -1
- data/lib/ragdoll/core.rb +7 -0
- metadata +11 -2
@@ -0,0 +1,200 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ruby_llm"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class AudioToTextService
|
7
|
+
class TranscriptionError < StandardError; end
|
8
|
+
|
9
|
+
DEFAULT_OPTIONS = {
|
10
|
+
model: "whisper-1",
|
11
|
+
provider: :openai,
|
12
|
+
temperature: 0.0,
|
13
|
+
language: nil # Auto-detect
|
14
|
+
}.freeze
|
15
|
+
|
16
|
+
def self.transcribe(file_path, **options)
|
17
|
+
new(**options).transcribe(file_path)
|
18
|
+
end
|
19
|
+
|
20
|
+
def initialize(**options)
|
21
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
22
|
+
configure_transcription_service
|
23
|
+
end
|
24
|
+
|
25
|
+
def transcribe(file_path)
|
26
|
+
return "" unless File.exist?(file_path)
|
27
|
+
return "" unless audio_file?(file_path)
|
28
|
+
|
29
|
+
begin
|
30
|
+
# Use RubyLLM for transcription
|
31
|
+
# Note: This is a placeholder implementation
|
32
|
+
# Real implementation would depend on the transcription service available
|
33
|
+
|
34
|
+
if transcription_available?
|
35
|
+
perform_transcription(file_path)
|
36
|
+
else
|
37
|
+
generate_fallback_transcript(file_path)
|
38
|
+
end
|
39
|
+
rescue StandardError => e
|
40
|
+
puts "Warning: Audio transcription failed (#{e.message}), using fallback"
|
41
|
+
generate_fallback_transcript(file_path)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def supported_formats
|
46
|
+
%w[.mp3 .wav .m4a .flac .ogg .aac .wma .mp4 .mov .avi .webm]
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def configure_transcription_service
|
52
|
+
# Configure transcription service based on provider
|
53
|
+
case @options[:provider]
|
54
|
+
when :openai
|
55
|
+
configure_openai_transcription
|
56
|
+
when :azure
|
57
|
+
configure_azure_transcription
|
58
|
+
when :google
|
59
|
+
configure_google_transcription
|
60
|
+
when :whisper_local
|
61
|
+
configure_local_whisper
|
62
|
+
else
|
63
|
+
puts "Warning: Unsupported transcription provider: #{@options[:provider]}"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def configure_openai_transcription
|
68
|
+
# OpenAI Whisper API configuration
|
69
|
+
@api_key = ENV["OPENAI_API_KEY"]
|
70
|
+
@endpoint = "https://api.openai.com/v1/audio/transcriptions"
|
71
|
+
end
|
72
|
+
|
73
|
+
def configure_azure_transcription
|
74
|
+
# Azure Speech Services configuration
|
75
|
+
@api_key = ENV["AZURE_SPEECH_KEY"]
|
76
|
+
@region = ENV["AZURE_SPEECH_REGION"]
|
77
|
+
end
|
78
|
+
|
79
|
+
def configure_google_transcription
|
80
|
+
# Google Cloud Speech-to-Text configuration
|
81
|
+
@api_key = ENV["GOOGLE_CLOUD_API_KEY"]
|
82
|
+
@project_id = ENV["GOOGLE_CLOUD_PROJECT_ID"]
|
83
|
+
end
|
84
|
+
|
85
|
+
def configure_local_whisper
|
86
|
+
# Local Whisper installation configuration
|
87
|
+
@whisper_command = `which whisper`.strip
|
88
|
+
end
|
89
|
+
|
90
|
+
def transcription_available?
|
91
|
+
case @options[:provider]
|
92
|
+
when :openai
|
93
|
+
!@api_key.nil? && !@api_key.empty?
|
94
|
+
when :azure
|
95
|
+
!@api_key.nil? && !@api_key.empty? && !@region.nil?
|
96
|
+
when :google
|
97
|
+
!@api_key.nil? && !@api_key.empty?
|
98
|
+
when :whisper_local
|
99
|
+
!@whisper_command.empty? && File.executable?(@whisper_command)
|
100
|
+
else
|
101
|
+
false
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def perform_transcription(file_path)
|
106
|
+
case @options[:provider]
|
107
|
+
when :openai
|
108
|
+
transcribe_with_openai(file_path)
|
109
|
+
when :azure
|
110
|
+
transcribe_with_azure(file_path)
|
111
|
+
when :google
|
112
|
+
transcribe_with_google(file_path)
|
113
|
+
when :whisper_local
|
114
|
+
transcribe_with_local_whisper(file_path)
|
115
|
+
else
|
116
|
+
raise TranscriptionError, "Unsupported transcription provider"
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def transcribe_with_openai(file_path)
|
121
|
+
# Placeholder for OpenAI Whisper API implementation
|
122
|
+
# This would use HTTP requests to OpenAI's API
|
123
|
+
# For now, return a placeholder
|
124
|
+
generate_fallback_transcript(file_path)
|
125
|
+
end
|
126
|
+
|
127
|
+
def transcribe_with_azure(file_path)
|
128
|
+
# Placeholder for Azure Speech Services implementation
|
129
|
+
generate_fallback_transcript(file_path)
|
130
|
+
end
|
131
|
+
|
132
|
+
def transcribe_with_google(file_path)
|
133
|
+
# Placeholder for Google Cloud Speech-to-Text implementation
|
134
|
+
generate_fallback_transcript(file_path)
|
135
|
+
end
|
136
|
+
|
137
|
+
def transcribe_with_local_whisper(file_path)
|
138
|
+
# Use local Whisper installation
|
139
|
+
output_file = "#{file_path}.txt"
|
140
|
+
|
141
|
+
begin
|
142
|
+
# Run whisper command
|
143
|
+
command = "#{@whisper_command} \"#{file_path}\" --output_format txt --output_dir \"#{File.dirname(file_path)}\""
|
144
|
+
command += " --language #{@options[:language]}" if @options[:language]
|
145
|
+
command += " --temperature #{@options[:temperature]}"
|
146
|
+
|
147
|
+
result = `#{command} 2>&1`
|
148
|
+
|
149
|
+
if $?.success? && File.exist?(output_file)
|
150
|
+
transcript = File.read(output_file)
|
151
|
+
File.delete(output_file) # Cleanup
|
152
|
+
transcript.strip
|
153
|
+
else
|
154
|
+
raise TranscriptionError, "Whisper command failed: #{result}"
|
155
|
+
end
|
156
|
+
rescue StandardError => e
|
157
|
+
raise TranscriptionError, "Local Whisper transcription failed: #{e.message}"
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def audio_file?(file_path)
|
162
|
+
extension = File.extname(file_path).downcase
|
163
|
+
supported_formats.include?(extension)
|
164
|
+
end
|
165
|
+
|
166
|
+
def generate_fallback_transcript(file_path)
|
167
|
+
filename = File.basename(file_path)
|
168
|
+
duration = estimate_duration(file_path)
|
169
|
+
|
170
|
+
if duration
|
171
|
+
"[Audio file: #{filename} (#{format_duration(duration)})]"
|
172
|
+
else
|
173
|
+
"[Audio file: #{filename}]"
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def estimate_duration(file_path)
|
178
|
+
# Try to get duration using file size estimation
|
179
|
+
# This is very rough and not accurate
|
180
|
+
begin
|
181
|
+
file_size = File.size(file_path)
|
182
|
+
# Rough estimation: 1MB per minute for compressed audio
|
183
|
+
estimated_minutes = file_size / (1024 * 1024)
|
184
|
+
estimated_minutes > 0 ? estimated_minutes : nil
|
185
|
+
rescue StandardError
|
186
|
+
nil
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
def format_duration(minutes)
|
191
|
+
if minutes < 60
|
192
|
+
"#{minutes.round}m"
|
193
|
+
else
|
194
|
+
hours = minutes / 60
|
195
|
+
remaining_minutes = minutes % 60
|
196
|
+
"#{hours.round}h #{remaining_minutes.round}m"
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
@@ -0,0 +1,216 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ragdoll
|
4
|
+
class DocumentConverter
|
5
|
+
class ConversionError < StandardError; end
|
6
|
+
|
7
|
+
def self.convert_to_text(file_path, document_type = nil, **options)
|
8
|
+
new(**options).convert_to_text(file_path, document_type)
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(**options)
|
12
|
+
@options = options
|
13
|
+
end
|
14
|
+
|
15
|
+
def convert_to_text(file_path, document_type = nil)
|
16
|
+
return "" unless File.exist?(file_path)
|
17
|
+
|
18
|
+
document_type ||= determine_document_type(file_path)
|
19
|
+
|
20
|
+
begin
|
21
|
+
case document_type
|
22
|
+
when "text", "markdown", "html", "pdf", "docx", "csv", "json", "xml", "yaml"
|
23
|
+
convert_text_based_document(file_path, document_type)
|
24
|
+
when "image"
|
25
|
+
convert_image_to_text(file_path)
|
26
|
+
when "audio"
|
27
|
+
convert_audio_to_text(file_path)
|
28
|
+
when "video"
|
29
|
+
convert_video_to_text(file_path)
|
30
|
+
else
|
31
|
+
convert_unknown_document(file_path)
|
32
|
+
end
|
33
|
+
rescue StandardError => e
|
34
|
+
puts "Warning: Document conversion failed for #{file_path}: #{e.message}"
|
35
|
+
generate_fallback_text(file_path, document_type)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def determine_document_type(file_path)
|
40
|
+
extension = File.extname(file_path).downcase
|
41
|
+
|
42
|
+
case extension
|
43
|
+
when ".pdf" then "pdf"
|
44
|
+
when ".docx" then "docx"
|
45
|
+
when ".txt" then "text"
|
46
|
+
when ".md", ".markdown" then "markdown"
|
47
|
+
when ".html", ".htm" then "html"
|
48
|
+
when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
|
49
|
+
"image"
|
50
|
+
when ".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac", ".wma"
|
51
|
+
"audio"
|
52
|
+
when ".mp4", ".mov", ".avi", ".webm", ".mkv"
|
53
|
+
"video"
|
54
|
+
when ".csv" then "csv"
|
55
|
+
when ".json" then "json"
|
56
|
+
when ".xml" then "xml"
|
57
|
+
when ".yml", ".yaml" then "yaml"
|
58
|
+
else
|
59
|
+
"text" # Default to text for unknown extensions
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def supported_formats
|
64
|
+
{
|
65
|
+
text: %w[.txt .md .markdown .html .htm .csv .json .xml .yml .yaml],
|
66
|
+
documents: %w[.pdf .docx],
|
67
|
+
images: %w[.jpg .jpeg .png .gif .bmp .webp .svg .ico .tiff .tif],
|
68
|
+
audio: %w[.mp3 .wav .m4a .flac .ogg .aac .wma],
|
69
|
+
video: %w[.mp4 .mov .avi .webm .mkv]
|
70
|
+
}
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def convert_text_based_document(file_path, document_type)
|
76
|
+
service = Ragdoll::TextExtractionService.new(file_path, document_type)
|
77
|
+
service.extract
|
78
|
+
end
|
79
|
+
|
80
|
+
def convert_image_to_text(file_path)
|
81
|
+
service = Ragdoll::ImageToTextService.new(@options)
|
82
|
+
service.convert(file_path)
|
83
|
+
end
|
84
|
+
|
85
|
+
def convert_audio_to_text(file_path)
|
86
|
+
service = Ragdoll::AudioToTextService.new(@options)
|
87
|
+
service.transcribe(file_path)
|
88
|
+
end
|
89
|
+
|
90
|
+
def convert_video_to_text(file_path)
|
91
|
+
# For video files, we'll extract audio and transcribe it
|
92
|
+
# This is a simplified approach - in production you might want to:
|
93
|
+
# 1. Extract keyframes as images and describe them
|
94
|
+
# 2. Extract audio track and transcribe it
|
95
|
+
# 3. Combine both approaches
|
96
|
+
|
97
|
+
begin
|
98
|
+
# Try to extract basic metadata
|
99
|
+
video_info = extract_video_metadata(file_path)
|
100
|
+
audio_text = attempt_video_audio_extraction(file_path)
|
101
|
+
|
102
|
+
if audio_text && !audio_text.empty?
|
103
|
+
if video_info
|
104
|
+
"Video content: #{video_info}\n\nAudio transcript: #{audio_text}"
|
105
|
+
else
|
106
|
+
"Video with audio transcript: #{audio_text}"
|
107
|
+
end
|
108
|
+
else
|
109
|
+
video_info || generate_fallback_text(file_path, "video")
|
110
|
+
end
|
111
|
+
rescue StandardError => e
|
112
|
+
puts "Warning: Video conversion failed: #{e.message}"
|
113
|
+
generate_fallback_text(file_path, "video")
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def convert_unknown_document(file_path)
|
118
|
+
# Try to read as text first
|
119
|
+
begin
|
120
|
+
content = File.read(file_path, encoding: "UTF-8")
|
121
|
+
return content if looks_like_text?(content)
|
122
|
+
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
|
123
|
+
# Try with different encoding
|
124
|
+
begin
|
125
|
+
content = File.read(file_path, encoding: "ISO-8859-1")
|
126
|
+
.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
|
127
|
+
return content if looks_like_text?(content)
|
128
|
+
rescue StandardError
|
129
|
+
# Fall through to binary handling
|
130
|
+
end
|
131
|
+
rescue StandardError
|
132
|
+
# Fall through to fallback
|
133
|
+
end
|
134
|
+
|
135
|
+
# If not readable as text, generate metadata-based description
|
136
|
+
generate_fallback_text(file_path, "unknown")
|
137
|
+
end
|
138
|
+
|
139
|
+
def extract_video_metadata(file_path)
|
140
|
+
# Basic video metadata extraction
|
141
|
+
# In production, you might use ffmpeg or similar tools
|
142
|
+
file_size = File.size(file_path)
|
143
|
+
filename = File.basename(file_path, File.extname(file_path))
|
144
|
+
|
145
|
+
# Extract meaningful parts from filename
|
146
|
+
descriptive_parts = filename
|
147
|
+
.gsub(/[-_]+/, ' ')
|
148
|
+
.gsub(/([a-z])([A-Z])/, '\1 \2')
|
149
|
+
.split(' ')
|
150
|
+
.reject { |part| part.match?(/^\d+$/) }
|
151
|
+
.map(&:capitalize)
|
152
|
+
|
153
|
+
if descriptive_parts.any?
|
154
|
+
"Video: #{descriptive_parts.join(' ')} (#{format_file_size(file_size)})"
|
155
|
+
else
|
156
|
+
"Video file: #{File.basename(file_path)} (#{format_file_size(file_size)})"
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def attempt_video_audio_extraction(file_path)
|
161
|
+
# Placeholder for video audio extraction
|
162
|
+
# In production, you would:
|
163
|
+
# 1. Use ffmpeg to extract audio track
|
164
|
+
# 2. Save to temporary audio file
|
165
|
+
# 3. Transcribe the audio file
|
166
|
+
# 4. Clean up temporary file
|
167
|
+
|
168
|
+
# For now, return nil to indicate no audio extraction
|
169
|
+
nil
|
170
|
+
end
|
171
|
+
|
172
|
+
def looks_like_text?(content)
|
173
|
+
# Simple heuristic to determine if content is text
|
174
|
+
return false if content.empty?
|
175
|
+
|
176
|
+
# Check for reasonable ratio of printable characters
|
177
|
+
printable_chars = content.count(" -~")
|
178
|
+
total_chars = content.length
|
179
|
+
|
180
|
+
printable_ratio = printable_chars.to_f / total_chars
|
181
|
+
printable_ratio > 0.8 && total_chars > 0
|
182
|
+
end
|
183
|
+
|
184
|
+
def generate_fallback_text(file_path, document_type)
|
185
|
+
filename = File.basename(file_path)
|
186
|
+
file_size = File.size(file_path)
|
187
|
+
|
188
|
+
case document_type
|
189
|
+
when "image"
|
190
|
+
"Image file: #{filename} (#{format_file_size(file_size)})"
|
191
|
+
when "audio"
|
192
|
+
"Audio file: #{filename} (#{format_file_size(file_size)})"
|
193
|
+
when "video"
|
194
|
+
"Video file: #{filename} (#{format_file_size(file_size)})"
|
195
|
+
else
|
196
|
+
"Document: #{filename} (#{format_file_size(file_size)})"
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def format_file_size(size)
|
201
|
+
units = %w[B KB MB GB TB]
|
202
|
+
unit_index = 0
|
203
|
+
|
204
|
+
while size >= 1024 && unit_index < units.length - 1
|
205
|
+
size /= 1024.0
|
206
|
+
unit_index += 1
|
207
|
+
end
|
208
|
+
|
209
|
+
if unit_index == 0
|
210
|
+
"#{size} #{units[unit_index]}"
|
211
|
+
else
|
212
|
+
"#{size.round(1)} #{units[unit_index]}"
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|