ragdoll 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,200 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ruby_llm"
4
+
5
+ module Ragdoll
6
+ class AudioToTextService
7
+ class TranscriptionError < StandardError; end
8
+
9
+ DEFAULT_OPTIONS = {
10
+ model: "whisper-1",
11
+ provider: :openai,
12
+ temperature: 0.0,
13
+ language: nil # Auto-detect
14
+ }.freeze
15
+
16
+ def self.transcribe(file_path, **options)
17
+ new(**options).transcribe(file_path)
18
+ end
19
+
20
+ def initialize(**options)
21
+ @options = DEFAULT_OPTIONS.merge(options)
22
+ configure_transcription_service
23
+ end
24
+
25
+ def transcribe(file_path)
26
+ return "" unless File.exist?(file_path)
27
+ return "" unless audio_file?(file_path)
28
+
29
+ begin
30
+ # Use RubyLLM for transcription
31
+ # Note: This is a placeholder implementation
32
+ # Real implementation would depend on the transcription service available
33
+
34
+ if transcription_available?
35
+ perform_transcription(file_path)
36
+ else
37
+ generate_fallback_transcript(file_path)
38
+ end
39
+ rescue StandardError => e
40
+ puts "Warning: Audio transcription failed (#{e.message}), using fallback"
41
+ generate_fallback_transcript(file_path)
42
+ end
43
+ end
44
+
45
+ def supported_formats
46
+ %w[.mp3 .wav .m4a .flac .ogg .aac .wma .mp4 .mov .avi .webm]
47
+ end
48
+
49
+ private
50
+
51
+ def configure_transcription_service
52
+ # Configure transcription service based on provider
53
+ case @options[:provider]
54
+ when :openai
55
+ configure_openai_transcription
56
+ when :azure
57
+ configure_azure_transcription
58
+ when :google
59
+ configure_google_transcription
60
+ when :whisper_local
61
+ configure_local_whisper
62
+ else
63
+ puts "Warning: Unsupported transcription provider: #{@options[:provider]}"
64
+ end
65
+ end
66
+
67
+ def configure_openai_transcription
68
+ # OpenAI Whisper API configuration
69
+ @api_key = ENV["OPENAI_API_KEY"]
70
+ @endpoint = "https://api.openai.com/v1/audio/transcriptions"
71
+ end
72
+
73
+ def configure_azure_transcription
74
+ # Azure Speech Services configuration
75
+ @api_key = ENV["AZURE_SPEECH_KEY"]
76
+ @region = ENV["AZURE_SPEECH_REGION"]
77
+ end
78
+
79
+ def configure_google_transcription
80
+ # Google Cloud Speech-to-Text configuration
81
+ @api_key = ENV["GOOGLE_CLOUD_API_KEY"]
82
+ @project_id = ENV["GOOGLE_CLOUD_PROJECT_ID"]
83
+ end
84
+
85
+ def configure_local_whisper
86
+ # Local Whisper installation configuration
87
+ @whisper_command = `which whisper`.strip
88
+ end
89
+
90
+ def transcription_available?
91
+ case @options[:provider]
92
+ when :openai
93
+ !@api_key.nil? && !@api_key.empty?
94
+ when :azure
95
+ !@api_key.nil? && !@api_key.empty? && !@region.nil?
96
+ when :google
97
+ !@api_key.nil? && !@api_key.empty?
98
+ when :whisper_local
99
+ !@whisper_command.empty? && File.executable?(@whisper_command)
100
+ else
101
+ false
102
+ end
103
+ end
104
+
105
+ def perform_transcription(file_path)
106
+ case @options[:provider]
107
+ when :openai
108
+ transcribe_with_openai(file_path)
109
+ when :azure
110
+ transcribe_with_azure(file_path)
111
+ when :google
112
+ transcribe_with_google(file_path)
113
+ when :whisper_local
114
+ transcribe_with_local_whisper(file_path)
115
+ else
116
+ raise TranscriptionError, "Unsupported transcription provider"
117
+ end
118
+ end
119
+
120
+ def transcribe_with_openai(file_path)
121
+ # Placeholder for OpenAI Whisper API implementation
122
+ # This would use HTTP requests to OpenAI's API
123
+ # For now, return a placeholder
124
+ generate_fallback_transcript(file_path)
125
+ end
126
+
127
+ def transcribe_with_azure(file_path)
128
+ # Placeholder for Azure Speech Services implementation
129
+ generate_fallback_transcript(file_path)
130
+ end
131
+
132
+ def transcribe_with_google(file_path)
133
+ # Placeholder for Google Cloud Speech-to-Text implementation
134
+ generate_fallback_transcript(file_path)
135
+ end
136
+
137
+ def transcribe_with_local_whisper(file_path)
138
+ # Use local Whisper installation
139
+ output_file = "#{file_path}.txt"
140
+
141
+ begin
142
+ # Run whisper command
143
+ command = "#{@whisper_command} \"#{file_path}\" --output_format txt --output_dir \"#{File.dirname(file_path)}\""
144
+ command += " --language #{@options[:language]}" if @options[:language]
145
+ command += " --temperature #{@options[:temperature]}"
146
+
147
+ result = `#{command} 2>&1`
148
+
149
+ if $?.success? && File.exist?(output_file)
150
+ transcript = File.read(output_file)
151
+ File.delete(output_file) # Cleanup
152
+ transcript.strip
153
+ else
154
+ raise TranscriptionError, "Whisper command failed: #{result}"
155
+ end
156
+ rescue StandardError => e
157
+ raise TranscriptionError, "Local Whisper transcription failed: #{e.message}"
158
+ end
159
+ end
160
+
161
+ def audio_file?(file_path)
162
+ extension = File.extname(file_path).downcase
163
+ supported_formats.include?(extension)
164
+ end
165
+
166
+ def generate_fallback_transcript(file_path)
167
+ filename = File.basename(file_path)
168
+ duration = estimate_duration(file_path)
169
+
170
+ if duration
171
+ "[Audio file: #{filename} (#{format_duration(duration)})]"
172
+ else
173
+ "[Audio file: #{filename}]"
174
+ end
175
+ end
176
+
177
+ def estimate_duration(file_path)
178
+ # Try to get duration using file size estimation
179
+ # This is very rough and not accurate
180
+ begin
181
+ file_size = File.size(file_path)
182
+ # Rough estimation: 1MB per minute for compressed audio
183
+ estimated_minutes = file_size / (1024 * 1024)
184
+ estimated_minutes > 0 ? estimated_minutes : nil
185
+ rescue StandardError
186
+ nil
187
+ end
188
+ end
189
+
190
+ def format_duration(minutes)
191
+ if minutes < 60
192
+ "#{minutes.round}m"
193
+ else
194
+ hours = minutes / 60
195
+ remaining_minutes = minutes % 60
196
+ "#{hours.round}h #{remaining_minutes.round}m"
197
+ end
198
+ end
199
+ end
200
+ end
@@ -0,0 +1,216 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ragdoll
4
+ class DocumentConverter
5
+ class ConversionError < StandardError; end
6
+
7
+ def self.convert_to_text(file_path, document_type = nil, **options)
8
+ new(**options).convert_to_text(file_path, document_type)
9
+ end
10
+
11
+ def initialize(**options)
12
+ @options = options
13
+ end
14
+
15
+ def convert_to_text(file_path, document_type = nil)
16
+ return "" unless File.exist?(file_path)
17
+
18
+ document_type ||= determine_document_type(file_path)
19
+
20
+ begin
21
+ case document_type
22
+ when "text", "markdown", "html", "pdf", "docx", "csv", "json", "xml", "yaml"
23
+ convert_text_based_document(file_path, document_type)
24
+ when "image"
25
+ convert_image_to_text(file_path)
26
+ when "audio"
27
+ convert_audio_to_text(file_path)
28
+ when "video"
29
+ convert_video_to_text(file_path)
30
+ else
31
+ convert_unknown_document(file_path)
32
+ end
33
+ rescue StandardError => e
34
+ puts "Warning: Document conversion failed for #{file_path}: #{e.message}"
35
+ generate_fallback_text(file_path, document_type)
36
+ end
37
+ end
38
+
39
+ def determine_document_type(file_path)
40
+ extension = File.extname(file_path).downcase
41
+
42
+ case extension
43
+ when ".pdf" then "pdf"
44
+ when ".docx" then "docx"
45
+ when ".txt" then "text"
46
+ when ".md", ".markdown" then "markdown"
47
+ when ".html", ".htm" then "html"
48
+ when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
49
+ "image"
50
+ when ".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac", ".wma"
51
+ "audio"
52
+ when ".mp4", ".mov", ".avi", ".webm", ".mkv"
53
+ "video"
54
+ when ".csv" then "csv"
55
+ when ".json" then "json"
56
+ when ".xml" then "xml"
57
+ when ".yml", ".yaml" then "yaml"
58
+ else
59
+ "text" # Default to text for unknown extensions
60
+ end
61
+ end
62
+
63
+ def supported_formats
64
+ {
65
+ text: %w[.txt .md .markdown .html .htm .csv .json .xml .yml .yaml],
66
+ documents: %w[.pdf .docx],
67
+ images: %w[.jpg .jpeg .png .gif .bmp .webp .svg .ico .tiff .tif],
68
+ audio: %w[.mp3 .wav .m4a .flac .ogg .aac .wma],
69
+ video: %w[.mp4 .mov .avi .webm .mkv]
70
+ }
71
+ end
72
+
73
+ private
74
+
75
+ def convert_text_based_document(file_path, document_type)
76
+ service = Ragdoll::TextExtractionService.new(file_path, document_type)
77
+ service.extract
78
+ end
79
+
80
+ def convert_image_to_text(file_path)
81
+ service = Ragdoll::ImageToTextService.new(@options)
82
+ service.convert(file_path)
83
+ end
84
+
85
+ def convert_audio_to_text(file_path)
86
+ service = Ragdoll::AudioToTextService.new(@options)
87
+ service.transcribe(file_path)
88
+ end
89
+
90
+ def convert_video_to_text(file_path)
91
+ # For video files, we'll extract audio and transcribe it
92
+ # This is a simplified approach - in production you might want to:
93
+ # 1. Extract keyframes as images and describe them
94
+ # 2. Extract audio track and transcribe it
95
+ # 3. Combine both approaches
96
+
97
+ begin
98
+ # Try to extract basic metadata
99
+ video_info = extract_video_metadata(file_path)
100
+ audio_text = attempt_video_audio_extraction(file_path)
101
+
102
+ if audio_text && !audio_text.empty?
103
+ if video_info
104
+ "Video content: #{video_info}\n\nAudio transcript: #{audio_text}"
105
+ else
106
+ "Video with audio transcript: #{audio_text}"
107
+ end
108
+ else
109
+ video_info || generate_fallback_text(file_path, "video")
110
+ end
111
+ rescue StandardError => e
112
+ puts "Warning: Video conversion failed: #{e.message}"
113
+ generate_fallback_text(file_path, "video")
114
+ end
115
+ end
116
+
117
+ def convert_unknown_document(file_path)
118
+ # Try to read as text first
119
+ begin
120
+ content = File.read(file_path, encoding: "UTF-8")
121
+ return content if looks_like_text?(content)
122
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
123
+ # Try with different encoding
124
+ begin
125
+ content = File.read(file_path, encoding: "ISO-8859-1")
126
+ .encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
127
+ return content if looks_like_text?(content)
128
+ rescue StandardError
129
+ # Fall through to binary handling
130
+ end
131
+ rescue StandardError
132
+ # Fall through to fallback
133
+ end
134
+
135
+ # If not readable as text, generate metadata-based description
136
+ generate_fallback_text(file_path, "unknown")
137
+ end
138
+
139
+ def extract_video_metadata(file_path)
140
+ # Basic video metadata extraction
141
+ # In production, you might use ffmpeg or similar tools
142
+ file_size = File.size(file_path)
143
+ filename = File.basename(file_path, File.extname(file_path))
144
+
145
+ # Extract meaningful parts from filename
146
+ descriptive_parts = filename
147
+ .gsub(/[-_]+/, ' ')
148
+ .gsub(/([a-z])([A-Z])/, '\1 \2')
149
+ .split(' ')
150
+ .reject { |part| part.match?(/^\d+$/) }
151
+ .map(&:capitalize)
152
+
153
+ if descriptive_parts.any?
154
+ "Video: #{descriptive_parts.join(' ')} (#{format_file_size(file_size)})"
155
+ else
156
+ "Video file: #{File.basename(file_path)} (#{format_file_size(file_size)})"
157
+ end
158
+ end
159
+
160
+ def attempt_video_audio_extraction(file_path)
161
+ # Placeholder for video audio extraction
162
+ # In production, you would:
163
+ # 1. Use ffmpeg to extract audio track
164
+ # 2. Save to temporary audio file
165
+ # 3. Transcribe the audio file
166
+ # 4. Clean up temporary file
167
+
168
+ # For now, return nil to indicate no audio extraction
169
+ nil
170
+ end
171
+
172
+ def looks_like_text?(content)
173
+ # Simple heuristic to determine if content is text
174
+ return false if content.empty?
175
+
176
+ # Check for reasonable ratio of printable characters
177
+ printable_chars = content.count(" -~")
178
+ total_chars = content.length
179
+
180
+ printable_ratio = printable_chars.to_f / total_chars
181
+ printable_ratio > 0.8 && total_chars > 0
182
+ end
183
+
184
+ def generate_fallback_text(file_path, document_type)
185
+ filename = File.basename(file_path)
186
+ file_size = File.size(file_path)
187
+
188
+ case document_type
189
+ when "image"
190
+ "Image file: #{filename} (#{format_file_size(file_size)})"
191
+ when "audio"
192
+ "Audio file: #{filename} (#{format_file_size(file_size)})"
193
+ when "video"
194
+ "Video file: #{filename} (#{format_file_size(file_size)})"
195
+ else
196
+ "Document: #{filename} (#{format_file_size(file_size)})"
197
+ end
198
+ end
199
+
200
+ def format_file_size(size)
201
+ units = %w[B KB MB GB TB]
202
+ unit_index = 0
203
+
204
+ while size >= 1024 && unit_index < units.length - 1
205
+ size /= 1024.0
206
+ unit_index += 1
207
+ end
208
+
209
+ if unit_index == 0
210
+ "#{size} #{units[unit_index]}"
211
+ else
212
+ "#{size.round(1)} #{units[unit_index]}"
213
+ end
214
+ end
215
+ end
216
+ end