ragdoll 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,200 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ruby_llm"
4
+
5
+ module Ragdoll
6
+ class AudioToTextService
7
+ class TranscriptionError < StandardError; end
8
+
9
+ DEFAULT_OPTIONS = {
10
+ model: "whisper-1",
11
+ provider: :openai,
12
+ temperature: 0.0,
13
+ language: nil # Auto-detect
14
+ }.freeze
15
+
16
+ def self.transcribe(file_path, **options)
17
+ new(**options).transcribe(file_path)
18
+ end
19
+
20
+ def initialize(**options)
21
+ @options = DEFAULT_OPTIONS.merge(options)
22
+ configure_transcription_service
23
+ end
24
+
25
+ def transcribe(file_path)
26
+ return "" unless File.exist?(file_path)
27
+ return "" unless audio_file?(file_path)
28
+
29
+ begin
30
+ # Use RubyLLM for transcription
31
+ # Note: This is a placeholder implementation
32
+ # Real implementation would depend on the transcription service available
33
+
34
+ if transcription_available?
35
+ perform_transcription(file_path)
36
+ else
37
+ generate_fallback_transcript(file_path)
38
+ end
39
+ rescue StandardError => e
40
+ puts "Warning: Audio transcription failed (#{e.message}), using fallback"
41
+ generate_fallback_transcript(file_path)
42
+ end
43
+ end
44
+
45
+ def supported_formats
46
+ %w[.mp3 .wav .m4a .flac .ogg .aac .wma .mp4 .mov .avi .webm]
47
+ end
48
+
49
+ private
50
+
51
+ def configure_transcription_service
52
+ # Configure transcription service based on provider
53
+ case @options[:provider]
54
+ when :openai
55
+ configure_openai_transcription
56
+ when :azure
57
+ configure_azure_transcription
58
+ when :google
59
+ configure_google_transcription
60
+ when :whisper_local
61
+ configure_local_whisper
62
+ else
63
+ puts "Warning: Unsupported transcription provider: #{@options[:provider]}"
64
+ end
65
+ end
66
+
67
+ def configure_openai_transcription
68
+ # OpenAI Whisper API configuration
69
+ @api_key = ENV["OPENAI_API_KEY"]
70
+ @endpoint = "https://api.openai.com/v1/audio/transcriptions"
71
+ end
72
+
73
+ def configure_azure_transcription
74
+ # Azure Speech Services configuration
75
+ @api_key = ENV["AZURE_SPEECH_KEY"]
76
+ @region = ENV["AZURE_SPEECH_REGION"]
77
+ end
78
+
79
+ def configure_google_transcription
80
+ # Google Cloud Speech-to-Text configuration
81
+ @api_key = ENV["GOOGLE_CLOUD_API_KEY"]
82
+ @project_id = ENV["GOOGLE_CLOUD_PROJECT_ID"]
83
+ end
84
+
85
+ def configure_local_whisper
86
+ # Local Whisper installation configuration
87
+ @whisper_command = `which whisper`.strip
88
+ end
89
+
90
+ def transcription_available?
91
+ case @options[:provider]
92
+ when :openai
93
+ !@api_key.nil? && !@api_key.empty?
94
+ when :azure
95
+ !@api_key.nil? && !@api_key.empty? && !@region.nil?
96
+ when :google
97
+ !@api_key.nil? && !@api_key.empty?
98
+ when :whisper_local
99
+ !@whisper_command.empty? && File.executable?(@whisper_command)
100
+ else
101
+ false
102
+ end
103
+ end
104
+
105
+ def perform_transcription(file_path)
106
+ case @options[:provider]
107
+ when :openai
108
+ transcribe_with_openai(file_path)
109
+ when :azure
110
+ transcribe_with_azure(file_path)
111
+ when :google
112
+ transcribe_with_google(file_path)
113
+ when :whisper_local
114
+ transcribe_with_local_whisper(file_path)
115
+ else
116
+ raise TranscriptionError, "Unsupported transcription provider"
117
+ end
118
+ end
119
+
120
+ def transcribe_with_openai(file_path)
121
+ # Placeholder for OpenAI Whisper API implementation
122
+ # This would use HTTP requests to OpenAI's API
123
+ # For now, return a placeholder
124
+ generate_fallback_transcript(file_path)
125
+ end
126
+
127
+ def transcribe_with_azure(file_path)
128
+ # Placeholder for Azure Speech Services implementation
129
+ generate_fallback_transcript(file_path)
130
+ end
131
+
132
+ def transcribe_with_google(file_path)
133
+ # Placeholder for Google Cloud Speech-to-Text implementation
134
+ generate_fallback_transcript(file_path)
135
+ end
136
+
137
+ def transcribe_with_local_whisper(file_path)
138
+ # Use local Whisper installation
139
+ output_file = "#{file_path}.txt"
140
+
141
+ begin
142
+ # Run whisper command
143
+ command = "#{@whisper_command} \"#{file_path}\" --output_format txt --output_dir \"#{File.dirname(file_path)}\""
144
+ command += " --language #{@options[:language]}" if @options[:language]
145
+ command += " --temperature #{@options[:temperature]}"
146
+
147
+ result = `#{command} 2>&1`
148
+
149
+ if $?.success? && File.exist?(output_file)
150
+ transcript = File.read(output_file)
151
+ File.delete(output_file) # Cleanup
152
+ transcript.strip
153
+ else
154
+ raise TranscriptionError, "Whisper command failed: #{result}"
155
+ end
156
+ rescue StandardError => e
157
+ raise TranscriptionError, "Local Whisper transcription failed: #{e.message}"
158
+ end
159
+ end
160
+
161
+ def audio_file?(file_path)
162
+ extension = File.extname(file_path).downcase
163
+ supported_formats.include?(extension)
164
+ end
165
+
166
+ def generate_fallback_transcript(file_path)
167
+ filename = File.basename(file_path)
168
+ duration = estimate_duration(file_path)
169
+
170
+ if duration
171
+ "[Audio file: #{filename} (#{format_duration(duration)})]"
172
+ else
173
+ "[Audio file: #{filename}]"
174
+ end
175
+ end
176
+
177
+ def estimate_duration(file_path)
178
+ # Try to get duration using file size estimation
179
+ # This is very rough and not accurate
180
+ begin
181
+ file_size = File.size(file_path)
182
+ # Rough estimation: 1MB per minute for compressed audio
183
+ estimated_minutes = file_size / (1024 * 1024)
184
+ estimated_minutes > 0 ? estimated_minutes : nil
185
+ rescue StandardError
186
+ nil
187
+ end
188
+ end
189
+
190
+ def format_duration(minutes)
191
+ if minutes < 60
192
+ "#{minutes.round}m"
193
+ else
194
+ hours = minutes / 60
195
+ remaining_minutes = minutes % 60
196
+ "#{hours.round}h #{remaining_minutes.round}m"
197
+ end
198
+ end
199
+ end
200
+ end
@@ -0,0 +1,216 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ragdoll
4
+ class DocumentConverter
5
+ class ConversionError < StandardError; end
6
+
7
+ def self.convert_to_text(file_path, document_type = nil, **options)
8
+ new(**options).convert_to_text(file_path, document_type)
9
+ end
10
+
11
+ def initialize(**options)
12
+ @options = options
13
+ end
14
+
15
+ def convert_to_text(file_path, document_type = nil)
16
+ return "" unless File.exist?(file_path)
17
+
18
+ document_type ||= determine_document_type(file_path)
19
+
20
+ begin
21
+ case document_type
22
+ when "text", "markdown", "html", "pdf", "docx", "csv", "json", "xml", "yaml"
23
+ convert_text_based_document(file_path, document_type)
24
+ when "image"
25
+ convert_image_to_text(file_path)
26
+ when "audio"
27
+ convert_audio_to_text(file_path)
28
+ when "video"
29
+ convert_video_to_text(file_path)
30
+ else
31
+ convert_unknown_document(file_path)
32
+ end
33
+ rescue StandardError => e
34
+ puts "Warning: Document conversion failed for #{file_path}: #{e.message}"
35
+ generate_fallback_text(file_path, document_type)
36
+ end
37
+ end
38
+
39
+ def determine_document_type(file_path)
40
+ extension = File.extname(file_path).downcase
41
+
42
+ case extension
43
+ when ".pdf" then "pdf"
44
+ when ".docx" then "docx"
45
+ when ".txt" then "text"
46
+ when ".md", ".markdown" then "markdown"
47
+ when ".html", ".htm" then "html"
48
+ when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
49
+ "image"
50
+ when ".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac", ".wma"
51
+ "audio"
52
+ when ".mp4", ".mov", ".avi", ".webm", ".mkv"
53
+ "video"
54
+ when ".csv" then "csv"
55
+ when ".json" then "json"
56
+ when ".xml" then "xml"
57
+ when ".yml", ".yaml" then "yaml"
58
+ else
59
+ "text" # Default to text for unknown extensions
60
+ end
61
+ end
62
+
63
+ def supported_formats
64
+ {
65
+ text: %w[.txt .md .markdown .html .htm .csv .json .xml .yml .yaml],
66
+ documents: %w[.pdf .docx],
67
+ images: %w[.jpg .jpeg .png .gif .bmp .webp .svg .ico .tiff .tif],
68
+ audio: %w[.mp3 .wav .m4a .flac .ogg .aac .wma],
69
+ video: %w[.mp4 .mov .avi .webm .mkv]
70
+ }
71
+ end
72
+
73
+ private
74
+
75
+ def convert_text_based_document(file_path, document_type)
76
+ service = Ragdoll::TextExtractionService.new(file_path, document_type)
77
+ service.extract
78
+ end
79
+
80
+ def convert_image_to_text(file_path)
81
+ service = Ragdoll::ImageToTextService.new(@options)
82
+ service.convert(file_path)
83
+ end
84
+
85
+ def convert_audio_to_text(file_path)
86
+ service = Ragdoll::AudioToTextService.new(@options)
87
+ service.transcribe(file_path)
88
+ end
89
+
90
+ def convert_video_to_text(file_path)
91
+ # For video files, we'll extract audio and transcribe it
92
+ # This is a simplified approach - in production you might want to:
93
+ # 1. Extract keyframes as images and describe them
94
+ # 2. Extract audio track and transcribe it
95
+ # 3. Combine both approaches
96
+
97
+ begin
98
+ # Try to extract basic metadata
99
+ video_info = extract_video_metadata(file_path)
100
+ audio_text = attempt_video_audio_extraction(file_path)
101
+
102
+ if audio_text && !audio_text.empty?
103
+ if video_info
104
+ "Video content: #{video_info}\n\nAudio transcript: #{audio_text}"
105
+ else
106
+ "Video with audio transcript: #{audio_text}"
107
+ end
108
+ else
109
+ video_info || generate_fallback_text(file_path, "video")
110
+ end
111
+ rescue StandardError => e
112
+ puts "Warning: Video conversion failed: #{e.message}"
113
+ generate_fallback_text(file_path, "video")
114
+ end
115
+ end
116
+
117
+ def convert_unknown_document(file_path)
118
+ # Try to read as text first
119
+ begin
120
+ content = File.read(file_path, encoding: "UTF-8")
121
+ return content if looks_like_text?(content)
122
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
123
+ # Try with different encoding
124
+ begin
125
+ content = File.read(file_path, encoding: "ISO-8859-1")
126
+ .encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
127
+ return content if looks_like_text?(content)
128
+ rescue StandardError
129
+ # Fall through to binary handling
130
+ end
131
+ rescue StandardError
132
+ # Fall through to fallback
133
+ end
134
+
135
+ # If not readable as text, generate metadata-based description
136
+ generate_fallback_text(file_path, "unknown")
137
+ end
138
+
139
+ def extract_video_metadata(file_path)
140
+ # Basic video metadata extraction
141
+ # In production, you might use ffmpeg or similar tools
142
+ file_size = File.size(file_path)
143
+ filename = File.basename(file_path, File.extname(file_path))
144
+
145
+ # Extract meaningful parts from filename
146
+ descriptive_parts = filename
147
+ .gsub(/[-_]+/, ' ')
148
+ .gsub(/([a-z])([A-Z])/, '\1 \2')
149
+ .split(' ')
150
+ .reject { |part| part.match?(/^\d+$/) }
151
+ .map(&:capitalize)
152
+
153
+ if descriptive_parts.any?
154
+ "Video: #{descriptive_parts.join(' ')} (#{format_file_size(file_size)})"
155
+ else
156
+ "Video file: #{File.basename(file_path)} (#{format_file_size(file_size)})"
157
+ end
158
+ end
159
+
160
+ def attempt_video_audio_extraction(file_path)
161
+ # Placeholder for video audio extraction
162
+ # In production, you would:
163
+ # 1. Use ffmpeg to extract audio track
164
+ # 2. Save to temporary audio file
165
+ # 3. Transcribe the audio file
166
+ # 4. Clean up temporary file
167
+
168
+ # For now, return nil to indicate no audio extraction
169
+ nil
170
+ end
171
+
172
+ def looks_like_text?(content)
173
+ # Simple heuristic to determine if content is text
174
+ return false if content.empty?
175
+
176
+ # Check for reasonable ratio of printable characters
177
+ printable_chars = content.count(" -~")
178
+ total_chars = content.length
179
+
180
+ printable_ratio = printable_chars.to_f / total_chars
181
+ printable_ratio > 0.8 && total_chars > 0
182
+ end
183
+
184
+ def generate_fallback_text(file_path, document_type)
185
+ filename = File.basename(file_path)
186
+ file_size = File.size(file_path)
187
+
188
+ case document_type
189
+ when "image"
190
+ "Image file: #{filename} (#{format_file_size(file_size)})"
191
+ when "audio"
192
+ "Audio file: #{filename} (#{format_file_size(file_size)})"
193
+ when "video"
194
+ "Video file: #{filename} (#{format_file_size(file_size)})"
195
+ else
196
+ "Document: #{filename} (#{format_file_size(file_size)})"
197
+ end
198
+ end
199
+
200
+ def format_file_size(size)
201
+ units = %w[B KB MB GB TB]
202
+ unit_index = 0
203
+
204
+ while size >= 1024 && unit_index < units.length - 1
205
+ size /= 1024.0
206
+ unit_index += 1
207
+ end
208
+
209
+ if unit_index == 0
210
+ "#{size} #{units[unit_index]}"
211
+ else
212
+ "#{size.round(1)} #{units[unit_index]}"
213
+ end
214
+ end
215
+ end
216
+ end
@@ -1,9 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "securerandom"
4
+
3
5
  module Ragdoll
4
6
  class DocumentManagement
5
7
  class << self
6
- def add_document(location, content, metadata = {})
8
+ def add_document(location, content, metadata = {}, force: false)
7
9
  # Ensure location is an absolute path if it's a file path
8
10
  absolute_location = location.start_with?("http") || location.start_with?("ftp") ? location : File.expand_path(location)
9
11
 
@@ -14,17 +16,21 @@ module Ragdoll
14
16
  Time.current
15
17
  end
16
18
 
17
- # Check if document already exists with same location and file_modified_at
18
- existing_document = Ragdoll::Document.find_by(
19
- location: absolute_location,
20
- file_modified_at: file_modified_at
21
- )
19
+ # Skip duplicate detection if force is true
20
+ unless force
21
+ existing_document = find_duplicate_document(absolute_location, content, metadata, file_modified_at)
22
+ return existing_document.id.to_s if existing_document
23
+ end
22
24
 
23
- # Return existing document ID if found (skip duplicate)
24
- return existing_document.id.to_s if existing_document
25
+ # Modify location if force is used to avoid unique constraint violation
26
+ final_location = if force
27
+ "#{absolute_location}#forced_#{Time.current.to_i}_#{SecureRandom.hex(4)}"
28
+ else
29
+ absolute_location
30
+ end
25
31
 
26
32
  document = Ragdoll::Document.create!(
27
- location: absolute_location,
33
+ location: final_location,
28
34
  title: metadata[:title] || metadata["title"] || extract_title_from_location(location),
29
35
  document_type: metadata[:document_type] || metadata["document_type"] || "text",
30
36
  metadata: metadata.is_a?(Hash) ? metadata : {},
@@ -100,6 +106,108 @@ module Ragdoll
100
106
 
101
107
  private
102
108
 
109
+ def find_duplicate_document(location, content, metadata, file_modified_at)
110
+ # Primary check: exact location match (simple duplicate detection)
111
+ existing = Ragdoll::Document.find_by(location: location)
112
+ return existing if existing
113
+
114
+ # Secondary check: exact location and file modification time (for files)
115
+ existing_with_time = Ragdoll::Document.find_by(
116
+ location: location,
117
+ file_modified_at: file_modified_at
118
+ )
119
+ return existing_with_time if existing_with_time
120
+
121
+ # Enhanced duplicate detection for file-based documents
122
+ if File.exist?(location) && !location.start_with?("http")
123
+ file_size = File.size(location)
124
+ content_hash = calculate_file_hash(location)
125
+
126
+ # Check for documents with same file hash (most reliable)
127
+ potential_duplicates = Ragdoll::Document.where("metadata->>'file_hash' = ?", content_hash)
128
+ return potential_duplicates.first if potential_duplicates.any?
129
+
130
+ # Check for documents with same file size and similar metadata
131
+ same_size_docs = Ragdoll::Document.where("metadata->>'file_size' = ?", file_size.to_s)
132
+ same_size_docs.each do |doc|
133
+ return doc if documents_are_duplicates?(doc, location, content, metadata, file_size, content_hash)
134
+ end
135
+ end
136
+
137
+ # For non-file documents (URLs, etc), check content-based duplicates
138
+ unless File.exist?(location)
139
+ return find_content_based_duplicate(content, metadata)
140
+ end
141
+
142
+ nil
143
+ end
144
+
145
+ def documents_are_duplicates?(existing_doc, location, content, metadata, file_size, content_hash)
146
+ # Compare multiple factors to determine if documents are duplicates
147
+
148
+ # Check filename similarity (basename without extension)
149
+ existing_basename = File.basename(existing_doc.location, File.extname(existing_doc.location))
150
+ new_basename = File.basename(location, File.extname(location))
151
+ return false unless existing_basename == new_basename
152
+
153
+ # Check content length similarity (within 5% tolerance)
154
+ if content.present? && existing_doc.content.present?
155
+ content_length_diff = (content.length - existing_doc.content.length).abs
156
+ max_length = [content.length, existing_doc.content.length].max
157
+ return false if max_length > 0 && (content_length_diff.to_f / max_length) > 0.05
158
+ end
159
+
160
+ # Check key metadata fields
161
+ existing_metadata = existing_doc.metadata || {}
162
+ new_metadata = metadata || {}
163
+
164
+ # Compare file type/document type
165
+ return false if existing_doc.document_type != (new_metadata[:document_type] || new_metadata["document_type"] || "text")
166
+
167
+ # Compare title if available
168
+ existing_title = existing_metadata["title"] || existing_doc.title
169
+ new_title = new_metadata[:title] || new_metadata["title"] || extract_title_from_location(location)
170
+ return false if existing_title && new_title && existing_title != new_title
171
+
172
+ # If we reach here, documents are likely duplicates
173
+ true
174
+ end
175
+
176
+ def find_content_based_duplicate(content, metadata)
177
+ return nil unless content.present?
178
+
179
+ content_hash = calculate_content_hash(content)
180
+ title = metadata[:title] || metadata["title"]
181
+
182
+ # Look for documents with same content hash
183
+ Ragdoll::Document.where("metadata->>'content_hash' = ?", content_hash).first ||
184
+ # Look for documents with same title and similar content length (within 5% tolerance)
185
+ (title ? find_by_title_and_content_similarity(title, content) : nil)
186
+ end
187
+
188
+ def find_by_title_and_content_similarity(title, content)
189
+ content_length = content.length
190
+ tolerance = content_length * 0.05
191
+
192
+ Ragdoll::Document.where(title: title).find do |doc|
193
+ doc.content.present? &&
194
+ (doc.content.length - content_length).abs <= tolerance
195
+ end
196
+ end
197
+
198
+ def calculate_file_hash(file_path)
199
+ require 'digest'
200
+ Digest::SHA256.file(file_path).hexdigest
201
+ rescue StandardError => e
202
+ Rails.logger.warn "Failed to calculate file hash for #{file_path}: #{e.message}" if defined?(Rails)
203
+ nil
204
+ end
205
+
206
+ def calculate_content_hash(content)
207
+ require 'digest'
208
+ Digest::SHA256.hexdigest(content)
209
+ end
210
+
103
211
  def extract_title_from_location(location)
104
212
  File.basename(location, File.extname(location))
105
213
  end