ragdoll 0.1.10 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/README.md +326 -351
- data/app/models/ragdoll/document.rb +1 -1
- data/app/models/ragdoll/search.rb +1 -1
- data/app/models/ragdoll/unified_content.rb +216 -0
- data/app/models/ragdoll/unified_document.rb +338 -0
- data/app/services/ragdoll/audio_to_text_service.rb +200 -0
- data/app/services/ragdoll/document_converter.rb +216 -0
- data/app/services/ragdoll/document_management.rb +117 -9
- data/app/services/ragdoll/document_processor.rb +213 -311
- data/app/services/ragdoll/image_to_text_service.rb +322 -0
- data/app/services/ragdoll/migration_service.rb +340 -0
- data/app/services/ragdoll/text_extraction_service.rb +422 -0
- data/app/services/ragdoll/unified_document_management.rb +300 -0
- data/db/migrate/20250923000001_create_ragdoll_unified_contents.rb +87 -0
- data/lib/ragdoll/core/client.rb +2 -2
- data/lib/ragdoll/core/version.rb +1 -1
- data/lib/ragdoll/core.rb +7 -0
- metadata +11 -2
@@ -0,0 +1,200 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ruby_llm"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
class AudioToTextService
|
7
|
+
class TranscriptionError < StandardError; end
|
8
|
+
|
9
|
+
DEFAULT_OPTIONS = {
|
10
|
+
model: "whisper-1",
|
11
|
+
provider: :openai,
|
12
|
+
temperature: 0.0,
|
13
|
+
language: nil # Auto-detect
|
14
|
+
}.freeze
|
15
|
+
|
16
|
+
def self.transcribe(file_path, **options)
|
17
|
+
new(**options).transcribe(file_path)
|
18
|
+
end
|
19
|
+
|
20
|
+
def initialize(**options)
|
21
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
22
|
+
configure_transcription_service
|
23
|
+
end
|
24
|
+
|
25
|
+
def transcribe(file_path)
|
26
|
+
return "" unless File.exist?(file_path)
|
27
|
+
return "" unless audio_file?(file_path)
|
28
|
+
|
29
|
+
begin
|
30
|
+
# Use RubyLLM for transcription
|
31
|
+
# Note: This is a placeholder implementation
|
32
|
+
# Real implementation would depend on the transcription service available
|
33
|
+
|
34
|
+
if transcription_available?
|
35
|
+
perform_transcription(file_path)
|
36
|
+
else
|
37
|
+
generate_fallback_transcript(file_path)
|
38
|
+
end
|
39
|
+
rescue StandardError => e
|
40
|
+
puts "Warning: Audio transcription failed (#{e.message}), using fallback"
|
41
|
+
generate_fallback_transcript(file_path)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def supported_formats
|
46
|
+
%w[.mp3 .wav .m4a .flac .ogg .aac .wma .mp4 .mov .avi .webm]
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def configure_transcription_service
|
52
|
+
# Configure transcription service based on provider
|
53
|
+
case @options[:provider]
|
54
|
+
when :openai
|
55
|
+
configure_openai_transcription
|
56
|
+
when :azure
|
57
|
+
configure_azure_transcription
|
58
|
+
when :google
|
59
|
+
configure_google_transcription
|
60
|
+
when :whisper_local
|
61
|
+
configure_local_whisper
|
62
|
+
else
|
63
|
+
puts "Warning: Unsupported transcription provider: #{@options[:provider]}"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def configure_openai_transcription
|
68
|
+
# OpenAI Whisper API configuration
|
69
|
+
@api_key = ENV["OPENAI_API_KEY"]
|
70
|
+
@endpoint = "https://api.openai.com/v1/audio/transcriptions"
|
71
|
+
end
|
72
|
+
|
73
|
+
def configure_azure_transcription
|
74
|
+
# Azure Speech Services configuration
|
75
|
+
@api_key = ENV["AZURE_SPEECH_KEY"]
|
76
|
+
@region = ENV["AZURE_SPEECH_REGION"]
|
77
|
+
end
|
78
|
+
|
79
|
+
def configure_google_transcription
|
80
|
+
# Google Cloud Speech-to-Text configuration
|
81
|
+
@api_key = ENV["GOOGLE_CLOUD_API_KEY"]
|
82
|
+
@project_id = ENV["GOOGLE_CLOUD_PROJECT_ID"]
|
83
|
+
end
|
84
|
+
|
85
|
+
def configure_local_whisper
|
86
|
+
# Local Whisper installation configuration
|
87
|
+
@whisper_command = `which whisper`.strip
|
88
|
+
end
|
89
|
+
|
90
|
+
def transcription_available?
|
91
|
+
case @options[:provider]
|
92
|
+
when :openai
|
93
|
+
!@api_key.nil? && !@api_key.empty?
|
94
|
+
when :azure
|
95
|
+
!@api_key.nil? && !@api_key.empty? && !@region.nil?
|
96
|
+
when :google
|
97
|
+
!@api_key.nil? && !@api_key.empty?
|
98
|
+
when :whisper_local
|
99
|
+
!@whisper_command.empty? && File.executable?(@whisper_command)
|
100
|
+
else
|
101
|
+
false
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def perform_transcription(file_path)
|
106
|
+
case @options[:provider]
|
107
|
+
when :openai
|
108
|
+
transcribe_with_openai(file_path)
|
109
|
+
when :azure
|
110
|
+
transcribe_with_azure(file_path)
|
111
|
+
when :google
|
112
|
+
transcribe_with_google(file_path)
|
113
|
+
when :whisper_local
|
114
|
+
transcribe_with_local_whisper(file_path)
|
115
|
+
else
|
116
|
+
raise TranscriptionError, "Unsupported transcription provider"
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def transcribe_with_openai(file_path)
|
121
|
+
# Placeholder for OpenAI Whisper API implementation
|
122
|
+
# This would use HTTP requests to OpenAI's API
|
123
|
+
# For now, return a placeholder
|
124
|
+
generate_fallback_transcript(file_path)
|
125
|
+
end
|
126
|
+
|
127
|
+
def transcribe_with_azure(file_path)
|
128
|
+
# Placeholder for Azure Speech Services implementation
|
129
|
+
generate_fallback_transcript(file_path)
|
130
|
+
end
|
131
|
+
|
132
|
+
def transcribe_with_google(file_path)
|
133
|
+
# Placeholder for Google Cloud Speech-to-Text implementation
|
134
|
+
generate_fallback_transcript(file_path)
|
135
|
+
end
|
136
|
+
|
137
|
+
def transcribe_with_local_whisper(file_path)
|
138
|
+
# Use local Whisper installation
|
139
|
+
output_file = "#{file_path}.txt"
|
140
|
+
|
141
|
+
begin
|
142
|
+
# Run whisper command
|
143
|
+
command = "#{@whisper_command} \"#{file_path}\" --output_format txt --output_dir \"#{File.dirname(file_path)}\""
|
144
|
+
command += " --language #{@options[:language]}" if @options[:language]
|
145
|
+
command += " --temperature #{@options[:temperature]}"
|
146
|
+
|
147
|
+
result = `#{command} 2>&1`
|
148
|
+
|
149
|
+
if $?.success? && File.exist?(output_file)
|
150
|
+
transcript = File.read(output_file)
|
151
|
+
File.delete(output_file) # Cleanup
|
152
|
+
transcript.strip
|
153
|
+
else
|
154
|
+
raise TranscriptionError, "Whisper command failed: #{result}"
|
155
|
+
end
|
156
|
+
rescue StandardError => e
|
157
|
+
raise TranscriptionError, "Local Whisper transcription failed: #{e.message}"
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def audio_file?(file_path)
|
162
|
+
extension = File.extname(file_path).downcase
|
163
|
+
supported_formats.include?(extension)
|
164
|
+
end
|
165
|
+
|
166
|
+
def generate_fallback_transcript(file_path)
|
167
|
+
filename = File.basename(file_path)
|
168
|
+
duration = estimate_duration(file_path)
|
169
|
+
|
170
|
+
if duration
|
171
|
+
"[Audio file: #{filename} (#{format_duration(duration)})]"
|
172
|
+
else
|
173
|
+
"[Audio file: #{filename}]"
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def estimate_duration(file_path)
|
178
|
+
# Try to get duration using file size estimation
|
179
|
+
# This is very rough and not accurate
|
180
|
+
begin
|
181
|
+
file_size = File.size(file_path)
|
182
|
+
# Rough estimation: 1MB per minute for compressed audio
|
183
|
+
estimated_minutes = file_size / (1024 * 1024)
|
184
|
+
estimated_minutes > 0 ? estimated_minutes : nil
|
185
|
+
rescue StandardError
|
186
|
+
nil
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
def format_duration(minutes)
|
191
|
+
if minutes < 60
|
192
|
+
"#{minutes.round}m"
|
193
|
+
else
|
194
|
+
hours = minutes / 60
|
195
|
+
remaining_minutes = minutes % 60
|
196
|
+
"#{hours.round}h #{remaining_minutes.round}m"
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
@@ -0,0 +1,216 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ragdoll
|
4
|
+
class DocumentConverter
|
5
|
+
class ConversionError < StandardError; end
|
6
|
+
|
7
|
+
def self.convert_to_text(file_path, document_type = nil, **options)
|
8
|
+
new(**options).convert_to_text(file_path, document_type)
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(**options)
|
12
|
+
@options = options
|
13
|
+
end
|
14
|
+
|
15
|
+
def convert_to_text(file_path, document_type = nil)
|
16
|
+
return "" unless File.exist?(file_path)
|
17
|
+
|
18
|
+
document_type ||= determine_document_type(file_path)
|
19
|
+
|
20
|
+
begin
|
21
|
+
case document_type
|
22
|
+
when "text", "markdown", "html", "pdf", "docx", "csv", "json", "xml", "yaml"
|
23
|
+
convert_text_based_document(file_path, document_type)
|
24
|
+
when "image"
|
25
|
+
convert_image_to_text(file_path)
|
26
|
+
when "audio"
|
27
|
+
convert_audio_to_text(file_path)
|
28
|
+
when "video"
|
29
|
+
convert_video_to_text(file_path)
|
30
|
+
else
|
31
|
+
convert_unknown_document(file_path)
|
32
|
+
end
|
33
|
+
rescue StandardError => e
|
34
|
+
puts "Warning: Document conversion failed for #{file_path}: #{e.message}"
|
35
|
+
generate_fallback_text(file_path, document_type)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def determine_document_type(file_path)
|
40
|
+
extension = File.extname(file_path).downcase
|
41
|
+
|
42
|
+
case extension
|
43
|
+
when ".pdf" then "pdf"
|
44
|
+
when ".docx" then "docx"
|
45
|
+
when ".txt" then "text"
|
46
|
+
when ".md", ".markdown" then "markdown"
|
47
|
+
when ".html", ".htm" then "html"
|
48
|
+
when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
|
49
|
+
"image"
|
50
|
+
when ".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac", ".wma"
|
51
|
+
"audio"
|
52
|
+
when ".mp4", ".mov", ".avi", ".webm", ".mkv"
|
53
|
+
"video"
|
54
|
+
when ".csv" then "csv"
|
55
|
+
when ".json" then "json"
|
56
|
+
when ".xml" then "xml"
|
57
|
+
when ".yml", ".yaml" then "yaml"
|
58
|
+
else
|
59
|
+
"text" # Default to text for unknown extensions
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def supported_formats
|
64
|
+
{
|
65
|
+
text: %w[.txt .md .markdown .html .htm .csv .json .xml .yml .yaml],
|
66
|
+
documents: %w[.pdf .docx],
|
67
|
+
images: %w[.jpg .jpeg .png .gif .bmp .webp .svg .ico .tiff .tif],
|
68
|
+
audio: %w[.mp3 .wav .m4a .flac .ogg .aac .wma],
|
69
|
+
video: %w[.mp4 .mov .avi .webm .mkv]
|
70
|
+
}
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def convert_text_based_document(file_path, document_type)
|
76
|
+
service = Ragdoll::TextExtractionService.new(file_path, document_type)
|
77
|
+
service.extract
|
78
|
+
end
|
79
|
+
|
80
|
+
def convert_image_to_text(file_path)
|
81
|
+
service = Ragdoll::ImageToTextService.new(@options)
|
82
|
+
service.convert(file_path)
|
83
|
+
end
|
84
|
+
|
85
|
+
def convert_audio_to_text(file_path)
|
86
|
+
service = Ragdoll::AudioToTextService.new(@options)
|
87
|
+
service.transcribe(file_path)
|
88
|
+
end
|
89
|
+
|
90
|
+
def convert_video_to_text(file_path)
|
91
|
+
# For video files, we'll extract audio and transcribe it
|
92
|
+
# This is a simplified approach - in production you might want to:
|
93
|
+
# 1. Extract keyframes as images and describe them
|
94
|
+
# 2. Extract audio track and transcribe it
|
95
|
+
# 3. Combine both approaches
|
96
|
+
|
97
|
+
begin
|
98
|
+
# Try to extract basic metadata
|
99
|
+
video_info = extract_video_metadata(file_path)
|
100
|
+
audio_text = attempt_video_audio_extraction(file_path)
|
101
|
+
|
102
|
+
if audio_text && !audio_text.empty?
|
103
|
+
if video_info
|
104
|
+
"Video content: #{video_info}\n\nAudio transcript: #{audio_text}"
|
105
|
+
else
|
106
|
+
"Video with audio transcript: #{audio_text}"
|
107
|
+
end
|
108
|
+
else
|
109
|
+
video_info || generate_fallback_text(file_path, "video")
|
110
|
+
end
|
111
|
+
rescue StandardError => e
|
112
|
+
puts "Warning: Video conversion failed: #{e.message}"
|
113
|
+
generate_fallback_text(file_path, "video")
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def convert_unknown_document(file_path)
|
118
|
+
# Try to read as text first
|
119
|
+
begin
|
120
|
+
content = File.read(file_path, encoding: "UTF-8")
|
121
|
+
return content if looks_like_text?(content)
|
122
|
+
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
|
123
|
+
# Try with different encoding
|
124
|
+
begin
|
125
|
+
content = File.read(file_path, encoding: "ISO-8859-1")
|
126
|
+
.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
|
127
|
+
return content if looks_like_text?(content)
|
128
|
+
rescue StandardError
|
129
|
+
# Fall through to binary handling
|
130
|
+
end
|
131
|
+
rescue StandardError
|
132
|
+
# Fall through to fallback
|
133
|
+
end
|
134
|
+
|
135
|
+
# If not readable as text, generate metadata-based description
|
136
|
+
generate_fallback_text(file_path, "unknown")
|
137
|
+
end
|
138
|
+
|
139
|
+
def extract_video_metadata(file_path)
|
140
|
+
# Basic video metadata extraction
|
141
|
+
# In production, you might use ffmpeg or similar tools
|
142
|
+
file_size = File.size(file_path)
|
143
|
+
filename = File.basename(file_path, File.extname(file_path))
|
144
|
+
|
145
|
+
# Extract meaningful parts from filename
|
146
|
+
descriptive_parts = filename
|
147
|
+
.gsub(/[-_]+/, ' ')
|
148
|
+
.gsub(/([a-z])([A-Z])/, '\1 \2')
|
149
|
+
.split(' ')
|
150
|
+
.reject { |part| part.match?(/^\d+$/) }
|
151
|
+
.map(&:capitalize)
|
152
|
+
|
153
|
+
if descriptive_parts.any?
|
154
|
+
"Video: #{descriptive_parts.join(' ')} (#{format_file_size(file_size)})"
|
155
|
+
else
|
156
|
+
"Video file: #{File.basename(file_path)} (#{format_file_size(file_size)})"
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def attempt_video_audio_extraction(file_path)
|
161
|
+
# Placeholder for video audio extraction
|
162
|
+
# In production, you would:
|
163
|
+
# 1. Use ffmpeg to extract audio track
|
164
|
+
# 2. Save to temporary audio file
|
165
|
+
# 3. Transcribe the audio file
|
166
|
+
# 4. Clean up temporary file
|
167
|
+
|
168
|
+
# For now, return nil to indicate no audio extraction
|
169
|
+
nil
|
170
|
+
end
|
171
|
+
|
172
|
+
def looks_like_text?(content)
|
173
|
+
# Simple heuristic to determine if content is text
|
174
|
+
return false if content.empty?
|
175
|
+
|
176
|
+
# Check for reasonable ratio of printable characters
|
177
|
+
printable_chars = content.count(" -~")
|
178
|
+
total_chars = content.length
|
179
|
+
|
180
|
+
printable_ratio = printable_chars.to_f / total_chars
|
181
|
+
printable_ratio > 0.8 && total_chars > 0
|
182
|
+
end
|
183
|
+
|
184
|
+
def generate_fallback_text(file_path, document_type)
|
185
|
+
filename = File.basename(file_path)
|
186
|
+
file_size = File.size(file_path)
|
187
|
+
|
188
|
+
case document_type
|
189
|
+
when "image"
|
190
|
+
"Image file: #{filename} (#{format_file_size(file_size)})"
|
191
|
+
when "audio"
|
192
|
+
"Audio file: #{filename} (#{format_file_size(file_size)})"
|
193
|
+
when "video"
|
194
|
+
"Video file: #{filename} (#{format_file_size(file_size)})"
|
195
|
+
else
|
196
|
+
"Document: #{filename} (#{format_file_size(file_size)})"
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def format_file_size(size)
|
201
|
+
units = %w[B KB MB GB TB]
|
202
|
+
unit_index = 0
|
203
|
+
|
204
|
+
while size >= 1024 && unit_index < units.length - 1
|
205
|
+
size /= 1024.0
|
206
|
+
unit_index += 1
|
207
|
+
end
|
208
|
+
|
209
|
+
if unit_index == 0
|
210
|
+
"#{size} #{units[unit_index]}"
|
211
|
+
else
|
212
|
+
"#{size.round(1)} #{units[unit_index]}"
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
@@ -1,9 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require "securerandom"
|
4
|
+
|
3
5
|
module Ragdoll
|
4
6
|
class DocumentManagement
|
5
7
|
class << self
|
6
|
-
def add_document(location, content, metadata = {})
|
8
|
+
def add_document(location, content, metadata = {}, force: false)
|
7
9
|
# Ensure location is an absolute path if it's a file path
|
8
10
|
absolute_location = location.start_with?("http") || location.start_with?("ftp") ? location : File.expand_path(location)
|
9
11
|
|
@@ -14,17 +16,21 @@ module Ragdoll
|
|
14
16
|
Time.current
|
15
17
|
end
|
16
18
|
|
17
|
-
#
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
19
|
+
# Skip duplicate detection if force is true
|
20
|
+
unless force
|
21
|
+
existing_document = find_duplicate_document(absolute_location, content, metadata, file_modified_at)
|
22
|
+
return existing_document.id.to_s if existing_document
|
23
|
+
end
|
22
24
|
|
23
|
-
#
|
24
|
-
|
25
|
+
# Modify location if force is used to avoid unique constraint violation
|
26
|
+
final_location = if force
|
27
|
+
"#{absolute_location}#forced_#{Time.current.to_i}_#{SecureRandom.hex(4)}"
|
28
|
+
else
|
29
|
+
absolute_location
|
30
|
+
end
|
25
31
|
|
26
32
|
document = Ragdoll::Document.create!(
|
27
|
-
location:
|
33
|
+
location: final_location,
|
28
34
|
title: metadata[:title] || metadata["title"] || extract_title_from_location(location),
|
29
35
|
document_type: metadata[:document_type] || metadata["document_type"] || "text",
|
30
36
|
metadata: metadata.is_a?(Hash) ? metadata : {},
|
@@ -100,6 +106,108 @@ module Ragdoll
|
|
100
106
|
|
101
107
|
private
|
102
108
|
|
109
|
+
def find_duplicate_document(location, content, metadata, file_modified_at)
|
110
|
+
# Primary check: exact location match (simple duplicate detection)
|
111
|
+
existing = Ragdoll::Document.find_by(location: location)
|
112
|
+
return existing if existing
|
113
|
+
|
114
|
+
# Secondary check: exact location and file modification time (for files)
|
115
|
+
existing_with_time = Ragdoll::Document.find_by(
|
116
|
+
location: location,
|
117
|
+
file_modified_at: file_modified_at
|
118
|
+
)
|
119
|
+
return existing_with_time if existing_with_time
|
120
|
+
|
121
|
+
# Enhanced duplicate detection for file-based documents
|
122
|
+
if File.exist?(location) && !location.start_with?("http")
|
123
|
+
file_size = File.size(location)
|
124
|
+
content_hash = calculate_file_hash(location)
|
125
|
+
|
126
|
+
# Check for documents with same file hash (most reliable)
|
127
|
+
potential_duplicates = Ragdoll::Document.where("metadata->>'file_hash' = ?", content_hash)
|
128
|
+
return potential_duplicates.first if potential_duplicates.any?
|
129
|
+
|
130
|
+
# Check for documents with same file size and similar metadata
|
131
|
+
same_size_docs = Ragdoll::Document.where("metadata->>'file_size' = ?", file_size.to_s)
|
132
|
+
same_size_docs.each do |doc|
|
133
|
+
return doc if documents_are_duplicates?(doc, location, content, metadata, file_size, content_hash)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# For non-file documents (URLs, etc), check content-based duplicates
|
138
|
+
unless File.exist?(location)
|
139
|
+
return find_content_based_duplicate(content, metadata)
|
140
|
+
end
|
141
|
+
|
142
|
+
nil
|
143
|
+
end
|
144
|
+
|
145
|
+
def documents_are_duplicates?(existing_doc, location, content, metadata, file_size, content_hash)
|
146
|
+
# Compare multiple factors to determine if documents are duplicates
|
147
|
+
|
148
|
+
# Check filename similarity (basename without extension)
|
149
|
+
existing_basename = File.basename(existing_doc.location, File.extname(existing_doc.location))
|
150
|
+
new_basename = File.basename(location, File.extname(location))
|
151
|
+
return false unless existing_basename == new_basename
|
152
|
+
|
153
|
+
# Check content length similarity (within 5% tolerance)
|
154
|
+
if content.present? && existing_doc.content.present?
|
155
|
+
content_length_diff = (content.length - existing_doc.content.length).abs
|
156
|
+
max_length = [content.length, existing_doc.content.length].max
|
157
|
+
return false if max_length > 0 && (content_length_diff.to_f / max_length) > 0.05
|
158
|
+
end
|
159
|
+
|
160
|
+
# Check key metadata fields
|
161
|
+
existing_metadata = existing_doc.metadata || {}
|
162
|
+
new_metadata = metadata || {}
|
163
|
+
|
164
|
+
# Compare file type/document type
|
165
|
+
return false if existing_doc.document_type != (new_metadata[:document_type] || new_metadata["document_type"] || "text")
|
166
|
+
|
167
|
+
# Compare title if available
|
168
|
+
existing_title = existing_metadata["title"] || existing_doc.title
|
169
|
+
new_title = new_metadata[:title] || new_metadata["title"] || extract_title_from_location(location)
|
170
|
+
return false if existing_title && new_title && existing_title != new_title
|
171
|
+
|
172
|
+
# If we reach here, documents are likely duplicates
|
173
|
+
true
|
174
|
+
end
|
175
|
+
|
176
|
+
def find_content_based_duplicate(content, metadata)
|
177
|
+
return nil unless content.present?
|
178
|
+
|
179
|
+
content_hash = calculate_content_hash(content)
|
180
|
+
title = metadata[:title] || metadata["title"]
|
181
|
+
|
182
|
+
# Look for documents with same content hash
|
183
|
+
Ragdoll::Document.where("metadata->>'content_hash' = ?", content_hash).first ||
|
184
|
+
# Look for documents with same title and similar content length (within 5% tolerance)
|
185
|
+
(title ? find_by_title_and_content_similarity(title, content) : nil)
|
186
|
+
end
|
187
|
+
|
188
|
+
def find_by_title_and_content_similarity(title, content)
|
189
|
+
content_length = content.length
|
190
|
+
tolerance = content_length * 0.05
|
191
|
+
|
192
|
+
Ragdoll::Document.where(title: title).find do |doc|
|
193
|
+
doc.content.present? &&
|
194
|
+
(doc.content.length - content_length).abs <= tolerance
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
def calculate_file_hash(file_path)
|
199
|
+
require 'digest'
|
200
|
+
Digest::SHA256.file(file_path).hexdigest
|
201
|
+
rescue StandardError => e
|
202
|
+
Rails.logger.warn "Failed to calculate file hash for #{file_path}: #{e.message}" if defined?(Rails)
|
203
|
+
nil
|
204
|
+
end
|
205
|
+
|
206
|
+
def calculate_content_hash(content)
|
207
|
+
require 'digest'
|
208
|
+
Digest::SHA256.hexdigest(content)
|
209
|
+
end
|
210
|
+
|
103
211
|
def extract_title_from_location(location)
|
104
212
|
File.basename(location, File.extname(location))
|
105
213
|
end
|