ragdoll 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,6 @@ require "docx"
5
5
  require "rmagick"
6
6
  require "yaml"
7
7
  require "date"
8
- # Image description service is auto-loaded from app/services
9
8
 
10
9
  module Ragdoll
11
10
  class DocumentProcessor
@@ -34,7 +33,7 @@ module Ragdoll
34
33
  location: File.expand_path(file_path),
35
34
  title: parsed[:title] || File.basename(file_path, File.extname(file_path)),
36
35
  content: parsed[:content],
37
- document_type: determine_document_type(file_path),
36
+ document_type: parsed[:document_type] || determine_document_type(file_path),
38
37
  metadata: parsed[:metadata] || {},
39
38
  status: "processed",
40
39
  file_modified_at: file_modified_at,
@@ -85,399 +84,237 @@ module Ragdoll
85
84
  end
86
85
 
87
86
  def parse
88
- case @file_extension
89
- when ".pdf"
90
- parse_pdf
91
- when ".docx"
92
- parse_docx
93
- when ".txt", ".md", ".markdown"
94
- parse_text
95
- when ".html", ".htm"
96
- parse_html
97
- when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
98
- parse_image
99
- else
100
- parse_text # Default to text parsing for unknown formats
87
+ # Check if file exists first
88
+ unless File.exist?(@file_path)
89
+ raise ParseError, "File does not exist: #{@file_path}"
101
90
  end
102
- end
103
91
 
104
- private
92
+ # Use the new unified document converter
93
+ document_type = determine_document_type(@file_path)
105
94
 
106
- def parse_pdf
107
- content = ""
108
- metadata = {}
95
+ begin
96
+ # Convert to text using the unified pipeline
97
+ text_content = Ragdoll::DocumentConverter.convert_to_text(@file_path, document_type)
109
98
 
110
- # Add file-based metadata for duplicate detection
111
- if File.exist?(@file_path)
112
- metadata[:file_size] = File.size(@file_path)
113
- metadata[:file_hash] = calculate_file_hash(@file_path)
114
- end
99
+ # Extract metadata based on document type
100
+ metadata = extract_metadata_for_type(document_type)
115
101
 
116
- begin
117
- PDF::Reader.open(@file_path) do |reader|
118
- # Extract metadata
119
- if reader.info
120
- metadata[:title] = reader.info[:Title] if reader.info[:Title]
121
- metadata[:author] = reader.info[:Author] if reader.info[:Author]
122
- metadata[:subject] = reader.info[:Subject] if reader.info[:Subject]
123
- metadata[:creator] = reader.info[:Creator] if reader.info[:Creator]
124
- metadata[:producer] = reader.info[:Producer] if reader.info[:Producer]
125
- metadata[:creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
126
- metadata[:modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
127
- end
102
+ # Add encoding information for text files
103
+ if %w[text markdown html].include?(document_type)
104
+ encoding = detect_file_encoding(@file_path) || "UTF-8"
105
+ metadata[:encoding] = encoding
106
+ end
128
107
 
129
- metadata[:page_count] = reader.page_count
108
+ # Get title from metadata or filename
109
+ title = metadata[:title] || extract_title_from_filepath
110
+
111
+ {
112
+ content: text_content,
113
+ metadata: metadata,
114
+ title: title,
115
+ document_type: document_type
116
+ }
117
+ rescue StandardError => e
118
+ raise ParseError, "Failed to parse document: #{e.message}"
119
+ end
120
+ end
130
121
 
131
- # Extract text from all pages
132
- reader.pages.each_with_index do |page, index|
133
- page_text = page.text.strip
134
- next if page_text.empty?
122
+ # Helper methods for document type determination
123
+ def self.determine_document_type(file_path)
124
+ Ragdoll::DocumentConverter.new.determine_document_type(file_path)
125
+ end
135
126
 
136
- content += "\n\n--- Page #{index + 1} ---\n\n" if content.length.positive?
137
- content += page_text
138
- end
139
- end
140
- rescue PDF::Reader::MalformedPDFError => e
141
- raise ParseError, "Malformed PDF: #{e.message}"
142
- rescue PDF::Reader::UnsupportedFeatureError => e
143
- raise ParseError, "Unsupported PDF feature: #{e.message}"
127
+ def self.determine_document_type_from_content_type(content_type)
128
+ case content_type
129
+ when "application/pdf" then "pdf"
130
+ when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx"
131
+ when "text/plain" then "text"
132
+ when "text/markdown" then "markdown"
133
+ when "text/html" then "html"
134
+ when %r{^image/} then "image"
135
+ when %r{^audio/} then "audio"
136
+ when %r{^video/} then "video"
137
+ else "text"
144
138
  end
139
+ end
145
140
 
146
- # Add filepath-based title as fallback if no title was found
147
- if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
148
- metadata[:title] = extract_title_from_filepath
141
+ def self.determine_content_type(file_path)
142
+ case File.extname(file_path).downcase
143
+ when ".pdf" then "application/pdf"
144
+ when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
145
+ when ".txt" then "text/plain"
146
+ when ".md", ".markdown" then "text/markdown"
147
+ when ".html", ".htm" then "text/html"
148
+ when ".jpg", ".jpeg" then "image/jpeg"
149
+ when ".png" then "image/png"
150
+ when ".gif" then "image/gif"
151
+ when ".webp" then "image/webp"
152
+ when ".bmp" then "image/bmp"
153
+ when ".svg" then "image/svg+xml"
154
+ when ".ico" then "image/x-icon"
155
+ when ".tiff", ".tif" then "image/tiff"
156
+ when ".mp3" then "audio/mpeg"
157
+ when ".wav" then "audio/wav"
158
+ when ".m4a" then "audio/mp4"
159
+ when ".flac" then "audio/flac"
160
+ when ".ogg" then "audio/ogg"
161
+ when ".mp4" then "video/mp4"
162
+ when ".mov" then "video/quicktime"
163
+ when ".avi" then "video/x-msvideo"
164
+ when ".webm" then "video/webm"
165
+ else "application/octet-stream"
149
166
  end
167
+ end
150
168
 
151
- # Add content hash for duplicate detection
152
- # Ensure content is UTF-8 encoded before checking presence
153
- metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
169
+ private
154
170
 
155
- {
156
- content: content.strip,
157
- metadata: metadata,
158
- document_type: "pdf"
159
- }
171
+ def determine_document_type(file_path)
172
+ Ragdoll::DocumentConverter.new.determine_document_type(file_path)
173
+ end
174
+
175
+ def extract_metadata_for_type(document_type)
176
+ metadata = basic_file_metadata
177
+
178
+ case document_type
179
+ when "pdf"
180
+ metadata.merge!(extract_pdf_metadata)
181
+ when "docx"
182
+ metadata.merge!(extract_docx_metadata)
183
+ when "image"
184
+ metadata.merge!(extract_image_metadata)
185
+ when "audio"
186
+ metadata.merge!(extract_audio_metadata)
187
+ when "video"
188
+ metadata.merge!(extract_video_metadata)
189
+ end
190
+
191
+ metadata
160
192
  end
161
193
 
162
- def parse_docx
163
- content = ""
194
+ def basic_file_metadata
164
195
  metadata = {}
165
196
 
166
- # Add file-based metadata for duplicate detection
167
197
  if File.exist?(@file_path)
168
198
  metadata[:file_size] = File.size(@file_path)
169
199
  metadata[:file_hash] = calculate_file_hash(@file_path)
200
+ metadata[:file_modified_at] = File.mtime(@file_path)
170
201
  end
171
202
 
172
- begin
173
- doc = Docx::Document.open(@file_path)
174
-
175
- # Extract core properties
176
- if doc.core_properties
177
- metadata[:title] = doc.core_properties.title if doc.core_properties.title
178
- metadata[:author] = doc.core_properties.creator if doc.core_properties.creator
179
- metadata[:subject] = doc.core_properties.subject if doc.core_properties.subject
180
- metadata[:description] = doc.core_properties.description if doc.core_properties.description
181
- metadata[:keywords] = doc.core_properties.keywords if doc.core_properties.keywords
182
- metadata[:created] = doc.core_properties.created if doc.core_properties.created
183
- metadata[:modified] = doc.core_properties.modified if doc.core_properties.modified
184
- if doc.core_properties.last_modified_by
185
- metadata[:last_modified_by] =
186
- doc.core_properties.last_modified_by
187
- end
188
- end
189
-
190
- # Extract text from paragraphs
191
- doc.paragraphs.each do |paragraph|
192
- paragraph_text = paragraph.text.strip
193
- next if paragraph_text.empty?
194
-
195
- content += "#{paragraph_text}\n\n"
196
- end
203
+ metadata[:original_filename] = File.basename(@file_path)
204
+ metadata[:file_extension] = File.extname(@file_path).downcase
205
+ metadata
206
+ end
197
207
 
198
- # Extract text from tables
199
- doc.tables.each_with_index do |table, table_index|
200
- content += "\n--- Table #{table_index + 1} ---\n\n"
208
+ def extract_pdf_metadata
209
+ return {} unless File.exist?(@file_path)
201
210
 
202
- table.rows.each do |row|
203
- row_text = row.cells.map(&:text).join(" | ")
204
- content += "#{row_text}\n" unless row_text.strip.empty?
211
+ begin
212
+ metadata = {}
213
+ PDF::Reader.open(@file_path) do |reader|
214
+ if reader.info
215
+ metadata[:pdf_title] = reader.info[:Title] if reader.info[:Title]
216
+ metadata[:pdf_author] = reader.info[:Author] if reader.info[:Author]
217
+ metadata[:pdf_subject] = reader.info[:Subject] if reader.info[:Subject]
218
+ metadata[:pdf_creator] = reader.info[:Creator] if reader.info[:Creator]
219
+ metadata[:pdf_producer] = reader.info[:Producer] if reader.info[:Producer]
220
+ metadata[:pdf_creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
221
+ metadata[:pdf_modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
205
222
  end
206
-
207
- content += "\n"
223
+ metadata[:page_count] = reader.page_count
208
224
  end
209
225
 
210
- metadata[:paragraph_count] = doc.paragraphs.count
211
- metadata[:table_count] = doc.tables.count
212
- rescue StandardError => e # StandardError => e
213
- raise ParseError, "#{__LINE__} Failed to parse DOCX: #{e.message}"
214
- end
215
-
216
- # Add filepath-based title as fallback if no title was found
217
- if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
218
- metadata[:title] = extract_title_from_filepath
226
+ # Use PDF title as main title if available
227
+ metadata[:title] = metadata[:pdf_title] if metadata[:pdf_title]
228
+ metadata
229
+ rescue StandardError => e
230
+ puts "Warning: Failed to extract PDF metadata: #{e.message}"
231
+ {}
219
232
  end
220
-
221
- # Add content hash for duplicate detection
222
- # Ensure content is UTF-8 encoded before checking presence
223
- metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
224
-
225
- {
226
- content: content.strip,
227
- metadata: metadata,
228
- document_type: "docx"
229
- }
230
233
  end
231
234
 
232
- def parse_text
233
- # Determine document type first (before any IO operations)
234
- document_type = case @file_extension
235
- when ".md", ".markdown" then "markdown"
236
- when ".txt" then "text"
237
- else "text"
238
- end
235
+ def extract_docx_metadata
236
+ return {} unless File.exist?(@file_path)
239
237
 
240
238
  begin
241
- content = File.read(@file_path, encoding: "UTF-8")
242
- encoding = "UTF-8"
243
- rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
244
- # Try with different encoding - read as ISO-8859-1 and force encoding to UTF-8
245
- content = File.read(@file_path, encoding: "ISO-8859-1").encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
246
- encoding = "ISO-8859-1"
247
- rescue Errno::ENOENT, Errno::EACCES => e
248
- raise ParseError, "Failed to read file #{@file_path}: #{e.message}"
249
- end
250
-
251
- metadata = {
252
- file_size: File.size(@file_path),
253
- file_hash: calculate_file_hash(@file_path),
254
- encoding: encoding
255
- }
239
+ metadata = {}
240
+ doc = Docx::Document.open(@file_path)
256
241
 
257
- # Parse YAML front matter for markdown files
258
- if document_type == "markdown" && content.start_with?("---\n")
259
- front_matter, body_content = parse_yaml_front_matter(content)
260
- if front_matter
261
- metadata.merge!(front_matter)
262
- content = body_content
242
+ if doc.core_properties
243
+ metadata[:docx_title] = doc.core_properties.title if doc.core_properties.title
244
+ metadata[:docx_author] = doc.core_properties.creator if doc.core_properties.creator
245
+ metadata[:docx_subject] = doc.core_properties.subject if doc.core_properties.subject
246
+ metadata[:docx_description] = doc.core_properties.description if doc.core_properties.description
247
+ metadata[:docx_keywords] = doc.core_properties.keywords if doc.core_properties.keywords
248
+ metadata[:docx_created] = doc.core_properties.created if doc.core_properties.created
249
+ metadata[:docx_modified] = doc.core_properties.modified if doc.core_properties.modified
250
+ metadata[:docx_last_modified_by] = doc.core_properties.last_modified_by if doc.core_properties.last_modified_by
263
251
  end
264
- end
265
-
266
- # Add filepath-based title as fallback if no title was found
267
- if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
268
- metadata[:title] = extract_title_from_filepath
269
- end
270
252
 
271
- # Add content hash for duplicate detection
272
- # Ensure content is UTF-8 encoded before checking presence
273
- metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
274
-
275
- {
276
- content: content,
277
- metadata: metadata,
278
- document_type: document_type
279
- }
280
- end
281
-
282
- def parse_html
283
- content = File.read(@file_path, encoding: "UTF-8")
284
-
285
- # Extract title from H1 tag if present
286
- h1_match = content.match(%r{<h1[^>]*>(.*?)</h1>}mi)
287
- title = nil
288
- if h1_match
289
- # Clean up the H1 content by removing any HTML tags and normalizing whitespace
290
- title = h1_match[1]
291
- .gsub(/<[^>]+>/, " ") # Remove any nested HTML tags
292
- .gsub(/\s+/, " ") # Normalize whitespace
293
- .strip
294
- end
295
-
296
- # Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
297
- clean_content = content
298
- .gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
299
- .gsub(%r{<style[^>]*>.*?</style>}mi, "") # Remove style tags
300
- .gsub(/<[^>]+>/, " ") # Remove all HTML tags
301
- .gsub(/\s+/, " ") # Normalize whitespace
302
- .strip
303
-
304
- metadata = {
305
- file_size: File.size(@file_path),
306
- file_hash: calculate_file_hash(@file_path),
307
- original_format: "html"
308
- }
253
+ metadata[:paragraph_count] = doc.paragraphs.count
254
+ metadata[:table_count] = doc.tables.count
309
255
 
310
- # Add title to metadata if found, otherwise use filepath fallback
311
- if title && !title.empty?
312
- metadata[:title] = title
313
- else
314
- metadata[:title] = extract_title_from_filepath
256
+ # Use DOCX title as main title if available
257
+ metadata[:title] = metadata[:docx_title] if metadata[:docx_title]
258
+ metadata
259
+ rescue StandardError => e
260
+ puts "Warning: Failed to extract DOCX metadata: #{e.message}"
261
+ {}
315
262
  end
316
-
317
- # Add content hash for duplicate detection
318
- metadata[:content_hash] = calculate_content_hash(clean_content) if clean_content.present?
319
-
320
- {
321
- content: clean_content,
322
- metadata: metadata,
323
- document_type: "html"
324
- }
325
263
  end
326
264
 
327
- def parse_image
328
- puts "🖼️ DocumentProcessor: Starting image parsing for #{@file_path}"
265
+ def extract_image_metadata
266
+ return {} unless File.exist?(@file_path)
329
267
 
330
- metadata = {
331
- file_size: File.size(@file_path),
332
- file_hash: calculate_file_hash(@file_path),
333
- file_type: @file_extension.sub(".", ""),
334
- original_filename: File.basename(@file_path)
335
- }
336
-
337
- # Extract image dimensions
338
268
  begin
269
+ metadata = {}
339
270
  img = Magick::Image.read(@file_path).first
340
- metadata[:width] = img.columns
271
+
272
+ metadata[:width] = img.columns
341
273
  metadata[:height] = img.rows
342
- puts "📏 DocumentProcessor: Image dimensions: #{img.columns}x#{img.rows}"
343
- rescue StandardError => e # StandardError
344
- puts "❌ DocumentProcessor: Failed to get image dimensions: #{e.message}"
345
- metadata[:width] = nil
346
- metadata[:height] = nil
274
+ metadata[:image_format] = img.format
275
+ metadata[:mime_type] = img.mime_type
276
+ metadata[:number_colors] = img.number_colors
277
+
278
+ metadata
279
+ rescue StandardError => e
280
+ puts "Warning: Failed to extract image metadata: #{e.message}"
281
+ {}
347
282
  end
283
+ end
348
284
 
349
- puts "🤖 DocumentProcessor: Creating ImageDescriptionService and calling generate_description..."
350
- desc = Ragdoll::ImageDescriptionService.new.generate_description(@file_path)
351
-
352
- puts "📝 DocumentProcessor: Received description: '#{desc}'"
353
-
354
- metadata[:description] = desc if desc && !desc.empty?
355
-
356
- # Use AI-generated description or fallback placeholder
357
- content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
358
-
359
- # Add filepath-based title as fallback
360
- metadata[:title] = extract_title_from_filepath
361
-
362
- # Add content hash for duplicate detection
363
- # Ensure content is UTF-8 encoded before checking presence
364
- metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
365
-
366
- puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
367
-
285
+ def extract_audio_metadata
286
+ # Basic audio file metadata
287
+ # In production, you might use audio analysis libraries
368
288
  {
369
- content: content,
370
- metadata: metadata,
371
- document_type: "image"
289
+ media_type: "audio",
290
+ file_type: File.extname(@file_path).sub(".", "")
372
291
  }
373
292
  end
374
293
 
375
- # Helper methods for document type determination
376
- def self.determine_document_type(file_path)
377
- case File.extname(file_path).downcase
378
- when ".pdf" then "pdf"
379
- when ".docx" then "docx"
380
- when ".txt" then "text"
381
- when ".md", ".markdown" then "markdown"
382
- when ".html", ".htm" then "html"
383
- when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif" then "image"
384
- else "text"
385
- end
386
- end
387
-
388
- def self.determine_document_type_from_content_type(content_type)
389
- case content_type
390
- when "application/pdf" then "pdf"
391
- when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx"
392
- when "text/plain" then "text"
393
- when "text/markdown" then "markdown"
394
- when "text/html" then "html"
395
- when %r{^image/} then "image"
396
- else "text"
397
- end
398
- end
399
-
400
- def self.determine_content_type(file_path)
401
- case File.extname(file_path).downcase
402
- when ".pdf" then "application/pdf"
403
- when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
404
- when ".txt" then "text/plain"
405
- when ".md", ".markdown" then "text/markdown"
406
- when ".html", ".htm" then "text/html"
407
- when ".jpg", ".jpeg" then "image/jpeg"
408
- when ".png" then "image/png"
409
- when ".gif" then "image/gif"
410
- when ".webp" then "image/webp"
411
- when ".bmp" then "image/bmp"
412
- when ".svg" then "image/svg+xml"
413
- when ".ico" then "image/x-icon"
414
- when ".tiff", ".tif" then "image/tiff"
415
- else "application/octet-stream"
416
- end
294
+ def extract_video_metadata
295
+ # Basic video file metadata
296
+ # In production, you might use video analysis libraries
297
+ {
298
+ media_type: "video",
299
+ file_type: File.extname(@file_path).sub(".", "")
300
+ }
417
301
  end
418
302
 
419
- private
420
-
421
303
  # Extract a meaningful title from the file path as a fallback
422
- # @param file_path [String] the full file path
423
- # @return [String] a cleaned title derived from the filename
424
304
  def extract_title_from_filepath(file_path = @file_path)
425
305
  filename = File.basename(file_path, File.extname(file_path))
426
-
306
+
427
307
  # Clean up common patterns in filenames to make them more readable
428
308
  title = filename
429
309
  .gsub(/[-_]+/, ' ') # Replace hyphens and underscores with spaces
430
310
  .gsub(/([a-z])([A-Z])/, '\1 \2') # Add space before capital letters (camelCase)
431
311
  .gsub(/\s+/, ' ') # Normalize multiple spaces
432
312
  .strip
433
-
313
+
434
314
  # Capitalize words for better readability
435
315
  title.split(' ').map(&:capitalize).join(' ')
436
316
  end
437
317
 
438
- # Parse YAML front matter from markdown content
439
- # @param content [String] the full content of the markdown file
440
- # @return [Array] returns [front_matter_hash, body_content] or [nil, original_content]
441
- def parse_yaml_front_matter(content)
442
- # Check if content starts with YAML front matter delimiter
443
- return [nil, content] unless content.start_with?("---\n")
444
-
445
- # Find the closing delimiter
446
- lines = content.lines
447
- closing_index = nil
448
-
449
- lines.each_with_index do |line, index|
450
- next if index == 0 # Skip the opening ---
451
- if line.strip == "---"
452
- closing_index = index
453
- break
454
- end
455
- end
456
-
457
- # No closing delimiter found
458
- return [nil, content] unless closing_index
459
-
460
- # Extract YAML content and body
461
- yaml_lines = lines[1...closing_index]
462
- body_lines = lines[(closing_index + 1)..-1]
463
-
464
- yaml_content = yaml_lines.join
465
- body_content = body_lines&.join || ""
466
-
467
- # Parse YAML
468
- begin
469
- # Allow Time objects for date fields in YAML front matter
470
- front_matter = YAML.safe_load(yaml_content, permitted_classes: [Time, Date])
471
- # Convert string keys to symbols for consistency
472
- front_matter = front_matter.transform_keys(&:to_sym) if front_matter.is_a?(Hash)
473
- [front_matter, body_content.strip]
474
- rescue YAML::SyntaxError, Psych::DisallowedClass => e
475
- # If YAML parsing fails, return original content
476
- Rails.logger.warn "Warning: Failed to parse YAML front matter: #{e.message}" if defined?(Rails)
477
- [nil, content]
478
- end
479
- end
480
-
481
318
  # Calculate SHA256 hash of file content for duplicate detection
482
319
  def calculate_file_hash(file_path)
483
320
  require 'digest'
@@ -497,5 +334,34 @@ module Ragdoll
497
334
  puts "Warning: Failed to calculate content hash: #{e.message}"
498
335
  nil
499
336
  end
337
+
338
+ # Detect file encoding for text files
339
+ def detect_file_encoding(file_path)
340
+ return nil unless File.exist?(file_path)
341
+
342
+ # Read a sample to detect encoding
343
+ sample = File.read(file_path, 1000, encoding: 'ASCII-8BIT')
344
+
345
+ # Check for common encodings
346
+ if sample.valid_encoding?
347
+ # Try to convert to UTF-8
348
+ utf8_content = sample.encode('UTF-8', invalid: :replace, undef: :replace)
349
+ return 'UTF-8' if utf8_content.valid_encoding?
350
+ end
351
+
352
+ # Try common encodings
353
+ ['UTF-8', 'ISO-8859-1', 'Windows-1252'].each do |encoding|
354
+ begin
355
+ test_content = sample.force_encoding(encoding)
356
+ return encoding if test_content.valid_encoding?
357
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
358
+ next
359
+ end
360
+ end
361
+
362
+ 'UTF-8' # Default fallback
363
+ rescue StandardError
364
+ 'UTF-8'
365
+ end
500
366
  end
501
367
  end