ragdoll 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,6 @@ require "docx"
5
5
  require "rmagick"
6
6
  require "yaml"
7
7
  require "date"
8
- # Image description service is auto-loaded from app/services
9
8
 
10
9
  module Ragdoll
11
10
  class DocumentProcessor
@@ -34,7 +33,7 @@ module Ragdoll
34
33
  location: File.expand_path(file_path),
35
34
  title: parsed[:title] || File.basename(file_path, File.extname(file_path)),
36
35
  content: parsed[:content],
37
- document_type: determine_document_type(file_path),
36
+ document_type: parsed[:document_type] || determine_document_type(file_path),
38
37
  metadata: parsed[:metadata] || {},
39
38
  status: "processed",
40
39
  file_modified_at: file_modified_at,
@@ -85,288 +84,44 @@ module Ragdoll
85
84
  end
86
85
 
87
86
  def parse
88
- case @file_extension
89
- when ".pdf"
90
- parse_pdf
91
- when ".docx"
92
- parse_docx
93
- when ".txt", ".md", ".markdown"
94
- parse_text
95
- when ".html", ".htm"
96
- parse_html
97
- when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
98
- parse_image
99
- else
100
- parse_text # Default to text parsing for unknown formats
87
+ # Check if file exists first
88
+ unless File.exist?(@file_path)
89
+ raise ParseError, "File does not exist: #{@file_path}"
101
90
  end
102
- rescue StandardError => e # StandardError => e
103
- raise ParseError, "#{__LINE__} Failed to parse #{@file_path}: #{e.message}"
104
- end
105
91
 
106
- private
107
-
108
- def parse_pdf
109
- content = ""
110
- metadata = {}
92
+ # Use the new unified document converter
93
+ document_type = determine_document_type(@file_path)
111
94
 
112
95
  begin
113
- PDF::Reader.open(@file_path) do |reader|
114
- # Extract metadata
115
- if reader.info
116
- metadata[:title] = reader.info[:Title] if reader.info[:Title]
117
- metadata[:author] = reader.info[:Author] if reader.info[:Author]
118
- metadata[:subject] = reader.info[:Subject] if reader.info[:Subject]
119
- metadata[:creator] = reader.info[:Creator] if reader.info[:Creator]
120
- metadata[:producer] = reader.info[:Producer] if reader.info[:Producer]
121
- metadata[:creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
122
- metadata[:modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
123
- end
124
-
125
- metadata[:page_count] = reader.page_count
126
-
127
- # Extract text from all pages
128
- reader.pages.each_with_index do |page, index|
129
- page_text = page.text.strip
130
- next if page_text.empty?
131
-
132
- content += "\n\n--- Page #{index + 1} ---\n\n" if content.length.positive?
133
- content += page_text
134
- end
135
- end
136
- rescue PDF::Reader::MalformedPDFError => e
137
- raise ParseError, "Malformed PDF: #{e.message}"
138
- rescue PDF::Reader::UnsupportedFeatureError => e
139
- raise ParseError, "Unsupported PDF feature: #{e.message}"
140
- end
141
-
142
- # Add filepath-based title as fallback if no title was found
143
- if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
144
- metadata[:title] = extract_title_from_filepath
145
- end
146
-
147
- {
148
- content: content.strip,
149
- metadata: metadata,
150
- document_type: "pdf"
151
- }
152
- end
153
-
154
- def parse_docx
155
- content = ""
156
- metadata = {}
157
-
158
- begin
159
- doc = Docx::Document.open(@file_path)
160
-
161
- # Extract core properties
162
- if doc.core_properties
163
- metadata[:title] = doc.core_properties.title if doc.core_properties.title
164
- metadata[:author] = doc.core_properties.creator if doc.core_properties.creator
165
- metadata[:subject] = doc.core_properties.subject if doc.core_properties.subject
166
- metadata[:description] = doc.core_properties.description if doc.core_properties.description
167
- metadata[:keywords] = doc.core_properties.keywords if doc.core_properties.keywords
168
- metadata[:created] = doc.core_properties.created if doc.core_properties.created
169
- metadata[:modified] = doc.core_properties.modified if doc.core_properties.modified
170
- if doc.core_properties.last_modified_by
171
- metadata[:last_modified_by] =
172
- doc.core_properties.last_modified_by
173
- end
174
- end
175
-
176
- # Extract text from paragraphs
177
- doc.paragraphs.each do |paragraph|
178
- paragraph_text = paragraph.text.strip
179
- next if paragraph_text.empty?
180
-
181
- content += "#{paragraph_text}\n\n"
182
- end
183
-
184
- # Extract text from tables
185
- doc.tables.each_with_index do |table, table_index|
186
- content += "\n--- Table #{table_index + 1} ---\n\n"
187
-
188
- table.rows.each do |row|
189
- row_text = row.cells.map(&:text).join(" | ")
190
- content += "#{row_text}\n" unless row_text.strip.empty?
191
- end
192
-
193
- content += "\n"
194
- end
195
-
196
- metadata[:paragraph_count] = doc.paragraphs.count
197
- metadata[:table_count] = doc.tables.count
198
- rescue StandardError => e # StandardError => e
199
- raise ParseError, "#{__LINE__} Failed to parse DOCX: #{e.message}"
200
- end
201
-
202
- # Add filepath-based title as fallback if no title was found
203
- if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
204
- metadata[:title] = extract_title_from_filepath
205
- end
206
-
207
- {
208
- content: content.strip,
209
- metadata: metadata,
210
- document_type: "docx"
211
- }
212
- end
213
-
214
- def parse_text
215
- content = File.read(@file_path, encoding: "UTF-8")
216
- metadata = {
217
- file_size: File.size(@file_path),
218
- encoding: "UTF-8"
219
- }
96
+ # Convert to text using the unified pipeline
97
+ text_content = Ragdoll::DocumentConverter.convert_to_text(@file_path, document_type)
220
98
 
221
- document_type = case @file_extension
222
- when ".md", ".markdown" then "markdown"
223
- when ".txt" then "text"
224
- else "text"
225
- end
226
-
227
- # Parse YAML front matter for markdown files
228
- if document_type == "markdown" && content.start_with?("---\n")
229
- front_matter, body_content = parse_yaml_front_matter(content)
230
- if front_matter
231
- metadata.merge!(front_matter)
232
- content = body_content
233
- end
234
- end
235
-
236
- # Add filepath-based title as fallback if no title was found
237
- if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
238
- metadata[:title] = extract_title_from_filepath
239
- end
240
-
241
- {
242
- content: content,
243
- metadata: metadata,
244
- document_type: document_type
245
- }
246
- rescue Encoding::InvalidByteSequenceError
247
- # Try with different encoding
248
- content = File.read(@file_path, encoding: "ISO-8859-1")
249
- metadata = {
250
- file_size: File.size(@file_path),
251
- encoding: "ISO-8859-1"
252
- }
99
+ # Extract metadata based on document type
100
+ metadata = extract_metadata_for_type(document_type)
253
101
 
254
- # Try to parse front matter with different encoding too
255
- if document_type == "markdown" && content.start_with?("---\n")
256
- front_matter, body_content = parse_yaml_front_matter(content)
257
- if front_matter
258
- metadata.merge!(front_matter)
259
- content = body_content
102
+ # Add encoding information for text files
103
+ if %w[text markdown html].include?(document_type)
104
+ encoding = detect_file_encoding(@file_path) || "UTF-8"
105
+ metadata[:encoding] = encoding
260
106
  end
261
- end
262
107
 
263
- # Add filepath-based title as fallback if no title was found
264
- if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
265
- metadata[:title] = extract_title_from_filepath
108
+ # Get title from metadata or filename
109
+ title = metadata[:title] || extract_title_from_filepath
110
+
111
+ {
112
+ content: text_content,
113
+ metadata: metadata,
114
+ title: title,
115
+ document_type: document_type
116
+ }
117
+ rescue StandardError => e
118
+ raise ParseError, "Failed to parse document: #{e.message}"
266
119
  end
267
-
268
- {
269
- content: content,
270
- metadata: metadata,
271
- document_type: document_type.nil? ? "text" : document_type
272
- }
273
- end
274
-
275
- def parse_html
276
- content = File.read(@file_path, encoding: "UTF-8")
277
-
278
- # Extract title from H1 tag if present
279
- h1_match = content.match(%r{<h1[^>]*>(.*?)</h1>}mi)
280
- title = nil
281
- if h1_match
282
- # Clean up the H1 content by removing any HTML tags and normalizing whitespace
283
- title = h1_match[1]
284
- .gsub(/<[^>]+>/, " ") # Remove any nested HTML tags
285
- .gsub(/\s+/, " ") # Normalize whitespace
286
- .strip
287
- end
288
-
289
- # Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
290
- clean_content = content
291
- .gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
292
- .gsub(%r{<style[^>]*>.*?</style>}mi, "") # Remove style tags
293
- .gsub(/<[^>]+>/, " ") # Remove all HTML tags
294
- .gsub(/\s+/, " ") # Normalize whitespace
295
- .strip
296
-
297
- metadata = {
298
- file_size: File.size(@file_path),
299
- original_format: "html"
300
- }
301
-
302
- # Add title to metadata if found, otherwise use filepath fallback
303
- if title && !title.empty?
304
- metadata[:title] = title
305
- else
306
- metadata[:title] = extract_title_from_filepath
307
- end
308
-
309
- {
310
- content: clean_content,
311
- metadata: metadata,
312
- document_type: "html"
313
- }
314
- end
315
-
316
- def parse_image
317
- puts "🖼️ DocumentProcessor: Starting image parsing for #{@file_path}"
318
-
319
- metadata = {
320
- file_size: File.size(@file_path),
321
- file_type: @file_extension.sub(".", ""),
322
- original_filename: File.basename(@file_path)
323
- }
324
-
325
- # Extract image dimensions
326
- begin
327
- img = Magick::Image.read(@file_path).first
328
- metadata[:width] = img.columns
329
- metadata[:height] = img.rows
330
- puts "📏 DocumentProcessor: Image dimensions: #{img.columns}x#{img.rows}"
331
- rescue StandardError => e # StandardError
332
- puts "❌ DocumentProcessor: Failed to get image dimensions: #{e.message}"
333
- metadata[:width] = nil
334
- metadata[:height] = nil
335
- end
336
-
337
- puts "🤖 DocumentProcessor: Creating ImageDescriptionService and calling generate_description..."
338
- desc = Ragdoll::ImageDescriptionService.new.generate_description(@file_path)
339
-
340
- puts "📝 DocumentProcessor: Received description: '#{desc}'"
341
-
342
- metadata[:description] = desc if desc && !desc.empty?
343
-
344
- # Use AI-generated description or fallback placeholder
345
- content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
346
-
347
- # Add filepath-based title as fallback
348
- metadata[:title] = extract_title_from_filepath
349
-
350
- puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
351
-
352
- {
353
- content: content,
354
- metadata: metadata,
355
- document_type: "image"
356
- }
357
120
  end
358
121
 
359
122
  # Helper methods for document type determination
360
123
  def self.determine_document_type(file_path)
361
- case File.extname(file_path).downcase
362
- when ".pdf" then "pdf"
363
- when ".docx" then "docx"
364
- when ".txt" then "text"
365
- when ".md", ".markdown" then "markdown"
366
- when ".html", ".htm" then "html"
367
- when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif" then "image"
368
- else "text"
369
- end
124
+ Ragdoll::DocumentConverter.new.determine_document_type(file_path)
370
125
  end
371
126
 
372
127
  def self.determine_document_type_from_content_type(content_type)
@@ -377,6 +132,8 @@ module Ragdoll
377
132
  when "text/markdown" then "markdown"
378
133
  when "text/html" then "html"
379
134
  when %r{^image/} then "image"
135
+ when %r{^audio/} then "audio"
136
+ when %r{^video/} then "video"
380
137
  else "text"
381
138
  end
382
139
  end
@@ -396,70 +153,215 @@ module Ragdoll
396
153
  when ".svg" then "image/svg+xml"
397
154
  when ".ico" then "image/x-icon"
398
155
  when ".tiff", ".tif" then "image/tiff"
156
+ when ".mp3" then "audio/mpeg"
157
+ when ".wav" then "audio/wav"
158
+ when ".m4a" then "audio/mp4"
159
+ when ".flac" then "audio/flac"
160
+ when ".ogg" then "audio/ogg"
161
+ when ".mp4" then "video/mp4"
162
+ when ".mov" then "video/quicktime"
163
+ when ".avi" then "video/x-msvideo"
164
+ when ".webm" then "video/webm"
399
165
  else "application/octet-stream"
400
166
  end
401
167
  end
402
168
 
403
169
  private
404
170
 
171
+ def determine_document_type(file_path)
172
+ Ragdoll::DocumentConverter.new.determine_document_type(file_path)
173
+ end
174
+
175
+ def extract_metadata_for_type(document_type)
176
+ metadata = basic_file_metadata
177
+
178
+ case document_type
179
+ when "pdf"
180
+ metadata.merge!(extract_pdf_metadata)
181
+ when "docx"
182
+ metadata.merge!(extract_docx_metadata)
183
+ when "image"
184
+ metadata.merge!(extract_image_metadata)
185
+ when "audio"
186
+ metadata.merge!(extract_audio_metadata)
187
+ when "video"
188
+ metadata.merge!(extract_video_metadata)
189
+ end
190
+
191
+ metadata
192
+ end
193
+
194
+ def basic_file_metadata
195
+ metadata = {}
196
+
197
+ if File.exist?(@file_path)
198
+ metadata[:file_size] = File.size(@file_path)
199
+ metadata[:file_hash] = calculate_file_hash(@file_path)
200
+ metadata[:file_modified_at] = File.mtime(@file_path)
201
+ end
202
+
203
+ metadata[:original_filename] = File.basename(@file_path)
204
+ metadata[:file_extension] = File.extname(@file_path).downcase
205
+ metadata
206
+ end
207
+
208
+ def extract_pdf_metadata
209
+ return {} unless File.exist?(@file_path)
210
+
211
+ begin
212
+ metadata = {}
213
+ PDF::Reader.open(@file_path) do |reader|
214
+ if reader.info
215
+ metadata[:pdf_title] = reader.info[:Title] if reader.info[:Title]
216
+ metadata[:pdf_author] = reader.info[:Author] if reader.info[:Author]
217
+ metadata[:pdf_subject] = reader.info[:Subject] if reader.info[:Subject]
218
+ metadata[:pdf_creator] = reader.info[:Creator] if reader.info[:Creator]
219
+ metadata[:pdf_producer] = reader.info[:Producer] if reader.info[:Producer]
220
+ metadata[:pdf_creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
221
+ metadata[:pdf_modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
222
+ end
223
+ metadata[:page_count] = reader.page_count
224
+ end
225
+
226
+ # Use PDF title as main title if available
227
+ metadata[:title] = metadata[:pdf_title] if metadata[:pdf_title]
228
+ metadata
229
+ rescue StandardError => e
230
+ puts "Warning: Failed to extract PDF metadata: #{e.message}"
231
+ {}
232
+ end
233
+ end
234
+
235
+ def extract_docx_metadata
236
+ return {} unless File.exist?(@file_path)
237
+
238
+ begin
239
+ metadata = {}
240
+ doc = Docx::Document.open(@file_path)
241
+
242
+ if doc.core_properties
243
+ metadata[:docx_title] = doc.core_properties.title if doc.core_properties.title
244
+ metadata[:docx_author] = doc.core_properties.creator if doc.core_properties.creator
245
+ metadata[:docx_subject] = doc.core_properties.subject if doc.core_properties.subject
246
+ metadata[:docx_description] = doc.core_properties.description if doc.core_properties.description
247
+ metadata[:docx_keywords] = doc.core_properties.keywords if doc.core_properties.keywords
248
+ metadata[:docx_created] = doc.core_properties.created if doc.core_properties.created
249
+ metadata[:docx_modified] = doc.core_properties.modified if doc.core_properties.modified
250
+ metadata[:docx_last_modified_by] = doc.core_properties.last_modified_by if doc.core_properties.last_modified_by
251
+ end
252
+
253
+ metadata[:paragraph_count] = doc.paragraphs.count
254
+ metadata[:table_count] = doc.tables.count
255
+
256
+ # Use DOCX title as main title if available
257
+ metadata[:title] = metadata[:docx_title] if metadata[:docx_title]
258
+ metadata
259
+ rescue StandardError => e
260
+ puts "Warning: Failed to extract DOCX metadata: #{e.message}"
261
+ {}
262
+ end
263
+ end
264
+
265
+ def extract_image_metadata
266
+ return {} unless File.exist?(@file_path)
267
+
268
+ begin
269
+ metadata = {}
270
+ img = Magick::Image.read(@file_path).first
271
+
272
+ metadata[:width] = img.columns
273
+ metadata[:height] = img.rows
274
+ metadata[:image_format] = img.format
275
+ metadata[:mime_type] = img.mime_type
276
+ metadata[:number_colors] = img.number_colors
277
+
278
+ metadata
279
+ rescue StandardError => e
280
+ puts "Warning: Failed to extract image metadata: #{e.message}"
281
+ {}
282
+ end
283
+ end
284
+
285
+ def extract_audio_metadata
286
+ # Basic audio file metadata
287
+ # In production, you might use audio analysis libraries
288
+ {
289
+ media_type: "audio",
290
+ file_type: File.extname(@file_path).sub(".", "")
291
+ }
292
+ end
293
+
294
+ def extract_video_metadata
295
+ # Basic video file metadata
296
+ # In production, you might use video analysis libraries
297
+ {
298
+ media_type: "video",
299
+ file_type: File.extname(@file_path).sub(".", "")
300
+ }
301
+ end
302
+
405
303
  # Extract a meaningful title from the file path as a fallback
406
- # @param file_path [String] the full file path
407
- # @return [String] a cleaned title derived from the filename
408
304
  def extract_title_from_filepath(file_path = @file_path)
409
305
  filename = File.basename(file_path, File.extname(file_path))
410
-
306
+
411
307
  # Clean up common patterns in filenames to make them more readable
412
308
  title = filename
413
309
  .gsub(/[-_]+/, ' ') # Replace hyphens and underscores with spaces
414
310
  .gsub(/([a-z])([A-Z])/, '\1 \2') # Add space before capital letters (camelCase)
415
311
  .gsub(/\s+/, ' ') # Normalize multiple spaces
416
312
  .strip
417
-
313
+
418
314
  # Capitalize words for better readability
419
315
  title.split(' ').map(&:capitalize).join(' ')
420
316
  end
421
317
 
422
- # Parse YAML front matter from markdown content
423
- # @param content [String] the full content of the markdown file
424
- # @return [Array] returns [front_matter_hash, body_content] or [nil, original_content]
425
- def parse_yaml_front_matter(content)
426
- # Check if content starts with YAML front matter delimiter
427
- return [nil, content] unless content.start_with?("---\n")
428
-
429
- # Find the closing delimiter
430
- lines = content.lines
431
- closing_index = nil
432
-
433
- lines.each_with_index do |line, index|
434
- next if index == 0 # Skip the opening ---
435
- if line.strip == "---"
436
- closing_index = index
437
- break
438
- end
439
- end
318
+ # Calculate SHA256 hash of file content for duplicate detection
319
+ def calculate_file_hash(file_path)
320
+ require 'digest'
321
+ Digest::SHA256.file(file_path).hexdigest
322
+ rescue StandardError => e
323
+ Rails.logger.warn "Failed to calculate file hash for #{file_path}: #{e.message}" if defined?(Rails)
324
+ puts "Warning: Failed to calculate file hash for #{file_path}: #{e.message}"
325
+ nil
326
+ end
440
327
 
441
- # No closing delimiter found
442
- return [nil, content] unless closing_index
328
+ # Calculate SHA256 hash of text content for duplicate detection
329
+ def calculate_content_hash(content)
330
+ require 'digest'
331
+ Digest::SHA256.hexdigest(content)
332
+ rescue StandardError => e
333
+ Rails.logger.warn "Failed to calculate content hash: #{e.message}" if defined?(Rails)
334
+ puts "Warning: Failed to calculate content hash: #{e.message}"
335
+ nil
336
+ end
443
337
 
444
- # Extract YAML content and body
445
- yaml_lines = lines[1...closing_index]
446
- body_lines = lines[(closing_index + 1)..-1]
338
+ # Detect file encoding for text files
339
+ def detect_file_encoding(file_path)
340
+ return nil unless File.exist?(file_path)
447
341
 
448
- yaml_content = yaml_lines.join
449
- body_content = body_lines&.join || ""
342
+ # Read a sample to detect encoding
343
+ sample = File.read(file_path, 1000, encoding: 'ASCII-8BIT')
450
344
 
451
- # Parse YAML
452
- begin
453
- # Allow Time objects for date fields in YAML front matter
454
- front_matter = YAML.safe_load(yaml_content, permitted_classes: [Time, Date])
455
- # Convert string keys to symbols for consistency
456
- front_matter = front_matter.transform_keys(&:to_sym) if front_matter.is_a?(Hash)
457
- [front_matter, body_content.strip]
458
- rescue YAML::SyntaxError, Psych::DisallowedClass => e
459
- # If YAML parsing fails, return original content
460
- Rails.logger.warn "Warning: Failed to parse YAML front matter: #{e.message}" if defined?(Rails)
461
- [nil, content]
345
+ # Check for common encodings
346
+ if sample.valid_encoding?
347
+ # Try to convert to UTF-8
348
+ utf8_content = sample.encode('UTF-8', invalid: :replace, undef: :replace)
349
+ return 'UTF-8' if utf8_content.valid_encoding?
462
350
  end
351
+
352
+ # Try common encodings
353
+ ['UTF-8', 'ISO-8859-1', 'Windows-1252'].each do |encoding|
354
+ begin
355
+ test_content = sample.force_encoding(encoding)
356
+ return encoding if test_content.valid_encoding?
357
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
358
+ next
359
+ end
360
+ end
361
+
362
+ 'UTF-8' # Default fallback
363
+ rescue StandardError
364
+ 'UTF-8'
463
365
  end
464
366
  end
465
367
  end