ragdoll 0.1.11 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +323 -384
- data/app/models/ragdoll/document.rb +1 -1
- data/app/models/ragdoll/unified_content.rb +216 -0
- data/app/models/ragdoll/unified_document.rb +338 -0
- data/app/services/ragdoll/audio_to_text_service.rb +200 -0
- data/app/services/ragdoll/document_converter.rb +216 -0
- data/app/services/ragdoll/document_processor.rb +197 -331
- data/app/services/ragdoll/image_to_text_service.rb +322 -0
- data/app/services/ragdoll/migration_service.rb +340 -0
- data/app/services/ragdoll/text_extraction_service.rb +422 -0
- data/app/services/ragdoll/unified_document_management.rb +300 -0
- data/db/migrate/20250923000001_create_ragdoll_unified_contents.rb +87 -0
- data/lib/ragdoll/core/version.rb +1 -1
- data/lib/ragdoll/core.rb +7 -0
- metadata +11 -2
@@ -5,7 +5,6 @@ require "docx"
|
|
5
5
|
require "rmagick"
|
6
6
|
require "yaml"
|
7
7
|
require "date"
|
8
|
-
# Image description service is auto-loaded from app/services
|
9
8
|
|
10
9
|
module Ragdoll
|
11
10
|
class DocumentProcessor
|
@@ -34,7 +33,7 @@ module Ragdoll
|
|
34
33
|
location: File.expand_path(file_path),
|
35
34
|
title: parsed[:title] || File.basename(file_path, File.extname(file_path)),
|
36
35
|
content: parsed[:content],
|
37
|
-
document_type: determine_document_type(file_path),
|
36
|
+
document_type: parsed[:document_type] || determine_document_type(file_path),
|
38
37
|
metadata: parsed[:metadata] || {},
|
39
38
|
status: "processed",
|
40
39
|
file_modified_at: file_modified_at,
|
@@ -85,399 +84,237 @@ module Ragdoll
|
|
85
84
|
end
|
86
85
|
|
87
86
|
def parse
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
when ".docx"
|
92
|
-
parse_docx
|
93
|
-
when ".txt", ".md", ".markdown"
|
94
|
-
parse_text
|
95
|
-
when ".html", ".htm"
|
96
|
-
parse_html
|
97
|
-
when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
|
98
|
-
parse_image
|
99
|
-
else
|
100
|
-
parse_text # Default to text parsing for unknown formats
|
87
|
+
# Check if file exists first
|
88
|
+
unless File.exist?(@file_path)
|
89
|
+
raise ParseError, "File does not exist: #{@file_path}"
|
101
90
|
end
|
102
|
-
end
|
103
91
|
|
104
|
-
|
92
|
+
# Use the new unified document converter
|
93
|
+
document_type = determine_document_type(@file_path)
|
105
94
|
|
106
|
-
|
107
|
-
|
108
|
-
|
95
|
+
begin
|
96
|
+
# Convert to text using the unified pipeline
|
97
|
+
text_content = Ragdoll::DocumentConverter.convert_to_text(@file_path, document_type)
|
109
98
|
|
110
|
-
|
111
|
-
|
112
|
-
metadata[:file_size] = File.size(@file_path)
|
113
|
-
metadata[:file_hash] = calculate_file_hash(@file_path)
|
114
|
-
end
|
99
|
+
# Extract metadata based on document type
|
100
|
+
metadata = extract_metadata_for_type(document_type)
|
115
101
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
metadata[:author] = reader.info[:Author] if reader.info[:Author]
|
122
|
-
metadata[:subject] = reader.info[:Subject] if reader.info[:Subject]
|
123
|
-
metadata[:creator] = reader.info[:Creator] if reader.info[:Creator]
|
124
|
-
metadata[:producer] = reader.info[:Producer] if reader.info[:Producer]
|
125
|
-
metadata[:creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
|
126
|
-
metadata[:modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
|
127
|
-
end
|
102
|
+
# Add encoding information for text files
|
103
|
+
if %w[text markdown html].include?(document_type)
|
104
|
+
encoding = detect_file_encoding(@file_path) || "UTF-8"
|
105
|
+
metadata[:encoding] = encoding
|
106
|
+
end
|
128
107
|
|
129
|
-
|
108
|
+
# Get title from metadata or filename
|
109
|
+
title = metadata[:title] || extract_title_from_filepath
|
110
|
+
|
111
|
+
{
|
112
|
+
content: text_content,
|
113
|
+
metadata: metadata,
|
114
|
+
title: title,
|
115
|
+
document_type: document_type
|
116
|
+
}
|
117
|
+
rescue StandardError => e
|
118
|
+
raise ParseError, "Failed to parse document: #{e.message}"
|
119
|
+
end
|
120
|
+
end
|
130
121
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
122
|
+
# Helper methods for document type determination
|
123
|
+
def self.determine_document_type(file_path)
|
124
|
+
Ragdoll::DocumentConverter.new.determine_document_type(file_path)
|
125
|
+
end
|
135
126
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
127
|
+
def self.determine_document_type_from_content_type(content_type)
|
128
|
+
case content_type
|
129
|
+
when "application/pdf" then "pdf"
|
130
|
+
when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx"
|
131
|
+
when "text/plain" then "text"
|
132
|
+
when "text/markdown" then "markdown"
|
133
|
+
when "text/html" then "html"
|
134
|
+
when %r{^image/} then "image"
|
135
|
+
when %r{^audio/} then "audio"
|
136
|
+
when %r{^video/} then "video"
|
137
|
+
else "text"
|
144
138
|
end
|
139
|
+
end
|
145
140
|
|
146
|
-
|
147
|
-
|
148
|
-
|
141
|
+
def self.determine_content_type(file_path)
|
142
|
+
case File.extname(file_path).downcase
|
143
|
+
when ".pdf" then "application/pdf"
|
144
|
+
when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
145
|
+
when ".txt" then "text/plain"
|
146
|
+
when ".md", ".markdown" then "text/markdown"
|
147
|
+
when ".html", ".htm" then "text/html"
|
148
|
+
when ".jpg", ".jpeg" then "image/jpeg"
|
149
|
+
when ".png" then "image/png"
|
150
|
+
when ".gif" then "image/gif"
|
151
|
+
when ".webp" then "image/webp"
|
152
|
+
when ".bmp" then "image/bmp"
|
153
|
+
when ".svg" then "image/svg+xml"
|
154
|
+
when ".ico" then "image/x-icon"
|
155
|
+
when ".tiff", ".tif" then "image/tiff"
|
156
|
+
when ".mp3" then "audio/mpeg"
|
157
|
+
when ".wav" then "audio/wav"
|
158
|
+
when ".m4a" then "audio/mp4"
|
159
|
+
when ".flac" then "audio/flac"
|
160
|
+
when ".ogg" then "audio/ogg"
|
161
|
+
when ".mp4" then "video/mp4"
|
162
|
+
when ".mov" then "video/quicktime"
|
163
|
+
when ".avi" then "video/x-msvideo"
|
164
|
+
when ".webm" then "video/webm"
|
165
|
+
else "application/octet-stream"
|
149
166
|
end
|
167
|
+
end
|
150
168
|
|
151
|
-
|
152
|
-
# Ensure content is UTF-8 encoded before checking presence
|
153
|
-
metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
|
169
|
+
private
|
154
170
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
171
|
+
def determine_document_type(file_path)
|
172
|
+
Ragdoll::DocumentConverter.new.determine_document_type(file_path)
|
173
|
+
end
|
174
|
+
|
175
|
+
def extract_metadata_for_type(document_type)
|
176
|
+
metadata = basic_file_metadata
|
177
|
+
|
178
|
+
case document_type
|
179
|
+
when "pdf"
|
180
|
+
metadata.merge!(extract_pdf_metadata)
|
181
|
+
when "docx"
|
182
|
+
metadata.merge!(extract_docx_metadata)
|
183
|
+
when "image"
|
184
|
+
metadata.merge!(extract_image_metadata)
|
185
|
+
when "audio"
|
186
|
+
metadata.merge!(extract_audio_metadata)
|
187
|
+
when "video"
|
188
|
+
metadata.merge!(extract_video_metadata)
|
189
|
+
end
|
190
|
+
|
191
|
+
metadata
|
160
192
|
end
|
161
193
|
|
162
|
-
def
|
163
|
-
content = ""
|
194
|
+
def basic_file_metadata
|
164
195
|
metadata = {}
|
165
196
|
|
166
|
-
# Add file-based metadata for duplicate detection
|
167
197
|
if File.exist?(@file_path)
|
168
198
|
metadata[:file_size] = File.size(@file_path)
|
169
199
|
metadata[:file_hash] = calculate_file_hash(@file_path)
|
200
|
+
metadata[:file_modified_at] = File.mtime(@file_path)
|
170
201
|
end
|
171
202
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
if doc.core_properties
|
177
|
-
metadata[:title] = doc.core_properties.title if doc.core_properties.title
|
178
|
-
metadata[:author] = doc.core_properties.creator if doc.core_properties.creator
|
179
|
-
metadata[:subject] = doc.core_properties.subject if doc.core_properties.subject
|
180
|
-
metadata[:description] = doc.core_properties.description if doc.core_properties.description
|
181
|
-
metadata[:keywords] = doc.core_properties.keywords if doc.core_properties.keywords
|
182
|
-
metadata[:created] = doc.core_properties.created if doc.core_properties.created
|
183
|
-
metadata[:modified] = doc.core_properties.modified if doc.core_properties.modified
|
184
|
-
if doc.core_properties.last_modified_by
|
185
|
-
metadata[:last_modified_by] =
|
186
|
-
doc.core_properties.last_modified_by
|
187
|
-
end
|
188
|
-
end
|
189
|
-
|
190
|
-
# Extract text from paragraphs
|
191
|
-
doc.paragraphs.each do |paragraph|
|
192
|
-
paragraph_text = paragraph.text.strip
|
193
|
-
next if paragraph_text.empty?
|
194
|
-
|
195
|
-
content += "#{paragraph_text}\n\n"
|
196
|
-
end
|
203
|
+
metadata[:original_filename] = File.basename(@file_path)
|
204
|
+
metadata[:file_extension] = File.extname(@file_path).downcase
|
205
|
+
metadata
|
206
|
+
end
|
197
207
|
|
198
|
-
|
199
|
-
|
200
|
-
content += "\n--- Table #{table_index + 1} ---\n\n"
|
208
|
+
def extract_pdf_metadata
|
209
|
+
return {} unless File.exist?(@file_path)
|
201
210
|
|
202
|
-
|
203
|
-
|
204
|
-
|
211
|
+
begin
|
212
|
+
metadata = {}
|
213
|
+
PDF::Reader.open(@file_path) do |reader|
|
214
|
+
if reader.info
|
215
|
+
metadata[:pdf_title] = reader.info[:Title] if reader.info[:Title]
|
216
|
+
metadata[:pdf_author] = reader.info[:Author] if reader.info[:Author]
|
217
|
+
metadata[:pdf_subject] = reader.info[:Subject] if reader.info[:Subject]
|
218
|
+
metadata[:pdf_creator] = reader.info[:Creator] if reader.info[:Creator]
|
219
|
+
metadata[:pdf_producer] = reader.info[:Producer] if reader.info[:Producer]
|
220
|
+
metadata[:pdf_creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
|
221
|
+
metadata[:pdf_modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
|
205
222
|
end
|
206
|
-
|
207
|
-
content += "\n"
|
223
|
+
metadata[:page_count] = reader.page_count
|
208
224
|
end
|
209
225
|
|
210
|
-
|
211
|
-
metadata[:
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
# Add filepath-based title as fallback if no title was found
|
217
|
-
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
218
|
-
metadata[:title] = extract_title_from_filepath
|
226
|
+
# Use PDF title as main title if available
|
227
|
+
metadata[:title] = metadata[:pdf_title] if metadata[:pdf_title]
|
228
|
+
metadata
|
229
|
+
rescue StandardError => e
|
230
|
+
puts "Warning: Failed to extract PDF metadata: #{e.message}"
|
231
|
+
{}
|
219
232
|
end
|
220
|
-
|
221
|
-
# Add content hash for duplicate detection
|
222
|
-
# Ensure content is UTF-8 encoded before checking presence
|
223
|
-
metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
|
224
|
-
|
225
|
-
{
|
226
|
-
content: content.strip,
|
227
|
-
metadata: metadata,
|
228
|
-
document_type: "docx"
|
229
|
-
}
|
230
233
|
end
|
231
234
|
|
232
|
-
def
|
233
|
-
|
234
|
-
document_type = case @file_extension
|
235
|
-
when ".md", ".markdown" then "markdown"
|
236
|
-
when ".txt" then "text"
|
237
|
-
else "text"
|
238
|
-
end
|
235
|
+
def extract_docx_metadata
|
236
|
+
return {} unless File.exist?(@file_path)
|
239
237
|
|
240
238
|
begin
|
241
|
-
|
242
|
-
|
243
|
-
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
|
244
|
-
# Try with different encoding - read as ISO-8859-1 and force encoding to UTF-8
|
245
|
-
content = File.read(@file_path, encoding: "ISO-8859-1").encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
|
246
|
-
encoding = "ISO-8859-1"
|
247
|
-
rescue Errno::ENOENT, Errno::EACCES => e
|
248
|
-
raise ParseError, "Failed to read file #{@file_path}: #{e.message}"
|
249
|
-
end
|
250
|
-
|
251
|
-
metadata = {
|
252
|
-
file_size: File.size(@file_path),
|
253
|
-
file_hash: calculate_file_hash(@file_path),
|
254
|
-
encoding: encoding
|
255
|
-
}
|
239
|
+
metadata = {}
|
240
|
+
doc = Docx::Document.open(@file_path)
|
256
241
|
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
metadata.
|
262
|
-
|
242
|
+
if doc.core_properties
|
243
|
+
metadata[:docx_title] = doc.core_properties.title if doc.core_properties.title
|
244
|
+
metadata[:docx_author] = doc.core_properties.creator if doc.core_properties.creator
|
245
|
+
metadata[:docx_subject] = doc.core_properties.subject if doc.core_properties.subject
|
246
|
+
metadata[:docx_description] = doc.core_properties.description if doc.core_properties.description
|
247
|
+
metadata[:docx_keywords] = doc.core_properties.keywords if doc.core_properties.keywords
|
248
|
+
metadata[:docx_created] = doc.core_properties.created if doc.core_properties.created
|
249
|
+
metadata[:docx_modified] = doc.core_properties.modified if doc.core_properties.modified
|
250
|
+
metadata[:docx_last_modified_by] = doc.core_properties.last_modified_by if doc.core_properties.last_modified_by
|
263
251
|
end
|
264
|
-
end
|
265
|
-
|
266
|
-
# Add filepath-based title as fallback if no title was found
|
267
|
-
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
268
|
-
metadata[:title] = extract_title_from_filepath
|
269
|
-
end
|
270
252
|
|
271
|
-
|
272
|
-
|
273
|
-
metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
|
274
|
-
|
275
|
-
{
|
276
|
-
content: content,
|
277
|
-
metadata: metadata,
|
278
|
-
document_type: document_type
|
279
|
-
}
|
280
|
-
end
|
281
|
-
|
282
|
-
def parse_html
|
283
|
-
content = File.read(@file_path, encoding: "UTF-8")
|
284
|
-
|
285
|
-
# Extract title from H1 tag if present
|
286
|
-
h1_match = content.match(%r{<h1[^>]*>(.*?)</h1>}mi)
|
287
|
-
title = nil
|
288
|
-
if h1_match
|
289
|
-
# Clean up the H1 content by removing any HTML tags and normalizing whitespace
|
290
|
-
title = h1_match[1]
|
291
|
-
.gsub(/<[^>]+>/, " ") # Remove any nested HTML tags
|
292
|
-
.gsub(/\s+/, " ") # Normalize whitespace
|
293
|
-
.strip
|
294
|
-
end
|
295
|
-
|
296
|
-
# Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
|
297
|
-
clean_content = content
|
298
|
-
.gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
|
299
|
-
.gsub(%r{<style[^>]*>.*?</style>}mi, "") # Remove style tags
|
300
|
-
.gsub(/<[^>]+>/, " ") # Remove all HTML tags
|
301
|
-
.gsub(/\s+/, " ") # Normalize whitespace
|
302
|
-
.strip
|
303
|
-
|
304
|
-
metadata = {
|
305
|
-
file_size: File.size(@file_path),
|
306
|
-
file_hash: calculate_file_hash(@file_path),
|
307
|
-
original_format: "html"
|
308
|
-
}
|
253
|
+
metadata[:paragraph_count] = doc.paragraphs.count
|
254
|
+
metadata[:table_count] = doc.tables.count
|
309
255
|
|
310
|
-
|
311
|
-
|
312
|
-
metadata
|
313
|
-
|
314
|
-
metadata
|
256
|
+
# Use DOCX title as main title if available
|
257
|
+
metadata[:title] = metadata[:docx_title] if metadata[:docx_title]
|
258
|
+
metadata
|
259
|
+
rescue StandardError => e
|
260
|
+
puts "Warning: Failed to extract DOCX metadata: #{e.message}"
|
261
|
+
{}
|
315
262
|
end
|
316
|
-
|
317
|
-
# Add content hash for duplicate detection
|
318
|
-
metadata[:content_hash] = calculate_content_hash(clean_content) if clean_content.present?
|
319
|
-
|
320
|
-
{
|
321
|
-
content: clean_content,
|
322
|
-
metadata: metadata,
|
323
|
-
document_type: "html"
|
324
|
-
}
|
325
263
|
end
|
326
264
|
|
327
|
-
def
|
328
|
-
|
265
|
+
def extract_image_metadata
|
266
|
+
return {} unless File.exist?(@file_path)
|
329
267
|
|
330
|
-
metadata = {
|
331
|
-
file_size: File.size(@file_path),
|
332
|
-
file_hash: calculate_file_hash(@file_path),
|
333
|
-
file_type: @file_extension.sub(".", ""),
|
334
|
-
original_filename: File.basename(@file_path)
|
335
|
-
}
|
336
|
-
|
337
|
-
# Extract image dimensions
|
338
268
|
begin
|
269
|
+
metadata = {}
|
339
270
|
img = Magick::Image.read(@file_path).first
|
340
|
-
|
271
|
+
|
272
|
+
metadata[:width] = img.columns
|
341
273
|
metadata[:height] = img.rows
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
metadata
|
274
|
+
metadata[:image_format] = img.format
|
275
|
+
metadata[:mime_type] = img.mime_type
|
276
|
+
metadata[:number_colors] = img.number_colors
|
277
|
+
|
278
|
+
metadata
|
279
|
+
rescue StandardError => e
|
280
|
+
puts "Warning: Failed to extract image metadata: #{e.message}"
|
281
|
+
{}
|
347
282
|
end
|
283
|
+
end
|
348
284
|
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
puts "📝 DocumentProcessor: Received description: '#{desc}'"
|
353
|
-
|
354
|
-
metadata[:description] = desc if desc && !desc.empty?
|
355
|
-
|
356
|
-
# Use AI-generated description or fallback placeholder
|
357
|
-
content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
|
358
|
-
|
359
|
-
# Add filepath-based title as fallback
|
360
|
-
metadata[:title] = extract_title_from_filepath
|
361
|
-
|
362
|
-
# Add content hash for duplicate detection
|
363
|
-
# Ensure content is UTF-8 encoded before checking presence
|
364
|
-
metadata[:content_hash] = calculate_content_hash(content) if content && content.length > 0
|
365
|
-
|
366
|
-
puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
|
367
|
-
|
285
|
+
def extract_audio_metadata
|
286
|
+
# Basic audio file metadata
|
287
|
+
# In production, you might use audio analysis libraries
|
368
288
|
{
|
369
|
-
|
370
|
-
|
371
|
-
document_type: "image"
|
289
|
+
media_type: "audio",
|
290
|
+
file_type: File.extname(@file_path).sub(".", "")
|
372
291
|
}
|
373
292
|
end
|
374
293
|
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
when ".html", ".htm" then "html"
|
383
|
-
when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif" then "image"
|
384
|
-
else "text"
|
385
|
-
end
|
386
|
-
end
|
387
|
-
|
388
|
-
def self.determine_document_type_from_content_type(content_type)
|
389
|
-
case content_type
|
390
|
-
when "application/pdf" then "pdf"
|
391
|
-
when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx"
|
392
|
-
when "text/plain" then "text"
|
393
|
-
when "text/markdown" then "markdown"
|
394
|
-
when "text/html" then "html"
|
395
|
-
when %r{^image/} then "image"
|
396
|
-
else "text"
|
397
|
-
end
|
398
|
-
end
|
399
|
-
|
400
|
-
def self.determine_content_type(file_path)
|
401
|
-
case File.extname(file_path).downcase
|
402
|
-
when ".pdf" then "application/pdf"
|
403
|
-
when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
404
|
-
when ".txt" then "text/plain"
|
405
|
-
when ".md", ".markdown" then "text/markdown"
|
406
|
-
when ".html", ".htm" then "text/html"
|
407
|
-
when ".jpg", ".jpeg" then "image/jpeg"
|
408
|
-
when ".png" then "image/png"
|
409
|
-
when ".gif" then "image/gif"
|
410
|
-
when ".webp" then "image/webp"
|
411
|
-
when ".bmp" then "image/bmp"
|
412
|
-
when ".svg" then "image/svg+xml"
|
413
|
-
when ".ico" then "image/x-icon"
|
414
|
-
when ".tiff", ".tif" then "image/tiff"
|
415
|
-
else "application/octet-stream"
|
416
|
-
end
|
294
|
+
def extract_video_metadata
|
295
|
+
# Basic video file metadata
|
296
|
+
# In production, you might use video analysis libraries
|
297
|
+
{
|
298
|
+
media_type: "video",
|
299
|
+
file_type: File.extname(@file_path).sub(".", "")
|
300
|
+
}
|
417
301
|
end
|
418
302
|
|
419
|
-
private
|
420
|
-
|
421
303
|
# Extract a meaningful title from the file path as a fallback
|
422
|
-
# @param file_path [String] the full file path
|
423
|
-
# @return [String] a cleaned title derived from the filename
|
424
304
|
def extract_title_from_filepath(file_path = @file_path)
|
425
305
|
filename = File.basename(file_path, File.extname(file_path))
|
426
|
-
|
306
|
+
|
427
307
|
# Clean up common patterns in filenames to make them more readable
|
428
308
|
title = filename
|
429
309
|
.gsub(/[-_]+/, ' ') # Replace hyphens and underscores with spaces
|
430
310
|
.gsub(/([a-z])([A-Z])/, '\1 \2') # Add space before capital letters (camelCase)
|
431
311
|
.gsub(/\s+/, ' ') # Normalize multiple spaces
|
432
312
|
.strip
|
433
|
-
|
313
|
+
|
434
314
|
# Capitalize words for better readability
|
435
315
|
title.split(' ').map(&:capitalize).join(' ')
|
436
316
|
end
|
437
317
|
|
438
|
-
# Parse YAML front matter from markdown content
|
439
|
-
# @param content [String] the full content of the markdown file
|
440
|
-
# @return [Array] returns [front_matter_hash, body_content] or [nil, original_content]
|
441
|
-
def parse_yaml_front_matter(content)
|
442
|
-
# Check if content starts with YAML front matter delimiter
|
443
|
-
return [nil, content] unless content.start_with?("---\n")
|
444
|
-
|
445
|
-
# Find the closing delimiter
|
446
|
-
lines = content.lines
|
447
|
-
closing_index = nil
|
448
|
-
|
449
|
-
lines.each_with_index do |line, index|
|
450
|
-
next if index == 0 # Skip the opening ---
|
451
|
-
if line.strip == "---"
|
452
|
-
closing_index = index
|
453
|
-
break
|
454
|
-
end
|
455
|
-
end
|
456
|
-
|
457
|
-
# No closing delimiter found
|
458
|
-
return [nil, content] unless closing_index
|
459
|
-
|
460
|
-
# Extract YAML content and body
|
461
|
-
yaml_lines = lines[1...closing_index]
|
462
|
-
body_lines = lines[(closing_index + 1)..-1]
|
463
|
-
|
464
|
-
yaml_content = yaml_lines.join
|
465
|
-
body_content = body_lines&.join || ""
|
466
|
-
|
467
|
-
# Parse YAML
|
468
|
-
begin
|
469
|
-
# Allow Time objects for date fields in YAML front matter
|
470
|
-
front_matter = YAML.safe_load(yaml_content, permitted_classes: [Time, Date])
|
471
|
-
# Convert string keys to symbols for consistency
|
472
|
-
front_matter = front_matter.transform_keys(&:to_sym) if front_matter.is_a?(Hash)
|
473
|
-
[front_matter, body_content.strip]
|
474
|
-
rescue YAML::SyntaxError, Psych::DisallowedClass => e
|
475
|
-
# If YAML parsing fails, return original content
|
476
|
-
Rails.logger.warn "Warning: Failed to parse YAML front matter: #{e.message}" if defined?(Rails)
|
477
|
-
[nil, content]
|
478
|
-
end
|
479
|
-
end
|
480
|
-
|
481
318
|
# Calculate SHA256 hash of file content for duplicate detection
|
482
319
|
def calculate_file_hash(file_path)
|
483
320
|
require 'digest'
|
@@ -497,5 +334,34 @@ module Ragdoll
|
|
497
334
|
puts "Warning: Failed to calculate content hash: #{e.message}"
|
498
335
|
nil
|
499
336
|
end
|
337
|
+
|
338
|
+
# Detect file encoding for text files
|
339
|
+
def detect_file_encoding(file_path)
|
340
|
+
return nil unless File.exist?(file_path)
|
341
|
+
|
342
|
+
# Read a sample to detect encoding
|
343
|
+
sample = File.read(file_path, 1000, encoding: 'ASCII-8BIT')
|
344
|
+
|
345
|
+
# Check for common encodings
|
346
|
+
if sample.valid_encoding?
|
347
|
+
# Try to convert to UTF-8
|
348
|
+
utf8_content = sample.encode('UTF-8', invalid: :replace, undef: :replace)
|
349
|
+
return 'UTF-8' if utf8_content.valid_encoding?
|
350
|
+
end
|
351
|
+
|
352
|
+
# Try common encodings
|
353
|
+
['UTF-8', 'ISO-8859-1', 'Windows-1252'].each do |encoding|
|
354
|
+
begin
|
355
|
+
test_content = sample.force_encoding(encoding)
|
356
|
+
return encoding if test_content.valid_encoding?
|
357
|
+
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
|
358
|
+
next
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
'UTF-8' # Default fallback
|
363
|
+
rescue StandardError
|
364
|
+
'UTF-8'
|
365
|
+
end
|
500
366
|
end
|
501
367
|
end
|