ragdoll 0.1.10 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/README.md +326 -351
- data/app/models/ragdoll/document.rb +1 -1
- data/app/models/ragdoll/search.rb +1 -1
- data/app/models/ragdoll/unified_content.rb +216 -0
- data/app/models/ragdoll/unified_document.rb +338 -0
- data/app/services/ragdoll/audio_to_text_service.rb +200 -0
- data/app/services/ragdoll/document_converter.rb +216 -0
- data/app/services/ragdoll/document_management.rb +117 -9
- data/app/services/ragdoll/document_processor.rb +213 -311
- data/app/services/ragdoll/image_to_text_service.rb +322 -0
- data/app/services/ragdoll/migration_service.rb +340 -0
- data/app/services/ragdoll/text_extraction_service.rb +422 -0
- data/app/services/ragdoll/unified_document_management.rb +300 -0
- data/db/migrate/20250923000001_create_ragdoll_unified_contents.rb +87 -0
- data/lib/ragdoll/core/client.rb +2 -2
- data/lib/ragdoll/core/version.rb +1 -1
- data/lib/ragdoll/core.rb +7 -0
- metadata +11 -2
@@ -5,7 +5,6 @@ require "docx"
|
|
5
5
|
require "rmagick"
|
6
6
|
require "yaml"
|
7
7
|
require "date"
|
8
|
-
# Image description service is auto-loaded from app/services
|
9
8
|
|
10
9
|
module Ragdoll
|
11
10
|
class DocumentProcessor
|
@@ -34,7 +33,7 @@ module Ragdoll
|
|
34
33
|
location: File.expand_path(file_path),
|
35
34
|
title: parsed[:title] || File.basename(file_path, File.extname(file_path)),
|
36
35
|
content: parsed[:content],
|
37
|
-
document_type: determine_document_type(file_path),
|
36
|
+
document_type: parsed[:document_type] || determine_document_type(file_path),
|
38
37
|
metadata: parsed[:metadata] || {},
|
39
38
|
status: "processed",
|
40
39
|
file_modified_at: file_modified_at,
|
@@ -85,288 +84,44 @@ module Ragdoll
|
|
85
84
|
end
|
86
85
|
|
87
86
|
def parse
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
when ".docx"
|
92
|
-
parse_docx
|
93
|
-
when ".txt", ".md", ".markdown"
|
94
|
-
parse_text
|
95
|
-
when ".html", ".htm"
|
96
|
-
parse_html
|
97
|
-
when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
|
98
|
-
parse_image
|
99
|
-
else
|
100
|
-
parse_text # Default to text parsing for unknown formats
|
87
|
+
# Check if file exists first
|
88
|
+
unless File.exist?(@file_path)
|
89
|
+
raise ParseError, "File does not exist: #{@file_path}"
|
101
90
|
end
|
102
|
-
rescue StandardError => e # StandardError => e
|
103
|
-
raise ParseError, "#{__LINE__} Failed to parse #{@file_path}: #{e.message}"
|
104
|
-
end
|
105
91
|
|
106
|
-
|
107
|
-
|
108
|
-
def parse_pdf
|
109
|
-
content = ""
|
110
|
-
metadata = {}
|
92
|
+
# Use the new unified document converter
|
93
|
+
document_type = determine_document_type(@file_path)
|
111
94
|
|
112
95
|
begin
|
113
|
-
|
114
|
-
|
115
|
-
if reader.info
|
116
|
-
metadata[:title] = reader.info[:Title] if reader.info[:Title]
|
117
|
-
metadata[:author] = reader.info[:Author] if reader.info[:Author]
|
118
|
-
metadata[:subject] = reader.info[:Subject] if reader.info[:Subject]
|
119
|
-
metadata[:creator] = reader.info[:Creator] if reader.info[:Creator]
|
120
|
-
metadata[:producer] = reader.info[:Producer] if reader.info[:Producer]
|
121
|
-
metadata[:creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
|
122
|
-
metadata[:modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
|
123
|
-
end
|
124
|
-
|
125
|
-
metadata[:page_count] = reader.page_count
|
126
|
-
|
127
|
-
# Extract text from all pages
|
128
|
-
reader.pages.each_with_index do |page, index|
|
129
|
-
page_text = page.text.strip
|
130
|
-
next if page_text.empty?
|
131
|
-
|
132
|
-
content += "\n\n--- Page #{index + 1} ---\n\n" if content.length.positive?
|
133
|
-
content += page_text
|
134
|
-
end
|
135
|
-
end
|
136
|
-
rescue PDF::Reader::MalformedPDFError => e
|
137
|
-
raise ParseError, "Malformed PDF: #{e.message}"
|
138
|
-
rescue PDF::Reader::UnsupportedFeatureError => e
|
139
|
-
raise ParseError, "Unsupported PDF feature: #{e.message}"
|
140
|
-
end
|
141
|
-
|
142
|
-
# Add filepath-based title as fallback if no title was found
|
143
|
-
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
144
|
-
metadata[:title] = extract_title_from_filepath
|
145
|
-
end
|
146
|
-
|
147
|
-
{
|
148
|
-
content: content.strip,
|
149
|
-
metadata: metadata,
|
150
|
-
document_type: "pdf"
|
151
|
-
}
|
152
|
-
end
|
153
|
-
|
154
|
-
def parse_docx
|
155
|
-
content = ""
|
156
|
-
metadata = {}
|
157
|
-
|
158
|
-
begin
|
159
|
-
doc = Docx::Document.open(@file_path)
|
160
|
-
|
161
|
-
# Extract core properties
|
162
|
-
if doc.core_properties
|
163
|
-
metadata[:title] = doc.core_properties.title if doc.core_properties.title
|
164
|
-
metadata[:author] = doc.core_properties.creator if doc.core_properties.creator
|
165
|
-
metadata[:subject] = doc.core_properties.subject if doc.core_properties.subject
|
166
|
-
metadata[:description] = doc.core_properties.description if doc.core_properties.description
|
167
|
-
metadata[:keywords] = doc.core_properties.keywords if doc.core_properties.keywords
|
168
|
-
metadata[:created] = doc.core_properties.created if doc.core_properties.created
|
169
|
-
metadata[:modified] = doc.core_properties.modified if doc.core_properties.modified
|
170
|
-
if doc.core_properties.last_modified_by
|
171
|
-
metadata[:last_modified_by] =
|
172
|
-
doc.core_properties.last_modified_by
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
# Extract text from paragraphs
|
177
|
-
doc.paragraphs.each do |paragraph|
|
178
|
-
paragraph_text = paragraph.text.strip
|
179
|
-
next if paragraph_text.empty?
|
180
|
-
|
181
|
-
content += "#{paragraph_text}\n\n"
|
182
|
-
end
|
183
|
-
|
184
|
-
# Extract text from tables
|
185
|
-
doc.tables.each_with_index do |table, table_index|
|
186
|
-
content += "\n--- Table #{table_index + 1} ---\n\n"
|
187
|
-
|
188
|
-
table.rows.each do |row|
|
189
|
-
row_text = row.cells.map(&:text).join(" | ")
|
190
|
-
content += "#{row_text}\n" unless row_text.strip.empty?
|
191
|
-
end
|
192
|
-
|
193
|
-
content += "\n"
|
194
|
-
end
|
195
|
-
|
196
|
-
metadata[:paragraph_count] = doc.paragraphs.count
|
197
|
-
metadata[:table_count] = doc.tables.count
|
198
|
-
rescue StandardError => e # StandardError => e
|
199
|
-
raise ParseError, "#{__LINE__} Failed to parse DOCX: #{e.message}"
|
200
|
-
end
|
201
|
-
|
202
|
-
# Add filepath-based title as fallback if no title was found
|
203
|
-
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
204
|
-
metadata[:title] = extract_title_from_filepath
|
205
|
-
end
|
206
|
-
|
207
|
-
{
|
208
|
-
content: content.strip,
|
209
|
-
metadata: metadata,
|
210
|
-
document_type: "docx"
|
211
|
-
}
|
212
|
-
end
|
213
|
-
|
214
|
-
def parse_text
|
215
|
-
content = File.read(@file_path, encoding: "UTF-8")
|
216
|
-
metadata = {
|
217
|
-
file_size: File.size(@file_path),
|
218
|
-
encoding: "UTF-8"
|
219
|
-
}
|
96
|
+
# Convert to text using the unified pipeline
|
97
|
+
text_content = Ragdoll::DocumentConverter.convert_to_text(@file_path, document_type)
|
220
98
|
|
221
|
-
|
222
|
-
|
223
|
-
when ".txt" then "text"
|
224
|
-
else "text"
|
225
|
-
end
|
226
|
-
|
227
|
-
# Parse YAML front matter for markdown files
|
228
|
-
if document_type == "markdown" && content.start_with?("---\n")
|
229
|
-
front_matter, body_content = parse_yaml_front_matter(content)
|
230
|
-
if front_matter
|
231
|
-
metadata.merge!(front_matter)
|
232
|
-
content = body_content
|
233
|
-
end
|
234
|
-
end
|
235
|
-
|
236
|
-
# Add filepath-based title as fallback if no title was found
|
237
|
-
if metadata[:title].nil? || (metadata[:title].is_a?(String) && metadata[:title].strip.empty?)
|
238
|
-
metadata[:title] = extract_title_from_filepath
|
239
|
-
end
|
240
|
-
|
241
|
-
{
|
242
|
-
content: content,
|
243
|
-
metadata: metadata,
|
244
|
-
document_type: document_type
|
245
|
-
}
|
246
|
-
rescue Encoding::InvalidByteSequenceError
|
247
|
-
# Try with different encoding
|
248
|
-
content = File.read(@file_path, encoding: "ISO-8859-1")
|
249
|
-
metadata = {
|
250
|
-
file_size: File.size(@file_path),
|
251
|
-
encoding: "ISO-8859-1"
|
252
|
-
}
|
99
|
+
# Extract metadata based on document type
|
100
|
+
metadata = extract_metadata_for_type(document_type)
|
253
101
|
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
metadata.merge!(front_matter)
|
259
|
-
content = body_content
|
102
|
+
# Add encoding information for text files
|
103
|
+
if %w[text markdown html].include?(document_type)
|
104
|
+
encoding = detect_file_encoding(@file_path) || "UTF-8"
|
105
|
+
metadata[:encoding] = encoding
|
260
106
|
end
|
261
|
-
end
|
262
107
|
|
263
|
-
|
264
|
-
|
265
|
-
|
108
|
+
# Get title from metadata or filename
|
109
|
+
title = metadata[:title] || extract_title_from_filepath
|
110
|
+
|
111
|
+
{
|
112
|
+
content: text_content,
|
113
|
+
metadata: metadata,
|
114
|
+
title: title,
|
115
|
+
document_type: document_type
|
116
|
+
}
|
117
|
+
rescue StandardError => e
|
118
|
+
raise ParseError, "Failed to parse document: #{e.message}"
|
266
119
|
end
|
267
|
-
|
268
|
-
{
|
269
|
-
content: content,
|
270
|
-
metadata: metadata,
|
271
|
-
document_type: document_type.nil? ? "text" : document_type
|
272
|
-
}
|
273
|
-
end
|
274
|
-
|
275
|
-
def parse_html
|
276
|
-
content = File.read(@file_path, encoding: "UTF-8")
|
277
|
-
|
278
|
-
# Extract title from H1 tag if present
|
279
|
-
h1_match = content.match(%r{<h1[^>]*>(.*?)</h1>}mi)
|
280
|
-
title = nil
|
281
|
-
if h1_match
|
282
|
-
# Clean up the H1 content by removing any HTML tags and normalizing whitespace
|
283
|
-
title = h1_match[1]
|
284
|
-
.gsub(/<[^>]+>/, " ") # Remove any nested HTML tags
|
285
|
-
.gsub(/\s+/, " ") # Normalize whitespace
|
286
|
-
.strip
|
287
|
-
end
|
288
|
-
|
289
|
-
# Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
|
290
|
-
clean_content = content
|
291
|
-
.gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
|
292
|
-
.gsub(%r{<style[^>]*>.*?</style>}mi, "") # Remove style tags
|
293
|
-
.gsub(/<[^>]+>/, " ") # Remove all HTML tags
|
294
|
-
.gsub(/\s+/, " ") # Normalize whitespace
|
295
|
-
.strip
|
296
|
-
|
297
|
-
metadata = {
|
298
|
-
file_size: File.size(@file_path),
|
299
|
-
original_format: "html"
|
300
|
-
}
|
301
|
-
|
302
|
-
# Add title to metadata if found, otherwise use filepath fallback
|
303
|
-
if title && !title.empty?
|
304
|
-
metadata[:title] = title
|
305
|
-
else
|
306
|
-
metadata[:title] = extract_title_from_filepath
|
307
|
-
end
|
308
|
-
|
309
|
-
{
|
310
|
-
content: clean_content,
|
311
|
-
metadata: metadata,
|
312
|
-
document_type: "html"
|
313
|
-
}
|
314
|
-
end
|
315
|
-
|
316
|
-
def parse_image
|
317
|
-
puts "🖼️ DocumentProcessor: Starting image parsing for #{@file_path}"
|
318
|
-
|
319
|
-
metadata = {
|
320
|
-
file_size: File.size(@file_path),
|
321
|
-
file_type: @file_extension.sub(".", ""),
|
322
|
-
original_filename: File.basename(@file_path)
|
323
|
-
}
|
324
|
-
|
325
|
-
# Extract image dimensions
|
326
|
-
begin
|
327
|
-
img = Magick::Image.read(@file_path).first
|
328
|
-
metadata[:width] = img.columns
|
329
|
-
metadata[:height] = img.rows
|
330
|
-
puts "📏 DocumentProcessor: Image dimensions: #{img.columns}x#{img.rows}"
|
331
|
-
rescue StandardError => e # StandardError
|
332
|
-
puts "❌ DocumentProcessor: Failed to get image dimensions: #{e.message}"
|
333
|
-
metadata[:width] = nil
|
334
|
-
metadata[:height] = nil
|
335
|
-
end
|
336
|
-
|
337
|
-
puts "🤖 DocumentProcessor: Creating ImageDescriptionService and calling generate_description..."
|
338
|
-
desc = Ragdoll::ImageDescriptionService.new.generate_description(@file_path)
|
339
|
-
|
340
|
-
puts "📝 DocumentProcessor: Received description: '#{desc}'"
|
341
|
-
|
342
|
-
metadata[:description] = desc if desc && !desc.empty?
|
343
|
-
|
344
|
-
# Use AI-generated description or fallback placeholder
|
345
|
-
content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
|
346
|
-
|
347
|
-
# Add filepath-based title as fallback
|
348
|
-
metadata[:title] = extract_title_from_filepath
|
349
|
-
|
350
|
-
puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
|
351
|
-
|
352
|
-
{
|
353
|
-
content: content,
|
354
|
-
metadata: metadata,
|
355
|
-
document_type: "image"
|
356
|
-
}
|
357
120
|
end
|
358
121
|
|
359
122
|
# Helper methods for document type determination
|
360
123
|
def self.determine_document_type(file_path)
|
361
|
-
|
362
|
-
when ".pdf" then "pdf"
|
363
|
-
when ".docx" then "docx"
|
364
|
-
when ".txt" then "text"
|
365
|
-
when ".md", ".markdown" then "markdown"
|
366
|
-
when ".html", ".htm" then "html"
|
367
|
-
when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif" then "image"
|
368
|
-
else "text"
|
369
|
-
end
|
124
|
+
Ragdoll::DocumentConverter.new.determine_document_type(file_path)
|
370
125
|
end
|
371
126
|
|
372
127
|
def self.determine_document_type_from_content_type(content_type)
|
@@ -377,6 +132,8 @@ module Ragdoll
|
|
377
132
|
when "text/markdown" then "markdown"
|
378
133
|
when "text/html" then "html"
|
379
134
|
when %r{^image/} then "image"
|
135
|
+
when %r{^audio/} then "audio"
|
136
|
+
when %r{^video/} then "video"
|
380
137
|
else "text"
|
381
138
|
end
|
382
139
|
end
|
@@ -396,70 +153,215 @@ module Ragdoll
|
|
396
153
|
when ".svg" then "image/svg+xml"
|
397
154
|
when ".ico" then "image/x-icon"
|
398
155
|
when ".tiff", ".tif" then "image/tiff"
|
156
|
+
when ".mp3" then "audio/mpeg"
|
157
|
+
when ".wav" then "audio/wav"
|
158
|
+
when ".m4a" then "audio/mp4"
|
159
|
+
when ".flac" then "audio/flac"
|
160
|
+
when ".ogg" then "audio/ogg"
|
161
|
+
when ".mp4" then "video/mp4"
|
162
|
+
when ".mov" then "video/quicktime"
|
163
|
+
when ".avi" then "video/x-msvideo"
|
164
|
+
when ".webm" then "video/webm"
|
399
165
|
else "application/octet-stream"
|
400
166
|
end
|
401
167
|
end
|
402
168
|
|
403
169
|
private
|
404
170
|
|
171
|
+
def determine_document_type(file_path)
|
172
|
+
Ragdoll::DocumentConverter.new.determine_document_type(file_path)
|
173
|
+
end
|
174
|
+
|
175
|
+
def extract_metadata_for_type(document_type)
|
176
|
+
metadata = basic_file_metadata
|
177
|
+
|
178
|
+
case document_type
|
179
|
+
when "pdf"
|
180
|
+
metadata.merge!(extract_pdf_metadata)
|
181
|
+
when "docx"
|
182
|
+
metadata.merge!(extract_docx_metadata)
|
183
|
+
when "image"
|
184
|
+
metadata.merge!(extract_image_metadata)
|
185
|
+
when "audio"
|
186
|
+
metadata.merge!(extract_audio_metadata)
|
187
|
+
when "video"
|
188
|
+
metadata.merge!(extract_video_metadata)
|
189
|
+
end
|
190
|
+
|
191
|
+
metadata
|
192
|
+
end
|
193
|
+
|
194
|
+
def basic_file_metadata
|
195
|
+
metadata = {}
|
196
|
+
|
197
|
+
if File.exist?(@file_path)
|
198
|
+
metadata[:file_size] = File.size(@file_path)
|
199
|
+
metadata[:file_hash] = calculate_file_hash(@file_path)
|
200
|
+
metadata[:file_modified_at] = File.mtime(@file_path)
|
201
|
+
end
|
202
|
+
|
203
|
+
metadata[:original_filename] = File.basename(@file_path)
|
204
|
+
metadata[:file_extension] = File.extname(@file_path).downcase
|
205
|
+
metadata
|
206
|
+
end
|
207
|
+
|
208
|
+
def extract_pdf_metadata
|
209
|
+
return {} unless File.exist?(@file_path)
|
210
|
+
|
211
|
+
begin
|
212
|
+
metadata = {}
|
213
|
+
PDF::Reader.open(@file_path) do |reader|
|
214
|
+
if reader.info
|
215
|
+
metadata[:pdf_title] = reader.info[:Title] if reader.info[:Title]
|
216
|
+
metadata[:pdf_author] = reader.info[:Author] if reader.info[:Author]
|
217
|
+
metadata[:pdf_subject] = reader.info[:Subject] if reader.info[:Subject]
|
218
|
+
metadata[:pdf_creator] = reader.info[:Creator] if reader.info[:Creator]
|
219
|
+
metadata[:pdf_producer] = reader.info[:Producer] if reader.info[:Producer]
|
220
|
+
metadata[:pdf_creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
|
221
|
+
metadata[:pdf_modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
|
222
|
+
end
|
223
|
+
metadata[:page_count] = reader.page_count
|
224
|
+
end
|
225
|
+
|
226
|
+
# Use PDF title as main title if available
|
227
|
+
metadata[:title] = metadata[:pdf_title] if metadata[:pdf_title]
|
228
|
+
metadata
|
229
|
+
rescue StandardError => e
|
230
|
+
puts "Warning: Failed to extract PDF metadata: #{e.message}"
|
231
|
+
{}
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
def extract_docx_metadata
|
236
|
+
return {} unless File.exist?(@file_path)
|
237
|
+
|
238
|
+
begin
|
239
|
+
metadata = {}
|
240
|
+
doc = Docx::Document.open(@file_path)
|
241
|
+
|
242
|
+
if doc.core_properties
|
243
|
+
metadata[:docx_title] = doc.core_properties.title if doc.core_properties.title
|
244
|
+
metadata[:docx_author] = doc.core_properties.creator if doc.core_properties.creator
|
245
|
+
metadata[:docx_subject] = doc.core_properties.subject if doc.core_properties.subject
|
246
|
+
metadata[:docx_description] = doc.core_properties.description if doc.core_properties.description
|
247
|
+
metadata[:docx_keywords] = doc.core_properties.keywords if doc.core_properties.keywords
|
248
|
+
metadata[:docx_created] = doc.core_properties.created if doc.core_properties.created
|
249
|
+
metadata[:docx_modified] = doc.core_properties.modified if doc.core_properties.modified
|
250
|
+
metadata[:docx_last_modified_by] = doc.core_properties.last_modified_by if doc.core_properties.last_modified_by
|
251
|
+
end
|
252
|
+
|
253
|
+
metadata[:paragraph_count] = doc.paragraphs.count
|
254
|
+
metadata[:table_count] = doc.tables.count
|
255
|
+
|
256
|
+
# Use DOCX title as main title if available
|
257
|
+
metadata[:title] = metadata[:docx_title] if metadata[:docx_title]
|
258
|
+
metadata
|
259
|
+
rescue StandardError => e
|
260
|
+
puts "Warning: Failed to extract DOCX metadata: #{e.message}"
|
261
|
+
{}
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
def extract_image_metadata
|
266
|
+
return {} unless File.exist?(@file_path)
|
267
|
+
|
268
|
+
begin
|
269
|
+
metadata = {}
|
270
|
+
img = Magick::Image.read(@file_path).first
|
271
|
+
|
272
|
+
metadata[:width] = img.columns
|
273
|
+
metadata[:height] = img.rows
|
274
|
+
metadata[:image_format] = img.format
|
275
|
+
metadata[:mime_type] = img.mime_type
|
276
|
+
metadata[:number_colors] = img.number_colors
|
277
|
+
|
278
|
+
metadata
|
279
|
+
rescue StandardError => e
|
280
|
+
puts "Warning: Failed to extract image metadata: #{e.message}"
|
281
|
+
{}
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
def extract_audio_metadata
|
286
|
+
# Basic audio file metadata
|
287
|
+
# In production, you might use audio analysis libraries
|
288
|
+
{
|
289
|
+
media_type: "audio",
|
290
|
+
file_type: File.extname(@file_path).sub(".", "")
|
291
|
+
}
|
292
|
+
end
|
293
|
+
|
294
|
+
def extract_video_metadata
|
295
|
+
# Basic video file metadata
|
296
|
+
# In production, you might use video analysis libraries
|
297
|
+
{
|
298
|
+
media_type: "video",
|
299
|
+
file_type: File.extname(@file_path).sub(".", "")
|
300
|
+
}
|
301
|
+
end
|
302
|
+
|
405
303
|
# Extract a meaningful title from the file path as a fallback
|
406
|
-
# @param file_path [String] the full file path
|
407
|
-
# @return [String] a cleaned title derived from the filename
|
408
304
|
def extract_title_from_filepath(file_path = @file_path)
|
409
305
|
filename = File.basename(file_path, File.extname(file_path))
|
410
|
-
|
306
|
+
|
411
307
|
# Clean up common patterns in filenames to make them more readable
|
412
308
|
title = filename
|
413
309
|
.gsub(/[-_]+/, ' ') # Replace hyphens and underscores with spaces
|
414
310
|
.gsub(/([a-z])([A-Z])/, '\1 \2') # Add space before capital letters (camelCase)
|
415
311
|
.gsub(/\s+/, ' ') # Normalize multiple spaces
|
416
312
|
.strip
|
417
|
-
|
313
|
+
|
418
314
|
# Capitalize words for better readability
|
419
315
|
title.split(' ').map(&:capitalize).join(' ')
|
420
316
|
end
|
421
317
|
|
422
|
-
#
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
closing_index = nil
|
432
|
-
|
433
|
-
lines.each_with_index do |line, index|
|
434
|
-
next if index == 0 # Skip the opening ---
|
435
|
-
if line.strip == "---"
|
436
|
-
closing_index = index
|
437
|
-
break
|
438
|
-
end
|
439
|
-
end
|
318
|
+
# Calculate SHA256 hash of file content for duplicate detection
|
319
|
+
def calculate_file_hash(file_path)
|
320
|
+
require 'digest'
|
321
|
+
Digest::SHA256.file(file_path).hexdigest
|
322
|
+
rescue StandardError => e
|
323
|
+
Rails.logger.warn "Failed to calculate file hash for #{file_path}: #{e.message}" if defined?(Rails)
|
324
|
+
puts "Warning: Failed to calculate file hash for #{file_path}: #{e.message}"
|
325
|
+
nil
|
326
|
+
end
|
440
327
|
|
441
|
-
|
442
|
-
|
328
|
+
# Calculate SHA256 hash of text content for duplicate detection
|
329
|
+
def calculate_content_hash(content)
|
330
|
+
require 'digest'
|
331
|
+
Digest::SHA256.hexdigest(content)
|
332
|
+
rescue StandardError => e
|
333
|
+
Rails.logger.warn "Failed to calculate content hash: #{e.message}" if defined?(Rails)
|
334
|
+
puts "Warning: Failed to calculate content hash: #{e.message}"
|
335
|
+
nil
|
336
|
+
end
|
443
337
|
|
444
|
-
|
445
|
-
|
446
|
-
|
338
|
+
# Detect file encoding for text files
|
339
|
+
def detect_file_encoding(file_path)
|
340
|
+
return nil unless File.exist?(file_path)
|
447
341
|
|
448
|
-
|
449
|
-
|
342
|
+
# Read a sample to detect encoding
|
343
|
+
sample = File.read(file_path, 1000, encoding: 'ASCII-8BIT')
|
450
344
|
|
451
|
-
#
|
452
|
-
|
453
|
-
#
|
454
|
-
|
455
|
-
|
456
|
-
front_matter = front_matter.transform_keys(&:to_sym) if front_matter.is_a?(Hash)
|
457
|
-
[front_matter, body_content.strip]
|
458
|
-
rescue YAML::SyntaxError, Psych::DisallowedClass => e
|
459
|
-
# If YAML parsing fails, return original content
|
460
|
-
Rails.logger.warn "Warning: Failed to parse YAML front matter: #{e.message}" if defined?(Rails)
|
461
|
-
[nil, content]
|
345
|
+
# Check for common encodings
|
346
|
+
if sample.valid_encoding?
|
347
|
+
# Try to convert to UTF-8
|
348
|
+
utf8_content = sample.encode('UTF-8', invalid: :replace, undef: :replace)
|
349
|
+
return 'UTF-8' if utf8_content.valid_encoding?
|
462
350
|
end
|
351
|
+
|
352
|
+
# Try common encodings
|
353
|
+
['UTF-8', 'ISO-8859-1', 'Windows-1252'].each do |encoding|
|
354
|
+
begin
|
355
|
+
test_content = sample.force_encoding(encoding)
|
356
|
+
return encoding if test_content.valid_encoding?
|
357
|
+
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
|
358
|
+
next
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
'UTF-8' # Default fallback
|
363
|
+
rescue StandardError
|
364
|
+
'UTF-8'
|
463
365
|
end
|
464
366
|
end
|
465
367
|
end
|