ragdoll 0.1.11 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +323 -384
- data/app/models/ragdoll/document.rb +1 -1
- data/app/models/ragdoll/unified_content.rb +216 -0
- data/app/models/ragdoll/unified_document.rb +338 -0
- data/app/services/ragdoll/audio_to_text_service.rb +200 -0
- data/app/services/ragdoll/document_converter.rb +216 -0
- data/app/services/ragdoll/document_processor.rb +197 -331
- data/app/services/ragdoll/image_to_text_service.rb +322 -0
- data/app/services/ragdoll/migration_service.rb +340 -0
- data/app/services/ragdoll/text_extraction_service.rb +422 -0
- data/app/services/ragdoll/unified_document_management.rb +300 -0
- data/db/migrate/20250923000001_create_ragdoll_unified_contents.rb +87 -0
- data/lib/ragdoll/core/version.rb +1 -1
- data/lib/ragdoll/core.rb +7 -0
- metadata +11 -2
@@ -0,0 +1,422 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "pdf-reader"
|
4
|
+
require "docx"
|
5
|
+
require "yaml"
|
6
|
+
require "json"
|
7
|
+
|
8
|
+
module Ragdoll
|
9
|
+
class TextExtractionService
|
10
|
+
class ExtractionError < StandardError; end
|
11
|
+
|
12
|
+
def self.extract(file_path, document_type = nil)
|
13
|
+
new(file_path, document_type).extract
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize(file_path, document_type = nil)
|
17
|
+
@file_path = file_path
|
18
|
+
@document_type = document_type || determine_document_type
|
19
|
+
@file_extension = File.extname(file_path).downcase
|
20
|
+
end
|
21
|
+
|
22
|
+
def extract
|
23
|
+
case @document_type
|
24
|
+
when "pdf"
|
25
|
+
extract_from_pdf
|
26
|
+
when "docx"
|
27
|
+
extract_from_docx
|
28
|
+
when "text", "markdown"
|
29
|
+
extract_from_text
|
30
|
+
when "html"
|
31
|
+
extract_from_html
|
32
|
+
when "csv"
|
33
|
+
extract_from_csv
|
34
|
+
when "json"
|
35
|
+
extract_from_json
|
36
|
+
when "xml"
|
37
|
+
extract_from_xml
|
38
|
+
when "yaml"
|
39
|
+
extract_from_yaml
|
40
|
+
else
|
41
|
+
extract_from_text # Default fallback
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def determine_document_type
|
48
|
+
case @file_extension
|
49
|
+
when ".pdf" then "pdf"
|
50
|
+
when ".docx" then "docx"
|
51
|
+
when ".txt" then "text"
|
52
|
+
when ".md", ".markdown" then "markdown"
|
53
|
+
when ".html", ".htm" then "html"
|
54
|
+
when ".csv" then "csv"
|
55
|
+
when ".json" then "json"
|
56
|
+
when ".xml" then "xml"
|
57
|
+
when ".yml", ".yaml" then "yaml"
|
58
|
+
else "text"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def extract_from_pdf
|
63
|
+
content = ""
|
64
|
+
|
65
|
+
begin
|
66
|
+
PDF::Reader.open(@file_path) do |reader|
|
67
|
+
reader.pages.each_with_index do |page, index|
|
68
|
+
page_text = page.text.strip
|
69
|
+
next if page_text.empty?
|
70
|
+
|
71
|
+
content += "\n\n--- Page #{index + 1} ---\n\n" if content.length.positive?
|
72
|
+
content += page_text
|
73
|
+
end
|
74
|
+
end
|
75
|
+
rescue PDF::Reader::MalformedPDFError => e
|
76
|
+
raise ExtractionError, "Malformed PDF: #{e.message}"
|
77
|
+
rescue PDF::Reader::UnsupportedFeatureError => e
|
78
|
+
raise ExtractionError, "Unsupported PDF feature: #{e.message}"
|
79
|
+
end
|
80
|
+
|
81
|
+
content.strip
|
82
|
+
end
|
83
|
+
|
84
|
+
def extract_from_docx
|
85
|
+
content = ""
|
86
|
+
|
87
|
+
begin
|
88
|
+
doc = Docx::Document.open(@file_path)
|
89
|
+
|
90
|
+
# Extract text from paragraphs
|
91
|
+
doc.paragraphs.each do |paragraph|
|
92
|
+
paragraph_text = paragraph.text.strip
|
93
|
+
next if paragraph_text.empty?
|
94
|
+
|
95
|
+
content += "#{paragraph_text}\n\n"
|
96
|
+
end
|
97
|
+
|
98
|
+
# Extract text from tables
|
99
|
+
doc.tables.each_with_index do |table, table_index|
|
100
|
+
content += "\n--- Table #{table_index + 1} ---\n\n"
|
101
|
+
|
102
|
+
table.rows.each do |row|
|
103
|
+
row_text = row.cells.map(&:text).join(" | ")
|
104
|
+
content += "#{row_text}\n" unless row_text.strip.empty?
|
105
|
+
end
|
106
|
+
|
107
|
+
content += "\n"
|
108
|
+
end
|
109
|
+
rescue StandardError => e
|
110
|
+
raise ExtractionError, "Failed to parse DOCX: #{e.message}"
|
111
|
+
end
|
112
|
+
|
113
|
+
content.strip
|
114
|
+
end
|
115
|
+
|
116
|
+
def extract_from_text
|
117
|
+
begin
|
118
|
+
content = File.read(@file_path, encoding: "UTF-8")
|
119
|
+
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
|
120
|
+
# Try with different encoding
|
121
|
+
content = File.read(@file_path, encoding: "ISO-8859-1")
|
122
|
+
.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
|
123
|
+
rescue Errno::ENOENT, Errno::EACCES => e
|
124
|
+
raise ExtractionError, "Failed to read file #{@file_path}: #{e.message}"
|
125
|
+
end
|
126
|
+
|
127
|
+
# Parse YAML front matter for markdown files
|
128
|
+
if @document_type == "markdown" && content.start_with?("---\n")
|
129
|
+
front_matter, body_content = parse_yaml_front_matter(content)
|
130
|
+
content = body_content if front_matter
|
131
|
+
end
|
132
|
+
|
133
|
+
content
|
134
|
+
end
|
135
|
+
|
136
|
+
def extract_from_html
|
137
|
+
content = File.read(@file_path, encoding: "UTF-8")
|
138
|
+
|
139
|
+
# Basic HTML tag stripping
|
140
|
+
clean_content = content
|
141
|
+
.gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
|
142
|
+
.gsub(%r{<style[^>]*>.*?</style>}mi, "") # Remove style tags
|
143
|
+
.gsub(/<[^>]+>/, " ") # Remove all HTML tags
|
144
|
+
.gsub(/\s+/, " ") # Normalize whitespace
|
145
|
+
.strip
|
146
|
+
|
147
|
+
clean_content
|
148
|
+
end
|
149
|
+
|
150
|
+
def parse_yaml_front_matter(content)
|
151
|
+
return [nil, content] unless content.start_with?("---\n")
|
152
|
+
|
153
|
+
lines = content.lines
|
154
|
+
closing_index = nil
|
155
|
+
|
156
|
+
lines.each_with_index do |line, index|
|
157
|
+
next if index == 0 # Skip the opening ---
|
158
|
+
if line.strip == "---"
|
159
|
+
closing_index = index
|
160
|
+
break
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
return [nil, content] unless closing_index
|
165
|
+
|
166
|
+
yaml_lines = lines[1...closing_index]
|
167
|
+
body_lines = lines[(closing_index + 1)..-1]
|
168
|
+
|
169
|
+
yaml_content = yaml_lines.join
|
170
|
+
body_content = body_lines&.join || ""
|
171
|
+
|
172
|
+
begin
|
173
|
+
front_matter = YAML.safe_load(yaml_content, permitted_classes: [Time, Date])
|
174
|
+
front_matter = front_matter.transform_keys(&:to_sym) if front_matter.is_a?(Hash)
|
175
|
+
[front_matter, body_content.strip]
|
176
|
+
rescue YAML::SyntaxError, Psych::DisallowedClass
|
177
|
+
[nil, content]
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
def extract_from_csv
|
182
|
+
content = []
|
183
|
+
|
184
|
+
begin
|
185
|
+
# Simple CSV parsing without using the csv gem
|
186
|
+
lines = File.readlines(@file_path, encoding: "UTF-8").map(&:strip).reject(&:empty?)
|
187
|
+
return "Empty CSV file" if lines.empty?
|
188
|
+
|
189
|
+
# Assume first line is headers
|
190
|
+
header_line = lines.first
|
191
|
+
headers = parse_csv_line(header_line)
|
192
|
+
|
193
|
+
return "CSV file with only headers" if lines.length == 1
|
194
|
+
|
195
|
+
# Process data rows
|
196
|
+
lines[1..-1].each_with_index do |line, index|
|
197
|
+
next if line.strip.empty?
|
198
|
+
|
199
|
+
values = parse_csv_line(line)
|
200
|
+
next if values.all?(&:empty?)
|
201
|
+
|
202
|
+
# Create readable row format
|
203
|
+
row_pairs = []
|
204
|
+
headers.each_with_index do |header, col_index|
|
205
|
+
value = values[col_index] || ""
|
206
|
+
row_pairs << "#{header}: #{value}" unless value.empty?
|
207
|
+
end
|
208
|
+
|
209
|
+
content << row_pairs.join(", ") if row_pairs.any?
|
210
|
+
end
|
211
|
+
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
|
212
|
+
# Try with different encoding
|
213
|
+
begin
|
214
|
+
lines = File.readlines(@file_path, encoding: "ISO-8859-1").map { |line|
|
215
|
+
line.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?").strip
|
216
|
+
}.reject(&:empty?)
|
217
|
+
|
218
|
+
return "Empty CSV file" if lines.empty?
|
219
|
+
|
220
|
+
header_line = lines.first
|
221
|
+
headers = parse_csv_line(header_line)
|
222
|
+
|
223
|
+
return "CSV file with only headers" if lines.length == 1
|
224
|
+
|
225
|
+
lines[1..-1].each do |line|
|
226
|
+
next if line.strip.empty?
|
227
|
+
|
228
|
+
values = parse_csv_line(line)
|
229
|
+
next if values.all?(&:empty?)
|
230
|
+
|
231
|
+
row_pairs = []
|
232
|
+
headers.each_with_index do |header, col_index|
|
233
|
+
value = values[col_index] || ""
|
234
|
+
row_pairs << "#{header}: #{value}" unless value.empty?
|
235
|
+
end
|
236
|
+
|
237
|
+
content << row_pairs.join(", ") if row_pairs.any?
|
238
|
+
end
|
239
|
+
rescue StandardError => e
|
240
|
+
raise ExtractionError, "Failed to parse CSV with alternative encoding: #{e.message}"
|
241
|
+
end
|
242
|
+
rescue StandardError => e
|
243
|
+
raise ExtractionError, "Failed to read CSV file: #{e.message}"
|
244
|
+
end
|
245
|
+
|
246
|
+
if content.empty?
|
247
|
+
return "CSV file with no readable data"
|
248
|
+
end
|
249
|
+
|
250
|
+
"CSV Data:\n#{content.join("\n")}"
|
251
|
+
end
|
252
|
+
|
253
|
+
def parse_csv_line(line)
|
254
|
+
# Simple CSV line parser that handles basic quoting
|
255
|
+
return [] if line.strip.empty?
|
256
|
+
|
257
|
+
fields = []
|
258
|
+
current_field = ""
|
259
|
+
in_quotes = false
|
260
|
+
|
261
|
+
i = 0
|
262
|
+
while i < line.length
|
263
|
+
char = line[i]
|
264
|
+
|
265
|
+
case char
|
266
|
+
when '"'
|
267
|
+
if in_quotes && i + 1 < line.length && line[i + 1] == '"'
|
268
|
+
# Escaped quote
|
269
|
+
current_field += '"'
|
270
|
+
i += 1
|
271
|
+
else
|
272
|
+
# Toggle quote state
|
273
|
+
in_quotes = !in_quotes
|
274
|
+
end
|
275
|
+
when ','
|
276
|
+
if in_quotes
|
277
|
+
current_field += char
|
278
|
+
else
|
279
|
+
# End of field
|
280
|
+
fields << current_field.strip
|
281
|
+
current_field = ""
|
282
|
+
end
|
283
|
+
else
|
284
|
+
current_field += char
|
285
|
+
end
|
286
|
+
|
287
|
+
i += 1
|
288
|
+
end
|
289
|
+
|
290
|
+
# Add the last field
|
291
|
+
fields << current_field.strip
|
292
|
+
fields
|
293
|
+
end
|
294
|
+
|
295
|
+
def extract_from_json
|
296
|
+
begin
|
297
|
+
content = File.read(@file_path, encoding: "UTF-8")
|
298
|
+
parsed_json = JSON.parse(content)
|
299
|
+
|
300
|
+
# Convert JSON to readable text format
|
301
|
+
convert_json_to_text(parsed_json)
|
302
|
+
rescue JSON::ParserError => e
|
303
|
+
raise ExtractionError, "Invalid JSON: #{e.message}"
|
304
|
+
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
|
305
|
+
# Try with different encoding
|
306
|
+
begin
|
307
|
+
content = File.read(@file_path, encoding: "ISO-8859-1")
|
308
|
+
.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
|
309
|
+
parsed_json = JSON.parse(content)
|
310
|
+
convert_json_to_text(parsed_json)
|
311
|
+
rescue StandardError => e
|
312
|
+
raise ExtractionError, "Failed to parse JSON with alternative encoding: #{e.message}"
|
313
|
+
end
|
314
|
+
rescue StandardError => e
|
315
|
+
raise ExtractionError, "Failed to read JSON file: #{e.message}"
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
def extract_from_xml
|
320
|
+
begin
|
321
|
+
content = File.read(@file_path, encoding: "UTF-8")
|
322
|
+
|
323
|
+
# Basic XML text extraction - remove tags and normalize whitespace
|
324
|
+
clean_content = content
|
325
|
+
.gsub(%r{<!--.*?-->}m, "") # Remove comments
|
326
|
+
.gsub(/<\?.*?\?>/m, "") # Remove processing instructions
|
327
|
+
.gsub(/<[^>]+>/, " ") # Remove all XML tags
|
328
|
+
.gsub(/\s+/, " ") # Normalize whitespace
|
329
|
+
.strip
|
330
|
+
|
331
|
+
if clean_content.empty?
|
332
|
+
"XML document with no readable text content"
|
333
|
+
else
|
334
|
+
"XML Content:\n#{clean_content}"
|
335
|
+
end
|
336
|
+
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
|
337
|
+
begin
|
338
|
+
content = File.read(@file_path, encoding: "ISO-8859-1")
|
339
|
+
.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
|
340
|
+
|
341
|
+
clean_content = content
|
342
|
+
.gsub(%r{<!--.*?-->}m, "")
|
343
|
+
.gsub(/<\?.*?\?>/m, "")
|
344
|
+
.gsub(/<[^>]+>/, " ")
|
345
|
+
.gsub(/\s+/, " ")
|
346
|
+
.strip
|
347
|
+
|
348
|
+
if clean_content.empty?
|
349
|
+
"XML document with no readable text content"
|
350
|
+
else
|
351
|
+
"XML Content:\n#{clean_content}"
|
352
|
+
end
|
353
|
+
rescue StandardError => e
|
354
|
+
raise ExtractionError, "Failed to parse XML: #{e.message}"
|
355
|
+
end
|
356
|
+
rescue StandardError => e
|
357
|
+
raise ExtractionError, "Failed to read XML file: #{e.message}"
|
358
|
+
end
|
359
|
+
end
|
360
|
+
|
361
|
+
def extract_from_yaml
|
362
|
+
begin
|
363
|
+
content = File.read(@file_path, encoding: "UTF-8")
|
364
|
+
parsed_yaml = YAML.safe_load(content, permitted_classes: [Time, Date])
|
365
|
+
|
366
|
+
# Convert YAML to readable text format
|
367
|
+
convert_yaml_to_text(parsed_yaml)
|
368
|
+
rescue YAML::SyntaxError => e
|
369
|
+
raise ExtractionError, "Invalid YAML: #{e.message}"
|
370
|
+
rescue Psych::DisallowedClass => e
|
371
|
+
raise ExtractionError, "YAML contains disallowed class: #{e.message}"
|
372
|
+
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
|
373
|
+
begin
|
374
|
+
content = File.read(@file_path, encoding: "ISO-8859-1")
|
375
|
+
.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
|
376
|
+
parsed_yaml = YAML.safe_load(content, permitted_classes: [Time, Date])
|
377
|
+
convert_yaml_to_text(parsed_yaml)
|
378
|
+
rescue StandardError => e
|
379
|
+
raise ExtractionError, "Failed to parse YAML with alternative encoding: #{e.message}"
|
380
|
+
end
|
381
|
+
rescue StandardError => e
|
382
|
+
raise ExtractionError, "Failed to read YAML file: #{e.message}"
|
383
|
+
end
|
384
|
+
end
|
385
|
+
|
386
|
+
def convert_json_to_text(obj, indent = 0)
|
387
|
+
prefix = " " * indent
|
388
|
+
|
389
|
+
case obj
|
390
|
+
when Hash
|
391
|
+
if obj.empty?
|
392
|
+
"Empty object"
|
393
|
+
else
|
394
|
+
lines = obj.map do |key, value|
|
395
|
+
"#{prefix}#{key}: #{convert_json_to_text(value, indent + 1)}"
|
396
|
+
end
|
397
|
+
lines.join("\n")
|
398
|
+
end
|
399
|
+
when Array
|
400
|
+
if obj.empty?
|
401
|
+
"Empty array"
|
402
|
+
else
|
403
|
+
lines = obj.each_with_index.map do |item, index|
|
404
|
+
"#{prefix}- #{convert_json_to_text(item, indent + 1)}"
|
405
|
+
end
|
406
|
+
lines.join("\n")
|
407
|
+
end
|
408
|
+
when String
|
409
|
+
obj.length > 100 ? "#{obj[0..97]}..." : obj
|
410
|
+
when Numeric, TrueClass, FalseClass, NilClass
|
411
|
+
obj.to_s
|
412
|
+
else
|
413
|
+
obj.to_s
|
414
|
+
end
|
415
|
+
end
|
416
|
+
|
417
|
+
def convert_yaml_to_text(obj, indent = 0)
|
418
|
+
# YAML and JSON have similar structures, so we can reuse the conversion logic
|
419
|
+
convert_json_to_text(obj, indent)
|
420
|
+
end
|
421
|
+
end
|
422
|
+
end
|