ragdoll 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,422 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pdf-reader"
4
+ require "docx"
5
+ require "yaml"
6
+ require "json"
7
+
8
+ module Ragdoll
9
+ class TextExtractionService
10
+ class ExtractionError < StandardError; end
11
+
12
+ def self.extract(file_path, document_type = nil)
13
+ new(file_path, document_type).extract
14
+ end
15
+
16
+ def initialize(file_path, document_type = nil)
17
+ @file_path = file_path
18
+ @document_type = document_type || determine_document_type
19
+ @file_extension = File.extname(file_path).downcase
20
+ end
21
+
22
+ def extract
23
+ case @document_type
24
+ when "pdf"
25
+ extract_from_pdf
26
+ when "docx"
27
+ extract_from_docx
28
+ when "text", "markdown"
29
+ extract_from_text
30
+ when "html"
31
+ extract_from_html
32
+ when "csv"
33
+ extract_from_csv
34
+ when "json"
35
+ extract_from_json
36
+ when "xml"
37
+ extract_from_xml
38
+ when "yaml"
39
+ extract_from_yaml
40
+ else
41
+ extract_from_text # Default fallback
42
+ end
43
+ end
44
+
45
+ private
46
+
47
+ def determine_document_type
48
+ case @file_extension
49
+ when ".pdf" then "pdf"
50
+ when ".docx" then "docx"
51
+ when ".txt" then "text"
52
+ when ".md", ".markdown" then "markdown"
53
+ when ".html", ".htm" then "html"
54
+ when ".csv" then "csv"
55
+ when ".json" then "json"
56
+ when ".xml" then "xml"
57
+ when ".yml", ".yaml" then "yaml"
58
+ else "text"
59
+ end
60
+ end
61
+
62
+ def extract_from_pdf
63
+ content = ""
64
+
65
+ begin
66
+ PDF::Reader.open(@file_path) do |reader|
67
+ reader.pages.each_with_index do |page, index|
68
+ page_text = page.text.strip
69
+ next if page_text.empty?
70
+
71
+ content += "\n\n--- Page #{index + 1} ---\n\n" if content.length.positive?
72
+ content += page_text
73
+ end
74
+ end
75
+ rescue PDF::Reader::MalformedPDFError => e
76
+ raise ExtractionError, "Malformed PDF: #{e.message}"
77
+ rescue PDF::Reader::UnsupportedFeatureError => e
78
+ raise ExtractionError, "Unsupported PDF feature: #{e.message}"
79
+ end
80
+
81
+ content.strip
82
+ end
83
+
84
+ def extract_from_docx
85
+ content = ""
86
+
87
+ begin
88
+ doc = Docx::Document.open(@file_path)
89
+
90
+ # Extract text from paragraphs
91
+ doc.paragraphs.each do |paragraph|
92
+ paragraph_text = paragraph.text.strip
93
+ next if paragraph_text.empty?
94
+
95
+ content += "#{paragraph_text}\n\n"
96
+ end
97
+
98
+ # Extract text from tables
99
+ doc.tables.each_with_index do |table, table_index|
100
+ content += "\n--- Table #{table_index + 1} ---\n\n"
101
+
102
+ table.rows.each do |row|
103
+ row_text = row.cells.map(&:text).join(" | ")
104
+ content += "#{row_text}\n" unless row_text.strip.empty?
105
+ end
106
+
107
+ content += "\n"
108
+ end
109
+ rescue StandardError => e
110
+ raise ExtractionError, "Failed to parse DOCX: #{e.message}"
111
+ end
112
+
113
+ content.strip
114
+ end
115
+
116
+ def extract_from_text
117
+ begin
118
+ content = File.read(@file_path, encoding: "UTF-8")
119
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
120
+ # Try with different encoding
121
+ content = File.read(@file_path, encoding: "ISO-8859-1")
122
+ .encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
123
+ rescue Errno::ENOENT, Errno::EACCES => e
124
+ raise ExtractionError, "Failed to read file #{@file_path}: #{e.message}"
125
+ end
126
+
127
+ # Parse YAML front matter for markdown files
128
+ if @document_type == "markdown" && content.start_with?("---\n")
129
+ front_matter, body_content = parse_yaml_front_matter(content)
130
+ content = body_content if front_matter
131
+ end
132
+
133
+ content
134
+ end
135
+
136
+ def extract_from_html
137
+ content = File.read(@file_path, encoding: "UTF-8")
138
+
139
+ # Basic HTML tag stripping
140
+ clean_content = content
141
+ .gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
142
+ .gsub(%r{<style[^>]*>.*?</style>}mi, "") # Remove style tags
143
+ .gsub(/<[^>]+>/, " ") # Remove all HTML tags
144
+ .gsub(/\s+/, " ") # Normalize whitespace
145
+ .strip
146
+
147
+ clean_content
148
+ end
149
+
150
+ def parse_yaml_front_matter(content)
151
+ return [nil, content] unless content.start_with?("---\n")
152
+
153
+ lines = content.lines
154
+ closing_index = nil
155
+
156
+ lines.each_with_index do |line, index|
157
+ next if index == 0 # Skip the opening ---
158
+ if line.strip == "---"
159
+ closing_index = index
160
+ break
161
+ end
162
+ end
163
+
164
+ return [nil, content] unless closing_index
165
+
166
+ yaml_lines = lines[1...closing_index]
167
+ body_lines = lines[(closing_index + 1)..-1]
168
+
169
+ yaml_content = yaml_lines.join
170
+ body_content = body_lines&.join || ""
171
+
172
+ begin
173
+ front_matter = YAML.safe_load(yaml_content, permitted_classes: [Time, Date])
174
+ front_matter = front_matter.transform_keys(&:to_sym) if front_matter.is_a?(Hash)
175
+ [front_matter, body_content.strip]
176
+ rescue YAML::SyntaxError, Psych::DisallowedClass
177
+ [nil, content]
178
+ end
179
+ end
180
+
181
+ def extract_from_csv
182
+ content = []
183
+
184
+ begin
185
+ # Simple CSV parsing without using the csv gem
186
+ lines = File.readlines(@file_path, encoding: "UTF-8").map(&:strip).reject(&:empty?)
187
+ return "Empty CSV file" if lines.empty?
188
+
189
+ # Assume first line is headers
190
+ header_line = lines.first
191
+ headers = parse_csv_line(header_line)
192
+
193
+ return "CSV file with only headers" if lines.length == 1
194
+
195
+ # Process data rows
196
+ lines[1..-1].each_with_index do |line, index|
197
+ next if line.strip.empty?
198
+
199
+ values = parse_csv_line(line)
200
+ next if values.all?(&:empty?)
201
+
202
+ # Create readable row format
203
+ row_pairs = []
204
+ headers.each_with_index do |header, col_index|
205
+ value = values[col_index] || ""
206
+ row_pairs << "#{header}: #{value}" unless value.empty?
207
+ end
208
+
209
+ content << row_pairs.join(", ") if row_pairs.any?
210
+ end
211
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
212
+ # Try with different encoding
213
+ begin
214
+ lines = File.readlines(@file_path, encoding: "ISO-8859-1").map { |line|
215
+ line.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?").strip
216
+ }.reject(&:empty?)
217
+
218
+ return "Empty CSV file" if lines.empty?
219
+
220
+ header_line = lines.first
221
+ headers = parse_csv_line(header_line)
222
+
223
+ return "CSV file with only headers" if lines.length == 1
224
+
225
+ lines[1..-1].each do |line|
226
+ next if line.strip.empty?
227
+
228
+ values = parse_csv_line(line)
229
+ next if values.all?(&:empty?)
230
+
231
+ row_pairs = []
232
+ headers.each_with_index do |header, col_index|
233
+ value = values[col_index] || ""
234
+ row_pairs << "#{header}: #{value}" unless value.empty?
235
+ end
236
+
237
+ content << row_pairs.join(", ") if row_pairs.any?
238
+ end
239
+ rescue StandardError => e
240
+ raise ExtractionError, "Failed to parse CSV with alternative encoding: #{e.message}"
241
+ end
242
+ rescue StandardError => e
243
+ raise ExtractionError, "Failed to read CSV file: #{e.message}"
244
+ end
245
+
246
+ if content.empty?
247
+ return "CSV file with no readable data"
248
+ end
249
+
250
+ "CSV Data:\n#{content.join("\n")}"
251
+ end
252
+
253
+ def parse_csv_line(line)
254
+ # Simple CSV line parser that handles basic quoting
255
+ return [] if line.strip.empty?
256
+
257
+ fields = []
258
+ current_field = ""
259
+ in_quotes = false
260
+
261
+ i = 0
262
+ while i < line.length
263
+ char = line[i]
264
+
265
+ case char
266
+ when '"'
267
+ if in_quotes && i + 1 < line.length && line[i + 1] == '"'
268
+ # Escaped quote
269
+ current_field += '"'
270
+ i += 1
271
+ else
272
+ # Toggle quote state
273
+ in_quotes = !in_quotes
274
+ end
275
+ when ','
276
+ if in_quotes
277
+ current_field += char
278
+ else
279
+ # End of field
280
+ fields << current_field.strip
281
+ current_field = ""
282
+ end
283
+ else
284
+ current_field += char
285
+ end
286
+
287
+ i += 1
288
+ end
289
+
290
+ # Add the last field
291
+ fields << current_field.strip
292
+ fields
293
+ end
294
+
295
+ def extract_from_json
296
+ begin
297
+ content = File.read(@file_path, encoding: "UTF-8")
298
+ parsed_json = JSON.parse(content)
299
+
300
+ # Convert JSON to readable text format
301
+ convert_json_to_text(parsed_json)
302
+ rescue JSON::ParserError => e
303
+ raise ExtractionError, "Invalid JSON: #{e.message}"
304
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
305
+ # Try with different encoding
306
+ begin
307
+ content = File.read(@file_path, encoding: "ISO-8859-1")
308
+ .encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
309
+ parsed_json = JSON.parse(content)
310
+ convert_json_to_text(parsed_json)
311
+ rescue StandardError => e
312
+ raise ExtractionError, "Failed to parse JSON with alternative encoding: #{e.message}"
313
+ end
314
+ rescue StandardError => e
315
+ raise ExtractionError, "Failed to read JSON file: #{e.message}"
316
+ end
317
+ end
318
+
319
+ def extract_from_xml
320
+ begin
321
+ content = File.read(@file_path, encoding: "UTF-8")
322
+
323
+ # Basic XML text extraction - remove tags and normalize whitespace
324
+ clean_content = content
325
+ .gsub(%r{<!--.*?-->}m, "") # Remove comments
326
+ .gsub(/<\?.*?\?>/m, "") # Remove processing instructions
327
+ .gsub(/<[^>]+>/, " ") # Remove all XML tags
328
+ .gsub(/\s+/, " ") # Normalize whitespace
329
+ .strip
330
+
331
+ if clean_content.empty?
332
+ "XML document with no readable text content"
333
+ else
334
+ "XML Content:\n#{clean_content}"
335
+ end
336
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
337
+ begin
338
+ content = File.read(@file_path, encoding: "ISO-8859-1")
339
+ .encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
340
+
341
+ clean_content = content
342
+ .gsub(%r{<!--.*?-->}m, "")
343
+ .gsub(/<\?.*?\?>/m, "")
344
+ .gsub(/<[^>]+>/, " ")
345
+ .gsub(/\s+/, " ")
346
+ .strip
347
+
348
+ if clean_content.empty?
349
+ "XML document with no readable text content"
350
+ else
351
+ "XML Content:\n#{clean_content}"
352
+ end
353
+ rescue StandardError => e
354
+ raise ExtractionError, "Failed to parse XML: #{e.message}"
355
+ end
356
+ rescue StandardError => e
357
+ raise ExtractionError, "Failed to read XML file: #{e.message}"
358
+ end
359
+ end
360
+
361
+ def extract_from_yaml
362
+ begin
363
+ content = File.read(@file_path, encoding: "UTF-8")
364
+ parsed_yaml = YAML.safe_load(content, permitted_classes: [Time, Date])
365
+
366
+ # Convert YAML to readable text format
367
+ convert_yaml_to_text(parsed_yaml)
368
+ rescue YAML::SyntaxError => e
369
+ raise ExtractionError, "Invalid YAML: #{e.message}"
370
+ rescue Psych::DisallowedClass => e
371
+ raise ExtractionError, "YAML contains disallowed class: #{e.message}"
372
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
373
+ begin
374
+ content = File.read(@file_path, encoding: "ISO-8859-1")
375
+ .encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
376
+ parsed_yaml = YAML.safe_load(content, permitted_classes: [Time, Date])
377
+ convert_yaml_to_text(parsed_yaml)
378
+ rescue StandardError => e
379
+ raise ExtractionError, "Failed to parse YAML with alternative encoding: #{e.message}"
380
+ end
381
+ rescue StandardError => e
382
+ raise ExtractionError, "Failed to read YAML file: #{e.message}"
383
+ end
384
+ end
385
+
386
+ def convert_json_to_text(obj, indent = 0)
387
+ prefix = " " * indent
388
+
389
+ case obj
390
+ when Hash
391
+ if obj.empty?
392
+ "Empty object"
393
+ else
394
+ lines = obj.map do |key, value|
395
+ "#{prefix}#{key}: #{convert_json_to_text(value, indent + 1)}"
396
+ end
397
+ lines.join("\n")
398
+ end
399
+ when Array
400
+ if obj.empty?
401
+ "Empty array"
402
+ else
403
+ lines = obj.each_with_index.map do |item, index|
404
+ "#{prefix}- #{convert_json_to_text(item, indent + 1)}"
405
+ end
406
+ lines.join("\n")
407
+ end
408
+ when String
409
+ obj.length > 100 ? "#{obj[0..97]}..." : obj
410
+ when Numeric, TrueClass, FalseClass, NilClass
411
+ obj.to_s
412
+ else
413
+ obj.to_s
414
+ end
415
+ end
416
+
417
+ def convert_yaml_to_text(obj, indent = 0)
418
+ # YAML and JSON have similar structures, so we can reuse the conversion logic
419
+ convert_json_to_text(obj, indent)
420
+ end
421
+ end
422
+ end