universal_document_processor 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +122 -1
- data/lib/universal_document_processor/ai_agent.rb +48 -49
- data/lib/universal_document_processor/document.rb +130 -13
- data/lib/universal_document_processor/processors/base_processor.rb +17 -0
- data/lib/universal_document_processor/processors/excel_processor.rb +30 -0
- data/lib/universal_document_processor/processors/pdf_processor.rb +21 -1
- data/lib/universal_document_processor/processors/text_processor.rb +21 -0
- data/lib/universal_document_processor/processors/word_processor.rb +30 -0
- data/lib/universal_document_processor/version.rb +1 -1
- data/lib/universal_document_processor.rb +10 -0
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: '0612949a026d62fd8fd9c9c1372cfa70cdeb8bdd1677475be639cf35cd684f4c'
|
|
4
|
+
data.tar.gz: 82780d2c062034be663b3d21275e9d27addc1e44f5705de7dc6b23e70293216e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 07a4fe1b792226dae8135e6620f455640d0ba137777b238052916d06a0b4f32b113414886479e0ad48b76ead57d9b7d6a577a76748dff926a9575ad124dc7ee5
|
|
7
|
+
data.tar.gz: fd3b8fb692f87755a657eb1270631c19ca99bcb5fcbb224e4ebcc22c1b19af1eb2aa3b2aed6ff3f94f84a6fdba8d5e255b28575d5fe1b074244e4bc160820f33
|
data/README.md
CHANGED
|
@@ -219,6 +219,33 @@ puts "Tables found: #{result[:tables].length}"
|
|
|
219
219
|
full_text = result[:text_content]
|
|
220
220
|
```
|
|
221
221
|
|
|
222
|
+
### Creating PDF Documents
|
|
223
|
+
|
|
224
|
+
```ruby
|
|
225
|
+
# Install Prawn for PDF creation (optional dependency)
|
|
226
|
+
# gem install prawn
|
|
227
|
+
|
|
228
|
+
# Create PDF from any supported document format
|
|
229
|
+
pdf_path = UniversalDocumentProcessor.create_pdf('document.docx')
|
|
230
|
+
puts "PDF created at: #{pdf_path}"
|
|
231
|
+
|
|
232
|
+
# Or use the convert method
|
|
233
|
+
pdf_path = UniversalDocumentProcessor.convert('spreadsheet.xlsx', :pdf)
|
|
234
|
+
|
|
235
|
+
# Check if PDF creation is available
|
|
236
|
+
if UniversalDocumentProcessor.pdf_creation_available?
|
|
237
|
+
puts "PDF creation is available!"
|
|
238
|
+
else
|
|
239
|
+
puts "Install 'prawn' gem to enable PDF creation: gem install prawn"
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
# The created PDF includes:
|
|
243
|
+
# - Document title and metadata
|
|
244
|
+
# - Full text content with formatting
|
|
245
|
+
# - Tables (if present in original document)
|
|
246
|
+
# - File information and statistics
|
|
247
|
+
```
|
|
248
|
+
|
|
222
249
|
### Processing Excel Spreadsheets
|
|
223
250
|
|
|
224
251
|
```ruby
|
|
@@ -413,6 +440,89 @@ summary = japanese_doc.ai_summarize(length: :medium)
|
|
|
413
440
|
|
|
414
441
|
```ruby
|
|
415
442
|
# Custom AI agent configuration
|
|
443
|
+
## ⚙️ Agentic AI Configuration & Usage
|
|
444
|
+
|
|
445
|
+
To enable and use the AI-powered features (agentic AI) in your application, follow these steps:
|
|
446
|
+
|
|
447
|
+
### 1. Install AI Dependency
|
|
448
|
+
|
|
449
|
+
You need the `ruby-openai` gem for AI features:
|
|
450
|
+
|
|
451
|
+
```bash
|
|
452
|
+
gem install ruby-openai
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
Or add to your Gemfile:
|
|
456
|
+
|
|
457
|
+
```ruby
|
|
458
|
+
gem 'ruby-openai'
|
|
459
|
+
```
|
|
460
|
+
|
|
461
|
+
Then run:
|
|
462
|
+
|
|
463
|
+
```bash
|
|
464
|
+
bundle install
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
### 2. Set Your OpenAI API Key
|
|
468
|
+
|
|
469
|
+
You must provide your OpenAI API key for agentic AI features to work. You can do this in two ways:
|
|
470
|
+
|
|
471
|
+
#### a) Environment Variable (Recommended)
|
|
472
|
+
|
|
473
|
+
Set the API key in your environment (e.g., in `.env`, `application.yml`, or your deployment environment):
|
|
474
|
+
|
|
475
|
+
```ruby
|
|
476
|
+
ENV['OPENAI_API_KEY'] = 'your-api-key-here'
|
|
477
|
+
```
|
|
478
|
+
|
|
479
|
+
#### b) Pass Directly When Creating the Agent
|
|
480
|
+
|
|
481
|
+
```ruby
|
|
482
|
+
agent = UniversalDocumentProcessor.create_ai_agent(api_key: 'your-api-key-here')
|
|
483
|
+
```
|
|
484
|
+
|
|
485
|
+
### 3. Rails: Where to Configure
|
|
486
|
+
|
|
487
|
+
If you are using Rails, add your configuration to:
|
|
488
|
+
|
|
489
|
+
`config/initializers/universal_document_processor.rb`
|
|
490
|
+
|
|
491
|
+
Example initializer:
|
|
492
|
+
|
|
493
|
+
```ruby
|
|
494
|
+
# config/initializers/universal_document_processor.rb
|
|
495
|
+
require 'universal_document_processor'
|
|
496
|
+
|
|
497
|
+
# Set your API key (or use ENV)
|
|
498
|
+
ENV['OPENAI_API_KEY'] ||= 'your-api-key-here' # (or use Rails credentials)
|
|
499
|
+
|
|
500
|
+
# Optionally, create a default agent with custom options
|
|
501
|
+
UniversalDocumentProcessor.create_ai_agent(
|
|
502
|
+
model: 'gpt-4',
|
|
503
|
+
temperature: 0.7,
|
|
504
|
+
max_history: 10
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
Rails.logger.info "Universal Document Processor with AI agent loaded" if defined?(Rails)
|
|
508
|
+
```
|
|
509
|
+
|
|
510
|
+
### 4. Using Agentic AI Features
|
|
511
|
+
|
|
512
|
+
You can now use the AI-powered methods:
|
|
513
|
+
|
|
514
|
+
```ruby
|
|
515
|
+
summary = UniversalDocumentProcessor.ai_summarize('document.pdf', length: :short)
|
|
516
|
+
insights = UniversalDocumentProcessor.ai_insights('document.pdf')
|
|
517
|
+
classification = UniversalDocumentProcessor.ai_classify('document.pdf')
|
|
518
|
+
key_info = UniversalDocumentProcessor.ai_extract_info('document.pdf', ['dates', 'names', 'amounts'])
|
|
519
|
+
action_items = UniversalDocumentProcessor.ai_action_items('document.pdf')
|
|
520
|
+
translation = UniversalDocumentProcessor.ai_translate('日本語文書.pdf', 'English')
|
|
521
|
+
```
|
|
522
|
+
|
|
523
|
+
Or create and use a persistent agent:
|
|
524
|
+
|
|
525
|
+
```ruby
|
|
416
526
|
agent = UniversalDocumentProcessor.create_ai_agent(
|
|
417
527
|
api_key: 'your-openai-key', # OpenAI API key
|
|
418
528
|
model: 'gpt-4', # Model to use (gpt-4, gpt-3.5-turbo)
|
|
@@ -420,6 +530,17 @@ agent = UniversalDocumentProcessor.create_ai_agent(
|
|
|
420
530
|
max_history: 20, # Conversation memory length
|
|
421
531
|
base_url: 'https://api.openai.com/v1' # Custom API endpoint
|
|
422
532
|
)
|
|
533
|
+
|
|
534
|
+
# Chat about a document
|
|
535
|
+
response = agent.analyze_document('report.pdf')
|
|
536
|
+
```
|
|
537
|
+
|
|
538
|
+
---
|
|
539
|
+
|
|
540
|
+
**Note:**
|
|
541
|
+
- The API key is required for all AI features.
|
|
542
|
+
- You can override the model, temperature, and other options per agent.
|
|
543
|
+
- For more, see the `USER_GUIDE.md` and the examples above.
|
|
423
544
|
```
|
|
424
545
|
|
|
425
546
|
## 📦 Archive Processing (ZIP Creation & Extraction)
|
|
@@ -857,7 +978,7 @@ bundle exec rspec
|
|
|
857
978
|
|
|
858
979
|
## 📝 Changelog
|
|
859
980
|
|
|
860
|
-
### Version 1.
|
|
981
|
+
### Version 1.1.0
|
|
861
982
|
- Initial release
|
|
862
983
|
- Support for PDF, Word, Excel, PowerPoint, images, archives
|
|
863
984
|
- Character validation and cleaning
|
|
@@ -14,16 +14,16 @@ module UniversalDocumentProcessor
|
|
|
14
14
|
@max_history = options[:max_history] || 10
|
|
15
15
|
@temperature = options[:temperature] || 0.7
|
|
16
16
|
@ai_enabled = false
|
|
17
|
-
|
|
17
|
+
|
|
18
18
|
validate_configuration
|
|
19
19
|
end
|
|
20
20
|
|
|
21
21
|
# Main document analysis with AI
|
|
22
22
|
def analyze_document(document_result, query = nil)
|
|
23
23
|
ensure_ai_available!
|
|
24
|
-
|
|
24
|
+
|
|
25
25
|
context = build_document_context(document_result)
|
|
26
|
-
|
|
26
|
+
|
|
27
27
|
if query
|
|
28
28
|
# Specific query about the document
|
|
29
29
|
analyze_with_query(context, query)
|
|
@@ -67,12 +67,12 @@ Please provide:
|
|
|
67
67
|
# Ask specific questions about a document
|
|
68
68
|
def ask_document_question(document_result, question)
|
|
69
69
|
ensure_ai_available!
|
|
70
|
-
|
|
70
|
+
|
|
71
71
|
context = build_document_context(document_result)
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
prompt = build_question_prompt(context, question)
|
|
74
74
|
response = call_openai_api(prompt)
|
|
75
|
-
|
|
75
|
+
|
|
76
76
|
add_to_history(question, response)
|
|
77
77
|
response
|
|
78
78
|
end
|
|
@@ -80,19 +80,19 @@ Please provide:
|
|
|
80
80
|
# Summarize document content
|
|
81
81
|
def summarize_document(document_result, length: :medium)
|
|
82
82
|
ensure_ai_available!
|
|
83
|
-
|
|
83
|
+
|
|
84
84
|
context = build_document_context(document_result)
|
|
85
|
-
|
|
85
|
+
|
|
86
86
|
length_instruction = case length
|
|
87
87
|
when :short then "in 2-3 sentences"
|
|
88
88
|
when :medium then "in 1-2 paragraphs"
|
|
89
89
|
when :long then "in detail with key points"
|
|
90
90
|
else "concisely"
|
|
91
91
|
end
|
|
92
|
-
|
|
92
|
+
|
|
93
93
|
prompt = build_summary_prompt(context, length_instruction)
|
|
94
94
|
response = call_openai_api(prompt)
|
|
95
|
-
|
|
95
|
+
|
|
96
96
|
add_to_history("Summarize document #{length_instruction}", response)
|
|
97
97
|
response
|
|
98
98
|
end
|
|
@@ -100,13 +100,13 @@ Please provide:
|
|
|
100
100
|
# Extract key information from document
|
|
101
101
|
def extract_key_information(document_result, categories = nil)
|
|
102
102
|
ensure_ai_available!
|
|
103
|
-
|
|
103
|
+
|
|
104
104
|
context = build_document_context(document_result)
|
|
105
105
|
categories ||= ['key_facts', 'important_dates', 'names', 'locations', 'numbers']
|
|
106
|
-
|
|
106
|
+
|
|
107
107
|
prompt = build_extraction_prompt(context, categories)
|
|
108
108
|
response = call_openai_api(prompt)
|
|
109
|
-
|
|
109
|
+
|
|
110
110
|
add_to_history("Extract key information: #{categories.join(', ')}", response)
|
|
111
111
|
parse_extraction_response(response)
|
|
112
112
|
end
|
|
@@ -114,12 +114,12 @@ Please provide:
|
|
|
114
114
|
# Translate document content
|
|
115
115
|
def translate_document(document_result, target_language)
|
|
116
116
|
ensure_ai_available!
|
|
117
|
-
|
|
117
|
+
|
|
118
118
|
context = build_document_context(document_result)
|
|
119
|
-
|
|
119
|
+
|
|
120
120
|
prompt = build_translation_prompt(context, target_language)
|
|
121
121
|
response = call_openai_api(prompt)
|
|
122
|
-
|
|
122
|
+
|
|
123
123
|
add_to_history("Translate to #{target_language}", response)
|
|
124
124
|
response
|
|
125
125
|
end
|
|
@@ -127,12 +127,12 @@ Please provide:
|
|
|
127
127
|
# Generate document insights and recommendations
|
|
128
128
|
def generate_insights(document_result)
|
|
129
129
|
ensure_ai_available!
|
|
130
|
-
|
|
130
|
+
|
|
131
131
|
context = build_document_context(document_result)
|
|
132
|
-
|
|
132
|
+
|
|
133
133
|
prompt = build_insights_prompt(context)
|
|
134
134
|
response = call_openai_api(prompt)
|
|
135
|
-
|
|
135
|
+
|
|
136
136
|
add_to_history("Generate insights", response)
|
|
137
137
|
parse_insights_response(response)
|
|
138
138
|
end
|
|
@@ -140,12 +140,12 @@ Please provide:
|
|
|
140
140
|
# Compare multiple documents
|
|
141
141
|
def compare_documents(document_results, comparison_type = :content)
|
|
142
142
|
ensure_ai_available!
|
|
143
|
-
|
|
143
|
+
|
|
144
144
|
contexts = document_results.map { |doc| build_document_context(doc) }
|
|
145
|
-
|
|
145
|
+
|
|
146
146
|
prompt = build_comparison_prompt(contexts, comparison_type)
|
|
147
147
|
response = call_openai_api(prompt)
|
|
148
|
-
|
|
148
|
+
|
|
149
149
|
add_to_history("Compare documents (#{comparison_type})", response)
|
|
150
150
|
response
|
|
151
151
|
end
|
|
@@ -153,12 +153,12 @@ Please provide:
|
|
|
153
153
|
# Classify document type and purpose
|
|
154
154
|
def classify_document(document_result)
|
|
155
155
|
ensure_ai_available!
|
|
156
|
-
|
|
156
|
+
|
|
157
157
|
context = build_document_context(document_result)
|
|
158
|
-
|
|
158
|
+
|
|
159
159
|
prompt = build_classification_prompt(context)
|
|
160
160
|
response = call_openai_api(prompt)
|
|
161
|
-
|
|
161
|
+
|
|
162
162
|
add_to_history("Classify document", response)
|
|
163
163
|
parse_classification_response(response)
|
|
164
164
|
end
|
|
@@ -166,12 +166,12 @@ Please provide:
|
|
|
166
166
|
# Generate action items from document
|
|
167
167
|
def extract_action_items(document_result)
|
|
168
168
|
ensure_ai_available!
|
|
169
|
-
|
|
169
|
+
|
|
170
170
|
context = build_document_context(document_result)
|
|
171
|
-
|
|
171
|
+
|
|
172
172
|
prompt = build_action_items_prompt(context)
|
|
173
173
|
response = call_openai_api(prompt)
|
|
174
|
-
|
|
174
|
+
|
|
175
175
|
add_to_history("Extract action items", response)
|
|
176
176
|
parse_action_items_response(response)
|
|
177
177
|
end
|
|
@@ -179,14 +179,14 @@ Please provide:
|
|
|
179
179
|
# Chat about the document
|
|
180
180
|
def chat(message, document_result = nil)
|
|
181
181
|
ensure_ai_available!
|
|
182
|
-
|
|
182
|
+
|
|
183
183
|
if document_result
|
|
184
184
|
context = build_document_context(document_result)
|
|
185
185
|
prompt = build_chat_prompt(context, message)
|
|
186
186
|
else
|
|
187
187
|
prompt = build_general_chat_prompt(message)
|
|
188
188
|
end
|
|
189
|
-
|
|
189
|
+
|
|
190
190
|
response = call_openai_api(prompt)
|
|
191
191
|
add_to_history(message, response)
|
|
192
192
|
response
|
|
@@ -200,15 +200,15 @@ Please provide:
|
|
|
200
200
|
# Get conversation summary
|
|
201
201
|
def conversation_summary
|
|
202
202
|
return "No conversation history" if @conversation_history.empty?
|
|
203
|
-
|
|
203
|
+
|
|
204
204
|
unless @ai_enabled
|
|
205
205
|
return "AI features are disabled. Cannot generate conversation summary."
|
|
206
206
|
end
|
|
207
|
-
|
|
207
|
+
|
|
208
208
|
history_text = @conversation_history.map do |entry|
|
|
209
209
|
"Q: #{entry[:question]}\nA: #{entry[:answer]}"
|
|
210
210
|
end.join("\n\n")
|
|
211
|
-
|
|
211
|
+
|
|
212
212
|
prompt = "Summarize this conversation:\n\n#{history_text}"
|
|
213
213
|
call_openai_api(prompt)
|
|
214
214
|
end
|
|
@@ -247,13 +247,13 @@ Please provide:
|
|
|
247
247
|
tables_count: document_result[:tables]&.length || 0,
|
|
248
248
|
filename_info: document_result[:filename_info] || {}
|
|
249
249
|
}
|
|
250
|
-
|
|
250
|
+
|
|
251
251
|
# Add Japanese-specific information if available
|
|
252
252
|
if context[:filename_info][:contains_japanese]
|
|
253
253
|
context[:japanese_filename] = true
|
|
254
254
|
context[:japanese_parts] = context[:filename_info][:japanese_parts]
|
|
255
255
|
end
|
|
256
|
-
|
|
256
|
+
|
|
257
257
|
context
|
|
258
258
|
end
|
|
259
259
|
|
|
@@ -324,8 +324,7 @@ Please provide:
|
|
|
324
324
|
|
|
325
325
|
def build_comparison_prompt(contexts, comparison_type)
|
|
326
326
|
comparison_content = contexts.map.with_index do |context, index|
|
|
327
|
-
"Document #{index + 1}: #{context[:filename]}
|
|
328
|
-
Content: #{truncate_content(context[:text_content], 1500)}"
|
|
327
|
+
"Document #{index + 1}: #{context[:filename]}\nContent: #{truncate_content(context[:text_content], 1500)}"
|
|
329
328
|
end.join("\n\n---\n\n")
|
|
330
329
|
|
|
331
330
|
"You are an AI analyst. Compare these documents focusing on #{comparison_type}:
|
|
@@ -404,15 +403,15 @@ Please respond helpfully."
|
|
|
404
403
|
|
|
405
404
|
def call_openai_api(prompt)
|
|
406
405
|
uri = URI("#{@base_url}/chat/completions")
|
|
407
|
-
|
|
406
|
+
|
|
408
407
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
409
408
|
http.use_ssl = true
|
|
410
409
|
http.read_timeout = 60
|
|
411
|
-
|
|
410
|
+
|
|
412
411
|
request = Net::HTTP::Post.new(uri)
|
|
413
412
|
request['Content-Type'] = 'application/json'
|
|
414
413
|
request['Authorization'] = "Bearer #{@api_key}"
|
|
415
|
-
|
|
414
|
+
|
|
416
415
|
request.body = {
|
|
417
416
|
model: @model,
|
|
418
417
|
messages: [
|
|
@@ -421,16 +420,16 @@ Please respond helpfully."
|
|
|
421
420
|
content: "You are an intelligent document processing assistant with expertise in analyzing, summarizing, and extracting information from various document types. You support multiple languages including Japanese."
|
|
422
421
|
},
|
|
423
422
|
{
|
|
424
|
-
role: "user",
|
|
423
|
+
role: "user",
|
|
425
424
|
content: prompt
|
|
426
425
|
}
|
|
427
426
|
],
|
|
428
427
|
temperature: @temperature,
|
|
429
428
|
max_tokens: 2000
|
|
430
429
|
}.to_json
|
|
431
|
-
|
|
430
|
+
|
|
432
431
|
response = http.request(request)
|
|
433
|
-
|
|
432
|
+
|
|
434
433
|
if response.code.to_i == 200
|
|
435
434
|
result = JSON.parse(response.body)
|
|
436
435
|
result.dig('choices', 0, 'message', 'content') || "No response generated"
|
|
@@ -446,14 +445,14 @@ Please respond helpfully."
|
|
|
446
445
|
answer: answer,
|
|
447
446
|
timestamp: Time.now
|
|
448
447
|
}
|
|
449
|
-
|
|
448
|
+
|
|
450
449
|
# Keep only the most recent conversations
|
|
451
450
|
@conversation_history = @conversation_history.last(@max_history) if @conversation_history.length > @max_history
|
|
452
451
|
end
|
|
453
452
|
|
|
454
453
|
def truncate_content(content, max_length)
|
|
455
454
|
return "" unless content.is_a?(String)
|
|
456
|
-
|
|
455
|
+
|
|
457
456
|
if content.length > max_length
|
|
458
457
|
"#{content[0...max_length]}...\n\n[Content truncated for analysis]"
|
|
459
458
|
else
|
|
@@ -463,16 +462,16 @@ Please respond helpfully."
|
|
|
463
462
|
|
|
464
463
|
def format_file_size(bytes)
|
|
465
464
|
return "0 B" if bytes == 0
|
|
466
|
-
|
|
465
|
+
|
|
467
466
|
units = ['B', 'KB', 'MB', 'GB']
|
|
468
467
|
size = bytes.to_f
|
|
469
468
|
unit_index = 0
|
|
470
|
-
|
|
469
|
+
|
|
471
470
|
while size >= 1024 && unit_index < units.length - 1
|
|
472
471
|
size /= 1024
|
|
473
472
|
unit_index += 1
|
|
474
473
|
end
|
|
475
|
-
|
|
474
|
+
|
|
476
475
|
"#{size.round(2)} #{units[unit_index]}"
|
|
477
476
|
end
|
|
478
477
|
|
|
@@ -490,7 +489,7 @@ Please respond helpfully."
|
|
|
490
489
|
rescue JSON::ParserError
|
|
491
490
|
# Fall back to plain text response
|
|
492
491
|
end
|
|
493
|
-
|
|
492
|
+
|
|
494
493
|
response
|
|
495
494
|
end
|
|
496
495
|
|
|
@@ -2,29 +2,62 @@ module UniversalDocumentProcessor
|
|
|
2
2
|
class Document
|
|
3
3
|
attr_reader :file_path, :content_type, :file_size, :options, :filename_validation
|
|
4
4
|
|
|
5
|
+
class LargeFileError < StandardError; end
|
|
6
|
+
class FileValidationError < StandardError; end
|
|
7
|
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
8
|
+
|
|
5
9
|
def initialize(file_path_or_io, options = {})
|
|
6
10
|
@file_path = file_path_or_io.is_a?(String) ? normalize_file_path(file_path_or_io) : save_temp_file(file_path_or_io)
|
|
7
11
|
@options = options
|
|
12
|
+
# 1. Check file existence and readability
|
|
13
|
+
unless File.exist?(@file_path) && File.readable?(@file_path)
|
|
14
|
+
raise FileValidationError, "File is missing or unreadable: #{@file_path}"
|
|
15
|
+
end
|
|
8
16
|
@content_type = detect_content_type
|
|
9
17
|
@file_size = File.size(@file_path)
|
|
18
|
+
# 2. Large file safeguard
|
|
19
|
+
if @file_size > MAX_FILE_SIZE
|
|
20
|
+
raise LargeFileError, "File size #{@file_size} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
|
|
21
|
+
end
|
|
10
22
|
@filename_validation = validate_filename_encoding
|
|
23
|
+
# 3. Encoding validation and cleaning for text files
|
|
24
|
+
if @content_type =~ /text|plain/
|
|
25
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
26
|
+
unless validation[:valid]
|
|
27
|
+
@cleaned_text_content = UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
28
|
+
remove_null_bytes: true,
|
|
29
|
+
remove_control_chars: true,
|
|
30
|
+
normalize_whitespace: true
|
|
31
|
+
})
|
|
32
|
+
else
|
|
33
|
+
@cleaned_text_content = nil
|
|
34
|
+
end
|
|
35
|
+
end
|
|
11
36
|
end
|
|
12
37
|
|
|
13
38
|
def process
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
39
|
+
begin
|
|
40
|
+
{
|
|
41
|
+
file_path: @file_path,
|
|
42
|
+
content_type: @content_type,
|
|
43
|
+
file_size: @file_size,
|
|
44
|
+
text_content: extract_text,
|
|
45
|
+
metadata: metadata,
|
|
46
|
+
images: extract_images,
|
|
47
|
+
tables: extract_tables,
|
|
48
|
+
filename_info: filename_info,
|
|
49
|
+
processed_at: Time.current
|
|
50
|
+
}
|
|
51
|
+
rescue LargeFileError, FileValidationError => e
|
|
52
|
+
{ error: e.class.name, message: e.message, file_path: @file_path }
|
|
53
|
+
rescue => e
|
|
54
|
+
{ error: 'ProcessingError', message: e.message, file_path: @file_path }
|
|
55
|
+
end
|
|
25
56
|
end
|
|
26
57
|
|
|
27
58
|
def extract_text
|
|
59
|
+
# Use cleaned text if available (from encoding validation)
|
|
60
|
+
return @cleaned_text_content if defined?(@cleaned_text_content) && @cleaned_text_content
|
|
28
61
|
processor.extract_text
|
|
29
62
|
rescue => e
|
|
30
63
|
fallback_text_extraction
|
|
@@ -253,13 +286,97 @@ module UniversalDocumentProcessor
|
|
|
253
286
|
end
|
|
254
287
|
|
|
255
288
|
def convert_to_pdf
|
|
256
|
-
|
|
257
|
-
|
|
289
|
+
ensure_prawn_available!
|
|
290
|
+
|
|
291
|
+
output_path = @file_path.gsub(File.extname(@file_path), '.pdf')
|
|
292
|
+
|
|
293
|
+
Prawn::Document.generate(output_path) do |pdf|
|
|
294
|
+
# Add title
|
|
295
|
+
pdf.font_size 18
|
|
296
|
+
pdf.text "Document: #{File.basename(@file_path)}", style: :bold
|
|
297
|
+
pdf.move_down 20
|
|
298
|
+
|
|
299
|
+
# Add metadata section
|
|
300
|
+
pdf.font_size 12
|
|
301
|
+
pdf.text "Document Information", style: :bold
|
|
302
|
+
pdf.move_down 10
|
|
303
|
+
|
|
304
|
+
metadata_info = metadata
|
|
305
|
+
pdf.text "File Size: #{format_file_size(@file_size)}"
|
|
306
|
+
pdf.text "Content Type: #{@content_type}"
|
|
307
|
+
pdf.text "Created: #{metadata_info[:created_at]}" if metadata_info[:created_at]
|
|
308
|
+
pdf.text "Modified: #{metadata_info[:modified_at]}" if metadata_info[:modified_at]
|
|
309
|
+
pdf.move_down 20
|
|
310
|
+
|
|
311
|
+
# Add content section
|
|
312
|
+
pdf.text "Content", style: :bold
|
|
313
|
+
pdf.move_down 10
|
|
314
|
+
|
|
315
|
+
text_content = extract_text
|
|
316
|
+
if text_content && !text_content.strip.empty?
|
|
317
|
+
pdf.font_size 10
|
|
318
|
+
pdf.text text_content
|
|
319
|
+
else
|
|
320
|
+
pdf.text "No text content available for this document."
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
# Add tables if available
|
|
324
|
+
tables = extract_tables
|
|
325
|
+
unless tables.empty?
|
|
326
|
+
pdf.start_new_page
|
|
327
|
+
pdf.font_size 12
|
|
328
|
+
pdf.text "Tables", style: :bold
|
|
329
|
+
pdf.move_down 10
|
|
330
|
+
|
|
331
|
+
tables.each_with_index do |table, index|
|
|
332
|
+
pdf.text "Table #{index + 1}", style: :bold
|
|
333
|
+
pdf.move_down 5
|
|
334
|
+
|
|
335
|
+
if table[:content] && !table[:content].empty?
|
|
336
|
+
# Format table data for Prawn
|
|
337
|
+
table_data = table[:content].first(20) # Limit to first 20 rows
|
|
338
|
+
pdf.table(table_data, header: true) do
|
|
339
|
+
row(0).font_style = :bold
|
|
340
|
+
cells.size = 8
|
|
341
|
+
cells.padding = 3
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
pdf.move_down 15
|
|
345
|
+
end
|
|
346
|
+
end
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
output_path
|
|
350
|
+
rescue => e
|
|
351
|
+
raise ProcessingError, "Failed to create PDF: #{e.message}"
|
|
258
352
|
end
|
|
259
353
|
|
|
260
354
|
def convert_to_html
|
|
261
355
|
# Implementation for HTML conversion
|
|
262
356
|
raise NotImplementedError, "HTML conversion not yet implemented"
|
|
263
357
|
end
|
|
358
|
+
|
|
359
|
+
private
|
|
360
|
+
|
|
361
|
+
def ensure_prawn_available!
|
|
362
|
+
unless defined?(Prawn)
|
|
363
|
+
raise DependencyMissingError, "PDF creation requires the 'prawn' gem. Install it with: gem install prawn -v '~> 2.4'"
|
|
364
|
+
end
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
def format_file_size(bytes)
|
|
368
|
+
return "0 B" if bytes == 0
|
|
369
|
+
|
|
370
|
+
units = ['B', 'KB', 'MB', 'GB']
|
|
371
|
+
size = bytes.to_f
|
|
372
|
+
unit_index = 0
|
|
373
|
+
|
|
374
|
+
while size >= 1024 && unit_index < units.length - 1
|
|
375
|
+
size /= 1024
|
|
376
|
+
unit_index += 1
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
"#{size.round(2)} #{units[unit_index]}"
|
|
380
|
+
end
|
|
264
381
|
end
|
|
265
382
|
end
|
|
@@ -3,6 +3,8 @@ module UniversalDocumentProcessor
|
|
|
3
3
|
class BaseProcessor
|
|
4
4
|
attr_reader :file_path, :options
|
|
5
5
|
|
|
6
|
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
7
|
+
|
|
6
8
|
def initialize(file_path, options = {})
|
|
7
9
|
@file_path = file_path
|
|
8
10
|
@options = options
|
|
@@ -11,6 +13,17 @@ module UniversalDocumentProcessor
|
|
|
11
13
|
def extract_text
|
|
12
14
|
# Fallback to universal text extraction
|
|
13
15
|
if defined?(Yomu)
|
|
16
|
+
# Encoding validation for text files
|
|
17
|
+
if File.extname(@file_path) =~ /\.(txt|csv|tsv|md|json|xml|html|htm)$/i
|
|
18
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
19
|
+
unless validation[:valid]
|
|
20
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
21
|
+
remove_null_bytes: true,
|
|
22
|
+
remove_control_chars: true,
|
|
23
|
+
normalize_whitespace: true
|
|
24
|
+
})
|
|
25
|
+
end
|
|
26
|
+
end
|
|
14
27
|
Yomu.new(@file_path).text
|
|
15
28
|
else
|
|
16
29
|
raise ProcessingError, "Universal text extraction requires the 'yomu' gem. Install it with: gem install yomu -v '~> 0.2'"
|
|
@@ -49,6 +62,10 @@ module UniversalDocumentProcessor
|
|
|
49
62
|
def validate_file
|
|
50
63
|
raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
|
|
51
64
|
raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
|
|
65
|
+
# Large file safeguard
|
|
66
|
+
if File.size(@file_path) > MAX_FILE_SIZE
|
|
67
|
+
raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
|
|
68
|
+
end
|
|
52
69
|
end
|
|
53
70
|
|
|
54
71
|
def with_error_handling
|
|
@@ -6,11 +6,32 @@ require 'csv'
|
|
|
6
6
|
module UniversalDocumentProcessor
|
|
7
7
|
module Processors
|
|
8
8
|
class ExcelProcessor < BaseProcessor
|
|
9
|
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
10
|
+
|
|
9
11
|
def extract_text
|
|
12
|
+
validate_file
|
|
10
13
|
with_error_handling do
|
|
11
14
|
if @file_path.end_with?('.csv')
|
|
15
|
+
# Encoding validation for CSV
|
|
16
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
17
|
+
unless validation[:valid]
|
|
18
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
19
|
+
remove_null_bytes: true,
|
|
20
|
+
remove_control_chars: true,
|
|
21
|
+
normalize_whitespace: true
|
|
22
|
+
})
|
|
23
|
+
end
|
|
12
24
|
extract_csv_text
|
|
13
25
|
elsif @file_path.end_with?('.tsv')
|
|
26
|
+
# Encoding validation for TSV
|
|
27
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
28
|
+
unless validation[:valid]
|
|
29
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
30
|
+
remove_null_bytes: true,
|
|
31
|
+
remove_control_chars: true,
|
|
32
|
+
normalize_whitespace: true
|
|
33
|
+
})
|
|
34
|
+
end
|
|
14
35
|
extract_tsv_text
|
|
15
36
|
elsif @file_path.end_with?('.xlsx')
|
|
16
37
|
extract_xlsx_text_builtin
|
|
@@ -208,6 +229,15 @@ module UniversalDocumentProcessor
|
|
|
208
229
|
|
|
209
230
|
private
|
|
210
231
|
|
|
232
|
+
def validate_file
|
|
233
|
+
raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
|
|
234
|
+
raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
|
|
235
|
+
# Large file safeguard
|
|
236
|
+
if File.size(@file_path) > MAX_FILE_SIZE
|
|
237
|
+
raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
211
241
|
# CSV Processing Methods
|
|
212
242
|
def extract_csv_text
|
|
213
243
|
content = File.read(@file_path, encoding: 'UTF-8')
|
|
@@ -1,12 +1,23 @@
|
|
|
1
1
|
module UniversalDocumentProcessor
|
|
2
2
|
module Processors
|
|
3
3
|
class PdfProcessor < BaseProcessor
|
|
4
|
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
5
|
+
|
|
4
6
|
def extract_text
|
|
5
7
|
ensure_pdf_reader_available!
|
|
6
|
-
|
|
8
|
+
validate_file
|
|
7
9
|
with_error_handling do
|
|
8
10
|
reader = PDF::Reader.new(@file_path)
|
|
9
11
|
text = reader.pages.map(&:text).join("\n")
|
|
12
|
+
# Encoding validation for extracted text
|
|
13
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
14
|
+
unless validation[:valid]
|
|
15
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
16
|
+
remove_null_bytes: true,
|
|
17
|
+
remove_control_chars: true,
|
|
18
|
+
normalize_whitespace: true
|
|
19
|
+
})
|
|
20
|
+
end
|
|
10
21
|
text.strip.empty? ? "No text content found in PDF" : text
|
|
11
22
|
end
|
|
12
23
|
rescue => e
|
|
@@ -104,6 +115,15 @@ module UniversalDocumentProcessor
|
|
|
104
115
|
end
|
|
105
116
|
end
|
|
106
117
|
|
|
118
|
+
def validate_file
|
|
119
|
+
raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
|
|
120
|
+
raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
|
|
121
|
+
# Large file safeguard
|
|
122
|
+
if File.size(@file_path) > MAX_FILE_SIZE
|
|
123
|
+
raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
107
127
|
def extract_form_fields(reader)
|
|
108
128
|
# Extract PDF form fields if present
|
|
109
129
|
[]
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
module UniversalDocumentProcessor
|
|
2
2
|
module Processors
|
|
3
3
|
class TextProcessor < BaseProcessor
|
|
4
|
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
5
|
+
|
|
4
6
|
def extract_text
|
|
7
|
+
validate_file
|
|
5
8
|
with_error_handling do
|
|
6
9
|
case detect_text_format
|
|
7
10
|
when :rtf
|
|
@@ -15,6 +18,15 @@ module UniversalDocumentProcessor
|
|
|
15
18
|
when :json
|
|
16
19
|
extract_json_text
|
|
17
20
|
else
|
|
21
|
+
# Encoding validation for plain text
|
|
22
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
23
|
+
unless validation[:valid]
|
|
24
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
25
|
+
remove_null_bytes: true,
|
|
26
|
+
remove_control_chars: true,
|
|
27
|
+
normalize_whitespace: true
|
|
28
|
+
})
|
|
29
|
+
end
|
|
18
30
|
extract_plain_text
|
|
19
31
|
end
|
|
20
32
|
end
|
|
@@ -81,6 +93,15 @@ module UniversalDocumentProcessor
|
|
|
81
93
|
|
|
82
94
|
private
|
|
83
95
|
|
|
96
|
+
def validate_file
|
|
97
|
+
raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
|
|
98
|
+
raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
|
|
99
|
+
# Large file safeguard
|
|
100
|
+
if File.size(@file_path) > MAX_FILE_SIZE
|
|
101
|
+
raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
84
105
|
def detect_text_format
|
|
85
106
|
extension = File.extname(@file_path).downcase
|
|
86
107
|
case extension
|
|
@@ -1,11 +1,32 @@
|
|
|
1
1
|
module UniversalDocumentProcessor
|
|
2
2
|
module Processors
|
|
3
3
|
class WordProcessor < BaseProcessor
|
|
4
|
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
5
|
+
|
|
4
6
|
def extract_text
|
|
7
|
+
validate_file
|
|
5
8
|
with_error_handling do
|
|
6
9
|
if @file_path.end_with?('.docx')
|
|
10
|
+
# Encoding validation for docx (if possible)
|
|
11
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
12
|
+
unless validation[:valid]
|
|
13
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
14
|
+
remove_null_bytes: true,
|
|
15
|
+
remove_control_chars: true,
|
|
16
|
+
normalize_whitespace: true
|
|
17
|
+
})
|
|
18
|
+
end
|
|
7
19
|
extract_docx_text
|
|
8
20
|
elsif @file_path.end_with?('.doc')
|
|
21
|
+
# Encoding validation for doc (if possible)
|
|
22
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
23
|
+
unless validation[:valid]
|
|
24
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
25
|
+
remove_null_bytes: true,
|
|
26
|
+
remove_control_chars: true,
|
|
27
|
+
normalize_whitespace: true
|
|
28
|
+
})
|
|
29
|
+
end
|
|
9
30
|
# Built-in .doc file processing
|
|
10
31
|
fallback_text_extraction
|
|
11
32
|
else
|
|
@@ -90,6 +111,15 @@ module UniversalDocumentProcessor
|
|
|
90
111
|
|
|
91
112
|
private
|
|
92
113
|
|
|
114
|
+
def validate_file
|
|
115
|
+
raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
|
|
116
|
+
raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
|
|
117
|
+
# Large file safeguard
|
|
118
|
+
if File.size(@file_path) > MAX_FILE_SIZE
|
|
119
|
+
raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
93
123
|
def ensure_docx_available!
|
|
94
124
|
unless defined?(Docx)
|
|
95
125
|
raise DependencyMissingError, "DOCX processing requires the 'docx' gem. Install it with: gem install docx -v '~> 0.8'"
|
|
@@ -206,6 +206,16 @@ module UniversalDocumentProcessor
|
|
|
206
206
|
Document.new(file_path_or_io, options).convert_to(target_format)
|
|
207
207
|
end
|
|
208
208
|
|
|
209
|
+
# Create PDF from any supported document
|
|
210
|
+
def self.create_pdf(file_path, options = {})
|
|
211
|
+
Document.new(file_path, options).convert_to(:pdf)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
# Check if PDF creation is available
|
|
215
|
+
def self.pdf_creation_available?
|
|
216
|
+
defined?(Prawn)
|
|
217
|
+
end
|
|
218
|
+
|
|
209
219
|
# Batch process multiple documents
|
|
210
220
|
def self.batch_process(file_paths, options = {})
|
|
211
221
|
file_paths.map do |file_path|
|