universal_document_processor 1.0.5 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +237 -2
- data/lib/universal_document_processor/ai_agent.rb +48 -49
- data/lib/universal_document_processor/document.rb +130 -13
- data/lib/universal_document_processor/processors/archive_processor.rb +26 -0
- data/lib/universal_document_processor/processors/base_processor.rb +17 -0
- data/lib/universal_document_processor/processors/excel_processor.rb +30 -0
- data/lib/universal_document_processor/processors/pdf_processor.rb +21 -1
- data/lib/universal_document_processor/processors/text_processor.rb +21 -0
- data/lib/universal_document_processor/processors/word_processor.rb +30 -0
- data/lib/universal_document_processor/version.rb +1 -1
- data/lib/universal_document_processor.rb +10 -0
- metadata +1 -6
- data/debug_test.rb +0 -35
- data/test_ai_dependency.rb +0 -80
- data/test_core_functionality.rb +0 -280
- data/test_performance_memory.rb +0 -271
- data/test_published_gem.rb +0 -349
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: '0612949a026d62fd8fd9c9c1372cfa70cdeb8bdd1677475be639cf35cd684f4c'
|
|
4
|
+
data.tar.gz: 82780d2c062034be663b3d21275e9d27addc1e44f5705de7dc6b23e70293216e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 07a4fe1b792226dae8135e6620f455640d0ba137777b238052916d06a0b4f32b113414886479e0ad48b76ead57d9b7d6a577a76748dff926a9575ad124dc7ee5
|
|
7
|
+
data.tar.gz: fd3b8fb692f87755a657eb1270631c19ca99bcb5fcbb224e4ebcc22c1b19af1eb2aa3b2aed6ff3f94f84a6fdba8d5e255b28575d5fe1b074244e4bc160820f33
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [1.1.0] - 2025-01-XX
|
|
11
|
+
### Added
|
|
12
|
+
- **ZIP File Creation**: New functionality to create ZIP archives programmatically
|
|
13
|
+
- `ArchiveProcessor.create_zip()` class method for creating ZIP files
|
|
14
|
+
- Support for creating archives from individual files or entire directories
|
|
15
|
+
- Recursive directory archiving with proper path structure preservation
|
|
16
|
+
- Comprehensive test coverage with error handling
|
|
17
|
+
- Integration with existing archive processing capabilities
|
|
18
|
+
|
|
19
|
+
### Enhanced
|
|
20
|
+
- **ArchiveProcessor**: Extended with ZIP creation capabilities alongside existing extraction features
|
|
21
|
+
- **Archive Support**: Now supports both reading/extracting and creating ZIP archives
|
|
22
|
+
|
|
10
23
|
## [1.2.0] - 2024-01-15
|
|
11
24
|
### Added
|
|
12
25
|
- **TSV (Tab-Separated Values) File Support**: Complete built-in TSV processing capabilities
|
data/README.md
CHANGED
|
@@ -29,6 +29,7 @@ A comprehensive Ruby gem that provides unified document processing capabilities
|
|
|
29
29
|
- **Table Detection**: Structured data extraction
|
|
30
30
|
- **Character Validation**: Invalid character detection and cleaning
|
|
31
31
|
- **Multi-language Support**: Full Unicode support including Japanese (日本語)
|
|
32
|
+
- **Archive Creation**: Create ZIP files from individual files or directories
|
|
32
33
|
|
|
33
34
|
### **Character & Encoding Support**
|
|
34
35
|
- **Smart encoding detection** (UTF-8, Shift_JIS, EUC-JP, ISO-8859-1)
|
|
@@ -110,7 +111,7 @@ result = UniversalDocumentProcessor.process('document.pdf')
|
|
|
110
111
|
},
|
|
111
112
|
images: [...],
|
|
112
113
|
tables: [...],
|
|
113
|
-
processed_at:
|
|
114
|
+
processed_at: 2025-07-06 10:30:00 UTC
|
|
114
115
|
}
|
|
115
116
|
```
|
|
116
117
|
|
|
@@ -218,6 +219,33 @@ puts "Tables found: #{result[:tables].length}"
|
|
|
218
219
|
full_text = result[:text_content]
|
|
219
220
|
```
|
|
220
221
|
|
|
222
|
+
### Creating PDF Documents
|
|
223
|
+
|
|
224
|
+
```ruby
|
|
225
|
+
# Install Prawn for PDF creation (optional dependency)
|
|
226
|
+
# gem install prawn
|
|
227
|
+
|
|
228
|
+
# Create PDF from any supported document format
|
|
229
|
+
pdf_path = UniversalDocumentProcessor.create_pdf('document.docx')
|
|
230
|
+
puts "PDF created at: #{pdf_path}"
|
|
231
|
+
|
|
232
|
+
# Or use the convert method
|
|
233
|
+
pdf_path = UniversalDocumentProcessor.convert('spreadsheet.xlsx', :pdf)
|
|
234
|
+
|
|
235
|
+
# Check if PDF creation is available
|
|
236
|
+
if UniversalDocumentProcessor.pdf_creation_available?
|
|
237
|
+
puts "PDF creation is available!"
|
|
238
|
+
else
|
|
239
|
+
puts "Install 'prawn' gem to enable PDF creation: gem install prawn"
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
# The created PDF includes:
|
|
243
|
+
# - Document title and metadata
|
|
244
|
+
# - Full text content with formatting
|
|
245
|
+
# - Tables (if present in original document)
|
|
246
|
+
# - File information and statistics
|
|
247
|
+
```
|
|
248
|
+
|
|
221
249
|
### Processing Excel Spreadsheets
|
|
222
250
|
|
|
223
251
|
```ruby
|
|
@@ -412,6 +440,89 @@ summary = japanese_doc.ai_summarize(length: :medium)
|
|
|
412
440
|
|
|
413
441
|
```ruby
|
|
414
442
|
# Custom AI agent configuration
|
|
443
|
+
## ⚙️ Agentic AI Configuration & Usage
|
|
444
|
+
|
|
445
|
+
To enable and use the AI-powered features (agentic AI) in your application, follow these steps:
|
|
446
|
+
|
|
447
|
+
### 1. Install AI Dependency
|
|
448
|
+
|
|
449
|
+
You need the `ruby-openai` gem for AI features:
|
|
450
|
+
|
|
451
|
+
```bash
|
|
452
|
+
gem install ruby-openai
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
Or add to your Gemfile:
|
|
456
|
+
|
|
457
|
+
```ruby
|
|
458
|
+
gem 'ruby-openai'
|
|
459
|
+
```
|
|
460
|
+
|
|
461
|
+
Then run:
|
|
462
|
+
|
|
463
|
+
```bash
|
|
464
|
+
bundle install
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
### 2. Set Your OpenAI API Key
|
|
468
|
+
|
|
469
|
+
You must provide your OpenAI API key for agentic AI features to work. You can do this in two ways:
|
|
470
|
+
|
|
471
|
+
#### a) Environment Variable (Recommended)
|
|
472
|
+
|
|
473
|
+
Set the API key in your environment (e.g., in `.env`, `application.yml`, or your deployment environment):
|
|
474
|
+
|
|
475
|
+
```ruby
|
|
476
|
+
ENV['OPENAI_API_KEY'] = 'your-api-key-here'
|
|
477
|
+
```
|
|
478
|
+
|
|
479
|
+
#### b) Pass Directly When Creating the Agent
|
|
480
|
+
|
|
481
|
+
```ruby
|
|
482
|
+
agent = UniversalDocumentProcessor.create_ai_agent(api_key: 'your-api-key-here')
|
|
483
|
+
```
|
|
484
|
+
|
|
485
|
+
### 3. Rails: Where to Configure
|
|
486
|
+
|
|
487
|
+
If you are using Rails, add your configuration to:
|
|
488
|
+
|
|
489
|
+
`config/initializers/universal_document_processor.rb`
|
|
490
|
+
|
|
491
|
+
Example initializer:
|
|
492
|
+
|
|
493
|
+
```ruby
|
|
494
|
+
# config/initializers/universal_document_processor.rb
|
|
495
|
+
require 'universal_document_processor'
|
|
496
|
+
|
|
497
|
+
# Set your API key (or use ENV)
|
|
498
|
+
ENV['OPENAI_API_KEY'] ||= 'your-api-key-here' # (or use Rails credentials)
|
|
499
|
+
|
|
500
|
+
# Optionally, create a default agent with custom options
|
|
501
|
+
UniversalDocumentProcessor.create_ai_agent(
|
|
502
|
+
model: 'gpt-4',
|
|
503
|
+
temperature: 0.7,
|
|
504
|
+
max_history: 10
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
Rails.logger.info "Universal Document Processor with AI agent loaded" if defined?(Rails)
|
|
508
|
+
```
|
|
509
|
+
|
|
510
|
+
### 4. Using Agentic AI Features
|
|
511
|
+
|
|
512
|
+
You can now use the AI-powered methods:
|
|
513
|
+
|
|
514
|
+
```ruby
|
|
515
|
+
summary = UniversalDocumentProcessor.ai_summarize('document.pdf', length: :short)
|
|
516
|
+
insights = UniversalDocumentProcessor.ai_insights('document.pdf')
|
|
517
|
+
classification = UniversalDocumentProcessor.ai_classify('document.pdf')
|
|
518
|
+
key_info = UniversalDocumentProcessor.ai_extract_info('document.pdf', ['dates', 'names', 'amounts'])
|
|
519
|
+
action_items = UniversalDocumentProcessor.ai_action_items('document.pdf')
|
|
520
|
+
translation = UniversalDocumentProcessor.ai_translate('日本語文書.pdf', 'English')
|
|
521
|
+
```
|
|
522
|
+
|
|
523
|
+
Or create and use a persistent agent:
|
|
524
|
+
|
|
525
|
+
```ruby
|
|
415
526
|
agent = UniversalDocumentProcessor.create_ai_agent(
|
|
416
527
|
api_key: 'your-openai-key', # OpenAI API key
|
|
417
528
|
model: 'gpt-4', # Model to use (gpt-4, gpt-3.5-turbo)
|
|
@@ -419,6 +530,130 @@ agent = UniversalDocumentProcessor.create_ai_agent(
|
|
|
419
530
|
max_history: 20, # Conversation memory length
|
|
420
531
|
base_url: 'https://api.openai.com/v1' # Custom API endpoint
|
|
421
532
|
)
|
|
533
|
+
|
|
534
|
+
# Chat about a document
|
|
535
|
+
response = agent.analyze_document('report.pdf')
|
|
536
|
+
```
|
|
537
|
+
|
|
538
|
+
---
|
|
539
|
+
|
|
540
|
+
**Note:**
|
|
541
|
+
- The API key is required for all AI features.
|
|
542
|
+
- You can override the model, temperature, and other options per agent.
|
|
543
|
+
- For more, see the `USER_GUIDE.md` and the examples above.
|
|
544
|
+
```
|
|
545
|
+
|
|
546
|
+
## 📦 Archive Processing (ZIP Creation & Extraction)
|
|
547
|
+
|
|
548
|
+
The gem provides comprehensive archive processing capabilities, including both extracting from existing archives and creating new ZIP files.
|
|
549
|
+
|
|
550
|
+
### Extracting from Archives
|
|
551
|
+
|
|
552
|
+
```ruby
|
|
553
|
+
# Extract text and metadata from ZIP archives
|
|
554
|
+
result = UniversalDocumentProcessor.process('archive.zip')
|
|
555
|
+
|
|
556
|
+
# Access archive-specific metadata
|
|
557
|
+
metadata = result[:metadata]
|
|
558
|
+
puts "Archive type: #{metadata[:archive_type]}" # => "zip"
|
|
559
|
+
puts "Total files: #{metadata[:total_files]}" # => 15
|
|
560
|
+
puts "Uncompressed size: #{metadata[:total_uncompressed_size]} bytes"
|
|
561
|
+
puts "Compression ratio: #{metadata[:compression_ratio]}%" # => 75%
|
|
562
|
+
puts "Directory structure: #{metadata[:directory_structure]}"
|
|
563
|
+
|
|
564
|
+
# Check for specific file types
|
|
565
|
+
puts "File types: #{metadata[:file_types]}" # => {"txt"=>5, "pdf"=>3, "jpg"=>7}
|
|
566
|
+
puts "Has executables: #{metadata[:has_executable_files]}" # => false
|
|
567
|
+
puts "Largest file: #{metadata[:largest_file][:path]} (#{metadata[:largest_file][:size]} bytes)"
|
|
568
|
+
|
|
569
|
+
# Extract text from text files within the archive
|
|
570
|
+
text_content = result[:text_content]
|
|
571
|
+
puts "Combined text from archive: #{text_content.length} characters"
|
|
572
|
+
```
|
|
573
|
+
|
|
574
|
+
### Creating ZIP Archives
|
|
575
|
+
|
|
576
|
+
```ruby
|
|
577
|
+
# Create ZIP from individual files
|
|
578
|
+
files_to_zip = ['document1.pdf', 'document2.txt', 'image.jpg']
|
|
579
|
+
output_zip = 'my_archive.zip'
|
|
580
|
+
|
|
581
|
+
zip_path = UniversalDocumentProcessor::Processors::ArchiveProcessor.create_zip(
|
|
582
|
+
output_zip,
|
|
583
|
+
files_to_zip
|
|
584
|
+
)
|
|
585
|
+
puts "ZIP created: #{zip_path}"
|
|
586
|
+
|
|
587
|
+
# Create ZIP from entire directory (preserves folder structure)
|
|
588
|
+
directory_to_zip = '/path/to/documents'
|
|
589
|
+
archive_path = UniversalDocumentProcessor::Processors::ArchiveProcessor.create_zip(
|
|
590
|
+
'directory_backup.zip',
|
|
591
|
+
directory_to_zip
|
|
592
|
+
)
|
|
593
|
+
puts "Directory archived: #{archive_path}"
|
|
594
|
+
|
|
595
|
+
# Working with temporary directories
|
|
596
|
+
require 'tmpdir'
|
|
597
|
+
|
|
598
|
+
Dir.mktmpdir do |tmpdir|
|
|
599
|
+
# Create some test files
|
|
600
|
+
File.write(File.join(tmpdir, 'file1.txt'), 'Hello from file 1')
|
|
601
|
+
File.write(File.join(tmpdir, 'file2.txt'), 'Hello from file 2')
|
|
602
|
+
|
|
603
|
+
# Create subdirectory with files
|
|
604
|
+
subdir = File.join(tmpdir, 'subfolder')
|
|
605
|
+
Dir.mkdir(subdir)
|
|
606
|
+
File.write(File.join(subdir, 'file3.txt'), 'Hello from subfolder')
|
|
607
|
+
|
|
608
|
+
# Archive the entire directory structure
|
|
609
|
+
zip_file = File.join(tmpdir, 'complete_backup.zip')
|
|
610
|
+
UniversalDocumentProcessor::Processors::ArchiveProcessor.create_zip(zip_file, tmpdir)
|
|
611
|
+
|
|
612
|
+
puts "Archive size: #{File.size(zip_file)} bytes"
|
|
613
|
+
|
|
614
|
+
# Verify archive contents by processing it
|
|
615
|
+
archive_result = UniversalDocumentProcessor.process(zip_file)
|
|
616
|
+
puts "Files in archive: #{archive_result[:metadata][:total_files]}"
|
|
617
|
+
end
|
|
618
|
+
|
|
619
|
+
# Error handling for ZIP creation
|
|
620
|
+
begin
|
|
621
|
+
UniversalDocumentProcessor::Processors::ArchiveProcessor.create_zip(
|
|
622
|
+
'/invalid/path/archive.zip',
|
|
623
|
+
['file1.txt', 'file2.txt']
|
|
624
|
+
)
|
|
625
|
+
rescue => e
|
|
626
|
+
puts "Error creating ZIP: #{e.message}"
|
|
627
|
+
end
|
|
628
|
+
|
|
629
|
+
# Validate input before creating ZIP
|
|
630
|
+
files = ['doc1.pdf', 'doc2.txt']
|
|
631
|
+
files.each do |file|
|
|
632
|
+
unless File.exist?(file)
|
|
633
|
+
puts "Warning: #{file} does not exist"
|
|
634
|
+
end
|
|
635
|
+
end
|
|
636
|
+
```
|
|
637
|
+
|
|
638
|
+
### Archive Analysis
|
|
639
|
+
|
|
640
|
+
```ruby
|
|
641
|
+
# Analyze archive security and structure
|
|
642
|
+
result = UniversalDocumentProcessor.process('suspicious_archive.zip')
|
|
643
|
+
metadata = result[:metadata]
|
|
644
|
+
|
|
645
|
+
# Security analysis
|
|
646
|
+
if metadata[:has_executable_files]
|
|
647
|
+
puts "⚠️ Archive contains executable files"
|
|
648
|
+
end
|
|
649
|
+
|
|
650
|
+
# Directory structure analysis
|
|
651
|
+
structure = metadata[:directory_structure]
|
|
652
|
+
puts "Top-level directories: #{structure.keys.join(', ')}"
|
|
653
|
+
|
|
654
|
+
# File type distribution
|
|
655
|
+
file_types = metadata[:file_types]
|
|
656
|
+
puts "Most common file type: #{file_types.max_by{|k,v| v}}"
|
|
422
657
|
```
|
|
423
658
|
|
|
424
659
|
## 🎌 Japanese Filename Support
|
|
@@ -743,7 +978,7 @@ bundle exec rspec
|
|
|
743
978
|
|
|
744
979
|
## 📝 Changelog
|
|
745
980
|
|
|
746
|
-
### Version 1.
|
|
981
|
+
### Version 1.1.0
|
|
747
982
|
- Initial release
|
|
748
983
|
- Support for PDF, Word, Excel, PowerPoint, images, archives
|
|
749
984
|
- Character validation and cleaning
|
|
@@ -14,16 +14,16 @@ module UniversalDocumentProcessor
|
|
|
14
14
|
@max_history = options[:max_history] || 10
|
|
15
15
|
@temperature = options[:temperature] || 0.7
|
|
16
16
|
@ai_enabled = false
|
|
17
|
-
|
|
17
|
+
|
|
18
18
|
validate_configuration
|
|
19
19
|
end
|
|
20
20
|
|
|
21
21
|
# Main document analysis with AI
|
|
22
22
|
def analyze_document(document_result, query = nil)
|
|
23
23
|
ensure_ai_available!
|
|
24
|
-
|
|
24
|
+
|
|
25
25
|
context = build_document_context(document_result)
|
|
26
|
-
|
|
26
|
+
|
|
27
27
|
if query
|
|
28
28
|
# Specific query about the document
|
|
29
29
|
analyze_with_query(context, query)
|
|
@@ -67,12 +67,12 @@ Please provide:
|
|
|
67
67
|
# Ask specific questions about a document
|
|
68
68
|
def ask_document_question(document_result, question)
|
|
69
69
|
ensure_ai_available!
|
|
70
|
-
|
|
70
|
+
|
|
71
71
|
context = build_document_context(document_result)
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
prompt = build_question_prompt(context, question)
|
|
74
74
|
response = call_openai_api(prompt)
|
|
75
|
-
|
|
75
|
+
|
|
76
76
|
add_to_history(question, response)
|
|
77
77
|
response
|
|
78
78
|
end
|
|
@@ -80,19 +80,19 @@ Please provide:
|
|
|
80
80
|
# Summarize document content
|
|
81
81
|
def summarize_document(document_result, length: :medium)
|
|
82
82
|
ensure_ai_available!
|
|
83
|
-
|
|
83
|
+
|
|
84
84
|
context = build_document_context(document_result)
|
|
85
|
-
|
|
85
|
+
|
|
86
86
|
length_instruction = case length
|
|
87
87
|
when :short then "in 2-3 sentences"
|
|
88
88
|
when :medium then "in 1-2 paragraphs"
|
|
89
89
|
when :long then "in detail with key points"
|
|
90
90
|
else "concisely"
|
|
91
91
|
end
|
|
92
|
-
|
|
92
|
+
|
|
93
93
|
prompt = build_summary_prompt(context, length_instruction)
|
|
94
94
|
response = call_openai_api(prompt)
|
|
95
|
-
|
|
95
|
+
|
|
96
96
|
add_to_history("Summarize document #{length_instruction}", response)
|
|
97
97
|
response
|
|
98
98
|
end
|
|
@@ -100,13 +100,13 @@ Please provide:
|
|
|
100
100
|
# Extract key information from document
|
|
101
101
|
def extract_key_information(document_result, categories = nil)
|
|
102
102
|
ensure_ai_available!
|
|
103
|
-
|
|
103
|
+
|
|
104
104
|
context = build_document_context(document_result)
|
|
105
105
|
categories ||= ['key_facts', 'important_dates', 'names', 'locations', 'numbers']
|
|
106
|
-
|
|
106
|
+
|
|
107
107
|
prompt = build_extraction_prompt(context, categories)
|
|
108
108
|
response = call_openai_api(prompt)
|
|
109
|
-
|
|
109
|
+
|
|
110
110
|
add_to_history("Extract key information: #{categories.join(', ')}", response)
|
|
111
111
|
parse_extraction_response(response)
|
|
112
112
|
end
|
|
@@ -114,12 +114,12 @@ Please provide:
|
|
|
114
114
|
# Translate document content
|
|
115
115
|
def translate_document(document_result, target_language)
|
|
116
116
|
ensure_ai_available!
|
|
117
|
-
|
|
117
|
+
|
|
118
118
|
context = build_document_context(document_result)
|
|
119
|
-
|
|
119
|
+
|
|
120
120
|
prompt = build_translation_prompt(context, target_language)
|
|
121
121
|
response = call_openai_api(prompt)
|
|
122
|
-
|
|
122
|
+
|
|
123
123
|
add_to_history("Translate to #{target_language}", response)
|
|
124
124
|
response
|
|
125
125
|
end
|
|
@@ -127,12 +127,12 @@ Please provide:
|
|
|
127
127
|
# Generate document insights and recommendations
|
|
128
128
|
def generate_insights(document_result)
|
|
129
129
|
ensure_ai_available!
|
|
130
|
-
|
|
130
|
+
|
|
131
131
|
context = build_document_context(document_result)
|
|
132
|
-
|
|
132
|
+
|
|
133
133
|
prompt = build_insights_prompt(context)
|
|
134
134
|
response = call_openai_api(prompt)
|
|
135
|
-
|
|
135
|
+
|
|
136
136
|
add_to_history("Generate insights", response)
|
|
137
137
|
parse_insights_response(response)
|
|
138
138
|
end
|
|
@@ -140,12 +140,12 @@ Please provide:
|
|
|
140
140
|
# Compare multiple documents
|
|
141
141
|
def compare_documents(document_results, comparison_type = :content)
|
|
142
142
|
ensure_ai_available!
|
|
143
|
-
|
|
143
|
+
|
|
144
144
|
contexts = document_results.map { |doc| build_document_context(doc) }
|
|
145
|
-
|
|
145
|
+
|
|
146
146
|
prompt = build_comparison_prompt(contexts, comparison_type)
|
|
147
147
|
response = call_openai_api(prompt)
|
|
148
|
-
|
|
148
|
+
|
|
149
149
|
add_to_history("Compare documents (#{comparison_type})", response)
|
|
150
150
|
response
|
|
151
151
|
end
|
|
@@ -153,12 +153,12 @@ Please provide:
|
|
|
153
153
|
# Classify document type and purpose
|
|
154
154
|
def classify_document(document_result)
|
|
155
155
|
ensure_ai_available!
|
|
156
|
-
|
|
156
|
+
|
|
157
157
|
context = build_document_context(document_result)
|
|
158
|
-
|
|
158
|
+
|
|
159
159
|
prompt = build_classification_prompt(context)
|
|
160
160
|
response = call_openai_api(prompt)
|
|
161
|
-
|
|
161
|
+
|
|
162
162
|
add_to_history("Classify document", response)
|
|
163
163
|
parse_classification_response(response)
|
|
164
164
|
end
|
|
@@ -166,12 +166,12 @@ Please provide:
|
|
|
166
166
|
# Generate action items from document
|
|
167
167
|
def extract_action_items(document_result)
|
|
168
168
|
ensure_ai_available!
|
|
169
|
-
|
|
169
|
+
|
|
170
170
|
context = build_document_context(document_result)
|
|
171
|
-
|
|
171
|
+
|
|
172
172
|
prompt = build_action_items_prompt(context)
|
|
173
173
|
response = call_openai_api(prompt)
|
|
174
|
-
|
|
174
|
+
|
|
175
175
|
add_to_history("Extract action items", response)
|
|
176
176
|
parse_action_items_response(response)
|
|
177
177
|
end
|
|
@@ -179,14 +179,14 @@ Please provide:
|
|
|
179
179
|
# Chat about the document
|
|
180
180
|
def chat(message, document_result = nil)
|
|
181
181
|
ensure_ai_available!
|
|
182
|
-
|
|
182
|
+
|
|
183
183
|
if document_result
|
|
184
184
|
context = build_document_context(document_result)
|
|
185
185
|
prompt = build_chat_prompt(context, message)
|
|
186
186
|
else
|
|
187
187
|
prompt = build_general_chat_prompt(message)
|
|
188
188
|
end
|
|
189
|
-
|
|
189
|
+
|
|
190
190
|
response = call_openai_api(prompt)
|
|
191
191
|
add_to_history(message, response)
|
|
192
192
|
response
|
|
@@ -200,15 +200,15 @@ Please provide:
|
|
|
200
200
|
# Get conversation summary
|
|
201
201
|
def conversation_summary
|
|
202
202
|
return "No conversation history" if @conversation_history.empty?
|
|
203
|
-
|
|
203
|
+
|
|
204
204
|
unless @ai_enabled
|
|
205
205
|
return "AI features are disabled. Cannot generate conversation summary."
|
|
206
206
|
end
|
|
207
|
-
|
|
207
|
+
|
|
208
208
|
history_text = @conversation_history.map do |entry|
|
|
209
209
|
"Q: #{entry[:question]}\nA: #{entry[:answer]}"
|
|
210
210
|
end.join("\n\n")
|
|
211
|
-
|
|
211
|
+
|
|
212
212
|
prompt = "Summarize this conversation:\n\n#{history_text}"
|
|
213
213
|
call_openai_api(prompt)
|
|
214
214
|
end
|
|
@@ -247,13 +247,13 @@ Please provide:
|
|
|
247
247
|
tables_count: document_result[:tables]&.length || 0,
|
|
248
248
|
filename_info: document_result[:filename_info] || {}
|
|
249
249
|
}
|
|
250
|
-
|
|
250
|
+
|
|
251
251
|
# Add Japanese-specific information if available
|
|
252
252
|
if context[:filename_info][:contains_japanese]
|
|
253
253
|
context[:japanese_filename] = true
|
|
254
254
|
context[:japanese_parts] = context[:filename_info][:japanese_parts]
|
|
255
255
|
end
|
|
256
|
-
|
|
256
|
+
|
|
257
257
|
context
|
|
258
258
|
end
|
|
259
259
|
|
|
@@ -324,8 +324,7 @@ Please provide:
|
|
|
324
324
|
|
|
325
325
|
def build_comparison_prompt(contexts, comparison_type)
|
|
326
326
|
comparison_content = contexts.map.with_index do |context, index|
|
|
327
|
-
"Document #{index + 1}: #{context[:filename]}
|
|
328
|
-
Content: #{truncate_content(context[:text_content], 1500)}"
|
|
327
|
+
"Document #{index + 1}: #{context[:filename]}\nContent: #{truncate_content(context[:text_content], 1500)}"
|
|
329
328
|
end.join("\n\n---\n\n")
|
|
330
329
|
|
|
331
330
|
"You are an AI analyst. Compare these documents focusing on #{comparison_type}:
|
|
@@ -404,15 +403,15 @@ Please respond helpfully."
|
|
|
404
403
|
|
|
405
404
|
def call_openai_api(prompt)
|
|
406
405
|
uri = URI("#{@base_url}/chat/completions")
|
|
407
|
-
|
|
406
|
+
|
|
408
407
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
409
408
|
http.use_ssl = true
|
|
410
409
|
http.read_timeout = 60
|
|
411
|
-
|
|
410
|
+
|
|
412
411
|
request = Net::HTTP::Post.new(uri)
|
|
413
412
|
request['Content-Type'] = 'application/json'
|
|
414
413
|
request['Authorization'] = "Bearer #{@api_key}"
|
|
415
|
-
|
|
414
|
+
|
|
416
415
|
request.body = {
|
|
417
416
|
model: @model,
|
|
418
417
|
messages: [
|
|
@@ -421,16 +420,16 @@ Please respond helpfully."
|
|
|
421
420
|
content: "You are an intelligent document processing assistant with expertise in analyzing, summarizing, and extracting information from various document types. You support multiple languages including Japanese."
|
|
422
421
|
},
|
|
423
422
|
{
|
|
424
|
-
role: "user",
|
|
423
|
+
role: "user",
|
|
425
424
|
content: prompt
|
|
426
425
|
}
|
|
427
426
|
],
|
|
428
427
|
temperature: @temperature,
|
|
429
428
|
max_tokens: 2000
|
|
430
429
|
}.to_json
|
|
431
|
-
|
|
430
|
+
|
|
432
431
|
response = http.request(request)
|
|
433
|
-
|
|
432
|
+
|
|
434
433
|
if response.code.to_i == 200
|
|
435
434
|
result = JSON.parse(response.body)
|
|
436
435
|
result.dig('choices', 0, 'message', 'content') || "No response generated"
|
|
@@ -446,14 +445,14 @@ Please respond helpfully."
|
|
|
446
445
|
answer: answer,
|
|
447
446
|
timestamp: Time.now
|
|
448
447
|
}
|
|
449
|
-
|
|
448
|
+
|
|
450
449
|
# Keep only the most recent conversations
|
|
451
450
|
@conversation_history = @conversation_history.last(@max_history) if @conversation_history.length > @max_history
|
|
452
451
|
end
|
|
453
452
|
|
|
454
453
|
def truncate_content(content, max_length)
|
|
455
454
|
return "" unless content.is_a?(String)
|
|
456
|
-
|
|
455
|
+
|
|
457
456
|
if content.length > max_length
|
|
458
457
|
"#{content[0...max_length]}...\n\n[Content truncated for analysis]"
|
|
459
458
|
else
|
|
@@ -463,16 +462,16 @@ Please respond helpfully."
|
|
|
463
462
|
|
|
464
463
|
def format_file_size(bytes)
|
|
465
464
|
return "0 B" if bytes == 0
|
|
466
|
-
|
|
465
|
+
|
|
467
466
|
units = ['B', 'KB', 'MB', 'GB']
|
|
468
467
|
size = bytes.to_f
|
|
469
468
|
unit_index = 0
|
|
470
|
-
|
|
469
|
+
|
|
471
470
|
while size >= 1024 && unit_index < units.length - 1
|
|
472
471
|
size /= 1024
|
|
473
472
|
unit_index += 1
|
|
474
473
|
end
|
|
475
|
-
|
|
474
|
+
|
|
476
475
|
"#{size.round(2)} #{units[unit_index]}"
|
|
477
476
|
end
|
|
478
477
|
|
|
@@ -490,7 +489,7 @@ Please respond helpfully."
|
|
|
490
489
|
rescue JSON::ParserError
|
|
491
490
|
# Fall back to plain text response
|
|
492
491
|
end
|
|
493
|
-
|
|
492
|
+
|
|
494
493
|
response
|
|
495
494
|
end
|
|
496
495
|
|