universal_document_processor 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/README.md +53 -1
- data/Rakefile +100 -17
- data/lib/universal_document_processor/ai_agent.rb +44 -3
- data/lib/universal_document_processor/document.rb +40 -4
- data/lib/universal_document_processor/processors/excel_processor.rb +719 -132
- data/lib/universal_document_processor/processors/pdf_processor.rb +14 -0
- data/lib/universal_document_processor/processors/word_processor.rb +94 -4
- data/lib/universal_document_processor/utils/file_detector.rb +1 -0
- data/lib/universal_document_processor/version.rb +1 -1
- data/lib/universal_document_processor.rb +84 -1
- metadata +48 -6
- data/AI_USAGE_GUIDE.md +0 -404
- data/GEM_RELEASE_GUIDE.md +0 -288
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8ec66decfe8626354f9fe05b757dbdc11921b21fa6b5dccfdb4d8ce5deba2c3f
|
4
|
+
data.tar.gz: 19c2802d337d0517ab91cfe71bdb2b051213e17f0e1a76605c5bce895429eed4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2cbb1944e533a4a75d6248dd6df279219e4a4c7b77dac3b0e4d474b5b4375203d188bff5d388af30716b3dc5487fcd293955c8504565c4a1b56d552a8484993
|
7
|
+
data.tar.gz: 2d95c2f173de302d14cdfda6d3357b5d7d9a5cf82cabc2e5622bdb8f6d7e60c56bab75eccf85a312045be5f8e66b743b4344420847192a3b0268cd4a70c5f414
|
data/CHANGELOG.md
CHANGED
@@ -7,6 +7,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
7
7
|
|
8
8
|
## [Unreleased]
|
9
9
|
|
10
|
+
## [1.2.0] - 2024-01-15
|
11
|
+
### Added
|
12
|
+
- **TSV (Tab-Separated Values) File Support**: Complete built-in TSV processing capabilities
|
13
|
+
- Native TSV parsing using Ruby CSV library with tab delimiter
|
14
|
+
- Text extraction with proper formatting
|
15
|
+
- Comprehensive metadata detection (format, delimiter, encoding)
|
16
|
+
- Table structure analysis and header detection
|
17
|
+
- Statistical analysis and data validation
|
18
|
+
- Format conversions: TSV ↔ CSV, TSV → JSON
|
19
|
+
- Cross-format compatibility with existing CSV and Excel features
|
20
|
+
- New `to_tsv()` method for converting other formats to TSV
|
21
|
+
- Enhanced file detector with TSV MIME type mapping
|
22
|
+
- Full integration with existing Document class API
|
23
|
+
|
24
|
+
### Enhanced
|
25
|
+
- **ExcelProcessor**: Extended to handle TSV files alongside CSV and Excel formats
|
26
|
+
- **File Detection**: Added TSV MIME type support (`text/tab-separated-values`)
|
27
|
+
- **Document Class**: Added `to_tsv()` method and TSV format support
|
28
|
+
- **Supported Formats**: Updated to include TSV in format list
|
29
|
+
|
10
30
|
## [1.0.1] - 2025-06-23
|
11
31
|
|
12
32
|
### Fixed
|
data/README.md
CHANGED
@@ -16,7 +16,7 @@ A comprehensive Ruby gem that provides unified document processing capabilities
|
|
16
16
|
|
17
17
|
### **Supported File Formats**
|
18
18
|
- **📄 Documents**: PDF, DOC, DOCX, RTF
|
19
|
-
- **📊 Spreadsheets**: XLS, XLSX, CSV
|
19
|
+
- **📊 Spreadsheets**: XLS, XLSX, CSV, TSV
|
20
20
|
- **📺 Presentations**: PPT, PPTX
|
21
21
|
- **🖼️ Images**: JPG, PNG, GIF, BMP, TIFF
|
22
22
|
- **📁 Archives**: ZIP, RAR, 7Z
|
@@ -236,6 +236,58 @@ tables.each_with_index do |table, index|
|
|
236
236
|
end
|
237
237
|
```
|
238
238
|
|
239
|
+
### Processing TSV (Tab-Separated Values) Files
|
240
|
+
|
241
|
+
```ruby
|
242
|
+
# Process TSV files with built-in support
|
243
|
+
result = UniversalDocumentProcessor.process('data.tsv')
|
244
|
+
|
245
|
+
# TSV-specific metadata
|
246
|
+
metadata = result[:metadata]
|
247
|
+
puts "Format: #{metadata[:format]}" # => "tsv"
|
248
|
+
puts "Delimiter: #{metadata[:delimiter]}" # => "tab"
|
249
|
+
puts "Rows: #{metadata[:total_rows]}"
|
250
|
+
puts "Columns: #{metadata[:total_columns]}"
|
251
|
+
puts "Has headers: #{metadata[:has_headers]}"
|
252
|
+
|
253
|
+
# Extract structured data
|
254
|
+
tables = result[:tables]
|
255
|
+
table = tables.first
|
256
|
+
puts "Headers: #{table[:headers].join(', ')}"
|
257
|
+
puts "Sample row: #{table[:data][1].join(' | ')}"
|
258
|
+
|
259
|
+
# Format conversions
|
260
|
+
document = UniversalDocumentProcessor::Document.new('data.tsv')
|
261
|
+
|
262
|
+
# Convert TSV to CSV
|
263
|
+
csv_output = document.to_csv
|
264
|
+
puts "CSV conversion: #{csv_output.length} characters"
|
265
|
+
|
266
|
+
# Convert TSV to JSON
|
267
|
+
json_output = document.to_json
|
268
|
+
puts "JSON conversion: #{json_output.length} characters"
|
269
|
+
|
270
|
+
# Convert CSV to TSV
|
271
|
+
csv_document = UniversalDocumentProcessor::Document.new('data.csv')
|
272
|
+
tsv_output = csv_document.to_tsv
|
273
|
+
puts "TSV conversion: #{tsv_output.length} characters"
|
274
|
+
|
275
|
+
# Statistical analysis
|
276
|
+
stats = document.extract_statistics
|
277
|
+
sheet_stats = stats['Sheet1']
|
278
|
+
puts "Total cells: #{sheet_stats[:total_cells]}"
|
279
|
+
puts "Numeric cells: #{sheet_stats[:numeric_cells]}"
|
280
|
+
puts "Text cells: #{sheet_stats[:text_cells]}"
|
281
|
+
puts "Average value: #{sheet_stats[:average_value]}"
|
282
|
+
|
283
|
+
# Data validation
|
284
|
+
validation = document.validate_data
|
285
|
+
sheet_validation = validation['Sheet1']
|
286
|
+
puts "Data quality score: #{sheet_validation[:data_quality_score]}%"
|
287
|
+
puts "Empty rows: #{sheet_validation[:empty_rows]}"
|
288
|
+
puts "Duplicate rows: #{sheet_validation[:duplicate_rows]}"
|
289
|
+
```
|
290
|
+
|
239
291
|
### Processing Word Documents
|
240
292
|
|
241
293
|
```ruby
|
data/Rakefile
CHANGED
@@ -1,12 +1,105 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require "rubocop/rake_task"
|
1
|
+
require 'rake/testtask'
|
2
|
+
require 'bundler/gem_tasks'
|
4
3
|
|
5
|
-
|
6
|
-
|
4
|
+
# Default task
|
5
|
+
task default: :test
|
7
6
|
|
8
|
-
|
9
|
-
|
7
|
+
# Test task
|
8
|
+
Rake::TestTask.new(:test) do |t|
|
9
|
+
t.libs << 'test'
|
10
|
+
t.libs << 'lib'
|
11
|
+
t.test_files = FileList['test/test_*.rb']
|
12
|
+
t.verbose = true
|
13
|
+
end
|
14
|
+
|
15
|
+
# Individual test tasks
|
16
|
+
Rake::TestTask.new(:test_core) do |t|
|
17
|
+
t.libs << 'test'
|
18
|
+
t.libs << 'lib'
|
19
|
+
t.test_files = FileList['test/test_universal_document_processor.rb']
|
20
|
+
t.verbose = true
|
21
|
+
end
|
22
|
+
|
23
|
+
Rake::TestTask.new(:test_ai) do |t|
|
24
|
+
t.libs << 'test'
|
25
|
+
t.libs << 'lib'
|
26
|
+
t.test_files = FileList['test/test_ai_agent.rb']
|
27
|
+
t.verbose = true
|
28
|
+
end
|
29
|
+
|
30
|
+
Rake::TestTask.new(:test_processors) do |t|
|
31
|
+
t.libs << 'test'
|
32
|
+
t.libs << 'lib'
|
33
|
+
t.test_files = FileList['test/test_processors.rb']
|
34
|
+
t.verbose = true
|
35
|
+
end
|
36
|
+
|
37
|
+
# Coverage task (if simplecov is available)
|
38
|
+
desc "Run tests with coverage"
|
39
|
+
task :coverage do
|
40
|
+
ENV['COVERAGE'] = 'true'
|
41
|
+
Rake::Task[:test].invoke
|
42
|
+
end
|
43
|
+
|
44
|
+
# Lint task (if rubocop is available)
|
45
|
+
desc "Run RuboCop"
|
46
|
+
task :lint do
|
47
|
+
begin
|
48
|
+
require 'rubocop/rake_task'
|
49
|
+
RuboCop::RakeTask.new
|
50
|
+
rescue LoadError
|
51
|
+
puts "RuboCop not available. Install it with: gem install rubocop"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Documentation task
|
56
|
+
desc "Generate documentation"
|
57
|
+
task :doc do
|
58
|
+
system "yard doc"
|
59
|
+
end
|
60
|
+
|
61
|
+
# Clean task
|
62
|
+
desc "Clean up generated files"
|
63
|
+
task :clean do
|
64
|
+
FileUtils.rm_rf('coverage')
|
65
|
+
FileUtils.rm_rf('doc')
|
66
|
+
FileUtils.rm_rf('pkg')
|
67
|
+
FileUtils.rm_f('Gemfile.lock')
|
68
|
+
end
|
69
|
+
|
70
|
+
# Install dependencies
|
71
|
+
desc "Install dependencies"
|
72
|
+
task :install do
|
73
|
+
system "bundle install"
|
74
|
+
end
|
75
|
+
|
76
|
+
# Quality check task
|
77
|
+
desc "Run all quality checks"
|
78
|
+
task quality: [:test, :lint]
|
79
|
+
|
80
|
+
# CI task
|
81
|
+
desc "Run CI tasks"
|
82
|
+
task ci: [:install, :test]
|
83
|
+
|
84
|
+
# Development setup
|
85
|
+
desc "Setup development environment"
|
86
|
+
task :setup do
|
87
|
+
puts "Setting up development environment..."
|
88
|
+
Rake::Task[:install].invoke
|
89
|
+
puts "Development environment ready!"
|
90
|
+
puts ""
|
91
|
+
puts "Available tasks:"
|
92
|
+
puts " rake test - Run all tests"
|
93
|
+
puts " rake test_core - Run core functionality tests"
|
94
|
+
puts " rake test_ai - Run AI agent tests"
|
95
|
+
puts " rake test_processors - Run processor tests"
|
96
|
+
puts " rake coverage - Run tests with coverage"
|
97
|
+
puts " rake lint - Run RuboCop linting"
|
98
|
+
puts " rake doc - Generate documentation"
|
99
|
+
puts " rake clean - Clean up generated files"
|
100
|
+
puts ""
|
101
|
+
puts "To run tests with AI features, set OPENAI_API_KEY environment variable"
|
102
|
+
end
|
10
103
|
|
11
104
|
desc "Build the gem"
|
12
105
|
task :build do
|
@@ -23,14 +116,4 @@ desc "Install the gem locally"
|
|
23
116
|
task :install do
|
24
117
|
system "gem build universal_document_processor.gemspec"
|
25
118
|
system "gem install universal_document_processor-*.gem"
|
26
|
-
end
|
27
|
-
|
28
|
-
desc "Clean build artifacts"
|
29
|
-
task :clean do
|
30
|
-
system "rm -f *.gem"
|
31
|
-
end
|
32
|
-
|
33
|
-
desc "Generate documentation"
|
34
|
-
task :doc do
|
35
|
-
system "yard doc"
|
36
119
|
end
|
@@ -4,7 +4,7 @@ require 'uri'
|
|
4
4
|
|
5
5
|
module UniversalDocumentProcessor
|
6
6
|
class AIAgent
|
7
|
-
attr_reader :api_key, :model, :base_url, :conversation_history
|
7
|
+
attr_reader :api_key, :model, :base_url, :conversation_history, :ai_enabled
|
8
8
|
|
9
9
|
def initialize(options = {})
|
10
10
|
@api_key = options[:api_key] || ENV['OPENAI_API_KEY']
|
@@ -13,12 +13,15 @@ module UniversalDocumentProcessor
|
|
13
13
|
@conversation_history = []
|
14
14
|
@max_history = options[:max_history] || 10
|
15
15
|
@temperature = options[:temperature] || 0.7
|
16
|
+
@ai_enabled = false
|
16
17
|
|
17
18
|
validate_configuration
|
18
19
|
end
|
19
20
|
|
20
21
|
# Main document analysis with AI
|
21
22
|
def analyze_document(document_result, query = nil)
|
23
|
+
ensure_ai_available!
|
24
|
+
|
22
25
|
context = build_document_context(document_result)
|
23
26
|
|
24
27
|
if query
|
@@ -63,6 +66,8 @@ Please provide:
|
|
63
66
|
|
64
67
|
# Ask specific questions about a document
|
65
68
|
def ask_document_question(document_result, question)
|
69
|
+
ensure_ai_available!
|
70
|
+
|
66
71
|
context = build_document_context(document_result)
|
67
72
|
|
68
73
|
prompt = build_question_prompt(context, question)
|
@@ -74,6 +79,8 @@ Please provide:
|
|
74
79
|
|
75
80
|
# Summarize document content
|
76
81
|
def summarize_document(document_result, length: :medium)
|
82
|
+
ensure_ai_available!
|
83
|
+
|
77
84
|
context = build_document_context(document_result)
|
78
85
|
|
79
86
|
length_instruction = case length
|
@@ -92,6 +99,8 @@ Please provide:
|
|
92
99
|
|
93
100
|
# Extract key information from document
|
94
101
|
def extract_key_information(document_result, categories = nil)
|
102
|
+
ensure_ai_available!
|
103
|
+
|
95
104
|
context = build_document_context(document_result)
|
96
105
|
categories ||= ['key_facts', 'important_dates', 'names', 'locations', 'numbers']
|
97
106
|
|
@@ -104,6 +113,8 @@ Please provide:
|
|
104
113
|
|
105
114
|
# Translate document content
|
106
115
|
def translate_document(document_result, target_language)
|
116
|
+
ensure_ai_available!
|
117
|
+
|
107
118
|
context = build_document_context(document_result)
|
108
119
|
|
109
120
|
prompt = build_translation_prompt(context, target_language)
|
@@ -115,6 +126,8 @@ Please provide:
|
|
115
126
|
|
116
127
|
# Generate document insights and recommendations
|
117
128
|
def generate_insights(document_result)
|
129
|
+
ensure_ai_available!
|
130
|
+
|
118
131
|
context = build_document_context(document_result)
|
119
132
|
|
120
133
|
prompt = build_insights_prompt(context)
|
@@ -126,6 +139,8 @@ Please provide:
|
|
126
139
|
|
127
140
|
# Compare multiple documents
|
128
141
|
def compare_documents(document_results, comparison_type = :content)
|
142
|
+
ensure_ai_available!
|
143
|
+
|
129
144
|
contexts = document_results.map { |doc| build_document_context(doc) }
|
130
145
|
|
131
146
|
prompt = build_comparison_prompt(contexts, comparison_type)
|
@@ -137,6 +152,8 @@ Please provide:
|
|
137
152
|
|
138
153
|
# Classify document type and purpose
|
139
154
|
def classify_document(document_result)
|
155
|
+
ensure_ai_available!
|
156
|
+
|
140
157
|
context = build_document_context(document_result)
|
141
158
|
|
142
159
|
prompt = build_classification_prompt(context)
|
@@ -148,6 +165,8 @@ Please provide:
|
|
148
165
|
|
149
166
|
# Generate action items from document
|
150
167
|
def extract_action_items(document_result)
|
168
|
+
ensure_ai_available!
|
169
|
+
|
151
170
|
context = build_document_context(document_result)
|
152
171
|
|
153
172
|
prompt = build_action_items_prompt(context)
|
@@ -159,6 +178,8 @@ Please provide:
|
|
159
178
|
|
160
179
|
# Chat about the document
|
161
180
|
def chat(message, document_result = nil)
|
181
|
+
ensure_ai_available!
|
182
|
+
|
162
183
|
if document_result
|
163
184
|
context = build_document_context(document_result)
|
164
185
|
prompt = build_chat_prompt(context, message)
|
@@ -180,6 +201,10 @@ Please provide:
|
|
180
201
|
def conversation_summary
|
181
202
|
return "No conversation history" if @conversation_history.empty?
|
182
203
|
|
204
|
+
unless @ai_enabled
|
205
|
+
return "AI features are disabled. Cannot generate conversation summary."
|
206
|
+
end
|
207
|
+
|
183
208
|
history_text = @conversation_history.map do |entry|
|
184
209
|
"Q: #{entry[:question]}\nA: #{entry[:answer]}"
|
185
210
|
end.join("\n\n")
|
@@ -188,11 +213,27 @@ Please provide:
|
|
188
213
|
call_openai_api(prompt)
|
189
214
|
end
|
190
215
|
|
216
|
+
# Check if AI features are available
|
217
|
+
def ai_available?
|
218
|
+
@ai_enabled
|
219
|
+
end
|
220
|
+
|
191
221
|
private
|
192
222
|
|
193
223
|
def validate_configuration
|
194
|
-
|
195
|
-
|
224
|
+
if @api_key && !@api_key.empty?
|
225
|
+
@ai_enabled = true
|
226
|
+
else
|
227
|
+
@ai_enabled = false
|
228
|
+
warn "Warning: OpenAI API key not provided. AI features will be disabled. Set OPENAI_API_KEY environment variable or pass api_key option to enable AI features."
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
# Ensure AI is available before making API calls
|
233
|
+
def ensure_ai_available!
|
234
|
+
unless @ai_enabled
|
235
|
+
raise DependencyMissingError, "AI features are not available. Please provide an OpenAI API key to use AI functionality."
|
236
|
+
end
|
196
237
|
end
|
197
238
|
|
198
239
|
def build_document_context(document_result)
|
@@ -48,6 +48,42 @@ module UniversalDocumentProcessor
|
|
48
48
|
[]
|
49
49
|
end
|
50
50
|
|
51
|
+
def extract_statistics
|
52
|
+
processor.respond_to?(:extract_statistics) ? processor.extract_statistics : {}
|
53
|
+
rescue => e
|
54
|
+
{}
|
55
|
+
end
|
56
|
+
|
57
|
+
def validate_data
|
58
|
+
processor.respond_to?(:validate_data) ? processor.validate_data : {}
|
59
|
+
rescue => e
|
60
|
+
{}
|
61
|
+
end
|
62
|
+
|
63
|
+
def extract_formulas
|
64
|
+
processor.respond_to?(:extract_formulas) ? processor.extract_formulas : []
|
65
|
+
rescue => e
|
66
|
+
[]
|
67
|
+
end
|
68
|
+
|
69
|
+
def to_json
|
70
|
+
processor.respond_to?(:to_json) ? processor.to_json : process.to_json
|
71
|
+
rescue => e
|
72
|
+
process.to_json
|
73
|
+
end
|
74
|
+
|
75
|
+
def to_csv(sheet_name = nil)
|
76
|
+
processor.respond_to?(:to_csv) ? processor.to_csv(sheet_name) : ""
|
77
|
+
rescue => e
|
78
|
+
""
|
79
|
+
end
|
80
|
+
|
81
|
+
def to_tsv(sheet_name = nil)
|
82
|
+
processor.respond_to?(:to_tsv) ? processor.to_tsv(sheet_name) : ""
|
83
|
+
rescue => e
|
84
|
+
""
|
85
|
+
end
|
86
|
+
|
51
87
|
def convert_to(target_format)
|
52
88
|
case target_format.to_sym
|
53
89
|
when :pdf
|
@@ -64,7 +100,7 @@ module UniversalDocumentProcessor
|
|
64
100
|
end
|
65
101
|
|
66
102
|
def supported_formats
|
67
|
-
%w[pdf docx doc xlsx xls pptx ppt txt rtf html xml csv jpg jpeg png gif bmp tiff zip rar 7z]
|
103
|
+
%w[pdf docx doc xlsx xls pptx ppt txt rtf html xml csv tsv jpg jpeg png gif bmp tiff zip rar 7z]
|
68
104
|
end
|
69
105
|
|
70
106
|
def supported?
|
@@ -139,11 +175,11 @@ module UniversalDocumentProcessor
|
|
139
175
|
case @content_type
|
140
176
|
when /pdf/
|
141
177
|
Processors::PdfProcessor.new(@file_path, @options)
|
142
|
-
when /
|
178
|
+
when /wordprocessingml/, /msword/
|
143
179
|
Processors::WordProcessor.new(@file_path, @options)
|
144
|
-
when /excel/, /
|
180
|
+
when /spreadsheetml/, /ms-excel/, /csv/, /tab-separated/
|
145
181
|
Processors::ExcelProcessor.new(@file_path, @options)
|
146
|
-
when /
|
182
|
+
when /presentationml/, /ms-powerpoint/
|
147
183
|
Processors::PowerpointProcessor.new(@file_path, @options)
|
148
184
|
when /image/
|
149
185
|
Processors::ImageProcessor.new(@file_path, @options)
|