universal_document_processor 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/README.md +53 -1
- data/Rakefile +100 -17
- data/lib/universal_document_processor/ai_agent.rb +44 -3
- data/lib/universal_document_processor/document.rb +40 -4
- data/lib/universal_document_processor/processors/excel_processor.rb +719 -132
- data/lib/universal_document_processor/processors/pdf_processor.rb +14 -0
- data/lib/universal_document_processor/processors/word_processor.rb +94 -4
- data/lib/universal_document_processor/utils/file_detector.rb +1 -0
- data/lib/universal_document_processor/version.rb +1 -1
- data/lib/universal_document_processor.rb +84 -1
- metadata +48 -6
- data/AI_USAGE_GUIDE.md +0 -404
- data/GEM_RELEASE_GUIDE.md +0 -288
@@ -2,6 +2,8 @@ module UniversalDocumentProcessor
|
|
2
2
|
module Processors
|
3
3
|
class PdfProcessor < BaseProcessor
|
4
4
|
def extract_text
|
5
|
+
ensure_pdf_reader_available!
|
6
|
+
|
5
7
|
with_error_handling do
|
6
8
|
reader = PDF::Reader.new(@file_path)
|
7
9
|
text = reader.pages.map(&:text).join("\n")
|
@@ -10,6 +12,8 @@ module UniversalDocumentProcessor
|
|
10
12
|
end
|
11
13
|
|
12
14
|
def extract_metadata
|
15
|
+
ensure_pdf_reader_available!
|
16
|
+
|
13
17
|
with_error_handling do
|
14
18
|
reader = PDF::Reader.new(@file_path)
|
15
19
|
info = reader.info || {}
|
@@ -32,6 +36,8 @@ module UniversalDocumentProcessor
|
|
32
36
|
end
|
33
37
|
|
34
38
|
def extract_images
|
39
|
+
ensure_pdf_reader_available!
|
40
|
+
|
35
41
|
with_error_handling do
|
36
42
|
# Extract embedded images from PDF
|
37
43
|
images = []
|
@@ -57,6 +63,8 @@ module UniversalDocumentProcessor
|
|
57
63
|
end
|
58
64
|
|
59
65
|
def extract_tables
|
66
|
+
ensure_pdf_reader_available!
|
67
|
+
|
60
68
|
with_error_handling do
|
61
69
|
# Basic table extraction from PDF text
|
62
70
|
tables = []
|
@@ -87,6 +95,12 @@ module UniversalDocumentProcessor
|
|
87
95
|
|
88
96
|
private
|
89
97
|
|
98
|
+
def ensure_pdf_reader_available!
|
99
|
+
unless defined?(PDF::Reader)
|
100
|
+
raise DependencyMissingError, "PDF processing requires the 'pdf-reader' gem. Install it with: gem install pdf-reader -v '~> 2.0'"
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
90
104
|
def extract_form_fields(reader)
|
91
105
|
# Extract PDF form fields if present
|
92
106
|
[]
|
@@ -5,8 +5,11 @@ module UniversalDocumentProcessor
|
|
5
5
|
with_error_handling do
|
6
6
|
if @file_path.end_with?('.docx')
|
7
7
|
extract_docx_text
|
8
|
+
elsif @file_path.end_with?('.doc')
|
9
|
+
# Built-in .doc file processing
|
10
|
+
fallback_text_extraction
|
8
11
|
else
|
9
|
-
#
|
12
|
+
# Handle other Word formats
|
10
13
|
fallback_text_extraction
|
11
14
|
end
|
12
15
|
end
|
@@ -16,6 +19,8 @@ module UniversalDocumentProcessor
|
|
16
19
|
with_error_handling do
|
17
20
|
if @file_path.end_with?('.docx')
|
18
21
|
extract_docx_metadata
|
22
|
+
elsif @file_path.end_with?('.doc')
|
23
|
+
extract_doc_metadata
|
19
24
|
else
|
20
25
|
super
|
21
26
|
end
|
@@ -25,6 +30,7 @@ module UniversalDocumentProcessor
|
|
25
30
|
def extract_images
|
26
31
|
with_error_handling do
|
27
32
|
return [] unless @file_path.end_with?('.docx')
|
33
|
+
ensure_docx_available!
|
28
34
|
|
29
35
|
images = []
|
30
36
|
doc = Docx::Document.open(@file_path)
|
@@ -48,6 +54,7 @@ module UniversalDocumentProcessor
|
|
48
54
|
def extract_tables
|
49
55
|
with_error_handling do
|
50
56
|
return [] unless @file_path.end_with?('.docx')
|
57
|
+
ensure_docx_available!
|
51
58
|
|
52
59
|
tables = []
|
53
60
|
doc = Docx::Document.open(@file_path)
|
@@ -73,12 +80,25 @@ module UniversalDocumentProcessor
|
|
73
80
|
end
|
74
81
|
|
75
82
|
def supported_operations
|
76
|
-
|
83
|
+
if @file_path.end_with?('.docx')
|
84
|
+
super + [:extract_images, :extract_tables, :extract_styles, :extract_comments]
|
85
|
+
else
|
86
|
+
# .doc files support basic text and metadata extraction
|
87
|
+
super + [:extract_basic_formatting]
|
88
|
+
end
|
77
89
|
end
|
78
90
|
|
79
91
|
private
|
80
92
|
|
93
|
+
def ensure_docx_available!
|
94
|
+
unless defined?(Docx)
|
95
|
+
raise DependencyMissingError, "DOCX processing requires the 'docx' gem. Install it with: gem install docx -v '~> 0.8'"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
81
99
|
def extract_docx_text
|
100
|
+
ensure_docx_available!
|
101
|
+
|
82
102
|
doc = Docx::Document.open(@file_path)
|
83
103
|
text_content = []
|
84
104
|
|
@@ -99,6 +119,8 @@ module UniversalDocumentProcessor
|
|
99
119
|
end
|
100
120
|
|
101
121
|
def extract_docx_metadata
|
122
|
+
ensure_docx_available!
|
123
|
+
|
102
124
|
doc = Docx::Document.open(@file_path)
|
103
125
|
core_properties = doc.core_properties
|
104
126
|
|
@@ -126,12 +148,80 @@ module UniversalDocumentProcessor
|
|
126
148
|
0
|
127
149
|
end
|
128
150
|
|
151
|
+
def extract_doc_metadata
|
152
|
+
# Extract basic metadata from .doc files
|
153
|
+
file_stats = File.stat(@file_path)
|
154
|
+
extracted_text = extract_doc_text_builtin
|
155
|
+
|
156
|
+
super.merge({
|
157
|
+
format: 'Microsoft Word Document (.doc)',
|
158
|
+
word_count: count_words(extracted_text),
|
159
|
+
character_count: extracted_text.length,
|
160
|
+
created_at: file_stats.ctime,
|
161
|
+
modified_at: file_stats.mtime,
|
162
|
+
file_size: file_stats.size,
|
163
|
+
extraction_method: 'Built-in binary parsing'
|
164
|
+
})
|
165
|
+
rescue => e
|
166
|
+
super.merge({
|
167
|
+
format: 'Microsoft Word Document (.doc)',
|
168
|
+
extraction_error: e.message
|
169
|
+
})
|
170
|
+
end
|
171
|
+
|
129
172
|
def fallback_text_extraction
|
130
|
-
#
|
131
|
-
|
173
|
+
# Built-in .doc file text extraction
|
174
|
+
extract_doc_text_builtin
|
132
175
|
rescue => e
|
133
176
|
"Unable to extract text from Word document: #{e.message}"
|
134
177
|
end
|
178
|
+
|
179
|
+
def extract_doc_text_builtin
|
180
|
+
# Read .doc file as binary and extract readable text
|
181
|
+
content = File.binread(@file_path)
|
182
|
+
|
183
|
+
# .doc files store text in a specific format - extract readable ASCII text
|
184
|
+
# This is a simplified extraction that works for basic .doc files
|
185
|
+
text_content = []
|
186
|
+
|
187
|
+
# Look for text patterns in the binary data
|
188
|
+
# .doc files often have text stored with null bytes between characters
|
189
|
+
content.force_encoding('ASCII-8BIT').scan(/[\x20-\x7E\x0A\x0D]{4,}/) do |match|
|
190
|
+
# Clean up the extracted text
|
191
|
+
cleaned_text = match.gsub(/[\x00-\x1F\x7F-\xFF]/n, ' ').strip
|
192
|
+
text_content << cleaned_text if cleaned_text.length > 3
|
193
|
+
end
|
194
|
+
|
195
|
+
# Try alternative extraction method if first method yields little text
|
196
|
+
if text_content.join(' ').length < 50
|
197
|
+
text_content = extract_doc_alternative_method(content)
|
198
|
+
end
|
199
|
+
|
200
|
+
result = text_content.join("\n").strip
|
201
|
+
result.empty? ? "Text extracted from .doc file (content may be limited due to complex formatting)" : result
|
202
|
+
end
|
203
|
+
|
204
|
+
def extract_doc_alternative_method(content)
|
205
|
+
# Alternative method: look for Word document text patterns
|
206
|
+
text_parts = []
|
207
|
+
|
208
|
+
# .doc files often have text in UTF-16 or with specific markers
|
209
|
+
# Try to find readable text segments
|
210
|
+
content.force_encoding('UTF-16LE').encode('UTF-8', invalid: :replace, undef: :replace).scan(/[[:print:]]{5,}/m) do |match|
|
211
|
+
cleaned = match.strip
|
212
|
+
text_parts << cleaned if cleaned.length > 4 && !cleaned.match?(/^[\x00-\x1F]*$/)
|
213
|
+
end
|
214
|
+
|
215
|
+
# If UTF-16 doesn't work, try scanning for ASCII patterns
|
216
|
+
if text_parts.empty?
|
217
|
+
content.force_encoding('ASCII-8BIT').scan(/[a-zA-Z0-9\s\.\,\!\?\;\:]{10,}/n) do |match|
|
218
|
+
cleaned = match.strip
|
219
|
+
text_parts << cleaned if cleaned.length > 9
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
text_parts.uniq
|
224
|
+
end
|
135
225
|
end
|
136
226
|
end
|
137
227
|
end
|
@@ -122,48 +122,72 @@ module UniversalDocumentProcessor
|
|
122
122
|
def self.ai_analyze(file_path, options = {})
|
123
123
|
document_result = process(file_path, options)
|
124
124
|
ai_agent = AIAgent.new(options)
|
125
|
+
unless ai_agent.ai_available?
|
126
|
+
raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
|
127
|
+
end
|
125
128
|
ai_agent.analyze_document(document_result, options[:query])
|
126
129
|
end
|
127
130
|
|
128
131
|
def self.ai_summarize(file_path, length: :medium, options: {})
|
129
132
|
document_result = process(file_path, options)
|
130
133
|
ai_agent = AIAgent.new(options)
|
134
|
+
unless ai_agent.ai_available?
|
135
|
+
raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
|
136
|
+
end
|
131
137
|
ai_agent.summarize_document(document_result, length: length)
|
132
138
|
end
|
133
139
|
|
134
140
|
def self.ai_extract_info(file_path, categories = nil, options = {})
|
135
141
|
document_result = process(file_path, options)
|
136
142
|
ai_agent = AIAgent.new(options)
|
143
|
+
unless ai_agent.ai_available?
|
144
|
+
raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
|
145
|
+
end
|
137
146
|
ai_agent.extract_key_information(document_result, categories)
|
138
147
|
end
|
139
148
|
|
140
149
|
def self.ai_translate(file_path, target_language, options = {})
|
141
150
|
document_result = process(file_path, options)
|
142
151
|
ai_agent = AIAgent.new(options)
|
152
|
+
unless ai_agent.ai_available?
|
153
|
+
raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
|
154
|
+
end
|
143
155
|
ai_agent.translate_document(document_result, target_language)
|
144
156
|
end
|
145
157
|
|
146
158
|
def self.ai_classify(file_path, options = {})
|
147
159
|
document_result = process(file_path, options)
|
148
160
|
ai_agent = AIAgent.new(options)
|
161
|
+
unless ai_agent.ai_available?
|
162
|
+
raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
|
163
|
+
end
|
149
164
|
ai_agent.classify_document(document_result)
|
150
165
|
end
|
151
166
|
|
152
167
|
def self.ai_insights(file_path, options = {})
|
153
168
|
document_result = process(file_path, options)
|
154
169
|
ai_agent = AIAgent.new(options)
|
170
|
+
unless ai_agent.ai_available?
|
171
|
+
raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
|
172
|
+
end
|
155
173
|
ai_agent.generate_insights(document_result)
|
156
174
|
end
|
157
175
|
|
158
176
|
def self.ai_action_items(file_path, options = {})
|
159
177
|
document_result = process(file_path, options)
|
160
178
|
ai_agent = AIAgent.new(options)
|
179
|
+
unless ai_agent.ai_available?
|
180
|
+
raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
|
181
|
+
end
|
161
182
|
ai_agent.extract_action_items(document_result)
|
162
183
|
end
|
163
184
|
|
164
185
|
def self.ai_compare(file_paths, comparison_type = :content, options = {})
|
165
186
|
document_results = file_paths.map { |path| process(path, options) }
|
166
187
|
ai_agent = AIAgent.new(options)
|
188
|
+
unless ai_agent.ai_available?
|
189
|
+
raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
|
190
|
+
end
|
167
191
|
ai_agent.compare_documents(document_results, comparison_type)
|
168
192
|
end
|
169
193
|
|
@@ -171,6 +195,12 @@ module UniversalDocumentProcessor
|
|
171
195
|
AIAgent.new(options)
|
172
196
|
end
|
173
197
|
|
198
|
+
# Check if AI features are available
|
199
|
+
def self.ai_available?(options = {})
|
200
|
+
ai_agent = AIAgent.new(options)
|
201
|
+
ai_agent.ai_available?
|
202
|
+
end
|
203
|
+
|
174
204
|
# Convert document to different format
|
175
205
|
def self.convert(file_path_or_io, target_format, options = {})
|
176
206
|
Document.new(file_path_or_io, options).convert_to(target_format)
|
@@ -207,9 +237,54 @@ module UniversalDocumentProcessor
|
|
207
237
|
end
|
208
238
|
end
|
209
239
|
|
240
|
+
# Get list of optional dependencies
|
241
|
+
def self.optional_dependencies
|
242
|
+
{
|
243
|
+
'pdf-reader' => '~> 2.0', # PDF text extraction
|
244
|
+
'prawn' => '~> 2.4', # PDF generation
|
245
|
+
'docx' => '~> 0.8', # Word document processing
|
246
|
+
'roo' => '~> 2.8', # Excel/Spreadsheet processing
|
247
|
+
'mini_magick' => '~> 4.11', # Image processing
|
248
|
+
'yomu' => '~> 0.2' # Universal text extraction fallback
|
249
|
+
}
|
250
|
+
end
|
251
|
+
|
252
|
+
# Check which optional dependencies are missing
|
253
|
+
def self.missing_dependencies
|
254
|
+
missing = []
|
255
|
+
missing << 'pdf-reader' unless dependency_available?(:pdf_reader)
|
256
|
+
missing << 'prawn' unless dependency_available?(:prawn)
|
257
|
+
missing << 'docx' unless dependency_available?(:docx)
|
258
|
+
missing << 'roo' unless dependency_available?(:roo)
|
259
|
+
missing << 'mini_magick' unless dependency_available?(:mini_magick)
|
260
|
+
missing << 'yomu' unless dependency_available?(:yomu)
|
261
|
+
missing
|
262
|
+
end
|
263
|
+
|
264
|
+
# Generate installation instructions for missing dependencies
|
265
|
+
def self.installation_instructions
|
266
|
+
missing = missing_dependencies
|
267
|
+
return "All optional dependencies are installed!" if missing.empty?
|
268
|
+
|
269
|
+
instructions = ["To enable additional features, install these optional gems:"]
|
270
|
+
missing.each do |gem_name|
|
271
|
+
version = optional_dependencies[gem_name]
|
272
|
+
instructions << " gem install #{gem_name} -v '#{version}'"
|
273
|
+
end
|
274
|
+
|
275
|
+
instructions << ""
|
276
|
+
instructions << "Or add to your Gemfile:"
|
277
|
+
missing.each do |gem_name|
|
278
|
+
version = optional_dependencies[gem_name]
|
279
|
+
instructions << " gem '#{gem_name}', '#{version}'"
|
280
|
+
end
|
281
|
+
|
282
|
+
instructions.join("\n")
|
283
|
+
end
|
284
|
+
|
210
285
|
# Get list of available features based on installed dependencies
|
211
286
|
def self.available_features
|
212
|
-
features = [:text_processing, :html_processing, :xml_processing, :csv_processing, :json_processing, :archive_processing]
|
287
|
+
features = [:text_processing, :html_processing, :xml_processing, :csv_processing, :json_processing, :archive_processing, :tsv_processing]
|
213
288
|
|
214
289
|
features << :pdf_processing if dependency_available?(:pdf_reader)
|
215
290
|
features << :word_processing if dependency_available?(:docx)
|
@@ -218,6 +293,14 @@ module UniversalDocumentProcessor
|
|
218
293
|
features << :universal_text_extraction if dependency_available?(:yomu)
|
219
294
|
features << :pdf_generation if dependency_available?(:prawn)
|
220
295
|
|
296
|
+
# Check AI availability without creating circular dependency
|
297
|
+
begin
|
298
|
+
ai_agent = AIAgent.new
|
299
|
+
features << :ai_processing if ai_agent.ai_enabled
|
300
|
+
rescue
|
301
|
+
# AI not available
|
302
|
+
end
|
303
|
+
|
221
304
|
features
|
222
305
|
end
|
223
306
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: universal_document_processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Vikas Patil
|
@@ -66,19 +66,47 @@ dependencies:
|
|
66
66
|
- !ruby/object:Gem::Version
|
67
67
|
version: '2.3'
|
68
68
|
- !ruby/object:Gem::Dependency
|
69
|
-
name:
|
69
|
+
name: rexml
|
70
70
|
requirement: !ruby/object:Gem::Requirement
|
71
71
|
requirements:
|
72
72
|
- - "~>"
|
73
73
|
- !ruby/object:Gem::Version
|
74
|
-
version: '3.
|
74
|
+
version: '3.2'
|
75
|
+
type: :runtime
|
76
|
+
prerelease: false
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '3.2'
|
82
|
+
- !ruby/object:Gem::Dependency
|
83
|
+
name: minitest
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - "~>"
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '5.0'
|
75
89
|
type: :development
|
76
90
|
prerelease: false
|
77
91
|
version_requirements: !ruby/object:Gem::Requirement
|
78
92
|
requirements:
|
79
93
|
- - "~>"
|
80
94
|
- !ruby/object:Gem::Version
|
81
|
-
version: '
|
95
|
+
version: '5.0'
|
96
|
+
- !ruby/object:Gem::Dependency
|
97
|
+
name: minitest-reporters
|
98
|
+
requirement: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - "~>"
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '1.0'
|
103
|
+
type: :development
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '1.0'
|
82
110
|
- !ruby/object:Gem::Dependency
|
83
111
|
name: rake
|
84
112
|
requirement: !ruby/object:Gem::Requirement
|
@@ -135,6 +163,20 @@ dependencies:
|
|
135
163
|
- - "~>"
|
136
164
|
- !ruby/object:Gem::Version
|
137
165
|
version: '2.0'
|
166
|
+
- !ruby/object:Gem::Dependency
|
167
|
+
name: simplecov
|
168
|
+
requirement: !ruby/object:Gem::Requirement
|
169
|
+
requirements:
|
170
|
+
- - "~>"
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: '0.22'
|
173
|
+
type: :development
|
174
|
+
prerelease: false
|
175
|
+
version_requirements: !ruby/object:Gem::Requirement
|
176
|
+
requirements:
|
177
|
+
- - "~>"
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: '0.22'
|
138
180
|
description: A comprehensive Ruby gem that handles document processing, text extraction,
|
139
181
|
and AI-powered analysis for PDF, Word, Excel, PowerPoint, images, archives, and
|
140
182
|
more with a unified API. Includes agentic AI features for document analysis, summarization,
|
@@ -145,9 +187,7 @@ executables: []
|
|
145
187
|
extensions: []
|
146
188
|
extra_rdoc_files: []
|
147
189
|
files:
|
148
|
-
- AI_USAGE_GUIDE.md
|
149
190
|
- CHANGELOG.md
|
150
|
-
- GEM_RELEASE_GUIDE.md
|
151
191
|
- Gemfile
|
152
192
|
- LICENSE
|
153
193
|
- README.md
|
@@ -178,6 +218,8 @@ metadata:
|
|
178
218
|
documentation_uri: https://github.com/vpatil160/universal_document_processor/blob/main/README.md
|
179
219
|
funding_uri: https://github.com/sponsors/vpatil160
|
180
220
|
rubygems_mfa_required: 'true'
|
221
|
+
optional_dependencies: pdf-reader ~> 2.0, prawn ~> 2.4, docx ~> 0.8, roo ~> 2.8,
|
222
|
+
mini_magick ~> 4.11, yomu ~> 0.2
|
181
223
|
rdoc_options: []
|
182
224
|
require_paths:
|
183
225
|
- lib
|