universal_document_processor 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,8 @@ module UniversalDocumentProcessor
2
2
  module Processors
3
3
  class PdfProcessor < BaseProcessor
4
4
  def extract_text
5
+ ensure_pdf_reader_available!
6
+
5
7
  with_error_handling do
6
8
  reader = PDF::Reader.new(@file_path)
7
9
  text = reader.pages.map(&:text).join("\n")
@@ -10,6 +12,8 @@ module UniversalDocumentProcessor
10
12
  end
11
13
 
12
14
  def extract_metadata
15
+ ensure_pdf_reader_available!
16
+
13
17
  with_error_handling do
14
18
  reader = PDF::Reader.new(@file_path)
15
19
  info = reader.info || {}
@@ -32,6 +36,8 @@ module UniversalDocumentProcessor
32
36
  end
33
37
 
34
38
  def extract_images
39
+ ensure_pdf_reader_available!
40
+
35
41
  with_error_handling do
36
42
  # Extract embedded images from PDF
37
43
  images = []
@@ -57,6 +63,8 @@ module UniversalDocumentProcessor
57
63
  end
58
64
 
59
65
  def extract_tables
66
+ ensure_pdf_reader_available!
67
+
60
68
  with_error_handling do
61
69
  # Basic table extraction from PDF text
62
70
  tables = []
@@ -87,6 +95,12 @@ module UniversalDocumentProcessor
87
95
 
88
96
  private
89
97
 
98
+ def ensure_pdf_reader_available!
99
+ unless defined?(PDF::Reader)
100
+ raise DependencyMissingError, "PDF processing requires the 'pdf-reader' gem. Install it with: gem install pdf-reader -v '~> 2.0'"
101
+ end
102
+ end
103
+
90
104
  def extract_form_fields(reader)
91
105
  # Extract PDF form fields if present
92
106
  []
@@ -5,8 +5,11 @@ module UniversalDocumentProcessor
5
5
  with_error_handling do
6
6
  if @file_path.end_with?('.docx')
7
7
  extract_docx_text
8
+ elsif @file_path.end_with?('.doc')
9
+ # Built-in .doc file processing
10
+ fallback_text_extraction
8
11
  else
9
- # Fallback for .doc files
12
+ # Handle other Word formats
10
13
  fallback_text_extraction
11
14
  end
12
15
  end
@@ -16,6 +19,8 @@ module UniversalDocumentProcessor
16
19
  with_error_handling do
17
20
  if @file_path.end_with?('.docx')
18
21
  extract_docx_metadata
22
+ elsif @file_path.end_with?('.doc')
23
+ extract_doc_metadata
19
24
  else
20
25
  super
21
26
  end
@@ -25,6 +30,7 @@ module UniversalDocumentProcessor
25
30
  def extract_images
26
31
  with_error_handling do
27
32
  return [] unless @file_path.end_with?('.docx')
33
+ ensure_docx_available!
28
34
 
29
35
  images = []
30
36
  doc = Docx::Document.open(@file_path)
@@ -48,6 +54,7 @@ module UniversalDocumentProcessor
48
54
  def extract_tables
49
55
  with_error_handling do
50
56
  return [] unless @file_path.end_with?('.docx')
57
+ ensure_docx_available!
51
58
 
52
59
  tables = []
53
60
  doc = Docx::Document.open(@file_path)
@@ -73,12 +80,25 @@ module UniversalDocumentProcessor
73
80
  end
74
81
 
75
82
  def supported_operations
76
- super + [:extract_images, :extract_tables, :extract_styles, :extract_comments]
83
+ if @file_path.end_with?('.docx')
84
+ super + [:extract_images, :extract_tables, :extract_styles, :extract_comments]
85
+ else
86
+ # .doc files support basic text and metadata extraction
87
+ super + [:extract_basic_formatting]
88
+ end
77
89
  end
78
90
 
79
91
  private
80
92
 
93
+ def ensure_docx_available!
94
+ unless defined?(Docx)
95
+ raise DependencyMissingError, "DOCX processing requires the 'docx' gem. Install it with: gem install docx -v '~> 0.8'"
96
+ end
97
+ end
98
+
81
99
  def extract_docx_text
100
+ ensure_docx_available!
101
+
82
102
  doc = Docx::Document.open(@file_path)
83
103
  text_content = []
84
104
 
@@ -99,6 +119,8 @@ module UniversalDocumentProcessor
99
119
  end
100
120
 
101
121
  def extract_docx_metadata
122
+ ensure_docx_available!
123
+
102
124
  doc = Docx::Document.open(@file_path)
103
125
  core_properties = doc.core_properties
104
126
 
@@ -126,12 +148,80 @@ module UniversalDocumentProcessor
126
148
  0
127
149
  end
128
150
 
151
+ def extract_doc_metadata
152
+ # Extract basic metadata from .doc files
153
+ file_stats = File.stat(@file_path)
154
+ extracted_text = extract_doc_text_builtin
155
+
156
+ super.merge({
157
+ format: 'Microsoft Word Document (.doc)',
158
+ word_count: count_words(extracted_text),
159
+ character_count: extracted_text.length,
160
+ created_at: file_stats.ctime,
161
+ modified_at: file_stats.mtime,
162
+ file_size: file_stats.size,
163
+ extraction_method: 'Built-in binary parsing'
164
+ })
165
+ rescue => e
166
+ super.merge({
167
+ format: 'Microsoft Word Document (.doc)',
168
+ extraction_error: e.message
169
+ })
170
+ end
171
+
129
172
  def fallback_text_extraction
130
- # Use Yomu for .doc files or as fallback
131
- Yomu.new(@file_path).text
173
+ # Built-in .doc file text extraction
174
+ extract_doc_text_builtin
132
175
  rescue => e
133
176
  "Unable to extract text from Word document: #{e.message}"
134
177
  end
178
+
179
+ def extract_doc_text_builtin
180
+ # Read .doc file as binary and extract readable text
181
+ content = File.binread(@file_path)
182
+
183
+ # .doc files store text in a specific format - extract readable ASCII text
184
+ # This is a simplified extraction that works for basic .doc files
185
+ text_content = []
186
+
187
+ # Look for text patterns in the binary data
188
+ # .doc files often have text stored with null bytes between characters
189
+ content.force_encoding('ASCII-8BIT').scan(/[\x20-\x7E\x0A\x0D]{4,}/) do |match|
190
+ # Clean up the extracted text
191
+ cleaned_text = match.gsub(/[\x00-\x1F\x7F-\xFF]/n, ' ').strip
192
+ text_content << cleaned_text if cleaned_text.length > 3
193
+ end
194
+
195
+ # Try alternative extraction method if first method yields little text
196
+ if text_content.join(' ').length < 50
197
+ text_content = extract_doc_alternative_method(content)
198
+ end
199
+
200
+ result = text_content.join("\n").strip
201
+ result.empty? ? "Text extracted from .doc file (content may be limited due to complex formatting)" : result
202
+ end
203
+
204
+ def extract_doc_alternative_method(content)
205
+ # Alternative method: look for Word document text patterns
206
+ text_parts = []
207
+
208
+ # .doc files often have text in UTF-16 or with specific markers
209
+ # Try to find readable text segments
210
+ content.force_encoding('UTF-16LE').encode('UTF-8', invalid: :replace, undef: :replace).scan(/[[:print:]]{5,}/m) do |match|
211
+ cleaned = match.strip
212
+ text_parts << cleaned if cleaned.length > 4 && !cleaned.match?(/^[\x00-\x1F]*$/)
213
+ end
214
+
215
+ # If UTF-16 doesn't work, try scanning for ASCII patterns
216
+ if text_parts.empty?
217
+ content.force_encoding('ASCII-8BIT').scan(/[a-zA-Z0-9\s\.\,\!\?\;\:]{10,}/n) do |match|
218
+ cleaned = match.strip
219
+ text_parts << cleaned if cleaned.length > 9
220
+ end
221
+ end
222
+
223
+ text_parts.uniq
224
+ end
135
225
  end
136
226
  end
137
227
  end
@@ -15,6 +15,7 @@ module UniversalDocumentProcessor
15
15
  'htm' => 'text/html',
16
16
  'xml' => 'application/xml',
17
17
  'csv' => 'text/csv',
18
+ 'tsv' => 'text/tab-separated-values',
18
19
  'json' => 'application/json',
19
20
  'jpg' => 'image/jpeg',
20
21
  'jpeg' => 'image/jpeg',
@@ -1,3 +1,3 @@
1
1
  module UniversalDocumentProcessor
2
- VERSION = "1.0.1"
2
+ VERSION = "1.0.3"
3
3
  end
@@ -122,48 +122,72 @@ module UniversalDocumentProcessor
122
122
  def self.ai_analyze(file_path, options = {})
123
123
  document_result = process(file_path, options)
124
124
  ai_agent = AIAgent.new(options)
125
+ unless ai_agent.ai_available?
126
+ raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
127
+ end
125
128
  ai_agent.analyze_document(document_result, options[:query])
126
129
  end
127
130
 
128
131
  def self.ai_summarize(file_path, length: :medium, options: {})
129
132
  document_result = process(file_path, options)
130
133
  ai_agent = AIAgent.new(options)
134
+ unless ai_agent.ai_available?
135
+ raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
136
+ end
131
137
  ai_agent.summarize_document(document_result, length: length)
132
138
  end
133
139
 
134
140
  def self.ai_extract_info(file_path, categories = nil, options = {})
135
141
  document_result = process(file_path, options)
136
142
  ai_agent = AIAgent.new(options)
143
+ unless ai_agent.ai_available?
144
+ raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
145
+ end
137
146
  ai_agent.extract_key_information(document_result, categories)
138
147
  end
139
148
 
140
149
  def self.ai_translate(file_path, target_language, options = {})
141
150
  document_result = process(file_path, options)
142
151
  ai_agent = AIAgent.new(options)
152
+ unless ai_agent.ai_available?
153
+ raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
154
+ end
143
155
  ai_agent.translate_document(document_result, target_language)
144
156
  end
145
157
 
146
158
  def self.ai_classify(file_path, options = {})
147
159
  document_result = process(file_path, options)
148
160
  ai_agent = AIAgent.new(options)
161
+ unless ai_agent.ai_available?
162
+ raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
163
+ end
149
164
  ai_agent.classify_document(document_result)
150
165
  end
151
166
 
152
167
  def self.ai_insights(file_path, options = {})
153
168
  document_result = process(file_path, options)
154
169
  ai_agent = AIAgent.new(options)
170
+ unless ai_agent.ai_available?
171
+ raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
172
+ end
155
173
  ai_agent.generate_insights(document_result)
156
174
  end
157
175
 
158
176
  def self.ai_action_items(file_path, options = {})
159
177
  document_result = process(file_path, options)
160
178
  ai_agent = AIAgent.new(options)
179
+ unless ai_agent.ai_available?
180
+ raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
181
+ end
161
182
  ai_agent.extract_action_items(document_result)
162
183
  end
163
184
 
164
185
  def self.ai_compare(file_paths, comparison_type = :content, options = {})
165
186
  document_results = file_paths.map { |path| process(path, options) }
166
187
  ai_agent = AIAgent.new(options)
188
+ unless ai_agent.ai_available?
189
+ raise DependencyMissingError, "AI features require an OpenAI API key. Set OPENAI_API_KEY environment variable or pass api_key in options."
190
+ end
167
191
  ai_agent.compare_documents(document_results, comparison_type)
168
192
  end
169
193
 
@@ -171,6 +195,12 @@ module UniversalDocumentProcessor
171
195
  AIAgent.new(options)
172
196
  end
173
197
 
198
+ # Check if AI features are available
199
+ def self.ai_available?(options = {})
200
+ ai_agent = AIAgent.new(options)
201
+ ai_agent.ai_available?
202
+ end
203
+
174
204
  # Convert document to different format
175
205
  def self.convert(file_path_or_io, target_format, options = {})
176
206
  Document.new(file_path_or_io, options).convert_to(target_format)
@@ -207,9 +237,54 @@ module UniversalDocumentProcessor
207
237
  end
208
238
  end
209
239
 
240
+ # Get list of optional dependencies
241
+ def self.optional_dependencies
242
+ {
243
+ 'pdf-reader' => '~> 2.0', # PDF text extraction
244
+ 'prawn' => '~> 2.4', # PDF generation
245
+ 'docx' => '~> 0.8', # Word document processing
246
+ 'roo' => '~> 2.8', # Excel/Spreadsheet processing
247
+ 'mini_magick' => '~> 4.11', # Image processing
248
+ 'yomu' => '~> 0.2' # Universal text extraction fallback
249
+ }
250
+ end
251
+
252
+ # Check which optional dependencies are missing
253
+ def self.missing_dependencies
254
+ missing = []
255
+ missing << 'pdf-reader' unless dependency_available?(:pdf_reader)
256
+ missing << 'prawn' unless dependency_available?(:prawn)
257
+ missing << 'docx' unless dependency_available?(:docx)
258
+ missing << 'roo' unless dependency_available?(:roo)
259
+ missing << 'mini_magick' unless dependency_available?(:mini_magick)
260
+ missing << 'yomu' unless dependency_available?(:yomu)
261
+ missing
262
+ end
263
+
264
+ # Generate installation instructions for missing dependencies
265
+ def self.installation_instructions
266
+ missing = missing_dependencies
267
+ return "All optional dependencies are installed!" if missing.empty?
268
+
269
+ instructions = ["To enable additional features, install these optional gems:"]
270
+ missing.each do |gem_name|
271
+ version = optional_dependencies[gem_name]
272
+ instructions << " gem install #{gem_name} -v '#{version}'"
273
+ end
274
+
275
+ instructions << ""
276
+ instructions << "Or add to your Gemfile:"
277
+ missing.each do |gem_name|
278
+ version = optional_dependencies[gem_name]
279
+ instructions << " gem '#{gem_name}', '#{version}'"
280
+ end
281
+
282
+ instructions.join("\n")
283
+ end
284
+
210
285
  # Get list of available features based on installed dependencies
211
286
  def self.available_features
212
- features = [:text_processing, :html_processing, :xml_processing, :csv_processing, :json_processing, :archive_processing]
287
+ features = [:text_processing, :html_processing, :xml_processing, :csv_processing, :json_processing, :archive_processing, :tsv_processing]
213
288
 
214
289
  features << :pdf_processing if dependency_available?(:pdf_reader)
215
290
  features << :word_processing if dependency_available?(:docx)
@@ -218,6 +293,14 @@ module UniversalDocumentProcessor
218
293
  features << :universal_text_extraction if dependency_available?(:yomu)
219
294
  features << :pdf_generation if dependency_available?(:prawn)
220
295
 
296
+ # Check AI availability without creating circular dependency
297
+ begin
298
+ ai_agent = AIAgent.new
299
+ features << :ai_processing if ai_agent.ai_enabled
300
+ rescue
301
+ # AI not available
302
+ end
303
+
221
304
  features
222
305
  end
223
306
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: universal_document_processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vikas Patil
@@ -66,19 +66,47 @@ dependencies:
66
66
  - !ruby/object:Gem::Version
67
67
  version: '2.3'
68
68
  - !ruby/object:Gem::Dependency
69
- name: rspec
69
+ name: rexml
70
70
  requirement: !ruby/object:Gem::Requirement
71
71
  requirements:
72
72
  - - "~>"
73
73
  - !ruby/object:Gem::Version
74
- version: '3.12'
74
+ version: '3.2'
75
+ type: :runtime
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '3.2'
82
+ - !ruby/object:Gem::Dependency
83
+ name: minitest
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '5.0'
75
89
  type: :development
76
90
  prerelease: false
77
91
  version_requirements: !ruby/object:Gem::Requirement
78
92
  requirements:
79
93
  - - "~>"
80
94
  - !ruby/object:Gem::Version
81
- version: '3.12'
95
+ version: '5.0'
96
+ - !ruby/object:Gem::Dependency
97
+ name: minitest-reporters
98
+ requirement: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '1.0'
103
+ type: :development
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '1.0'
82
110
  - !ruby/object:Gem::Dependency
83
111
  name: rake
84
112
  requirement: !ruby/object:Gem::Requirement
@@ -135,6 +163,20 @@ dependencies:
135
163
  - - "~>"
136
164
  - !ruby/object:Gem::Version
137
165
  version: '2.0'
166
+ - !ruby/object:Gem::Dependency
167
+ name: simplecov
168
+ requirement: !ruby/object:Gem::Requirement
169
+ requirements:
170
+ - - "~>"
171
+ - !ruby/object:Gem::Version
172
+ version: '0.22'
173
+ type: :development
174
+ prerelease: false
175
+ version_requirements: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - "~>"
178
+ - !ruby/object:Gem::Version
179
+ version: '0.22'
138
180
  description: A comprehensive Ruby gem that handles document processing, text extraction,
139
181
  and AI-powered analysis for PDF, Word, Excel, PowerPoint, images, archives, and
140
182
  more with a unified API. Includes agentic AI features for document analysis, summarization,
@@ -145,9 +187,7 @@ executables: []
145
187
  extensions: []
146
188
  extra_rdoc_files: []
147
189
  files:
148
- - AI_USAGE_GUIDE.md
149
190
  - CHANGELOG.md
150
- - GEM_RELEASE_GUIDE.md
151
191
  - Gemfile
152
192
  - LICENSE
153
193
  - README.md
@@ -178,6 +218,8 @@ metadata:
178
218
  documentation_uri: https://github.com/vpatil160/universal_document_processor/blob/main/README.md
179
219
  funding_uri: https://github.com/sponsors/vpatil160
180
220
  rubygems_mfa_required: 'true'
221
+ optional_dependencies: pdf-reader ~> 2.0, prawn ~> 2.4, docx ~> 0.8, roo ~> 2.8,
222
+ mini_magick ~> 4.11, yomu ~> 0.2
181
223
  rdoc_options: []
182
224
  require_paths:
183
225
  - lib