universal_document_processor 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8444e9dc03cd125a0a9e62df6370b7dbba4adf4777d89478d1d51c60f5c83d70
4
- data.tar.gz: 3a5fc000774c34683c7b0d95c0ca9a034838cd5e69be51c43c98792070b278aa
3
+ metadata.gz: '0612949a026d62fd8fd9c9c1372cfa70cdeb8bdd1677475be639cf35cd684f4c'
4
+ data.tar.gz: 82780d2c062034be663b3d21275e9d27addc1e44f5705de7dc6b23e70293216e
5
5
  SHA512:
6
- metadata.gz: be66ce6b411fcfa52eaf6353ef1f37785cbe996c8c977bb671c971988c43a500108004741b157ca7636d9e2f20991c43b44884134c3ffd4fde2f6b4a90d27380
7
- data.tar.gz: 94e90e17615093e529db4100d674944150d0cbbcd527df3341b8c025ada3fe389dc8dbc64099b1cfa87473105c57038ca1fb5106281242b8658e583042cc52c9
6
+ metadata.gz: 07a4fe1b792226dae8135e6620f455640d0ba137777b238052916d06a0b4f32b113414886479e0ad48b76ead57d9b7d6a577a76748dff926a9575ad124dc7ee5
7
+ data.tar.gz: fd3b8fb692f87755a657eb1270631c19ca99bcb5fcbb224e4ebcc22c1b19af1eb2aa3b2aed6ff3f94f84a6fdba8d5e255b28575d5fe1b074244e4bc160820f33
data/README.md CHANGED
@@ -219,6 +219,33 @@ puts "Tables found: #{result[:tables].length}"
219
219
  full_text = result[:text_content]
220
220
  ```
221
221
 
222
+ ### Creating PDF Documents
223
+
224
+ ```ruby
225
+ # Install Prawn for PDF creation (optional dependency)
226
+ # gem install prawn
227
+
228
+ # Create PDF from any supported document format
229
+ pdf_path = UniversalDocumentProcessor.create_pdf('document.docx')
230
+ puts "PDF created at: #{pdf_path}"
231
+
232
+ # Or use the convert method
233
+ pdf_path = UniversalDocumentProcessor.convert('spreadsheet.xlsx', :pdf)
234
+
235
+ # Check if PDF creation is available
236
+ if UniversalDocumentProcessor.pdf_creation_available?
237
+ puts "PDF creation is available!"
238
+ else
239
+ puts "Install 'prawn' gem to enable PDF creation: gem install prawn"
240
+ end
241
+
242
+ # The created PDF includes:
243
+ # - Document title and metadata
244
+ # - Full text content with formatting
245
+ # - Tables (if present in original document)
246
+ # - File information and statistics
247
+ ```
248
+
222
249
  ### Processing Excel Spreadsheets
223
250
 
224
251
  ```ruby
@@ -413,6 +440,89 @@ summary = japanese_doc.ai_summarize(length: :medium)
413
440
 
414
441
  ```ruby
415
442
  # Custom AI agent configuration
443
+ ## ⚙️ Agentic AI Configuration & Usage
444
+
445
+ To enable and use the AI-powered features (agentic AI) in your application, follow these steps:
446
+
447
+ ### 1. Install AI Dependency
448
+
449
+ You need the `ruby-openai` gem for AI features:
450
+
451
+ ```bash
452
+ gem install ruby-openai
453
+ ```
454
+
455
+ Or add to your Gemfile:
456
+
457
+ ```ruby
458
+ gem 'ruby-openai'
459
+ ```
460
+
461
+ Then run:
462
+
463
+ ```bash
464
+ bundle install
465
+ ```
466
+
467
+ ### 2. Set Your OpenAI API Key
468
+
469
+ You must provide your OpenAI API key for agentic AI features to work. You can do this in two ways:
470
+
471
+ #### a) Environment Variable (Recommended)
472
+
473
+ Set the API key in your environment (e.g., in `.env`, `application.yml`, or your deployment environment):
474
+
475
+ ```ruby
476
+ ENV['OPENAI_API_KEY'] = 'your-api-key-here'
477
+ ```
478
+
479
+ #### b) Pass Directly When Creating the Agent
480
+
481
+ ```ruby
482
+ agent = UniversalDocumentProcessor.create_ai_agent(api_key: 'your-api-key-here')
483
+ ```
484
+
485
+ ### 3. Rails: Where to Configure
486
+
487
+ If you are using Rails, add your configuration to:
488
+
489
+ `config/initializers/universal_document_processor.rb`
490
+
491
+ Example initializer:
492
+
493
+ ```ruby
494
+ # config/initializers/universal_document_processor.rb
495
+ require 'universal_document_processor'
496
+
497
+ # Set your API key (or use ENV)
498
+ ENV['OPENAI_API_KEY'] ||= 'your-api-key-here' # (or use Rails credentials)
499
+
500
+ # Optionally, create a default agent with custom options
501
+ UniversalDocumentProcessor.create_ai_agent(
502
+ model: 'gpt-4',
503
+ temperature: 0.7,
504
+ max_history: 10
505
+ )
506
+
507
+ Rails.logger.info "Universal Document Processor with AI agent loaded" if defined?(Rails)
508
+ ```
509
+
510
+ ### 4. Using Agentic AI Features
511
+
512
+ You can now use the AI-powered methods:
513
+
514
+ ```ruby
515
+ summary = UniversalDocumentProcessor.ai_summarize('document.pdf', length: :short)
516
+ insights = UniversalDocumentProcessor.ai_insights('document.pdf')
517
+ classification = UniversalDocumentProcessor.ai_classify('document.pdf')
518
+ key_info = UniversalDocumentProcessor.ai_extract_info('document.pdf', ['dates', 'names', 'amounts'])
519
+ action_items = UniversalDocumentProcessor.ai_action_items('document.pdf')
520
+ translation = UniversalDocumentProcessor.ai_translate('日本語文書.pdf', 'English')
521
+ ```
522
+
523
+ Or create and use a persistent agent:
524
+
525
+ ```ruby
416
526
  agent = UniversalDocumentProcessor.create_ai_agent(
417
527
  api_key: 'your-openai-key', # OpenAI API key
418
528
  model: 'gpt-4', # Model to use (gpt-4, gpt-3.5-turbo)
@@ -420,6 +530,17 @@ agent = UniversalDocumentProcessor.create_ai_agent(
420
530
  max_history: 20, # Conversation memory length
421
531
  base_url: 'https://api.openai.com/v1' # Custom API endpoint
422
532
  )
533
+
534
+ # Chat about a document
535
+ response = agent.analyze_document('report.pdf')
536
+ ```
537
+
538
+ ---
539
+
540
+ **Note:**
541
+ - The API key is required for all AI features.
542
+ - You can override the model, temperature, and other options per agent.
543
+ - For more, see the `USER_GUIDE.md` and the examples above.
423
544
  ```
424
545
 
425
546
  ## 📦 Archive Processing (ZIP Creation & Extraction)
@@ -857,7 +978,7 @@ bundle exec rspec
857
978
 
858
979
  ## 📝 Changelog
859
980
 
860
- ### Version 1.0.0
981
+ ### Version 1.1.0
861
982
  - Initial release
862
983
  - Support for PDF, Word, Excel, PowerPoint, images, archives
863
984
  - Character validation and cleaning
@@ -14,16 +14,16 @@ module UniversalDocumentProcessor
14
14
  @max_history = options[:max_history] || 10
15
15
  @temperature = options[:temperature] || 0.7
16
16
  @ai_enabled = false
17
-
17
+
18
18
  validate_configuration
19
19
  end
20
20
 
21
21
  # Main document analysis with AI
22
22
  def analyze_document(document_result, query = nil)
23
23
  ensure_ai_available!
24
-
24
+
25
25
  context = build_document_context(document_result)
26
-
26
+
27
27
  if query
28
28
  # Specific query about the document
29
29
  analyze_with_query(context, query)
@@ -67,12 +67,12 @@ Please provide:
67
67
  # Ask specific questions about a document
68
68
  def ask_document_question(document_result, question)
69
69
  ensure_ai_available!
70
-
70
+
71
71
  context = build_document_context(document_result)
72
-
72
+
73
73
  prompt = build_question_prompt(context, question)
74
74
  response = call_openai_api(prompt)
75
-
75
+
76
76
  add_to_history(question, response)
77
77
  response
78
78
  end
@@ -80,19 +80,19 @@ Please provide:
80
80
  # Summarize document content
81
81
  def summarize_document(document_result, length: :medium)
82
82
  ensure_ai_available!
83
-
83
+
84
84
  context = build_document_context(document_result)
85
-
85
+
86
86
  length_instruction = case length
87
87
  when :short then "in 2-3 sentences"
88
88
  when :medium then "in 1-2 paragraphs"
89
89
  when :long then "in detail with key points"
90
90
  else "concisely"
91
91
  end
92
-
92
+
93
93
  prompt = build_summary_prompt(context, length_instruction)
94
94
  response = call_openai_api(prompt)
95
-
95
+
96
96
  add_to_history("Summarize document #{length_instruction}", response)
97
97
  response
98
98
  end
@@ -100,13 +100,13 @@ Please provide:
100
100
  # Extract key information from document
101
101
  def extract_key_information(document_result, categories = nil)
102
102
  ensure_ai_available!
103
-
103
+
104
104
  context = build_document_context(document_result)
105
105
  categories ||= ['key_facts', 'important_dates', 'names', 'locations', 'numbers']
106
-
106
+
107
107
  prompt = build_extraction_prompt(context, categories)
108
108
  response = call_openai_api(prompt)
109
-
109
+
110
110
  add_to_history("Extract key information: #{categories.join(', ')}", response)
111
111
  parse_extraction_response(response)
112
112
  end
@@ -114,12 +114,12 @@ Please provide:
114
114
  # Translate document content
115
115
  def translate_document(document_result, target_language)
116
116
  ensure_ai_available!
117
-
117
+
118
118
  context = build_document_context(document_result)
119
-
119
+
120
120
  prompt = build_translation_prompt(context, target_language)
121
121
  response = call_openai_api(prompt)
122
-
122
+
123
123
  add_to_history("Translate to #{target_language}", response)
124
124
  response
125
125
  end
@@ -127,12 +127,12 @@ Please provide:
127
127
  # Generate document insights and recommendations
128
128
  def generate_insights(document_result)
129
129
  ensure_ai_available!
130
-
130
+
131
131
  context = build_document_context(document_result)
132
-
132
+
133
133
  prompt = build_insights_prompt(context)
134
134
  response = call_openai_api(prompt)
135
-
135
+
136
136
  add_to_history("Generate insights", response)
137
137
  parse_insights_response(response)
138
138
  end
@@ -140,12 +140,12 @@ Please provide:
140
140
  # Compare multiple documents
141
141
  def compare_documents(document_results, comparison_type = :content)
142
142
  ensure_ai_available!
143
-
143
+
144
144
  contexts = document_results.map { |doc| build_document_context(doc) }
145
-
145
+
146
146
  prompt = build_comparison_prompt(contexts, comparison_type)
147
147
  response = call_openai_api(prompt)
148
-
148
+
149
149
  add_to_history("Compare documents (#{comparison_type})", response)
150
150
  response
151
151
  end
@@ -153,12 +153,12 @@ Please provide:
153
153
  # Classify document type and purpose
154
154
  def classify_document(document_result)
155
155
  ensure_ai_available!
156
-
156
+
157
157
  context = build_document_context(document_result)
158
-
158
+
159
159
  prompt = build_classification_prompt(context)
160
160
  response = call_openai_api(prompt)
161
-
161
+
162
162
  add_to_history("Classify document", response)
163
163
  parse_classification_response(response)
164
164
  end
@@ -166,12 +166,12 @@ Please provide:
166
166
  # Generate action items from document
167
167
  def extract_action_items(document_result)
168
168
  ensure_ai_available!
169
-
169
+
170
170
  context = build_document_context(document_result)
171
-
171
+
172
172
  prompt = build_action_items_prompt(context)
173
173
  response = call_openai_api(prompt)
174
-
174
+
175
175
  add_to_history("Extract action items", response)
176
176
  parse_action_items_response(response)
177
177
  end
@@ -179,14 +179,14 @@ Please provide:
179
179
  # Chat about the document
180
180
  def chat(message, document_result = nil)
181
181
  ensure_ai_available!
182
-
182
+
183
183
  if document_result
184
184
  context = build_document_context(document_result)
185
185
  prompt = build_chat_prompt(context, message)
186
186
  else
187
187
  prompt = build_general_chat_prompt(message)
188
188
  end
189
-
189
+
190
190
  response = call_openai_api(prompt)
191
191
  add_to_history(message, response)
192
192
  response
@@ -200,15 +200,15 @@ Please provide:
200
200
  # Get conversation summary
201
201
  def conversation_summary
202
202
  return "No conversation history" if @conversation_history.empty?
203
-
203
+
204
204
  unless @ai_enabled
205
205
  return "AI features are disabled. Cannot generate conversation summary."
206
206
  end
207
-
207
+
208
208
  history_text = @conversation_history.map do |entry|
209
209
  "Q: #{entry[:question]}\nA: #{entry[:answer]}"
210
210
  end.join("\n\n")
211
-
211
+
212
212
  prompt = "Summarize this conversation:\n\n#{history_text}"
213
213
  call_openai_api(prompt)
214
214
  end
@@ -247,13 +247,13 @@ Please provide:
247
247
  tables_count: document_result[:tables]&.length || 0,
248
248
  filename_info: document_result[:filename_info] || {}
249
249
  }
250
-
250
+
251
251
  # Add Japanese-specific information if available
252
252
  if context[:filename_info][:contains_japanese]
253
253
  context[:japanese_filename] = true
254
254
  context[:japanese_parts] = context[:filename_info][:japanese_parts]
255
255
  end
256
-
256
+
257
257
  context
258
258
  end
259
259
 
@@ -324,8 +324,7 @@ Please provide:
324
324
 
325
325
  def build_comparison_prompt(contexts, comparison_type)
326
326
  comparison_content = contexts.map.with_index do |context, index|
327
- "Document #{index + 1}: #{context[:filename]}
328
- Content: #{truncate_content(context[:text_content], 1500)}"
327
+ "Document #{index + 1}: #{context[:filename]}\nContent: #{truncate_content(context[:text_content], 1500)}"
329
328
  end.join("\n\n---\n\n")
330
329
 
331
330
  "You are an AI analyst. Compare these documents focusing on #{comparison_type}:
@@ -404,15 +403,15 @@ Please respond helpfully."
404
403
 
405
404
  def call_openai_api(prompt)
406
405
  uri = URI("#{@base_url}/chat/completions")
407
-
406
+
408
407
  http = Net::HTTP.new(uri.host, uri.port)
409
408
  http.use_ssl = true
410
409
  http.read_timeout = 60
411
-
410
+
412
411
  request = Net::HTTP::Post.new(uri)
413
412
  request['Content-Type'] = 'application/json'
414
413
  request['Authorization'] = "Bearer #{@api_key}"
415
-
414
+
416
415
  request.body = {
417
416
  model: @model,
418
417
  messages: [
@@ -421,16 +420,16 @@ Please respond helpfully."
421
420
  content: "You are an intelligent document processing assistant with expertise in analyzing, summarizing, and extracting information from various document types. You support multiple languages including Japanese."
422
421
  },
423
422
  {
424
- role: "user",
423
+ role: "user",
425
424
  content: prompt
426
425
  }
427
426
  ],
428
427
  temperature: @temperature,
429
428
  max_tokens: 2000
430
429
  }.to_json
431
-
430
+
432
431
  response = http.request(request)
433
-
432
+
434
433
  if response.code.to_i == 200
435
434
  result = JSON.parse(response.body)
436
435
  result.dig('choices', 0, 'message', 'content') || "No response generated"
@@ -446,14 +445,14 @@ Please respond helpfully."
446
445
  answer: answer,
447
446
  timestamp: Time.now
448
447
  }
449
-
448
+
450
449
  # Keep only the most recent conversations
451
450
  @conversation_history = @conversation_history.last(@max_history) if @conversation_history.length > @max_history
452
451
  end
453
452
 
454
453
  def truncate_content(content, max_length)
455
454
  return "" unless content.is_a?(String)
456
-
455
+
457
456
  if content.length > max_length
458
457
  "#{content[0...max_length]}...\n\n[Content truncated for analysis]"
459
458
  else
@@ -463,16 +462,16 @@ Please respond helpfully."
463
462
 
464
463
  def format_file_size(bytes)
465
464
  return "0 B" if bytes == 0
466
-
465
+
467
466
  units = ['B', 'KB', 'MB', 'GB']
468
467
  size = bytes.to_f
469
468
  unit_index = 0
470
-
469
+
471
470
  while size >= 1024 && unit_index < units.length - 1
472
471
  size /= 1024
473
472
  unit_index += 1
474
473
  end
475
-
474
+
476
475
  "#{size.round(2)} #{units[unit_index]}"
477
476
  end
478
477
 
@@ -490,7 +489,7 @@ Please respond helpfully."
490
489
  rescue JSON::ParserError
491
490
  # Fall back to plain text response
492
491
  end
493
-
492
+
494
493
  response
495
494
  end
496
495
 
@@ -2,29 +2,62 @@ module UniversalDocumentProcessor
2
2
  class Document
3
3
  attr_reader :file_path, :content_type, :file_size, :options, :filename_validation
4
4
 
5
+ class LargeFileError < StandardError; end
6
+ class FileValidationError < StandardError; end
7
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
8
+
5
9
  def initialize(file_path_or_io, options = {})
6
10
  @file_path = file_path_or_io.is_a?(String) ? normalize_file_path(file_path_or_io) : save_temp_file(file_path_or_io)
7
11
  @options = options
12
+ # 1. Check file existence and readability
13
+ unless File.exist?(@file_path) && File.readable?(@file_path)
14
+ raise FileValidationError, "File is missing or unreadable: #{@file_path}"
15
+ end
8
16
  @content_type = detect_content_type
9
17
  @file_size = File.size(@file_path)
18
+ # 2. Large file safeguard
19
+ if @file_size > MAX_FILE_SIZE
20
+ raise LargeFileError, "File size #{@file_size} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
21
+ end
10
22
  @filename_validation = validate_filename_encoding
23
+ # 3. Encoding validation and cleaning for text files
24
+ if @content_type =~ /text|plain/
25
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
26
+ unless validation[:valid]
27
+ @cleaned_text_content = UniversalDocumentProcessor.clean_text(validation[:content], {
28
+ remove_null_bytes: true,
29
+ remove_control_chars: true,
30
+ normalize_whitespace: true
31
+ })
32
+ else
33
+ @cleaned_text_content = nil
34
+ end
35
+ end
11
36
  end
12
37
 
13
38
  def process
14
- {
15
- file_path: @file_path,
16
- content_type: @content_type,
17
- file_size: @file_size,
18
- text_content: extract_text,
19
- metadata: metadata,
20
- images: extract_images,
21
- tables: extract_tables,
22
- filename_info: filename_info,
23
- processed_at: Time.current
24
- }
39
+ begin
40
+ {
41
+ file_path: @file_path,
42
+ content_type: @content_type,
43
+ file_size: @file_size,
44
+ text_content: extract_text,
45
+ metadata: metadata,
46
+ images: extract_images,
47
+ tables: extract_tables,
48
+ filename_info: filename_info,
49
+ processed_at: Time.current
50
+ }
51
+ rescue LargeFileError, FileValidationError => e
52
+ { error: e.class.name, message: e.message, file_path: @file_path }
53
+ rescue => e
54
+ { error: 'ProcessingError', message: e.message, file_path: @file_path }
55
+ end
25
56
  end
26
57
 
27
58
  def extract_text
59
+ # Use cleaned text if available (from encoding validation)
60
+ return @cleaned_text_content if defined?(@cleaned_text_content) && @cleaned_text_content
28
61
  processor.extract_text
29
62
  rescue => e
30
63
  fallback_text_extraction
@@ -253,13 +286,97 @@ module UniversalDocumentProcessor
253
286
  end
254
287
 
255
288
  def convert_to_pdf
256
- # Implementation for PDF conversion
257
- raise NotImplementedError, "PDF conversion not yet implemented"
289
+ ensure_prawn_available!
290
+
291
+ output_path = @file_path.gsub(File.extname(@file_path), '.pdf')
292
+
293
+ Prawn::Document.generate(output_path) do |pdf|
294
+ # Add title
295
+ pdf.font_size 18
296
+ pdf.text "Document: #{File.basename(@file_path)}", style: :bold
297
+ pdf.move_down 20
298
+
299
+ # Add metadata section
300
+ pdf.font_size 12
301
+ pdf.text "Document Information", style: :bold
302
+ pdf.move_down 10
303
+
304
+ metadata_info = metadata
305
+ pdf.text "File Size: #{format_file_size(@file_size)}"
306
+ pdf.text "Content Type: #{@content_type}"
307
+ pdf.text "Created: #{metadata_info[:created_at]}" if metadata_info[:created_at]
308
+ pdf.text "Modified: #{metadata_info[:modified_at]}" if metadata_info[:modified_at]
309
+ pdf.move_down 20
310
+
311
+ # Add content section
312
+ pdf.text "Content", style: :bold
313
+ pdf.move_down 10
314
+
315
+ text_content = extract_text
316
+ if text_content && !text_content.strip.empty?
317
+ pdf.font_size 10
318
+ pdf.text text_content
319
+ else
320
+ pdf.text "No text content available for this document."
321
+ end
322
+
323
+ # Add tables if available
324
+ tables = extract_tables
325
+ unless tables.empty?
326
+ pdf.start_new_page
327
+ pdf.font_size 12
328
+ pdf.text "Tables", style: :bold
329
+ pdf.move_down 10
330
+
331
+ tables.each_with_index do |table, index|
332
+ pdf.text "Table #{index + 1}", style: :bold
333
+ pdf.move_down 5
334
+
335
+ if table[:content] && !table[:content].empty?
336
+ # Format table data for Prawn
337
+ table_data = table[:content].first(20) # Limit to first 20 rows
338
+ pdf.table(table_data, header: true) do
339
+ row(0).font_style = :bold
340
+ cells.size = 8
341
+ cells.padding = 3
342
+ end
343
+ end
344
+ pdf.move_down 15
345
+ end
346
+ end
347
+ end
348
+
349
+ output_path
350
+ rescue => e
351
+ raise ProcessingError, "Failed to create PDF: #{e.message}"
258
352
  end
259
353
 
260
354
  def convert_to_html
261
355
  # Implementation for HTML conversion
262
356
  raise NotImplementedError, "HTML conversion not yet implemented"
263
357
  end
358
+
359
+ private
360
+
361
+ def ensure_prawn_available!
362
+ unless defined?(Prawn)
363
+ raise DependencyMissingError, "PDF creation requires the 'prawn' gem. Install it with: gem install prawn -v '~> 2.4'"
364
+ end
365
+ end
366
+
367
+ def format_file_size(bytes)
368
+ return "0 B" if bytes == 0
369
+
370
+ units = ['B', 'KB', 'MB', 'GB']
371
+ size = bytes.to_f
372
+ unit_index = 0
373
+
374
+ while size >= 1024 && unit_index < units.length - 1
375
+ size /= 1024
376
+ unit_index += 1
377
+ end
378
+
379
+ "#{size.round(2)} #{units[unit_index]}"
380
+ end
264
381
  end
265
382
  end
@@ -3,6 +3,8 @@ module UniversalDocumentProcessor
3
3
  class BaseProcessor
4
4
  attr_reader :file_path, :options
5
5
 
6
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
7
+
6
8
  def initialize(file_path, options = {})
7
9
  @file_path = file_path
8
10
  @options = options
@@ -11,6 +13,17 @@ module UniversalDocumentProcessor
11
13
  def extract_text
12
14
  # Fallback to universal text extraction
13
15
  if defined?(Yomu)
16
+ # Encoding validation for text files
17
+ if File.extname(@file_path) =~ /\.(txt|csv|tsv|md|json|xml|html|htm)$/i
18
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
19
+ unless validation[:valid]
20
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
21
+ remove_null_bytes: true,
22
+ remove_control_chars: true,
23
+ normalize_whitespace: true
24
+ })
25
+ end
26
+ end
14
27
  Yomu.new(@file_path).text
15
28
  else
16
29
  raise ProcessingError, "Universal text extraction requires the 'yomu' gem. Install it with: gem install yomu -v '~> 0.2'"
@@ -49,6 +62,10 @@ module UniversalDocumentProcessor
49
62
  def validate_file
50
63
  raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
51
64
  raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
65
+ # Large file safeguard
66
+ if File.size(@file_path) > MAX_FILE_SIZE
67
+ raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
68
+ end
52
69
  end
53
70
 
54
71
  def with_error_handling
@@ -6,11 +6,32 @@ require 'csv'
6
6
  module UniversalDocumentProcessor
7
7
  module Processors
8
8
  class ExcelProcessor < BaseProcessor
9
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
10
+
9
11
  def extract_text
12
+ validate_file
10
13
  with_error_handling do
11
14
  if @file_path.end_with?('.csv')
15
+ # Encoding validation for CSV
16
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
17
+ unless validation[:valid]
18
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
19
+ remove_null_bytes: true,
20
+ remove_control_chars: true,
21
+ normalize_whitespace: true
22
+ })
23
+ end
12
24
  extract_csv_text
13
25
  elsif @file_path.end_with?('.tsv')
26
+ # Encoding validation for TSV
27
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
28
+ unless validation[:valid]
29
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
30
+ remove_null_bytes: true,
31
+ remove_control_chars: true,
32
+ normalize_whitespace: true
33
+ })
34
+ end
14
35
  extract_tsv_text
15
36
  elsif @file_path.end_with?('.xlsx')
16
37
  extract_xlsx_text_builtin
@@ -208,6 +229,15 @@ module UniversalDocumentProcessor
208
229
 
209
230
  private
210
231
 
232
+ def validate_file
233
+ raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
234
+ raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
235
+ # Large file safeguard
236
+ if File.size(@file_path) > MAX_FILE_SIZE
237
+ raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
238
+ end
239
+ end
240
+
211
241
  # CSV Processing Methods
212
242
  def extract_csv_text
213
243
  content = File.read(@file_path, encoding: 'UTF-8')
@@ -1,12 +1,23 @@
1
1
  module UniversalDocumentProcessor
2
2
  module Processors
3
3
  class PdfProcessor < BaseProcessor
4
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
5
+
4
6
  def extract_text
5
7
  ensure_pdf_reader_available!
6
-
8
+ validate_file
7
9
  with_error_handling do
8
10
  reader = PDF::Reader.new(@file_path)
9
11
  text = reader.pages.map(&:text).join("\n")
12
+ # Encoding validation for extracted text
13
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
14
+ unless validation[:valid]
15
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
16
+ remove_null_bytes: true,
17
+ remove_control_chars: true,
18
+ normalize_whitespace: true
19
+ })
20
+ end
10
21
  text.strip.empty? ? "No text content found in PDF" : text
11
22
  end
12
23
  rescue => e
@@ -104,6 +115,15 @@ module UniversalDocumentProcessor
104
115
  end
105
116
  end
106
117
 
118
+ def validate_file
119
+ raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
120
+ raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
121
+ # Large file safeguard
122
+ if File.size(@file_path) > MAX_FILE_SIZE
123
+ raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
124
+ end
125
+ end
126
+
107
127
  def extract_form_fields(reader)
108
128
  # Extract PDF form fields if present
109
129
  []
@@ -1,7 +1,10 @@
1
1
  module UniversalDocumentProcessor
2
2
  module Processors
3
3
  class TextProcessor < BaseProcessor
4
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
5
+
4
6
  def extract_text
7
+ validate_file
5
8
  with_error_handling do
6
9
  case detect_text_format
7
10
  when :rtf
@@ -15,6 +18,15 @@ module UniversalDocumentProcessor
15
18
  when :json
16
19
  extract_json_text
17
20
  else
21
+ # Encoding validation for plain text
22
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
23
+ unless validation[:valid]
24
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
25
+ remove_null_bytes: true,
26
+ remove_control_chars: true,
27
+ normalize_whitespace: true
28
+ })
29
+ end
18
30
  extract_plain_text
19
31
  end
20
32
  end
@@ -81,6 +93,15 @@ module UniversalDocumentProcessor
81
93
 
82
94
  private
83
95
 
96
+ def validate_file
97
+ raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
98
+ raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
99
+ # Large file safeguard
100
+ if File.size(@file_path) > MAX_FILE_SIZE
101
+ raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
102
+ end
103
+ end
104
+
84
105
  def detect_text_format
85
106
  extension = File.extname(@file_path).downcase
86
107
  case extension
@@ -1,11 +1,32 @@
1
1
  module UniversalDocumentProcessor
2
2
  module Processors
3
3
  class WordProcessor < BaseProcessor
4
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
5
+
4
6
  def extract_text
7
+ validate_file
5
8
  with_error_handling do
6
9
  if @file_path.end_with?('.docx')
10
+ # Encoding validation for docx (if possible)
11
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
12
+ unless validation[:valid]
13
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
14
+ remove_null_bytes: true,
15
+ remove_control_chars: true,
16
+ normalize_whitespace: true
17
+ })
18
+ end
7
19
  extract_docx_text
8
20
  elsif @file_path.end_with?('.doc')
21
+ # Encoding validation for doc (if possible)
22
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
23
+ unless validation[:valid]
24
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
25
+ remove_null_bytes: true,
26
+ remove_control_chars: true,
27
+ normalize_whitespace: true
28
+ })
29
+ end
9
30
  # Built-in .doc file processing
10
31
  fallback_text_extraction
11
32
  else
@@ -90,6 +111,15 @@ module UniversalDocumentProcessor
90
111
 
91
112
  private
92
113
 
114
+ def validate_file
115
+ raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
116
+ raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
117
+ # Large file safeguard
118
+ if File.size(@file_path) > MAX_FILE_SIZE
119
+ raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
120
+ end
121
+ end
122
+
93
123
  def ensure_docx_available!
94
124
  unless defined?(Docx)
95
125
  raise DependencyMissingError, "DOCX processing requires the 'docx' gem. Install it with: gem install docx -v '~> 0.8'"
@@ -1,3 +1,3 @@
1
1
  module UniversalDocumentProcessor
2
- VERSION = "1.1.0"
2
+ VERSION = "1.1.1"
3
3
  end
@@ -206,6 +206,16 @@ module UniversalDocumentProcessor
206
206
  Document.new(file_path_or_io, options).convert_to(target_format)
207
207
  end
208
208
 
209
+ # Create PDF from any supported document
210
+ def self.create_pdf(file_path, options = {})
211
+ Document.new(file_path, options).convert_to(:pdf)
212
+ end
213
+
214
+ # Check if PDF creation is available
215
+ def self.pdf_creation_available?
216
+ defined?(Prawn)
217
+ end
218
+
209
219
  # Batch process multiple documents
210
220
  def self.batch_process(file_paths, options = {})
211
221
  file_paths.map do |file_path|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: universal_document_processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vikas Patil