universal_document_processor 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c33ef9db6830ddb62e98966d0bf3dd85106833e260303651df297bb3d46b9529
4
- data.tar.gz: 280fa75bf3d842fc1af11dd95ca49e0526ed22512b5732aed0a7c25d4a57a7d9
3
+ metadata.gz: 8ec66decfe8626354f9fe05b757dbdc11921b21fa6b5dccfdb4d8ce5deba2c3f
4
+ data.tar.gz: 19c2802d337d0517ab91cfe71bdb2b051213e17f0e1a76605c5bce895429eed4
5
5
  SHA512:
6
- metadata.gz: 01d6c5129dc7ec1a7911a77ba69b8dbafd32bd9397e0374175a4cf09cf24f8c06225a0bc927b1e3bba8250e3d1e93d40bbbc1dbf3aa1a6ff0fbc92cb7f5f24a4
7
- data.tar.gz: b445b8d773e2865e6e2c5593c123cc7d0da580d4d16822e4b9e45c851568f077f4c949d921f2f602c4ed3ff941ccf8a1d002eb14401c554b0f8355dc293bff62
6
+ metadata.gz: f2cbb1944e533a4a75d6248dd6df279219e4a4c7b77dac3b0e4d474b5b4375203d188bff5d388af30716b3dc5487fcd293955c8504565c4a1b56d552a8484993
7
+ data.tar.gz: 2d95c2f173de302d14cdfda6d3357b5d7d9a5cf82cabc2e5622bdb8f6d7e60c56bab75eccf85a312045be5f8e66b743b4344420847192a3b0268cd4a70c5f414
data/CHANGELOG.md CHANGED
@@ -7,6 +7,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [1.2.0] - 2024-01-15
11
+ ### Added
12
+ - **TSV (Tab-Separated Values) File Support**: Complete built-in TSV processing capabilities
13
+ - Native TSV parsing using Ruby CSV library with tab delimiter
14
+ - Text extraction with proper formatting
15
+ - Comprehensive metadata detection (format, delimiter, encoding)
16
+ - Table structure analysis and header detection
17
+ - Statistical analysis and data validation
18
+ - Format conversions: TSV ↔ CSV, TSV → JSON
19
+ - Cross-format compatibility with existing CSV and Excel features
20
+ - New `to_tsv()` method for converting other formats to TSV
21
+ - Enhanced file detector with TSV MIME type mapping
22
+ - Full integration with existing Document class API
23
+
24
+ ### Enhanced
25
+ - **ExcelProcessor**: Extended to handle TSV files alongside CSV and Excel formats
26
+ - **File Detection**: Added TSV MIME type support (`text/tab-separated-values`)
27
+ - **Document Class**: Added `to_tsv()` method and TSV format support
28
+ - **Supported Formats**: Updated to include TSV in format list
29
+
10
30
  ## [1.0.1] - 2025-06-23
11
31
 
12
32
  ### Fixed
data/README.md CHANGED
@@ -16,7 +16,7 @@ A comprehensive Ruby gem that provides unified document processing capabilities
16
16
 
17
17
  ### **Supported File Formats**
18
18
  - **📄 Documents**: PDF, DOC, DOCX, RTF
19
- - **📊 Spreadsheets**: XLS, XLSX, CSV
19
+ - **📊 Spreadsheets**: XLS, XLSX, CSV, TSV
20
20
  - **📺 Presentations**: PPT, PPTX
21
21
  - **🖼️ Images**: JPG, PNG, GIF, BMP, TIFF
22
22
  - **📁 Archives**: ZIP, RAR, 7Z
@@ -236,6 +236,58 @@ tables.each_with_index do |table, index|
236
236
  end
237
237
  ```
238
238
 
239
+ ### Processing TSV (Tab-Separated Values) Files
240
+
241
+ ```ruby
242
+ # Process TSV files with built-in support
243
+ result = UniversalDocumentProcessor.process('data.tsv')
244
+
245
+ # TSV-specific metadata
246
+ metadata = result[:metadata]
247
+ puts "Format: #{metadata[:format]}" # => "tsv"
248
+ puts "Delimiter: #{metadata[:delimiter]}" # => "tab"
249
+ puts "Rows: #{metadata[:total_rows]}"
250
+ puts "Columns: #{metadata[:total_columns]}"
251
+ puts "Has headers: #{metadata[:has_headers]}"
252
+
253
+ # Extract structured data
254
+ tables = result[:tables]
255
+ table = tables.first
256
+ puts "Headers: #{table[:headers].join(', ')}"
257
+ puts "Sample row: #{table[:data][1].join(' | ')}"
258
+
259
+ # Format conversions
260
+ document = UniversalDocumentProcessor::Document.new('data.tsv')
261
+
262
+ # Convert TSV to CSV
263
+ csv_output = document.to_csv
264
+ puts "CSV conversion: #{csv_output.length} characters"
265
+
266
+ # Convert TSV to JSON
267
+ json_output = document.to_json
268
+ puts "JSON conversion: #{json_output.length} characters"
269
+
270
+ # Convert CSV to TSV
271
+ csv_document = UniversalDocumentProcessor::Document.new('data.csv')
272
+ tsv_output = csv_document.to_tsv
273
+ puts "TSV conversion: #{tsv_output.length} characters"
274
+
275
+ # Statistical analysis
276
+ stats = document.extract_statistics
277
+ sheet_stats = stats['Sheet1']
278
+ puts "Total cells: #{sheet_stats[:total_cells]}"
279
+ puts "Numeric cells: #{sheet_stats[:numeric_cells]}"
280
+ puts "Text cells: #{sheet_stats[:text_cells]}"
281
+ puts "Average value: #{sheet_stats[:average_value]}"
282
+
283
+ # Data validation
284
+ validation = document.validate_data
285
+ sheet_validation = validation['Sheet1']
286
+ puts "Data quality score: #{sheet_validation[:data_quality_score]}%"
287
+ puts "Empty rows: #{sheet_validation[:empty_rows]}"
288
+ puts "Duplicate rows: #{sheet_validation[:duplicate_rows]}"
289
+ ```
290
+
239
291
  ### Processing Word Documents
240
292
 
241
293
  ```ruby
data/Rakefile CHANGED
@@ -1,12 +1,105 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
3
- require "rubocop/rake_task"
1
+ require 'rake/testtask'
2
+ require 'bundler/gem_tasks'
4
3
 
5
- RSpec::Core::RakeTask.new(:spec)
6
- RuboCop::RakeTask.new
4
+ # Default task
5
+ task default: :test
7
6
 
8
- desc "Run tests and linting"
9
- task default: %i[spec rubocop]
7
+ # Test task
8
+ Rake::TestTask.new(:test) do |t|
9
+ t.libs << 'test'
10
+ t.libs << 'lib'
11
+ t.test_files = FileList['test/test_*.rb']
12
+ t.verbose = true
13
+ end
14
+
15
+ # Individual test tasks
16
+ Rake::TestTask.new(:test_core) do |t|
17
+ t.libs << 'test'
18
+ t.libs << 'lib'
19
+ t.test_files = FileList['test/test_universal_document_processor.rb']
20
+ t.verbose = true
21
+ end
22
+
23
+ Rake::TestTask.new(:test_ai) do |t|
24
+ t.libs << 'test'
25
+ t.libs << 'lib'
26
+ t.test_files = FileList['test/test_ai_agent.rb']
27
+ t.verbose = true
28
+ end
29
+
30
+ Rake::TestTask.new(:test_processors) do |t|
31
+ t.libs << 'test'
32
+ t.libs << 'lib'
33
+ t.test_files = FileList['test/test_processors.rb']
34
+ t.verbose = true
35
+ end
36
+
37
+ # Coverage task (if simplecov is available)
38
+ desc "Run tests with coverage"
39
+ task :coverage do
40
+ ENV['COVERAGE'] = 'true'
41
+ Rake::Task[:test].invoke
42
+ end
43
+
44
+ # Lint task (if rubocop is available)
45
+ desc "Run RuboCop"
46
+ task :lint do
47
+ begin
48
+ require 'rubocop/rake_task'
49
+ RuboCop::RakeTask.new
50
+ rescue LoadError
51
+ puts "RuboCop not available. Install it with: gem install rubocop"
52
+ end
53
+ end
54
+
55
+ # Documentation task
56
+ desc "Generate documentation"
57
+ task :doc do
58
+ system "yard doc"
59
+ end
60
+
61
+ # Clean task
62
+ desc "Clean up generated files"
63
+ task :clean do
64
+ FileUtils.rm_rf('coverage')
65
+ FileUtils.rm_rf('doc')
66
+ FileUtils.rm_rf('pkg')
67
+ FileUtils.rm_f('Gemfile.lock')
68
+ end
69
+
70
+ # Install dependencies
71
+ desc "Install dependencies"
72
+ task :install do
73
+ system "bundle install"
74
+ end
75
+
76
+ # Quality check task
77
+ desc "Run all quality checks"
78
+ task quality: [:test, :lint]
79
+
80
+ # CI task
81
+ desc "Run CI tasks"
82
+ task ci: [:install, :test]
83
+
84
+ # Development setup
85
+ desc "Setup development environment"
86
+ task :setup do
87
+ puts "Setting up development environment..."
88
+ Rake::Task[:install].invoke
89
+ puts "Development environment ready!"
90
+ puts ""
91
+ puts "Available tasks:"
92
+ puts " rake test - Run all tests"
93
+ puts " rake test_core - Run core functionality tests"
94
+ puts " rake test_ai - Run AI agent tests"
95
+ puts " rake test_processors - Run processor tests"
96
+ puts " rake coverage - Run tests with coverage"
97
+ puts " rake lint - Run RuboCop linting"
98
+ puts " rake doc - Generate documentation"
99
+ puts " rake clean - Clean up generated files"
100
+ puts ""
101
+ puts "To run tests with AI features, set OPENAI_API_KEY environment variable"
102
+ end
10
103
 
11
104
  desc "Build the gem"
12
105
  task :build do
@@ -23,14 +116,4 @@ desc "Install the gem locally"
23
116
  task :install do
24
117
  system "gem build universal_document_processor.gemspec"
25
118
  system "gem install universal_document_processor-*.gem"
26
- end
27
-
28
- desc "Clean build artifacts"
29
- task :clean do
30
- system "rm -f *.gem"
31
- end
32
-
33
- desc "Generate documentation"
34
- task :doc do
35
- system "yard doc"
36
119
  end
@@ -4,7 +4,7 @@ require 'uri'
4
4
 
5
5
  module UniversalDocumentProcessor
6
6
  class AIAgent
7
- attr_reader :api_key, :model, :base_url, :conversation_history
7
+ attr_reader :api_key, :model, :base_url, :conversation_history, :ai_enabled
8
8
 
9
9
  def initialize(options = {})
10
10
  @api_key = options[:api_key] || ENV['OPENAI_API_KEY']
@@ -13,12 +13,15 @@ module UniversalDocumentProcessor
13
13
  @conversation_history = []
14
14
  @max_history = options[:max_history] || 10
15
15
  @temperature = options[:temperature] || 0.7
16
+ @ai_enabled = false
16
17
 
17
18
  validate_configuration
18
19
  end
19
20
 
20
21
  # Main document analysis with AI
21
22
  def analyze_document(document_result, query = nil)
23
+ ensure_ai_available!
24
+
22
25
  context = build_document_context(document_result)
23
26
 
24
27
  if query
@@ -63,6 +66,8 @@ Please provide:
63
66
 
64
67
  # Ask specific questions about a document
65
68
  def ask_document_question(document_result, question)
69
+ ensure_ai_available!
70
+
66
71
  context = build_document_context(document_result)
67
72
 
68
73
  prompt = build_question_prompt(context, question)
@@ -74,6 +79,8 @@ Please provide:
74
79
 
75
80
  # Summarize document content
76
81
  def summarize_document(document_result, length: :medium)
82
+ ensure_ai_available!
83
+
77
84
  context = build_document_context(document_result)
78
85
 
79
86
  length_instruction = case length
@@ -92,6 +99,8 @@ Please provide:
92
99
 
93
100
  # Extract key information from document
94
101
  def extract_key_information(document_result, categories = nil)
102
+ ensure_ai_available!
103
+
95
104
  context = build_document_context(document_result)
96
105
  categories ||= ['key_facts', 'important_dates', 'names', 'locations', 'numbers']
97
106
 
@@ -104,6 +113,8 @@ Please provide:
104
113
 
105
114
  # Translate document content
106
115
  def translate_document(document_result, target_language)
116
+ ensure_ai_available!
117
+
107
118
  context = build_document_context(document_result)
108
119
 
109
120
  prompt = build_translation_prompt(context, target_language)
@@ -115,6 +126,8 @@ Please provide:
115
126
 
116
127
  # Generate document insights and recommendations
117
128
  def generate_insights(document_result)
129
+ ensure_ai_available!
130
+
118
131
  context = build_document_context(document_result)
119
132
 
120
133
  prompt = build_insights_prompt(context)
@@ -126,6 +139,8 @@ Please provide:
126
139
 
127
140
  # Compare multiple documents
128
141
  def compare_documents(document_results, comparison_type = :content)
142
+ ensure_ai_available!
143
+
129
144
  contexts = document_results.map { |doc| build_document_context(doc) }
130
145
 
131
146
  prompt = build_comparison_prompt(contexts, comparison_type)
@@ -137,6 +152,8 @@ Please provide:
137
152
 
138
153
  # Classify document type and purpose
139
154
  def classify_document(document_result)
155
+ ensure_ai_available!
156
+
140
157
  context = build_document_context(document_result)
141
158
 
142
159
  prompt = build_classification_prompt(context)
@@ -148,6 +165,8 @@ Please provide:
148
165
 
149
166
  # Generate action items from document
150
167
  def extract_action_items(document_result)
168
+ ensure_ai_available!
169
+
151
170
  context = build_document_context(document_result)
152
171
 
153
172
  prompt = build_action_items_prompt(context)
@@ -159,6 +178,8 @@ Please provide:
159
178
 
160
179
  # Chat about the document
161
180
  def chat(message, document_result = nil)
181
+ ensure_ai_available!
182
+
162
183
  if document_result
163
184
  context = build_document_context(document_result)
164
185
  prompt = build_chat_prompt(context, message)
@@ -180,6 +201,10 @@ Please provide:
180
201
  def conversation_summary
181
202
  return "No conversation history" if @conversation_history.empty?
182
203
 
204
+ unless @ai_enabled
205
+ return "AI features are disabled. Cannot generate conversation summary."
206
+ end
207
+
183
208
  history_text = @conversation_history.map do |entry|
184
209
  "Q: #{entry[:question]}\nA: #{entry[:answer]}"
185
210
  end.join("\n\n")
@@ -188,11 +213,27 @@ Please provide:
188
213
  call_openai_api(prompt)
189
214
  end
190
215
 
216
+ # Check if AI features are available
217
+ def ai_available?
218
+ @ai_enabled
219
+ end
220
+
191
221
  private
192
222
 
193
223
  def validate_configuration
194
- raise ArgumentError, "OpenAI API key is required" unless @api_key
195
- raise ArgumentError, "OpenAI API key cannot be empty" if @api_key.empty?
224
+ if @api_key && !@api_key.empty?
225
+ @ai_enabled = true
226
+ else
227
+ @ai_enabled = false
228
+ warn "Warning: OpenAI API key not provided. AI features will be disabled. Set OPENAI_API_KEY environment variable or pass api_key option to enable AI features."
229
+ end
230
+ end
231
+
232
+ # Ensure AI is available before making API calls
233
+ def ensure_ai_available!
234
+ unless @ai_enabled
235
+ raise DependencyMissingError, "AI features are not available. Please provide an OpenAI API key to use AI functionality."
236
+ end
196
237
  end
197
238
 
198
239
  def build_document_context(document_result)
@@ -48,6 +48,42 @@ module UniversalDocumentProcessor
48
48
  []
49
49
  end
50
50
 
51
+ def extract_statistics
52
+ processor.respond_to?(:extract_statistics) ? processor.extract_statistics : {}
53
+ rescue => e
54
+ {}
55
+ end
56
+
57
+ def validate_data
58
+ processor.respond_to?(:validate_data) ? processor.validate_data : {}
59
+ rescue => e
60
+ {}
61
+ end
62
+
63
+ def extract_formulas
64
+ processor.respond_to?(:extract_formulas) ? processor.extract_formulas : []
65
+ rescue => e
66
+ []
67
+ end
68
+
69
+ def to_json
70
+ processor.respond_to?(:to_json) ? processor.to_json : process.to_json
71
+ rescue => e
72
+ process.to_json
73
+ end
74
+
75
+ def to_csv(sheet_name = nil)
76
+ processor.respond_to?(:to_csv) ? processor.to_csv(sheet_name) : ""
77
+ rescue => e
78
+ ""
79
+ end
80
+
81
+ def to_tsv(sheet_name = nil)
82
+ processor.respond_to?(:to_tsv) ? processor.to_tsv(sheet_name) : ""
83
+ rescue => e
84
+ ""
85
+ end
86
+
51
87
  def convert_to(target_format)
52
88
  case target_format.to_sym
53
89
  when :pdf
@@ -64,7 +100,7 @@ module UniversalDocumentProcessor
64
100
  end
65
101
 
66
102
  def supported_formats
67
- %w[pdf docx doc xlsx xls pptx ppt txt rtf html xml csv jpg jpeg png gif bmp tiff zip rar 7z]
103
+ %w[pdf docx doc xlsx xls pptx ppt txt rtf html xml csv tsv jpg jpeg png gif bmp tiff zip rar 7z]
68
104
  end
69
105
 
70
106
  def supported?
@@ -139,11 +175,11 @@ module UniversalDocumentProcessor
139
175
  case @content_type
140
176
  when /pdf/
141
177
  Processors::PdfProcessor.new(@file_path, @options)
142
- when /word/, /document/
178
+ when /wordprocessingml/, /msword/
143
179
  Processors::WordProcessor.new(@file_path, @options)
144
- when /excel/, /spreadsheet/
180
+ when /spreadsheetml/, /ms-excel/, /csv/, /tab-separated/
145
181
  Processors::ExcelProcessor.new(@file_path, @options)
146
- when /powerpoint/, /presentation/
182
+ when /presentationml/, /ms-powerpoint/
147
183
  Processors::PowerpointProcessor.new(@file_path, @options)
148
184
  when /image/
149
185
  Processors::ImageProcessor.new(@file_path, @options)