legal_summariser 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +25 -0
 - data/exe/legal_summariser +131 -1
 - data/lib/legal_summariser/text_extractor.rb +125 -7
 - data/lib/legal_summariser/version.rb +1 -1
 - data/lib/legal_summariser.rb +191 -38
 - metadata +1 -1
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 99da5ab12240efdb658eafc5b3e76ef46834f7a7d76bf86edfe1958ea75c4f58
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: aa0ee6b2406771e99c22af8d5ab00145eeee8666a8ccbda9b96c48ed87e0e408
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 20d58233629912675fd4fa7a44c0813d1267e25bc0004df18d37c60ed069906f31d8a68cc165c337809ff040e874d41053b347eb4b3df46f98bf85451a1f654d
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: f7bc3b2feab8929485a5387e93ecc0762b32d18903460ba5e33ea1a7c3dd010102c8cbe10feff018694c0ea2a9641c6919904da574a041a226d1de0f1134122b
         
     | 
    
        data/CHANGELOG.md
    CHANGED
    
    | 
         @@ -5,6 +5,31 @@ All notable changes to this project will be documented in this file. 
     | 
|
| 
       5 
5 
     | 
    
         
             
            The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
         
     | 
| 
       6 
6 
     | 
    
         
             
            and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
         
     | 
| 
       7 
7 
     | 
    
         | 
| 
      
 8 
     | 
    
         
            +
            ## [0.2.0] - 2025-01-09
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
            ### Added
         
     | 
| 
      
 11 
     | 
    
         
            +
            - **Configuration System**: Comprehensive configuration management with validation
         
     | 
| 
      
 12 
     | 
    
         
            +
            - **Caching System**: Result caching with TTL and size management
         
     | 
| 
      
 13 
     | 
    
         
            +
            - **Performance Monitoring**: Built-in performance tracking and metrics
         
     | 
| 
      
 14 
     | 
    
         
            +
            - **Enhanced CLI**: New commands for batch processing, statistics, and configuration
         
     | 
| 
      
 15 
     | 
    
         
            +
            - **Batch Processing**: Process multiple documents simultaneously
         
     | 
| 
      
 16 
     | 
    
         
            +
            - **Enhanced Document Support**: Added RTF support and improved text extraction
         
     | 
| 
      
 17 
     | 
    
         
            +
            - **Advanced Error Handling**: Better error messages and recovery mechanisms
         
     | 
| 
      
 18 
     | 
    
         
            +
            - **Comprehensive Testing**: 75 test cases with full coverage
         
     | 
| 
      
 19 
     | 
    
         
            +
            - **Documentation**: Complete examples and contribution guidelines
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
            ### Enhanced
         
     | 
| 
      
 22 
     | 
    
         
            +
            - **Text Extraction**: Multiple encoding support, better PDF/DOCX handling
         
     | 
| 
      
 23 
     | 
    
         
            +
            - **Document Type Detection**: Improved scoring system for 9 document types
         
     | 
| 
      
 24 
     | 
    
         
            +
            - **Risk Analysis**: More comprehensive risk patterns and compliance checking
         
     | 
| 
      
 25 
     | 
    
         
            +
            - **Summarization**: Better plain English conversion and key point extraction
         
     | 
| 
      
 26 
     | 
    
         
            +
            - **CLI Interface**: Verbose logging, caching options, and performance stats
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
            ### Fixed
         
     | 
| 
      
 29 
     | 
    
         
            +
            - Text cleaning and normalization issues
         
     | 
| 
      
 30 
     | 
    
         
            +
            - Memory leaks in document processing
         
     | 
| 
      
 31 
     | 
    
         
            +
            - Error handling for edge cases
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
       8 
33 
     | 
    
         
             
            ## [0.1.0] - 2024-09-09
         
     | 
| 
       9 
34 
     | 
    
         | 
| 
       10 
35 
     | 
    
         
             
            ### Added
         
     | 
    
        data/exe/legal_summariser
    CHANGED
    
    | 
         @@ -10,18 +10,28 @@ module LegalSummariser 
     | 
|
| 
       10 
10 
     | 
    
         
             
                option :format, aliases: '-f', default: 'text', desc: 'Output format (json, markdown, text)'
         
     | 
| 
       11 
11 
     | 
    
         
             
                option :output, aliases: '-o', desc: 'Output file path (optional)'
         
     | 
| 
       12 
12 
     | 
    
         
             
                option :max_sentences, type: :numeric, default: 5, desc: 'Maximum sentences in summary'
         
     | 
| 
      
 13 
     | 
    
         
            +
                option :verbose, aliases: '-v', type: :boolean, default: false, desc: 'Enable verbose logging'
         
     | 
| 
      
 14 
     | 
    
         
            +
                option :cache, type: :boolean, default: false, desc: 'Enable result caching'
         
     | 
| 
       13 
15 
     | 
    
         
             
                def analyze(file_path)
         
     | 
| 
       14 
16 
     | 
    
         
             
                  begin
         
     | 
| 
      
 17 
     | 
    
         
            +
                    # Configure logging and caching
         
     | 
| 
      
 18 
     | 
    
         
            +
                    configure_gem(options)
         
     | 
| 
      
 19 
     | 
    
         
            +
                    
         
     | 
| 
       15 
20 
     | 
    
         
             
                    puts "Analyzing: #{file_path}"
         
     | 
| 
       16 
21 
     | 
    
         
             
                    puts "Format: #{options[:format]}"
         
     | 
| 
      
 22 
     | 
    
         
            +
                    puts "Caching: #{options[:cache] ? 'enabled' : 'disabled'}"
         
     | 
| 
       17 
23 
     | 
    
         
             
                    puts "-" * 50
         
     | 
| 
       18 
24 
     | 
    
         | 
| 
      
 25 
     | 
    
         
            +
                    start_time = Time.now
         
     | 
| 
      
 26 
     | 
    
         
            +
                    
         
     | 
| 
       19 
27 
     | 
    
         
             
                    # Perform analysis
         
     | 
| 
       20 
28 
     | 
    
         
             
                    results = LegalSummariser.summarise(file_path, {
         
     | 
| 
       21 
29 
     | 
    
         
             
                      format: options[:format],
         
     | 
| 
       22 
30 
     | 
    
         
             
                      max_sentences: options[:max_sentences]
         
     | 
| 
       23 
31 
     | 
    
         
             
                    })
         
     | 
| 
       24 
32 
     | 
    
         | 
| 
      
 33 
     | 
    
         
            +
                    end_time = Time.now
         
     | 
| 
      
 34 
     | 
    
         
            +
                    
         
     | 
| 
       25 
35 
     | 
    
         
             
                    # Output results
         
     | 
| 
       26 
36 
     | 
    
         
             
                    if options[:output]
         
     | 
| 
       27 
37 
     | 
    
         
             
                      File.write(options[:output], results)
         
     | 
| 
         @@ -30,15 +40,24 @@ module LegalSummariser 
     | 
|
| 
       30 
40 
     | 
    
         
             
                      puts results
         
     | 
| 
       31 
41 
     | 
    
         
             
                    end
         
     | 
| 
       32 
42 
     | 
    
         | 
| 
      
 43 
     | 
    
         
            +
                    if options[:verbose]
         
     | 
| 
      
 44 
     | 
    
         
            +
                      puts "\n" + "-" * 50
         
     | 
| 
      
 45 
     | 
    
         
            +
                      puts "Analysis completed in #{(end_time - start_time).round(3)}s"
         
     | 
| 
      
 46 
     | 
    
         
            +
                      puts "Performance stats available via 'legal_summariser stats'"
         
     | 
| 
      
 47 
     | 
    
         
            +
                    end
         
     | 
| 
      
 48 
     | 
    
         
            +
                    
         
     | 
| 
       33 
49 
     | 
    
         
             
                  rescue LegalSummariser::DocumentNotFoundError => e
         
     | 
| 
       34 
50 
     | 
    
         
             
                    puts "Error: #{e.message}"
         
     | 
| 
       35 
51 
     | 
    
         
             
                    exit 1
         
     | 
| 
       36 
52 
     | 
    
         
             
                  rescue LegalSummariser::UnsupportedFormatError => e
         
     | 
| 
       37 
53 
     | 
    
         
             
                    puts "Error: #{e.message}"
         
     | 
| 
       38 
54 
     | 
    
         
             
                    exit 1
         
     | 
| 
      
 55 
     | 
    
         
            +
                  rescue LegalSummariser::Error => e
         
     | 
| 
      
 56 
     | 
    
         
            +
                    puts "Processing error: #{e.message}"
         
     | 
| 
      
 57 
     | 
    
         
            +
                    exit 1
         
     | 
| 
       39 
58 
     | 
    
         
             
                  rescue => e
         
     | 
| 
       40 
59 
     | 
    
         
             
                    puts "Unexpected error: #{e.message}"
         
     | 
| 
       41 
     | 
    
         
            -
                    puts e.backtrace if ENV['DEBUG']
         
     | 
| 
      
 60 
     | 
    
         
            +
                    puts e.backtrace if options[:verbose] || ENV['DEBUG']
         
     | 
| 
       42 
61 
     | 
    
         
             
                    exit 1
         
     | 
| 
       43 
62 
     | 
    
         
             
                  end
         
     | 
| 
       44 
63 
     | 
    
         
             
                end
         
     | 
| 
         @@ -62,6 +81,107 @@ module LegalSummariser 
     | 
|
| 
       62 
81 
     | 
    
         
             
                  puts "- Plain text (text, txt)"
         
     | 
| 
       63 
82 
     | 
    
         
             
                end
         
     | 
| 
       64 
83 
     | 
    
         | 
| 
      
 84 
     | 
    
         
            +
                desc "batch FILES", "Analyze multiple legal documents"
         
     | 
| 
      
 85 
     | 
    
         
            +
                option :format, aliases: '-f', default: 'text', desc: 'Output format (json, markdown, text)'
         
     | 
| 
      
 86 
     | 
    
         
            +
                option :output_dir, aliases: '-d', desc: 'Output directory for results'
         
     | 
| 
      
 87 
     | 
    
         
            +
                option :verbose, aliases: '-v', type: :boolean, default: false, desc: 'Enable verbose logging'
         
     | 
| 
      
 88 
     | 
    
         
            +
                option :cache, type: :boolean, default: true, desc: 'Enable result caching'
         
     | 
| 
      
 89 
     | 
    
         
            +
                def batch(*file_paths)
         
     | 
| 
      
 90 
     | 
    
         
            +
                  if file_paths.empty?
         
     | 
| 
      
 91 
     | 
    
         
            +
                    puts "Error: No files specified"
         
     | 
| 
      
 92 
     | 
    
         
            +
                    puts "Usage: legal_summariser batch file1.pdf file2.docx ..."
         
     | 
| 
      
 93 
     | 
    
         
            +
                    exit 1
         
     | 
| 
      
 94 
     | 
    
         
            +
                  end
         
     | 
| 
      
 95 
     | 
    
         
            +
                  
         
     | 
| 
      
 96 
     | 
    
         
            +
                  configure_gem(options)
         
     | 
| 
      
 97 
     | 
    
         
            +
                  
         
     | 
| 
      
 98 
     | 
    
         
            +
                  puts "Batch processing #{file_paths.length} files..."
         
     | 
| 
      
 99 
     | 
    
         
            +
                  puts "-" * 50
         
     | 
| 
      
 100 
     | 
    
         
            +
                  
         
     | 
| 
      
 101 
     | 
    
         
            +
                  results = LegalSummariser.batch_summarise(file_paths, {
         
     | 
| 
      
 102 
     | 
    
         
            +
                    format: options[:format]
         
     | 
| 
      
 103 
     | 
    
         
            +
                  })
         
     | 
| 
      
 104 
     | 
    
         
            +
                  
         
     | 
| 
      
 105 
     | 
    
         
            +
                  # Process results
         
     | 
| 
      
 106 
     | 
    
         
            +
                  successful = results.count { |r| r[:success] }
         
     | 
| 
      
 107 
     | 
    
         
            +
                  failed = results.count { |r| !r[:success] }
         
     | 
| 
      
 108 
     | 
    
         
            +
                  
         
     | 
| 
      
 109 
     | 
    
         
            +
                  puts "\nBatch processing completed:"
         
     | 
| 
      
 110 
     | 
    
         
            +
                  puts "✓ Successful: #{successful}"
         
     | 
| 
      
 111 
     | 
    
         
            +
                  puts "✗ Failed: #{failed}" if failed > 0
         
     | 
| 
      
 112 
     | 
    
         
            +
                  
         
     | 
| 
      
 113 
     | 
    
         
            +
                  if options[:output_dir]
         
     | 
| 
      
 114 
     | 
    
         
            +
                    FileUtils.mkdir_p(options[:output_dir])
         
     | 
| 
      
 115 
     | 
    
         
            +
                    
         
     | 
| 
      
 116 
     | 
    
         
            +
                    results.each do |result|
         
     | 
| 
      
 117 
     | 
    
         
            +
                      next unless result[:success]
         
     | 
| 
      
 118 
     | 
    
         
            +
                      
         
     | 
| 
      
 119 
     | 
    
         
            +
                      filename = File.basename(result[:file_path], '.*') + '_analysis'
         
     | 
| 
      
 120 
     | 
    
         
            +
                      extension = case options[:format]
         
     | 
| 
      
 121 
     | 
    
         
            +
                                 when 'json' then '.json'
         
     | 
| 
      
 122 
     | 
    
         
            +
                                 when 'markdown', 'md' then '.md'
         
     | 
| 
      
 123 
     | 
    
         
            +
                                 else '.txt'
         
     | 
| 
      
 124 
     | 
    
         
            +
                                 end
         
     | 
| 
      
 125 
     | 
    
         
            +
                      
         
     | 
| 
      
 126 
     | 
    
         
            +
                      output_file = File.join(options[:output_dir], filename + extension)
         
     | 
| 
      
 127 
     | 
    
         
            +
                      File.write(output_file, result[:result])
         
     | 
| 
      
 128 
     | 
    
         
            +
                      puts "Saved: #{output_file}"
         
     | 
| 
      
 129 
     | 
    
         
            +
                    end
         
     | 
| 
      
 130 
     | 
    
         
            +
                  end
         
     | 
| 
      
 131 
     | 
    
         
            +
                end
         
     | 
| 
      
 132 
     | 
    
         
            +
             
     | 
| 
      
 133 
     | 
    
         
            +
                desc "stats", "Show performance and usage statistics"
         
     | 
| 
      
 134 
     | 
    
         
            +
                def stats
         
     | 
| 
      
 135 
     | 
    
         
            +
                  stats = LegalSummariser.stats
         
     | 
| 
      
 136 
     | 
    
         
            +
                  
         
     | 
| 
      
 137 
     | 
    
         
            +
                  puts "Legal Summariser Statistics"
         
     | 
| 
      
 138 
     | 
    
         
            +
                  puts "=" * 50
         
     | 
| 
      
 139 
     | 
    
         
            +
                  
         
     | 
| 
      
 140 
     | 
    
         
            +
                  # Performance stats
         
     | 
| 
      
 141 
     | 
    
         
            +
                  if stats[:performance].any?
         
     | 
| 
      
 142 
     | 
    
         
            +
                    puts "\nPerformance:"
         
     | 
| 
      
 143 
     | 
    
         
            +
                    stats[:performance].each do |metric, data|
         
     | 
| 
      
 144 
     | 
    
         
            +
                      puts "  #{metric.to_s.tr('_', ' ').capitalize}:"
         
     | 
| 
      
 145 
     | 
    
         
            +
                      puts "    Count: #{data[:count]}"
         
     | 
| 
      
 146 
     | 
    
         
            +
                      puts "    Average: #{data[:average]}s"
         
     | 
| 
      
 147 
     | 
    
         
            +
                      puts "    Total: #{data[:total]}s"
         
     | 
| 
      
 148 
     | 
    
         
            +
                    end
         
     | 
| 
      
 149 
     | 
    
         
            +
                  end
         
     | 
| 
      
 150 
     | 
    
         
            +
                  
         
     | 
| 
      
 151 
     | 
    
         
            +
                  # Cache stats
         
     | 
| 
      
 152 
     | 
    
         
            +
                  puts "\nCache:"
         
     | 
| 
      
 153 
     | 
    
         
            +
                  cache_stats = stats[:cache]
         
     | 
| 
      
 154 
     | 
    
         
            +
                  if cache_stats[:enabled]
         
     | 
| 
      
 155 
     | 
    
         
            +
                    puts "  Status: Enabled"
         
     | 
| 
      
 156 
     | 
    
         
            +
                    puts "  Files: #{cache_stats[:file_count]}"
         
     | 
| 
      
 157 
     | 
    
         
            +
                    puts "  Size: #{cache_stats[:total_size_mb]} MB"
         
     | 
| 
      
 158 
     | 
    
         
            +
                  else
         
     | 
| 
      
 159 
     | 
    
         
            +
                    puts "  Status: Disabled"
         
     | 
| 
      
 160 
     | 
    
         
            +
                  end
         
     | 
| 
      
 161 
     | 
    
         
            +
                  
         
     | 
| 
      
 162 
     | 
    
         
            +
                  # Memory stats
         
     | 
| 
      
 163 
     | 
    
         
            +
                  memory = stats[:memory]
         
     | 
| 
      
 164 
     | 
    
         
            +
                  if memory[:available] != false
         
     | 
| 
      
 165 
     | 
    
         
            +
                    puts "\nMemory:"
         
     | 
| 
      
 166 
     | 
    
         
            +
                    puts "  Objects: #{memory[:object_count]}"
         
     | 
| 
      
 167 
     | 
    
         
            +
                    puts "  GC Count: #{memory[:gc_count]}"
         
     | 
| 
      
 168 
     | 
    
         
            +
                    puts "  Estimated Usage: #{memory[:memory_mb]} MB"
         
     | 
| 
      
 169 
     | 
    
         
            +
                  end
         
     | 
| 
      
 170 
     | 
    
         
            +
                end
         
     | 
| 
      
 171 
     | 
    
         
            +
             
     | 
| 
      
 172 
     | 
    
         
            +
                desc "config", "Show current configuration"
         
     | 
| 
      
 173 
     | 
    
         
            +
                def config
         
     | 
| 
      
 174 
     | 
    
         
            +
                  config = LegalSummariser.configuration
         
     | 
| 
      
 175 
     | 
    
         
            +
                  
         
     | 
| 
      
 176 
     | 
    
         
            +
                  puts "Legal Summariser Configuration"
         
     | 
| 
      
 177 
     | 
    
         
            +
                  puts "=" * 50
         
     | 
| 
      
 178 
     | 
    
         
            +
                  puts "Language: #{config.language}"
         
     | 
| 
      
 179 
     | 
    
         
            +
                  puts "Max File Size: #{config.max_file_size / 1024 / 1024} MB"
         
     | 
| 
      
 180 
     | 
    
         
            +
                  puts "Timeout: #{config.timeout}s"
         
     | 
| 
      
 181 
     | 
    
         
            +
                  puts "Caching: #{config.enable_caching ? 'enabled' : 'disabled'}"
         
     | 
| 
      
 182 
     | 
    
         
            +
                  puts "Cache Directory: #{config.cache_dir}"
         
     | 
| 
      
 183 
     | 
    
         
            +
                end
         
     | 
| 
      
 184 
     | 
    
         
            +
             
     | 
| 
       65 
185 
     | 
    
         
             
                desc "demo", "Run demo analysis on sample documents"
         
     | 
| 
       66 
186 
     | 
    
         
             
                def demo
         
     | 
| 
       67 
187 
     | 
    
         
             
                  puts "Legal Summariser Demo"
         
     | 
| 
         @@ -85,6 +205,16 @@ module LegalSummariser 
     | 
|
| 
       85 
205 
     | 
    
         | 
| 
       86 
206 
     | 
    
         
             
                private
         
     | 
| 
       87 
207 
     | 
    
         | 
| 
      
 208 
     | 
    
         
            +
                def configure_gem(options)
         
     | 
| 
      
 209 
     | 
    
         
            +
                  LegalSummariser.configure do |config|
         
     | 
| 
      
 210 
     | 
    
         
            +
                    if options[:verbose]
         
     | 
| 
      
 211 
     | 
    
         
            +
                      require 'logger'
         
     | 
| 
      
 212 
     | 
    
         
            +
                      config.logger = Logger.new(STDOUT, level: Logger::INFO)
         
     | 
| 
      
 213 
     | 
    
         
            +
                    end
         
     | 
| 
      
 214 
     | 
    
         
            +
                    config.enable_caching = options[:cache] if options.key?(:cache)
         
     | 
| 
      
 215 
     | 
    
         
            +
                  end
         
     | 
| 
      
 216 
     | 
    
         
            +
                end
         
     | 
| 
      
 217 
     | 
    
         
            +
             
     | 
| 
       88 
218 
     | 
    
         
             
                def create_sample_nda
         
     | 
| 
       89 
219 
     | 
    
         
             
                  <<~NDA
         
     | 
| 
       90 
220 
     | 
    
         
             
                    NON-DISCLOSURE AGREEMENT
         
     | 
| 
         @@ -2,22 +2,38 @@ 
     | 
|
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            require 'pdf-reader'
         
     | 
| 
       4 
4 
     | 
    
         
             
            require 'docx'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'logger'
         
     | 
| 
       5 
6 
     | 
    
         | 
| 
       6 
7 
     | 
    
         
             
            module LegalSummariser
         
     | 
| 
       7 
8 
     | 
    
         
             
              class TextExtractor
         
     | 
| 
      
 9 
     | 
    
         
            +
                # Logger for debugging and monitoring
         
     | 
| 
      
 10 
     | 
    
         
            +
                def self.logger
         
     | 
| 
      
 11 
     | 
    
         
            +
                  @logger ||= Logger.new(STDOUT, level: Logger::WARN)
         
     | 
| 
      
 12 
     | 
    
         
            +
                end
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                def self.logger=(logger)
         
     | 
| 
      
 15 
     | 
    
         
            +
                  @logger = logger
         
     | 
| 
      
 16 
     | 
    
         
            +
                end
         
     | 
| 
       8 
17 
     | 
    
         
             
                # Extract text from various document formats
         
     | 
| 
       9 
18 
     | 
    
         
             
                # @param file_path [String] Path to the document
         
     | 
| 
       10 
19 
     | 
    
         
             
                # @return [String] Extracted text
         
     | 
| 
       11 
20 
     | 
    
         
             
                def self.extract(file_path)
         
     | 
| 
      
 21 
     | 
    
         
            +
                  raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
         
     | 
| 
      
 22 
     | 
    
         
            +
                  raise DocumentNotFoundError, "File is empty: #{file_path}" if File.zero?(file_path)
         
     | 
| 
      
 23 
     | 
    
         
            +
                  
         
     | 
| 
      
 24 
     | 
    
         
            +
                  logger.info "Extracting text from: #{file_path}"
         
     | 
| 
      
 25 
     | 
    
         
            +
                  
         
     | 
| 
       12 
26 
     | 
    
         
             
                  case File.extname(file_path).downcase
         
     | 
| 
       13 
27 
     | 
    
         
             
                  when '.pdf'
         
     | 
| 
       14 
28 
     | 
    
         
             
                    extract_from_pdf(file_path)
         
     | 
| 
       15 
29 
     | 
    
         
             
                  when '.docx'
         
     | 
| 
       16 
30 
     | 
    
         
             
                    extract_from_docx(file_path)
         
     | 
| 
       17 
     | 
    
         
            -
                  when '.txt'
         
     | 
| 
       18 
     | 
    
         
            -
                     
     | 
| 
      
 31 
     | 
    
         
            +
                  when '.txt', '.text'
         
     | 
| 
      
 32 
     | 
    
         
            +
                    extract_from_text(file_path)
         
     | 
| 
      
 33 
     | 
    
         
            +
                  when '.rtf'
         
     | 
| 
      
 34 
     | 
    
         
            +
                    extract_from_rtf(file_path)
         
     | 
| 
       19 
35 
     | 
    
         
             
                  else
         
     | 
| 
       20 
     | 
    
         
            -
                    raise UnsupportedFormatError, "Unsupported file format: #{File.extname(file_path)}"
         
     | 
| 
      
 36 
     | 
    
         
            +
                    raise UnsupportedFormatError, "Unsupported file format: #{File.extname(file_path)}. Supported formats: .pdf, .docx, .txt, .rtf"
         
     | 
| 
       21 
37 
     | 
    
         
             
                  end
         
     | 
| 
       22 
38 
     | 
    
         
             
                end
         
     | 
| 
       23 
39 
     | 
    
         | 
| 
         @@ -27,15 +43,30 @@ module LegalSummariser 
     | 
|
| 
       27 
43 
     | 
    
         
             
                # @param file_path [String] Path to PDF file
         
     | 
| 
       28 
44 
     | 
    
         
             
                # @return [String] Extracted text
         
     | 
| 
       29 
45 
     | 
    
         
             
                def self.extract_from_pdf(file_path)
         
     | 
| 
      
 46 
     | 
    
         
            +
                  logger.debug "Processing PDF: #{file_path}"
         
     | 
| 
      
 47 
     | 
    
         
            +
                  
         
     | 
| 
       30 
48 
     | 
    
         
             
                  reader = PDF::Reader.new(file_path)
         
     | 
| 
       31 
49 
     | 
    
         
             
                  text = ""
         
     | 
| 
      
 50 
     | 
    
         
            +
                  page_count = 0
         
     | 
| 
       32 
51 
     | 
    
         | 
| 
       33 
52 
     | 
    
         
             
                  reader.pages.each do |page|
         
     | 
| 
       34 
     | 
    
         
            -
                     
     | 
| 
      
 53 
     | 
    
         
            +
                    page_count += 1
         
     | 
| 
      
 54 
     | 
    
         
            +
                    page_text = page.text
         
     | 
| 
      
 55 
     | 
    
         
            +
                    text += page_text + "\n" if page_text && !page_text.strip.empty?
         
     | 
| 
      
 56 
     | 
    
         
            +
                  end
         
     | 
| 
      
 57 
     | 
    
         
            +
                  
         
     | 
| 
      
 58 
     | 
    
         
            +
                  logger.info "Extracted text from #{page_count} PDF pages"
         
     | 
| 
      
 59 
     | 
    
         
            +
                  
         
     | 
| 
      
 60 
     | 
    
         
            +
                  if text.strip.empty?
         
     | 
| 
      
 61 
     | 
    
         
            +
                    logger.warn "No text extracted from PDF - file may be image-based or encrypted"
         
     | 
| 
      
 62 
     | 
    
         
            +
                    raise Error, "No extractable text found in PDF. File may be image-based or password-protected."
         
     | 
| 
       35 
63 
     | 
    
         
             
                  end
         
     | 
| 
       36 
64 
     | 
    
         | 
| 
       37 
     | 
    
         
            -
                  # Clean up common PDF artifacts
         
     | 
| 
       38 
65 
     | 
    
         
             
                  clean_text(text)
         
     | 
| 
      
 66 
     | 
    
         
            +
                rescue PDF::Reader::MalformedPDFError => e
         
     | 
| 
      
 67 
     | 
    
         
            +
                  raise Error, "Malformed PDF file: #{e.message}"
         
     | 
| 
      
 68 
     | 
    
         
            +
                rescue PDF::Reader::UnsupportedFeatureError => e
         
     | 
| 
      
 69 
     | 
    
         
            +
                  raise Error, "PDF contains unsupported features: #{e.message}"
         
     | 
| 
       39 
70 
     | 
    
         
             
                rescue => e
         
     | 
| 
       40 
71 
     | 
    
         
             
                  raise Error, "Failed to extract text from PDF: #{e.message}"
         
     | 
| 
       41 
72 
     | 
    
         
             
                end
         
     | 
| 
         @@ -44,28 +75,98 @@ module LegalSummariser 
     | 
|
| 
       44 
75 
     | 
    
         
             
                # @param file_path [String] Path to DOCX file
         
     | 
| 
       45 
76 
     | 
    
         
             
                # @return [String] Extracted text
         
     | 
| 
       46 
77 
     | 
    
         
             
                def self.extract_from_docx(file_path)
         
     | 
| 
      
 78 
     | 
    
         
            +
                  logger.debug "Processing DOCX: #{file_path}"
         
     | 
| 
      
 79 
     | 
    
         
            +
                  
         
     | 
| 
       47 
80 
     | 
    
         
             
                  doc = Docx::Document.open(file_path)
         
     | 
| 
       48 
81 
     | 
    
         
             
                  text = ""
         
     | 
| 
      
 82 
     | 
    
         
            +
                  paragraph_count = 0
         
     | 
| 
       49 
83 
     | 
    
         | 
| 
       50 
84 
     | 
    
         
             
                  doc.paragraphs.each do |paragraph|
         
     | 
| 
       51 
     | 
    
         
            -
                     
     | 
| 
      
 85 
     | 
    
         
            +
                    paragraph_text = paragraph.text
         
     | 
| 
      
 86 
     | 
    
         
            +
                    if paragraph_text && !paragraph_text.strip.empty?
         
     | 
| 
      
 87 
     | 
    
         
            +
                      text += paragraph_text + "\n"
         
     | 
| 
      
 88 
     | 
    
         
            +
                      paragraph_count += 1
         
     | 
| 
      
 89 
     | 
    
         
            +
                    end
         
     | 
| 
      
 90 
     | 
    
         
            +
                  end
         
     | 
| 
      
 91 
     | 
    
         
            +
                  
         
     | 
| 
      
 92 
     | 
    
         
            +
                  # Also extract text from tables if present
         
     | 
| 
      
 93 
     | 
    
         
            +
                  doc.tables.each do |table|
         
     | 
| 
      
 94 
     | 
    
         
            +
                    table.rows.each do |row|
         
     | 
| 
      
 95 
     | 
    
         
            +
                      row.cells.each do |cell|
         
     | 
| 
      
 96 
     | 
    
         
            +
                        cell_text = cell.text
         
     | 
| 
      
 97 
     | 
    
         
            +
                        text += cell_text + " " if cell_text && !cell_text.strip.empty?
         
     | 
| 
      
 98 
     | 
    
         
            +
                      end
         
     | 
| 
      
 99 
     | 
    
         
            +
                      text += "\n"
         
     | 
| 
      
 100 
     | 
    
         
            +
                    end
         
     | 
| 
      
 101 
     | 
    
         
            +
                  end
         
     | 
| 
      
 102 
     | 
    
         
            +
                  
         
     | 
| 
      
 103 
     | 
    
         
            +
                  logger.info "Extracted text from #{paragraph_count} DOCX paragraphs"
         
     | 
| 
      
 104 
     | 
    
         
            +
                  
         
     | 
| 
      
 105 
     | 
    
         
            +
                  if text.strip.empty?
         
     | 
| 
      
 106 
     | 
    
         
            +
                    raise Error, "No text content found in DOCX file"
         
     | 
| 
       52 
107 
     | 
    
         
             
                  end
         
     | 
| 
       53 
108 
     | 
    
         | 
| 
       54 
109 
     | 
    
         
             
                  clean_text(text)
         
     | 
| 
      
 110 
     | 
    
         
            +
                rescue Zip::Error => e
         
     | 
| 
      
 111 
     | 
    
         
            +
                  raise Error, "Invalid DOCX file format: #{e.message}"
         
     | 
| 
       55 
112 
     | 
    
         
             
                rescue => e
         
     | 
| 
       56 
113 
     | 
    
         
             
                  raise Error, "Failed to extract text from DOCX: #{e.message}"
         
     | 
| 
       57 
114 
     | 
    
         
             
                end
         
     | 
| 
       58 
115 
     | 
    
         | 
| 
      
 116 
     | 
    
         
            +
                # Extract text from plain text files
         
     | 
| 
      
 117 
     | 
    
         
            +
                # @param file_path [String] Path to text file
         
     | 
| 
      
 118 
     | 
    
         
            +
                # @return [String] Extracted text
         
     | 
| 
      
 119 
     | 
    
         
            +
                def self.extract_from_text(file_path)
         
     | 
| 
      
 120 
     | 
    
         
            +
                  logger.debug "Processing text file: #{file_path}"
         
     | 
| 
      
 121 
     | 
    
         
            +
                  
         
     | 
| 
      
 122 
     | 
    
         
            +
                  # Try different encodings
         
     | 
| 
      
 123 
     | 
    
         
            +
                  encodings = ['UTF-8', 'ISO-8859-1', 'Windows-1252']
         
     | 
| 
      
 124 
     | 
    
         
            +
                  
         
     | 
| 
      
 125 
     | 
    
         
            +
                  encodings.each do |encoding|
         
     | 
| 
      
 126 
     | 
    
         
            +
                    begin
         
     | 
| 
      
 127 
     | 
    
         
            +
                      text = File.read(file_path, encoding: encoding)
         
     | 
| 
      
 128 
     | 
    
         
            +
                      logger.info "Successfully read text file with #{encoding} encoding"
         
     | 
| 
      
 129 
     | 
    
         
            +
                      return clean_text(text)
         
     | 
| 
      
 130 
     | 
    
         
            +
                    rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
         
     | 
| 
      
 131 
     | 
    
         
            +
                      logger.debug "Failed to read with #{encoding} encoding, trying next"
         
     | 
| 
      
 132 
     | 
    
         
            +
                      next
         
     | 
| 
      
 133 
     | 
    
         
            +
                    end
         
     | 
| 
      
 134 
     | 
    
         
            +
                  end
         
     | 
| 
      
 135 
     | 
    
         
            +
                  
         
     | 
| 
      
 136 
     | 
    
         
            +
                  raise Error, "Unable to read text file with supported encodings"
         
     | 
| 
      
 137 
     | 
    
         
            +
                end
         
     | 
| 
      
 138 
     | 
    
         
            +
             
     | 
| 
      
 139 
     | 
    
         
            +
                # Extract text from RTF files (basic support)
         
     | 
| 
      
 140 
     | 
    
         
            +
                # @param file_path [String] Path to RTF file
         
     | 
| 
      
 141 
     | 
    
         
            +
                # @return [String] Extracted text
         
     | 
| 
      
 142 
     | 
    
         
            +
                def self.extract_from_rtf(file_path)
         
     | 
| 
      
 143 
     | 
    
         
            +
                  logger.debug "Processing RTF: #{file_path}"
         
     | 
| 
      
 144 
     | 
    
         
            +
                  
         
     | 
| 
      
 145 
     | 
    
         
            +
                  content = File.read(file_path, encoding: 'UTF-8')
         
     | 
| 
      
 146 
     | 
    
         
            +
                  
         
     | 
| 
      
 147 
     | 
    
         
            +
                  # Basic RTF parsing - remove RTF control codes
         
     | 
| 
      
 148 
     | 
    
         
            +
                  text = content.gsub(/\{[^}]*\}/, '') # Remove RTF groups
         
     | 
| 
      
 149 
     | 
    
         
            +
                  text = text.gsub(/\\[a-z]+\d*\s?/, '') # Remove RTF commands
         
     | 
| 
      
 150 
     | 
    
         
            +
                  text = text.gsub(/\\[^a-z]/, '') # Remove RTF escape sequences
         
     | 
| 
      
 151 
     | 
    
         
            +
                  
         
     | 
| 
      
 152 
     | 
    
         
            +
                  clean_text(text)
         
     | 
| 
      
 153 
     | 
    
         
            +
                rescue => e
         
     | 
| 
      
 154 
     | 
    
         
            +
                  raise Error, "Failed to extract text from RTF: #{e.message}"
         
     | 
| 
      
 155 
     | 
    
         
            +
                end
         
     | 
| 
      
 156 
     | 
    
         
            +
             
     | 
| 
       59 
157 
     | 
    
         
             
                # Clean extracted text
         
     | 
| 
       60 
158 
     | 
    
         
             
                # @param text [String] Raw extracted text
         
     | 
| 
       61 
159 
     | 
    
         
             
                # @return [String] Cleaned text
         
     | 
| 
       62 
160 
     | 
    
         
             
                def self.clean_text(text)
         
     | 
| 
      
 161 
     | 
    
         
            +
                  return "" if text.nil? || text.empty?
         
     | 
| 
      
 162 
     | 
    
         
            +
                  
         
     | 
| 
       63 
163 
     | 
    
         
             
                  # Normalize line breaks first
         
     | 
| 
       64 
164 
     | 
    
         
             
                  text = text.gsub(/\r\n?/, "\n")
         
     | 
| 
       65 
165 
     | 
    
         | 
| 
       66 
     | 
    
         
            -
                  # Remove common  
     | 
| 
      
 166 
     | 
    
         
            +
                  # Remove common document artifacts
         
     | 
| 
       67 
167 
     | 
    
         
             
                  text = text.gsub(/\f/, '') # Form feed characters
         
     | 
| 
       68 
168 
     | 
    
         
             
                  text = text.gsub(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/, '') # Control characters
         
     | 
| 
      
 169 
     | 
    
         
            +
                  text = text.gsub(/\u00A0/, ' ') # Non-breaking spaces
         
     | 
| 
       69 
170 
     | 
    
         | 
| 
       70 
171 
     | 
    
         
             
                  # Remove excessive whitespace but preserve line breaks
         
     | 
| 
       71 
172 
     | 
    
         
             
                  text = text.gsub(/[ \t]+/, ' ')
         
     | 
| 
         @@ -73,7 +174,24 @@ module LegalSummariser 
     | 
|
| 
       73 
174 
     | 
    
         
             
                  # Remove excessive newlines
         
     | 
| 
       74 
175 
     | 
    
         
             
                  text = text.gsub(/\n{3,}/, "\n\n")
         
     | 
| 
       75 
176 
     | 
    
         | 
| 
      
 177 
     | 
    
         
            +
                  # Remove leading/trailing whitespace from each line
         
     | 
| 
      
 178 
     | 
    
         
            +
                  text = text.split("\n").map(&:strip).join("\n")
         
     | 
| 
      
 179 
     | 
    
         
            +
                  
         
     | 
| 
      
 180 
     | 
    
         
            +
                  # Remove empty lines at start and end
         
     | 
| 
       76 
181 
     | 
    
         
             
                  text.strip
         
     | 
| 
       77 
182 
     | 
    
         
             
                end
         
     | 
| 
      
 183 
     | 
    
         
            +
             
     | 
| 
      
 184 
     | 
    
         
            +
                # Get document statistics
         
     | 
| 
      
 185 
     | 
    
         
            +
                # @param text [String] Document text
         
     | 
| 
      
 186 
     | 
    
         
            +
                # @return [Hash] Document statistics
         
     | 
| 
      
 187 
     | 
    
         
            +
                def self.get_statistics(text)
         
     | 
| 
      
 188 
     | 
    
         
            +
                  {
         
     | 
| 
      
 189 
     | 
    
         
            +
                    character_count: text.length,
         
     | 
| 
      
 190 
     | 
    
         
            +
                    word_count: text.split(/\s+/).length,
         
     | 
| 
      
 191 
     | 
    
         
            +
                    sentence_count: text.split(/[.!?]+/).length,
         
     | 
| 
      
 192 
     | 
    
         
            +
                    paragraph_count: text.split(/\n\s*\n/).length,
         
     | 
| 
      
 193 
     | 
    
         
            +
                    average_sentence_length: text.split(/\s+/).length.to_f / text.split(/[.!?]+/).length
         
     | 
| 
      
 194 
     | 
    
         
            +
                  }
         
     | 
| 
      
 195 
     | 
    
         
            +
                end
         
     | 
| 
       78 
196 
     | 
    
         
             
              end
         
     | 
| 
       79 
197 
     | 
    
         
             
            end
         
     | 
    
        data/lib/legal_summariser.rb
    CHANGED
    
    | 
         @@ -1,6 +1,9 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # frozen_string_literal: true
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            require_relative "legal_summariser/version"
         
     | 
| 
      
 4 
     | 
    
         
            +
            require_relative "legal_summariser/configuration"
         
     | 
| 
      
 5 
     | 
    
         
            +
            require_relative "legal_summariser/cache"
         
     | 
| 
      
 6 
     | 
    
         
            +
            require_relative "legal_summariser/performance_monitor"
         
     | 
| 
       4 
7 
     | 
    
         
             
            require_relative "legal_summariser/document_parser"
         
     | 
| 
       5 
8 
     | 
    
         
             
            require_relative "legal_summariser/text_extractor"
         
     | 
| 
       6 
9 
     | 
    
         
             
            require_relative "legal_summariser/summariser"
         
     | 
| 
         @@ -18,34 +21,89 @@ module LegalSummariser 
     | 
|
| 
       18 
21 
     | 
    
         
             
              # @param options [Hash] Configuration options
         
     | 
| 
       19 
22 
     | 
    
         
             
              # @return [Hash] Summary results
         
     | 
| 
       20 
23 
     | 
    
         
             
              def self.summarise(file_path, options = {})
         
     | 
| 
       21 
     | 
    
         
            -
                 
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
       23 
     | 
    
         
            -
                # Extract text from document
         
     | 
| 
       24 
     | 
    
         
            -
                text = TextExtractor.extract(file_path)
         
     | 
| 
      
 24 
     | 
    
         
            +
                monitor = performance_monitor
         
     | 
| 
      
 25 
     | 
    
         
            +
                cache = Cache.new
         
     | 
| 
       25 
26 
     | 
    
         | 
| 
       26 
     | 
    
         
            -
                 
     | 
| 
       27 
     | 
    
         
            -
                summary = Summariser.new(text, options).generate
         
     | 
| 
       28 
     | 
    
         
            -
                clauses = ClauseDetector.new(text).detect
         
     | 
| 
       29 
     | 
    
         
            -
                risks = RiskAnalyzer.new(text).analyze
         
     | 
| 
      
 27 
     | 
    
         
            +
                monitor.start_timer(:total_analysis)
         
     | 
| 
       30 
28 
     | 
    
         | 
| 
       31 
     | 
    
         
            -
                 
     | 
| 
       32 
     | 
    
         
            -
             
     | 
| 
       33 
     | 
    
         
            -
                   
     | 
| 
       34 
     | 
    
         
            -
                   
     | 
| 
       35 
     | 
    
         
            -
                   
     | 
| 
       36 
     | 
    
         
            -
                   
     | 
| 
       37 
     | 
    
         
            -
                   
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
       39 
     | 
    
         
            -
             
     | 
| 
       40 
     | 
    
         
            -
             
     | 
| 
      
 29 
     | 
    
         
            +
                begin
         
     | 
| 
      
 30 
     | 
    
         
            +
                  # Validate file
         
     | 
| 
      
 31 
     | 
    
         
            +
                  raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
         
     | 
| 
      
 32 
     | 
    
         
            +
                  
         
     | 
| 
      
 33 
     | 
    
         
            +
                  file_size = File.size(file_path)
         
     | 
| 
      
 34 
     | 
    
         
            +
                  raise Error, "File too large: #{file_size} bytes (max: #{configuration.max_file_size})" if file_size > configuration.max_file_size
         
     | 
| 
      
 35 
     | 
    
         
            +
                  
         
     | 
| 
      
 36 
     | 
    
         
            +
                  # Check cache first
         
     | 
| 
      
 37 
     | 
    
         
            +
                  cache_key = cache.cache_key(file_path, options)
         
     | 
| 
      
 38 
     | 
    
         
            +
                  cached_result = cache.get(cache_key)
         
     | 
| 
      
 39 
     | 
    
         
            +
                  
         
     | 
| 
      
 40 
     | 
    
         
            +
                  if cached_result
         
     | 
| 
      
 41 
     | 
    
         
            +
                    configuration.logger&.info("Using cached result for #{file_path}")
         
     | 
| 
      
 42 
     | 
    
         
            +
                    monitor.end_timer(:total_analysis)
         
     | 
| 
      
 43 
     | 
    
         
            +
                    return cached_result
         
     | 
| 
      
 44 
     | 
    
         
            +
                  end
         
     | 
| 
      
 45 
     | 
    
         
            +
                  
         
     | 
| 
      
 46 
     | 
    
         
            +
                  # Extract text from document
         
     | 
| 
      
 47 
     | 
    
         
            +
                  monitor.start_timer(:text_extraction)
         
     | 
| 
      
 48 
     | 
    
         
            +
                  text = TextExtractor.extract(file_path)
         
     | 
| 
      
 49 
     | 
    
         
            +
                  extraction_time = monitor.end_timer(:text_extraction)
         
     | 
| 
      
 50 
     | 
    
         
            +
                  
         
     | 
| 
      
 51 
     | 
    
         
            +
                  # Record text statistics
         
     | 
| 
      
 52 
     | 
    
         
            +
                  text_stats = TextExtractor.get_statistics(text)
         
     | 
| 
      
 53 
     | 
    
         
            +
                  monitor.record(:document_word_count, text_stats[:word_count])
         
     | 
| 
      
 54 
     | 
    
         
            +
                  monitor.record(:document_character_count, text_stats[:character_count])
         
     | 
| 
      
 55 
     | 
    
         
            +
                  
         
     | 
| 
      
 56 
     | 
    
         
            +
                  # Perform analysis components
         
     | 
| 
      
 57 
     | 
    
         
            +
                  monitor.start_timer(:summarisation)
         
     | 
| 
      
 58 
     | 
    
         
            +
                  summary = Summariser.new(text, options).generate
         
     | 
| 
      
 59 
     | 
    
         
            +
                  monitor.end_timer(:summarisation)
         
     | 
| 
      
 60 
     | 
    
         
            +
                  
         
     | 
| 
      
 61 
     | 
    
         
            +
                  monitor.start_timer(:clause_detection)
         
     | 
| 
      
 62 
     | 
    
         
            +
                  clauses = ClauseDetector.new(text).detect
         
     | 
| 
      
 63 
     | 
    
         
            +
                  monitor.end_timer(:clause_detection)
         
     | 
| 
      
 64 
     | 
    
         
            +
                  
         
     | 
| 
      
 65 
     | 
    
         
            +
                  monitor.start_timer(:risk_analysis)
         
     | 
| 
      
 66 
     | 
    
         
            +
                  risks = RiskAnalyzer.new(text).analyze
         
     | 
| 
      
 67 
     | 
    
         
            +
                  monitor.end_timer(:risk_analysis)
         
     | 
| 
      
 68 
     | 
    
         
            +
                  
         
     | 
| 
      
 69 
     | 
    
         
            +
                  # Format results
         
     | 
| 
      
 70 
     | 
    
         
            +
                  result = {
         
     | 
| 
      
 71 
     | 
    
         
            +
                    plain_text: summary[:plain_text],
         
     | 
| 
      
 72 
     | 
    
         
            +
                    key_points: summary[:key_points],
         
     | 
| 
      
 73 
     | 
    
         
            +
                    clauses: clauses,
         
     | 
| 
      
 74 
     | 
    
         
            +
                    risks: risks,
         
     | 
| 
      
 75 
     | 
    
         
            +
                    metadata: {
         
     | 
| 
      
 76 
     | 
    
         
            +
                      document_type: detect_document_type(text),
         
     | 
| 
      
 77 
     | 
    
         
            +
                      word_count: text_stats[:word_count],
         
     | 
| 
      
 78 
     | 
    
         
            +
                      character_count: text_stats[:character_count],
         
     | 
| 
      
 79 
     | 
    
         
            +
                      sentence_count: text_stats[:sentence_count],
         
     | 
| 
      
 80 
     | 
    
         
            +
                      paragraph_count: text_stats[:paragraph_count],
         
     | 
| 
      
 81 
     | 
    
         
            +
                      file_size_bytes: file_size,
         
     | 
| 
      
 82 
     | 
    
         
            +
                      extraction_time_seconds: extraction_time.round(3),
         
     | 
| 
      
 83 
     | 
    
         
            +
                      processed_at: Time.now.strftime("%Y-%m-%dT%H:%M:%S%z"),
         
     | 
| 
      
 84 
     | 
    
         
            +
                      gem_version: VERSION,
         
     | 
| 
      
 85 
     | 
    
         
            +
                      language: configuration.language
         
     | 
| 
      
 86 
     | 
    
         
            +
                    },
         
     | 
| 
      
 87 
     | 
    
         
            +
                    performance: monitor.stats
         
     | 
| 
       41 
88 
     | 
    
         
             
                  }
         
     | 
| 
       42 
     | 
    
         
            -
             
     | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
       44 
     | 
    
         
            -
             
     | 
| 
       45 
     | 
    
         
            -
             
     | 
| 
       46 
     | 
    
         
            -
                   
     | 
| 
       47 
     | 
    
         
            -
             
     | 
| 
       48 
     | 
    
         
            -
                   
     | 
| 
      
 89 
     | 
    
         
            +
                  
         
     | 
| 
      
 90 
     | 
    
         
            +
                  # Cache the result
         
     | 
| 
      
 91 
     | 
    
         
            +
                  cache.set(cache_key, result)
         
     | 
| 
      
 92 
     | 
    
         
            +
                  
         
     | 
| 
      
 93 
     | 
    
         
            +
                  total_time = monitor.end_timer(:total_analysis)
         
     | 
| 
      
 94 
     | 
    
         
            +
                  configuration.logger&.info("Analysis completed in #{total_time.round(3)}s")
         
     | 
| 
      
 95 
     | 
    
         
            +
                  
         
     | 
| 
      
 96 
     | 
    
         
            +
                  # Apply formatting if requested
         
     | 
| 
      
 97 
     | 
    
         
            +
                  if options[:format]
         
     | 
| 
      
 98 
     | 
    
         
            +
                    Formatter.format(result, options[:format])
         
     | 
| 
      
 99 
     | 
    
         
            +
                  else
         
     | 
| 
      
 100 
     | 
    
         
            +
                    result
         
     | 
| 
      
 101 
     | 
    
         
            +
                  end
         
     | 
| 
      
 102 
     | 
    
         
            +
                  
         
     | 
| 
      
 103 
     | 
    
         
            +
                rescue => e
         
     | 
| 
      
 104 
     | 
    
         
            +
                  monitor.end_timer(:total_analysis)
         
     | 
| 
      
 105 
     | 
    
         
            +
                  configuration.logger&.error("Analysis failed: #{e.message}")
         
     | 
| 
      
 106 
     | 
    
         
            +
                  raise
         
     | 
| 
       49 
107 
     | 
    
         
             
                end
         
     | 
| 
       50 
108 
     | 
    
         
             
              end
         
     | 
| 
       51 
109 
     | 
    
         | 
| 
         @@ -53,19 +111,114 @@ module LegalSummariser 
     | 
|
| 
       53 
111 
     | 
    
         
             
              # @param text [String] Document text
         
     | 
| 
       54 
112 
     | 
    
         
             
              # @return [String] Document type
         
     | 
| 
       55 
113 
     | 
    
         
             
              def self.detect_document_type(text)
         
     | 
| 
       56 
     | 
    
         
            -
                 
     | 
| 
       57 
     | 
    
         
            -
                 
     | 
| 
       58 
     | 
    
         
            -
             
     | 
| 
       59 
     | 
    
         
            -
                 
     | 
| 
       60 
     | 
    
         
            -
                   
     | 
| 
       61 
     | 
    
         
            -
             
     | 
| 
       62 
     | 
    
         
            -
                   
     | 
| 
       63 
     | 
    
         
            -
             
     | 
| 
       64 
     | 
    
         
            -
                   
     | 
| 
       65 
     | 
    
         
            -
             
     | 
| 
       66 
     | 
    
         
            -
                   
     | 
| 
       67 
     | 
    
         
            -
             
     | 
| 
       68 
     | 
    
         
            -
                   
     | 
| 
      
 114 
     | 
    
         
            +
                text_lower = text.downcase
         
     | 
| 
      
 115 
     | 
    
         
            +
                
         
     | 
| 
      
 116 
     | 
    
         
            +
                # Score different document types
         
     | 
| 
      
 117 
     | 
    
         
            +
                scores = {
         
     | 
| 
      
 118 
     | 
    
         
            +
                  nda: 0,
         
     | 
| 
      
 119 
     | 
    
         
            +
                  service_agreement: 0,
         
     | 
| 
      
 120 
     | 
    
         
            +
                  employment_contract: 0,
         
     | 
| 
      
 121 
     | 
    
         
            +
                  privacy_policy: 0,
         
     | 
| 
      
 122 
     | 
    
         
            +
                  license_agreement: 0,
         
     | 
| 
      
 123 
     | 
    
         
            +
                  terms_of_use: 0,
         
     | 
| 
      
 124 
     | 
    
         
            +
                  purchase_agreement: 0,
         
     | 
| 
      
 125 
     | 
    
         
            +
                  lease_agreement: 0,
         
     | 
| 
      
 126 
     | 
    
         
            +
                  partnership_agreement: 0,
         
     | 
| 
      
 127 
     | 
    
         
            +
                  general_contract: 1 # Base score
         
     | 
| 
      
 128 
     | 
    
         
            +
                }
         
     | 
| 
      
 129 
     | 
    
         
            +
                
         
     | 
| 
      
 130 
     | 
    
         
            +
                # NDA indicators
         
     | 
| 
      
 131 
     | 
    
         
            +
                scores[:nda] += 3 if text_lower.match?(/non.?disclosure/)
         
     | 
| 
      
 132 
     | 
    
         
            +
                scores[:nda] += 2 if text_lower.match?(/\bnda\b/)
         
     | 
| 
      
 133 
     | 
    
         
            +
                scores[:nda] += 2 if text_lower.match?(/confidential/)
         
     | 
| 
      
 134 
     | 
    
         
            +
                scores[:nda] += 1 if text_lower.match?(/proprietary/)
         
     | 
| 
      
 135 
     | 
    
         
            +
                
         
     | 
| 
      
 136 
     | 
    
         
            +
                # Service agreement indicators
         
     | 
| 
      
 137 
     | 
    
         
            +
                scores[:service_agreement] += 3 if text_lower.match?(/service agreement/)
         
     | 
| 
      
 138 
     | 
    
         
            +
                scores[:service_agreement] += 2 if text_lower.match?(/terms of service/)
         
     | 
| 
      
 139 
     | 
    
         
            +
                scores[:service_agreement] += 2 if text_lower.match?(/\btos\b/)
         
     | 
| 
      
 140 
     | 
    
         
            +
                scores[:service_agreement] += 1 if text_lower.match?(/deliver|provide.*service/)
         
     | 
| 
      
 141 
     | 
    
         
            +
                
         
     | 
| 
      
 142 
     | 
    
         
            +
                # Employment indicators
         
     | 
| 
      
 143 
     | 
    
         
            +
                scores[:employment_contract] += 3 if text_lower.match?(/employment/)
         
     | 
| 
      
 144 
     | 
    
         
            +
                scores[:employment_contract] += 2 if text_lower.match?(/employee|employer/)
         
     | 
| 
      
 145 
     | 
    
         
            +
                scores[:employment_contract] += 2 if text_lower.match?(/job|position/)
         
     | 
| 
      
 146 
     | 
    
         
            +
                scores[:employment_contract] += 1 if text_lower.match?(/salary|wage/)
         
     | 
| 
      
 147 
     | 
    
         
            +
                
         
     | 
| 
      
 148 
     | 
    
         
            +
                # Privacy policy indicators
         
     | 
| 
      
 149 
     | 
    
         
            +
                scores[:privacy_policy] += 3 if text_lower.match?(/privacy policy/)
         
     | 
| 
      
 150 
     | 
    
         
            +
                scores[:privacy_policy] += 2 if text_lower.match?(/data protection/)
         
     | 
| 
      
 151 
     | 
    
         
            +
                scores[:privacy_policy] += 2 if text_lower.match?(/gdpr|kvkk/)
         
     | 
| 
      
 152 
     | 
    
         
            +
                scores[:privacy_policy] += 1 if text_lower.match?(/personal data/)
         
     | 
| 
      
 153 
     | 
    
         
            +
                
         
     | 
| 
      
 154 
     | 
    
         
            +
                # License agreement indicators
         
     | 
| 
      
 155 
     | 
    
         
            +
                scores[:license_agreement] += 3 if text_lower.match?(/license agreement/)
         
     | 
| 
      
 156 
     | 
    
         
            +
                scores[:license_agreement] += 2 if text_lower.match?(/licensing/)
         
     | 
| 
      
 157 
     | 
    
         
            +
                scores[:license_agreement] += 1 if text_lower.match?(/intellectual property/)
         
     | 
| 
      
 158 
     | 
    
         
            +
                
         
     | 
| 
      
 159 
     | 
    
         
            +
                # Terms of use indicators
         
     | 
| 
      
 160 
     | 
    
         
            +
                scores[:terms_of_use] += 3 if text_lower.match?(/terms of use/)
         
     | 
| 
      
 161 
     | 
    
         
            +
                scores[:terms_of_use] += 2 if text_lower.match?(/user agreement/)
         
     | 
| 
      
 162 
     | 
    
         
            +
                scores[:terms_of_use] += 1 if text_lower.match?(/website|platform/)
         
     | 
| 
      
 163 
     | 
    
         
            +
                
         
     | 
| 
      
 164 
     | 
    
         
            +
                # Purchase agreement indicators
         
     | 
| 
      
 165 
     | 
    
         
            +
                scores[:purchase_agreement] += 3 if text_lower.match?(/purchase agreement/)
         
     | 
| 
      
 166 
     | 
    
         
            +
                scores[:purchase_agreement] += 2 if text_lower.match?(/buy|sell|purchase/)
         
     | 
| 
      
 167 
     | 
    
         
            +
                scores[:purchase_agreement] += 1 if text_lower.match?(/price|payment/)
         
     | 
| 
      
 168 
     | 
    
         
            +
                
         
     | 
| 
      
 169 
     | 
    
         
            +
                # Lease agreement indicators
         
     | 
| 
      
 170 
     | 
    
         
            +
                scores[:lease_agreement] += 3 if text_lower.match?(/lease agreement/)
         
     | 
| 
      
 171 
     | 
    
         
            +
                scores[:lease_agreement] += 2 if text_lower.match?(/rent|tenant|landlord/)
         
     | 
| 
      
 172 
     | 
    
         
            +
                scores[:lease_agreement] += 1 if text_lower.match?(/property|premises/)
         
     | 
| 
      
 173 
     | 
    
         
            +
                
         
     | 
| 
      
 174 
     | 
    
         
            +
                # Partnership agreement indicators
         
     | 
| 
      
 175 
     | 
    
         
            +
                scores[:partnership_agreement] += 3 if text_lower.match?(/partnership agreement/)
         
     | 
| 
      
 176 
     | 
    
         
            +
                scores[:partnership_agreement] += 2 if text_lower.match?(/partner|partnership/)
         
     | 
| 
      
 177 
     | 
    
         
            +
                scores[:partnership_agreement] += 1 if text_lower.match?(/joint venture/)
         
     | 
| 
      
 178 
     | 
    
         
            +
                
         
     | 
| 
      
 179 
     | 
    
         
            +
                # Return the type with highest score
         
     | 
| 
      
 180 
     | 
    
         
            +
                scores.max_by { |_, score| score }[0].to_s
         
     | 
| 
      
 181 
     | 
    
         
            +
              end
         
     | 
| 
      
 182 
     | 
    
         
            +
              
         
     | 
| 
      
 183 
     | 
    
         
            +
              # Get analysis statistics
         
     | 
| 
      
 184 
     | 
    
         
            +
              # @return [Hash] Analysis statistics
         
     | 
| 
      
 185 
     | 
    
         
            +
              def self.stats
         
     | 
| 
      
 186 
     | 
    
         
            +
                {
         
     | 
| 
      
 187 
     | 
    
         
            +
                  performance: performance_monitor.stats,
         
     | 
| 
      
 188 
     | 
    
         
            +
                  cache: Cache.new.stats,
         
     | 
| 
      
 189 
     | 
    
         
            +
                  memory: performance_monitor.memory_usage,
         
     | 
| 
      
 190 
     | 
    
         
            +
                  configuration: {
         
     | 
| 
      
 191 
     | 
    
         
            +
                    language: configuration.language,
         
     | 
| 
      
 192 
     | 
    
         
            +
                    max_file_size: configuration.max_file_size,
         
     | 
| 
      
 193 
     | 
    
         
            +
                    caching_enabled: configuration.enable_caching
         
     | 
| 
      
 194 
     | 
    
         
            +
                  }
         
     | 
| 
      
 195 
     | 
    
         
            +
                }
         
     | 
| 
      
 196 
     | 
    
         
            +
              end
         
     | 
| 
      
 197 
     | 
    
         
            +
              
         
     | 
| 
      
 198 
     | 
    
         
            +
              # Reset all statistics and cache
         
     | 
| 
      
 199 
     | 
    
         
            +
              def self.reset!
         
     | 
| 
      
 200 
     | 
    
         
            +
                performance_monitor.reset!
         
     | 
| 
      
 201 
     | 
    
         
            +
                Cache.new.clear!
         
     | 
| 
      
 202 
     | 
    
         
            +
              end
         
     | 
| 
      
 203 
     | 
    
         
            +
             
     | 
| 
      
 204 
     | 
    
         
            +
              # Batch process multiple documents
         
     | 
| 
      
 205 
     | 
    
         
            +
              # @param file_paths [Array<String>] Array of file paths
         
     | 
| 
      
 206 
     | 
    
         
            +
              # @param options [Hash] Processing options
         
     | 
| 
      
 207 
     | 
    
         
            +
              # @return [Array<Hash>] Array of analysis results
         
     | 
| 
      
 208 
     | 
    
         
            +
              def self.batch_summarise(file_paths, options = {})
         
     | 
| 
      
 209 
     | 
    
         
            +
                results = []
         
     | 
| 
      
 210 
     | 
    
         
            +
                
         
     | 
| 
      
 211 
     | 
    
         
            +
                file_paths.each_with_index do |file_path, index|
         
     | 
| 
      
 212 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 213 
     | 
    
         
            +
                    configuration.logger&.info("Processing file #{index + 1}/#{file_paths.length}: #{file_path}")
         
     | 
| 
      
 214 
     | 
    
         
            +
                    result = summarise(file_path, options)
         
     | 
| 
      
 215 
     | 
    
         
            +
                    results << { file_path: file_path, success: true, result: result }
         
     | 
| 
      
 216 
     | 
    
         
            +
                  rescue => e
         
     | 
| 
      
 217 
     | 
    
         
            +
                    configuration.logger&.error("Failed to process #{file_path}: #{e.message}")
         
     | 
| 
      
 218 
     | 
    
         
            +
                    results << { file_path: file_path, success: false, error: e.message }
         
     | 
| 
      
 219 
     | 
    
         
            +
                  end
         
     | 
| 
       69 
220 
     | 
    
         
             
                end
         
     | 
| 
      
 221 
     | 
    
         
            +
                
         
     | 
| 
      
 222 
     | 
    
         
            +
                results
         
     | 
| 
       70 
223 
     | 
    
         
             
              end
         
     | 
| 
       71 
224 
     | 
    
         
             
            end
         
     |