legal_summariser 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +48 -0
 - data/CONTRIBUTING.md +231 -0
 - data/examples/advanced_configuration.rb +195 -0
 - data/examples/basic_usage.rb +101 -0
 - data/examples/batch_processing.rb +123 -0
 - data/exe/legal_summariser +131 -1
 - data/lib/legal_summariser/cache.rb +81 -0
 - data/lib/legal_summariser/configuration.rb +43 -0
 - data/lib/legal_summariser/performance_monitor.rb +108 -0
 - data/lib/legal_summariser/text_extractor.rb +125 -7
 - data/lib/legal_summariser/version.rb +1 -1
 - data/lib/legal_summariser.rb +205 -44
 - metadata +8 -1
 
| 
         @@ -2,22 +2,38 @@ 
     | 
|
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            require 'pdf-reader'
         
     | 
| 
       4 
4 
     | 
    
         
             
            require 'docx'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'logger'
         
     | 
| 
       5 
6 
     | 
    
         | 
| 
       6 
7 
     | 
    
         
             
            module LegalSummariser
         
     | 
| 
       7 
8 
     | 
    
         
             
              class TextExtractor
         
     | 
| 
      
 9 
     | 
    
         
            +
                # Logger for debugging and monitoring
         
     | 
| 
      
 10 
     | 
    
         
            +
                def self.logger
         
     | 
| 
      
 11 
     | 
    
         
            +
                  @logger ||= Logger.new(STDOUT, level: Logger::WARN)
         
     | 
| 
      
 12 
     | 
    
         
            +
                end
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                def self.logger=(logger)
         
     | 
| 
      
 15 
     | 
    
         
            +
                  @logger = logger
         
     | 
| 
      
 16 
     | 
    
         
            +
                end
         
     | 
| 
       8 
17 
     | 
    
         
             
                # Extract text from various document formats
         
     | 
| 
       9 
18 
     | 
    
         
             
                # @param file_path [String] Path to the document
         
     | 
| 
       10 
19 
     | 
    
         
             
                # @return [String] Extracted text
         
     | 
| 
       11 
20 
     | 
    
         
             
                def self.extract(file_path)
         
     | 
| 
      
 21 
     | 
    
         
            +
                  raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
         
     | 
| 
      
 22 
     | 
    
         
            +
                  raise DocumentNotFoundError, "File is empty: #{file_path}" if File.zero?(file_path)
         
     | 
| 
      
 23 
     | 
    
         
            +
                  
         
     | 
| 
      
 24 
     | 
    
         
            +
                  logger.info "Extracting text from: #{file_path}"
         
     | 
| 
      
 25 
     | 
    
         
            +
                  
         
     | 
| 
       12 
26 
     | 
    
         
             
                  case File.extname(file_path).downcase
         
     | 
| 
       13 
27 
     | 
    
         
             
                  when '.pdf'
         
     | 
| 
       14 
28 
     | 
    
         
             
                    extract_from_pdf(file_path)
         
     | 
| 
       15 
29 
     | 
    
         
             
                  when '.docx'
         
     | 
| 
       16 
30 
     | 
    
         
             
                    extract_from_docx(file_path)
         
     | 
| 
       17 
     | 
    
         
            -
                  when '.txt'
         
     | 
| 
       18 
     | 
    
         
            -
                     
     | 
| 
      
 31 
     | 
    
         
            +
                  when '.txt', '.text'
         
     | 
| 
      
 32 
     | 
    
         
            +
                    extract_from_text(file_path)
         
     | 
| 
      
 33 
     | 
    
         
            +
                  when '.rtf'
         
     | 
| 
      
 34 
     | 
    
         
            +
                    extract_from_rtf(file_path)
         
     | 
| 
       19 
35 
     | 
    
         
             
                  else
         
     | 
| 
       20 
     | 
    
         
            -
                    raise UnsupportedFormatError, "Unsupported file format: #{File.extname(file_path)}"
         
     | 
| 
      
 36 
     | 
    
         
            +
                    raise UnsupportedFormatError, "Unsupported file format: #{File.extname(file_path)}. Supported formats: .pdf, .docx, .txt, .rtf"
         
     | 
| 
       21 
37 
     | 
    
         
             
                  end
         
     | 
| 
       22 
38 
     | 
    
         
             
                end
         
     | 
| 
       23 
39 
     | 
    
         | 
| 
         @@ -27,15 +43,30 @@ module LegalSummariser 
     | 
|
| 
       27 
43 
     | 
    
         
             
                # @param file_path [String] Path to PDF file
         
     | 
| 
       28 
44 
     | 
    
         
             
                # @return [String] Extracted text
         
     | 
| 
       29 
45 
     | 
    
         
             
                def self.extract_from_pdf(file_path)
         
     | 
| 
      
 46 
     | 
    
         
            +
                  logger.debug "Processing PDF: #{file_path}"
         
     | 
| 
      
 47 
     | 
    
         
            +
                  
         
     | 
| 
       30 
48 
     | 
    
         
             
                  reader = PDF::Reader.new(file_path)
         
     | 
| 
       31 
49 
     | 
    
         
             
                  text = ""
         
     | 
| 
      
 50 
     | 
    
         
            +
                  page_count = 0
         
     | 
| 
       32 
51 
     | 
    
         | 
| 
       33 
52 
     | 
    
         
             
                  reader.pages.each do |page|
         
     | 
| 
       34 
     | 
    
         
            -
                     
     | 
| 
      
 53 
     | 
    
         
            +
                    page_count += 1
         
     | 
| 
      
 54 
     | 
    
         
            +
                    page_text = page.text
         
     | 
| 
      
 55 
     | 
    
         
            +
                    text += page_text + "\n" if page_text && !page_text.strip.empty?
         
     | 
| 
      
 56 
     | 
    
         
            +
                  end
         
     | 
| 
      
 57 
     | 
    
         
            +
                  
         
     | 
| 
      
 58 
     | 
    
         
            +
                  logger.info "Extracted text from #{page_count} PDF pages"
         
     | 
| 
      
 59 
     | 
    
         
            +
                  
         
     | 
| 
      
 60 
     | 
    
         
            +
                  if text.strip.empty?
         
     | 
| 
      
 61 
     | 
    
         
            +
                    logger.warn "No text extracted from PDF - file may be image-based or encrypted"
         
     | 
| 
      
 62 
     | 
    
         
            +
                    raise Error, "No extractable text found in PDF. File may be image-based or password-protected."
         
     | 
| 
       35 
63 
     | 
    
         
             
                  end
         
     | 
| 
       36 
64 
     | 
    
         | 
| 
       37 
     | 
    
         
            -
                  # Clean up common PDF artifacts
         
     | 
| 
       38 
65 
     | 
    
         
             
                  clean_text(text)
         
     | 
| 
      
 66 
     | 
    
         
            +
                rescue PDF::Reader::MalformedPDFError => e
         
     | 
| 
      
 67 
     | 
    
         
            +
                  raise Error, "Malformed PDF file: #{e.message}"
         
     | 
| 
      
 68 
     | 
    
         
            +
                rescue PDF::Reader::UnsupportedFeatureError => e
         
     | 
| 
      
 69 
     | 
    
         
            +
                  raise Error, "PDF contains unsupported features: #{e.message}"
         
     | 
| 
       39 
70 
     | 
    
         
             
                rescue => e
         
     | 
| 
       40 
71 
     | 
    
         
             
                  raise Error, "Failed to extract text from PDF: #{e.message}"
         
     | 
| 
       41 
72 
     | 
    
         
             
                end
         
     | 
| 
         @@ -44,28 +75,98 @@ module LegalSummariser 
     | 
|
| 
       44 
75 
     | 
    
         
             
                # @param file_path [String] Path to DOCX file
         
     | 
| 
       45 
76 
     | 
    
         
             
                # @return [String] Extracted text
         
     | 
| 
       46 
77 
     | 
    
         
             
                def self.extract_from_docx(file_path)
         
     | 
| 
      
 78 
     | 
    
         
            +
                  logger.debug "Processing DOCX: #{file_path}"
         
     | 
| 
      
 79 
     | 
    
         
            +
                  
         
     | 
| 
       47 
80 
     | 
    
         
             
                  doc = Docx::Document.open(file_path)
         
     | 
| 
       48 
81 
     | 
    
         
             
                  text = ""
         
     | 
| 
      
 82 
     | 
    
         
            +
                  paragraph_count = 0
         
     | 
| 
       49 
83 
     | 
    
         | 
| 
       50 
84 
     | 
    
         
             
                  doc.paragraphs.each do |paragraph|
         
     | 
| 
       51 
     | 
    
         
            -
                     
     | 
| 
      
 85 
     | 
    
         
            +
                    paragraph_text = paragraph.text
         
     | 
| 
      
 86 
     | 
    
         
            +
                    if paragraph_text && !paragraph_text.strip.empty?
         
     | 
| 
      
 87 
     | 
    
         
            +
                      text += paragraph_text + "\n"
         
     | 
| 
      
 88 
     | 
    
         
            +
                      paragraph_count += 1
         
     | 
| 
      
 89 
     | 
    
         
            +
                    end
         
     | 
| 
      
 90 
     | 
    
         
            +
                  end
         
     | 
| 
      
 91 
     | 
    
         
            +
                  
         
     | 
| 
      
 92 
     | 
    
         
            +
                  # Also extract text from tables if present
         
     | 
| 
      
 93 
     | 
    
         
            +
                  doc.tables.each do |table|
         
     | 
| 
      
 94 
     | 
    
         
            +
                    table.rows.each do |row|
         
     | 
| 
      
 95 
     | 
    
         
            +
                      row.cells.each do |cell|
         
     | 
| 
      
 96 
     | 
    
         
            +
                        cell_text = cell.text
         
     | 
| 
      
 97 
     | 
    
         
            +
                        text += cell_text + " " if cell_text && !cell_text.strip.empty?
         
     | 
| 
      
 98 
     | 
    
         
            +
                      end
         
     | 
| 
      
 99 
     | 
    
         
            +
                      text += "\n"
         
     | 
| 
      
 100 
     | 
    
         
            +
                    end
         
     | 
| 
      
 101 
     | 
    
         
            +
                  end
         
     | 
| 
      
 102 
     | 
    
         
            +
                  
         
     | 
| 
      
 103 
     | 
    
         
            +
                  logger.info "Extracted text from #{paragraph_count} DOCX paragraphs"
         
     | 
| 
      
 104 
     | 
    
         
            +
                  
         
     | 
| 
      
 105 
     | 
    
         
            +
                  if text.strip.empty?
         
     | 
| 
      
 106 
     | 
    
         
            +
                    raise Error, "No text content found in DOCX file"
         
     | 
| 
       52 
107 
     | 
    
         
             
                  end
         
     | 
| 
       53 
108 
     | 
    
         | 
| 
       54 
109 
     | 
    
         
             
                  clean_text(text)
         
     | 
| 
      
 110 
     | 
    
         
            +
                rescue Zip::Error => e
         
     | 
| 
      
 111 
     | 
    
         
            +
                  raise Error, "Invalid DOCX file format: #{e.message}"
         
     | 
| 
       55 
112 
     | 
    
         
             
                rescue => e
         
     | 
| 
       56 
113 
     | 
    
         
             
                  raise Error, "Failed to extract text from DOCX: #{e.message}"
         
     | 
| 
       57 
114 
     | 
    
         
             
                end
         
     | 
| 
       58 
115 
     | 
    
         | 
| 
      
 116 
     | 
    
         
            +
                # Extract text from plain text files
         
     | 
| 
      
 117 
     | 
    
         
            +
                # @param file_path [String] Path to text file
         
     | 
| 
      
 118 
     | 
    
         
            +
                # @return [String] Extracted text
         
     | 
| 
      
 119 
     | 
    
         
            +
                def self.extract_from_text(file_path)
         
     | 
| 
      
 120 
     | 
    
         
            +
                  logger.debug "Processing text file: #{file_path}"
         
     | 
| 
      
 121 
     | 
    
         
            +
                  
         
     | 
| 
      
 122 
     | 
    
         
            +
                  # Try different encodings
         
     | 
| 
      
 123 
     | 
    
         
            +
                  encodings = ['UTF-8', 'ISO-8859-1', 'Windows-1252']
         
     | 
| 
      
 124 
     | 
    
         
            +
                  
         
     | 
| 
      
 125 
     | 
    
         
            +
                  encodings.each do |encoding|
         
     | 
| 
      
 126 
     | 
    
         
            +
                    begin
         
     | 
| 
      
 127 
     | 
    
         
            +
                      text = File.read(file_path, encoding: encoding)
         
     | 
| 
      
 128 
     | 
    
         
            +
                      logger.info "Successfully read text file with #{encoding} encoding"
         
     | 
| 
      
 129 
     | 
    
         
            +
                      return clean_text(text)
         
     | 
| 
      
 130 
     | 
    
         
            +
                    rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
         
     | 
| 
      
 131 
     | 
    
         
            +
                      logger.debug "Failed to read with #{encoding} encoding, trying next"
         
     | 
| 
      
 132 
     | 
    
         
            +
                      next
         
     | 
| 
      
 133 
     | 
    
         
            +
                    end
         
     | 
| 
      
 134 
     | 
    
         
            +
                  end
         
     | 
| 
      
 135 
     | 
    
         
            +
                  
         
     | 
| 
      
 136 
     | 
    
         
            +
                  raise Error, "Unable to read text file with supported encodings"
         
     | 
| 
      
 137 
     | 
    
         
            +
                end
         
     | 
| 
      
 138 
     | 
    
         
            +
             
     | 
| 
      
 139 
     | 
    
         
            +
                # Extract text from RTF files (basic support)
         
     | 
| 
      
 140 
     | 
    
         
            +
                # @param file_path [String] Path to RTF file
         
     | 
| 
      
 141 
     | 
    
         
            +
                # @return [String] Extracted text
         
     | 
| 
      
 142 
     | 
    
         
            +
                def self.extract_from_rtf(file_path)
         
     | 
| 
      
 143 
     | 
    
         
            +
                  logger.debug "Processing RTF: #{file_path}"
         
     | 
| 
      
 144 
     | 
    
         
            +
                  
         
     | 
| 
      
 145 
     | 
    
         
            +
                  content = File.read(file_path, encoding: 'UTF-8')
         
     | 
| 
      
 146 
     | 
    
         
            +
                  
         
     | 
| 
      
 147 
     | 
    
         
            +
                  # Basic RTF parsing - remove RTF control codes
         
     | 
| 
      
 148 
     | 
    
         
            +
                  text = content.gsub(/\{[^}]*\}/, '') # Remove RTF groups
         
     | 
| 
      
 149 
     | 
    
         
            +
                  text = text.gsub(/\\[a-z]+\d*\s?/, '') # Remove RTF commands
         
     | 
| 
      
 150 
     | 
    
         
            +
                  text = text.gsub(/\\[^a-z]/, '') # Remove RTF escape sequences
         
     | 
| 
      
 151 
     | 
    
         
            +
                  
         
     | 
| 
      
 152 
     | 
    
         
            +
                  clean_text(text)
         
     | 
| 
      
 153 
     | 
    
         
            +
                rescue => e
         
     | 
| 
      
 154 
     | 
    
         
            +
                  raise Error, "Failed to extract text from RTF: #{e.message}"
         
     | 
| 
      
 155 
     | 
    
         
            +
                end
         
     | 
| 
      
 156 
     | 
    
         
            +
             
     | 
| 
       59 
157 
     | 
    
         
             
                # Clean extracted text
         
     | 
| 
       60 
158 
     | 
    
         
             
                # @param text [String] Raw extracted text
         
     | 
| 
       61 
159 
     | 
    
         
             
                # @return [String] Cleaned text
         
     | 
| 
       62 
160 
     | 
    
         
             
                def self.clean_text(text)
         
     | 
| 
      
 161 
     | 
    
         
            +
                  return "" if text.nil? || text.empty?
         
     | 
| 
      
 162 
     | 
    
         
            +
                  
         
     | 
| 
       63 
163 
     | 
    
         
             
                  # Normalize line breaks first
         
     | 
| 
       64 
164 
     | 
    
         
             
                  text = text.gsub(/\r\n?/, "\n")
         
     | 
| 
       65 
165 
     | 
    
         | 
| 
       66 
     | 
    
         
            -
                  # Remove common  
     | 
| 
      
 166 
     | 
    
         
            +
                  # Remove common document artifacts
         
     | 
| 
       67 
167 
     | 
    
         
             
                  text = text.gsub(/\f/, '') # Form feed characters
         
     | 
| 
       68 
168 
     | 
    
         
             
                  text = text.gsub(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/, '') # Control characters
         
     | 
| 
      
 169 
     | 
    
         
            +
                  text = text.gsub(/\u00A0/, ' ') # Non-breaking spaces
         
     | 
| 
       69 
170 
     | 
    
         | 
| 
       70 
171 
     | 
    
         
             
                  # Remove excessive whitespace but preserve line breaks
         
     | 
| 
       71 
172 
     | 
    
         
             
                  text = text.gsub(/[ \t]+/, ' ')
         
     | 
| 
         @@ -73,7 +174,24 @@ module LegalSummariser 
     | 
|
| 
       73 
174 
     | 
    
         
             
                  # Remove excessive newlines
         
     | 
| 
       74 
175 
     | 
    
         
             
                  text = text.gsub(/\n{3,}/, "\n\n")
         
     | 
| 
       75 
176 
     | 
    
         | 
| 
      
 177 
     | 
    
         
            +
                  # Remove leading/trailing whitespace from each line
         
     | 
| 
      
 178 
     | 
    
         
            +
                  text = text.split("\n").map(&:strip).join("\n")
         
     | 
| 
      
 179 
     | 
    
         
            +
                  
         
     | 
| 
      
 180 
     | 
    
         
            +
                  # Remove empty lines at start and end
         
     | 
| 
       76 
181 
     | 
    
         
             
                  text.strip
         
     | 
| 
       77 
182 
     | 
    
         
             
                end
         
     | 
| 
      
 183 
     | 
    
         
            +
             
     | 
| 
      
 184 
     | 
    
         
            +
                # Get document statistics
         
     | 
| 
      
 185 
     | 
    
         
            +
                # @param text [String] Document text
         
     | 
| 
      
 186 
     | 
    
         
            +
                # @return [Hash] Document statistics
         
     | 
| 
      
 187 
     | 
    
         
            +
                def self.get_statistics(text)
         
     | 
| 
      
 188 
     | 
    
         
            +
                  {
         
     | 
| 
      
 189 
     | 
    
         
            +
                    character_count: text.length,
         
     | 
| 
      
 190 
     | 
    
         
            +
                    word_count: text.split(/\s+/).length,
         
     | 
| 
      
 191 
     | 
    
         
            +
                    sentence_count: text.split(/[.!?]+/).length,
         
     | 
| 
      
 192 
     | 
    
         
            +
                    paragraph_count: text.split(/\n\s*\n/).length,
         
     | 
| 
      
 193 
     | 
    
         
            +
                    average_sentence_length: text.split(/\s+/).length.to_f / text.split(/[.!?]+/).length
         
     | 
| 
      
 194 
     | 
    
         
            +
                  }
         
     | 
| 
      
 195 
     | 
    
         
            +
                end
         
     | 
| 
       78 
196 
     | 
    
         
             
              end
         
     | 
| 
       79 
197 
     | 
    
         
             
            end
         
     | 
    
        data/lib/legal_summariser.rb
    CHANGED
    
    | 
         @@ -1,12 +1,19 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # frozen_string_literal: true
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
     | 
    
         
            -
            require_relative  
     | 
| 
       4 
     | 
    
         
            -
            require_relative  
     | 
| 
       5 
     | 
    
         
            -
            require_relative  
     | 
| 
       6 
     | 
    
         
            -
            require_relative  
     | 
| 
       7 
     | 
    
         
            -
            require_relative  
     | 
| 
       8 
     | 
    
         
            -
            require_relative  
     | 
| 
       9 
     | 
    
         
            -
            require_relative  
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative 'legal_summariser/version'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require_relative 'legal_summariser/text_extractor'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require_relative 'legal_summariser/summariser'
         
     | 
| 
      
 6 
     | 
    
         
            +
            require_relative 'legal_summariser/clause_detector'
         
     | 
| 
      
 7 
     | 
    
         
            +
            require_relative 'legal_summariser/risk_analyzer'
         
     | 
| 
      
 8 
     | 
    
         
            +
            require_relative 'legal_summariser/formatter'
         
     | 
| 
      
 9 
     | 
    
         
            +
            require_relative 'legal_summariser/document_parser'
         
     | 
| 
      
 10 
     | 
    
         
            +
            require_relative 'legal_summariser/configuration'
         
     | 
| 
      
 11 
     | 
    
         
            +
            require_relative 'legal_summariser/cache'
         
     | 
| 
      
 12 
     | 
    
         
            +
            require_relative 'legal_summariser/performance_monitor'
         
     | 
| 
      
 13 
     | 
    
         
            +
            require_relative 'legal_summariser/plain_language_generator'
         
     | 
| 
      
 14 
     | 
    
         
            +
            require_relative 'legal_summariser/model_trainer'
         
     | 
| 
      
 15 
     | 
    
         
            +
            require_relative 'legal_summariser/multilingual_processor'
         
     | 
| 
      
 16 
     | 
    
         
            +
            require_relative 'legal_summariser/pdf_annotator'
         
     | 
| 
       10 
17 
     | 
    
         | 
| 
       11 
18 
     | 
    
         
             
            module LegalSummariser
         
     | 
| 
       12 
19 
     | 
    
         
             
              class Error < StandardError; end
         
     | 
| 
         @@ -18,34 +25,93 @@ module LegalSummariser 
     | 
|
| 
       18 
25 
     | 
    
         
             
              # @param options [Hash] Configuration options
         
     | 
| 
       19 
26 
     | 
    
         
             
              # @return [Hash] Summary results
         
     | 
| 
       20 
27 
     | 
    
         
             
              def self.summarise(file_path, options = {})
         
     | 
| 
       21 
     | 
    
         
            -
                 
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
       23 
     | 
    
         
            -
                # Extract text from document
         
     | 
| 
       24 
     | 
    
         
            -
                text = TextExtractor.extract(file_path)
         
     | 
| 
      
 28 
     | 
    
         
            +
                monitor = performance_monitor
         
     | 
| 
      
 29 
     | 
    
         
            +
                cache = Cache.new
         
     | 
| 
       25 
30 
     | 
    
         | 
| 
       26 
     | 
    
         
            -
                 
     | 
| 
       27 
     | 
    
         
            -
                summary = Summariser.new(text, options).generate
         
     | 
| 
       28 
     | 
    
         
            -
                clauses = ClauseDetector.new(text).detect
         
     | 
| 
       29 
     | 
    
         
            -
                risks = RiskAnalyzer.new(text).analyze
         
     | 
| 
      
 31 
     | 
    
         
            +
                monitor.start_timer(:total_analysis)
         
     | 
| 
       30 
32 
     | 
    
         | 
| 
       31 
     | 
    
         
            -
                 
     | 
| 
       32 
     | 
    
         
            -
             
     | 
| 
       33 
     | 
    
         
            -
                   
     | 
| 
       34 
     | 
    
         
            -
                   
     | 
| 
       35 
     | 
    
         
            -
                   
     | 
| 
       36 
     | 
    
         
            -
                   
     | 
| 
       37 
     | 
    
         
            -
                   
     | 
| 
      
 33 
     | 
    
         
            +
                begin
         
     | 
| 
      
 34 
     | 
    
         
            +
                  # Validate file
         
     | 
| 
      
 35 
     | 
    
         
            +
                  raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
         
     | 
| 
      
 36 
     | 
    
         
            +
                  
         
     | 
| 
      
 37 
     | 
    
         
            +
                  file_size = File.size(file_path)
         
     | 
| 
      
 38 
     | 
    
         
            +
                  raise Error, "File too large: #{file_size} bytes (max: #{configuration.max_file_size})" if file_size > configuration.max_file_size
         
     | 
| 
      
 39 
     | 
    
         
            +
                  
         
     | 
| 
      
 40 
     | 
    
         
            +
                  # Check cache first
         
     | 
| 
      
 41 
     | 
    
         
            +
                  cache_key = cache.cache_key(file_path, options)
         
     | 
| 
      
 42 
     | 
    
         
            +
                  cached_result = cache.get(cache_key)
         
     | 
| 
      
 43 
     | 
    
         
            +
                  
         
     | 
| 
      
 44 
     | 
    
         
            +
                  if cached_result
         
     | 
| 
      
 45 
     | 
    
         
            +
                    configuration.logger&.info("Using cached result for #{file_path}")
         
     | 
| 
      
 46 
     | 
    
         
            +
                    monitor.end_timer(:total_analysis)
         
     | 
| 
      
 47 
     | 
    
         
            +
                    return cached_result
         
     | 
| 
      
 48 
     | 
    
         
            +
                  end
         
     | 
| 
      
 49 
     | 
    
         
            +
                  
         
     | 
| 
      
 50 
     | 
    
         
            +
                  # Extract text from document
         
     | 
| 
      
 51 
     | 
    
         
            +
                  monitor.start_timer(:text_extraction)
         
     | 
| 
      
 52 
     | 
    
         
            +
                  text = TextExtractor.extract(file_path)
         
     | 
| 
      
 53 
     | 
    
         
            +
                  extraction_time = monitor.end_timer(:text_extraction)
         
     | 
| 
      
 54 
     | 
    
         
            +
                  
         
     | 
| 
      
 55 
     | 
    
         
            +
                  # Record text statistics
         
     | 
| 
      
 56 
     | 
    
         
            +
                  text_stats = TextExtractor.get_statistics(text)
         
     | 
| 
      
 57 
     | 
    
         
            +
                  monitor.record(:document_word_count, text_stats[:word_count])
         
     | 
| 
      
 58 
     | 
    
         
            +
                  monitor.record(:document_character_count, text_stats[:character_count])
         
     | 
| 
      
 59 
     | 
    
         
            +
                  
         
     | 
| 
      
 60 
     | 
    
         
            +
                  # Perform analysis components
         
     | 
| 
      
 61 
     | 
    
         
            +
                  monitor.start_timer(:summarisation)
         
     | 
| 
      
 62 
     | 
    
         
            +
                  summary = Summariser.new(text, options).generate
         
     | 
| 
      
 63 
     | 
    
         
            +
                  monitor.end_timer(:summarisation)
         
     | 
| 
      
 64 
     | 
    
         
            +
                  
         
     | 
| 
      
 65 
     | 
    
         
            +
                  monitor.start_timer(:clause_detection)
         
     | 
| 
      
 66 
     | 
    
         
            +
                  clauses = ClauseDetector.new(text).detect
         
     | 
| 
      
 67 
     | 
    
         
            +
                  monitor.end_timer(:clause_detection)
         
     | 
| 
      
 68 
     | 
    
         
            +
                  
         
     | 
| 
      
 69 
     | 
    
         
            +
                  monitor.start_timer(:risk_analysis)
         
     | 
| 
      
 70 
     | 
    
         
            +
                  risks = RiskAnalyzer.new(text).analyze
         
     | 
| 
      
 71 
     | 
    
         
            +
                  monitor.end_timer(:risk_analysis)
         
     | 
| 
      
 72 
     | 
    
         
            +
                  
         
     | 
| 
      
 73 
     | 
    
         
            +
                  # Format results
         
     | 
| 
      
 74 
     | 
    
         
            +
                  result = {
         
     | 
| 
      
 75 
     | 
    
         
            +
                    file_path: file_path,
         
     | 
| 
       38 
76 
     | 
    
         
             
                    document_type: detect_document_type(text),
         
     | 
| 
       39 
     | 
    
         
            -
                     
     | 
| 
       40 
     | 
    
         
            -
                     
     | 
| 
      
 77 
     | 
    
         
            +
                    processing_time: monitor.end_timer(:total_analysis),
         
     | 
| 
      
 78 
     | 
    
         
            +
                    plain_text: summary[:plain_text],
         
     | 
| 
      
 79 
     | 
    
         
            +
                    key_points: summary[:key_points],
         
     | 
| 
      
 80 
     | 
    
         
            +
                    clauses: clauses,
         
     | 
| 
      
 81 
     | 
    
         
            +
                    risks: risks,
         
     | 
| 
      
 82 
     | 
    
         
            +
                    metadata: {
         
     | 
| 
      
 83 
     | 
    
         
            +
                      file_size: File.size(file_path),
         
     | 
| 
      
 84 
     | 
    
         
            +
                      word_count: text_stats[:word_count],
         
     | 
| 
      
 85 
     | 
    
         
            +
                      character_count: text_stats[:character_count],
         
     | 
| 
      
 86 
     | 
    
         
            +
                      sentence_count: text_stats[:sentence_count],
         
     | 
| 
      
 87 
     | 
    
         
            +
                      paragraph_count: text_stats[:paragraph_count],
         
     | 
| 
      
 88 
     | 
    
         
            +
                      file_size_bytes: file_size,
         
     | 
| 
      
 89 
     | 
    
         
            +
                      extraction_time_seconds: extraction_time.round(3),
         
     | 
| 
      
 90 
     | 
    
         
            +
                      processed_at: Time.now.strftime("%Y-%m-%dT%H:%M:%S%z"),
         
     | 
| 
      
 91 
     | 
    
         
            +
                      gem_version: VERSION,
         
     | 
| 
      
 92 
     | 
    
         
            +
                      language: configuration.language,
         
     | 
| 
      
 93 
     | 
    
         
            +
                      document_type: detect_document_type(text)
         
     | 
| 
      
 94 
     | 
    
         
            +
                    },
         
     | 
| 
      
 95 
     | 
    
         
            +
                    performance: monitor.stats
         
     | 
| 
       41 
96 
     | 
    
         
             
                  }
         
     | 
| 
       42 
     | 
    
         
            -
             
     | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
       44 
     | 
    
         
            -
             
     | 
| 
       45 
     | 
    
         
            -
             
     | 
| 
       46 
     | 
    
         
            -
                   
     | 
| 
       47 
     | 
    
         
            -
             
     | 
| 
       48 
     | 
    
         
            -
                   
     | 
| 
      
 97 
     | 
    
         
            +
                  
         
     | 
| 
      
 98 
     | 
    
         
            +
                  # Cache the result
         
     | 
| 
      
 99 
     | 
    
         
            +
                  cache.set(cache_key, result)
         
     | 
| 
      
 100 
     | 
    
         
            +
                  
         
     | 
| 
      
 101 
     | 
    
         
            +
                  total_time = monitor.end_timer(:total_analysis)
         
     | 
| 
      
 102 
     | 
    
         
            +
                  configuration.logger&.info("Analysis completed in #{total_time.round(3)}s")
         
     | 
| 
      
 103 
     | 
    
         
            +
                  
         
     | 
| 
      
 104 
     | 
    
         
            +
                  # Apply formatting if requested
         
     | 
| 
      
 105 
     | 
    
         
            +
                  if options[:format]
         
     | 
| 
      
 106 
     | 
    
         
            +
                    Formatter.format(result, options[:format])
         
     | 
| 
      
 107 
     | 
    
         
            +
                  else
         
     | 
| 
      
 108 
     | 
    
         
            +
                    result
         
     | 
| 
      
 109 
     | 
    
         
            +
                  end
         
     | 
| 
      
 110 
     | 
    
         
            +
                  
         
     | 
| 
      
 111 
     | 
    
         
            +
                rescue => e
         
     | 
| 
      
 112 
     | 
    
         
            +
                  monitor.end_timer(:total_analysis)
         
     | 
| 
      
 113 
     | 
    
         
            +
                  configuration.logger&.error("Analysis failed: #{e.message}")
         
     | 
| 
      
 114 
     | 
    
         
            +
                  raise
         
     | 
| 
       49 
115 
     | 
    
         
             
                end
         
     | 
| 
       50 
116 
     | 
    
         
             
              end
         
     | 
| 
       51 
117 
     | 
    
         | 
| 
         @@ -53,19 +119,114 @@ module LegalSummariser 
     | 
|
| 
       53 
119 
     | 
    
         
             
              # @param text [String] Document text
         
     | 
| 
       54 
120 
     | 
    
         
             
              # @return [String] Document type
         
     | 
| 
       55 
121 
     | 
    
         
             
              def self.detect_document_type(text)
         
     | 
| 
       56 
     | 
    
         
            -
                 
     | 
| 
       57 
     | 
    
         
            -
                 
     | 
| 
       58 
     | 
    
         
            -
             
     | 
| 
       59 
     | 
    
         
            -
                 
     | 
| 
       60 
     | 
    
         
            -
                   
     | 
| 
       61 
     | 
    
         
            -
             
     | 
| 
       62 
     | 
    
         
            -
                   
     | 
| 
       63 
     | 
    
         
            -
             
     | 
| 
       64 
     | 
    
         
            -
                   
     | 
| 
       65 
     | 
    
         
            -
             
     | 
| 
       66 
     | 
    
         
            -
                   
     | 
| 
       67 
     | 
    
         
            -
             
     | 
| 
       68 
     | 
    
         
            -
                   
     | 
| 
      
 122 
     | 
    
         
            +
                text_lower = text.downcase
         
     | 
| 
      
 123 
     | 
    
         
            +
                
         
     | 
| 
      
 124 
     | 
    
         
            +
                # Score different document types
         
     | 
| 
      
 125 
     | 
    
         
            +
                scores = {
         
     | 
| 
      
 126 
     | 
    
         
            +
                  nda: 0,
         
     | 
| 
      
 127 
     | 
    
         
            +
                  service_agreement: 0,
         
     | 
| 
      
 128 
     | 
    
         
            +
                  employment_contract: 0,
         
     | 
| 
      
 129 
     | 
    
         
            +
                  privacy_policy: 0,
         
     | 
| 
      
 130 
     | 
    
         
            +
                  license_agreement: 0,
         
     | 
| 
      
 131 
     | 
    
         
            +
                  terms_of_use: 0,
         
     | 
| 
      
 132 
     | 
    
         
            +
                  purchase_agreement: 0,
         
     | 
| 
      
 133 
     | 
    
         
            +
                  lease_agreement: 0,
         
     | 
| 
      
 134 
     | 
    
         
            +
                  partnership_agreement: 0,
         
     | 
| 
      
 135 
     | 
    
         
            +
                  general_contract: 1 # Base score
         
     | 
| 
      
 136 
     | 
    
         
            +
                }
         
     | 
| 
      
 137 
     | 
    
         
            +
                
         
     | 
| 
      
 138 
     | 
    
         
            +
                # NDA indicators
         
     | 
| 
      
 139 
     | 
    
         
            +
                scores[:nda] += 3 if text_lower.match?(/non.?disclosure/)
         
     | 
| 
      
 140 
     | 
    
         
            +
                scores[:nda] += 2 if text_lower.match?(/\bnda\b/)
         
     | 
| 
      
 141 
     | 
    
         
            +
                scores[:nda] += 2 if text_lower.match?(/confidential/)
         
     | 
| 
      
 142 
     | 
    
         
            +
                scores[:nda] += 1 if text_lower.match?(/proprietary/)
         
     | 
| 
      
 143 
     | 
    
         
            +
                
         
     | 
| 
      
 144 
     | 
    
         
            +
                # Service agreement indicators
         
     | 
| 
      
 145 
     | 
    
         
            +
                scores[:service_agreement] += 3 if text_lower.match?(/service agreement/)
         
     | 
| 
      
 146 
     | 
    
         
            +
                scores[:service_agreement] += 2 if text_lower.match?(/terms of service/)
         
     | 
| 
      
 147 
     | 
    
         
            +
                scores[:service_agreement] += 2 if text_lower.match?(/\btos\b/)
         
     | 
| 
      
 148 
     | 
    
         
            +
                scores[:service_agreement] += 1 if text_lower.match?(/deliver|provide.*service/)
         
     | 
| 
      
 149 
     | 
    
         
            +
                
         
     | 
| 
      
 150 
     | 
    
         
            +
                # Employment indicators
         
     | 
| 
      
 151 
     | 
    
         
            +
                scores[:employment_contract] += 3 if text_lower.match?(/employment/)
         
     | 
| 
      
 152 
     | 
    
         
            +
                scores[:employment_contract] += 2 if text_lower.match?(/employee|employer/)
         
     | 
| 
      
 153 
     | 
    
         
            +
                scores[:employment_contract] += 2 if text_lower.match?(/job|position/)
         
     | 
| 
      
 154 
     | 
    
         
            +
                scores[:employment_contract] += 1 if text_lower.match?(/salary|wage/)
         
     | 
| 
      
 155 
     | 
    
         
            +
                
         
     | 
| 
      
 156 
     | 
    
         
            +
                # Privacy policy indicators
         
     | 
| 
      
 157 
     | 
    
         
            +
                scores[:privacy_policy] += 3 if text_lower.match?(/privacy policy/)
         
     | 
| 
      
 158 
     | 
    
         
            +
                scores[:privacy_policy] += 2 if text_lower.match?(/data protection/)
         
     | 
| 
      
 159 
     | 
    
         
            +
                scores[:privacy_policy] += 2 if text_lower.match?(/gdpr|kvkk/)
         
     | 
| 
      
 160 
     | 
    
         
            +
                scores[:privacy_policy] += 1 if text_lower.match?(/personal data/)
         
     | 
| 
      
 161 
     | 
    
         
            +
                
         
     | 
| 
      
 162 
     | 
    
         
            +
                # License agreement indicators
         
     | 
| 
      
 163 
     | 
    
         
            +
                scores[:license_agreement] += 3 if text_lower.match?(/license agreement/)
         
     | 
| 
      
 164 
     | 
    
         
            +
                scores[:license_agreement] += 2 if text_lower.match?(/licensing/)
         
     | 
| 
      
 165 
     | 
    
         
            +
                scores[:license_agreement] += 1 if text_lower.match?(/intellectual property/)
         
     | 
| 
      
 166 
     | 
    
         
            +
                
         
     | 
| 
      
 167 
     | 
    
         
            +
                # Terms of use indicators
         
     | 
| 
      
 168 
     | 
    
         
            +
                scores[:terms_of_use] += 3 if text_lower.match?(/terms of use/)
         
     | 
| 
      
 169 
     | 
    
         
            +
                scores[:terms_of_use] += 2 if text_lower.match?(/user agreement/)
         
     | 
| 
      
 170 
     | 
    
         
            +
                scores[:terms_of_use] += 1 if text_lower.match?(/website|platform/)
         
     | 
| 
      
 171 
     | 
    
         
            +
                
         
     | 
| 
      
 172 
     | 
    
         
            +
                # Purchase agreement indicators
         
     | 
| 
      
 173 
     | 
    
         
            +
                scores[:purchase_agreement] += 3 if text_lower.match?(/purchase agreement/)
         
     | 
| 
      
 174 
     | 
    
         
            +
                scores[:purchase_agreement] += 2 if text_lower.match?(/buy|sell|purchase/)
         
     | 
| 
      
 175 
     | 
    
         
            +
                scores[:purchase_agreement] += 1 if text_lower.match?(/price|payment/)
         
     | 
| 
      
 176 
     | 
    
         
            +
                
         
     | 
| 
      
 177 
     | 
    
         
            +
                # Lease agreement indicators
         
     | 
| 
      
 178 
     | 
    
         
            +
                scores[:lease_agreement] += 3 if text_lower.match?(/lease agreement/)
         
     | 
| 
      
 179 
     | 
    
         
            +
                scores[:lease_agreement] += 2 if text_lower.match?(/rent|tenant|landlord/)
         
     | 
| 
      
 180 
     | 
    
         
            +
                scores[:lease_agreement] += 1 if text_lower.match?(/property|premises/)
         
     | 
| 
      
 181 
     | 
    
         
            +
                
         
     | 
| 
      
 182 
     | 
    
         
            +
                # Partnership agreement indicators
         
     | 
| 
      
 183 
     | 
    
         
            +
                scores[:partnership_agreement] += 3 if text_lower.match?(/partnership agreement/)
         
     | 
| 
      
 184 
     | 
    
         
            +
                scores[:partnership_agreement] += 2 if text_lower.match?(/partner|partnership/)
         
     | 
| 
      
 185 
     | 
    
         
            +
                scores[:partnership_agreement] += 1 if text_lower.match?(/joint venture/)
         
     | 
| 
      
 186 
     | 
    
         
            +
                
         
     | 
| 
      
 187 
     | 
    
         
            +
                # Return the type with highest score
         
     | 
| 
      
 188 
     | 
    
         
            +
                scores.max_by { |_, score| score }[0].to_s
         
     | 
| 
      
 189 
     | 
    
         
            +
              end
         
     | 
| 
      
 190 
     | 
    
         
            +
              
         
     | 
| 
      
 191 
     | 
    
         
            +
              # Get analysis statistics
         
     | 
| 
      
 192 
     | 
    
         
            +
              # @return [Hash] Analysis statistics
         
     | 
| 
      
 193 
     | 
    
         
            +
              def self.stats
         
     | 
| 
      
 194 
     | 
    
         
            +
                {
         
     | 
| 
      
 195 
     | 
    
         
            +
                  performance: performance_monitor.stats,
         
     | 
| 
      
 196 
     | 
    
         
            +
                  cache: Cache.new.stats,
         
     | 
| 
      
 197 
     | 
    
         
            +
                  memory: performance_monitor.memory_usage,
         
     | 
| 
      
 198 
     | 
    
         
            +
                  configuration: {
         
     | 
| 
      
 199 
     | 
    
         
            +
                    language: configuration.language,
         
     | 
| 
      
 200 
     | 
    
         
            +
                    max_file_size: configuration.max_file_size,
         
     | 
| 
      
 201 
     | 
    
         
            +
                    caching_enabled: configuration.enable_caching
         
     | 
| 
      
 202 
     | 
    
         
            +
                  }
         
     | 
| 
      
 203 
     | 
    
         
            +
                }
         
     | 
| 
      
 204 
     | 
    
         
            +
              end
         
     | 
| 
      
 205 
     | 
    
         
            +
              
         
     | 
| 
      
 206 
     | 
    
         
            +
              # Reset all statistics and cache
         
     | 
| 
      
 207 
     | 
    
         
            +
              def self.reset!
         
     | 
| 
      
 208 
     | 
    
         
            +
                performance_monitor.reset!
         
     | 
| 
      
 209 
     | 
    
         
            +
                Cache.new.clear!
         
     | 
| 
      
 210 
     | 
    
         
            +
              end
         
     | 
| 
      
 211 
     | 
    
         
            +
             
     | 
| 
      
 212 
     | 
    
         
            +
              # Batch process multiple documents
         
     | 
| 
      
 213 
     | 
    
         
            +
              # @param file_paths [Array<String>] Array of file paths
         
     | 
| 
      
 214 
     | 
    
         
            +
              # @param options [Hash] Processing options
         
     | 
| 
      
 215 
     | 
    
         
            +
              # @return [Array<Hash>] Array of analysis results
         
     | 
| 
      
 216 
     | 
    
         
            +
              def self.batch_summarise(file_paths, options = {})
         
     | 
| 
      
 217 
     | 
    
         
            +
                results = []
         
     | 
| 
      
 218 
     | 
    
         
            +
                
         
     | 
| 
      
 219 
     | 
    
         
            +
                file_paths.each_with_index do |file_path, index|
         
     | 
| 
      
 220 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 221 
     | 
    
         
            +
                    configuration.logger&.info("Processing file #{index + 1}/#{file_paths.length}: #{file_path}")
         
     | 
| 
      
 222 
     | 
    
         
            +
                    result = summarise(file_path, options)
         
     | 
| 
      
 223 
     | 
    
         
            +
                    results << { file_path: file_path, success: true, result: result }
         
     | 
| 
      
 224 
     | 
    
         
            +
                  rescue => e
         
     | 
| 
      
 225 
     | 
    
         
            +
                    configuration.logger&.error("Failed to process #{file_path}: #{e.message}")
         
     | 
| 
      
 226 
     | 
    
         
            +
                    results << { file_path: file_path, success: false, error: e.message }
         
     | 
| 
      
 227 
     | 
    
         
            +
                  end
         
     | 
| 
       69 
228 
     | 
    
         
             
                end
         
     | 
| 
      
 229 
     | 
    
         
            +
                
         
     | 
| 
      
 230 
     | 
    
         
            +
                results
         
     | 
| 
       70 
231 
     | 
    
         
             
              end
         
     | 
| 
       71 
232 
     | 
    
         
             
            end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: legal_summariser
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.3.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Legal Summariser Team
         
     | 
| 
         @@ -162,14 +162,21 @@ extra_rdoc_files: [] 
     | 
|
| 
       162 
162 
     | 
    
         
             
            files:
         
     | 
| 
       163 
163 
     | 
    
         
             
            - ".rspec"
         
     | 
| 
       164 
164 
     | 
    
         
             
            - CHANGELOG.md
         
     | 
| 
      
 165 
     | 
    
         
            +
            - CONTRIBUTING.md
         
     | 
| 
       165 
166 
     | 
    
         
             
            - Gemfile
         
     | 
| 
       166 
167 
     | 
    
         
             
            - README.md
         
     | 
| 
       167 
168 
     | 
    
         
             
            - Rakefile
         
     | 
| 
      
 169 
     | 
    
         
            +
            - examples/advanced_configuration.rb
         
     | 
| 
      
 170 
     | 
    
         
            +
            - examples/basic_usage.rb
         
     | 
| 
      
 171 
     | 
    
         
            +
            - examples/batch_processing.rb
         
     | 
| 
       168 
172 
     | 
    
         
             
            - exe/legal_summariser
         
     | 
| 
       169 
173 
     | 
    
         
             
            - lib/legal_summariser.rb
         
     | 
| 
      
 174 
     | 
    
         
            +
            - lib/legal_summariser/cache.rb
         
     | 
| 
       170 
175 
     | 
    
         
             
            - lib/legal_summariser/clause_detector.rb
         
     | 
| 
      
 176 
     | 
    
         
            +
            - lib/legal_summariser/configuration.rb
         
     | 
| 
       171 
177 
     | 
    
         
             
            - lib/legal_summariser/document_parser.rb
         
     | 
| 
       172 
178 
     | 
    
         
             
            - lib/legal_summariser/formatter.rb
         
     | 
| 
      
 179 
     | 
    
         
            +
            - lib/legal_summariser/performance_monitor.rb
         
     | 
| 
       173 
180 
     | 
    
         
             
            - lib/legal_summariser/risk_analyzer.rb
         
     | 
| 
       174 
181 
     | 
    
         
             
            - lib/legal_summariser/summariser.rb
         
     | 
| 
       175 
182 
     | 
    
         
             
            - lib/legal_summariser/text_extractor.rb
         
     |