RubyGems - legal_summariser - Versions diffs - 0.1.0 → 0.3.0 - Mend

legal_summariser 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +48 -0
data/CONTRIBUTING.md +231 -0
data/examples/advanced_configuration.rb +195 -0
data/examples/basic_usage.rb +101 -0
data/examples/batch_processing.rb +123 -0
data/exe/legal_summariser +131 -1
data/lib/legal_summariser/cache.rb +81 -0
data/lib/legal_summariser/configuration.rb +43 -0
data/lib/legal_summariser/performance_monitor.rb +108 -0
data/lib/legal_summariser/text_extractor.rb +125 -7
data/lib/legal_summariser/version.rb +1 -1
data/lib/legal_summariser.rb +205 -44
metadata +8 -1

data/lib/legal_summariser/text_extractor.rb CHANGED Viewed

@@ -2,22 +2,38 @@
 require 'pdf-reader'
 require 'docx'
+require 'logger'
 module LegalSummariser
   class TextExtractor
+    # Logger for debugging and monitoring
+    def self.logger
+      @logger ||= Logger.new(STDOUT, level: Logger::WARN)
+    end
+    def self.logger=(logger)
+      @logger = logger
+    end
     # Extract text from various document formats
     # @param file_path [String] Path to the document
     # @return [String] Extracted text
     def self.extract(file_path)
+      raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
+      raise DocumentNotFoundError, "File is empty: #{file_path}" if File.zero?(file_path)
+      logger.info "Extracting text from: #{file_path}"
       case File.extname(file_path).downcase
       when '.pdf'
         extract_from_pdf(file_path)
       when '.docx'
         extract_from_docx(file_path)
-      when '.txt'
-        File.read(file_path, encoding: 'UTF-8')
+      when '.txt', '.text'
+        extract_from_text(file_path)
+      when '.rtf'
+        extract_from_rtf(file_path)
       else
-        raise UnsupportedFormatError, "Unsupported file format: #{File.extname(file_path)}"
+        raise UnsupportedFormatError, "Unsupported file format: #{File.extname(file_path)}. Supported formats: .pdf, .docx, .txt, .rtf"
       end
     end
@@ -27,15 +43,30 @@ module LegalSummariser
     # @param file_path [String] Path to PDF file
     # @return [String] Extracted text
     def self.extract_from_pdf(file_path)
+      logger.debug "Processing PDF: #{file_path}"
       reader = PDF::Reader.new(file_path)
       text = ""
+      page_count = 0
       reader.pages.each do |page|
-        text += page.text + "\n"
+        page_count += 1
+        page_text = page.text
+        text += page_text + "\n" if page_text && !page_text.strip.empty?
+      end
+      logger.info "Extracted text from #{page_count} PDF pages"
+      if text.strip.empty?
+        logger.warn "No text extracted from PDF - file may be image-based or encrypted"
+        raise Error, "No extractable text found in PDF. File may be image-based or password-protected."
       end
-      # Clean up common PDF artifacts
       clean_text(text)
+    rescue PDF::Reader::MalformedPDFError => e
+      raise Error, "Malformed PDF file: #{e.message}"
+    rescue PDF::Reader::UnsupportedFeatureError => e
+      raise Error, "PDF contains unsupported features: #{e.message}"
     rescue => e
       raise Error, "Failed to extract text from PDF: #{e.message}"
     end
@@ -44,28 +75,98 @@ module LegalSummariser
     # @param file_path [String] Path to DOCX file
     # @return [String] Extracted text
     def self.extract_from_docx(file_path)
+      logger.debug "Processing DOCX: #{file_path}"
       doc = Docx::Document.open(file_path)
       text = ""
+      paragraph_count = 0
       doc.paragraphs.each do |paragraph|
-        text += paragraph.text + "\n"
+        paragraph_text = paragraph.text
+        if paragraph_text && !paragraph_text.strip.empty?
+          text += paragraph_text + "\n"
+          paragraph_count += 1
+        end
+      end
+      # Also extract text from tables if present
+      doc.tables.each do |table|
+        table.rows.each do |row|
+          row.cells.each do |cell|
+            cell_text = cell.text
+            text += cell_text + " " if cell_text && !cell_text.strip.empty?
+          end
+          text += "\n"
+        end
+      end
+      logger.info "Extracted text from #{paragraph_count} DOCX paragraphs"
+      if text.strip.empty?
+        raise Error, "No text content found in DOCX file"
       end
       clean_text(text)
+    rescue Zip::Error => e
+      raise Error, "Invalid DOCX file format: #{e.message}"
     rescue => e
       raise Error, "Failed to extract text from DOCX: #{e.message}"
     end
+    # Extract text from plain text files
+    # @param file_path [String] Path to text file
+    # @return [String] Extracted text
+    def self.extract_from_text(file_path)
+      logger.debug "Processing text file: #{file_path}"
+      # Try different encodings
+      encodings = ['UTF-8', 'ISO-8859-1', 'Windows-1252']
+      encodings.each do |encoding|
+        begin
+          text = File.read(file_path, encoding: encoding)
+          logger.info "Successfully read text file with #{encoding} encoding"
+          return clean_text(text)
+        rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
+          logger.debug "Failed to read with #{encoding} encoding, trying next"
+          next
+        end
+      end
+      raise Error, "Unable to read text file with supported encodings"
+    end
+    # Extract text from RTF files (basic support)
+    # @param file_path [String] Path to RTF file
+    # @return [String] Extracted text
+    def self.extract_from_rtf(file_path)
+      logger.debug "Processing RTF: #{file_path}"
+      content = File.read(file_path, encoding: 'UTF-8')
+      # Basic RTF parsing - remove RTF control codes
+      text = content.gsub(/\{[^}]*\}/, '') # Remove RTF groups
+      text = text.gsub(/\\[a-z]+\d*\s?/, '') # Remove RTF commands
+      text = text.gsub(/\\[^a-z]/, '') # Remove RTF escape sequences
+      clean_text(text)
+    rescue => e
+      raise Error, "Failed to extract text from RTF: #{e.message}"
+    end
     # Clean extracted text
     # @param text [String] Raw extracted text
     # @return [String] Cleaned text
     def self.clean_text(text)
+      return "" if text.nil? || text.empty?
       # Normalize line breaks first
       text = text.gsub(/\r\n?/, "\n")
-      # Remove common PDF artifacts
+      # Remove common document artifacts
       text = text.gsub(/\f/, '') # Form feed characters
       text = text.gsub(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/, '') # Control characters
+      text = text.gsub(/\u00A0/, ' ') # Non-breaking spaces
       # Remove excessive whitespace but preserve line breaks
       text = text.gsub(/[ \t]+/, ' ')
@@ -73,7 +174,24 @@ module LegalSummariser
       # Remove excessive newlines
       text = text.gsub(/\n{3,}/, "\n\n")
+      # Remove leading/trailing whitespace from each line
+      text = text.split("\n").map(&:strip).join("\n")
+      # Remove empty lines at start and end
       text.strip
     end
+    # Get document statistics
+    # @param text [String] Document text
+    # @return [Hash] Document statistics
+    def self.get_statistics(text)
+      {
+        character_count: text.length,
+        word_count: text.split(/\s+/).length,
+        sentence_count: text.split(/[.!?]+/).length,
+        paragraph_count: text.split(/\n\s*\n/).length,
+        average_sentence_length: text.split(/\s+/).length.to_f / text.split(/[.!?]+/).length
+      }
+    end
   end
 end

data/lib/legal_summariser/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module LegalSummariser
-  VERSION = "0.1.0"
+  VERSION = "0.3.0"
 end

data/lib/legal_summariser.rb CHANGED Viewed

@@ -1,12 +1,19 @@
 # frozen_string_literal: true
-require_relative "legal_summariser/version"
-require_relative "legal_summariser/document_parser"
-require_relative "legal_summariser/text_extractor"
-require_relative "legal_summariser/summariser"
-require_relative "legal_summariser/clause_detector"
-require_relative "legal_summariser/risk_analyzer"
-require_relative "legal_summariser/formatter"
+require_relative 'legal_summariser/version'
+require_relative 'legal_summariser/text_extractor'
+require_relative 'legal_summariser/summariser'
+require_relative 'legal_summariser/clause_detector'
+require_relative 'legal_summariser/risk_analyzer'
+require_relative 'legal_summariser/formatter'
+require_relative 'legal_summariser/document_parser'
+require_relative 'legal_summariser/configuration'
+require_relative 'legal_summariser/cache'
+require_relative 'legal_summariser/performance_monitor'
+require_relative 'legal_summariser/plain_language_generator'
+require_relative 'legal_summariser/model_trainer'
+require_relative 'legal_summariser/multilingual_processor'
+require_relative 'legal_summariser/pdf_annotator'
 module LegalSummariser
   class Error < StandardError; end
@@ -18,34 +25,93 @@ module LegalSummariser
   # @param options [Hash] Configuration options
   # @return [Hash] Summary results
   def self.summarise(file_path, options = {})
-    raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
-    # Extract text from document
-    text = TextExtractor.extract(file_path)
+    monitor = performance_monitor
+    cache = Cache.new
-    # Perform analysis
-    summary = Summariser.new(text, options).generate
-    clauses = ClauseDetector.new(text).detect
-    risks = RiskAnalyzer.new(text).analyze
+    monitor.start_timer(:total_analysis)
-    # Format results
-    result = {
-      plain_text: summary[:plain_text],
-      key_points: summary[:key_points],
-      clauses: clauses,
-      risks: risks,
-      metadata: {
+    begin
+      # Validate file
+      raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
+      file_size = File.size(file_path)
+      raise Error, "File too large: #{file_size} bytes (max: #{configuration.max_file_size})" if file_size > configuration.max_file_size
+      # Check cache first
+      cache_key = cache.cache_key(file_path, options)
+      cached_result = cache.get(cache_key)
+      if cached_result
+        configuration.logger&.info("Using cached result for #{file_path}")
+        monitor.end_timer(:total_analysis)
+        return cached_result
+      end
+      # Extract text from document
+      monitor.start_timer(:text_extraction)
+      text = TextExtractor.extract(file_path)
+      extraction_time = monitor.end_timer(:text_extraction)
+      # Record text statistics
+      text_stats = TextExtractor.get_statistics(text)
+      monitor.record(:document_word_count, text_stats[:word_count])
+      monitor.record(:document_character_count, text_stats[:character_count])
+      # Perform analysis components
+      monitor.start_timer(:summarisation)
+      summary = Summariser.new(text, options).generate
+      monitor.end_timer(:summarisation)
+      monitor.start_timer(:clause_detection)
+      clauses = ClauseDetector.new(text).detect
+      monitor.end_timer(:clause_detection)
+      monitor.start_timer(:risk_analysis)
+      risks = RiskAnalyzer.new(text).analyze
+      monitor.end_timer(:risk_analysis)
+      # Format results
+      result = {
+        file_path: file_path,
         document_type: detect_document_type(text),
-        word_count: text.split.length,
-        processed_at: Time.now.strftime("%Y-%m-%dT%H:%M:%S%z")
+        processing_time: monitor.end_timer(:total_analysis),
+        plain_text: summary[:plain_text],
+        key_points: summary[:key_points],
+        clauses: clauses,
+        risks: risks,
+        metadata: {
+          file_size: File.size(file_path),
+          word_count: text_stats[:word_count],
+          character_count: text_stats[:character_count],
+          sentence_count: text_stats[:sentence_count],
+          paragraph_count: text_stats[:paragraph_count],
+          file_size_bytes: file_size,
+          extraction_time_seconds: extraction_time.round(3),
+          processed_at: Time.now.strftime("%Y-%m-%dT%H:%M:%S%z"),
+          gem_version: VERSION,
+          language: configuration.language,
+          document_type: detect_document_type(text)
+        },
+        performance: monitor.stats
       }
-    }
-    # Apply formatting if requested
-    if options[:format]
-      Formatter.format(result, options[:format])
-    else
-      result
+      # Cache the result
+      cache.set(cache_key, result)
+      total_time = monitor.end_timer(:total_analysis)
+      configuration.logger&.info("Analysis completed in #{total_time.round(3)}s")
+      # Apply formatting if requested
+      if options[:format]
+        Formatter.format(result, options[:format])
+      else
+        result
+      end
+    rescue => e
+      monitor.end_timer(:total_analysis)
+      configuration.logger&.error("Analysis failed: #{e.message}")
+      raise
     end
   end
@@ -53,19 +119,114 @@ module LegalSummariser
   # @param text [String] Document text
   # @return [String] Document type
   def self.detect_document_type(text)
-    case text.downcase
-    when /non.?disclosure|nda|confidentiality/
-      "nda"
-    when /service agreement|terms of service|tos/
-      "service_agreement"
-    when /employment|job|position/
-      "employment_contract"
-    when /privacy policy|data protection|gdpr|kvkk/
-      "privacy_policy"
-    when /license|licensing/
-      "license_agreement"
-    else
-      "general_contract"
+    text_lower = text.downcase
+    # Score different document types
+    scores = {
+      nda: 0,
+      service_agreement: 0,
+      employment_contract: 0,
+      privacy_policy: 0,
+      license_agreement: 0,
+      terms_of_use: 0,
+      purchase_agreement: 0,
+      lease_agreement: 0,
+      partnership_agreement: 0,
+      general_contract: 1 # Base score
+    }
+    # NDA indicators
+    scores[:nda] += 3 if text_lower.match?(/non.?disclosure/)
+    scores[:nda] += 2 if text_lower.match?(/\bnda\b/)
+    scores[:nda] += 2 if text_lower.match?(/confidential/)
+    scores[:nda] += 1 if text_lower.match?(/proprietary/)
+    # Service agreement indicators
+    scores[:service_agreement] += 3 if text_lower.match?(/service agreement/)
+    scores[:service_agreement] += 2 if text_lower.match?(/terms of service/)
+    scores[:service_agreement] += 2 if text_lower.match?(/\btos\b/)
+    scores[:service_agreement] += 1 if text_lower.match?(/deliver|provide.*service/)
+    # Employment indicators
+    scores[:employment_contract] += 3 if text_lower.match?(/employment/)
+    scores[:employment_contract] += 2 if text_lower.match?(/employee|employer/)
+    scores[:employment_contract] += 2 if text_lower.match?(/job|position/)
+    scores[:employment_contract] += 1 if text_lower.match?(/salary|wage/)
+    # Privacy policy indicators
+    scores[:privacy_policy] += 3 if text_lower.match?(/privacy policy/)
+    scores[:privacy_policy] += 2 if text_lower.match?(/data protection/)
+    scores[:privacy_policy] += 2 if text_lower.match?(/gdpr|kvkk/)
+    scores[:privacy_policy] += 1 if text_lower.match?(/personal data/)
+    # License agreement indicators
+    scores[:license_agreement] += 3 if text_lower.match?(/license agreement/)
+    scores[:license_agreement] += 2 if text_lower.match?(/licensing/)
+    scores[:license_agreement] += 1 if text_lower.match?(/intellectual property/)
+    # Terms of use indicators
+    scores[:terms_of_use] += 3 if text_lower.match?(/terms of use/)
+    scores[:terms_of_use] += 2 if text_lower.match?(/user agreement/)
+    scores[:terms_of_use] += 1 if text_lower.match?(/website|platform/)
+    # Purchase agreement indicators
+    scores[:purchase_agreement] += 3 if text_lower.match?(/purchase agreement/)
+    scores[:purchase_agreement] += 2 if text_lower.match?(/buy|sell|purchase/)
+    scores[:purchase_agreement] += 1 if text_lower.match?(/price|payment/)
+    # Lease agreement indicators
+    scores[:lease_agreement] += 3 if text_lower.match?(/lease agreement/)
+    scores[:lease_agreement] += 2 if text_lower.match?(/rent|tenant|landlord/)
+    scores[:lease_agreement] += 1 if text_lower.match?(/property|premises/)
+    # Partnership agreement indicators
+    scores[:partnership_agreement] += 3 if text_lower.match?(/partnership agreement/)
+    scores[:partnership_agreement] += 2 if text_lower.match?(/partner|partnership/)
+    scores[:partnership_agreement] += 1 if text_lower.match?(/joint venture/)
+    # Return the type with highest score
+    scores.max_by { |_, score| score }[0].to_s
+  end
+  # Get analysis statistics
+  # @return [Hash] Analysis statistics
+  def self.stats
+    {
+      performance: performance_monitor.stats,
+      cache: Cache.new.stats,
+      memory: performance_monitor.memory_usage,
+      configuration: {
+        language: configuration.language,
+        max_file_size: configuration.max_file_size,
+        caching_enabled: configuration.enable_caching
+      }
+    }
+  end
+  # Reset all statistics and cache
+  def self.reset!
+    performance_monitor.reset!
+    Cache.new.clear!
+  end
+  # Batch process multiple documents
+  # @param file_paths [Array<String>] Array of file paths
+  # @param options [Hash] Processing options
+  # @return [Array<Hash>] Array of analysis results
+  def self.batch_summarise(file_paths, options = {})
+    results = []
+    file_paths.each_with_index do |file_path, index|
+      begin
+        configuration.logger&.info("Processing file #{index + 1}/#{file_paths.length}: #{file_path}")
+        result = summarise(file_path, options)
+        results << { file_path: file_path, success: true, result: result }
+      rescue => e
+        configuration.logger&.error("Failed to process #{file_path}: #{e.message}")
+        results << { file_path: file_path, success: false, error: e.message }
+      end
     end
+    results
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: legal_summariser
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Legal Summariser Team
@@ -162,14 +162,21 @@ extra_rdoc_files: []
 files:
 - ".rspec"
 - CHANGELOG.md
+- CONTRIBUTING.md
 - Gemfile
 - README.md
 - Rakefile
+- examples/advanced_configuration.rb
+- examples/basic_usage.rb
+- examples/batch_processing.rb
 - exe/legal_summariser
 - lib/legal_summariser.rb
+- lib/legal_summariser/cache.rb
 - lib/legal_summariser/clause_detector.rb
+- lib/legal_summariser/configuration.rb
 - lib/legal_summariser/document_parser.rb
 - lib/legal_summariser/formatter.rb
+- lib/legal_summariser/performance_monitor.rb
 - lib/legal_summariser/risk_analyzer.rb
 - lib/legal_summariser/summariser.rb
 - lib/legal_summariser/text_extractor.rb