legal_summariser 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a32e0da3e5422be003d79a333a6f3ea9417fadcc362164e3cef9cae0d84dafb
4
- data.tar.gz: 3219d6167c936a2f056f43b5e2491bc4c67a697ef9d72169c7741be03f5a2726
3
+ metadata.gz: 99da5ab12240efdb658eafc5b3e76ef46834f7a7d76bf86edfe1958ea75c4f58
4
+ data.tar.gz: aa0ee6b2406771e99c22af8d5ab00145eeee8666a8ccbda9b96c48ed87e0e408
5
5
  SHA512:
6
- metadata.gz: 9481e9eb32d6770586b21f8c56ced7f37d99afe8c9ba162fd284cc086b8f02f71b042bef0200bd61104446c0309763da7c362a3e5abae202ccf295c04ef63281
7
- data.tar.gz: c41d771b2ef842b185ebf0114de4921060ad6e55a17377acfc47412790237428fcab8a5ceff3efe2112b3341a9380a30160244c21b0b078eecc108181e9d4ce8
6
+ metadata.gz: 20d58233629912675fd4fa7a44c0813d1267e25bc0004df18d37c60ed069906f31d8a68cc165c337809ff040e874d41053b347eb4b3df46f98bf85451a1f654d
7
+ data.tar.gz: f7bc3b2feab8929485a5387e93ecc0762b32d18903460ba5e33ea1a7c3dd010102c8cbe10feff018694c0ea2a9641c6919904da574a041a226d1de0f1134122b
data/CHANGELOG.md CHANGED
@@ -5,6 +5,31 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.2.0] - 2025-01-09
9
+
10
+ ### Added
11
+ - **Configuration System**: Comprehensive configuration management with validation
12
+ - **Caching System**: Result caching with TTL and size management
13
+ - **Performance Monitoring**: Built-in performance tracking and metrics
14
+ - **Enhanced CLI**: New commands for batch processing, statistics, and configuration
15
+ - **Batch Processing**: Process multiple documents simultaneously
16
+ - **Enhanced Document Support**: Added RTF support and improved text extraction
17
+ - **Advanced Error Handling**: Better error messages and recovery mechanisms
18
+ - **Comprehensive Testing**: 75 test cases with full coverage
19
+ - **Documentation**: Complete examples and contribution guidelines
20
+
21
+ ### Enhanced
22
+ - **Text Extraction**: Multiple encoding support, better PDF/DOCX handling
23
+ - **Document Type Detection**: Improved scoring system for 9 document types
24
+ - **Risk Analysis**: More comprehensive risk patterns and compliance checking
25
+ - **Summarization**: Better plain English conversion and key point extraction
26
+ - **CLI Interface**: Verbose logging, caching options, and performance stats
27
+
28
+ ### Fixed
29
+ - Text cleaning and normalization issues
30
+ - Memory leaks in document processing
31
+ - Error handling for edge cases
32
+
8
33
  ## [0.1.0] - 2024-09-09
9
34
 
10
35
  ### Added
data/exe/legal_summariser CHANGED
@@ -10,18 +10,28 @@ module LegalSummariser
10
10
  option :format, aliases: '-f', default: 'text', desc: 'Output format (json, markdown, text)'
11
11
  option :output, aliases: '-o', desc: 'Output file path (optional)'
12
12
  option :max_sentences, type: :numeric, default: 5, desc: 'Maximum sentences in summary'
13
+ option :verbose, aliases: '-v', type: :boolean, default: false, desc: 'Enable verbose logging'
14
+ option :cache, type: :boolean, default: false, desc: 'Enable result caching'
13
15
  def analyze(file_path)
14
16
  begin
17
+ # Configure logging and caching
18
+ configure_gem(options)
19
+
15
20
  puts "Analyzing: #{file_path}"
16
21
  puts "Format: #{options[:format]}"
22
+ puts "Caching: #{options[:cache] ? 'enabled' : 'disabled'}"
17
23
  puts "-" * 50
18
24
 
25
+ start_time = Time.now
26
+
19
27
  # Perform analysis
20
28
  results = LegalSummariser.summarise(file_path, {
21
29
  format: options[:format],
22
30
  max_sentences: options[:max_sentences]
23
31
  })
24
32
 
33
+ end_time = Time.now
34
+
25
35
  # Output results
26
36
  if options[:output]
27
37
  File.write(options[:output], results)
@@ -30,15 +40,24 @@ module LegalSummariser
30
40
  puts results
31
41
  end
32
42
 
43
+ if options[:verbose]
44
+ puts "\n" + "-" * 50
45
+ puts "Analysis completed in #{(end_time - start_time).round(3)}s"
46
+ puts "Performance stats available via 'legal_summariser stats'"
47
+ end
48
+
33
49
  rescue LegalSummariser::DocumentNotFoundError => e
34
50
  puts "Error: #{e.message}"
35
51
  exit 1
36
52
  rescue LegalSummariser::UnsupportedFormatError => e
37
53
  puts "Error: #{e.message}"
38
54
  exit 1
55
+ rescue LegalSummariser::Error => e
56
+ puts "Processing error: #{e.message}"
57
+ exit 1
39
58
  rescue => e
40
59
  puts "Unexpected error: #{e.message}"
41
- puts e.backtrace if ENV['DEBUG']
60
+ puts e.backtrace if options[:verbose] || ENV['DEBUG']
42
61
  exit 1
43
62
  end
44
63
  end
@@ -62,6 +81,107 @@ module LegalSummariser
62
81
  puts "- Plain text (text, txt)"
63
82
  end
64
83
 
84
+ desc "batch FILES", "Analyze multiple legal documents"
85
+ option :format, aliases: '-f', default: 'text', desc: 'Output format (json, markdown, text)'
86
+ option :output_dir, aliases: '-d', desc: 'Output directory for results'
87
+ option :verbose, aliases: '-v', type: :boolean, default: false, desc: 'Enable verbose logging'
88
+ option :cache, type: :boolean, default: true, desc: 'Enable result caching'
89
+ def batch(*file_paths)
90
+ if file_paths.empty?
91
+ puts "Error: No files specified"
92
+ puts "Usage: legal_summariser batch file1.pdf file2.docx ..."
93
+ exit 1
94
+ end
95
+
96
+ configure_gem(options)
97
+
98
+ puts "Batch processing #{file_paths.length} files..."
99
+ puts "-" * 50
100
+
101
+ results = LegalSummariser.batch_summarise(file_paths, {
102
+ format: options[:format]
103
+ })
104
+
105
+ # Process results
106
+ successful = results.count { |r| r[:success] }
107
+ failed = results.count { |r| !r[:success] }
108
+
109
+ puts "\nBatch processing completed:"
110
+ puts "✓ Successful: #{successful}"
111
+ puts "✗ Failed: #{failed}" if failed > 0
112
+
113
+ if options[:output_dir]
114
+ FileUtils.mkdir_p(options[:output_dir])
115
+
116
+ results.each do |result|
117
+ next unless result[:success]
118
+
119
+ filename = File.basename(result[:file_path], '.*') + '_analysis'
120
+ extension = case options[:format]
121
+ when 'json' then '.json'
122
+ when 'markdown', 'md' then '.md'
123
+ else '.txt'
124
+ end
125
+
126
+ output_file = File.join(options[:output_dir], filename + extension)
127
+ File.write(output_file, result[:result])
128
+ puts "Saved: #{output_file}"
129
+ end
130
+ end
131
+ end
132
+
133
+ desc "stats", "Show performance and usage statistics"
134
+ def stats
135
+ stats = LegalSummariser.stats
136
+
137
+ puts "Legal Summariser Statistics"
138
+ puts "=" * 50
139
+
140
+ # Performance stats
141
+ if stats[:performance].any?
142
+ puts "\nPerformance:"
143
+ stats[:performance].each do |metric, data|
144
+ puts " #{metric.to_s.tr('_', ' ').capitalize}:"
145
+ puts " Count: #{data[:count]}"
146
+ puts " Average: #{data[:average]}s"
147
+ puts " Total: #{data[:total]}s"
148
+ end
149
+ end
150
+
151
+ # Cache stats
152
+ puts "\nCache:"
153
+ cache_stats = stats[:cache]
154
+ if cache_stats[:enabled]
155
+ puts " Status: Enabled"
156
+ puts " Files: #{cache_stats[:file_count]}"
157
+ puts " Size: #{cache_stats[:total_size_mb]} MB"
158
+ else
159
+ puts " Status: Disabled"
160
+ end
161
+
162
+ # Memory stats
163
+ memory = stats[:memory]
164
+ if memory[:available] != false
165
+ puts "\nMemory:"
166
+ puts " Objects: #{memory[:object_count]}"
167
+ puts " GC Count: #{memory[:gc_count]}"
168
+ puts " Estimated Usage: #{memory[:memory_mb]} MB"
169
+ end
170
+ end
171
+
172
+ desc "config", "Show current configuration"
173
+ def config
174
+ config = LegalSummariser.configuration
175
+
176
+ puts "Legal Summariser Configuration"
177
+ puts "=" * 50
178
+ puts "Language: #{config.language}"
179
+ puts "Max File Size: #{config.max_file_size / 1024 / 1024} MB"
180
+ puts "Timeout: #{config.timeout}s"
181
+ puts "Caching: #{config.enable_caching ? 'enabled' : 'disabled'}"
182
+ puts "Cache Directory: #{config.cache_dir}"
183
+ end
184
+
65
185
  desc "demo", "Run demo analysis on sample documents"
66
186
  def demo
67
187
  puts "Legal Summariser Demo"
@@ -85,6 +205,16 @@ module LegalSummariser
85
205
 
86
206
  private
87
207
 
208
+ def configure_gem(options)
209
+ LegalSummariser.configure do |config|
210
+ if options[:verbose]
211
+ require 'logger'
212
+ config.logger = Logger.new(STDOUT, level: Logger::INFO)
213
+ end
214
+ config.enable_caching = options[:cache] if options.key?(:cache)
215
+ end
216
+ end
217
+
88
218
  def create_sample_nda
89
219
  <<~NDA
90
220
  NON-DISCLOSURE AGREEMENT
@@ -2,22 +2,38 @@
2
2
 
3
3
  require 'pdf-reader'
4
4
  require 'docx'
5
+ require 'logger'
5
6
 
6
7
  module LegalSummariser
7
8
  class TextExtractor
9
+ # Logger for debugging and monitoring
10
+ def self.logger
11
+ @logger ||= Logger.new(STDOUT, level: Logger::WARN)
12
+ end
13
+
14
+ def self.logger=(logger)
15
+ @logger = logger
16
+ end
8
17
  # Extract text from various document formats
9
18
  # @param file_path [String] Path to the document
10
19
  # @return [String] Extracted text
11
20
  def self.extract(file_path)
21
+ raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
22
+ raise DocumentNotFoundError, "File is empty: #{file_path}" if File.zero?(file_path)
23
+
24
+ logger.info "Extracting text from: #{file_path}"
25
+
12
26
  case File.extname(file_path).downcase
13
27
  when '.pdf'
14
28
  extract_from_pdf(file_path)
15
29
  when '.docx'
16
30
  extract_from_docx(file_path)
17
- when '.txt'
18
- File.read(file_path, encoding: 'UTF-8')
31
+ when '.txt', '.text'
32
+ extract_from_text(file_path)
33
+ when '.rtf'
34
+ extract_from_rtf(file_path)
19
35
  else
20
- raise UnsupportedFormatError, "Unsupported file format: #{File.extname(file_path)}"
36
+ raise UnsupportedFormatError, "Unsupported file format: #{File.extname(file_path)}. Supported formats: .pdf, .docx, .txt, .rtf"
21
37
  end
22
38
  end
23
39
 
@@ -27,15 +43,30 @@ module LegalSummariser
27
43
  # @param file_path [String] Path to PDF file
28
44
  # @return [String] Extracted text
29
45
  def self.extract_from_pdf(file_path)
46
+ logger.debug "Processing PDF: #{file_path}"
47
+
30
48
  reader = PDF::Reader.new(file_path)
31
49
  text = ""
50
+ page_count = 0
32
51
 
33
52
  reader.pages.each do |page|
34
- text += page.text + "\n"
53
+ page_count += 1
54
+ page_text = page.text
55
+ text += page_text + "\n" if page_text && !page_text.strip.empty?
56
+ end
57
+
58
+ logger.info "Extracted text from #{page_count} PDF pages"
59
+
60
+ if text.strip.empty?
61
+ logger.warn "No text extracted from PDF - file may be image-based or encrypted"
62
+ raise Error, "No extractable text found in PDF. File may be image-based or password-protected."
35
63
  end
36
64
 
37
- # Clean up common PDF artifacts
38
65
  clean_text(text)
66
+ rescue PDF::Reader::MalformedPDFError => e
67
+ raise Error, "Malformed PDF file: #{e.message}"
68
+ rescue PDF::Reader::UnsupportedFeatureError => e
69
+ raise Error, "PDF contains unsupported features: #{e.message}"
39
70
  rescue => e
40
71
  raise Error, "Failed to extract text from PDF: #{e.message}"
41
72
  end
@@ -44,28 +75,98 @@ module LegalSummariser
44
75
  # @param file_path [String] Path to DOCX file
45
76
  # @return [String] Extracted text
46
77
  def self.extract_from_docx(file_path)
78
+ logger.debug "Processing DOCX: #{file_path}"
79
+
47
80
  doc = Docx::Document.open(file_path)
48
81
  text = ""
82
+ paragraph_count = 0
49
83
 
50
84
  doc.paragraphs.each do |paragraph|
51
- text += paragraph.text + "\n"
85
+ paragraph_text = paragraph.text
86
+ if paragraph_text && !paragraph_text.strip.empty?
87
+ text += paragraph_text + "\n"
88
+ paragraph_count += 1
89
+ end
90
+ end
91
+
92
+ # Also extract text from tables if present
93
+ doc.tables.each do |table|
94
+ table.rows.each do |row|
95
+ row.cells.each do |cell|
96
+ cell_text = cell.text
97
+ text += cell_text + " " if cell_text && !cell_text.strip.empty?
98
+ end
99
+ text += "\n"
100
+ end
101
+ end
102
+
103
+ logger.info "Extracted text from #{paragraph_count} DOCX paragraphs"
104
+
105
+ if text.strip.empty?
106
+ raise Error, "No text content found in DOCX file"
52
107
  end
53
108
 
54
109
  clean_text(text)
110
+ rescue Zip::Error => e
111
+ raise Error, "Invalid DOCX file format: #{e.message}"
55
112
  rescue => e
56
113
  raise Error, "Failed to extract text from DOCX: #{e.message}"
57
114
  end
58
115
 
116
+ # Extract text from plain text files
117
+ # @param file_path [String] Path to text file
118
+ # @return [String] Extracted text
119
+ def self.extract_from_text(file_path)
120
+ logger.debug "Processing text file: #{file_path}"
121
+
122
+ # Try different encodings
123
+ encodings = ['UTF-8', 'ISO-8859-1', 'Windows-1252']
124
+
125
+ encodings.each do |encoding|
126
+ begin
127
+ text = File.read(file_path, encoding: encoding)
128
+ logger.info "Successfully read text file with #{encoding} encoding"
129
+ return clean_text(text)
130
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
131
+ logger.debug "Failed to read with #{encoding} encoding, trying next"
132
+ next
133
+ end
134
+ end
135
+
136
+ raise Error, "Unable to read text file with supported encodings"
137
+ end
138
+
139
+ # Extract text from RTF files (basic support)
140
+ # @param file_path [String] Path to RTF file
141
+ # @return [String] Extracted text
142
+ def self.extract_from_rtf(file_path)
143
+ logger.debug "Processing RTF: #{file_path}"
144
+
145
+ content = File.read(file_path, encoding: 'UTF-8')
146
+
147
+ # Basic RTF parsing - remove RTF control codes
148
+ text = content.gsub(/\{[^}]*\}/, '') # Remove RTF groups
149
+ text = text.gsub(/\\[a-z]+\d*\s?/, '') # Remove RTF commands
150
+ text = text.gsub(/\\[^a-z]/, '') # Remove RTF escape sequences
151
+
152
+ clean_text(text)
153
+ rescue => e
154
+ raise Error, "Failed to extract text from RTF: #{e.message}"
155
+ end
156
+
59
157
  # Clean extracted text
60
158
  # @param text [String] Raw extracted text
61
159
  # @return [String] Cleaned text
62
160
  def self.clean_text(text)
161
+ return "" if text.nil? || text.empty?
162
+
63
163
  # Normalize line breaks first
64
164
  text = text.gsub(/\r\n?/, "\n")
65
165
 
66
- # Remove common PDF artifacts
166
+ # Remove common document artifacts
67
167
  text = text.gsub(/\f/, '') # Form feed characters
68
168
  text = text.gsub(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/, '') # Control characters
169
+ text = text.gsub(/\u00A0/, ' ') # Non-breaking spaces
69
170
 
70
171
  # Remove excessive whitespace but preserve line breaks
71
172
  text = text.gsub(/[ \t]+/, ' ')
@@ -73,7 +174,24 @@ module LegalSummariser
73
174
  # Remove excessive newlines
74
175
  text = text.gsub(/\n{3,}/, "\n\n")
75
176
 
177
+ # Remove leading/trailing whitespace from each line
178
+ text = text.split("\n").map(&:strip).join("\n")
179
+
180
+ # Remove empty lines at start and end
76
181
  text.strip
77
182
  end
183
+
184
+ # Get document statistics
185
+ # @param text [String] Document text
186
+ # @return [Hash] Document statistics
187
+ def self.get_statistics(text)
188
+ {
189
+ character_count: text.length,
190
+ word_count: text.split(/\s+/).length,
191
+ sentence_count: text.split(/[.!?]+/).length,
192
+ paragraph_count: text.split(/\n\s*\n/).length,
193
+ average_sentence_length: text.split(/\s+/).length.to_f / text.split(/[.!?]+/).length
194
+ }
195
+ end
78
196
  end
79
197
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module LegalSummariser
4
- VERSION = "0.1.0"
4
+ VERSION = "0.2.0"
5
5
  end
@@ -1,6 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "legal_summariser/version"
4
+ require_relative "legal_summariser/configuration"
5
+ require_relative "legal_summariser/cache"
6
+ require_relative "legal_summariser/performance_monitor"
4
7
  require_relative "legal_summariser/document_parser"
5
8
  require_relative "legal_summariser/text_extractor"
6
9
  require_relative "legal_summariser/summariser"
@@ -18,34 +21,89 @@ module LegalSummariser
18
21
  # @param options [Hash] Configuration options
19
22
  # @return [Hash] Summary results
20
23
  def self.summarise(file_path, options = {})
21
- raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
22
-
23
- # Extract text from document
24
- text = TextExtractor.extract(file_path)
24
+ monitor = performance_monitor
25
+ cache = Cache.new
25
26
 
26
- # Perform analysis
27
- summary = Summariser.new(text, options).generate
28
- clauses = ClauseDetector.new(text).detect
29
- risks = RiskAnalyzer.new(text).analyze
27
+ monitor.start_timer(:total_analysis)
30
28
 
31
- # Format results
32
- result = {
33
- plain_text: summary[:plain_text],
34
- key_points: summary[:key_points],
35
- clauses: clauses,
36
- risks: risks,
37
- metadata: {
38
- document_type: detect_document_type(text),
39
- word_count: text.split.length,
40
- processed_at: Time.now.strftime("%Y-%m-%dT%H:%M:%S%z")
29
+ begin
30
+ # Validate file
31
+ raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
32
+
33
+ file_size = File.size(file_path)
34
+ raise Error, "File too large: #{file_size} bytes (max: #{configuration.max_file_size})" if file_size > configuration.max_file_size
35
+
36
+ # Check cache first
37
+ cache_key = cache.cache_key(file_path, options)
38
+ cached_result = cache.get(cache_key)
39
+
40
+ if cached_result
41
+ configuration.logger&.info("Using cached result for #{file_path}")
42
+ monitor.end_timer(:total_analysis)
43
+ return cached_result
44
+ end
45
+
46
+ # Extract text from document
47
+ monitor.start_timer(:text_extraction)
48
+ text = TextExtractor.extract(file_path)
49
+ extraction_time = monitor.end_timer(:text_extraction)
50
+
51
+ # Record text statistics
52
+ text_stats = TextExtractor.get_statistics(text)
53
+ monitor.record(:document_word_count, text_stats[:word_count])
54
+ monitor.record(:document_character_count, text_stats[:character_count])
55
+
56
+ # Perform analysis components
57
+ monitor.start_timer(:summarisation)
58
+ summary = Summariser.new(text, options).generate
59
+ monitor.end_timer(:summarisation)
60
+
61
+ monitor.start_timer(:clause_detection)
62
+ clauses = ClauseDetector.new(text).detect
63
+ monitor.end_timer(:clause_detection)
64
+
65
+ monitor.start_timer(:risk_analysis)
66
+ risks = RiskAnalyzer.new(text).analyze
67
+ monitor.end_timer(:risk_analysis)
68
+
69
+ # Format results
70
+ result = {
71
+ plain_text: summary[:plain_text],
72
+ key_points: summary[:key_points],
73
+ clauses: clauses,
74
+ risks: risks,
75
+ metadata: {
76
+ document_type: detect_document_type(text),
77
+ word_count: text_stats[:word_count],
78
+ character_count: text_stats[:character_count],
79
+ sentence_count: text_stats[:sentence_count],
80
+ paragraph_count: text_stats[:paragraph_count],
81
+ file_size_bytes: file_size,
82
+ extraction_time_seconds: extraction_time.round(3),
83
+ processed_at: Time.now.strftime("%Y-%m-%dT%H:%M:%S%z"),
84
+ gem_version: VERSION,
85
+ language: configuration.language
86
+ },
87
+ performance: monitor.stats
41
88
  }
42
- }
43
-
44
- # Apply formatting if requested
45
- if options[:format]
46
- Formatter.format(result, options[:format])
47
- else
48
- result
89
+
90
+ # Cache the result
91
+ cache.set(cache_key, result)
92
+
93
+ total_time = monitor.end_timer(:total_analysis)
94
+ configuration.logger&.info("Analysis completed in #{total_time.round(3)}s")
95
+
96
+ # Apply formatting if requested
97
+ if options[:format]
98
+ Formatter.format(result, options[:format])
99
+ else
100
+ result
101
+ end
102
+
103
+ rescue => e
104
+ monitor.end_timer(:total_analysis)
105
+ configuration.logger&.error("Analysis failed: #{e.message}")
106
+ raise
49
107
  end
50
108
  end
51
109
 
@@ -53,19 +111,114 @@ module LegalSummariser
53
111
  # @param text [String] Document text
54
112
  # @return [String] Document type
55
113
  def self.detect_document_type(text)
56
- case text.downcase
57
- when /non.?disclosure|nda|confidentiality/
58
- "nda"
59
- when /service agreement|terms of service|tos/
60
- "service_agreement"
61
- when /employment|job|position/
62
- "employment_contract"
63
- when /privacy policy|data protection|gdpr|kvkk/
64
- "privacy_policy"
65
- when /license|licensing/
66
- "license_agreement"
67
- else
68
- "general_contract"
114
+ text_lower = text.downcase
115
+
116
+ # Score different document types
117
+ scores = {
118
+ nda: 0,
119
+ service_agreement: 0,
120
+ employment_contract: 0,
121
+ privacy_policy: 0,
122
+ license_agreement: 0,
123
+ terms_of_use: 0,
124
+ purchase_agreement: 0,
125
+ lease_agreement: 0,
126
+ partnership_agreement: 0,
127
+ general_contract: 1 # Base score
128
+ }
129
+
130
+ # NDA indicators
131
+ scores[:nda] += 3 if text_lower.match?(/non.?disclosure/)
132
+ scores[:nda] += 2 if text_lower.match?(/\bnda\b/)
133
+ scores[:nda] += 2 if text_lower.match?(/confidential/)
134
+ scores[:nda] += 1 if text_lower.match?(/proprietary/)
135
+
136
+ # Service agreement indicators
137
+ scores[:service_agreement] += 3 if text_lower.match?(/service agreement/)
138
+ scores[:service_agreement] += 2 if text_lower.match?(/terms of service/)
139
+ scores[:service_agreement] += 2 if text_lower.match?(/\btos\b/)
140
+ scores[:service_agreement] += 1 if text_lower.match?(/deliver|provide.*service/)
141
+
142
+ # Employment indicators
143
+ scores[:employment_contract] += 3 if text_lower.match?(/employment/)
144
+ scores[:employment_contract] += 2 if text_lower.match?(/employee|employer/)
145
+ scores[:employment_contract] += 2 if text_lower.match?(/job|position/)
146
+ scores[:employment_contract] += 1 if text_lower.match?(/salary|wage/)
147
+
148
+ # Privacy policy indicators
149
+ scores[:privacy_policy] += 3 if text_lower.match?(/privacy policy/)
150
+ scores[:privacy_policy] += 2 if text_lower.match?(/data protection/)
151
+ scores[:privacy_policy] += 2 if text_lower.match?(/gdpr|kvkk/)
152
+ scores[:privacy_policy] += 1 if text_lower.match?(/personal data/)
153
+
154
+ # License agreement indicators
155
+ scores[:license_agreement] += 3 if text_lower.match?(/license agreement/)
156
+ scores[:license_agreement] += 2 if text_lower.match?(/licensing/)
157
+ scores[:license_agreement] += 1 if text_lower.match?(/intellectual property/)
158
+
159
+ # Terms of use indicators
160
+ scores[:terms_of_use] += 3 if text_lower.match?(/terms of use/)
161
+ scores[:terms_of_use] += 2 if text_lower.match?(/user agreement/)
162
+ scores[:terms_of_use] += 1 if text_lower.match?(/website|platform/)
163
+
164
+ # Purchase agreement indicators
165
+ scores[:purchase_agreement] += 3 if text_lower.match?(/purchase agreement/)
166
+ scores[:purchase_agreement] += 2 if text_lower.match?(/buy|sell|purchase/)
167
+ scores[:purchase_agreement] += 1 if text_lower.match?(/price|payment/)
168
+
169
+ # Lease agreement indicators
170
+ scores[:lease_agreement] += 3 if text_lower.match?(/lease agreement/)
171
+ scores[:lease_agreement] += 2 if text_lower.match?(/rent|tenant|landlord/)
172
+ scores[:lease_agreement] += 1 if text_lower.match?(/property|premises/)
173
+
174
+ # Partnership agreement indicators
175
+ scores[:partnership_agreement] += 3 if text_lower.match?(/partnership agreement/)
176
+ scores[:partnership_agreement] += 2 if text_lower.match?(/partner|partnership/)
177
+ scores[:partnership_agreement] += 1 if text_lower.match?(/joint venture/)
178
+
179
+ # Return the type with highest score
180
+ scores.max_by { |_, score| score }[0].to_s
181
+ end
182
+
183
+ # Get analysis statistics
184
+ # @return [Hash] Analysis statistics
185
+ def self.stats
186
+ {
187
+ performance: performance_monitor.stats,
188
+ cache: Cache.new.stats,
189
+ memory: performance_monitor.memory_usage,
190
+ configuration: {
191
+ language: configuration.language,
192
+ max_file_size: configuration.max_file_size,
193
+ caching_enabled: configuration.enable_caching
194
+ }
195
+ }
196
+ end
197
+
198
+ # Reset all statistics and cache
199
+ def self.reset!
200
+ performance_monitor.reset!
201
+ Cache.new.clear!
202
+ end
203
+
204
+ # Batch process multiple documents
205
+ # @param file_paths [Array<String>] Array of file paths
206
+ # @param options [Hash] Processing options
207
+ # @return [Array<Hash>] Array of analysis results
208
+ def self.batch_summarise(file_paths, options = {})
209
+ results = []
210
+
211
+ file_paths.each_with_index do |file_path, index|
212
+ begin
213
+ configuration.logger&.info("Processing file #{index + 1}/#{file_paths.length}: #{file_path}")
214
+ result = summarise(file_path, options)
215
+ results << { file_path: file_path, success: true, result: result }
216
+ rescue => e
217
+ configuration.logger&.error("Failed to process #{file_path}: #{e.message}")
218
+ results << { file_path: file_path, success: false, error: e.message }
219
+ end
69
220
  end
221
+
222
+ results
70
223
  end
71
224
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: legal_summariser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Legal Summariser Team