legal_summariser 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/exe/legal_summariser +131 -1
- data/lib/legal_summariser/text_extractor.rb +125 -7
- data/lib/legal_summariser/version.rb +1 -1
- data/lib/legal_summariser.rb +191 -38
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 99da5ab12240efdb658eafc5b3e76ef46834f7a7d76bf86edfe1958ea75c4f58
|
|
4
|
+
data.tar.gz: aa0ee6b2406771e99c22af8d5ab00145eeee8666a8ccbda9b96c48ed87e0e408
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 20d58233629912675fd4fa7a44c0813d1267e25bc0004df18d37c60ed069906f31d8a68cc165c337809ff040e874d41053b347eb4b3df46f98bf85451a1f654d
|
|
7
|
+
data.tar.gz: f7bc3b2feab8929485a5387e93ecc0762b32d18903460ba5e33ea1a7c3dd010102c8cbe10feff018694c0ea2a9641c6919904da574a041a226d1de0f1134122b
|
data/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,31 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.2.0] - 2025-01-09
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- **Configuration System**: Comprehensive configuration management with validation
|
|
12
|
+
- **Caching System**: Result caching with TTL and size management
|
|
13
|
+
- **Performance Monitoring**: Built-in performance tracking and metrics
|
|
14
|
+
- **Enhanced CLI**: New commands for batch processing, statistics, and configuration
|
|
15
|
+
- **Batch Processing**: Process multiple documents simultaneously
|
|
16
|
+
- **Enhanced Document Support**: Added RTF support and improved text extraction
|
|
17
|
+
- **Advanced Error Handling**: Better error messages and recovery mechanisms
|
|
18
|
+
- **Comprehensive Testing**: 75 test cases with full coverage
|
|
19
|
+
- **Documentation**: Complete examples and contribution guidelines
|
|
20
|
+
|
|
21
|
+
### Enhanced
|
|
22
|
+
- **Text Extraction**: Multiple encoding support, better PDF/DOCX handling
|
|
23
|
+
- **Document Type Detection**: Improved scoring system for 9 document types
|
|
24
|
+
- **Risk Analysis**: More comprehensive risk patterns and compliance checking
|
|
25
|
+
- **Summarization**: Better plain English conversion and key point extraction
|
|
26
|
+
- **CLI Interface**: Verbose logging, caching options, and performance stats
|
|
27
|
+
|
|
28
|
+
### Fixed
|
|
29
|
+
- Text cleaning and normalization issues
|
|
30
|
+
- Memory leaks in document processing
|
|
31
|
+
- Error handling for edge cases
|
|
32
|
+
|
|
8
33
|
## [0.1.0] - 2024-09-09
|
|
9
34
|
|
|
10
35
|
### Added
|
data/exe/legal_summariser
CHANGED
|
@@ -10,18 +10,28 @@ module LegalSummariser
|
|
|
10
10
|
option :format, aliases: '-f', default: 'text', desc: 'Output format (json, markdown, text)'
|
|
11
11
|
option :output, aliases: '-o', desc: 'Output file path (optional)'
|
|
12
12
|
option :max_sentences, type: :numeric, default: 5, desc: 'Maximum sentences in summary'
|
|
13
|
+
option :verbose, aliases: '-v', type: :boolean, default: false, desc: 'Enable verbose logging'
|
|
14
|
+
option :cache, type: :boolean, default: false, desc: 'Enable result caching'
|
|
13
15
|
def analyze(file_path)
|
|
14
16
|
begin
|
|
17
|
+
# Configure logging and caching
|
|
18
|
+
configure_gem(options)
|
|
19
|
+
|
|
15
20
|
puts "Analyzing: #{file_path}"
|
|
16
21
|
puts "Format: #{options[:format]}"
|
|
22
|
+
puts "Caching: #{options[:cache] ? 'enabled' : 'disabled'}"
|
|
17
23
|
puts "-" * 50
|
|
18
24
|
|
|
25
|
+
start_time = Time.now
|
|
26
|
+
|
|
19
27
|
# Perform analysis
|
|
20
28
|
results = LegalSummariser.summarise(file_path, {
|
|
21
29
|
format: options[:format],
|
|
22
30
|
max_sentences: options[:max_sentences]
|
|
23
31
|
})
|
|
24
32
|
|
|
33
|
+
end_time = Time.now
|
|
34
|
+
|
|
25
35
|
# Output results
|
|
26
36
|
if options[:output]
|
|
27
37
|
File.write(options[:output], results)
|
|
@@ -30,15 +40,24 @@ module LegalSummariser
|
|
|
30
40
|
puts results
|
|
31
41
|
end
|
|
32
42
|
|
|
43
|
+
if options[:verbose]
|
|
44
|
+
puts "\n" + "-" * 50
|
|
45
|
+
puts "Analysis completed in #{(end_time - start_time).round(3)}s"
|
|
46
|
+
puts "Performance stats available via 'legal_summariser stats'"
|
|
47
|
+
end
|
|
48
|
+
|
|
33
49
|
rescue LegalSummariser::DocumentNotFoundError => e
|
|
34
50
|
puts "Error: #{e.message}"
|
|
35
51
|
exit 1
|
|
36
52
|
rescue LegalSummariser::UnsupportedFormatError => e
|
|
37
53
|
puts "Error: #{e.message}"
|
|
38
54
|
exit 1
|
|
55
|
+
rescue LegalSummariser::Error => e
|
|
56
|
+
puts "Processing error: #{e.message}"
|
|
57
|
+
exit 1
|
|
39
58
|
rescue => e
|
|
40
59
|
puts "Unexpected error: #{e.message}"
|
|
41
|
-
puts e.backtrace if ENV['DEBUG']
|
|
60
|
+
puts e.backtrace if options[:verbose] || ENV['DEBUG']
|
|
42
61
|
exit 1
|
|
43
62
|
end
|
|
44
63
|
end
|
|
@@ -62,6 +81,107 @@ module LegalSummariser
|
|
|
62
81
|
puts "- Plain text (text, txt)"
|
|
63
82
|
end
|
|
64
83
|
|
|
84
|
+
desc "batch FILES", "Analyze multiple legal documents"
|
|
85
|
+
option :format, aliases: '-f', default: 'text', desc: 'Output format (json, markdown, text)'
|
|
86
|
+
option :output_dir, aliases: '-d', desc: 'Output directory for results'
|
|
87
|
+
option :verbose, aliases: '-v', type: :boolean, default: false, desc: 'Enable verbose logging'
|
|
88
|
+
option :cache, type: :boolean, default: true, desc: 'Enable result caching'
|
|
89
|
+
def batch(*file_paths)
|
|
90
|
+
if file_paths.empty?
|
|
91
|
+
puts "Error: No files specified"
|
|
92
|
+
puts "Usage: legal_summariser batch file1.pdf file2.docx ..."
|
|
93
|
+
exit 1
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
configure_gem(options)
|
|
97
|
+
|
|
98
|
+
puts "Batch processing #{file_paths.length} files..."
|
|
99
|
+
puts "-" * 50
|
|
100
|
+
|
|
101
|
+
results = LegalSummariser.batch_summarise(file_paths, {
|
|
102
|
+
format: options[:format]
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
# Process results
|
|
106
|
+
successful = results.count { |r| r[:success] }
|
|
107
|
+
failed = results.count { |r| !r[:success] }
|
|
108
|
+
|
|
109
|
+
puts "\nBatch processing completed:"
|
|
110
|
+
puts "✓ Successful: #{successful}"
|
|
111
|
+
puts "✗ Failed: #{failed}" if failed > 0
|
|
112
|
+
|
|
113
|
+
if options[:output_dir]
|
|
114
|
+
FileUtils.mkdir_p(options[:output_dir])
|
|
115
|
+
|
|
116
|
+
results.each do |result|
|
|
117
|
+
next unless result[:success]
|
|
118
|
+
|
|
119
|
+
filename = File.basename(result[:file_path], '.*') + '_analysis'
|
|
120
|
+
extension = case options[:format]
|
|
121
|
+
when 'json' then '.json'
|
|
122
|
+
when 'markdown', 'md' then '.md'
|
|
123
|
+
else '.txt'
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
output_file = File.join(options[:output_dir], filename + extension)
|
|
127
|
+
File.write(output_file, result[:result])
|
|
128
|
+
puts "Saved: #{output_file}"
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
desc "stats", "Show performance and usage statistics"
|
|
134
|
+
def stats
|
|
135
|
+
stats = LegalSummariser.stats
|
|
136
|
+
|
|
137
|
+
puts "Legal Summariser Statistics"
|
|
138
|
+
puts "=" * 50
|
|
139
|
+
|
|
140
|
+
# Performance stats
|
|
141
|
+
if stats[:performance].any?
|
|
142
|
+
puts "\nPerformance:"
|
|
143
|
+
stats[:performance].each do |metric, data|
|
|
144
|
+
puts " #{metric.to_s.tr('_', ' ').capitalize}:"
|
|
145
|
+
puts " Count: #{data[:count]}"
|
|
146
|
+
puts " Average: #{data[:average]}s"
|
|
147
|
+
puts " Total: #{data[:total]}s"
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Cache stats
|
|
152
|
+
puts "\nCache:"
|
|
153
|
+
cache_stats = stats[:cache]
|
|
154
|
+
if cache_stats[:enabled]
|
|
155
|
+
puts " Status: Enabled"
|
|
156
|
+
puts " Files: #{cache_stats[:file_count]}"
|
|
157
|
+
puts " Size: #{cache_stats[:total_size_mb]} MB"
|
|
158
|
+
else
|
|
159
|
+
puts " Status: Disabled"
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Memory stats
|
|
163
|
+
memory = stats[:memory]
|
|
164
|
+
if memory[:available] != false
|
|
165
|
+
puts "\nMemory:"
|
|
166
|
+
puts " Objects: #{memory[:object_count]}"
|
|
167
|
+
puts " GC Count: #{memory[:gc_count]}"
|
|
168
|
+
puts " Estimated Usage: #{memory[:memory_mb]} MB"
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
desc "config", "Show current configuration"
|
|
173
|
+
def config
|
|
174
|
+
config = LegalSummariser.configuration
|
|
175
|
+
|
|
176
|
+
puts "Legal Summariser Configuration"
|
|
177
|
+
puts "=" * 50
|
|
178
|
+
puts "Language: #{config.language}"
|
|
179
|
+
puts "Max File Size: #{config.max_file_size / 1024 / 1024} MB"
|
|
180
|
+
puts "Timeout: #{config.timeout}s"
|
|
181
|
+
puts "Caching: #{config.enable_caching ? 'enabled' : 'disabled'}"
|
|
182
|
+
puts "Cache Directory: #{config.cache_dir}"
|
|
183
|
+
end
|
|
184
|
+
|
|
65
185
|
desc "demo", "Run demo analysis on sample documents"
|
|
66
186
|
def demo
|
|
67
187
|
puts "Legal Summariser Demo"
|
|
@@ -85,6 +205,16 @@ module LegalSummariser
|
|
|
85
205
|
|
|
86
206
|
private
|
|
87
207
|
|
|
208
|
+
def configure_gem(options)
|
|
209
|
+
LegalSummariser.configure do |config|
|
|
210
|
+
if options[:verbose]
|
|
211
|
+
require 'logger'
|
|
212
|
+
config.logger = Logger.new(STDOUT, level: Logger::INFO)
|
|
213
|
+
end
|
|
214
|
+
config.enable_caching = options[:cache] if options.key?(:cache)
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
|
|
88
218
|
def create_sample_nda
|
|
89
219
|
<<~NDA
|
|
90
220
|
NON-DISCLOSURE AGREEMENT
|
|
@@ -2,22 +2,38 @@
|
|
|
2
2
|
|
|
3
3
|
require 'pdf-reader'
|
|
4
4
|
require 'docx'
|
|
5
|
+
require 'logger'
|
|
5
6
|
|
|
6
7
|
module LegalSummariser
|
|
7
8
|
class TextExtractor
|
|
9
|
+
# Logger for debugging and monitoring
|
|
10
|
+
def self.logger
|
|
11
|
+
@logger ||= Logger.new(STDOUT, level: Logger::WARN)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def self.logger=(logger)
|
|
15
|
+
@logger = logger
|
|
16
|
+
end
|
|
8
17
|
# Extract text from various document formats
|
|
9
18
|
# @param file_path [String] Path to the document
|
|
10
19
|
# @return [String] Extracted text
|
|
11
20
|
def self.extract(file_path)
|
|
21
|
+
raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
|
|
22
|
+
raise DocumentNotFoundError, "File is empty: #{file_path}" if File.zero?(file_path)
|
|
23
|
+
|
|
24
|
+
logger.info "Extracting text from: #{file_path}"
|
|
25
|
+
|
|
12
26
|
case File.extname(file_path).downcase
|
|
13
27
|
when '.pdf'
|
|
14
28
|
extract_from_pdf(file_path)
|
|
15
29
|
when '.docx'
|
|
16
30
|
extract_from_docx(file_path)
|
|
17
|
-
when '.txt'
|
|
18
|
-
|
|
31
|
+
when '.txt', '.text'
|
|
32
|
+
extract_from_text(file_path)
|
|
33
|
+
when '.rtf'
|
|
34
|
+
extract_from_rtf(file_path)
|
|
19
35
|
else
|
|
20
|
-
raise UnsupportedFormatError, "Unsupported file format: #{File.extname(file_path)}"
|
|
36
|
+
raise UnsupportedFormatError, "Unsupported file format: #{File.extname(file_path)}. Supported formats: .pdf, .docx, .txt, .rtf"
|
|
21
37
|
end
|
|
22
38
|
end
|
|
23
39
|
|
|
@@ -27,15 +43,30 @@ module LegalSummariser
|
|
|
27
43
|
# @param file_path [String] Path to PDF file
|
|
28
44
|
# @return [String] Extracted text
|
|
29
45
|
def self.extract_from_pdf(file_path)
|
|
46
|
+
logger.debug "Processing PDF: #{file_path}"
|
|
47
|
+
|
|
30
48
|
reader = PDF::Reader.new(file_path)
|
|
31
49
|
text = ""
|
|
50
|
+
page_count = 0
|
|
32
51
|
|
|
33
52
|
reader.pages.each do |page|
|
|
34
|
-
|
|
53
|
+
page_count += 1
|
|
54
|
+
page_text = page.text
|
|
55
|
+
text += page_text + "\n" if page_text && !page_text.strip.empty?
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
logger.info "Extracted text from #{page_count} PDF pages"
|
|
59
|
+
|
|
60
|
+
if text.strip.empty?
|
|
61
|
+
logger.warn "No text extracted from PDF - file may be image-based or encrypted"
|
|
62
|
+
raise Error, "No extractable text found in PDF. File may be image-based or password-protected."
|
|
35
63
|
end
|
|
36
64
|
|
|
37
|
-
# Clean up common PDF artifacts
|
|
38
65
|
clean_text(text)
|
|
66
|
+
rescue PDF::Reader::MalformedPDFError => e
|
|
67
|
+
raise Error, "Malformed PDF file: #{e.message}"
|
|
68
|
+
rescue PDF::Reader::UnsupportedFeatureError => e
|
|
69
|
+
raise Error, "PDF contains unsupported features: #{e.message}"
|
|
39
70
|
rescue => e
|
|
40
71
|
raise Error, "Failed to extract text from PDF: #{e.message}"
|
|
41
72
|
end
|
|
@@ -44,28 +75,98 @@ module LegalSummariser
|
|
|
44
75
|
# @param file_path [String] Path to DOCX file
|
|
45
76
|
# @return [String] Extracted text
|
|
46
77
|
def self.extract_from_docx(file_path)
|
|
78
|
+
logger.debug "Processing DOCX: #{file_path}"
|
|
79
|
+
|
|
47
80
|
doc = Docx::Document.open(file_path)
|
|
48
81
|
text = ""
|
|
82
|
+
paragraph_count = 0
|
|
49
83
|
|
|
50
84
|
doc.paragraphs.each do |paragraph|
|
|
51
|
-
|
|
85
|
+
paragraph_text = paragraph.text
|
|
86
|
+
if paragraph_text && !paragraph_text.strip.empty?
|
|
87
|
+
text += paragraph_text + "\n"
|
|
88
|
+
paragraph_count += 1
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Also extract text from tables if present
|
|
93
|
+
doc.tables.each do |table|
|
|
94
|
+
table.rows.each do |row|
|
|
95
|
+
row.cells.each do |cell|
|
|
96
|
+
cell_text = cell.text
|
|
97
|
+
text += cell_text + " " if cell_text && !cell_text.strip.empty?
|
|
98
|
+
end
|
|
99
|
+
text += "\n"
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
logger.info "Extracted text from #{paragraph_count} DOCX paragraphs"
|
|
104
|
+
|
|
105
|
+
if text.strip.empty?
|
|
106
|
+
raise Error, "No text content found in DOCX file"
|
|
52
107
|
end
|
|
53
108
|
|
|
54
109
|
clean_text(text)
|
|
110
|
+
rescue Zip::Error => e
|
|
111
|
+
raise Error, "Invalid DOCX file format: #{e.message}"
|
|
55
112
|
rescue => e
|
|
56
113
|
raise Error, "Failed to extract text from DOCX: #{e.message}"
|
|
57
114
|
end
|
|
58
115
|
|
|
116
|
+
# Extract text from plain text files
|
|
117
|
+
# @param file_path [String] Path to text file
|
|
118
|
+
# @return [String] Extracted text
|
|
119
|
+
def self.extract_from_text(file_path)
|
|
120
|
+
logger.debug "Processing text file: #{file_path}"
|
|
121
|
+
|
|
122
|
+
# Try different encodings
|
|
123
|
+
encodings = ['UTF-8', 'ISO-8859-1', 'Windows-1252']
|
|
124
|
+
|
|
125
|
+
encodings.each do |encoding|
|
|
126
|
+
begin
|
|
127
|
+
text = File.read(file_path, encoding: encoding)
|
|
128
|
+
logger.info "Successfully read text file with #{encoding} encoding"
|
|
129
|
+
return clean_text(text)
|
|
130
|
+
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
|
|
131
|
+
logger.debug "Failed to read with #{encoding} encoding, trying next"
|
|
132
|
+
next
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
raise Error, "Unable to read text file with supported encodings"
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Extract text from RTF files (basic support)
|
|
140
|
+
# @param file_path [String] Path to RTF file
|
|
141
|
+
# @return [String] Extracted text
|
|
142
|
+
def self.extract_from_rtf(file_path)
|
|
143
|
+
logger.debug "Processing RTF: #{file_path}"
|
|
144
|
+
|
|
145
|
+
content = File.read(file_path, encoding: 'UTF-8')
|
|
146
|
+
|
|
147
|
+
# Basic RTF parsing - remove RTF control codes
|
|
148
|
+
text = content.gsub(/\{[^}]*\}/, '') # Remove RTF groups
|
|
149
|
+
text = text.gsub(/\\[a-z]+\d*\s?/, '') # Remove RTF commands
|
|
150
|
+
text = text.gsub(/\\[^a-z]/, '') # Remove RTF escape sequences
|
|
151
|
+
|
|
152
|
+
clean_text(text)
|
|
153
|
+
rescue => e
|
|
154
|
+
raise Error, "Failed to extract text from RTF: #{e.message}"
|
|
155
|
+
end
|
|
156
|
+
|
|
59
157
|
# Clean extracted text
|
|
60
158
|
# @param text [String] Raw extracted text
|
|
61
159
|
# @return [String] Cleaned text
|
|
62
160
|
def self.clean_text(text)
|
|
161
|
+
return "" if text.nil? || text.empty?
|
|
162
|
+
|
|
63
163
|
# Normalize line breaks first
|
|
64
164
|
text = text.gsub(/\r\n?/, "\n")
|
|
65
165
|
|
|
66
|
-
# Remove common
|
|
166
|
+
# Remove common document artifacts
|
|
67
167
|
text = text.gsub(/\f/, '') # Form feed characters
|
|
68
168
|
text = text.gsub(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/, '') # Control characters
|
|
169
|
+
text = text.gsub(/\u00A0/, ' ') # Non-breaking spaces
|
|
69
170
|
|
|
70
171
|
# Remove excessive whitespace but preserve line breaks
|
|
71
172
|
text = text.gsub(/[ \t]+/, ' ')
|
|
@@ -73,7 +174,24 @@ module LegalSummariser
|
|
|
73
174
|
# Remove excessive newlines
|
|
74
175
|
text = text.gsub(/\n{3,}/, "\n\n")
|
|
75
176
|
|
|
177
|
+
# Remove leading/trailing whitespace from each line
|
|
178
|
+
text = text.split("\n").map(&:strip).join("\n")
|
|
179
|
+
|
|
180
|
+
# Remove empty lines at start and end
|
|
76
181
|
text.strip
|
|
77
182
|
end
|
|
183
|
+
|
|
184
|
+
# Get document statistics
|
|
185
|
+
# @param text [String] Document text
|
|
186
|
+
# @return [Hash] Document statistics
|
|
187
|
+
def self.get_statistics(text)
|
|
188
|
+
{
|
|
189
|
+
character_count: text.length,
|
|
190
|
+
word_count: text.split(/\s+/).length,
|
|
191
|
+
sentence_count: text.split(/[.!?]+/).length,
|
|
192
|
+
paragraph_count: text.split(/\n\s*\n/).length,
|
|
193
|
+
average_sentence_length: text.split(/\s+/).length.to_f / text.split(/[.!?]+/).length
|
|
194
|
+
}
|
|
195
|
+
end
|
|
78
196
|
end
|
|
79
197
|
end
|
data/lib/legal_summariser.rb
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "legal_summariser/version"
|
|
4
|
+
require_relative "legal_summariser/configuration"
|
|
5
|
+
require_relative "legal_summariser/cache"
|
|
6
|
+
require_relative "legal_summariser/performance_monitor"
|
|
4
7
|
require_relative "legal_summariser/document_parser"
|
|
5
8
|
require_relative "legal_summariser/text_extractor"
|
|
6
9
|
require_relative "legal_summariser/summariser"
|
|
@@ -18,34 +21,89 @@ module LegalSummariser
|
|
|
18
21
|
# @param options [Hash] Configuration options
|
|
19
22
|
# @return [Hash] Summary results
|
|
20
23
|
def self.summarise(file_path, options = {})
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
# Extract text from document
|
|
24
|
-
text = TextExtractor.extract(file_path)
|
|
24
|
+
monitor = performance_monitor
|
|
25
|
+
cache = Cache.new
|
|
25
26
|
|
|
26
|
-
|
|
27
|
-
summary = Summariser.new(text, options).generate
|
|
28
|
-
clauses = ClauseDetector.new(text).detect
|
|
29
|
-
risks = RiskAnalyzer.new(text).analyze
|
|
27
|
+
monitor.start_timer(:total_analysis)
|
|
30
28
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
29
|
+
begin
|
|
30
|
+
# Validate file
|
|
31
|
+
raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
|
|
32
|
+
|
|
33
|
+
file_size = File.size(file_path)
|
|
34
|
+
raise Error, "File too large: #{file_size} bytes (max: #{configuration.max_file_size})" if file_size > configuration.max_file_size
|
|
35
|
+
|
|
36
|
+
# Check cache first
|
|
37
|
+
cache_key = cache.cache_key(file_path, options)
|
|
38
|
+
cached_result = cache.get(cache_key)
|
|
39
|
+
|
|
40
|
+
if cached_result
|
|
41
|
+
configuration.logger&.info("Using cached result for #{file_path}")
|
|
42
|
+
monitor.end_timer(:total_analysis)
|
|
43
|
+
return cached_result
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Extract text from document
|
|
47
|
+
monitor.start_timer(:text_extraction)
|
|
48
|
+
text = TextExtractor.extract(file_path)
|
|
49
|
+
extraction_time = monitor.end_timer(:text_extraction)
|
|
50
|
+
|
|
51
|
+
# Record text statistics
|
|
52
|
+
text_stats = TextExtractor.get_statistics(text)
|
|
53
|
+
monitor.record(:document_word_count, text_stats[:word_count])
|
|
54
|
+
monitor.record(:document_character_count, text_stats[:character_count])
|
|
55
|
+
|
|
56
|
+
# Perform analysis components
|
|
57
|
+
monitor.start_timer(:summarisation)
|
|
58
|
+
summary = Summariser.new(text, options).generate
|
|
59
|
+
monitor.end_timer(:summarisation)
|
|
60
|
+
|
|
61
|
+
monitor.start_timer(:clause_detection)
|
|
62
|
+
clauses = ClauseDetector.new(text).detect
|
|
63
|
+
monitor.end_timer(:clause_detection)
|
|
64
|
+
|
|
65
|
+
monitor.start_timer(:risk_analysis)
|
|
66
|
+
risks = RiskAnalyzer.new(text).analyze
|
|
67
|
+
monitor.end_timer(:risk_analysis)
|
|
68
|
+
|
|
69
|
+
# Format results
|
|
70
|
+
result = {
|
|
71
|
+
plain_text: summary[:plain_text],
|
|
72
|
+
key_points: summary[:key_points],
|
|
73
|
+
clauses: clauses,
|
|
74
|
+
risks: risks,
|
|
75
|
+
metadata: {
|
|
76
|
+
document_type: detect_document_type(text),
|
|
77
|
+
word_count: text_stats[:word_count],
|
|
78
|
+
character_count: text_stats[:character_count],
|
|
79
|
+
sentence_count: text_stats[:sentence_count],
|
|
80
|
+
paragraph_count: text_stats[:paragraph_count],
|
|
81
|
+
file_size_bytes: file_size,
|
|
82
|
+
extraction_time_seconds: extraction_time.round(3),
|
|
83
|
+
processed_at: Time.now.strftime("%Y-%m-%dT%H:%M:%S%z"),
|
|
84
|
+
gem_version: VERSION,
|
|
85
|
+
language: configuration.language
|
|
86
|
+
},
|
|
87
|
+
performance: monitor.stats
|
|
41
88
|
}
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
89
|
+
|
|
90
|
+
# Cache the result
|
|
91
|
+
cache.set(cache_key, result)
|
|
92
|
+
|
|
93
|
+
total_time = monitor.end_timer(:total_analysis)
|
|
94
|
+
configuration.logger&.info("Analysis completed in #{total_time.round(3)}s")
|
|
95
|
+
|
|
96
|
+
# Apply formatting if requested
|
|
97
|
+
if options[:format]
|
|
98
|
+
Formatter.format(result, options[:format])
|
|
99
|
+
else
|
|
100
|
+
result
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
rescue => e
|
|
104
|
+
monitor.end_timer(:total_analysis)
|
|
105
|
+
configuration.logger&.error("Analysis failed: #{e.message}")
|
|
106
|
+
raise
|
|
49
107
|
end
|
|
50
108
|
end
|
|
51
109
|
|
|
@@ -53,19 +111,114 @@ module LegalSummariser
|
|
|
53
111
|
# @param text [String] Document text
|
|
54
112
|
# @return [String] Document type
|
|
55
113
|
def self.detect_document_type(text)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
114
|
+
text_lower = text.downcase
|
|
115
|
+
|
|
116
|
+
# Score different document types
|
|
117
|
+
scores = {
|
|
118
|
+
nda: 0,
|
|
119
|
+
service_agreement: 0,
|
|
120
|
+
employment_contract: 0,
|
|
121
|
+
privacy_policy: 0,
|
|
122
|
+
license_agreement: 0,
|
|
123
|
+
terms_of_use: 0,
|
|
124
|
+
purchase_agreement: 0,
|
|
125
|
+
lease_agreement: 0,
|
|
126
|
+
partnership_agreement: 0,
|
|
127
|
+
general_contract: 1 # Base score
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
# NDA indicators
|
|
131
|
+
scores[:nda] += 3 if text_lower.match?(/non.?disclosure/)
|
|
132
|
+
scores[:nda] += 2 if text_lower.match?(/\bnda\b/)
|
|
133
|
+
scores[:nda] += 2 if text_lower.match?(/confidential/)
|
|
134
|
+
scores[:nda] += 1 if text_lower.match?(/proprietary/)
|
|
135
|
+
|
|
136
|
+
# Service agreement indicators
|
|
137
|
+
scores[:service_agreement] += 3 if text_lower.match?(/service agreement/)
|
|
138
|
+
scores[:service_agreement] += 2 if text_lower.match?(/terms of service/)
|
|
139
|
+
scores[:service_agreement] += 2 if text_lower.match?(/\btos\b/)
|
|
140
|
+
scores[:service_agreement] += 1 if text_lower.match?(/deliver|provide.*service/)
|
|
141
|
+
|
|
142
|
+
# Employment indicators
|
|
143
|
+
scores[:employment_contract] += 3 if text_lower.match?(/employment/)
|
|
144
|
+
scores[:employment_contract] += 2 if text_lower.match?(/employee|employer/)
|
|
145
|
+
scores[:employment_contract] += 2 if text_lower.match?(/job|position/)
|
|
146
|
+
scores[:employment_contract] += 1 if text_lower.match?(/salary|wage/)
|
|
147
|
+
|
|
148
|
+
# Privacy policy indicators
|
|
149
|
+
scores[:privacy_policy] += 3 if text_lower.match?(/privacy policy/)
|
|
150
|
+
scores[:privacy_policy] += 2 if text_lower.match?(/data protection/)
|
|
151
|
+
scores[:privacy_policy] += 2 if text_lower.match?(/gdpr|kvkk/)
|
|
152
|
+
scores[:privacy_policy] += 1 if text_lower.match?(/personal data/)
|
|
153
|
+
|
|
154
|
+
# License agreement indicators
|
|
155
|
+
scores[:license_agreement] += 3 if text_lower.match?(/license agreement/)
|
|
156
|
+
scores[:license_agreement] += 2 if text_lower.match?(/licensing/)
|
|
157
|
+
scores[:license_agreement] += 1 if text_lower.match?(/intellectual property/)
|
|
158
|
+
|
|
159
|
+
# Terms of use indicators
|
|
160
|
+
scores[:terms_of_use] += 3 if text_lower.match?(/terms of use/)
|
|
161
|
+
scores[:terms_of_use] += 2 if text_lower.match?(/user agreement/)
|
|
162
|
+
scores[:terms_of_use] += 1 if text_lower.match?(/website|platform/)
|
|
163
|
+
|
|
164
|
+
# Purchase agreement indicators
|
|
165
|
+
scores[:purchase_agreement] += 3 if text_lower.match?(/purchase agreement/)
|
|
166
|
+
scores[:purchase_agreement] += 2 if text_lower.match?(/buy|sell|purchase/)
|
|
167
|
+
scores[:purchase_agreement] += 1 if text_lower.match?(/price|payment/)
|
|
168
|
+
|
|
169
|
+
# Lease agreement indicators
|
|
170
|
+
scores[:lease_agreement] += 3 if text_lower.match?(/lease agreement/)
|
|
171
|
+
scores[:lease_agreement] += 2 if text_lower.match?(/rent|tenant|landlord/)
|
|
172
|
+
scores[:lease_agreement] += 1 if text_lower.match?(/property|premises/)
|
|
173
|
+
|
|
174
|
+
# Partnership agreement indicators
|
|
175
|
+
scores[:partnership_agreement] += 3 if text_lower.match?(/partnership agreement/)
|
|
176
|
+
scores[:partnership_agreement] += 2 if text_lower.match?(/partner|partnership/)
|
|
177
|
+
scores[:partnership_agreement] += 1 if text_lower.match?(/joint venture/)
|
|
178
|
+
|
|
179
|
+
# Return the type with highest score
|
|
180
|
+
scores.max_by { |_, score| score }[0].to_s
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Get analysis statistics
|
|
184
|
+
# @return [Hash] Analysis statistics
|
|
185
|
+
def self.stats
|
|
186
|
+
{
|
|
187
|
+
performance: performance_monitor.stats,
|
|
188
|
+
cache: Cache.new.stats,
|
|
189
|
+
memory: performance_monitor.memory_usage,
|
|
190
|
+
configuration: {
|
|
191
|
+
language: configuration.language,
|
|
192
|
+
max_file_size: configuration.max_file_size,
|
|
193
|
+
caching_enabled: configuration.enable_caching
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Reset all statistics and cache
|
|
199
|
+
def self.reset!
|
|
200
|
+
performance_monitor.reset!
|
|
201
|
+
Cache.new.clear!
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Batch process multiple documents
|
|
205
|
+
# @param file_paths [Array<String>] Array of file paths
|
|
206
|
+
# @param options [Hash] Processing options
|
|
207
|
+
# @return [Array<Hash>] Array of analysis results
|
|
208
|
+
def self.batch_summarise(file_paths, options = {})
|
|
209
|
+
results = []
|
|
210
|
+
|
|
211
|
+
file_paths.each_with_index do |file_path, index|
|
|
212
|
+
begin
|
|
213
|
+
configuration.logger&.info("Processing file #{index + 1}/#{file_paths.length}: #{file_path}")
|
|
214
|
+
result = summarise(file_path, options)
|
|
215
|
+
results << { file_path: file_path, success: true, result: result }
|
|
216
|
+
rescue => e
|
|
217
|
+
configuration.logger&.error("Failed to process #{file_path}: #{e.message}")
|
|
218
|
+
results << { file_path: file_path, success: false, error: e.message }
|
|
219
|
+
end
|
|
69
220
|
end
|
|
221
|
+
|
|
222
|
+
results
|
|
70
223
|
end
|
|
71
224
|
end
|