legal_summariser 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,22 +2,38 @@
2
2
 
3
3
  require 'pdf-reader'
4
4
  require 'docx'
5
+ require 'logger'
5
6
 
6
7
  module LegalSummariser
7
8
  class TextExtractor
9
+ # Logger for debugging and monitoring
10
+ def self.logger
11
+ @logger ||= Logger.new(STDOUT, level: Logger::WARN)
12
+ end
13
+
14
+ def self.logger=(logger)
15
+ @logger = logger
16
+ end
8
17
  # Extract text from various document formats
9
18
  # @param file_path [String] Path to the document
10
19
  # @return [String] Extracted text
11
20
  def self.extract(file_path)
21
+ raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
22
+ raise DocumentNotFoundError, "File is empty: #{file_path}" if File.zero?(file_path)
23
+
24
+ logger.info "Extracting text from: #{file_path}"
25
+
12
26
  case File.extname(file_path).downcase
13
27
  when '.pdf'
14
28
  extract_from_pdf(file_path)
15
29
  when '.docx'
16
30
  extract_from_docx(file_path)
17
- when '.txt'
18
- File.read(file_path, encoding: 'UTF-8')
31
+ when '.txt', '.text'
32
+ extract_from_text(file_path)
33
+ when '.rtf'
34
+ extract_from_rtf(file_path)
19
35
  else
20
- raise UnsupportedFormatError, "Unsupported file format: #{File.extname(file_path)}"
36
+ raise UnsupportedFormatError, "Unsupported file format: #{File.extname(file_path)}. Supported formats: .pdf, .docx, .txt, .rtf"
21
37
  end
22
38
  end
23
39
 
@@ -27,15 +43,30 @@ module LegalSummariser
27
43
  # @param file_path [String] Path to PDF file
28
44
  # @return [String] Extracted text
29
45
  def self.extract_from_pdf(file_path)
46
+ logger.debug "Processing PDF: #{file_path}"
47
+
30
48
  reader = PDF::Reader.new(file_path)
31
49
  text = ""
50
+ page_count = 0
32
51
 
33
52
  reader.pages.each do |page|
34
- text += page.text + "\n"
53
+ page_count += 1
54
+ page_text = page.text
55
+ text += page_text + "\n" if page_text && !page_text.strip.empty?
56
+ end
57
+
58
+ logger.info "Extracted text from #{page_count} PDF pages"
59
+
60
+ if text.strip.empty?
61
+ logger.warn "No text extracted from PDF - file may be image-based or encrypted"
62
+ raise Error, "No extractable text found in PDF. File may be image-based or password-protected."
35
63
  end
36
64
 
37
- # Clean up common PDF artifacts
38
65
  clean_text(text)
66
+ rescue PDF::Reader::MalformedPDFError => e
67
+ raise Error, "Malformed PDF file: #{e.message}"
68
+ rescue PDF::Reader::UnsupportedFeatureError => e
69
+ raise Error, "PDF contains unsupported features: #{e.message}"
39
70
  rescue => e
40
71
  raise Error, "Failed to extract text from PDF: #{e.message}"
41
72
  end
@@ -44,28 +75,98 @@ module LegalSummariser
44
75
  # @param file_path [String] Path to DOCX file
45
76
  # @return [String] Extracted text
46
77
  def self.extract_from_docx(file_path)
78
+ logger.debug "Processing DOCX: #{file_path}"
79
+
47
80
  doc = Docx::Document.open(file_path)
48
81
  text = ""
82
+ paragraph_count = 0
49
83
 
50
84
  doc.paragraphs.each do |paragraph|
51
- text += paragraph.text + "\n"
85
+ paragraph_text = paragraph.text
86
+ if paragraph_text && !paragraph_text.strip.empty?
87
+ text += paragraph_text + "\n"
88
+ paragraph_count += 1
89
+ end
90
+ end
91
+
92
+ # Also extract text from tables if present
93
+ doc.tables.each do |table|
94
+ table.rows.each do |row|
95
+ row.cells.each do |cell|
96
+ cell_text = cell.text
97
+ text += cell_text + " " if cell_text && !cell_text.strip.empty?
98
+ end
99
+ text += "\n"
100
+ end
101
+ end
102
+
103
+ logger.info "Extracted text from #{paragraph_count} DOCX paragraphs"
104
+
105
+ if text.strip.empty?
106
+ raise Error, "No text content found in DOCX file"
52
107
  end
53
108
 
54
109
  clean_text(text)
110
+ rescue Zip::Error => e
111
+ raise Error, "Invalid DOCX file format: #{e.message}"
55
112
  rescue => e
56
113
  raise Error, "Failed to extract text from DOCX: #{e.message}"
57
114
  end
58
115
 
116
+ # Extract text from plain text files
117
+ # @param file_path [String] Path to text file
118
+ # @return [String] Extracted text
119
+ def self.extract_from_text(file_path)
120
+ logger.debug "Processing text file: #{file_path}"
121
+
122
+ # Try different encodings
123
+ encodings = ['UTF-8', 'ISO-8859-1', 'Windows-1252']
124
+
125
+ encodings.each do |encoding|
126
+ begin
127
+ text = File.read(file_path, encoding: encoding)
128
+ logger.info "Successfully read text file with #{encoding} encoding"
129
+ return clean_text(text)
130
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
131
+ logger.debug "Failed to read with #{encoding} encoding, trying next"
132
+ next
133
+ end
134
+ end
135
+
136
+ raise Error, "Unable to read text file with supported encodings"
137
+ end
138
+
139
+ # Extract text from RTF files (basic support)
140
+ # @param file_path [String] Path to RTF file
141
+ # @return [String] Extracted text
142
+ def self.extract_from_rtf(file_path)
143
+ logger.debug "Processing RTF: #{file_path}"
144
+
145
+ content = File.read(file_path, encoding: 'UTF-8')
146
+
147
+ # Basic RTF parsing - remove RTF control codes
148
+ text = content.gsub(/\{[^}]*\}/, '') # Remove RTF groups
149
+ text = text.gsub(/\\[a-z]+\d*\s?/, '') # Remove RTF commands
150
+ text = text.gsub(/\\[^a-z]/, '') # Remove RTF escape sequences
151
+
152
+ clean_text(text)
153
+ rescue => e
154
+ raise Error, "Failed to extract text from RTF: #{e.message}"
155
+ end
156
+
59
157
  # Clean extracted text
60
158
  # @param text [String] Raw extracted text
61
159
  # @return [String] Cleaned text
62
160
  def self.clean_text(text)
161
+ return "" if text.nil? || text.empty?
162
+
63
163
  # Normalize line breaks first
64
164
  text = text.gsub(/\r\n?/, "\n")
65
165
 
66
- # Remove common PDF artifacts
166
+ # Remove common document artifacts
67
167
  text = text.gsub(/\f/, '') # Form feed characters
68
168
  text = text.gsub(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/, '') # Control characters
169
+ text = text.gsub(/\u00A0/, ' ') # Non-breaking spaces
69
170
 
70
171
  # Remove excessive whitespace but preserve line breaks
71
172
  text = text.gsub(/[ \t]+/, ' ')
@@ -73,7 +174,24 @@ module LegalSummariser
73
174
  # Remove excessive newlines
74
175
  text = text.gsub(/\n{3,}/, "\n\n")
75
176
 
177
+ # Remove leading/trailing whitespace from each line
178
+ text = text.split("\n").map(&:strip).join("\n")
179
+
180
+ # Remove empty lines at start and end
76
181
  text.strip
77
182
  end
183
+
184
+ # Get document statistics
185
+ # @param text [String] Document text
186
+ # @return [Hash] Document statistics
187
+ def self.get_statistics(text)
188
+ {
189
+ character_count: text.length,
190
+ word_count: text.split(/\s+/).length,
191
+ sentence_count: text.split(/[.!?]+/).length,
192
+ paragraph_count: text.split(/\n\s*\n/).length,
193
+ average_sentence_length: text.split(/\s+/).length.to_f / text.split(/[.!?]+/).length
194
+ }
195
+ end
78
196
  end
79
197
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module LegalSummariser
4
- VERSION = "0.1.0"
4
+ VERSION = "0.3.0"
5
5
  end
@@ -1,12 +1,19 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "legal_summariser/version"
4
- require_relative "legal_summariser/document_parser"
5
- require_relative "legal_summariser/text_extractor"
6
- require_relative "legal_summariser/summariser"
7
- require_relative "legal_summariser/clause_detector"
8
- require_relative "legal_summariser/risk_analyzer"
9
- require_relative "legal_summariser/formatter"
3
+ require_relative 'legal_summariser/version'
4
+ require_relative 'legal_summariser/text_extractor'
5
+ require_relative 'legal_summariser/summariser'
6
+ require_relative 'legal_summariser/clause_detector'
7
+ require_relative 'legal_summariser/risk_analyzer'
8
+ require_relative 'legal_summariser/formatter'
9
+ require_relative 'legal_summariser/document_parser'
10
+ require_relative 'legal_summariser/configuration'
11
+ require_relative 'legal_summariser/cache'
12
+ require_relative 'legal_summariser/performance_monitor'
13
+ require_relative 'legal_summariser/plain_language_generator'
14
+ require_relative 'legal_summariser/model_trainer'
15
+ require_relative 'legal_summariser/multilingual_processor'
16
+ require_relative 'legal_summariser/pdf_annotator'
10
17
 
11
18
  module LegalSummariser
12
19
  class Error < StandardError; end
@@ -18,34 +25,93 @@ module LegalSummariser
18
25
  # @param options [Hash] Configuration options
19
26
  # @return [Hash] Summary results
20
27
  def self.summarise(file_path, options = {})
21
- raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
22
-
23
- # Extract text from document
24
- text = TextExtractor.extract(file_path)
28
+ monitor = performance_monitor
29
+ cache = Cache.new
25
30
 
26
- # Perform analysis
27
- summary = Summariser.new(text, options).generate
28
- clauses = ClauseDetector.new(text).detect
29
- risks = RiskAnalyzer.new(text).analyze
31
+ monitor.start_timer(:total_analysis)
30
32
 
31
- # Format results
32
- result = {
33
- plain_text: summary[:plain_text],
34
- key_points: summary[:key_points],
35
- clauses: clauses,
36
- risks: risks,
37
- metadata: {
33
+ begin
34
+ # Validate file
35
+ raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
36
+
37
+ file_size = File.size(file_path)
38
+ raise Error, "File too large: #{file_size} bytes (max: #{configuration.max_file_size})" if file_size > configuration.max_file_size
39
+
40
+ # Check cache first
41
+ cache_key = cache.cache_key(file_path, options)
42
+ cached_result = cache.get(cache_key)
43
+
44
+ if cached_result
45
+ configuration.logger&.info("Using cached result for #{file_path}")
46
+ monitor.end_timer(:total_analysis)
47
+ return cached_result
48
+ end
49
+
50
+ # Extract text from document
51
+ monitor.start_timer(:text_extraction)
52
+ text = TextExtractor.extract(file_path)
53
+ extraction_time = monitor.end_timer(:text_extraction)
54
+
55
+ # Record text statistics
56
+ text_stats = TextExtractor.get_statistics(text)
57
+ monitor.record(:document_word_count, text_stats[:word_count])
58
+ monitor.record(:document_character_count, text_stats[:character_count])
59
+
60
+ # Perform analysis components
61
+ monitor.start_timer(:summarisation)
62
+ summary = Summariser.new(text, options).generate
63
+ monitor.end_timer(:summarisation)
64
+
65
+ monitor.start_timer(:clause_detection)
66
+ clauses = ClauseDetector.new(text).detect
67
+ monitor.end_timer(:clause_detection)
68
+
69
+ monitor.start_timer(:risk_analysis)
70
+ risks = RiskAnalyzer.new(text).analyze
71
+ monitor.end_timer(:risk_analysis)
72
+
73
+ # Format results
74
+ result = {
75
+ file_path: file_path,
38
76
  document_type: detect_document_type(text),
39
- word_count: text.split.length,
40
- processed_at: Time.now.strftime("%Y-%m-%dT%H:%M:%S%z")
77
+ processing_time: monitor.end_timer(:total_analysis),
78
+ plain_text: summary[:plain_text],
79
+ key_points: summary[:key_points],
80
+ clauses: clauses,
81
+ risks: risks,
82
+ metadata: {
83
+ file_size: File.size(file_path),
84
+ word_count: text_stats[:word_count],
85
+ character_count: text_stats[:character_count],
86
+ sentence_count: text_stats[:sentence_count],
87
+ paragraph_count: text_stats[:paragraph_count],
88
+ file_size_bytes: file_size,
89
+ extraction_time_seconds: extraction_time.round(3),
90
+ processed_at: Time.now.strftime("%Y-%m-%dT%H:%M:%S%z"),
91
+ gem_version: VERSION,
92
+ language: configuration.language,
93
+ document_type: detect_document_type(text)
94
+ },
95
+ performance: monitor.stats
41
96
  }
42
- }
43
-
44
- # Apply formatting if requested
45
- if options[:format]
46
- Formatter.format(result, options[:format])
47
- else
48
- result
97
+
98
+ # Cache the result
99
+ cache.set(cache_key, result)
100
+
101
+ total_time = monitor.end_timer(:total_analysis)
102
+ configuration.logger&.info("Analysis completed in #{total_time.round(3)}s")
103
+
104
+ # Apply formatting if requested
105
+ if options[:format]
106
+ Formatter.format(result, options[:format])
107
+ else
108
+ result
109
+ end
110
+
111
+ rescue => e
112
+ monitor.end_timer(:total_analysis)
113
+ configuration.logger&.error("Analysis failed: #{e.message}")
114
+ raise
49
115
  end
50
116
  end
51
117
 
@@ -53,19 +119,114 @@ module LegalSummariser
53
119
  # @param text [String] Document text
54
120
  # @return [String] Document type
55
121
  def self.detect_document_type(text)
56
- case text.downcase
57
- when /non.?disclosure|nda|confidentiality/
58
- "nda"
59
- when /service agreement|terms of service|tos/
60
- "service_agreement"
61
- when /employment|job|position/
62
- "employment_contract"
63
- when /privacy policy|data protection|gdpr|kvkk/
64
- "privacy_policy"
65
- when /license|licensing/
66
- "license_agreement"
67
- else
68
- "general_contract"
122
+ text_lower = text.downcase
123
+
124
+ # Score different document types
125
+ scores = {
126
+ nda: 0,
127
+ service_agreement: 0,
128
+ employment_contract: 0,
129
+ privacy_policy: 0,
130
+ license_agreement: 0,
131
+ terms_of_use: 0,
132
+ purchase_agreement: 0,
133
+ lease_agreement: 0,
134
+ partnership_agreement: 0,
135
+ general_contract: 1 # Base score
136
+ }
137
+
138
+ # NDA indicators
139
+ scores[:nda] += 3 if text_lower.match?(/non.?disclosure/)
140
+ scores[:nda] += 2 if text_lower.match?(/\bnda\b/)
141
+ scores[:nda] += 2 if text_lower.match?(/confidential/)
142
+ scores[:nda] += 1 if text_lower.match?(/proprietary/)
143
+
144
+ # Service agreement indicators
145
+ scores[:service_agreement] += 3 if text_lower.match?(/service agreement/)
146
+ scores[:service_agreement] += 2 if text_lower.match?(/terms of service/)
147
+ scores[:service_agreement] += 2 if text_lower.match?(/\btos\b/)
148
+ scores[:service_agreement] += 1 if text_lower.match?(/deliver|provide.*service/)
149
+
150
+ # Employment indicators
151
+ scores[:employment_contract] += 3 if text_lower.match?(/employment/)
152
+ scores[:employment_contract] += 2 if text_lower.match?(/employee|employer/)
153
+ scores[:employment_contract] += 2 if text_lower.match?(/job|position/)
154
+ scores[:employment_contract] += 1 if text_lower.match?(/salary|wage/)
155
+
156
+ # Privacy policy indicators
157
+ scores[:privacy_policy] += 3 if text_lower.match?(/privacy policy/)
158
+ scores[:privacy_policy] += 2 if text_lower.match?(/data protection/)
159
+ scores[:privacy_policy] += 2 if text_lower.match?(/gdpr|kvkk/)
160
+ scores[:privacy_policy] += 1 if text_lower.match?(/personal data/)
161
+
162
+ # License agreement indicators
163
+ scores[:license_agreement] += 3 if text_lower.match?(/license agreement/)
164
+ scores[:license_agreement] += 2 if text_lower.match?(/licensing/)
165
+ scores[:license_agreement] += 1 if text_lower.match?(/intellectual property/)
166
+
167
+ # Terms of use indicators
168
+ scores[:terms_of_use] += 3 if text_lower.match?(/terms of use/)
169
+ scores[:terms_of_use] += 2 if text_lower.match?(/user agreement/)
170
+ scores[:terms_of_use] += 1 if text_lower.match?(/website|platform/)
171
+
172
+ # Purchase agreement indicators
173
+ scores[:purchase_agreement] += 3 if text_lower.match?(/purchase agreement/)
174
+ scores[:purchase_agreement] += 2 if text_lower.match?(/buy|sell|purchase/)
175
+ scores[:purchase_agreement] += 1 if text_lower.match?(/price|payment/)
176
+
177
+ # Lease agreement indicators
178
+ scores[:lease_agreement] += 3 if text_lower.match?(/lease agreement/)
179
+ scores[:lease_agreement] += 2 if text_lower.match?(/rent|tenant|landlord/)
180
+ scores[:lease_agreement] += 1 if text_lower.match?(/property|premises/)
181
+
182
+ # Partnership agreement indicators
183
+ scores[:partnership_agreement] += 3 if text_lower.match?(/partnership agreement/)
184
+ scores[:partnership_agreement] += 2 if text_lower.match?(/partner|partnership/)
185
+ scores[:partnership_agreement] += 1 if text_lower.match?(/joint venture/)
186
+
187
+ # Return the type with highest score
188
+ scores.max_by { |_, score| score }[0].to_s
189
+ end
190
+
191
+ # Get analysis statistics
192
+ # @return [Hash] Analysis statistics
193
+ def self.stats
194
+ {
195
+ performance: performance_monitor.stats,
196
+ cache: Cache.new.stats,
197
+ memory: performance_monitor.memory_usage,
198
+ configuration: {
199
+ language: configuration.language,
200
+ max_file_size: configuration.max_file_size,
201
+ caching_enabled: configuration.enable_caching
202
+ }
203
+ }
204
+ end
205
+
206
+ # Reset all statistics and cache
207
+ def self.reset!
208
+ performance_monitor.reset!
209
+ Cache.new.clear!
210
+ end
211
+
212
+ # Batch process multiple documents
213
+ # @param file_paths [Array<String>] Array of file paths
214
+ # @param options [Hash] Processing options
215
+ # @return [Array<Hash>] Array of analysis results
216
+ def self.batch_summarise(file_paths, options = {})
217
+ results = []
218
+
219
+ file_paths.each_with_index do |file_path, index|
220
+ begin
221
+ configuration.logger&.info("Processing file #{index + 1}/#{file_paths.length}: #{file_path}")
222
+ result = summarise(file_path, options)
223
+ results << { file_path: file_path, success: true, result: result }
224
+ rescue => e
225
+ configuration.logger&.error("Failed to process #{file_path}: #{e.message}")
226
+ results << { file_path: file_path, success: false, error: e.message }
227
+ end
69
228
  end
229
+
230
+ results
70
231
  end
71
232
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: legal_summariser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Legal Summariser Team
@@ -162,14 +162,21 @@ extra_rdoc_files: []
162
162
  files:
163
163
  - ".rspec"
164
164
  - CHANGELOG.md
165
+ - CONTRIBUTING.md
165
166
  - Gemfile
166
167
  - README.md
167
168
  - Rakefile
169
+ - examples/advanced_configuration.rb
170
+ - examples/basic_usage.rb
171
+ - examples/batch_processing.rb
168
172
  - exe/legal_summariser
169
173
  - lib/legal_summariser.rb
174
+ - lib/legal_summariser/cache.rb
170
175
  - lib/legal_summariser/clause_detector.rb
176
+ - lib/legal_summariser/configuration.rb
171
177
  - lib/legal_summariser/document_parser.rb
172
178
  - lib/legal_summariser/formatter.rb
179
+ - lib/legal_summariser/performance_monitor.rb
173
180
  - lib/legal_summariser/risk_analyzer.rb
174
181
  - lib/legal_summariser/summariser.rb
175
182
  - lib/legal_summariser/text_extractor.rb