legal_summariser 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,463 @@
1
+ require 'net/http'
2
+ require 'json'
3
+ require 'uri'
4
+
5
+ module LegalSummariser
6
+ # Advanced plain language generator using AI/ML models for legal text simplification
7
+ class PlainLanguageGenerator
8
+ class ModelError < StandardError; end
9
+ class APIError < StandardError; end
10
+ class ConfigurationError < StandardError; end
11
+
12
+ # Legal jargon to plain English mappings
13
+ LEGAL_MAPPINGS = {
14
+ 'heretofore' => 'until now',
15
+ 'hereinafter' => 'from now on',
16
+ 'whereas' => 'since',
17
+ 'whereby' => 'by which',
18
+ 'pursuant to' => 'according to',
19
+ 'notwithstanding' => 'despite',
20
+ 'aforementioned' => 'mentioned above',
21
+ 'aforestated' => 'stated above',
22
+ 'therein' => 'in that',
23
+ 'thereof' => 'of that',
24
+ 'hereunder' => 'under this',
25
+ 'thereunder' => 'under that',
26
+ 'herewith' => 'with this',
27
+ 'therewith' => 'with that',
28
+ 'henceforth' => 'from now on',
29
+ 'ipso facto' => 'by the fact itself',
30
+ 'inter alia' => 'among other things',
31
+ 'prima facie' => 'at first sight',
32
+ 'quid pro quo' => 'something for something',
33
+ 'vis-à-vis' => 'in relation to',
34
+ 'force majeure' => 'unforeseeable circumstances',
35
+ 'in perpetuity' => 'forever',
36
+ 'ab initio' => 'from the beginning',
37
+ 'bona fide' => 'genuine',
38
+ 'de facto' => 'in reality',
39
+ 'de jure' => 'by law',
40
+ 'ex parte' => 'one-sided',
41
+ 'pro rata' => 'proportionally',
42
+ 'sine qua non' => 'essential requirement'
43
+ }.freeze
44
+
45
+ # Complex sentence patterns to simplify
46
+ SENTENCE_PATTERNS = [
47
+ {
48
+ pattern: /shall be deemed to be/i,
49
+ replacement: 'is considered'
50
+ },
51
+ {
52
+ pattern: /is hereby authorized to/i,
53
+ replacement: 'can'
54
+ },
55
+ {
56
+ pattern: /for the purpose of/i,
57
+ replacement: 'to'
58
+ },
59
+ {
60
+ pattern: /in the event that/i,
61
+ replacement: 'if'
62
+ },
63
+ {
64
+ pattern: /provided that/i,
65
+ replacement: 'if'
66
+ },
67
+ {
68
+ pattern: /subject to the provisions of/i,
69
+ replacement: 'following the rules in'
70
+ },
71
+ {
72
+ pattern: /without prejudice to/i,
73
+ replacement: 'without affecting'
74
+ },
75
+ {
76
+ pattern: /save and except/i,
77
+ replacement: 'except for'
78
+ },
79
+ {
80
+ pattern: /null and void/i,
81
+ replacement: 'invalid'
82
+ },
83
+ {
84
+ pattern: /cease and desist/i,
85
+ replacement: 'stop'
86
+ }
87
+ ].freeze
88
+
89
+ attr_reader :config, :model_config, :logger
90
+
91
+ def initialize(config = nil)
92
+ @config = config || LegalSummariser.configuration
93
+ @logger = @config.logger
94
+ @model_config = setup_model_configuration
95
+ validate_configuration
96
+ end
97
+
98
+ # Generate plain language version of legal text
99
+ def generate(text, options = {})
100
+ return '' if text.nil? || text.strip.empty?
101
+
102
+ @logger&.info("Generating plain language for text of length: #{text.length}")
103
+
104
+ start_time = Time.now
105
+
106
+ begin
107
+ # Multi-step processing approach
108
+ simplified_text = process_text_pipeline(text, options)
109
+
110
+ duration = Time.now - start_time
111
+ @logger&.info("Plain language generation completed in #{duration.round(2)}s")
112
+
113
+ {
114
+ original_text: text,
115
+ simplified_text: simplified_text,
116
+ processing_time: duration,
117
+ readability_score: calculate_readability_score(simplified_text),
118
+ complexity_reduction: calculate_complexity_reduction(text, simplified_text),
119
+ metadata: {
120
+ word_count_original: text.split.length,
121
+ word_count_simplified: simplified_text.split.length,
122
+ sentence_count: simplified_text.split(/[.!?]+/).length,
123
+ avg_sentence_length: calculate_avg_sentence_length(simplified_text)
124
+ }
125
+ }
126
+ rescue => e
127
+ @logger&.error("Plain language generation failed: #{e.message}")
128
+ raise ModelError, "Failed to generate plain language: #{e.message}"
129
+ end
130
+ end
131
+
132
+ # Batch process multiple texts
133
+ def generate_batch(texts, options = {})
134
+ return [] if texts.nil? || texts.empty?
135
+
136
+ @logger&.info("Processing batch of #{texts.length} texts")
137
+
138
+ results = []
139
+ texts.each_with_index do |text, index|
140
+ begin
141
+ result = generate(text, options.merge(batch_index: index))
142
+ results << result
143
+ rescue => e
144
+ @logger&.error("Failed to process text #{index}: #{e.message}")
145
+ results << {
146
+ error: e.message,
147
+ original_text: text,
148
+ batch_index: index
149
+ }
150
+ end
151
+ end
152
+
153
+ results
154
+ end
155
+
156
+ # Get available AI models
157
+ def available_models
158
+ {
159
+ local: ['rule_based', 'pattern_matching'],
160
+ cloud: model_config[:available_models] || [],
161
+ recommended: 'rule_based'
162
+ }
163
+ end
164
+
165
+ # Fine-tune model with custom legal text pairs
166
+ def fine_tune_model(training_data, options = {})
167
+ return false unless training_data.is_a?(Array) && !training_data.empty?
168
+
169
+ @logger&.info("Fine-tuning model with #{training_data.length} training examples")
170
+
171
+ # For now, we'll store custom mappings for rule-based improvement
172
+ custom_mappings_file = File.join(@config.cache_dir, 'custom_legal_mappings.json')
173
+
174
+ begin
175
+ custom_mappings = extract_custom_mappings(training_data)
176
+ File.write(custom_mappings_file, JSON.pretty_generate(custom_mappings))
177
+
178
+ @logger&.info("Custom mappings saved to #{custom_mappings_file}")
179
+ true
180
+ rescue => e
181
+ @logger&.error("Fine-tuning failed: #{e.message}")
182
+ false
183
+ end
184
+ end
185
+
186
+ # Load custom trained mappings
187
+ def load_custom_mappings
188
+ custom_mappings_file = File.join(@config.cache_dir, 'custom_legal_mappings.json')
189
+
190
+ if File.exist?(custom_mappings_file)
191
+ JSON.parse(File.read(custom_mappings_file))
192
+ else
193
+ {}
194
+ end
195
+ rescue => e
196
+ @logger&.error("Failed to load custom mappings: #{e.message}")
197
+ {}
198
+ end
199
+
200
+ private
201
+
202
+ def setup_model_configuration
203
+ {
204
+ model_type: 'rule_based', # Default to rule-based for reliability
205
+ api_endpoint: ENV['LEGAL_AI_API_ENDPOINT'],
206
+ api_key: ENV['LEGAL_AI_API_KEY'],
207
+ timeout: 30,
208
+ max_tokens: 2000,
209
+ temperature: 0.3, # Lower temperature for more consistent legal text
210
+ available_models: ['gpt-3.5-turbo', 'claude-3-haiku', 'llama-2-legal']
211
+ }
212
+ end
213
+
214
+ def validate_configuration
215
+ raise ConfigurationError, "Configuration is required" unless @config
216
+ raise ConfigurationError, "Logger is required" unless @config.logger
217
+ raise ConfigurationError, "Cache directory is required" unless @config.cache_dir
218
+ end
219
+
220
+ def process_text_pipeline(text, options = {})
221
+ # Step 1: Basic legal jargon replacement
222
+ simplified = replace_legal_jargon(text)
223
+
224
+ # Step 2: Sentence pattern simplification
225
+ simplified = simplify_sentence_patterns(simplified)
226
+
227
+ # Step 3: Custom mappings from fine-tuning
228
+ simplified = apply_custom_mappings(simplified)
229
+
230
+ # Step 4: Advanced AI processing (if available and enabled)
231
+ if options[:use_ai_model] && model_available?
232
+ simplified = process_with_ai_model(simplified, options)
233
+ end
234
+
235
+ # Step 5: Final cleanup and formatting
236
+ cleanup_text(simplified)
237
+ end
238
+
239
+ def replace_legal_jargon(text)
240
+ result = text.dup
241
+
242
+ # Apply all legal mappings
243
+ LEGAL_MAPPINGS.each do |legal_term, plain_term|
244
+ # Case-insensitive replacement while preserving original case
245
+ result.gsub!(/\b#{Regexp.escape(legal_term)}\b/i) do |match|
246
+ if match == match.upcase
247
+ plain_term.upcase
248
+ elsif match == match.capitalize
249
+ plain_term.capitalize
250
+ else
251
+ plain_term
252
+ end
253
+ end
254
+ end
255
+
256
+ result
257
+ end
258
+
259
+ def simplify_sentence_patterns(text)
260
+ result = text.dup
261
+
262
+ SENTENCE_PATTERNS.each do |pattern_info|
263
+ result.gsub!(pattern_info[:pattern], pattern_info[:replacement])
264
+ end
265
+
266
+ result
267
+ end
268
+
269
+ def apply_custom_mappings(text)
270
+ custom_mappings = load_custom_mappings
271
+ result = text.dup
272
+
273
+ custom_mappings.each do |legal_term, plain_term|
274
+ result.gsub!(/\b#{Regexp.escape(legal_term)}\b/i, plain_term)
275
+ end
276
+
277
+ result
278
+ end
279
+
280
+ def process_with_ai_model(text, options = {})
281
+ return text unless model_config[:api_endpoint] && model_config[:api_key]
282
+
283
+ begin
284
+ response = call_ai_api(text, options)
285
+ response['simplified_text'] || text
286
+ rescue => e
287
+ @logger&.warn("AI model processing failed, falling back to rule-based: #{e.message}")
288
+ text
289
+ end
290
+ end
291
+
292
+ def call_ai_api(text, options = {})
293
+ uri = URI(model_config[:api_endpoint])
294
+ http = Net::HTTP.new(uri.host, uri.port)
295
+ http.use_ssl = true if uri.scheme == 'https'
296
+ http.read_timeout = model_config[:timeout]
297
+
298
+ request = Net::HTTP::Post.new(uri)
299
+ request['Authorization'] = "Bearer #{model_config[:api_key]}"
300
+ request['Content-Type'] = 'application/json'
301
+
302
+ prompt = build_ai_prompt(text, options)
303
+
304
+ request.body = JSON.generate({
305
+ model: options[:model] || 'gpt-3.5-turbo',
306
+ messages: [
307
+ {
308
+ role: 'system',
309
+ content: 'You are a legal expert specializing in converting complex legal language into plain English while maintaining accuracy and legal meaning.'
310
+ },
311
+ {
312
+ role: 'user',
313
+ content: prompt
314
+ }
315
+ ],
316
+ max_tokens: model_config[:max_tokens],
317
+ temperature: model_config[:temperature]
318
+ })
319
+
320
+ response = http.request(request)
321
+
322
+ unless response.code == '200'
323
+ raise APIError, "API request failed with code #{response.code}: #{response.body}"
324
+ end
325
+
326
+ JSON.parse(response.body)
327
+ end
328
+
329
+ def build_ai_prompt(text, options = {})
330
+ <<~PROMPT
331
+ Please convert the following legal text into plain English while maintaining its legal accuracy and meaning:
332
+
333
+ Legal Text:
334
+ #{text}
335
+
336
+ Requirements:
337
+ - Use simple, everyday language
338
+ - Maintain legal accuracy
339
+ - Keep the same meaning and intent
340
+ - Use shorter sentences where possible
341
+ - Replace legal jargon with common terms
342
+ - Ensure readability for general audience
343
+
344
+ Please provide only the simplified version without explanations.
345
+ PROMPT
346
+ end
347
+
348
+ def cleanup_text(text)
349
+ # Remove excessive whitespace
350
+ cleaned = text.gsub(/\s+/, ' ').strip
351
+
352
+ # Fix punctuation spacing
353
+ cleaned = cleaned.gsub(/\s+([,.;:!?])/, '\1')
354
+ cleaned = cleaned.gsub(/([.!?])\s*([A-Z])/, '\1 \2')
355
+
356
+ # Ensure proper sentence endings
357
+ cleaned += '.' unless cleaned.end_with?('.', '!', '?')
358
+
359
+ cleaned
360
+ end
361
+
362
+ def calculate_readability_score(text)
363
+ # Simplified Flesch Reading Ease calculation
364
+ sentences = text.split(/[.!?]+/).length
365
+ words = text.split.length
366
+ syllables = count_syllables(text)
367
+
368
+ return 0 if sentences == 0 || words == 0
369
+
370
+ avg_sentence_length = words.to_f / sentences
371
+ avg_syllables_per_word = syllables.to_f / words
372
+
373
+ score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables_per_word)
374
+ [0, [100, score].min].max.round(1)
375
+ end
376
+
377
+ def count_syllables(text)
378
+ # Simple syllable counting heuristic
379
+ text.downcase.gsub(/[^a-z]/, '').scan(/[aeiouy]+/).length
380
+ end
381
+
382
+ def calculate_complexity_reduction(original, simplified)
383
+ original_complexity = calculate_text_complexity(original)
384
+ simplified_complexity = calculate_text_complexity(simplified)
385
+
386
+ return 0 if original_complexity == 0
387
+
388
+ reduction = ((original_complexity - simplified_complexity) / original_complexity.to_f * 100).round(1)
389
+ [0, reduction].max
390
+ end
391
+
392
+ def calculate_text_complexity(text)
393
+ # Complexity based on average word length, sentence length, and jargon count
394
+ words = text.split
395
+ sentences = text.split(/[.!?]+/)
396
+
397
+ avg_word_length = words.map(&:length).sum.to_f / words.length
398
+ avg_sentence_length = words.length.to_f / sentences.length
399
+ jargon_count = count_legal_jargon(text)
400
+
401
+ (avg_word_length * 2) + (avg_sentence_length * 0.5) + (jargon_count * 3)
402
+ end
403
+
404
+ def count_legal_jargon(text)
405
+ LEGAL_MAPPINGS.keys.count { |term| text.downcase.include?(term.downcase) }
406
+ end
407
+
408
+ def calculate_avg_sentence_length(text)
409
+ sentences = text.split(/[.!?]+/).reject(&:empty?)
410
+ return 0 if sentences.empty?
411
+
412
+ total_words = sentences.map { |s| s.split.length }.sum
413
+ (total_words.to_f / sentences.length).round(1)
414
+ end
415
+
416
+ def extract_custom_mappings(training_data)
417
+ mappings = {}
418
+
419
+ training_data.each do |example|
420
+ next unless example.is_a?(Hash) && example['legal'] && example['plain']
421
+
422
+ legal_text = example['legal']
423
+ plain_text = example['plain']
424
+
425
+ # Extract potential mappings using simple pattern matching
426
+ legal_words = legal_text.split
427
+ plain_words = plain_text.split
428
+
429
+ # This is a simplified extraction - in practice, you'd use more sophisticated NLP
430
+ legal_words.each do |legal_word|
431
+ next if legal_word.length < 4 # Skip short words
432
+
433
+ # Look for potential plain language equivalents
434
+ plain_words.each do |plain_word|
435
+ if similar_context?(legal_word, plain_word, legal_text, plain_text)
436
+ mappings[legal_word.downcase] = plain_word.downcase
437
+ end
438
+ end
439
+ end
440
+ end
441
+
442
+ mappings
443
+ end
444
+
445
+ def similar_context?(legal_word, plain_word, legal_text, plain_text)
446
+ # Simple heuristic to determine if words might be equivalent
447
+ legal_index = legal_text.downcase.index(legal_word.downcase)
448
+ plain_index = plain_text.downcase.index(plain_word.downcase)
449
+
450
+ return false unless legal_index && plain_index
451
+
452
+ # Check if words appear in similar positions (rough heuristic)
453
+ legal_position = legal_index.to_f / legal_text.length
454
+ plain_position = plain_index.to_f / plain_text.length
455
+
456
+ (legal_position - plain_position).abs < 0.2
457
+ end
458
+
459
+ def model_available?
460
+ model_config[:api_endpoint] && model_config[:api_key]
461
+ end
462
+ end
463
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module LegalSummariser
4
- VERSION = "0.3.0"
4
+ VERSION = "0.3.1"
5
5
  end
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: legal_summariser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
- - Legal Summariser Team
7
+ - Ahmet KAHRAMAN
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
@@ -150,11 +150,12 @@ dependencies:
150
150
  - - "~>"
151
151
  - !ruby/object:Gem::Version
152
152
  version: '0.9'
153
- description: A Ruby gem that summarises legal documents, extracts key clauses, flags
154
- risks, and converts legal jargon into plain English. Supports PDF/Word documents
155
- with offline processing capabilities.
153
+ description: Advanced Ruby gem for legal document analysis featuring AI-powered plain
154
+ language generation, multilingual processing (8 languages), PDF annotations, model
155
+ training, risk analysis, and clause detection. Supports PDF, DOCX, RTF formats with
156
+ comprehensive CLI tools.
156
157
  email:
157
- - info@legal-summariser.com
158
+ - ahmetxhero@gmail.com
158
159
  executables:
159
160
  - legal_summariser
160
161
  extensions: []
@@ -176,19 +177,26 @@ files:
176
177
  - lib/legal_summariser/configuration.rb
177
178
  - lib/legal_summariser/document_parser.rb
178
179
  - lib/legal_summariser/formatter.rb
180
+ - lib/legal_summariser/model_trainer.rb
181
+ - lib/legal_summariser/multilingual_processor.rb
182
+ - lib/legal_summariser/pdf_annotator.rb
179
183
  - lib/legal_summariser/performance_monitor.rb
184
+ - lib/legal_summariser/plain_language_generator.rb
180
185
  - lib/legal_summariser/risk_analyzer.rb
181
186
  - lib/legal_summariser/summariser.rb
182
187
  - lib/legal_summariser/text_extractor.rb
183
188
  - lib/legal_summariser/version.rb
184
- homepage: https://github.com/legal-summariser/legal_summariser
189
+ homepage: https://github.com/ahmetxhero/legal-summariser
185
190
  licenses:
186
191
  - MIT
187
192
  metadata:
188
193
  allowed_push_host: https://rubygems.org
189
- homepage_uri: https://github.com/legal-summariser/legal_summariser
190
- source_code_uri: https://github.com/legal-summariser/legal_summariser
191
- changelog_uri: https://github.com/legal-summariser/legal_summariser/blob/main/CHANGELOG.md
194
+ homepage_uri: https://github.com/ahmetxhero/legal-summariser
195
+ source_code_uri: https://github.com/ahmetxhero/legal-summariser
196
+ changelog_uri: https://github.com/ahmetxhero/legal-summariser/blob/main/CHANGELOG.md
197
+ documentation_uri: https://github.com/ahmetxhero/legal-summariser#readme
198
+ bug_tracker_uri: https://github.com/ahmetxhero/legal-summariser/issues
199
+ wiki_uri: https://github.com/ahmetxhero/legal-summariser/wiki
192
200
  post_install_message:
193
201
  rdoc_options: []
194
202
  require_paths:
@@ -207,5 +215,5 @@ requirements: []
207
215
  rubygems_version: 3.0.3.1
208
216
  signing_key:
209
217
  specification_version: 4
210
- summary: AI-powered legal document summarisation and analysis toolkit
218
+ summary: AI-powered legal document analysis with multilingual support and PDF annotations
211
219
  test_files: []