legal_summariser 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,601 @@
1
+ require 'json'
2
+ require 'fileutils'
3
+ require 'digest'
4
+
5
+ module LegalSummariser
6
+ # Advanced PDF annotation system for legal document analysis output
7
+ class PDFAnnotator
8
+ class AnnotationError < StandardError; end
9
+ class PDFError < StandardError; end
10
+ class UnsupportedFormatError < StandardError; end
11
+
12
+ # Annotation types supported
13
+ ANNOTATION_TYPES = {
14
+ highlight: {
15
+ color: '#FFFF00',
16
+ opacity: 0.3,
17
+ description: 'Highlighted text'
18
+ },
19
+ note: {
20
+ color: '#FFA500',
21
+ opacity: 0.8,
22
+ description: 'Sticky note annotation'
23
+ },
24
+ warning: {
25
+ color: '#FF6B6B',
26
+ opacity: 0.5,
27
+ description: 'Warning or risk indicator'
28
+ },
29
+ important: {
30
+ color: '#4ECDC4',
31
+ opacity: 0.4,
32
+ description: 'Important clause or section'
33
+ },
34
+ question: {
35
+ color: '#95E1D3',
36
+ opacity: 0.4,
37
+ description: 'Question or clarification needed'
38
+ },
39
+ summary: {
40
+ color: '#A8E6CF',
41
+ opacity: 0.3,
42
+ description: 'Summary or key point'
43
+ }
44
+ }.freeze
45
+
46
+ # Risk level color coding
47
+ RISK_COLORS = {
48
+ high: '#FF4757',
49
+ medium: '#FFA502',
50
+ low: '#2ED573',
51
+ info: '#3742FA'
52
+ }.freeze
53
+
54
+ attr_reader :config, :logger, :annotations_dir
55
+
56
+ def initialize(config = nil)
57
+ @config = config || LegalSummariser.configuration
58
+ @logger = @config.logger
59
+ @annotations_dir = File.join(@config.cache_dir, 'pdf_annotations')
60
+
61
+ setup_directories
62
+ end
63
+
64
+ # Create annotated PDF from analysis results
65
+ def create_annotated_pdf(pdf_path, analysis_results, output_path, options = {})
66
+ validate_pdf_path(pdf_path)
67
+
68
+ @logger&.info("Creating annotated PDF from #{pdf_path}")
69
+
70
+ begin
71
+ # Extract text positions from PDF
72
+ text_positions = extract_text_positions(pdf_path)
73
+
74
+ # Generate annotations from analysis results
75
+ annotations = generate_annotations_from_analysis(analysis_results, text_positions, options)
76
+
77
+ # Create annotated PDF
78
+ annotated_pdf_data = create_pdf_with_annotations(pdf_path, annotations, options)
79
+
80
+ # Save annotated PDF
81
+ File.write(output_path, annotated_pdf_data, mode: 'wb')
82
+
83
+ # Save annotation metadata
84
+ save_annotation_metadata(output_path, annotations, analysis_results)
85
+
86
+ @logger&.info("Annotated PDF created: #{output_path}")
87
+
88
+ {
89
+ input_pdf: pdf_path,
90
+ output_pdf: output_path,
91
+ annotations_count: annotations.length,
92
+ annotation_types: annotations.group_by { |a| a[:type] }.transform_values(&:count),
93
+ metadata_file: get_metadata_path(output_path)
94
+ }
95
+
96
+ rescue => e
97
+ @logger&.error("PDF annotation failed: #{e.message}")
98
+ raise AnnotationError, "Failed to create annotated PDF: #{e.message}"
99
+ end
100
+ end
101
+
102
+ # Add custom annotations to PDF
103
+ def add_custom_annotations(pdf_path, custom_annotations, output_path, options = {})
104
+ validate_pdf_path(pdf_path)
105
+ validate_annotations(custom_annotations)
106
+
107
+ @logger&.info("Adding #{custom_annotations.length} custom annotations to PDF")
108
+
109
+ begin
110
+ # Process custom annotations
111
+ processed_annotations = process_custom_annotations(custom_annotations, options)
112
+
113
+ # Create annotated PDF
114
+ annotated_pdf_data = create_pdf_with_annotations(pdf_path, processed_annotations, options)
115
+
116
+ # Save result
117
+ File.write(output_path, annotated_pdf_data, mode: 'wb')
118
+ save_annotation_metadata(output_path, processed_annotations, { custom: true })
119
+
120
+ {
121
+ input_pdf: pdf_path,
122
+ output_pdf: output_path,
123
+ custom_annotations: processed_annotations.length
124
+ }
125
+
126
+ rescue => e
127
+ @logger&.error("Custom annotation failed: #{e.message}")
128
+ raise AnnotationError, "Failed to add custom annotations: #{e.message}"
129
+ end
130
+ end
131
+
132
+ # Extract annotations from an annotated PDF
133
+ def extract_annotations(pdf_path)
134
+ validate_pdf_path(pdf_path)
135
+
136
+ @logger&.info("Extracting annotations from #{pdf_path}")
137
+
138
+ begin
139
+ # Check for metadata file first
140
+ metadata_path = get_metadata_path(pdf_path)
141
+
142
+ if File.exist?(metadata_path)
143
+ metadata = JSON.parse(File.read(metadata_path))
144
+ return metadata['annotations'] || []
145
+ end
146
+
147
+ # Fallback: try to extract from PDF directly
148
+ extract_annotations_from_pdf(pdf_path)
149
+
150
+ rescue => e
151
+ @logger&.error("Annotation extraction failed: #{e.message}")
152
+ raise AnnotationError, "Failed to extract annotations: #{e.message}"
153
+ end
154
+ end
155
+
156
+ # Generate annotation report
157
+ def generate_annotation_report(pdf_path, format = :json)
158
+ annotations = extract_annotations(pdf_path)
159
+
160
+ case format
161
+ when :json
162
+ generate_json_report(annotations)
163
+ when :markdown
164
+ generate_markdown_report(annotations)
165
+ when :html
166
+ generate_html_report(annotations)
167
+ else
168
+ raise UnsupportedFormatError, "Unsupported report format: #{format}"
169
+ end
170
+ end
171
+
172
+ # Merge multiple annotated PDFs
173
+ def merge_annotated_pdfs(pdf_paths, output_path, options = {})
174
+ @logger&.info("Merging #{pdf_paths.length} annotated PDFs")
175
+
176
+ begin
177
+ merged_annotations = []
178
+ page_offset = 0
179
+
180
+ pdf_paths.each_with_index do |pdf_path, index|
181
+ validate_pdf_path(pdf_path)
182
+
183
+ # Extract annotations and adjust page numbers
184
+ annotations = extract_annotations(pdf_path)
185
+ annotations.each do |annotation|
186
+ annotation[:page] += page_offset if annotation[:page]
187
+ annotation[:source_pdf] = File.basename(pdf_path)
188
+ merged_annotations << annotation
189
+ end
190
+
191
+ # Get page count for offset calculation
192
+ page_count = get_pdf_page_count(pdf_path)
193
+ page_offset += page_count
194
+ end
195
+
196
+ # Create merged PDF (placeholder implementation)
197
+ create_merged_pdf(pdf_paths, output_path, merged_annotations, options)
198
+
199
+ {
200
+ merged_pdf: output_path,
201
+ source_pdfs: pdf_paths.length,
202
+ total_annotations: merged_annotations.length
203
+ }
204
+
205
+ rescue => e
206
+ @logger&.error("PDF merging failed: #{e.message}")
207
+ raise AnnotationError, "Failed to merge annotated PDFs: #{e.message}"
208
+ end
209
+ end
210
+
211
+ # Get annotation statistics
212
+ def get_annotation_statistics(pdf_path)
213
+ annotations = extract_annotations(pdf_path)
214
+
215
+ {
216
+ total_annotations: annotations.length,
217
+ by_type: annotations.group_by { |a| a[:type] }.transform_values(&:count),
218
+ by_risk_level: annotations.select { |a| a[:risk_level] }
219
+ .group_by { |a| a[:risk_level] }
220
+ .transform_values(&:count),
221
+ by_page: annotations.group_by { |a| a[:page] }.transform_values(&:count),
222
+ coverage: calculate_annotation_coverage(annotations),
223
+ summary: generate_annotation_summary(annotations)
224
+ }
225
+ end
226
+
227
+ private
228
+
229
+ def setup_directories
230
+ FileUtils.mkdir_p(@annotations_dir) unless Dir.exist?(@annotations_dir)
231
+ end
232
+
233
+ def validate_pdf_path(pdf_path)
234
+ raise PDFError, "PDF file not found: #{pdf_path}" unless File.exist?(pdf_path)
235
+ raise PDFError, "Invalid PDF file: #{pdf_path}" unless pdf_path.downcase.end_with?('.pdf')
236
+ end
237
+
238
+ def validate_annotations(annotations)
239
+ raise AnnotationError, "Annotations must be an array" unless annotations.is_a?(Array)
240
+
241
+ annotations.each_with_index do |annotation, index|
242
+ unless annotation.is_a?(Hash)
243
+ raise AnnotationError, "Annotation #{index} must be a hash"
244
+ end
245
+
246
+ required_keys = [:type, :text, :page]
247
+ missing_keys = required_keys - annotation.keys
248
+
249
+ unless missing_keys.empty?
250
+ raise AnnotationError, "Annotation #{index} missing keys: #{missing_keys.join(', ')}"
251
+ end
252
+ end
253
+ end
254
+
255
+ def extract_text_positions(pdf_path)
256
+ # Placeholder implementation for text position extraction
257
+ # In a real implementation, you would use a PDF library like PDF::Reader
258
+ # to extract text positions and coordinates
259
+
260
+ @logger&.info("Extracting text positions from PDF (placeholder)")
261
+
262
+ # Simulated text positions
263
+ {
264
+ pages: [
265
+ {
266
+ page_number: 1,
267
+ width: 612,
268
+ height: 792,
269
+ text_blocks: [
270
+ {
271
+ text: "Sample contract text",
272
+ x: 72,
273
+ y: 720,
274
+ width: 200,
275
+ height: 20
276
+ }
277
+ ]
278
+ }
279
+ ]
280
+ }
281
+ end
282
+
283
+ def generate_annotations_from_analysis(analysis_results, text_positions, options = {})
284
+ annotations = []
285
+
286
+ # Generate annotations from summary
287
+ if analysis_results[:summary]
288
+ summary_annotations = create_summary_annotations(analysis_results[:summary], text_positions)
289
+ annotations.concat(summary_annotations)
290
+ end
291
+
292
+ # Generate annotations from risks
293
+ if analysis_results[:risks]
294
+ risk_annotations = create_risk_annotations(analysis_results[:risks], text_positions)
295
+ annotations.concat(risk_annotations)
296
+ end
297
+
298
+ # Generate annotations from clauses
299
+ if analysis_results[:clauses]
300
+ clause_annotations = create_clause_annotations(analysis_results[:clauses], text_positions)
301
+ annotations.concat(clause_annotations)
302
+ end
303
+
304
+ # Generate annotations from plain language suggestions
305
+ if analysis_results[:plain_language]
306
+ plain_language_annotations = create_plain_language_annotations(analysis_results[:plain_language], text_positions)
307
+ annotations.concat(plain_language_annotations)
308
+ end
309
+
310
+ annotations
311
+ end
312
+
313
+ def create_summary_annotations(summary_data, text_positions)
314
+ annotations = []
315
+
316
+ if summary_data[:key_points]
317
+ summary_data[:key_points].each_with_index do |point, index|
318
+ annotations << {
319
+ type: :summary,
320
+ text: point,
321
+ note: "Key Point #{index + 1}",
322
+ page: 1, # Simplified - would need actual text matching
323
+ color: ANNOTATION_TYPES[:summary][:color],
324
+ opacity: ANNOTATION_TYPES[:summary][:opacity]
325
+ }
326
+ end
327
+ end
328
+
329
+ annotations
330
+ end
331
+
332
+ def create_risk_annotations(risks_data, text_positions)
333
+ annotations = []
334
+
335
+ [:high_risks, :medium_risks, :low_risks].each do |risk_level|
336
+ next unless risks_data[risk_level]
337
+
338
+ level = risk_level.to_s.split('_').first.to_sym
339
+
340
+ risks_data[risk_level].each do |risk|
341
+ annotations << {
342
+ type: :warning,
343
+ text: risk[:text] || risk,
344
+ note: "#{level.capitalize} Risk: #{risk[:description] || risk}",
345
+ risk_level: level,
346
+ page: 1, # Simplified
347
+ color: RISK_COLORS[level],
348
+ opacity: 0.6
349
+ }
350
+ end
351
+ end
352
+
353
+ annotations
354
+ end
355
+
356
+ def create_clause_annotations(clauses_data, text_positions)
357
+ annotations = []
358
+
359
+ clauses_data.each do |clause_type, clauses|
360
+ next unless clauses.is_a?(Array)
361
+
362
+ clauses.each do |clause|
363
+ annotations << {
364
+ type: :important,
365
+ text: clause[:text] || clause,
366
+ note: "#{clause_type.to_s.humanize} Clause",
367
+ clause_type: clause_type,
368
+ page: 1, # Simplified
369
+ color: ANNOTATION_TYPES[:important][:color],
370
+ opacity: ANNOTATION_TYPES[:important][:opacity]
371
+ }
372
+ end
373
+ end
374
+
375
+ annotations
376
+ end
377
+
378
+ def create_plain_language_annotations(plain_language_data, text_positions)
379
+ annotations = []
380
+
381
+ if plain_language_data[:simplified_text]
382
+ # Create annotations for complex terms that were simplified
383
+ annotations << {
384
+ type: :note,
385
+ text: "Plain language version available",
386
+ note: "This document has been converted to plain English. See attached simplified version.",
387
+ page: 1,
388
+ color: ANNOTATION_TYPES[:note][:color],
389
+ opacity: ANNOTATION_TYPES[:note][:opacity]
390
+ }
391
+ end
392
+
393
+ annotations
394
+ end
395
+
396
+ def process_custom_annotations(custom_annotations, options = {})
397
+ processed = []
398
+
399
+ custom_annotations.each do |annotation|
400
+ processed_annotation = {
401
+ type: annotation[:type] || :note,
402
+ text: annotation[:text],
403
+ note: annotation[:note] || annotation[:comment],
404
+ page: annotation[:page] || 1,
405
+ color: annotation[:color] || ANNOTATION_TYPES[annotation[:type] || :note][:color],
406
+ opacity: annotation[:opacity] || ANNOTATION_TYPES[annotation[:type] || :note][:opacity]
407
+ }
408
+
409
+ # Add position if provided
410
+ if annotation[:position]
411
+ processed_annotation[:position] = annotation[:position]
412
+ end
413
+
414
+ processed << processed_annotation
415
+ end
416
+
417
+ processed
418
+ end
419
+
420
+ def create_pdf_with_annotations(pdf_path, annotations, options = {})
421
+ # Placeholder implementation for PDF annotation
422
+ # In a real implementation, you would use a PDF library like Prawn or HexaPDF
423
+ # to add actual annotations to the PDF
424
+
425
+ @logger&.info("Creating PDF with #{annotations.length} annotations (placeholder)")
426
+
427
+ # For now, just copy the original PDF
428
+ # In practice, this would create a new PDF with annotations overlaid
429
+ File.read(pdf_path, mode: 'rb')
430
+ end
431
+
432
+ def save_annotation_metadata(pdf_path, annotations, analysis_results)
433
+ metadata = {
434
+ pdf_file: File.basename(pdf_path),
435
+ created_at: Time.now.iso8601,
436
+ annotations: annotations,
437
+ analysis_results: analysis_results,
438
+ annotation_statistics: {
439
+ total: annotations.length,
440
+ by_type: annotations.group_by { |a| a[:type] }.transform_values(&:count)
441
+ }
442
+ }
443
+
444
+ metadata_path = get_metadata_path(pdf_path)
445
+ File.write(metadata_path, JSON.pretty_generate(metadata))
446
+ end
447
+
448
+ def get_metadata_path(pdf_path)
449
+ base_name = File.basename(pdf_path, '.pdf')
450
+ File.join(@annotations_dir, "#{base_name}_annotations.json")
451
+ end
452
+
453
+ def extract_annotations_from_pdf(pdf_path)
454
+ # Placeholder for extracting annotations directly from PDF
455
+ # This would use a PDF library to read existing annotations
456
+ []
457
+ end
458
+
459
+ def generate_json_report(annotations)
460
+ {
461
+ report_type: 'annotation_report',
462
+ generated_at: Time.now.iso8601,
463
+ total_annotations: annotations.length,
464
+ annotations: annotations,
465
+ statistics: calculate_annotation_statistics(annotations)
466
+ }.to_json
467
+ end
468
+
469
+ def generate_markdown_report(annotations)
470
+ report = "# PDF Annotation Report\n\n"
471
+ report += "Generated: #{Time.now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
472
+ report += "Total Annotations: #{annotations.length}\n\n"
473
+
474
+ # Group by type
475
+ annotations.group_by { |a| a[:type] }.each do |type, type_annotations|
476
+ report += "## #{type.to_s.capitalize} Annotations (#{type_annotations.length})\n\n"
477
+
478
+ type_annotations.each_with_index do |annotation, index|
479
+ report += "### #{index + 1}. Page #{annotation[:page]}\n"
480
+ report += "**Text:** #{annotation[:text]}\n\n"
481
+ report += "**Note:** #{annotation[:note]}\n\n" if annotation[:note]
482
+ report += "---\n\n"
483
+ end
484
+ end
485
+
486
+ report
487
+ end
488
+
489
+ def generate_html_report(annotations)
490
+ html = <<~HTML
491
+ <!DOCTYPE html>
492
+ <html>
493
+ <head>
494
+ <title>PDF Annotation Report</title>
495
+ <style>
496
+ body { font-family: Arial, sans-serif; margin: 20px; }
497
+ .annotation { border: 1px solid #ccc; margin: 10px 0; padding: 10px; }
498
+ .type-highlight { border-left: 4px solid #FFFF00; }
499
+ .type-warning { border-left: 4px solid #FF6B6B; }
500
+ .type-note { border-left: 4px solid #FFA500; }
501
+ .type-important { border-left: 4px solid #4ECDC4; }
502
+ </style>
503
+ </head>
504
+ <body>
505
+ <h1>PDF Annotation Report</h1>
506
+ <p>Generated: #{Time.now.strftime('%Y-%m-%d %H:%M:%S')}</p>
507
+ <p>Total Annotations: #{annotations.length}</p>
508
+ HTML
509
+
510
+ annotations.each_with_index do |annotation, index|
511
+ html += <<~HTML
512
+ <div class="annotation type-#{annotation[:type]}">
513
+ <h3>Annotation #{index + 1} - Page #{annotation[:page]}</h3>
514
+ <p><strong>Type:</strong> #{annotation[:type]}</p>
515
+ <p><strong>Text:</strong> #{annotation[:text]}</p>
516
+ HTML
517
+
518
+ if annotation[:note]
519
+ html += "<p><strong>Note:</strong> #{annotation[:note]}</p>"
520
+ end
521
+
522
+ html += "</div>"
523
+ end
524
+
525
+ html += "</body></html>"
526
+ html
527
+ end
528
+
529
+ def create_merged_pdf(pdf_paths, output_path, annotations, options = {})
530
+ # Placeholder for PDF merging
531
+ # In practice, this would use a PDF library to merge PDFs and preserve annotations
532
+
533
+ @logger&.info("Merging PDFs (placeholder implementation)")
534
+
535
+ # For now, just copy the first PDF
536
+ if pdf_paths.any?
537
+ FileUtils.cp(pdf_paths.first, output_path)
538
+ end
539
+
540
+ # Save merged annotations metadata
541
+ save_annotation_metadata(output_path, annotations, { merged: true, source_pdfs: pdf_paths })
542
+ end
543
+
544
+ def get_pdf_page_count(pdf_path)
545
+ # Placeholder for getting PDF page count
546
+ # In practice, this would use a PDF library to count pages
547
+ 10 # Default assumption
548
+ end
549
+
550
+ def calculate_annotation_coverage(annotations)
551
+ return 0 if annotations.empty?
552
+
553
+ pages_with_annotations = annotations.map { |a| a[:page] }.uniq.length
554
+ total_pages = annotations.map { |a| a[:page] }.max || 1
555
+
556
+ (pages_with_annotations.to_f / total_pages * 100).round(1)
557
+ end
558
+
559
+ def generate_annotation_summary(annotations)
560
+ summary = {
561
+ most_annotated_page: nil,
562
+ most_common_type: nil,
563
+ risk_distribution: {},
564
+ recommendations: []
565
+ }
566
+
567
+ # Most annotated page
568
+ page_counts = annotations.group_by { |a| a[:page] }.transform_values(&:count)
569
+ summary[:most_annotated_page] = page_counts.max_by { |_, count| count }&.first
570
+
571
+ # Most common annotation type
572
+ type_counts = annotations.group_by { |a| a[:type] }.transform_values(&:count)
573
+ summary[:most_common_type] = type_counts.max_by { |_, count| count }&.first
574
+
575
+ # Risk distribution
576
+ risk_annotations = annotations.select { |a| a[:risk_level] }
577
+ summary[:risk_distribution] = risk_annotations.group_by { |a| a[:risk_level] }
578
+ .transform_values(&:count)
579
+
580
+ # Generate recommendations
581
+ if summary[:risk_distribution][:high]&.> 0
582
+ summary[:recommendations] << "High-risk items require immediate attention"
583
+ end
584
+
585
+ if type_counts[:warning]&.> 5
586
+ summary[:recommendations] << "Multiple warnings detected - consider legal review"
587
+ end
588
+
589
+ summary
590
+ end
591
+
592
+ def calculate_annotation_statistics(annotations)
593
+ {
594
+ total: annotations.length,
595
+ by_type: annotations.group_by { |a| a[:type] }.transform_values(&:count),
596
+ by_page: annotations.group_by { |a| a[:page] }.transform_values(&:count),
597
+ with_risks: annotations.count { |a| a[:risk_level] }
598
+ }
599
+ end
600
+ end
601
+ end