legal_summariser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pdf-reader'
4
+ require 'docx'
5
+
6
+ module LegalSummariser
7
+ class TextExtractor
8
+ # Extract text from various document formats
9
+ # @param file_path [String] Path to the document
10
+ # @return [String] Extracted text
11
+ def self.extract(file_path)
12
+ case File.extname(file_path).downcase
13
+ when '.pdf'
14
+ extract_from_pdf(file_path)
15
+ when '.docx'
16
+ extract_from_docx(file_path)
17
+ when '.txt'
18
+ File.read(file_path, encoding: 'UTF-8')
19
+ else
20
+ raise UnsupportedFormatError, "Unsupported file format: #{File.extname(file_path)}"
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ # Extract text from PDF files
27
+ # @param file_path [String] Path to PDF file
28
+ # @return [String] Extracted text
29
+ def self.extract_from_pdf(file_path)
30
+ reader = PDF::Reader.new(file_path)
31
+ text = ""
32
+
33
+ reader.pages.each do |page|
34
+ text += page.text + "\n"
35
+ end
36
+
37
+ # Clean up common PDF artifacts
38
+ clean_text(text)
39
+ rescue => e
40
+ raise Error, "Failed to extract text from PDF: #{e.message}"
41
+ end
42
+
43
+ # Extract text from DOCX files
44
+ # @param file_path [String] Path to DOCX file
45
+ # @return [String] Extracted text
46
+ def self.extract_from_docx(file_path)
47
+ doc = Docx::Document.open(file_path)
48
+ text = ""
49
+
50
+ doc.paragraphs.each do |paragraph|
51
+ text += paragraph.text + "\n"
52
+ end
53
+
54
+ clean_text(text)
55
+ rescue => e
56
+ raise Error, "Failed to extract text from DOCX: #{e.message}"
57
+ end
58
+
59
+ # Clean extracted text
60
+ # @param text [String] Raw extracted text
61
+ # @return [String] Cleaned text
62
+ def self.clean_text(text)
63
+ # Normalize line breaks first
64
+ text = text.gsub(/\r\n?/, "\n")
65
+
66
+ # Remove common PDF artifacts
67
+ text = text.gsub(/\f/, '') # Form feed characters
68
+ text = text.gsub(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/, '') # Control characters
69
+
70
+ # Remove excessive whitespace but preserve line breaks
71
+ text = text.gsub(/[ \t]+/, ' ')
72
+
73
+ # Remove excessive newlines
74
+ text = text.gsub(/\n{3,}/, "\n\n")
75
+
76
+ text.strip
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LegalSummariser
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "legal_summariser/version"
4
+ require_relative "legal_summariser/document_parser"
5
+ require_relative "legal_summariser/text_extractor"
6
+ require_relative "legal_summariser/summariser"
7
+ require_relative "legal_summariser/clause_detector"
8
+ require_relative "legal_summariser/risk_analyzer"
9
+ require_relative "legal_summariser/formatter"
10
+
11
+ module LegalSummariser
12
+ class Error < StandardError; end
13
+ class DocumentNotFoundError < Error; end
14
+ class UnsupportedFormatError < Error; end
15
+
16
+ # Main entry point for the legal summariser
17
+ # @param file_path [String] Path to the legal document
18
+ # @param options [Hash] Configuration options
19
+ # @return [Hash] Summary results
20
+ def self.summarise(file_path, options = {})
21
+ raise DocumentNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
22
+
23
+ # Extract text from document
24
+ text = TextExtractor.extract(file_path)
25
+
26
+ # Perform analysis
27
+ summary = Summariser.new(text, options).generate
28
+ clauses = ClauseDetector.new(text).detect
29
+ risks = RiskAnalyzer.new(text).analyze
30
+
31
+ # Format results
32
+ result = {
33
+ plain_text: summary[:plain_text],
34
+ key_points: summary[:key_points],
35
+ clauses: clauses,
36
+ risks: risks,
37
+ metadata: {
38
+ document_type: detect_document_type(text),
39
+ word_count: text.split.length,
40
+ processed_at: Time.now.strftime("%Y-%m-%dT%H:%M:%S%z")
41
+ }
42
+ }
43
+
44
+ # Apply formatting if requested
45
+ if options[:format]
46
+ Formatter.format(result, options[:format])
47
+ else
48
+ result
49
+ end
50
+ end
51
+
52
+ # Detect the type of legal document
53
+ # @param text [String] Document text
54
+ # @return [String] Document type
55
+ def self.detect_document_type(text)
56
+ case text.downcase
57
+ when /non.?disclosure|nda|confidentiality/
58
+ "nda"
59
+ when /service agreement|terms of service|tos/
60
+ "service_agreement"
61
+ when /employment|job|position/
62
+ "employment_contract"
63
+ when /privacy policy|data protection|gdpr|kvkk/
64
+ "privacy_policy"
65
+ when /license|licensing/
66
+ "license_agreement"
67
+ else
68
+ "general_contract"
69
+ end
70
+ end
71
+ end
metadata ADDED
@@ -0,0 +1,204 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: legal_summariser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Legal Summariser Team
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2025-09-09 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: pdf-reader
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.11'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.11'
27
+ - !ruby/object:Gem::Dependency
28
+ name: docx
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.8'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.8'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.13'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.13'
55
+ - !ruby/object:Gem::Dependency
56
+ name: thor
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.2'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.2'
69
+ - !ruby/object:Gem::Dependency
70
+ name: json
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '2.6'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '2.6'
83
+ - !ruby/object:Gem::Dependency
84
+ name: bundler
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '1.17'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '1.17'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rake
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '13.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '13.0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rspec
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '3.12'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '3.12'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rubocop
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '1.50'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '1.50'
139
+ - !ruby/object:Gem::Dependency
140
+ name: yard
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '0.9'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '0.9'
153
+ description: A Ruby gem that summarises legal documents, extracts key clauses, flags
154
+ risks, and converts legal jargon into plain English. Supports PDF/Word documents
155
+ with offline processing capabilities.
156
+ email:
157
+ - info@legal-summariser.com
158
+ executables:
159
+ - legal_summariser
160
+ extensions: []
161
+ extra_rdoc_files: []
162
+ files:
163
+ - ".rspec"
164
+ - CHANGELOG.md
165
+ - Gemfile
166
+ - README.md
167
+ - Rakefile
168
+ - exe/legal_summariser
169
+ - lib/legal_summariser.rb
170
+ - lib/legal_summariser/clause_detector.rb
171
+ - lib/legal_summariser/document_parser.rb
172
+ - lib/legal_summariser/formatter.rb
173
+ - lib/legal_summariser/risk_analyzer.rb
174
+ - lib/legal_summariser/summariser.rb
175
+ - lib/legal_summariser/text_extractor.rb
176
+ - lib/legal_summariser/version.rb
177
+ homepage: https://github.com/legal-summariser/legal_summariser
178
+ licenses:
179
+ - MIT
180
+ metadata:
181
+ allowed_push_host: https://rubygems.org
182
+ homepage_uri: https://github.com/legal-summariser/legal_summariser
183
+ source_code_uri: https://github.com/legal-summariser/legal_summariser
184
+ changelog_uri: https://github.com/legal-summariser/legal_summariser/blob/main/CHANGELOG.md
185
+ post_install_message:
186
+ rdoc_options: []
187
+ require_paths:
188
+ - lib
189
+ required_ruby_version: !ruby/object:Gem::Requirement
190
+ requirements:
191
+ - - ">="
192
+ - !ruby/object:Gem::Version
193
+ version: 2.6.0
194
+ required_rubygems_version: !ruby/object:Gem::Requirement
195
+ requirements:
196
+ - - ">="
197
+ - !ruby/object:Gem::Version
198
+ version: '0'
199
+ requirements: []
200
+ rubygems_version: 3.0.3.1
201
+ signing_key:
202
+ specification_version: 4
203
+ summary: AI-powered legal document summarisation and analysis toolkit
204
+ test_files: []