sentence 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +22 -0
- data/lib/abbreviations.rb +18 -0
- data/lib/conjunctions.rb +11 -0
- data/lib/sentence.rb +16 -0
- data/lib/sentence_splitter.rb +94 -0
- data/lib/split_pattern.rb +31 -0
- metadata +51 -0
data/README
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
= Read Me
|
|
2
|
+
|
|
3
|
+
by Tomasz Bak
|
|
4
|
+
|
|
5
|
+
== Description
|
|
6
|
+
|
|
7
|
+
Welcome to Sentence.
|
|
8
|
+
|
|
9
|
+
This library helps spliting text into its constituent sentences, based on a regular expressions
|
|
10
|
+
(packed into SplitPattern class) and a list of abbreviations.
|
|
11
|
+
|
|
12
|
+
It was designed to be flexiple and clean, and it has three primary goals:
|
|
13
|
+
|
|
14
|
+
1. Be easely applicable in wide range of usage scenarios (i.e. splitting sentences
|
|
15
|
+
generated by html2text, what involved support for simple formating understanding,
|
|
16
|
+
see examples/example.rb that parses corpora/medicine.txt).
|
|
17
|
+
|
|
18
|
+
2. Be library language aware, what is particulary important in building abbreviations and
|
|
19
|
+
conjunctions base (see lib/conjunctions.rb and lib/abbreviations.rb).
|
|
20
|
+
|
|
21
|
+
3. Use extesive testing not to break previously working code with new abbreviations and
|
|
22
|
+
patterns, which hopefully will be extended in future with patches sent by users.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
module Sentence
|
|
2
|
+
module Abbreviations
|
|
3
|
+
ROMAN = ['i','ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x']
|
|
4
|
+
|
|
5
|
+
def self.EN
|
|
6
|
+
people = ['jr', 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', "sens?", "reps?", 'gov', 'supt', 'det', 'rev']
|
|
7
|
+
army = ['col','gen', 'lt', 'cmdr', 'adm', 'capt', 'sgt', 'cpl', 'maj']
|
|
8
|
+
month = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec','sept']
|
|
9
|
+
misc = ['vs', 'etc', 'no', 'esp', 'e.g', 'cf','i.e']
|
|
10
|
+
|
|
11
|
+
people + army + month + misc + ROMAN
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def self.PL
|
|
15
|
+
ROMAN
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
data/lib/conjunctions.rb
ADDED
data/lib/sentence.rb
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
require File.dirname(__FILE__) + '/abbreviations'
|
|
2
|
+
require File.dirname(__FILE__) + '/conjunctions'
|
|
3
|
+
require File.dirname(__FILE__) + '/sentence_splitter'
|
|
4
|
+
require File.dirname(__FILE__) + '/split_pattern'
|
|
5
|
+
|
|
6
|
+
module Sentence
|
|
7
|
+
# English language rules
|
|
8
|
+
def self.EN
|
|
9
|
+
SentenceSplitter.new(:EN)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Polish language rules
|
|
13
|
+
def self.PL
|
|
14
|
+
SentenceSplitter.new(:PL)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
module Sentence
|
|
2
|
+
EOS = "\001" # End Of Sentence
|
|
3
|
+
|
|
4
|
+
class SentenceSplitter
|
|
5
|
+
|
|
6
|
+
include Abbreviations
|
|
7
|
+
include Conjunctions
|
|
8
|
+
|
|
9
|
+
def initialize(lang)
|
|
10
|
+
@lang = lang
|
|
11
|
+
@discarded_text_as_sentence = []
|
|
12
|
+
@split_patterns = []
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def discard(texts)
|
|
16
|
+
discard_texts_as_sentence(texts)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def patterns(paterns)
|
|
20
|
+
@split_patterns = paterns
|
|
21
|
+
self
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def discard_texts_as_sentence(texts)
|
|
25
|
+
@discarded_text_as_sentence = texts
|
|
26
|
+
self
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# get_sentences - takes text input and splits it into sentences.
|
|
30
|
+
def get_sentences( text )
|
|
31
|
+
return [] if text.nil? or text.empty?
|
|
32
|
+
|
|
33
|
+
@split_patterns.each do |pattern|
|
|
34
|
+
pattern.process(text)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
marked_text = first_sentence_breaking(text)
|
|
38
|
+
fixed_marked_text = split_unsplit_stuff( remove_false_end_of_sentence( marked_text ) )
|
|
39
|
+
sentences = fixed_marked_text.split(EOS)
|
|
40
|
+
clean_sentences(sentences)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def first_sentence_breaking(text)
|
|
44
|
+
# each match means a different sentence
|
|
45
|
+
text.gsub!(/\n\s*\n/,EOS) # double new-line
|
|
46
|
+
text.gsub!(/\|\s*([A-Z])/,EOS+'\1') # break on | before A-Z
|
|
47
|
+
text.gsub!(/\n\s*([A-Z])/,EOS+'\1') # new line and new world starts with A-Z
|
|
48
|
+
text.gsub!(/^\s*[\*\o\#]\s(.*)/,EOS+'\1') # list items starts with [*,o,#]
|
|
49
|
+
text.gsub!(/([\.\!\?])(\s)/,'\1'+EOS+'\2') # punctuation
|
|
50
|
+
text
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def remove_false_end_of_sentence( text )
|
|
54
|
+
Abbreviations.send(@lang).each do |abb|
|
|
55
|
+
text.gsub!(/(#{abb}\.)#{EOS}/i,'\1')
|
|
56
|
+
end
|
|
57
|
+
Conjunctions.send(@lang).each do |con|
|
|
58
|
+
text.gsub!(/(#{con}\s*)#{EOS}/i,'\1')
|
|
59
|
+
end
|
|
60
|
+
text
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def split_unsplit_stuff(text)
|
|
64
|
+
text.gsub!(/\n#{EOS}(\s*[a-z])/,'\1')
|
|
65
|
+
text.gsub!(/([A-Z0-9]\.)#{EOS}/,'\1') # single upcase letter or number with .
|
|
66
|
+
text
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def clean_sentences(sentences)
|
|
70
|
+
cleaned_sentences = []
|
|
71
|
+
sentences.each do |sentence|
|
|
72
|
+
|
|
73
|
+
subsentences = [ sentence ]
|
|
74
|
+
if sentence.index(/\./) == nil
|
|
75
|
+
subsentences = sentence.split('*') # split * if no . in sentence
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
subsentences.each do |sentence|
|
|
79
|
+
case sentence.strip
|
|
80
|
+
when '', "\n", "..."
|
|
81
|
+
else
|
|
82
|
+
if sentence.index(/\w/) != nil
|
|
83
|
+
sentence_whithout_ws = sentence.split.join(' ')
|
|
84
|
+
unless @discarded_text_as_sentence.include? sentence_whithout_ws.downcase
|
|
85
|
+
cleaned_sentences << sentence_whithout_ws
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
cleaned_sentences
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
module Sentence
|
|
2
|
+
class SplitPattern
|
|
3
|
+
attr_reader :name, :pattern, :replace
|
|
4
|
+
@@extracted_patterns = {}
|
|
5
|
+
|
|
6
|
+
def initialize(name, pattern, replace)
|
|
7
|
+
@name = name
|
|
8
|
+
@pattern = pattern
|
|
9
|
+
@replace = replace
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def process(text)
|
|
13
|
+
#@@extracted_patterns[@name].each do |caption| = text.scan(@pattern)
|
|
14
|
+
|
|
15
|
+
text.scan(@pattern).each do |caption|
|
|
16
|
+
@@extracted_patterns[@name] = [] if @@extracted_patterns[@name].nil?
|
|
17
|
+
caption = [ caption ] if caption.class != 'Array'
|
|
18
|
+
Sentence.EN.get_sentences(caption.flatten.join(Sentence::EOS)).each do |sentence|
|
|
19
|
+
#puts "sentence = #{sentence.inspect}"
|
|
20
|
+
@@extracted_patterns[@name] << sentence unless sentence.empty?
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
text.gsub!(@pattern, @replace)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def self.get_extracted_patterns
|
|
28
|
+
@@extracted_patterns
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
rubygems_version: 0.9.4
|
|
3
|
+
specification_version: 1
|
|
4
|
+
name: sentence
|
|
5
|
+
version: !ruby/object:Gem::Version
|
|
6
|
+
version: 1.0.0
|
|
7
|
+
date: 2008-01-13 00:00:00 +01:00
|
|
8
|
+
summary: Splits text into its constituent sentences
|
|
9
|
+
require_paths:
|
|
10
|
+
- lib
|
|
11
|
+
email: hellolinus@gmail.com
|
|
12
|
+
homepage: http://rubyforge.org/projects/sentence
|
|
13
|
+
rubyforge_project:
|
|
14
|
+
description:
|
|
15
|
+
autorequire:
|
|
16
|
+
default_executable:
|
|
17
|
+
bindir: bin
|
|
18
|
+
has_rdoc: false
|
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
20
|
+
requirements:
|
|
21
|
+
- - ">"
|
|
22
|
+
- !ruby/object:Gem::Version
|
|
23
|
+
version: 0.0.0
|
|
24
|
+
version:
|
|
25
|
+
platform: ruby
|
|
26
|
+
signing_key:
|
|
27
|
+
cert_chain:
|
|
28
|
+
post_install_message:
|
|
29
|
+
authors:
|
|
30
|
+
- Tomasz Bak
|
|
31
|
+
files:
|
|
32
|
+
- lib/abbreviations.rb
|
|
33
|
+
- lib/conjunctions.rb
|
|
34
|
+
- lib/sentence.rb
|
|
35
|
+
- lib/sentence_splitter.rb
|
|
36
|
+
- lib/split_pattern.rb
|
|
37
|
+
- README
|
|
38
|
+
test_files: []
|
|
39
|
+
|
|
40
|
+
rdoc_options: []
|
|
41
|
+
|
|
42
|
+
extra_rdoc_files:
|
|
43
|
+
- README
|
|
44
|
+
executables: []
|
|
45
|
+
|
|
46
|
+
extensions: []
|
|
47
|
+
|
|
48
|
+
requirements: []
|
|
49
|
+
|
|
50
|
+
dependencies: []
|
|
51
|
+
|