sentence 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,22 @@
1
+ = Read Me
2
+
3
+ by Tomasz Bak
4
+
5
+ == Description
6
+
7
+ Welcome to Sentence.
8
+
9
+ This library helps spliting text into its constituent sentences, based on a regular expressions
10
+ (packed into SplitPattern class) and a list of abbreviations.
11
+
12
+ It was designed to be flexiple and clean, and it has three primary goals:
13
+
14
+ 1. Be easely applicable in wide range of usage scenarios (i.e. splitting sentences
15
+ generated by html2text, what involved support for simple formating understanding,
16
+ see examples/example.rb that parses corpora/medicine.txt).
17
+
18
+ 2. Be library language aware, what is particulary important in building abbreviations and
19
+ conjunctions base (see lib/conjunctions.rb and lib/abbreviations.rb).
20
+
21
+ 3. Use extesive testing not to break previously working code with new abbreviations and
22
+ patterns, which hopefully will be extended in future with patches sent by users.
@@ -0,0 +1,18 @@
1
+ module Sentence
2
+ module Abbreviations
3
+ ROMAN = ['i','ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x']
4
+
5
+ def self.EN
6
+ people = ['jr', 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', "sens?", "reps?", 'gov', 'supt', 'det', 'rev']
7
+ army = ['col','gen', 'lt', 'cmdr', 'adm', 'capt', 'sgt', 'cpl', 'maj']
8
+ month = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec','sept']
9
+ misc = ['vs', 'etc', 'no', 'esp', 'e.g', 'cf','i.e']
10
+
11
+ people + army + month + misc + ROMAN
12
+ end
13
+
14
+ def self.PL
15
+ ROMAN
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,11 @@
1
+ module Sentence
2
+ module Conjunctions
3
+ def self.EN
4
+ ['and', 'or', 'but', 'so', 'as', 'of']
5
+ end
6
+
7
+ def self.PL
8
+ ['i', 'lub', 'oraz', 'za']
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,16 @@
1
+ require File.dirname(__FILE__) + '/abbreviations'
2
+ require File.dirname(__FILE__) + '/conjunctions'
3
+ require File.dirname(__FILE__) + '/sentence_splitter'
4
+ require File.dirname(__FILE__) + '/split_pattern'
5
+
6
+ module Sentence
7
+ # English language rules
8
+ def self.EN
9
+ SentenceSplitter.new(:EN)
10
+ end
11
+
12
+ # Polish language rules
13
+ def self.PL
14
+ SentenceSplitter.new(:PL)
15
+ end
16
+ end
@@ -0,0 +1,94 @@
1
+ module Sentence
2
+ EOS = "\001" # End Of Sentence
3
+
4
+ class SentenceSplitter
5
+
6
+ include Abbreviations
7
+ include Conjunctions
8
+
9
+ def initialize(lang)
10
+ @lang = lang
11
+ @discarded_text_as_sentence = []
12
+ @split_patterns = []
13
+ end
14
+
15
+ def discard(texts)
16
+ discard_texts_as_sentence(texts)
17
+ end
18
+
19
+ def patterns(paterns)
20
+ @split_patterns = paterns
21
+ self
22
+ end
23
+
24
+ def discard_texts_as_sentence(texts)
25
+ @discarded_text_as_sentence = texts
26
+ self
27
+ end
28
+
29
+ # get_sentences - takes text input and splits it into sentences.
30
+ def get_sentences( text )
31
+ return [] if text.nil? or text.empty?
32
+
33
+ @split_patterns.each do |pattern|
34
+ pattern.process(text)
35
+ end
36
+
37
+ marked_text = first_sentence_breaking(text)
38
+ fixed_marked_text = split_unsplit_stuff( remove_false_end_of_sentence( marked_text ) )
39
+ sentences = fixed_marked_text.split(EOS)
40
+ clean_sentences(sentences)
41
+ end
42
+
43
+ def first_sentence_breaking(text)
44
+ # each match means a different sentence
45
+ text.gsub!(/\n\s*\n/,EOS) # double new-line
46
+ text.gsub!(/\|\s*([A-Z])/,EOS+'\1') # break on | before A-Z
47
+ text.gsub!(/\n\s*([A-Z])/,EOS+'\1') # new line and new world starts with A-Z
48
+ text.gsub!(/^\s*[\*\o\#]\s(.*)/,EOS+'\1') # list items starts with [*,o,#]
49
+ text.gsub!(/([\.\!\?])(\s)/,'\1'+EOS+'\2') # punctuation
50
+ text
51
+ end
52
+
53
+ def remove_false_end_of_sentence( text )
54
+ Abbreviations.send(@lang).each do |abb|
55
+ text.gsub!(/(#{abb}\.)#{EOS}/i,'\1')
56
+ end
57
+ Conjunctions.send(@lang).each do |con|
58
+ text.gsub!(/(#{con}\s*)#{EOS}/i,'\1')
59
+ end
60
+ text
61
+ end
62
+
63
+ def split_unsplit_stuff(text)
64
+ text.gsub!(/\n#{EOS}(\s*[a-z])/,'\1')
65
+ text.gsub!(/([A-Z0-9]\.)#{EOS}/,'\1') # single upcase letter or number with .
66
+ text
67
+ end
68
+
69
+ def clean_sentences(sentences)
70
+ cleaned_sentences = []
71
+ sentences.each do |sentence|
72
+
73
+ subsentences = [ sentence ]
74
+ if sentence.index(/\./) == nil
75
+ subsentences = sentence.split('*') # split * if no . in sentence
76
+ end
77
+
78
+ subsentences.each do |sentence|
79
+ case sentence.strip
80
+ when '', "\n", "..."
81
+ else
82
+ if sentence.index(/\w/) != nil
83
+ sentence_whithout_ws = sentence.split.join(' ')
84
+ unless @discarded_text_as_sentence.include? sentence_whithout_ws.downcase
85
+ cleaned_sentences << sentence_whithout_ws
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
91
+ cleaned_sentences
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,31 @@
1
+ module Sentence
2
+ class SplitPattern
3
+ attr_reader :name, :pattern, :replace
4
+ @@extracted_patterns = {}
5
+
6
+ def initialize(name, pattern, replace)
7
+ @name = name
8
+ @pattern = pattern
9
+ @replace = replace
10
+ end
11
+
12
+ def process(text)
13
+ #@@extracted_patterns[@name].each do |caption| = text.scan(@pattern)
14
+
15
+ text.scan(@pattern).each do |caption|
16
+ @@extracted_patterns[@name] = [] if @@extracted_patterns[@name].nil?
17
+ caption = [ caption ] if caption.class != 'Array'
18
+ Sentence.EN.get_sentences(caption.flatten.join(Sentence::EOS)).each do |sentence|
19
+ #puts "sentence = #{sentence.inspect}"
20
+ @@extracted_patterns[@name] << sentence unless sentence.empty?
21
+ end
22
+ end
23
+
24
+ text.gsub!(@pattern, @replace)
25
+ end
26
+
27
+ def self.get_extracted_patterns
28
+ @@extracted_patterns
29
+ end
30
+ end
31
+ end
metadata ADDED
@@ -0,0 +1,51 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.4
3
+ specification_version: 1
4
+ name: sentence
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2008-01-13 00:00:00 +01:00
8
+ summary: Splits text into its constituent sentences
9
+ require_paths:
10
+ - lib
11
+ email: hellolinus@gmail.com
12
+ homepage: http://rubyforge.org/projects/sentence
13
+ rubyforge_project:
14
+ description:
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Tomasz Bak
31
+ files:
32
+ - lib/abbreviations.rb
33
+ - lib/conjunctions.rb
34
+ - lib/sentence.rb
35
+ - lib/sentence_splitter.rb
36
+ - lib/split_pattern.rb
37
+ - README
38
+ test_files: []
39
+
40
+ rdoc_options: []
41
+
42
+ extra_rdoc_files:
43
+ - README
44
+ executables: []
45
+
46
+ extensions: []
47
+
48
+ requirements: []
49
+
50
+ dependencies: []
51
+