tactful_tokenizer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data.tar.gz.sig ADDED
@@ -0,0 +1,4 @@
1
+ 1���8����&�֭�D����0�SLI�x�r1e��(|?;j��_y���r���y���@[��z�-��<)�� �zt�����K�<,�����u��Xxᯱ(�V�����!��J�,�
2
+ b����N�!y�B�*{z��=��Jh���D�Z��O������dI�m�F/
3
+ �p�YD~�4h�i�vV*y�!��QR���8ӕ�g���Ό��ʧ�1Hߚ�F
4
+ q��|���� v7G"xE=��)�#��.��
data/Manifest ADDED
@@ -0,0 +1,8 @@
1
+ Manifest
2
+ README.rdoc
3
+ Rakefile
4
+ lib/models/features.mar
5
+ lib/models/lower_words.mar
6
+ lib/models/non_abbrs.mar
7
+ lib/tactful_tokenizer.rb
8
+ lib/word_tokenizer.rb
data/README.rdoc ADDED
@@ -0,0 +1,26 @@
1
+ = TactfulTokenizer
2
+
3
+ TactfulTokenizer is a Ruby library for high quality sentence
4
+ tokenization. It uses a Naive Bayesian statistical model, and
5
+ is based on Splitta[http://code.google.com/p/splitta/], but
6
+ has support for '?' and '!' as well as primitive handling of
7
+ XHTML markup. Better support for XHTML parsing is coming shortly.
8
+
9
+ == Usage
10
+
11
+ require "tactful_tokenizer"
12
+ m = TactfulTokenizer::Model.new
13
+ m.tokenize_text("Here in the U.S. Senate we prefer to eat our friends. Is it easier that way? <em>Yes.</em> <em>Maybe</em>!")
14
+ #=> ["Here in the U.S. Senate we prefer to eat our friends.", "Is it easier that way?", "<em>Yes.</em>", "<em>Maybe</em>!"]
15
+
16
+ The input text is expected to consist of paragraphs delimited
17
+ by line breaks.
18
+
19
+ == Installation
20
+ git clone http://github.com/SlyShy/Tactful_Tokenizer.git
21
+ gem install andand
22
+
23
+ == Author
24
+
25
+ Copyright (c) 2010 Matthew Bunday. All rights reserved.
26
+ Released under the {GNU GPL v3}[http://www.gnu.org/licenses/gpl.html].
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'echoe'
4
+
5
+ Echoe.new('tactful_tokenizer', '0.0.1') do |p|
6
+ p.description = "A high accuracy naive bayesian sentence tokenizer based on Splitta."
7
+ p.url = "http://github.com/SlyShy/Tactful_Tokenizer"
8
+ p.author = "Matthew Bunday"
9
+ p.email = "mkbunday @nospam@ gmail.com"
10
+ p.ignore_pattern = ["tmp/*", "script/*"]
11
+ p.development_dependencies = []
12
+ end
Binary file
Binary file
Binary file
@@ -0,0 +1,210 @@
1
+ # TactfulTokenizer is a Ruby library for high quality sentence
2
+ # tokenization. It uses a Naive Bayesian statistical model, and
3
+ # is based on Splitta[http://code.google.com/p/splitta/]. But
4
+ # has support for '?' and '!' as well as primitive handling of
5
+ # XHTML markup. Better support for XHTML parsing is coming shortly.
6
+ #
7
+ # Example usage:
8
+ #
9
+ # require "tactful_tokenizer"
10
+ # m = TactfulTokenizer::Model.new
11
+ # m.tokenize_text("Here in the U.S. Senate we prefer to eat our friends. Is it easier that way, really? Yes.")
12
+ # #=> ["Here in the U.S. Senate we prefer to eat our friends.", "Is it easier that way, really?", "Yes."]
13
+ #
14
+ # The input text is expected to consist of paragraphs delimited
15
+ # by line breaks.
16
+ #
17
+ # Author:: Matthew Bunday (mailto:mkbunday@gmail.com)
18
+ # License:: GNU General Public License v3
19
+
20
+ require "word_tokenizer.rb"
21
+ include WordTokenizer
22
+
23
+ #--
24
+ ####### Performance TODOs.
25
+ # TODO: Use inline C where necessary?
26
+ # TODO: Use RE2 regexp extension.
27
+ #++
28
+
29
+ module TactfulTokenizer
30
+
31
+ # Basic String extensions.
32
+ String.class_eval do
33
+
34
+ # Simple regex to check if a string is alphabetic.
35
+ def is_alphabetic?
36
+ return !/[^[:alpha:]]/.match(self)
37
+ end
38
+
39
+ # Check for upper case.
40
+ # Surprisingly, this is faster than a regex in benchmarks.
41
+ # Using the trinary operator is faster than to_s
42
+ def is_upper_case?
43
+ self == self.upcase ? 'true' : 'false'
44
+ end
45
+ end
46
+
47
+ # A model stores normalized probabilities of different features occuring.
48
+ class Model
49
+
50
+ # Initialize the model. feats, lower_words, and non_abbrs
51
+ # indicate the locations of the respective Marshal dumps.
52
+ def initialize(feats="#{File.dirname(__FILE__)}/models/features.mar", lower_words="#{File.dirname(__FILE__)}/models/lower_words.mar", non_abbrs="#{File.dirname(__FILE__)}/models/non_abbrs.mar")
53
+ @feats, @lower_words, @non_abbrs = [feats, lower_words, non_abbrs].map do |file|
54
+ File.open(file) do |f|
55
+ Marshal.load(f.read)
56
+ end
57
+ end
58
+ @p0 = @feats["<prior>"] ** 4
59
+ end
60
+
61
+ # feats = {feature => normalized probability of feature}
62
+ # lower_words = {token => log count of occurences in lower case}
63
+ # non_abbrs = {token => log count of occurences when not an abbrv.}
64
+ attr_accessor :feats, :lower_words, :non_abbrs
65
+
66
+ # This function is the only one that'll end up being used.
67
+ # m = TactfulTokenizer::Model.new
68
+ # m.tokenize_text("Hey, are these two sentences? I bet they should be.")
69
+ # => ["Hey, are these two sentences?", "I bet they should be."]
70
+ def tokenize_text(text)
71
+ data = Doc.new(text)
72
+ featurize(data)
73
+ classify(data)
74
+ return data.segment
75
+ end
76
+
77
+ # Assign a prediction (probability, to be precise) to each sentence fragment.
78
+ # For each feature in each fragment we hunt up the normalized probability and
79
+ # multiply. This is a fairly straightforward Bayesian probabilistic algorithm.
80
+ def classify(doc)
81
+ frag = nil
82
+ probs = 1
83
+ feat = ''
84
+ doc.frags.each do |frag|
85
+ probs = @p0
86
+ frag.features.each do |feat|
87
+ probs *= @feats[feat]
88
+ end
89
+ frag.pred = probs / (probs + 1)
90
+ end
91
+ end
92
+
93
+ # Get the features of every fragment.
94
+ def featurize(doc)
95
+ frag = nil
96
+ doc.frags.each do |frag|
97
+ get_features(frag, self)
98
+ end
99
+ end
100
+
101
+ # Finds the features in a text fragment of the form:
102
+ # ... w1. (sb?) w2 ...
103
+ # Features listed in rough order of importance:
104
+ # * w1: a word that includes a period.
105
+ # * w2: the next word, if it exists.
106
+ # * w1length: the number of alphabetic characters in w1.
107
+ # * both: w1 and w2 taken together.
108
+ # * w1abbr: logarithmic count of w1 occuring without a period.
109
+ # * w2lower: logarithmiccount of w2 occuring lowercased.
110
+ def get_features(frag, model)
111
+ w1 = (frag.cleaned.last or '')
112
+ w2 = (frag.next or '')
113
+
114
+ frag.features = ["w1_#{w1}", "w2_#{w2}", "both_#{w1}_#{w2}"]
115
+
116
+ if not w2.empty?
117
+ if w1.chop.is_alphabetic?
118
+ frag.features.push "w1length_#{[10, w1.length].min}"
119
+ frag.features.push "w1abbr_#{model.non_abbrs[w1.chop]}"
120
+ end
121
+
122
+ if w2.chop.is_alphabetic?
123
+ frag.features.push "w2cap_#{w2[0].is_upper_case?}"
124
+ frag.features.push "w2lower_#{model.lower_words[w2.downcase]}"
125
+ end
126
+ end
127
+ end
128
+ end
129
+
130
+ # A document represents the input text. It holds a list of fragments generated
131
+ # from the text.
132
+ class Doc
133
+ # List of fragments.
134
+ attr_accessor :frags
135
+
136
+ # Receives a text, which is then broken into fragments.
137
+ # A fragment ends with a period, quesetion mark, or exclamation mark followed
138
+ # possibly by right handed punctuation like quotation marks or closing braces
139
+ # and trailing whitespace. Failing that, it'll accept something like "I hate cheese\n"
140
+ # No, it doesn't have a period, but that's the end of paragraph.
141
+ #
142
+ # Input assumption: Paragraphs delimited by line breaks.
143
+ def initialize(text)
144
+ @frags = []
145
+ res = nil
146
+ puts "Hey!"
147
+ puts text.inspect
148
+ text.each_line do |line|
149
+ unless line.strip.empty?
150
+ line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[\s])/).each do |res|
151
+ unless res.strip.empty?
152
+ frag = Frag.new(res)
153
+ @frags.last.next = frag.cleaned.first unless @frags.empty?
154
+ @frags.push frag
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
160
+
161
+ # Segments the text. More precisely, it reassembles the fragments into sentences.
162
+ # We call something a sentence whenever it is more likely to be a sentence than not.
163
+ def segment
164
+ sents, sent = [], []
165
+ thresh = 0.5
166
+
167
+ frag = nil
168
+ @frags.each do |frag|
169
+ sent.push(frag.orig)
170
+ if frag.pred > thresh
171
+ break if frag.orig.nil?
172
+ sents.push(sent.join('').strip)
173
+ sent = []
174
+ end
175
+ end
176
+ sents
177
+ end
178
+ end
179
+
180
+ # A fragment is a potential sentence, but is based only on the existence of a period.
181
+ # The text "Here in the U.S. Senate we prefer to devour our friends." will be split
182
+ # into "Here in the U.S." and "Senate we prefer to devour our friends."
183
+ class Frag
184
+
185
+ # orig = The original text of the fragment.
186
+ # next = The next word following the fragment.
187
+ # cleaned = Array of the fragment's words after cleaning.
188
+ # pred = Probability that the fragment is a sentence.
189
+ # features = Array of the fragment's features.
190
+ attr_accessor :orig, :next, :cleaned, :pred, :features
191
+
192
+ # Create a new fragment.
193
+ def initialize(orig='')
194
+ @orig = orig
195
+ clean(orig)
196
+ @next, @pred, @features = nil, nil, nil
197
+ end
198
+
199
+ # Normalizes numbers and discards ambiguous punctuation. And then splits into an
200
+ # array, because realistically only the last and first words are ever accessed.
201
+ def clean(s)
202
+ @cleaned = String.new(s)
203
+ tokenize(@cleaned)
204
+ @cleaned.gsub!(/[.,\d]*\d/, '<NUM>')
205
+ @cleaned.gsub!(/[^a-zA-Z0-9,.;:<>\-'\/$% ]/, '')
206
+ @cleaned.gsub!('--', ' ')
207
+ @cleaned = @cleaned.split
208
+ end
209
+ end
210
+ end
@@ -0,0 +1,51 @@
1
+ module WordTokenizer
2
+ @@tokenize_regexps = [
3
+ # Uniform Quotes
4
+ [/''|``/, '"'],
5
+
6
+ # Separate punctuation (except for periods) from words.
7
+ [/(^|\s)(')/, '\1\2'],
8
+ [/(?=[\("`{\[:;&#*@])(.)/, '\1 '],
9
+
10
+ [/(.)(?=[?!\)";}\]*:@'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|\s)-)(?=[^-])/, '\1 '],
11
+
12
+ # Treat double-hyphen as a single token.
13
+ [/([^-])(--+)([^-])/, '\1 \2 \3'],
14
+ [/(\s|^)(,)(?=(\S))/, '\1\2 '],
15
+
16
+ # Only separate a comma if a space follows.
17
+ [/(.)(,)(\s|$)/, '\1 \2\3'],
18
+
19
+ # Combine dots separated by whitespace to be a single token.
20
+ [/\.\s\.\s\./, '...'],
21
+
22
+ # Separate "No.6"
23
+ [/([a-zA-Z]\.)(\d+)/, '\1 \2'],
24
+
25
+ # Separate words from ellipses
26
+ [/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
27
+ [/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
28
+ [/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
29
+
30
+ ##### Some additional fixes.
31
+
32
+ # Fix %, $, &
33
+ [/(\d)%/, '\1 %'],
34
+ [/\$(\.?\d)/, '$ \1'],
35
+ [/(\w)& (\w)/, '\1&\2'],
36
+ [/(\w\w+)&(\w\w+)/, '\1 & \2'],
37
+
38
+ # Fix (n 't) -> ( n't)
39
+ [/n 't( |$)/, " n't\\1"],
40
+ [/N 'T( |$)/, " N'T\\1"],
41
+
42
+ # Treebank tokenizer special words
43
+ [/([Cc])annot/, '\1an not']
44
+
45
+ ];
46
+
47
+ def tokenize(s)
48
+ rules = []
49
+ @@tokenize_regexps.each {|rules| s.gsub!(rules[0], rules[1])}
50
+ end
51
+ end
@@ -0,0 +1,32 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{tactful_tokenizer}
5
+ s.version = "0.0.1"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Matthew Bunday"]
9
+ s.cert_chain = ["/home/slyshy/.ssh/gem-public_cert.pem"]
10
+ s.date = %q{2010-03-23}
11
+ s.description = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
12
+ s.email = %q{mkbunday @nospam@ gmail.com}
13
+ s.extra_rdoc_files = ["README.rdoc", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb"]
14
+ s.files = ["Manifest", "README.rdoc", "Rakefile", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb", "tactful_tokenizer.gemspec"]
15
+ s.homepage = %q{http://github.com/SlyShy/Tactful_Tokenizer}
16
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Tactful_tokenizer", "--main", "README.rdoc"]
17
+ s.require_paths = ["lib"]
18
+ s.rubyforge_project = %q{tactful_tokenizer}
19
+ s.rubygems_version = %q{1.3.6}
20
+ s.signing_key = %q{/home/slyshy/.ssh/gem-private_key.pem}
21
+ s.summary = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
22
+
23
+ if s.respond_to? :specification_version then
24
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
25
+ s.specification_version = 3
26
+
27
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
28
+ else
29
+ end
30
+ else
31
+ end
32
+ end
metadata ADDED
@@ -0,0 +1,102 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tactful_tokenizer
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Matthew Bunday
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain:
16
+ - |
17
+ -----BEGIN CERTIFICATE-----
18
+ MIIDMjCCAhqgAwIBAgIBADANBgkqhkiG9w0BAQUFADA/MREwDwYDVQQDDAhta2J1
19
+ bmRheTEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYKCZImiZPyLGQBGRYDY29t
20
+ MB4XDTEwMDMyMzE2MDkzOVoXDTExMDMyMzE2MDkzOVowPzERMA8GA1UEAwwIbWti
21
+ dW5kYXkxFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixkARkWA2Nv
22
+ bTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAMk5+Wsur5ptIGUthPBG
23
+ VHECPqlV7TRgxiEMbH8vxkMVNnqFGDTezd9zsmqfX9kKR4/Jmu1fXKyBswGRxYxD
24
+ qx8nR+DCnWk0gfx2jjpnknPPWTQ6lHiZaPrGb+QuANhebPTwI6cDIz4A3dg2QIRo
25
+ ETdiAdOspNudUHu2Jf/QeNQPr5SURy9vGnSXkDhMcrnR3EjkRAP4suNIlHBNj3Hz
26
+ 7hYjZV5QzeFwVENR5K3zFSkbC3ZK6uZTUwPVngmCqWz3MLsNJiQhAhvn/XQ8OCJ3
27
+ Q8O/nPuIIqFNeT3TMvnfrbx+wyxX6FIBZ12M4lNmU6yoXxzmi/n/cBNLAkQ/hc2g
28
+ n68CAwEAAaM5MDcwCQYDVR0TBAIwADAdBgNVHQ4EFgQUZfQL/a3SzQ017Zj9MUwh
29
+ Y6BtLUgwCwYDVR0PBAQDAgSwMA0GCSqGSIb3DQEBBQUAA4IBAQAjdEGkZbV7tkOq
30
+ N0y3yL5n1JOMsVHsQF7/w2zeET3PyUgKmmobdq3V0rztqVcJ1oP/+fYUO1KYxC90
31
+ b8FOCGGvcKjMn1QJufFp1DTfiGFcz6nHRWmiAMRXbempzA5NDzocQP9jaRkoYEzK
32
+ pwsJwe0dlpJXs8/fqqljNdBe4AToDGLcbzdMmpGxZN63P70yAFL5G7sJy1Izp5ei
33
+ CvIRDtL1PdU1ESVLFJuoCAiCtpBfwwepv4kuuoca9Ykd5ldPCGzMq0n8+KIubb+2
34
+ xz7fp33atnZoMajdCOYKqwo2xVhUuFPZzBFZ3L6T6YLuEVGKHNyUAfcfr+8VSuB5
35
+ 3+l7cSZt
36
+ -----END CERTIFICATE-----
37
+
38
+ date: 2010-03-23 00:00:00 -05:00
39
+ default_executable:
40
+ dependencies: []
41
+
42
+ description: A high accuracy naive bayesian sentence tokenizer based on Splitta.
43
+ email: mkbunday @nospam@ gmail.com
44
+ executables: []
45
+
46
+ extensions: []
47
+
48
+ extra_rdoc_files:
49
+ - README.rdoc
50
+ - lib/models/features.mar
51
+ - lib/models/lower_words.mar
52
+ - lib/models/non_abbrs.mar
53
+ - lib/tactful_tokenizer.rb
54
+ - lib/word_tokenizer.rb
55
+ files:
56
+ - Manifest
57
+ - README.rdoc
58
+ - Rakefile
59
+ - lib/models/features.mar
60
+ - lib/models/lower_words.mar
61
+ - lib/models/non_abbrs.mar
62
+ - lib/tactful_tokenizer.rb
63
+ - lib/word_tokenizer.rb
64
+ - tactful_tokenizer.gemspec
65
+ has_rdoc: true
66
+ homepage: http://github.com/SlyShy/Tactful_Tokenizer
67
+ licenses: []
68
+
69
+ post_install_message:
70
+ rdoc_options:
71
+ - --line-numbers
72
+ - --inline-source
73
+ - --title
74
+ - Tactful_tokenizer
75
+ - --main
76
+ - README.rdoc
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ segments:
84
+ - 0
85
+ version: "0"
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ segments:
91
+ - 1
92
+ - 2
93
+ version: "1.2"
94
+ requirements: []
95
+
96
+ rubyforge_project: tactful_tokenizer
97
+ rubygems_version: 1.3.6
98
+ signing_key:
99
+ specification_version: 3
100
+ summary: A high accuracy naive bayesian sentence tokenizer based on Splitta.
101
+ test_files: []
102
+
metadata.gz.sig ADDED
Binary file