tactful_tokenizer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data.tar.gz.sig ADDED
@@ -0,0 +1,4 @@
1
+ 1���8����&�֭�D����0�SLI�x�r1e��(|?;j��_y���r���y���@[��z�-��<)�� �zt�����K�<,�����u��Xxᯱ(�V�����!��J�,�
2
+ b����N�!y�B�*{z��=��Jh���D�Z��O������dI�m�F/
3
+ �p�YD~�4h�i�vV*y�!��QR���8ӕ�g���Ό��ʧ�1Hߚ�F
4
+ q��|���� v7G"xE=��)�#��.��
data/Manifest ADDED
@@ -0,0 +1,8 @@
1
+ Manifest
2
+ README.rdoc
3
+ Rakefile
4
+ lib/models/features.mar
5
+ lib/models/lower_words.mar
6
+ lib/models/non_abbrs.mar
7
+ lib/tactful_tokenizer.rb
8
+ lib/word_tokenizer.rb
data/README.rdoc ADDED
@@ -0,0 +1,26 @@
1
+ = TactfulTokenizer
2
+
3
+ TactfulTokenizer is a Ruby library for high quality sentence
4
+ tokenization. It uses a Naive Bayesian statistical model, and
5
+ is based on Splitta[http://code.google.com/p/splitta/], but
6
+ has support for '?' and '!' as well as primitive handling of
7
+ XHTML markup. Better support for XHTML parsing is coming shortly.
8
+
9
+ == Usage
10
+
11
+ require "tactful_tokenizer"
12
+ m = TactfulTokenizer::Model.new
13
+ m.tokenize_text("Here in the U.S. Senate we prefer to eat our friends. Is it easier that way? <em>Yes.</em> <em>Maybe</em>!")
14
+ #=> ["Here in the U.S. Senate we prefer to eat our friends.", "Is it easier that way?", "<em>Yes.</em>", "<em>Maybe</em>!"]
15
+
16
+ The input text is expected to consist of paragraphs delimited
17
+ by line breaks.
18
+
19
+ == Installation
20
+ git clone http://github.com/SlyShy/Tactful_Tokenizer.git
21
+ gem install andand
22
+
23
+ == Author
24
+
25
+ Copyright (c) 2010 Matthew Bunday. All rights reserved.
26
+ Released under the {GNU GPL v3}[http://www.gnu.org/licenses/gpl.html].
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'echoe'
4
+
5
+ Echoe.new('tactful_tokenizer', '0.0.1') do |p|
6
+ p.description = "A high accuracy naive bayesian sentence tokenizer based on Splitta."
7
+ p.url = "http://github.com/SlyShy/Tactful_Tokenizer"
8
+ p.author = "Matthew Bunday"
9
+ p.email = "mkbunday @nospam@ gmail.com"
10
+ p.ignore_pattern = ["tmp/*", "script/*"]
11
+ p.development_dependencies = []
12
+ end
Binary file
Binary file
Binary file
@@ -0,0 +1,210 @@
1
+ # TactfulTokenizer is a Ruby library for high quality sentence
2
+ # tokenization. It uses a Naive Bayesian statistical model, and
3
+ # is based on Splitta[http://code.google.com/p/splitta/]. But
4
+ # has support for '?' and '!' as well as primitive handling of
5
+ # XHTML markup. Better support for XHTML parsing is coming shortly.
6
+ #
7
+ # Example usage:
8
+ #
9
+ # require "tactful_tokenizer"
10
+ # m = TactfulTokenizer::Model.new
11
+ # m.tokenize_text("Here in the U.S. Senate we prefer to eat our friends. Is it easier that way, really? Yes.")
12
+ # #=> ["Here in the U.S. Senate we prefer to eat our friends.", "Is it easier that way, really?", "Yes."]
13
+ #
14
+ # The input text is expected to consist of paragraphs delimited
15
+ # by line breaks.
16
+ #
17
+ # Author:: Matthew Bunday (mailto:mkbunday@gmail.com)
18
+ # License:: GNU General Public License v3
19
+
20
+ require "word_tokenizer.rb"
21
+ include WordTokenizer
22
+
23
+ #--
24
+ ####### Performance TODOs.
25
+ # TODO: Use inline C where necessary?
26
+ # TODO: Use RE2 regexp extension.
27
+ #++
28
+
29
+ module TactfulTokenizer
30
+
31
+ # Basic String extensions.
32
+ String.class_eval do
33
+
34
+ # Simple regex to check if a string is alphabetic.
35
+ def is_alphabetic?
36
+ return !/[^[:alpha:]]/.match(self)
37
+ end
38
+
39
+ # Check for upper case.
40
+ # Surprisingly, this is faster than a regex in benchmarks.
41
+ # Using the trinary operator is faster than to_s
42
+ def is_upper_case?
43
+ self == self.upcase ? 'true' : 'false'
44
+ end
45
+ end
46
+
47
+ # A model stores normalized probabilities of different features occuring.
48
+ class Model
49
+
50
+ # Initialize the model. feats, lower_words, and non_abbrs
51
+ # indicate the locations of the respective Marshal dumps.
52
+ def initialize(feats="#{File.dirname(__FILE__)}/models/features.mar", lower_words="#{File.dirname(__FILE__)}/models/lower_words.mar", non_abbrs="#{File.dirname(__FILE__)}/models/non_abbrs.mar")
53
+ @feats, @lower_words, @non_abbrs = [feats, lower_words, non_abbrs].map do |file|
54
+ File.open(file) do |f|
55
+ Marshal.load(f.read)
56
+ end
57
+ end
58
+ @p0 = @feats["<prior>"] ** 4
59
+ end
60
+
61
+ # feats = {feature => normalized probability of feature}
62
+ # lower_words = {token => log count of occurences in lower case}
63
+ # non_abbrs = {token => log count of occurences when not an abbrv.}
64
+ attr_accessor :feats, :lower_words, :non_abbrs
65
+
66
+ # This function is the only one that'll end up being used.
67
+ # m = TactfulTokenizer::Model.new
68
+ # m.tokenize_text("Hey, are these two sentences? I bet they should be.")
69
+ # => ["Hey, are these two sentences?", "I bet they should be."]
70
+ def tokenize_text(text)
71
+ data = Doc.new(text)
72
+ featurize(data)
73
+ classify(data)
74
+ return data.segment
75
+ end
76
+
77
+ # Assign a prediction (probability, to be precise) to each sentence fragment.
78
+ # For each feature in each fragment we hunt up the normalized probability and
79
+ # multiply. This is a fairly straightforward Bayesian probabilistic algorithm.
80
+ def classify(doc)
81
+ frag = nil
82
+ probs = 1
83
+ feat = ''
84
+ doc.frags.each do |frag|
85
+ probs = @p0
86
+ frag.features.each do |feat|
87
+ probs *= @feats[feat]
88
+ end
89
+ frag.pred = probs / (probs + 1)
90
+ end
91
+ end
92
+
93
+ # Get the features of every fragment.
94
+ def featurize(doc)
95
+ frag = nil
96
+ doc.frags.each do |frag|
97
+ get_features(frag, self)
98
+ end
99
+ end
100
+
101
+ # Finds the features in a text fragment of the form:
102
+ # ... w1. (sb?) w2 ...
103
+ # Features listed in rough order of importance:
104
+ # * w1: a word that includes a period.
105
+ # * w2: the next word, if it exists.
106
+ # * w1length: the number of alphabetic characters in w1.
107
+ # * both: w1 and w2 taken together.
108
+ # * w1abbr: logarithmic count of w1 occuring without a period.
109
+ # * w2lower: logarithmiccount of w2 occuring lowercased.
110
+ def get_features(frag, model)
111
+ w1 = (frag.cleaned.last or '')
112
+ w2 = (frag.next or '')
113
+
114
+ frag.features = ["w1_#{w1}", "w2_#{w2}", "both_#{w1}_#{w2}"]
115
+
116
+ if not w2.empty?
117
+ if w1.chop.is_alphabetic?
118
+ frag.features.push "w1length_#{[10, w1.length].min}"
119
+ frag.features.push "w1abbr_#{model.non_abbrs[w1.chop]}"
120
+ end
121
+
122
+ if w2.chop.is_alphabetic?
123
+ frag.features.push "w2cap_#{w2[0].is_upper_case?}"
124
+ frag.features.push "w2lower_#{model.lower_words[w2.downcase]}"
125
+ end
126
+ end
127
+ end
128
+ end
129
+
130
+ # A document represents the input text. It holds a list of fragments generated
131
+ # from the text.
132
+ class Doc
133
+ # List of fragments.
134
+ attr_accessor :frags
135
+
136
+ # Receives a text, which is then broken into fragments.
137
+ # A fragment ends with a period, quesetion mark, or exclamation mark followed
138
+ # possibly by right handed punctuation like quotation marks or closing braces
139
+ # and trailing whitespace. Failing that, it'll accept something like "I hate cheese\n"
140
+ # No, it doesn't have a period, but that's the end of paragraph.
141
+ #
142
+ # Input assumption: Paragraphs delimited by line breaks.
143
+ def initialize(text)
144
+ @frags = []
145
+ res = nil
146
+ puts "Hey!"
147
+ puts text.inspect
148
+ text.each_line do |line|
149
+ unless line.strip.empty?
150
+ line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[\s])/).each do |res|
151
+ unless res.strip.empty?
152
+ frag = Frag.new(res)
153
+ @frags.last.next = frag.cleaned.first unless @frags.empty?
154
+ @frags.push frag
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
160
+
161
+ # Segments the text. More precisely, it reassembles the fragments into sentences.
162
+ # We call something a sentence whenever it is more likely to be a sentence than not.
163
+ def segment
164
+ sents, sent = [], []
165
+ thresh = 0.5
166
+
167
+ frag = nil
168
+ @frags.each do |frag|
169
+ sent.push(frag.orig)
170
+ if frag.pred > thresh
171
+ break if frag.orig.nil?
172
+ sents.push(sent.join('').strip)
173
+ sent = []
174
+ end
175
+ end
176
+ sents
177
+ end
178
+ end
179
+
180
+ # A fragment is a potential sentence, but is based only on the existence of a period.
181
+ # The text "Here in the U.S. Senate we prefer to devour our friends." will be split
182
+ # into "Here in the U.S." and "Senate we prefer to devour our friends."
183
+ class Frag
184
+
185
+ # orig = The original text of the fragment.
186
+ # next = The next word following the fragment.
187
+ # cleaned = Array of the fragment's words after cleaning.
188
+ # pred = Probability that the fragment is a sentence.
189
+ # features = Array of the fragment's features.
190
+ attr_accessor :orig, :next, :cleaned, :pred, :features
191
+
192
+ # Create a new fragment.
193
+ def initialize(orig='')
194
+ @orig = orig
195
+ clean(orig)
196
+ @next, @pred, @features = nil, nil, nil
197
+ end
198
+
199
+ # Normalizes numbers and discards ambiguous punctuation. And then splits into an
200
+ # array, because realistically only the last and first words are ever accessed.
201
+ def clean(s)
202
+ @cleaned = String.new(s)
203
+ tokenize(@cleaned)
204
+ @cleaned.gsub!(/[.,\d]*\d/, '<NUM>')
205
+ @cleaned.gsub!(/[^a-zA-Z0-9,.;:<>\-'\/$% ]/, '')
206
+ @cleaned.gsub!('--', ' ')
207
+ @cleaned = @cleaned.split
208
+ end
209
+ end
210
+ end
@@ -0,0 +1,51 @@
1
+ module WordTokenizer
2
+ @@tokenize_regexps = [
3
+ # Uniform Quotes
4
+ [/''|``/, '"'],
5
+
6
+ # Separate punctuation (except for periods) from words.
7
+ [/(^|\s)(')/, '\1\2'],
8
+ [/(?=[\("`{\[:;&#*@])(.)/, '\1 '],
9
+
10
+ [/(.)(?=[?!\)";}\]*:@'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|\s)-)(?=[^-])/, '\1 '],
11
+
12
+ # Treat double-hyphen as a single token.
13
+ [/([^-])(--+)([^-])/, '\1 \2 \3'],
14
+ [/(\s|^)(,)(?=(\S))/, '\1\2 '],
15
+
16
+ # Only separate a comma if a space follows.
17
+ [/(.)(,)(\s|$)/, '\1 \2\3'],
18
+
19
+ # Combine dots separated by whitespace to be a single token.
20
+ [/\.\s\.\s\./, '...'],
21
+
22
+ # Separate "No.6"
23
+ [/([a-zA-Z]\.)(\d+)/, '\1 \2'],
24
+
25
+ # Separate words from ellipses
26
+ [/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
27
+ [/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
28
+ [/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
29
+
30
+ ##### Some additional fixes.
31
+
32
+ # Fix %, $, &
33
+ [/(\d)%/, '\1 %'],
34
+ [/\$(\.?\d)/, '$ \1'],
35
+ [/(\w)& (\w)/, '\1&\2'],
36
+ [/(\w\w+)&(\w\w+)/, '\1 & \2'],
37
+
38
+ # Fix (n 't) -> ( n't)
39
+ [/n 't( |$)/, " n't\\1"],
40
+ [/N 'T( |$)/, " N'T\\1"],
41
+
42
+ # Treebank tokenizer special words
43
+ [/([Cc])annot/, '\1an not']
44
+
45
+ ];
46
+
47
+ def tokenize(s)
48
+ rules = []
49
+ @@tokenize_regexps.each {|rules| s.gsub!(rules[0], rules[1])}
50
+ end
51
+ end
@@ -0,0 +1,32 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{tactful_tokenizer}
5
+ s.version = "0.0.1"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Matthew Bunday"]
9
+ s.cert_chain = ["/home/slyshy/.ssh/gem-public_cert.pem"]
10
+ s.date = %q{2010-03-23}
11
+ s.description = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
12
+ s.email = %q{mkbunday @nospam@ gmail.com}
13
+ s.extra_rdoc_files = ["README.rdoc", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb"]
14
+ s.files = ["Manifest", "README.rdoc", "Rakefile", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb", "tactful_tokenizer.gemspec"]
15
+ s.homepage = %q{http://github.com/SlyShy/Tactful_Tokenizer}
16
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Tactful_tokenizer", "--main", "README.rdoc"]
17
+ s.require_paths = ["lib"]
18
+ s.rubyforge_project = %q{tactful_tokenizer}
19
+ s.rubygems_version = %q{1.3.6}
20
+ s.signing_key = %q{/home/slyshy/.ssh/gem-private_key.pem}
21
+ s.summary = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
22
+
23
+ if s.respond_to? :specification_version then
24
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
25
+ s.specification_version = 3
26
+
27
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
28
+ else
29
+ end
30
+ else
31
+ end
32
+ end
metadata ADDED
@@ -0,0 +1,102 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tactful_tokenizer
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Matthew Bunday
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain:
16
+ - |
17
+ -----BEGIN CERTIFICATE-----
18
+ MIIDMjCCAhqgAwIBAgIBADANBgkqhkiG9w0BAQUFADA/MREwDwYDVQQDDAhta2J1
19
+ bmRheTEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYKCZImiZPyLGQBGRYDY29t
20
+ MB4XDTEwMDMyMzE2MDkzOVoXDTExMDMyMzE2MDkzOVowPzERMA8GA1UEAwwIbWti
21
+ dW5kYXkxFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixkARkWA2Nv
22
+ bTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAMk5+Wsur5ptIGUthPBG
23
+ VHECPqlV7TRgxiEMbH8vxkMVNnqFGDTezd9zsmqfX9kKR4/Jmu1fXKyBswGRxYxD
24
+ qx8nR+DCnWk0gfx2jjpnknPPWTQ6lHiZaPrGb+QuANhebPTwI6cDIz4A3dg2QIRo
25
+ ETdiAdOspNudUHu2Jf/QeNQPr5SURy9vGnSXkDhMcrnR3EjkRAP4suNIlHBNj3Hz
26
+ 7hYjZV5QzeFwVENR5K3zFSkbC3ZK6uZTUwPVngmCqWz3MLsNJiQhAhvn/XQ8OCJ3
27
+ Q8O/nPuIIqFNeT3TMvnfrbx+wyxX6FIBZ12M4lNmU6yoXxzmi/n/cBNLAkQ/hc2g
28
+ n68CAwEAAaM5MDcwCQYDVR0TBAIwADAdBgNVHQ4EFgQUZfQL/a3SzQ017Zj9MUwh
29
+ Y6BtLUgwCwYDVR0PBAQDAgSwMA0GCSqGSIb3DQEBBQUAA4IBAQAjdEGkZbV7tkOq
30
+ N0y3yL5n1JOMsVHsQF7/w2zeET3PyUgKmmobdq3V0rztqVcJ1oP/+fYUO1KYxC90
31
+ b8FOCGGvcKjMn1QJufFp1DTfiGFcz6nHRWmiAMRXbempzA5NDzocQP9jaRkoYEzK
32
+ pwsJwe0dlpJXs8/fqqljNdBe4AToDGLcbzdMmpGxZN63P70yAFL5G7sJy1Izp5ei
33
+ CvIRDtL1PdU1ESVLFJuoCAiCtpBfwwepv4kuuoca9Ykd5ldPCGzMq0n8+KIubb+2
34
+ xz7fp33atnZoMajdCOYKqwo2xVhUuFPZzBFZ3L6T6YLuEVGKHNyUAfcfr+8VSuB5
35
+ 3+l7cSZt
36
+ -----END CERTIFICATE-----
37
+
38
+ date: 2010-03-23 00:00:00 -05:00
39
+ default_executable:
40
+ dependencies: []
41
+
42
+ description: A high accuracy naive bayesian sentence tokenizer based on Splitta.
43
+ email: mkbunday @nospam@ gmail.com
44
+ executables: []
45
+
46
+ extensions: []
47
+
48
+ extra_rdoc_files:
49
+ - README.rdoc
50
+ - lib/models/features.mar
51
+ - lib/models/lower_words.mar
52
+ - lib/models/non_abbrs.mar
53
+ - lib/tactful_tokenizer.rb
54
+ - lib/word_tokenizer.rb
55
+ files:
56
+ - Manifest
57
+ - README.rdoc
58
+ - Rakefile
59
+ - lib/models/features.mar
60
+ - lib/models/lower_words.mar
61
+ - lib/models/non_abbrs.mar
62
+ - lib/tactful_tokenizer.rb
63
+ - lib/word_tokenizer.rb
64
+ - tactful_tokenizer.gemspec
65
+ has_rdoc: true
66
+ homepage: http://github.com/SlyShy/Tactful_Tokenizer
67
+ licenses: []
68
+
69
+ post_install_message:
70
+ rdoc_options:
71
+ - --line-numbers
72
+ - --inline-source
73
+ - --title
74
+ - Tactful_tokenizer
75
+ - --main
76
+ - README.rdoc
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ segments:
84
+ - 0
85
+ version: "0"
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ segments:
91
+ - 1
92
+ - 2
93
+ version: "1.2"
94
+ requirements: []
95
+
96
+ rubyforge_project: tactful_tokenizer
97
+ rubygems_version: 1.3.6
98
+ signing_key:
99
+ specification_version: 3
100
+ summary: A high accuracy naive bayesian sentence tokenizer based on Splitta.
101
+ test_files: []
102
+
metadata.gz.sig ADDED
Binary file