tactful_tokenizer 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +4 -0
- data/Manifest +8 -0
- data/README.rdoc +26 -0
- data/Rakefile +12 -0
- data/lib/models/features.mar +0 -0
- data/lib/models/lower_words.mar +0 -0
- data/lib/models/non_abbrs.mar +0 -0
- data/lib/tactful_tokenizer.rb +210 -0
- data/lib/word_tokenizer.rb +51 -0
- data/tactful_tokenizer.gemspec +32 -0
- metadata +102 -0
- metadata.gz.sig +0 -0
data.tar.gz.sig
ADDED
data/Manifest
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
= TactfulTokenizer
|
2
|
+
|
3
|
+
TactfulTokenizer is a Ruby library for high quality sentence
|
4
|
+
tokenization. It uses a Naive Bayesian statistical model, and
|
5
|
+
is based on Splitta[http://code.google.com/p/splitta/], but
|
6
|
+
has support for '?' and '!' as well as primitive handling of
|
7
|
+
XHTML markup. Better support for XHTML parsing is coming shortly.
|
8
|
+
|
9
|
+
== Usage
|
10
|
+
|
11
|
+
require "tactful_tokenizer"
|
12
|
+
m = TactfulTokenizer::Model.new
|
13
|
+
m.tokenize_text("Here in the U.S. Senate we prefer to eat our friends. Is it easier that way? <em>Yes.</em> <em>Maybe</em>!")
|
14
|
+
#=> ["Here in the U.S. Senate we prefer to eat our friends.", "Is it easier that way?", "<em>Yes.</em>", "<em>Maybe</em>!"]
|
15
|
+
|
16
|
+
The input text is expected to consist of paragraphs delimited
|
17
|
+
by line breaks.
|
18
|
+
|
19
|
+
== Installation
|
20
|
+
git clone http://github.com/SlyShy/Tactful_Tokenizer.git
|
21
|
+
gem install andand
|
22
|
+
|
23
|
+
== Author
|
24
|
+
|
25
|
+
Copyright (c) 2010 Matthew Bunday. All rights reserved.
|
26
|
+
Released under the {GNU GPL v3}[http://www.gnu.org/licenses/gpl.html].
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'echoe'
|
4
|
+
|
5
|
+
Echoe.new('tactful_tokenizer', '0.0.1') do |p|
|
6
|
+
p.description = "A high accuracy naive bayesian sentence tokenizer based on Splitta."
|
7
|
+
p.url = "http://github.com/SlyShy/Tactful_Tokenizer"
|
8
|
+
p.author = "Matthew Bunday"
|
9
|
+
p.email = "mkbunday @nospam@ gmail.com"
|
10
|
+
p.ignore_pattern = ["tmp/*", "script/*"]
|
11
|
+
p.development_dependencies = []
|
12
|
+
end
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,210 @@
|
|
1
|
+
# TactfulTokenizer is a Ruby library for high quality sentence
|
2
|
+
# tokenization. It uses a Naive Bayesian statistical model, and
|
3
|
+
# is based on Splitta[http://code.google.com/p/splitta/]. But
|
4
|
+
# has support for '?' and '!' as well as primitive handling of
|
5
|
+
# XHTML markup. Better support for XHTML parsing is coming shortly.
|
6
|
+
#
|
7
|
+
# Example usage:
|
8
|
+
#
|
9
|
+
# require "tactful_tokenizer"
|
10
|
+
# m = TactfulTokenizer::Model.new
|
11
|
+
# m.tokenize_text("Here in the U.S. Senate we prefer to eat our friends. Is it easier that way, really? Yes.")
|
12
|
+
# #=> ["Here in the U.S. Senate we prefer to eat our friends.", "Is it easier that way, really?", "Yes."]
|
13
|
+
#
|
14
|
+
# The input text is expected to consist of paragraphs delimited
|
15
|
+
# by line breaks.
|
16
|
+
#
|
17
|
+
# Author:: Matthew Bunday (mailto:mkbunday@gmail.com)
|
18
|
+
# License:: GNU General Public License v3
|
19
|
+
|
20
|
+
require "word_tokenizer.rb"
|
21
|
+
include WordTokenizer
|
22
|
+
|
23
|
+
#--
|
24
|
+
####### Performance TODOs.
|
25
|
+
# TODO: Use inline C where necessary?
|
26
|
+
# TODO: Use RE2 regexp extension.
|
27
|
+
#++
|
28
|
+
|
29
|
+
module TactfulTokenizer
|
30
|
+
|
31
|
+
# Basic String extensions.
|
32
|
+
String.class_eval do
|
33
|
+
|
34
|
+
# Simple regex to check if a string is alphabetic.
|
35
|
+
def is_alphabetic?
|
36
|
+
return !/[^[:alpha:]]/.match(self)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Check for upper case.
|
40
|
+
# Surprisingly, this is faster than a regex in benchmarks.
|
41
|
+
# Using the trinary operator is faster than to_s
|
42
|
+
def is_upper_case?
|
43
|
+
self == self.upcase ? 'true' : 'false'
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# A model stores normalized probabilities of different features occuring.
|
48
|
+
class Model
|
49
|
+
|
50
|
+
# Initialize the model. feats, lower_words, and non_abbrs
|
51
|
+
# indicate the locations of the respective Marshal dumps.
|
52
|
+
def initialize(feats="#{File.dirname(__FILE__)}/models/features.mar", lower_words="#{File.dirname(__FILE__)}/models/lower_words.mar", non_abbrs="#{File.dirname(__FILE__)}/models/non_abbrs.mar")
|
53
|
+
@feats, @lower_words, @non_abbrs = [feats, lower_words, non_abbrs].map do |file|
|
54
|
+
File.open(file) do |f|
|
55
|
+
Marshal.load(f.read)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
@p0 = @feats["<prior>"] ** 4
|
59
|
+
end
|
60
|
+
|
61
|
+
# feats = {feature => normalized probability of feature}
|
62
|
+
# lower_words = {token => log count of occurences in lower case}
|
63
|
+
# non_abbrs = {token => log count of occurences when not an abbrv.}
|
64
|
+
attr_accessor :feats, :lower_words, :non_abbrs
|
65
|
+
|
66
|
+
# This function is the only one that'll end up being used.
|
67
|
+
# m = TactfulTokenizer::Model.new
|
68
|
+
# m.tokenize_text("Hey, are these two sentences? I bet they should be.")
|
69
|
+
# => ["Hey, are these two sentences?", "I bet they should be."]
|
70
|
+
def tokenize_text(text)
|
71
|
+
data = Doc.new(text)
|
72
|
+
featurize(data)
|
73
|
+
classify(data)
|
74
|
+
return data.segment
|
75
|
+
end
|
76
|
+
|
77
|
+
# Assign a prediction (probability, to be precise) to each sentence fragment.
|
78
|
+
# For each feature in each fragment we hunt up the normalized probability and
|
79
|
+
# multiply. This is a fairly straightforward Bayesian probabilistic algorithm.
|
80
|
+
def classify(doc)
|
81
|
+
frag = nil
|
82
|
+
probs = 1
|
83
|
+
feat = ''
|
84
|
+
doc.frags.each do |frag|
|
85
|
+
probs = @p0
|
86
|
+
frag.features.each do |feat|
|
87
|
+
probs *= @feats[feat]
|
88
|
+
end
|
89
|
+
frag.pred = probs / (probs + 1)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Get the features of every fragment.
|
94
|
+
def featurize(doc)
|
95
|
+
frag = nil
|
96
|
+
doc.frags.each do |frag|
|
97
|
+
get_features(frag, self)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Finds the features in a text fragment of the form:
|
102
|
+
# ... w1. (sb?) w2 ...
|
103
|
+
# Features listed in rough order of importance:
|
104
|
+
# * w1: a word that includes a period.
|
105
|
+
# * w2: the next word, if it exists.
|
106
|
+
# * w1length: the number of alphabetic characters in w1.
|
107
|
+
# * both: w1 and w2 taken together.
|
108
|
+
# * w1abbr: logarithmic count of w1 occuring without a period.
|
109
|
+
# * w2lower: logarithmiccount of w2 occuring lowercased.
|
110
|
+
def get_features(frag, model)
|
111
|
+
w1 = (frag.cleaned.last or '')
|
112
|
+
w2 = (frag.next or '')
|
113
|
+
|
114
|
+
frag.features = ["w1_#{w1}", "w2_#{w2}", "both_#{w1}_#{w2}"]
|
115
|
+
|
116
|
+
if not w2.empty?
|
117
|
+
if w1.chop.is_alphabetic?
|
118
|
+
frag.features.push "w1length_#{[10, w1.length].min}"
|
119
|
+
frag.features.push "w1abbr_#{model.non_abbrs[w1.chop]}"
|
120
|
+
end
|
121
|
+
|
122
|
+
if w2.chop.is_alphabetic?
|
123
|
+
frag.features.push "w2cap_#{w2[0].is_upper_case?}"
|
124
|
+
frag.features.push "w2lower_#{model.lower_words[w2.downcase]}"
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# A document represents the input text. It holds a list of fragments generated
|
131
|
+
# from the text.
|
132
|
+
class Doc
|
133
|
+
# List of fragments.
|
134
|
+
attr_accessor :frags
|
135
|
+
|
136
|
+
# Receives a text, which is then broken into fragments.
|
137
|
+
# A fragment ends with a period, quesetion mark, or exclamation mark followed
|
138
|
+
# possibly by right handed punctuation like quotation marks or closing braces
|
139
|
+
# and trailing whitespace. Failing that, it'll accept something like "I hate cheese\n"
|
140
|
+
# No, it doesn't have a period, but that's the end of paragraph.
|
141
|
+
#
|
142
|
+
# Input assumption: Paragraphs delimited by line breaks.
|
143
|
+
def initialize(text)
|
144
|
+
@frags = []
|
145
|
+
res = nil
|
146
|
+
puts "Hey!"
|
147
|
+
puts text.inspect
|
148
|
+
text.each_line do |line|
|
149
|
+
unless line.strip.empty?
|
150
|
+
line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[\s])/).each do |res|
|
151
|
+
unless res.strip.empty?
|
152
|
+
frag = Frag.new(res)
|
153
|
+
@frags.last.next = frag.cleaned.first unless @frags.empty?
|
154
|
+
@frags.push frag
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
# Segments the text. More precisely, it reassembles the fragments into sentences.
|
162
|
+
# We call something a sentence whenever it is more likely to be a sentence than not.
|
163
|
+
def segment
|
164
|
+
sents, sent = [], []
|
165
|
+
thresh = 0.5
|
166
|
+
|
167
|
+
frag = nil
|
168
|
+
@frags.each do |frag|
|
169
|
+
sent.push(frag.orig)
|
170
|
+
if frag.pred > thresh
|
171
|
+
break if frag.orig.nil?
|
172
|
+
sents.push(sent.join('').strip)
|
173
|
+
sent = []
|
174
|
+
end
|
175
|
+
end
|
176
|
+
sents
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# A fragment is a potential sentence, but is based only on the existence of a period.
|
181
|
+
# The text "Here in the U.S. Senate we prefer to devour our friends." will be split
|
182
|
+
# into "Here in the U.S." and "Senate we prefer to devour our friends."
|
183
|
+
class Frag
|
184
|
+
|
185
|
+
# orig = The original text of the fragment.
|
186
|
+
# next = The next word following the fragment.
|
187
|
+
# cleaned = Array of the fragment's words after cleaning.
|
188
|
+
# pred = Probability that the fragment is a sentence.
|
189
|
+
# features = Array of the fragment's features.
|
190
|
+
attr_accessor :orig, :next, :cleaned, :pred, :features
|
191
|
+
|
192
|
+
# Create a new fragment.
|
193
|
+
def initialize(orig='')
|
194
|
+
@orig = orig
|
195
|
+
clean(orig)
|
196
|
+
@next, @pred, @features = nil, nil, nil
|
197
|
+
end
|
198
|
+
|
199
|
+
# Normalizes numbers and discards ambiguous punctuation. And then splits into an
|
200
|
+
# array, because realistically only the last and first words are ever accessed.
|
201
|
+
def clean(s)
|
202
|
+
@cleaned = String.new(s)
|
203
|
+
tokenize(@cleaned)
|
204
|
+
@cleaned.gsub!(/[.,\d]*\d/, '<NUM>')
|
205
|
+
@cleaned.gsub!(/[^a-zA-Z0-9,.;:<>\-'\/$% ]/, '')
|
206
|
+
@cleaned.gsub!('--', ' ')
|
207
|
+
@cleaned = @cleaned.split
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module WordTokenizer
|
2
|
+
@@tokenize_regexps = [
|
3
|
+
# Uniform Quotes
|
4
|
+
[/''|``/, '"'],
|
5
|
+
|
6
|
+
# Separate punctuation (except for periods) from words.
|
7
|
+
[/(^|\s)(')/, '\1\2'],
|
8
|
+
[/(?=[\("`{\[:;&#*@])(.)/, '\1 '],
|
9
|
+
|
10
|
+
[/(.)(?=[?!\)";}\]*:@'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|\s)-)(?=[^-])/, '\1 '],
|
11
|
+
|
12
|
+
# Treat double-hyphen as a single token.
|
13
|
+
[/([^-])(--+)([^-])/, '\1 \2 \3'],
|
14
|
+
[/(\s|^)(,)(?=(\S))/, '\1\2 '],
|
15
|
+
|
16
|
+
# Only separate a comma if a space follows.
|
17
|
+
[/(.)(,)(\s|$)/, '\1 \2\3'],
|
18
|
+
|
19
|
+
# Combine dots separated by whitespace to be a single token.
|
20
|
+
[/\.\s\.\s\./, '...'],
|
21
|
+
|
22
|
+
# Separate "No.6"
|
23
|
+
[/([a-zA-Z]\.)(\d+)/, '\1 \2'],
|
24
|
+
|
25
|
+
# Separate words from ellipses
|
26
|
+
[/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
|
27
|
+
[/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
|
28
|
+
[/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
|
29
|
+
|
30
|
+
##### Some additional fixes.
|
31
|
+
|
32
|
+
# Fix %, $, &
|
33
|
+
[/(\d)%/, '\1 %'],
|
34
|
+
[/\$(\.?\d)/, '$ \1'],
|
35
|
+
[/(\w)& (\w)/, '\1&\2'],
|
36
|
+
[/(\w\w+)&(\w\w+)/, '\1 & \2'],
|
37
|
+
|
38
|
+
# Fix (n 't) -> ( n't)
|
39
|
+
[/n 't( |$)/, " n't\\1"],
|
40
|
+
[/N 'T( |$)/, " N'T\\1"],
|
41
|
+
|
42
|
+
# Treebank tokenizer special words
|
43
|
+
[/([Cc])annot/, '\1an not']
|
44
|
+
|
45
|
+
];
|
46
|
+
|
47
|
+
def tokenize(s)
|
48
|
+
rules = []
|
49
|
+
@@tokenize_regexps.each {|rules| s.gsub!(rules[0], rules[1])}
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{tactful_tokenizer}
|
5
|
+
s.version = "0.0.1"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Matthew Bunday"]
|
9
|
+
s.cert_chain = ["/home/slyshy/.ssh/gem-public_cert.pem"]
|
10
|
+
s.date = %q{2010-03-23}
|
11
|
+
s.description = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
|
12
|
+
s.email = %q{mkbunday @nospam@ gmail.com}
|
13
|
+
s.extra_rdoc_files = ["README.rdoc", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb"]
|
14
|
+
s.files = ["Manifest", "README.rdoc", "Rakefile", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb", "tactful_tokenizer.gemspec"]
|
15
|
+
s.homepage = %q{http://github.com/SlyShy/Tactful_Tokenizer}
|
16
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Tactful_tokenizer", "--main", "README.rdoc"]
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
s.rubyforge_project = %q{tactful_tokenizer}
|
19
|
+
s.rubygems_version = %q{1.3.6}
|
20
|
+
s.signing_key = %q{/home/slyshy/.ssh/gem-private_key.pem}
|
21
|
+
s.summary = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
|
22
|
+
|
23
|
+
if s.respond_to? :specification_version then
|
24
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
25
|
+
s.specification_version = 3
|
26
|
+
|
27
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
28
|
+
else
|
29
|
+
end
|
30
|
+
else
|
31
|
+
end
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tactful_tokenizer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Matthew Bunday
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain:
|
16
|
+
- |
|
17
|
+
-----BEGIN CERTIFICATE-----
|
18
|
+
MIIDMjCCAhqgAwIBAgIBADANBgkqhkiG9w0BAQUFADA/MREwDwYDVQQDDAhta2J1
|
19
|
+
bmRheTEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYKCZImiZPyLGQBGRYDY29t
|
20
|
+
MB4XDTEwMDMyMzE2MDkzOVoXDTExMDMyMzE2MDkzOVowPzERMA8GA1UEAwwIbWti
|
21
|
+
dW5kYXkxFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixkARkWA2Nv
|
22
|
+
bTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAMk5+Wsur5ptIGUthPBG
|
23
|
+
VHECPqlV7TRgxiEMbH8vxkMVNnqFGDTezd9zsmqfX9kKR4/Jmu1fXKyBswGRxYxD
|
24
|
+
qx8nR+DCnWk0gfx2jjpnknPPWTQ6lHiZaPrGb+QuANhebPTwI6cDIz4A3dg2QIRo
|
25
|
+
ETdiAdOspNudUHu2Jf/QeNQPr5SURy9vGnSXkDhMcrnR3EjkRAP4suNIlHBNj3Hz
|
26
|
+
7hYjZV5QzeFwVENR5K3zFSkbC3ZK6uZTUwPVngmCqWz3MLsNJiQhAhvn/XQ8OCJ3
|
27
|
+
Q8O/nPuIIqFNeT3TMvnfrbx+wyxX6FIBZ12M4lNmU6yoXxzmi/n/cBNLAkQ/hc2g
|
28
|
+
n68CAwEAAaM5MDcwCQYDVR0TBAIwADAdBgNVHQ4EFgQUZfQL/a3SzQ017Zj9MUwh
|
29
|
+
Y6BtLUgwCwYDVR0PBAQDAgSwMA0GCSqGSIb3DQEBBQUAA4IBAQAjdEGkZbV7tkOq
|
30
|
+
N0y3yL5n1JOMsVHsQF7/w2zeET3PyUgKmmobdq3V0rztqVcJ1oP/+fYUO1KYxC90
|
31
|
+
b8FOCGGvcKjMn1QJufFp1DTfiGFcz6nHRWmiAMRXbempzA5NDzocQP9jaRkoYEzK
|
32
|
+
pwsJwe0dlpJXs8/fqqljNdBe4AToDGLcbzdMmpGxZN63P70yAFL5G7sJy1Izp5ei
|
33
|
+
CvIRDtL1PdU1ESVLFJuoCAiCtpBfwwepv4kuuoca9Ykd5ldPCGzMq0n8+KIubb+2
|
34
|
+
xz7fp33atnZoMajdCOYKqwo2xVhUuFPZzBFZ3L6T6YLuEVGKHNyUAfcfr+8VSuB5
|
35
|
+
3+l7cSZt
|
36
|
+
-----END CERTIFICATE-----
|
37
|
+
|
38
|
+
date: 2010-03-23 00:00:00 -05:00
|
39
|
+
default_executable:
|
40
|
+
dependencies: []
|
41
|
+
|
42
|
+
description: A high accuracy naive bayesian sentence tokenizer based on Splitta.
|
43
|
+
email: mkbunday @nospam@ gmail.com
|
44
|
+
executables: []
|
45
|
+
|
46
|
+
extensions: []
|
47
|
+
|
48
|
+
extra_rdoc_files:
|
49
|
+
- README.rdoc
|
50
|
+
- lib/models/features.mar
|
51
|
+
- lib/models/lower_words.mar
|
52
|
+
- lib/models/non_abbrs.mar
|
53
|
+
- lib/tactful_tokenizer.rb
|
54
|
+
- lib/word_tokenizer.rb
|
55
|
+
files:
|
56
|
+
- Manifest
|
57
|
+
- README.rdoc
|
58
|
+
- Rakefile
|
59
|
+
- lib/models/features.mar
|
60
|
+
- lib/models/lower_words.mar
|
61
|
+
- lib/models/non_abbrs.mar
|
62
|
+
- lib/tactful_tokenizer.rb
|
63
|
+
- lib/word_tokenizer.rb
|
64
|
+
- tactful_tokenizer.gemspec
|
65
|
+
has_rdoc: true
|
66
|
+
homepage: http://github.com/SlyShy/Tactful_Tokenizer
|
67
|
+
licenses: []
|
68
|
+
|
69
|
+
post_install_message:
|
70
|
+
rdoc_options:
|
71
|
+
- --line-numbers
|
72
|
+
- --inline-source
|
73
|
+
- --title
|
74
|
+
- Tactful_tokenizer
|
75
|
+
- --main
|
76
|
+
- README.rdoc
|
77
|
+
require_paths:
|
78
|
+
- lib
|
79
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - ">="
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
segments:
|
84
|
+
- 0
|
85
|
+
version: "0"
|
86
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
segments:
|
91
|
+
- 1
|
92
|
+
- 2
|
93
|
+
version: "1.2"
|
94
|
+
requirements: []
|
95
|
+
|
96
|
+
rubyforge_project: tactful_tokenizer
|
97
|
+
rubygems_version: 1.3.6
|
98
|
+
signing_key:
|
99
|
+
specification_version: 3
|
100
|
+
summary: A high accuracy naive bayesian sentence tokenizer based on Splitta.
|
101
|
+
test_files: []
|
102
|
+
|
metadata.gz.sig
ADDED
Binary file
|