tactful_tokenizer 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +5 -0
- data/.travis.yml +9 -0
- data/Gemfile +8 -0
- data/README.rdoc +7 -0
- data/Rakefile +6 -11
- data/lib/tactful_tokenizer.rb +164 -158
- data/lib/tactful_tokenizer/version.rb +3 -0
- data/lib/word_tokenizer.rb +40 -36
- data/{test → spec/files}/sample.txt +1 -0
- data/{test → spec/files}/test_out.txt +5 -0
- data/{test → spec/files}/verification_out.txt +5 -0
- data/spec/spec_helper.rb +7 -0
- data/spec/tactful_tokenizer/tactful_tokenizer_spec.rb +96 -0
- data/tactful_tokenizer.gemspec +18 -25
- metadata +74 -84
- data.tar.gz.sig +0 -0
- data/Manifest +0 -12
- data/test/test.rb +0 -21
- metadata.gz.sig +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 12a5db701c483d6d9653b1d9c1a6d1ac242501ff
|
4
|
+
data.tar.gz: 2dafa09df763499694bd2580e442ad42b1fcb304
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7d60773b82ba93aca28cb79402b73c73408a3466001346d89885f9c7d003339ec44b1a412224ec08692f3ea7d3665cabab7c93369c90a8b330ecc847f3e54ae3
|
7
|
+
data.tar.gz: f53f494ef41b55afb7bda23320f9f1e7743164f187a507c60ee14667290552919adf618e01ca5bd5c2e30bc451a576ead8245f99a43339e81d74a80af89540bd
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.rdoc
CHANGED
@@ -1,11 +1,18 @@
|
|
1
1
|
= TactfulTokenizer
|
2
2
|
|
3
|
+
{<img src="https://badge.fury.io/rb/tactful_tokenizer.png" alt="Gem Version" />}[http://badge.fury.io/rb/tactful_tokenizer]
|
4
|
+
{<img src="https://travis-ci.org/zencephalon/Tactful_Tokenizer.png?branch=release" alt="Build Status" />}[https://travis-ci.org/zencephalon/Tactful_Tokenizer]
|
5
|
+
{<img src="https://codeclimate.com/github/zencephalon/Tactful_Tokenizer.png" />}[https://codeclimate.com/github/zencephalon/Tactful_Tokenizer]
|
6
|
+
{<img src="https://coveralls.io/repos/zencephalon/Tactful_Tokenizer/badge.png?branch=release" alt="Coverage Status" />}[https://coveralls.io/r/zencephalon/Tactful_Tokenizer?branch=release]
|
7
|
+
|
3
8
|
TactfulTokenizer is a Ruby library for high quality sentence
|
4
9
|
tokenization. It uses a Naive Bayesian statistical model, and
|
5
10
|
is based on Splitta[http://code.google.com/p/splitta/], but
|
6
11
|
has support for '?' and '!' as well as primitive handling of
|
7
12
|
XHTML markup. Better support for XHTML parsing is coming shortly.
|
8
13
|
|
14
|
+
Additionally supports unicode text tokenization.
|
15
|
+
|
9
16
|
== Usage
|
10
17
|
|
11
18
|
require "tactful_tokenizer"
|
data/Rakefile
CHANGED
@@ -1,12 +1,7 @@
|
|
1
|
-
|
2
|
-
require
|
3
|
-
require '
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
require 'rspec/core/rake_task'
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
p.author = "Matthew Bunday"
|
9
|
-
p.email = "mkbunday @nospam@ gmail.com"
|
10
|
-
p.ignore_pattern = ["tmp/*", "script/*"]
|
11
|
-
p.development_dependencies = []
|
12
|
-
end
|
5
|
+
RSpec::Core::RakeTask.new(:spec)
|
6
|
+
|
7
|
+
task :default => :spec
|
data/lib/tactful_tokenizer.rb
CHANGED
@@ -17,188 +17,194 @@
|
|
17
17
|
# Author:: Matthew Bunday (mailto:mkbunday@gmail.com)
|
18
18
|
# License:: GNU General Public License v3
|
19
19
|
|
20
|
-
|
21
|
-
include WordTokenizer
|
22
|
-
|
23
|
-
#--
|
24
|
-
####### Performance TODOs.
|
20
|
+
# Performance TODOs.
|
25
21
|
# TODO: Use inline C where necessary?
|
26
22
|
# TODO: Use RE2 regexp extension.
|
27
|
-
|
23
|
+
|
24
|
+
# -*- encoding : utf-8 -*-
|
25
|
+
require "word_tokenizer.rb"
|
26
|
+
include WordTokenizer
|
28
27
|
|
29
28
|
module TactfulTokenizer
|
30
29
|
|
31
|
-
|
32
|
-
|
30
|
+
# Basic String extensions.
|
31
|
+
String.class_eval do
|
33
32
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
33
|
+
# Simple regex to check if a string is alphabetic.
|
34
|
+
def is_alphabetic?
|
35
|
+
!/[[:lower:][:upper:][:space:]]+/u.match(self).nil?
|
36
|
+
end
|
38
37
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
end
|
38
|
+
# Check for upper case.
|
39
|
+
# Surprisingly, this is faster than a regex in benchmarks.
|
40
|
+
# Using the trinary operator is faster than to_s
|
41
|
+
def is_upper_case?
|
42
|
+
self == self.upcase
|
45
43
|
end
|
44
|
+
end
|
46
45
|
|
47
|
-
|
48
|
-
|
46
|
+
# A model stores normalized probabilities of different features occuring.
|
47
|
+
class Model
|
49
48
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
end
|
57
|
-
end
|
58
|
-
@p0 = @feats["<prior>"] ** 4
|
49
|
+
# Initialize the model. feats, lower_words, and non_abbrs
|
50
|
+
# indicate the locations of the respective Marshal dumps.
|
51
|
+
def initialize(feats="#{File.dirname(__FILE__)}/models/features.mar", lower_words="#{File.dirname(__FILE__)}/models/lower_words.mar", non_abbrs="#{File.dirname(__FILE__)}/models/non_abbrs.mar")
|
52
|
+
@feats, @lower_words, @non_abbrs = [feats, lower_words, non_abbrs].map do |file|
|
53
|
+
File.open(file) do |f|
|
54
|
+
Marshal.load(f.read)
|
59
55
|
end
|
56
|
+
end
|
57
|
+
@p0 = @feats["<prior>"] ** 4
|
58
|
+
end
|
60
59
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
60
|
+
# feats = {feature => normalized probability of feature}
|
61
|
+
# lower_words = {token => log count of occurences in lower case}
|
62
|
+
# non_abbrs = {token => log count of occurences when not an abbrv.}
|
63
|
+
attr_accessor :feats, :lower_words, :non_abbrs
|
64
|
+
|
65
|
+
# This function is the only one that'll end up being used.
|
66
|
+
# m = TactfulTokenizer::Model.new
|
67
|
+
# m.tokenize_text("Hey, are these two sentences? I bet they should be.")
|
68
|
+
# => ["Hey, are these two sentences?", "I bet they should be."]
|
69
|
+
def tokenize_text(text)
|
70
|
+
data = Doc.new(text)
|
71
|
+
featurize(data)
|
72
|
+
classify(data)
|
73
|
+
return data.segment
|
74
|
+
end
|
76
75
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
end
|
87
|
-
frag.pred = probs / (probs + 1)
|
88
|
-
end
|
76
|
+
# Assign a prediction (probability, to be precise) to each sentence fragment.
|
77
|
+
# For each feature in each fragment we hunt up the normalized probability and
|
78
|
+
# multiply. This is a fairly straightforward Bayesian probabilistic algorithm.
|
79
|
+
def classify(doc)
|
80
|
+
frag, probs, feat = nil, nil, nil
|
81
|
+
doc.frags.each do |frag|
|
82
|
+
probs = @p0
|
83
|
+
frag.features.each do |feat|
|
84
|
+
probs *= @feats[feat]
|
89
85
|
end
|
86
|
+
frag.pred = probs / (probs + 1)
|
87
|
+
end
|
88
|
+
end
|
90
89
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
90
|
+
# Get the features of every fragment.
|
91
|
+
def featurize(doc)
|
92
|
+
frag = nil
|
93
|
+
doc.frags.each do |frag|
|
94
|
+
get_features(frag, self)
|
95
|
+
end
|
96
|
+
end
|
98
97
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
98
|
+
# Finds the features in a text fragment of the form:
|
99
|
+
# ... w1. (sb?) w2 ...
|
100
|
+
# Features listed in rough order of importance:
|
101
|
+
# * w1: a word that includes a period.
|
102
|
+
# * w2: the next word, if it exists.
|
103
|
+
# * w1length: the number of alphabetic characters in w1.
|
104
|
+
# * both: w1 and w2 taken together.
|
105
|
+
# * w1abbr: logarithmic count of w1 occuring without a period.
|
106
|
+
# * w2lower: logarithmiccount of w2 occuring lowercased.
|
107
|
+
def get_features(frag, model)
|
108
|
+
w1 = (frag.cleaned.last or '')
|
109
|
+
w2 = (frag.next or '')
|
110
|
+
|
111
|
+
frag.features = ["w1_#{w1}", "w2_#{w2}", "both_#{w1}_#{w2}"]
|
112
|
+
|
113
|
+
unless w2.empty?
|
114
|
+
frag.push_w1_features(w1, model)
|
115
|
+
frag.push_w2_features(w2, model)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# A document represents the input text. It holds a list of fragments generated
|
121
|
+
# from the text.
|
122
|
+
class Doc
|
123
|
+
# List of fragments.
|
124
|
+
attr_accessor :frags
|
125
|
+
|
126
|
+
# Receives a text, which is then broken into fragments.
|
127
|
+
# A fragment ends with a period, quesetion mark, or exclamation mark followed
|
128
|
+
# possibly by right handed punctuation like quotation marks or closing braces
|
129
|
+
# and trailing whitespace. Failing that, it'll accept something like "I hate cheese\n"
|
130
|
+
# No, it doesn't have a period, but that's the end of paragraph.
|
131
|
+
#
|
132
|
+
# Input assumption: Paragraphs delimited by line breaks.
|
133
|
+
def initialize(text)
|
134
|
+
@frags = []
|
135
|
+
res = nil
|
136
|
+
text.each_line do |line|
|
137
|
+
unless line.strip.empty?
|
138
|
+
line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
|
139
|
+
unless res.strip.empty?
|
140
|
+
frag = Frag.new(res)
|
141
|
+
@frags.last.next = frag.cleaned.first unless @frags.empty?
|
142
|
+
@frags.push frag
|
122
143
|
end
|
144
|
+
end
|
123
145
|
end
|
146
|
+
end
|
124
147
|
end
|
125
148
|
|
126
|
-
#
|
127
|
-
#
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
def initialize(text)
|
140
|
-
@frags = []
|
141
|
-
res = nil
|
142
|
-
text.each_line do |line|
|
143
|
-
unless line.strip.empty?
|
144
|
-
line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[\s])/).each do |res|
|
145
|
-
unless res.strip.empty?
|
146
|
-
frag = Frag.new(res)
|
147
|
-
@frags.last.next = frag.cleaned.first unless @frags.empty?
|
148
|
-
@frags.push frag
|
149
|
-
end
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
149
|
+
# Segments the text. More precisely, it reassembles the fragments into sentences.
|
150
|
+
# We call something a sentence whenever it is more likely to be a sentence than not.
|
151
|
+
def segment
|
152
|
+
sents, sent = [], []
|
153
|
+
thresh = 0.5
|
154
|
+
|
155
|
+
frag = nil
|
156
|
+
@frags.each do |frag|
|
157
|
+
sent.push(frag.orig)
|
158
|
+
if frag.pred && frag.pred > thresh
|
159
|
+
break if frag.orig.nil?
|
160
|
+
sents.push(sent.join('').strip)
|
161
|
+
sent = []
|
153
162
|
end
|
163
|
+
end
|
164
|
+
sents
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# A fragment is a potential sentence, but is based only on the existence of a period.
|
169
|
+
# The text "Here in the U.S. Senate we prefer to devour our friends." will be split
|
170
|
+
# into "Here in the U.S." and "Senate we prefer to devour our friends."
|
171
|
+
class Frag
|
172
|
+
|
173
|
+
# orig = The original text of the fragment.
|
174
|
+
# next = The next word following the fragment.
|
175
|
+
# cleaned = Array of the fragment's words after cleaning.
|
176
|
+
# pred = Probability that the fragment is a sentence.
|
177
|
+
# features = Array of the fragment's features.
|
178
|
+
attr_accessor :orig, :next, :cleaned, :pred, :features
|
179
|
+
|
180
|
+
# Create a new fragment.
|
181
|
+
def initialize(orig='')
|
182
|
+
@orig = orig
|
183
|
+
clean(orig)
|
184
|
+
@next, @pred, @features = nil, nil, nil
|
185
|
+
end
|
154
186
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
if frag.pred > thresh
|
165
|
-
break if frag.orig.nil?
|
166
|
-
sents.push(sent.join('').strip)
|
167
|
-
sent = []
|
168
|
-
end
|
169
|
-
end
|
170
|
-
sents
|
171
|
-
end
|
187
|
+
# Normalizes numbers and discards ambiguous punctuation. And then splits into an
|
188
|
+
# array, because realistically only the last and first words are ever accessed.
|
189
|
+
def clean(s)
|
190
|
+
@cleaned = String.new(s)
|
191
|
+
tokenize(@cleaned)
|
192
|
+
@cleaned.gsub!(/[.,\d]*\d/, '<NUM>')
|
193
|
+
@cleaned.gsub!(/[^[[:upper:][:lower:]]\d[:space:],!?.;:<>\-'\/$% ]/u, '')
|
194
|
+
@cleaned.gsub!('--', ' ')
|
195
|
+
@cleaned = @cleaned.split
|
172
196
|
end
|
173
197
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
# orig = The original text of the fragment.
|
180
|
-
# next = The next word following the fragment.
|
181
|
-
# cleaned = Array of the fragment's words after cleaning.
|
182
|
-
# pred = Probability that the fragment is a sentence.
|
183
|
-
# features = Array of the fragment's features.
|
184
|
-
attr_accessor :orig, :next, :cleaned, :pred, :features
|
185
|
-
|
186
|
-
# Create a new fragment.
|
187
|
-
def initialize(orig='')
|
188
|
-
@orig = orig
|
189
|
-
clean(orig)
|
190
|
-
@next, @pred, @features = nil, nil, nil
|
191
|
-
end
|
198
|
+
def push_w1_features w1, model
|
199
|
+
if w1.chop.is_alphabetic?
|
200
|
+
features.push "w1length_#{[10, w1.length].min}", "w1abbr_#{model.non_abbrs[w1.chop]}"
|
201
|
+
end
|
202
|
+
end
|
192
203
|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
tokenize(@cleaned)
|
198
|
-
@cleaned.gsub!(/[.,\d]*\d/, '<NUM>')
|
199
|
-
@cleaned.gsub!(/[^a-zA-Z0-9,.;:<>\-'\/$% ]/, '')
|
200
|
-
@cleaned.gsub!('--', ' ')
|
201
|
-
@cleaned = @cleaned.split
|
202
|
-
end
|
204
|
+
def push_w2_features w2, model
|
205
|
+
if w2.chop.is_alphabetic?
|
206
|
+
features.push "w2cap_#{w2[0,1].is_upper_case?}", "w2lower_#{model.lower_words[w2.downcase]}"
|
207
|
+
end
|
203
208
|
end
|
209
|
+
end
|
204
210
|
end
|
data/lib/word_tokenizer.rb
CHANGED
@@ -1,51 +1,55 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
1
2
|
module WordTokenizer
|
2
|
-
|
3
|
-
|
4
|
-
|
3
|
+
@@tokenize_regexps = [
|
4
|
+
# Uniform Quotes
|
5
|
+
[/''|``/, '"'],
|
5
6
|
|
6
|
-
|
7
|
-
|
8
|
-
|
7
|
+
# Separate punctuation (except for periods) from words.
|
8
|
+
[/(^|[:space:])(')/u, '\1\2'],
|
9
|
+
[/(?=[\("`{\[:;&#*@])(.)/, '\1 '],
|
9
10
|
|
10
|
-
|
11
|
+
[/(.)(?=[?!\)";}\]*:@'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|[:space:])-)(?=[^-])/u, '\1 '],
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
13
|
+
# Treat double-hyphen as a single token.
|
14
|
+
[/([^-])(--+)([^-])/, '\1 \2 \3'],
|
15
|
+
[/([:space:]|^)(,)(?=(^[:space:]))/u, '\1\2 '],
|
15
16
|
|
16
|
-
|
17
|
-
|
17
|
+
# Only separate a comma if a space follows.
|
18
|
+
[/(.)(,)([:space:]|$)/u, '\1 \2\3'],
|
18
19
|
|
19
|
-
|
20
|
-
|
20
|
+
# Combine dots separated by whitespace to be a single token.
|
21
|
+
[/\.[:space:]\.[:space:]\./u, '...'],
|
21
22
|
|
22
|
-
|
23
|
-
|
23
|
+
# Separate "No.6"
|
24
|
+
[/(^[:upper]^[:lower:]\.)(\d+)/, '\1 \2'],
|
24
25
|
|
25
|
-
|
26
|
-
|
27
|
-
[/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
|
28
|
-
[/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
|
26
|
+
# Md. or MD. for Ruby 1.8
|
27
|
+
[/M[d|D]./, '\1'],
|
29
28
|
|
30
|
-
|
29
|
+
# Separate words from ellipses
|
30
|
+
[/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
|
31
|
+
[/(^|[:space:])(\.{2,})([^\.[:space:]])/u, '\1\2 \3'],
|
32
|
+
[/(^|[:space:])(\.{2,})([^\.[:space:]])/u, '\1 \2\3'],
|
31
33
|
|
32
|
-
|
33
|
-
[/(\d)%/, '\1 %'],
|
34
|
-
[/\$(\.?\d)/, '$ \1'],
|
35
|
-
[/(\w)& (\w)/, '\1&\2'],
|
36
|
-
[/(\w\w+)&(\w\w+)/, '\1 & \2'],
|
34
|
+
##### Some additional fixes.
|
37
35
|
|
38
|
-
|
39
|
-
|
40
|
-
|
36
|
+
# Fix %, $, &
|
37
|
+
[/(\d)%/, '\1 %'],
|
38
|
+
[/\$(\.?\d)/, '$ \1'],
|
39
|
+
[/(^[:lower:]^[:upper:])& (^[:lower:]^[:upper:])/u, '\1&\2'],
|
40
|
+
[/(^[:lower:]^[:upper:]+)&(^[:lower:]^[:upper:]+)/u, '\1 & \2'],
|
41
41
|
|
42
|
-
|
43
|
-
|
42
|
+
# Fix (n 't) -> ( n't)
|
43
|
+
[/n 't( |$)/, " n't\\1"],
|
44
|
+
[/N 'T( |$)/, " N'T\\1"],
|
44
45
|
|
45
|
-
|
46
|
+
# Treebank tokenizer special words
|
47
|
+
[/([Cc])annot/, '\1an not']
|
46
48
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
49
|
+
];
|
50
|
+
|
51
|
+
def tokenize(s)
|
52
|
+
rules = []
|
53
|
+
@@tokenize_regexps.each {|rules| s.gsub!(rules[0], rules[1])}
|
54
|
+
end
|
51
55
|
end
|
@@ -96,3 +96,4 @@ The Seattle Mariners on Sunday announced they signed high school catcher Ryan Ch
|
|
96
96
|
|
97
97
|
Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz. If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
|
98
98
|
|
99
|
+
Добавим немного русского текста, чтобы проверить, верно ли все работает. Еще одно предложение. Работай! Будешь? Нет?
|
@@ -92,3 +92,8 @@ It was replay of the scene from last year when David Wells pitched the only othe
|
|
92
92
|
The Seattle Mariners on Sunday announced they signed high school catcher Ryan Christianson, their first-round pick and the 11th overall selection in last month's baseball draft.
|
93
93
|
Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz.
|
94
94
|
If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
|
95
|
+
Добавим немного русского текста, чтобы проверить, верно ли все работает.
|
96
|
+
Еще одно предложение.
|
97
|
+
Работай!
|
98
|
+
Будешь?
|
99
|
+
Нет?
|
@@ -92,3 +92,8 @@ It was replay of the scene from last year when David Wells pitched the only othe
|
|
92
92
|
The Seattle Mariners on Sunday announced they signed high school catcher Ryan Christianson, their first-round pick and the 11th overall selection in last month's baseball draft.
|
93
93
|
Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz.
|
94
94
|
If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
|
95
|
+
Добавим немного русского текста, чтобы проверить, верно ли все работает.
|
96
|
+
Еще одно предложение.
|
97
|
+
Работай!
|
98
|
+
Будешь?
|
99
|
+
Нет?
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe String do
|
5
|
+
describe "::is_upper_case?" do
|
6
|
+
it "should be false" do
|
7
|
+
"asdfghjk".is_upper_case?.should == false
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should be true" do
|
11
|
+
"ASDFGHJK".is_upper_case?.should == true
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "::is_alphabetic?" do
|
16
|
+
it "should be false" do
|
17
|
+
"!^?".is_alphabetic?.should == false
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should be true" do
|
21
|
+
"some text".is_alphabetic?.should == true
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should be true for unicode text" do
|
25
|
+
"русский текст öö üüü".is_alphabetic?.should == true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe TactfulTokenizer::Doc do
|
31
|
+
describe "::segment" do
|
32
|
+
it "should return array of segments" do
|
33
|
+
model = TactfulTokenizer::Model.new
|
34
|
+
doc = TactfulTokenizer::Doc.new("Hello!\nMy name is Richard Stewart.\nHow are you?\n")
|
35
|
+
model.featurize doc
|
36
|
+
model.classify doc
|
37
|
+
doc.segment.should == ["Hello!", "My name is Richard Stewart.", "How are you?"]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe TactfulTokenizer::Frag do
|
43
|
+
describe "::clean" do
|
44
|
+
before :each do
|
45
|
+
@frag = TactfulTokenizer::Frag.new
|
46
|
+
@cleaned = @frag.clean("1 good bad 23 ?!")
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should return an instance of Array" do
|
50
|
+
@cleaned.class.should == Array
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should normalize numbers and discard ambiguous punctuation" do
|
54
|
+
@cleaned.should == ["<NUM>", "good", "bad", "<NUM>", "?", "!"]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe TactfulTokenizer::Model do
|
60
|
+
before :each do
|
61
|
+
@m = TactfulTokenizer::Model.new
|
62
|
+
File.open('spec/files/sample.txt') do |f|
|
63
|
+
@text = f.read
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
describe "::classify" do
|
68
|
+
it "should assign a prediction for frags" do
|
69
|
+
doc = TactfulTokenizer::Doc.new("Hello!\n")
|
70
|
+
@m.featurize(doc)
|
71
|
+
@m.classify(doc).first.pred.should > 0.5
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe "::featurize" do
|
76
|
+
it "should get the features of every fragment" do
|
77
|
+
doc = TactfulTokenizer::Doc.new("Hello!\n")
|
78
|
+
@m.featurize(doc).first.features.should == ["w1_!", "w2_", "both_!_"]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
describe "::tokenize_text" do
|
83
|
+
it "should tokenize correctly" do
|
84
|
+
text = @m.tokenize_text(@text)
|
85
|
+
File.open("spec/files/test_out.txt", "w+") do |g|
|
86
|
+
text.each do |line|
|
87
|
+
g.puts line unless line.empty?
|
88
|
+
end
|
89
|
+
g.rewind
|
90
|
+
t2 = g.read
|
91
|
+
t1 = File.open("spec/files/verification_out.txt").read
|
92
|
+
t1.should == t2
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/tactful_tokenizer.gemspec
CHANGED
@@ -1,32 +1,25 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "tactful_tokenizer/version"
|
2
4
|
|
3
5
|
Gem::Specification.new do |s|
|
4
|
-
s.name
|
5
|
-
s.version
|
6
|
+
s.name = "tactful_tokenizer"
|
7
|
+
s.version = TactfulTokenizer::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Matthew Bunday", "Sergey Kishenin"]
|
10
|
+
s.email = ["mkbunday@gmail.com"]
|
11
|
+
s.homepage = "http://github.com/zencephalon/Tactful_Tokenizer"
|
12
|
+
s.summary = "High accuracy sentence tokenization for Ruby."
|
13
|
+
s.description = "TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ corpuses to provide high quality sentence tokenization."
|
14
|
+
s.license = "GPL-3"
|
6
15
|
|
7
|
-
s.
|
8
|
-
s.authors = ["Matthew Bunday"]
|
9
|
-
s.cert_chain = ["/home/slyshy/.ssh/gem-public_cert.pem"]
|
10
|
-
s.date = %q{2010-04-04}
|
11
|
-
s.description = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
|
12
|
-
s.email = %q{mkbunday @nospam@ gmail.com}
|
13
|
-
s.extra_rdoc_files = ["README.rdoc", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb"]
|
14
|
-
s.files = ["Manifest", "README.rdoc", "Rakefile", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb", "test/sample.txt", "test/test.rb", "test/test_out.txt", "test/verification_out.txt", "tactful_tokenizer.gemspec"]
|
15
|
-
s.homepage = %q{http://github.com/SlyShy/Tactful_Tokenizer}
|
16
|
-
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Tactful_tokenizer", "--main", "README.rdoc"]
|
17
|
-
s.require_paths = ["lib"]
|
18
|
-
s.rubyforge_project = %q{tactful_tokenizer}
|
19
|
-
s.rubygems_version = %q{1.3.6}
|
20
|
-
s.signing_key = %q{/home/slyshy/.ssh/gem-private_key.pem}
|
21
|
-
s.summary = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
|
16
|
+
s.rubyforge_project = "tactful_tokenizer"
|
22
17
|
|
23
|
-
|
24
|
-
|
25
|
-
|
18
|
+
s.files = `git ls-files`.split($\)
|
19
|
+
s.executables = s.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
20
|
+
s.test_files = s.files.grep(%r{^(test|spec|features)/})
|
21
|
+
s.require_paths = ["lib"]
|
26
22
|
|
27
|
-
|
28
|
-
|
29
|
-
end
|
30
|
-
else
|
31
|
-
end
|
23
|
+
s.add_development_dependency "rspec", "~> 0"
|
24
|
+
s.add_development_dependency "rake", "~> 0"
|
32
25
|
end
|
metadata
CHANGED
@@ -1,106 +1,96 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: tactful_tokenizer
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
segments:
|
6
|
-
- 0
|
7
|
-
- 0
|
8
|
-
- 2
|
9
|
-
version: 0.0.2
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
10
5
|
platform: ruby
|
11
|
-
authors:
|
6
|
+
authors:
|
12
7
|
- Matthew Bunday
|
8
|
+
- Sergey Kishenin
|
13
9
|
autorequire:
|
14
10
|
bindir: bin
|
15
|
-
cert_chain:
|
16
|
-
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-04-25 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '0'
|
21
|
+
type: :development
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - "~>"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: '0'
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: rake
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
type: :development
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
description: TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ
|
43
|
+
corpuses to provide high quality sentence tokenization.
|
44
|
+
email:
|
45
|
+
- mkbunday@gmail.com
|
44
46
|
executables: []
|
45
|
-
|
46
47
|
extensions: []
|
47
|
-
|
48
|
-
|
49
|
-
-
|
50
|
-
-
|
51
|
-
-
|
52
|
-
- lib/models/non_abbrs.mar
|
53
|
-
- lib/tactful_tokenizer.rb
|
54
|
-
- lib/word_tokenizer.rb
|
55
|
-
files:
|
56
|
-
- Manifest
|
48
|
+
extra_rdoc_files: []
|
49
|
+
files:
|
50
|
+
- ".gitignore"
|
51
|
+
- ".travis.yml"
|
52
|
+
- Gemfile
|
57
53
|
- README.rdoc
|
58
54
|
- Rakefile
|
59
55
|
- lib/models/features.mar
|
60
56
|
- lib/models/lower_words.mar
|
61
57
|
- lib/models/non_abbrs.mar
|
62
58
|
- lib/tactful_tokenizer.rb
|
59
|
+
- lib/tactful_tokenizer/version.rb
|
63
60
|
- lib/word_tokenizer.rb
|
64
|
-
-
|
65
|
-
-
|
66
|
-
-
|
67
|
-
-
|
61
|
+
- spec/files/sample.txt
|
62
|
+
- spec/files/test_out.txt
|
63
|
+
- spec/files/verification_out.txt
|
64
|
+
- spec/spec_helper.rb
|
65
|
+
- spec/tactful_tokenizer/tactful_tokenizer_spec.rb
|
68
66
|
- tactful_tokenizer.gemspec
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
67
|
+
homepage: http://github.com/zencephalon/Tactful_Tokenizer
|
68
|
+
licenses:
|
69
|
+
- GPL-3
|
70
|
+
metadata: {}
|
73
71
|
post_install_message:
|
74
|
-
rdoc_options:
|
75
|
-
|
76
|
-
- --inline-source
|
77
|
-
- --title
|
78
|
-
- Tactful_tokenizer
|
79
|
-
- --main
|
80
|
-
- README.rdoc
|
81
|
-
require_paths:
|
72
|
+
rdoc_options: []
|
73
|
+
require_paths:
|
82
74
|
- lib
|
83
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
-
requirements:
|
75
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
85
77
|
- - ">="
|
86
|
-
- !ruby/object:Gem::Version
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
-
requirements:
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: '0'
|
80
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
92
82
|
- - ">="
|
93
|
-
- !ruby/object:Gem::Version
|
94
|
-
|
95
|
-
- 1
|
96
|
-
- 2
|
97
|
-
version: "1.2"
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '0'
|
98
85
|
requirements: []
|
99
|
-
|
100
86
|
rubyforge_project: tactful_tokenizer
|
101
|
-
rubygems_version:
|
87
|
+
rubygems_version: 2.2.2
|
102
88
|
signing_key:
|
103
|
-
specification_version:
|
104
|
-
summary:
|
105
|
-
test_files:
|
106
|
-
|
89
|
+
specification_version: 4
|
90
|
+
summary: High accuracy sentence tokenization for Ruby.
|
91
|
+
test_files:
|
92
|
+
- spec/files/sample.txt
|
93
|
+
- spec/files/test_out.txt
|
94
|
+
- spec/files/verification_out.txt
|
95
|
+
- spec/spec_helper.rb
|
96
|
+
- spec/tactful_tokenizer/tactful_tokenizer_spec.rb
|
data.tar.gz.sig
DELETED
Binary file
|
data/Manifest
DELETED
data/test/test.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
require '../lib/tactful_tokenizer'
|
2
|
-
require 'test/unit'
|
3
|
-
|
4
|
-
class TactfulTokenize < Test::Unit::TestCase
|
5
|
-
def test_simple
|
6
|
-
m = TactfulTokenizer::Model.new
|
7
|
-
File.open("sample.txt") do |f|
|
8
|
-
text = f.read
|
9
|
-
text = m.tokenize_text(text)
|
10
|
-
File.open("test_out.txt","w+") do |g|
|
11
|
-
text.each do |line|
|
12
|
-
g.puts line unless line.empty?
|
13
|
-
end
|
14
|
-
g.rewind
|
15
|
-
t2 = g.read
|
16
|
-
t1 = File.open("verification_out.txt").read
|
17
|
-
assert_equal(t1, t2)
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
metadata.gz.sig
DELETED
Binary file
|