tactful_tokenizer 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +5 -0
- data/.travis.yml +9 -0
- data/Gemfile +8 -0
- data/README.rdoc +7 -0
- data/Rakefile +6 -11
- data/lib/tactful_tokenizer.rb +164 -158
- data/lib/tactful_tokenizer/version.rb +3 -0
- data/lib/word_tokenizer.rb +40 -36
- data/{test → spec/files}/sample.txt +1 -0
- data/{test → spec/files}/test_out.txt +5 -0
- data/{test → spec/files}/verification_out.txt +5 -0
- data/spec/spec_helper.rb +7 -0
- data/spec/tactful_tokenizer/tactful_tokenizer_spec.rb +96 -0
- data/tactful_tokenizer.gemspec +18 -25
- metadata +74 -84
- data.tar.gz.sig +0 -0
- data/Manifest +0 -12
- data/test/test.rb +0 -21
- metadata.gz.sig +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 12a5db701c483d6d9653b1d9c1a6d1ac242501ff
|
4
|
+
data.tar.gz: 2dafa09df763499694bd2580e442ad42b1fcb304
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7d60773b82ba93aca28cb79402b73c73408a3466001346d89885f9c7d003339ec44b1a412224ec08692f3ea7d3665cabab7c93369c90a8b330ecc847f3e54ae3
|
7
|
+
data.tar.gz: f53f494ef41b55afb7bda23320f9f1e7743164f187a507c60ee14667290552919adf618e01ca5bd5c2e30bc451a576ead8245f99a43339e81d74a80af89540bd
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.rdoc
CHANGED
@@ -1,11 +1,18 @@
|
|
1
1
|
= TactfulTokenizer
|
2
2
|
|
3
|
+
{<img src="https://badge.fury.io/rb/tactful_tokenizer.png" alt="Gem Version" />}[http://badge.fury.io/rb/tactful_tokenizer]
|
4
|
+
{<img src="https://travis-ci.org/zencephalon/Tactful_Tokenizer.png?branch=release" alt="Build Status" />}[https://travis-ci.org/zencephalon/Tactful_Tokenizer]
|
5
|
+
{<img src="https://codeclimate.com/github/zencephalon/Tactful_Tokenizer.png" />}[https://codeclimate.com/github/zencephalon/Tactful_Tokenizer]
|
6
|
+
{<img src="https://coveralls.io/repos/zencephalon/Tactful_Tokenizer/badge.png?branch=release" alt="Coverage Status" />}[https://coveralls.io/r/zencephalon/Tactful_Tokenizer?branch=release]
|
7
|
+
|
3
8
|
TactfulTokenizer is a Ruby library for high quality sentence
|
4
9
|
tokenization. It uses a Naive Bayesian statistical model, and
|
5
10
|
is based on Splitta[http://code.google.com/p/splitta/], but
|
6
11
|
has support for '?' and '!' as well as primitive handling of
|
7
12
|
XHTML markup. Better support for XHTML parsing is coming shortly.
|
8
13
|
|
14
|
+
Additionally supports unicode text tokenization.
|
15
|
+
|
9
16
|
== Usage
|
10
17
|
|
11
18
|
require "tactful_tokenizer"
|
data/Rakefile
CHANGED
@@ -1,12 +1,7 @@
|
|
1
|
-
|
2
|
-
require
|
3
|
-
require '
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
require 'rspec/core/rake_task'
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
p.author = "Matthew Bunday"
|
9
|
-
p.email = "mkbunday @nospam@ gmail.com"
|
10
|
-
p.ignore_pattern = ["tmp/*", "script/*"]
|
11
|
-
p.development_dependencies = []
|
12
|
-
end
|
5
|
+
RSpec::Core::RakeTask.new(:spec)
|
6
|
+
|
7
|
+
task :default => :spec
|
data/lib/tactful_tokenizer.rb
CHANGED
@@ -17,188 +17,194 @@
|
|
17
17
|
# Author:: Matthew Bunday (mailto:mkbunday@gmail.com)
|
18
18
|
# License:: GNU General Public License v3
|
19
19
|
|
20
|
-
|
21
|
-
include WordTokenizer
|
22
|
-
|
23
|
-
#--
|
24
|
-
####### Performance TODOs.
|
20
|
+
# Performance TODOs.
|
25
21
|
# TODO: Use inline C where necessary?
|
26
22
|
# TODO: Use RE2 regexp extension.
|
27
|
-
|
23
|
+
|
24
|
+
# -*- encoding : utf-8 -*-
|
25
|
+
require "word_tokenizer.rb"
|
26
|
+
include WordTokenizer
|
28
27
|
|
29
28
|
module TactfulTokenizer
|
30
29
|
|
31
|
-
|
32
|
-
|
30
|
+
# Basic String extensions.
|
31
|
+
String.class_eval do
|
33
32
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
33
|
+
# Simple regex to check if a string is alphabetic.
|
34
|
+
def is_alphabetic?
|
35
|
+
!/[[:lower:][:upper:][:space:]]+/u.match(self).nil?
|
36
|
+
end
|
38
37
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
end
|
38
|
+
# Check for upper case.
|
39
|
+
# Surprisingly, this is faster than a regex in benchmarks.
|
40
|
+
# Using the trinary operator is faster than to_s
|
41
|
+
def is_upper_case?
|
42
|
+
self == self.upcase
|
45
43
|
end
|
44
|
+
end
|
46
45
|
|
47
|
-
|
48
|
-
|
46
|
+
# A model stores normalized probabilities of different features occuring.
|
47
|
+
class Model
|
49
48
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
end
|
57
|
-
end
|
58
|
-
@p0 = @feats["<prior>"] ** 4
|
49
|
+
# Initialize the model. feats, lower_words, and non_abbrs
|
50
|
+
# indicate the locations of the respective Marshal dumps.
|
51
|
+
def initialize(feats="#{File.dirname(__FILE__)}/models/features.mar", lower_words="#{File.dirname(__FILE__)}/models/lower_words.mar", non_abbrs="#{File.dirname(__FILE__)}/models/non_abbrs.mar")
|
52
|
+
@feats, @lower_words, @non_abbrs = [feats, lower_words, non_abbrs].map do |file|
|
53
|
+
File.open(file) do |f|
|
54
|
+
Marshal.load(f.read)
|
59
55
|
end
|
56
|
+
end
|
57
|
+
@p0 = @feats["<prior>"] ** 4
|
58
|
+
end
|
60
59
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
60
|
+
# feats = {feature => normalized probability of feature}
|
61
|
+
# lower_words = {token => log count of occurences in lower case}
|
62
|
+
# non_abbrs = {token => log count of occurences when not an abbrv.}
|
63
|
+
attr_accessor :feats, :lower_words, :non_abbrs
|
64
|
+
|
65
|
+
# This function is the only one that'll end up being used.
|
66
|
+
# m = TactfulTokenizer::Model.new
|
67
|
+
# m.tokenize_text("Hey, are these two sentences? I bet they should be.")
|
68
|
+
# => ["Hey, are these two sentences?", "I bet they should be."]
|
69
|
+
def tokenize_text(text)
|
70
|
+
data = Doc.new(text)
|
71
|
+
featurize(data)
|
72
|
+
classify(data)
|
73
|
+
return data.segment
|
74
|
+
end
|
76
75
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
end
|
87
|
-
frag.pred = probs / (probs + 1)
|
88
|
-
end
|
76
|
+
# Assign a prediction (probability, to be precise) to each sentence fragment.
|
77
|
+
# For each feature in each fragment we hunt up the normalized probability and
|
78
|
+
# multiply. This is a fairly straightforward Bayesian probabilistic algorithm.
|
79
|
+
def classify(doc)
|
80
|
+
frag, probs, feat = nil, nil, nil
|
81
|
+
doc.frags.each do |frag|
|
82
|
+
probs = @p0
|
83
|
+
frag.features.each do |feat|
|
84
|
+
probs *= @feats[feat]
|
89
85
|
end
|
86
|
+
frag.pred = probs / (probs + 1)
|
87
|
+
end
|
88
|
+
end
|
90
89
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
90
|
+
# Get the features of every fragment.
|
91
|
+
def featurize(doc)
|
92
|
+
frag = nil
|
93
|
+
doc.frags.each do |frag|
|
94
|
+
get_features(frag, self)
|
95
|
+
end
|
96
|
+
end
|
98
97
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
98
|
+
# Finds the features in a text fragment of the form:
|
99
|
+
# ... w1. (sb?) w2 ...
|
100
|
+
# Features listed in rough order of importance:
|
101
|
+
# * w1: a word that includes a period.
|
102
|
+
# * w2: the next word, if it exists.
|
103
|
+
# * w1length: the number of alphabetic characters in w1.
|
104
|
+
# * both: w1 and w2 taken together.
|
105
|
+
# * w1abbr: logarithmic count of w1 occuring without a period.
|
106
|
+
# * w2lower: logarithmiccount of w2 occuring lowercased.
|
107
|
+
def get_features(frag, model)
|
108
|
+
w1 = (frag.cleaned.last or '')
|
109
|
+
w2 = (frag.next or '')
|
110
|
+
|
111
|
+
frag.features = ["w1_#{w1}", "w2_#{w2}", "both_#{w1}_#{w2}"]
|
112
|
+
|
113
|
+
unless w2.empty?
|
114
|
+
frag.push_w1_features(w1, model)
|
115
|
+
frag.push_w2_features(w2, model)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# A document represents the input text. It holds a list of fragments generated
|
121
|
+
# from the text.
|
122
|
+
class Doc
|
123
|
+
# List of fragments.
|
124
|
+
attr_accessor :frags
|
125
|
+
|
126
|
+
# Receives a text, which is then broken into fragments.
|
127
|
+
# A fragment ends with a period, quesetion mark, or exclamation mark followed
|
128
|
+
# possibly by right handed punctuation like quotation marks or closing braces
|
129
|
+
# and trailing whitespace. Failing that, it'll accept something like "I hate cheese\n"
|
130
|
+
# No, it doesn't have a period, but that's the end of paragraph.
|
131
|
+
#
|
132
|
+
# Input assumption: Paragraphs delimited by line breaks.
|
133
|
+
def initialize(text)
|
134
|
+
@frags = []
|
135
|
+
res = nil
|
136
|
+
text.each_line do |line|
|
137
|
+
unless line.strip.empty?
|
138
|
+
line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
|
139
|
+
unless res.strip.empty?
|
140
|
+
frag = Frag.new(res)
|
141
|
+
@frags.last.next = frag.cleaned.first unless @frags.empty?
|
142
|
+
@frags.push frag
|
122
143
|
end
|
144
|
+
end
|
123
145
|
end
|
146
|
+
end
|
124
147
|
end
|
125
148
|
|
126
|
-
#
|
127
|
-
#
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
def initialize(text)
|
140
|
-
@frags = []
|
141
|
-
res = nil
|
142
|
-
text.each_line do |line|
|
143
|
-
unless line.strip.empty?
|
144
|
-
line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[\s])/).each do |res|
|
145
|
-
unless res.strip.empty?
|
146
|
-
frag = Frag.new(res)
|
147
|
-
@frags.last.next = frag.cleaned.first unless @frags.empty?
|
148
|
-
@frags.push frag
|
149
|
-
end
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
149
|
+
# Segments the text. More precisely, it reassembles the fragments into sentences.
|
150
|
+
# We call something a sentence whenever it is more likely to be a sentence than not.
|
151
|
+
def segment
|
152
|
+
sents, sent = [], []
|
153
|
+
thresh = 0.5
|
154
|
+
|
155
|
+
frag = nil
|
156
|
+
@frags.each do |frag|
|
157
|
+
sent.push(frag.orig)
|
158
|
+
if frag.pred && frag.pred > thresh
|
159
|
+
break if frag.orig.nil?
|
160
|
+
sents.push(sent.join('').strip)
|
161
|
+
sent = []
|
153
162
|
end
|
163
|
+
end
|
164
|
+
sents
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# A fragment is a potential sentence, but is based only on the existence of a period.
|
169
|
+
# The text "Here in the U.S. Senate we prefer to devour our friends." will be split
|
170
|
+
# into "Here in the U.S." and "Senate we prefer to devour our friends."
|
171
|
+
class Frag
|
172
|
+
|
173
|
+
# orig = The original text of the fragment.
|
174
|
+
# next = The next word following the fragment.
|
175
|
+
# cleaned = Array of the fragment's words after cleaning.
|
176
|
+
# pred = Probability that the fragment is a sentence.
|
177
|
+
# features = Array of the fragment's features.
|
178
|
+
attr_accessor :orig, :next, :cleaned, :pred, :features
|
179
|
+
|
180
|
+
# Create a new fragment.
|
181
|
+
def initialize(orig='')
|
182
|
+
@orig = orig
|
183
|
+
clean(orig)
|
184
|
+
@next, @pred, @features = nil, nil, nil
|
185
|
+
end
|
154
186
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
if frag.pred > thresh
|
165
|
-
break if frag.orig.nil?
|
166
|
-
sents.push(sent.join('').strip)
|
167
|
-
sent = []
|
168
|
-
end
|
169
|
-
end
|
170
|
-
sents
|
171
|
-
end
|
187
|
+
# Normalizes numbers and discards ambiguous punctuation. And then splits into an
|
188
|
+
# array, because realistically only the last and first words are ever accessed.
|
189
|
+
def clean(s)
|
190
|
+
@cleaned = String.new(s)
|
191
|
+
tokenize(@cleaned)
|
192
|
+
@cleaned.gsub!(/[.,\d]*\d/, '<NUM>')
|
193
|
+
@cleaned.gsub!(/[^[[:upper:][:lower:]]\d[:space:],!?.;:<>\-'\/$% ]/u, '')
|
194
|
+
@cleaned.gsub!('--', ' ')
|
195
|
+
@cleaned = @cleaned.split
|
172
196
|
end
|
173
197
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
# orig = The original text of the fragment.
|
180
|
-
# next = The next word following the fragment.
|
181
|
-
# cleaned = Array of the fragment's words after cleaning.
|
182
|
-
# pred = Probability that the fragment is a sentence.
|
183
|
-
# features = Array of the fragment's features.
|
184
|
-
attr_accessor :orig, :next, :cleaned, :pred, :features
|
185
|
-
|
186
|
-
# Create a new fragment.
|
187
|
-
def initialize(orig='')
|
188
|
-
@orig = orig
|
189
|
-
clean(orig)
|
190
|
-
@next, @pred, @features = nil, nil, nil
|
191
|
-
end
|
198
|
+
def push_w1_features w1, model
|
199
|
+
if w1.chop.is_alphabetic?
|
200
|
+
features.push "w1length_#{[10, w1.length].min}", "w1abbr_#{model.non_abbrs[w1.chop]}"
|
201
|
+
end
|
202
|
+
end
|
192
203
|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
tokenize(@cleaned)
|
198
|
-
@cleaned.gsub!(/[.,\d]*\d/, '<NUM>')
|
199
|
-
@cleaned.gsub!(/[^a-zA-Z0-9,.;:<>\-'\/$% ]/, '')
|
200
|
-
@cleaned.gsub!('--', ' ')
|
201
|
-
@cleaned = @cleaned.split
|
202
|
-
end
|
204
|
+
def push_w2_features w2, model
|
205
|
+
if w2.chop.is_alphabetic?
|
206
|
+
features.push "w2cap_#{w2[0,1].is_upper_case?}", "w2lower_#{model.lower_words[w2.downcase]}"
|
207
|
+
end
|
203
208
|
end
|
209
|
+
end
|
204
210
|
end
|
data/lib/word_tokenizer.rb
CHANGED
@@ -1,51 +1,55 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
1
2
|
module WordTokenizer
|
2
|
-
|
3
|
-
|
4
|
-
|
3
|
+
@@tokenize_regexps = [
|
4
|
+
# Uniform Quotes
|
5
|
+
[/''|``/, '"'],
|
5
6
|
|
6
|
-
|
7
|
-
|
8
|
-
|
7
|
+
# Separate punctuation (except for periods) from words.
|
8
|
+
[/(^|[:space:])(')/u, '\1\2'],
|
9
|
+
[/(?=[\("`{\[:;&#*@])(.)/, '\1 '],
|
9
10
|
|
10
|
-
|
11
|
+
[/(.)(?=[?!\)";}\]*:@'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|[:space:])-)(?=[^-])/u, '\1 '],
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
13
|
+
# Treat double-hyphen as a single token.
|
14
|
+
[/([^-])(--+)([^-])/, '\1 \2 \3'],
|
15
|
+
[/([:space:]|^)(,)(?=(^[:space:]))/u, '\1\2 '],
|
15
16
|
|
16
|
-
|
17
|
-
|
17
|
+
# Only separate a comma if a space follows.
|
18
|
+
[/(.)(,)([:space:]|$)/u, '\1 \2\3'],
|
18
19
|
|
19
|
-
|
20
|
-
|
20
|
+
# Combine dots separated by whitespace to be a single token.
|
21
|
+
[/\.[:space:]\.[:space:]\./u, '...'],
|
21
22
|
|
22
|
-
|
23
|
-
|
23
|
+
# Separate "No.6"
|
24
|
+
[/(^[:upper]^[:lower:]\.)(\d+)/, '\1 \2'],
|
24
25
|
|
25
|
-
|
26
|
-
|
27
|
-
[/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
|
28
|
-
[/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
|
26
|
+
# Md. or MD. for Ruby 1.8
|
27
|
+
[/M[d|D]./, '\1'],
|
29
28
|
|
30
|
-
|
29
|
+
# Separate words from ellipses
|
30
|
+
[/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
|
31
|
+
[/(^|[:space:])(\.{2,})([^\.[:space:]])/u, '\1\2 \3'],
|
32
|
+
[/(^|[:space:])(\.{2,})([^\.[:space:]])/u, '\1 \2\3'],
|
31
33
|
|
32
|
-
|
33
|
-
[/(\d)%/, '\1 %'],
|
34
|
-
[/\$(\.?\d)/, '$ \1'],
|
35
|
-
[/(\w)& (\w)/, '\1&\2'],
|
36
|
-
[/(\w\w+)&(\w\w+)/, '\1 & \2'],
|
34
|
+
##### Some additional fixes.
|
37
35
|
|
38
|
-
|
39
|
-
|
40
|
-
|
36
|
+
# Fix %, $, &
|
37
|
+
[/(\d)%/, '\1 %'],
|
38
|
+
[/\$(\.?\d)/, '$ \1'],
|
39
|
+
[/(^[:lower:]^[:upper:])& (^[:lower:]^[:upper:])/u, '\1&\2'],
|
40
|
+
[/(^[:lower:]^[:upper:]+)&(^[:lower:]^[:upper:]+)/u, '\1 & \2'],
|
41
41
|
|
42
|
-
|
43
|
-
|
42
|
+
# Fix (n 't) -> ( n't)
|
43
|
+
[/n 't( |$)/, " n't\\1"],
|
44
|
+
[/N 'T( |$)/, " N'T\\1"],
|
44
45
|
|
45
|
-
|
46
|
+
# Treebank tokenizer special words
|
47
|
+
[/([Cc])annot/, '\1an not']
|
46
48
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
49
|
+
];
|
50
|
+
|
51
|
+
def tokenize(s)
|
52
|
+
rules = []
|
53
|
+
@@tokenize_regexps.each {|rules| s.gsub!(rules[0], rules[1])}
|
54
|
+
end
|
51
55
|
end
|
@@ -96,3 +96,4 @@ The Seattle Mariners on Sunday announced they signed high school catcher Ryan Ch
|
|
96
96
|
|
97
97
|
Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz. If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
|
98
98
|
|
99
|
+
Добавим немного русского текста, чтобы проверить, верно ли все работает. Еще одно предложение. Работай! Будешь? Нет?
|
@@ -92,3 +92,8 @@ It was replay of the scene from last year when David Wells pitched the only othe
|
|
92
92
|
The Seattle Mariners on Sunday announced they signed high school catcher Ryan Christianson, their first-round pick and the 11th overall selection in last month's baseball draft.
|
93
93
|
Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz.
|
94
94
|
If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
|
95
|
+
Добавим немного русского текста, чтобы проверить, верно ли все работает.
|
96
|
+
Еще одно предложение.
|
97
|
+
Работай!
|
98
|
+
Будешь?
|
99
|
+
Нет?
|
@@ -92,3 +92,8 @@ It was replay of the scene from last year when David Wells pitched the only othe
|
|
92
92
|
The Seattle Mariners on Sunday announced they signed high school catcher Ryan Christianson, their first-round pick and the 11th overall selection in last month's baseball draft.
|
93
93
|
Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz.
|
94
94
|
If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
|
95
|
+
Добавим немного русского текста, чтобы проверить, верно ли все работает.
|
96
|
+
Еще одно предложение.
|
97
|
+
Работай!
|
98
|
+
Будешь?
|
99
|
+
Нет?
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe String do
|
5
|
+
describe "::is_upper_case?" do
|
6
|
+
it "should be false" do
|
7
|
+
"asdfghjk".is_upper_case?.should == false
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should be true" do
|
11
|
+
"ASDFGHJK".is_upper_case?.should == true
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "::is_alphabetic?" do
|
16
|
+
it "should be false" do
|
17
|
+
"!^?".is_alphabetic?.should == false
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should be true" do
|
21
|
+
"some text".is_alphabetic?.should == true
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should be true for unicode text" do
|
25
|
+
"русский текст öö üüü".is_alphabetic?.should == true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe TactfulTokenizer::Doc do
|
31
|
+
describe "::segment" do
|
32
|
+
it "should return array of segments" do
|
33
|
+
model = TactfulTokenizer::Model.new
|
34
|
+
doc = TactfulTokenizer::Doc.new("Hello!\nMy name is Richard Stewart.\nHow are you?\n")
|
35
|
+
model.featurize doc
|
36
|
+
model.classify doc
|
37
|
+
doc.segment.should == ["Hello!", "My name is Richard Stewart.", "How are you?"]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe TactfulTokenizer::Frag do
|
43
|
+
describe "::clean" do
|
44
|
+
before :each do
|
45
|
+
@frag = TactfulTokenizer::Frag.new
|
46
|
+
@cleaned = @frag.clean("1 good bad 23 ?!")
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should return an instance of Array" do
|
50
|
+
@cleaned.class.should == Array
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should normalize numbers and discard ambiguous punctuation" do
|
54
|
+
@cleaned.should == ["<NUM>", "good", "bad", "<NUM>", "?", "!"]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe TactfulTokenizer::Model do
|
60
|
+
before :each do
|
61
|
+
@m = TactfulTokenizer::Model.new
|
62
|
+
File.open('spec/files/sample.txt') do |f|
|
63
|
+
@text = f.read
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
describe "::classify" do
|
68
|
+
it "should assign a prediction for frags" do
|
69
|
+
doc = TactfulTokenizer::Doc.new("Hello!\n")
|
70
|
+
@m.featurize(doc)
|
71
|
+
@m.classify(doc).first.pred.should > 0.5
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe "::featurize" do
|
76
|
+
it "should get the features of every fragment" do
|
77
|
+
doc = TactfulTokenizer::Doc.new("Hello!\n")
|
78
|
+
@m.featurize(doc).first.features.should == ["w1_!", "w2_", "both_!_"]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
describe "::tokenize_text" do
|
83
|
+
it "should tokenize correctly" do
|
84
|
+
text = @m.tokenize_text(@text)
|
85
|
+
File.open("spec/files/test_out.txt", "w+") do |g|
|
86
|
+
text.each do |line|
|
87
|
+
g.puts line unless line.empty?
|
88
|
+
end
|
89
|
+
g.rewind
|
90
|
+
t2 = g.read
|
91
|
+
t1 = File.open("spec/files/verification_out.txt").read
|
92
|
+
t1.should == t2
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/tactful_tokenizer.gemspec
CHANGED
@@ -1,32 +1,25 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "tactful_tokenizer/version"
|
2
4
|
|
3
5
|
Gem::Specification.new do |s|
|
4
|
-
s.name
|
5
|
-
s.version
|
6
|
+
s.name = "tactful_tokenizer"
|
7
|
+
s.version = TactfulTokenizer::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Matthew Bunday", "Sergey Kishenin"]
|
10
|
+
s.email = ["mkbunday@gmail.com"]
|
11
|
+
s.homepage = "http://github.com/zencephalon/Tactful_Tokenizer"
|
12
|
+
s.summary = "High accuracy sentence tokenization for Ruby."
|
13
|
+
s.description = "TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ corpuses to provide high quality sentence tokenization."
|
14
|
+
s.license = "GPL-3"
|
6
15
|
|
7
|
-
s.
|
8
|
-
s.authors = ["Matthew Bunday"]
|
9
|
-
s.cert_chain = ["/home/slyshy/.ssh/gem-public_cert.pem"]
|
10
|
-
s.date = %q{2010-04-04}
|
11
|
-
s.description = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
|
12
|
-
s.email = %q{mkbunday @nospam@ gmail.com}
|
13
|
-
s.extra_rdoc_files = ["README.rdoc", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb"]
|
14
|
-
s.files = ["Manifest", "README.rdoc", "Rakefile", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb", "test/sample.txt", "test/test.rb", "test/test_out.txt", "test/verification_out.txt", "tactful_tokenizer.gemspec"]
|
15
|
-
s.homepage = %q{http://github.com/SlyShy/Tactful_Tokenizer}
|
16
|
-
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Tactful_tokenizer", "--main", "README.rdoc"]
|
17
|
-
s.require_paths = ["lib"]
|
18
|
-
s.rubyforge_project = %q{tactful_tokenizer}
|
19
|
-
s.rubygems_version = %q{1.3.6}
|
20
|
-
s.signing_key = %q{/home/slyshy/.ssh/gem-private_key.pem}
|
21
|
-
s.summary = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
|
16
|
+
s.rubyforge_project = "tactful_tokenizer"
|
22
17
|
|
23
|
-
|
24
|
-
|
25
|
-
|
18
|
+
s.files = `git ls-files`.split($\)
|
19
|
+
s.executables = s.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
20
|
+
s.test_files = s.files.grep(%r{^(test|spec|features)/})
|
21
|
+
s.require_paths = ["lib"]
|
26
22
|
|
27
|
-
|
28
|
-
|
29
|
-
end
|
30
|
-
else
|
31
|
-
end
|
23
|
+
s.add_development_dependency "rspec", "~> 0"
|
24
|
+
s.add_development_dependency "rake", "~> 0"
|
32
25
|
end
|
metadata
CHANGED
@@ -1,106 +1,96 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: tactful_tokenizer
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
segments:
|
6
|
-
- 0
|
7
|
-
- 0
|
8
|
-
- 2
|
9
|
-
version: 0.0.2
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
10
5
|
platform: ruby
|
11
|
-
authors:
|
6
|
+
authors:
|
12
7
|
- Matthew Bunday
|
8
|
+
- Sergey Kishenin
|
13
9
|
autorequire:
|
14
10
|
bindir: bin
|
15
|
-
cert_chain:
|
16
|
-
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-04-25 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '0'
|
21
|
+
type: :development
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - "~>"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: '0'
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: rake
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
type: :development
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
description: TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ
|
43
|
+
corpuses to provide high quality sentence tokenization.
|
44
|
+
email:
|
45
|
+
- mkbunday@gmail.com
|
44
46
|
executables: []
|
45
|
-
|
46
47
|
extensions: []
|
47
|
-
|
48
|
-
|
49
|
-
-
|
50
|
-
-
|
51
|
-
-
|
52
|
-
- lib/models/non_abbrs.mar
|
53
|
-
- lib/tactful_tokenizer.rb
|
54
|
-
- lib/word_tokenizer.rb
|
55
|
-
files:
|
56
|
-
- Manifest
|
48
|
+
extra_rdoc_files: []
|
49
|
+
files:
|
50
|
+
- ".gitignore"
|
51
|
+
- ".travis.yml"
|
52
|
+
- Gemfile
|
57
53
|
- README.rdoc
|
58
54
|
- Rakefile
|
59
55
|
- lib/models/features.mar
|
60
56
|
- lib/models/lower_words.mar
|
61
57
|
- lib/models/non_abbrs.mar
|
62
58
|
- lib/tactful_tokenizer.rb
|
59
|
+
- lib/tactful_tokenizer/version.rb
|
63
60
|
- lib/word_tokenizer.rb
|
64
|
-
-
|
65
|
-
-
|
66
|
-
-
|
67
|
-
-
|
61
|
+
- spec/files/sample.txt
|
62
|
+
- spec/files/test_out.txt
|
63
|
+
- spec/files/verification_out.txt
|
64
|
+
- spec/spec_helper.rb
|
65
|
+
- spec/tactful_tokenizer/tactful_tokenizer_spec.rb
|
68
66
|
- tactful_tokenizer.gemspec
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
67
|
+
homepage: http://github.com/zencephalon/Tactful_Tokenizer
|
68
|
+
licenses:
|
69
|
+
- GPL-3
|
70
|
+
metadata: {}
|
73
71
|
post_install_message:
|
74
|
-
rdoc_options:
|
75
|
-
|
76
|
-
- --inline-source
|
77
|
-
- --title
|
78
|
-
- Tactful_tokenizer
|
79
|
-
- --main
|
80
|
-
- README.rdoc
|
81
|
-
require_paths:
|
72
|
+
rdoc_options: []
|
73
|
+
require_paths:
|
82
74
|
- lib
|
83
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
-
requirements:
|
75
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
85
77
|
- - ">="
|
86
|
-
- !ruby/object:Gem::Version
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
-
requirements:
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: '0'
|
80
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
92
82
|
- - ">="
|
93
|
-
- !ruby/object:Gem::Version
|
94
|
-
|
95
|
-
- 1
|
96
|
-
- 2
|
97
|
-
version: "1.2"
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '0'
|
98
85
|
requirements: []
|
99
|
-
|
100
86
|
rubyforge_project: tactful_tokenizer
|
101
|
-
rubygems_version:
|
87
|
+
rubygems_version: 2.2.2
|
102
88
|
signing_key:
|
103
|
-
specification_version:
|
104
|
-
summary:
|
105
|
-
test_files:
|
106
|
-
|
89
|
+
specification_version: 4
|
90
|
+
summary: High accuracy sentence tokenization for Ruby.
|
91
|
+
test_files:
|
92
|
+
- spec/files/sample.txt
|
93
|
+
- spec/files/test_out.txt
|
94
|
+
- spec/files/verification_out.txt
|
95
|
+
- spec/spec_helper.rb
|
96
|
+
- spec/tactful_tokenizer/tactful_tokenizer_spec.rb
|
data.tar.gz.sig
DELETED
Binary file
|
data/Manifest
DELETED
data/test/test.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
require '../lib/tactful_tokenizer'
|
2
|
-
require 'test/unit'
|
3
|
-
|
4
|
-
class TactfulTokenize < Test::Unit::TestCase
|
5
|
-
def test_simple
|
6
|
-
m = TactfulTokenizer::Model.new
|
7
|
-
File.open("sample.txt") do |f|
|
8
|
-
text = f.read
|
9
|
-
text = m.tokenize_text(text)
|
10
|
-
File.open("test_out.txt","w+") do |g|
|
11
|
-
text.each do |line|
|
12
|
-
g.puts line unless line.empty?
|
13
|
-
end
|
14
|
-
g.rewind
|
15
|
-
t2 = g.read
|
16
|
-
t1 = File.open("verification_out.txt").read
|
17
|
-
assert_equal(t1, t2)
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
metadata.gz.sig
DELETED
Binary file
|