tactful_tokenizer 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 12a5db701c483d6d9653b1d9c1a6d1ac242501ff
4
+ data.tar.gz: 2dafa09df763499694bd2580e442ad42b1fcb304
5
+ SHA512:
6
+ metadata.gz: 7d60773b82ba93aca28cb79402b73c73408a3466001346d89885f9c7d003339ec44b1a412224ec08692f3ea7d3665cabab7c93369c90a8b330ecc847f3e54ae3
7
+ data.tar.gz: f53f494ef41b55afb7bda23320f9f1e7743164f187a507c60ee14667290552919adf618e01ca5bd5c2e30bc451a576ead8245f99a43339e81d74a80af89540bd
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ /coverage
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.0.0
4
+ - 1.9.3
5
+ - 1.9.2
6
+ - jruby-18mode # JRuby in 1.8 mode
7
+ - jruby-19mode # JRuby in 1.9 mode
8
+ - rbx-19mode
9
+ - 1.8.7
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source "http://rubygems.org"
2
+
3
+ group :test do
4
+ gem "coveralls", :require => false
5
+ end
6
+
7
+ # Specify your gem's dependencies in tactful_tokenizer.gemspec
8
+ gemspec
data/README.rdoc CHANGED
@@ -1,11 +1,18 @@
1
1
  = TactfulTokenizer
2
2
 
3
+ {<img src="https://badge.fury.io/rb/tactful_tokenizer.png" alt="Gem Version" />}[http://badge.fury.io/rb/tactful_tokenizer]
4
+ {<img src="https://travis-ci.org/zencephalon/Tactful_Tokenizer.png?branch=release" alt="Build Status" />}[https://travis-ci.org/zencephalon/Tactful_Tokenizer]
5
+ {<img src="https://codeclimate.com/github/zencephalon/Tactful_Tokenizer.png" />}[https://codeclimate.com/github/zencephalon/Tactful_Tokenizer]
6
+ {<img src="https://coveralls.io/repos/zencephalon/Tactful_Tokenizer/badge.png?branch=release" alt="Coverage Status" />}[https://coveralls.io/r/zencephalon/Tactful_Tokenizer?branch=release]
7
+
3
8
  TactfulTokenizer is a Ruby library for high quality sentence
4
9
  tokenization. It uses a Naive Bayesian statistical model, and
5
10
  is based on Splitta[http://code.google.com/p/splitta/], but
6
11
  has support for '?' and '!' as well as primitive handling of
7
12
  XHTML markup. Better support for XHTML parsing is coming shortly.
8
13
 
14
+ Additionally supports unicode text tokenization.
15
+
9
16
  == Usage
10
17
 
11
18
  require "tactful_tokenizer"
data/Rakefile CHANGED
@@ -1,12 +1,7 @@
1
- require 'rubygems'
2
- require 'rake'
3
- require 'echoe'
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+ require 'rspec/core/rake_task'
4
4
 
5
- Echoe.new('tactful_tokenizer', '0.0.2') do |p|
6
- p.description = "A high accuracy naive bayesian sentence tokenizer based on Splitta."
7
- p.url = "http://github.com/SlyShy/Tactful_Tokenizer"
8
- p.author = "Matthew Bunday"
9
- p.email = "mkbunday @nospam@ gmail.com"
10
- p.ignore_pattern = ["tmp/*", "script/*"]
11
- p.development_dependencies = []
12
- end
5
+ RSpec::Core::RakeTask.new(:spec)
6
+
7
+ task :default => :spec
@@ -17,188 +17,194 @@
17
17
  # Author:: Matthew Bunday (mailto:mkbunday@gmail.com)
18
18
  # License:: GNU General Public License v3
19
19
 
20
- require "word_tokenizer.rb"
21
- include WordTokenizer
22
-
23
- #--
24
- ####### Performance TODOs.
20
+ # Performance TODOs.
25
21
  # TODO: Use inline C where necessary?
26
22
  # TODO: Use RE2 regexp extension.
27
- #++
23
+
24
+ # -*- encoding : utf-8 -*-
25
+ require "word_tokenizer.rb"
26
+ include WordTokenizer
28
27
 
29
28
  module TactfulTokenizer
30
29
 
31
- # Basic String extensions.
32
- String.class_eval do
30
+ # Basic String extensions.
31
+ String.class_eval do
33
32
 
34
- # Simple regex to check if a string is alphabetic.
35
- def is_alphabetic?
36
- return !/[^[:alpha:]]/.match(self)
37
- end
33
+ # Simple regex to check if a string is alphabetic.
34
+ def is_alphabetic?
35
+ !/[[:lower:][:upper:][:space:]]+/u.match(self).nil?
36
+ end
38
37
 
39
- # Check for upper case.
40
- # Surprisingly, this is faster than a regex in benchmarks.
41
- # Using the trinary operator is faster than to_s
42
- def is_upper_case?
43
- self == self.upcase ? 'true' : 'false'
44
- end
38
+ # Check for upper case.
39
+ # Surprisingly, this is faster than a regex in benchmarks.
40
+ # Using the trinary operator is faster than to_s
41
+ def is_upper_case?
42
+ self == self.upcase
45
43
  end
44
+ end
46
45
 
47
- # A model stores normalized probabilities of different features occuring.
48
- class Model
46
+ # A model stores normalized probabilities of different features occuring.
47
+ class Model
49
48
 
50
- # Initialize the model. feats, lower_words, and non_abbrs
51
- # indicate the locations of the respective Marshal dumps.
52
- def initialize(feats="#{File.dirname(__FILE__)}/models/features.mar", lower_words="#{File.dirname(__FILE__)}/models/lower_words.mar", non_abbrs="#{File.dirname(__FILE__)}/models/non_abbrs.mar")
53
- @feats, @lower_words, @non_abbrs = [feats, lower_words, non_abbrs].map do |file|
54
- File.open(file) do |f|
55
- Marshal.load(f.read)
56
- end
57
- end
58
- @p0 = @feats["<prior>"] ** 4
49
+ # Initialize the model. feats, lower_words, and non_abbrs
50
+ # indicate the locations of the respective Marshal dumps.
51
+ def initialize(feats="#{File.dirname(__FILE__)}/models/features.mar", lower_words="#{File.dirname(__FILE__)}/models/lower_words.mar", non_abbrs="#{File.dirname(__FILE__)}/models/non_abbrs.mar")
52
+ @feats, @lower_words, @non_abbrs = [feats, lower_words, non_abbrs].map do |file|
53
+ File.open(file) do |f|
54
+ Marshal.load(f.read)
59
55
  end
56
+ end
57
+ @p0 = @feats["<prior>"] ** 4
58
+ end
60
59
 
61
- # feats = {feature => normalized probability of feature}
62
- # lower_words = {token => log count of occurences in lower case}
63
- # non_abbrs = {token => log count of occurences when not an abbrv.}
64
- attr_accessor :feats, :lower_words, :non_abbrs
65
-
66
- # This function is the only one that'll end up being used.
67
- # m = TactfulTokenizer::Model.new
68
- # m.tokenize_text("Hey, are these two sentences? I bet they should be.")
69
- # => ["Hey, are these two sentences?", "I bet they should be."]
70
- def tokenize_text(text)
71
- data = Doc.new(text)
72
- featurize(data)
73
- classify(data)
74
- return data.segment
75
- end
60
+ # feats = {feature => normalized probability of feature}
61
+ # lower_words = {token => log count of occurences in lower case}
62
+ # non_abbrs = {token => log count of occurences when not an abbrv.}
63
+ attr_accessor :feats, :lower_words, :non_abbrs
64
+
65
+ # This function is the only one that'll end up being used.
66
+ # m = TactfulTokenizer::Model.new
67
+ # m.tokenize_text("Hey, are these two sentences? I bet they should be.")
68
+ # => ["Hey, are these two sentences?", "I bet they should be."]
69
+ def tokenize_text(text)
70
+ data = Doc.new(text)
71
+ featurize(data)
72
+ classify(data)
73
+ return data.segment
74
+ end
76
75
 
77
- # Assign a prediction (probability, to be precise) to each sentence fragment.
78
- # For each feature in each fragment we hunt up the normalized probability and
79
- # multiply. This is a fairly straightforward Bayesian probabilistic algorithm.
80
- def classify(doc)
81
- frag, probs, feat = nil, nil, nil
82
- doc.frags.each do |frag|
83
- probs = @p0
84
- frag.features.each do |feat|
85
- probs *= @feats[feat]
86
- end
87
- frag.pred = probs / (probs + 1)
88
- end
76
+ # Assign a prediction (probability, to be precise) to each sentence fragment.
77
+ # For each feature in each fragment we hunt up the normalized probability and
78
+ # multiply. This is a fairly straightforward Bayesian probabilistic algorithm.
79
+ def classify(doc)
80
+ frag, probs, feat = nil, nil, nil
81
+ doc.frags.each do |frag|
82
+ probs = @p0
83
+ frag.features.each do |feat|
84
+ probs *= @feats[feat]
89
85
  end
86
+ frag.pred = probs / (probs + 1)
87
+ end
88
+ end
90
89
 
91
- # Get the features of every fragment.
92
- def featurize(doc)
93
- frag = nil
94
- doc.frags.each do |frag|
95
- get_features(frag, self)
96
- end
97
- end
90
+ # Get the features of every fragment.
91
+ def featurize(doc)
92
+ frag = nil
93
+ doc.frags.each do |frag|
94
+ get_features(frag, self)
95
+ end
96
+ end
98
97
 
99
- # Finds the features in a text fragment of the form:
100
- # ... w1. (sb?) w2 ...
101
- # Features listed in rough order of importance:
102
- # * w1: a word that includes a period.
103
- # * w2: the next word, if it exists.
104
- # * w1length: the number of alphabetic characters in w1.
105
- # * both: w1 and w2 taken together.
106
- # * w1abbr: logarithmic count of w1 occuring without a period.
107
- # * w2lower: logarithmiccount of w2 occuring lowercased.
108
- def get_features(frag, model)
109
- w1 = (frag.cleaned.last or '')
110
- w2 = (frag.next or '')
111
-
112
- frag.features = ["w1_#{w1}", "w2_#{w2}", "both_#{w1}_#{w2}"]
113
-
114
- if not w2.empty?
115
- if w1.chop.is_alphabetic?
116
- frag.features.push "w1length_#{[10, w1.length].min}", "w1abbr_#{model.non_abbrs[w1.chop]}"
117
- end
118
-
119
- if w2.chop.is_alphabetic?
120
- frag.features.push "w2cap_#{w2[0].is_upper_case?}", "w2lower_#{model.lower_words[w2.downcase]}"
121
- end
98
+ # Finds the features in a text fragment of the form:
99
+ # ... w1. (sb?) w2 ...
100
+ # Features listed in rough order of importance:
101
+ # * w1: a word that includes a period.
102
+ # * w2: the next word, if it exists.
103
+ # * w1length: the number of alphabetic characters in w1.
104
+ # * both: w1 and w2 taken together.
105
+ # * w1abbr: logarithmic count of w1 occuring without a period.
106
+ # * w2lower: logarithmiccount of w2 occuring lowercased.
107
+ def get_features(frag, model)
108
+ w1 = (frag.cleaned.last or '')
109
+ w2 = (frag.next or '')
110
+
111
+ frag.features = ["w1_#{w1}", "w2_#{w2}", "both_#{w1}_#{w2}"]
112
+
113
+ unless w2.empty?
114
+ frag.push_w1_features(w1, model)
115
+ frag.push_w2_features(w2, model)
116
+ end
117
+ end
118
+ end
119
+
120
+ # A document represents the input text. It holds a list of fragments generated
121
+ # from the text.
122
+ class Doc
123
+ # List of fragments.
124
+ attr_accessor :frags
125
+
126
+ # Receives a text, which is then broken into fragments.
127
+ # A fragment ends with a period, quesetion mark, or exclamation mark followed
128
+ # possibly by right handed punctuation like quotation marks or closing braces
129
+ # and trailing whitespace. Failing that, it'll accept something like "I hate cheese\n"
130
+ # No, it doesn't have a period, but that's the end of paragraph.
131
+ #
132
+ # Input assumption: Paragraphs delimited by line breaks.
133
+ def initialize(text)
134
+ @frags = []
135
+ res = nil
136
+ text.each_line do |line|
137
+ unless line.strip.empty?
138
+ line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
139
+ unless res.strip.empty?
140
+ frag = Frag.new(res)
141
+ @frags.last.next = frag.cleaned.first unless @frags.empty?
142
+ @frags.push frag
122
143
  end
144
+ end
123
145
  end
146
+ end
124
147
  end
125
148
 
126
- # A document represents the input text. It holds a list of fragments generated
127
- # from the text.
128
- class Doc
129
- # List of fragments.
130
- attr_accessor :frags
131
-
132
- # Receives a text, which is then broken into fragments.
133
- # A fragment ends with a period, quesetion mark, or exclamation mark followed
134
- # possibly by right handed punctuation like quotation marks or closing braces
135
- # and trailing whitespace. Failing that, it'll accept something like "I hate cheese\n"
136
- # No, it doesn't have a period, but that's the end of paragraph.
137
- #
138
- # Input assumption: Paragraphs delimited by line breaks.
139
- def initialize(text)
140
- @frags = []
141
- res = nil
142
- text.each_line do |line|
143
- unless line.strip.empty?
144
- line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[\s])/).each do |res|
145
- unless res.strip.empty?
146
- frag = Frag.new(res)
147
- @frags.last.next = frag.cleaned.first unless @frags.empty?
148
- @frags.push frag
149
- end
150
- end
151
- end
152
- end
149
+ # Segments the text. More precisely, it reassembles the fragments into sentences.
150
+ # We call something a sentence whenever it is more likely to be a sentence than not.
151
+ def segment
152
+ sents, sent = [], []
153
+ thresh = 0.5
154
+
155
+ frag = nil
156
+ @frags.each do |frag|
157
+ sent.push(frag.orig)
158
+ if frag.pred && frag.pred > thresh
159
+ break if frag.orig.nil?
160
+ sents.push(sent.join('').strip)
161
+ sent = []
153
162
  end
163
+ end
164
+ sents
165
+ end
166
+ end
167
+
168
+ # A fragment is a potential sentence, but is based only on the existence of a period.
169
+ # The text "Here in the U.S. Senate we prefer to devour our friends." will be split
170
+ # into "Here in the U.S." and "Senate we prefer to devour our friends."
171
+ class Frag
172
+
173
+ # orig = The original text of the fragment.
174
+ # next = The next word following the fragment.
175
+ # cleaned = Array of the fragment's words after cleaning.
176
+ # pred = Probability that the fragment is a sentence.
177
+ # features = Array of the fragment's features.
178
+ attr_accessor :orig, :next, :cleaned, :pred, :features
179
+
180
+ # Create a new fragment.
181
+ def initialize(orig='')
182
+ @orig = orig
183
+ clean(orig)
184
+ @next, @pred, @features = nil, nil, nil
185
+ end
154
186
 
155
- # Segments the text. More precisely, it reassembles the fragments into sentences.
156
- # We call something a sentence whenever it is more likely to be a sentence than not.
157
- def segment
158
- sents, sent = [], []
159
- thresh = 0.5
160
-
161
- frag = nil
162
- @frags.each do |frag|
163
- sent.push(frag.orig)
164
- if frag.pred > thresh
165
- break if frag.orig.nil?
166
- sents.push(sent.join('').strip)
167
- sent = []
168
- end
169
- end
170
- sents
171
- end
187
+ # Normalizes numbers and discards ambiguous punctuation. And then splits into an
188
+ # array, because realistically only the last and first words are ever accessed.
189
+ def clean(s)
190
+ @cleaned = String.new(s)
191
+ tokenize(@cleaned)
192
+ @cleaned.gsub!(/[.,\d]*\d/, '<NUM>')
193
+ @cleaned.gsub!(/[^[[:upper:][:lower:]]\d[:space:],!?.;:<>\-'\/$% ]/u, '')
194
+ @cleaned.gsub!('--', ' ')
195
+ @cleaned = @cleaned.split
172
196
  end
173
197
 
174
- # A fragment is a potential sentence, but is based only on the existence of a period.
175
- # The text "Here in the U.S. Senate we prefer to devour our friends." will be split
176
- # into "Here in the U.S." and "Senate we prefer to devour our friends."
177
- class Frag
178
-
179
- # orig = The original text of the fragment.
180
- # next = The next word following the fragment.
181
- # cleaned = Array of the fragment's words after cleaning.
182
- # pred = Probability that the fragment is a sentence.
183
- # features = Array of the fragment's features.
184
- attr_accessor :orig, :next, :cleaned, :pred, :features
185
-
186
- # Create a new fragment.
187
- def initialize(orig='')
188
- @orig = orig
189
- clean(orig)
190
- @next, @pred, @features = nil, nil, nil
191
- end
198
+ def push_w1_features w1, model
199
+ if w1.chop.is_alphabetic?
200
+ features.push "w1length_#{[10, w1.length].min}", "w1abbr_#{model.non_abbrs[w1.chop]}"
201
+ end
202
+ end
192
203
 
193
- # Normalizes numbers and discards ambiguous punctuation. And then splits into an
194
- # array, because realistically only the last and first words are ever accessed.
195
- def clean(s)
196
- @cleaned = String.new(s)
197
- tokenize(@cleaned)
198
- @cleaned.gsub!(/[.,\d]*\d/, '<NUM>')
199
- @cleaned.gsub!(/[^a-zA-Z0-9,.;:<>\-'\/$% ]/, '')
200
- @cleaned.gsub!('--', ' ')
201
- @cleaned = @cleaned.split
202
- end
204
+ def push_w2_features w2, model
205
+ if w2.chop.is_alphabetic?
206
+ features.push "w2cap_#{w2[0,1].is_upper_case?}", "w2lower_#{model.lower_words[w2.downcase]}"
207
+ end
203
208
  end
209
+ end
204
210
  end
@@ -0,0 +1,3 @@
1
+ module TactfulTokenizer
2
+ VERSION = "0.0.3"
3
+ end
@@ -1,51 +1,55 @@
1
+ # -*- encoding : utf-8 -*-
1
2
  module WordTokenizer
2
- @@tokenize_regexps = [
3
- # Uniform Quotes
4
- [/''|``/, '"'],
3
+ @@tokenize_regexps = [
4
+ # Uniform Quotes
5
+ [/''|``/, '"'],
5
6
 
6
- # Separate punctuation (except for periods) from words.
7
- [/(^|\s)(')/, '\1\2'],
8
- [/(?=[\("`{\[:;&#*@])(.)/, '\1 '],
7
+ # Separate punctuation (except for periods) from words.
8
+ [/(^|[:space:])(')/u, '\1\2'],
9
+ [/(?=[\("`{\[:;&#*@])(.)/, '\1 '],
9
10
 
10
- [/(.)(?=[?!\)";}\]*:@'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|\s)-)(?=[^-])/, '\1 '],
11
+ [/(.)(?=[?!\)";}\]*:@'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|[:space:])-)(?=[^-])/u, '\1 '],
11
12
 
12
- # Treat double-hyphen as a single token.
13
- [/([^-])(--+)([^-])/, '\1 \2 \3'],
14
- [/(\s|^)(,)(?=(\S))/, '\1\2 '],
13
+ # Treat double-hyphen as a single token.
14
+ [/([^-])(--+)([^-])/, '\1 \2 \3'],
15
+ [/([:space:]|^)(,)(?=(^[:space:]))/u, '\1\2 '],
15
16
 
16
- # Only separate a comma if a space follows.
17
- [/(.)(,)(\s|$)/, '\1 \2\3'],
17
+ # Only separate a comma if a space follows.
18
+ [/(.)(,)([:space:]|$)/u, '\1 \2\3'],
18
19
 
19
- # Combine dots separated by whitespace to be a single token.
20
- [/\.\s\.\s\./, '...'],
20
+ # Combine dots separated by whitespace to be a single token.
21
+ [/\.[:space:]\.[:space:]\./u, '...'],
21
22
 
22
- # Separate "No.6"
23
- [/([a-zA-Z]\.)(\d+)/, '\1 \2'],
23
+ # Separate "No.6"
24
+ [/(^[:upper]^[:lower:]\.)(\d+)/, '\1 \2'],
24
25
 
25
- # Separate words from ellipses
26
- [/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
27
- [/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
28
- [/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
26
+ # Md. or MD. for Ruby 1.8
27
+ [/M[d|D]./, '\1'],
29
28
 
30
- ##### Some additional fixes.
29
+ # Separate words from ellipses
30
+ [/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
31
+ [/(^|[:space:])(\.{2,})([^\.[:space:]])/u, '\1\2 \3'],
32
+ [/(^|[:space:])(\.{2,})([^\.[:space:]])/u, '\1 \2\3'],
31
33
 
32
- # Fix %, $, &
33
- [/(\d)%/, '\1 %'],
34
- [/\$(\.?\d)/, '$ \1'],
35
- [/(\w)& (\w)/, '\1&\2'],
36
- [/(\w\w+)&(\w\w+)/, '\1 & \2'],
34
+ ##### Some additional fixes.
37
35
 
38
- # Fix (n 't) -> ( n't)
39
- [/n 't( |$)/, " n't\\1"],
40
- [/N 'T( |$)/, " N'T\\1"],
36
+ # Fix %, $, &
37
+ [/(\d)%/, '\1 %'],
38
+ [/\$(\.?\d)/, '$ \1'],
39
+ [/(^[:lower:]^[:upper:])& (^[:lower:]^[:upper:])/u, '\1&\2'],
40
+ [/(^[:lower:]^[:upper:]+)&(^[:lower:]^[:upper:]+)/u, '\1 & \2'],
41
41
 
42
- # Treebank tokenizer special words
43
- [/([Cc])annot/, '\1an not']
42
+ # Fix (n 't) -> ( n't)
43
+ [/n 't( |$)/, " n't\\1"],
44
+ [/N 'T( |$)/, " N'T\\1"],
44
45
 
45
- ];
46
+ # Treebank tokenizer special words
47
+ [/([Cc])annot/, '\1an not']
46
48
 
47
- def tokenize(s)
48
- rules = []
49
- @@tokenize_regexps.each {|rules| s.gsub!(rules[0], rules[1])}
50
- end
49
+ ];
50
+
51
+ def tokenize(s)
52
+ rules = []
53
+ @@tokenize_regexps.each {|rules| s.gsub!(rules[0], rules[1])}
54
+ end
51
55
  end
@@ -96,3 +96,4 @@ The Seattle Mariners on Sunday announced they signed high school catcher Ryan Ch
96
96
 
97
97
  Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz. If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
98
98
 
99
+ Добавим немного русского текста, чтобы проверить, верно ли все работает. Еще одно предложение. Работай! Будешь? Нет?
@@ -92,3 +92,8 @@ It was replay of the scene from last year when David Wells pitched the only othe
92
92
  The Seattle Mariners on Sunday announced they signed high school catcher Ryan Christianson, their first-round pick and the 11th overall selection in last month's baseball draft.
93
93
  Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz.
94
94
  If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
95
+ Добавим немного русского текста, чтобы проверить, верно ли все работает.
96
+ Еще одно предложение.
97
+ Работай!
98
+ Будешь?
99
+ Нет?
@@ -92,3 +92,8 @@ It was replay of the scene from last year when David Wells pitched the only othe
92
92
  The Seattle Mariners on Sunday announced they signed high school catcher Ryan Christianson, their first-round pick and the 11th overall selection in last month's baseball draft.
93
93
  Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz.
94
94
  If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
95
+ Добавим немного русского текста, чтобы проверить, верно ли все работает.
96
+ Еще одно предложение.
97
+ Работай!
98
+ Будешь?
99
+ Нет?
@@ -0,0 +1,7 @@
1
+ $:.unshift File.expand_path('..', __FILE__)
2
+ $:.unshift File.expand_path('../../lib', __FILE__)
3
+
4
+ require 'rspec'
5
+ require 'tactful_tokenizer'
6
+ require 'coveralls'
7
+ Coveralls.wear!
@@ -0,0 +1,96 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'spec_helper'
3
+
4
+ describe String do
5
+ describe "::is_upper_case?" do
6
+ it "should be false" do
7
+ "asdfghjk".is_upper_case?.should == false
8
+ end
9
+
10
+ it "should be true" do
11
+ "ASDFGHJK".is_upper_case?.should == true
12
+ end
13
+ end
14
+
15
+ describe "::is_alphabetic?" do
16
+ it "should be false" do
17
+ "!^?".is_alphabetic?.should == false
18
+ end
19
+
20
+ it "should be true" do
21
+ "some text".is_alphabetic?.should == true
22
+ end
23
+
24
+ it "should be true for unicode text" do
25
+ "русский текст öö üüü".is_alphabetic?.should == true
26
+ end
27
+ end
28
+ end
29
+
30
+ describe TactfulTokenizer::Doc do
31
+ describe "::segment" do
32
+ it "should return array of segments" do
33
+ model = TactfulTokenizer::Model.new
34
+ doc = TactfulTokenizer::Doc.new("Hello!\nMy name is Richard Stewart.\nHow are you?\n")
35
+ model.featurize doc
36
+ model.classify doc
37
+ doc.segment.should == ["Hello!", "My name is Richard Stewart.", "How are you?"]
38
+ end
39
+ end
40
+ end
41
+
42
+ describe TactfulTokenizer::Frag do
43
+ describe "::clean" do
44
+ before :each do
45
+ @frag = TactfulTokenizer::Frag.new
46
+ @cleaned = @frag.clean("1 good bad 23 ?!")
47
+ end
48
+
49
+ it "should return an instance of Array" do
50
+ @cleaned.class.should == Array
51
+ end
52
+
53
+ it "should normalize numbers and discard ambiguous punctuation" do
54
+ @cleaned.should == ["<NUM>", "good", "bad", "<NUM>", "?", "!"]
55
+ end
56
+ end
57
+ end
58
+
59
+ describe TactfulTokenizer::Model do
60
+ before :each do
61
+ @m = TactfulTokenizer::Model.new
62
+ File.open('spec/files/sample.txt') do |f|
63
+ @text = f.read
64
+ end
65
+ end
66
+
67
+ describe "::classify" do
68
+ it "should assign a prediction for frags" do
69
+ doc = TactfulTokenizer::Doc.new("Hello!\n")
70
+ @m.featurize(doc)
71
+ @m.classify(doc).first.pred.should > 0.5
72
+ end
73
+ end
74
+
75
+ describe "::featurize" do
76
+ it "should get the features of every fragment" do
77
+ doc = TactfulTokenizer::Doc.new("Hello!\n")
78
+ @m.featurize(doc).first.features.should == ["w1_!", "w2_", "both_!_"]
79
+ end
80
+ end
81
+
82
+ describe "::tokenize_text" do
83
+ it "should tokenize correctly" do
84
+ text = @m.tokenize_text(@text)
85
+ File.open("spec/files/test_out.txt", "w+") do |g|
86
+ text.each do |line|
87
+ g.puts line unless line.empty?
88
+ end
89
+ g.rewind
90
+ t2 = g.read
91
+ t1 = File.open("spec/files/verification_out.txt").read
92
+ t1.should == t2
93
+ end
94
+ end
95
+ end
96
+ end
@@ -1,32 +1,25 @@
1
1
  # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "tactful_tokenizer/version"
2
4
 
3
5
  Gem::Specification.new do |s|
4
- s.name = %q{tactful_tokenizer}
5
- s.version = "0.0.2"
6
+ s.name = "tactful_tokenizer"
7
+ s.version = TactfulTokenizer::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Matthew Bunday", "Sergey Kishenin"]
10
+ s.email = ["mkbunday@gmail.com"]
11
+ s.homepage = "http://github.com/zencephalon/Tactful_Tokenizer"
12
+ s.summary = "High accuracy sentence tokenization for Ruby."
13
+ s.description = "TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ corpuses to provide high quality sentence tokenization."
14
+ s.license = "GPL-3"
6
15
 
7
- s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
- s.authors = ["Matthew Bunday"]
9
- s.cert_chain = ["/home/slyshy/.ssh/gem-public_cert.pem"]
10
- s.date = %q{2010-04-04}
11
- s.description = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
12
- s.email = %q{mkbunday @nospam@ gmail.com}
13
- s.extra_rdoc_files = ["README.rdoc", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb"]
14
- s.files = ["Manifest", "README.rdoc", "Rakefile", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb", "test/sample.txt", "test/test.rb", "test/test_out.txt", "test/verification_out.txt", "tactful_tokenizer.gemspec"]
15
- s.homepage = %q{http://github.com/SlyShy/Tactful_Tokenizer}
16
- s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Tactful_tokenizer", "--main", "README.rdoc"]
17
- s.require_paths = ["lib"]
18
- s.rubyforge_project = %q{tactful_tokenizer}
19
- s.rubygems_version = %q{1.3.6}
20
- s.signing_key = %q{/home/slyshy/.ssh/gem-private_key.pem}
21
- s.summary = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
16
+ s.rubyforge_project = "tactful_tokenizer"
22
17
 
23
- if s.respond_to? :specification_version then
24
- current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
25
- s.specification_version = 3
18
+ s.files = `git ls-files`.split($\)
19
+ s.executables = s.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
20
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
21
+ s.require_paths = ["lib"]
26
22
 
27
- if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
28
- else
29
- end
30
- else
31
- end
23
+ s.add_development_dependency "rspec", "~> 0"
24
+ s.add_development_dependency "rake", "~> 0"
32
25
  end
metadata CHANGED
@@ -1,106 +1,96 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: tactful_tokenizer
3
- version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 0
7
- - 0
8
- - 2
9
- version: 0.0.2
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
10
5
  platform: ruby
11
- authors:
6
+ authors:
12
7
  - Matthew Bunday
8
+ - Sergey Kishenin
13
9
  autorequire:
14
10
  bindir: bin
15
- cert_chain:
16
- - |
17
- -----BEGIN CERTIFICATE-----
18
- MIIDMjCCAhqgAwIBAgIBADANBgkqhkiG9w0BAQUFADA/MREwDwYDVQQDDAhta2J1
19
- bmRheTEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYKCZImiZPyLGQBGRYDY29t
20
- MB4XDTEwMDMyMzE2MDkzOVoXDTExMDMyMzE2MDkzOVowPzERMA8GA1UEAwwIbWti
21
- dW5kYXkxFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixkARkWA2Nv
22
- bTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAMk5+Wsur5ptIGUthPBG
23
- VHECPqlV7TRgxiEMbH8vxkMVNnqFGDTezd9zsmqfX9kKR4/Jmu1fXKyBswGRxYxD
24
- qx8nR+DCnWk0gfx2jjpnknPPWTQ6lHiZaPrGb+QuANhebPTwI6cDIz4A3dg2QIRo
25
- ETdiAdOspNudUHu2Jf/QeNQPr5SURy9vGnSXkDhMcrnR3EjkRAP4suNIlHBNj3Hz
26
- 7hYjZV5QzeFwVENR5K3zFSkbC3ZK6uZTUwPVngmCqWz3MLsNJiQhAhvn/XQ8OCJ3
27
- Q8O/nPuIIqFNeT3TMvnfrbx+wyxX6FIBZ12M4lNmU6yoXxzmi/n/cBNLAkQ/hc2g
28
- n68CAwEAAaM5MDcwCQYDVR0TBAIwADAdBgNVHQ4EFgQUZfQL/a3SzQ017Zj9MUwh
29
- Y6BtLUgwCwYDVR0PBAQDAgSwMA0GCSqGSIb3DQEBBQUAA4IBAQAjdEGkZbV7tkOq
30
- N0y3yL5n1JOMsVHsQF7/w2zeET3PyUgKmmobdq3V0rztqVcJ1oP/+fYUO1KYxC90
31
- b8FOCGGvcKjMn1QJufFp1DTfiGFcz6nHRWmiAMRXbempzA5NDzocQP9jaRkoYEzK
32
- pwsJwe0dlpJXs8/fqqljNdBe4AToDGLcbzdMmpGxZN63P70yAFL5G7sJy1Izp5ei
33
- CvIRDtL1PdU1ESVLFJuoCAiCtpBfwwepv4kuuoca9Ykd5ldPCGzMq0n8+KIubb+2
34
- xz7fp33atnZoMajdCOYKqwo2xVhUuFPZzBFZ3L6T6YLuEVGKHNyUAfcfr+8VSuB5
35
- 3+l7cSZt
36
- -----END CERTIFICATE-----
37
-
38
- date: 2010-04-04 00:00:00 -05:00
39
- default_executable:
40
- dependencies: []
41
-
42
- description: A high accuracy naive bayesian sentence tokenizer based on Splitta.
43
- email: mkbunday @nospam@ gmail.com
11
+ cert_chain: []
12
+ date: 2014-04-25 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ type: :development
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: rake
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ description: TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ
43
+ corpuses to provide high quality sentence tokenization.
44
+ email:
45
+ - mkbunday@gmail.com
44
46
  executables: []
45
-
46
47
  extensions: []
47
-
48
- extra_rdoc_files:
49
- - README.rdoc
50
- - lib/models/features.mar
51
- - lib/models/lower_words.mar
52
- - lib/models/non_abbrs.mar
53
- - lib/tactful_tokenizer.rb
54
- - lib/word_tokenizer.rb
55
- files:
56
- - Manifest
48
+ extra_rdoc_files: []
49
+ files:
50
+ - ".gitignore"
51
+ - ".travis.yml"
52
+ - Gemfile
57
53
  - README.rdoc
58
54
  - Rakefile
59
55
  - lib/models/features.mar
60
56
  - lib/models/lower_words.mar
61
57
  - lib/models/non_abbrs.mar
62
58
  - lib/tactful_tokenizer.rb
59
+ - lib/tactful_tokenizer/version.rb
63
60
  - lib/word_tokenizer.rb
64
- - test/sample.txt
65
- - test/test.rb
66
- - test/test_out.txt
67
- - test/verification_out.txt
61
+ - spec/files/sample.txt
62
+ - spec/files/test_out.txt
63
+ - spec/files/verification_out.txt
64
+ - spec/spec_helper.rb
65
+ - spec/tactful_tokenizer/tactful_tokenizer_spec.rb
68
66
  - tactful_tokenizer.gemspec
69
- has_rdoc: true
70
- homepage: http://github.com/SlyShy/Tactful_Tokenizer
71
- licenses: []
72
-
67
+ homepage: http://github.com/zencephalon/Tactful_Tokenizer
68
+ licenses:
69
+ - GPL-3
70
+ metadata: {}
73
71
  post_install_message:
74
- rdoc_options:
75
- - --line-numbers
76
- - --inline-source
77
- - --title
78
- - Tactful_tokenizer
79
- - --main
80
- - README.rdoc
81
- require_paths:
72
+ rdoc_options: []
73
+ require_paths:
82
74
  - lib
83
- required_ruby_version: !ruby/object:Gem::Requirement
84
- requirements:
75
+ required_ruby_version: !ruby/object:Gem::Requirement
76
+ requirements:
85
77
  - - ">="
86
- - !ruby/object:Gem::Version
87
- segments:
88
- - 0
89
- version: "0"
90
- required_rubygems_version: !ruby/object:Gem::Requirement
91
- requirements:
78
+ - !ruby/object:Gem::Version
79
+ version: '0'
80
+ required_rubygems_version: !ruby/object:Gem::Requirement
81
+ requirements:
92
82
  - - ">="
93
- - !ruby/object:Gem::Version
94
- segments:
95
- - 1
96
- - 2
97
- version: "1.2"
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
98
85
  requirements: []
99
-
100
86
  rubyforge_project: tactful_tokenizer
101
- rubygems_version: 1.3.6
87
+ rubygems_version: 2.2.2
102
88
  signing_key:
103
- specification_version: 3
104
- summary: A high accuracy naive bayesian sentence tokenizer based on Splitta.
105
- test_files: []
106
-
89
+ specification_version: 4
90
+ summary: High accuracy sentence tokenization for Ruby.
91
+ test_files:
92
+ - spec/files/sample.txt
93
+ - spec/files/test_out.txt
94
+ - spec/files/verification_out.txt
95
+ - spec/spec_helper.rb
96
+ - spec/tactful_tokenizer/tactful_tokenizer_spec.rb
data.tar.gz.sig DELETED
Binary file
data/Manifest DELETED
@@ -1,12 +0,0 @@
1
- Manifest
2
- README.rdoc
3
- Rakefile
4
- lib/models/features.mar
5
- lib/models/lower_words.mar
6
- lib/models/non_abbrs.mar
7
- lib/tactful_tokenizer.rb
8
- lib/word_tokenizer.rb
9
- test/sample.txt
10
- test/test.rb
11
- test/test_out.txt
12
- test/verification_out.txt
data/test/test.rb DELETED
@@ -1,21 +0,0 @@
1
- require '../lib/tactful_tokenizer'
2
- require 'test/unit'
3
-
4
- class TactfulTokenize < Test::Unit::TestCase
5
- def test_simple
6
- m = TactfulTokenizer::Model.new
7
- File.open("sample.txt") do |f|
8
- text = f.read
9
- text = m.tokenize_text(text)
10
- File.open("test_out.txt","w+") do |g|
11
- text.each do |line|
12
- g.puts line unless line.empty?
13
- end
14
- g.rewind
15
- t2 = g.read
16
- t1 = File.open("verification_out.txt").read
17
- assert_equal(t1, t2)
18
- end
19
- end
20
- end
21
- end
metadata.gz.sig DELETED
Binary file