tactful_tokenizer 0.0.3 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +7 -2
- data/lib/tactful_tokenizer.rb +2 -1
- data/lib/tactful_tokenizer/version.rb +1 -1
- data/lib/word_tokenizer.rb +1 -1
- data/spec/files/sample.txt +3 -1
- data/spec/files/verification_out.txt +2 -0
- data/tactful_tokenizer.gemspec +3 -3
- metadata +7 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69094f450e0e7dac588b0402d7480070df7503b1
|
4
|
+
data.tar.gz: 5b70295f51ce23cfcc53ff252f710c31c759c64f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7a338080ea08773e1057561819f9d737a6ec3a12c1a9e56ddee97e307360f4f04ebdbe8b656bc0e5a9952d10fc606a83154ba6c7922a846178e61f1138b8b898
|
7
|
+
data.tar.gz: dde829e0be9b0745ac7c7d5fd76bd5d0027b01a831304f8b9e6fac5fe04a12e5808449d80db9ae1ca76257297b36233284cc929e512c05bb97b154f380b75c3e
|
data/.travis.yml
CHANGED
@@ -1,9 +1,14 @@
|
|
1
1
|
language: ruby
|
2
2
|
rvm:
|
3
|
+
- 2.1.0
|
3
4
|
- 2.0.0
|
4
5
|
- 1.9.3
|
5
6
|
- 1.9.2
|
6
7
|
- jruby-18mode # JRuby in 1.8 mode
|
7
8
|
- jruby-19mode # JRuby in 1.9 mode
|
8
|
-
- rbx-
|
9
|
-
- 1.8.7
|
9
|
+
- rbx-2.2.6
|
10
|
+
- 1.8.7
|
11
|
+
matrix:
|
12
|
+
allow_failures:
|
13
|
+
- rvm: 1.8.7
|
14
|
+
- rvm: jruby-18mode
|
data/lib/tactful_tokenizer.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
1
2
|
# TactfulTokenizer is a Ruby library for high quality sentence
|
2
3
|
# tokenization. It uses a Naive Bayesian statistical model, and
|
3
4
|
# is based on Splitta[http://code.google.com/p/splitta/]. But
|
@@ -135,7 +136,7 @@ module TactfulTokenizer
|
|
135
136
|
res = nil
|
136
137
|
text.each_line do |line|
|
137
138
|
unless line.strip.empty?
|
138
|
-
line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
|
139
|
+
line.split(/(.*?[.!?](?:[”"')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
|
139
140
|
unless res.strip.empty?
|
140
141
|
frag = Frag.new(res)
|
141
142
|
@frags.last.next = frag.cleaned.first unless @frags.empty?
|
data/lib/word_tokenizer.rb
CHANGED
data/spec/files/sample.txt
CHANGED
@@ -96,4 +96,6 @@ The Seattle Mariners on Sunday announced they signed high school catcher Ryan Ch
|
|
96
96
|
|
97
97
|
Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz. If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
|
98
98
|
|
99
|
-
Добавим немного русского текста, чтобы проверить, верно ли все работает. Еще одно предложение. Работай! Будешь? Нет?
|
99
|
+
Добавим немного русского текста, чтобы проверить, верно ли все работает. Еще одно предложение. Работай! Будешь? Нет?
|
100
|
+
|
101
|
+
“But the point of writing something down is so it stops bothering you—that's why it seems less haunting after you've written it down.” A breeze blows through the cattail stalks and rolls over us, and her scent mixes with the musk of earth.
|
@@ -97,3 +97,5 @@ If he plays well there, he could be elevated to Class A Everett of the Northwest
|
|
97
97
|
Работай!
|
98
98
|
Будешь?
|
99
99
|
Нет?
|
100
|
+
“But the point of writing something down is so it stops bothering you—that's why it seems less haunting after you've written it down.”
|
101
|
+
A breeze blows through the cattail stalks and rolls over us, and her scent mixes with the musk of earth.
|
data/tactful_tokenizer.gemspec
CHANGED
@@ -7,7 +7,7 @@ Gem::Specification.new do |s|
|
|
7
7
|
s.version = TactfulTokenizer::VERSION
|
8
8
|
s.platform = Gem::Platform::RUBY
|
9
9
|
s.authors = ["Matthew Bunday", "Sergey Kishenin"]
|
10
|
-
s.email = ["mkbunday@gmail.com"]
|
10
|
+
s.email = ["mkbunday@gmail.com", "sergey.kishenin@gmail.com"]
|
11
11
|
s.homepage = "http://github.com/zencephalon/Tactful_Tokenizer"
|
12
12
|
s.summary = "High accuracy sentence tokenization for Ruby."
|
13
13
|
s.description = "TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ corpuses to provide high quality sentence tokenization."
|
@@ -20,6 +20,6 @@ Gem::Specification.new do |s|
|
|
20
20
|
s.test_files = s.files.grep(%r{^(test|spec|features)/})
|
21
21
|
s.require_paths = ["lib"]
|
22
22
|
|
23
|
-
s.add_development_dependency "rspec", "~>
|
24
|
-
s.add_development_dependency "rake", "~>
|
23
|
+
s.add_development_dependency "rspec", "~> 2.14.1"
|
24
|
+
s.add_development_dependency "rake", "~> 10.3.1"
|
25
25
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tactful_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Matthew Bunday
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-04-
|
12
|
+
date: 2014-04-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -17,32 +17,33 @@ dependencies:
|
|
17
17
|
requirements:
|
18
18
|
- - "~>"
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version:
|
20
|
+
version: 2.14.1
|
21
21
|
type: :development
|
22
22
|
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
25
|
- - "~>"
|
26
26
|
- !ruby/object:Gem::Version
|
27
|
-
version:
|
27
|
+
version: 2.14.1
|
28
28
|
- !ruby/object:Gem::Dependency
|
29
29
|
name: rake
|
30
30
|
requirement: !ruby/object:Gem::Requirement
|
31
31
|
requirements:
|
32
32
|
- - "~>"
|
33
33
|
- !ruby/object:Gem::Version
|
34
|
-
version:
|
34
|
+
version: 10.3.1
|
35
35
|
type: :development
|
36
36
|
prerelease: false
|
37
37
|
version_requirements: !ruby/object:Gem::Requirement
|
38
38
|
requirements:
|
39
39
|
- - "~>"
|
40
40
|
- !ruby/object:Gem::Version
|
41
|
-
version:
|
41
|
+
version: 10.3.1
|
42
42
|
description: TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ
|
43
43
|
corpuses to provide high quality sentence tokenization.
|
44
44
|
email:
|
45
45
|
- mkbunday@gmail.com
|
46
|
+
- sergey.kishenin@gmail.com
|
46
47
|
executables: []
|
47
48
|
extensions: []
|
48
49
|
extra_rdoc_files: []
|