tactful_tokenizer 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +7 -2
- data/lib/tactful_tokenizer.rb +2 -1
- data/lib/tactful_tokenizer/version.rb +1 -1
- data/lib/word_tokenizer.rb +1 -1
- data/spec/files/sample.txt +3 -1
- data/spec/files/verification_out.txt +2 -0
- data/tactful_tokenizer.gemspec +3 -3
- metadata +7 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69094f450e0e7dac588b0402d7480070df7503b1
|
4
|
+
data.tar.gz: 5b70295f51ce23cfcc53ff252f710c31c759c64f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7a338080ea08773e1057561819f9d737a6ec3a12c1a9e56ddee97e307360f4f04ebdbe8b656bc0e5a9952d10fc606a83154ba6c7922a846178e61f1138b8b898
|
7
|
+
data.tar.gz: dde829e0be9b0745ac7c7d5fd76bd5d0027b01a831304f8b9e6fac5fe04a12e5808449d80db9ae1ca76257297b36233284cc929e512c05bb97b154f380b75c3e
|
data/.travis.yml
CHANGED
@@ -1,9 +1,14 @@
|
|
1
1
|
language: ruby
|
2
2
|
rvm:
|
3
|
+
- 2.1.0
|
3
4
|
- 2.0.0
|
4
5
|
- 1.9.3
|
5
6
|
- 1.9.2
|
6
7
|
- jruby-18mode # JRuby in 1.8 mode
|
7
8
|
- jruby-19mode # JRuby in 1.9 mode
|
8
|
-
- rbx-
|
9
|
-
- 1.8.7
|
9
|
+
- rbx-2.2.6
|
10
|
+
- 1.8.7
|
11
|
+
matrix:
|
12
|
+
allow_failures:
|
13
|
+
- rvm: 1.8.7
|
14
|
+
- rvm: jruby-18mode
|
data/lib/tactful_tokenizer.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
1
2
|
# TactfulTokenizer is a Ruby library for high quality sentence
|
2
3
|
# tokenization. It uses a Naive Bayesian statistical model, and
|
3
4
|
# is based on Splitta[http://code.google.com/p/splitta/]. But
|
@@ -135,7 +136,7 @@ module TactfulTokenizer
|
|
135
136
|
res = nil
|
136
137
|
text.each_line do |line|
|
137
138
|
unless line.strip.empty?
|
138
|
-
line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
|
139
|
+
line.split(/(.*?[.!?](?:[”"')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
|
139
140
|
unless res.strip.empty?
|
140
141
|
frag = Frag.new(res)
|
141
142
|
@frags.last.next = frag.cleaned.first unless @frags.empty?
|
data/lib/word_tokenizer.rb
CHANGED
data/spec/files/sample.txt
CHANGED
@@ -96,4 +96,6 @@ The Seattle Mariners on Sunday announced they signed high school catcher Ryan Ch
|
|
96
96
|
|
97
97
|
Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz. If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
|
98
98
|
|
99
|
-
Добавим немного русского текста, чтобы проверить, верно ли все работает. Еще одно предложение. Работай! Будешь? Нет?
|
99
|
+
Добавим немного русского текста, чтобы проверить, верно ли все работает. Еще одно предложение. Работай! Будешь? Нет?
|
100
|
+
|
101
|
+
“But the point of writing something down is so it stops bothering you—that's why it seems less haunting after you've written it down.” A breeze blows through the cattail stalks and rolls over us, and her scent mixes with the musk of earth.
|
@@ -97,3 +97,5 @@ If he plays well there, he could be elevated to Class A Everett of the Northwest
|
|
97
97
|
Работай!
|
98
98
|
Будешь?
|
99
99
|
Нет?
|
100
|
+
“But the point of writing something down is so it stops bothering you—that's why it seems less haunting after you've written it down.”
|
101
|
+
A breeze blows through the cattail stalks and rolls over us, and her scent mixes with the musk of earth.
|
data/tactful_tokenizer.gemspec
CHANGED
@@ -7,7 +7,7 @@ Gem::Specification.new do |s|
|
|
7
7
|
s.version = TactfulTokenizer::VERSION
|
8
8
|
s.platform = Gem::Platform::RUBY
|
9
9
|
s.authors = ["Matthew Bunday", "Sergey Kishenin"]
|
10
|
-
s.email = ["mkbunday@gmail.com"]
|
10
|
+
s.email = ["mkbunday@gmail.com", "sergey.kishenin@gmail.com"]
|
11
11
|
s.homepage = "http://github.com/zencephalon/Tactful_Tokenizer"
|
12
12
|
s.summary = "High accuracy sentence tokenization for Ruby."
|
13
13
|
s.description = "TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ corpuses to provide high quality sentence tokenization."
|
@@ -20,6 +20,6 @@ Gem::Specification.new do |s|
|
|
20
20
|
s.test_files = s.files.grep(%r{^(test|spec|features)/})
|
21
21
|
s.require_paths = ["lib"]
|
22
22
|
|
23
|
-
s.add_development_dependency "rspec", "~>
|
24
|
-
s.add_development_dependency "rake", "~>
|
23
|
+
s.add_development_dependency "rspec", "~> 2.14.1"
|
24
|
+
s.add_development_dependency "rake", "~> 10.3.1"
|
25
25
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tactful_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Matthew Bunday
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-04-
|
12
|
+
date: 2014-04-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -17,32 +17,33 @@ dependencies:
|
|
17
17
|
requirements:
|
18
18
|
- - "~>"
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version:
|
20
|
+
version: 2.14.1
|
21
21
|
type: :development
|
22
22
|
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
25
|
- - "~>"
|
26
26
|
- !ruby/object:Gem::Version
|
27
|
-
version:
|
27
|
+
version: 2.14.1
|
28
28
|
- !ruby/object:Gem::Dependency
|
29
29
|
name: rake
|
30
30
|
requirement: !ruby/object:Gem::Requirement
|
31
31
|
requirements:
|
32
32
|
- - "~>"
|
33
33
|
- !ruby/object:Gem::Version
|
34
|
-
version:
|
34
|
+
version: 10.3.1
|
35
35
|
type: :development
|
36
36
|
prerelease: false
|
37
37
|
version_requirements: !ruby/object:Gem::Requirement
|
38
38
|
requirements:
|
39
39
|
- - "~>"
|
40
40
|
- !ruby/object:Gem::Version
|
41
|
-
version:
|
41
|
+
version: 10.3.1
|
42
42
|
description: TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ
|
43
43
|
corpuses to provide high quality sentence tokenization.
|
44
44
|
email:
|
45
45
|
- mkbunday@gmail.com
|
46
|
+
- sergey.kishenin@gmail.com
|
46
47
|
executables: []
|
47
48
|
extensions: []
|
48
49
|
extra_rdoc_files: []
|