tactful_tokenizer 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 12a5db701c483d6d9653b1d9c1a6d1ac242501ff
4
- data.tar.gz: 2dafa09df763499694bd2580e442ad42b1fcb304
3
+ metadata.gz: 69094f450e0e7dac588b0402d7480070df7503b1
4
+ data.tar.gz: 5b70295f51ce23cfcc53ff252f710c31c759c64f
5
5
  SHA512:
6
- metadata.gz: 7d60773b82ba93aca28cb79402b73c73408a3466001346d89885f9c7d003339ec44b1a412224ec08692f3ea7d3665cabab7c93369c90a8b330ecc847f3e54ae3
7
- data.tar.gz: f53f494ef41b55afb7bda23320f9f1e7743164f187a507c60ee14667290552919adf618e01ca5bd5c2e30bc451a576ead8245f99a43339e81d74a80af89540bd
6
+ metadata.gz: 7a338080ea08773e1057561819f9d737a6ec3a12c1a9e56ddee97e307360f4f04ebdbe8b656bc0e5a9952d10fc606a83154ba6c7922a846178e61f1138b8b898
7
+ data.tar.gz: dde829e0be9b0745ac7c7d5fd76bd5d0027b01a831304f8b9e6fac5fe04a12e5808449d80db9ae1ca76257297b36233284cc929e512c05bb97b154f380b75c3e
@@ -1,9 +1,14 @@
1
1
  language: ruby
2
2
  rvm:
3
+ - 2.1.0
3
4
  - 2.0.0
4
5
  - 1.9.3
5
6
  - 1.9.2
6
7
  - jruby-18mode # JRuby in 1.8 mode
7
8
  - jruby-19mode # JRuby in 1.9 mode
8
- - rbx-19mode
9
- - 1.8.7
9
+ - rbx-2.2.6
10
+ - 1.8.7
11
+ matrix:
12
+ allow_failures:
13
+ - rvm: 1.8.7
14
+ - rvm: jruby-18mode
@@ -1,3 +1,4 @@
1
+ # -*- encoding : utf-8 -*-
1
2
  # TactfulTokenizer is a Ruby library for high quality sentence
2
3
  # tokenization. It uses a Naive Bayesian statistical model, and
3
4
  # is based on Splitta[http://code.google.com/p/splitta/]. But
@@ -135,7 +136,7 @@ module TactfulTokenizer
135
136
  res = nil
136
137
  text.each_line do |line|
137
138
  unless line.strip.empty?
138
- line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
139
+ line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
139
140
  unless res.strip.empty?
140
141
  frag = Frag.new(res)
141
142
  @frags.last.next = frag.cleaned.first unless @frags.empty?
@@ -1,3 +1,3 @@
1
1
  module TactfulTokenizer
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.5"
3
3
  end
@@ -2,7 +2,7 @@
2
2
  module WordTokenizer
3
3
  @@tokenize_regexps = [
4
4
  # Uniform Quotes
5
- [/''|``/, '"'],
5
+ [/''|``|“|”/, '"'],
6
6
 
7
7
  # Separate punctuation (except for periods) from words.
8
8
  [/(^|[:space:])(')/u, '\1\2'],
@@ -96,4 +96,6 @@ The Seattle Mariners on Sunday announced they signed high school catcher Ryan Ch
96
96
 
97
97
  Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz. If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
98
98
 
99
- Добавим немного русского текста, чтобы проверить, верно ли все работает. Еще одно предложение. Работай! Будешь? Нет?
99
+ Добавим немного русского текста, чтобы проверить, верно ли все работает. Еще одно предложение. Работай! Будешь? Нет?
100
+
101
+ “But the point of writing something down is so it stops bothering you—that's why it seems less haunting after you've written it down.” A breeze blows through the cattail stalks and rolls over us, and her scent mixes with the musk of earth.
@@ -97,3 +97,5 @@ If he plays well there, he could be elevated to Class A Everett of the Northwest
97
97
  Работай!
98
98
  Будешь?
99
99
  Нет?
100
+ “But the point of writing something down is so it stops bothering you—that's why it seems less haunting after you've written it down.”
101
+ A breeze blows through the cattail stalks and rolls over us, and her scent mixes with the musk of earth.
@@ -7,7 +7,7 @@ Gem::Specification.new do |s|
7
7
  s.version = TactfulTokenizer::VERSION
8
8
  s.platform = Gem::Platform::RUBY
9
9
  s.authors = ["Matthew Bunday", "Sergey Kishenin"]
10
- s.email = ["mkbunday@gmail.com"]
10
+ s.email = ["mkbunday@gmail.com", "sergey.kishenin@gmail.com"]
11
11
  s.homepage = "http://github.com/zencephalon/Tactful_Tokenizer"
12
12
  s.summary = "High accuracy sentence tokenization for Ruby."
13
13
  s.description = "TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ corpuses to provide high quality sentence tokenization."
@@ -20,6 +20,6 @@ Gem::Specification.new do |s|
20
20
  s.test_files = s.files.grep(%r{^(test|spec|features)/})
21
21
  s.require_paths = ["lib"]
22
22
 
23
- s.add_development_dependency "rspec", "~> 0"
24
- s.add_development_dependency "rake", "~> 0"
23
+ s.add_development_dependency "rspec", "~> 2.14.1"
24
+ s.add_development_dependency "rake", "~> 10.3.1"
25
25
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tactful_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthew Bunday
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-04-25 00:00:00.000000000 Z
12
+ date: 2014-04-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -17,32 +17,33 @@ dependencies:
17
17
  requirements:
18
18
  - - "~>"
19
19
  - !ruby/object:Gem::Version
20
- version: '0'
20
+ version: 2.14.1
21
21
  type: :development
22
22
  prerelease: false
23
23
  version_requirements: !ruby/object:Gem::Requirement
24
24
  requirements:
25
25
  - - "~>"
26
26
  - !ruby/object:Gem::Version
27
- version: '0'
27
+ version: 2.14.1
28
28
  - !ruby/object:Gem::Dependency
29
29
  name: rake
30
30
  requirement: !ruby/object:Gem::Requirement
31
31
  requirements:
32
32
  - - "~>"
33
33
  - !ruby/object:Gem::Version
34
- version: '0'
34
+ version: 10.3.1
35
35
  type: :development
36
36
  prerelease: false
37
37
  version_requirements: !ruby/object:Gem::Requirement
38
38
  requirements:
39
39
  - - "~>"
40
40
  - !ruby/object:Gem::Version
41
- version: '0'
41
+ version: 10.3.1
42
42
  description: TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ
43
43
  corpuses to provide high quality sentence tokenization.
44
44
  email:
45
45
  - mkbunday@gmail.com
46
+ - sergey.kishenin@gmail.com
46
47
  executables: []
47
48
  extensions: []
48
49
  extra_rdoc_files: []