RubyGems - tactful_tokenizer - Versions diffs - 0.0.3 → 0.0.5 - Mend

tactful_tokenizer 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/.travis.yml +7 -2
data/lib/tactful_tokenizer.rb +2 -1
data/lib/tactful_tokenizer/version.rb +1 -1
data/lib/word_tokenizer.rb +1 -1
data/spec/files/sample.txt +3 -1
data/spec/files/verification_out.txt +2 -0
data/tactful_tokenizer.gemspec +3 -3
metadata +7 -6

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 12a5db701c483d6d9653b1d9c1a6d1ac242501ff
-  data.tar.gz: 2dafa09df763499694bd2580e442ad42b1fcb304
+  metadata.gz: 69094f450e0e7dac588b0402d7480070df7503b1
+  data.tar.gz: 5b70295f51ce23cfcc53ff252f710c31c759c64f
 SHA512:
-  metadata.gz: 7d60773b82ba93aca28cb79402b73c73408a3466001346d89885f9c7d003339ec44b1a412224ec08692f3ea7d3665cabab7c93369c90a8b330ecc847f3e54ae3
-  data.tar.gz: f53f494ef41b55afb7bda23320f9f1e7743164f187a507c60ee14667290552919adf618e01ca5bd5c2e30bc451a576ead8245f99a43339e81d74a80af89540bd
+  metadata.gz: 7a338080ea08773e1057561819f9d737a6ec3a12c1a9e56ddee97e307360f4f04ebdbe8b656bc0e5a9952d10fc606a83154ba6c7922a846178e61f1138b8b898
+  data.tar.gz: dde829e0be9b0745ac7c7d5fd76bd5d0027b01a831304f8b9e6fac5fe04a12e5808449d80db9ae1ca76257297b36233284cc929e512c05bb97b154f380b75c3e

data/.travis.yml CHANGED

@@ -1,9 +1,14 @@
 language: ruby
 rvm:
+  - 2.1.0
   - 2.0.0
   - 1.9.3
   - 1.9.2
   - jruby-18mode # JRuby in 1.8 mode
   - jruby-19mode # JRuby in 1.9 mode
-  - rbx-19mode
-  - 1.8.7
+  - rbx-2.2.6
+  - 1.8.7
+matrix:
+  allow_failures:
+    - rvm: 1.8.7
+    - rvm: jruby-18mode

data/lib/tactful_tokenizer.rb CHANGED

@@ -1,3 +1,4 @@
+# -*- encoding : utf-8 -*-
 # TactfulTokenizer is a Ruby library for high quality sentence
 # tokenization. It uses a Naive Bayesian statistical model, and
 # is based on Splitta[http://code.google.com/p/splitta/]. But
@@ -135,7 +136,7 @@ module TactfulTokenizer
       res = nil
       text.each_line do |line|
         unless line.strip.empty?
-          line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
+          line.split(/(.*?[.!?](?:[”"')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
             unless res.strip.empty?
               frag = Frag.new(res)
               @frags.last.next = frag.cleaned.first unless @frags.empty?

data/lib/tactful_tokenizer/version.rb CHANGED

@@ -1,3 +1,3 @@
 module TactfulTokenizer
-  VERSION = "0.0.3"
+  VERSION = "0.0.5"
 end

data/lib/word_tokenizer.rb CHANGED

@@ -2,7 +2,7 @@
 module WordTokenizer
   @@tokenize_regexps = [
     # Uniform Quotes
-    [/''|``/, '"'],
+    [/''|``|“|”/, '"'],
     # Separate punctuation (except for periods) from words.
     [/(^|[:space:])(')/u, '\1\2'],

data/spec/files/sample.txt CHANGED

@@ -96,4 +96,6 @@ The Seattle Mariners on Sunday announced they signed high school catcher Ryan Ch
 Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz. If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
-Добавим немного русского текста, чтобы проверить, верно ли все работает. Еще одно предложение. Работай! Будешь? Нет?
+Добавим немного русского текста, чтобы проверить, верно ли все работает. Еще одно предложение. Работай! Будешь? Нет?
+“But the point of writing something down is so it stops bothering you—that's why it seems less haunting after you've written it down.” A breeze blows through the cattail stalks and rolls over us, and her scent mixes with the musk of earth.

data/spec/files/verification_out.txt CHANGED

@@ -97,3 +97,5 @@ If he plays well there, he could be elevated to Class A Everett of the Northwest
 Работай!
 Будешь?
 Нет?
+“But the point of writing something down is so it stops bothering you—that's why it seems less haunting after you've written it down.”
+A breeze blows through the cattail stalks and rolls over us, and her scent mixes with the musk of earth.

data/tactful_tokenizer.gemspec CHANGED

@@ -7,7 +7,7 @@ Gem::Specification.new do |s|
   s.version     = TactfulTokenizer::VERSION
   s.platform    = Gem::Platform::RUBY
   s.authors     = ["Matthew Bunday", "Sergey Kishenin"]
-  s.email       = ["mkbunday@gmail.com"]
+  s.email       = ["mkbunday@gmail.com", "sergey.kishenin@gmail.com"]
   s.homepage    = "http://github.com/zencephalon/Tactful_Tokenizer"
   s.summary     = "High accuracy sentence tokenization for Ruby."
   s.description = "TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ corpuses to provide high quality sentence tokenization."
@@ -20,6 +20,6 @@ Gem::Specification.new do |s|
   s.test_files    = s.files.grep(%r{^(test|spec|features)/})
   s.require_paths = ["lib"]
-  s.add_development_dependency "rspec", "~> 0"
-  s.add_development_dependency "rake", "~> 0"
+  s.add_development_dependency "rspec", "~> 2.14.1"
+  s.add_development_dependency "rake", "~> 10.3.1"
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: tactful_tokenizer
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.5
 platform: ruby
 authors:
 - Matthew Bunday
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-04-25 00:00:00.000000000 Z
+date: 2014-04-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
@@ -17,32 +17,33 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 2.14.1
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 2.14.1
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 10.3.1
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 10.3.1
 description: TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ
   corpuses to provide high quality sentence tokenization.
 email:
 - mkbunday@gmail.com
+- sergey.kishenin@gmail.com
 executables: []
 extensions: []
 extra_rdoc_files: []