RubyGems - tactful_tokenizer - Versions diffs - 0.0.2 → 0.0.3 - Mend

tactful_tokenizer 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +7 -0
data/.gitignore +5 -0
data/.travis.yml +9 -0
data/Gemfile +8 -0
data/README.rdoc +7 -0
data/Rakefile +6 -11
data/lib/tactful_tokenizer.rb +164 -158
data/lib/tactful_tokenizer/version.rb +3 -0
data/lib/word_tokenizer.rb +40 -36
data/{test → spec/files}/sample.txt +1 -0
data/{test → spec/files}/test_out.txt +5 -0
data/{test → spec/files}/verification_out.txt +5 -0
data/spec/spec_helper.rb +7 -0
data/spec/tactful_tokenizer/tactful_tokenizer_spec.rb +96 -0
data/tactful_tokenizer.gemspec +18 -25
metadata +74 -84
data.tar.gz.sig +0 -0
data/Manifest +0 -12
data/test/test.rb +0 -21
metadata.gz.sig +0 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 12a5db701c483d6d9653b1d9c1a6d1ac242501ff
+  data.tar.gz: 2dafa09df763499694bd2580e442ad42b1fcb304
+SHA512:
+  metadata.gz: 7d60773b82ba93aca28cb79402b73c73408a3466001346d89885f9c7d003339ec44b1a412224ec08692f3ea7d3665cabab7c93369c90a8b330ecc847f3e54ae3
+  data.tar.gz: f53f494ef41b55afb7bda23320f9f1e7743164f187a507c60ee14667290552919adf618e01ca5bd5c2e30bc451a576ead8245f99a43339e81d74a80af89540bd

data/.gitignore ADDED Viewed

@@ -0,0 +1,5 @@
+*.gem
+.bundle
+Gemfile.lock
+pkg/*
+/coverage

data/.travis.yml ADDED Viewed

@@ -0,0 +1,9 @@
+language: ruby
+rvm:
+  - 2.0.0
+  - 1.9.3
+  - 1.9.2
+  - jruby-18mode # JRuby in 1.8 mode
+  - jruby-19mode # JRuby in 1.9 mode
+  - rbx-19mode
+  - 1.8.7

data/Gemfile ADDED Viewed

@@ -0,0 +1,8 @@
+source "http://rubygems.org"
+group :test do
+  gem "coveralls", :require => false
+end
+# Specify your gem's dependencies in tactful_tokenizer.gemspec
+gemspec

data/README.rdoc CHANGED Viewed

@@ -1,11 +1,18 @@
 = TactfulTokenizer
+{<img src="https://badge.fury.io/rb/tactful_tokenizer.png" alt="Gem Version" />}[http://badge.fury.io/rb/tactful_tokenizer]
+{<img src="https://travis-ci.org/zencephalon/Tactful_Tokenizer.png?branch=release" alt="Build Status" />}[https://travis-ci.org/zencephalon/Tactful_Tokenizer]
+{<img src="https://codeclimate.com/github/zencephalon/Tactful_Tokenizer.png" />}[https://codeclimate.com/github/zencephalon/Tactful_Tokenizer]
+{<img src="https://coveralls.io/repos/zencephalon/Tactful_Tokenizer/badge.png?branch=release" alt="Coverage Status" />}[https://coveralls.io/r/zencephalon/Tactful_Tokenizer?branch=release]
 TactfulTokenizer is a Ruby library for high quality sentence
 tokenization. It uses a Naive Bayesian statistical model, and
 is based on Splitta[http://code.google.com/p/splitta/], but
 has support for '?' and '!' as well as primitive handling of
 XHTML markup. Better support for XHTML parsing is coming shortly.
+Additionally supports unicode text tokenization.
 == Usage
  require "tactful_tokenizer"

data/Rakefile CHANGED Viewed

@@ -1,12 +1,7 @@
-require 'rubygems'
-require 'rake'
-require 'echoe'
+#!/usr/bin/env rake
+require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
-Echoe.new('tactful_tokenizer', '0.0.2') do |p|
-    p.description    = "A high accuracy naive bayesian sentence tokenizer based on Splitta."
-    p.url            = "http://github.com/SlyShy/Tactful_Tokenizer"
-    p.author         = "Matthew Bunday"
-    p.email          = "mkbunday @nospam@ gmail.com"
-    p.ignore_pattern = ["tmp/*", "script/*"]
-    p.development_dependencies = []
-end
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/lib/tactful_tokenizer.rb CHANGED Viewed

@@ -17,188 +17,194 @@
 # Author:: Matthew Bunday (mailto:mkbunday@gmail.com)
 # License:: GNU General Public License v3
-require "word_tokenizer.rb"
-include WordTokenizer
-#--
-####### Performance TODOs.
+# Performance TODOs.
 # TODO: Use inline C where necessary?
 # TODO: Use RE2 regexp extension.
-#++
+# -*- encoding : utf-8 -*-
+require "word_tokenizer.rb"
+include WordTokenizer
 module TactfulTokenizer
-    # Basic String extensions.
-    String.class_eval do
+  # Basic String extensions.
+  String.class_eval do
-        # Simple regex to check if a string is alphabetic.
-        def is_alphabetic?
-            return !/[^[:alpha:]]/.match(self)
-        end
+    # Simple regex to check if a string is alphabetic.
+    def is_alphabetic?
+      !/[[:lower:][:upper:][:space:]]+/u.match(self).nil?
+    end
-        # Check for upper case.
-        # Surprisingly, this is faster than a regex in benchmarks.
-        # Using the trinary operator is faster than to_s
-        def is_upper_case?
-            self == self.upcase ? 'true' : 'false'
-        end
+    # Check for upper case.
+    # Surprisingly, this is faster than a regex in benchmarks.
+    # Using the trinary operator is faster than to_s
+    def is_upper_case?
+      self == self.upcase
     end
+  end
-    # A model stores normalized probabilities of different features occuring.
-    class Model
+  # A model stores normalized probabilities of different features occuring.
+  class Model
-        # Initialize the model. feats, lower_words, and non_abbrs
-        # indicate the locations of the respective Marshal dumps.
-        def initialize(feats="#{File.dirname(__FILE__)}/models/features.mar", lower_words="#{File.dirname(__FILE__)}/models/lower_words.mar", non_abbrs="#{File.dirname(__FILE__)}/models/non_abbrs.mar")
-            @feats, @lower_words, @non_abbrs = [feats, lower_words, non_abbrs].map do |file|
-                File.open(file) do |f|
-                    Marshal.load(f.read)
-                end
-            end
-            @p0 = @feats["<prior>"] ** 4
+    # Initialize the model. feats, lower_words, and non_abbrs
+    # indicate the locations of the respective Marshal dumps.
+    def initialize(feats="#{File.dirname(__FILE__)}/models/features.mar", lower_words="#{File.dirname(__FILE__)}/models/lower_words.mar", non_abbrs="#{File.dirname(__FILE__)}/models/non_abbrs.mar")
+      @feats, @lower_words, @non_abbrs = [feats, lower_words, non_abbrs].map do |file|
+        File.open(file) do |f|
+          Marshal.load(f.read)
         end
+      end
+      @p0 = @feats["<prior>"] ** 4
+    end
-        # feats = {feature => normalized probability of feature}
-        # lower_words = {token => log count of occurences in lower case}
-        # non_abbrs = {token => log count of occurences when not an abbrv.}
-        attr_accessor :feats, :lower_words, :non_abbrs
-        # This function is the only one that'll end up being used.
-        # m = TactfulTokenizer::Model.new
-        # m.tokenize_text("Hey, are these two sentences? I bet they should be.")
-        # => ["Hey, are these two sentences?", "I bet they should be."]
-        def tokenize_text(text)
-            data = Doc.new(text)
-            featurize(data)
-            classify(data)
-            return data.segment
-        end
+    # feats = {feature => normalized probability of feature}
+    # lower_words = {token => log count of occurences in lower case}
+    # non_abbrs = {token => log count of occurences when not an abbrv.}
+    attr_accessor :feats, :lower_words, :non_abbrs
+    # This function is the only one that'll end up being used.
+    # m = TactfulTokenizer::Model.new
+    # m.tokenize_text("Hey, are these two sentences? I bet they should be.")
+    # => ["Hey, are these two sentences?", "I bet they should be."]
+    def tokenize_text(text)
+      data = Doc.new(text)
+      featurize(data)
+      classify(data)
+      return data.segment
+    end
-        # Assign a prediction (probability, to be precise) to each sentence fragment.
-        # For each feature in each fragment we hunt up the normalized probability and
-        # multiply. This is a fairly straightforward Bayesian probabilistic algorithm.
-        def classify(doc)
-            frag, probs, feat = nil, nil, nil
-            doc.frags.each do |frag|
-                probs = @p0
-                frag.features.each do |feat|
-                    probs *= @feats[feat]
-                end
-                frag.pred = probs / (probs + 1)
-            end
+    # Assign a prediction (probability, to be precise) to each sentence fragment.
+    # For each feature in each fragment we hunt up the normalized probability and
+    # multiply. This is a fairly straightforward Bayesian probabilistic algorithm.
+    def classify(doc)
+      frag, probs, feat = nil, nil, nil
+      doc.frags.each do |frag|
+        probs = @p0
+        frag.features.each do |feat|
+          probs *= @feats[feat]
         end
+        frag.pred = probs / (probs + 1)
+      end
+    end
-        # Get the features of every fragment.
-        def featurize(doc)
-            frag = nil
-            doc.frags.each do |frag|
-                get_features(frag, self)
-            end
-        end
+    # Get the features of every fragment.
+    def featurize(doc)
+      frag = nil
+      doc.frags.each do |frag|
+        get_features(frag, self)
+      end
+    end
-        # Finds the features in a text fragment of the form:
-        # ... w1. (sb?) w2 ...
-        # Features listed in rough order of importance:
-        # * w1: a word that includes a period.
-        # * w2: the next word, if it exists.
-        # * w1length: the number of alphabetic characters in w1.
-        # * both: w1 and w2 taken together.
-        # * w1abbr: logarithmic count of w1 occuring without a period.
-        # * w2lower: logarithmiccount of w2 occuring lowercased.
-        def get_features(frag, model)
-            w1 = (frag.cleaned.last or '')
-            w2 = (frag.next or '')
-            frag.features = ["w1_#{w1}", "w2_#{w2}", "both_#{w1}_#{w2}"]
-            if not w2.empty?
-                if w1.chop.is_alphabetic?
-                    frag.features.push "w1length_#{[10, w1.length].min}", "w1abbr_#{model.non_abbrs[w1.chop]}"
-                end
-                if w2.chop.is_alphabetic?
-                    frag.features.push "w2cap_#{w2[0].is_upper_case?}", "w2lower_#{model.lower_words[w2.downcase]}"
-                end
+    # Finds the features in a text fragment of the form:
+    # ... w1. (sb?) w2 ...
+    # Features listed in rough order of importance:
+    # * w1: a word that includes a period.
+    # * w2: the next word, if it exists.
+    # * w1length: the number of alphabetic characters in w1.
+    # * both: w1 and w2 taken together.
+    # * w1abbr: logarithmic count of w1 occuring without a period.
+    # * w2lower: logarithmiccount of w2 occuring lowercased.
+    def get_features(frag, model)
+      w1 = (frag.cleaned.last or '')
+      w2 = (frag.next or '')
+      frag.features = ["w1_#{w1}", "w2_#{w2}", "both_#{w1}_#{w2}"]
+      unless w2.empty?
+        frag.push_w1_features(w1, model)
+        frag.push_w2_features(w2, model)
+      end
+    end
+  end
+  # A document represents the input text. It holds a list of fragments generated
+  # from the text.
+  class Doc
+    # List of fragments.
+    attr_accessor :frags
+    # Receives a text, which is then broken into fragments.
+    # A fragment ends with a period, quesetion mark, or exclamation mark followed
+    # possibly by right handed punctuation like quotation marks or closing braces
+    # and trailing whitespace. Failing that, it'll accept something like "I hate cheese\n"
+    # No, it doesn't have a period, but that's the end of paragraph.
+    #
+    # Input assumption: Paragraphs delimited by line breaks.
+    def initialize(text)
+      @frags = []
+      res = nil
+      text.each_line do |line|
+        unless line.strip.empty?
+          line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
+            unless res.strip.empty?
+              frag = Frag.new(res)
+              @frags.last.next = frag.cleaned.first unless @frags.empty?
+              @frags.push frag
             end
+          end
         end
+      end
     end
-    # A document represents the input text. It holds a list of fragments generated
-    # from the text.
-    class Doc
-        # List of fragments.
-        attr_accessor :frags
-        # Receives a text, which is then broken into fragments.
-        # A fragment ends with a period, quesetion mark, or exclamation mark followed
-        # possibly by right handed punctuation like quotation marks or closing braces
-        # and trailing whitespace. Failing that, it'll accept something like "I hate cheese\n"
-        # No, it doesn't have a period, but that's the end of paragraph.
-        #
-        # Input assumption: Paragraphs delimited by line breaks.
-        def initialize(text)
-            @frags = []
-            res = nil
-            text.each_line do |line|
-                unless line.strip.empty?
-                    line.split(/(.*?[.!?](?:["')\]}]|(?:<.*>))*[\s])/).each do |res|
-                        unless res.strip.empty?
-                            frag = Frag.new(res)
-                            @frags.last.next = frag.cleaned.first unless @frags.empty?
-                            @frags.push frag
-                        end
-                    end
-                end
-            end
+    # Segments the text. More precisely, it reassembles the fragments into sentences.
+    # We call something a sentence whenever it is more likely to be a sentence than not.
+    def segment
+      sents, sent = [], []
+      thresh = 0.5
+      frag = nil
+      @frags.each do |frag|
+        sent.push(frag.orig)
+        if frag.pred && frag.pred > thresh
+          break if frag.orig.nil?
+          sents.push(sent.join('').strip)
+          sent = []
         end
+      end
+      sents
+    end
+  end
+  # A fragment is a potential sentence, but is based only on the existence of a period.
+  # The text "Here in the U.S. Senate we prefer to devour our friends." will be split
+  # into "Here in the U.S." and "Senate we prefer to devour our friends."
+  class Frag
+    # orig = The original text of the fragment.
+    # next = The next word following the fragment.
+    # cleaned = Array of the fragment's words after cleaning.
+    # pred = Probability that the fragment is a sentence.
+    # features = Array of the fragment's features.
+    attr_accessor :orig, :next, :cleaned, :pred, :features
+    # Create a new fragment.
+    def initialize(orig='')
+      @orig = orig
+      clean(orig)
+      @next, @pred, @features = nil, nil, nil
+    end
-        # Segments the text. More precisely, it reassembles the fragments into sentences.
-        # We call something a sentence whenever it is more likely to be a sentence than not.
-        def segment
-            sents, sent = [], []
-            thresh = 0.5
-            frag = nil
-            @frags.each do |frag|
-                sent.push(frag.orig)
-                if frag.pred > thresh
-                    break if frag.orig.nil?
-                    sents.push(sent.join('').strip)
-                    sent = []
-                end
-            end
-            sents
-        end
+    # Normalizes numbers and discards ambiguous punctuation. And then splits into an
+    # array, because realistically only the last and first words are ever accessed.
+    def clean(s)
+      @cleaned = String.new(s)
+      tokenize(@cleaned)
+      @cleaned.gsub!(/[.,\d]*\d/, '<NUM>')
+      @cleaned.gsub!(/[^[[:upper:][:lower:]]\d[:space:],!?.;:<>\-'\/$% ]/u, '')
+      @cleaned.gsub!('--', ' ')
+      @cleaned = @cleaned.split
     end
-    # A fragment is a potential sentence, but is based only on the existence of a period.
-    # The text "Here in the U.S. Senate we prefer to devour our friends." will be split
-    # into "Here in the U.S." and "Senate we prefer to devour our friends."
-    class Frag
-        # orig = The original text of the fragment.
-        # next = The next word following the fragment.
-        # cleaned = Array of the fragment's words after cleaning.
-        # pred = Probability that the fragment is a sentence.
-        # features = Array of the fragment's features.
-        attr_accessor :orig, :next, :cleaned, :pred, :features
-        # Create a new fragment.
-        def initialize(orig='')
-            @orig = orig
-            clean(orig)
-            @next, @pred, @features = nil, nil, nil
-        end
+    def push_w1_features w1, model
+      if w1.chop.is_alphabetic?
+        features.push "w1length_#{[10, w1.length].min}", "w1abbr_#{model.non_abbrs[w1.chop]}"
+      end
+    end
-        # Normalizes numbers and discards ambiguous punctuation. And then splits into an
-        # array, because realistically only the last and first words are ever accessed.
-        def clean(s)
-            @cleaned = String.new(s)
-            tokenize(@cleaned)
-            @cleaned.gsub!(/[.,\d]*\d/, '<NUM>')
-            @cleaned.gsub!(/[^a-zA-Z0-9,.;:<>\-'\/$% ]/, '')
-            @cleaned.gsub!('--', ' ')
-            @cleaned = @cleaned.split
-        end
+    def push_w2_features w2, model
+      if w2.chop.is_alphabetic?
+        features.push "w2cap_#{w2[0,1].is_upper_case?}", "w2lower_#{model.lower_words[w2.downcase]}"
+      end
     end
+  end
 end

data/lib/tactful_tokenizer/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module TactfulTokenizer
+  VERSION = "0.0.3"
+end

data/lib/word_tokenizer.rb CHANGED Viewed

@@ -1,51 +1,55 @@
+# -*- encoding : utf-8 -*-
 module WordTokenizer
-    @@tokenize_regexps = [
-        # Uniform Quotes
-        [/''|``/, '"'],
+  @@tokenize_regexps = [
+    # Uniform Quotes
+    [/''|``/, '"'],
-        # Separate punctuation (except for periods) from words.
-        [/(^|\s)(')/, '\1\2'],
-        [/(?=[\("`{\[:;&#*@])(.)/, '\1 '],
+    # Separate punctuation (except for periods) from words.
+    [/(^|[:space:])(')/u, '\1\2'],
+    [/(?=[\("`{\[:;&#*@])(.)/, '\1 '],
-        [/(.)(?=[?!\)";}\]*:@'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|\s)-)(?=[^-])/, '\1 '],
+    [/(.)(?=[?!\)";}\]*:@'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|[:space:])-)(?=[^-])/u, '\1 '],
-        # Treat double-hyphen as a single token.
-        [/([^-])(--+)([^-])/, '\1 \2 \3'],
-        [/(\s|^)(,)(?=(\S))/, '\1\2 '],
+    # Treat double-hyphen as a single token.
+    [/([^-])(--+)([^-])/, '\1 \2 \3'],
+    [/([:space:]|^)(,)(?=(^[:space:]))/u, '\1\2 '],
-        # Only separate a comma if a space follows.
-        [/(.)(,)(\s|$)/, '\1 \2\3'],
+    # Only separate a comma if a space follows.
+    [/(.)(,)([:space:]|$)/u, '\1 \2\3'],
-        # Combine dots separated by whitespace to be a single token.
-        [/\.\s\.\s\./, '...'],
+    # Combine dots separated by whitespace to be a single token.
+    [/\.[:space:]\.[:space:]\./u, '...'],
-        # Separate "No.6"
-        [/([a-zA-Z]\.)(\d+)/, '\1 \2'],
+    # Separate "No.6"
+    [/(^[:upper]^[:lower:]\.)(\d+)/, '\1 \2'],
-        # Separate words from ellipses
-        [/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
-        [/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
-        [/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
+    # Md. or MD. for Ruby 1.8
+    [/M[d|D]./, '\1'],
-        ##### Some additional fixes.
+    # Separate words from ellipses
+    [/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
+    [/(^|[:space:])(\.{2,})([^\.[:space:]])/u, '\1\2 \3'],
+    [/(^|[:space:])(\.{2,})([^\.[:space:]])/u, '\1 \2\3'],
-        # Fix %, $, &
-        [/(\d)%/, '\1 %'],
-        [/\$(\.?\d)/, '$ \1'],
-        [/(\w)& (\w)/, '\1&\2'],
-        [/(\w\w+)&(\w\w+)/, '\1 & \2'],
+    ##### Some additional fixes.
-        # Fix (n 't) -> ( n't)
-        [/n 't( |$)/, " n't\\1"],
-        [/N 'T( |$)/, " N'T\\1"],
+    # Fix %, $, &
+    [/(\d)%/, '\1 %'],
+    [/\$(\.?\d)/, '$ \1'],
+    [/(^[:lower:]^[:upper:])& (^[:lower:]^[:upper:])/u, '\1&\2'],
+    [/(^[:lower:]^[:upper:]+)&(^[:lower:]^[:upper:]+)/u, '\1 & \2'],
-        # Treebank tokenizer special words
-        [/([Cc])annot/, '\1an not']
+    # Fix (n 't) -> ( n't)
+    [/n 't( |$)/, " n't\\1"],
+    [/N 'T( |$)/, " N'T\\1"],
-    ];
+    # Treebank tokenizer special words
+    [/([Cc])annot/, '\1an not']
-    def tokenize(s)
-        rules = []
-        @@tokenize_regexps.each {|rules| s.gsub!(rules[0], rules[1])}
-    end
+  ];
+  def tokenize(s)
+    rules = []
+    @@tokenize_regexps.each {|rules| s.gsub!(rules[0], rules[1])}
+  end
 end

data/{test → spec/files}/sample.txt RENAMED Viewed

@@ -96,3 +96,4 @@ The Seattle Mariners on Sunday announced they signed high school catcher Ryan Ch
 Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz. If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
+Добавим немного русского текста, чтобы проверить, верно ли все работает. Еще одно предложение. Работай! Будешь? Нет?

data/{test → spec/files}/test_out.txt RENAMED Viewed

@@ -92,3 +92,8 @@ It was replay of the scene from last year when David Wells pitched the only othe
 The Seattle Mariners on Sunday announced they signed high school catcher Ryan Christianson, their first-round pick and the 11th overall selection in last month's baseball draft.
 Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz.
 If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
+Добавим немного русского текста, чтобы проверить, верно ли все работает.
+Еще одно предложение.
+Работай!
+Будешь?
+Нет?

data/{test → spec/files}/verification_out.txt RENAMED Viewed

@@ -92,3 +92,8 @@ It was replay of the scene from last year when David Wells pitched the only othe
 The Seattle Mariners on Sunday announced they signed high school catcher Ryan Christianson, their first-round pick and the 11th overall selection in last month's baseball draft.
 Christianson, 18, from Arlington High School in Riverside, Calif., will report to the Mariners' rookie league team in Peoria, Ariz.
 If he plays well there, he could be elevated to Class A Everett of the Northwest League in ``a couple of weeks,'' said Frank Mattox, the team's director of scouting.
+Добавим немного русского текста, чтобы проверить, верно ли все работает.
+Еще одно предложение.
+Работай!
+Будешь?
+Нет?

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,7 @@
+$:.unshift File.expand_path('..', __FILE__)
+$:.unshift File.expand_path('../../lib', __FILE__)
+require 'rspec'
+require 'tactful_tokenizer'
+require 'coveralls'
+Coveralls.wear!

data/spec/tactful_tokenizer/tactful_tokenizer_spec.rb ADDED Viewed

@@ -0,0 +1,96 @@
+# -*- encoding : utf-8 -*-
+require 'spec_helper'
+describe String do
+  describe "::is_upper_case?" do
+    it "should be false" do
+      "asdfghjk".is_upper_case?.should == false
+    end
+    it "should be true" do
+      "ASDFGHJK".is_upper_case?.should == true
+    end
+  end
+  describe "::is_alphabetic?" do
+    it "should be false" do
+      "!^?".is_alphabetic?.should == false
+    end
+    it "should be true" do
+      "some text".is_alphabetic?.should == true
+    end
+    it "should be true for unicode text" do
+      "русский текст öö üüü".is_alphabetic?.should == true
+    end
+  end
+end
+describe TactfulTokenizer::Doc do
+  describe "::segment" do
+    it "should return array of segments" do
+      model = TactfulTokenizer::Model.new
+      doc = TactfulTokenizer::Doc.new("Hello!\nMy name is Richard Stewart.\nHow are you?\n")
+      model.featurize doc
+      model.classify doc
+      doc.segment.should == ["Hello!", "My name is Richard Stewart.", "How are you?"]
+    end
+  end
+end
+describe TactfulTokenizer::Frag do
+  describe "::clean" do
+    before :each do
+      @frag = TactfulTokenizer::Frag.new
+      @cleaned = @frag.clean("1 good bad 23 ?!")
+    end
+    it "should return an instance of Array" do
+      @cleaned.class.should == Array
+    end
+    it "should normalize numbers and discard ambiguous punctuation" do
+      @cleaned.should == ["<NUM>", "good", "bad", "<NUM>", "?", "!"]
+    end
+  end
+end
+describe TactfulTokenizer::Model do
+  before :each do
+    @m = TactfulTokenizer::Model.new
+    File.open('spec/files/sample.txt') do |f|
+      @text = f.read
+    end
+  end
+  describe "::classify" do
+    it "should assign a prediction for frags" do
+      doc = TactfulTokenizer::Doc.new("Hello!\n")
+      @m.featurize(doc)
+      @m.classify(doc).first.pred.should > 0.5
+    end
+  end
+  describe "::featurize" do
+    it "should get the features of every fragment" do
+      doc = TactfulTokenizer::Doc.new("Hello!\n")
+      @m.featurize(doc).first.features.should == ["w1_!", "w2_", "both_!_"]
+    end
+  end
+  describe "::tokenize_text" do
+    it "should tokenize correctly" do
+      text = @m.tokenize_text(@text)
+      File.open("spec/files/test_out.txt", "w+") do |g|
+        text.each do |line|
+          g.puts line unless line.empty?
+        end
+        g.rewind
+        t2 = g.read
+        t1 = File.open("spec/files/verification_out.txt").read
+        t1.should == t2
+      end
+    end
+  end
+end

data/tactful_tokenizer.gemspec CHANGED Viewed

@@ -1,32 +1,25 @@
 # -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "tactful_tokenizer/version"
 Gem::Specification.new do |s|
-  s.name = %q{tactful_tokenizer}
-  s.version = "0.0.2"
+  s.name        = "tactful_tokenizer"
+  s.version     = TactfulTokenizer::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Matthew Bunday", "Sergey Kishenin"]
+  s.email       = ["mkbunday@gmail.com"]
+  s.homepage    = "http://github.com/zencephalon/Tactful_Tokenizer"
+  s.summary     = "High accuracy sentence tokenization for Ruby."
+  s.description = "TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ corpuses to provide high quality sentence tokenization."
+  s.license     = "GPL-3"
-  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
-  s.authors = ["Matthew Bunday"]
-  s.cert_chain = ["/home/slyshy/.ssh/gem-public_cert.pem"]
-  s.date = %q{2010-04-04}
-  s.description = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
-  s.email = %q{mkbunday @nospam@ gmail.com}
-  s.extra_rdoc_files = ["README.rdoc", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb"]
-  s.files = ["Manifest", "README.rdoc", "Rakefile", "lib/models/features.mar", "lib/models/lower_words.mar", "lib/models/non_abbrs.mar", "lib/tactful_tokenizer.rb", "lib/word_tokenizer.rb", "test/sample.txt", "test/test.rb", "test/test_out.txt", "test/verification_out.txt", "tactful_tokenizer.gemspec"]
-  s.homepage = %q{http://github.com/SlyShy/Tactful_Tokenizer}
-  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Tactful_tokenizer", "--main", "README.rdoc"]
-  s.require_paths = ["lib"]
-  s.rubyforge_project = %q{tactful_tokenizer}
-  s.rubygems_version = %q{1.3.6}
-  s.signing_key = %q{/home/slyshy/.ssh/gem-private_key.pem}
-  s.summary = %q{A high accuracy naive bayesian sentence tokenizer based on Splitta.}
+  s.rubyforge_project = "tactful_tokenizer"
-  if s.respond_to? :specification_version then
-    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
-    s.specification_version = 3
+  s.files         = `git ls-files`.split($\)
+  s.executables   = s.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  s.test_files    = s.files.grep(%r{^(test|spec|features)/})
+  s.require_paths = ["lib"]
-    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
-    else
-    end
-  else
-  end
+  s.add_development_dependency "rspec", "~> 0"
+  s.add_development_dependency "rake", "~> 0"
 end

metadata CHANGED Viewed

@@ -1,106 +1,96 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: tactful_tokenizer
-version: !ruby/object:Gem::Version
-  prerelease: false
-  segments:
-  - 0
-  - 0
-  - 2
-  version: 0.0.2
+version: !ruby/object:Gem::Version
+  version: 0.0.3
 platform: ruby
-authors:
+authors:
 - Matthew Bunday
+- Sergey Kishenin
 autorequire:
 bindir: bin
-cert_chain:
-- |
-  -----BEGIN CERTIFICATE-----
-  MIIDMjCCAhqgAwIBAgIBADANBgkqhkiG9w0BAQUFADA/MREwDwYDVQQDDAhta2J1
-  bmRheTEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYKCZImiZPyLGQBGRYDY29t
-  MB4XDTEwMDMyMzE2MDkzOVoXDTExMDMyMzE2MDkzOVowPzERMA8GA1UEAwwIbWti
-  dW5kYXkxFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixkARkWA2Nv
-  bTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAMk5+Wsur5ptIGUthPBG
-  VHECPqlV7TRgxiEMbH8vxkMVNnqFGDTezd9zsmqfX9kKR4/Jmu1fXKyBswGRxYxD
-  qx8nR+DCnWk0gfx2jjpnknPPWTQ6lHiZaPrGb+QuANhebPTwI6cDIz4A3dg2QIRo
-  ETdiAdOspNudUHu2Jf/QeNQPr5SURy9vGnSXkDhMcrnR3EjkRAP4suNIlHBNj3Hz
-  7hYjZV5QzeFwVENR5K3zFSkbC3ZK6uZTUwPVngmCqWz3MLsNJiQhAhvn/XQ8OCJ3
-  Q8O/nPuIIqFNeT3TMvnfrbx+wyxX6FIBZ12M4lNmU6yoXxzmi/n/cBNLAkQ/hc2g
-  n68CAwEAAaM5MDcwCQYDVR0TBAIwADAdBgNVHQ4EFgQUZfQL/a3SzQ017Zj9MUwh
-  Y6BtLUgwCwYDVR0PBAQDAgSwMA0GCSqGSIb3DQEBBQUAA4IBAQAjdEGkZbV7tkOq
-  N0y3yL5n1JOMsVHsQF7/w2zeET3PyUgKmmobdq3V0rztqVcJ1oP/+fYUO1KYxC90
-  b8FOCGGvcKjMn1QJufFp1DTfiGFcz6nHRWmiAMRXbempzA5NDzocQP9jaRkoYEzK
-  pwsJwe0dlpJXs8/fqqljNdBe4AToDGLcbzdMmpGxZN63P70yAFL5G7sJy1Izp5ei
-  CvIRDtL1PdU1ESVLFJuoCAiCtpBfwwepv4kuuoca9Ykd5ldPCGzMq0n8+KIubb+2
-  xz7fp33atnZoMajdCOYKqwo2xVhUuFPZzBFZ3L6T6YLuEVGKHNyUAfcfr+8VSuB5
-  3+l7cSZt
-  -----END CERTIFICATE-----
-date: 2010-04-04 00:00:00 -05:00
-default_executable:
-dependencies: []
-description: A high accuracy naive bayesian sentence tokenizer based on Splitta.
-email: mkbunday @nospam@ gmail.com
+cert_chain: []
+date: 2014-04-25 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+description: TactfulTokenizer uses a naive bayesian model train on the Brown and WSJ
+  corpuses to provide high quality sentence tokenization.
+email:
+- mkbunday@gmail.com
 executables: []
 extensions: []
-extra_rdoc_files:
-- README.rdoc
-- lib/models/features.mar
-- lib/models/lower_words.mar
-- lib/models/non_abbrs.mar
-- lib/tactful_tokenizer.rb
-- lib/word_tokenizer.rb
-files:
-- Manifest
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".travis.yml"
+- Gemfile
 - README.rdoc
 - Rakefile
 - lib/models/features.mar
 - lib/models/lower_words.mar
 - lib/models/non_abbrs.mar
 - lib/tactful_tokenizer.rb
+- lib/tactful_tokenizer/version.rb
 - lib/word_tokenizer.rb
-- test/sample.txt
-- test/test.rb
-- test/test_out.txt
-- test/verification_out.txt
+- spec/files/sample.txt
+- spec/files/test_out.txt
+- spec/files/verification_out.txt
+- spec/spec_helper.rb
+- spec/tactful_tokenizer/tactful_tokenizer_spec.rb
 - tactful_tokenizer.gemspec
-has_rdoc: true
-homepage: http://github.com/SlyShy/Tactful_Tokenizer
-licenses: []
+homepage: http://github.com/zencephalon/Tactful_Tokenizer
+licenses:
+- GPL-3
+metadata: {}
 post_install_message:
-rdoc_options:
-- --line-numbers
-- --inline-source
-- --title
-- Tactful_tokenizer
-- --main
-- README.rdoc
-require_paths:
+rdoc_options: []
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
-  requirements:
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
   - - ">="
-    - !ruby/object:Gem::Version
-      segments:
-      - 0
-      version: "0"
-required_rubygems_version: !ruby/object:Gem::Requirement
-  requirements:
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
   - - ">="
-    - !ruby/object:Gem::Version
-      segments:
-      - 1
-      - 2
-      version: "1.2"
+    - !ruby/object:Gem::Version
+      version: '0'
 requirements: []
 rubyforge_project: tactful_tokenizer
-rubygems_version: 1.3.6
+rubygems_version: 2.2.2
 signing_key:
-specification_version: 3
-summary: A high accuracy naive bayesian sentence tokenizer based on Splitta.
-test_files: []
+specification_version: 4
+summary: High accuracy sentence tokenization for Ruby.
+test_files:
+- spec/files/sample.txt
+- spec/files/test_out.txt
+- spec/files/verification_out.txt
+- spec/spec_helper.rb
+- spec/tactful_tokenizer/tactful_tokenizer_spec.rb

data.tar.gz.sig DELETED Viewed

Binary file

data/Manifest DELETED Viewed

@@ -1,12 +0,0 @@
-Manifest
-README.rdoc
-Rakefile
-lib/models/features.mar
-lib/models/lower_words.mar
-lib/models/non_abbrs.mar
-lib/tactful_tokenizer.rb
-lib/word_tokenizer.rb
-test/sample.txt
-test/test.rb
-test/test_out.txt
-test/verification_out.txt

data/test/test.rb DELETED Viewed

@@ -1,21 +0,0 @@
-require '../lib/tactful_tokenizer'
-require 'test/unit'
-class TactfulTokenize < Test::Unit::TestCase
-    def test_simple
-        m = TactfulTokenizer::Model.new
-        File.open("sample.txt") do |f|
-            text = f.read
-            text = m.tokenize_text(text)
-            File.open("test_out.txt","w+") do |g|
-                text.each do |line|
-                    g.puts line unless line.empty?
-                end
-                g.rewind
-                t2 = g.read
-                t1 = File.open("verification_out.txt").read
-                assert_equal(t1, t2)
-            end
-        end
-    end
-end

metadata.gz.sig DELETED Viewed

Binary file