RubyGems - punkt-segmenter - Versions diffs - 0.9.0 - Mend

punkt-segmenter 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

data/LICENSE.txt +13 -0
data/README.md +79 -0
data/Rakefile +16 -0
data/lib/punkt-segmenter.rb +13 -0
data/lib/punkt-segmenter/frequency_distribution.rb +121 -0
data/lib/punkt-segmenter/punkt.rb +51 -0
data/lib/punkt-segmenter/punkt/base.rb +65 -0
data/lib/punkt-segmenter/punkt/language_vars.rb +34 -0
data/lib/punkt-segmenter/punkt/parameters.rb +37 -0
data/lib/punkt-segmenter/punkt/sentence_tokenizer.rb +180 -0
data/lib/punkt-segmenter/punkt/token.rb +81 -0
data/lib/punkt-segmenter/punkt/trainer.rb +304 -0
data/punkt-segmenter.gemspec +17 -0
data/script/console +7 -0
data/test/punkt-segmenter/frequency_distribution_test.rb +118 -0
data/test/punkt-segmenter/punkt/language_vars_test.rb +21 -0
data/test/punkt-segmenter/punkt/token_test.rb +121 -0
data/test/punkt-segmenter/punkt/trainer_test.rb +32 -0
data/test/punkt-segmenter/punkt_test.rb +67 -0
data/test/test_helper.rb +16 -0
metadata +129 -0

data/lib/punkt-segmenter/punkt/token.rb ADDED

@@ -0,0 +1,81 @@
+module Punkt
+  class Token
+    attr_accessor :token, :type, :period_final
+    attr_accessor :paragraph_start, :line_start
+    attr_accessor :sentence_break, :abbr, :ellipsis
+    def initialize(token, options = {})
+      valid_options = [:paragraph_start, :line_start, :sentence_break, :abbr, :ellipsis]
+      @token        = token
+      @type         = UnicodeUtils.downcase(token).gsub(/^-?[\.,]?\d[\d,\.-]*\.?$/, '##number##') # numeric
+      @period_final = token.end_with?('.')
+      valid_options.each do |item|
+        self.instance_variable_set(("@"+item.to_s).to_sym, nil)
+      end
+      options.each do |key, value|
+        self.instance_variable_set(("@"+key.to_s).to_sym, value) if valid_options.include?(key)
+      end
+    end
+    def type_without_period
+      @type.size > 1 && @type.end_with?('.') ? @type.chop : @type
+    end
+    def type_without_sentence_period
+      @sentence_break ? type_without_period : @type
+    end
+    def first_upper?
+      UnicodeUtils.uppercase_char?(@token[0])
+    end
+    def first_lower?
+      UnicodeUtils.lowercase_char?(@token[0])
+    end
+    def first_case
+      return :lower if first_lower?
+      return :upper if first_upper?
+      return :none
+    end
+    def ends_with_period?
+      @period_final
+    end
+    def is_ellipsis?
+      !(@token =~ /^\.\.+$/).nil?
+    end
+    def is_number?
+      @type.start_with?("##number##")
+    end
+    def is_initial?
+      !(@token =~ /^[^\W\d]\.$/).nil?
+    end
+    def is_alpha?
+      !(@token =~ /^[^\W\d]+$/).nil?
+    end
+    def is_non_punctuation?
+      !(@type =~ /[^\W\d]/).nil?
+    end
+    def to_s
+      result = @token
+      result += '<A>' if @abbr
+      result += '<E>' if @ellipsis
+      result += '<S>' if @sentence_break
+      result
+    end
+    def inspect
+      "<#{to_s}>"
+    end
+  end
+end

data/lib/punkt-segmenter/punkt/trainer.rb ADDED

@@ -0,0 +1,304 @@
+require "punkt-segmenter/frequency_distribution"
+module Punkt
+  class Trainer < Base
+    # cut-off value whether a 'token' is an abbreviation
+    ABBREV = 0.3
+    # allows the disabling of the abbreviation penalty heuristic, which
+    # exponentially disadvantages words that are found at times without a
+    # final period.
+    IGNORE_ABBREV_PENALTY = false
+    # upper cut-off for Mikheev's(2002) abbreviation detection algorithm
+    ABBREV_BACKOFF = 5
+    # minimal log-likelihood value that two tokens need to be considered
+    # as a collocation
+    COLLOCATION = 7.88
+    # minimal log-likelihood value that a token requires to be considered
+    # as a frequent sentence starter
+    SENT_STARTER = 30
+    # this includes as potential collocations all word pairs where the first
+    # word ends in a period. It may be useful in corpora where there is a lot
+    # of variation that makes abbreviations like Mr difficult to identify.
+    INCLUDE_ALL_COLLOCS = true #TODO colocar false
+    # this includes as potential collocations all word pairs where the first
+    # word is an abbreviation. Such collocations override the orthographic
+    # heuristic, but not the sentence starter heuristic. This is overridden by
+    # INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
+    # and ordinals are considered.
+    INCLUDE_ABBREV_COLLOCS = false
+    # this sets a minimum bound on the number of times a bigram needs to
+    # appear before it can be considered a collocation, in addition to log
+    # likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.
+    MIN_COLLOC_FREQ = 1
+    def initialize(language_vars = Punkt::LanguageVars.new,
+                   token_class   = Punkt::Token)
+      super(language_vars, token_class)
+      @type_fdist             = Probability::FrequencyDistribution.new
+      @collocation_fdist      = Probability::FrequencyDistribution.new
+      @sentence_starter_fdist = Probability::FrequencyDistribution.new
+      @period_tokens_count    = 0
+      @sentence_break_count   = 0
+      @finalized              = false
+    end
+    def train(text_or_tokens)
+      if text_or_tokens.kind_of?(String)
+        tokens = tokenize_words(text_or_tokens)
+      elsif text_or_tokens.kind_of?(Array)
+        tokens = text_or_tokens.map { |t| @token_class.new(t) }
+      end
+      train_tokens(tokens)
+    end
+    def parameters
+      finalize_training unless @finalized
+      return @parameters
+    end
+    def finalize_training
+      @parameters.clear_sentence_starters
+      find_sentence_starters do |type, ll|
+        @parameters.sentence_starters << type
+      end
+      @parameters.clear_collocations
+      find_collocations do |types, ll|
+        @parameters.collocations << [types[0], types[1]]
+      end
+      @finalized = true
+    end
+  private
+    def train_tokens(tokens)
+      tokens.each do |token|
+        @type_fdist << token.type
+        @period_tokens_count += 1 if token.ends_with_period?
+      end
+      unique_types = Set.new(tokens.map { |t| t.type })
+      reclassify_abbreviation_types(unique_types) do |abbr, score, is_add|
+        if score >= ABBREV
+          @parameters.abbreviation_types << abbr if is_add
+        else
+          @parameters.abbreviation_types.delete(abbr) unless is_add
+        end
+      end
+      tokens = annotate_first_pass(tokens)
+      get_orthography_data(tokens)
+      tokens.each { |token| @sentence_break_count += 1 if token.sentence_break }
+      pair_each(tokens) do |tok1, tok2|
+        next if !tok1.ends_with_period? || !tok2
+        if is_rare_abbreviation_type?(tok1, tok2)
+          @parameters.abbreviation_types << tok1.type_without_period
+        end
+        if is_potential_sentence_starter?(tok2, tok1)
+          @sentence_starter_fdist << tok2.type
+        end
+        if is_potential_collocation?(tok1, tok2)
+          @collocation_fdist << [tok1.type_without_period, tok2.type_without_sentence_period]
+        end
+      end
+    end
+    def reclassify_abbreviation_types(types, &block)
+      types.each do |type|
+        # if there is punctuation or is a number, continue. This will be processed later
+        next if (type =~ /[^\W\d]/).nil? || type == "##number##"
+        if type.end_with?(".")
+          next if @parameters.abbreviation_types.include?(type)
+          type = type.chop
+          is_add = true
+        else
+          next unless @parameters.abbreviation_types.include?(type)
+          is_add = false
+        end
+        periods_count = type.count(".") + 1
+        non_periods_count = type.size - periods_count + 1
+        with_periods_count     = @type_fdist[type + "."]
+        without_periods_count  = @type_fdist[type]
+        ll = dunning_log_likelihood(with_periods_count + without_periods_count,
+                                    @period_tokens_count,
+                                    with_periods_count,
+                                    @type_fdist.N)
+        f_length  = Math.exp(-non_periods_count)
+        f_periods = periods_count
+        f_penalty = IGNORE_ABBREV_PENALTY ? 0 : non_periods_count**(-without_periods_count).to_f
+        score = ll * f_length * f_periods * f_penalty
+        yield(type, score, is_add)
+      end
+    end
+    def dunning_log_likelihood(count_a, count_b, count_ab, n)
+      p1 = count_b.to_f / n
+      p2 = 0.99
+      null_hypo = (count_ab.to_f * Math.log(p1) +
+                   (count_a - count_ab) * Math.log(1.0 - p1))
+      alt_hypo  = (count_ab.to_f * Math.log(p2) +
+                   (count_a - count_ab) * Math.log(1.0 - p2))
+      likelihood = null_hypo - alt_hypo
+      return (-2.0 * likelihood)
+    end
+    def get_orthography_data(tokens)
+      context = :internal
+      tokens.each do |aug_token|
+        context = :initial if aug_token.paragraph_start && context != :unknown
+        context = :unknown if aug_token.line_start && context == :internal
+        type = aug_token.type_without_sentence_period
+        flag = Punkt::ORTHO_MAP[[context, aug_token.first_case]] || 0
+        @parameters.add_orthographic_context(type, flag) if flag
+        if aug_token.sentence_break
+          context = !(aug_token.is_number? || aug_token.is_initial?) ? :initial : :unknown
+        elsif aug_token.ellipsis || aug_token.abbr
+          context = :unknown
+        else
+          context = :internal
+        end
+      end
+    end
+    def is_rare_abbreviation_type?(current_token, next_token)
+      return false if current_token.abbr || !current_token.sentence_break
+      type = current_token.type_without_sentence_period
+      count = @type_fdist[type] + @type_fdist[type.chop]
+      return false if (@parameters.abbreviation_types.include?(type) || count >= ABBREV_BACKOFF)
+      if @language_vars.internal_punctuation.include?(next_token.token[0])
+        return true
+      elsif next_token.first_lower?
+        type2 = next_token.type_without_sentence_period
+        type2_orthographic_context = @parameters.orthographic_context[type2]
+        return true if (type2_orthographic_context & Punkt::ORTHO_BEG_UC != 0) && (type2_orthographic_context & Punkt::ORTHO_MID_UC != 0)
+      end
+    end
+    def is_potential_sentence_starter?(current_token, previous_token)
+      return (previous_token.sentence_break &&
+              !(previous_token.is_number? || previous_token.is_initial?) &&
+              current_token.is_alpha?)
+    end
+    def is_potential_collocation?(tok1, tok2)
+      return ((INCLUDE_ALL_COLLOCS ||
+                  (INCLUDE_ABBREV_COLLOCS && tok1.abbr) ||
+                  (tok1.sentence_break &&
+                    (tok1.is_number? || tok2.is_initial?))) &&
+                tok1.is_non_punctuation? &&
+                tok2.is_non_punctuation?)
+    end
+    def find_sentence_starters(&block)
+      @sentence_starter_fdist.each do |type, type_at_break_count|
+        next if !type
+        type_count = @type_fdist[type] + @type_fdist[type + "."]
+        next if type_count < type_at_break_count
+        ll = col_log_likelihood(@sentence_break_count,
+                                type_count,
+                                type_at_break_count,
+                                @type_fdist.N)
+        if (ll >= SENT_STARTER &&
+           @type_fdist.N.to_f/@sentence_break_count > type_count.to_f/type_at_break_count)
+          yield(type, ll)
+        end
+      end
+    end
+    def col_log_likelihood(count_a, count_b, count_ab, n)
+      p = 1.0 * count_b / n
+      p1 = 1.0 * count_ab / count_a
+      p2 = 1.0 * (count_b - count_ab) / (n - count_a)
+      summand1 = (count_ab * Math.log(p) +
+                  (count_a - count_ab) * Math.log(1.0 - p))
+      summand2 = ((count_b - count_ab) * Math.log(p) +
+                  (n - count_a - count_b + count_ab) * Math.log(1.0 - p))
+      if count_a == count_ab
+          summand3 = 0
+      else
+          summand3 = (count_ab * Math.log(p1) +
+                      (count_a - count_ab) * Math.log(1.0 - p1))
+      end
+      if count_b == count_ab
+          summand4 = 0
+      else
+          summand4 = ((count_b - count_ab) * Math.log(p2) +
+                      (n - count_a - count_b + count_ab) * Math.log(1.0 - p2))
+      end
+      likelihood = summand1 + summand2 - summand3 - summand4
+      return (-2.0 * likelihood)
+    end
+    def find_collocations(&block)
+      @collocation_fdist.each do |types, col_count|
+        type1, type2 = types
+        next if type1.nil? || type2.nil?
+        next if @parameters.sentence_starters.include?(type2)
+        type1_count = @type_fdist[type1] + @type_fdist[type1 + "."]
+        type2_count = @type_fdist[type2] + @type_fdist[type2 + "."]
+        if (type1_count > 1 && type2_count > 1 &&
+            MIN_COLLOC_FREQ < col_count &&
+            col_count <= [type1_count, type2_count].min)
+          ll = col_log_likelihood(type1_count, type2_count,
+                                  col_count, @type_fdist.N)
+          if (ll >= COLLOCATION &&
+              @type_fdist.N.to_f/type1_count > type2_count.to_f/col_count)
+            yield([type1, type2], ll)
+          end
+        end
+      end
+    end
+  end
+end

data/punkt-segmenter.gemspec ADDED

@@ -0,0 +1,17 @@
+Gem::Specification.new do |s|
+  s.name          = "punkt-segmenter"
+  s.version       = "0.9.0"
+  s.platform      = Gem::Platform::RUBY
+  s.summary       = "Ruby port of the NLTK Punkt sentence segmentation algorithm"
+  s.require_paths = ['lib']
+  s.files         = Dir["{lib/**/*.rb,README.md,LICENSE.txt,test/**/*.rb,Rakefile,*.gemspec,script/*}"]
+  s.author        = "Luis Cipriani"
+  s.email         = "lfcipriani@talleye.com"
+  s.homepage      = "http://github.com/lfcipriani/punkt-segmenter"
+  s.add_dependency('unicode_utils', '>= 1.0.0')
+  s.add_development_dependency('cover_me')
+  s.add_development_dependency('ruby-debug19')
+end

data/script/console ADDED

@@ -0,0 +1,7 @@
+#!/usr/bin/env ruby
+# File: script/console
+irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
+libs = " -r #{File.dirname(__FILE__) + '/../lib/punkt-segmenter.rb'}"
+puts "Loading punkt-segmenter env"
+exec "#{irb} #{libs} --simple-prompt #{ARGV.join("")}"

data/test/punkt-segmenter/frequency_distribution_test.rb ADDED

@@ -0,0 +1,118 @@
+require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
+class Probability::FrequencyDistributionTest < Test::Unit::TestCase
+  def setup
+    @words = %w(two one three one one three two one two)
+    @freq_dist = Probability::FrequencyDistribution.new
+  end
+  def test_increment_count_on_given_sample
+    @words.each { |word| @freq_dist << word }
+    assert_equal @freq_dist["one"]  , 4
+    assert_equal @freq_dist["two"]  , 3
+    assert_equal @freq_dist["three"], 2
+    assert_equal @freq_dist.N       , 9
+  end
+  def test_increment_count_on_given_sample_for_count_different_than_1
+    @words.each { |word| @freq_dist.inc(word, 2) }
+    assert_equal @freq_dist["one"]  , 8
+    assert_equal @freq_dist["two"]  , 6
+    assert_equal @freq_dist["three"], 4
+    assert_equal @freq_dist.N       , 18
+  end
+  def test_direct_count_attribution
+    @freq_dist["one"] = 10
+    @freq_dist["two"] = 20
+    @freq_dist["three"] = 30
+    assert_equal @freq_dist["one"]  , 10
+    assert_equal @freq_dist["two"]  , 20
+    assert_equal @freq_dist["three"], 30
+    assert_equal @freq_dist.N       , 60
+  end
+  def test_get_sample_frequencies
+    @words.each { |word| @freq_dist << word }
+    assert_equal((@freq_dist.frequency_of("one") +
+                 @freq_dist.frequency_of("two") +
+                 @freq_dist.frequency_of("three")).round, 1)
+  end
+  def test_get_sample_with_maximum_ocurrences
+    @words.each { |word| @freq_dist << word }
+    assert_equal(@freq_dist.max, "one")
+  end
+  def test_merge_frequency_distribution
+    @words.each { |word| @freq_dist << word }
+    @new_freq = @freq_dist.merge(@freq_dist)
+    assert_equal @new_freq["one"]  , 8
+    assert_equal @new_freq["two"]  , 6
+    assert_equal @new_freq["three"], 4
+    assert_equal @new_freq.N       , 18
+    assert_equal @freq_dist.merge!(@new_freq).N, 27
+  end
+  def test_get_keys_ordered_by_frequency_desc
+    @words.each { |word| @freq_dist << word }
+    assert_equal @freq_dist.keys.first, "one"
+    assert_equal @freq_dist.keys[1]   , "two"
+    assert_equal @freq_dist.keys.last , "three"
+ end
+  def test_get_values_ordered_by_frequency_desc
+    @words.each { |word| @freq_dist << word }
+    assert_equal @freq_dist.values.first, 4
+    assert_equal @freq_dist.values[1]   , 3
+    assert_equal @freq_dist.values.last , 2
+  end
+  def test_iterators_must_order_by_frequency_desc
+    @words.each { |word| @freq_dist << word }
+    ordered = []
+    @freq_dist.each do |sample, value|
+      ordered << [sample, value]
+    end
+    assert_equal ordered, @freq_dist.items
+    ordered = []
+    @freq_dist.each_key do |keys|
+      ordered << keys
+    end
+    assert_equal ordered, @freq_dist.keys
+    ordered = []
+    @freq_dist.each_value do |value|
+      ordered << value
+    end
+    assert_equal ordered, @freq_dist.values
+  end
+  def test_removing_samples
+    @words.each { |word| @freq_dist << word }
+    assert_equal @freq_dist.delete("one"), 4
+    assert_equal @freq_dist.N            , 5
+    assert_raise(RuntimeError) { @freq_dist.delete_if { |sample, value| value == 2 } }
+  end
+  def test_features_with_empty_distribution
+    assert_equal @freq_dist["a sample"]             , 0
+    assert_equal @freq_dist.N                       , 0
+    assert_equal @freq_dist.frequency_of("a sample"), 0
+    assert_equal @freq_dist.max                     , nil
+  end
+end