punkt-segmenter 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This code is a ruby 1.9.x port of the Punkt sentence tokenizer algorithm implemented by the NLTK Project ([http://www.nltk.org/]). Punkt is a **language-independent**, unsupervised approach to **sentence boundary detection**. It is based on the assumption that a large number of ambiguities in the determination of sentence boundaries can be eliminated once abbreviations have been identified.
4
4
 
5
- The description of the algorithm is presented in the following academic paper:
5
+ The full description of the algorithm is presented in the following academic paper:
6
6
 
7
7
  > Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
8
8
  > Computational Linguistics 32: 485-525.
@@ -28,21 +28,18 @@ module Probability
28
28
  end
29
29
 
30
30
  def keys
31
- result = @cache[:ordered_by_frequency_desc] || order_by_frequency_desc
32
- result.map { |item| item[0] }
31
+ items.map { |item| item[0] }
33
32
  end
34
33
 
35
34
  def values
36
- result = @cache[:ordered_by_frequency_desc] || order_by_frequency_desc
37
- result.map { |item| item[1] }
35
+ items.map { |item| item[1] }
38
36
  end
39
37
 
40
38
  def items
41
- @cache[:ordered_by_frequency_desc] || order_by_frequency_desc
39
+ @cache[:ordered_by_frequency_desc] ||= self.to_a.sort {|x,y| y[1] <=> x[1] }
42
40
  end
43
41
 
44
42
  def each(&block)
45
- items = @cache[:ordered_by_frequency_desc] || order_by_frequency_desc
46
43
  items.each { |item| yield(item[0], item[1]) }
47
44
  end
48
45
 
@@ -111,11 +108,5 @@ module Probability
111
108
  self
112
109
  end
113
110
 
114
- private
115
-
116
- def order_by_frequency_desc
117
- @cache[:ordered_by_frequency_desc] = self.to_a.sort {|x,y| y[1] <=> x[1] }
118
- end
119
-
120
111
  end
121
112
  end
@@ -39,7 +39,7 @@ module Punkt
39
39
  if @language_vars.sent_end_chars.include?(tok)
40
40
  aug_token.sentence_break = true
41
41
  elsif aug_token.is_ellipsis?
42
- aug_token.is_ellipsis = true
42
+ aug_token.ellipsis = true
43
43
  elsif aug_token.ends_with_period? && !tok.end_with?("..")
44
44
  tok_low = UnicodeUtils.downcase(tok.chop)
45
45
  if @parameters.abbreviation_types.include?(tok_low) || @parameters.abbreviation_types.include?(tok_low.split("-")[-1])
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "punkt-segmenter"
3
- s.version = "0.9.0"
3
+ s.version = "0.9.1"
4
4
  s.platform = Gem::Platform::RUBY
5
5
  s.summary = "Ruby port of the NLTK Punkt sentence segmentation algorithm"
6
6
  s.require_paths = ['lib']
@@ -8,7 +8,7 @@ Gem::Specification.new do |s|
8
8
 
9
9
  s.author = "Luis Cipriani"
10
10
  s.email = "lfcipriani@talleye.com"
11
- s.homepage = "http://github.com/lfcipriani/punkt-segmenter"
11
+ s.homepage = "http://blog.talleye.com"
12
12
 
13
13
  s.add_dependency('unicode_utils', '>= 1.0.0')
14
14
 
@@ -117,5 +117,11 @@ class PunktTokenTest < Test::Unit::TestCase
117
117
  assert !token.is_non_punctuation?
118
118
  end
119
119
 
120
+ def test_to_s_and_inspect
121
+ token = Punkt::Token.new("foo", :abbr => true, :sentence_break => true, :ellipsis => true)
122
+
123
+ assert_equal "<foo<A><E><S>>", token.inspect
124
+ end
125
+
120
126
  end
121
127
 
@@ -13,4 +13,4 @@ require 'test/unit'
13
13
  require 'rubygems'
14
14
  require 'ruby-debug'
15
15
 
16
- require 'punkt-segmenter'
16
+ require File.expand_path(File.dirname(__FILE__) + '/../lib/punkt-segmenter')
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: punkt-segmenter
3
3
  version: !ruby/object:Gem::Version
4
- hash: 59
5
4
  prerelease: false
6
5
  segments:
7
6
  - 0
8
7
  - 9
9
- - 0
10
- version: 0.9.0
8
+ - 1
9
+ version: 0.9.1
11
10
  platform: ruby
12
11
  authors:
13
12
  - Luis Cipriani
@@ -15,7 +14,7 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2010-08-17 00:00:00 -03:00
17
+ date: 2010-08-26 00:00:00 -03:00
19
18
  default_executable:
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
@@ -26,7 +25,6 @@ dependencies:
26
25
  requirements:
27
26
  - - ">="
28
27
  - !ruby/object:Gem::Version
29
- hash: 23
30
28
  segments:
31
29
  - 1
32
30
  - 0
@@ -42,7 +40,6 @@ dependencies:
42
40
  requirements:
43
41
  - - ">="
44
42
  - !ruby/object:Gem::Version
45
- hash: 3
46
43
  segments:
47
44
  - 0
48
45
  version: "0"
@@ -56,7 +53,6 @@ dependencies:
56
53
  requirements:
57
54
  - - ">="
58
55
  - !ruby/object:Gem::Version
59
- hash: 3
60
56
  segments:
61
57
  - 0
62
58
  version: "0"
@@ -92,7 +88,7 @@ files:
92
88
  - punkt-segmenter.gemspec
93
89
  - script/console
94
90
  has_rdoc: true
95
- homepage: http://github.com/lfcipriani/punkt-segmenter
91
+ homepage: http://blog.talleye.com
96
92
  licenses: []
97
93
 
98
94
  post_install_message:
@@ -105,7 +101,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
105
101
  requirements:
106
102
  - - ">="
107
103
  - !ruby/object:Gem::Version
108
- hash: 3
109
104
  segments:
110
105
  - 0
111
106
  version: "0"
@@ -114,7 +109,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
114
109
  requirements:
115
110
  - - ">="
116
111
  - !ruby/object:Gem::Version
117
- hash: 3
118
112
  segments:
119
113
  - 0
120
114
  version: "0"