punkt-segmenter 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This code is a ruby 1.9.x port of the Punkt sentence tokenizer algorithm implemented by the NLTK Project ([http://www.nltk.org/]). Punkt is a **language-independent**, unsupervised approach to **sentence boundary detection**. It is based on the assumption that a large number of ambiguities in the determination of sentence boundaries can be eliminated once abbreviations have been identified.
4
4
 
5
- The description of the algorithm is presented in the following academic paper:
5
+ The full description of the algorithm is presented in the following academic paper:
6
6
 
7
7
  > Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
8
8
  > Computational Linguistics 32: 485-525.
@@ -28,21 +28,18 @@ module Probability
28
28
  end
29
29
 
30
30
  def keys
31
- result = @cache[:ordered_by_frequency_desc] || order_by_frequency_desc
32
- result.map { |item| item[0] }
31
+ items.map { |item| item[0] }
33
32
  end
34
33
 
35
34
  def values
36
- result = @cache[:ordered_by_frequency_desc] || order_by_frequency_desc
37
- result.map { |item| item[1] }
35
+ items.map { |item| item[1] }
38
36
  end
39
37
 
40
38
  def items
41
- @cache[:ordered_by_frequency_desc] || order_by_frequency_desc
39
+ @cache[:ordered_by_frequency_desc] ||= self.to_a.sort {|x,y| y[1] <=> x[1] }
42
40
  end
43
41
 
44
42
  def each(&block)
45
- items = @cache[:ordered_by_frequency_desc] || order_by_frequency_desc
46
43
  items.each { |item| yield(item[0], item[1]) }
47
44
  end
48
45
 
@@ -111,11 +108,5 @@ module Probability
111
108
  self
112
109
  end
113
110
 
114
- private
115
-
116
- def order_by_frequency_desc
117
- @cache[:ordered_by_frequency_desc] = self.to_a.sort {|x,y| y[1] <=> x[1] }
118
- end
119
-
120
111
  end
121
112
  end
@@ -39,7 +39,7 @@ module Punkt
39
39
  if @language_vars.sent_end_chars.include?(tok)
40
40
  aug_token.sentence_break = true
41
41
  elsif aug_token.is_ellipsis?
42
- aug_token.is_ellipsis = true
42
+ aug_token.ellipsis = true
43
43
  elsif aug_token.ends_with_period? && !tok.end_with?("..")
44
44
  tok_low = UnicodeUtils.downcase(tok.chop)
45
45
  if @parameters.abbreviation_types.include?(tok_low) || @parameters.abbreviation_types.include?(tok_low.split("-")[-1])
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "punkt-segmenter"
3
- s.version = "0.9.0"
3
+ s.version = "0.9.1"
4
4
  s.platform = Gem::Platform::RUBY
5
5
  s.summary = "Ruby port of the NLTK Punkt sentence segmentation algorithm"
6
6
  s.require_paths = ['lib']
@@ -8,7 +8,7 @@ Gem::Specification.new do |s|
8
8
 
9
9
  s.author = "Luis Cipriani"
10
10
  s.email = "lfcipriani@talleye.com"
11
- s.homepage = "http://github.com/lfcipriani/punkt-segmenter"
11
+ s.homepage = "http://blog.talleye.com"
12
12
 
13
13
  s.add_dependency('unicode_utils', '>= 1.0.0')
14
14
 
@@ -117,5 +117,11 @@ class PunktTokenTest < Test::Unit::TestCase
117
117
  assert !token.is_non_punctuation?
118
118
  end
119
119
 
120
+ def test_to_s_and_inspect
121
+ token = Punkt::Token.new("foo", :abbr => true, :sentence_break => true, :ellipsis => true)
122
+
123
+ assert_equal "<foo<A><E><S>>", token.inspect
124
+ end
125
+
120
126
  end
121
127
 
@@ -13,4 +13,4 @@ require 'test/unit'
13
13
  require 'rubygems'
14
14
  require 'ruby-debug'
15
15
 
16
- require 'punkt-segmenter'
16
+ require File.expand_path(File.dirname(__FILE__) + '/../lib/punkt-segmenter')
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: punkt-segmenter
3
3
  version: !ruby/object:Gem::Version
4
- hash: 59
5
4
  prerelease: false
6
5
  segments:
7
6
  - 0
8
7
  - 9
9
- - 0
10
- version: 0.9.0
8
+ - 1
9
+ version: 0.9.1
11
10
  platform: ruby
12
11
  authors:
13
12
  - Luis Cipriani
@@ -15,7 +14,7 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2010-08-17 00:00:00 -03:00
17
+ date: 2010-08-26 00:00:00 -03:00
19
18
  default_executable:
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
@@ -26,7 +25,6 @@ dependencies:
26
25
  requirements:
27
26
  - - ">="
28
27
  - !ruby/object:Gem::Version
29
- hash: 23
30
28
  segments:
31
29
  - 1
32
30
  - 0
@@ -42,7 +40,6 @@ dependencies:
42
40
  requirements:
43
41
  - - ">="
44
42
  - !ruby/object:Gem::Version
45
- hash: 3
46
43
  segments:
47
44
  - 0
48
45
  version: "0"
@@ -56,7 +53,6 @@ dependencies:
56
53
  requirements:
57
54
  - - ">="
58
55
  - !ruby/object:Gem::Version
59
- hash: 3
60
56
  segments:
61
57
  - 0
62
58
  version: "0"
@@ -92,7 +88,7 @@ files:
92
88
  - punkt-segmenter.gemspec
93
89
  - script/console
94
90
  has_rdoc: true
95
- homepage: http://github.com/lfcipriani/punkt-segmenter
91
+ homepage: http://blog.talleye.com
96
92
  licenses: []
97
93
 
98
94
  post_install_message:
@@ -105,7 +101,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
105
101
  requirements:
106
102
  - - ">="
107
103
  - !ruby/object:Gem::Version
108
- hash: 3
109
104
  segments:
110
105
  - 0
111
106
  version: "0"
@@ -114,7 +109,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
114
109
  requirements:
115
110
  - - ">="
116
111
  - !ruby/object:Gem::Version
117
- hash: 3
118
112
  segments:
119
113
  - 0
120
114
  version: "0"