punkt-segmenter 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
This code is a ruby 1.9.x port of the Punkt sentence tokenizer algorithm implemented by the NLTK Project ([http://www.nltk.org/]). Punkt is a **language-independent**, unsupervised approach to **sentence boundary detection**. It is based on the assumption that a large number of ambiguities in the determination of sentence boundaries can be eliminated once abbreviations have been identified.
|
4
4
|
|
5
|
-
The description of the algorithm is presented in the following academic paper:
|
5
|
+
The full description of the algorithm is presented in the following academic paper:
|
6
6
|
|
7
7
|
> Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
8
8
|
> Computational Linguistics 32: 485-525.
|
@@ -28,21 +28,18 @@ module Probability
|
|
28
28
|
end
|
29
29
|
|
30
30
|
def keys
|
31
|
-
|
32
|
-
result.map { |item| item[0] }
|
31
|
+
items.map { |item| item[0] }
|
33
32
|
end
|
34
33
|
|
35
34
|
def values
|
36
|
-
|
37
|
-
result.map { |item| item[1] }
|
35
|
+
items.map { |item| item[1] }
|
38
36
|
end
|
39
37
|
|
40
38
|
def items
|
41
|
-
@cache[:ordered_by_frequency_desc]
|
39
|
+
@cache[:ordered_by_frequency_desc] ||= self.to_a.sort {|x,y| y[1] <=> x[1] }
|
42
40
|
end
|
43
41
|
|
44
42
|
def each(&block)
|
45
|
-
items = @cache[:ordered_by_frequency_desc] || order_by_frequency_desc
|
46
43
|
items.each { |item| yield(item[0], item[1]) }
|
47
44
|
end
|
48
45
|
|
@@ -111,11 +108,5 @@ module Probability
|
|
111
108
|
self
|
112
109
|
end
|
113
110
|
|
114
|
-
private
|
115
|
-
|
116
|
-
def order_by_frequency_desc
|
117
|
-
@cache[:ordered_by_frequency_desc] = self.to_a.sort {|x,y| y[1] <=> x[1] }
|
118
|
-
end
|
119
|
-
|
120
111
|
end
|
121
112
|
end
|
@@ -39,7 +39,7 @@ module Punkt
|
|
39
39
|
if @language_vars.sent_end_chars.include?(tok)
|
40
40
|
aug_token.sentence_break = true
|
41
41
|
elsif aug_token.is_ellipsis?
|
42
|
-
aug_token.
|
42
|
+
aug_token.ellipsis = true
|
43
43
|
elsif aug_token.ends_with_period? && !tok.end_with?("..")
|
44
44
|
tok_low = UnicodeUtils.downcase(tok.chop)
|
45
45
|
if @parameters.abbreviation_types.include?(tok_low) || @parameters.abbreviation_types.include?(tok_low.split("-")[-1])
|
data/punkt-segmenter.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "punkt-segmenter"
|
3
|
-
s.version = "0.9.
|
3
|
+
s.version = "0.9.1"
|
4
4
|
s.platform = Gem::Platform::RUBY
|
5
5
|
s.summary = "Ruby port of the NLTK Punkt sentence segmentation algorithm"
|
6
6
|
s.require_paths = ['lib']
|
@@ -8,7 +8,7 @@ Gem::Specification.new do |s|
|
|
8
8
|
|
9
9
|
s.author = "Luis Cipriani"
|
10
10
|
s.email = "lfcipriani@talleye.com"
|
11
|
-
s.homepage = "http://
|
11
|
+
s.homepage = "http://blog.talleye.com"
|
12
12
|
|
13
13
|
s.add_dependency('unicode_utils', '>= 1.0.0')
|
14
14
|
|
@@ -117,5 +117,11 @@ class PunktTokenTest < Test::Unit::TestCase
|
|
117
117
|
assert !token.is_non_punctuation?
|
118
118
|
end
|
119
119
|
|
120
|
+
def test_to_s_and_inspect
|
121
|
+
token = Punkt::Token.new("foo", :abbr => true, :sentence_break => true, :ellipsis => true)
|
122
|
+
|
123
|
+
assert_equal "<foo<A><E><S>>", token.inspect
|
124
|
+
end
|
125
|
+
|
120
126
|
end
|
121
127
|
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: punkt-segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 59
|
5
4
|
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
7
|
- 9
|
9
|
-
-
|
10
|
-
version: 0.9.
|
8
|
+
- 1
|
9
|
+
version: 0.9.1
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Luis Cipriani
|
@@ -15,7 +14,7 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2010-08-
|
17
|
+
date: 2010-08-26 00:00:00 -03:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
@@ -26,7 +25,6 @@ dependencies:
|
|
26
25
|
requirements:
|
27
26
|
- - ">="
|
28
27
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 23
|
30
28
|
segments:
|
31
29
|
- 1
|
32
30
|
- 0
|
@@ -42,7 +40,6 @@ dependencies:
|
|
42
40
|
requirements:
|
43
41
|
- - ">="
|
44
42
|
- !ruby/object:Gem::Version
|
45
|
-
hash: 3
|
46
43
|
segments:
|
47
44
|
- 0
|
48
45
|
version: "0"
|
@@ -56,7 +53,6 @@ dependencies:
|
|
56
53
|
requirements:
|
57
54
|
- - ">="
|
58
55
|
- !ruby/object:Gem::Version
|
59
|
-
hash: 3
|
60
56
|
segments:
|
61
57
|
- 0
|
62
58
|
version: "0"
|
@@ -92,7 +88,7 @@ files:
|
|
92
88
|
- punkt-segmenter.gemspec
|
93
89
|
- script/console
|
94
90
|
has_rdoc: true
|
95
|
-
homepage: http://
|
91
|
+
homepage: http://blog.talleye.com
|
96
92
|
licenses: []
|
97
93
|
|
98
94
|
post_install_message:
|
@@ -105,7 +101,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
105
101
|
requirements:
|
106
102
|
- - ">="
|
107
103
|
- !ruby/object:Gem::Version
|
108
|
-
hash: 3
|
109
104
|
segments:
|
110
105
|
- 0
|
111
106
|
version: "0"
|
@@ -114,7 +109,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
114
109
|
requirements:
|
115
110
|
- - ">="
|
116
111
|
- !ruby/object:Gem::Version
|
117
|
-
hash: 3
|
118
112
|
segments:
|
119
113
|
- 0
|
120
114
|
version: "0"
|