punkt-segmenter 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
This code is a ruby 1.9.x port of the Punkt sentence tokenizer algorithm implemented by the NLTK Project ([http://www.nltk.org/]). Punkt is a **language-independent**, unsupervised approach to **sentence boundary detection**. It is based on the assumption that a large number of ambiguities in the determination of sentence boundaries can be eliminated once abbreviations have been identified.
|
4
4
|
|
5
|
-
The description of the algorithm is presented in the following academic paper:
|
5
|
+
The full description of the algorithm is presented in the following academic paper:
|
6
6
|
|
7
7
|
> Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
8
8
|
> Computational Linguistics 32: 485-525.
|
@@ -28,21 +28,18 @@ module Probability
|
|
28
28
|
end
|
29
29
|
|
30
30
|
def keys
|
31
|
-
|
32
|
-
result.map { |item| item[0] }
|
31
|
+
items.map { |item| item[0] }
|
33
32
|
end
|
34
33
|
|
35
34
|
def values
|
36
|
-
|
37
|
-
result.map { |item| item[1] }
|
35
|
+
items.map { |item| item[1] }
|
38
36
|
end
|
39
37
|
|
40
38
|
def items
|
41
|
-
@cache[:ordered_by_frequency_desc]
|
39
|
+
@cache[:ordered_by_frequency_desc] ||= self.to_a.sort {|x,y| y[1] <=> x[1] }
|
42
40
|
end
|
43
41
|
|
44
42
|
def each(&block)
|
45
|
-
items = @cache[:ordered_by_frequency_desc] || order_by_frequency_desc
|
46
43
|
items.each { |item| yield(item[0], item[1]) }
|
47
44
|
end
|
48
45
|
|
@@ -111,11 +108,5 @@ module Probability
|
|
111
108
|
self
|
112
109
|
end
|
113
110
|
|
114
|
-
private
|
115
|
-
|
116
|
-
def order_by_frequency_desc
|
117
|
-
@cache[:ordered_by_frequency_desc] = self.to_a.sort {|x,y| y[1] <=> x[1] }
|
118
|
-
end
|
119
|
-
|
120
111
|
end
|
121
112
|
end
|
@@ -39,7 +39,7 @@ module Punkt
|
|
39
39
|
if @language_vars.sent_end_chars.include?(tok)
|
40
40
|
aug_token.sentence_break = true
|
41
41
|
elsif aug_token.is_ellipsis?
|
42
|
-
aug_token.
|
42
|
+
aug_token.ellipsis = true
|
43
43
|
elsif aug_token.ends_with_period? && !tok.end_with?("..")
|
44
44
|
tok_low = UnicodeUtils.downcase(tok.chop)
|
45
45
|
if @parameters.abbreviation_types.include?(tok_low) || @parameters.abbreviation_types.include?(tok_low.split("-")[-1])
|
data/punkt-segmenter.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "punkt-segmenter"
|
3
|
-
s.version = "0.9.
|
3
|
+
s.version = "0.9.1"
|
4
4
|
s.platform = Gem::Platform::RUBY
|
5
5
|
s.summary = "Ruby port of the NLTK Punkt sentence segmentation algorithm"
|
6
6
|
s.require_paths = ['lib']
|
@@ -8,7 +8,7 @@ Gem::Specification.new do |s|
|
|
8
8
|
|
9
9
|
s.author = "Luis Cipriani"
|
10
10
|
s.email = "lfcipriani@talleye.com"
|
11
|
-
s.homepage = "http://
|
11
|
+
s.homepage = "http://blog.talleye.com"
|
12
12
|
|
13
13
|
s.add_dependency('unicode_utils', '>= 1.0.0')
|
14
14
|
|
@@ -117,5 +117,11 @@ class PunktTokenTest < Test::Unit::TestCase
|
|
117
117
|
assert !token.is_non_punctuation?
|
118
118
|
end
|
119
119
|
|
120
|
+
def test_to_s_and_inspect
|
121
|
+
token = Punkt::Token.new("foo", :abbr => true, :sentence_break => true, :ellipsis => true)
|
122
|
+
|
123
|
+
assert_equal "<foo<A><E><S>>", token.inspect
|
124
|
+
end
|
125
|
+
|
120
126
|
end
|
121
127
|
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: punkt-segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 59
|
5
4
|
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
7
|
- 9
|
9
|
-
-
|
10
|
-
version: 0.9.
|
8
|
+
- 1
|
9
|
+
version: 0.9.1
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Luis Cipriani
|
@@ -15,7 +14,7 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2010-08-
|
17
|
+
date: 2010-08-26 00:00:00 -03:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
@@ -26,7 +25,6 @@ dependencies:
|
|
26
25
|
requirements:
|
27
26
|
- - ">="
|
28
27
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 23
|
30
28
|
segments:
|
31
29
|
- 1
|
32
30
|
- 0
|
@@ -42,7 +40,6 @@ dependencies:
|
|
42
40
|
requirements:
|
43
41
|
- - ">="
|
44
42
|
- !ruby/object:Gem::Version
|
45
|
-
hash: 3
|
46
43
|
segments:
|
47
44
|
- 0
|
48
45
|
version: "0"
|
@@ -56,7 +53,6 @@ dependencies:
|
|
56
53
|
requirements:
|
57
54
|
- - ">="
|
58
55
|
- !ruby/object:Gem::Version
|
59
|
-
hash: 3
|
60
56
|
segments:
|
61
57
|
- 0
|
62
58
|
version: "0"
|
@@ -92,7 +88,7 @@ files:
|
|
92
88
|
- punkt-segmenter.gemspec
|
93
89
|
- script/console
|
94
90
|
has_rdoc: true
|
95
|
-
homepage: http://
|
91
|
+
homepage: http://blog.talleye.com
|
96
92
|
licenses: []
|
97
93
|
|
98
94
|
post_install_message:
|
@@ -105,7 +101,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
105
101
|
requirements:
|
106
102
|
- - ">="
|
107
103
|
- !ruby/object:Gem::Version
|
108
|
-
hash: 3
|
109
104
|
segments:
|
110
105
|
- 0
|
111
106
|
version: "0"
|
@@ -114,7 +109,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
114
109
|
requirements:
|
115
110
|
- - ">="
|
116
111
|
- !ruby/object:Gem::Version
|
117
|
-
hash: 3
|
118
112
|
segments:
|
119
113
|
- 0
|
120
114
|
version: "0"
|