punkt-segmenter 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +13 -0
- data/README.md +79 -0
- data/Rakefile +16 -0
- data/lib/punkt-segmenter.rb +13 -0
- data/lib/punkt-segmenter/frequency_distribution.rb +121 -0
- data/lib/punkt-segmenter/punkt.rb +51 -0
- data/lib/punkt-segmenter/punkt/base.rb +65 -0
- data/lib/punkt-segmenter/punkt/language_vars.rb +34 -0
- data/lib/punkt-segmenter/punkt/parameters.rb +37 -0
- data/lib/punkt-segmenter/punkt/sentence_tokenizer.rb +180 -0
- data/lib/punkt-segmenter/punkt/token.rb +81 -0
- data/lib/punkt-segmenter/punkt/trainer.rb +304 -0
- data/punkt-segmenter.gemspec +17 -0
- data/script/console +7 -0
- data/test/punkt-segmenter/frequency_distribution_test.rb +118 -0
- data/test/punkt-segmenter/punkt/language_vars_test.rb +21 -0
- data/test/punkt-segmenter/punkt/token_test.rb +121 -0
- data/test/punkt-segmenter/punkt/trainer_test.rb +32 -0
- data/test/punkt-segmenter/punkt_test.rb +67 -0
- data/test/test_helper.rb +16 -0
- metadata +129 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
2
|
+
|
3
|
+
class PunktLanguageVarsTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@lang_var = Punkt::LanguageVars.new
|
7
|
+
@sample = %Q{For example, the word "abbreviation" can itself be represented by the abbreviation abbr., abbrv. or abbrev.}
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_word_tokenize
|
11
|
+
tokens = @lang_var.word_tokenize(@sample)
|
12
|
+
|
13
|
+
assert_equal 20 , tokens.size
|
14
|
+
assert_equal true, tokens.include?("abbr.")
|
15
|
+
assert_equal true, tokens.include?("\"")
|
16
|
+
assert_equal true, tokens.include?(",")
|
17
|
+
assert_equal true, tokens.include?("itself")
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
@@ -0,0 +1,121 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
3
|
+
|
4
|
+
class PunktTokenTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_token_properties_initialization
|
7
|
+
token = Punkt::Token.new("Test", :paragraph_start => true,
|
8
|
+
:line_start => true,
|
9
|
+
:sentence_break => true,
|
10
|
+
:abbr => false)
|
11
|
+
|
12
|
+
assert_equal true, token.paragraph_start
|
13
|
+
assert_equal true, token.line_start
|
14
|
+
assert_equal true, token.sentence_break
|
15
|
+
assert_equal false, token.abbr
|
16
|
+
assert_equal nil, token.ellipsis
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_main_attributes
|
20
|
+
token = Punkt::Token.new("Test")
|
21
|
+
assert_equal "test", token.type
|
22
|
+
|
23
|
+
token = Punkt::Token.new("Test.")
|
24
|
+
assert_equal "test.", token.type
|
25
|
+
|
26
|
+
token = Punkt::Token.new("Índico")
|
27
|
+
assert_equal "índico", token.type
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_type_without_period
|
31
|
+
token = Punkt::Token.new("Test")
|
32
|
+
assert_equal "test", token.type_without_period
|
33
|
+
|
34
|
+
token = Punkt::Token.new("Test.")
|
35
|
+
assert_equal "test", token.type_without_period
|
36
|
+
|
37
|
+
token = Punkt::Token.new("123.")
|
38
|
+
assert_equal "##number##", token.type_without_period
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_type_without_sentence_period
|
42
|
+
token = Punkt::Token.new("Test", :sentence_break => false)
|
43
|
+
assert_equal "test", token.type_without_sentence_period
|
44
|
+
|
45
|
+
token = Punkt::Token.new("test.", :sentence_break => true)
|
46
|
+
assert_equal "test", token.type_without_sentence_period
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_first_upper?
|
50
|
+
token = Punkt::Token.new("Test")
|
51
|
+
assert token.first_upper?
|
52
|
+
|
53
|
+
token = Punkt::Token.new("Índico")
|
54
|
+
assert token.first_upper?
|
55
|
+
|
56
|
+
token = Punkt::Token.new("test.")
|
57
|
+
assert !token.first_upper?
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_first_lower?
|
61
|
+
token = Punkt::Token.new("Test")
|
62
|
+
assert !token.first_lower?
|
63
|
+
|
64
|
+
token = Punkt::Token.new("Índico")
|
65
|
+
assert !token.first_lower?
|
66
|
+
|
67
|
+
token = Punkt::Token.new("test.")
|
68
|
+
assert token.first_lower?
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_first_case
|
72
|
+
token = Punkt::Token.new("Test")
|
73
|
+
assert_equal :upper, token.first_case
|
74
|
+
|
75
|
+
token = Punkt::Token.new("Índico")
|
76
|
+
assert_equal :upper, token.first_case
|
77
|
+
|
78
|
+
token = Punkt::Token.new("test.")
|
79
|
+
assert_equal :lower, token.first_case
|
80
|
+
|
81
|
+
token = Punkt::Token.new("@")
|
82
|
+
assert_equal :none, token.first_case
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_is_ellipsis?
|
86
|
+
token = Punkt::Token.new("...")
|
87
|
+
assert token.is_ellipsis?
|
88
|
+
|
89
|
+
token = Punkt::Token.new("..")
|
90
|
+
assert token.is_ellipsis?
|
91
|
+
|
92
|
+
token = Punkt::Token.new("..foo")
|
93
|
+
assert !token.is_ellipsis?
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_is_initial?
|
97
|
+
token = Punkt::Token.new("C.")
|
98
|
+
assert token.is_initial?
|
99
|
+
|
100
|
+
token = Punkt::Token.new("B.M.")
|
101
|
+
assert !token.is_initial?
|
102
|
+
end
|
103
|
+
|
104
|
+
def test_is_alpha?
|
105
|
+
token = Punkt::Token.new("foo")
|
106
|
+
assert token.is_alpha?
|
107
|
+
|
108
|
+
token = Punkt::Token.new("!")
|
109
|
+
assert !token.is_alpha?
|
110
|
+
end
|
111
|
+
|
112
|
+
def test_is_non_punctuation?
|
113
|
+
token = Punkt::Token.new("foo")
|
114
|
+
assert token.is_non_punctuation?
|
115
|
+
|
116
|
+
token = Punkt::Token.new("!")
|
117
|
+
assert !token.is_non_punctuation?
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
3
|
+
|
4
|
+
class PunktTrainerTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_train_basic_portuguese_text_with_error
|
7
|
+
not_so_good_trainning_data = File.read(File.expand_path(File.dirname(__FILE__) + "/../../data/canudos.txt"))
|
8
|
+
|
9
|
+
trainer = Punkt::Trainer.new()
|
10
|
+
trainer.train(not_so_good_trainning_data)
|
11
|
+
|
12
|
+
parameters = trainer.parameters
|
13
|
+
|
14
|
+
# 'gol' is a word, not an abbreviation, the trainning text isn't good enough
|
15
|
+
assert parameters.abbreviation_types.include?("gol")
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_improve_trainning_of_portuguese_text
|
19
|
+
not_so_good_trainning_data = File.read(File.expand_path(File.dirname(__FILE__) + "/../../data/canudos.txt"))
|
20
|
+
text_with_gol_as_a_word = File.read(File.expand_path(File.dirname(__FILE__) + "/../../data/gripe.txt"))
|
21
|
+
|
22
|
+
trainer = Punkt::Trainer.new()
|
23
|
+
trainer.train(not_so_good_trainning_data)
|
24
|
+
trainer.train(text_with_gol_as_a_word)
|
25
|
+
|
26
|
+
parameters = trainer.parameters
|
27
|
+
|
28
|
+
# 'gol' is a word now, the trainning was better
|
29
|
+
assert !parameters.abbreviation_types.include?("gol")
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
|
3
|
+
|
4
|
+
class PunktTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def setup
|
7
|
+
@text = File.read(File.expand_path(File.dirname(__FILE__) + "/../data/wikipedia_minute.txt"))
|
8
|
+
@tokenizer = Punkt::SentenceTokenizer.new(@text)
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_sentences_as_string_indexes
|
12
|
+
result = @tokenizer.sentences_from_text(@text)
|
13
|
+
|
14
|
+
assert_equal 7 , result.size #number of sentences
|
15
|
+
assert_equal [0,53] , result.first
|
16
|
+
assert_equal [478,595], result.last
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_sentences_as_list_of_strings
|
20
|
+
result = @tokenizer.sentences_from_text(@text, :output => :sentences_text)
|
21
|
+
|
22
|
+
assert_equal 7 , result.size #number of sentences
|
23
|
+
assert_equal result[0], "A minute is a unit of measurement of time or of angle."
|
24
|
+
assert_equal result[1], "The minute is a unit of time equal to 1/60th of an hour or 60 seconds by 1."
|
25
|
+
assert_equal result[2], "In the UTC time scale, a minute occasionally has 59 or 61 seconds; see leap second."
|
26
|
+
assert_equal result[3], "The minute is not an SI unit; however, it is accepted for use with SI units."
|
27
|
+
assert_equal result[4], "The symbol for minute or minutes is min."
|
28
|
+
assert_equal result[5], "The fact that an hour contains 60 minutes is probably due to influences from the Babylonians, who used a base-60 or sexagesimal counting system."
|
29
|
+
assert_equal result[6], "Colloquially, a min. may also refer to an indefinite amount of time substantially longer than the standardized length."
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_sentences_as_list_of_tokens
|
33
|
+
result = @tokenizer.sentences_from_text(@text, :output => :tokenized_sentences)
|
34
|
+
|
35
|
+
assert_equal 7 , result.size #number of sentences
|
36
|
+
assert_equal "angle.", result.first.last
|
37
|
+
assert_equal 18 , result[1].size
|
38
|
+
assert_equal String , result.last.first.class
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_segment_list_of_tokens
|
42
|
+
list_of_tokens = Punkt::Base.new().tokenize_words(@text, :output => :string)
|
43
|
+
@tokenizer = Punkt::SentenceTokenizer.new(@text)
|
44
|
+
result = @tokenizer.sentences_from_tokens(list_of_tokens)
|
45
|
+
|
46
|
+
assert_equal 7 , result.size #number of sentences
|
47
|
+
assert_equal "angle.", result.first.last
|
48
|
+
assert_equal 18 , result[1].size
|
49
|
+
assert_equal String , result.last.first.class
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_realign_boundaries
|
53
|
+
text = File.read(File.expand_path(File.dirname(__FILE__) + "/../data/canudos.txt"))
|
54
|
+
tokenizer = Punkt::SentenceTokenizer.new(text)
|
55
|
+
result = tokenizer.sentences_from_text(text, :output => :sentences_text)
|
56
|
+
|
57
|
+
assert result[7].end_with?("(que vem bem ao nosso caso.")
|
58
|
+
assert result[8].start_with?(") \n\nDizem que durante")
|
59
|
+
|
60
|
+
result = tokenizer.sentences_from_text(text, :output => :sentences_text, :realign_boundaries => true)
|
61
|
+
|
62
|
+
assert result[7].end_with?("(que vem bem ao nosso caso.)")
|
63
|
+
assert result[8].start_with?("Dizem que durante")
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
if ENV["coverage"]
|
2
|
+
require 'cover_me'
|
3
|
+
CoverMe.config do |c|
|
4
|
+
# where is your project's root:
|
5
|
+
c.project.root = File.expand_path(File.dirname(__FILE__) + '/..')
|
6
|
+
|
7
|
+
# what files are you interested in coverage for:
|
8
|
+
c.file_pattern = /(#{CoverMe.config.project.root}\/lib\/.+\.rb)/ix
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
require 'test/unit'
|
13
|
+
require 'rubygems'
|
14
|
+
require 'ruby-debug'
|
15
|
+
|
16
|
+
require 'punkt-segmenter'
|
metadata
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: punkt-segmenter
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 59
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 9
|
9
|
+
- 0
|
10
|
+
version: 0.9.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Luis Cipriani
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-08-17 00:00:00 -03:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: unicode_utils
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 23
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 0
|
33
|
+
- 0
|
34
|
+
version: 1.0.0
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: cover_me
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
hash: 3
|
46
|
+
segments:
|
47
|
+
- 0
|
48
|
+
version: "0"
|
49
|
+
type: :development
|
50
|
+
version_requirements: *id002
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
name: ruby-debug19
|
53
|
+
prerelease: false
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
hash: 3
|
60
|
+
segments:
|
61
|
+
- 0
|
62
|
+
version: "0"
|
63
|
+
type: :development
|
64
|
+
version_requirements: *id003
|
65
|
+
description:
|
66
|
+
email: lfcipriani@talleye.com
|
67
|
+
executables: []
|
68
|
+
|
69
|
+
extensions: []
|
70
|
+
|
71
|
+
extra_rdoc_files: []
|
72
|
+
|
73
|
+
files:
|
74
|
+
- lib/punkt-segmenter/frequency_distribution.rb
|
75
|
+
- lib/punkt-segmenter/punkt/base.rb
|
76
|
+
- lib/punkt-segmenter/punkt/language_vars.rb
|
77
|
+
- lib/punkt-segmenter/punkt/parameters.rb
|
78
|
+
- lib/punkt-segmenter/punkt/sentence_tokenizer.rb
|
79
|
+
- lib/punkt-segmenter/punkt/token.rb
|
80
|
+
- lib/punkt-segmenter/punkt/trainer.rb
|
81
|
+
- lib/punkt-segmenter/punkt.rb
|
82
|
+
- lib/punkt-segmenter.rb
|
83
|
+
- README.md
|
84
|
+
- LICENSE.txt
|
85
|
+
- test/punkt-segmenter/frequency_distribution_test.rb
|
86
|
+
- test/punkt-segmenter/punkt/language_vars_test.rb
|
87
|
+
- test/punkt-segmenter/punkt/token_test.rb
|
88
|
+
- test/punkt-segmenter/punkt/trainer_test.rb
|
89
|
+
- test/punkt-segmenter/punkt_test.rb
|
90
|
+
- test/test_helper.rb
|
91
|
+
- Rakefile
|
92
|
+
- punkt-segmenter.gemspec
|
93
|
+
- script/console
|
94
|
+
has_rdoc: true
|
95
|
+
homepage: http://github.com/lfcipriani/punkt-segmenter
|
96
|
+
licenses: []
|
97
|
+
|
98
|
+
post_install_message:
|
99
|
+
rdoc_options: []
|
100
|
+
|
101
|
+
require_paths:
|
102
|
+
- lib
|
103
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
104
|
+
none: false
|
105
|
+
requirements:
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
hash: 3
|
109
|
+
segments:
|
110
|
+
- 0
|
111
|
+
version: "0"
|
112
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
hash: 3
|
118
|
+
segments:
|
119
|
+
- 0
|
120
|
+
version: "0"
|
121
|
+
requirements: []
|
122
|
+
|
123
|
+
rubyforge_project:
|
124
|
+
rubygems_version: 1.3.7
|
125
|
+
signing_key:
|
126
|
+
specification_version: 3
|
127
|
+
summary: Ruby port of the NLTK Punkt sentence segmentation algorithm
|
128
|
+
test_files: []
|
129
|
+
|