punkt-segmenter 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
2
+
3
+ class PunktLanguageVarsTest < Test::Unit::TestCase
4
+
5
+ def setup
6
+ @lang_var = Punkt::LanguageVars.new
7
+ @sample = %Q{For example, the word "abbreviation" can itself be represented by the abbreviation abbr., abbrv. or abbrev.}
8
+ end
9
+
10
+ def test_word_tokenize
11
+ tokens = @lang_var.word_tokenize(@sample)
12
+
13
+ assert_equal 20 , tokens.size
14
+ assert_equal true, tokens.include?("abbr.")
15
+ assert_equal true, tokens.include?("\"")
16
+ assert_equal true, tokens.include?(",")
17
+ assert_equal true, tokens.include?("itself")
18
+ end
19
+
20
+ end
21
+
@@ -0,0 +1,121 @@
1
+ # encoding: utf-8
2
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
3
+
4
+ class PunktTokenTest < Test::Unit::TestCase
5
+
6
+ def test_token_properties_initialization
7
+ token = Punkt::Token.new("Test", :paragraph_start => true,
8
+ :line_start => true,
9
+ :sentence_break => true,
10
+ :abbr => false)
11
+
12
+ assert_equal true, token.paragraph_start
13
+ assert_equal true, token.line_start
14
+ assert_equal true, token.sentence_break
15
+ assert_equal false, token.abbr
16
+ assert_equal nil, token.ellipsis
17
+ end
18
+
19
+ def test_main_attributes
20
+ token = Punkt::Token.new("Test")
21
+ assert_equal "test", token.type
22
+
23
+ token = Punkt::Token.new("Test.")
24
+ assert_equal "test.", token.type
25
+
26
+ token = Punkt::Token.new("Índico")
27
+ assert_equal "índico", token.type
28
+ end
29
+
30
+ def test_type_without_period
31
+ token = Punkt::Token.new("Test")
32
+ assert_equal "test", token.type_without_period
33
+
34
+ token = Punkt::Token.new("Test.")
35
+ assert_equal "test", token.type_without_period
36
+
37
+ token = Punkt::Token.new("123.")
38
+ assert_equal "##number##", token.type_without_period
39
+ end
40
+
41
+ def test_type_without_sentence_period
42
+ token = Punkt::Token.new("Test", :sentence_break => false)
43
+ assert_equal "test", token.type_without_sentence_period
44
+
45
+ token = Punkt::Token.new("test.", :sentence_break => true)
46
+ assert_equal "test", token.type_without_sentence_period
47
+ end
48
+
49
+ def test_first_upper?
50
+ token = Punkt::Token.new("Test")
51
+ assert token.first_upper?
52
+
53
+ token = Punkt::Token.new("Índico")
54
+ assert token.first_upper?
55
+
56
+ token = Punkt::Token.new("test.")
57
+ assert !token.first_upper?
58
+ end
59
+
60
+ def test_first_lower?
61
+ token = Punkt::Token.new("Test")
62
+ assert !token.first_lower?
63
+
64
+ token = Punkt::Token.new("Índico")
65
+ assert !token.first_lower?
66
+
67
+ token = Punkt::Token.new("test.")
68
+ assert token.first_lower?
69
+ end
70
+
71
+ def test_first_case
72
+ token = Punkt::Token.new("Test")
73
+ assert_equal :upper, token.first_case
74
+
75
+ token = Punkt::Token.new("Índico")
76
+ assert_equal :upper, token.first_case
77
+
78
+ token = Punkt::Token.new("test.")
79
+ assert_equal :lower, token.first_case
80
+
81
+ token = Punkt::Token.new("@")
82
+ assert_equal :none, token.first_case
83
+ end
84
+
85
+ def test_is_ellipsis?
86
+ token = Punkt::Token.new("...")
87
+ assert token.is_ellipsis?
88
+
89
+ token = Punkt::Token.new("..")
90
+ assert token.is_ellipsis?
91
+
92
+ token = Punkt::Token.new("..foo")
93
+ assert !token.is_ellipsis?
94
+ end
95
+
96
+ def test_is_initial?
97
+ token = Punkt::Token.new("C.")
98
+ assert token.is_initial?
99
+
100
+ token = Punkt::Token.new("B.M.")
101
+ assert !token.is_initial?
102
+ end
103
+
104
+ def test_is_alpha?
105
+ token = Punkt::Token.new("foo")
106
+ assert token.is_alpha?
107
+
108
+ token = Punkt::Token.new("!")
109
+ assert !token.is_alpha?
110
+ end
111
+
112
+ def test_is_non_punctuation?
113
+ token = Punkt::Token.new("foo")
114
+ assert token.is_non_punctuation?
115
+
116
+ token = Punkt::Token.new("!")
117
+ assert !token.is_non_punctuation?
118
+ end
119
+
120
+ end
121
+
@@ -0,0 +1,32 @@
1
+ # encoding: utf-8
2
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
3
+
4
+ class PunktTrainerTest < Test::Unit::TestCase
5
+
6
+ def test_train_basic_portuguese_text_with_error
7
+ not_so_good_trainning_data = File.read(File.expand_path(File.dirname(__FILE__) + "/../../data/canudos.txt"))
8
+
9
+ trainer = Punkt::Trainer.new()
10
+ trainer.train(not_so_good_trainning_data)
11
+
12
+ parameters = trainer.parameters
13
+
14
+ # 'gol' is a word, not an abbreviation, the trainning text isn't good enough
15
+ assert parameters.abbreviation_types.include?("gol")
16
+ end
17
+
18
+ def test_improve_trainning_of_portuguese_text
19
+ not_so_good_trainning_data = File.read(File.expand_path(File.dirname(__FILE__) + "/../../data/canudos.txt"))
20
+ text_with_gol_as_a_word = File.read(File.expand_path(File.dirname(__FILE__) + "/../../data/gripe.txt"))
21
+
22
+ trainer = Punkt::Trainer.new()
23
+ trainer.train(not_so_good_trainning_data)
24
+ trainer.train(text_with_gol_as_a_word)
25
+
26
+ parameters = trainer.parameters
27
+
28
+ # 'gol' is a word now, the trainning was better
29
+ assert !parameters.abbreviation_types.include?("gol")
30
+ end
31
+
32
+ end
@@ -0,0 +1,67 @@
1
+ # encoding: utf-8
2
+ require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
3
+
4
+ class PunktTest < Test::Unit::TestCase
5
+
6
+ def setup
7
+ @text = File.read(File.expand_path(File.dirname(__FILE__) + "/../data/wikipedia_minute.txt"))
8
+ @tokenizer = Punkt::SentenceTokenizer.new(@text)
9
+ end
10
+
11
+ def test_sentences_as_string_indexes
12
+ result = @tokenizer.sentences_from_text(@text)
13
+
14
+ assert_equal 7 , result.size #number of sentences
15
+ assert_equal [0,53] , result.first
16
+ assert_equal [478,595], result.last
17
+ end
18
+
19
+ def test_sentences_as_list_of_strings
20
+ result = @tokenizer.sentences_from_text(@text, :output => :sentences_text)
21
+
22
+ assert_equal 7 , result.size #number of sentences
23
+ assert_equal result[0], "A minute is a unit of measurement of time or of angle."
24
+ assert_equal result[1], "The minute is a unit of time equal to 1/60th of an hour or 60 seconds by 1."
25
+ assert_equal result[2], "In the UTC time scale, a minute occasionally has 59 or 61 seconds; see leap second."
26
+ assert_equal result[3], "The minute is not an SI unit; however, it is accepted for use with SI units."
27
+ assert_equal result[4], "The symbol for minute or minutes is min."
28
+ assert_equal result[5], "The fact that an hour contains 60 minutes is probably due to influences from the Babylonians, who used a base-60 or sexagesimal counting system."
29
+ assert_equal result[6], "Colloquially, a min. may also refer to an indefinite amount of time substantially longer than the standardized length."
30
+ end
31
+
32
+ def test_sentences_as_list_of_tokens
33
+ result = @tokenizer.sentences_from_text(@text, :output => :tokenized_sentences)
34
+
35
+ assert_equal 7 , result.size #number of sentences
36
+ assert_equal "angle.", result.first.last
37
+ assert_equal 18 , result[1].size
38
+ assert_equal String , result.last.first.class
39
+ end
40
+
41
+ def test_segment_list_of_tokens
42
+ list_of_tokens = Punkt::Base.new().tokenize_words(@text, :output => :string)
43
+ @tokenizer = Punkt::SentenceTokenizer.new(@text)
44
+ result = @tokenizer.sentences_from_tokens(list_of_tokens)
45
+
46
+ assert_equal 7 , result.size #number of sentences
47
+ assert_equal "angle.", result.first.last
48
+ assert_equal 18 , result[1].size
49
+ assert_equal String , result.last.first.class
50
+ end
51
+
52
+ def test_realign_boundaries
53
+ text = File.read(File.expand_path(File.dirname(__FILE__) + "/../data/canudos.txt"))
54
+ tokenizer = Punkt::SentenceTokenizer.new(text)
55
+ result = tokenizer.sentences_from_text(text, :output => :sentences_text)
56
+
57
+ assert result[7].end_with?("(que vem bem ao nosso caso.")
58
+ assert result[8].start_with?(") \n\nDizem que durante")
59
+
60
+ result = tokenizer.sentences_from_text(text, :output => :sentences_text, :realign_boundaries => true)
61
+
62
+ assert result[7].end_with?("(que vem bem ao nosso caso.)")
63
+ assert result[8].start_with?("Dizem que durante")
64
+ end
65
+
66
+ end
67
+
@@ -0,0 +1,16 @@
1
+ if ENV["coverage"]
2
+ require 'cover_me'
3
+ CoverMe.config do |c|
4
+ # where is your project's root:
5
+ c.project.root = File.expand_path(File.dirname(__FILE__) + '/..')
6
+
7
+ # what files are you interested in coverage for:
8
+ c.file_pattern = /(#{CoverMe.config.project.root}\/lib\/.+\.rb)/ix
9
+ end
10
+ end
11
+
12
+ require 'test/unit'
13
+ require 'rubygems'
14
+ require 'ruby-debug'
15
+
16
+ require 'punkt-segmenter'
metadata ADDED
@@ -0,0 +1,129 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: punkt-segmenter
3
+ version: !ruby/object:Gem::Version
4
+ hash: 59
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 9
9
+ - 0
10
+ version: 0.9.0
11
+ platform: ruby
12
+ authors:
13
+ - Luis Cipriani
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-08-17 00:00:00 -03:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: unicode_utils
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 23
30
+ segments:
31
+ - 1
32
+ - 0
33
+ - 0
34
+ version: 1.0.0
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: cover_me
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 3
46
+ segments:
47
+ - 0
48
+ version: "0"
49
+ type: :development
50
+ version_requirements: *id002
51
+ - !ruby/object:Gem::Dependency
52
+ name: ruby-debug19
53
+ prerelease: false
54
+ requirement: &id003 !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ hash: 3
60
+ segments:
61
+ - 0
62
+ version: "0"
63
+ type: :development
64
+ version_requirements: *id003
65
+ description:
66
+ email: lfcipriani@talleye.com
67
+ executables: []
68
+
69
+ extensions: []
70
+
71
+ extra_rdoc_files: []
72
+
73
+ files:
74
+ - lib/punkt-segmenter/frequency_distribution.rb
75
+ - lib/punkt-segmenter/punkt/base.rb
76
+ - lib/punkt-segmenter/punkt/language_vars.rb
77
+ - lib/punkt-segmenter/punkt/parameters.rb
78
+ - lib/punkt-segmenter/punkt/sentence_tokenizer.rb
79
+ - lib/punkt-segmenter/punkt/token.rb
80
+ - lib/punkt-segmenter/punkt/trainer.rb
81
+ - lib/punkt-segmenter/punkt.rb
82
+ - lib/punkt-segmenter.rb
83
+ - README.md
84
+ - LICENSE.txt
85
+ - test/punkt-segmenter/frequency_distribution_test.rb
86
+ - test/punkt-segmenter/punkt/language_vars_test.rb
87
+ - test/punkt-segmenter/punkt/token_test.rb
88
+ - test/punkt-segmenter/punkt/trainer_test.rb
89
+ - test/punkt-segmenter/punkt_test.rb
90
+ - test/test_helper.rb
91
+ - Rakefile
92
+ - punkt-segmenter.gemspec
93
+ - script/console
94
+ has_rdoc: true
95
+ homepage: http://github.com/lfcipriani/punkt-segmenter
96
+ licenses: []
97
+
98
+ post_install_message:
99
+ rdoc_options: []
100
+
101
+ require_paths:
102
+ - lib
103
+ required_ruby_version: !ruby/object:Gem::Requirement
104
+ none: false
105
+ requirements:
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ hash: 3
109
+ segments:
110
+ - 0
111
+ version: "0"
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ hash: 3
118
+ segments:
119
+ - 0
120
+ version: "0"
121
+ requirements: []
122
+
123
+ rubyforge_project:
124
+ rubygems_version: 1.3.7
125
+ signing_key:
126
+ specification_version: 3
127
+ summary: Ruby port of the NLTK Punkt sentence segmentation algorithm
128
+ test_files: []
129
+