porter2stemmer 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,307 @@
1
+ # coding: utf-8
2
+ # Porter 2 stemmer test file
3
+ #
4
+ # This file tests each stage of the stemmer individually.
5
+
6
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
7
+
8
+ require 'test/unit'
9
+ require 'porter2stemmer'
10
+
11
+ class TestPorter2 < Test::Unit::TestCase
12
+
13
+ def test_tidy
14
+ assert_equal "abacde", "abacde".porter2_tidy
15
+ assert_equal "abacde", " abacde ".porter2_tidy
16
+ assert_equal "abacde", "ABACDE".porter2_tidy
17
+ assert_equal "ab'cde", "ab‘cde".porter2_tidy
18
+ assert_equal "ab'cde", "ab’cde".porter2_tidy
19
+ assert_equal "ab'c'de", "ab’c’de".porter2_tidy
20
+ assert_equal "ab'c'de", "ab‘c‘de".porter2_tidy
21
+ assert_equal "''abacde", "’‘abacde".porter2_tidy
22
+ end
23
+
24
+ def test_preprocess
25
+ assert_equal "abacde", "abacde".porter2_preprocess
26
+ assert_equal "abacde", "''abacde".porter2_preprocess
27
+ assert_equal "ab'c'de", "'ab'c'de".porter2_preprocess
28
+ assert_equal "ab'c'de", "''ab'c'de".porter2_preprocess
29
+ assert_equal "Yabac", "yabac".porter2_preprocess
30
+ assert_equal "aYbc", "aybc".porter2_preprocess
31
+ assert_equal "abacdeY", "abacdey".porter2_preprocess
32
+ assert_equal "abaYde", "abayde".porter2_preprocess
33
+ assert_equal "kabaYde", "kabayde".porter2_preprocess
34
+ assert_equal "kabyaYde", "kabyayde".porter2_preprocess
35
+ assert_equal "'", "'''".porter2_preprocess
36
+ end
37
+
38
+ def test_R1
39
+ assert_equal "iful", "beautiful".porter2_r1
40
+ assert_equal "y", "beauty".porter2_r1
41
+ assert_equal "", "beau".porter2_r1
42
+ assert_equal "imadversion", "animadversion".porter2_r1
43
+ assert_equal "kled", "sprinkled".porter2_r1
44
+ assert_equal "harist", "eucharist".porter2_r1
45
+
46
+ # special cases
47
+ assert_equal "ate", "generate".porter2_r1
48
+ assert_equal "ates", "generates".porter2_r1
49
+ assert_equal "ated", "generated".porter2_r1
50
+ assert_equal "al", "general".porter2_r1
51
+ assert_equal "ally", "generally".porter2_r1
52
+ assert_equal "ic", "generic".porter2_r1
53
+ assert_equal "ically", "generically".porter2_r1
54
+ assert_equal "ous", "generous".porter2_r1
55
+ assert_equal "ously", "generously".porter2_r1
56
+
57
+ assert_equal "al", "communal".porter2_r1
58
+ assert_equal "ity", "community".porter2_r1
59
+ assert_equal "e", "commune".porter2_r1
60
+
61
+ assert_equal "ic", "arsenic".porter2_r1
62
+ assert_equal "al", "arsenal".porter2_r1
63
+ end
64
+
65
+ def test_R2
66
+ assert_equal "ul", "beautiful".porter2_r2
67
+ assert_equal "", "beauty".porter2_r2
68
+ assert_equal "", "beau".porter2_r2
69
+ assert_equal "adversion", "animadversion".porter2_r2
70
+ assert_equal "", "sprinkled".porter2_r2
71
+ assert_equal "ist", "eucharist".porter2_r2
72
+ end
73
+
74
+ def test_ends_with_short_syllable?
75
+ assert_equal true, "rap".porter2_ends_with_short_syllable?
76
+ assert_equal true, "trap".porter2_ends_with_short_syllable?
77
+ assert_equal true, "entrap".porter2_ends_with_short_syllable?
78
+ assert_equal true, "ow".porter2_ends_with_short_syllable?
79
+ assert_equal true, "on".porter2_ends_with_short_syllable?
80
+ assert_equal true, "at".porter2_ends_with_short_syllable?
81
+ assert_equal false, "uproot".porter2_ends_with_short_syllable?
82
+ assert_equal false, "bestow".porter2_ends_with_short_syllable?
83
+ assert_equal false, "disturb".porter2_ends_with_short_syllable?
84
+ end
85
+
86
+ def test_is_short_word?
87
+ short_words = %w[ bed shed shred hop ]
88
+ long_words = %w[ bead embed beds ]
89
+ short_words.each do |w|
90
+ r1 = w.porter2_r1
91
+ assert_equal true, w.porter2_is_short_word?,
92
+ "#{w} should be short but classified as long"
93
+ end
94
+ long_words.each do |w|
95
+ r1 = w.porter2_r1
96
+ assert_equal false, w.porter2_is_short_word?,
97
+ "#{w} should be long but classified as short"
98
+ end
99
+ end
100
+
101
+ def test_step_0
102
+ assert_equal "abac", "abac".porter2_step0
103
+ assert_equal "abac", "abac'".porter2_step0
104
+ assert_equal "abac", "abac's".porter2_step0
105
+ assert_equal "abac", "abac's'".porter2_step0
106
+ assert_equal "ab'c", "ab'c".porter2_step0
107
+ assert_equal "ab'sc", "ab'sc".porter2_step0
108
+ assert_equal "ab's'c", "ab's'c".porter2_step0
109
+ assert_equal "ab'sc", "ab'sc's".porter2_step0
110
+ assert_equal "'", "'".porter2_step0
111
+ assert_equal "'s", "'s".porter2_step0
112
+ assert_equal "'s", "'s'".porter2_step0
113
+ end
114
+
115
+ def test_step_1a
116
+ assert_equal "abacde", "abacde".porter2_step1a
117
+ assert_equal "abacess", "abacesses".porter2_step1a
118
+ assert_equal "tie", "ties".porter2_step1a
119
+ assert_equal "tie", "tied".porter2_step1a
120
+ assert_equal "cri", "cries".porter2_step1a
121
+ assert_equal "cri", "cried".porter2_step1a
122
+ assert_equal "gas", "gas".porter2_step1a
123
+ assert_equal "this", "this".porter2_step1a
124
+ assert_equal "gap", "gaps".porter2_step1a
125
+ assert_equal "kiwi", "kiwis".porter2_step1a
126
+ assert_equal "abacus", "abacus".porter2_step1a
127
+ assert_equal "abacess", "abacess".porter2_step1a
128
+ end
129
+
130
+ def test_step_1b
131
+ assert_equal "abacde", "abacde".porter2_step1b
132
+ words_non_gb = {"luxuriated" => "luxuriate", "luxuriating" => "luxuriate",
133
+ "hopping" => "hop", "hopped" => "hop",
134
+ "hoped" => "hope", "hoping" => "hope",
135
+ "atomized" => "atomize", "atomised" => "atomis",
136
+ "addicted" => "addict", "bleed" => "bleed" }
137
+ words_non_gb.each do |original, stemmed|
138
+ assert_equal stemmed, original.porter2_step1b,
139
+ "#{original} should have stemmed to #{stemmed} but got #{original.porter2_step1b(original.porter2_r1)} instead"
140
+ end
141
+ words_gb = {"luxuriated" => "luxuriate", "luxuriating" => "luxuriate",
142
+ "hopping" => "hop", "hopped" => "hop",
143
+ "hoped" => "hope", "hoping" => "hope",
144
+ "atomized" => "atomize", "atomised" => "atomise",
145
+ "addicted" => "addict", "bleed" => "bleed" }
146
+ words_gb.each do |original, stemmed|
147
+ assert_equal stemmed, original.porter2_step1b(true),
148
+ "#{original} should have stemmed to #{stemmed} but got #{original.porter2_step1b(original.porter2_r1)} instead"
149
+ end
150
+ end
151
+
152
+ def test_step_1c
153
+ assert_equal "cri", "cry".porter2_step1c
154
+ assert_equal "by", "by".porter2_step1c
155
+ assert_equal "saY", "saY".porter2_step1c
156
+ assert_equal "abbeY", "abbeY".porter2_step1c
157
+ end
158
+
159
+ def test_step_2
160
+ assert_equal "abac", "abac".porter2_step2
161
+
162
+ assert_equal "nationalize", "nationalization".porter2_step2
163
+ assert_equal "nationalisate", "nationalisation".porter2_step2
164
+ assert_equal "nationalize", "nationalization".porter2_step2(true)
165
+ assert_equal "nationalise", "nationalisation".porter2_step2(true)
166
+ # Repeat the steps to ensure that the english-gb behaviour isn't sticky
167
+ assert_equal "nationalize", "nationalization".porter2_step2(false)
168
+ assert_equal "nationalisate", "nationalisation".porter2_step2(false)
169
+ assert_equal "nationalize", "nationalization".porter2_step2
170
+ assert_equal "nationalisate", "nationalisation".porter2_step2
171
+
172
+ assert_equal "nationalize", "nationalizer".porter2_step2
173
+ assert_equal "nationaliser", "nationaliser".porter2_step2
174
+ assert_equal "nationalize", "nationalizer".porter2_step2(true)
175
+ assert_equal "nationalise", "nationaliser".porter2_step2(true)
176
+
177
+ assert_equal "abaction", "abactional".porter2_step2
178
+ assert_equal "abacence", "abacenci".porter2_step2
179
+ assert_equal "abacance", "abacanci".porter2_step2
180
+ assert_equal "abacable", "abacabli".porter2_step2
181
+ assert_equal "abacent", "abacentli".porter2_step2
182
+ assert_equal "abacize", "abacizer".porter2_step2
183
+ assert_equal "abacize", "abacization".porter2_step2
184
+ assert_equal "abacate", "abacational".porter2_step2
185
+ assert_equal "abacate", "abacation".porter2_step2
186
+ assert_equal "abacate", "abacator".porter2_step2
187
+ assert_equal "abacal", "abacalism".porter2_step2
188
+ assert_equal "abacal", "abacaliti".porter2_step2
189
+ assert_equal "abacal", "abacalli".porter2_step2
190
+ assert_equal "abacful", "abacfulness".porter2_step2
191
+ assert_equal "abacous", "abacousli".porter2_step2
192
+ assert_equal "abacous", "abacousness".porter2_step2
193
+ assert_equal "abacive", "abaciveness".porter2_step2
194
+ assert_equal "abacive", "abaciviti".porter2_step2
195
+ assert_equal "abiliti", "abiliti".porter2_step2
196
+ assert_equal "abacble", "abacbiliti".porter2_step2
197
+ assert_equal "abacble", "abacbli".porter2_step2
198
+ assert_equal "abacful", "abacfulli".porter2_step2
199
+ assert_equal "abacless", "abaclessli".porter2_step2
200
+ assert_equal "abaclog", "abaclogi".porter2_step2
201
+
202
+ assert_equal "abac", "abacli".porter2_step2
203
+ assert_equal "abd", "abdli".porter2_step2
204
+ assert_equal "abe", "abeli".porter2_step2
205
+ assert_equal "abg", "abgli".porter2_step2
206
+ assert_equal "abh", "abhli".porter2_step2
207
+ assert_equal "abk", "abkli".porter2_step2
208
+ assert_equal "abm", "abmli".porter2_step2
209
+ assert_equal "abn", "abnli".porter2_step2
210
+ assert_equal "abr", "abrli".porter2_step2
211
+ assert_equal "abt", "abtli".porter2_step2
212
+ assert_equal "abali", "abali".porter2_step2
213
+
214
+ assert_equal "bad", "badli".porter2_step2
215
+ assert_equal "fluentli", "fluentli".porter2_step2
216
+ assert_equal "geolog", "geologi".porter2_step2
217
+ end
218
+
219
+ def test_step_3
220
+ assert_equal "abac", "abac".porter2_step3("")
221
+
222
+ assert_equal "national", "nationalize".porter2_step3
223
+ assert_equal "nationalise", "nationalise".porter2_step3
224
+ assert_equal "national", "nationalise".porter2_step3(true)
225
+ # Repeat the steps to ensure that the english-gb behaviour isn't sticky
226
+ assert_equal "national", "nationalize".porter2_step3(false)
227
+ assert_equal "nationalise", "nationalise".porter2_step3(false)
228
+ assert_equal "national", "nationalize".porter2_step3
229
+ assert_equal "nationalise", "nationalise".porter2_step3
230
+
231
+ assert_equal "abaction", "abactional".porter2_step3
232
+ assert_equal "abacate", "abacational".porter2_step3
233
+ assert_equal "abacic", "abacicate".porter2_step3
234
+ assert_equal "abacic", "abaciciti".porter2_step3
235
+ assert_equal "abacic", "abacical".porter2_step3
236
+ assert_equal "abac", "abacful".porter2_step3
237
+ assert_equal "abac", "abacness".porter2_step3
238
+
239
+ assert_equal "abacabac", "abacabacative".porter2_step3
240
+ assert_equal "abacabac", "abacabacative".porter2_step3
241
+
242
+ assert_equal "dryness", "dryness".porter2_step3
243
+ end
244
+
245
+ def test_step_4
246
+ assert_equal "abac", "abac".porter2_step4("")
247
+
248
+ assert_equal "nation", "nationize".porter2_step4
249
+ assert_equal "nationise", "nationise".porter2_step4
250
+ assert_equal "nation", "nationize".porter2_step4(true)
251
+ assert_equal "nation", "nationise".porter2_step4(true)
252
+ assert_equal "nation", "nationize".porter2_step4(false)
253
+ assert_equal "nationise", "nationise".porter2_step4(false)
254
+ assert_equal "nation", "nationize".porter2_step4()
255
+ assert_equal "nationise", "nationise".porter2_step4()
256
+
257
+ assert_equal "abac", "abacal".porter2_step4
258
+ assert_equal "abac", "abacance".porter2_step4
259
+ assert_equal "abac", "abacence".porter2_step4
260
+ assert_equal "abac", "abacer".porter2_step4
261
+ assert_equal "abac", "abacic".porter2_step4
262
+ assert_equal "abacer", "abacerable".porter2_step4
263
+ assert_equal "abac", "abacible".porter2_step4
264
+ assert_equal "abac", "abacant".porter2_step4
265
+ assert_equal "abac", "abacement".porter2_step4 # Check we handle overlapping suffixes properly
266
+ assert_equal "abacac", "abacacement".porter2_step4
267
+ assert_equal "abacac", "abacacment".porter2_step4
268
+ assert_equal "abac", "abacment".porter2_step4
269
+ assert_equal "abac", "abacent".porter2_step4
270
+ assert_equal "abac", "abacism".porter2_step4
271
+ assert_equal "abac", "abacate".porter2_step4
272
+ assert_equal "abac", "abaciti".porter2_step4
273
+ assert_equal "abac", "abacous".porter2_step4
274
+ assert_equal "abac", "abacive".porter2_step4
275
+ assert_equal "abac", "abacize".porter2_step4
276
+ assert_equal "abacion", "abacion".porter2_step4
277
+ assert_equal "abacs", "abacsion".porter2_step4
278
+ assert_equal "abact", "abaction".porter2_step4
279
+ assert_equal "abction", "abction".porter2_step4
280
+ assert_equal "ablut", "ablution".porter2_step4
281
+ assert_equal "agreement", "agreement".porter2_step4
282
+
283
+ assert_equal "abcal", "abcal".porter2_step4 # No removal if suffix isn't in R2
284
+ end
285
+
286
+ def test_step_5
287
+ assert_equal "abac", "abac".porter2_step5
288
+
289
+ assert_equal "abacl", "abacll".porter2_step5
290
+ assert_equal "abcll", "abcll".porter2_step5
291
+
292
+ assert_equal "abc", "abc".porter2_step5
293
+ assert_equal "abl", "able".porter2_step5
294
+ assert_equal "abe", "abe".porter2_step5
295
+ assert_equal "abac", "abace".porter2_step5
296
+ assert_equal "bawac", "bawace".porter2_step5
297
+ end
298
+
299
+ def test_porter2_postprocess
300
+ assert_equal "abac", "abac".porter2_postprocess
301
+ assert_equal "abacy", "abacy".porter2_postprocess
302
+ assert_equal "abacy", "abacY".porter2_postprocess
303
+ assert_equal "aybcy", "aYbcY".porter2_postprocess
304
+ assert_equal "aybcy", "aYbcy".porter2_postprocess
305
+ end
306
+
307
+ end
metadata ADDED
@@ -0,0 +1,127 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: porter2stemmer
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Neil Smith
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-06-30 00:00:00.000000000 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: shoulda
17
+ requirement: &79323000 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: '0'
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: *79323000
26
+ - !ruby/object:Gem::Dependency
27
+ name: bundler
28
+ requirement: &79322760 !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 1.0.0
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: *79322760
37
+ - !ruby/object:Gem::Dependency
38
+ name: jeweler
39
+ requirement: &79322520 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ~>
43
+ - !ruby/object:Gem::Version
44
+ version: 1.5.2
45
+ type: :development
46
+ prerelease: false
47
+ version_requirements: *79322520
48
+ - !ruby/object:Gem::Dependency
49
+ name: rcov
50
+ requirement: &79322280 !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ type: :development
57
+ prerelease: false
58
+ version_requirements: *79322280
59
+ description: ! "This is the Porter 2 stemming algorithm, as described at \nhttp://snowball.tartarus.org/algorithms/english/stemmer.html
60
+ It removes suffixes from English words, revealing something closer to the stem of
61
+ the word. For instance \"acknowledgments\".stem => \"acknowledg\"."
62
+ email: neil.github@njae.me.uk
63
+ executables: []
64
+ extensions: []
65
+ extra_rdoc_files:
66
+ - LICENSE.txt
67
+ - README.md
68
+ - README.rdoc
69
+ files:
70
+ - Gemfile
71
+ - Gemfile.lock
72
+ - LICENSE.txt
73
+ - README.md
74
+ - README.rdoc
75
+ - Rakefile
76
+ - VERSION
77
+ - lib/porter2stemmer.rb
78
+ - lib/porter2stemmer/constants.rb
79
+ - lib/porter2stemmer/implementation.rb
80
+ - pkg/porter2stemmer-1.0.0.gem
81
+ - rdoc/Porter2.html
82
+ - rdoc/README_rdoc.html
83
+ - rdoc/String.html
84
+ - rdoc/created.rid
85
+ - rdoc/index.html
86
+ - rdoc/lib/porter2stemmer/constants_rb.html
87
+ - rdoc/lib/porter2stemmer/implementation_rb.html
88
+ - rdoc/lib/porter2stemmer_rb.html
89
+ - rdoc/rdoc.css
90
+ - test/helper.rb
91
+ - test/test_porter2stemmer.rb
92
+ - test/test_porter2stemmer_full.rb
93
+ - test/test_porter2stemmer_parts.rb
94
+ has_rdoc: true
95
+ homepage: http://github.com/NeilNjae/porter2stemmer
96
+ licenses:
97
+ - MIT
98
+ post_install_message:
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ! '>='
106
+ - !ruby/object:Gem::Version
107
+ version: '0'
108
+ segments:
109
+ - 0
110
+ hash: -893266339
111
+ required_rubygems_version: !ruby/object:Gem::Requirement
112
+ none: false
113
+ requirements:
114
+ - - ! '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 1.6.2
120
+ signing_key:
121
+ specification_version: 3
122
+ summary: A pure Ruby implementation of the Porter 2 stemmer
123
+ test_files:
124
+ - test/helper.rb
125
+ - test/test_porter2stemmer.rb
126
+ - test/test_porter2stemmer_full.rb
127
+ - test/test_porter2stemmer_parts.rb