porter2stemmer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,307 @@
1
+ # coding: utf-8
2
+ # Porter 2 stemmer test file
3
+ #
4
+ # This file tests each stage of the stemmer individually.
5
+
6
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
7
+
8
+ require 'test/unit'
9
+ require 'porter2stemmer'
10
+
11
+ class TestPorter2 < Test::Unit::TestCase
12
+
13
+ def test_tidy
14
+ assert_equal "abacde", "abacde".porter2_tidy
15
+ assert_equal "abacde", " abacde ".porter2_tidy
16
+ assert_equal "abacde", "ABACDE".porter2_tidy
17
+ assert_equal "ab'cde", "ab‘cde".porter2_tidy
18
+ assert_equal "ab'cde", "ab’cde".porter2_tidy
19
+ assert_equal "ab'c'de", "ab’c’de".porter2_tidy
20
+ assert_equal "ab'c'de", "ab‘c‘de".porter2_tidy
21
+ assert_equal "''abacde", "’‘abacde".porter2_tidy
22
+ end
23
+
24
+ def test_preprocess
25
+ assert_equal "abacde", "abacde".porter2_preprocess
26
+ assert_equal "abacde", "''abacde".porter2_preprocess
27
+ assert_equal "ab'c'de", "'ab'c'de".porter2_preprocess
28
+ assert_equal "ab'c'de", "''ab'c'de".porter2_preprocess
29
+ assert_equal "Yabac", "yabac".porter2_preprocess
30
+ assert_equal "aYbc", "aybc".porter2_preprocess
31
+ assert_equal "abacdeY", "abacdey".porter2_preprocess
32
+ assert_equal "abaYde", "abayde".porter2_preprocess
33
+ assert_equal "kabaYde", "kabayde".porter2_preprocess
34
+ assert_equal "kabyaYde", "kabyayde".porter2_preprocess
35
+ assert_equal "'", "'''".porter2_preprocess
36
+ end
37
+
38
+ def test_R1
39
+ assert_equal "iful", "beautiful".porter2_r1
40
+ assert_equal "y", "beauty".porter2_r1
41
+ assert_equal "", "beau".porter2_r1
42
+ assert_equal "imadversion", "animadversion".porter2_r1
43
+ assert_equal "kled", "sprinkled".porter2_r1
44
+ assert_equal "harist", "eucharist".porter2_r1
45
+
46
+ # special cases
47
+ assert_equal "ate", "generate".porter2_r1
48
+ assert_equal "ates", "generates".porter2_r1
49
+ assert_equal "ated", "generated".porter2_r1
50
+ assert_equal "al", "general".porter2_r1
51
+ assert_equal "ally", "generally".porter2_r1
52
+ assert_equal "ic", "generic".porter2_r1
53
+ assert_equal "ically", "generically".porter2_r1
54
+ assert_equal "ous", "generous".porter2_r1
55
+ assert_equal "ously", "generously".porter2_r1
56
+
57
+ assert_equal "al", "communal".porter2_r1
58
+ assert_equal "ity", "community".porter2_r1
59
+ assert_equal "e", "commune".porter2_r1
60
+
61
+ assert_equal "ic", "arsenic".porter2_r1
62
+ assert_equal "al", "arsenal".porter2_r1
63
+ end
64
+
65
+ def test_R2
66
+ assert_equal "ul", "beautiful".porter2_r2
67
+ assert_equal "", "beauty".porter2_r2
68
+ assert_equal "", "beau".porter2_r2
69
+ assert_equal "adversion", "animadversion".porter2_r2
70
+ assert_equal "", "sprinkled".porter2_r2
71
+ assert_equal "ist", "eucharist".porter2_r2
72
+ end
73
+
74
+ def test_ends_with_short_syllable?
75
+ assert_equal true, "rap".porter2_ends_with_short_syllable?
76
+ assert_equal true, "trap".porter2_ends_with_short_syllable?
77
+ assert_equal true, "entrap".porter2_ends_with_short_syllable?
78
+ assert_equal true, "ow".porter2_ends_with_short_syllable?
79
+ assert_equal true, "on".porter2_ends_with_short_syllable?
80
+ assert_equal true, "at".porter2_ends_with_short_syllable?
81
+ assert_equal false, "uproot".porter2_ends_with_short_syllable?
82
+ assert_equal false, "bestow".porter2_ends_with_short_syllable?
83
+ assert_equal false, "disturb".porter2_ends_with_short_syllable?
84
+ end
85
+
86
+ def test_is_short_word?
87
+ short_words = %w[ bed shed shred hop ]
88
+ long_words = %w[ bead embed beds ]
89
+ short_words.each do |w|
90
+ r1 = w.porter2_r1
91
+ assert_equal true, w.porter2_is_short_word?,
92
+ "#{w} should be short but classified as long"
93
+ end
94
+ long_words.each do |w|
95
+ r1 = w.porter2_r1
96
+ assert_equal false, w.porter2_is_short_word?,
97
+ "#{w} should be long but classified as short"
98
+ end
99
+ end
100
+
101
+ def test_step_0
102
+ assert_equal "abac", "abac".porter2_step0
103
+ assert_equal "abac", "abac'".porter2_step0
104
+ assert_equal "abac", "abac's".porter2_step0
105
+ assert_equal "abac", "abac's'".porter2_step0
106
+ assert_equal "ab'c", "ab'c".porter2_step0
107
+ assert_equal "ab'sc", "ab'sc".porter2_step0
108
+ assert_equal "ab's'c", "ab's'c".porter2_step0
109
+ assert_equal "ab'sc", "ab'sc's".porter2_step0
110
+ assert_equal "'", "'".porter2_step0
111
+ assert_equal "'s", "'s".porter2_step0
112
+ assert_equal "'s", "'s'".porter2_step0
113
+ end
114
+
115
+ def test_step_1a
116
+ assert_equal "abacde", "abacde".porter2_step1a
117
+ assert_equal "abacess", "abacesses".porter2_step1a
118
+ assert_equal "tie", "ties".porter2_step1a
119
+ assert_equal "tie", "tied".porter2_step1a
120
+ assert_equal "cri", "cries".porter2_step1a
121
+ assert_equal "cri", "cried".porter2_step1a
122
+ assert_equal "gas", "gas".porter2_step1a
123
+ assert_equal "this", "this".porter2_step1a
124
+ assert_equal "gap", "gaps".porter2_step1a
125
+ assert_equal "kiwi", "kiwis".porter2_step1a
126
+ assert_equal "abacus", "abacus".porter2_step1a
127
+ assert_equal "abacess", "abacess".porter2_step1a
128
+ end
129
+
130
+ def test_step_1b
131
+ assert_equal "abacde", "abacde".porter2_step1b
132
+ words_non_gb = {"luxuriated" => "luxuriate", "luxuriating" => "luxuriate",
133
+ "hopping" => "hop", "hopped" => "hop",
134
+ "hoped" => "hope", "hoping" => "hope",
135
+ "atomized" => "atomize", "atomised" => "atomis",
136
+ "addicted" => "addict", "bleed" => "bleed" }
137
+ words_non_gb.each do |original, stemmed|
138
+ assert_equal stemmed, original.porter2_step1b,
139
+ "#{original} should have stemmed to #{stemmed} but got #{original.porter2_step1b(original.porter2_r1)} instead"
140
+ end
141
+ words_gb = {"luxuriated" => "luxuriate", "luxuriating" => "luxuriate",
142
+ "hopping" => "hop", "hopped" => "hop",
143
+ "hoped" => "hope", "hoping" => "hope",
144
+ "atomized" => "atomize", "atomised" => "atomise",
145
+ "addicted" => "addict", "bleed" => "bleed" }
146
+ words_gb.each do |original, stemmed|
147
+ assert_equal stemmed, original.porter2_step1b(true),
148
+ "#{original} should have stemmed to #{stemmed} but got #{original.porter2_step1b(original.porter2_r1)} instead"
149
+ end
150
+ end
151
+
152
+ def test_step_1c
153
+ assert_equal "cri", "cry".porter2_step1c
154
+ assert_equal "by", "by".porter2_step1c
155
+ assert_equal "saY", "saY".porter2_step1c
156
+ assert_equal "abbeY", "abbeY".porter2_step1c
157
+ end
158
+
159
+ def test_step_2
160
+ assert_equal "abac", "abac".porter2_step2
161
+
162
+ assert_equal "nationalize", "nationalization".porter2_step2
163
+ assert_equal "nationalisate", "nationalisation".porter2_step2
164
+ assert_equal "nationalize", "nationalization".porter2_step2(true)
165
+ assert_equal "nationalise", "nationalisation".porter2_step2(true)
166
+ # Repeat the steps to ensure that the english-gb behaviour isn't sticky
167
+ assert_equal "nationalize", "nationalization".porter2_step2(false)
168
+ assert_equal "nationalisate", "nationalisation".porter2_step2(false)
169
+ assert_equal "nationalize", "nationalization".porter2_step2
170
+ assert_equal "nationalisate", "nationalisation".porter2_step2
171
+
172
+ assert_equal "nationalize", "nationalizer".porter2_step2
173
+ assert_equal "nationaliser", "nationaliser".porter2_step2
174
+ assert_equal "nationalize", "nationalizer".porter2_step2(true)
175
+ assert_equal "nationalise", "nationaliser".porter2_step2(true)
176
+
177
+ assert_equal "abaction", "abactional".porter2_step2
178
+ assert_equal "abacence", "abacenci".porter2_step2
179
+ assert_equal "abacance", "abacanci".porter2_step2
180
+ assert_equal "abacable", "abacabli".porter2_step2
181
+ assert_equal "abacent", "abacentli".porter2_step2
182
+ assert_equal "abacize", "abacizer".porter2_step2
183
+ assert_equal "abacize", "abacization".porter2_step2
184
+ assert_equal "abacate", "abacational".porter2_step2
185
+ assert_equal "abacate", "abacation".porter2_step2
186
+ assert_equal "abacate", "abacator".porter2_step2
187
+ assert_equal "abacal", "abacalism".porter2_step2
188
+ assert_equal "abacal", "abacaliti".porter2_step2
189
+ assert_equal "abacal", "abacalli".porter2_step2
190
+ assert_equal "abacful", "abacfulness".porter2_step2
191
+ assert_equal "abacous", "abacousli".porter2_step2
192
+ assert_equal "abacous", "abacousness".porter2_step2
193
+ assert_equal "abacive", "abaciveness".porter2_step2
194
+ assert_equal "abacive", "abaciviti".porter2_step2
195
+ assert_equal "abiliti", "abiliti".porter2_step2
196
+ assert_equal "abacble", "abacbiliti".porter2_step2
197
+ assert_equal "abacble", "abacbli".porter2_step2
198
+ assert_equal "abacful", "abacfulli".porter2_step2
199
+ assert_equal "abacless", "abaclessli".porter2_step2
200
+ assert_equal "abaclog", "abaclogi".porter2_step2
201
+
202
+ assert_equal "abac", "abacli".porter2_step2
203
+ assert_equal "abd", "abdli".porter2_step2
204
+ assert_equal "abe", "abeli".porter2_step2
205
+ assert_equal "abg", "abgli".porter2_step2
206
+ assert_equal "abh", "abhli".porter2_step2
207
+ assert_equal "abk", "abkli".porter2_step2
208
+ assert_equal "abm", "abmli".porter2_step2
209
+ assert_equal "abn", "abnli".porter2_step2
210
+ assert_equal "abr", "abrli".porter2_step2
211
+ assert_equal "abt", "abtli".porter2_step2
212
+ assert_equal "abali", "abali".porter2_step2
213
+
214
+ assert_equal "bad", "badli".porter2_step2
215
+ assert_equal "fluentli", "fluentli".porter2_step2
216
+ assert_equal "geolog", "geologi".porter2_step2
217
+ end
218
+
219
+ def test_step_3
220
+ assert_equal "abac", "abac".porter2_step3("")
221
+
222
+ assert_equal "national", "nationalize".porter2_step3
223
+ assert_equal "nationalise", "nationalise".porter2_step3
224
+ assert_equal "national", "nationalise".porter2_step3(true)
225
+ # Repeat the steps to ensure that the english-gb behaviour isn't sticky
226
+ assert_equal "national", "nationalize".porter2_step3(false)
227
+ assert_equal "nationalise", "nationalise".porter2_step3(false)
228
+ assert_equal "national", "nationalize".porter2_step3
229
+ assert_equal "nationalise", "nationalise".porter2_step3
230
+
231
+ assert_equal "abaction", "abactional".porter2_step3
232
+ assert_equal "abacate", "abacational".porter2_step3
233
+ assert_equal "abacic", "abacicate".porter2_step3
234
+ assert_equal "abacic", "abaciciti".porter2_step3
235
+ assert_equal "abacic", "abacical".porter2_step3
236
+ assert_equal "abac", "abacful".porter2_step3
237
+ assert_equal "abac", "abacness".porter2_step3
238
+
239
+ assert_equal "abacabac", "abacabacative".porter2_step3
240
+ assert_equal "abacabac", "abacabacative".porter2_step3
241
+
242
+ assert_equal "dryness", "dryness".porter2_step3
243
+ end
244
+
245
+ def test_step_4
246
+ assert_equal "abac", "abac".porter2_step4("")
247
+
248
+ assert_equal "nation", "nationize".porter2_step4
249
+ assert_equal "nationise", "nationise".porter2_step4
250
+ assert_equal "nation", "nationize".porter2_step4(true)
251
+ assert_equal "nation", "nationise".porter2_step4(true)
252
+ assert_equal "nation", "nationize".porter2_step4(false)
253
+ assert_equal "nationise", "nationise".porter2_step4(false)
254
+ assert_equal "nation", "nationize".porter2_step4()
255
+ assert_equal "nationise", "nationise".porter2_step4()
256
+
257
+ assert_equal "abac", "abacal".porter2_step4
258
+ assert_equal "abac", "abacance".porter2_step4
259
+ assert_equal "abac", "abacence".porter2_step4
260
+ assert_equal "abac", "abacer".porter2_step4
261
+ assert_equal "abac", "abacic".porter2_step4
262
+ assert_equal "abacer", "abacerable".porter2_step4
263
+ assert_equal "abac", "abacible".porter2_step4
264
+ assert_equal "abac", "abacant".porter2_step4
265
+ assert_equal "abac", "abacement".porter2_step4 # Check we handle overlapping suffixes properly
266
+ assert_equal "abacac", "abacacement".porter2_step4
267
+ assert_equal "abacac", "abacacment".porter2_step4
268
+ assert_equal "abac", "abacment".porter2_step4
269
+ assert_equal "abac", "abacent".porter2_step4
270
+ assert_equal "abac", "abacism".porter2_step4
271
+ assert_equal "abac", "abacate".porter2_step4
272
+ assert_equal "abac", "abaciti".porter2_step4
273
+ assert_equal "abac", "abacous".porter2_step4
274
+ assert_equal "abac", "abacive".porter2_step4
275
+ assert_equal "abac", "abacize".porter2_step4
276
+ assert_equal "abacion", "abacion".porter2_step4
277
+ assert_equal "abacs", "abacsion".porter2_step4
278
+ assert_equal "abact", "abaction".porter2_step4
279
+ assert_equal "abction", "abction".porter2_step4
280
+ assert_equal "ablut", "ablution".porter2_step4
281
+ assert_equal "agreement", "agreement".porter2_step4
282
+
283
+ assert_equal "abcal", "abcal".porter2_step4 # No removal if suffix isn't in R2
284
+ end
285
+
286
+ def test_step_5
287
+ assert_equal "abac", "abac".porter2_step5
288
+
289
+ assert_equal "abacl", "abacll".porter2_step5
290
+ assert_equal "abcll", "abcll".porter2_step5
291
+
292
+ assert_equal "abc", "abc".porter2_step5
293
+ assert_equal "abl", "able".porter2_step5
294
+ assert_equal "abe", "abe".porter2_step5
295
+ assert_equal "abac", "abace".porter2_step5
296
+ assert_equal "bawac", "bawace".porter2_step5
297
+ end
298
+
299
+ def test_porter2_postprocess
300
+ assert_equal "abac", "abac".porter2_postprocess
301
+ assert_equal "abacy", "abacy".porter2_postprocess
302
+ assert_equal "abacy", "abacY".porter2_postprocess
303
+ assert_equal "aybcy", "aYbcY".porter2_postprocess
304
+ assert_equal "aybcy", "aYbcy".porter2_postprocess
305
+ end
306
+
307
+ end
metadata ADDED
@@ -0,0 +1,127 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: porter2stemmer
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Neil Smith
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-06-30 00:00:00.000000000 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: shoulda
17
+ requirement: &79323000 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: '0'
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: *79323000
26
+ - !ruby/object:Gem::Dependency
27
+ name: bundler
28
+ requirement: &79322760 !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 1.0.0
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: *79322760
37
+ - !ruby/object:Gem::Dependency
38
+ name: jeweler
39
+ requirement: &79322520 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ~>
43
+ - !ruby/object:Gem::Version
44
+ version: 1.5.2
45
+ type: :development
46
+ prerelease: false
47
+ version_requirements: *79322520
48
+ - !ruby/object:Gem::Dependency
49
+ name: rcov
50
+ requirement: &79322280 !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ type: :development
57
+ prerelease: false
58
+ version_requirements: *79322280
59
+ description: ! "This is the Porter 2 stemming algorithm, as described at \nhttp://snowball.tartarus.org/algorithms/english/stemmer.html
60
+ It removes suffixes from English words, revealing something closer to the stem of
61
+ the word. For instance \"acknowledgments\".stem => \"acknowledg\"."
62
+ email: neil.github@njae.me.uk
63
+ executables: []
64
+ extensions: []
65
+ extra_rdoc_files:
66
+ - LICENSE.txt
67
+ - README.md
68
+ - README.rdoc
69
+ files:
70
+ - Gemfile
71
+ - Gemfile.lock
72
+ - LICENSE.txt
73
+ - README.md
74
+ - README.rdoc
75
+ - Rakefile
76
+ - VERSION
77
+ - lib/porter2stemmer.rb
78
+ - lib/porter2stemmer/constants.rb
79
+ - lib/porter2stemmer/implementation.rb
80
+ - pkg/porter2stemmer-1.0.0.gem
81
+ - rdoc/Porter2.html
82
+ - rdoc/README_rdoc.html
83
+ - rdoc/String.html
84
+ - rdoc/created.rid
85
+ - rdoc/index.html
86
+ - rdoc/lib/porter2stemmer/constants_rb.html
87
+ - rdoc/lib/porter2stemmer/implementation_rb.html
88
+ - rdoc/lib/porter2stemmer_rb.html
89
+ - rdoc/rdoc.css
90
+ - test/helper.rb
91
+ - test/test_porter2stemmer.rb
92
+ - test/test_porter2stemmer_full.rb
93
+ - test/test_porter2stemmer_parts.rb
94
+ has_rdoc: true
95
+ homepage: http://github.com/NeilNjae/porter2stemmer
96
+ licenses:
97
+ - MIT
98
+ post_install_message:
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ! '>='
106
+ - !ruby/object:Gem::Version
107
+ version: '0'
108
+ segments:
109
+ - 0
110
+ hash: -893266339
111
+ required_rubygems_version: !ruby/object:Gem::Requirement
112
+ none: false
113
+ requirements:
114
+ - - ! '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 1.6.2
120
+ signing_key:
121
+ specification_version: 3
122
+ summary: A pure Ruby implementation of the Porter 2 stemmer
123
+ test_files:
124
+ - test/helper.rb
125
+ - test/test_porter2stemmer.rb
126
+ - test/test_porter2stemmer_full.rb
127
+ - test/test_porter2stemmer_parts.rb