porter2stemmer 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +13 -0
- data/Gemfile.lock +20 -0
- data/LICENSE.txt +20 -0
- data/README.md +58 -0
- data/README.rdoc +67 -0
- data/Rakefile +54 -0
- data/VERSION +1 -0
- data/lib/porter2stemmer/constants.rb +114 -0
- data/lib/porter2stemmer/implementation.rb +326 -0
- data/lib/porter2stemmer.rb +7 -0
- data/pkg/porter2stemmer-1.0.0.gem +0 -0
- data/rdoc/Porter2.html +249 -0
- data/rdoc/README_rdoc.html +204 -0
- data/rdoc/String.html +1142 -0
- data/rdoc/created.rid +6 -0
- data/rdoc/index.html +94 -0
- data/rdoc/lib/porter2stemmer/constants_rb.html +55 -0
- data/rdoc/lib/porter2stemmer/implementation_rb.html +55 -0
- data/rdoc/lib/porter2stemmer_rb.html +59 -0
- data/rdoc/rdoc.css +706 -0
- data/test/helper.rb +7 -0
- data/test/test_porter2stemmer.rb +10 -0
- data/test/test_porter2stemmer_full.rb +29533 -0
- data/test/test_porter2stemmer_parts.rb +307 -0
- metadata +127 -0
data/rdoc/Porter2.html
ADDED
@@ -0,0 +1,249 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
4
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
5
|
+
<head>
|
6
|
+
<meta content="text/html; charset=utf-8" http-equiv="Content-Type" />
|
7
|
+
|
8
|
+
<title>Module: Porter2</title>
|
9
|
+
|
10
|
+
<link rel="stylesheet" href="./rdoc.css" type="text/css" media="screen" />
|
11
|
+
|
12
|
+
<script src="./js/jquery.js" type="text/javascript"
|
13
|
+
charset="utf-8"></script>
|
14
|
+
<script src="./js/thickbox-compressed.js" type="text/javascript"
|
15
|
+
charset="utf-8"></script>
|
16
|
+
<script src="./js/quicksearch.js" type="text/javascript"
|
17
|
+
charset="utf-8"></script>
|
18
|
+
<script src="./js/darkfish.js" type="text/javascript"
|
19
|
+
charset="utf-8"></script>
|
20
|
+
|
21
|
+
</head>
|
22
|
+
<body class="module">
|
23
|
+
|
24
|
+
<div id="metadata">
|
25
|
+
<div id="home-metadata">
|
26
|
+
<div id="home-section" class="section">
|
27
|
+
<h3 class="section-header">
|
28
|
+
<a href="./index.html">Home</a>
|
29
|
+
<a href="./index.html#classes">Classes</a>
|
30
|
+
<a href="./index.html#methods">Methods</a>
|
31
|
+
</h3>
|
32
|
+
</div>
|
33
|
+
</div>
|
34
|
+
|
35
|
+
<div id="file-metadata">
|
36
|
+
<div id="file-list-section" class="section">
|
37
|
+
<h3 class="section-header">In Files</h3>
|
38
|
+
<div class="section-body">
|
39
|
+
<ul>
|
40
|
+
|
41
|
+
<li><a href="./lib/porter2stemmer/constants_rb.html?TB_iframe=true&height=550&width=785"
|
42
|
+
class="thickbox" title="lib/porter2stemmer/constants.rb">lib/porter2stemmer/constants.rb</a></li>
|
43
|
+
|
44
|
+
</ul>
|
45
|
+
</div>
|
46
|
+
</div>
|
47
|
+
|
48
|
+
|
49
|
+
</div>
|
50
|
+
|
51
|
+
<div id="class-metadata">
|
52
|
+
|
53
|
+
<!-- Parent Class -->
|
54
|
+
|
55
|
+
|
56
|
+
<!-- Namespace Contents -->
|
57
|
+
|
58
|
+
|
59
|
+
<!-- Method Quickref -->
|
60
|
+
|
61
|
+
|
62
|
+
<!-- Included Modules -->
|
63
|
+
|
64
|
+
</div>
|
65
|
+
|
66
|
+
<div id="project-metadata">
|
67
|
+
|
68
|
+
|
69
|
+
<div id="fileindex-section" class="section project-section">
|
70
|
+
<h3 class="section-header">Files</h3>
|
71
|
+
<ul>
|
72
|
+
|
73
|
+
<li class="file"><a href="./README_rdoc.html">README.rdoc</a></li>
|
74
|
+
|
75
|
+
</ul>
|
76
|
+
</div>
|
77
|
+
|
78
|
+
|
79
|
+
<div id="classindex-section" class="section project-section">
|
80
|
+
<h3 class="section-header">Class Index
|
81
|
+
<span class="search-toggle"><img src="./images/find.png"
|
82
|
+
height="16" width="16" alt="[+]"
|
83
|
+
title="show/hide quicksearch" /></span></h3>
|
84
|
+
<form action="#" method="get" accept-charset="utf-8" class="initially-hidden">
|
85
|
+
<fieldset>
|
86
|
+
<legend>Quicksearch</legend>
|
87
|
+
<input type="text" name="quicksearch" value=""
|
88
|
+
class="quicksearch-field" />
|
89
|
+
</fieldset>
|
90
|
+
</form>
|
91
|
+
|
92
|
+
<ul class="link-list">
|
93
|
+
|
94
|
+
<li><a href="./Porter2.html">Porter2</a></li>
|
95
|
+
|
96
|
+
<li><a href="./String.html">String</a></li>
|
97
|
+
|
98
|
+
</ul>
|
99
|
+
<div id="no-class-search-results" style="display: none;">No matching classes.</div>
|
100
|
+
</div>
|
101
|
+
|
102
|
+
|
103
|
+
</div>
|
104
|
+
</div>
|
105
|
+
|
106
|
+
<div id="documentation">
|
107
|
+
<h1 class="module">Porter2</h1>
|
108
|
+
|
109
|
+
<div id="description">
|
110
|
+
<p>
|
111
|
+
Constants for the Porter 2 stemmer
|
112
|
+
</p>
|
113
|
+
|
114
|
+
</div>
|
115
|
+
|
116
|
+
<!-- Constants -->
|
117
|
+
|
118
|
+
<div id="constants-list" class="section">
|
119
|
+
<h3 class="section-header">Constants</h3>
|
120
|
+
<dl>
|
121
|
+
|
122
|
+
<dt><a name="C">C</a></dt>
|
123
|
+
|
124
|
+
<dd class="description"><p>
|
125
|
+
A non-vowel
|
126
|
+
</p></dd>
|
127
|
+
|
128
|
+
|
129
|
+
<dt><a name="V">V</a></dt>
|
130
|
+
|
131
|
+
<dd class="description"><p>
|
132
|
+
A vowel: a e i o u y
|
133
|
+
</p></dd>
|
134
|
+
|
135
|
+
|
136
|
+
<dt><a name="CW">CW</a></dt>
|
137
|
+
|
138
|
+
<dd class="description"><p>
|
139
|
+
A non-vowel other than w, x, or Y
|
140
|
+
</p></dd>
|
141
|
+
|
142
|
+
|
143
|
+
<dt><a name="Double">Double</a></dt>
|
144
|
+
|
145
|
+
<dd class="description"><p>
|
146
|
+
Doubles created when adding a suffix: these are undoubled when stemmed
|
147
|
+
</p></dd>
|
148
|
+
|
149
|
+
|
150
|
+
<dt><a name="Valid_LI">Valid_LI</a></dt>
|
151
|
+
|
152
|
+
<dd class="description"><p>
|
153
|
+
A valid letter that can come before ‘li’ (or ‘ly’)
|
154
|
+
</p></dd>
|
155
|
+
|
156
|
+
|
157
|
+
<dt><a name="SHORT_SYLLABLE">SHORT_SYLLABLE</a></dt>
|
158
|
+
|
159
|
+
<dd class="description"><p>
|
160
|
+
A specification for a short syllable.
|
161
|
+
</p>
|
162
|
+
<p>
|
163
|
+
A short syllable in a word is either:
|
164
|
+
</p>
|
165
|
+
<ol>
|
166
|
+
<li><p>
|
167
|
+
a vowel followed by a non-vowel other than w, x or Y and preceded by a
|
168
|
+
non-vowel, or
|
169
|
+
</p>
|
170
|
+
</li>
|
171
|
+
<li><p>
|
172
|
+
a vowel at the beginning of the word followed by a non-vowel.
|
173
|
+
</p>
|
174
|
+
</li>
|
175
|
+
</ol>
|
176
|
+
<p>
|
177
|
+
(The original document is silent on whether sequences of two or more
|
178
|
+
non-vowels make a
|
179
|
syllable long. But as this specification is only used to
|
180
|
+
find sequences of non-vowel -
|
1
181
|
vowel - non-vowel - end-of-word, this
|
182
|
+
ambiguity does not have an effect.)
|
183
|
+
</p></dd>
|
184
|
+
|
185
|
+
|
186
|
+
<dt><a name="STEP_2_MAPS">STEP_2_MAPS</a></dt>
|
187
|
+
|
188
|
+
<dd class="description"><p>
|
189
|
+
Suffix transformations used in porter2_step2.
|
2
190
|
(ogi, li endings dealt with
|
191
|
+
in procedure)
|
192
|
+
</p></dd>
|
193
|
+
|
194
|
+
|
195
|
+
<dt><a name="STEP_3_MAPS">STEP_3_MAPS</a></dt>
|
196
|
+
|
197
|
+
<dd class="description"><p>
|
198
|
+
Suffix transformations used in porter2_step3.
|
3
199
|
(ative ending dealt with in
|
200
|
+
procedure)
|
201
|
+
</p></dd>
|
202
|
+
|
203
|
+
|
204
|
+
<dt><a name="STEP_4_MAPS">STEP_4_MAPS</a></dt>
|
205
|
+
|
206
|
+
<dd class="description"><p>
|
207
|
+
Suffix transformations used in porter2_step4.
|
4
208
|
(ion ending dealt with in
|
209
|
+
procedure)
|
210
|
+
</p></dd>
|
211
|
+
|
212
|
+
|
213
|
+
<dt><a name="SPECIAL_CASES">SPECIAL_CASES</a></dt>
|
214
|
+
|
215
|
+
<dd class="description"><p>
|
216
|
+
Special-case stemmings
|
217
|
+
</p></dd>
|
218
|
+
|
219
|
+
|
220
|
+
<dt><a name="STEP_1A_SPECIAL_CASES">STEP_1A_SPECIAL_CASES</a></dt>
|
221
|
+
|
222
|
+
<dd class="description"><p>
|
223
|
+
Special case words to stop processing after step 1a.
|
224
|
+
</p></dd>
|
225
|
+
|
226
|
+
|
227
|
+
</dl>
|
228
|
+
</div>
|
229
|
+
|
230
|
+
|
231
|
+
<!-- Attributes -->
|
232
|
+
|
233
|
+
|
234
|
+
<!-- Methods -->
|
235
|
+
|
236
|
+
|
237
|
+
</div>
|
238
|
+
|
239
|
+
|
240
|
+
<div id="rdoc-debugging-section-dump" class="debugging-section">
|
241
|
+
|
242
|
+
<p>Disabled; run with --debug to generate this.</p>
|
243
|
+
|
244
|
+
</div>
|
245
|
+
|
246
|
+
<div id="validator-badges">
|
247
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
248
|
+
<p><small>Generated with the <a href="http://deveiate.org/projects/Darkfish-Rdoc/">Darkfish
|
249
|
+
Rdoc Generator</a> 1.1.6</small>.</p>
|
250
|
+
</div>
|
251
|
+
|
252
|
+
</body>
|
253
|
+
</html>
|
254
|
+
|
@@ -0,0 +1,204 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
4
|
+
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
6
|
+
<head>
|
7
|
+
<meta content="text/html; charset=utf-8" http-equiv="Content-Type" />
|
8
|
+
|
9
|
+
<title>File: README.rdoc [porter2stemmer 1.0.0]</title>
|
10
|
+
|
11
|
+
<link type="text/css" media="screen" href="./rdoc.css" rel="stylesheet" />
|
12
|
+
|
13
|
+
<script src="./js/jquery.js" type="text/javascript"
|
14
|
+
charset="utf-8"></script>
|
15
|
+
<script src="./js/thickbox-compressed.js" type="text/javascript"
|
16
|
+
charset="utf-8"></script>
|
17
|
+
<script src="./js/quicksearch.js" type="text/javascript"
|
18
|
+
charset="utf-8"></script>
|
19
|
+
<script src="./js/darkfish.js" type="text/javascript"
|
20
|
+
charset="utf-8"></script>
|
21
|
+
</head>
|
22
|
+
|
23
|
+
<body class="file">
|
24
|
+
<div id="metadata">
|
25
|
+
<div id="home-metadata">
|
26
|
+
<div id="home-section" class="section">
|
27
|
+
<h3 class="section-header">
|
28
|
+
<a href="./index.html">Home</a>
|
29
|
+
<a href="./index.html#classes">Classes</a>
|
30
|
+
<a href="./index.html#methods">Methods</a>
|
31
|
+
</h3>
|
32
|
+
</div>
|
33
|
+
</div>
|
34
|
+
|
35
|
+
<div id="project-metadata">
|
36
|
+
|
37
|
+
|
38
|
+
<div id="fileindex-section" class="section project-section">
|
39
|
+
<h3 class="section-header">Files</h3>
|
40
|
+
<ul>
|
41
|
+
|
42
|
+
<li class="file"><a href="./README_rdoc.html">README.rdoc</a></li>
|
43
|
+
|
44
|
+
</ul>
|
45
|
+
</div>
|
46
|
+
|
47
|
+
|
48
|
+
<div id="classindex-section" class="section project-section">
|
49
|
+
<h3 class="section-header">Class Index
|
50
|
+
<span class="search-toggle"><img src="./images/find.png"
|
51
|
+
height="16" width="16" alt="[+]"
|
52
|
+
title="show/hide quicksearch" /></span></h3>
|
53
|
+
<form action="#" method="get" accept-charset="utf-8" class="initially-hidden">
|
54
|
+
<fieldset>
|
55
|
+
<legend>Quicksearch</legend>
|
56
|
+
<input type="text" name="quicksearch" value=""
|
57
|
+
class="quicksearch-field" />
|
58
|
+
</fieldset>
|
59
|
+
</form>
|
60
|
+
|
61
|
+
<ul class="link-list">
|
62
|
+
|
63
|
+
<li><a href="./Porter2.html">Porter2</a></li>
|
64
|
+
|
65
|
+
<li><a href="./String.html">String</a></li>
|
66
|
+
|
67
|
+
</ul>
|
68
|
+
<div id="no-class-search-results" style="display: none;">No matching classes.</div>
|
69
|
+
</div>
|
70
|
+
|
71
|
+
|
72
|
+
</div>
|
73
|
+
</div>
|
74
|
+
|
75
|
+
<div id="documentation">
|
76
|
+
<h1>porter2stemmer</h1>
|
77
|
+
<h2>The Porter 2 stemmer</h2>
|
78
|
+
<p>
|
79
|
+
This is the Porter 2 stemming algorithm, as described at <a
|
80
|
+
href="http://snowball.tartarus.org/algorithms/english/stemmer.html">snowball.tartarus.org/algorithms/english/stemmer.html</a>
|
81
|
+
The original paper is:
|
82
|
+
</p>
|
83
|
+
<p>
|
84
|
+
Porter, 1980, “An algorithm for suffix stripping”,
|
85
|
+
<em>Program</em>, Vol. 14, no. 3, pp 130-137
|
86
|
+
</p>
|
87
|
+
<h2>Features of this implementation</h2>
|
88
|
+
<p>
|
89
|
+
This stemmer is written in pure Ruby, making it easy to modify for language
|
90
|
+
variants. For instance, the original Porter stemmer only works for
|
91
|
+
American English and does not recognise British English’s
|
92
|
+
’-ise’ as an alternate spelling of ’-ize’. This
|
93
|
+
implementation has been extended to handle correctly British English.
|
94
|
+
</p>
|
95
|
+
<p>
|
96
|
+
This stemmer also features a comprehensive test set of over 29,000 words,
|
97
|
+
taken from the <a
|
98
|
+
href="http://snowball.tartarus.org/algorithms/english/stemmer.html">Porter
|
99
|
+
2 stemmer website</a>.
|
100
|
+
</p>
|
101
|
+
<h2>Files</h2>
|
102
|
+
<p>
|
103
|
+
Constants for the stemmer are in the <a href="Porter2.html">Porter2</a>
|
104
|
+
module.
|
105
|
+
</p>
|
106
|
+
<p>
|
107
|
+
Procedures that implement the stemmer are added to the <a
|
108
|
+
href="String.html">String</a> class.
|
109
|
+
</p>
|
110
|
+
<p>
|
111
|
+
The stemmer algorithm is implemented in the <a
|
112
|
+
href="String.html#method-i-porter2_stem">String#porter2_stem</a> procedure.
|
113
|
+
</p>
|
114
|
+
<h2>Internationalisation</h2>
|
115
|
+
<p>
|
116
|
+
There isn’t much, as this is a stemmer that only works for English.
|
117
|
+
</p>
|
118
|
+
<p>
|
119
|
+
The <tt>gb_english</tt> flag to the various procedures allows the stemmer
|
120
|
+
to treat the British English ’-ise’ the same as the American
|
121
|
+
English ’-ize’.
|
122
|
+
</p>
|
123
|
+
<h2>Longest suffixes</h2>
|
124
|
+
<p>
|
125
|
+
Several places in the algorithm require matching the longest suffix of a
|
126
|
+
word. The regexp engine in Ruby 1.9 seems to handle alterntives in regexps
|
127
|
+
by finding the alternative that matches at the first position in the
|
128
|
+
string. As we’re only talking about suffixes, that first match is
|
129
|
+
also the longest suffix. If the regexp engine changes, this behaviour may
|
130
|
+
change and break the stemmer.
|
131
|
+
</p>
|
132
|
+
<h2>Usage</h2>
|
133
|
+
<p>
|
134
|
+
Call the <a
|
135
|
+
href="String.html#method-i-porter2_stem">String#porter2_stem</a> or <a
|
136
|
+
href="String.html#method-i-stem">String#stem</a> methods on a string to
|
137
|
+
return its stem
|
138
|
+
</p>
|
139
|
+
<pre>
|
140
|
+
"consistency".stem # => "consist"
|
141
|
+
"knitting".stem # => "knit"
|
142
|
+
"articulated".stem # => "articul"
|
143
|
+
"nationalize".stem # => "nation"
|
144
|
+
"nationalise".stem # => "nationalis"
|
145
|
+
"nationalise".stem(true) # => "nation"
|
146
|
+
</pre>
|
147
|
+
<h2>Author</h2>
|
148
|
+
<p>
|
149
|
+
The Porter 2 stemming algorithm was developed by <a
|
150
|
+
href="http://snowball.tartarus.org/algorithms/english/stemmer.html">Martin
|
151
|
+
Porter</a>. This implementation is by <a href="http://www.njae.me.uk">Neil
|
152
|
+
Smith</a>.
|
153
|
+
</p>
|
154
|
+
<h2>Contributing to porter2stemmer</h2>
|
155
|
+
<ul>
|
156
|
+
<li><p>
|
157
|
+
Check out the latest master to make sure the feature hasn’t been
|
158
|
+
implemented or the bug hasn’t been fixed yet
|
159
|
+
</p>
|
160
|
+
</li>
|
161
|
+
<li><p>
|
162
|
+
Check out the issue tracker to make sure someone already hasn’t
|
163
|
+
requested it and/or contributed it
|
164
|
+
</p>
|
165
|
+
</li>
|
166
|
+
<li><p>
|
167
|
+
Fork the project
|
168
|
+
</p>
|
169
|
+
</li>
|
170
|
+
<li><p>
|
171
|
+
Start a feature/bugfix branch
|
172
|
+
</p>
|
173
|
+
</li>
|
174
|
+
<li><p>
|
175
|
+
Commit and push until you are happy with your contribution
|
176
|
+
</p>
|
177
|
+
</li>
|
178
|
+
<li><p>
|
179
|
+
Make sure to add tests for it. This is important so I don’t break it
|
180
|
+
in a future version unintentionally.
|
181
|
+
</p>
|
182
|
+
</li>
|
183
|
+
<li><p>
|
184
|
+
Please try not to mess with the Rakefile, version, or history. If you want
|
185
|
+
to have your own version, or is otherwise necessary, that is fine, but
|
186
|
+
please isolate to its own commit so I can cherry-pick around it.
|
187
|
+
</p>
|
188
|
+
</li>
|
189
|
+
</ul>
|
190
|
+
<h2>Copyright</h2>
|
191
|
+
<p>
|
192
|
+
Copyright © 2011 Neil Smith. See LICENSE.txt for further details.
|
193
|
+
</p>
|
194
|
+
|
195
|
+
</div>
|
196
|
+
|
197
|
+
<div id="validator-badges">
|
198
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
199
|
+
<p><small>Generated with the <a href="http://deveiate.org/projects/Darkfish-Rdoc/">Darkfish
|
200
|
+
Rdoc Generator</a> 1.1.6</small>.</p>
|
201
|
+
</div>
|
202
|
+
</body>
|
203
|
+
</html>
|
204
|
+
|