porter-stemmer 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/porter-stemmer.rb +235 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: df3836142bca8ef6ddbde1a7e25524a40270f26e
|
4
|
+
data.tar.gz: 0801b8628b4a11b025aaaa7734d1fb1853463d58
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2be923cafdedab0085e864b93b56063a403f6149c25d3023fa815f06ebcd4258d590e8723ce44583011e6258c54d804df048d3966b33516876f975c42165ae7d
|
7
|
+
data.tar.gz: d3acc5afd0e1e3dd15f4a8f052d866d781c078422f3ada8650294984709e593c0cf70d84cacc2d247b7be82d38248e2ae0aeea989ab75fbfbcf6800ef2021917
|
@@ -0,0 +1,235 @@
|
|
1
|
+
module Porter
|
2
|
+
class Stemmer
|
3
|
+
|
4
|
+
STEP_2_SUFFIX_MAPPING = {
|
5
|
+
'ational' => 'ate',
|
6
|
+
'tional' => 'tion',
|
7
|
+
'enci' => 'ence',
|
8
|
+
'anci' => 'ance',
|
9
|
+
'izer' => 'ize',
|
10
|
+
'bli' => 'ble',
|
11
|
+
'alli' => 'al',
|
12
|
+
'entli' => 'ent',
|
13
|
+
'eli' => 'e',
|
14
|
+
'ousli' => 'ous',
|
15
|
+
'ization' => 'ize',
|
16
|
+
'ation' => 'ate',
|
17
|
+
'ator' => 'ate',
|
18
|
+
'alism' => 'al',
|
19
|
+
'iveness' => 'ive',
|
20
|
+
'fulness' => 'ful',
|
21
|
+
'ousness' => 'ous',
|
22
|
+
'aliti' => 'al',
|
23
|
+
'iviti' => 'ive',
|
24
|
+
'biliti' => 'ble',
|
25
|
+
'logi' => 'log'
|
26
|
+
}
|
27
|
+
|
28
|
+
STEP_2_SUFFIX_REGEXP = /(
|
29
|
+
ational |
|
30
|
+
tional |
|
31
|
+
enci |
|
32
|
+
anci |
|
33
|
+
izer |
|
34
|
+
bli |
|
35
|
+
alli |
|
36
|
+
entli |
|
37
|
+
eli |
|
38
|
+
ousli |
|
39
|
+
ization |
|
40
|
+
ation |
|
41
|
+
ator |
|
42
|
+
alism |
|
43
|
+
iveness |
|
44
|
+
fulness |
|
45
|
+
ousness |
|
46
|
+
aliti |
|
47
|
+
iviti |
|
48
|
+
biliti |
|
49
|
+
logi)$/x
|
50
|
+
|
51
|
+
STEP_3_SUFFIX_MAPPING = {
|
52
|
+
'icate' => 'ic',
|
53
|
+
'ative' => '',
|
54
|
+
'alize' => 'al',
|
55
|
+
'iciti' => 'ic',
|
56
|
+
'ical' => 'ic',
|
57
|
+
'ful' => '',
|
58
|
+
'ness' => ''
|
59
|
+
}
|
60
|
+
|
61
|
+
STEP_3_SUFFIX_REGEXP = /(icate|ative|alize|iciti|ical|ful|ness)$/
|
62
|
+
|
63
|
+
STEP_4_SUFFIX_REGEXP = /(
|
64
|
+
al |
|
65
|
+
ance |
|
66
|
+
ence |
|
67
|
+
er |
|
68
|
+
ic |
|
69
|
+
able |
|
70
|
+
ible |
|
71
|
+
ant |
|
72
|
+
ement |
|
73
|
+
ment |
|
74
|
+
ent |
|
75
|
+
ou |
|
76
|
+
ism |
|
77
|
+
ate |
|
78
|
+
iti |
|
79
|
+
ous |
|
80
|
+
ive |
|
81
|
+
ize)$/x
|
82
|
+
|
83
|
+
CONSONANT = "[^aeiou]" # consonant
|
84
|
+
VOWEL = "[aeiouy]" # vowel
|
85
|
+
CONSONANT_SEQUENCE = "#{CONSONANT}(?>[^aeiouy]*)" # consonant sequence
|
86
|
+
VOWEL_SEQUENCE = "#{VOWEL}(?>[aeiou]*)" # vowel sequence
|
87
|
+
|
88
|
+
# Number of consonant sequences
|
89
|
+
MGR0 = /^(#{CONSONANT_SEQUENCE})?#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}/o # [cc]vvcc... is m>0
|
90
|
+
MEQ1 = /^(#{CONSONANT_SEQUENCE})?#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}(#{VOWEL_SEQUENCE})?$/o # [cc]vvcc[vv] is m=1
|
91
|
+
MGR1 = /^(#{CONSONANT_SEQUENCE})?#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}/o # [cc]vvccvvcc... is m>1
|
92
|
+
VOWEL_IN_STEM = /^(#{CONSONANT_SEQUENCE})?#{VOWEL}/o # vowel in stem
|
93
|
+
|
94
|
+
def stem(word)
|
95
|
+
return word if word.length < 3
|
96
|
+
|
97
|
+
# Map initial y to Y so that the patterns never treat it as vowel
|
98
|
+
word[0] = 'Y' if word[0] == 'y'
|
99
|
+
|
100
|
+
word = step1(word)
|
101
|
+
word = step2(word)
|
102
|
+
word = step3(word)
|
103
|
+
word = step4(word)
|
104
|
+
word = step5(word)
|
105
|
+
|
106
|
+
# Turn initial Y back to y
|
107
|
+
word[0] = 'y' if word[0] == 'Y'
|
108
|
+
|
109
|
+
return word
|
110
|
+
end
|
111
|
+
|
112
|
+
private
|
113
|
+
|
114
|
+
# Gets rid of plurals and -ed or -ing. e.g.
|
115
|
+
def step1(word)
|
116
|
+
word = step1a(word)
|
117
|
+
word = step1b(word)
|
118
|
+
word = step1c(word)
|
119
|
+
end
|
120
|
+
|
121
|
+
def step1a(word)
|
122
|
+
if word =~ /(ss|i)es$/ || word =~ /([^s])s$/
|
123
|
+
word = $` + $1
|
124
|
+
end
|
125
|
+
|
126
|
+
return word
|
127
|
+
|
128
|
+
end
|
129
|
+
|
130
|
+
def step1b(word)
|
131
|
+
if word =~ /eed$/
|
132
|
+
word.chop! if $` =~ MGR0
|
133
|
+
elsif word =~ /(ed|ing)$/
|
134
|
+
stemmed_word = $`
|
135
|
+
if stemmed_word =~ VOWEL_IN_STEM
|
136
|
+
word = stemmed_word
|
137
|
+
case word
|
138
|
+
when /(at|bl|iz)$/, /^#{CONSONANT_SEQUENCE}#{VOWEL}[^aeiouwxy]$/o
|
139
|
+
word << "e"
|
140
|
+
when /([^aeiouylsz])\1$/
|
141
|
+
word.chop!
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
return word
|
147
|
+
|
148
|
+
end
|
149
|
+
|
150
|
+
# Turns terminal y to i when there is another vowel in the stem
|
151
|
+
def step1c(word)
|
152
|
+
if word =~ /y$/
|
153
|
+
stemmed_word = $`
|
154
|
+
word = stemmed_word + "i" if stemmed_word =~ VOWEL_IN_STEM
|
155
|
+
end
|
156
|
+
|
157
|
+
return word
|
158
|
+
end
|
159
|
+
|
160
|
+
# Maps double suffices to single ones, so -ization (-ize plus -ation) maps to -ize
|
161
|
+
def step2(word)
|
162
|
+
map_suffices word, STEP_2_SUFFIX_REGEXP, STEP_2_SUFFIX_MAPPING
|
163
|
+
end
|
164
|
+
|
165
|
+
# Deals with -ic-, -full, -ness, etc.
|
166
|
+
def step3(word)
|
167
|
+
map_suffices word, STEP_3_SUFFIX_REGEXP, STEP_3_SUFFIX_MAPPING
|
168
|
+
end
|
169
|
+
|
170
|
+
def map_suffices(word, regexp, suffix_mapping)
|
171
|
+
if word =~ regexp
|
172
|
+
stemmed_word = $`
|
173
|
+
suffix = $1
|
174
|
+
if stemmed_word =~ MGR0
|
175
|
+
word = stemmed_word + suffix_mapping[suffix]
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
return word
|
180
|
+
end
|
181
|
+
|
182
|
+
# Takes off -ant, -ence etc., in context <c>vcvc<v>
|
183
|
+
def step4(word)
|
184
|
+
if word =~ STEP_4_SUFFIX_REGEXP
|
185
|
+
stemmed_word = $`
|
186
|
+
elsif word =~ /(s|t)(ion)$/
|
187
|
+
stemmed_word = $` + $1
|
188
|
+
end
|
189
|
+
|
190
|
+
word = stemmed_word if defined?(stemmed_word) && stemmed_word =~ MGR1
|
191
|
+
|
192
|
+
return word
|
193
|
+
end
|
194
|
+
|
195
|
+
# Removes a final -e if the number of consonant sequences is greater than 1
|
196
|
+
def step5(word)
|
197
|
+
if word =~ /e$/
|
198
|
+
stemmed_word = $`
|
199
|
+
if (stemmed_word =~ MGR1) ||
|
200
|
+
(stemmed_word =~ MEQ1 && stemmed_word !~ /^#{CONSONANT_SEQUENCE}#{VOWEL}[^aeiouwxy]$/o)
|
201
|
+
word = stemmed_word
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
if word =~ /ll$/ && word =~ MGR1
|
206
|
+
word.chop!
|
207
|
+
end
|
208
|
+
|
209
|
+
return word
|
210
|
+
end
|
211
|
+
|
212
|
+
end
|
213
|
+
|
214
|
+
def stem_as_array
|
215
|
+
stemmer = Stemmer.new
|
216
|
+
stemmed_words = []
|
217
|
+
words = self.split(/\W+/)
|
218
|
+
|
219
|
+
words.each_index do |index|
|
220
|
+
word = words[index]
|
221
|
+
stemmed_words << stemmer.stem(word)
|
222
|
+
end
|
223
|
+
|
224
|
+
return stemmed_words
|
225
|
+
end
|
226
|
+
|
227
|
+
def stem
|
228
|
+
Stemmer.new.stem self.dup
|
229
|
+
end
|
230
|
+
|
231
|
+
end
|
232
|
+
|
233
|
+
class String
|
234
|
+
include Porter
|
235
|
+
end
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: porter-stemmer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- jcarlosgarcia
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-07-28 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Applies the Porter Stemmer algorithm to a word or a whole text
|
14
|
+
email:
|
15
|
+
- jcarlosgarcia@hotmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/porter-stemmer.rb
|
21
|
+
homepage:
|
22
|
+
licenses: []
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.2.2
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Porter Stemmer
|
44
|
+
test_files: []
|