porter-stemmer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/porter-stemmer.rb +235 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: df3836142bca8ef6ddbde1a7e25524a40270f26e
|
4
|
+
data.tar.gz: 0801b8628b4a11b025aaaa7734d1fb1853463d58
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2be923cafdedab0085e864b93b56063a403f6149c25d3023fa815f06ebcd4258d590e8723ce44583011e6258c54d804df048d3966b33516876f975c42165ae7d
|
7
|
+
data.tar.gz: d3acc5afd0e1e3dd15f4a8f052d866d781c078422f3ada8650294984709e593c0cf70d84cacc2d247b7be82d38248e2ae0aeea989ab75fbfbcf6800ef2021917
|
@@ -0,0 +1,235 @@
|
|
1
|
+
module Porter
|
2
|
+
class Stemmer
|
3
|
+
|
4
|
+
STEP_2_SUFFIX_MAPPING = {
|
5
|
+
'ational' => 'ate',
|
6
|
+
'tional' => 'tion',
|
7
|
+
'enci' => 'ence',
|
8
|
+
'anci' => 'ance',
|
9
|
+
'izer' => 'ize',
|
10
|
+
'bli' => 'ble',
|
11
|
+
'alli' => 'al',
|
12
|
+
'entli' => 'ent',
|
13
|
+
'eli' => 'e',
|
14
|
+
'ousli' => 'ous',
|
15
|
+
'ization' => 'ize',
|
16
|
+
'ation' => 'ate',
|
17
|
+
'ator' => 'ate',
|
18
|
+
'alism' => 'al',
|
19
|
+
'iveness' => 'ive',
|
20
|
+
'fulness' => 'ful',
|
21
|
+
'ousness' => 'ous',
|
22
|
+
'aliti' => 'al',
|
23
|
+
'iviti' => 'ive',
|
24
|
+
'biliti' => 'ble',
|
25
|
+
'logi' => 'log'
|
26
|
+
}
|
27
|
+
|
28
|
+
STEP_2_SUFFIX_REGEXP = /(
|
29
|
+
ational |
|
30
|
+
tional |
|
31
|
+
enci |
|
32
|
+
anci |
|
33
|
+
izer |
|
34
|
+
bli |
|
35
|
+
alli |
|
36
|
+
entli |
|
37
|
+
eli |
|
38
|
+
ousli |
|
39
|
+
ization |
|
40
|
+
ation |
|
41
|
+
ator |
|
42
|
+
alism |
|
43
|
+
iveness |
|
44
|
+
fulness |
|
45
|
+
ousness |
|
46
|
+
aliti |
|
47
|
+
iviti |
|
48
|
+
biliti |
|
49
|
+
logi)$/x
|
50
|
+
|
51
|
+
STEP_3_SUFFIX_MAPPING = {
|
52
|
+
'icate' => 'ic',
|
53
|
+
'ative' => '',
|
54
|
+
'alize' => 'al',
|
55
|
+
'iciti' => 'ic',
|
56
|
+
'ical' => 'ic',
|
57
|
+
'ful' => '',
|
58
|
+
'ness' => ''
|
59
|
+
}
|
60
|
+
|
61
|
+
STEP_3_SUFFIX_REGEXP = /(icate|ative|alize|iciti|ical|ful|ness)$/
|
62
|
+
|
63
|
+
STEP_4_SUFFIX_REGEXP = /(
|
64
|
+
al |
|
65
|
+
ance |
|
66
|
+
ence |
|
67
|
+
er |
|
68
|
+
ic |
|
69
|
+
able |
|
70
|
+
ible |
|
71
|
+
ant |
|
72
|
+
ement |
|
73
|
+
ment |
|
74
|
+
ent |
|
75
|
+
ou |
|
76
|
+
ism |
|
77
|
+
ate |
|
78
|
+
iti |
|
79
|
+
ous |
|
80
|
+
ive |
|
81
|
+
ize)$/x
|
82
|
+
|
83
|
+
CONSONANT = "[^aeiou]" # consonant
|
84
|
+
VOWEL = "[aeiouy]" # vowel
|
85
|
+
CONSONANT_SEQUENCE = "#{CONSONANT}(?>[^aeiouy]*)" # consonant sequence
|
86
|
+
VOWEL_SEQUENCE = "#{VOWEL}(?>[aeiou]*)" # vowel sequence
|
87
|
+
|
88
|
+
# Number of consonant sequences
|
89
|
+
MGR0 = /^(#{CONSONANT_SEQUENCE})?#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}/o # [cc]vvcc... is m>0
|
90
|
+
MEQ1 = /^(#{CONSONANT_SEQUENCE})?#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}(#{VOWEL_SEQUENCE})?$/o # [cc]vvcc[vv] is m=1
|
91
|
+
MGR1 = /^(#{CONSONANT_SEQUENCE})?#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}/o # [cc]vvccvvcc... is m>1
|
92
|
+
VOWEL_IN_STEM = /^(#{CONSONANT_SEQUENCE})?#{VOWEL}/o # vowel in stem
|
93
|
+
|
94
|
+
def stem(word)
|
95
|
+
return word if word.length < 3
|
96
|
+
|
97
|
+
# Map initial y to Y so that the patterns never treat it as vowel
|
98
|
+
word[0] = 'Y' if word[0] == 'y'
|
99
|
+
|
100
|
+
word = step1(word)
|
101
|
+
word = step2(word)
|
102
|
+
word = step3(word)
|
103
|
+
word = step4(word)
|
104
|
+
word = step5(word)
|
105
|
+
|
106
|
+
# Turn initial Y back to y
|
107
|
+
word[0] = 'y' if word[0] == 'Y'
|
108
|
+
|
109
|
+
return word
|
110
|
+
end
|
111
|
+
|
112
|
+
private
|
113
|
+
|
114
|
+
# Gets rid of plurals and -ed or -ing. e.g.
|
115
|
+
def step1(word)
|
116
|
+
word = step1a(word)
|
117
|
+
word = step1b(word)
|
118
|
+
word = step1c(word)
|
119
|
+
end
|
120
|
+
|
121
|
+
def step1a(word)
|
122
|
+
if word =~ /(ss|i)es$/ || word =~ /([^s])s$/
|
123
|
+
word = $` + $1
|
124
|
+
end
|
125
|
+
|
126
|
+
return word
|
127
|
+
|
128
|
+
end
|
129
|
+
|
130
|
+
def step1b(word)
|
131
|
+
if word =~ /eed$/
|
132
|
+
word.chop! if $` =~ MGR0
|
133
|
+
elsif word =~ /(ed|ing)$/
|
134
|
+
stemmed_word = $`
|
135
|
+
if stemmed_word =~ VOWEL_IN_STEM
|
136
|
+
word = stemmed_word
|
137
|
+
case word
|
138
|
+
when /(at|bl|iz)$/, /^#{CONSONANT_SEQUENCE}#{VOWEL}[^aeiouwxy]$/o
|
139
|
+
word << "e"
|
140
|
+
when /([^aeiouylsz])\1$/
|
141
|
+
word.chop!
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
return word
|
147
|
+
|
148
|
+
end
|
149
|
+
|
150
|
+
# Turns terminal y to i when there is another vowel in the stem
|
151
|
+
def step1c(word)
|
152
|
+
if word =~ /y$/
|
153
|
+
stemmed_word = $`
|
154
|
+
word = stemmed_word + "i" if stemmed_word =~ VOWEL_IN_STEM
|
155
|
+
end
|
156
|
+
|
157
|
+
return word
|
158
|
+
end
|
159
|
+
|
160
|
+
# Maps double suffices to single ones, so -ization (-ize plus -ation) maps to -ize
|
161
|
+
def step2(word)
|
162
|
+
map_suffices word, STEP_2_SUFFIX_REGEXP, STEP_2_SUFFIX_MAPPING
|
163
|
+
end
|
164
|
+
|
165
|
+
# Deals with -ic-, -full, -ness, etc.
|
166
|
+
def step3(word)
|
167
|
+
map_suffices word, STEP_3_SUFFIX_REGEXP, STEP_3_SUFFIX_MAPPING
|
168
|
+
end
|
169
|
+
|
170
|
+
def map_suffices(word, regexp, suffix_mapping)
|
171
|
+
if word =~ regexp
|
172
|
+
stemmed_word = $`
|
173
|
+
suffix = $1
|
174
|
+
if stemmed_word =~ MGR0
|
175
|
+
word = stemmed_word + suffix_mapping[suffix]
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
return word
|
180
|
+
end
|
181
|
+
|
182
|
+
# Takes off -ant, -ence etc., in context <c>vcvc<v>
|
183
|
+
def step4(word)
|
184
|
+
if word =~ STEP_4_SUFFIX_REGEXP
|
185
|
+
stemmed_word = $`
|
186
|
+
elsif word =~ /(s|t)(ion)$/
|
187
|
+
stemmed_word = $` + $1
|
188
|
+
end
|
189
|
+
|
190
|
+
word = stemmed_word if defined?(stemmed_word) && stemmed_word =~ MGR1
|
191
|
+
|
192
|
+
return word
|
193
|
+
end
|
194
|
+
|
195
|
+
# Removes a final -e if the number of consonant sequences is greater than 1
|
196
|
+
def step5(word)
|
197
|
+
if word =~ /e$/
|
198
|
+
stemmed_word = $`
|
199
|
+
if (stemmed_word =~ MGR1) ||
|
200
|
+
(stemmed_word =~ MEQ1 && stemmed_word !~ /^#{CONSONANT_SEQUENCE}#{VOWEL}[^aeiouwxy]$/o)
|
201
|
+
word = stemmed_word
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
if word =~ /ll$/ && word =~ MGR1
|
206
|
+
word.chop!
|
207
|
+
end
|
208
|
+
|
209
|
+
return word
|
210
|
+
end
|
211
|
+
|
212
|
+
end
|
213
|
+
|
214
|
+
def stem_as_array
|
215
|
+
stemmer = Stemmer.new
|
216
|
+
stemmed_words = []
|
217
|
+
words = self.split(/\W+/)
|
218
|
+
|
219
|
+
words.each_index do |index|
|
220
|
+
word = words[index]
|
221
|
+
stemmed_words << stemmer.stem(word)
|
222
|
+
end
|
223
|
+
|
224
|
+
return stemmed_words
|
225
|
+
end
|
226
|
+
|
227
|
+
def stem
|
228
|
+
Stemmer.new.stem self.dup
|
229
|
+
end
|
230
|
+
|
231
|
+
end
|
232
|
+
|
233
|
+
class String
|
234
|
+
include Porter
|
235
|
+
end
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: porter-stemmer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- jcarlosgarcia
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-07-28 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Applies the Porter Stemmer algorithm to a word or a whole text
|
14
|
+
email:
|
15
|
+
- jcarlosgarcia@hotmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/porter-stemmer.rb
|
21
|
+
homepage:
|
22
|
+
licenses: []
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.2.2
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Porter Stemmer
|
44
|
+
test_files: []
|