porter-stemmer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/porter-stemmer.rb +235 -0
  3. metadata +44 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: df3836142bca8ef6ddbde1a7e25524a40270f26e
4
+ data.tar.gz: 0801b8628b4a11b025aaaa7734d1fb1853463d58
5
+ SHA512:
6
+ metadata.gz: 2be923cafdedab0085e864b93b56063a403f6149c25d3023fa815f06ebcd4258d590e8723ce44583011e6258c54d804df048d3966b33516876f975c42165ae7d
7
+ data.tar.gz: d3acc5afd0e1e3dd15f4a8f052d866d781c078422f3ada8650294984709e593c0cf70d84cacc2d247b7be82d38248e2ae0aeea989ab75fbfbcf6800ef2021917
@@ -0,0 +1,235 @@
1
+ module Porter
2
+ class Stemmer
3
+
4
+ STEP_2_SUFFIX_MAPPING = {
5
+ 'ational' => 'ate',
6
+ 'tional' => 'tion',
7
+ 'enci' => 'ence',
8
+ 'anci' => 'ance',
9
+ 'izer' => 'ize',
10
+ 'bli' => 'ble',
11
+ 'alli' => 'al',
12
+ 'entli' => 'ent',
13
+ 'eli' => 'e',
14
+ 'ousli' => 'ous',
15
+ 'ization' => 'ize',
16
+ 'ation' => 'ate',
17
+ 'ator' => 'ate',
18
+ 'alism' => 'al',
19
+ 'iveness' => 'ive',
20
+ 'fulness' => 'ful',
21
+ 'ousness' => 'ous',
22
+ 'aliti' => 'al',
23
+ 'iviti' => 'ive',
24
+ 'biliti' => 'ble',
25
+ 'logi' => 'log'
26
+ }
27
+
28
+ STEP_2_SUFFIX_REGEXP = /(
29
+ ational |
30
+ tional |
31
+ enci |
32
+ anci |
33
+ izer |
34
+ bli |
35
+ alli |
36
+ entli |
37
+ eli |
38
+ ousli |
39
+ ization |
40
+ ation |
41
+ ator |
42
+ alism |
43
+ iveness |
44
+ fulness |
45
+ ousness |
46
+ aliti |
47
+ iviti |
48
+ biliti |
49
+ logi)$/x
50
+
51
+ STEP_3_SUFFIX_MAPPING = {
52
+ 'icate' => 'ic',
53
+ 'ative' => '',
54
+ 'alize' => 'al',
55
+ 'iciti' => 'ic',
56
+ 'ical' => 'ic',
57
+ 'ful' => '',
58
+ 'ness' => ''
59
+ }
60
+
61
+ STEP_3_SUFFIX_REGEXP = /(icate|ative|alize|iciti|ical|ful|ness)$/
62
+
63
+ STEP_4_SUFFIX_REGEXP = /(
64
+ al |
65
+ ance |
66
+ ence |
67
+ er |
68
+ ic |
69
+ able |
70
+ ible |
71
+ ant |
72
+ ement |
73
+ ment |
74
+ ent |
75
+ ou |
76
+ ism |
77
+ ate |
78
+ iti |
79
+ ous |
80
+ ive |
81
+ ize)$/x
82
+
83
+ CONSONANT = "[^aeiou]" # consonant
84
+ VOWEL = "[aeiouy]" # vowel
85
+ CONSONANT_SEQUENCE = "#{CONSONANT}(?>[^aeiouy]*)" # consonant sequence
86
+ VOWEL_SEQUENCE = "#{VOWEL}(?>[aeiou]*)" # vowel sequence
87
+
88
+ # Number of consonant sequences
89
+ MGR0 = /^(#{CONSONANT_SEQUENCE})?#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}/o # [cc]vvcc... is m>0
90
+ MEQ1 = /^(#{CONSONANT_SEQUENCE})?#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}(#{VOWEL_SEQUENCE})?$/o # [cc]vvcc[vv] is m=1
91
+ MGR1 = /^(#{CONSONANT_SEQUENCE})?#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}/o # [cc]vvccvvcc... is m>1
92
+ VOWEL_IN_STEM = /^(#{CONSONANT_SEQUENCE})?#{VOWEL}/o # vowel in stem
93
+
94
+ def stem(word)
95
+ return word if word.length < 3
96
+
97
+ # Map initial y to Y so that the patterns never treat it as vowel
98
+ word[0] = 'Y' if word[0] == 'y'
99
+
100
+ word = step1(word)
101
+ word = step2(word)
102
+ word = step3(word)
103
+ word = step4(word)
104
+ word = step5(word)
105
+
106
+ # Turn initial Y back to y
107
+ word[0] = 'y' if word[0] == 'Y'
108
+
109
+ return word
110
+ end
111
+
112
+ private
113
+
114
+ # Gets rid of plurals and -ed or -ing. e.g.
115
+ def step1(word)
116
+ word = step1a(word)
117
+ word = step1b(word)
118
+ word = step1c(word)
119
+ end
120
+
121
+ def step1a(word)
122
+ if word =~ /(ss|i)es$/ || word =~ /([^s])s$/
123
+ word = $` + $1
124
+ end
125
+
126
+ return word
127
+
128
+ end
129
+
130
+ def step1b(word)
131
+ if word =~ /eed$/
132
+ word.chop! if $` =~ MGR0
133
+ elsif word =~ /(ed|ing)$/
134
+ stemmed_word = $`
135
+ if stemmed_word =~ VOWEL_IN_STEM
136
+ word = stemmed_word
137
+ case word
138
+ when /(at|bl|iz)$/, /^#{CONSONANT_SEQUENCE}#{VOWEL}[^aeiouwxy]$/o
139
+ word << "e"
140
+ when /([^aeiouylsz])\1$/
141
+ word.chop!
142
+ end
143
+ end
144
+ end
145
+
146
+ return word
147
+
148
+ end
149
+
150
+ # Turns terminal y to i when there is another vowel in the stem
151
+ def step1c(word)
152
+ if word =~ /y$/
153
+ stemmed_word = $`
154
+ word = stemmed_word + "i" if stemmed_word =~ VOWEL_IN_STEM
155
+ end
156
+
157
+ return word
158
+ end
159
+
160
+ # Maps double suffices to single ones, so -ization (-ize plus -ation) maps to -ize
161
+ def step2(word)
162
+ map_suffices word, STEP_2_SUFFIX_REGEXP, STEP_2_SUFFIX_MAPPING
163
+ end
164
+
165
+ # Deals with -ic-, -full, -ness, etc.
166
+ def step3(word)
167
+ map_suffices word, STEP_3_SUFFIX_REGEXP, STEP_3_SUFFIX_MAPPING
168
+ end
169
+
170
+ def map_suffices(word, regexp, suffix_mapping)
171
+ if word =~ regexp
172
+ stemmed_word = $`
173
+ suffix = $1
174
+ if stemmed_word =~ MGR0
175
+ word = stemmed_word + suffix_mapping[suffix]
176
+ end
177
+ end
178
+
179
+ return word
180
+ end
181
+
182
+ # Takes off -ant, -ence etc., in context <c>vcvc<v>
183
+ def step4(word)
184
+ if word =~ STEP_4_SUFFIX_REGEXP
185
+ stemmed_word = $`
186
+ elsif word =~ /(s|t)(ion)$/
187
+ stemmed_word = $` + $1
188
+ end
189
+
190
+ word = stemmed_word if defined?(stemmed_word) && stemmed_word =~ MGR1
191
+
192
+ return word
193
+ end
194
+
195
+ # Removes a final -e if the number of consonant sequences is greater than 1
196
+ def step5(word)
197
+ if word =~ /e$/
198
+ stemmed_word = $`
199
+ if (stemmed_word =~ MGR1) ||
200
+ (stemmed_word =~ MEQ1 && stemmed_word !~ /^#{CONSONANT_SEQUENCE}#{VOWEL}[^aeiouwxy]$/o)
201
+ word = stemmed_word
202
+ end
203
+ end
204
+
205
+ if word =~ /ll$/ && word =~ MGR1
206
+ word.chop!
207
+ end
208
+
209
+ return word
210
+ end
211
+
212
+ end
213
+
214
+ def stem_as_array
215
+ stemmer = Stemmer.new
216
+ stemmed_words = []
217
+ words = self.split(/\W+/)
218
+
219
+ words.each_index do |index|
220
+ word = words[index]
221
+ stemmed_words << stemmer.stem(word)
222
+ end
223
+
224
+ return stemmed_words
225
+ end
226
+
227
+ def stem
228
+ Stemmer.new.stem self.dup
229
+ end
230
+
231
+ end
232
+
233
+ class String
234
+ include Porter
235
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: porter-stemmer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - jcarlosgarcia
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-07-28 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Applies the Porter Stemmer algorithm to a word or a whole text
14
+ email:
15
+ - jcarlosgarcia@hotmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/porter-stemmer.rb
21
+ homepage:
22
+ licenses: []
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.2.2
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Porter Stemmer
44
+ test_files: []