porter-stemmer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/porter-stemmer.rb +235 -0
  3. metadata +44 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: df3836142bca8ef6ddbde1a7e25524a40270f26e
4
+ data.tar.gz: 0801b8628b4a11b025aaaa7734d1fb1853463d58
5
+ SHA512:
6
+ metadata.gz: 2be923cafdedab0085e864b93b56063a403f6149c25d3023fa815f06ebcd4258d590e8723ce44583011e6258c54d804df048d3966b33516876f975c42165ae7d
7
+ data.tar.gz: d3acc5afd0e1e3dd15f4a8f052d866d781c078422f3ada8650294984709e593c0cf70d84cacc2d247b7be82d38248e2ae0aeea989ab75fbfbcf6800ef2021917
@@ -0,0 +1,235 @@
1
+ module Porter
2
+ class Stemmer
3
+
4
+ STEP_2_SUFFIX_MAPPING = {
5
+ 'ational' => 'ate',
6
+ 'tional' => 'tion',
7
+ 'enci' => 'ence',
8
+ 'anci' => 'ance',
9
+ 'izer' => 'ize',
10
+ 'bli' => 'ble',
11
+ 'alli' => 'al',
12
+ 'entli' => 'ent',
13
+ 'eli' => 'e',
14
+ 'ousli' => 'ous',
15
+ 'ization' => 'ize',
16
+ 'ation' => 'ate',
17
+ 'ator' => 'ate',
18
+ 'alism' => 'al',
19
+ 'iveness' => 'ive',
20
+ 'fulness' => 'ful',
21
+ 'ousness' => 'ous',
22
+ 'aliti' => 'al',
23
+ 'iviti' => 'ive',
24
+ 'biliti' => 'ble',
25
+ 'logi' => 'log'
26
+ }
27
+
28
+ STEP_2_SUFFIX_REGEXP = /(
29
+ ational |
30
+ tional |
31
+ enci |
32
+ anci |
33
+ izer |
34
+ bli |
35
+ alli |
36
+ entli |
37
+ eli |
38
+ ousli |
39
+ ization |
40
+ ation |
41
+ ator |
42
+ alism |
43
+ iveness |
44
+ fulness |
45
+ ousness |
46
+ aliti |
47
+ iviti |
48
+ biliti |
49
+ logi)$/x
50
+
51
+ STEP_3_SUFFIX_MAPPING = {
52
+ 'icate' => 'ic',
53
+ 'ative' => '',
54
+ 'alize' => 'al',
55
+ 'iciti' => 'ic',
56
+ 'ical' => 'ic',
57
+ 'ful' => '',
58
+ 'ness' => ''
59
+ }
60
+
61
+ STEP_3_SUFFIX_REGEXP = /(icate|ative|alize|iciti|ical|ful|ness)$/
62
+
63
+ STEP_4_SUFFIX_REGEXP = /(
64
+ al |
65
+ ance |
66
+ ence |
67
+ er |
68
+ ic |
69
+ able |
70
+ ible |
71
+ ant |
72
+ ement |
73
+ ment |
74
+ ent |
75
+ ou |
76
+ ism |
77
+ ate |
78
+ iti |
79
+ ous |
80
+ ive |
81
+ ize)$/x
82
+
83
+ CONSONANT = "[^aeiou]" # consonant
84
+ VOWEL = "[aeiouy]" # vowel
85
+ CONSONANT_SEQUENCE = "#{CONSONANT}(?>[^aeiouy]*)" # consonant sequence
86
+ VOWEL_SEQUENCE = "#{VOWEL}(?>[aeiou]*)" # vowel sequence
87
+
88
+ # Number of consonant sequences
89
+ MGR0 = /^(#{CONSONANT_SEQUENCE})?#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}/o # [cc]vvcc... is m>0
90
+ MEQ1 = /^(#{CONSONANT_SEQUENCE})?#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}(#{VOWEL_SEQUENCE})?$/o # [cc]vvcc[vv] is m=1
91
+ MGR1 = /^(#{CONSONANT_SEQUENCE})?#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}#{VOWEL_SEQUENCE}#{CONSONANT_SEQUENCE}/o # [cc]vvccvvcc... is m>1
92
+ VOWEL_IN_STEM = /^(#{CONSONANT_SEQUENCE})?#{VOWEL}/o # vowel in stem
93
+
94
+ def stem(word)
95
+ return word if word.length < 3
96
+
97
+ # Map initial y to Y so that the patterns never treat it as vowel
98
+ word[0] = 'Y' if word[0] == 'y'
99
+
100
+ word = step1(word)
101
+ word = step2(word)
102
+ word = step3(word)
103
+ word = step4(word)
104
+ word = step5(word)
105
+
106
+ # Turn initial Y back to y
107
+ word[0] = 'y' if word[0] == 'Y'
108
+
109
+ return word
110
+ end
111
+
112
+ private
113
+
114
+ # Gets rid of plurals and -ed or -ing. e.g.
115
+ def step1(word)
116
+ word = step1a(word)
117
+ word = step1b(word)
118
+ word = step1c(word)
119
+ end
120
+
121
+ def step1a(word)
122
+ if word =~ /(ss|i)es$/ || word =~ /([^s])s$/
123
+ word = $` + $1
124
+ end
125
+
126
+ return word
127
+
128
+ end
129
+
130
+ def step1b(word)
131
+ if word =~ /eed$/
132
+ word.chop! if $` =~ MGR0
133
+ elsif word =~ /(ed|ing)$/
134
+ stemmed_word = $`
135
+ if stemmed_word =~ VOWEL_IN_STEM
136
+ word = stemmed_word
137
+ case word
138
+ when /(at|bl|iz)$/, /^#{CONSONANT_SEQUENCE}#{VOWEL}[^aeiouwxy]$/o
139
+ word << "e"
140
+ when /([^aeiouylsz])\1$/
141
+ word.chop!
142
+ end
143
+ end
144
+ end
145
+
146
+ return word
147
+
148
+ end
149
+
150
+ # Turns terminal y to i when there is another vowel in the stem
151
+ def step1c(word)
152
+ if word =~ /y$/
153
+ stemmed_word = $`
154
+ word = stemmed_word + "i" if stemmed_word =~ VOWEL_IN_STEM
155
+ end
156
+
157
+ return word
158
+ end
159
+
160
+ # Maps double suffices to single ones, so -ization (-ize plus -ation) maps to -ize
161
+ def step2(word)
162
+ map_suffices word, STEP_2_SUFFIX_REGEXP, STEP_2_SUFFIX_MAPPING
163
+ end
164
+
165
+ # Deals with -ic-, -full, -ness, etc.
166
+ def step3(word)
167
+ map_suffices word, STEP_3_SUFFIX_REGEXP, STEP_3_SUFFIX_MAPPING
168
+ end
169
+
170
+ def map_suffices(word, regexp, suffix_mapping)
171
+ if word =~ regexp
172
+ stemmed_word = $`
173
+ suffix = $1
174
+ if stemmed_word =~ MGR0
175
+ word = stemmed_word + suffix_mapping[suffix]
176
+ end
177
+ end
178
+
179
+ return word
180
+ end
181
+
182
+ # Takes off -ant, -ence etc., in context <c>vcvc<v>
183
+ def step4(word)
184
+ if word =~ STEP_4_SUFFIX_REGEXP
185
+ stemmed_word = $`
186
+ elsif word =~ /(s|t)(ion)$/
187
+ stemmed_word = $` + $1
188
+ end
189
+
190
+ word = stemmed_word if defined?(stemmed_word) && stemmed_word =~ MGR1
191
+
192
+ return word
193
+ end
194
+
195
+ # Removes a final -e if the number of consonant sequences is greater than 1
196
+ def step5(word)
197
+ if word =~ /e$/
198
+ stemmed_word = $`
199
+ if (stemmed_word =~ MGR1) ||
200
+ (stemmed_word =~ MEQ1 && stemmed_word !~ /^#{CONSONANT_SEQUENCE}#{VOWEL}[^aeiouwxy]$/o)
201
+ word = stemmed_word
202
+ end
203
+ end
204
+
205
+ if word =~ /ll$/ && word =~ MGR1
206
+ word.chop!
207
+ end
208
+
209
+ return word
210
+ end
211
+
212
+ end
213
+
214
+ def stem_as_array
215
+ stemmer = Stemmer.new
216
+ stemmed_words = []
217
+ words = self.split(/\W+/)
218
+
219
+ words.each_index do |index|
220
+ word = words[index]
221
+ stemmed_words << stemmer.stem(word)
222
+ end
223
+
224
+ return stemmed_words
225
+ end
226
+
227
+ def stem
228
+ Stemmer.new.stem self.dup
229
+ end
230
+
231
+ end
232
+
233
+ class String
234
+ include Porter
235
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: porter-stemmer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - jcarlosgarcia
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-07-28 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Applies the Porter Stemmer algorithm to a word or a whole text
14
+ email:
15
+ - jcarlosgarcia@hotmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/porter-stemmer.rb
21
+ homepage:
22
+ licenses: []
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.2.2
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Porter Stemmer
44
+ test_files: []