stemmer 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/porter/stemmable.rb +196 -0
  2. data/lib/stemmer.rb +1 -0
  3. metadata +39 -0
@@ -0,0 +1,196 @@
1
+ #! /local/ruby/bin/ruby
2
+ #
3
+ # $Id: stemmable.rb,v 1.2 2003/02/01 02:07:30 condit Exp $
4
+ #
5
+ # See example usage at the end of this file.
6
+ #
7
+
8
+ module Stemmable
9
+
10
+ STEP_2_LIST = {
11
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
12
+ 'izer'=>'ize', 'bli'=>'ble',
13
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
14
+ 'ization'=>'ize', 'ation'=>'ate',
15
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
16
+ 'ousness'=>'ous', 'aliti'=>'al',
17
+ 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
18
+ }
19
+
20
+ STEP_3_LIST = {
21
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
22
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
23
+ }
24
+
25
+
26
+ SUFFIX_1_REGEXP = /(
27
+ ational |
28
+ tional |
29
+ enci |
30
+ anci |
31
+ izer |
32
+ bli |
33
+ alli |
34
+ entli |
35
+ eli |
36
+ ousli |
37
+ ization |
38
+ ation |
39
+ ator |
40
+ alism |
41
+ iveness |
42
+ fulness |
43
+ ousness |
44
+ aliti |
45
+ iviti |
46
+ biliti |
47
+ logi)$/x
48
+
49
+
50
+ SUFFIX_2_REGEXP = /(
51
+ al |
52
+ ance |
53
+ ence |
54
+ er |
55
+ ic |
56
+ able |
57
+ ible |
58
+ ant |
59
+ ement |
60
+ ment |
61
+ ent |
62
+ ou |
63
+ ism |
64
+ ate |
65
+ iti |
66
+ ous |
67
+ ive |
68
+ ize)$/x
69
+
70
+
71
+ C = "[^aeiou]" # consonant
72
+ V = "[aeiouy]" # vowel
73
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
74
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
75
+
76
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
77
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
78
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
79
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
80
+
81
+ #
82
+ # Porter stemmer in Ruby.
83
+ #
84
+ # This is the Porter stemming algorithm, ported to Ruby from the
85
+ # version coded up in Perl. It's easy to follow against the rules
86
+ # in the original paper in:
87
+ #
88
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
89
+ # no. 3, pp 130-137,
90
+ #
91
+ # See also http://www.tartarus.org/~martin/PorterStemmer
92
+ #
93
+ # Send comments to raypereda@hotmail.com
94
+ #
95
+
96
+ def stem_porter
97
+
98
+ # make a copy of the given object and convert it to a string.
99
+ w = self.dup.to_str
100
+
101
+ return w if w.length < 3
102
+
103
+ # now map initial y to Y so that the patterns never treat it as vowel
104
+ w[0] = 'Y' if w[0] == ?y
105
+
106
+ # Step 1a
107
+ if w =~ /(ss|i)es$/
108
+ w = $` + $1
109
+ elsif w =~ /([^s])s$/
110
+ w = $` + $1
111
+ end
112
+
113
+ # Step 1b
114
+ if w =~ /eed$/
115
+ w.chop! if $` =~ MGR0
116
+ elsif w =~ /(ed|ing)$/
117
+ stem = $`
118
+ if stem =~ VOWEL_IN_STEM
119
+ w = stem
120
+ case w
121
+ when /(at|bl|iz)$/ then w << "e"
122
+ when /([^aeiouylsz])\1$/ then w.chop!
123
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
124
+ end
125
+ end
126
+ end
127
+
128
+ if w =~ /y$/
129
+ stem = $`
130
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
131
+ end
132
+
133
+ # Step 2
134
+ if w =~ SUFFIX_1_REGEXP
135
+ stem = $`
136
+ suffix = $1
137
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
138
+ if stem =~ MGR0
139
+ w = stem + STEP_2_LIST[suffix]
140
+ end
141
+ end
142
+
143
+ # Step 3
144
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
145
+ stem = $`
146
+ suffix = $1
147
+ if stem =~ MGR0
148
+ w = stem + STEP_3_LIST[suffix]
149
+ end
150
+ end
151
+
152
+ # Step 4
153
+ if w =~ SUFFIX_2_REGEXP
154
+ stem = $`
155
+ if stem =~ MGR1
156
+ w = stem
157
+ end
158
+ elsif w =~ /(s|t)(ion)$/
159
+ stem = $` + $1
160
+ if stem =~ MGR1
161
+ w = stem
162
+ end
163
+ end
164
+
165
+ # Step 5
166
+ if w =~ /e$/
167
+ stem = $`
168
+ if (stem =~ MGR1) ||
169
+ (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
170
+ w = stem
171
+ end
172
+ end
173
+
174
+ if w =~ /ll$/ && w =~ MGR1
175
+ w.chop!
176
+ end
177
+
178
+ # and turn initial Y back to y
179
+ w[0] = 'y' if w[0] == ?Y
180
+
181
+ w
182
+ end
183
+
184
+
185
+ #
186
+ # make the stem_porter the default stem method, just in case we
187
+ # feel like having multiple stemmers available later.
188
+ #
189
+ alias stem stem_porter
190
+
191
+ end
192
+
193
+ # Add stem method to all Strings
194
+ class String
195
+ include Stemmable
196
+ end
data/lib/stemmer.rb ADDED
@@ -0,0 +1 @@
1
+ require 'porter/stemmable'
metadata ADDED
@@ -0,0 +1,39 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.10
3
+ specification_version: 1
4
+ name: stemmer
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2005-04-19
8
+ summary: Library of word stemming algoritms (currently Porter).
9
+ require_paths:
10
+ - lib
11
+ email: self@mattmower.com
12
+ homepage: http://rubyforge.org/projects/stemmer/
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: stemmer
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ -
22
+ - ">"
23
+ - !ruby/object:Gem::Version
24
+ version: 0.0.0
25
+ version:
26
+ platform: ruby
27
+ authors:
28
+ - Matt Mower
29
+ files:
30
+ - lib/porter
31
+ - lib/stemmer.rb
32
+ - lib/porter/stemmable.rb
33
+ test_files: []
34
+ rdoc_options: []
35
+ extra_rdoc_files: []
36
+ executables: []
37
+ extensions: []
38
+ requirements: []
39
+ dependencies: []