stemmer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/porter/stemmable.rb +196 -0
  2. data/lib/stemmer.rb +1 -0
  3. metadata +39 -0
@@ -0,0 +1,196 @@
1
+ #! /local/ruby/bin/ruby
2
+ #
3
+ # $Id: stemmable.rb,v 1.2 2003/02/01 02:07:30 condit Exp $
4
+ #
5
+ # See example usage at the end of this file.
6
+ #
7
+
8
+ module Stemmable
9
+
10
+ STEP_2_LIST = {
11
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
12
+ 'izer'=>'ize', 'bli'=>'ble',
13
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
14
+ 'ization'=>'ize', 'ation'=>'ate',
15
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
16
+ 'ousness'=>'ous', 'aliti'=>'al',
17
+ 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
18
+ }
19
+
20
+ STEP_3_LIST = {
21
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
22
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
23
+ }
24
+
25
+
26
+ SUFFIX_1_REGEXP = /(
27
+ ational |
28
+ tional |
29
+ enci |
30
+ anci |
31
+ izer |
32
+ bli |
33
+ alli |
34
+ entli |
35
+ eli |
36
+ ousli |
37
+ ization |
38
+ ation |
39
+ ator |
40
+ alism |
41
+ iveness |
42
+ fulness |
43
+ ousness |
44
+ aliti |
45
+ iviti |
46
+ biliti |
47
+ logi)$/x
48
+
49
+
50
+ SUFFIX_2_REGEXP = /(
51
+ al |
52
+ ance |
53
+ ence |
54
+ er |
55
+ ic |
56
+ able |
57
+ ible |
58
+ ant |
59
+ ement |
60
+ ment |
61
+ ent |
62
+ ou |
63
+ ism |
64
+ ate |
65
+ iti |
66
+ ous |
67
+ ive |
68
+ ize)$/x
69
+
70
+
71
+ C = "[^aeiou]" # consonant
72
+ V = "[aeiouy]" # vowel
73
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
74
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
75
+
76
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
77
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
78
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
79
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
80
+
81
+ #
82
+ # Porter stemmer in Ruby.
83
+ #
84
+ # This is the Porter stemming algorithm, ported to Ruby from the
85
+ # version coded up in Perl. It's easy to follow against the rules
86
+ # in the original paper in:
87
+ #
88
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
89
+ # no. 3, pp 130-137,
90
+ #
91
+ # See also http://www.tartarus.org/~martin/PorterStemmer
92
+ #
93
+ # Send comments to raypereda@hotmail.com
94
+ #
95
+
96
+ def stem_porter
97
+
98
+ # make a copy of the given object and convert it to a string.
99
+ w = self.dup.to_str
100
+
101
+ return w if w.length < 3
102
+
103
+ # now map initial y to Y so that the patterns never treat it as vowel
104
+ w[0] = 'Y' if w[0] == ?y
105
+
106
+ # Step 1a
107
+ if w =~ /(ss|i)es$/
108
+ w = $` + $1
109
+ elsif w =~ /([^s])s$/
110
+ w = $` + $1
111
+ end
112
+
113
+ # Step 1b
114
+ if w =~ /eed$/
115
+ w.chop! if $` =~ MGR0
116
+ elsif w =~ /(ed|ing)$/
117
+ stem = $`
118
+ if stem =~ VOWEL_IN_STEM
119
+ w = stem
120
+ case w
121
+ when /(at|bl|iz)$/ then w << "e"
122
+ when /([^aeiouylsz])\1$/ then w.chop!
123
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
124
+ end
125
+ end
126
+ end
127
+
128
+ if w =~ /y$/
129
+ stem = $`
130
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
131
+ end
132
+
133
+ # Step 2
134
+ if w =~ SUFFIX_1_REGEXP
135
+ stem = $`
136
+ suffix = $1
137
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
138
+ if stem =~ MGR0
139
+ w = stem + STEP_2_LIST[suffix]
140
+ end
141
+ end
142
+
143
+ # Step 3
144
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
145
+ stem = $`
146
+ suffix = $1
147
+ if stem =~ MGR0
148
+ w = stem + STEP_3_LIST[suffix]
149
+ end
150
+ end
151
+
152
+ # Step 4
153
+ if w =~ SUFFIX_2_REGEXP
154
+ stem = $`
155
+ if stem =~ MGR1
156
+ w = stem
157
+ end
158
+ elsif w =~ /(s|t)(ion)$/
159
+ stem = $` + $1
160
+ if stem =~ MGR1
161
+ w = stem
162
+ end
163
+ end
164
+
165
+ # Step 5
166
+ if w =~ /e$/
167
+ stem = $`
168
+ if (stem =~ MGR1) ||
169
+ (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
170
+ w = stem
171
+ end
172
+ end
173
+
174
+ if w =~ /ll$/ && w =~ MGR1
175
+ w.chop!
176
+ end
177
+
178
+ # and turn initial Y back to y
179
+ w[0] = 'y' if w[0] == ?Y
180
+
181
+ w
182
+ end
183
+
184
+
185
+ #
186
+ # make the stem_porter the default stem method, just in case we
187
+ # feel like having multiple stemmers available later.
188
+ #
189
+ alias stem stem_porter
190
+
191
+ end
192
+
193
+ # Add stem method to all Strings
194
+ class String
195
+ include Stemmable
196
+ end
data/lib/stemmer.rb ADDED
@@ -0,0 +1 @@
1
+ require 'porter/stemmable'
metadata ADDED
@@ -0,0 +1,39 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.10
3
+ specification_version: 1
4
+ name: stemmer
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2005-04-19
8
+ summary: Library of word stemming algoritms (currently Porter).
9
+ require_paths:
10
+ - lib
11
+ email: self@mattmower.com
12
+ homepage: http://rubyforge.org/projects/stemmer/
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: stemmer
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ -
22
+ - ">"
23
+ - !ruby/object:Gem::Version
24
+ version: 0.0.0
25
+ version:
26
+ platform: ruby
27
+ authors:
28
+ - Matt Mower
29
+ files:
30
+ - lib/porter
31
+ - lib/stemmer.rb
32
+ - lib/porter/stemmable.rb
33
+ test_files: []
34
+ rdoc_options: []
35
+ extra_rdoc_files: []
36
+ executables: []
37
+ extensions: []
38
+ requirements: []
39
+ dependencies: []