estem 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. data/Rakefile +1 -1
  2. data/lib/estem.rb +39 -2
  3. data/lib/estem.rb~ +233 -0
  4. metadata +2 -1
data/Rakefile CHANGED
@@ -13,6 +13,6 @@ RDoc::Task.new do |rdoc|
13
13
  rdoc.rdoc_files.include('README.rdoc',
14
14
  'lib/**/*',
15
15
  'bin/**/*')
16
- rdoc.options = ['--main', 'README.rdoc']
16
+ rdoc.options = ['--main', 'README.rdoc', '--charset', 'UTF-8']
17
17
  rdoc.rdoc_dir = 'rdoc'
18
18
  end
data/lib/estem.rb CHANGED
@@ -1,10 +1,45 @@
1
1
  # encoding: UTF-8
2
2
  #
3
- # Porter, Spanish stemmer in Ruby.
3
+ # :title: Spanish Stemming
4
+ # = Description
5
+ # This gem is for reducing Spanish words to their roots. It uses an algorithm
6
+ # based on Martin Porter's specifications.
7
+ #
8
+ # For more information, visit:
9
+ # http://snowball.tartarus.org/algorithms/spanish/stemmer.html
10
+ #
11
+ # = Descripción
12
+ # Esta gema está para reducir las palabras del Español en sus respectivas raíces,
13
+ # para ello ultiliza un algoritmo basado en las especificaciones de Martin Porter
14
+ #
15
+ # Para más información, visite:
16
+ # http://snowball.tartarus.org/algorithms/spanish/stemmer.html
17
+ #
18
+ # = License -- Licencia
19
+ # This code is provided under the terms of the {MIT License.}[http://www.opensource.org/licenses/mit-license.php]
20
+ #
21
+ # = Authors
22
+ # * Manuel A. Güílamo
4
23
  #
5
- # :title: EStem - Ruby based Porter Spanish Stemmer
6
24
 
7
25
  module EStem
26
+ ##
27
+ # :method: estem
28
+ # For more information, please see <b>String#es_stem</b> method, also <b>EStem</b>.
29
+
30
+
31
+ ##
32
+ #This method stem Spanish words.
33
+ #
34
+ # "albergues".es_stem # ==> "alberg"
35
+ # "habitaciones".es_stem # ==> "habit"
36
+ # "ALbeRGues".es_stem # ==> "ALbeRG"
37
+ # "HaBiTaCiOnEs".es_stem # ==> "HaBiT"
38
+ # "Hacinamiento".es_stem # ==> "Hacin"
39
+ #
40
+ #:call-seq:
41
+ # str.es_stem => "new_str"
42
+
8
43
  def es_stem
9
44
  str = self.dup
10
45
  return remove_accent(str) if str.length == 1
@@ -24,6 +59,8 @@ module EStem
24
59
  remove_accent(str)
25
60
  end
26
61
 
62
+ # :stopdoc:
63
+
27
64
  private
28
65
 
29
66
  def vowel?(c)
data/lib/estem.rb~ ADDED
@@ -0,0 +1,233 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # :title: Spanish Stemming
4
+ # = Description
5
+ # This gem is for reducing Spanish words to their roots. It uses an algorithm
6
+ # based on Martin Porter's specifications.
7
+ #
8
+ # For more information, visit:
9
+ # http://snowball.tartarus.org/algorithms/spanish/stemmer.html
10
+ #
11
+ # = Descripción
12
+ # Esta gema está para reducir las palabras del Español en sus respectivas raíces,
13
+ # para ello ultiliza un algoritmo basado en las especificaciones de Martin Porter
14
+ #
15
+ # Para más información, visite:
16
+ # http://snowball.tartarus.org/algorithms/spanish/stemmer.html
17
+ #
18
+ # = License -- Licencia
19
+ # This code is provided under the terms of the {MIT License.}[http://www.opensource.org/licenses/mit-license.php]
20
+ #
21
+ # = Authors
22
+ # * Manuel A. Güílamo
23
+ #
24
+
25
+ module EStem
26
+ ##
27
+ # :method: estem
28
+ # For more information, please see <b>String#es_stem</b> method, also <b>EStem</b>.
29
+
30
+
31
+ ##
32
+ #This method reduces Spanish words to their root.
33
+ #
34
+ # "albergues".es_stem # ==> "alberg"
35
+ # "habitaciones".es_stem # ==> "habit"
36
+ # "ALbeRGues".es_stem # ==> "ALbeRG"
37
+ # "HaBiTaCiOnEs".es_stem # ==> "HaBiT"
38
+ # "Hacinamiento".es_stem # ==> "Hacin"
39
+ #
40
+ #:call-seq:
41
+ # str.es_stem => "new_str"
42
+
43
+ def es_stem
44
+ str = self.dup
45
+ return remove_accent(str) if str.length == 1
46
+ tmp = step0(str)
47
+ str = tmp ? tmp : str
48
+
49
+ unless tmp = step1(str)
50
+ unless tmp = step2a(str)
51
+ tmp = step2b(str)
52
+ str = tmp ? tmp : str
53
+ else
54
+ str = tmp
55
+ end
56
+ end
57
+ tmp = step3(str)
58
+ str = tmp.nil? ? str : tmp
59
+ remove_accent(str)
60
+ end
61
+
62
+ # :stopdoc:
63
+
64
+ private
65
+
66
+ def vowel?(c)
67
+ VOWEL.include?(c)
68
+ end
69
+
70
+ def consonant?(c)
71
+ CONSONANT.include?(c)
72
+ end
73
+
74
+ def remove_accent(str)
75
+ str.tr('áéíóúÁÉÍÓÚ','aeiouAEIOU')
76
+ end
77
+
78
+ def rv(str)
79
+ if consonant? str[1]
80
+ i=2
81
+ i+=1 while str[i] and consonant? str[i]
82
+ return str.nil? ? str.length-1 : i+1
83
+ end
84
+
85
+ if vowel? str[0] and vowel? str[1]
86
+ i=2
87
+ i+=1 while str[i] and vowel? str[i]
88
+ return str.nil? ? str.length-1 : i+1
89
+ end
90
+
91
+ return 3 if consonant? str[0] and vowel? str[1]
92
+
93
+ str.length - 1
94
+ end
95
+
96
+ def r(str, i=0)
97
+ i+=1 while str[i] and consonant?(str[i])
98
+ i+=1
99
+ i+=1 while str[i] and vowel? str[i]
100
+ str[i].nil? ? str.length : i+1
101
+ end
102
+
103
+ def r12(str)
104
+ r1 = r(str)
105
+ r2 = r(str,r1)
106
+ [r1,r2]
107
+ end
108
+
109
+ def step0(str)
110
+ return nil unless str =~ /(se(l[ao]s?)?|l([aeo]s?)|me|nos)$/i
111
+
112
+ suffix = $&
113
+ rv_text = str[rv(str)..-1]
114
+
115
+ case rv_text
116
+ when %r{((?<=i[éÉ]ndo|[áÁ]ndo|[áéíÁÉÍ]r)#{suffix})$}ui
117
+ str[%r{#$&$}]=''
118
+ str = remove_accent(str)
119
+ return str
120
+ when %r{((?<=iendo|ando|[aei]r)#{suffix})$}i
121
+ str[%r{#$&$}]=''
122
+ return str
123
+ end
124
+
125
+ if rv_text =~ /yendo/i and str =~ /uyendo/i
126
+ str[suffix]=''
127
+ return str
128
+ end
129
+ nil
130
+ end
131
+
132
+ #=> new_str or nil
133
+ def step1(str)
134
+ r1,r2 = r12(str)
135
+ r1_text = str[r1..-1]
136
+ r2_text = str[r2..-1]
137
+
138
+ case r2_text
139
+ when /(anzas?|ic[oa]s?|ismos?|[ai]bles?|istas?|os[oa]s?|[ai]mientos?)$/i
140
+ str[%r{#$&$}]=''
141
+ return str
142
+ when /(ic)?(ador([ae]s?)?|aci[óÓ]n|aciones|antes?|ancias?)$/ui
143
+ str[%r{#$&$}]=''
144
+ return str
145
+ when /log[íÍ]as?/ui
146
+ str[%r{#$&$}]='log'
147
+ return str
148
+ when /(uci([óÓ]n|ones))$/ui
149
+ str[%r{#$&$}]='u'
150
+ return str
151
+ when /(encias?)$/i
152
+ str[%r{#$&$}]='ente'
153
+ return str
154
+ end
155
+
156
+ if r2_text =~ /(ativ|iv|os|ic|ad)amente$/i or r1_text =~ /amente$/i
157
+ str[%r{#$&$}]=''
158
+ return str
159
+ end
160
+
161
+ case r2_text
162
+ when /((ante|[ai]ble)?mente)$/i, /((abil|i[cv])?idad(es)?)$/i, /((at)?iv[ao]s?)$/i
163
+ str[%r{#$&$}]=''
164
+ return str
165
+ end
166
+ nil
167
+ end
168
+
169
+ #=> nil or new_str
170
+ def step2a(str)
171
+ rv_pos = rv(str)
172
+ idx = str[rv_pos..-1] =~ /(y[oóÓ]|ye(ron|ndo)|y[ae][ns]?|ya(is|mos))$/ui
173
+
174
+ return nil unless idx
175
+
176
+ if 'u' == str[rv_pos+idx-1].downcase
177
+ str[%r{#$&$}] = ''
178
+ return str
179
+ end
180
+ nil
181
+ end
182
+
183
+ STEP2B_REGEXP = /(
184
+ ar([áÁ][ns]?|a(n|s|is)?|on)? | ar([éÉ]is|emos|é|É) | ar[íÍ]a(n|s|is|mos)? |
185
+ er([áÁ][sn]?|[éÉ](is)?|emos|[íÍ]a(n|s|is|mos)?)? |
186
+ ir([íÍ]a(s|n|is|mos)?|[áÁ][ns]?|emos|[éÉ]|éis)? | aba(s|n|is)? |
187
+ ad([ao]s?)? | ed | id(a|as|o|os)? | [íÍ]a(n|s|is|mos)? | [íÍ]s |
188
+ as(e[ns]?|te|eis|teis)? | [áÁ](is|bamos|semos|ramos) | a(n|ndo|mos) |
189
+ ie(ra|se|ran|sen|ron|ndo|ras|ses|rais|seis) | i(ste|steis|[óÓ]|mos|[éÉ]ramos|[éÉ]semos) |
190
+ en|es|[éÉ]is|emos
191
+ )$/xiu
192
+
193
+ def step2b(str)
194
+ rv_pos = rv(str)
195
+
196
+ if idx = str[rv_pos..-1] =~ STEP2B_REGEXP
197
+ suffix = $&
198
+ if suffix =~ /^(en|es|[éÉ]is|emos)$/ui
199
+ str[%r{#{suffix}$}]=''
200
+ str[rv_pos+idx-1]='' if str[rv_pos+idx-2] =~ /g/i and str[rv_pos+idx-1] =~ /u/i
201
+ else
202
+ str[%r{#{suffix}$}]=''
203
+ end
204
+ return str
205
+ end
206
+ nil
207
+ end
208
+
209
+ def step3(str)
210
+ rv_pos = rv(str)
211
+ rv_text = str[rv_pos..-1]
212
+
213
+ if rv_text =~ /(os|[aoáíóÁÍÓ])$/ui
214
+ str[%r{#$&$}]=''
215
+ return str
216
+ elsif idx = rv_text =~ /(u?[eéÉ])$/i
217
+ if $&[0].downcase == 'u' and str[rv_pos+idx-1].downcase == 'g'
218
+ str[%r{#$&$}]=''
219
+ else
220
+ str.chop!
221
+ end
222
+ return str
223
+ end
224
+ nil
225
+ end
226
+
227
+ VOWEL = 'aeiouáéíóúüAEIOUÁÉÍÓÚÜ'
228
+ CONSONANT = "bcdfghjklmnñpqrstvwxyzABCDEFGHIJKLMNÑOPQRSTUVWXYZ"
229
+ end
230
+
231
+ class String
232
+ include EStem
233
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: estem
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -21,6 +21,7 @@ files:
21
21
  - Rakefile
22
22
  - bin/es_stem.rb
23
23
  - lib/estem.rb
24
+ - lib/estem.rb~
24
25
  - test/diffs.txt
25
26
  - test/test_estem.rb
26
27
  homepage: https://github.com/MaG21/estem