estem 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. data/Rakefile +1 -1
  2. data/lib/estem.rb +39 -2
  3. data/lib/estem.rb~ +233 -0
  4. metadata +2 -1
data/Rakefile CHANGED
@@ -13,6 +13,6 @@ RDoc::Task.new do |rdoc|
13
13
  rdoc.rdoc_files.include('README.rdoc',
14
14
  'lib/**/*',
15
15
  'bin/**/*')
16
- rdoc.options = ['--main', 'README.rdoc']
16
+ rdoc.options = ['--main', 'README.rdoc', '--charset', 'UTF-8']
17
17
  rdoc.rdoc_dir = 'rdoc'
18
18
  end
data/lib/estem.rb CHANGED
@@ -1,10 +1,45 @@
1
1
  # encoding: UTF-8
2
2
  #
3
- # Porter, Spanish stemmer in Ruby.
3
+ # :title: Spanish Stemming
4
+ # = Description
5
+ # This gem is for reducing Spanish words to their roots. It uses an algorithm
6
+ # based on Martin Porter's specifications.
7
+ #
8
+ # For more information, visit:
9
+ # http://snowball.tartarus.org/algorithms/spanish/stemmer.html
10
+ #
11
+ # = Descripción
12
+ # Esta gema está para reducir las palabras del Español en sus respectivas raíces,
13
+ # para ello ultiliza un algoritmo basado en las especificaciones de Martin Porter
14
+ #
15
+ # Para más información, visite:
16
+ # http://snowball.tartarus.org/algorithms/spanish/stemmer.html
17
+ #
18
+ # = License -- Licencia
19
+ # This code is provided under the terms of the {MIT License.}[http://www.opensource.org/licenses/mit-license.php]
20
+ #
21
+ # = Authors
22
+ # * Manuel A. Güílamo
4
23
  #
5
- # :title: EStem - Ruby based Porter Spanish Stemmer
6
24
 
7
25
  module EStem
26
+ ##
27
+ # :method: estem
28
+ # For more information, please see <b>String#es_stem</b> method, also <b>EStem</b>.
29
+
30
+
31
+ ##
32
+ #This method stem Spanish words.
33
+ #
34
+ # "albergues".es_stem # ==> "alberg"
35
+ # "habitaciones".es_stem # ==> "habit"
36
+ # "ALbeRGues".es_stem # ==> "ALbeRG"
37
+ # "HaBiTaCiOnEs".es_stem # ==> "HaBiT"
38
+ # "Hacinamiento".es_stem # ==> "Hacin"
39
+ #
40
+ #:call-seq:
41
+ # str.es_stem => "new_str"
42
+
8
43
  def es_stem
9
44
  str = self.dup
10
45
  return remove_accent(str) if str.length == 1
@@ -24,6 +59,8 @@ module EStem
24
59
  remove_accent(str)
25
60
  end
26
61
 
62
+ # :stopdoc:
63
+
27
64
  private
28
65
 
29
66
  def vowel?(c)
data/lib/estem.rb~ ADDED
@@ -0,0 +1,233 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # :title: Spanish Stemming
4
+ # = Description
5
+ # This gem is for reducing Spanish words to their roots. It uses an algorithm
6
+ # based on Martin Porter's specifications.
7
+ #
8
+ # For more information, visit:
9
+ # http://snowball.tartarus.org/algorithms/spanish/stemmer.html
10
+ #
11
+ # = Descripción
12
+ # Esta gema está para reducir las palabras del Español en sus respectivas raíces,
13
+ # para ello ultiliza un algoritmo basado en las especificaciones de Martin Porter
14
+ #
15
+ # Para más información, visite:
16
+ # http://snowball.tartarus.org/algorithms/spanish/stemmer.html
17
+ #
18
+ # = License -- Licencia
19
+ # This code is provided under the terms of the {MIT License.}[http://www.opensource.org/licenses/mit-license.php]
20
+ #
21
+ # = Authors
22
+ # * Manuel A. Güílamo
23
+ #
24
+
25
+ module EStem
26
+ ##
27
+ # :method: estem
28
+ # For more information, please see <b>String#es_stem</b> method, also <b>EStem</b>.
29
+
30
+
31
+ ##
32
+ #This method reduces Spanish words to their root.
33
+ #
34
+ # "albergues".es_stem # ==> "alberg"
35
+ # "habitaciones".es_stem # ==> "habit"
36
+ # "ALbeRGues".es_stem # ==> "ALbeRG"
37
+ # "HaBiTaCiOnEs".es_stem # ==> "HaBiT"
38
+ # "Hacinamiento".es_stem # ==> "Hacin"
39
+ #
40
+ #:call-seq:
41
+ # str.es_stem => "new_str"
42
+
43
+ def es_stem
44
+ str = self.dup
45
+ return remove_accent(str) if str.length == 1
46
+ tmp = step0(str)
47
+ str = tmp ? tmp : str
48
+
49
+ unless tmp = step1(str)
50
+ unless tmp = step2a(str)
51
+ tmp = step2b(str)
52
+ str = tmp ? tmp : str
53
+ else
54
+ str = tmp
55
+ end
56
+ end
57
+ tmp = step3(str)
58
+ str = tmp.nil? ? str : tmp
59
+ remove_accent(str)
60
+ end
61
+
62
+ # :stopdoc:
63
+
64
+ private
65
+
66
+ def vowel?(c)
67
+ VOWEL.include?(c)
68
+ end
69
+
70
+ def consonant?(c)
71
+ CONSONANT.include?(c)
72
+ end
73
+
74
+ def remove_accent(str)
75
+ str.tr('áéíóúÁÉÍÓÚ','aeiouAEIOU')
76
+ end
77
+
78
+ def rv(str)
79
+ if consonant? str[1]
80
+ i=2
81
+ i+=1 while str[i] and consonant? str[i]
82
+ return str.nil? ? str.length-1 : i+1
83
+ end
84
+
85
+ if vowel? str[0] and vowel? str[1]
86
+ i=2
87
+ i+=1 while str[i] and vowel? str[i]
88
+ return str.nil? ? str.length-1 : i+1
89
+ end
90
+
91
+ return 3 if consonant? str[0] and vowel? str[1]
92
+
93
+ str.length - 1
94
+ end
95
+
96
+ def r(str, i=0)
97
+ i+=1 while str[i] and consonant?(str[i])
98
+ i+=1
99
+ i+=1 while str[i] and vowel? str[i]
100
+ str[i].nil? ? str.length : i+1
101
+ end
102
+
103
+ def r12(str)
104
+ r1 = r(str)
105
+ r2 = r(str,r1)
106
+ [r1,r2]
107
+ end
108
+
109
+ def step0(str)
110
+ return nil unless str =~ /(se(l[ao]s?)?|l([aeo]s?)|me|nos)$/i
111
+
112
+ suffix = $&
113
+ rv_text = str[rv(str)..-1]
114
+
115
+ case rv_text
116
+ when %r{((?<=i[éÉ]ndo|[áÁ]ndo|[áéíÁÉÍ]r)#{suffix})$}ui
117
+ str[%r{#$&$}]=''
118
+ str = remove_accent(str)
119
+ return str
120
+ when %r{((?<=iendo|ando|[aei]r)#{suffix})$}i
121
+ str[%r{#$&$}]=''
122
+ return str
123
+ end
124
+
125
+ if rv_text =~ /yendo/i and str =~ /uyendo/i
126
+ str[suffix]=''
127
+ return str
128
+ end
129
+ nil
130
+ end
131
+
132
+ #=> new_str or nil
133
+ def step1(str)
134
+ r1,r2 = r12(str)
135
+ r1_text = str[r1..-1]
136
+ r2_text = str[r2..-1]
137
+
138
+ case r2_text
139
+ when /(anzas?|ic[oa]s?|ismos?|[ai]bles?|istas?|os[oa]s?|[ai]mientos?)$/i
140
+ str[%r{#$&$}]=''
141
+ return str
142
+ when /(ic)?(ador([ae]s?)?|aci[óÓ]n|aciones|antes?|ancias?)$/ui
143
+ str[%r{#$&$}]=''
144
+ return str
145
+ when /log[íÍ]as?/ui
146
+ str[%r{#$&$}]='log'
147
+ return str
148
+ when /(uci([óÓ]n|ones))$/ui
149
+ str[%r{#$&$}]='u'
150
+ return str
151
+ when /(encias?)$/i
152
+ str[%r{#$&$}]='ente'
153
+ return str
154
+ end
155
+
156
+ if r2_text =~ /(ativ|iv|os|ic|ad)amente$/i or r1_text =~ /amente$/i
157
+ str[%r{#$&$}]=''
158
+ return str
159
+ end
160
+
161
+ case r2_text
162
+ when /((ante|[ai]ble)?mente)$/i, /((abil|i[cv])?idad(es)?)$/i, /((at)?iv[ao]s?)$/i
163
+ str[%r{#$&$}]=''
164
+ return str
165
+ end
166
+ nil
167
+ end
168
+
169
+ #=> nil or new_str
170
+ def step2a(str)
171
+ rv_pos = rv(str)
172
+ idx = str[rv_pos..-1] =~ /(y[oóÓ]|ye(ron|ndo)|y[ae][ns]?|ya(is|mos))$/ui
173
+
174
+ return nil unless idx
175
+
176
+ if 'u' == str[rv_pos+idx-1].downcase
177
+ str[%r{#$&$}] = ''
178
+ return str
179
+ end
180
+ nil
181
+ end
182
+
183
+ STEP2B_REGEXP = /(
184
+ ar([áÁ][ns]?|a(n|s|is)?|on)? | ar([éÉ]is|emos|é|É) | ar[íÍ]a(n|s|is|mos)? |
185
+ er([áÁ][sn]?|[éÉ](is)?|emos|[íÍ]a(n|s|is|mos)?)? |
186
+ ir([íÍ]a(s|n|is|mos)?|[áÁ][ns]?|emos|[éÉ]|éis)? | aba(s|n|is)? |
187
+ ad([ao]s?)? | ed | id(a|as|o|os)? | [íÍ]a(n|s|is|mos)? | [íÍ]s |
188
+ as(e[ns]?|te|eis|teis)? | [áÁ](is|bamos|semos|ramos) | a(n|ndo|mos) |
189
+ ie(ra|se|ran|sen|ron|ndo|ras|ses|rais|seis) | i(ste|steis|[óÓ]|mos|[éÉ]ramos|[éÉ]semos) |
190
+ en|es|[éÉ]is|emos
191
+ )$/xiu
192
+
193
+ def step2b(str)
194
+ rv_pos = rv(str)
195
+
196
+ if idx = str[rv_pos..-1] =~ STEP2B_REGEXP
197
+ suffix = $&
198
+ if suffix =~ /^(en|es|[éÉ]is|emos)$/ui
199
+ str[%r{#{suffix}$}]=''
200
+ str[rv_pos+idx-1]='' if str[rv_pos+idx-2] =~ /g/i and str[rv_pos+idx-1] =~ /u/i
201
+ else
202
+ str[%r{#{suffix}$}]=''
203
+ end
204
+ return str
205
+ end
206
+ nil
207
+ end
208
+
209
+ def step3(str)
210
+ rv_pos = rv(str)
211
+ rv_text = str[rv_pos..-1]
212
+
213
+ if rv_text =~ /(os|[aoáíóÁÍÓ])$/ui
214
+ str[%r{#$&$}]=''
215
+ return str
216
+ elsif idx = rv_text =~ /(u?[eéÉ])$/i
217
+ if $&[0].downcase == 'u' and str[rv_pos+idx-1].downcase == 'g'
218
+ str[%r{#$&$}]=''
219
+ else
220
+ str.chop!
221
+ end
222
+ return str
223
+ end
224
+ nil
225
+ end
226
+
227
+ VOWEL = 'aeiouáéíóúüAEIOUÁÉÍÓÚÜ'
228
+ CONSONANT = "bcdfghjklmnñpqrstvwxyzABCDEFGHIJKLMNÑOPQRSTUVWXYZ"
229
+ end
230
+
231
+ class String
232
+ include EStem
233
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: estem
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -21,6 +21,7 @@ files:
21
21
  - Rakefile
22
22
  - bin/es_stem.rb
23
23
  - lib/estem.rb
24
+ - lib/estem.rb~
24
25
  - test/diffs.txt
25
26
  - test/test_estem.rb
26
27
  homepage: https://github.com/MaG21/estem