estem 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/lib/estem.rb +39 -2
- data/lib/estem.rb~ +233 -0
- metadata +2 -1
data/Rakefile
CHANGED
data/lib/estem.rb
CHANGED
@@ -1,10 +1,45 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
#
|
3
|
-
#
|
3
|
+
# :title: Spanish Stemming
|
4
|
+
# = Description
|
5
|
+
# This gem is for reducing Spanish words to their roots. It uses an algorithm
|
6
|
+
# based on Martin Porter's specifications.
|
7
|
+
#
|
8
|
+
# For more information, visit:
|
9
|
+
# http://snowball.tartarus.org/algorithms/spanish/stemmer.html
|
10
|
+
#
|
11
|
+
# = Descripción
|
12
|
+
# Esta gema está para reducir las palabras del Español en sus respectivas raíces,
|
13
|
+
# para ello ultiliza un algoritmo basado en las especificaciones de Martin Porter
|
14
|
+
#
|
15
|
+
# Para más información, visite:
|
16
|
+
# http://snowball.tartarus.org/algorithms/spanish/stemmer.html
|
17
|
+
#
|
18
|
+
# = License -- Licencia
|
19
|
+
# This code is provided under the terms of the {MIT License.}[http://www.opensource.org/licenses/mit-license.php]
|
20
|
+
#
|
21
|
+
# = Authors
|
22
|
+
# * Manuel A. Güílamo
|
4
23
|
#
|
5
|
-
# :title: EStem - Ruby based Porter Spanish Stemmer
|
6
24
|
|
7
25
|
module EStem
|
26
|
+
##
|
27
|
+
# :method: estem
|
28
|
+
# For more information, please see <b>String#es_stem</b> method, also <b>EStem</b>.
|
29
|
+
|
30
|
+
|
31
|
+
##
|
32
|
+
#This method stem Spanish words.
|
33
|
+
#
|
34
|
+
# "albergues".es_stem # ==> "alberg"
|
35
|
+
# "habitaciones".es_stem # ==> "habit"
|
36
|
+
# "ALbeRGues".es_stem # ==> "ALbeRG"
|
37
|
+
# "HaBiTaCiOnEs".es_stem # ==> "HaBiT"
|
38
|
+
# "Hacinamiento".es_stem # ==> "Hacin"
|
39
|
+
#
|
40
|
+
#:call-seq:
|
41
|
+
# str.es_stem => "new_str"
|
42
|
+
|
8
43
|
def es_stem
|
9
44
|
str = self.dup
|
10
45
|
return remove_accent(str) if str.length == 1
|
@@ -24,6 +59,8 @@ module EStem
|
|
24
59
|
remove_accent(str)
|
25
60
|
end
|
26
61
|
|
62
|
+
# :stopdoc:
|
63
|
+
|
27
64
|
private
|
28
65
|
|
29
66
|
def vowel?(c)
|
data/lib/estem.rb~
ADDED
@@ -0,0 +1,233 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# :title: Spanish Stemming
|
4
|
+
# = Description
|
5
|
+
# This gem is for reducing Spanish words to their roots. It uses an algorithm
|
6
|
+
# based on Martin Porter's specifications.
|
7
|
+
#
|
8
|
+
# For more information, visit:
|
9
|
+
# http://snowball.tartarus.org/algorithms/spanish/stemmer.html
|
10
|
+
#
|
11
|
+
# = Descripción
|
12
|
+
# Esta gema está para reducir las palabras del Español en sus respectivas raíces,
|
13
|
+
# para ello ultiliza un algoritmo basado en las especificaciones de Martin Porter
|
14
|
+
#
|
15
|
+
# Para más información, visite:
|
16
|
+
# http://snowball.tartarus.org/algorithms/spanish/stemmer.html
|
17
|
+
#
|
18
|
+
# = License -- Licencia
|
19
|
+
# This code is provided under the terms of the {MIT License.}[http://www.opensource.org/licenses/mit-license.php]
|
20
|
+
#
|
21
|
+
# = Authors
|
22
|
+
# * Manuel A. Güílamo
|
23
|
+
#
|
24
|
+
|
25
|
+
module EStem
|
26
|
+
##
|
27
|
+
# :method: estem
|
28
|
+
# For more information, please see <b>String#es_stem</b> method, also <b>EStem</b>.
|
29
|
+
|
30
|
+
|
31
|
+
##
|
32
|
+
#This method reduces Spanish words to their root.
|
33
|
+
#
|
34
|
+
# "albergues".es_stem # ==> "alberg"
|
35
|
+
# "habitaciones".es_stem # ==> "habit"
|
36
|
+
# "ALbeRGues".es_stem # ==> "ALbeRG"
|
37
|
+
# "HaBiTaCiOnEs".es_stem # ==> "HaBiT"
|
38
|
+
# "Hacinamiento".es_stem # ==> "Hacin"
|
39
|
+
#
|
40
|
+
#:call-seq:
|
41
|
+
# str.es_stem => "new_str"
|
42
|
+
|
43
|
+
def es_stem
|
44
|
+
str = self.dup
|
45
|
+
return remove_accent(str) if str.length == 1
|
46
|
+
tmp = step0(str)
|
47
|
+
str = tmp ? tmp : str
|
48
|
+
|
49
|
+
unless tmp = step1(str)
|
50
|
+
unless tmp = step2a(str)
|
51
|
+
tmp = step2b(str)
|
52
|
+
str = tmp ? tmp : str
|
53
|
+
else
|
54
|
+
str = tmp
|
55
|
+
end
|
56
|
+
end
|
57
|
+
tmp = step3(str)
|
58
|
+
str = tmp.nil? ? str : tmp
|
59
|
+
remove_accent(str)
|
60
|
+
end
|
61
|
+
|
62
|
+
# :stopdoc:
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def vowel?(c)
|
67
|
+
VOWEL.include?(c)
|
68
|
+
end
|
69
|
+
|
70
|
+
def consonant?(c)
|
71
|
+
CONSONANT.include?(c)
|
72
|
+
end
|
73
|
+
|
74
|
+
def remove_accent(str)
|
75
|
+
str.tr('áéíóúÁÉÍÓÚ','aeiouAEIOU')
|
76
|
+
end
|
77
|
+
|
78
|
+
def rv(str)
|
79
|
+
if consonant? str[1]
|
80
|
+
i=2
|
81
|
+
i+=1 while str[i] and consonant? str[i]
|
82
|
+
return str.nil? ? str.length-1 : i+1
|
83
|
+
end
|
84
|
+
|
85
|
+
if vowel? str[0] and vowel? str[1]
|
86
|
+
i=2
|
87
|
+
i+=1 while str[i] and vowel? str[i]
|
88
|
+
return str.nil? ? str.length-1 : i+1
|
89
|
+
end
|
90
|
+
|
91
|
+
return 3 if consonant? str[0] and vowel? str[1]
|
92
|
+
|
93
|
+
str.length - 1
|
94
|
+
end
|
95
|
+
|
96
|
+
def r(str, i=0)
|
97
|
+
i+=1 while str[i] and consonant?(str[i])
|
98
|
+
i+=1
|
99
|
+
i+=1 while str[i] and vowel? str[i]
|
100
|
+
str[i].nil? ? str.length : i+1
|
101
|
+
end
|
102
|
+
|
103
|
+
def r12(str)
|
104
|
+
r1 = r(str)
|
105
|
+
r2 = r(str,r1)
|
106
|
+
[r1,r2]
|
107
|
+
end
|
108
|
+
|
109
|
+
def step0(str)
|
110
|
+
return nil unless str =~ /(se(l[ao]s?)?|l([aeo]s?)|me|nos)$/i
|
111
|
+
|
112
|
+
suffix = $&
|
113
|
+
rv_text = str[rv(str)..-1]
|
114
|
+
|
115
|
+
case rv_text
|
116
|
+
when %r{((?<=i[éÉ]ndo|[áÁ]ndo|[áéíÁÉÍ]r)#{suffix})$}ui
|
117
|
+
str[%r{#$&$}]=''
|
118
|
+
str = remove_accent(str)
|
119
|
+
return str
|
120
|
+
when %r{((?<=iendo|ando|[aei]r)#{suffix})$}i
|
121
|
+
str[%r{#$&$}]=''
|
122
|
+
return str
|
123
|
+
end
|
124
|
+
|
125
|
+
if rv_text =~ /yendo/i and str =~ /uyendo/i
|
126
|
+
str[suffix]=''
|
127
|
+
return str
|
128
|
+
end
|
129
|
+
nil
|
130
|
+
end
|
131
|
+
|
132
|
+
#=> new_str or nil
|
133
|
+
def step1(str)
|
134
|
+
r1,r2 = r12(str)
|
135
|
+
r1_text = str[r1..-1]
|
136
|
+
r2_text = str[r2..-1]
|
137
|
+
|
138
|
+
case r2_text
|
139
|
+
when /(anzas?|ic[oa]s?|ismos?|[ai]bles?|istas?|os[oa]s?|[ai]mientos?)$/i
|
140
|
+
str[%r{#$&$}]=''
|
141
|
+
return str
|
142
|
+
when /(ic)?(ador([ae]s?)?|aci[óÓ]n|aciones|antes?|ancias?)$/ui
|
143
|
+
str[%r{#$&$}]=''
|
144
|
+
return str
|
145
|
+
when /log[íÍ]as?/ui
|
146
|
+
str[%r{#$&$}]='log'
|
147
|
+
return str
|
148
|
+
when /(uci([óÓ]n|ones))$/ui
|
149
|
+
str[%r{#$&$}]='u'
|
150
|
+
return str
|
151
|
+
when /(encias?)$/i
|
152
|
+
str[%r{#$&$}]='ente'
|
153
|
+
return str
|
154
|
+
end
|
155
|
+
|
156
|
+
if r2_text =~ /(ativ|iv|os|ic|ad)amente$/i or r1_text =~ /amente$/i
|
157
|
+
str[%r{#$&$}]=''
|
158
|
+
return str
|
159
|
+
end
|
160
|
+
|
161
|
+
case r2_text
|
162
|
+
when /((ante|[ai]ble)?mente)$/i, /((abil|i[cv])?idad(es)?)$/i, /((at)?iv[ao]s?)$/i
|
163
|
+
str[%r{#$&$}]=''
|
164
|
+
return str
|
165
|
+
end
|
166
|
+
nil
|
167
|
+
end
|
168
|
+
|
169
|
+
#=> nil or new_str
|
170
|
+
def step2a(str)
|
171
|
+
rv_pos = rv(str)
|
172
|
+
idx = str[rv_pos..-1] =~ /(y[oóÓ]|ye(ron|ndo)|y[ae][ns]?|ya(is|mos))$/ui
|
173
|
+
|
174
|
+
return nil unless idx
|
175
|
+
|
176
|
+
if 'u' == str[rv_pos+idx-1].downcase
|
177
|
+
str[%r{#$&$}] = ''
|
178
|
+
return str
|
179
|
+
end
|
180
|
+
nil
|
181
|
+
end
|
182
|
+
|
183
|
+
STEP2B_REGEXP = /(
|
184
|
+
ar([áÁ][ns]?|a(n|s|is)?|on)? | ar([éÉ]is|emos|é|É) | ar[íÍ]a(n|s|is|mos)? |
|
185
|
+
er([áÁ][sn]?|[éÉ](is)?|emos|[íÍ]a(n|s|is|mos)?)? |
|
186
|
+
ir([íÍ]a(s|n|is|mos)?|[áÁ][ns]?|emos|[éÉ]|éis)? | aba(s|n|is)? |
|
187
|
+
ad([ao]s?)? | ed | id(a|as|o|os)? | [íÍ]a(n|s|is|mos)? | [íÍ]s |
|
188
|
+
as(e[ns]?|te|eis|teis)? | [áÁ](is|bamos|semos|ramos) | a(n|ndo|mos) |
|
189
|
+
ie(ra|se|ran|sen|ron|ndo|ras|ses|rais|seis) | i(ste|steis|[óÓ]|mos|[éÉ]ramos|[éÉ]semos) |
|
190
|
+
en|es|[éÉ]is|emos
|
191
|
+
)$/xiu
|
192
|
+
|
193
|
+
def step2b(str)
|
194
|
+
rv_pos = rv(str)
|
195
|
+
|
196
|
+
if idx = str[rv_pos..-1] =~ STEP2B_REGEXP
|
197
|
+
suffix = $&
|
198
|
+
if suffix =~ /^(en|es|[éÉ]is|emos)$/ui
|
199
|
+
str[%r{#{suffix}$}]=''
|
200
|
+
str[rv_pos+idx-1]='' if str[rv_pos+idx-2] =~ /g/i and str[rv_pos+idx-1] =~ /u/i
|
201
|
+
else
|
202
|
+
str[%r{#{suffix}$}]=''
|
203
|
+
end
|
204
|
+
return str
|
205
|
+
end
|
206
|
+
nil
|
207
|
+
end
|
208
|
+
|
209
|
+
def step3(str)
|
210
|
+
rv_pos = rv(str)
|
211
|
+
rv_text = str[rv_pos..-1]
|
212
|
+
|
213
|
+
if rv_text =~ /(os|[aoáíóÁÍÓ])$/ui
|
214
|
+
str[%r{#$&$}]=''
|
215
|
+
return str
|
216
|
+
elsif idx = rv_text =~ /(u?[eéÉ])$/i
|
217
|
+
if $&[0].downcase == 'u' and str[rv_pos+idx-1].downcase == 'g'
|
218
|
+
str[%r{#$&$}]=''
|
219
|
+
else
|
220
|
+
str.chop!
|
221
|
+
end
|
222
|
+
return str
|
223
|
+
end
|
224
|
+
nil
|
225
|
+
end
|
226
|
+
|
227
|
+
VOWEL = 'aeiouáéíóúüAEIOUÁÉÍÓÚÜ'
|
228
|
+
CONSONANT = "bcdfghjklmnñpqrstvwxyzABCDEFGHIJKLMNÑOPQRSTUVWXYZ"
|
229
|
+
end
|
230
|
+
|
231
|
+
class String
|
232
|
+
include EStem
|
233
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: estem
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -21,6 +21,7 @@ files:
|
|
21
21
|
- Rakefile
|
22
22
|
- bin/es_stem.rb
|
23
23
|
- lib/estem.rb
|
24
|
+
- lib/estem.rb~
|
24
25
|
- test/diffs.txt
|
25
26
|
- test/test_estem.rb
|
26
27
|
homepage: https://github.com/MaG21/estem
|