estem 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +18 -0
- data/bin/es_stem.rb +178 -0
- data/lib/estem.rb +196 -0
- data/test/diffs.txt +28390 -0
- data/test/test_estem.rb +23 -0
- metadata +54 -0
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rake/testtask'
|
2
|
+
|
3
|
+
Rake::TestTask.new do |t|
|
4
|
+
t.libs << 'test'
|
5
|
+
end
|
6
|
+
|
7
|
+
desc "Run tests"
|
8
|
+
task :default => :test
|
9
|
+
|
10
|
+
# Generate documentation.
|
11
|
+
require 'rdoc/task'
|
12
|
+
RDoc::Task.new do |rdoc|
|
13
|
+
rdoc.rdoc_files.include('README.rdoc',
|
14
|
+
'lib/**/*',
|
15
|
+
'bin/**/*')
|
16
|
+
rdoc.options = ['--main', 'README.rdoc']
|
17
|
+
rdoc.rdoc_dir = 'rdoc'
|
18
|
+
end
|
data/bin/es_stem.rb
ADDED
@@ -0,0 +1,178 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
|
4
|
+
# Copyright (c) 2012 Manuel A. Güílamo
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22
|
+
# SOFTWARE.
|
23
|
+
|
24
|
+
require 'estem.rb'
|
25
|
+
require 'getoptlong'
|
26
|
+
require 'iconv'
|
27
|
+
|
28
|
+
$version = "0.1.9"
|
29
|
+
|
30
|
+
def usage(error=false)
|
31
|
+
out = error ? $stderr : $stdout
|
32
|
+
out.puts DATA.read()
|
33
|
+
end
|
34
|
+
|
35
|
+
opts = GetoptLong.new(
|
36
|
+
['--help', '-h', GetoptLong::NO_ARGUMENT],
|
37
|
+
['--version', '-v', GetoptLong::NO_ARGUMENT],
|
38
|
+
['--file', '-f', GetoptLong::REQUIRED_ARGUMENT],
|
39
|
+
['--in-enc', '-i', GetoptLong::REQUIRED_ARGUMENT],
|
40
|
+
['--out-enc', '-o', GetoptLong::REQUIRED_ARGUMENT])
|
41
|
+
|
42
|
+
opts.quiet = true
|
43
|
+
|
44
|
+
filename = nil
|
45
|
+
ienc = nil
|
46
|
+
oenc = nil
|
47
|
+
|
48
|
+
begin
|
49
|
+
opts.each do |op, arg|
|
50
|
+
case op
|
51
|
+
when '--help'
|
52
|
+
usage()
|
53
|
+
exit
|
54
|
+
when '--version'
|
55
|
+
puts "EStem\nSpanish stemmer // lexemador\nVer: #{$version}"
|
56
|
+
exit
|
57
|
+
when '--file'
|
58
|
+
filename = arg
|
59
|
+
when '--in-enc'
|
60
|
+
ienc = arg
|
61
|
+
when '--out-enc'
|
62
|
+
oenc = arg
|
63
|
+
end
|
64
|
+
end
|
65
|
+
rescue GetoptLong::MissingArgument
|
66
|
+
$stderr.puts 'Option requires an argument // La opción requiere un argumento'
|
67
|
+
exit
|
68
|
+
rescue GetoptLong::InvalidOption
|
69
|
+
$stderr.puts 'Unknown option // Opción desconocida.'
|
70
|
+
usage(true)
|
71
|
+
exit
|
72
|
+
rescue
|
73
|
+
puts $!
|
74
|
+
exit
|
75
|
+
end
|
76
|
+
|
77
|
+
if filename
|
78
|
+
begin
|
79
|
+
if ienc and ienc!='UTF-8'
|
80
|
+
file = File.open(filename, "r:#{ienc}:UTF-8")
|
81
|
+
else
|
82
|
+
file = File.open(filename, 'r:UTF-8')
|
83
|
+
end
|
84
|
+
rescue
|
85
|
+
$stderr.puts $!
|
86
|
+
exit
|
87
|
+
end
|
88
|
+
|
89
|
+
begin
|
90
|
+
hsh = {}
|
91
|
+
file.each_line do|line|
|
92
|
+
line.split(/[^a-záéíóúüñÁÉÍÓÚÜÑ]+/ui).each do|word|
|
93
|
+
hsh[word] = word.es_stem unless hsh[word]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
rescue
|
97
|
+
puts $!
|
98
|
+
exit
|
99
|
+
ensure
|
100
|
+
file.close
|
101
|
+
end
|
102
|
+
else
|
103
|
+
hsh = {}
|
104
|
+
$stdin.each_line do|line|
|
105
|
+
if ienc
|
106
|
+
line = Iconv.conv('UTF-8', ienc, line)
|
107
|
+
else
|
108
|
+
# Just in case the terminal mess with the encoding name.
|
109
|
+
# Por si la terminal juega con el nombre de la codificación.
|
110
|
+
line.force_encoding('UTF-8')
|
111
|
+
end
|
112
|
+
|
113
|
+
begin
|
114
|
+
line.split(/[^a-záéíóúüñÁÉÍÓÚÜÑ]+/ui).each do|word|
|
115
|
+
hsh[word] = word.es_stem unless hsh[word]
|
116
|
+
end
|
117
|
+
rescue Encoding::CompatibilityError
|
118
|
+
if ienc
|
119
|
+
msg = "incompatible encoding, please use option " +
|
120
|
+
"`--in-inc' correctly. //\n" +
|
121
|
+
"codificación incompatible, por favor use la " +
|
122
|
+
"opción `--in-inc' correctamente."
|
123
|
+
else
|
124
|
+
msg="incompatible encoding, please use option `--in-inc'."+
|
125
|
+
" //\ncodificación incompatible, por favor use la " +
|
126
|
+
"opción `--in-inc'."
|
127
|
+
end
|
128
|
+
|
129
|
+
if oenc
|
130
|
+
msg = Iconv.conv(oenc, 'UTF-8', msg)
|
131
|
+
end
|
132
|
+
|
133
|
+
$stderr.puts msg
|
134
|
+
exit
|
135
|
+
rescue
|
136
|
+
puts $!
|
137
|
+
exit
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
if oenc
|
143
|
+
begin
|
144
|
+
hsh.each_pair do |k,v|
|
145
|
+
puts Iconv.conv(oenc, 'UTF-8', "#{k} => #{v}")
|
146
|
+
end
|
147
|
+
rescue
|
148
|
+
puts $!
|
149
|
+
exit
|
150
|
+
end
|
151
|
+
else
|
152
|
+
hsh.each_pair{ |k,v| puts "#{k} => #{v}" }
|
153
|
+
end
|
154
|
+
|
155
|
+
__END__
|
156
|
+
Use: es_stem [OPTION]...
|
157
|
+
|
158
|
+
Options:
|
159
|
+
--help, -h display this help and exit. // Presenta esta ayuda y termina.
|
160
|
+
--version, -v output version information and exit //
|
161
|
+
Muestra la versión y termina.
|
162
|
+
--file, -f file of words. // fichero de palabras.
|
163
|
+
--in-enc, -i encoding of the file. // codificación del fichero.
|
164
|
+
--out-enc, -o output encoding // codificación de salida.
|
165
|
+
|
166
|
+
By default UTF-8 is used as input encoding, and if no file is specified,
|
167
|
+
standard input will be used instead.
|
168
|
+
|
169
|
+
You should set the option `--out-enc' if you are experimenting problems
|
170
|
+
visualizing the output text.
|
171
|
+
|
172
|
+
//
|
173
|
+
|
174
|
+
Por defecto se usará UTF-8 como codificación de entrada, si no se especifica un
|
175
|
+
fichero, la entrada estándard se usará en su lugar.
|
176
|
+
|
177
|
+
Debería establecer la opción `--out-enc' si está experimentando problemas para
|
178
|
+
visualizar el texto de salida.
|
data/lib/estem.rb
ADDED
@@ -0,0 +1,196 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# Porter, Spanish stemmer in Ruby.
|
4
|
+
#
|
5
|
+
# :title: EStem - Ruby based Porter Spanish Stemmer
|
6
|
+
|
7
|
+
module EStem
|
8
|
+
def es_stem
|
9
|
+
str = self.dup
|
10
|
+
return remove_accent(str) if str.length == 1
|
11
|
+
tmp = step0(str)
|
12
|
+
str = tmp ? tmp : str
|
13
|
+
|
14
|
+
unless tmp = step1(str)
|
15
|
+
unless tmp = step2a(str)
|
16
|
+
tmp = step2b(str)
|
17
|
+
str = tmp ? tmp : str
|
18
|
+
else
|
19
|
+
str = tmp
|
20
|
+
end
|
21
|
+
end
|
22
|
+
tmp = step3(str)
|
23
|
+
str = tmp.nil? ? str : tmp
|
24
|
+
remove_accent(str)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def vowel?(c)
|
30
|
+
VOWEL.include?(c)
|
31
|
+
end
|
32
|
+
|
33
|
+
def consonant?(c)
|
34
|
+
CONSONANT.include?(c)
|
35
|
+
end
|
36
|
+
|
37
|
+
def remove_accent(str)
|
38
|
+
str.tr('áéíóúÁÉÍÓÚ','aeiouAEIOU')
|
39
|
+
end
|
40
|
+
|
41
|
+
def rv(str)
|
42
|
+
if consonant? str[1]
|
43
|
+
i=2
|
44
|
+
i+=1 while str[i] and consonant? str[i]
|
45
|
+
return str.nil? ? str.length-1 : i+1
|
46
|
+
end
|
47
|
+
|
48
|
+
if vowel? str[0] and vowel? str[1]
|
49
|
+
i=2
|
50
|
+
i+=1 while str[i] and vowel? str[i]
|
51
|
+
return str.nil? ? str.length-1 : i+1
|
52
|
+
end
|
53
|
+
|
54
|
+
return 3 if consonant? str[0] and vowel? str[1]
|
55
|
+
|
56
|
+
str.length - 1
|
57
|
+
end
|
58
|
+
|
59
|
+
def r(str, i=0)
|
60
|
+
i+=1 while str[i] and consonant?(str[i])
|
61
|
+
i+=1
|
62
|
+
i+=1 while str[i] and vowel? str[i]
|
63
|
+
str[i].nil? ? str.length : i+1
|
64
|
+
end
|
65
|
+
|
66
|
+
def r12(str)
|
67
|
+
r1 = r(str)
|
68
|
+
r2 = r(str,r1)
|
69
|
+
[r1,r2]
|
70
|
+
end
|
71
|
+
|
72
|
+
def step0(str)
|
73
|
+
return nil unless str =~ /(se(l[ao]s?)?|l([aeo]s?)|me|nos)$/i
|
74
|
+
|
75
|
+
suffix = $&
|
76
|
+
rv_text = str[rv(str)..-1]
|
77
|
+
|
78
|
+
case rv_text
|
79
|
+
when %r{((?<=i[éÉ]ndo|[áÁ]ndo|[áéíÁÉÍ]r)#{suffix})$}ui
|
80
|
+
str[%r{#$&$}]=''
|
81
|
+
str = remove_accent(str)
|
82
|
+
return str
|
83
|
+
when %r{((?<=iendo|ando|[aei]r)#{suffix})$}i
|
84
|
+
str[%r{#$&$}]=''
|
85
|
+
return str
|
86
|
+
end
|
87
|
+
|
88
|
+
if rv_text =~ /yendo/i and str =~ /uyendo/i
|
89
|
+
str[suffix]=''
|
90
|
+
return str
|
91
|
+
end
|
92
|
+
nil
|
93
|
+
end
|
94
|
+
|
95
|
+
#=> new_str or nil
|
96
|
+
def step1(str)
|
97
|
+
r1,r2 = r12(str)
|
98
|
+
r1_text = str[r1..-1]
|
99
|
+
r2_text = str[r2..-1]
|
100
|
+
|
101
|
+
case r2_text
|
102
|
+
when /(anzas?|ic[oa]s?|ismos?|[ai]bles?|istas?|os[oa]s?|[ai]mientos?)$/i
|
103
|
+
str[%r{#$&$}]=''
|
104
|
+
return str
|
105
|
+
when /(ic)?(ador([ae]s?)?|aci[óÓ]n|aciones|antes?|ancias?)$/ui
|
106
|
+
str[%r{#$&$}]=''
|
107
|
+
return str
|
108
|
+
when /log[íÍ]as?/ui
|
109
|
+
str[%r{#$&$}]='log'
|
110
|
+
return str
|
111
|
+
when /(uci([óÓ]n|ones))$/ui
|
112
|
+
str[%r{#$&$}]='u'
|
113
|
+
return str
|
114
|
+
when /(encias?)$/i
|
115
|
+
str[%r{#$&$}]='ente'
|
116
|
+
return str
|
117
|
+
end
|
118
|
+
|
119
|
+
if r2_text =~ /(ativ|iv|os|ic|ad)amente$/i or r1_text =~ /amente$/i
|
120
|
+
str[%r{#$&$}]=''
|
121
|
+
return str
|
122
|
+
end
|
123
|
+
|
124
|
+
case r2_text
|
125
|
+
when /((ante|[ai]ble)?mente)$/i, /((abil|i[cv])?idad(es)?)$/i, /((at)?iv[ao]s?)$/i
|
126
|
+
str[%r{#$&$}]=''
|
127
|
+
return str
|
128
|
+
end
|
129
|
+
nil
|
130
|
+
end
|
131
|
+
|
132
|
+
#=> nil or new_str
|
133
|
+
def step2a(str)
|
134
|
+
rv_pos = rv(str)
|
135
|
+
idx = str[rv_pos..-1] =~ /(y[oóÓ]|ye(ron|ndo)|y[ae][ns]?|ya(is|mos))$/ui
|
136
|
+
|
137
|
+
return nil unless idx
|
138
|
+
|
139
|
+
if 'u' == str[rv_pos+idx-1].downcase
|
140
|
+
str[%r{#$&$}] = ''
|
141
|
+
return str
|
142
|
+
end
|
143
|
+
nil
|
144
|
+
end
|
145
|
+
|
146
|
+
STEP2B_REGEXP = /(
|
147
|
+
ar([áÁ][ns]?|a(n|s|is)?|on)? | ar([éÉ]is|emos|é|É) | ar[íÍ]a(n|s|is|mos)? |
|
148
|
+
er([áÁ][sn]?|[éÉ](is)?|emos|[íÍ]a(n|s|is|mos)?)? |
|
149
|
+
ir([íÍ]a(s|n|is|mos)?|[áÁ][ns]?|emos|[éÉ]|éis)? | aba(s|n|is)? |
|
150
|
+
ad([ao]s?)? | ed | id(a|as|o|os)? | [íÍ]a(n|s|is|mos)? | [íÍ]s |
|
151
|
+
as(e[ns]?|te|eis|teis)? | [áÁ](is|bamos|semos|ramos) | a(n|ndo|mos) |
|
152
|
+
ie(ra|se|ran|sen|ron|ndo|ras|ses|rais|seis) | i(ste|steis|[óÓ]|mos|[éÉ]ramos|[éÉ]semos) |
|
153
|
+
en|es|[éÉ]is|emos
|
154
|
+
)$/xiu
|
155
|
+
|
156
|
+
def step2b(str)
|
157
|
+
rv_pos = rv(str)
|
158
|
+
|
159
|
+
if idx = str[rv_pos..-1] =~ STEP2B_REGEXP
|
160
|
+
suffix = $&
|
161
|
+
if suffix =~ /^(en|es|[éÉ]is|emos)$/ui
|
162
|
+
str[%r{#{suffix}$}]=''
|
163
|
+
str[rv_pos+idx-1]='' if str[rv_pos+idx-2] =~ /g/i and str[rv_pos+idx-1] =~ /u/i
|
164
|
+
else
|
165
|
+
str[%r{#{suffix}$}]=''
|
166
|
+
end
|
167
|
+
return str
|
168
|
+
end
|
169
|
+
nil
|
170
|
+
end
|
171
|
+
|
172
|
+
def step3(str)
|
173
|
+
rv_pos = rv(str)
|
174
|
+
rv_text = str[rv_pos..-1]
|
175
|
+
|
176
|
+
if rv_text =~ /(os|[aoáíóÁÍÓ])$/ui
|
177
|
+
str[%r{#$&$}]=''
|
178
|
+
return str
|
179
|
+
elsif idx = rv_text =~ /(u?[eéÉ])$/i
|
180
|
+
if $&[0].downcase == 'u' and str[rv_pos+idx-1].downcase == 'g'
|
181
|
+
str[%r{#$&$}]=''
|
182
|
+
else
|
183
|
+
str.chop!
|
184
|
+
end
|
185
|
+
return str
|
186
|
+
end
|
187
|
+
nil
|
188
|
+
end
|
189
|
+
|
190
|
+
VOWEL = 'aeiouáéíóúüAEIOUÁÉÍÓÚÜ'
|
191
|
+
CONSONANT = "bcdfghjklmnñpqrstvwxyzABCDEFGHIJKLMNÑOPQRSTUVWXYZ"
|
192
|
+
end
|
193
|
+
|
194
|
+
class String
|
195
|
+
include EStem
|
196
|
+
end
|