estem 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (6) hide show
  1. data/Rakefile +18 -0
  2. data/bin/es_stem.rb +178 -0
  3. data/lib/estem.rb +196 -0
  4. data/test/diffs.txt +28390 -0
  5. data/test/test_estem.rb +23 -0
  6. metadata +54 -0
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc "Run tests"
8
+ task :default => :test
9
+
10
+ # Generate documentation.
11
+ require 'rdoc/task'
12
+ RDoc::Task.new do |rdoc|
13
+ rdoc.rdoc_files.include('README.rdoc',
14
+ 'lib/**/*',
15
+ 'bin/**/*')
16
+ rdoc.options = ['--main', 'README.rdoc']
17
+ rdoc.rdoc_dir = 'rdoc'
18
+ end
data/bin/es_stem.rb ADDED
@@ -0,0 +1,178 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+
4
+ # Copyright (c) 2012 Manuel A. Güílamo
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ # SOFTWARE.
23
+
24
+ require 'estem.rb'
25
+ require 'getoptlong'
26
+ require 'iconv'
27
+
28
+ $version = "0.1.9"
29
+
30
+ def usage(error=false)
31
+ out = error ? $stderr : $stdout
32
+ out.puts DATA.read()
33
+ end
34
+
35
+ opts = GetoptLong.new(
36
+ ['--help', '-h', GetoptLong::NO_ARGUMENT],
37
+ ['--version', '-v', GetoptLong::NO_ARGUMENT],
38
+ ['--file', '-f', GetoptLong::REQUIRED_ARGUMENT],
39
+ ['--in-enc', '-i', GetoptLong::REQUIRED_ARGUMENT],
40
+ ['--out-enc', '-o', GetoptLong::REQUIRED_ARGUMENT])
41
+
42
+ opts.quiet = true
43
+
44
+ filename = nil
45
+ ienc = nil
46
+ oenc = nil
47
+
48
+ begin
49
+ opts.each do |op, arg|
50
+ case op
51
+ when '--help'
52
+ usage()
53
+ exit
54
+ when '--version'
55
+ puts "EStem\nSpanish stemmer // lexemador\nVer: #{$version}"
56
+ exit
57
+ when '--file'
58
+ filename = arg
59
+ when '--in-enc'
60
+ ienc = arg
61
+ when '--out-enc'
62
+ oenc = arg
63
+ end
64
+ end
65
+ rescue GetoptLong::MissingArgument
66
+ $stderr.puts 'Option requires an argument // La opción requiere un argumento'
67
+ exit
68
+ rescue GetoptLong::InvalidOption
69
+ $stderr.puts 'Unknown option // Opción desconocida.'
70
+ usage(true)
71
+ exit
72
+ rescue
73
+ puts $!
74
+ exit
75
+ end
76
+
77
+ if filename
78
+ begin
79
+ if ienc and ienc!='UTF-8'
80
+ file = File.open(filename, "r:#{ienc}:UTF-8")
81
+ else
82
+ file = File.open(filename, 'r:UTF-8')
83
+ end
84
+ rescue
85
+ $stderr.puts $!
86
+ exit
87
+ end
88
+
89
+ begin
90
+ hsh = {}
91
+ file.each_line do|line|
92
+ line.split(/[^a-záéíóúüñÁÉÍÓÚÜÑ]+/ui).each do|word|
93
+ hsh[word] = word.es_stem unless hsh[word]
94
+ end
95
+ end
96
+ rescue
97
+ puts $!
98
+ exit
99
+ ensure
100
+ file.close
101
+ end
102
+ else
103
+ hsh = {}
104
+ $stdin.each_line do|line|
105
+ if ienc
106
+ line = Iconv.conv('UTF-8', ienc, line)
107
+ else
108
+ # Just in case the terminal mess with the encoding name.
109
+ # Por si la terminal juega con el nombre de la codificación.
110
+ line.force_encoding('UTF-8')
111
+ end
112
+
113
+ begin
114
+ line.split(/[^a-záéíóúüñÁÉÍÓÚÜÑ]+/ui).each do|word|
115
+ hsh[word] = word.es_stem unless hsh[word]
116
+ end
117
+ rescue Encoding::CompatibilityError
118
+ if ienc
119
+ msg = "incompatible encoding, please use option " +
120
+ "`--in-inc' correctly. //\n" +
121
+ "codificación incompatible, por favor use la " +
122
+ "opción `--in-inc' correctamente."
123
+ else
124
+ msg="incompatible encoding, please use option `--in-inc'."+
125
+ " //\ncodificación incompatible, por favor use la " +
126
+ "opción `--in-inc'."
127
+ end
128
+
129
+ if oenc
130
+ msg = Iconv.conv(oenc, 'UTF-8', msg)
131
+ end
132
+
133
+ $stderr.puts msg
134
+ exit
135
+ rescue
136
+ puts $!
137
+ exit
138
+ end
139
+ end
140
+ end
141
+
142
+ if oenc
143
+ begin
144
+ hsh.each_pair do |k,v|
145
+ puts Iconv.conv(oenc, 'UTF-8', "#{k} => #{v}")
146
+ end
147
+ rescue
148
+ puts $!
149
+ exit
150
+ end
151
+ else
152
+ hsh.each_pair{ |k,v| puts "#{k} => #{v}" }
153
+ end
154
+
155
+ __END__
156
+ Use: es_stem [OPTION]...
157
+
158
+ Options:
159
+ --help, -h display this help and exit. // Presenta esta ayuda y termina.
160
+ --version, -v output version information and exit //
161
+ Muestra la versión y termina.
162
+ --file, -f file of words. // fichero de palabras.
163
+ --in-enc, -i encoding of the file. // codificación del fichero.
164
+ --out-enc, -o output encoding // codificación de salida.
165
+
166
+ By default UTF-8 is used as input encoding, and if no file is specified,
167
+ standard input will be used instead.
168
+
169
+ You should set the option `--out-enc' if you are experimenting problems
170
+ visualizing the output text.
171
+
172
+ //
173
+
174
+ Por defecto se usará UTF-8 como codificación de entrada, si no se especifica un
175
+ fichero, la entrada estándard se usará en su lugar.
176
+
177
+ Debería establecer la opción `--out-enc' si está experimentando problemas para
178
+ visualizar el texto de salida.
data/lib/estem.rb ADDED
@@ -0,0 +1,196 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # Porter, Spanish stemmer in Ruby.
4
+ #
5
+ # :title: EStem - Ruby based Porter Spanish Stemmer
6
+
7
+ module EStem
8
+ def es_stem
9
+ str = self.dup
10
+ return remove_accent(str) if str.length == 1
11
+ tmp = step0(str)
12
+ str = tmp ? tmp : str
13
+
14
+ unless tmp = step1(str)
15
+ unless tmp = step2a(str)
16
+ tmp = step2b(str)
17
+ str = tmp ? tmp : str
18
+ else
19
+ str = tmp
20
+ end
21
+ end
22
+ tmp = step3(str)
23
+ str = tmp.nil? ? str : tmp
24
+ remove_accent(str)
25
+ end
26
+
27
+ private
28
+
29
+ def vowel?(c)
30
+ VOWEL.include?(c)
31
+ end
32
+
33
+ def consonant?(c)
34
+ CONSONANT.include?(c)
35
+ end
36
+
37
+ def remove_accent(str)
38
+ str.tr('áéíóúÁÉÍÓÚ','aeiouAEIOU')
39
+ end
40
+
41
+ def rv(str)
42
+ if consonant? str[1]
43
+ i=2
44
+ i+=1 while str[i] and consonant? str[i]
45
+ return str.nil? ? str.length-1 : i+1
46
+ end
47
+
48
+ if vowel? str[0] and vowel? str[1]
49
+ i=2
50
+ i+=1 while str[i] and vowel? str[i]
51
+ return str.nil? ? str.length-1 : i+1
52
+ end
53
+
54
+ return 3 if consonant? str[0] and vowel? str[1]
55
+
56
+ str.length - 1
57
+ end
58
+
59
+ def r(str, i=0)
60
+ i+=1 while str[i] and consonant?(str[i])
61
+ i+=1
62
+ i+=1 while str[i] and vowel? str[i]
63
+ str[i].nil? ? str.length : i+1
64
+ end
65
+
66
+ def r12(str)
67
+ r1 = r(str)
68
+ r2 = r(str,r1)
69
+ [r1,r2]
70
+ end
71
+
72
+ def step0(str)
73
+ return nil unless str =~ /(se(l[ao]s?)?|l([aeo]s?)|me|nos)$/i
74
+
75
+ suffix = $&
76
+ rv_text = str[rv(str)..-1]
77
+
78
+ case rv_text
79
+ when %r{((?<=i[éÉ]ndo|[áÁ]ndo|[áéíÁÉÍ]r)#{suffix})$}ui
80
+ str[%r{#$&$}]=''
81
+ str = remove_accent(str)
82
+ return str
83
+ when %r{((?<=iendo|ando|[aei]r)#{suffix})$}i
84
+ str[%r{#$&$}]=''
85
+ return str
86
+ end
87
+
88
+ if rv_text =~ /yendo/i and str =~ /uyendo/i
89
+ str[suffix]=''
90
+ return str
91
+ end
92
+ nil
93
+ end
94
+
95
+ #=> new_str or nil
96
+ def step1(str)
97
+ r1,r2 = r12(str)
98
+ r1_text = str[r1..-1]
99
+ r2_text = str[r2..-1]
100
+
101
+ case r2_text
102
+ when /(anzas?|ic[oa]s?|ismos?|[ai]bles?|istas?|os[oa]s?|[ai]mientos?)$/i
103
+ str[%r{#$&$}]=''
104
+ return str
105
+ when /(ic)?(ador([ae]s?)?|aci[óÓ]n|aciones|antes?|ancias?)$/ui
106
+ str[%r{#$&$}]=''
107
+ return str
108
+ when /log[íÍ]as?/ui
109
+ str[%r{#$&$}]='log'
110
+ return str
111
+ when /(uci([óÓ]n|ones))$/ui
112
+ str[%r{#$&$}]='u'
113
+ return str
114
+ when /(encias?)$/i
115
+ str[%r{#$&$}]='ente'
116
+ return str
117
+ end
118
+
119
+ if r2_text =~ /(ativ|iv|os|ic|ad)amente$/i or r1_text =~ /amente$/i
120
+ str[%r{#$&$}]=''
121
+ return str
122
+ end
123
+
124
+ case r2_text
125
+ when /((ante|[ai]ble)?mente)$/i, /((abil|i[cv])?idad(es)?)$/i, /((at)?iv[ao]s?)$/i
126
+ str[%r{#$&$}]=''
127
+ return str
128
+ end
129
+ nil
130
+ end
131
+
132
+ #=> nil or new_str
133
+ def step2a(str)
134
+ rv_pos = rv(str)
135
+ idx = str[rv_pos..-1] =~ /(y[oóÓ]|ye(ron|ndo)|y[ae][ns]?|ya(is|mos))$/ui
136
+
137
+ return nil unless idx
138
+
139
+ if 'u' == str[rv_pos+idx-1].downcase
140
+ str[%r{#$&$}] = ''
141
+ return str
142
+ end
143
+ nil
144
+ end
145
+
146
+ STEP2B_REGEXP = /(
147
+ ar([áÁ][ns]?|a(n|s|is)?|on)? | ar([éÉ]is|emos|é|É) | ar[íÍ]a(n|s|is|mos)? |
148
+ er([áÁ][sn]?|[éÉ](is)?|emos|[íÍ]a(n|s|is|mos)?)? |
149
+ ir([íÍ]a(s|n|is|mos)?|[áÁ][ns]?|emos|[éÉ]|éis)? | aba(s|n|is)? |
150
+ ad([ao]s?)? | ed | id(a|as|o|os)? | [íÍ]a(n|s|is|mos)? | [íÍ]s |
151
+ as(e[ns]?|te|eis|teis)? | [áÁ](is|bamos|semos|ramos) | a(n|ndo|mos) |
152
+ ie(ra|se|ran|sen|ron|ndo|ras|ses|rais|seis) | i(ste|steis|[óÓ]|mos|[éÉ]ramos|[éÉ]semos) |
153
+ en|es|[éÉ]is|emos
154
+ )$/xiu
155
+
156
+ def step2b(str)
157
+ rv_pos = rv(str)
158
+
159
+ if idx = str[rv_pos..-1] =~ STEP2B_REGEXP
160
+ suffix = $&
161
+ if suffix =~ /^(en|es|[éÉ]is|emos)$/ui
162
+ str[%r{#{suffix}$}]=''
163
+ str[rv_pos+idx-1]='' if str[rv_pos+idx-2] =~ /g/i and str[rv_pos+idx-1] =~ /u/i
164
+ else
165
+ str[%r{#{suffix}$}]=''
166
+ end
167
+ return str
168
+ end
169
+ nil
170
+ end
171
+
172
+ def step3(str)
173
+ rv_pos = rv(str)
174
+ rv_text = str[rv_pos..-1]
175
+
176
+ if rv_text =~ /(os|[aoáíóÁÍÓ])$/ui
177
+ str[%r{#$&$}]=''
178
+ return str
179
+ elsif idx = rv_text =~ /(u?[eéÉ])$/i
180
+ if $&[0].downcase == 'u' and str[rv_pos+idx-1].downcase == 'g'
181
+ str[%r{#$&$}]=''
182
+ else
183
+ str.chop!
184
+ end
185
+ return str
186
+ end
187
+ nil
188
+ end
189
+
190
+ VOWEL = 'aeiouáéíóúüAEIOUÁÉÍÓÚÜ'
191
+ CONSONANT = "bcdfghjklmnñpqrstvwxyzABCDEFGHIJKLMNÑOPQRSTUVWXYZ"
192
+ end
193
+
194
+ class String
195
+ include EStem
196
+ end