estem 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (6) hide show
  1. data/Rakefile +18 -0
  2. data/bin/es_stem.rb +178 -0
  3. data/lib/estem.rb +196 -0
  4. data/test/diffs.txt +28390 -0
  5. data/test/test_estem.rb +23 -0
  6. metadata +54 -0
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc "Run tests"
8
+ task :default => :test
9
+
10
+ # Generate documentation.
11
+ require 'rdoc/task'
12
+ RDoc::Task.new do |rdoc|
13
+ rdoc.rdoc_files.include('README.rdoc',
14
+ 'lib/**/*',
15
+ 'bin/**/*')
16
+ rdoc.options = ['--main', 'README.rdoc']
17
+ rdoc.rdoc_dir = 'rdoc'
18
+ end
data/bin/es_stem.rb ADDED
@@ -0,0 +1,178 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+
4
+ # Copyright (c) 2012 Manuel A. Güílamo
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ # SOFTWARE.
23
+
24
+ require 'estem.rb'
25
+ require 'getoptlong'
26
+ require 'iconv'
27
+
28
+ $version = "0.1.9"
29
+
30
+ def usage(error=false)
31
+ out = error ? $stderr : $stdout
32
+ out.puts DATA.read()
33
+ end
34
+
35
+ opts = GetoptLong.new(
36
+ ['--help', '-h', GetoptLong::NO_ARGUMENT],
37
+ ['--version', '-v', GetoptLong::NO_ARGUMENT],
38
+ ['--file', '-f', GetoptLong::REQUIRED_ARGUMENT],
39
+ ['--in-enc', '-i', GetoptLong::REQUIRED_ARGUMENT],
40
+ ['--out-enc', '-o', GetoptLong::REQUIRED_ARGUMENT])
41
+
42
+ opts.quiet = true
43
+
44
+ filename = nil
45
+ ienc = nil
46
+ oenc = nil
47
+
48
+ begin
49
+ opts.each do |op, arg|
50
+ case op
51
+ when '--help'
52
+ usage()
53
+ exit
54
+ when '--version'
55
+ puts "EStem\nSpanish stemmer // lexemador\nVer: #{$version}"
56
+ exit
57
+ when '--file'
58
+ filename = arg
59
+ when '--in-enc'
60
+ ienc = arg
61
+ when '--out-enc'
62
+ oenc = arg
63
+ end
64
+ end
65
+ rescue GetoptLong::MissingArgument
66
+ $stderr.puts 'Option requires an argument // La opción requiere un argumento'
67
+ exit
68
+ rescue GetoptLong::InvalidOption
69
+ $stderr.puts 'Unknown option // Opción desconocida.'
70
+ usage(true)
71
+ exit
72
+ rescue
73
+ puts $!
74
+ exit
75
+ end
76
+
77
+ if filename
78
+ begin
79
+ if ienc and ienc!='UTF-8'
80
+ file = File.open(filename, "r:#{ienc}:UTF-8")
81
+ else
82
+ file = File.open(filename, 'r:UTF-8')
83
+ end
84
+ rescue
85
+ $stderr.puts $!
86
+ exit
87
+ end
88
+
89
+ begin
90
+ hsh = {}
91
+ file.each_line do|line|
92
+ line.split(/[^a-záéíóúüñÁÉÍÓÚÜÑ]+/ui).each do|word|
93
+ hsh[word] = word.es_stem unless hsh[word]
94
+ end
95
+ end
96
+ rescue
97
+ puts $!
98
+ exit
99
+ ensure
100
+ file.close
101
+ end
102
+ else
103
+ hsh = {}
104
+ $stdin.each_line do|line|
105
+ if ienc
106
+ line = Iconv.conv('UTF-8', ienc, line)
107
+ else
108
+ # Just in case the terminal mess with the encoding name.
109
+ # Por si la terminal juega con el nombre de la codificación.
110
+ line.force_encoding('UTF-8')
111
+ end
112
+
113
+ begin
114
+ line.split(/[^a-záéíóúüñÁÉÍÓÚÜÑ]+/ui).each do|word|
115
+ hsh[word] = word.es_stem unless hsh[word]
116
+ end
117
+ rescue Encoding::CompatibilityError
118
+ if ienc
119
+ msg = "incompatible encoding, please use option " +
120
+ "`--in-inc' correctly. //\n" +
121
+ "codificación incompatible, por favor use la " +
122
+ "opción `--in-inc' correctamente."
123
+ else
124
+ msg="incompatible encoding, please use option `--in-inc'."+
125
+ " //\ncodificación incompatible, por favor use la " +
126
+ "opción `--in-inc'."
127
+ end
128
+
129
+ if oenc
130
+ msg = Iconv.conv(oenc, 'UTF-8', msg)
131
+ end
132
+
133
+ $stderr.puts msg
134
+ exit
135
+ rescue
136
+ puts $!
137
+ exit
138
+ end
139
+ end
140
+ end
141
+
142
+ if oenc
143
+ begin
144
+ hsh.each_pair do |k,v|
145
+ puts Iconv.conv(oenc, 'UTF-8', "#{k} => #{v}")
146
+ end
147
+ rescue
148
+ puts $!
149
+ exit
150
+ end
151
+ else
152
+ hsh.each_pair{ |k,v| puts "#{k} => #{v}" }
153
+ end
154
+
155
+ __END__
156
+ Use: es_stem [OPTION]...
157
+
158
+ Options:
159
+ --help, -h display this help and exit. // Presenta esta ayuda y termina.
160
+ --version, -v output version information and exit //
161
+ Muestra la versión y termina.
162
+ --file, -f file of words. // fichero de palabras.
163
+ --in-enc, -i encoding of the file. // codificación del fichero.
164
+ --out-enc, -o output encoding // codificación de salida.
165
+
166
+ By default UTF-8 is used as input encoding, and if no file is specified,
167
+ standard input will be used instead.
168
+
169
+ You should set the option `--out-enc' if you are experimenting problems
170
+ visualizing the output text.
171
+
172
+ //
173
+
174
+ Por defecto se usará UTF-8 como codificación de entrada, si no se especifica un
175
+ fichero, la entrada estándard se usará en su lugar.
176
+
177
+ Debería establecer la opción `--out-enc' si está experimentando problemas para
178
+ visualizar el texto de salida.
data/lib/estem.rb ADDED
@@ -0,0 +1,196 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # Porter, Spanish stemmer in Ruby.
4
+ #
5
+ # :title: EStem - Ruby based Porter Spanish Stemmer
6
+
7
+ module EStem
8
+ def es_stem
9
+ str = self.dup
10
+ return remove_accent(str) if str.length == 1
11
+ tmp = step0(str)
12
+ str = tmp ? tmp : str
13
+
14
+ unless tmp = step1(str)
15
+ unless tmp = step2a(str)
16
+ tmp = step2b(str)
17
+ str = tmp ? tmp : str
18
+ else
19
+ str = tmp
20
+ end
21
+ end
22
+ tmp = step3(str)
23
+ str = tmp.nil? ? str : tmp
24
+ remove_accent(str)
25
+ end
26
+
27
+ private
28
+
29
+ def vowel?(c)
30
+ VOWEL.include?(c)
31
+ end
32
+
33
+ def consonant?(c)
34
+ CONSONANT.include?(c)
35
+ end
36
+
37
+ def remove_accent(str)
38
+ str.tr('áéíóúÁÉÍÓÚ','aeiouAEIOU')
39
+ end
40
+
41
+ def rv(str)
42
+ if consonant? str[1]
43
+ i=2
44
+ i+=1 while str[i] and consonant? str[i]
45
+ return str.nil? ? str.length-1 : i+1
46
+ end
47
+
48
+ if vowel? str[0] and vowel? str[1]
49
+ i=2
50
+ i+=1 while str[i] and vowel? str[i]
51
+ return str.nil? ? str.length-1 : i+1
52
+ end
53
+
54
+ return 3 if consonant? str[0] and vowel? str[1]
55
+
56
+ str.length - 1
57
+ end
58
+
59
+ def r(str, i=0)
60
+ i+=1 while str[i] and consonant?(str[i])
61
+ i+=1
62
+ i+=1 while str[i] and vowel? str[i]
63
+ str[i].nil? ? str.length : i+1
64
+ end
65
+
66
+ def r12(str)
67
+ r1 = r(str)
68
+ r2 = r(str,r1)
69
+ [r1,r2]
70
+ end
71
+
72
+ def step0(str)
73
+ return nil unless str =~ /(se(l[ao]s?)?|l([aeo]s?)|me|nos)$/i
74
+
75
+ suffix = $&
76
+ rv_text = str[rv(str)..-1]
77
+
78
+ case rv_text
79
+ when %r{((?<=i[éÉ]ndo|[áÁ]ndo|[áéíÁÉÍ]r)#{suffix})$}ui
80
+ str[%r{#$&$}]=''
81
+ str = remove_accent(str)
82
+ return str
83
+ when %r{((?<=iendo|ando|[aei]r)#{suffix})$}i
84
+ str[%r{#$&$}]=''
85
+ return str
86
+ end
87
+
88
+ if rv_text =~ /yendo/i and str =~ /uyendo/i
89
+ str[suffix]=''
90
+ return str
91
+ end
92
+ nil
93
+ end
94
+
95
+ #=> new_str or nil
96
+ def step1(str)
97
+ r1,r2 = r12(str)
98
+ r1_text = str[r1..-1]
99
+ r2_text = str[r2..-1]
100
+
101
+ case r2_text
102
+ when /(anzas?|ic[oa]s?|ismos?|[ai]bles?|istas?|os[oa]s?|[ai]mientos?)$/i
103
+ str[%r{#$&$}]=''
104
+ return str
105
+ when /(ic)?(ador([ae]s?)?|aci[óÓ]n|aciones|antes?|ancias?)$/ui
106
+ str[%r{#$&$}]=''
107
+ return str
108
+ when /log[íÍ]as?/ui
109
+ str[%r{#$&$}]='log'
110
+ return str
111
+ when /(uci([óÓ]n|ones))$/ui
112
+ str[%r{#$&$}]='u'
113
+ return str
114
+ when /(encias?)$/i
115
+ str[%r{#$&$}]='ente'
116
+ return str
117
+ end
118
+
119
+ if r2_text =~ /(ativ|iv|os|ic|ad)amente$/i or r1_text =~ /amente$/i
120
+ str[%r{#$&$}]=''
121
+ return str
122
+ end
123
+
124
+ case r2_text
125
+ when /((ante|[ai]ble)?mente)$/i, /((abil|i[cv])?idad(es)?)$/i, /((at)?iv[ao]s?)$/i
126
+ str[%r{#$&$}]=''
127
+ return str
128
+ end
129
+ nil
130
+ end
131
+
132
+ #=> nil or new_str
133
+ def step2a(str)
134
+ rv_pos = rv(str)
135
+ idx = str[rv_pos..-1] =~ /(y[oóÓ]|ye(ron|ndo)|y[ae][ns]?|ya(is|mos))$/ui
136
+
137
+ return nil unless idx
138
+
139
+ if 'u' == str[rv_pos+idx-1].downcase
140
+ str[%r{#$&$}] = ''
141
+ return str
142
+ end
143
+ nil
144
+ end
145
+
146
+ STEP2B_REGEXP = /(
147
+ ar([áÁ][ns]?|a(n|s|is)?|on)? | ar([éÉ]is|emos|é|É) | ar[íÍ]a(n|s|is|mos)? |
148
+ er([áÁ][sn]?|[éÉ](is)?|emos|[íÍ]a(n|s|is|mos)?)? |
149
+ ir([íÍ]a(s|n|is|mos)?|[áÁ][ns]?|emos|[éÉ]|éis)? | aba(s|n|is)? |
150
+ ad([ao]s?)? | ed | id(a|as|o|os)? | [íÍ]a(n|s|is|mos)? | [íÍ]s |
151
+ as(e[ns]?|te|eis|teis)? | [áÁ](is|bamos|semos|ramos) | a(n|ndo|mos) |
152
+ ie(ra|se|ran|sen|ron|ndo|ras|ses|rais|seis) | i(ste|steis|[óÓ]|mos|[éÉ]ramos|[éÉ]semos) |
153
+ en|es|[éÉ]is|emos
154
+ )$/xiu
155
+
156
+ def step2b(str)
157
+ rv_pos = rv(str)
158
+
159
+ if idx = str[rv_pos..-1] =~ STEP2B_REGEXP
160
+ suffix = $&
161
+ if suffix =~ /^(en|es|[éÉ]is|emos)$/ui
162
+ str[%r{#{suffix}$}]=''
163
+ str[rv_pos+idx-1]='' if str[rv_pos+idx-2] =~ /g/i and str[rv_pos+idx-1] =~ /u/i
164
+ else
165
+ str[%r{#{suffix}$}]=''
166
+ end
167
+ return str
168
+ end
169
+ nil
170
+ end
171
+
172
+ def step3(str)
173
+ rv_pos = rv(str)
174
+ rv_text = str[rv_pos..-1]
175
+
176
+ if rv_text =~ /(os|[aoáíóÁÍÓ])$/ui
177
+ str[%r{#$&$}]=''
178
+ return str
179
+ elsif idx = rv_text =~ /(u?[eéÉ])$/i
180
+ if $&[0].downcase == 'u' and str[rv_pos+idx-1].downcase == 'g'
181
+ str[%r{#$&$}]=''
182
+ else
183
+ str.chop!
184
+ end
185
+ return str
186
+ end
187
+ nil
188
+ end
189
+
190
+ VOWEL = 'aeiouáéíóúüAEIOUÁÉÍÓÚÜ'
191
+ CONSONANT = "bcdfghjklmnñpqrstvwxyzABCDEFGHIJKLMNÑOPQRSTUVWXYZ"
192
+ end
193
+
194
+ class String
195
+ include EStem
196
+ end