estem 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +37 -10
- data/README.rdoc +7 -38
- data/examples/usage.rb +0 -2
- data/examples/usage.rb~ +11 -0
- data/lib/estem.rb +59 -60
- data/lib/estem.rb~ +271 -0
- data/test/diffs_ISO88591.txt +28390 -0
- data/test/{diffs.txt → diffs_UTF8.txt} +0 -0
- data/test/test_estem.rb +15 -10
- data/test/test_estem.rb~ +27 -0
- metadata +10 -5
- data/bin/es_stem.rb +0 -179
File without changes
|
data/test/test_estem.rb
CHANGED
@@ -1,23 +1,28 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
1
|
require 'test/unit'
|
4
2
|
require 'estem'
|
5
3
|
|
4
|
+
# NOTE:
|
5
|
+
# assuming we will run the test from the root directory of the project
|
6
|
+
# using "rake test" from the command-line
|
7
|
+
|
6
8
|
class EStemTest < Test::Unit::TestCase
|
7
|
-
def get_content(filename)
|
9
|
+
def get_content(filename, encoding='UTF-8')
|
8
10
|
content = nil
|
9
|
-
File.open(filename,
|
11
|
+
File.open(filename, "r:#{encoding}") do |f|
|
10
12
|
content = f.read()
|
11
13
|
end
|
12
14
|
content.scan(/(\S+)(?:\s+)(\S+)/)
|
13
15
|
end
|
14
16
|
|
15
|
-
def
|
16
|
-
|
17
|
-
# using "rake test" from the command-line
|
18
|
-
for word, good in get_content('test/diffs.txt')
|
17
|
+
def test_estem
|
18
|
+
for word, good in get_content('test/diffs_UTF8.txt')
|
19
19
|
assert_equal(good, word.es_stem, "input: " + word)
|
20
|
-
end
|
20
|
+
end
|
21
|
+
|
22
|
+
for word, good in get_content('test/diffs_ISO88591.txt', 'ISO-8859-1')
|
23
|
+
ret = word.safe_es_stem
|
24
|
+
assert_equal(good, ret, "input: " + word)
|
25
|
+
assert_equal('ISO-8859-1', ret.encoding.name.upcase)
|
26
|
+
end
|
21
27
|
end
|
22
|
-
|
23
28
|
end
|
data/test/test_estem.rb~
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'estem'
|
3
|
+
|
4
|
+
# NOTE:
|
5
|
+
# assuming we will run the test from the root directory of the project
|
6
|
+
# using "rake test" from the command-line
|
7
|
+
|
8
|
+
class EStemTest < Test::Unit::TestCase
|
9
|
+
def get_content(filename, encoding='UTF-8')
|
10
|
+
content = nil
|
11
|
+
File.open(filename, "r:#{encoding}") do |f|
|
12
|
+
content = f.read()
|
13
|
+
end
|
14
|
+
content.scan(/(\S+)(?:\s+)(\S+)/)
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_estem
|
18
|
+
for word, good in get_content('test/diffs_UTF8.txt')
|
19
|
+
assert_equal(good, word.es_stem, "input: " + word)
|
20
|
+
end
|
21
|
+
|
22
|
+
for word, good in get_content('test/diffs_ISO88591.txt', 'ISO-8859-1')
|
23
|
+
assert_equal(good, word.safe_es_stem, "input: " + word)
|
24
|
+
assert_equal(2,3)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: estem
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-08-02 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Spanish stemming. Based on Martin Porter's specifications. See README
|
15
15
|
file for more information.
|
@@ -19,14 +19,17 @@ extensions: []
|
|
19
19
|
extra_rdoc_files: []
|
20
20
|
files:
|
21
21
|
- Rakefile
|
22
|
-
- bin/es_stem.rb
|
23
22
|
- lib/estem.rb
|
23
|
+
- lib/estem.rb~
|
24
24
|
- examples/usage.rb
|
25
|
+
- examples/usage.rb~
|
25
26
|
- COPYRIGHT
|
26
27
|
- README.rdoc
|
27
28
|
- ChangeLog
|
28
|
-
- test/
|
29
|
+
- test/diffs_ISO88591.txt
|
30
|
+
- test/diffs_UTF8.txt
|
29
31
|
- test/test_estem.rb
|
32
|
+
- test/test_estem.rb~
|
30
33
|
homepage: https://github.com/MaG21/estem
|
31
34
|
licenses: []
|
32
35
|
post_install_message:
|
@@ -52,5 +55,7 @@ signing_key:
|
|
52
55
|
specification_version: 3
|
53
56
|
summary: Spanish stemming. Based on Martin Porter's specifications.
|
54
57
|
test_files:
|
55
|
-
- test/
|
58
|
+
- test/diffs_ISO88591.txt
|
59
|
+
- test/diffs_UTF8.txt
|
56
60
|
- test/test_estem.rb
|
61
|
+
- test/test_estem.rb~
|
data/bin/es_stem.rb
DELETED
@@ -1,179 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# encoding: UTF-8
|
3
|
-
# :stopdoc:
|
4
|
-
|
5
|
-
# Copyright (c) 2012 Manuel A. Güílamo
|
6
|
-
#
|
7
|
-
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
8
|
-
# of this software and associated documentation files (the "Software"), to deal
|
9
|
-
# in the Software without restriction, including without limitation the rights
|
10
|
-
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11
|
-
# copies of the Software, and to permit persons to whom the Software is
|
12
|
-
# furnished to do so, subject to the following conditions:
|
13
|
-
#
|
14
|
-
# The above copyright notice and this permission notice shall be included in
|
15
|
-
# all copies or substantial portions of the Software.
|
16
|
-
#
|
17
|
-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
18
|
-
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
19
|
-
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
20
|
-
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
21
|
-
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
22
|
-
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23
|
-
# SOFTWARE.
|
24
|
-
|
25
|
-
require 'estem'
|
26
|
-
require 'getoptlong'
|
27
|
-
require 'iconv'
|
28
|
-
|
29
|
-
$version = "0.1.10"
|
30
|
-
|
31
|
-
def usage(error=false)
|
32
|
-
out = error ? $stderr : $stdout
|
33
|
-
out.puts DATA.read()
|
34
|
-
end
|
35
|
-
|
36
|
-
opts = GetoptLong.new(
|
37
|
-
['--help', '-h', GetoptLong::NO_ARGUMENT],
|
38
|
-
['--version', '-v', GetoptLong::NO_ARGUMENT],
|
39
|
-
['--file', '-f', GetoptLong::REQUIRED_ARGUMENT],
|
40
|
-
['--in-enc', '-i', GetoptLong::REQUIRED_ARGUMENT],
|
41
|
-
['--out-enc', '-o', GetoptLong::REQUIRED_ARGUMENT])
|
42
|
-
|
43
|
-
opts.quiet = true
|
44
|
-
|
45
|
-
filename = nil
|
46
|
-
ienc = nil
|
47
|
-
oenc = nil
|
48
|
-
|
49
|
-
begin
|
50
|
-
opts.each do |op, arg|
|
51
|
-
case op
|
52
|
-
when '--help'
|
53
|
-
usage()
|
54
|
-
exit
|
55
|
-
when '--version'
|
56
|
-
puts "EStem\nSpanish stemmer // lexemador\nVer: #{$version}"
|
57
|
-
exit
|
58
|
-
when '--file'
|
59
|
-
filename = arg
|
60
|
-
when '--in-enc'
|
61
|
-
ienc = arg
|
62
|
-
when '--out-enc'
|
63
|
-
oenc = arg
|
64
|
-
end
|
65
|
-
end
|
66
|
-
rescue GetoptLong::MissingArgument
|
67
|
-
$stderr.puts 'Option requires an argument // La opción requiere un argumento'
|
68
|
-
exit
|
69
|
-
rescue GetoptLong::InvalidOption
|
70
|
-
$stderr.puts 'Unknown option // Opción desconocida.'
|
71
|
-
usage(true)
|
72
|
-
exit
|
73
|
-
rescue
|
74
|
-
puts $!
|
75
|
-
exit
|
76
|
-
end
|
77
|
-
|
78
|
-
if filename
|
79
|
-
begin
|
80
|
-
if ienc and ienc.upcase !='UTF-8'
|
81
|
-
file = File.open(filename, "r:#{ienc}:UTF-8")
|
82
|
-
else
|
83
|
-
file = File.open(filename, 'r:UTF-8')
|
84
|
-
end
|
85
|
-
rescue
|
86
|
-
$stderr.puts $!
|
87
|
-
exit
|
88
|
-
end
|
89
|
-
|
90
|
-
begin
|
91
|
-
hsh = {}
|
92
|
-
file.each_line do|line|
|
93
|
-
line.split(/[^a-záéíóúüñÁÉÍÓÚÜÑ]+/ui).each do|word|
|
94
|
-
hsh[word] = word.es_stem unless hsh[word]
|
95
|
-
end
|
96
|
-
end
|
97
|
-
rescue
|
98
|
-
puts $!
|
99
|
-
exit
|
100
|
-
ensure
|
101
|
-
file.close
|
102
|
-
end
|
103
|
-
else
|
104
|
-
hsh = {}
|
105
|
-
$stdin.each_line do|line|
|
106
|
-
if ienc
|
107
|
-
line = Iconv.conv('UTF-8', ienc, line)
|
108
|
-
else
|
109
|
-
# Just in case the terminal mess with the encoding name.
|
110
|
-
# Por si la terminal juega con el nombre de la codificación.
|
111
|
-
line.force_encoding('UTF-8')
|
112
|
-
end
|
113
|
-
|
114
|
-
begin
|
115
|
-
line.split(/[^a-záéíóúüñÁÉÍÓÚÜÑ]+/ui).each do|word|
|
116
|
-
hsh[word] = word.es_stem unless hsh[word]
|
117
|
-
end
|
118
|
-
rescue Encoding::CompatibilityError
|
119
|
-
if ienc
|
120
|
-
msg = "incompatible encoding, please use option " +
|
121
|
-
"`--in-inc' correctly. //\n" +
|
122
|
-
"codificación incompatible, por favor use la " +
|
123
|
-
"opción `--in-inc' correctamente."
|
124
|
-
else
|
125
|
-
msg="incompatible encoding, please use option `--in-inc'."+
|
126
|
-
" //\ncodificación incompatible, por favor use la " +
|
127
|
-
"opción `--in-inc'."
|
128
|
-
end
|
129
|
-
|
130
|
-
if oenc
|
131
|
-
msg = Iconv.conv(oenc, 'UTF-8', msg)
|
132
|
-
end
|
133
|
-
|
134
|
-
$stderr.puts msg
|
135
|
-
exit
|
136
|
-
rescue
|
137
|
-
puts $!
|
138
|
-
exit
|
139
|
-
end
|
140
|
-
end
|
141
|
-
end
|
142
|
-
|
143
|
-
if oenc
|
144
|
-
begin
|
145
|
-
hsh.each_pair do |k,v|
|
146
|
-
puts Iconv.conv(oenc, 'UTF-8', "#{k} => #{v}")
|
147
|
-
end
|
148
|
-
rescue
|
149
|
-
puts $!
|
150
|
-
exit
|
151
|
-
end
|
152
|
-
else
|
153
|
-
hsh.each_pair{ |k,v| puts "#{k} => #{v}" }
|
154
|
-
end
|
155
|
-
|
156
|
-
__END__
|
157
|
-
Use: es_stem [OPTION]...
|
158
|
-
|
159
|
-
Options:
|
160
|
-
--help, -h display this help and exit. // Presenta esta ayuda y termina.
|
161
|
-
--version, -v output version information and exit //
|
162
|
-
Muestra la versión y termina.
|
163
|
-
--file, -f file of words. // fichero de palabras.
|
164
|
-
--in-enc, -i encoding of the file. // codificación del fichero.
|
165
|
-
--out-enc, -o output encoding // codificación de salida.
|
166
|
-
|
167
|
-
By default UTF-8 is used as input encoding, and if no file is specified,
|
168
|
-
standard input will be used instead.
|
169
|
-
|
170
|
-
You should set the option `--out-enc' if you are experimenting problems
|
171
|
-
visualizing the output text.
|
172
|
-
|
173
|
-
//
|
174
|
-
|
175
|
-
Por defecto se usará UTF-8 como codificación de entrada, si no se especifica un
|
176
|
-
fichero, la entrada estándard se usará en su lugar.
|
177
|
-
|
178
|
-
Debería establecer la opción `--out-enc' si está experimentando problemas para
|
179
|
-
visualizar el texto de salida.
|