estem 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
File without changes
data/test/test_estem.rb CHANGED
@@ -1,23 +1,28 @@
1
- # encoding: UTF-8
2
-
3
1
  require 'test/unit'
4
2
  require 'estem'
5
3
 
4
+ # NOTE:
5
+ # assuming we will run the test from the root directory of the project
6
+ # using "rake test" from the command-line
7
+
6
8
  class EStemTest < Test::Unit::TestCase
7
- def get_content(filename)
9
+ def get_content(filename, encoding='UTF-8')
8
10
  content = nil
9
- File.open(filename, 'r:UTF-8') do |f|
11
+ File.open(filename, "r:#{encoding}") do |f|
10
12
  content = f.read()
11
13
  end
12
14
  content.scan(/(\S+)(?:\s+)(\S+)/)
13
15
  end
14
16
 
15
- def test_stem
16
- # assuming we will run the test from the root directory of the project
17
- # using "rake test" from the command-line
18
- for word, good in get_content('test/diffs.txt')
17
+ def test_estem
18
+ for word, good in get_content('test/diffs_UTF8.txt')
19
19
  assert_equal(good, word.es_stem, "input: " + word)
20
- end
20
+ end
21
+
22
+ for word, good in get_content('test/diffs_ISO88591.txt', 'ISO-8859-1')
23
+ ret = word.safe_es_stem
24
+ assert_equal(good, ret, "input: " + word)
25
+ assert_equal('ISO-8859-1', ret.encoding.name.upcase)
26
+ end
21
27
  end
22
-
23
28
  end
@@ -0,0 +1,27 @@
1
+ require 'test/unit'
2
+ require 'estem'
3
+
4
+ # NOTE:
5
+ # assuming we will run the test from the root directory of the project
6
+ # using "rake test" from the command-line
7
+
8
+ class EStemTest < Test::Unit::TestCase
9
+ def get_content(filename, encoding='UTF-8')
10
+ content = nil
11
+ File.open(filename, "r:#{encoding}") do |f|
12
+ content = f.read()
13
+ end
14
+ content.scan(/(\S+)(?:\s+)(\S+)/)
15
+ end
16
+
17
+ def test_estem
18
+ for word, good in get_content('test/diffs_UTF8.txt')
19
+ assert_equal(good, word.es_stem, "input: " + word)
20
+ end
21
+
22
+ for word, good in get_content('test/diffs_ISO88591.txt', 'ISO-8859-1')
23
+ assert_equal(good, word.safe_es_stem, "input: " + word)
24
+ assert_equal(2,3)
25
+ end
26
+ end
27
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: estem
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-25 00:00:00.000000000 Z
12
+ date: 2012-08-02 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Spanish stemming. Based on Martin Porter's specifications. See README
15
15
  file for more information.
@@ -19,14 +19,17 @@ extensions: []
19
19
  extra_rdoc_files: []
20
20
  files:
21
21
  - Rakefile
22
- - bin/es_stem.rb
23
22
  - lib/estem.rb
23
+ - lib/estem.rb~
24
24
  - examples/usage.rb
25
+ - examples/usage.rb~
25
26
  - COPYRIGHT
26
27
  - README.rdoc
27
28
  - ChangeLog
28
- - test/diffs.txt
29
+ - test/diffs_ISO88591.txt
30
+ - test/diffs_UTF8.txt
29
31
  - test/test_estem.rb
32
+ - test/test_estem.rb~
30
33
  homepage: https://github.com/MaG21/estem
31
34
  licenses: []
32
35
  post_install_message:
@@ -52,5 +55,7 @@ signing_key:
52
55
  specification_version: 3
53
56
  summary: Spanish stemming. Based on Martin Porter's specifications.
54
57
  test_files:
55
- - test/diffs.txt
58
+ - test/diffs_ISO88591.txt
59
+ - test/diffs_UTF8.txt
56
60
  - test/test_estem.rb
61
+ - test/test_estem.rb~
data/bin/es_stem.rb DELETED
@@ -1,179 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # encoding: UTF-8
3
- # :stopdoc:
4
-
5
- # Copyright (c) 2012 Manuel A. Güílamo
6
- #
7
- # Permission is hereby granted, free of charge, to any person obtaining a copy
8
- # of this software and associated documentation files (the "Software"), to deal
9
- # in the Software without restriction, including without limitation the rights
10
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
- # copies of the Software, and to permit persons to whom the Software is
12
- # furnished to do so, subject to the following conditions:
13
- #
14
- # The above copyright notice and this permission notice shall be included in
15
- # all copies or substantial portions of the Software.
16
- #
17
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
- # SOFTWARE.
24
-
25
- require 'estem'
26
- require 'getoptlong'
27
- require 'iconv'
28
-
29
- $version = "0.1.10"
30
-
31
- def usage(error=false)
32
- out = error ? $stderr : $stdout
33
- out.puts DATA.read()
34
- end
35
-
36
- opts = GetoptLong.new(
37
- ['--help', '-h', GetoptLong::NO_ARGUMENT],
38
- ['--version', '-v', GetoptLong::NO_ARGUMENT],
39
- ['--file', '-f', GetoptLong::REQUIRED_ARGUMENT],
40
- ['--in-enc', '-i', GetoptLong::REQUIRED_ARGUMENT],
41
- ['--out-enc', '-o', GetoptLong::REQUIRED_ARGUMENT])
42
-
43
- opts.quiet = true
44
-
45
- filename = nil
46
- ienc = nil
47
- oenc = nil
48
-
49
- begin
50
- opts.each do |op, arg|
51
- case op
52
- when '--help'
53
- usage()
54
- exit
55
- when '--version'
56
- puts "EStem\nSpanish stemmer // lexemador\nVer: #{$version}"
57
- exit
58
- when '--file'
59
- filename = arg
60
- when '--in-enc'
61
- ienc = arg
62
- when '--out-enc'
63
- oenc = arg
64
- end
65
- end
66
- rescue GetoptLong::MissingArgument
67
- $stderr.puts 'Option requires an argument // La opción requiere un argumento'
68
- exit
69
- rescue GetoptLong::InvalidOption
70
- $stderr.puts 'Unknown option // Opción desconocida.'
71
- usage(true)
72
- exit
73
- rescue
74
- puts $!
75
- exit
76
- end
77
-
78
- if filename
79
- begin
80
- if ienc and ienc.upcase !='UTF-8'
81
- file = File.open(filename, "r:#{ienc}:UTF-8")
82
- else
83
- file = File.open(filename, 'r:UTF-8')
84
- end
85
- rescue
86
- $stderr.puts $!
87
- exit
88
- end
89
-
90
- begin
91
- hsh = {}
92
- file.each_line do|line|
93
- line.split(/[^a-záéíóúüñÁÉÍÓÚÜÑ]+/ui).each do|word|
94
- hsh[word] = word.es_stem unless hsh[word]
95
- end
96
- end
97
- rescue
98
- puts $!
99
- exit
100
- ensure
101
- file.close
102
- end
103
- else
104
- hsh = {}
105
- $stdin.each_line do|line|
106
- if ienc
107
- line = Iconv.conv('UTF-8', ienc, line)
108
- else
109
- # Just in case the terminal mess with the encoding name.
110
- # Por si la terminal juega con el nombre de la codificación.
111
- line.force_encoding('UTF-8')
112
- end
113
-
114
- begin
115
- line.split(/[^a-záéíóúüñÁÉÍÓÚÜÑ]+/ui).each do|word|
116
- hsh[word] = word.es_stem unless hsh[word]
117
- end
118
- rescue Encoding::CompatibilityError
119
- if ienc
120
- msg = "incompatible encoding, please use option " +
121
- "`--in-inc' correctly. //\n" +
122
- "codificación incompatible, por favor use la " +
123
- "opción `--in-inc' correctamente."
124
- else
125
- msg="incompatible encoding, please use option `--in-inc'."+
126
- " //\ncodificación incompatible, por favor use la " +
127
- "opción `--in-inc'."
128
- end
129
-
130
- if oenc
131
- msg = Iconv.conv(oenc, 'UTF-8', msg)
132
- end
133
-
134
- $stderr.puts msg
135
- exit
136
- rescue
137
- puts $!
138
- exit
139
- end
140
- end
141
- end
142
-
143
- if oenc
144
- begin
145
- hsh.each_pair do |k,v|
146
- puts Iconv.conv(oenc, 'UTF-8', "#{k} => #{v}")
147
- end
148
- rescue
149
- puts $!
150
- exit
151
- end
152
- else
153
- hsh.each_pair{ |k,v| puts "#{k} => #{v}" }
154
- end
155
-
156
- __END__
157
- Use: es_stem [OPTION]...
158
-
159
- Options:
160
- --help, -h display this help and exit. // Presenta esta ayuda y termina.
161
- --version, -v output version information and exit //
162
- Muestra la versión y termina.
163
- --file, -f file of words. // fichero de palabras.
164
- --in-enc, -i encoding of the file. // codificación del fichero.
165
- --out-enc, -o output encoding // codificación de salida.
166
-
167
- By default UTF-8 is used as input encoding, and if no file is specified,
168
- standard input will be used instead.
169
-
170
- You should set the option `--out-enc' if you are experimenting problems
171
- visualizing the output text.
172
-
173
- //
174
-
175
- Por defecto se usará UTF-8 como codificación de entrada, si no se especifica un
176
- fichero, la entrada estándard se usará en su lugar.
177
-
178
- Debería establecer la opción `--out-enc' si está experimentando problemas para
179
- visualizar el texto de salida.