estem 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
data/test/test_estem.rb CHANGED
@@ -1,23 +1,28 @@
1
- # encoding: UTF-8
2
-
3
1
  require 'test/unit'
4
2
  require 'estem'
5
3
 
4
+ # NOTE:
5
+ # assuming we will run the test from the root directory of the project
6
+ # using "rake test" from the command-line
7
+
6
8
  class EStemTest < Test::Unit::TestCase
7
- def get_content(filename)
9
+ def get_content(filename, encoding='UTF-8')
8
10
  content = nil
9
- File.open(filename, 'r:UTF-8') do |f|
11
+ File.open(filename, "r:#{encoding}") do |f|
10
12
  content = f.read()
11
13
  end
12
14
  content.scan(/(\S+)(?:\s+)(\S+)/)
13
15
  end
14
16
 
15
- def test_stem
16
- # assuming we will run the test from the root directory of the project
17
- # using "rake test" from the command-line
18
- for word, good in get_content('test/diffs.txt')
17
+ def test_estem
18
+ for word, good in get_content('test/diffs_UTF8.txt')
19
19
  assert_equal(good, word.es_stem, "input: " + word)
20
- end
20
+ end
21
+
22
+ for word, good in get_content('test/diffs_ISO88591.txt', 'ISO-8859-1')
23
+ ret = word.safe_es_stem
24
+ assert_equal(good, ret, "input: " + word)
25
+ assert_equal('ISO-8859-1', ret.encoding.name.upcase)
26
+ end
21
27
  end
22
-
23
28
  end
@@ -0,0 +1,27 @@
1
+ require 'test/unit'
2
+ require 'estem'
3
+
4
+ # NOTE:
5
+ # assuming we will run the test from the root directory of the project
6
+ # using "rake test" from the command-line
7
+
8
+ class EStemTest < Test::Unit::TestCase
9
+ def get_content(filename, encoding='UTF-8')
10
+ content = nil
11
+ File.open(filename, "r:#{encoding}") do |f|
12
+ content = f.read()
13
+ end
14
+ content.scan(/(\S+)(?:\s+)(\S+)/)
15
+ end
16
+
17
+ def test_estem
18
+ for word, good in get_content('test/diffs_UTF8.txt')
19
+ assert_equal(good, word.es_stem, "input: " + word)
20
+ end
21
+
22
+ for word, good in get_content('test/diffs_ISO88591.txt', 'ISO-8859-1')
23
+ assert_equal(good, word.safe_es_stem, "input: " + word)
24
+ assert_equal(2,3)
25
+ end
26
+ end
27
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: estem
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-25 00:00:00.000000000 Z
12
+ date: 2012-08-02 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Spanish stemming. Based on Martin Porter's specifications. See README
15
15
  file for more information.
@@ -19,14 +19,17 @@ extensions: []
19
19
  extra_rdoc_files: []
20
20
  files:
21
21
  - Rakefile
22
- - bin/es_stem.rb
23
22
  - lib/estem.rb
23
+ - lib/estem.rb~
24
24
  - examples/usage.rb
25
+ - examples/usage.rb~
25
26
  - COPYRIGHT
26
27
  - README.rdoc
27
28
  - ChangeLog
28
- - test/diffs.txt
29
+ - test/diffs_ISO88591.txt
30
+ - test/diffs_UTF8.txt
29
31
  - test/test_estem.rb
32
+ - test/test_estem.rb~
30
33
  homepage: https://github.com/MaG21/estem
31
34
  licenses: []
32
35
  post_install_message:
@@ -52,5 +55,7 @@ signing_key:
52
55
  specification_version: 3
53
56
  summary: Spanish stemming. Based on Martin Porter's specifications.
54
57
  test_files:
55
- - test/diffs.txt
58
+ - test/diffs_ISO88591.txt
59
+ - test/diffs_UTF8.txt
56
60
  - test/test_estem.rb
61
+ - test/test_estem.rb~
data/bin/es_stem.rb DELETED
@@ -1,179 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # encoding: UTF-8
3
- # :stopdoc:
4
-
5
- # Copyright (c) 2012 Manuel A. Güílamo
6
- #
7
- # Permission is hereby granted, free of charge, to any person obtaining a copy
8
- # of this software and associated documentation files (the "Software"), to deal
9
- # in the Software without restriction, including without limitation the rights
10
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
- # copies of the Software, and to permit persons to whom the Software is
12
- # furnished to do so, subject to the following conditions:
13
- #
14
- # The above copyright notice and this permission notice shall be included in
15
- # all copies or substantial portions of the Software.
16
- #
17
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
- # SOFTWARE.
24
-
25
- require 'estem'
26
- require 'getoptlong'
27
- require 'iconv'
28
-
29
- $version = "0.1.10"
30
-
31
- def usage(error=false)
32
- out = error ? $stderr : $stdout
33
- out.puts DATA.read()
34
- end
35
-
36
- opts = GetoptLong.new(
37
- ['--help', '-h', GetoptLong::NO_ARGUMENT],
38
- ['--version', '-v', GetoptLong::NO_ARGUMENT],
39
- ['--file', '-f', GetoptLong::REQUIRED_ARGUMENT],
40
- ['--in-enc', '-i', GetoptLong::REQUIRED_ARGUMENT],
41
- ['--out-enc', '-o', GetoptLong::REQUIRED_ARGUMENT])
42
-
43
- opts.quiet = true
44
-
45
- filename = nil
46
- ienc = nil
47
- oenc = nil
48
-
49
- begin
50
- opts.each do |op, arg|
51
- case op
52
- when '--help'
53
- usage()
54
- exit
55
- when '--version'
56
- puts "EStem\nSpanish stemmer // lexemador\nVer: #{$version}"
57
- exit
58
- when '--file'
59
- filename = arg
60
- when '--in-enc'
61
- ienc = arg
62
- when '--out-enc'
63
- oenc = arg
64
- end
65
- end
66
- rescue GetoptLong::MissingArgument
67
- $stderr.puts 'Option requires an argument // La opción requiere un argumento'
68
- exit
69
- rescue GetoptLong::InvalidOption
70
- $stderr.puts 'Unknown option // Opción desconocida.'
71
- usage(true)
72
- exit
73
- rescue
74
- puts $!
75
- exit
76
- end
77
-
78
- if filename
79
- begin
80
- if ienc and ienc.upcase !='UTF-8'
81
- file = File.open(filename, "r:#{ienc}:UTF-8")
82
- else
83
- file = File.open(filename, 'r:UTF-8')
84
- end
85
- rescue
86
- $stderr.puts $!
87
- exit
88
- end
89
-
90
- begin
91
- hsh = {}
92
- file.each_line do|line|
93
- line.split(/[^a-záéíóúüñÁÉÍÓÚÜÑ]+/ui).each do|word|
94
- hsh[word] = word.es_stem unless hsh[word]
95
- end
96
- end
97
- rescue
98
- puts $!
99
- exit
100
- ensure
101
- file.close
102
- end
103
- else
104
- hsh = {}
105
- $stdin.each_line do|line|
106
- if ienc
107
- line = Iconv.conv('UTF-8', ienc, line)
108
- else
109
- # Just in case the terminal mess with the encoding name.
110
- # Por si la terminal juega con el nombre de la codificación.
111
- line.force_encoding('UTF-8')
112
- end
113
-
114
- begin
115
- line.split(/[^a-záéíóúüñÁÉÍÓÚÜÑ]+/ui).each do|word|
116
- hsh[word] = word.es_stem unless hsh[word]
117
- end
118
- rescue Encoding::CompatibilityError
119
- if ienc
120
- msg = "incompatible encoding, please use option " +
121
- "`--in-inc' correctly. //\n" +
122
- "codificación incompatible, por favor use la " +
123
- "opción `--in-inc' correctamente."
124
- else
125
- msg="incompatible encoding, please use option `--in-inc'."+
126
- " //\ncodificación incompatible, por favor use la " +
127
- "opción `--in-inc'."
128
- end
129
-
130
- if oenc
131
- msg = Iconv.conv(oenc, 'UTF-8', msg)
132
- end
133
-
134
- $stderr.puts msg
135
- exit
136
- rescue
137
- puts $!
138
- exit
139
- end
140
- end
141
- end
142
-
143
- if oenc
144
- begin
145
- hsh.each_pair do |k,v|
146
- puts Iconv.conv(oenc, 'UTF-8', "#{k} => #{v}")
147
- end
148
- rescue
149
- puts $!
150
- exit
151
- end
152
- else
153
- hsh.each_pair{ |k,v| puts "#{k} => #{v}" }
154
- end
155
-
156
- __END__
157
- Use: es_stem [OPTION]...
158
-
159
- Options:
160
- --help, -h display this help and exit. // Presenta esta ayuda y termina.
161
- --version, -v output version information and exit //
162
- Muestra la versión y termina.
163
- --file, -f file of words. // fichero de palabras.
164
- --in-enc, -i encoding of the file. // codificación del fichero.
165
- --out-enc, -o output encoding // codificación de salida.
166
-
167
- By default UTF-8 is used as input encoding, and if no file is specified,
168
- standard input will be used instead.
169
-
170
- You should set the option `--out-enc' if you are experimenting problems
171
- visualizing the output text.
172
-
173
- //
174
-
175
- Por defecto se usará UTF-8 como codificación de entrada, si no se especifica un
176
- fichero, la entrada estándard se usará en su lugar.
177
-
178
- Debería establecer la opción `--out-enc' si está experimentando problemas para
179
- visualizar el texto de salida.