stemmer4r 0.5 → 0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/stemmer4r/stemmer4r.c +22 -6
- data/lib/stemmer.rb +70 -0
- data/lib/stemmer_utf8.rb +66 -0
- data/stemmer4r.gemspec +3 -3
- data/test/test.rb +73 -3
- metadata +6 -3
data/ext/stemmer4r/stemmer4r.c
CHANGED
@@ -82,13 +82,25 @@ stemmer_free(struct sb_stemmer* stemmer)
|
|
82
82
|
* You can stem:
|
83
83
|
*
|
84
84
|
* * a single word:
|
85
|
+
*
|
85
86
|
* puts stemmer.stem('�t�')
|
86
87
|
*
|
87
88
|
* * an array:
|
89
|
+
*
|
88
90
|
* puts stemmer.stem(%w{t�l�vision chapeau ordinateur})
|
89
91
|
*
|
90
|
-
* * a
|
91
|
-
*
|
92
|
+
* * a string of words:
|
93
|
+
*
|
94
|
+
* puts stemmer.stem("Une t�l�vision sur un chapeau d ordinateur")
|
95
|
+
*
|
96
|
+
* The fastest way to stem is to pass a string of words separated by spaces. You have to clean your string before stemming:
|
97
|
+
* remove all punctuation characters (! , ; : ! ? ...). All stemmed words in the string will be separated with one space.
|
98
|
+
*
|
99
|
+
* puts stemmer.stem("Une t�l�vision sur un chapeau d ordinateur")
|
100
|
+
*
|
101
|
+
* will return the same stemmed string as
|
102
|
+
*
|
103
|
+
* puts stemmer.stem("Une t�l�vision sur un chapeau d ordinateur")
|
92
104
|
*
|
93
105
|
*/
|
94
106
|
static VALUE
|
@@ -119,12 +131,16 @@ stemmer_allocate(VALUE klass, VALUE algorithm)
|
|
119
131
|
* Document-method: stem
|
120
132
|
* call-seq: stem(obj)
|
121
133
|
*
|
122
|
-
* Stems a word, a
|
134
|
+
* Stems a word, a string of words separated by spaces or an array of words and returns the result (always in lowercase).
|
123
135
|
* +obj+ is always converted to lowercase before stemming (mandatory for snowball algorithms to work).
|
124
136
|
*
|
125
137
|
* === Parameters
|
126
138
|
*
|
127
|
-
* +obj+:: word,
|
139
|
+
* +obj+:: word, string of words or array of words to stem.
|
140
|
+
*
|
141
|
+
* All strings must be encoding the 'right' way (iso-8859-1 for french for example).
|
142
|
+
* If you want to be able to 'transparently' stem UTF-8 characters, see the Stemmable_utf8 module.
|
143
|
+
*
|
128
144
|
*/
|
129
145
|
static VALUE
|
130
146
|
stemmer_stem(VALUE self, VALUE obj)
|
@@ -144,7 +160,7 @@ stemmer_stem(VALUE self, VALUE obj)
|
|
144
160
|
case T_STRING:
|
145
161
|
word_lowercase = rb_funcall2(obj, rb_intern("downcase"), 0, 0);
|
146
162
|
cword = strdup(STR2CSTR(word_lowercase));
|
147
|
-
//
|
163
|
+
// String of words
|
148
164
|
if (strchr(cword, ' '))
|
149
165
|
{
|
150
166
|
ret = rb_str_new2("");
|
@@ -172,7 +188,7 @@ stemmer_stem(VALUE self, VALUE obj)
|
|
172
188
|
cword = strdup(STR2CSTR(word_lowercase));
|
173
189
|
rb_ary_push(ret, rb_str_new2(sb_stemmer_stem(stemmer, cword, RSTRING(RARRAY(obj)->ptr[i])->len)));
|
174
190
|
}
|
175
|
-
free(cword);
|
191
|
+
if (RARRAY(obj)->len) free(cword);
|
176
192
|
break;
|
177
193
|
default:
|
178
194
|
rb_raise(rb_eTypeError, "not valid value");
|
data/lib/stemmer.rb
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
#
|
2
|
+
# == Stemmable module
|
3
|
+
#
|
4
|
+
# This module is automatically added to the String and Array classes when you:
|
5
|
+
#
|
6
|
+
# require 'stemmer'
|
7
|
+
#
|
8
|
+
# It adds a +stem+ method to String and Array.
|
9
|
+
#
|
10
|
+
# str = 'this is a string'
|
11
|
+
#
|
12
|
+
# stemmed_str = str.stem
|
13
|
+
#
|
14
|
+
# array = %w{this is an array}
|
15
|
+
#
|
16
|
+
# stemmed_array = array.stem
|
17
|
+
#
|
18
|
+
# By default, stemming occurs in english. If you want to stem in another language, just give it as a parameter:
|
19
|
+
#
|
20
|
+
# str = 'Cha�ne de caract�res fran�aise'
|
21
|
+
#
|
22
|
+
# stemmed_str = str.stem('fr')
|
23
|
+
#
|
24
|
+
# Or you can change the default configuration:
|
25
|
+
#
|
26
|
+
# Stemmable::stemmer_default_language = 'fr'
|
27
|
+
#
|
28
|
+
# stemmed_str = str.stem
|
29
|
+
#
|
30
|
+
module Stemmable
|
31
|
+
begin
|
32
|
+
require 'rubygems'
|
33
|
+
require_gem 'stemmer4r'
|
34
|
+
rescue LoadError
|
35
|
+
require 'stemmer4r'
|
36
|
+
end
|
37
|
+
|
38
|
+
@@stemmer_default_language = 'en'
|
39
|
+
@@stemmer = Stemmer.new('en')
|
40
|
+
@@UTF8_MAP = {
|
41
|
+
'fr' => 'iso-8859-1'
|
42
|
+
}
|
43
|
+
|
44
|
+
def Stemmable.stemmer_default_language=(language)
|
45
|
+
@@stemmer_default_language = language
|
46
|
+
@@stemmer = Stemmer.new(language)
|
47
|
+
language
|
48
|
+
end
|
49
|
+
|
50
|
+
def Stemmable.stemmer_default_language
|
51
|
+
return @@stemmer_default_language
|
52
|
+
end
|
53
|
+
|
54
|
+
def stem(language = nil)
|
55
|
+
if (language.nil?)
|
56
|
+
@@stemmer.stem(self)
|
57
|
+
else
|
58
|
+
stemmer = Stemmer.new(language)
|
59
|
+
stemmer.stem(self)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class String
|
65
|
+
include Stemmable
|
66
|
+
end
|
67
|
+
|
68
|
+
class Array
|
69
|
+
include Stemmable
|
70
|
+
end
|
data/lib/stemmer_utf8.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
#
|
2
|
+
# == Stemmable_utf8 module
|
3
|
+
#
|
4
|
+
# This module is automatically added to the String and Array classes when you:
|
5
|
+
#
|
6
|
+
# require 'stemmer_utf8'
|
7
|
+
#
|
8
|
+
# It adds a +stem_utf8+ method to String and Array.
|
9
|
+
#
|
10
|
+
# str_utf8 = 'this is a UTF-8 encoded string'
|
11
|
+
#
|
12
|
+
# stemmed_str_utf8 = str_utf8.stem_utf8
|
13
|
+
#
|
14
|
+
# array_utf8 = %w{this is an array with utf8 caracters}
|
15
|
+
#
|
16
|
+
# stemmed_array_utf8 = array_utf8.stem_utf8
|
17
|
+
#
|
18
|
+
# By default, stemming occurs in english. If you want to stem in another language, just give it as a parameter:
|
19
|
+
#
|
20
|
+
# str_utf8 = 'Cha�ne de caract�res fran�aise en UTF-8'
|
21
|
+
#
|
22
|
+
# stemmed_str_utf8 = str.stem_utf8('fr')
|
23
|
+
#
|
24
|
+
# Or you can change the default configuration:
|
25
|
+
#
|
26
|
+
# Stemmable::stemmer_default_language = 'fr'
|
27
|
+
#
|
28
|
+
# stemmed_str_utf8 = str_utf8.stem_utf8
|
29
|
+
#
|
30
|
+
module Stemmable_utf8
|
31
|
+
include Stemmable
|
32
|
+
|
33
|
+
def stem_utf8(language = nil)
|
34
|
+
require 'iconv'
|
35
|
+
if (language.nil?)
|
36
|
+
language = @@stemmer_default_language
|
37
|
+
stemmer = @@stemmer
|
38
|
+
else
|
39
|
+
stemmer = Stemmer.new(language)
|
40
|
+
end
|
41
|
+
language_encoding = @@UTF8_MAP[language] || 'iso-8859-1'
|
42
|
+
if self.is_a?(String)
|
43
|
+
Iconv.new('utf-8', language_encoding).iconv(stemmer.stem(Iconv.new(language_encoding, 'utf-8').iconv(self)))
|
44
|
+
elsif self.is_a?(Array)
|
45
|
+
temp = []
|
46
|
+
output = []
|
47
|
+
Iconv.open(language_encoding, 'utf-8') do |cd|
|
48
|
+
self.each { |s| temp << cd.iconv(s) + cd.iconv(nil) }
|
49
|
+
end
|
50
|
+
Iconv.open('utf-8', language_encoding) do |cd|
|
51
|
+
stemmer.stem(temp).each { |s| output << cd.iconv(s) + cd.iconv(nil) }
|
52
|
+
end
|
53
|
+
output
|
54
|
+
else
|
55
|
+
raise 'no valid type'
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
class String
|
61
|
+
include Stemmable_utf8
|
62
|
+
end
|
63
|
+
|
64
|
+
class Array
|
65
|
+
include Stemmable_utf8
|
66
|
+
end
|
data/stemmer4r.gemspec
CHANGED
@@ -3,7 +3,7 @@ require 'rake'
|
|
3
3
|
|
4
4
|
spec = Gem::Specification.new do |s|
|
5
5
|
s.name = 'stemmer4r'
|
6
|
-
s.version = '0.
|
6
|
+
s.version = '0.6'
|
7
7
|
s.author = "Fabien POTENCIER"
|
8
8
|
s.email = "fabien.potencier@gmail.com"
|
9
9
|
s.homepage = "http://stemmer4r.rubyforge.org"
|
@@ -14,9 +14,9 @@ spec = Gem::Specification.new do |s|
|
|
14
14
|
s.description = <<-EOF
|
15
15
|
Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
|
16
16
|
EOF
|
17
|
-
s.files = FileList['ext/**/*', 'test/**/*', 'stemmer4r.gemspec', 'README', 'LICENSE'].to_a
|
17
|
+
s.files = FileList['lib/**/*', 'ext/**/*', 'test/**/*', 'stemmer4r.gemspec', 'README', 'LICENSE'].to_a
|
18
18
|
s.extensions << "ext/stemmer4r/extconf.rb"
|
19
|
-
s.
|
19
|
+
s.require_paths << 'ext'
|
20
20
|
s.autorequire = 'stemmer4r'
|
21
21
|
s.has_rdoc = true
|
22
22
|
s.rdoc_options = [
|
data/test/test.rb
CHANGED
@@ -7,18 +7,65 @@ rescue LoadError
|
|
7
7
|
require 'stemmer4r'
|
8
8
|
end
|
9
9
|
|
10
|
+
require 'stemmer'
|
10
11
|
require 'benchmark'
|
11
12
|
require 'test/unit'
|
12
13
|
|
13
14
|
class StemmerTest < Test::Unit::TestCase
|
15
|
+
begin
|
16
|
+
require 'iconv'
|
17
|
+
require 'stemmer_utf8'
|
18
|
+
@@has_iconv = true
|
19
|
+
rescue
|
20
|
+
@@has_iconv = false
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_utf8
|
24
|
+
return unless @@has_iconv
|
25
|
+
|
26
|
+
utf8_str_input = Iconv.new('utf-8', 'iso-8859-1').iconv('aimera t�l�vision')
|
27
|
+
utf8_str_output = Iconv.new('utf-8', 'iso-8859-1').iconv('aim t�l�vis')
|
28
|
+
assert_equal(utf8_str_output, utf8_str_input.stem_utf8('fr'))
|
29
|
+
|
30
|
+
utf8_str_input = Iconv.new('utf-8', 'iso-8859-1').iconv('love independant')
|
31
|
+
utf8_str_output = Iconv.new('utf-8', 'iso-8859-1').iconv('love independ')
|
32
|
+
assert_equal(utf8_str_output, utf8_str_input.stem_utf8)
|
33
|
+
|
34
|
+
utf8_ary_input = []
|
35
|
+
%w{aimera t�l�vision grandiose}.each { |k|
|
36
|
+
utf8_ary_input << Iconv.new('utf-8', 'iso-8859-1').iconv(k)
|
37
|
+
}
|
38
|
+
utf8_ary_output = []
|
39
|
+
%w{aim t�l�vis grandios}.each { |k|
|
40
|
+
utf8_ary_output << Iconv.new('utf-8', 'iso-8859-1').iconv(k)
|
41
|
+
}
|
42
|
+
assert_equal(utf8_ary_output, utf8_ary_input.stem_utf8('fr'))
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_stemmable
|
46
|
+
assert_equal('il aim utilis le ordin dan le avion', 'il aime utiliser les ordinateurs dans les avions'.stem('fr'))
|
47
|
+
assert_equal('aim ordin', 'aime ordinateurs'.stem('fr'))
|
48
|
+
assert_equal('ordin jeux avion aim', %w{ordinateur jeux avion aimer}.stem('fr').join(' '))
|
49
|
+
assert_equal('ordin', %w{ordinateur}.stem('fr').join(' '))
|
50
|
+
assert_equal('aimera'.stem('fr'), 'AIMERA'.stem('fr'))
|
51
|
+
assert_equal('believable'.stem('en'), 'believable'.stem)
|
52
|
+
assert_equal('believ', 'believable'.stem)
|
53
|
+
assert_raise(ArgumentError) { 'test'.stem('notavalidlanguage') }
|
54
|
+
assert_equal('', ''.stem)
|
55
|
+
assert_equal([], [].stem)
|
56
|
+
assert_equal(['', ' '], ['', ' '].stem)
|
57
|
+
end
|
58
|
+
|
14
59
|
def test_sentence
|
15
60
|
s = Stemmer.new('fr')
|
16
|
-
assert_equal('il aim utilis le ordin dan le avion', s.stem('il aime utiliser
|
61
|
+
assert_equal('il aim utilis le ordin dan le avion', s.stem('il aime utiliser les ordinateurs dans les avions'))
|
62
|
+
assert_equal('aim ordin', s.stem('aime ordinateurs'))
|
17
63
|
end
|
18
64
|
|
19
65
|
def test_array
|
20
66
|
s = Stemmer.new('fr')
|
21
67
|
assert_equal('ordin jeux avion aim', s.stem(%w{ordinateur jeux avion aimer}).join(' '))
|
68
|
+
assert_equal('ordin', s.stem(%w{ordinateur}).join(' '))
|
22
69
|
end
|
23
70
|
|
24
71
|
def test_lowercase
|
@@ -52,17 +99,40 @@ class StemmerTest < Test::Unit::TestCase
|
|
52
99
|
}
|
53
100
|
}
|
54
101
|
x.report {
|
55
|
-
%w{
|
102
|
+
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
56
103
|
puts "Testing '#{language}' algorithm (a single array)..."
|
57
104
|
assert_equal(output[language], s[language].stem(input[language]))
|
58
105
|
}
|
59
106
|
}
|
60
107
|
x.report {
|
61
|
-
%w{
|
108
|
+
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
62
109
|
puts "Testing '#{language}' algorithm (a single sentence)..."
|
63
110
|
assert_equal(output[language].join(' '), s[language].stem(input[language].join(' ')))
|
64
111
|
}
|
65
112
|
}
|
113
|
+
x.report {
|
114
|
+
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
115
|
+
puts "Testing '#{language}' algorithm (a single array - Array module)..."
|
116
|
+
assert_equal(output[language], input[language].stem(language))
|
117
|
+
}
|
118
|
+
}
|
119
|
+
x.report {
|
120
|
+
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
121
|
+
puts "Testing '#{language}' algorithm (a single sentence - String module)..."
|
122
|
+
assert_equal(output[language].join(' '), input[language].join(' ').stem(language))
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
return unless @@has_iconv
|
127
|
+
|
128
|
+
x.report {
|
129
|
+
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
130
|
+
utf8_input = Iconv.new('utf-8', 'iso-8859-1').iconv(input[language].join(' '))
|
131
|
+
utf8_output = Iconv.new('utf-8', 'iso-8859-1').iconv(output[language].join(' '))
|
132
|
+
puts "Testing '#{language}' algorithm (a single sentence - String module - utf8)..."
|
133
|
+
assert_equal(utf8_output, utf8_input.stem_utf8(language))
|
134
|
+
}
|
135
|
+
}
|
66
136
|
}
|
67
137
|
end
|
68
138
|
end
|
metadata
CHANGED
@@ -3,11 +3,12 @@ rubygems_version: 0.8.10
|
|
3
3
|
specification_version: 1
|
4
4
|
name: stemmer4r
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: "0.
|
7
|
-
date: 2005-05-
|
6
|
+
version: "0.6"
|
7
|
+
date: 2005-05-13
|
8
8
|
summary: Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
|
9
9
|
require_paths:
|
10
|
-
-
|
10
|
+
- lib
|
11
|
+
- ext
|
11
12
|
email: fabien.potencier@gmail.com
|
12
13
|
homepage: http://stemmer4r.rubyforge.org
|
13
14
|
rubyforge_project: stemmer4r
|
@@ -27,6 +28,8 @@ platform: ruby
|
|
27
28
|
authors:
|
28
29
|
- Fabien POTENCIER
|
29
30
|
files:
|
31
|
+
- lib/stemmer.rb
|
32
|
+
- lib/stemmer_utf8.rb
|
30
33
|
- ext/stemmer4r
|
31
34
|
- ext/stemmer4r/stemmer4r.c
|
32
35
|
- ext/stemmer4r/libstemmer_c
|