stemmer4r 0.5 → 0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/stemmer4r/stemmer4r.c +22 -6
- data/lib/stemmer.rb +70 -0
- data/lib/stemmer_utf8.rb +66 -0
- data/stemmer4r.gemspec +3 -3
- data/test/test.rb +73 -3
- metadata +6 -3
data/ext/stemmer4r/stemmer4r.c
CHANGED
@@ -82,13 +82,25 @@ stemmer_free(struct sb_stemmer* stemmer)
|
|
82
82
|
* You can stem:
|
83
83
|
*
|
84
84
|
* * a single word:
|
85
|
+
*
|
85
86
|
* puts stemmer.stem('�t�')
|
86
87
|
*
|
87
88
|
* * an array:
|
89
|
+
*
|
88
90
|
* puts stemmer.stem(%w{t�l�vision chapeau ordinateur})
|
89
91
|
*
|
90
|
-
* * a
|
91
|
-
*
|
92
|
+
* * a string of words:
|
93
|
+
*
|
94
|
+
* puts stemmer.stem("Une t�l�vision sur un chapeau d ordinateur")
|
95
|
+
*
|
96
|
+
* The fastest way to stem is to pass a string of words separated by spaces. You have to clean your string before stemming:
|
97
|
+
* remove all punctuation characters (! , ; : ! ? ...). All stemmed words in the string will be separated with one space.
|
98
|
+
*
|
99
|
+
* puts stemmer.stem("Une t�l�vision sur un chapeau d ordinateur")
|
100
|
+
*
|
101
|
+
* will return the same stemmed string as
|
102
|
+
*
|
103
|
+
* puts stemmer.stem("Une t�l�vision sur un chapeau d ordinateur")
|
92
104
|
*
|
93
105
|
*/
|
94
106
|
static VALUE
|
@@ -119,12 +131,16 @@ stemmer_allocate(VALUE klass, VALUE algorithm)
|
|
119
131
|
* Document-method: stem
|
120
132
|
* call-seq: stem(obj)
|
121
133
|
*
|
122
|
-
* Stems a word, a
|
134
|
+
* Stems a word, a string of words separated by spaces or an array of words and returns the result (always in lowercase).
|
123
135
|
* +obj+ is always converted to lowercase before stemming (mandatory for snowball algorithms to work).
|
124
136
|
*
|
125
137
|
* === Parameters
|
126
138
|
*
|
127
|
-
* +obj+:: word,
|
139
|
+
* +obj+:: word, string of words or array of words to stem.
|
140
|
+
*
|
141
|
+
* All strings must be encoding the 'right' way (iso-8859-1 for french for example).
|
142
|
+
* If you want to be able to 'transparently' stem UTF-8 characters, see the Stemmable_utf8 module.
|
143
|
+
*
|
128
144
|
*/
|
129
145
|
static VALUE
|
130
146
|
stemmer_stem(VALUE self, VALUE obj)
|
@@ -144,7 +160,7 @@ stemmer_stem(VALUE self, VALUE obj)
|
|
144
160
|
case T_STRING:
|
145
161
|
word_lowercase = rb_funcall2(obj, rb_intern("downcase"), 0, 0);
|
146
162
|
cword = strdup(STR2CSTR(word_lowercase));
|
147
|
-
//
|
163
|
+
// String of words
|
148
164
|
if (strchr(cword, ' '))
|
149
165
|
{
|
150
166
|
ret = rb_str_new2("");
|
@@ -172,7 +188,7 @@ stemmer_stem(VALUE self, VALUE obj)
|
|
172
188
|
cword = strdup(STR2CSTR(word_lowercase));
|
173
189
|
rb_ary_push(ret, rb_str_new2(sb_stemmer_stem(stemmer, cword, RSTRING(RARRAY(obj)->ptr[i])->len)));
|
174
190
|
}
|
175
|
-
free(cword);
|
191
|
+
if (RARRAY(obj)->len) free(cword);
|
176
192
|
break;
|
177
193
|
default:
|
178
194
|
rb_raise(rb_eTypeError, "not valid value");
|
data/lib/stemmer.rb
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
#
|
2
|
+
# == Stemmable module
|
3
|
+
#
|
4
|
+
# This module is automatically added to the String and Array classes when you:
|
5
|
+
#
|
6
|
+
# require 'stemmer'
|
7
|
+
#
|
8
|
+
# It adds a +stem+ method to String and Array.
|
9
|
+
#
|
10
|
+
# str = 'this is a string'
|
11
|
+
#
|
12
|
+
# stemmed_str = str.stem
|
13
|
+
#
|
14
|
+
# array = %w{this is an array}
|
15
|
+
#
|
16
|
+
# stemmed_array = array.stem
|
17
|
+
#
|
18
|
+
# By default, stemming occurs in english. If you want to stem in another language, just give it as a parameter:
|
19
|
+
#
|
20
|
+
# str = 'Cha�ne de caract�res fran�aise'
|
21
|
+
#
|
22
|
+
# stemmed_str = str.stem('fr')
|
23
|
+
#
|
24
|
+
# Or you can change the default configuration:
|
25
|
+
#
|
26
|
+
# Stemmable::stemmer_default_language = 'fr'
|
27
|
+
#
|
28
|
+
# stemmed_str = str.stem
|
29
|
+
#
|
30
|
+
module Stemmable
|
31
|
+
begin
|
32
|
+
require 'rubygems'
|
33
|
+
require_gem 'stemmer4r'
|
34
|
+
rescue LoadError
|
35
|
+
require 'stemmer4r'
|
36
|
+
end
|
37
|
+
|
38
|
+
@@stemmer_default_language = 'en'
|
39
|
+
@@stemmer = Stemmer.new('en')
|
40
|
+
@@UTF8_MAP = {
|
41
|
+
'fr' => 'iso-8859-1'
|
42
|
+
}
|
43
|
+
|
44
|
+
def Stemmable.stemmer_default_language=(language)
|
45
|
+
@@stemmer_default_language = language
|
46
|
+
@@stemmer = Stemmer.new(language)
|
47
|
+
language
|
48
|
+
end
|
49
|
+
|
50
|
+
def Stemmable.stemmer_default_language
|
51
|
+
return @@stemmer_default_language
|
52
|
+
end
|
53
|
+
|
54
|
+
def stem(language = nil)
|
55
|
+
if (language.nil?)
|
56
|
+
@@stemmer.stem(self)
|
57
|
+
else
|
58
|
+
stemmer = Stemmer.new(language)
|
59
|
+
stemmer.stem(self)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class String
|
65
|
+
include Stemmable
|
66
|
+
end
|
67
|
+
|
68
|
+
class Array
|
69
|
+
include Stemmable
|
70
|
+
end
|
data/lib/stemmer_utf8.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
#
|
2
|
+
# == Stemmable_utf8 module
|
3
|
+
#
|
4
|
+
# This module is automatically added to the String and Array classes when you:
|
5
|
+
#
|
6
|
+
# require 'stemmer_utf8'
|
7
|
+
#
|
8
|
+
# It adds a +stem_utf8+ method to String and Array.
|
9
|
+
#
|
10
|
+
# str_utf8 = 'this is a UTF-8 encoded string'
|
11
|
+
#
|
12
|
+
# stemmed_str_utf8 = str_utf8.stem_utf8
|
13
|
+
#
|
14
|
+
# array_utf8 = %w{this is an array with utf8 caracters}
|
15
|
+
#
|
16
|
+
# stemmed_array_utf8 = array_utf8.stem_utf8
|
17
|
+
#
|
18
|
+
# By default, stemming occurs in english. If you want to stem in another language, just give it as a parameter:
|
19
|
+
#
|
20
|
+
# str_utf8 = 'Cha�ne de caract�res fran�aise en UTF-8'
|
21
|
+
#
|
22
|
+
# stemmed_str_utf8 = str.stem_utf8('fr')
|
23
|
+
#
|
24
|
+
# Or you can change the default configuration:
|
25
|
+
#
|
26
|
+
# Stemmable::stemmer_default_language = 'fr'
|
27
|
+
#
|
28
|
+
# stemmed_str_utf8 = str_utf8.stem_utf8
|
29
|
+
#
|
30
|
+
module Stemmable_utf8
|
31
|
+
include Stemmable
|
32
|
+
|
33
|
+
def stem_utf8(language = nil)
|
34
|
+
require 'iconv'
|
35
|
+
if (language.nil?)
|
36
|
+
language = @@stemmer_default_language
|
37
|
+
stemmer = @@stemmer
|
38
|
+
else
|
39
|
+
stemmer = Stemmer.new(language)
|
40
|
+
end
|
41
|
+
language_encoding = @@UTF8_MAP[language] || 'iso-8859-1'
|
42
|
+
if self.is_a?(String)
|
43
|
+
Iconv.new('utf-8', language_encoding).iconv(stemmer.stem(Iconv.new(language_encoding, 'utf-8').iconv(self)))
|
44
|
+
elsif self.is_a?(Array)
|
45
|
+
temp = []
|
46
|
+
output = []
|
47
|
+
Iconv.open(language_encoding, 'utf-8') do |cd|
|
48
|
+
self.each { |s| temp << cd.iconv(s) + cd.iconv(nil) }
|
49
|
+
end
|
50
|
+
Iconv.open('utf-8', language_encoding) do |cd|
|
51
|
+
stemmer.stem(temp).each { |s| output << cd.iconv(s) + cd.iconv(nil) }
|
52
|
+
end
|
53
|
+
output
|
54
|
+
else
|
55
|
+
raise 'no valid type'
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
class String
|
61
|
+
include Stemmable_utf8
|
62
|
+
end
|
63
|
+
|
64
|
+
class Array
|
65
|
+
include Stemmable_utf8
|
66
|
+
end
|
data/stemmer4r.gemspec
CHANGED
@@ -3,7 +3,7 @@ require 'rake'
|
|
3
3
|
|
4
4
|
spec = Gem::Specification.new do |s|
|
5
5
|
s.name = 'stemmer4r'
|
6
|
-
s.version = '0.
|
6
|
+
s.version = '0.6'
|
7
7
|
s.author = "Fabien POTENCIER"
|
8
8
|
s.email = "fabien.potencier@gmail.com"
|
9
9
|
s.homepage = "http://stemmer4r.rubyforge.org"
|
@@ -14,9 +14,9 @@ spec = Gem::Specification.new do |s|
|
|
14
14
|
s.description = <<-EOF
|
15
15
|
Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
|
16
16
|
EOF
|
17
|
-
s.files = FileList['ext/**/*', 'test/**/*', 'stemmer4r.gemspec', 'README', 'LICENSE'].to_a
|
17
|
+
s.files = FileList['lib/**/*', 'ext/**/*', 'test/**/*', 'stemmer4r.gemspec', 'README', 'LICENSE'].to_a
|
18
18
|
s.extensions << "ext/stemmer4r/extconf.rb"
|
19
|
-
s.
|
19
|
+
s.require_paths << 'ext'
|
20
20
|
s.autorequire = 'stemmer4r'
|
21
21
|
s.has_rdoc = true
|
22
22
|
s.rdoc_options = [
|
data/test/test.rb
CHANGED
@@ -7,18 +7,65 @@ rescue LoadError
|
|
7
7
|
require 'stemmer4r'
|
8
8
|
end
|
9
9
|
|
10
|
+
require 'stemmer'
|
10
11
|
require 'benchmark'
|
11
12
|
require 'test/unit'
|
12
13
|
|
13
14
|
class StemmerTest < Test::Unit::TestCase
|
15
|
+
begin
|
16
|
+
require 'iconv'
|
17
|
+
require 'stemmer_utf8'
|
18
|
+
@@has_iconv = true
|
19
|
+
rescue
|
20
|
+
@@has_iconv = false
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_utf8
|
24
|
+
return unless @@has_iconv
|
25
|
+
|
26
|
+
utf8_str_input = Iconv.new('utf-8', 'iso-8859-1').iconv('aimera t�l�vision')
|
27
|
+
utf8_str_output = Iconv.new('utf-8', 'iso-8859-1').iconv('aim t�l�vis')
|
28
|
+
assert_equal(utf8_str_output, utf8_str_input.stem_utf8('fr'))
|
29
|
+
|
30
|
+
utf8_str_input = Iconv.new('utf-8', 'iso-8859-1').iconv('love independant')
|
31
|
+
utf8_str_output = Iconv.new('utf-8', 'iso-8859-1').iconv('love independ')
|
32
|
+
assert_equal(utf8_str_output, utf8_str_input.stem_utf8)
|
33
|
+
|
34
|
+
utf8_ary_input = []
|
35
|
+
%w{aimera t�l�vision grandiose}.each { |k|
|
36
|
+
utf8_ary_input << Iconv.new('utf-8', 'iso-8859-1').iconv(k)
|
37
|
+
}
|
38
|
+
utf8_ary_output = []
|
39
|
+
%w{aim t�l�vis grandios}.each { |k|
|
40
|
+
utf8_ary_output << Iconv.new('utf-8', 'iso-8859-1').iconv(k)
|
41
|
+
}
|
42
|
+
assert_equal(utf8_ary_output, utf8_ary_input.stem_utf8('fr'))
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_stemmable
|
46
|
+
assert_equal('il aim utilis le ordin dan le avion', 'il aime utiliser les ordinateurs dans les avions'.stem('fr'))
|
47
|
+
assert_equal('aim ordin', 'aime ordinateurs'.stem('fr'))
|
48
|
+
assert_equal('ordin jeux avion aim', %w{ordinateur jeux avion aimer}.stem('fr').join(' '))
|
49
|
+
assert_equal('ordin', %w{ordinateur}.stem('fr').join(' '))
|
50
|
+
assert_equal('aimera'.stem('fr'), 'AIMERA'.stem('fr'))
|
51
|
+
assert_equal('believable'.stem('en'), 'believable'.stem)
|
52
|
+
assert_equal('believ', 'believable'.stem)
|
53
|
+
assert_raise(ArgumentError) { 'test'.stem('notavalidlanguage') }
|
54
|
+
assert_equal('', ''.stem)
|
55
|
+
assert_equal([], [].stem)
|
56
|
+
assert_equal(['', ' '], ['', ' '].stem)
|
57
|
+
end
|
58
|
+
|
14
59
|
def test_sentence
|
15
60
|
s = Stemmer.new('fr')
|
16
|
-
assert_equal('il aim utilis le ordin dan le avion', s.stem('il aime utiliser
|
61
|
+
assert_equal('il aim utilis le ordin dan le avion', s.stem('il aime utiliser les ordinateurs dans les avions'))
|
62
|
+
assert_equal('aim ordin', s.stem('aime ordinateurs'))
|
17
63
|
end
|
18
64
|
|
19
65
|
def test_array
|
20
66
|
s = Stemmer.new('fr')
|
21
67
|
assert_equal('ordin jeux avion aim', s.stem(%w{ordinateur jeux avion aimer}).join(' '))
|
68
|
+
assert_equal('ordin', s.stem(%w{ordinateur}).join(' '))
|
22
69
|
end
|
23
70
|
|
24
71
|
def test_lowercase
|
@@ -52,17 +99,40 @@ class StemmerTest < Test::Unit::TestCase
|
|
52
99
|
}
|
53
100
|
}
|
54
101
|
x.report {
|
55
|
-
%w{
|
102
|
+
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
56
103
|
puts "Testing '#{language}' algorithm (a single array)..."
|
57
104
|
assert_equal(output[language], s[language].stem(input[language]))
|
58
105
|
}
|
59
106
|
}
|
60
107
|
x.report {
|
61
|
-
%w{
|
108
|
+
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
62
109
|
puts "Testing '#{language}' algorithm (a single sentence)..."
|
63
110
|
assert_equal(output[language].join(' '), s[language].stem(input[language].join(' ')))
|
64
111
|
}
|
65
112
|
}
|
113
|
+
x.report {
|
114
|
+
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
115
|
+
puts "Testing '#{language}' algorithm (a single array - Array module)..."
|
116
|
+
assert_equal(output[language], input[language].stem(language))
|
117
|
+
}
|
118
|
+
}
|
119
|
+
x.report {
|
120
|
+
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
121
|
+
puts "Testing '#{language}' algorithm (a single sentence - String module)..."
|
122
|
+
assert_equal(output[language].join(' '), input[language].join(' ').stem(language))
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
return unless @@has_iconv
|
127
|
+
|
128
|
+
x.report {
|
129
|
+
%w{da de nl en es fi fr it no pt ru sv}.each { |language|
|
130
|
+
utf8_input = Iconv.new('utf-8', 'iso-8859-1').iconv(input[language].join(' '))
|
131
|
+
utf8_output = Iconv.new('utf-8', 'iso-8859-1').iconv(output[language].join(' '))
|
132
|
+
puts "Testing '#{language}' algorithm (a single sentence - String module - utf8)..."
|
133
|
+
assert_equal(utf8_output, utf8_input.stem_utf8(language))
|
134
|
+
}
|
135
|
+
}
|
66
136
|
}
|
67
137
|
end
|
68
138
|
end
|
metadata
CHANGED
@@ -3,11 +3,12 @@ rubygems_version: 0.8.10
|
|
3
3
|
specification_version: 1
|
4
4
|
name: stemmer4r
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: "0.
|
7
|
-
date: 2005-05-
|
6
|
+
version: "0.6"
|
7
|
+
date: 2005-05-13
|
8
8
|
summary: Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
|
9
9
|
require_paths:
|
10
|
-
-
|
10
|
+
- lib
|
11
|
+
- ext
|
11
12
|
email: fabien.potencier@gmail.com
|
12
13
|
homepage: http://stemmer4r.rubyforge.org
|
13
14
|
rubyforge_project: stemmer4r
|
@@ -27,6 +28,8 @@ platform: ruby
|
|
27
28
|
authors:
|
28
29
|
- Fabien POTENCIER
|
29
30
|
files:
|
31
|
+
- lib/stemmer.rb
|
32
|
+
- lib/stemmer_utf8.rb
|
30
33
|
- ext/stemmer4r
|
31
34
|
- ext/stemmer4r/stemmer4r.c
|
32
35
|
- ext/stemmer4r/libstemmer_c
|