stemmer4r 0.5 → 0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -82,13 +82,25 @@ stemmer_free(struct sb_stemmer* stemmer)
82
82
  * You can stem:
83
83
  *
84
84
  * * a single word:
85
+ *
85
86
  * puts stemmer.stem('�t�')
86
87
  *
87
88
  * * an array:
89
+ *
88
90
  * puts stemmer.stem(%w{t�l�vision chapeau ordinateur})
89
91
  *
90
- * * a sentence:
91
- * puts stemmer.stem("Une t�l�vision sur un chapeau d'ordinateur")
92
+ * * a string of words:
93
+ *
94
+ * puts stemmer.stem("Une t�l�vision sur un chapeau d ordinateur")
95
+ *
96
+ * The fastest way to stem is to pass a string of words separated by spaces. You have to clean your string before stemming:
97
+ * remove all punctuation characters (! , ; : ! ? ...). All stemmed words in the string will be separated with one space.
98
+ *
99
+ * puts stemmer.stem("Une t�l�vision sur un chapeau d ordinateur")
100
+ *
101
+ * will return the same stemmed string as
102
+ *
103
+ * puts stemmer.stem("Une t�l�vision sur un chapeau d ordinateur")
92
104
  *
93
105
  */
94
106
  static VALUE
@@ -119,12 +131,16 @@ stemmer_allocate(VALUE klass, VALUE algorithm)
119
131
  * Document-method: stem
120
132
  * call-seq: stem(obj)
121
133
  *
122
- * Stems a word, a sentence or an array of words and returns the result (always in lowercase).
134
+ * Stems a word, a string of words separated by spaces or an array of words and returns the result (always in lowercase).
123
135
  * +obj+ is always converted to lowercase before stemming (mandatory for snowball algorithms to work).
124
136
  *
125
137
  * === Parameters
126
138
  *
127
- * +obj+:: word, sentence (string) or array of words to stem. All strings must be encoding the 'right' way (iso-8859-1 for french for example).
139
+ * +obj+:: word, string of words or array of words to stem.
140
+ *
141
+ * All strings must be encoding the 'right' way (iso-8859-1 for french for example).
142
+ * If you want to be able to 'transparently' stem UTF-8 characters, see the Stemmable_utf8 module.
143
+ *
128
144
  */
129
145
  static VALUE
130
146
  stemmer_stem(VALUE self, VALUE obj)
@@ -144,7 +160,7 @@ stemmer_stem(VALUE self, VALUE obj)
144
160
  case T_STRING:
145
161
  word_lowercase = rb_funcall2(obj, rb_intern("downcase"), 0, 0);
146
162
  cword = strdup(STR2CSTR(word_lowercase));
147
- // Sentence
163
+ // String of words
148
164
  if (strchr(cword, ' '))
149
165
  {
150
166
  ret = rb_str_new2("");
@@ -172,7 +188,7 @@ stemmer_stem(VALUE self, VALUE obj)
172
188
  cword = strdup(STR2CSTR(word_lowercase));
173
189
  rb_ary_push(ret, rb_str_new2(sb_stemmer_stem(stemmer, cword, RSTRING(RARRAY(obj)->ptr[i])->len)));
174
190
  }
175
- free(cword);
191
+ if (RARRAY(obj)->len) free(cword);
176
192
  break;
177
193
  default:
178
194
  rb_raise(rb_eTypeError, "not valid value");
@@ -0,0 +1,70 @@
1
+ #
2
+ # == Stemmable module
3
+ #
4
+ # This module is automatically added to the String and Array classes when you:
5
+ #
6
+ # require 'stemmer'
7
+ #
8
+ # It adds a +stem+ method to String and Array.
9
+ #
10
+ # str = 'this is a string'
11
+ #
12
+ # stemmed_str = str.stem
13
+ #
14
+ # array = %w{this is an array}
15
+ #
16
+ # stemmed_array = array.stem
17
+ #
18
+ # By default, stemming occurs in english. If you want to stem in another language, just give it as a parameter:
19
+ #
20
+ # str = 'Cha�ne de caract�res fran�aise'
21
+ #
22
+ # stemmed_str = str.stem('fr')
23
+ #
24
+ # Or you can change the default configuration:
25
+ #
26
+ # Stemmable::stemmer_default_language = 'fr'
27
+ #
28
+ # stemmed_str = str.stem
29
+ #
30
+ module Stemmable
31
+ begin
32
+ require 'rubygems'
33
+ require_gem 'stemmer4r'
34
+ rescue LoadError
35
+ require 'stemmer4r'
36
+ end
37
+
38
+ @@stemmer_default_language = 'en'
39
+ @@stemmer = Stemmer.new('en')
40
+ @@UTF8_MAP = {
41
+ 'fr' => 'iso-8859-1'
42
+ }
43
+
44
+ def Stemmable.stemmer_default_language=(language)
45
+ @@stemmer_default_language = language
46
+ @@stemmer = Stemmer.new(language)
47
+ language
48
+ end
49
+
50
+ def Stemmable.stemmer_default_language
51
+ return @@stemmer_default_language
52
+ end
53
+
54
+ def stem(language = nil)
55
+ if (language.nil?)
56
+ @@stemmer.stem(self)
57
+ else
58
+ stemmer = Stemmer.new(language)
59
+ stemmer.stem(self)
60
+ end
61
+ end
62
+ end
63
+
64
+ class String
65
+ include Stemmable
66
+ end
67
+
68
+ class Array
69
+ include Stemmable
70
+ end
@@ -0,0 +1,66 @@
1
+ #
2
+ # == Stemmable_utf8 module
3
+ #
4
+ # This module is automatically added to the String and Array classes when you:
5
+ #
6
+ # require 'stemmer_utf8'
7
+ #
8
+ # It adds a +stem_utf8+ method to String and Array.
9
+ #
10
+ # str_utf8 = 'this is a UTF-8 encoded string'
11
+ #
12
+ # stemmed_str_utf8 = str_utf8.stem_utf8
13
+ #
14
+ # array_utf8 = %w{this is an array with utf8 caracters}
15
+ #
16
+ # stemmed_array_utf8 = array_utf8.stem_utf8
17
+ #
18
+ # By default, stemming occurs in english. If you want to stem in another language, just give it as a parameter:
19
+ #
20
+ # str_utf8 = 'Cha�ne de caract�res fran�aise en UTF-8'
21
+ #
22
+ # stemmed_str_utf8 = str.stem_utf8('fr')
23
+ #
24
+ # Or you can change the default configuration:
25
+ #
26
+ # Stemmable::stemmer_default_language = 'fr'
27
+ #
28
+ # stemmed_str_utf8 = str_utf8.stem_utf8
29
+ #
30
+ module Stemmable_utf8
31
+ include Stemmable
32
+
33
+ def stem_utf8(language = nil)
34
+ require 'iconv'
35
+ if (language.nil?)
36
+ language = @@stemmer_default_language
37
+ stemmer = @@stemmer
38
+ else
39
+ stemmer = Stemmer.new(language)
40
+ end
41
+ language_encoding = @@UTF8_MAP[language] || 'iso-8859-1'
42
+ if self.is_a?(String)
43
+ Iconv.new('utf-8', language_encoding).iconv(stemmer.stem(Iconv.new(language_encoding, 'utf-8').iconv(self)))
44
+ elsif self.is_a?(Array)
45
+ temp = []
46
+ output = []
47
+ Iconv.open(language_encoding, 'utf-8') do |cd|
48
+ self.each { |s| temp << cd.iconv(s) + cd.iconv(nil) }
49
+ end
50
+ Iconv.open('utf-8', language_encoding) do |cd|
51
+ stemmer.stem(temp).each { |s| output << cd.iconv(s) + cd.iconv(nil) }
52
+ end
53
+ output
54
+ else
55
+ raise 'no valid type'
56
+ end
57
+ end
58
+ end
59
+
60
+ class String
61
+ include Stemmable_utf8
62
+ end
63
+
64
+ class Array
65
+ include Stemmable_utf8
66
+ end
@@ -3,7 +3,7 @@ require 'rake'
3
3
 
4
4
  spec = Gem::Specification.new do |s|
5
5
  s.name = 'stemmer4r'
6
- s.version = '0.5'
6
+ s.version = '0.6'
7
7
  s.author = "Fabien POTENCIER"
8
8
  s.email = "fabien.potencier@gmail.com"
9
9
  s.homepage = "http://stemmer4r.rubyforge.org"
@@ -14,9 +14,9 @@ spec = Gem::Specification.new do |s|
14
14
  s.description = <<-EOF
15
15
  Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
16
16
  EOF
17
- s.files = FileList['ext/**/*', 'test/**/*', 'stemmer4r.gemspec', 'README', 'LICENSE'].to_a
17
+ s.files = FileList['lib/**/*', 'ext/**/*', 'test/**/*', 'stemmer4r.gemspec', 'README', 'LICENSE'].to_a
18
18
  s.extensions << "ext/stemmer4r/extconf.rb"
19
- s.require_path = '.'
19
+ s.require_paths << 'ext'
20
20
  s.autorequire = 'stemmer4r'
21
21
  s.has_rdoc = true
22
22
  s.rdoc_options = [
@@ -7,18 +7,65 @@ rescue LoadError
7
7
  require 'stemmer4r'
8
8
  end
9
9
 
10
+ require 'stemmer'
10
11
  require 'benchmark'
11
12
  require 'test/unit'
12
13
 
13
14
  class StemmerTest < Test::Unit::TestCase
15
+ begin
16
+ require 'iconv'
17
+ require 'stemmer_utf8'
18
+ @@has_iconv = true
19
+ rescue
20
+ @@has_iconv = false
21
+ end
22
+
23
+ def test_utf8
24
+ return unless @@has_iconv
25
+
26
+ utf8_str_input = Iconv.new('utf-8', 'iso-8859-1').iconv('aimera t�l�vision')
27
+ utf8_str_output = Iconv.new('utf-8', 'iso-8859-1').iconv('aim t�l�vis')
28
+ assert_equal(utf8_str_output, utf8_str_input.stem_utf8('fr'))
29
+
30
+ utf8_str_input = Iconv.new('utf-8', 'iso-8859-1').iconv('love independant')
31
+ utf8_str_output = Iconv.new('utf-8', 'iso-8859-1').iconv('love independ')
32
+ assert_equal(utf8_str_output, utf8_str_input.stem_utf8)
33
+
34
+ utf8_ary_input = []
35
+ %w{aimera t�l�vision grandiose}.each { |k|
36
+ utf8_ary_input << Iconv.new('utf-8', 'iso-8859-1').iconv(k)
37
+ }
38
+ utf8_ary_output = []
39
+ %w{aim t�l�vis grandios}.each { |k|
40
+ utf8_ary_output << Iconv.new('utf-8', 'iso-8859-1').iconv(k)
41
+ }
42
+ assert_equal(utf8_ary_output, utf8_ary_input.stem_utf8('fr'))
43
+ end
44
+
45
+ def test_stemmable
46
+ assert_equal('il aim utilis le ordin dan le avion', 'il aime utiliser les ordinateurs dans les avions'.stem('fr'))
47
+ assert_equal('aim ordin', 'aime ordinateurs'.stem('fr'))
48
+ assert_equal('ordin jeux avion aim', %w{ordinateur jeux avion aimer}.stem('fr').join(' '))
49
+ assert_equal('ordin', %w{ordinateur}.stem('fr').join(' '))
50
+ assert_equal('aimera'.stem('fr'), 'AIMERA'.stem('fr'))
51
+ assert_equal('believable'.stem('en'), 'believable'.stem)
52
+ assert_equal('believ', 'believable'.stem)
53
+ assert_raise(ArgumentError) { 'test'.stem('notavalidlanguage') }
54
+ assert_equal('', ''.stem)
55
+ assert_equal([], [].stem)
56
+ assert_equal(['', ' '], ['', ' '].stem)
57
+ end
58
+
14
59
  def test_sentence
15
60
  s = Stemmer.new('fr')
16
- assert_equal('il aim utilis le ordin dan le avion', s.stem('il aime utiliser les ordinateurs dans les avions'))
61
+ assert_equal('il aim utilis le ordin dan le avion', s.stem('il aime utiliser les ordinateurs dans les avions'))
62
+ assert_equal('aim ordin', s.stem('aime ordinateurs'))
17
63
  end
18
64
 
19
65
  def test_array
20
66
  s = Stemmer.new('fr')
21
67
  assert_equal('ordin jeux avion aim', s.stem(%w{ordinateur jeux avion aimer}).join(' '))
68
+ assert_equal('ordin', s.stem(%w{ordinateur}).join(' '))
22
69
  end
23
70
 
24
71
  def test_lowercase
@@ -52,17 +99,40 @@ class StemmerTest < Test::Unit::TestCase
52
99
  }
53
100
  }
54
101
  x.report {
55
- %w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
102
+ %w{da de nl en es fi fr it no pt ru sv}.each { |language|
56
103
  puts "Testing '#{language}' algorithm (a single array)..."
57
104
  assert_equal(output[language], s[language].stem(input[language]))
58
105
  }
59
106
  }
60
107
  x.report {
61
- %w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
108
+ %w{da de nl en es fi fr it no pt ru sv}.each { |language|
62
109
  puts "Testing '#{language}' algorithm (a single sentence)..."
63
110
  assert_equal(output[language].join(' '), s[language].stem(input[language].join(' ')))
64
111
  }
65
112
  }
113
+ x.report {
114
+ %w{da de nl en es fi fr it no pt ru sv}.each { |language|
115
+ puts "Testing '#{language}' algorithm (a single array - Array module)..."
116
+ assert_equal(output[language], input[language].stem(language))
117
+ }
118
+ }
119
+ x.report {
120
+ %w{da de nl en es fi fr it no pt ru sv}.each { |language|
121
+ puts "Testing '#{language}' algorithm (a single sentence - String module)..."
122
+ assert_equal(output[language].join(' '), input[language].join(' ').stem(language))
123
+ }
124
+ }
125
+
126
+ return unless @@has_iconv
127
+
128
+ x.report {
129
+ %w{da de nl en es fi fr it no pt ru sv}.each { |language|
130
+ utf8_input = Iconv.new('utf-8', 'iso-8859-1').iconv(input[language].join(' '))
131
+ utf8_output = Iconv.new('utf-8', 'iso-8859-1').iconv(output[language].join(' '))
132
+ puts "Testing '#{language}' algorithm (a single sentence - String module - utf8)..."
133
+ assert_equal(utf8_output, utf8_input.stem_utf8(language))
134
+ }
135
+ }
66
136
  }
67
137
  end
68
138
  end
metadata CHANGED
@@ -3,11 +3,12 @@ rubygems_version: 0.8.10
3
3
  specification_version: 1
4
4
  name: stemmer4r
5
5
  version: !ruby/object:Gem::Version
6
- version: "0.5"
7
- date: 2005-05-12
6
+ version: "0.6"
7
+ date: 2005-05-13
8
8
  summary: Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
9
9
  require_paths:
10
- - "."
10
+ - lib
11
+ - ext
11
12
  email: fabien.potencier@gmail.com
12
13
  homepage: http://stemmer4r.rubyforge.org
13
14
  rubyforge_project: stemmer4r
@@ -27,6 +28,8 @@ platform: ruby
27
28
  authors:
28
29
  - Fabien POTENCIER
29
30
  files:
31
+ - lib/stemmer.rb
32
+ - lib/stemmer_utf8.rb
30
33
  - ext/stemmer4r
31
34
  - ext/stemmer4r/stemmer4r.c
32
35
  - ext/stemmer4r/libstemmer_c