stemmer4r 0.5 → 0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -82,13 +82,25 @@ stemmer_free(struct sb_stemmer* stemmer)
82
82
  * You can stem:
83
83
  *
84
84
  * * a single word:
85
+ *
85
86
  * puts stemmer.stem('�t�')
86
87
  *
87
88
  * * an array:
89
+ *
88
90
  * puts stemmer.stem(%w{t�l�vision chapeau ordinateur})
89
91
  *
90
- * * a sentence:
91
- * puts stemmer.stem("Une t�l�vision sur un chapeau d'ordinateur")
92
+ * * a string of words:
93
+ *
94
+ * puts stemmer.stem("Une t�l�vision sur un chapeau d ordinateur")
95
+ *
96
+ * The fastest way to stem is to pass a string of words separated by spaces. You have to clean your string before stemming:
97
+ * remove all punctuation characters (! , ; : ! ? ...). All stemmed words in the string will be separated with one space.
98
+ *
99
+ * puts stemmer.stem("Une t�l�vision sur un chapeau d ordinateur")
100
+ *
101
+ * will return the same stemmed string as
102
+ *
103
+ * puts stemmer.stem("Une t�l�vision sur un chapeau d ordinateur")
92
104
  *
93
105
  */
94
106
  static VALUE
@@ -119,12 +131,16 @@ stemmer_allocate(VALUE klass, VALUE algorithm)
119
131
  * Document-method: stem
120
132
  * call-seq: stem(obj)
121
133
  *
122
- * Stems a word, a sentence or an array of words and returns the result (always in lowercase).
134
+ * Stems a word, a string of words separated by spaces or an array of words and returns the result (always in lowercase).
123
135
  * +obj+ is always converted to lowercase before stemming (mandatory for snowball algorithms to work).
124
136
  *
125
137
  * === Parameters
126
138
  *
127
- * +obj+:: word, sentence (string) or array of words to stem. All strings must be encoding the 'right' way (iso-8859-1 for french for example).
139
+ * +obj+:: word, string of words or array of words to stem.
140
+ *
141
+ * All strings must be encoding the 'right' way (iso-8859-1 for french for example).
142
+ * If you want to be able to 'transparently' stem UTF-8 characters, see the Stemmable_utf8 module.
143
+ *
128
144
  */
129
145
  static VALUE
130
146
  stemmer_stem(VALUE self, VALUE obj)
@@ -144,7 +160,7 @@ stemmer_stem(VALUE self, VALUE obj)
144
160
  case T_STRING:
145
161
  word_lowercase = rb_funcall2(obj, rb_intern("downcase"), 0, 0);
146
162
  cword = strdup(STR2CSTR(word_lowercase));
147
- // Sentence
163
+ // String of words
148
164
  if (strchr(cword, ' '))
149
165
  {
150
166
  ret = rb_str_new2("");
@@ -172,7 +188,7 @@ stemmer_stem(VALUE self, VALUE obj)
172
188
  cword = strdup(STR2CSTR(word_lowercase));
173
189
  rb_ary_push(ret, rb_str_new2(sb_stemmer_stem(stemmer, cword, RSTRING(RARRAY(obj)->ptr[i])->len)));
174
190
  }
175
- free(cword);
191
+ if (RARRAY(obj)->len) free(cword);
176
192
  break;
177
193
  default:
178
194
  rb_raise(rb_eTypeError, "not valid value");
@@ -0,0 +1,70 @@
1
+ #
2
+ # == Stemmable module
3
+ #
4
+ # This module is automatically added to the String and Array classes when you:
5
+ #
6
+ # require 'stemmer'
7
+ #
8
+ # It adds a +stem+ method to String and Array.
9
+ #
10
+ # str = 'this is a string'
11
+ #
12
+ # stemmed_str = str.stem
13
+ #
14
+ # array = %w{this is an array}
15
+ #
16
+ # stemmed_array = array.stem
17
+ #
18
+ # By default, stemming occurs in english. If you want to stem in another language, just give it as a parameter:
19
+ #
20
+ # str = 'Cha�ne de caract�res fran�aise'
21
+ #
22
+ # stemmed_str = str.stem('fr')
23
+ #
24
+ # Or you can change the default configuration:
25
+ #
26
+ # Stemmable::stemmer_default_language = 'fr'
27
+ #
28
+ # stemmed_str = str.stem
29
+ #
30
+ module Stemmable
31
+ begin
32
+ require 'rubygems'
33
+ require_gem 'stemmer4r'
34
+ rescue LoadError
35
+ require 'stemmer4r'
36
+ end
37
+
38
+ @@stemmer_default_language = 'en'
39
+ @@stemmer = Stemmer.new('en')
40
+ @@UTF8_MAP = {
41
+ 'fr' => 'iso-8859-1'
42
+ }
43
+
44
+ def Stemmable.stemmer_default_language=(language)
45
+ @@stemmer_default_language = language
46
+ @@stemmer = Stemmer.new(language)
47
+ language
48
+ end
49
+
50
+ def Stemmable.stemmer_default_language
51
+ return @@stemmer_default_language
52
+ end
53
+
54
+ def stem(language = nil)
55
+ if (language.nil?)
56
+ @@stemmer.stem(self)
57
+ else
58
+ stemmer = Stemmer.new(language)
59
+ stemmer.stem(self)
60
+ end
61
+ end
62
+ end
63
+
64
+ class String
65
+ include Stemmable
66
+ end
67
+
68
+ class Array
69
+ include Stemmable
70
+ end
@@ -0,0 +1,66 @@
1
+ #
2
+ # == Stemmable_utf8 module
3
+ #
4
+ # This module is automatically added to the String and Array classes when you:
5
+ #
6
+ # require 'stemmer_utf8'
7
+ #
8
+ # It adds a +stem_utf8+ method to String and Array.
9
+ #
10
+ # str_utf8 = 'this is a UTF-8 encoded string'
11
+ #
12
+ # stemmed_str_utf8 = str_utf8.stem_utf8
13
+ #
14
+ # array_utf8 = %w{this is an array with utf8 caracters}
15
+ #
16
+ # stemmed_array_utf8 = array_utf8.stem_utf8
17
+ #
18
+ # By default, stemming occurs in english. If you want to stem in another language, just give it as a parameter:
19
+ #
20
+ # str_utf8 = 'Cha�ne de caract�res fran�aise en UTF-8'
21
+ #
22
+ # stemmed_str_utf8 = str.stem_utf8('fr')
23
+ #
24
+ # Or you can change the default configuration:
25
+ #
26
+ # Stemmable::stemmer_default_language = 'fr'
27
+ #
28
+ # stemmed_str_utf8 = str_utf8.stem_utf8
29
+ #
30
+ module Stemmable_utf8
31
+ include Stemmable
32
+
33
+ def stem_utf8(language = nil)
34
+ require 'iconv'
35
+ if (language.nil?)
36
+ language = @@stemmer_default_language
37
+ stemmer = @@stemmer
38
+ else
39
+ stemmer = Stemmer.new(language)
40
+ end
41
+ language_encoding = @@UTF8_MAP[language] || 'iso-8859-1'
42
+ if self.is_a?(String)
43
+ Iconv.new('utf-8', language_encoding).iconv(stemmer.stem(Iconv.new(language_encoding, 'utf-8').iconv(self)))
44
+ elsif self.is_a?(Array)
45
+ temp = []
46
+ output = []
47
+ Iconv.open(language_encoding, 'utf-8') do |cd|
48
+ self.each { |s| temp << cd.iconv(s) + cd.iconv(nil) }
49
+ end
50
+ Iconv.open('utf-8', language_encoding) do |cd|
51
+ stemmer.stem(temp).each { |s| output << cd.iconv(s) + cd.iconv(nil) }
52
+ end
53
+ output
54
+ else
55
+ raise 'no valid type'
56
+ end
57
+ end
58
+ end
59
+
60
+ class String
61
+ include Stemmable_utf8
62
+ end
63
+
64
+ class Array
65
+ include Stemmable_utf8
66
+ end
@@ -3,7 +3,7 @@ require 'rake'
3
3
 
4
4
  spec = Gem::Specification.new do |s|
5
5
  s.name = 'stemmer4r'
6
- s.version = '0.5'
6
+ s.version = '0.6'
7
7
  s.author = "Fabien POTENCIER"
8
8
  s.email = "fabien.potencier@gmail.com"
9
9
  s.homepage = "http://stemmer4r.rubyforge.org"
@@ -14,9 +14,9 @@ spec = Gem::Specification.new do |s|
14
14
  s.description = <<-EOF
15
15
  Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
16
16
  EOF
17
- s.files = FileList['ext/**/*', 'test/**/*', 'stemmer4r.gemspec', 'README', 'LICENSE'].to_a
17
+ s.files = FileList['lib/**/*', 'ext/**/*', 'test/**/*', 'stemmer4r.gemspec', 'README', 'LICENSE'].to_a
18
18
  s.extensions << "ext/stemmer4r/extconf.rb"
19
- s.require_path = '.'
19
+ s.require_paths << 'ext'
20
20
  s.autorequire = 'stemmer4r'
21
21
  s.has_rdoc = true
22
22
  s.rdoc_options = [
@@ -7,18 +7,65 @@ rescue LoadError
7
7
  require 'stemmer4r'
8
8
  end
9
9
 
10
+ require 'stemmer'
10
11
  require 'benchmark'
11
12
  require 'test/unit'
12
13
 
13
14
  class StemmerTest < Test::Unit::TestCase
15
+ begin
16
+ require 'iconv'
17
+ require 'stemmer_utf8'
18
+ @@has_iconv = true
19
+ rescue
20
+ @@has_iconv = false
21
+ end
22
+
23
+ def test_utf8
24
+ return unless @@has_iconv
25
+
26
+ utf8_str_input = Iconv.new('utf-8', 'iso-8859-1').iconv('aimera t�l�vision')
27
+ utf8_str_output = Iconv.new('utf-8', 'iso-8859-1').iconv('aim t�l�vis')
28
+ assert_equal(utf8_str_output, utf8_str_input.stem_utf8('fr'))
29
+
30
+ utf8_str_input = Iconv.new('utf-8', 'iso-8859-1').iconv('love independant')
31
+ utf8_str_output = Iconv.new('utf-8', 'iso-8859-1').iconv('love independ')
32
+ assert_equal(utf8_str_output, utf8_str_input.stem_utf8)
33
+
34
+ utf8_ary_input = []
35
+ %w{aimera t�l�vision grandiose}.each { |k|
36
+ utf8_ary_input << Iconv.new('utf-8', 'iso-8859-1').iconv(k)
37
+ }
38
+ utf8_ary_output = []
39
+ %w{aim t�l�vis grandios}.each { |k|
40
+ utf8_ary_output << Iconv.new('utf-8', 'iso-8859-1').iconv(k)
41
+ }
42
+ assert_equal(utf8_ary_output, utf8_ary_input.stem_utf8('fr'))
43
+ end
44
+
45
+ def test_stemmable
46
+ assert_equal('il aim utilis le ordin dan le avion', 'il aime utiliser les ordinateurs dans les avions'.stem('fr'))
47
+ assert_equal('aim ordin', 'aime ordinateurs'.stem('fr'))
48
+ assert_equal('ordin jeux avion aim', %w{ordinateur jeux avion aimer}.stem('fr').join(' '))
49
+ assert_equal('ordin', %w{ordinateur}.stem('fr').join(' '))
50
+ assert_equal('aimera'.stem('fr'), 'AIMERA'.stem('fr'))
51
+ assert_equal('believable'.stem('en'), 'believable'.stem)
52
+ assert_equal('believ', 'believable'.stem)
53
+ assert_raise(ArgumentError) { 'test'.stem('notavalidlanguage') }
54
+ assert_equal('', ''.stem)
55
+ assert_equal([], [].stem)
56
+ assert_equal(['', ' '], ['', ' '].stem)
57
+ end
58
+
14
59
  def test_sentence
15
60
  s = Stemmer.new('fr')
16
- assert_equal('il aim utilis le ordin dan le avion', s.stem('il aime utiliser les ordinateurs dans les avions'))
61
+ assert_equal('il aim utilis le ordin dan le avion', s.stem('il aime utiliser les ordinateurs dans les avions'))
62
+ assert_equal('aim ordin', s.stem('aime ordinateurs'))
17
63
  end
18
64
 
19
65
  def test_array
20
66
  s = Stemmer.new('fr')
21
67
  assert_equal('ordin jeux avion aim', s.stem(%w{ordinateur jeux avion aimer}).join(' '))
68
+ assert_equal('ordin', s.stem(%w{ordinateur}).join(' '))
22
69
  end
23
70
 
24
71
  def test_lowercase
@@ -52,17 +99,40 @@ class StemmerTest < Test::Unit::TestCase
52
99
  }
53
100
  }
54
101
  x.report {
55
- %w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
102
+ %w{da de nl en es fi fr it no pt ru sv}.each { |language|
56
103
  puts "Testing '#{language}' algorithm (a single array)..."
57
104
  assert_equal(output[language], s[language].stem(input[language]))
58
105
  }
59
106
  }
60
107
  x.report {
61
- %w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
108
+ %w{da de nl en es fi fr it no pt ru sv}.each { |language|
62
109
  puts "Testing '#{language}' algorithm (a single sentence)..."
63
110
  assert_equal(output[language].join(' '), s[language].stem(input[language].join(' ')))
64
111
  }
65
112
  }
113
+ x.report {
114
+ %w{da de nl en es fi fr it no pt ru sv}.each { |language|
115
+ puts "Testing '#{language}' algorithm (a single array - Array module)..."
116
+ assert_equal(output[language], input[language].stem(language))
117
+ }
118
+ }
119
+ x.report {
120
+ %w{da de nl en es fi fr it no pt ru sv}.each { |language|
121
+ puts "Testing '#{language}' algorithm (a single sentence - String module)..."
122
+ assert_equal(output[language].join(' '), input[language].join(' ').stem(language))
123
+ }
124
+ }
125
+
126
+ return unless @@has_iconv
127
+
128
+ x.report {
129
+ %w{da de nl en es fi fr it no pt ru sv}.each { |language|
130
+ utf8_input = Iconv.new('utf-8', 'iso-8859-1').iconv(input[language].join(' '))
131
+ utf8_output = Iconv.new('utf-8', 'iso-8859-1').iconv(output[language].join(' '))
132
+ puts "Testing '#{language}' algorithm (a single sentence - String module - utf8)..."
133
+ assert_equal(utf8_output, utf8_input.stem_utf8(language))
134
+ }
135
+ }
66
136
  }
67
137
  end
68
138
  end
metadata CHANGED
@@ -3,11 +3,12 @@ rubygems_version: 0.8.10
3
3
  specification_version: 1
4
4
  name: stemmer4r
5
5
  version: !ruby/object:Gem::Version
6
- version: "0.5"
7
- date: 2005-05-12
6
+ version: "0.6"
7
+ date: 2005-05-13
8
8
  summary: Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
9
9
  require_paths:
10
- - "."
10
+ - lib
11
+ - ext
11
12
  email: fabien.potencier@gmail.com
12
13
  homepage: http://stemmer4r.rubyforge.org
13
14
  rubyforge_project: stemmer4r
@@ -27,6 +28,8 @@ platform: ruby
27
28
  authors:
28
29
  - Fabien POTENCIER
29
30
  files:
31
+ - lib/stemmer.rb
32
+ - lib/stemmer_utf8.rb
30
33
  - ext/stemmer4r
31
34
  - ext/stemmer4r/stemmer4r.c
32
35
  - ext/stemmer4r/libstemmer_c