stemmer4r 0.4 → 0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/stemmer4r/stemmer4r.c +61 -14
 - data/stemmer4r.gemspec +1 -1
 - data/test/test.rb +43 -6
 - metadata +2 -2
 
    
        data/ext/stemmer4r/stemmer4r.c
    CHANGED
    
    | 
         @@ -74,12 +74,22 @@ stemmer_free(struct sb_stemmer* stemmer) 
     | 
|
| 
       74 
74 
     | 
    
         
             
             *
         
     | 
| 
       75 
75 
     | 
    
         
             
             * === Examples
         
     | 
| 
       76 
76 
     | 
    
         
             
             *
         
     | 
| 
       77 
     | 
    
         
            -
             * 
     | 
| 
       78 
     | 
    
         
            -
             * 
     | 
| 
      
 77 
     | 
    
         
            +
             *   stemmer = Stemmer.new('en')
         
     | 
| 
      
 78 
     | 
    
         
            +
             *   puts stemmer.stem('testing')
         
     | 
| 
       79 
79 
     | 
    
         
             
             *
         
     | 
| 
       80 
     | 
    
         
            -
             * 
     | 
| 
       81 
     | 
    
         
            -
             * puts stemmer.stem('�t�')
         
     | 
| 
      
 80 
     | 
    
         
            +
             *   stemmer = Stemmer.new('fr')
         
     | 
| 
       82 
81 
     | 
    
         
             
             *
         
     | 
| 
      
 82 
     | 
    
         
            +
             * You can stem:
         
     | 
| 
      
 83 
     | 
    
         
            +
             *
         
     | 
| 
      
 84 
     | 
    
         
            +
             * * a single word:
         
     | 
| 
      
 85 
     | 
    
         
            +
             *   puts stemmer.stem('�t�')
         
     | 
| 
      
 86 
     | 
    
         
            +
             *
         
     | 
| 
      
 87 
     | 
    
         
            +
             * * an array:
         
     | 
| 
      
 88 
     | 
    
         
            +
             *   puts stemmer.stem(%w{t�l�vision chapeau ordinateur})
         
     | 
| 
      
 89 
     | 
    
         
            +
             *
         
     | 
| 
      
 90 
     | 
    
         
            +
             * * a sentence:
         
     | 
| 
      
 91 
     | 
    
         
            +
             *   puts stemmer.stem("Une t�l�vision sur un chapeau d'ordinateur")
         
     | 
| 
      
 92 
     | 
    
         
            +
             * 
         
     | 
| 
       83 
93 
     | 
    
         
             
             */
         
     | 
| 
       84 
94 
     | 
    
         
             
            static VALUE
         
     | 
| 
       85 
95 
     | 
    
         
             
            stemmer_allocate(VALUE klass, VALUE algorithm)
         
     | 
| 
         @@ -107,30 +117,67 @@ stemmer_allocate(VALUE klass, VALUE algorithm) 
     | 
|
| 
       107 
117 
     | 
    
         | 
| 
       108 
118 
     | 
    
         
             
            /*
         
     | 
| 
       109 
119 
     | 
    
         
             
             * Document-method: stem
         
     | 
| 
       110 
     | 
    
         
            -
             * call-seq: stem( 
     | 
| 
      
 120 
     | 
    
         
            +
             * call-seq: stem(obj)
         
     | 
| 
       111 
121 
     | 
    
         
             
             *
         
     | 
| 
       112 
     | 
    
         
            -
             * Stems  
     | 
| 
      
 122 
     | 
    
         
            +
             * Stems a word, a sentence or an array of words and returns the result (always in lowercase).
         
     | 
| 
      
 123 
     | 
    
         
            +
             * +obj+ is always converted to lowercase before stemming (mandatory for snowball algorithms to work).
         
     | 
| 
       113 
124 
     | 
    
         
             
             *
         
     | 
| 
       114 
125 
     | 
    
         
             
             * === Parameters
         
     | 
| 
       115 
126 
     | 
    
         
             
             *
         
     | 
| 
       116 
     | 
    
         
            -
             * + 
     | 
| 
      
 127 
     | 
    
         
            +
             * +obj+:: word, sentence (string) or array of words to stem. All strings must be encoding the 'right' way (iso-8859-1 for french for example).
         
     | 
| 
       117 
128 
     | 
    
         
             
             */
         
     | 
| 
       118 
129 
     | 
    
         
             
            static VALUE
         
     | 
| 
       119 
     | 
    
         
            -
            stemmer_stem(VALUE self, VALUE  
     | 
| 
      
 130 
     | 
    
         
            +
            stemmer_stem(VALUE self, VALUE obj)
         
     | 
| 
       120 
131 
     | 
    
         
             
            {
         
     | 
| 
       121 
132 
     | 
    
         
             
              VALUE ret;
         
     | 
| 
      
 133 
     | 
    
         
            +
              VALUE word_lowercase;
         
     | 
| 
       122 
134 
     | 
    
         
             
              sb_symbol *cword;
         
     | 
| 
      
 135 
     | 
    
         
            +
              sb_symbol *t1;
         
     | 
| 
       123 
136 
     | 
    
         
             
              const sb_symbol *rcword;
         
     | 
| 
       124 
137 
     | 
    
         
             
              struct sb_stemmer *stemmer;
         
     | 
| 
      
 138 
     | 
    
         
            +
              int i;
         
     | 
| 
       125 
139 
     | 
    
         | 
| 
       126 
     | 
    
         
            -
              Check_Type(word, T_STRING);
         
     | 
| 
       127 
     | 
    
         
            -
             
     | 
| 
       128 
     | 
    
         
            -
              cword = strdup(STR2CSTR(word));
         
     | 
| 
       129 
140 
     | 
    
         
             
              Data_Get_Struct(self, struct sb_stemmer, stemmer);
         
     | 
| 
       130 
     | 
    
         
            -
              rcword = sb_stemmer_stem(stemmer, cword, RSTRING(word)->len);
         
     | 
| 
       131 
141 
     | 
    
         | 
| 
       132 
     | 
    
         
            -
               
     | 
| 
       133 
     | 
    
         
            -
               
     | 
| 
      
 142 
     | 
    
         
            +
              switch (TYPE(obj))
         
     | 
| 
      
 143 
     | 
    
         
            +
              {
         
     | 
| 
      
 144 
     | 
    
         
            +
                case T_STRING:
         
     | 
| 
      
 145 
     | 
    
         
            +
                  word_lowercase = rb_funcall2(obj, rb_intern("downcase"), 0, 0);
         
     | 
| 
      
 146 
     | 
    
         
            +
                  cword = strdup(STR2CSTR(word_lowercase));
         
     | 
| 
      
 147 
     | 
    
         
            +
                  // Sentence
         
     | 
| 
      
 148 
     | 
    
         
            +
                  if (strchr(cword, ' '))
         
     | 
| 
      
 149 
     | 
    
         
            +
                  {
         
     | 
| 
      
 150 
     | 
    
         
            +
                    ret = rb_str_new2("");
         
     | 
| 
      
 151 
     | 
    
         
            +
                    for (t1 = strtok(cword, " "); t1 != NULL; t1 = strtok(NULL, " "))
         
     | 
| 
      
 152 
     | 
    
         
            +
                    {
         
     | 
| 
      
 153 
     | 
    
         
            +
                      rb_str_cat2(ret, sb_stemmer_stem(stemmer, t1, strlen(t1)));
         
     | 
| 
      
 154 
     | 
    
         
            +
                      rb_str_cat2(ret, " ");
         
     | 
| 
      
 155 
     | 
    
         
            +
                    }
         
     | 
| 
      
 156 
     | 
    
         
            +
                    RSTRING(ret)->len--;
         
     | 
| 
      
 157 
     | 
    
         
            +
                    RSTRING(ret)->ptr[RSTRING(ret)->len] = '\0';
         
     | 
| 
      
 158 
     | 
    
         
            +
                  }
         
     | 
| 
      
 159 
     | 
    
         
            +
                  else
         
     | 
| 
      
 160 
     | 
    
         
            +
                  {
         
     | 
| 
      
 161 
     | 
    
         
            +
                    // A word
         
     | 
| 
      
 162 
     | 
    
         
            +
                    rcword = sb_stemmer_stem(stemmer, cword, RSTRING(obj)->len);
         
     | 
| 
      
 163 
     | 
    
         
            +
                    ret = rb_str_new2(rcword);
         
     | 
| 
      
 164 
     | 
    
         
            +
                  }
         
     | 
| 
      
 165 
     | 
    
         
            +
                  free(cword);
         
     | 
| 
      
 166 
     | 
    
         
            +
                  break;
         
     | 
| 
      
 167 
     | 
    
         
            +
                case T_ARRAY:
         
     | 
| 
      
 168 
     | 
    
         
            +
                  ret = rb_ary_new2(RARRAY(obj)->len);
         
     | 
| 
      
 169 
     | 
    
         
            +
                  for (i = 0; i < RARRAY(obj)->len; i++)
         
     | 
| 
      
 170 
     | 
    
         
            +
                  {
         
     | 
| 
      
 171 
     | 
    
         
            +
                    word_lowercase = rb_funcall2(RARRAY(obj)->ptr[i], rb_intern("downcase"), 0, 0);
         
     | 
| 
      
 172 
     | 
    
         
            +
                    cword = strdup(STR2CSTR(word_lowercase));
         
     | 
| 
      
 173 
     | 
    
         
            +
                    rb_ary_push(ret, rb_str_new2(sb_stemmer_stem(stemmer, cword, RSTRING(RARRAY(obj)->ptr[i])->len)));
         
     | 
| 
      
 174 
     | 
    
         
            +
                  }
         
     | 
| 
      
 175 
     | 
    
         
            +
                  free(cword);
         
     | 
| 
      
 176 
     | 
    
         
            +
                  break;
         
     | 
| 
      
 177 
     | 
    
         
            +
                default:
         
     | 
| 
      
 178 
     | 
    
         
            +
                  rb_raise(rb_eTypeError, "not valid value");
         
     | 
| 
      
 179 
     | 
    
         
            +
                  break;
         
     | 
| 
      
 180 
     | 
    
         
            +
              }
         
     | 
| 
       134 
181 
     | 
    
         | 
| 
       135 
182 
     | 
    
         
             
              return ret;
         
     | 
| 
       136 
183 
     | 
    
         
             
            }
         
     | 
    
        data/stemmer4r.gemspec
    CHANGED
    
    
    
        data/test/test.rb
    CHANGED
    
    | 
         @@ -7,9 +7,25 @@ rescue LoadError 
     | 
|
| 
       7 
7 
     | 
    
         
             
              require 'stemmer4r'
         
     | 
| 
       8 
8 
     | 
    
         
             
            end
         
     | 
| 
       9 
9 
     | 
    
         | 
| 
      
 10 
     | 
    
         
            +
            require 'benchmark'
         
     | 
| 
       10 
11 
     | 
    
         
             
            require 'test/unit'
         
     | 
| 
       11 
12 
     | 
    
         | 
| 
       12 
13 
     | 
    
         
             
            class StemmerTest < Test::Unit::TestCase
         
     | 
| 
      
 14 
     | 
    
         
            +
              def test_sentence
         
     | 
| 
      
 15 
     | 
    
         
            +
                s = Stemmer.new('fr')
         
     | 
| 
      
 16 
     | 
    
         
            +
                assert_equal('il aim utilis le ordin dan le avion', s.stem('il aime utiliser les ordinateurs dans les avions'))
         
     | 
| 
      
 17 
     | 
    
         
            +
              end
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
              def test_array
         
     | 
| 
      
 20 
     | 
    
         
            +
                s = Stemmer.new('fr')
         
     | 
| 
      
 21 
     | 
    
         
            +
                assert_equal('ordin jeux avion aim', s.stem(%w{ordinateur jeux avion aimer}).join(' '))
         
     | 
| 
      
 22 
     | 
    
         
            +
              end
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
              def test_lowercase
         
     | 
| 
      
 25 
     | 
    
         
            +
                s = Stemmer.new('fr')
         
     | 
| 
      
 26 
     | 
    
         
            +
                assert_equal(s.stem('aimera'), s.stem('AIMERA'))
         
     | 
| 
      
 27 
     | 
    
         
            +
              end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
       13 
29 
     | 
    
         
             
              def test_valid_language
         
     | 
| 
       14 
30 
     | 
    
         
             
                assert_not_nil(Stemmer.new('fr'))
         
     | 
| 
       15 
31 
     | 
    
         
             
                assert_nothing_raised() { Stemmer.new('fr') }
         
     | 
| 
         @@ -18,13 +34,34 @@ class StemmerTest < Test::Unit::TestCase 
     | 
|
| 
       18 
34 
     | 
    
         | 
| 
       19 
35 
     | 
    
         
             
              def test_stemmer
         
     | 
| 
       20 
36 
     | 
    
         
             
                dir = File.dirname(__FILE__)
         
     | 
| 
      
 37 
     | 
    
         
            +
                input = {}
         
     | 
| 
      
 38 
     | 
    
         
            +
                output = {}
         
     | 
| 
      
 39 
     | 
    
         
            +
                s = {}
         
     | 
| 
       21 
40 
     | 
    
         
             
                %w{da de nl en es fi fr it no pt ru sv}.each { |language|
         
     | 
| 
       22 
     | 
    
         
            -
                   
     | 
| 
       23 
     | 
    
         
            -
                   
     | 
| 
       24 
     | 
    
         
            -
                   
     | 
| 
       25 
     | 
    
         
            -
             
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
      
 41 
     | 
    
         
            +
                  input[language] = IO.read("#{dir}/tests/#{language}/voc.txt").split(/\s+/)
         
     | 
| 
      
 42 
     | 
    
         
            +
                  output[language] = IO.read("#{dir}/tests/#{language}/output.txt").split(/\s+/)
         
     | 
| 
      
 43 
     | 
    
         
            +
                  s[language] = Stemmer.new(language)
         
     | 
| 
      
 44 
     | 
    
         
            +
                }
         
     | 
| 
      
 45 
     | 
    
         
            +
                Benchmark.bm { |x|
         
     | 
| 
      
 46 
     | 
    
         
            +
                  x.report {
         
     | 
| 
      
 47 
     | 
    
         
            +
                    %w{da de nl en es fi fr it no pt ru sv}.each { |language|
         
     | 
| 
      
 48 
     | 
    
         
            +
                      puts "Testing '#{language}' algorithm (one word at a time)..."
         
     | 
| 
      
 49 
     | 
    
         
            +
                      input[language].length.times { |i|
         
     | 
| 
      
 50 
     | 
    
         
            +
                        assert_equal(output[language][i], s[language].stem(input[language][i]))
         
     | 
| 
      
 51 
     | 
    
         
            +
                      }
         
     | 
| 
      
 52 
     | 
    
         
            +
                    }
         
     | 
| 
      
 53 
     | 
    
         
            +
                  }
         
     | 
| 
      
 54 
     | 
    
         
            +
                  x.report {
         
     | 
| 
      
 55 
     | 
    
         
            +
                    %w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
         
     | 
| 
      
 56 
     | 
    
         
            +
                      puts "Testing '#{language}' algorithm (a single array)..."
         
     | 
| 
      
 57 
     | 
    
         
            +
                      assert_equal(output[language], s[language].stem(input[language]))
         
     | 
| 
      
 58 
     | 
    
         
            +
                    }
         
     | 
| 
      
 59 
     | 
    
         
            +
                  }
         
     | 
| 
      
 60 
     | 
    
         
            +
                  x.report {
         
     | 
| 
      
 61 
     | 
    
         
            +
                    %w{fr da de nl en es fi fr it no pt ru sv}.each { |language|
         
     | 
| 
      
 62 
     | 
    
         
            +
                      puts "Testing '#{language}' algorithm (a single sentence)..."
         
     | 
| 
      
 63 
     | 
    
         
            +
                      assert_equal(output[language].join(' '), s[language].stem(input[language].join(' ')))
         
     | 
| 
      
 64 
     | 
    
         
            +
                    }
         
     | 
| 
       28 
65 
     | 
    
         
             
                  }
         
     | 
| 
       29 
66 
     | 
    
         
             
                }
         
     | 
| 
       30 
67 
     | 
    
         
             
              end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -3,8 +3,8 @@ rubygems_version: 0.8.10 
     | 
|
| 
       3 
3 
     | 
    
         
             
            specification_version: 1
         
     | 
| 
       4 
4 
     | 
    
         
             
            name: stemmer4r
         
     | 
| 
       5 
5 
     | 
    
         
             
            version: !ruby/object:Gem::Version 
         
     | 
| 
       6 
     | 
    
         
            -
              version: "0. 
     | 
| 
       7 
     | 
    
         
            -
            date: 2005-05- 
     | 
| 
      
 6 
     | 
    
         
            +
              version: "0.5"
         
     | 
| 
      
 7 
     | 
    
         
            +
            date: 2005-05-12
         
     | 
| 
       8 
8 
     | 
    
         
             
            summary: Stemmer4r is a Ruby extension that wraps the snowball stemmer library (libstemmer).
         
     | 
| 
       9 
9 
     | 
    
         
             
            require_paths: 
         
     | 
| 
       10 
10 
     | 
    
         
             
              - "."
         
     |