ruby-stemmer 0.8.5 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +10 -8
- data/VERSION +1 -1
- data/ext/lingua/stemmer.c +25 -2
- data/lib/lingua/stemmer.rb +16 -2
- data/test/lingua/test_stemmer.rb +46 -1
- metadata +4 -12
data/README.rdoc
CHANGED
@@ -1,9 +1,10 @@
|
|
1
|
-
= Ruby-Stemmer
|
1
|
+
= Ruby-Stemmer {Project Status}[http://stillmaintained.com/aurelian/ruby-stemmer.png]
|
2
2
|
|
3
3
|
Ruby-Stemmer exposes SnowBall API to Ruby.
|
4
4
|
|
5
5
|
This package includes libstemmer_c library released under BSD licence
|
6
|
-
and available for free
|
6
|
+
and available for free [here]{http://snowball.tartarus.org/dist/libstemmer_c.tgz}.
|
7
|
+
|
7
8
|
Support for latin language is also included and it has been generated with the snowball compiler using
|
8
9
|
{schinke contribution}[http://snowball.tartarus.org/otherapps/schinke/intro.html]
|
9
10
|
|
@@ -88,7 +89,7 @@ For further reference on stem vs. root, please check wikipedia articles on the t
|
|
88
89
|
* {fast-stemmer}[http://github.com/romanbsd/fast-stemmer] (ext)
|
89
90
|
* {uea-stemmer}[http://github.com/ealdent/uea-stemmer] (ext)
|
90
91
|
* {stemmer}[http://rubyforge.org/projects/stemmer] (pure ruby)
|
91
|
-
*
|
92
|
+
* add yours
|
92
93
|
|
93
94
|
== Copyright
|
94
95
|
|
@@ -96,11 +97,12 @@ Copyright (c) 2008-2011 {Aurelian Oancea}[http://locknet.ro]. See MIT-LICENSE fo
|
|
96
97
|
|
97
98
|
== Contributors
|
98
99
|
|
99
|
-
* Aurelian Oancea
|
100
|
-
* Yury Korolev - various bug fixes
|
101
|
-
* Aaron Patterson - rake compiler (windows support), code cleanup
|
100
|
+
* {Aurelian Oancea}[https://github.com/aurelian]
|
101
|
+
* {Yury Korolev}[https://github.com/yury] - various bug fixes
|
102
|
+
* {Aaron Patterson}[https://github.com/tenderlove] - rake compiler (windows support), code cleanup
|
103
|
+
* {Damián Silvani}[https://github.com/munshkr] - Ruby 1.9 encoding
|
102
104
|
|
103
105
|
== Real life usage
|
104
|
-
|
105
106
|
* http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments.
|
106
|
-
|
107
|
+
|
108
|
+
# encoding: utf-8
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.9.0
|
data/ext/lingua/stemmer.c
CHANGED
@@ -1,6 +1,27 @@
|
|
1
1
|
#include "ruby.h"
|
2
2
|
#include <libstemmer.h>
|
3
3
|
|
4
|
+
|
5
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
6
|
+
|
7
|
+
#include <ruby/encoding.h>
|
8
|
+
|
9
|
+
#define ENCODED_STR_NEW2(str, encoding) \
|
10
|
+
({ \
|
11
|
+
VALUE _string = rb_str_new2((const char *)str); \
|
12
|
+
int _enc = rb_enc_get_index(encoding); \
|
13
|
+
rb_enc_associate_index(_string, _enc); \
|
14
|
+
_string; \
|
15
|
+
})
|
16
|
+
|
17
|
+
#else
|
18
|
+
|
19
|
+
#define ENCODED_STR_NEW2(str, encoding) \
|
20
|
+
rb_str_new2((const char *)str)
|
21
|
+
|
22
|
+
#endif
|
23
|
+
|
24
|
+
|
4
25
|
VALUE rb_mLingua;
|
5
26
|
VALUE rb_cStemmer;
|
6
27
|
VALUE rb_eStemmerError;
|
@@ -63,7 +84,9 @@ rb_stemmer_stem(VALUE self, VALUE word) {
|
|
63
84
|
(sb_symbol *)RSTRING_PTR(s_word),
|
64
85
|
RSTRING_LEN(s_word)
|
65
86
|
);
|
66
|
-
|
87
|
+
|
88
|
+
VALUE rb_enc = rb_iv_get(self, "@encoding");
|
89
|
+
return ENCODED_STR_NEW2((char *)stemmed, rb_enc);
|
67
90
|
}
|
68
91
|
|
69
92
|
static void
|
@@ -85,7 +108,7 @@ void Init_stemmer_native() {
|
|
85
108
|
rb_mLingua = rb_define_module("Lingua");
|
86
109
|
rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
|
87
110
|
rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
|
88
|
-
rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
|
111
|
+
rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
|
89
112
|
rb_define_private_method(rb_cStemmer, "native_init", rb_stemmer_init, 2);
|
90
113
|
rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
|
91
114
|
}
|
data/lib/lingua/stemmer.rb
CHANGED
@@ -37,10 +37,24 @@ module Lingua
|
|
37
37
|
# require 'lingua/stemmer'
|
38
38
|
# s = Lingua::Stemmer.new :language => 'fr'
|
39
39
|
#
|
40
|
-
def initialize
|
40
|
+
def initialize(options={})
|
41
41
|
@language = (options[:language] || 'en').to_s
|
42
42
|
@encoding = (options[:encoding] || 'UTF_8').to_s
|
43
|
-
|
43
|
+
|
44
|
+
if RUBY_VERSION >= "1.9"
|
45
|
+
if not @encoding.is_a?(Encoding)
|
46
|
+
@encoding = Encoding.find(@encoding.gsub("_", "-"))
|
47
|
+
end
|
48
|
+
else
|
49
|
+
@encoding = @encoding.upcase.gsub("-", "_")
|
50
|
+
end
|
51
|
+
|
52
|
+
native_init(@language, native_encoding(@encoding))
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
def native_encoding(enc)
|
57
|
+
RUBY_VERSION >= "1.9" ? enc.name.gsub('-', '_') : enc
|
44
58
|
end
|
45
59
|
end
|
46
60
|
end
|
data/test/lingua/test_stemmer.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
require 'helper'
|
2
3
|
|
3
4
|
class TestStemmer < Test::Unit::TestCase
|
@@ -35,7 +36,12 @@ class TestStemmer < Test::Unit::TestCase
|
|
35
36
|
assert_equal word, "install"
|
36
37
|
end
|
37
38
|
assert_kind_of ::Lingua::Stemmer, stemmer
|
38
|
-
|
39
|
+
|
40
|
+
if RUBY_VERSION >= '1.9'
|
41
|
+
assert_equal stemmer.encoding, Encoding::UTF_8
|
42
|
+
else
|
43
|
+
assert_equal stemmer.encoding, "UTF_8"
|
44
|
+
end
|
39
45
|
end
|
40
46
|
|
41
47
|
def test_array_stemmer
|
@@ -51,4 +57,43 @@ class TestStemmer < Test::Unit::TestCase
|
|
51
57
|
}.new.stem('cow')
|
52
58
|
end
|
53
59
|
end
|
60
|
+
|
61
|
+
def test_default_encoding_option
|
62
|
+
if RUBY_VERSION >= '1.9'
|
63
|
+
assert_equal ::Lingua::Stemmer.new.encoding, Encoding::UTF_8
|
64
|
+
else
|
65
|
+
assert_equal ::Lingua::Stemmer.new.encoding, "UTF_8"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_different_encoding_options
|
70
|
+
if RUBY_VERSION >= '1.9'
|
71
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => "ISO_8859_1").encoding, Encoding::ISO_8859_1
|
72
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => "UTF-8").encoding, Encoding::UTF_8
|
73
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => "utf-8").encoding, Encoding::UTF_8
|
74
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => :ISO_8859_1).encoding, Encoding::ISO_8859_1
|
75
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => Encoding::UTF_8).encoding, Encoding::UTF_8
|
76
|
+
else
|
77
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => "ISO_8859_1").encoding, "ISO_8859_1"
|
78
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => "UTF-8").encoding, "UTF_8"
|
79
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => "utf-8").encoding, "UTF_8"
|
80
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => :ISO_8859_1).encoding, "ISO_8859_1"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
if RUBY_VERSION >= '1.9'
|
85
|
+
def test_string_encoding
|
86
|
+
word = "așezare"
|
87
|
+
|
88
|
+
stem = ::Lingua.stemmer(word, :language => "ro", :encoding => "UTF_8")
|
89
|
+
assert_equal word.encoding, stem.encoding
|
90
|
+
|
91
|
+
s = ::Lingua::Stemmer.new(:language => "ro", :encoding => "UTF_8")
|
92
|
+
assert_equal s.stem(word).encoding, word.encoding
|
93
|
+
|
94
|
+
stem = ::Lingua.stemmer("installation", :language => "fr", :encoding => "ISO-8859-1")
|
95
|
+
assert_equal stem.encoding, Encoding::ISO_8859_1
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
54
99
|
end
|
metadata
CHANGED
@@ -1,12 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-stemmer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
prerelease:
|
5
|
-
|
6
|
-
- 0
|
7
|
-
- 8
|
8
|
-
- 5
|
9
|
-
version: 0.8.5
|
4
|
+
prerelease:
|
5
|
+
version: 0.9.0
|
10
6
|
platform: ruby
|
11
7
|
authors:
|
12
8
|
- Aurelian Oancea
|
@@ -15,7 +11,7 @@ autorequire:
|
|
15
11
|
bindir: bin
|
16
12
|
cert_chain: []
|
17
13
|
|
18
|
-
date: 2011-03-
|
14
|
+
date: 2011-03-12 00:00:00 +01:00
|
19
15
|
default_executable:
|
20
16
|
dependencies: []
|
21
17
|
|
@@ -135,21 +131,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
135
131
|
requirements:
|
136
132
|
- - ">="
|
137
133
|
- !ruby/object:Gem::Version
|
138
|
-
segments:
|
139
|
-
- 0
|
140
134
|
version: "0"
|
141
135
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
142
136
|
none: false
|
143
137
|
requirements:
|
144
138
|
- - ">="
|
145
139
|
- !ruby/object:Gem::Version
|
146
|
-
segments:
|
147
|
-
- 0
|
148
140
|
version: "0"
|
149
141
|
requirements: []
|
150
142
|
|
151
143
|
rubyforge_project: ruby-stemmer
|
152
|
-
rubygems_version: 1.
|
144
|
+
rubygems_version: 1.6.2
|
153
145
|
signing_key:
|
154
146
|
specification_version: 3
|
155
147
|
summary: Expose libstemmer_c to Ruby.
|