ruby-stemmer 0.8.5 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +10 -8
- data/VERSION +1 -1
- data/ext/lingua/stemmer.c +25 -2
- data/lib/lingua/stemmer.rb +16 -2
- data/test/lingua/test_stemmer.rb +46 -1
- metadata +4 -12
data/README.rdoc
CHANGED
@@ -1,9 +1,10 @@
|
|
1
|
-
= Ruby-Stemmer
|
1
|
+
= Ruby-Stemmer {Project Status}[http://stillmaintained.com/aurelian/ruby-stemmer.png]
|
2
2
|
|
3
3
|
Ruby-Stemmer exposes SnowBall API to Ruby.
|
4
4
|
|
5
5
|
This package includes libstemmer_c library released under BSD licence
|
6
|
-
and available for free
|
6
|
+
and available for free [here]{http://snowball.tartarus.org/dist/libstemmer_c.tgz}.
|
7
|
+
|
7
8
|
Support for latin language is also included and it has been generated with the snowball compiler using
|
8
9
|
{schinke contribution}[http://snowball.tartarus.org/otherapps/schinke/intro.html]
|
9
10
|
|
@@ -88,7 +89,7 @@ For further reference on stem vs. root, please check wikipedia articles on the t
|
|
88
89
|
* {fast-stemmer}[http://github.com/romanbsd/fast-stemmer] (ext)
|
89
90
|
* {uea-stemmer}[http://github.com/ealdent/uea-stemmer] (ext)
|
90
91
|
* {stemmer}[http://rubyforge.org/projects/stemmer] (pure ruby)
|
91
|
-
*
|
92
|
+
* add yours
|
92
93
|
|
93
94
|
== Copyright
|
94
95
|
|
@@ -96,11 +97,12 @@ Copyright (c) 2008-2011 {Aurelian Oancea}[http://locknet.ro]. See MIT-LICENSE fo
|
|
96
97
|
|
97
98
|
== Contributors
|
98
99
|
|
99
|
-
* Aurelian Oancea
|
100
|
-
* Yury Korolev - various bug fixes
|
101
|
-
* Aaron Patterson - rake compiler (windows support), code cleanup
|
100
|
+
* {Aurelian Oancea}[https://github.com/aurelian]
|
101
|
+
* {Yury Korolev}[https://github.com/yury] - various bug fixes
|
102
|
+
* {Aaron Patterson}[https://github.com/tenderlove] - rake compiler (windows support), code cleanup
|
103
|
+
* {Damián Silvani}[https://github.com/munshkr] - Ruby 1.9 encoding
|
102
104
|
|
103
105
|
== Real life usage
|
104
|
-
|
105
106
|
* http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments.
|
106
|
-
|
107
|
+
|
108
|
+
# encoding: utf-8
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.9.0
|
data/ext/lingua/stemmer.c
CHANGED
@@ -1,6 +1,27 @@
|
|
1
1
|
#include "ruby.h"
|
2
2
|
#include <libstemmer.h>
|
3
3
|
|
4
|
+
|
5
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
6
|
+
|
7
|
+
#include <ruby/encoding.h>
|
8
|
+
|
9
|
+
#define ENCODED_STR_NEW2(str, encoding) \
|
10
|
+
({ \
|
11
|
+
VALUE _string = rb_str_new2((const char *)str); \
|
12
|
+
int _enc = rb_enc_get_index(encoding); \
|
13
|
+
rb_enc_associate_index(_string, _enc); \
|
14
|
+
_string; \
|
15
|
+
})
|
16
|
+
|
17
|
+
#else
|
18
|
+
|
19
|
+
#define ENCODED_STR_NEW2(str, encoding) \
|
20
|
+
rb_str_new2((const char *)str)
|
21
|
+
|
22
|
+
#endif
|
23
|
+
|
24
|
+
|
4
25
|
VALUE rb_mLingua;
|
5
26
|
VALUE rb_cStemmer;
|
6
27
|
VALUE rb_eStemmerError;
|
@@ -63,7 +84,9 @@ rb_stemmer_stem(VALUE self, VALUE word) {
|
|
63
84
|
(sb_symbol *)RSTRING_PTR(s_word),
|
64
85
|
RSTRING_LEN(s_word)
|
65
86
|
);
|
66
|
-
|
87
|
+
|
88
|
+
VALUE rb_enc = rb_iv_get(self, "@encoding");
|
89
|
+
return ENCODED_STR_NEW2((char *)stemmed, rb_enc);
|
67
90
|
}
|
68
91
|
|
69
92
|
static void
|
@@ -85,7 +108,7 @@ void Init_stemmer_native() {
|
|
85
108
|
rb_mLingua = rb_define_module("Lingua");
|
86
109
|
rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
|
87
110
|
rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
|
88
|
-
rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
|
111
|
+
rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
|
89
112
|
rb_define_private_method(rb_cStemmer, "native_init", rb_stemmer_init, 2);
|
90
113
|
rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
|
91
114
|
}
|
data/lib/lingua/stemmer.rb
CHANGED
@@ -37,10 +37,24 @@ module Lingua
|
|
37
37
|
# require 'lingua/stemmer'
|
38
38
|
# s = Lingua::Stemmer.new :language => 'fr'
|
39
39
|
#
|
40
|
-
def initialize
|
40
|
+
def initialize(options={})
|
41
41
|
@language = (options[:language] || 'en').to_s
|
42
42
|
@encoding = (options[:encoding] || 'UTF_8').to_s
|
43
|
-
|
43
|
+
|
44
|
+
if RUBY_VERSION >= "1.9"
|
45
|
+
if not @encoding.is_a?(Encoding)
|
46
|
+
@encoding = Encoding.find(@encoding.gsub("_", "-"))
|
47
|
+
end
|
48
|
+
else
|
49
|
+
@encoding = @encoding.upcase.gsub("-", "_")
|
50
|
+
end
|
51
|
+
|
52
|
+
native_init(@language, native_encoding(@encoding))
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
def native_encoding(enc)
|
57
|
+
RUBY_VERSION >= "1.9" ? enc.name.gsub('-', '_') : enc
|
44
58
|
end
|
45
59
|
end
|
46
60
|
end
|
data/test/lingua/test_stemmer.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
require 'helper'
|
2
3
|
|
3
4
|
class TestStemmer < Test::Unit::TestCase
|
@@ -35,7 +36,12 @@ class TestStemmer < Test::Unit::TestCase
|
|
35
36
|
assert_equal word, "install"
|
36
37
|
end
|
37
38
|
assert_kind_of ::Lingua::Stemmer, stemmer
|
38
|
-
|
39
|
+
|
40
|
+
if RUBY_VERSION >= '1.9'
|
41
|
+
assert_equal stemmer.encoding, Encoding::UTF_8
|
42
|
+
else
|
43
|
+
assert_equal stemmer.encoding, "UTF_8"
|
44
|
+
end
|
39
45
|
end
|
40
46
|
|
41
47
|
def test_array_stemmer
|
@@ -51,4 +57,43 @@ class TestStemmer < Test::Unit::TestCase
|
|
51
57
|
}.new.stem('cow')
|
52
58
|
end
|
53
59
|
end
|
60
|
+
|
61
|
+
def test_default_encoding_option
|
62
|
+
if RUBY_VERSION >= '1.9'
|
63
|
+
assert_equal ::Lingua::Stemmer.new.encoding, Encoding::UTF_8
|
64
|
+
else
|
65
|
+
assert_equal ::Lingua::Stemmer.new.encoding, "UTF_8"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_different_encoding_options
|
70
|
+
if RUBY_VERSION >= '1.9'
|
71
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => "ISO_8859_1").encoding, Encoding::ISO_8859_1
|
72
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => "UTF-8").encoding, Encoding::UTF_8
|
73
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => "utf-8").encoding, Encoding::UTF_8
|
74
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => :ISO_8859_1).encoding, Encoding::ISO_8859_1
|
75
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => Encoding::UTF_8).encoding, Encoding::UTF_8
|
76
|
+
else
|
77
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => "ISO_8859_1").encoding, "ISO_8859_1"
|
78
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => "UTF-8").encoding, "UTF_8"
|
79
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => "utf-8").encoding, "UTF_8"
|
80
|
+
assert_equal ::Lingua::Stemmer.new(:encoding => :ISO_8859_1).encoding, "ISO_8859_1"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
if RUBY_VERSION >= '1.9'
|
85
|
+
def test_string_encoding
|
86
|
+
word = "așezare"
|
87
|
+
|
88
|
+
stem = ::Lingua.stemmer(word, :language => "ro", :encoding => "UTF_8")
|
89
|
+
assert_equal word.encoding, stem.encoding
|
90
|
+
|
91
|
+
s = ::Lingua::Stemmer.new(:language => "ro", :encoding => "UTF_8")
|
92
|
+
assert_equal s.stem(word).encoding, word.encoding
|
93
|
+
|
94
|
+
stem = ::Lingua.stemmer("installation", :language => "fr", :encoding => "ISO-8859-1")
|
95
|
+
assert_equal stem.encoding, Encoding::ISO_8859_1
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
54
99
|
end
|
metadata
CHANGED
@@ -1,12 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-stemmer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
prerelease:
|
5
|
-
|
6
|
-
- 0
|
7
|
-
- 8
|
8
|
-
- 5
|
9
|
-
version: 0.8.5
|
4
|
+
prerelease:
|
5
|
+
version: 0.9.0
|
10
6
|
platform: ruby
|
11
7
|
authors:
|
12
8
|
- Aurelian Oancea
|
@@ -15,7 +11,7 @@ autorequire:
|
|
15
11
|
bindir: bin
|
16
12
|
cert_chain: []
|
17
13
|
|
18
|
-
date: 2011-03-
|
14
|
+
date: 2011-03-12 00:00:00 +01:00
|
19
15
|
default_executable:
|
20
16
|
dependencies: []
|
21
17
|
|
@@ -135,21 +131,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
135
131
|
requirements:
|
136
132
|
- - ">="
|
137
133
|
- !ruby/object:Gem::Version
|
138
|
-
segments:
|
139
|
-
- 0
|
140
134
|
version: "0"
|
141
135
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
142
136
|
none: false
|
143
137
|
requirements:
|
144
138
|
- - ">="
|
145
139
|
- !ruby/object:Gem::Version
|
146
|
-
segments:
|
147
|
-
- 0
|
148
140
|
version: "0"
|
149
141
|
requirements: []
|
150
142
|
|
151
143
|
rubyforge_project: ruby-stemmer
|
152
|
-
rubygems_version: 1.
|
144
|
+
rubygems_version: 1.6.2
|
153
145
|
signing_key:
|
154
146
|
specification_version: 3
|
155
147
|
summary: Expose libstemmer_c to Ruby.
|