ruby-stemmer 0.7.0 → 0.8.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +5 -4
- data/Rakefile +14 -12
- data/VERSION +1 -1
- data/ext/lingua/extconf.rb +8 -5
- data/ext/lingua/stemmer.c +25 -87
- data/lib/lingua/stemmer.rb +31 -9
- data/libstemmer_c/Makefile.windows +14 -0
- data/test/helper.rb +0 -8
- data/test/lingua/test_stemmer.rb +7 -0
- metadata +3 -2
data/README.rdoc
CHANGED
@@ -49,7 +49,7 @@ Please not that Windows is not supported at this time.
|
|
49
49
|
$ git clone git://github.com/aurelian/ruby-stemmer.git
|
50
50
|
$ cd ruby-stemmer
|
51
51
|
$ rake -T #<== see what we've got
|
52
|
-
$ rake
|
52
|
+
$ rake compile #<== builds the extension do'h
|
53
53
|
$ rake test
|
54
54
|
|
55
55
|
== NOT A BUG
|
@@ -63,7 +63,7 @@ For further reference on stem vs. root, please check wikipedia articles on the t
|
|
63
63
|
== TODO
|
64
64
|
|
65
65
|
* {Open issues}[http://github.com/aurelian/ruby-stemmer/issues]
|
66
|
-
* Windows
|
66
|
+
* Release Windows Gem
|
67
67
|
|
68
68
|
== Note on Patches/Pull Requests
|
69
69
|
|
@@ -92,9 +92,10 @@ Copyright (c) 2008,2009 {Aurelian Oancea}[http://locknet.ro]. See MIT-LICENSE fo
|
|
92
92
|
== Contributors
|
93
93
|
|
94
94
|
* Aurelian Oancea
|
95
|
-
* Yury Korolev
|
95
|
+
* Yury Korolev - various bug fixes
|
96
|
+
* Aaron Patterson - rake compiler (windows support), code cleanup
|
96
97
|
|
97
98
|
== Real life usage
|
98
99
|
|
99
|
-
* http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments
|
100
|
+
* http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments.
|
100
101
|
|
data/Rakefile
CHANGED
@@ -3,8 +3,9 @@ require 'rake'
|
|
3
3
|
|
4
4
|
begin
|
5
5
|
require 'jeweler'
|
6
|
-
Jeweler::Tasks.new do |gem|
|
6
|
+
JEWLER = Jeweler::Tasks.new do |gem|
|
7
7
|
gem.name = "ruby-stemmer"
|
8
|
+
gem.version = File.read(File.expand_path(File.join(File.dirname(__FILE__),"VERSION"))).strip!
|
8
9
|
gem.summary = %Q{Expose libstemmer_c to Ruby.}
|
9
10
|
gem.description = %Q{Expose the bundled libstemmer_c library to Ruby.}
|
10
11
|
gem.email = "oancea@gmail.com"
|
@@ -13,14 +14,14 @@ begin
|
|
13
14
|
gem.extensions = ["ext/lingua/extconf.rb"]
|
14
15
|
gem.rubyforge_project = "ruby-stemmer"
|
15
16
|
gem.files = FileList['lib/**/*.rb', 'README.rdoc', 'MIT-LICENSE', 'VERSION', 'Rakefile', 'libstemmer_c/**/*', 'ext/**/*', 'test/**/*']
|
16
|
-
%w(ext/lingua/*.so ext/lingua/*.bundle ext/lingua/Makefile ext/lingua/mkmf.log ext/lingua/*.o libstemmer_c/**/*.o).each do | f |
|
17
|
+
%w(ext/lingua/*.so ext/lingua/*.bundle ext/lingua/Makefile ext/lingua/mkmf.log ext/lingua/*.o libstemmer_c/**/*.o libstemmer_c/stemwords).each do | f |
|
17
18
|
gem.files.exclude f
|
18
19
|
end
|
19
20
|
end
|
20
21
|
Jeweler::GemcutterTasks.new
|
21
22
|
Jeweler::RubyforgeTasks.new do |rubyforge|
|
22
23
|
rubyforge.doc_task = "rdoc"
|
23
|
-
end
|
24
|
+
end
|
24
25
|
rescue LoadError
|
25
26
|
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
26
27
|
end
|
@@ -49,22 +50,23 @@ task :test => :check_dependencies
|
|
49
50
|
|
50
51
|
task :default => :test
|
51
52
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
end
|
53
|
+
gem 'rake-compiler', '>= 0.4.1'
|
54
|
+
require "rake/extensiontask"
|
55
|
+
|
56
|
+
CLOBBER.include("libstemmer_c/**/*.o")
|
57
57
|
|
58
|
-
|
59
|
-
|
60
|
-
|
58
|
+
Rake::ExtensionTask.new(JEWLER.gemspec.name, JEWLER.gemspec) do |ext|
|
59
|
+
ext.lib_dir = File.join(*['lib', 'lingua', ENV['FAT_DIR']].compact)
|
60
|
+
ext.ext_dir = File.join 'ext', 'lingua'
|
61
|
+
ext.cross_compile = true
|
62
|
+
ext.name = 'stemmer_native'
|
61
63
|
end
|
62
64
|
|
63
65
|
require 'rake/rdoctask'
|
64
66
|
Rake::RDocTask.new do |rdoc|
|
65
67
|
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
66
68
|
rdoc.rdoc_dir = 'rdoc'
|
67
|
-
rdoc.options << '--charset' << 'utf-8'
|
69
|
+
rdoc.options << '--charset' << 'utf-8'
|
68
70
|
rdoc.title = "Ruby-Stemmer #{version}"
|
69
71
|
rdoc.rdoc_files.include('README*')
|
70
72
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.8.1
|
data/ext/lingua/extconf.rb
CHANGED
@@ -17,12 +17,15 @@ if RUBY_PLATFORM =~ /darwin/
|
|
17
17
|
exit
|
18
18
|
end
|
19
19
|
end
|
20
|
-
# make this stuff
|
21
|
-
system "cd #{LIBSTEMMER}; #{make} libstemmer.o; cd #{ROOT};"
|
22
|
-
exit unless $? == 0
|
23
20
|
|
24
|
-
|
25
|
-
|
21
|
+
# make libstemmer_c. unless we're cross-compiling.
|
22
|
+
unless RUBY_PLATFORM =~ /i386-mingw32/
|
23
|
+
system "cd #{LIBSTEMMER}; #{make} libstemmer.o; cd #{ROOT};"
|
24
|
+
exit unless $? == 0
|
25
|
+
end
|
26
|
+
|
27
|
+
$CFLAGS += " -I#{File.expand_path(File.join(LIBSTEMMER, 'include'))} "
|
28
|
+
$libs += " -L#{LIBSTEMMER} #{File.expand_path(File.join(LIBSTEMMER, 'libstemmer.o'))} "
|
26
29
|
|
27
30
|
if have_header("libstemmer.h")
|
28
31
|
create_makefile("lingua/stemmer_native")
|
data/ext/lingua/stemmer.c
CHANGED
@@ -1,20 +1,10 @@
|
|
1
1
|
#include "ruby.h"
|
2
2
|
#include <libstemmer.h>
|
3
3
|
|
4
|
-
#define GetStemmer(obj, sb_data) {\
|
5
|
-
Data_Get_Struct(obj, struct sb_stemmer_data, sb_data);\
|
6
|
-
}
|
7
|
-
|
8
4
|
VALUE rb_mLingua;
|
9
5
|
VALUE rb_cStemmer;
|
10
6
|
VALUE rb_eStemmerError;
|
11
7
|
|
12
|
-
struct sb_stemmer_data {
|
13
|
-
struct sb_stemmer * stemmer;
|
14
|
-
const char * lang;
|
15
|
-
const char * enc;
|
16
|
-
};
|
17
|
-
|
18
8
|
/*
|
19
9
|
* Document-method: new
|
20
10
|
* call-seq: Lingua::Stemmer.new
|
@@ -26,48 +16,27 @@ struct sb_stemmer_data {
|
|
26
16
|
* s = Lingua::Stemmer.new :language => 'fr'
|
27
17
|
*/
|
28
18
|
static VALUE
|
29
|
-
rb_stemmer_init(
|
30
|
-
VALUE roptions, rlang, renc;
|
31
|
-
|
19
|
+
rb_stemmer_init(VALUE self, VALUE rlang, VALUE renc) {
|
32
20
|
struct sb_stemmer * stemmer;
|
33
|
-
struct sb_stemmer_data *sb_data;
|
34
21
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
Check_Type(rlang, T_STRING);
|
41
|
-
} else {
|
42
|
-
rlang = rb_str_new2("en");
|
43
|
-
}
|
44
|
-
if((renc = rb_hash_aref(roptions, ID2SYM(rb_intern("encoding")))) != Qnil) {
|
45
|
-
Check_Type(renc, T_STRING);
|
46
|
-
} else {
|
47
|
-
renc = rb_str_new2("UTF_8");
|
48
|
-
}
|
49
|
-
} else {
|
50
|
-
rlang = rb_str_new2("en");
|
51
|
-
renc = rb_str_new2("UTF_8");
|
52
|
-
}
|
53
|
-
|
22
|
+
Data_Get_Struct(self, struct sb_stemmer, stemmer);
|
23
|
+
|
24
|
+
// In case someone sends() this method, free up the old one
|
25
|
+
if(stemmer) sb_stemmer_delete(stemmer);
|
26
|
+
|
54
27
|
stemmer = sb_stemmer_new( RSTRING_PTR(rlang), RSTRING_PTR(renc) );
|
55
|
-
if (stemmer
|
56
|
-
if (renc
|
57
|
-
rb_raise(rb_eStemmerError,
|
58
|
-
|
28
|
+
if (!stemmer) {
|
29
|
+
if (!RTEST(renc)) {
|
30
|
+
rb_raise(rb_eStemmerError,
|
31
|
+
"Language %s not available for stemming", RSTRING_PTR(rlang));
|
59
32
|
} else {
|
60
|
-
rb_raise(rb_eStemmerError,
|
33
|
+
rb_raise(rb_eStemmerError,
|
34
|
+
"Language %s not available for stemming in encoding %s",
|
61
35
|
RSTRING_PTR(rlang), RSTRING_PTR(renc));
|
62
|
-
exit(1);
|
63
36
|
}
|
64
37
|
}
|
65
38
|
|
66
|
-
|
67
|
-
DATA_PTR(self) = sb_data;
|
68
|
-
sb_data->stemmer= stemmer;
|
69
|
-
sb_data->lang = RSTRING_PTR(rlang);
|
70
|
-
sb_data->enc = RSTRING_PTR(renc);
|
39
|
+
DATA_PTR(self) = stemmer;
|
71
40
|
|
72
41
|
return self;
|
73
42
|
}
|
@@ -84,52 +53,23 @@ rb_stemmer_init(int argc, VALUE *argv, VALUE self) {
|
|
84
53
|
*/
|
85
54
|
static VALUE
|
86
55
|
rb_stemmer_stem(VALUE self, VALUE word) {
|
87
|
-
struct
|
88
|
-
const sb_symbol * stemmed;
|
89
|
-
VALUE s_word = rb_String(word);
|
90
|
-
GetStemmer(self, sb_data);
|
91
|
-
stemmed = sb_stemmer_stem(sb_data->stemmer, (sb_symbol *)RSTRING_PTR(s_word), RSTRING_LEN(s_word));
|
92
|
-
return rb_str_new2((char *)stemmed);
|
93
|
-
}
|
56
|
+
struct sb_stemmer * stemmer;
|
94
57
|
|
95
|
-
|
96
|
-
|
97
|
-
* call-seq: language
|
98
|
-
*
|
99
|
-
* Gets the language for this stemmer
|
100
|
-
*
|
101
|
-
* require 'lingua/stemmer'
|
102
|
-
* s = Lingua::Stemmer.new(:language => "fr")
|
103
|
-
* s.language #=> "fr"
|
104
|
-
*/
|
105
|
-
static VALUE
|
106
|
-
rb_stemmer_language(VALUE self) {
|
107
|
-
struct sb_stemmer_data * sb_data;
|
108
|
-
GetStemmer(self, sb_data);
|
109
|
-
return rb_str_new2(sb_data->lang);
|
110
|
-
}
|
58
|
+
Data_Get_Struct(self, struct sb_stemmer, stemmer);
|
59
|
+
if(!stemmer) rb_raise(rb_eRuntimeError, "Stemmer is not initialized");
|
111
60
|
|
112
|
-
|
113
|
-
*
|
114
|
-
*
|
115
|
-
|
116
|
-
|
117
|
-
*
|
118
|
-
* require 'lingua/stemmer'
|
119
|
-
* s = Lingua::Stemmer.new(:language => "UTF_8")
|
120
|
-
* s.encoding #=> "UTF_8"
|
121
|
-
*/
|
122
|
-
static VALUE
|
123
|
-
rb_stemmer_encoding(VALUE self) {
|
124
|
-
struct sb_stemmer_data * sb_data;
|
125
|
-
GetStemmer(self, sb_data);
|
126
|
-
return rb_str_new2(sb_data->enc);
|
61
|
+
VALUE s_word = rb_String(word);
|
62
|
+
const sb_symbol * stemmed = sb_stemmer_stem(stemmer,
|
63
|
+
(sb_symbol *)RSTRING_PTR(s_word),
|
64
|
+
RSTRING_LEN(s_word)
|
65
|
+
);
|
66
|
+
return rb_str_new2((char *)stemmed);
|
127
67
|
}
|
128
68
|
|
129
69
|
static void
|
130
|
-
sb_stemmer_free(struct
|
70
|
+
sb_stemmer_free(struct sb_stemmer * stemmer)
|
131
71
|
{
|
132
|
-
sb_stemmer_delete(
|
72
|
+
if(stemmer) sb_stemmer_delete(stemmer);
|
133
73
|
}
|
134
74
|
|
135
75
|
static VALUE
|
@@ -146,9 +86,7 @@ void Init_stemmer_native() {
|
|
146
86
|
rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
|
147
87
|
rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
|
148
88
|
rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
|
149
|
-
|
89
|
+
rb_define_private_method(rb_cStemmer, "native_init", rb_stemmer_init, 2);
|
150
90
|
rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
|
151
|
-
rb_define_method(rb_cStemmer, "language", rb_stemmer_language, 0);
|
152
|
-
rb_define_method(rb_cStemmer, "encoding", rb_stemmer_encoding, 0);
|
153
91
|
}
|
154
92
|
|
data/lib/lingua/stemmer.rb
CHANGED
@@ -1,24 +1,46 @@
|
|
1
|
-
|
1
|
+
if RUBY_PLATFORM =~/(mswin|mingw)/i
|
2
|
+
require "lingua/#{RUBY_VERSION.sub(/\.\d+$/, '')}/stemmer_native"
|
3
|
+
else
|
4
|
+
require 'lingua/stemmer_native'
|
5
|
+
end
|
2
6
|
|
3
7
|
module Lingua
|
4
|
-
|
5
8
|
def self.stemmer(o, options={})
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
9
|
+
stemmer = Stemmer.new(options)
|
10
|
+
|
11
|
+
words = Array(o).map { |e| e.to_s }
|
12
|
+
|
13
|
+
results = []
|
14
|
+
words.each do |word|
|
15
|
+
result = stemmer.stem(word)
|
11
16
|
if block_given?
|
12
17
|
yield result
|
13
18
|
else
|
14
19
|
results << result
|
15
20
|
end
|
16
21
|
end
|
17
|
-
|
18
|
-
|
22
|
+
|
23
|
+
return stemmer if block_given?
|
24
|
+
results.length == 1 ? results[0] : results
|
19
25
|
end
|
20
26
|
|
21
27
|
class Stemmer
|
22
28
|
VERSION = File.read(File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "VERSION"))).strip!
|
29
|
+
|
30
|
+
attr_reader :language
|
31
|
+
attr_reader :encoding
|
32
|
+
|
33
|
+
# Creates a new Stemmer, pass <tt>:language</tt> and <tt>:encoding</tt>
|
34
|
+
# as arguments to change encoding or language, otherwise english with UTF_8
|
35
|
+
# will be used
|
36
|
+
#
|
37
|
+
# require 'lingua/stemmer'
|
38
|
+
# s = Lingua::Stemmer.new :language => 'fr'
|
39
|
+
#
|
40
|
+
def initialize options = {}
|
41
|
+
@language = (options[:language] || 'en').to_s
|
42
|
+
@encoding = (options[:encoding] || 'UTF_8').to_s
|
43
|
+
native_init @language, @encoding
|
44
|
+
end
|
23
45
|
end
|
24
46
|
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
include mkinc.mak
|
2
|
+
|
3
|
+
AR=/usr/local/i386-mingw32-4.3.0/bin/i386-mingw32-ar
|
4
|
+
CC=/usr/local/i386-mingw32-4.3.0/bin/i386-mingw32-gcc
|
5
|
+
|
6
|
+
CFLAGS=-Iinclude
|
7
|
+
|
8
|
+
all: libstemmer.o stemwords
|
9
|
+
libstemmer.o: $(snowball_sources:.c=.o)
|
10
|
+
$(AR) -cru $@ $^
|
11
|
+
stemwords: examples/stemwords.o libstemmer.o
|
12
|
+
$(CC) -o $@ $^
|
13
|
+
clean:
|
14
|
+
rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o
|
data/test/helper.rb
CHANGED
data/test/lingua/test_stemmer.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-stemmer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aurelian Oancea
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2009-
|
13
|
+
date: 2009-11-06 00:00:00 +01:00
|
14
14
|
default_executable:
|
15
15
|
dependencies: []
|
16
16
|
|
@@ -32,6 +32,7 @@ files:
|
|
32
32
|
- lib/lingua/stemmer.rb
|
33
33
|
- libstemmer_c/MANIFEST
|
34
34
|
- libstemmer_c/Makefile
|
35
|
+
- libstemmer_c/Makefile.windows
|
35
36
|
- libstemmer_c/README
|
36
37
|
- libstemmer_c/examples/stemwords.c
|
37
38
|
- libstemmer_c/include/libstemmer.h
|