ruby-stemmer 0.7.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +5 -4
- data/Rakefile +14 -12
- data/VERSION +1 -1
- data/ext/lingua/extconf.rb +8 -5
- data/ext/lingua/stemmer.c +25 -87
- data/lib/lingua/stemmer.rb +31 -9
- data/libstemmer_c/Makefile.windows +14 -0
- data/test/helper.rb +0 -8
- data/test/lingua/test_stemmer.rb +7 -0
- metadata +3 -2
data/README.rdoc
CHANGED
@@ -49,7 +49,7 @@ Please not that Windows is not supported at this time.
|
|
49
49
|
$ git clone git://github.com/aurelian/ruby-stemmer.git
|
50
50
|
$ cd ruby-stemmer
|
51
51
|
$ rake -T #<== see what we've got
|
52
|
-
$ rake
|
52
|
+
$ rake compile #<== builds the extension do'h
|
53
53
|
$ rake test
|
54
54
|
|
55
55
|
== NOT A BUG
|
@@ -63,7 +63,7 @@ For further reference on stem vs. root, please check wikipedia articles on the t
|
|
63
63
|
== TODO
|
64
64
|
|
65
65
|
* {Open issues}[http://github.com/aurelian/ruby-stemmer/issues]
|
66
|
-
* Windows
|
66
|
+
* Release Windows Gem
|
67
67
|
|
68
68
|
== Note on Patches/Pull Requests
|
69
69
|
|
@@ -92,9 +92,10 @@ Copyright (c) 2008,2009 {Aurelian Oancea}[http://locknet.ro]. See MIT-LICENSE fo
|
|
92
92
|
== Contributors
|
93
93
|
|
94
94
|
* Aurelian Oancea
|
95
|
-
* Yury Korolev
|
95
|
+
* Yury Korolev - various bug fixes
|
96
|
+
* Aaron Patterson - rake compiler (windows support), code cleanup
|
96
97
|
|
97
98
|
== Real life usage
|
98
99
|
|
99
|
-
* http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments
|
100
|
+
* http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments.
|
100
101
|
|
data/Rakefile
CHANGED
@@ -3,8 +3,9 @@ require 'rake'
|
|
3
3
|
|
4
4
|
begin
|
5
5
|
require 'jeweler'
|
6
|
-
Jeweler::Tasks.new do |gem|
|
6
|
+
JEWLER = Jeweler::Tasks.new do |gem|
|
7
7
|
gem.name = "ruby-stemmer"
|
8
|
+
gem.version = File.read(File.expand_path(File.join(File.dirname(__FILE__),"VERSION"))).strip!
|
8
9
|
gem.summary = %Q{Expose libstemmer_c to Ruby.}
|
9
10
|
gem.description = %Q{Expose the bundled libstemmer_c library to Ruby.}
|
10
11
|
gem.email = "oancea@gmail.com"
|
@@ -13,14 +14,14 @@ begin
|
|
13
14
|
gem.extensions = ["ext/lingua/extconf.rb"]
|
14
15
|
gem.rubyforge_project = "ruby-stemmer"
|
15
16
|
gem.files = FileList['lib/**/*.rb', 'README.rdoc', 'MIT-LICENSE', 'VERSION', 'Rakefile', 'libstemmer_c/**/*', 'ext/**/*', 'test/**/*']
|
16
|
-
%w(ext/lingua/*.so ext/lingua/*.bundle ext/lingua/Makefile ext/lingua/mkmf.log ext/lingua/*.o libstemmer_c/**/*.o).each do | f |
|
17
|
+
%w(ext/lingua/*.so ext/lingua/*.bundle ext/lingua/Makefile ext/lingua/mkmf.log ext/lingua/*.o libstemmer_c/**/*.o libstemmer_c/stemwords).each do | f |
|
17
18
|
gem.files.exclude f
|
18
19
|
end
|
19
20
|
end
|
20
21
|
Jeweler::GemcutterTasks.new
|
21
22
|
Jeweler::RubyforgeTasks.new do |rubyforge|
|
22
23
|
rubyforge.doc_task = "rdoc"
|
23
|
-
end
|
24
|
+
end
|
24
25
|
rescue LoadError
|
25
26
|
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
26
27
|
end
|
@@ -49,22 +50,23 @@ task :test => :check_dependencies
|
|
49
50
|
|
50
51
|
task :default => :test
|
51
52
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
end
|
53
|
+
gem 'rake-compiler', '>= 0.4.1'
|
54
|
+
require "rake/extensiontask"
|
55
|
+
|
56
|
+
CLOBBER.include("libstemmer_c/**/*.o")
|
57
57
|
|
58
|
-
|
59
|
-
|
60
|
-
|
58
|
+
Rake::ExtensionTask.new(JEWLER.gemspec.name, JEWLER.gemspec) do |ext|
|
59
|
+
ext.lib_dir = File.join(*['lib', 'lingua', ENV['FAT_DIR']].compact)
|
60
|
+
ext.ext_dir = File.join 'ext', 'lingua'
|
61
|
+
ext.cross_compile = true
|
62
|
+
ext.name = 'stemmer_native'
|
61
63
|
end
|
62
64
|
|
63
65
|
require 'rake/rdoctask'
|
64
66
|
Rake::RDocTask.new do |rdoc|
|
65
67
|
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
66
68
|
rdoc.rdoc_dir = 'rdoc'
|
67
|
-
rdoc.options << '--charset' << 'utf-8'
|
69
|
+
rdoc.options << '--charset' << 'utf-8'
|
68
70
|
rdoc.title = "Ruby-Stemmer #{version}"
|
69
71
|
rdoc.rdoc_files.include('README*')
|
70
72
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.8.1
|
data/ext/lingua/extconf.rb
CHANGED
@@ -17,12 +17,15 @@ if RUBY_PLATFORM =~ /darwin/
|
|
17
17
|
exit
|
18
18
|
end
|
19
19
|
end
|
20
|
-
# make this stuff
|
21
|
-
system "cd #{LIBSTEMMER}; #{make} libstemmer.o; cd #{ROOT};"
|
22
|
-
exit unless $? == 0
|
23
20
|
|
24
|
-
|
25
|
-
|
21
|
+
# make libstemmer_c. unless we're cross-compiling.
|
22
|
+
unless RUBY_PLATFORM =~ /i386-mingw32/
|
23
|
+
system "cd #{LIBSTEMMER}; #{make} libstemmer.o; cd #{ROOT};"
|
24
|
+
exit unless $? == 0
|
25
|
+
end
|
26
|
+
|
27
|
+
$CFLAGS += " -I#{File.expand_path(File.join(LIBSTEMMER, 'include'))} "
|
28
|
+
$libs += " -L#{LIBSTEMMER} #{File.expand_path(File.join(LIBSTEMMER, 'libstemmer.o'))} "
|
26
29
|
|
27
30
|
if have_header("libstemmer.h")
|
28
31
|
create_makefile("lingua/stemmer_native")
|
data/ext/lingua/stemmer.c
CHANGED
@@ -1,20 +1,10 @@
|
|
1
1
|
#include "ruby.h"
|
2
2
|
#include <libstemmer.h>
|
3
3
|
|
4
|
-
#define GetStemmer(obj, sb_data) {\
|
5
|
-
Data_Get_Struct(obj, struct sb_stemmer_data, sb_data);\
|
6
|
-
}
|
7
|
-
|
8
4
|
VALUE rb_mLingua;
|
9
5
|
VALUE rb_cStemmer;
|
10
6
|
VALUE rb_eStemmerError;
|
11
7
|
|
12
|
-
struct sb_stemmer_data {
|
13
|
-
struct sb_stemmer * stemmer;
|
14
|
-
const char * lang;
|
15
|
-
const char * enc;
|
16
|
-
};
|
17
|
-
|
18
8
|
/*
|
19
9
|
* Document-method: new
|
20
10
|
* call-seq: Lingua::Stemmer.new
|
@@ -26,48 +16,27 @@ struct sb_stemmer_data {
|
|
26
16
|
* s = Lingua::Stemmer.new :language => 'fr'
|
27
17
|
*/
|
28
18
|
static VALUE
|
29
|
-
rb_stemmer_init(
|
30
|
-
VALUE roptions, rlang, renc;
|
31
|
-
|
19
|
+
rb_stemmer_init(VALUE self, VALUE rlang, VALUE renc) {
|
32
20
|
struct sb_stemmer * stemmer;
|
33
|
-
struct sb_stemmer_data *sb_data;
|
34
21
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
Check_Type(rlang, T_STRING);
|
41
|
-
} else {
|
42
|
-
rlang = rb_str_new2("en");
|
43
|
-
}
|
44
|
-
if((renc = rb_hash_aref(roptions, ID2SYM(rb_intern("encoding")))) != Qnil) {
|
45
|
-
Check_Type(renc, T_STRING);
|
46
|
-
} else {
|
47
|
-
renc = rb_str_new2("UTF_8");
|
48
|
-
}
|
49
|
-
} else {
|
50
|
-
rlang = rb_str_new2("en");
|
51
|
-
renc = rb_str_new2("UTF_8");
|
52
|
-
}
|
53
|
-
|
22
|
+
Data_Get_Struct(self, struct sb_stemmer, stemmer);
|
23
|
+
|
24
|
+
// In case someone sends() this method, free up the old one
|
25
|
+
if(stemmer) sb_stemmer_delete(stemmer);
|
26
|
+
|
54
27
|
stemmer = sb_stemmer_new( RSTRING_PTR(rlang), RSTRING_PTR(renc) );
|
55
|
-
if (stemmer
|
56
|
-
if (renc
|
57
|
-
rb_raise(rb_eStemmerError,
|
58
|
-
|
28
|
+
if (!stemmer) {
|
29
|
+
if (!RTEST(renc)) {
|
30
|
+
rb_raise(rb_eStemmerError,
|
31
|
+
"Language %s not available for stemming", RSTRING_PTR(rlang));
|
59
32
|
} else {
|
60
|
-
rb_raise(rb_eStemmerError,
|
33
|
+
rb_raise(rb_eStemmerError,
|
34
|
+
"Language %s not available for stemming in encoding %s",
|
61
35
|
RSTRING_PTR(rlang), RSTRING_PTR(renc));
|
62
|
-
exit(1);
|
63
36
|
}
|
64
37
|
}
|
65
38
|
|
66
|
-
|
67
|
-
DATA_PTR(self) = sb_data;
|
68
|
-
sb_data->stemmer= stemmer;
|
69
|
-
sb_data->lang = RSTRING_PTR(rlang);
|
70
|
-
sb_data->enc = RSTRING_PTR(renc);
|
39
|
+
DATA_PTR(self) = stemmer;
|
71
40
|
|
72
41
|
return self;
|
73
42
|
}
|
@@ -84,52 +53,23 @@ rb_stemmer_init(int argc, VALUE *argv, VALUE self) {
|
|
84
53
|
*/
|
85
54
|
static VALUE
|
86
55
|
rb_stemmer_stem(VALUE self, VALUE word) {
|
87
|
-
struct
|
88
|
-
const sb_symbol * stemmed;
|
89
|
-
VALUE s_word = rb_String(word);
|
90
|
-
GetStemmer(self, sb_data);
|
91
|
-
stemmed = sb_stemmer_stem(sb_data->stemmer, (sb_symbol *)RSTRING_PTR(s_word), RSTRING_LEN(s_word));
|
92
|
-
return rb_str_new2((char *)stemmed);
|
93
|
-
}
|
56
|
+
struct sb_stemmer * stemmer;
|
94
57
|
|
95
|
-
|
96
|
-
|
97
|
-
* call-seq: language
|
98
|
-
*
|
99
|
-
* Gets the language for this stemmer
|
100
|
-
*
|
101
|
-
* require 'lingua/stemmer'
|
102
|
-
* s = Lingua::Stemmer.new(:language => "fr")
|
103
|
-
* s.language #=> "fr"
|
104
|
-
*/
|
105
|
-
static VALUE
|
106
|
-
rb_stemmer_language(VALUE self) {
|
107
|
-
struct sb_stemmer_data * sb_data;
|
108
|
-
GetStemmer(self, sb_data);
|
109
|
-
return rb_str_new2(sb_data->lang);
|
110
|
-
}
|
58
|
+
Data_Get_Struct(self, struct sb_stemmer, stemmer);
|
59
|
+
if(!stemmer) rb_raise(rb_eRuntimeError, "Stemmer is not initialized");
|
111
60
|
|
112
|
-
|
113
|
-
*
|
114
|
-
*
|
115
|
-
|
116
|
-
|
117
|
-
*
|
118
|
-
* require 'lingua/stemmer'
|
119
|
-
* s = Lingua::Stemmer.new(:language => "UTF_8")
|
120
|
-
* s.encoding #=> "UTF_8"
|
121
|
-
*/
|
122
|
-
static VALUE
|
123
|
-
rb_stemmer_encoding(VALUE self) {
|
124
|
-
struct sb_stemmer_data * sb_data;
|
125
|
-
GetStemmer(self, sb_data);
|
126
|
-
return rb_str_new2(sb_data->enc);
|
61
|
+
VALUE s_word = rb_String(word);
|
62
|
+
const sb_symbol * stemmed = sb_stemmer_stem(stemmer,
|
63
|
+
(sb_symbol *)RSTRING_PTR(s_word),
|
64
|
+
RSTRING_LEN(s_word)
|
65
|
+
);
|
66
|
+
return rb_str_new2((char *)stemmed);
|
127
67
|
}
|
128
68
|
|
129
69
|
static void
|
130
|
-
sb_stemmer_free(struct
|
70
|
+
sb_stemmer_free(struct sb_stemmer * stemmer)
|
131
71
|
{
|
132
|
-
sb_stemmer_delete(
|
72
|
+
if(stemmer) sb_stemmer_delete(stemmer);
|
133
73
|
}
|
134
74
|
|
135
75
|
static VALUE
|
@@ -146,9 +86,7 @@ void Init_stemmer_native() {
|
|
146
86
|
rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
|
147
87
|
rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
|
148
88
|
rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
|
149
|
-
|
89
|
+
rb_define_private_method(rb_cStemmer, "native_init", rb_stemmer_init, 2);
|
150
90
|
rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
|
151
|
-
rb_define_method(rb_cStemmer, "language", rb_stemmer_language, 0);
|
152
|
-
rb_define_method(rb_cStemmer, "encoding", rb_stemmer_encoding, 0);
|
153
91
|
}
|
154
92
|
|
data/lib/lingua/stemmer.rb
CHANGED
@@ -1,24 +1,46 @@
|
|
1
|
-
|
1
|
+
if RUBY_PLATFORM =~/(mswin|mingw)/i
|
2
|
+
require "lingua/#{RUBY_VERSION.sub(/\.\d+$/, '')}/stemmer_native"
|
3
|
+
else
|
4
|
+
require 'lingua/stemmer_native'
|
5
|
+
end
|
2
6
|
|
3
7
|
module Lingua
|
4
|
-
|
5
8
|
def self.stemmer(o, options={})
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
9
|
+
stemmer = Stemmer.new(options)
|
10
|
+
|
11
|
+
words = Array(o).map { |e| e.to_s }
|
12
|
+
|
13
|
+
results = []
|
14
|
+
words.each do |word|
|
15
|
+
result = stemmer.stem(word)
|
11
16
|
if block_given?
|
12
17
|
yield result
|
13
18
|
else
|
14
19
|
results << result
|
15
20
|
end
|
16
21
|
end
|
17
|
-
|
18
|
-
|
22
|
+
|
23
|
+
return stemmer if block_given?
|
24
|
+
results.length == 1 ? results[0] : results
|
19
25
|
end
|
20
26
|
|
21
27
|
class Stemmer
|
22
28
|
VERSION = File.read(File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "VERSION"))).strip!
|
29
|
+
|
30
|
+
attr_reader :language
|
31
|
+
attr_reader :encoding
|
32
|
+
|
33
|
+
# Creates a new Stemmer, pass <tt>:language</tt> and <tt>:encoding</tt>
|
34
|
+
# as arguments to change encoding or language, otherwise english with UTF_8
|
35
|
+
# will be used
|
36
|
+
#
|
37
|
+
# require 'lingua/stemmer'
|
38
|
+
# s = Lingua::Stemmer.new :language => 'fr'
|
39
|
+
#
|
40
|
+
def initialize options = {}
|
41
|
+
@language = (options[:language] || 'en').to_s
|
42
|
+
@encoding = (options[:encoding] || 'UTF_8').to_s
|
43
|
+
native_init @language, @encoding
|
44
|
+
end
|
23
45
|
end
|
24
46
|
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
include mkinc.mak
|
2
|
+
|
3
|
+
AR=/usr/local/i386-mingw32-4.3.0/bin/i386-mingw32-ar
|
4
|
+
CC=/usr/local/i386-mingw32-4.3.0/bin/i386-mingw32-gcc
|
5
|
+
|
6
|
+
CFLAGS=-Iinclude
|
7
|
+
|
8
|
+
all: libstemmer.o stemwords
|
9
|
+
libstemmer.o: $(snowball_sources:.c=.o)
|
10
|
+
$(AR) -cru $@ $^
|
11
|
+
stemwords: examples/stemwords.o libstemmer.o
|
12
|
+
$(CC) -o $@ $^
|
13
|
+
clean:
|
14
|
+
rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o
|
data/test/helper.rb
CHANGED
data/test/lingua/test_stemmer.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-stemmer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aurelian Oancea
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2009-
|
13
|
+
date: 2009-11-06 00:00:00 +01:00
|
14
14
|
default_executable:
|
15
15
|
dependencies: []
|
16
16
|
|
@@ -32,6 +32,7 @@ files:
|
|
32
32
|
- lib/lingua/stemmer.rb
|
33
33
|
- libstemmer_c/MANIFEST
|
34
34
|
- libstemmer_c/Makefile
|
35
|
+
- libstemmer_c/Makefile.windows
|
35
36
|
- libstemmer_c/README
|
36
37
|
- libstemmer_c/examples/stemwords.c
|
37
38
|
- libstemmer_c/include/libstemmer.h
|