ruby-stemmer 0.7.0 → 0.8.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -49,7 +49,7 @@ Please not that Windows is not supported at this time.
49
49
  $ git clone git://github.com/aurelian/ruby-stemmer.git
50
50
  $ cd ruby-stemmer
51
51
  $ rake -T #<== see what we've got
52
- $ rake ext #<== builds the extension do'h
52
+ $ rake compile #<== builds the extension do'h
53
53
  $ rake test
54
54
 
55
55
  == NOT A BUG
@@ -63,7 +63,7 @@ For further reference on stem vs. root, please check wikipedia articles on the t
63
63
  == TODO
64
64
 
65
65
  * {Open issues}[http://github.com/aurelian/ruby-stemmer/issues]
66
- * Windows Support -> see rake-compiler to cross-compile a dll (with the gem and nokogiri Rakefile).
66
+ * Release Windows Gem
67
67
 
68
68
  == Note on Patches/Pull Requests
69
69
 
@@ -92,9 +92,10 @@ Copyright (c) 2008,2009 {Aurelian Oancea}[http://locknet.ro]. See MIT-LICENSE fo
92
92
  == Contributors
93
93
 
94
94
  * Aurelian Oancea
95
- * Yury Korolev
95
+ * Yury Korolev - various bug fixes
96
+ * Aaron Patterson - rake compiler (windows support), code cleanup
96
97
 
97
98
  == Real life usage
98
99
 
99
- * http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments
100
+ * http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments.
100
101
 
data/Rakefile CHANGED
@@ -3,8 +3,9 @@ require 'rake'
3
3
 
4
4
  begin
5
5
  require 'jeweler'
6
- Jeweler::Tasks.new do |gem|
6
+ JEWLER = Jeweler::Tasks.new do |gem|
7
7
  gem.name = "ruby-stemmer"
8
+ gem.version = File.read(File.expand_path(File.join(File.dirname(__FILE__),"VERSION"))).strip!
8
9
  gem.summary = %Q{Expose libstemmer_c to Ruby.}
9
10
  gem.description = %Q{Expose the bundled libstemmer_c library to Ruby.}
10
11
  gem.email = "oancea@gmail.com"
@@ -13,14 +14,14 @@ begin
13
14
  gem.extensions = ["ext/lingua/extconf.rb"]
14
15
  gem.rubyforge_project = "ruby-stemmer"
15
16
  gem.files = FileList['lib/**/*.rb', 'README.rdoc', 'MIT-LICENSE', 'VERSION', 'Rakefile', 'libstemmer_c/**/*', 'ext/**/*', 'test/**/*']
16
- %w(ext/lingua/*.so ext/lingua/*.bundle ext/lingua/Makefile ext/lingua/mkmf.log ext/lingua/*.o libstemmer_c/**/*.o).each do | f |
17
+ %w(ext/lingua/*.so ext/lingua/*.bundle ext/lingua/Makefile ext/lingua/mkmf.log ext/lingua/*.o libstemmer_c/**/*.o libstemmer_c/stemwords).each do | f |
17
18
  gem.files.exclude f
18
19
  end
19
20
  end
20
21
  Jeweler::GemcutterTasks.new
21
22
  Jeweler::RubyforgeTasks.new do |rubyforge|
22
23
  rubyforge.doc_task = "rdoc"
23
- end
24
+ end
24
25
  rescue LoadError
25
26
  puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
26
27
  end
@@ -49,22 +50,23 @@ task :test => :check_dependencies
49
50
 
50
51
  task :default => :test
51
52
 
52
- desc "Cleans the project"
53
- task :clean do
54
- `cd ext/lingua && rm -rf Makefile mkmf.log stemmer.o stemmer_native.bundle stemmer.so; cd ../../`
55
- `cd libstemmer_c && make clean && cd ../`
56
- end
53
+ gem 'rake-compiler', '>= 0.4.1'
54
+ require "rake/extensiontask"
55
+
56
+ CLOBBER.include("libstemmer_c/**/*.o")
57
57
 
58
- desc "Builds the extension"
59
- task :ext => :clean do
60
- `cd ext/lingua/ && ruby extconf.rb && make && cd ../../`
58
+ Rake::ExtensionTask.new(JEWLER.gemspec.name, JEWLER.gemspec) do |ext|
59
+ ext.lib_dir = File.join(*['lib', 'lingua', ENV['FAT_DIR']].compact)
60
+ ext.ext_dir = File.join 'ext', 'lingua'
61
+ ext.cross_compile = true
62
+ ext.name = 'stemmer_native'
61
63
  end
62
64
 
63
65
  require 'rake/rdoctask'
64
66
  Rake::RDocTask.new do |rdoc|
65
67
  version = File.exist?('VERSION') ? File.read('VERSION') : ""
66
68
  rdoc.rdoc_dir = 'rdoc'
67
- rdoc.options << '--charset' << 'utf-8'
69
+ rdoc.options << '--charset' << 'utf-8'
68
70
  rdoc.title = "Ruby-Stemmer #{version}"
69
71
  rdoc.rdoc_files.include('README*')
70
72
  rdoc.rdoc_files.include('lib/**/*.rb')
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.7.0
1
+ 0.8.1
@@ -17,12 +17,15 @@ if RUBY_PLATFORM =~ /darwin/
17
17
  exit
18
18
  end
19
19
  end
20
- # make this stuff
21
- system "cd #{LIBSTEMMER}; #{make} libstemmer.o; cd #{ROOT};"
22
- exit unless $? == 0
23
20
 
24
- $CFLAGS += " -I#{File.join(LIBSTEMMER, 'include')} "
25
- $libs += " -L#{LIBSTEMMER} #{File.join(LIBSTEMMER, 'libstemmer.o')} "
21
+ # make libstemmer_c. unless we're cross-compiling.
22
+ unless RUBY_PLATFORM =~ /i386-mingw32/
23
+ system "cd #{LIBSTEMMER}; #{make} libstemmer.o; cd #{ROOT};"
24
+ exit unless $? == 0
25
+ end
26
+
27
+ $CFLAGS += " -I#{File.expand_path(File.join(LIBSTEMMER, 'include'))} "
28
+ $libs += " -L#{LIBSTEMMER} #{File.expand_path(File.join(LIBSTEMMER, 'libstemmer.o'))} "
26
29
 
27
30
  if have_header("libstemmer.h")
28
31
  create_makefile("lingua/stemmer_native")
data/ext/lingua/stemmer.c CHANGED
@@ -1,20 +1,10 @@
1
1
  #include "ruby.h"
2
2
  #include <libstemmer.h>
3
3
 
4
- #define GetStemmer(obj, sb_data) {\
5
- Data_Get_Struct(obj, struct sb_stemmer_data, sb_data);\
6
- }
7
-
8
4
  VALUE rb_mLingua;
9
5
  VALUE rb_cStemmer;
10
6
  VALUE rb_eStemmerError;
11
7
 
12
- struct sb_stemmer_data {
13
- struct sb_stemmer * stemmer;
14
- const char * lang;
15
- const char * enc;
16
- };
17
-
18
8
  /*
19
9
  * Document-method: new
20
10
  * call-seq: Lingua::Stemmer.new
@@ -26,48 +16,27 @@ struct sb_stemmer_data {
26
16
  * s = Lingua::Stemmer.new :language => 'fr'
27
17
  */
28
18
  static VALUE
29
- rb_stemmer_init(int argc, VALUE *argv, VALUE self) {
30
- VALUE roptions, rlang, renc;
31
-
19
+ rb_stemmer_init(VALUE self, VALUE rlang, VALUE renc) {
32
20
  struct sb_stemmer * stemmer;
33
- struct sb_stemmer_data *sb_data;
34
21
 
35
- rb_scan_args(argc, argv, "01", &roptions);
36
-
37
- if(argc > 0) {
38
- Check_Type(roptions, T_HASH);
39
- if((rlang = rb_hash_aref(roptions, ID2SYM(rb_intern("language")))) != Qnil) {
40
- Check_Type(rlang, T_STRING);
41
- } else {
42
- rlang = rb_str_new2("en");
43
- }
44
- if((renc = rb_hash_aref(roptions, ID2SYM(rb_intern("encoding")))) != Qnil) {
45
- Check_Type(renc, T_STRING);
46
- } else {
47
- renc = rb_str_new2("UTF_8");
48
- }
49
- } else {
50
- rlang = rb_str_new2("en");
51
- renc = rb_str_new2("UTF_8");
52
- }
53
-
22
+ Data_Get_Struct(self, struct sb_stemmer, stemmer);
23
+
24
+ // In case someone sends() this method, free up the old one
25
+ if(stemmer) sb_stemmer_delete(stemmer);
26
+
54
27
  stemmer = sb_stemmer_new( RSTRING_PTR(rlang), RSTRING_PTR(renc) );
55
- if (stemmer == 0) {
56
- if (renc == 0 ) {
57
- rb_raise(rb_eStemmerError, "Language %s not available for stemming", RSTRING_PTR(rlang));
58
- exit(1);
28
+ if (!stemmer) {
29
+ if (!RTEST(renc)) {
30
+ rb_raise(rb_eStemmerError,
31
+ "Language %s not available for stemming", RSTRING_PTR(rlang));
59
32
  } else {
60
- rb_raise(rb_eStemmerError, "Language %s not available for stemming in encoding %s",
33
+ rb_raise(rb_eStemmerError,
34
+ "Language %s not available for stemming in encoding %s",
61
35
  RSTRING_PTR(rlang), RSTRING_PTR(renc));
62
- exit(1);
63
36
  }
64
37
  }
65
38
 
66
- sb_data = ALLOC(struct sb_stemmer_data);
67
- DATA_PTR(self) = sb_data;
68
- sb_data->stemmer= stemmer;
69
- sb_data->lang = RSTRING_PTR(rlang);
70
- sb_data->enc = RSTRING_PTR(renc);
39
+ DATA_PTR(self) = stemmer;
71
40
 
72
41
  return self;
73
42
  }
@@ -84,52 +53,23 @@ rb_stemmer_init(int argc, VALUE *argv, VALUE self) {
84
53
  */
85
54
  static VALUE
86
55
  rb_stemmer_stem(VALUE self, VALUE word) {
87
- struct sb_stemmer_data * sb_data;
88
- const sb_symbol * stemmed;
89
- VALUE s_word = rb_String(word);
90
- GetStemmer(self, sb_data);
91
- stemmed = sb_stemmer_stem(sb_data->stemmer, (sb_symbol *)RSTRING_PTR(s_word), RSTRING_LEN(s_word));
92
- return rb_str_new2((char *)stemmed);
93
- }
56
+ struct sb_stemmer * stemmer;
94
57
 
95
- /*
96
- * Document-method: language
97
- * call-seq: language
98
- *
99
- * Gets the language for this stemmer
100
- *
101
- * require 'lingua/stemmer'
102
- * s = Lingua::Stemmer.new(:language => "fr")
103
- * s.language #=> "fr"
104
- */
105
- static VALUE
106
- rb_stemmer_language(VALUE self) {
107
- struct sb_stemmer_data * sb_data;
108
- GetStemmer(self, sb_data);
109
- return rb_str_new2(sb_data->lang);
110
- }
58
+ Data_Get_Struct(self, struct sb_stemmer, stemmer);
59
+ if(!stemmer) rb_raise(rb_eRuntimeError, "Stemmer is not initialized");
111
60
 
112
- /*
113
- * Document-method: encoding
114
- * call-seq: encoding
115
- *
116
- * Gets the encoding for this stemmer
117
- *
118
- * require 'lingua/stemmer'
119
- * s = Lingua::Stemmer.new(:language => "UTF_8")
120
- * s.encoding #=> "UTF_8"
121
- */
122
- static VALUE
123
- rb_stemmer_encoding(VALUE self) {
124
- struct sb_stemmer_data * sb_data;
125
- GetStemmer(self, sb_data);
126
- return rb_str_new2(sb_data->enc);
61
+ VALUE s_word = rb_String(word);
62
+ const sb_symbol * stemmed = sb_stemmer_stem(stemmer,
63
+ (sb_symbol *)RSTRING_PTR(s_word),
64
+ RSTRING_LEN(s_word)
65
+ );
66
+ return rb_str_new2((char *)stemmed);
127
67
  }
128
68
 
129
69
  static void
130
- sb_stemmer_free(struct sb_stemmer_data * sb_data)
70
+ sb_stemmer_free(struct sb_stemmer * stemmer)
131
71
  {
132
- sb_stemmer_delete(sb_data->stemmer);
72
+ if(stemmer) sb_stemmer_delete(stemmer);
133
73
  }
134
74
 
135
75
  static VALUE
@@ -146,9 +86,7 @@ void Init_stemmer_native() {
146
86
  rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
147
87
  rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
148
88
  rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
149
- rb_define_method(rb_cStemmer, "initialize", rb_stemmer_init, -1);
89
+ rb_define_private_method(rb_cStemmer, "native_init", rb_stemmer_init, 2);
150
90
  rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
151
- rb_define_method(rb_cStemmer, "language", rb_stemmer_language, 0);
152
- rb_define_method(rb_cStemmer, "encoding", rb_stemmer_encoding, 0);
153
91
  }
154
92
 
@@ -1,24 +1,46 @@
1
- require 'lingua/stemmer_native'
1
+ if RUBY_PLATFORM =~/(mswin|mingw)/i
2
+ require "lingua/#{RUBY_VERSION.sub(/\.\d+$/, '')}/stemmer_native"
3
+ else
4
+ require 'lingua/stemmer_native'
5
+ end
2
6
 
3
7
  module Lingua
4
-
5
8
  def self.stemmer(o, options={})
6
- _stemmer= Stemmer.new({:language => "en", :encoding => "UTF_8"}.merge(options))
7
- words= o.kind_of?(Array)? o.map{|e|e.to_s} : [o.to_s]
8
- results = [] unless block_given?
9
- words.each do | word |
10
- result = _stemmer.stem(word)
9
+ stemmer = Stemmer.new(options)
10
+
11
+ words = Array(o).map { |e| e.to_s }
12
+
13
+ results = []
14
+ words.each do |word|
15
+ result = stemmer.stem(word)
11
16
  if block_given?
12
17
  yield result
13
18
  else
14
19
  results << result
15
20
  end
16
21
  end
17
- return (results.length == 1)? results[0] : results unless block_given?
18
- _stemmer
22
+
23
+ return stemmer if block_given?
24
+ results.length == 1 ? results[0] : results
19
25
  end
20
26
 
21
27
  class Stemmer
22
28
  VERSION = File.read(File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "VERSION"))).strip!
29
+
30
+ attr_reader :language
31
+ attr_reader :encoding
32
+
33
+ # Creates a new Stemmer, pass <tt>:language</tt> and <tt>:encoding</tt>
34
+ # as arguments to change encoding or language, otherwise english with UTF_8
35
+ # will be used
36
+ #
37
+ # require 'lingua/stemmer'
38
+ # s = Lingua::Stemmer.new :language => 'fr'
39
+ #
40
+ def initialize options = {}
41
+ @language = (options[:language] || 'en').to_s
42
+ @encoding = (options[:encoding] || 'UTF_8').to_s
43
+ native_init @language, @encoding
44
+ end
23
45
  end
24
46
  end
@@ -0,0 +1,14 @@
1
+ include mkinc.mak
2
+
3
+ AR=/usr/local/i386-mingw32-4.3.0/bin/i386-mingw32-ar
4
+ CC=/usr/local/i386-mingw32-4.3.0/bin/i386-mingw32-gcc
5
+
6
+ CFLAGS=-Iinclude
7
+
8
+ all: libstemmer.o stemwords
9
+ libstemmer.o: $(snowball_sources:.c=.o)
10
+ $(AR) -cru $@ $^
11
+ stemwords: examples/stemwords.o libstemmer.o
12
+ $(CC) -o $@ $^
13
+ clean:
14
+ rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o
data/test/helper.rb CHANGED
@@ -1,11 +1,3 @@
1
1
  require 'rubygems'
2
2
  require 'test/unit'
3
-
4
- %w(. ../lib ../ext).each do |path|
5
- $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
6
- end
7
-
8
3
  require 'lingua/stemmer'
9
-
10
- class Test::Unit::TestCase
11
- end
@@ -38,4 +38,11 @@ class TestStemmer < Test::Unit::TestCase
38
38
  assert_kind_of Array, results
39
39
  end
40
40
 
41
+ def test_stemmer_subclass
42
+ assert_raises(RuntimeError) do
43
+ Class.new(Lingua::Stemmer) {
44
+ def native_init a, b; end
45
+ }.new.stem('cow')
46
+ end
47
+ end
41
48
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-stemmer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.8.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aurelian Oancea
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2009-10-29 00:00:00 +01:00
13
+ date: 2009-11-06 00:00:00 +01:00
14
14
  default_executable:
15
15
  dependencies: []
16
16
 
@@ -32,6 +32,7 @@ files:
32
32
  - lib/lingua/stemmer.rb
33
33
  - libstemmer_c/MANIFEST
34
34
  - libstemmer_c/Makefile
35
+ - libstemmer_c/Makefile.windows
35
36
  - libstemmer_c/README
36
37
  - libstemmer_c/examples/stemwords.c
37
38
  - libstemmer_c/include/libstemmer.h