ruby-stemmer 0.7.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -49,7 +49,7 @@ Please not that Windows is not supported at this time.
49
49
  $ git clone git://github.com/aurelian/ruby-stemmer.git
50
50
  $ cd ruby-stemmer
51
51
  $ rake -T #<== see what we've got
52
- $ rake ext #<== builds the extension do'h
52
+ $ rake compile #<== builds the extension do'h
53
53
  $ rake test
54
54
 
55
55
  == NOT A BUG
@@ -63,7 +63,7 @@ For further reference on stem vs. root, please check wikipedia articles on the t
63
63
  == TODO
64
64
 
65
65
  * {Open issues}[http://github.com/aurelian/ruby-stemmer/issues]
66
- * Windows Support -> see rake-compiler to cross-compile a dll (with the gem and nokogiri Rakefile).
66
+ * Release Windows Gem
67
67
 
68
68
  == Note on Patches/Pull Requests
69
69
 
@@ -92,9 +92,10 @@ Copyright (c) 2008,2009 {Aurelian Oancea}[http://locknet.ro]. See MIT-LICENSE fo
92
92
  == Contributors
93
93
 
94
94
  * Aurelian Oancea
95
- * Yury Korolev
95
+ * Yury Korolev - various bug fixes
96
+ * Aaron Patterson - rake compiler (windows support), code cleanup
96
97
 
97
98
  == Real life usage
98
99
 
99
- * http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments
100
+ * http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments.
100
101
 
data/Rakefile CHANGED
@@ -3,8 +3,9 @@ require 'rake'
3
3
 
4
4
  begin
5
5
  require 'jeweler'
6
- Jeweler::Tasks.new do |gem|
6
+ JEWLER = Jeweler::Tasks.new do |gem|
7
7
  gem.name = "ruby-stemmer"
8
+ gem.version = File.read(File.expand_path(File.join(File.dirname(__FILE__),"VERSION"))).strip!
8
9
  gem.summary = %Q{Expose libstemmer_c to Ruby.}
9
10
  gem.description = %Q{Expose the bundled libstemmer_c library to Ruby.}
10
11
  gem.email = "oancea@gmail.com"
@@ -13,14 +14,14 @@ begin
13
14
  gem.extensions = ["ext/lingua/extconf.rb"]
14
15
  gem.rubyforge_project = "ruby-stemmer"
15
16
  gem.files = FileList['lib/**/*.rb', 'README.rdoc', 'MIT-LICENSE', 'VERSION', 'Rakefile', 'libstemmer_c/**/*', 'ext/**/*', 'test/**/*']
16
- %w(ext/lingua/*.so ext/lingua/*.bundle ext/lingua/Makefile ext/lingua/mkmf.log ext/lingua/*.o libstemmer_c/**/*.o).each do | f |
17
+ %w(ext/lingua/*.so ext/lingua/*.bundle ext/lingua/Makefile ext/lingua/mkmf.log ext/lingua/*.o libstemmer_c/**/*.o libstemmer_c/stemwords).each do | f |
17
18
  gem.files.exclude f
18
19
  end
19
20
  end
20
21
  Jeweler::GemcutterTasks.new
21
22
  Jeweler::RubyforgeTasks.new do |rubyforge|
22
23
  rubyforge.doc_task = "rdoc"
23
- end
24
+ end
24
25
  rescue LoadError
25
26
  puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
26
27
  end
@@ -49,22 +50,23 @@ task :test => :check_dependencies
49
50
 
50
51
  task :default => :test
51
52
 
52
- desc "Cleans the project"
53
- task :clean do
54
- `cd ext/lingua && rm -rf Makefile mkmf.log stemmer.o stemmer_native.bundle stemmer.so; cd ../../`
55
- `cd libstemmer_c && make clean && cd ../`
56
- end
53
+ gem 'rake-compiler', '>= 0.4.1'
54
+ require "rake/extensiontask"
55
+
56
+ CLOBBER.include("libstemmer_c/**/*.o")
57
57
 
58
- desc "Builds the extension"
59
- task :ext => :clean do
60
- `cd ext/lingua/ && ruby extconf.rb && make && cd ../../`
58
+ Rake::ExtensionTask.new(JEWLER.gemspec.name, JEWLER.gemspec) do |ext|
59
+ ext.lib_dir = File.join(*['lib', 'lingua', ENV['FAT_DIR']].compact)
60
+ ext.ext_dir = File.join 'ext', 'lingua'
61
+ ext.cross_compile = true
62
+ ext.name = 'stemmer_native'
61
63
  end
62
64
 
63
65
  require 'rake/rdoctask'
64
66
  Rake::RDocTask.new do |rdoc|
65
67
  version = File.exist?('VERSION') ? File.read('VERSION') : ""
66
68
  rdoc.rdoc_dir = 'rdoc'
67
- rdoc.options << '--charset' << 'utf-8'
69
+ rdoc.options << '--charset' << 'utf-8'
68
70
  rdoc.title = "Ruby-Stemmer #{version}"
69
71
  rdoc.rdoc_files.include('README*')
70
72
  rdoc.rdoc_files.include('lib/**/*.rb')
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.7.0
1
+ 0.8.1
@@ -17,12 +17,15 @@ if RUBY_PLATFORM =~ /darwin/
17
17
  exit
18
18
  end
19
19
  end
20
- # make this stuff
21
- system "cd #{LIBSTEMMER}; #{make} libstemmer.o; cd #{ROOT};"
22
- exit unless $? == 0
23
20
 
24
- $CFLAGS += " -I#{File.join(LIBSTEMMER, 'include')} "
25
- $libs += " -L#{LIBSTEMMER} #{File.join(LIBSTEMMER, 'libstemmer.o')} "
21
+ # make libstemmer_c. unless we're cross-compiling.
22
+ unless RUBY_PLATFORM =~ /i386-mingw32/
23
+ system "cd #{LIBSTEMMER}; #{make} libstemmer.o; cd #{ROOT};"
24
+ exit unless $? == 0
25
+ end
26
+
27
+ $CFLAGS += " -I#{File.expand_path(File.join(LIBSTEMMER, 'include'))} "
28
+ $libs += " -L#{LIBSTEMMER} #{File.expand_path(File.join(LIBSTEMMER, 'libstemmer.o'))} "
26
29
 
27
30
  if have_header("libstemmer.h")
28
31
  create_makefile("lingua/stemmer_native")
data/ext/lingua/stemmer.c CHANGED
@@ -1,20 +1,10 @@
1
1
  #include "ruby.h"
2
2
  #include <libstemmer.h>
3
3
 
4
- #define GetStemmer(obj, sb_data) {\
5
- Data_Get_Struct(obj, struct sb_stemmer_data, sb_data);\
6
- }
7
-
8
4
  VALUE rb_mLingua;
9
5
  VALUE rb_cStemmer;
10
6
  VALUE rb_eStemmerError;
11
7
 
12
- struct sb_stemmer_data {
13
- struct sb_stemmer * stemmer;
14
- const char * lang;
15
- const char * enc;
16
- };
17
-
18
8
  /*
19
9
  * Document-method: new
20
10
  * call-seq: Lingua::Stemmer.new
@@ -26,48 +16,27 @@ struct sb_stemmer_data {
26
16
  * s = Lingua::Stemmer.new :language => 'fr'
27
17
  */
28
18
  static VALUE
29
- rb_stemmer_init(int argc, VALUE *argv, VALUE self) {
30
- VALUE roptions, rlang, renc;
31
-
19
+ rb_stemmer_init(VALUE self, VALUE rlang, VALUE renc) {
32
20
  struct sb_stemmer * stemmer;
33
- struct sb_stemmer_data *sb_data;
34
21
 
35
- rb_scan_args(argc, argv, "01", &roptions);
36
-
37
- if(argc > 0) {
38
- Check_Type(roptions, T_HASH);
39
- if((rlang = rb_hash_aref(roptions, ID2SYM(rb_intern("language")))) != Qnil) {
40
- Check_Type(rlang, T_STRING);
41
- } else {
42
- rlang = rb_str_new2("en");
43
- }
44
- if((renc = rb_hash_aref(roptions, ID2SYM(rb_intern("encoding")))) != Qnil) {
45
- Check_Type(renc, T_STRING);
46
- } else {
47
- renc = rb_str_new2("UTF_8");
48
- }
49
- } else {
50
- rlang = rb_str_new2("en");
51
- renc = rb_str_new2("UTF_8");
52
- }
53
-
22
+ Data_Get_Struct(self, struct sb_stemmer, stemmer);
23
+
24
+ // In case someone sends() this method, free up the old one
25
+ if(stemmer) sb_stemmer_delete(stemmer);
26
+
54
27
  stemmer = sb_stemmer_new( RSTRING_PTR(rlang), RSTRING_PTR(renc) );
55
- if (stemmer == 0) {
56
- if (renc == 0 ) {
57
- rb_raise(rb_eStemmerError, "Language %s not available for stemming", RSTRING_PTR(rlang));
58
- exit(1);
28
+ if (!stemmer) {
29
+ if (!RTEST(renc)) {
30
+ rb_raise(rb_eStemmerError,
31
+ "Language %s not available for stemming", RSTRING_PTR(rlang));
59
32
  } else {
60
- rb_raise(rb_eStemmerError, "Language %s not available for stemming in encoding %s",
33
+ rb_raise(rb_eStemmerError,
34
+ "Language %s not available for stemming in encoding %s",
61
35
  RSTRING_PTR(rlang), RSTRING_PTR(renc));
62
- exit(1);
63
36
  }
64
37
  }
65
38
 
66
- sb_data = ALLOC(struct sb_stemmer_data);
67
- DATA_PTR(self) = sb_data;
68
- sb_data->stemmer= stemmer;
69
- sb_data->lang = RSTRING_PTR(rlang);
70
- sb_data->enc = RSTRING_PTR(renc);
39
+ DATA_PTR(self) = stemmer;
71
40
 
72
41
  return self;
73
42
  }
@@ -84,52 +53,23 @@ rb_stemmer_init(int argc, VALUE *argv, VALUE self) {
84
53
  */
85
54
  static VALUE
86
55
  rb_stemmer_stem(VALUE self, VALUE word) {
87
- struct sb_stemmer_data * sb_data;
88
- const sb_symbol * stemmed;
89
- VALUE s_word = rb_String(word);
90
- GetStemmer(self, sb_data);
91
- stemmed = sb_stemmer_stem(sb_data->stemmer, (sb_symbol *)RSTRING_PTR(s_word), RSTRING_LEN(s_word));
92
- return rb_str_new2((char *)stemmed);
93
- }
56
+ struct sb_stemmer * stemmer;
94
57
 
95
- /*
96
- * Document-method: language
97
- * call-seq: language
98
- *
99
- * Gets the language for this stemmer
100
- *
101
- * require 'lingua/stemmer'
102
- * s = Lingua::Stemmer.new(:language => "fr")
103
- * s.language #=> "fr"
104
- */
105
- static VALUE
106
- rb_stemmer_language(VALUE self) {
107
- struct sb_stemmer_data * sb_data;
108
- GetStemmer(self, sb_data);
109
- return rb_str_new2(sb_data->lang);
110
- }
58
+ Data_Get_Struct(self, struct sb_stemmer, stemmer);
59
+ if(!stemmer) rb_raise(rb_eRuntimeError, "Stemmer is not initialized");
111
60
 
112
- /*
113
- * Document-method: encoding
114
- * call-seq: encoding
115
- *
116
- * Gets the encoding for this stemmer
117
- *
118
- * require 'lingua/stemmer'
119
- * s = Lingua::Stemmer.new(:language => "UTF_8")
120
- * s.encoding #=> "UTF_8"
121
- */
122
- static VALUE
123
- rb_stemmer_encoding(VALUE self) {
124
- struct sb_stemmer_data * sb_data;
125
- GetStemmer(self, sb_data);
126
- return rb_str_new2(sb_data->enc);
61
+ VALUE s_word = rb_String(word);
62
+ const sb_symbol * stemmed = sb_stemmer_stem(stemmer,
63
+ (sb_symbol *)RSTRING_PTR(s_word),
64
+ RSTRING_LEN(s_word)
65
+ );
66
+ return rb_str_new2((char *)stemmed);
127
67
  }
128
68
 
129
69
  static void
130
- sb_stemmer_free(struct sb_stemmer_data * sb_data)
70
+ sb_stemmer_free(struct sb_stemmer * stemmer)
131
71
  {
132
- sb_stemmer_delete(sb_data->stemmer);
72
+ if(stemmer) sb_stemmer_delete(stemmer);
133
73
  }
134
74
 
135
75
  static VALUE
@@ -146,9 +86,7 @@ void Init_stemmer_native() {
146
86
  rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
147
87
  rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
148
88
  rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
149
- rb_define_method(rb_cStemmer, "initialize", rb_stemmer_init, -1);
89
+ rb_define_private_method(rb_cStemmer, "native_init", rb_stemmer_init, 2);
150
90
  rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
151
- rb_define_method(rb_cStemmer, "language", rb_stemmer_language, 0);
152
- rb_define_method(rb_cStemmer, "encoding", rb_stemmer_encoding, 0);
153
91
  }
154
92
 
@@ -1,24 +1,46 @@
1
- require 'lingua/stemmer_native'
1
+ if RUBY_PLATFORM =~/(mswin|mingw)/i
2
+ require "lingua/#{RUBY_VERSION.sub(/\.\d+$/, '')}/stemmer_native"
3
+ else
4
+ require 'lingua/stemmer_native'
5
+ end
2
6
 
3
7
  module Lingua
4
-
5
8
  def self.stemmer(o, options={})
6
- _stemmer= Stemmer.new({:language => "en", :encoding => "UTF_8"}.merge(options))
7
- words= o.kind_of?(Array)? o.map{|e|e.to_s} : [o.to_s]
8
- results = [] unless block_given?
9
- words.each do | word |
10
- result = _stemmer.stem(word)
9
+ stemmer = Stemmer.new(options)
10
+
11
+ words = Array(o).map { |e| e.to_s }
12
+
13
+ results = []
14
+ words.each do |word|
15
+ result = stemmer.stem(word)
11
16
  if block_given?
12
17
  yield result
13
18
  else
14
19
  results << result
15
20
  end
16
21
  end
17
- return (results.length == 1)? results[0] : results unless block_given?
18
- _stemmer
22
+
23
+ return stemmer if block_given?
24
+ results.length == 1 ? results[0] : results
19
25
  end
20
26
 
21
27
  class Stemmer
22
28
  VERSION = File.read(File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "VERSION"))).strip!
29
+
30
+ attr_reader :language
31
+ attr_reader :encoding
32
+
33
+ # Creates a new Stemmer, pass <tt>:language</tt> and <tt>:encoding</tt>
34
+ # as arguments to change encoding or language, otherwise english with UTF_8
35
+ # will be used
36
+ #
37
+ # require 'lingua/stemmer'
38
+ # s = Lingua::Stemmer.new :language => 'fr'
39
+ #
40
+ def initialize options = {}
41
+ @language = (options[:language] || 'en').to_s
42
+ @encoding = (options[:encoding] || 'UTF_8').to_s
43
+ native_init @language, @encoding
44
+ end
23
45
  end
24
46
  end
@@ -0,0 +1,14 @@
1
+ include mkinc.mak
2
+
3
+ AR=/usr/local/i386-mingw32-4.3.0/bin/i386-mingw32-ar
4
+ CC=/usr/local/i386-mingw32-4.3.0/bin/i386-mingw32-gcc
5
+
6
+ CFLAGS=-Iinclude
7
+
8
+ all: libstemmer.o stemwords
9
+ libstemmer.o: $(snowball_sources:.c=.o)
10
+ $(AR) -cru $@ $^
11
+ stemwords: examples/stemwords.o libstemmer.o
12
+ $(CC) -o $@ $^
13
+ clean:
14
+ rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o
data/test/helper.rb CHANGED
@@ -1,11 +1,3 @@
1
1
  require 'rubygems'
2
2
  require 'test/unit'
3
-
4
- %w(. ../lib ../ext).each do |path|
5
- $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), path)))
6
- end
7
-
8
3
  require 'lingua/stemmer'
9
-
10
- class Test::Unit::TestCase
11
- end
@@ -38,4 +38,11 @@ class TestStemmer < Test::Unit::TestCase
38
38
  assert_kind_of Array, results
39
39
  end
40
40
 
41
+ def test_stemmer_subclass
42
+ assert_raises(RuntimeError) do
43
+ Class.new(Lingua::Stemmer) {
44
+ def native_init a, b; end
45
+ }.new.stem('cow')
46
+ end
47
+ end
41
48
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-stemmer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.8.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aurelian Oancea
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2009-10-29 00:00:00 +01:00
13
+ date: 2009-11-06 00:00:00 +01:00
14
14
  default_executable:
15
15
  dependencies: []
16
16
 
@@ -32,6 +32,7 @@ files:
32
32
  - lib/lingua/stemmer.rb
33
33
  - libstemmer_c/MANIFEST
34
34
  - libstemmer_c/Makefile
35
+ - libstemmer_c/Makefile.windows
35
36
  - libstemmer_c/README
36
37
  - libstemmer_c/examples/stemwords.c
37
38
  - libstemmer_c/include/libstemmer.h