ruby-stemmer 0.9.3-x86-mswin32-60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. data/MIT-LICENSE +21 -0
  2. data/README.rdoc +113 -0
  3. data/Rakefile +70 -0
  4. data/TODO +0 -0
  5. data/VERSION +1 -0
  6. data/ext/lingua/extconf.rb +40 -0
  7. data/ext/lingua/stemmer.c +115 -0
  8. data/lib/lingua/1.8/stemmer_native.so +0 -0
  9. data/lib/lingua/1.9/stemmer_native.so +0 -0
  10. data/lib/lingua/stemmer.rb +60 -0
  11. data/libstemmer_c/MANIFEST +72 -0
  12. data/libstemmer_c/Makefile +9 -0
  13. data/libstemmer_c/Makefile.windows +15 -0
  14. data/libstemmer_c/README +125 -0
  15. data/libstemmer_c/examples/stemwords.c +209 -0
  16. data/libstemmer_c/include/libstemmer.h +79 -0
  17. data/libstemmer_c/libstemmer/libstemmer.c +93 -0
  18. data/libstemmer_c/libstemmer/libstemmer_utf8.c +93 -0
  19. data/libstemmer_c/libstemmer/modules.h +195 -0
  20. data/libstemmer_c/libstemmer/modules.txt +51 -0
  21. data/libstemmer_c/libstemmer/modules_utf8.h +123 -0
  22. data/libstemmer_c/libstemmer/modules_utf8.txt +50 -0
  23. data/libstemmer_c/mkinc.mak +86 -0
  24. data/libstemmer_c/mkinc_utf8.mak +54 -0
  25. data/libstemmer_c/runtime/api.c +66 -0
  26. data/libstemmer_c/runtime/api.h +26 -0
  27. data/libstemmer_c/runtime/header.h +58 -0
  28. data/libstemmer_c/runtime/utilities.c +478 -0
  29. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  30. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  31. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  32. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  33. data/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  34. data/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  35. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  36. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  37. data/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  38. data/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  39. data/libstemmer_c/src_c/stem_ISO_8859_1_german.c +503 -0
  40. data/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  41. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  42. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  43. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  44. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  45. data/libstemmer_c/src_c/stem_ISO_8859_1_latin.c +443 -0
  46. data/libstemmer_c/src_c/stem_ISO_8859_1_latin.h +16 -0
  47. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  48. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  49. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  50. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  51. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  52. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  53. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  54. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  55. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  56. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  57. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  58. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  59. data/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  60. data/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  61. data/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  62. data/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  63. data/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  64. data/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  65. data/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  66. data/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  67. data/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  68. data/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  69. data/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  70. data/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  71. data/libstemmer_c/src_c/stem_UTF_8_german.c +509 -0
  72. data/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  73. data/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  74. data/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  75. data/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  76. data/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  77. data/libstemmer_c/src_c/stem_UTF_8_latin.c +443 -0
  78. data/libstemmer_c/src_c/stem_UTF_8_latin.h +16 -0
  79. data/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  80. data/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  81. data/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  82. data/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  83. data/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  84. data/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  85. data/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  86. data/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  87. data/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  88. data/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  89. data/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  90. data/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  91. data/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  92. data/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  93. data/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  94. data/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  95. data/test/helper.rb +3 -0
  96. data/test/lingua/test_stemmer.rb +99 -0
  97. metadata +162 -0
data/MIT-LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2008-2011 Aurelian Oancea
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
data/README.rdoc ADDED
@@ -0,0 +1,113 @@
1
+ = Ruby-Stemmer {Project Status}[http://stillmaintained.com/aurelian/ruby-stemmer.png]
2
+
3
+ Ruby-Stemmer exposes SnowBall API to Ruby.
4
+
5
+ This package includes libstemmer_c library released under BSD licence
6
+ and available for free {here}[http://snowball.tartarus.org/dist/libstemmer_c.tgz].
7
+
8
+ Support for latin language is also included and it has been generated with the snowball compiler using
9
+ {schinke contribution}[http://snowball.tartarus.org/otherapps/schinke/intro.html].
10
+
11
+ For more details about libstemmer_c please visit the {SnowBall website}[http://snowball.tartarus.org].
12
+
13
+ == Usage
14
+
15
+ require 'rubygems'
16
+ require 'lingua/stemmer'
17
+
18
+ stemmer= Lingua::Stemmer.new(:language => "ro")
19
+ stemmer.stem("netăgăduit") #=> netăgădu
20
+
21
+ === Alternative
22
+
23
+ require 'rubygems'
24
+ require 'lingua/stemmer'
25
+
26
+ Lingua.stemmer( %w(incontestabil neîndoielnic), :language => "ro" ) #=> ["incontest", "neîndoieln"]
27
+ Lingua.stemmer("installation") #=> "instal"
28
+ Lingua.stemmer("installation", :language => "fr", :encoding => "ISO_8859_1") do | word |
29
+ puts "~> #{word}" #=> "instal"
30
+ end # => #<Lingua::Stemmer:0x102501e48>
31
+
32
+ === Rails
33
+
34
+ # Rails2: -- config/environment.rb:
35
+ config.gem 'ruby-stemmer', :version => '>=0.6.2', :lib => 'lingua/stemmer'
36
+
37
+ # Rails3: -- Gemfile
38
+ gem 'ruby-stemmer', '>=0.8.3', :require => 'lingua/stemmer'
39
+
40
+ === More details
41
+
42
+ * Complete API in {RDoc format}[http://rdoc.info/github/aurelian/ruby-stemmer/master/frames]
43
+ * More usage on the {test file}[http://github.com/aurelian/ruby-stemmer/blob/master/test/lingua/test_stemmer.rb]
44
+
45
+ == Install
46
+
47
+ === Standard install with:
48
+
49
+ gem install ruby-stemmer
50
+
51
+ ==== Windows
52
+
53
+ There's also a Windows (Fat bin) compiled against ruby 1.9.3 and ruby 1.8.7.
54
+
55
+ gem install ruby-stemmer --platform=x86-mswin32
56
+
57
+ {It's known}[http://cl.ly/BX9o] to work under Windows XP.
58
+
59
+ === Development version
60
+
61
+ $ git clone git://github.com/aurelian/ruby-stemmer.git
62
+ $ cd ruby-stemmer
63
+ $ rake -T #<== see what we've got
64
+ $ rake compile #<== builds the extension do'h
65
+ $ rake test
66
+
67
+ == NOT A BUG
68
+
69
+ The stemming process is an algorithm to allow one to find the stem of an word (not the root of it).
70
+ For further reference on stem vs. root, please check wikipedia articles on the topic:
71
+
72
+ * http://en.wikipedia.org/wiki/Stem_%28linguistics%29
73
+ * http://en.wikipedia.org/wiki/Root_%28linguistics%29
74
+
75
+ == TODO
76
+
77
+ * {Open issues}[http://github.com/aurelian/ruby-stemmer/issues]
78
+
79
+ == Note on Patches/Pull Requests
80
+
81
+ * Fork the project from {github}[http://github.com/aurelian/ruby-stemmer]
82
+ * Make your feature addition or {bug fix}[http://github.com/aurelian/ruby-stemmer/issues]
83
+ * Add tests for it. This is important so I don't break it in a
84
+ future version unintentionally.
85
+ * Commit, do not mess with rakefile, version, or history.
86
+
87
+ if you want to have your own version, that is fine but
88
+ bump version in a commit by itself I can ignore when I pull
89
+ * Send me a pull request. Bonus points for topic branches.
90
+
91
+ == Alternative Stemmers for Ruby
92
+
93
+ * {stemmer4r}[http://rubyforge.org/projects/stemmer4r] (ext)
94
+ * {fast-stemmer}[http://github.com/romanbsd/fast-stemmer] (ext)
95
+ * {uea-stemmer}[http://github.com/ealdent/uea-stemmer] (ext)
96
+ * {stemmer}[http://rubyforge.org/projects/stemmer] (pure ruby)
97
+ * add yours
98
+
99
+ == Copyright
100
+
101
+ Copyright (c) 2008-2011 {Aurelian Oancea}[http://locknet.ro]. See MIT-LICENSE for details.
102
+
103
+ == Contributors
104
+
105
+ * {Aurelian Oancea}[https://github.com/aurelian]
106
+ * {Yury Korolev}[https://github.com/yury] - various bug fixes
107
+ * {Aaron Patterson}[https://github.com/tenderlove] - rake compiler (windows support), code cleanup
108
+ * {Damián Silvani}[https://github.com/munshkr] - Ruby 1.9 encoding
109
+
110
+ == Real life usage
111
+ * http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments.
112
+
113
+ # encoding: utf-8
data/Rakefile ADDED
@@ -0,0 +1,70 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ gem 'rake-compiler', '~>0.7'
5
+ require 'rake/extensiontask'
6
+
7
+ require 'jeweler'
8
+ $jeweler = Jeweler::Tasks.new do |gem|
9
+ gem.name = "ruby-stemmer"
10
+ gem.version = File.read(File.expand_path(File.join(File.dirname(__FILE__),"VERSION"))).strip!
11
+ gem.summary = %Q{Expose libstemmer_c to Ruby.}
12
+ gem.description = %Q{Expose the bundled libstemmer_c library to Ruby.}
13
+ gem.email = "oancea@gmail.com"
14
+ gem.homepage = "http://github.com/aurelian/ruby-stemmer"
15
+ gem.authors = ["Aurelian Oancea", "Yury Korolev"]
16
+ gem.extensions = ["ext/lingua/extconf.rb"]
17
+ gem.rubyforge_project = "ruby-stemmer"
18
+ gem.files = FileList['lib/**/*.rb', 'README.rdoc', 'MIT-LICENSE', 'VERSION', 'Rakefile', 'libstemmer_c/**/*', 'ext/**/*', 'test/**/*']
19
+ %w(ext/lingua/*.so ext/lingua/*.bundle ext/lingua/Makefile ext/lingua/mkmf.log ext/lingua/*.o libstemmer_c/**/*.o libstemmer_c/stemwords).each do | f |
20
+ gem.files.exclude f
21
+ end
22
+ end
23
+
24
+ Jeweler::GemcutterTasks.new
25
+
26
+ require 'rake/testtask'
27
+ Rake::TestTask.new(:test) do |test|
28
+ test.libs << 'lib' << 'test'
29
+ test.pattern = 'test/**/test_*.rb'
30
+ test.verbose = true
31
+ end
32
+
33
+ begin
34
+ require 'rcov/rcovtask'
35
+ Rcov::RcovTask.new do |test|
36
+ test.libs << 'test'
37
+ test.pattern = 'test/**/test_*.rb'
38
+ test.verbose = true
39
+ end
40
+ rescue LoadError
41
+ task :rcov do
42
+ abort "RCov is not available. In order to run rcov, you must: gem install rcov"
43
+ end
44
+ end
45
+
46
+ task :test => :check_dependencies
47
+
48
+ task :default => :test
49
+
50
+ CLOBBER.include("libstemmer_c/**/*.o")
51
+
52
+ Rake::ExtensionTask.new('ruby-stemmer', $jeweler.jeweler.gemspec) do |ext|
53
+ ext.lib_dir = File.join(*['lib', 'lingua', ENV['FAT_DIR']].compact)
54
+ ext.ext_dir = File.join 'ext', 'lingua'
55
+ ext.cross_compile = true
56
+ ext.cross_platform = ['i386-mswin32-60', 'i386-mingw32']
57
+ ext.name = 'stemmer_native'
58
+ end
59
+
60
+ require 'rdoc/task'
61
+ Rake::RDocTask.new do |rdoc|
62
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
63
+ rdoc.rdoc_dir = 'rdoc'
64
+ rdoc.options << '--charset' << 'utf-8'
65
+ rdoc.title = "Ruby-Stemmer #{version}"
66
+ rdoc.rdoc_files.include('README*')
67
+ rdoc.rdoc_files.include('lib/**/*.rb')
68
+ rdoc.rdoc_files.include('ext/lingua/stemmer.c')
69
+ rdoc.rdoc_files.include('MIT-LICENSE')
70
+ end
data/TODO ADDED
File without changes
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.9.3
@@ -0,0 +1,40 @@
1
+ ENV['RC_ARCHS'] = '' if RUBY_PLATFORM =~ /darwin/
2
+ require "mkmf"
3
+
4
+ ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..', '..'))
5
+ LIBSTEMMER = File.join(ROOT, 'libstemmer_c')
6
+
7
+ # build libstemmer_c
8
+ # FreeBSD make is gmake
9
+ make= (RUBY_PLATFORM =~ /freebsd/)? 'gmake' : 'make'
10
+
11
+ # MacOS architecture mess up
12
+ if RUBY_PLATFORM =~ /darwin/
13
+ # see: #issue/3, #issue/5
14
+ begin
15
+ ENV['ARCHFLAGS']= "-arch " + %x[file #{File.expand_path(File.join(Config::CONFIG['bindir'], Config::CONFIG['RUBY_INSTALL_NAME']))}].strip!.match(/executable (.+)$/)[1] unless ENV['ARCHFLAGS'].nil?
16
+ rescue
17
+ $stderr << "Failed to get your ruby executable architecture.\n"
18
+ $stderr << "Please specify one using $ARCHFLAGS environment variable.\n"
19
+ exit
20
+ end
21
+ # see: #issue/9, #issue/6
22
+ # see: man compat
23
+ if ENV['COMMAND_MODE'] == 'legacy'
24
+ $stdout << "Setting compat mode to unix2003\n."
25
+ ENV['COMMAND_MODE']= 'unix2003'
26
+ end
27
+ end
28
+
29
+ # make libstemmer_c. unless we're cross-compiling.
30
+ unless RUBY_PLATFORM =~ /i386-mingw32/
31
+ system "cd #{LIBSTEMMER}; #{make} libstemmer.o; cd #{ROOT};"
32
+ exit unless $? == 0
33
+ end
34
+
35
+ $CFLAGS += " -I#{File.expand_path(File.join(LIBSTEMMER, 'include'))} "
36
+ $libs += " -L#{LIBSTEMMER} #{File.expand_path(File.join(LIBSTEMMER, 'libstemmer.o'))} "
37
+
38
+ if have_header("libstemmer.h")
39
+ create_makefile("lingua/stemmer_native")
40
+ end
@@ -0,0 +1,115 @@
1
+ #include "ruby.h"
2
+ #include <libstemmer.h>
3
+
4
+
5
+ #ifdef HAVE_RUBY_ENCODING_H
6
+
7
+ #include <ruby/encoding.h>
8
+
9
+ #define ENCODED_STR_NEW2(str, encoding) \
10
+ ({ \
11
+ VALUE _string = rb_str_new2((const char *)str); \
12
+ int _enc = rb_enc_get_index(encoding); \
13
+ rb_enc_associate_index(_string, _enc); \
14
+ _string; \
15
+ })
16
+
17
+ #else
18
+
19
+ #define ENCODED_STR_NEW2(str, encoding) \
20
+ rb_str_new2((const char *)str)
21
+
22
+ #endif
23
+
24
+
25
+ VALUE rb_mLingua;
26
+ VALUE rb_cStemmer;
27
+ VALUE rb_eStemmerError;
28
+
29
+ /*
30
+ * Document-method: new
31
+ * call-seq: Lingua::Stemmer.new
32
+ *
33
+ * Creates a new Stemmer, pass <tt>:language</tt> and <tt>:encoding</tt> as arguments
34
+ * to change encoding or language, otherwise english with UTF_8 will be used
35
+ *
36
+ * require 'lingua/stemmer'
37
+ * s = Lingua::Stemmer.new :language => 'fr'
38
+ */
39
+ static VALUE
40
+ rb_stemmer_init(VALUE self, VALUE rlang, VALUE renc) {
41
+ struct sb_stemmer * stemmer;
42
+
43
+ Data_Get_Struct(self, struct sb_stemmer, stemmer);
44
+
45
+ // In case someone sends() this method, free up the old one
46
+ if(stemmer) sb_stemmer_delete(stemmer);
47
+
48
+ stemmer = sb_stemmer_new( RSTRING_PTR(rlang), RSTRING_PTR(renc) );
49
+ if (!stemmer) {
50
+ if (!RTEST(renc)) {
51
+ rb_raise(rb_eStemmerError,
52
+ "Language %s not available for stemming", RSTRING_PTR(rlang));
53
+ } else {
54
+ rb_raise(rb_eStemmerError,
55
+ "Language %s not available for stemming in encoding %s",
56
+ RSTRING_PTR(rlang), RSTRING_PTR(renc));
57
+ }
58
+ }
59
+
60
+ DATA_PTR(self) = stemmer;
61
+
62
+ return self;
63
+ }
64
+
65
+ /*
66
+ * Document-method: stem
67
+ * call-seq: stem
68
+ *
69
+ * Stems a word
70
+ *
71
+ * require 'lingua/stemmer'
72
+ * s = Lingua::Stemmer.new
73
+ * s.stem "installation" # ==> install
74
+ */
75
+ static VALUE
76
+ rb_stemmer_stem(VALUE self, VALUE word) {
77
+ struct sb_stemmer * stemmer;
78
+
79
+ Data_Get_Struct(self, struct sb_stemmer, stemmer);
80
+ if(!stemmer) rb_raise(rb_eRuntimeError, "Stemmer is not initialized");
81
+
82
+ VALUE s_word = rb_String(word);
83
+ const sb_symbol * stemmed = sb_stemmer_stem(stemmer,
84
+ (sb_symbol *)RSTRING_PTR(s_word),
85
+ RSTRING_LEN(s_word)
86
+ );
87
+
88
+ VALUE rb_enc = rb_iv_get(self, "@encoding");
89
+ return ENCODED_STR_NEW2((char *)stemmed, rb_enc);
90
+ }
91
+
92
+ static void
93
+ sb_stemmer_free(struct sb_stemmer * stemmer)
94
+ {
95
+ if(stemmer) sb_stemmer_delete(stemmer);
96
+ }
97
+
98
+ static VALUE
99
+ sb_stemmer_alloc(VALUE klass)
100
+ {
101
+ return Data_Wrap_Struct(klass, 0, sb_stemmer_free, 0);
102
+ }
103
+
104
+ /*
105
+ * Ruby-Stemmer, Ruby extension to SnowBall API using libstemmer_c
106
+ */
107
+ void Init_stemmer_native() {
108
+ rb_mLingua = rb_define_module("Lingua");
109
+ rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
110
+ rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
111
+ rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
112
+ rb_define_private_method(rb_cStemmer, "native_init", rb_stemmer_init, 2);
113
+ rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
114
+ }
115
+
Binary file
Binary file
@@ -0,0 +1,60 @@
1
+ if RUBY_PLATFORM =~/(mswin|mingw)/i
2
+ require "lingua/#{RUBY_VERSION.sub(/\.\d+$/, '')}/stemmer_native"
3
+ else
4
+ require 'lingua/stemmer_native'
5
+ end
6
+
7
+ module Lingua
8
+ def self.stemmer(o, options={})
9
+ stemmer = Stemmer.new(options)
10
+
11
+ words = Array(o).map { |e| e.to_s }
12
+
13
+ results = []
14
+ words.each do |word|
15
+ result = stemmer.stem(word)
16
+ if block_given?
17
+ yield result
18
+ else
19
+ results << result
20
+ end
21
+ end
22
+
23
+ return stemmer if block_given?
24
+ results.length == 1 ? results[0] : results
25
+ end
26
+
27
+ class Stemmer
28
+ VERSION = File.read(File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "VERSION"))).strip!
29
+
30
+ attr_reader :language
31
+ attr_reader :encoding
32
+
33
+ # Creates a new Stemmer, pass <tt>:language</tt> and <tt>:encoding</tt>
34
+ # as arguments to change encoding or language, otherwise english with UTF_8
35
+ # will be used
36
+ #
37
+ # require 'lingua/stemmer'
38
+ # s = Lingua::Stemmer.new :language => 'fr'
39
+ #
40
+ def initialize(options={})
41
+ @language = (options[:language] || 'en').to_s
42
+ @encoding = (options[:encoding] || 'UTF_8').to_s
43
+
44
+ if RUBY_VERSION >= "1.9"
45
+ if not @encoding.is_a?(Encoding)
46
+ @encoding = Encoding.find(@encoding.gsub("_", "-"))
47
+ end
48
+ else
49
+ @encoding = @encoding.upcase.gsub("-", "_")
50
+ end
51
+
52
+ native_init(@language, native_encoding(@encoding))
53
+ end
54
+
55
+ private
56
+ def native_encoding(enc)
57
+ RUBY_VERSION >= "1.9" ? enc.name.gsub('-', '_') : enc
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,72 @@
1
+ README
2
+ src_c/stem_ISO_8859_1_danish.c
3
+ src_c/stem_ISO_8859_1_danish.h
4
+ src_c/stem_ISO_8859_1_dutch.c
5
+ src_c/stem_ISO_8859_1_dutch.h
6
+ src_c/stem_ISO_8859_1_english.c
7
+ src_c/stem_ISO_8859_1_english.h
8
+ src_c/stem_ISO_8859_1_finnish.c
9
+ src_c/stem_ISO_8859_1_finnish.h
10
+ src_c/stem_ISO_8859_1_french.c
11
+ src_c/stem_ISO_8859_1_french.h
12
+ src_c/stem_ISO_8859_1_german.c
13
+ src_c/stem_ISO_8859_1_german.h
14
+ src_c/stem_ISO_8859_1_hungarian.c
15
+ src_c/stem_ISO_8859_1_hungarian.h
16
+ src_c/stem_ISO_8859_1_italian.c
17
+ src_c/stem_ISO_8859_1_italian.h
18
+ src_c/stem_ISO_8859_1_norwegian.c
19
+ src_c/stem_ISO_8859_1_norwegian.h
20
+ src_c/stem_ISO_8859_1_porter.c
21
+ src_c/stem_ISO_8859_1_porter.h
22
+ src_c/stem_ISO_8859_1_portuguese.c
23
+ src_c/stem_ISO_8859_1_portuguese.h
24
+ src_c/stem_ISO_8859_1_spanish.c
25
+ src_c/stem_ISO_8859_1_spanish.h
26
+ src_c/stem_ISO_8859_1_swedish.c
27
+ src_c/stem_ISO_8859_1_swedish.h
28
+ src_c/stem_ISO_8859_2_romanian.c
29
+ src_c/stem_ISO_8859_2_romanian.h
30
+ src_c/stem_KOI8_R_russian.c
31
+ src_c/stem_KOI8_R_russian.h
32
+ src_c/stem_UTF_8_danish.c
33
+ src_c/stem_UTF_8_danish.h
34
+ src_c/stem_UTF_8_dutch.c
35
+ src_c/stem_UTF_8_dutch.h
36
+ src_c/stem_UTF_8_english.c
37
+ src_c/stem_UTF_8_english.h
38
+ src_c/stem_UTF_8_finnish.c
39
+ src_c/stem_UTF_8_finnish.h
40
+ src_c/stem_UTF_8_french.c
41
+ src_c/stem_UTF_8_french.h
42
+ src_c/stem_UTF_8_german.c
43
+ src_c/stem_UTF_8_german.h
44
+ src_c/stem_UTF_8_hungarian.c
45
+ src_c/stem_UTF_8_hungarian.h
46
+ src_c/stem_UTF_8_italian.c
47
+ src_c/stem_UTF_8_italian.h
48
+ src_c/stem_UTF_8_norwegian.c
49
+ src_c/stem_UTF_8_norwegian.h
50
+ src_c/stem_UTF_8_porter.c
51
+ src_c/stem_UTF_8_porter.h
52
+ src_c/stem_UTF_8_portuguese.c
53
+ src_c/stem_UTF_8_portuguese.h
54
+ src_c/stem_UTF_8_romanian.c
55
+ src_c/stem_UTF_8_romanian.h
56
+ src_c/stem_UTF_8_russian.c
57
+ src_c/stem_UTF_8_russian.h
58
+ src_c/stem_UTF_8_spanish.c
59
+ src_c/stem_UTF_8_spanish.h
60
+ src_c/stem_UTF_8_swedish.c
61
+ src_c/stem_UTF_8_swedish.h
62
+ src_c/stem_UTF_8_turkish.c
63
+ src_c/stem_UTF_8_turkish.h
64
+ runtime/api.c
65
+ runtime/api.h
66
+ runtime/header.h
67
+ runtime/utilities.c
68
+ libstemmer/libstemmer.c
69
+ libstemmer/libstemmer_utf8.c
70
+ libstemmer/modules.h
71
+ libstemmer/modules_utf8.h
72
+ include/libstemmer.h
@@ -0,0 +1,9 @@
1
+ include mkinc.mak
2
+ CFLAGS=-Iinclude -fPIC $(ARCHFLAGS)
3
+ all: libstemmer.o stemwords
4
+ libstemmer.o: $(snowball_sources:.c=.o)
5
+ $(AR) -cru $@ $^
6
+ stemwords: examples/stemwords.o libstemmer.o
7
+ $(CC) -o $@ $^
8
+ clean:
9
+ rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o
@@ -0,0 +1,15 @@
1
+ include mkinc.mak
2
+
3
+ AR=i686-w64-mingw32-ar
4
+ CC=i686-w64-mingw32-gcc
5
+ LD=i686-w64-mingw32-ld
6
+
7
+ CFLAGS=-Iinclude
8
+
9
+ all: libstemmer.o stemwords
10
+ libstemmer.o: $(snowball_sources:.c=.o)
11
+ $(AR) -cru $@ $^
12
+ stemwords: examples/stemwords.o libstemmer.o
13
+ $(CC) -o $@ $^
14
+ clean:
15
+ rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o
@@ -0,0 +1,125 @@
1
+ libstemmer_c
2
+ ============
3
+
4
+ This document pertains to the C version of the libstemmer distribution,
5
+ available for download from:
6
+
7
+ http://snowball.tartarus.org/dist/libstemmer_c.tgz
8
+
9
+
10
+ Compiling the library
11
+ =====================
12
+
13
+ A simple makefile is provided for Unix style systems. On such systems, it
14
+ should be possible simply to run "make", and the file "libstemmer.o"
15
+ and the example program "stemwords" will be generated.
16
+
17
+ If this doesn't work on your system, you need to write your own build
18
+ system (or call the compiler directly). The files to compile are
19
+ all contained in the "libstemmer", "runtime" and "src_c" directories,
20
+ and the public header file is contained in the "include" directory.
21
+
22
+ The library comes in two flavours; UTF-8 only, and UTF-8 plus other character
23
+ sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of
24
+ "libstemmer.c".
25
+
26
+ For convenience "mkinc.mak" is a makefile fragment listing the source files and
27
+ header files used to compile the standard version of the library.
28
+ "mkinc_utf8.mak" is a comparable makefile fragment listing just the source
29
+ files for the UTF-8 only version of the library.
30
+
31
+
32
+ Using the library
33
+ =================
34
+
35
+ The library provides a simple C API. Essentially, a new stemmer can
36
+ be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then
37
+ used to stem a word, "sb_stemmer_length" returns the stemmed
38
+ length of the last word processed, and "sb_stemmer_delete" is
39
+ used to delete a stemmer.
40
+
41
+ Creating a stemmer is a relatively expensive operation - the expected
42
+ usage pattern is that a new stemmer is created when needed, used
43
+ to stem many words, and deleted after some time.
44
+
45
+ Stemmers are re-entrant, but not threadsafe. In other words, if
46
+ you wish to access the same stemmer object from multiple threads,
47
+ you must ensure that all access is protected by a mutex or similar
48
+ device.
49
+
50
+ libstemmer does not currently incorporate any mechanism for caching the results
51
+ of stemming operations. Such caching can greatly increase the performance of a
52
+ stemmer under certain situations, so suitable patches will be considered for
53
+ inclusion.
54
+
55
+ The standard libstemmer sources contain an algorithm for each of the supported
56
+ languages. The algorithm may be selected using the english name of the
57
+ language, or using the 2 or 3 letter ISO 639 language codes. In addition,
58
+ the traditional "Porter" stemming algorithm for english is included for
59
+ backwards compatibility purposes, but we recommend use of the "English"
60
+ stemmer in preference for new projects.
61
+
62
+ (Some minor algorithms which are included only as curiosities in the snowball
63
+ website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not
64
+ included in the standard libstemmer sources. These are not really supported by
65
+ the snowball project, but it would be possible to compile a modified libstemmer
66
+ library containing these if desired.)
67
+
68
+
69
+ The stemwords example
70
+ =====================
71
+
72
+ The stemwords example program allows you to run any of the stemmers
73
+ compiled into the libstemmer library on a sample vocabulary. For
74
+ details on how to use it, run it with the "-h" command line option.
75
+
76
+
77
+ Using the library in a larger system
78
+ ====================================
79
+
80
+ If you are incorporating the library into the build system of a larger
81
+ program, I recommend copying the unpacked tarball without modification into
82
+ a subdirectory of the sources of your program. Future versions of the
83
+ library are intended to keep the same structure, so this will keep the
84
+ work required to move to a new version of the library to a minimum.
85
+
86
+ As an additional convenience, the list of source and header files used
87
+ in the library is detailed in mkinc.mak - a file which is in a suitable
88
+ format for inclusion by a Makefile. By including this file in your build
89
+ system, you can link the snowball system into your program with a few
90
+ extra rules.
91
+
92
+ Using the library in a system using GNU autotools
93
+ =================================================
94
+
95
+ The libstemmer_c library can be integrated into a larger system which uses the
96
+ GNU autotool framework (and in particular, automake and autoconf) as follows:
97
+
98
+ 1) Unpack libstemmer_c.tgz in the top level project directory so that there is
99
+ a libstemmer_c subdirectory of the top level directory of the project.
100
+
101
+ 2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing:
102
+
103
+ noinst_LTLIBRARIES = libstemmer.la
104
+ include $(srcdir)/mkinc.mak
105
+ noinst_HEADERS = $(snowball_headers)
106
+ libstemmer_la_SOURCES = $(snowball_sources)
107
+
108
+ (You may also need to add other lines to this, for example, if you are using
109
+ compiler options which are not compatible with compiling the libstemmer
110
+ library.)
111
+
112
+ 3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's
113
+ configure.ac file.
114
+
115
+ 4) Add to the top level makefile the following lines (or modify existing
116
+ assignments to these variables appropriately):
117
+
118
+ AUTOMAKE_OPTIONS = subdir-objects
119
+ AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include
120
+ SUBDIRS=libstemmer_c
121
+ <name>_LIBADD = libstemmer_c/libstemmer.la
122
+
123
+ (Where <name> is the name of the library or executable which links against
124
+ libstemmer.)
125
+