ruby-stemmer-dimelo 0.9.3.dimelo1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (95) hide show
  1. checksums.yaml +7 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +117 -0
  4. data/Rakefile +70 -0
  5. data/VERSION +1 -0
  6. data/ext/lingua/extconf.rb +40 -0
  7. data/ext/lingua/stemmer.c +115 -0
  8. data/lib/lingua/stemmer.rb +60 -0
  9. data/libstemmer_c/MANIFEST +72 -0
  10. data/libstemmer_c/Makefile +9 -0
  11. data/libstemmer_c/Makefile.windows +15 -0
  12. data/libstemmer_c/README +125 -0
  13. data/libstemmer_c/examples/stemwords.c +209 -0
  14. data/libstemmer_c/include/libstemmer.h +79 -0
  15. data/libstemmer_c/libstemmer/libstemmer.c +93 -0
  16. data/libstemmer_c/libstemmer/libstemmer_utf8.c +93 -0
  17. data/libstemmer_c/libstemmer/modules.h +195 -0
  18. data/libstemmer_c/libstemmer/modules.txt +51 -0
  19. data/libstemmer_c/libstemmer/modules_utf8.h +123 -0
  20. data/libstemmer_c/libstemmer/modules_utf8.txt +50 -0
  21. data/libstemmer_c/mkinc.mak +86 -0
  22. data/libstemmer_c/mkinc_utf8.mak +54 -0
  23. data/libstemmer_c/runtime/api.c +66 -0
  24. data/libstemmer_c/runtime/api.h +26 -0
  25. data/libstemmer_c/runtime/header.h +58 -0
  26. data/libstemmer_c/runtime/utilities.c +478 -0
  27. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  28. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  29. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  30. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  31. data/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  32. data/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  33. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  34. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  35. data/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1230 -0
  36. data/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  37. data/libstemmer_c/src_c/stem_ISO_8859_1_german.c +503 -0
  38. data/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  39. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  40. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  41. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  42. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  43. data/libstemmer_c/src_c/stem_ISO_8859_1_latin.c +443 -0
  44. data/libstemmer_c/src_c/stem_ISO_8859_1_latin.h +16 -0
  45. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  46. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  47. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  48. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  49. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  50. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  51. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  52. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  53. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  54. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  55. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  56. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  57. data/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  58. data/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  59. data/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  60. data/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  61. data/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  62. data/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  63. data/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  64. data/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  65. data/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  66. data/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  67. data/libstemmer_c/src_c/stem_UTF_8_french.c +1230 -0
  68. data/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  69. data/libstemmer_c/src_c/stem_UTF_8_german.c +509 -0
  70. data/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  71. data/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  72. data/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  73. data/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  74. data/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  75. data/libstemmer_c/src_c/stem_UTF_8_latin.c +443 -0
  76. data/libstemmer_c/src_c/stem_UTF_8_latin.h +16 -0
  77. data/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  78. data/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  79. data/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  80. data/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  81. data/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  82. data/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  83. data/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  84. data/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  85. data/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  86. data/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  87. data/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  88. data/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  89. data/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  90. data/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  91. data/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  92. data/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  93. data/test/helper.rb +3 -0
  94. data/test/lingua/test_stemmer.rb +99 -0
  95. metadata +141 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 210c16040fce7de4293c411dd0139091debd99e8
4
+ data.tar.gz: 9a64fdd53a0c383fd286974a6cf57dbd06497e6a
5
+ SHA512:
6
+ metadata.gz: 464276d630c2540b6db3f1e2c12f30a0e0a6d08195940673fbc01f8e6ec643bf5bc32778b081afcd1d2a3a950d94bacc5ff589865e156dddce619930aa1b5cea
7
+ data.tar.gz: d72d91388301b5835a92add0b298d9648082fe4d44a97e0a3360f1af828c31cc49f84586be7348dc9872dd6bcfec2fa4835697fa4428d5535f9d8e3725f59876
data/MIT-LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2008-2011 Aurelian Oancea
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
data/README.rdoc ADDED
@@ -0,0 +1,117 @@
1
+ = Ruby-Stemmer {Project Status}[http://stillmaintained.com/aurelian/ruby-stemmer.png]
2
+
3
+ Ruby-Stemmer exposes SnowBall API to Ruby.
4
+
5
+ This package includes libstemmer_c library released under BSD licence
6
+ and available for free {here}[http://snowball.tartarus.org/dist/libstemmer_c.tgz].
7
+
8
+ Support for latin language is also included and it has been generated with the snowball compiler using
9
+ {schinke contribution}[http://snowball.tartarus.org/otherapps/schinke/intro.html].
10
+
11
+ For more details about libstemmer_c please visit the {SnowBall website}[http://snowball.tartarus.org].
12
+
13
+ == Usage
14
+
15
+ require 'rubygems'
16
+ require 'lingua/stemmer'
17
+
18
+ stemmer= Lingua::Stemmer.new(:language => "ro")
19
+ stemmer.stem("netăgăduit") #=> netăgădu
20
+
21
+ === Alternative
22
+
23
+ require 'rubygems'
24
+ require 'lingua/stemmer'
25
+
26
+ Lingua.stemmer( %w(incontestabil neîndoielnic), :language => "ro" ) #=> ["incontest", "neîndoieln"]
27
+ Lingua.stemmer("installation") #=> "instal"
28
+ Lingua.stemmer("installation", :language => "fr", :encoding => "ISO_8859_1") do | word |
29
+ puts "~> #{word}" #=> "instal"
30
+ end # => #<Lingua::Stemmer:0x102501e48>
31
+
32
+ === Rails
33
+
34
+ # Rails2: -- config/environment.rb:
35
+ config.gem 'ruby-stemmer', :version => '>=0.6.2', :lib => 'lingua/stemmer'
36
+
37
+ # Rails3: -- Gemfile
38
+ gem 'ruby-stemmer', '>=0.8.3', :require => 'lingua/stemmer'
39
+
40
+ === More details
41
+
42
+ * Complete API in {RDoc format}[http://rdoc.info/github/aurelian/ruby-stemmer/master/frames]
43
+ * More usage on the {test file}[http://github.com/aurelian/ruby-stemmer/blob/master/test/lingua/test_stemmer.rb]
44
+
45
+ == Install
46
+
47
+ === Standard install with:
48
+
49
+ gem install ruby-stemmer
50
+
51
+ ==== Windows
52
+
53
+ There's also a Windows (Fat bin) compiled against ruby 1.9.3 and ruby 1.8.7.
54
+
55
+ gem install ruby-stemmer --platform=x86-mingw32
56
+
57
+ As far as I know the above should work with {rubyinstaller}[http://rubyinstaller.org/]. If if fails, you could try with:
58
+
59
+ gem install ruby-stemmer --platform=x86-mswin32
60
+
61
+ {It's known}[http://cl.ly/BX9o] to work under Windows XP.
62
+
63
+ === Development version
64
+
65
+ $ git clone git://github.com/aurelian/ruby-stemmer.git
66
+ $ cd ruby-stemmer
67
+ $ rake -T #<== see what we've got
68
+ $ rake compile #<== builds the extension do'h
69
+ $ rake test
70
+
71
+ == NOT A BUG
72
+
73
+ The stemming process is an algorithm to allow one to find the stem of an word (not the root of it).
74
+ For further reference on stem vs. root, please check wikipedia articles on the topic:
75
+
76
+ * http://en.wikipedia.org/wiki/Stem_%28linguistics%29
77
+ * http://en.wikipedia.org/wiki/Root_%28linguistics%29
78
+
79
+ == TODO
80
+
81
+ * {Open issues}[http://github.com/aurelian/ruby-stemmer/issues]
82
+
83
+ == Note on Patches/Pull Requests
84
+
85
+ * Fork the project from {github}[http://github.com/aurelian/ruby-stemmer]
86
+ * Make your feature addition or {bug fix}[http://github.com/aurelian/ruby-stemmer/issues]
87
+ * Add tests for it. This is important so I don't break it in a
88
+ future version unintentionally.
89
+ * Commit, do not mess with rakefile, version, or history.
90
+
91
+ if you want to have your own version, that is fine but
92
+ bump version in a commit by itself I can ignore when I pull
93
+ * Send me a pull request. Bonus points for topic branches.
94
+
95
+ == Alternative Stemmers for Ruby
96
+
97
+ * {stemmer4r}[http://rubyforge.org/projects/stemmer4r] (ext)
98
+ * {fast-stemmer}[http://github.com/romanbsd/fast-stemmer] (ext)
99
+ * {uea-stemmer}[http://github.com/ealdent/uea-stemmer] (ext)
100
+ * {stemmer}[http://rubyforge.org/projects/stemmer] (pure ruby)
101
+ * add yours
102
+
103
+ == Copyright
104
+
105
+ Copyright (c) 2008-2011 {Aurelian Oancea}[http://locknet.ro]. See MIT-LICENSE for details.
106
+
107
+ == Contributors
108
+
109
+ * {Aurelian Oancea}[https://github.com/aurelian]
110
+ * {Yury Korolev}[https://github.com/yury] - various bug fixes
111
+ * {Aaron Patterson}[https://github.com/tenderlove] - rake compiler (windows support), code cleanup
112
+ * {Damián Silvani}[https://github.com/munshkr] - Ruby 1.9 encoding
113
+
114
+ == Real life usage
115
+ * http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments.
116
+
117
+ # encoding: utf-8
data/Rakefile ADDED
@@ -0,0 +1,70 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ gem 'rake-compiler', '~>0.7'
5
+ require 'rake/extensiontask'
6
+
7
+ require 'jeweler'
8
+ $jeweler = Jeweler::Tasks.new do |gem|
9
+ gem.name = "ruby-stemmer"
10
+ gem.version = File.read(File.expand_path(File.join(File.dirname(__FILE__),"VERSION"))).strip!
11
+ gem.summary = %Q{Expose libstemmer_c to Ruby.}
12
+ gem.description = %Q{Expose the bundled libstemmer_c library to Ruby.}
13
+ gem.email = "oancea@gmail.com"
14
+ gem.homepage = "http://github.com/aurelian/ruby-stemmer"
15
+ gem.authors = ["Aurelian Oancea", "Yury Korolev"]
16
+ gem.extensions = ["ext/lingua/extconf.rb"]
17
+ gem.rubyforge_project = "ruby-stemmer"
18
+ gem.files = FileList['lib/**/*.rb', 'README.rdoc', 'MIT-LICENSE', 'VERSION', 'Rakefile', 'libstemmer_c/**/*', 'ext/**/*', 'test/**/*']
19
+ %w(ext/lingua/*.so ext/lingua/*.bundle ext/lingua/Makefile ext/lingua/mkmf.log ext/lingua/*.o libstemmer_c/**/*.o libstemmer_c/stemwords).each do | f |
20
+ gem.files.exclude f
21
+ end
22
+ end
23
+
24
+ Jeweler::GemcutterTasks.new
25
+
26
+ require 'rake/testtask'
27
+ Rake::TestTask.new(:test) do |test|
28
+ test.libs << 'lib' << 'test'
29
+ test.pattern = 'test/**/test_*.rb'
30
+ test.verbose = true
31
+ end
32
+
33
+ begin
34
+ require 'rcov/rcovtask'
35
+ Rcov::RcovTask.new do |test|
36
+ test.libs << 'test'
37
+ test.pattern = 'test/**/test_*.rb'
38
+ test.verbose = true
39
+ end
40
+ rescue LoadError
41
+ task :rcov do
42
+ abort "RCov is not available. In order to run rcov, you must: gem install rcov"
43
+ end
44
+ end
45
+
46
+ task :test => :check_dependencies
47
+
48
+ task :default => :test
49
+
50
+ CLOBBER.include("libstemmer_c/**/*.o")
51
+
52
+ Rake::ExtensionTask.new('ruby-stemmer', $jeweler.jeweler.gemspec) do |ext|
53
+ ext.lib_dir = File.join(*['lib', 'lingua', ENV['FAT_DIR']].compact)
54
+ ext.ext_dir = File.join 'ext', 'lingua'
55
+ ext.cross_compile = true
56
+ ext.cross_platform = ['i386-mswin32-60', 'i386-mingw32']
57
+ ext.name = 'stemmer_native'
58
+ end
59
+
60
+ require 'rdoc/task'
61
+ Rake::RDocTask.new do |rdoc|
62
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
63
+ rdoc.rdoc_dir = 'rdoc'
64
+ rdoc.options << '--charset' << 'utf-8'
65
+ rdoc.title = "Ruby-Stemmer #{version}"
66
+ rdoc.rdoc_files.include('README*')
67
+ rdoc.rdoc_files.include('lib/**/*.rb')
68
+ rdoc.rdoc_files.include('ext/lingua/stemmer.c')
69
+ rdoc.rdoc_files.include('MIT-LICENSE')
70
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.9.3
@@ -0,0 +1,40 @@
1
+ ENV['RC_ARCHS'] = '' if RUBY_PLATFORM =~ /darwin/
2
+ require "mkmf"
3
+
4
+ ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..', '..'))
5
+ LIBSTEMMER = File.join(ROOT, 'libstemmer_c')
6
+
7
+ # build libstemmer_c
8
+ # FreeBSD make is gmake
9
+ make= (RUBY_PLATFORM =~ /freebsd/)? 'gmake' : 'make'
10
+
11
+ # MacOS architecture mess up
12
+ if RUBY_PLATFORM =~ /darwin/
13
+ # see: #issue/3, #issue/5
14
+ begin
15
+ ENV['ARCHFLAGS']= "-arch " + %x[file #{File.expand_path(File.join(Config::CONFIG['bindir'], Config::CONFIG['RUBY_INSTALL_NAME']))}].strip!.match(/executable (.+)$/)[1] unless ENV['ARCHFLAGS'].nil?
16
+ rescue
17
+ $stderr << "Failed to get your ruby executable architecture.\n"
18
+ $stderr << "Please specify one using $ARCHFLAGS environment variable.\n"
19
+ exit
20
+ end
21
+ # see: #issue/9, #issue/6
22
+ # see: man compat
23
+ if ENV['COMMAND_MODE'] == 'legacy'
24
+ $stdout << "Setting compat mode to unix2003\n."
25
+ ENV['COMMAND_MODE']= 'unix2003'
26
+ end
27
+ end
28
+
29
+ # make libstemmer_c. unless we're cross-compiling.
30
+ unless RUBY_PLATFORM =~ /i386-mingw32/
31
+ system "cd #{LIBSTEMMER}; #{make} libstemmer.o; cd #{ROOT};"
32
+ exit unless $? == 0
33
+ end
34
+
35
+ $CFLAGS += " -I#{File.expand_path(File.join(LIBSTEMMER, 'include'))} "
36
+ $libs += " -L#{LIBSTEMMER} #{File.expand_path(File.join(LIBSTEMMER, 'libstemmer.o'))} "
37
+
38
+ if have_header("libstemmer.h")
39
+ create_makefile("lingua/stemmer_native")
40
+ end
@@ -0,0 +1,115 @@
1
+ #include "ruby.h"
2
+ #include <libstemmer.h>
3
+
4
+
5
+ #ifdef HAVE_RUBY_ENCODING_H
6
+
7
+ #include <ruby/encoding.h>
8
+
9
+ #define ENCODED_STR_NEW2(str, encoding) \
10
+ ({ \
11
+ VALUE _string = rb_str_new2((const char *)str); \
12
+ int _enc = rb_enc_get_index(encoding); \
13
+ rb_enc_associate_index(_string, _enc); \
14
+ _string; \
15
+ })
16
+
17
+ #else
18
+
19
+ #define ENCODED_STR_NEW2(str, encoding) \
20
+ rb_str_new2((const char *)str)
21
+
22
+ #endif
23
+
24
+
25
+ VALUE rb_mLingua;
26
+ VALUE rb_cStemmer;
27
+ VALUE rb_eStemmerError;
28
+
29
+ /*
30
+ * Document-method: new
31
+ * call-seq: Lingua::Stemmer.new
32
+ *
33
+ * Creates a new Stemmer, pass <tt>:language</tt> and <tt>:encoding</tt> as arguments
34
+ * to change encoding or language, otherwise english with UTF_8 will be used
35
+ *
36
+ * require 'lingua/stemmer'
37
+ * s = Lingua::Stemmer.new :language => 'fr'
38
+ */
39
+ static VALUE
40
+ rb_stemmer_init(VALUE self, VALUE rlang, VALUE renc) {
41
+ struct sb_stemmer * stemmer;
42
+
43
+ Data_Get_Struct(self, struct sb_stemmer, stemmer);
44
+
45
+ // In case someone sends() this method, free up the old one
46
+ if(stemmer) sb_stemmer_delete(stemmer);
47
+
48
+ stemmer = sb_stemmer_new( RSTRING_PTR(rlang), RSTRING_PTR(renc) );
49
+ if (!stemmer) {
50
+ if (!RTEST(renc)) {
51
+ rb_raise(rb_eStemmerError,
52
+ "Language %s not available for stemming", RSTRING_PTR(rlang));
53
+ } else {
54
+ rb_raise(rb_eStemmerError,
55
+ "Language %s not available for stemming in encoding %s",
56
+ RSTRING_PTR(rlang), RSTRING_PTR(renc));
57
+ }
58
+ }
59
+
60
+ DATA_PTR(self) = stemmer;
61
+
62
+ return self;
63
+ }
64
+
65
+ /*
66
+ * Document-method: stem
67
+ * call-seq: stem
68
+ *
69
+ * Stems a word
70
+ *
71
+ * require 'lingua/stemmer'
72
+ * s = Lingua::Stemmer.new
73
+ * s.stem "installation" # ==> install
74
+ */
75
+ static VALUE
76
+ rb_stemmer_stem(VALUE self, VALUE word) {
77
+ struct sb_stemmer * stemmer;
78
+
79
+ Data_Get_Struct(self, struct sb_stemmer, stemmer);
80
+ if(!stemmer) rb_raise(rb_eRuntimeError, "Stemmer is not initialized");
81
+
82
+ VALUE s_word = rb_String(word);
83
+ const sb_symbol * stemmed = sb_stemmer_stem(stemmer,
84
+ (sb_symbol *)RSTRING_PTR(s_word),
85
+ RSTRING_LEN(s_word)
86
+ );
87
+
88
+ VALUE rb_enc = rb_iv_get(self, "@encoding");
89
+ return ENCODED_STR_NEW2((char *)stemmed, rb_enc);
90
+ }
91
+
92
+ static void
93
+ sb_stemmer_free(struct sb_stemmer * stemmer)
94
+ {
95
+ if(stemmer) sb_stemmer_delete(stemmer);
96
+ }
97
+
98
+ static VALUE
99
+ sb_stemmer_alloc(VALUE klass)
100
+ {
101
+ return Data_Wrap_Struct(klass, 0, sb_stemmer_free, 0);
102
+ }
103
+
104
+ /*
105
+ * Ruby-Stemmer, Ruby extension to SnowBall API using libstemmer_c
106
+ */
107
+ void Init_stemmer_native() {
108
+ rb_mLingua = rb_define_module("Lingua");
109
+ rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
110
+ rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
111
+ rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
112
+ rb_define_private_method(rb_cStemmer, "native_init", rb_stemmer_init, 2);
113
+ rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
114
+ }
115
+
@@ -0,0 +1,60 @@
1
+ if RUBY_PLATFORM =~/(mswin|mingw)/i
2
+ require "lingua/#{RUBY_VERSION.sub(/\.\d+$/, '')}/stemmer_native"
3
+ else
4
+ require 'lingua/stemmer_native'
5
+ end
6
+
7
+ module Lingua
8
+ def self.stemmer(o, options={})
9
+ stemmer = Stemmer.new(options)
10
+
11
+ words = Array(o).map { |e| e.to_s }
12
+
13
+ results = []
14
+ words.each do |word|
15
+ result = stemmer.stem(word)
16
+ if block_given?
17
+ yield result
18
+ else
19
+ results << result
20
+ end
21
+ end
22
+
23
+ return stemmer if block_given?
24
+ results.length == 1 ? results[0] : results
25
+ end
26
+
27
+ class Stemmer
28
+ VERSION = File.read(File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "VERSION"))).strip!
29
+
30
+ attr_reader :language
31
+ attr_reader :encoding
32
+
33
+ # Creates a new Stemmer, pass <tt>:language</tt> and <tt>:encoding</tt>
34
+ # as arguments to change encoding or language, otherwise english with UTF_8
35
+ # will be used
36
+ #
37
+ # require 'lingua/stemmer'
38
+ # s = Lingua::Stemmer.new :language => 'fr'
39
+ #
40
+ def initialize(options={})
41
+ @language = (options[:language] || 'en').to_s
42
+ @encoding = (options[:encoding] || 'UTF_8').to_s
43
+
44
+ if RUBY_VERSION >= "1.9"
45
+ if not @encoding.is_a?(Encoding)
46
+ @encoding = Encoding.find(@encoding.gsub("_", "-"))
47
+ end
48
+ else
49
+ @encoding = @encoding.upcase.gsub("-", "_")
50
+ end
51
+
52
+ native_init(@language, native_encoding(@encoding))
53
+ end
54
+
55
+ private
56
+ def native_encoding(enc)
57
+ RUBY_VERSION >= "1.9" ? enc.name.gsub('-', '_') : enc
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,72 @@
1
+ README
2
+ src_c/stem_ISO_8859_1_danish.c
3
+ src_c/stem_ISO_8859_1_danish.h
4
+ src_c/stem_ISO_8859_1_dutch.c
5
+ src_c/stem_ISO_8859_1_dutch.h
6
+ src_c/stem_ISO_8859_1_english.c
7
+ src_c/stem_ISO_8859_1_english.h
8
+ src_c/stem_ISO_8859_1_finnish.c
9
+ src_c/stem_ISO_8859_1_finnish.h
10
+ src_c/stem_ISO_8859_1_french.c
11
+ src_c/stem_ISO_8859_1_french.h
12
+ src_c/stem_ISO_8859_1_german.c
13
+ src_c/stem_ISO_8859_1_german.h
14
+ src_c/stem_ISO_8859_1_hungarian.c
15
+ src_c/stem_ISO_8859_1_hungarian.h
16
+ src_c/stem_ISO_8859_1_italian.c
17
+ src_c/stem_ISO_8859_1_italian.h
18
+ src_c/stem_ISO_8859_1_norwegian.c
19
+ src_c/stem_ISO_8859_1_norwegian.h
20
+ src_c/stem_ISO_8859_1_porter.c
21
+ src_c/stem_ISO_8859_1_porter.h
22
+ src_c/stem_ISO_8859_1_portuguese.c
23
+ src_c/stem_ISO_8859_1_portuguese.h
24
+ src_c/stem_ISO_8859_1_spanish.c
25
+ src_c/stem_ISO_8859_1_spanish.h
26
+ src_c/stem_ISO_8859_1_swedish.c
27
+ src_c/stem_ISO_8859_1_swedish.h
28
+ src_c/stem_ISO_8859_2_romanian.c
29
+ src_c/stem_ISO_8859_2_romanian.h
30
+ src_c/stem_KOI8_R_russian.c
31
+ src_c/stem_KOI8_R_russian.h
32
+ src_c/stem_UTF_8_danish.c
33
+ src_c/stem_UTF_8_danish.h
34
+ src_c/stem_UTF_8_dutch.c
35
+ src_c/stem_UTF_8_dutch.h
36
+ src_c/stem_UTF_8_english.c
37
+ src_c/stem_UTF_8_english.h
38
+ src_c/stem_UTF_8_finnish.c
39
+ src_c/stem_UTF_8_finnish.h
40
+ src_c/stem_UTF_8_french.c
41
+ src_c/stem_UTF_8_french.h
42
+ src_c/stem_UTF_8_german.c
43
+ src_c/stem_UTF_8_german.h
44
+ src_c/stem_UTF_8_hungarian.c
45
+ src_c/stem_UTF_8_hungarian.h
46
+ src_c/stem_UTF_8_italian.c
47
+ src_c/stem_UTF_8_italian.h
48
+ src_c/stem_UTF_8_norwegian.c
49
+ src_c/stem_UTF_8_norwegian.h
50
+ src_c/stem_UTF_8_porter.c
51
+ src_c/stem_UTF_8_porter.h
52
+ src_c/stem_UTF_8_portuguese.c
53
+ src_c/stem_UTF_8_portuguese.h
54
+ src_c/stem_UTF_8_romanian.c
55
+ src_c/stem_UTF_8_romanian.h
56
+ src_c/stem_UTF_8_russian.c
57
+ src_c/stem_UTF_8_russian.h
58
+ src_c/stem_UTF_8_spanish.c
59
+ src_c/stem_UTF_8_spanish.h
60
+ src_c/stem_UTF_8_swedish.c
61
+ src_c/stem_UTF_8_swedish.h
62
+ src_c/stem_UTF_8_turkish.c
63
+ src_c/stem_UTF_8_turkish.h
64
+ runtime/api.c
65
+ runtime/api.h
66
+ runtime/header.h
67
+ runtime/utilities.c
68
+ libstemmer/libstemmer.c
69
+ libstemmer/libstemmer_utf8.c
70
+ libstemmer/modules.h
71
+ libstemmer/modules_utf8.h
72
+ include/libstemmer.h