ruby-stemmer 0.9.3-x86-mingw32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +21 -0
- data/README.rdoc +113 -0
- data/Rakefile +70 -0
- data/TODO +0 -0
- data/VERSION +1 -0
- data/ext/lingua/extconf.rb +40 -0
- data/ext/lingua/stemmer.c +115 -0
- data/lib/lingua/1.8/stemmer_native.so +0 -0
- data/lib/lingua/1.9/stemmer_native.so +0 -0
- data/lib/lingua/stemmer.rb +60 -0
- data/libstemmer_c/MANIFEST +72 -0
- data/libstemmer_c/Makefile +9 -0
- data/libstemmer_c/Makefile.windows +15 -0
- data/libstemmer_c/README +125 -0
- data/libstemmer_c/examples/stemwords.c +209 -0
- data/libstemmer_c/include/libstemmer.h +79 -0
- data/libstemmer_c/libstemmer/libstemmer.c +93 -0
- data/libstemmer_c/libstemmer/libstemmer_utf8.c +93 -0
- data/libstemmer_c/libstemmer/modules.h +195 -0
- data/libstemmer_c/libstemmer/modules.txt +51 -0
- data/libstemmer_c/libstemmer/modules_utf8.h +123 -0
- data/libstemmer_c/libstemmer/modules_utf8.txt +50 -0
- data/libstemmer_c/mkinc.mak +86 -0
- data/libstemmer_c/mkinc_utf8.mak +54 -0
- data/libstemmer_c/runtime/api.c +66 -0
- data/libstemmer_c/runtime/api.h +26 -0
- data/libstemmer_c/runtime/header.h +58 -0
- data/libstemmer_c/runtime/utilities.c +478 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_german.c +503 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_latin.c +443 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_latin.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_german.c +509 -0
- data/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_latin.c +443 -0
- data/libstemmer_c/src_c/stem_UTF_8_latin.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/test/helper.rb +3 -0
- data/test/lingua/test_stemmer.rb +99 -0
- metadata +162 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright (c) 2008-2011 Aurelian Oancea
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
21
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
= Ruby-Stemmer {Project Status}[http://stillmaintained.com/aurelian/ruby-stemmer.png]
|
2
|
+
|
3
|
+
Ruby-Stemmer exposes SnowBall API to Ruby.
|
4
|
+
|
5
|
+
This package includes libstemmer_c library released under BSD licence
|
6
|
+
and available for free {here}[http://snowball.tartarus.org/dist/libstemmer_c.tgz].
|
7
|
+
|
8
|
+
Support for latin language is also included and it has been generated with the snowball compiler using
|
9
|
+
{schinke contribution}[http://snowball.tartarus.org/otherapps/schinke/intro.html].
|
10
|
+
|
11
|
+
For more details about libstemmer_c please visit the {SnowBall website}[http://snowball.tartarus.org].
|
12
|
+
|
13
|
+
== Usage
|
14
|
+
|
15
|
+
require 'rubygems'
|
16
|
+
require 'lingua/stemmer'
|
17
|
+
|
18
|
+
stemmer= Lingua::Stemmer.new(:language => "ro")
|
19
|
+
stemmer.stem("netăgăduit") #=> netăgădu
|
20
|
+
|
21
|
+
=== Alternative
|
22
|
+
|
23
|
+
require 'rubygems'
|
24
|
+
require 'lingua/stemmer'
|
25
|
+
|
26
|
+
Lingua.stemmer( %w(incontestabil neîndoielnic), :language => "ro" ) #=> ["incontest", "neîndoieln"]
|
27
|
+
Lingua.stemmer("installation") #=> "instal"
|
28
|
+
Lingua.stemmer("installation", :language => "fr", :encoding => "ISO_8859_1") do | word |
|
29
|
+
puts "~> #{word}" #=> "instal"
|
30
|
+
end # => #<Lingua::Stemmer:0x102501e48>
|
31
|
+
|
32
|
+
=== Rails
|
33
|
+
|
34
|
+
# Rails2: -- config/environment.rb:
|
35
|
+
config.gem 'ruby-stemmer', :version => '>=0.6.2', :lib => 'lingua/stemmer'
|
36
|
+
|
37
|
+
# Rails3: -- Gemfile
|
38
|
+
gem 'ruby-stemmer', '>=0.8.3', :require => 'lingua/stemmer'
|
39
|
+
|
40
|
+
=== More details
|
41
|
+
|
42
|
+
* Complete API in {RDoc format}[http://rdoc.info/github/aurelian/ruby-stemmer/master/frames]
|
43
|
+
* More usage on the {test file}[http://github.com/aurelian/ruby-stemmer/blob/master/test/lingua/test_stemmer.rb]
|
44
|
+
|
45
|
+
== Install
|
46
|
+
|
47
|
+
=== Standard install with:
|
48
|
+
|
49
|
+
gem install ruby-stemmer
|
50
|
+
|
51
|
+
==== Windows
|
52
|
+
|
53
|
+
There's also a Windows (Fat bin) compiled against ruby 1.9.3 and ruby 1.8.7.
|
54
|
+
|
55
|
+
gem install ruby-stemmer --platform=x86-mswin32
|
56
|
+
|
57
|
+
{It's known}[http://cl.ly/BX9o] to work under Windows XP.
|
58
|
+
|
59
|
+
=== Development version
|
60
|
+
|
61
|
+
$ git clone git://github.com/aurelian/ruby-stemmer.git
|
62
|
+
$ cd ruby-stemmer
|
63
|
+
$ rake -T #<== see what we've got
|
64
|
+
$ rake compile #<== builds the extension do'h
|
65
|
+
$ rake test
|
66
|
+
|
67
|
+
== NOT A BUG
|
68
|
+
|
69
|
+
The stemming process is an algorithm to allow one to find the stem of an word (not the root of it).
|
70
|
+
For further reference on stem vs. root, please check wikipedia articles on the topic:
|
71
|
+
|
72
|
+
* http://en.wikipedia.org/wiki/Stem_%28linguistics%29
|
73
|
+
* http://en.wikipedia.org/wiki/Root_%28linguistics%29
|
74
|
+
|
75
|
+
== TODO
|
76
|
+
|
77
|
+
* {Open issues}[http://github.com/aurelian/ruby-stemmer/issues]
|
78
|
+
|
79
|
+
== Note on Patches/Pull Requests
|
80
|
+
|
81
|
+
* Fork the project from {github}[http://github.com/aurelian/ruby-stemmer]
|
82
|
+
* Make your feature addition or {bug fix}[http://github.com/aurelian/ruby-stemmer/issues]
|
83
|
+
* Add tests for it. This is important so I don't break it in a
|
84
|
+
future version unintentionally.
|
85
|
+
* Commit, do not mess with rakefile, version, or history.
|
86
|
+
|
87
|
+
if you want to have your own version, that is fine but
|
88
|
+
bump version in a commit by itself I can ignore when I pull
|
89
|
+
* Send me a pull request. Bonus points for topic branches.
|
90
|
+
|
91
|
+
== Alternative Stemmers for Ruby
|
92
|
+
|
93
|
+
* {stemmer4r}[http://rubyforge.org/projects/stemmer4r] (ext)
|
94
|
+
* {fast-stemmer}[http://github.com/romanbsd/fast-stemmer] (ext)
|
95
|
+
* {uea-stemmer}[http://github.com/ealdent/uea-stemmer] (ext)
|
96
|
+
* {stemmer}[http://rubyforge.org/projects/stemmer] (pure ruby)
|
97
|
+
* add yours
|
98
|
+
|
99
|
+
== Copyright
|
100
|
+
|
101
|
+
Copyright (c) 2008-2011 {Aurelian Oancea}[http://locknet.ro]. See MIT-LICENSE for details.
|
102
|
+
|
103
|
+
== Contributors
|
104
|
+
|
105
|
+
* {Aurelian Oancea}[https://github.com/aurelian]
|
106
|
+
* {Yury Korolev}[https://github.com/yury] - various bug fixes
|
107
|
+
* {Aaron Patterson}[https://github.com/tenderlove] - rake compiler (windows support), code cleanup
|
108
|
+
* {Damián Silvani}[https://github.com/munshkr] - Ruby 1.9 encoding
|
109
|
+
|
110
|
+
== Real life usage
|
111
|
+
* http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments.
|
112
|
+
|
113
|
+
# encoding: utf-8
|
data/Rakefile
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
gem 'rake-compiler', '~>0.7'
|
5
|
+
require 'rake/extensiontask'
|
6
|
+
|
7
|
+
require 'jeweler'
|
8
|
+
$jeweler = Jeweler::Tasks.new do |gem|
|
9
|
+
gem.name = "ruby-stemmer"
|
10
|
+
gem.version = File.read(File.expand_path(File.join(File.dirname(__FILE__),"VERSION"))).strip!
|
11
|
+
gem.summary = %Q{Expose libstemmer_c to Ruby.}
|
12
|
+
gem.description = %Q{Expose the bundled libstemmer_c library to Ruby.}
|
13
|
+
gem.email = "oancea@gmail.com"
|
14
|
+
gem.homepage = "http://github.com/aurelian/ruby-stemmer"
|
15
|
+
gem.authors = ["Aurelian Oancea", "Yury Korolev"]
|
16
|
+
gem.extensions = ["ext/lingua/extconf.rb"]
|
17
|
+
gem.rubyforge_project = "ruby-stemmer"
|
18
|
+
gem.files = FileList['lib/**/*.rb', 'README.rdoc', 'MIT-LICENSE', 'VERSION', 'Rakefile', 'libstemmer_c/**/*', 'ext/**/*', 'test/**/*']
|
19
|
+
%w(ext/lingua/*.so ext/lingua/*.bundle ext/lingua/Makefile ext/lingua/mkmf.log ext/lingua/*.o libstemmer_c/**/*.o libstemmer_c/stemwords).each do | f |
|
20
|
+
gem.files.exclude f
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
Jeweler::GemcutterTasks.new
|
25
|
+
|
26
|
+
require 'rake/testtask'
|
27
|
+
Rake::TestTask.new(:test) do |test|
|
28
|
+
test.libs << 'lib' << 'test'
|
29
|
+
test.pattern = 'test/**/test_*.rb'
|
30
|
+
test.verbose = true
|
31
|
+
end
|
32
|
+
|
33
|
+
begin
|
34
|
+
require 'rcov/rcovtask'
|
35
|
+
Rcov::RcovTask.new do |test|
|
36
|
+
test.libs << 'test'
|
37
|
+
test.pattern = 'test/**/test_*.rb'
|
38
|
+
test.verbose = true
|
39
|
+
end
|
40
|
+
rescue LoadError
|
41
|
+
task :rcov do
|
42
|
+
abort "RCov is not available. In order to run rcov, you must: gem install rcov"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
task :test => :check_dependencies
|
47
|
+
|
48
|
+
task :default => :test
|
49
|
+
|
50
|
+
CLOBBER.include("libstemmer_c/**/*.o")
|
51
|
+
|
52
|
+
Rake::ExtensionTask.new('ruby-stemmer', $jeweler.jeweler.gemspec) do |ext|
|
53
|
+
ext.lib_dir = File.join(*['lib', 'lingua', ENV['FAT_DIR']].compact)
|
54
|
+
ext.ext_dir = File.join 'ext', 'lingua'
|
55
|
+
ext.cross_compile = true
|
56
|
+
ext.cross_platform = ['i386-mswin32-60', 'i386-mingw32']
|
57
|
+
ext.name = 'stemmer_native'
|
58
|
+
end
|
59
|
+
|
60
|
+
require 'rdoc/task'
|
61
|
+
Rake::RDocTask.new do |rdoc|
|
62
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
63
|
+
rdoc.rdoc_dir = 'rdoc'
|
64
|
+
rdoc.options << '--charset' << 'utf-8'
|
65
|
+
rdoc.title = "Ruby-Stemmer #{version}"
|
66
|
+
rdoc.rdoc_files.include('README*')
|
67
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
68
|
+
rdoc.rdoc_files.include('ext/lingua/stemmer.c')
|
69
|
+
rdoc.rdoc_files.include('MIT-LICENSE')
|
70
|
+
end
|
data/TODO
ADDED
File without changes
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.9.3
|
@@ -0,0 +1,40 @@
|
|
1
|
+
ENV['RC_ARCHS'] = '' if RUBY_PLATFORM =~ /darwin/
|
2
|
+
require "mkmf"
|
3
|
+
|
4
|
+
ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..', '..'))
|
5
|
+
LIBSTEMMER = File.join(ROOT, 'libstemmer_c')
|
6
|
+
|
7
|
+
# build libstemmer_c
|
8
|
+
# FreeBSD make is gmake
|
9
|
+
make= (RUBY_PLATFORM =~ /freebsd/)? 'gmake' : 'make'
|
10
|
+
|
11
|
+
# MacOS architecture mess up
|
12
|
+
if RUBY_PLATFORM =~ /darwin/
|
13
|
+
# see: #issue/3, #issue/5
|
14
|
+
begin
|
15
|
+
ENV['ARCHFLAGS']= "-arch " + %x[file #{File.expand_path(File.join(Config::CONFIG['bindir'], Config::CONFIG['RUBY_INSTALL_NAME']))}].strip!.match(/executable (.+)$/)[1] unless ENV['ARCHFLAGS'].nil?
|
16
|
+
rescue
|
17
|
+
$stderr << "Failed to get your ruby executable architecture.\n"
|
18
|
+
$stderr << "Please specify one using $ARCHFLAGS environment variable.\n"
|
19
|
+
exit
|
20
|
+
end
|
21
|
+
# see: #issue/9, #issue/6
|
22
|
+
# see: man compat
|
23
|
+
if ENV['COMMAND_MODE'] == 'legacy'
|
24
|
+
$stdout << "Setting compat mode to unix2003\n."
|
25
|
+
ENV['COMMAND_MODE']= 'unix2003'
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# make libstemmer_c. unless we're cross-compiling.
|
30
|
+
unless RUBY_PLATFORM =~ /i386-mingw32/
|
31
|
+
system "cd #{LIBSTEMMER}; #{make} libstemmer.o; cd #{ROOT};"
|
32
|
+
exit unless $? == 0
|
33
|
+
end
|
34
|
+
|
35
|
+
$CFLAGS += " -I#{File.expand_path(File.join(LIBSTEMMER, 'include'))} "
|
36
|
+
$libs += " -L#{LIBSTEMMER} #{File.expand_path(File.join(LIBSTEMMER, 'libstemmer.o'))} "
|
37
|
+
|
38
|
+
if have_header("libstemmer.h")
|
39
|
+
create_makefile("lingua/stemmer_native")
|
40
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include <libstemmer.h>
|
3
|
+
|
4
|
+
|
5
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
6
|
+
|
7
|
+
#include <ruby/encoding.h>
|
8
|
+
|
9
|
+
#define ENCODED_STR_NEW2(str, encoding) \
|
10
|
+
({ \
|
11
|
+
VALUE _string = rb_str_new2((const char *)str); \
|
12
|
+
int _enc = rb_enc_get_index(encoding); \
|
13
|
+
rb_enc_associate_index(_string, _enc); \
|
14
|
+
_string; \
|
15
|
+
})
|
16
|
+
|
17
|
+
#else
|
18
|
+
|
19
|
+
#define ENCODED_STR_NEW2(str, encoding) \
|
20
|
+
rb_str_new2((const char *)str)
|
21
|
+
|
22
|
+
#endif
|
23
|
+
|
24
|
+
|
25
|
+
VALUE rb_mLingua;
|
26
|
+
VALUE rb_cStemmer;
|
27
|
+
VALUE rb_eStemmerError;
|
28
|
+
|
29
|
+
/*
|
30
|
+
* Document-method: new
|
31
|
+
* call-seq: Lingua::Stemmer.new
|
32
|
+
*
|
33
|
+
* Creates a new Stemmer, pass <tt>:language</tt> and <tt>:encoding</tt> as arguments
|
34
|
+
* to change encoding or language, otherwise english with UTF_8 will be used
|
35
|
+
*
|
36
|
+
* require 'lingua/stemmer'
|
37
|
+
* s = Lingua::Stemmer.new :language => 'fr'
|
38
|
+
*/
|
39
|
+
static VALUE
|
40
|
+
rb_stemmer_init(VALUE self, VALUE rlang, VALUE renc) {
|
41
|
+
struct sb_stemmer * stemmer;
|
42
|
+
|
43
|
+
Data_Get_Struct(self, struct sb_stemmer, stemmer);
|
44
|
+
|
45
|
+
// In case someone sends() this method, free up the old one
|
46
|
+
if(stemmer) sb_stemmer_delete(stemmer);
|
47
|
+
|
48
|
+
stemmer = sb_stemmer_new( RSTRING_PTR(rlang), RSTRING_PTR(renc) );
|
49
|
+
if (!stemmer) {
|
50
|
+
if (!RTEST(renc)) {
|
51
|
+
rb_raise(rb_eStemmerError,
|
52
|
+
"Language %s not available for stemming", RSTRING_PTR(rlang));
|
53
|
+
} else {
|
54
|
+
rb_raise(rb_eStemmerError,
|
55
|
+
"Language %s not available for stemming in encoding %s",
|
56
|
+
RSTRING_PTR(rlang), RSTRING_PTR(renc));
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
DATA_PTR(self) = stemmer;
|
61
|
+
|
62
|
+
return self;
|
63
|
+
}
|
64
|
+
|
65
|
+
/*
|
66
|
+
* Document-method: stem
|
67
|
+
* call-seq: stem
|
68
|
+
*
|
69
|
+
* Stems a word
|
70
|
+
*
|
71
|
+
* require 'lingua/stemmer'
|
72
|
+
* s = Lingua::Stemmer.new
|
73
|
+
* s.stem "installation" # ==> install
|
74
|
+
*/
|
75
|
+
static VALUE
|
76
|
+
rb_stemmer_stem(VALUE self, VALUE word) {
|
77
|
+
struct sb_stemmer * stemmer;
|
78
|
+
|
79
|
+
Data_Get_Struct(self, struct sb_stemmer, stemmer);
|
80
|
+
if(!stemmer) rb_raise(rb_eRuntimeError, "Stemmer is not initialized");
|
81
|
+
|
82
|
+
VALUE s_word = rb_String(word);
|
83
|
+
const sb_symbol * stemmed = sb_stemmer_stem(stemmer,
|
84
|
+
(sb_symbol *)RSTRING_PTR(s_word),
|
85
|
+
RSTRING_LEN(s_word)
|
86
|
+
);
|
87
|
+
|
88
|
+
VALUE rb_enc = rb_iv_get(self, "@encoding");
|
89
|
+
return ENCODED_STR_NEW2((char *)stemmed, rb_enc);
|
90
|
+
}
|
91
|
+
|
92
|
+
static void
|
93
|
+
sb_stemmer_free(struct sb_stemmer * stemmer)
|
94
|
+
{
|
95
|
+
if(stemmer) sb_stemmer_delete(stemmer);
|
96
|
+
}
|
97
|
+
|
98
|
+
static VALUE
|
99
|
+
sb_stemmer_alloc(VALUE klass)
|
100
|
+
{
|
101
|
+
return Data_Wrap_Struct(klass, 0, sb_stemmer_free, 0);
|
102
|
+
}
|
103
|
+
|
104
|
+
/*
|
105
|
+
* Ruby-Stemmer, Ruby extension to SnowBall API using libstemmer_c
|
106
|
+
*/
|
107
|
+
void Init_stemmer_native() {
|
108
|
+
rb_mLingua = rb_define_module("Lingua");
|
109
|
+
rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
|
110
|
+
rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
|
111
|
+
rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
|
112
|
+
rb_define_private_method(rb_cStemmer, "native_init", rb_stemmer_init, 2);
|
113
|
+
rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
|
114
|
+
}
|
115
|
+
|
Binary file
|
Binary file
|
@@ -0,0 +1,60 @@
|
|
1
|
+
if RUBY_PLATFORM =~/(mswin|mingw)/i
|
2
|
+
require "lingua/#{RUBY_VERSION.sub(/\.\d+$/, '')}/stemmer_native"
|
3
|
+
else
|
4
|
+
require 'lingua/stemmer_native'
|
5
|
+
end
|
6
|
+
|
7
|
+
module Lingua
|
8
|
+
def self.stemmer(o, options={})
|
9
|
+
stemmer = Stemmer.new(options)
|
10
|
+
|
11
|
+
words = Array(o).map { |e| e.to_s }
|
12
|
+
|
13
|
+
results = []
|
14
|
+
words.each do |word|
|
15
|
+
result = stemmer.stem(word)
|
16
|
+
if block_given?
|
17
|
+
yield result
|
18
|
+
else
|
19
|
+
results << result
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
return stemmer if block_given?
|
24
|
+
results.length == 1 ? results[0] : results
|
25
|
+
end
|
26
|
+
|
27
|
+
class Stemmer
|
28
|
+
VERSION = File.read(File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "VERSION"))).strip!
|
29
|
+
|
30
|
+
attr_reader :language
|
31
|
+
attr_reader :encoding
|
32
|
+
|
33
|
+
# Creates a new Stemmer, pass <tt>:language</tt> and <tt>:encoding</tt>
|
34
|
+
# as arguments to change encoding or language, otherwise english with UTF_8
|
35
|
+
# will be used
|
36
|
+
#
|
37
|
+
# require 'lingua/stemmer'
|
38
|
+
# s = Lingua::Stemmer.new :language => 'fr'
|
39
|
+
#
|
40
|
+
def initialize(options={})
|
41
|
+
@language = (options[:language] || 'en').to_s
|
42
|
+
@encoding = (options[:encoding] || 'UTF_8').to_s
|
43
|
+
|
44
|
+
if RUBY_VERSION >= "1.9"
|
45
|
+
if not @encoding.is_a?(Encoding)
|
46
|
+
@encoding = Encoding.find(@encoding.gsub("_", "-"))
|
47
|
+
end
|
48
|
+
else
|
49
|
+
@encoding = @encoding.upcase.gsub("-", "_")
|
50
|
+
end
|
51
|
+
|
52
|
+
native_init(@language, native_encoding(@encoding))
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
def native_encoding(enc)
|
57
|
+
RUBY_VERSION >= "1.9" ? enc.name.gsub('-', '_') : enc
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
README
|
2
|
+
src_c/stem_ISO_8859_1_danish.c
|
3
|
+
src_c/stem_ISO_8859_1_danish.h
|
4
|
+
src_c/stem_ISO_8859_1_dutch.c
|
5
|
+
src_c/stem_ISO_8859_1_dutch.h
|
6
|
+
src_c/stem_ISO_8859_1_english.c
|
7
|
+
src_c/stem_ISO_8859_1_english.h
|
8
|
+
src_c/stem_ISO_8859_1_finnish.c
|
9
|
+
src_c/stem_ISO_8859_1_finnish.h
|
10
|
+
src_c/stem_ISO_8859_1_french.c
|
11
|
+
src_c/stem_ISO_8859_1_french.h
|
12
|
+
src_c/stem_ISO_8859_1_german.c
|
13
|
+
src_c/stem_ISO_8859_1_german.h
|
14
|
+
src_c/stem_ISO_8859_1_hungarian.c
|
15
|
+
src_c/stem_ISO_8859_1_hungarian.h
|
16
|
+
src_c/stem_ISO_8859_1_italian.c
|
17
|
+
src_c/stem_ISO_8859_1_italian.h
|
18
|
+
src_c/stem_ISO_8859_1_norwegian.c
|
19
|
+
src_c/stem_ISO_8859_1_norwegian.h
|
20
|
+
src_c/stem_ISO_8859_1_porter.c
|
21
|
+
src_c/stem_ISO_8859_1_porter.h
|
22
|
+
src_c/stem_ISO_8859_1_portuguese.c
|
23
|
+
src_c/stem_ISO_8859_1_portuguese.h
|
24
|
+
src_c/stem_ISO_8859_1_spanish.c
|
25
|
+
src_c/stem_ISO_8859_1_spanish.h
|
26
|
+
src_c/stem_ISO_8859_1_swedish.c
|
27
|
+
src_c/stem_ISO_8859_1_swedish.h
|
28
|
+
src_c/stem_ISO_8859_2_romanian.c
|
29
|
+
src_c/stem_ISO_8859_2_romanian.h
|
30
|
+
src_c/stem_KOI8_R_russian.c
|
31
|
+
src_c/stem_KOI8_R_russian.h
|
32
|
+
src_c/stem_UTF_8_danish.c
|
33
|
+
src_c/stem_UTF_8_danish.h
|
34
|
+
src_c/stem_UTF_8_dutch.c
|
35
|
+
src_c/stem_UTF_8_dutch.h
|
36
|
+
src_c/stem_UTF_8_english.c
|
37
|
+
src_c/stem_UTF_8_english.h
|
38
|
+
src_c/stem_UTF_8_finnish.c
|
39
|
+
src_c/stem_UTF_8_finnish.h
|
40
|
+
src_c/stem_UTF_8_french.c
|
41
|
+
src_c/stem_UTF_8_french.h
|
42
|
+
src_c/stem_UTF_8_german.c
|
43
|
+
src_c/stem_UTF_8_german.h
|
44
|
+
src_c/stem_UTF_8_hungarian.c
|
45
|
+
src_c/stem_UTF_8_hungarian.h
|
46
|
+
src_c/stem_UTF_8_italian.c
|
47
|
+
src_c/stem_UTF_8_italian.h
|
48
|
+
src_c/stem_UTF_8_norwegian.c
|
49
|
+
src_c/stem_UTF_8_norwegian.h
|
50
|
+
src_c/stem_UTF_8_porter.c
|
51
|
+
src_c/stem_UTF_8_porter.h
|
52
|
+
src_c/stem_UTF_8_portuguese.c
|
53
|
+
src_c/stem_UTF_8_portuguese.h
|
54
|
+
src_c/stem_UTF_8_romanian.c
|
55
|
+
src_c/stem_UTF_8_romanian.h
|
56
|
+
src_c/stem_UTF_8_russian.c
|
57
|
+
src_c/stem_UTF_8_russian.h
|
58
|
+
src_c/stem_UTF_8_spanish.c
|
59
|
+
src_c/stem_UTF_8_spanish.h
|
60
|
+
src_c/stem_UTF_8_swedish.c
|
61
|
+
src_c/stem_UTF_8_swedish.h
|
62
|
+
src_c/stem_UTF_8_turkish.c
|
63
|
+
src_c/stem_UTF_8_turkish.h
|
64
|
+
runtime/api.c
|
65
|
+
runtime/api.h
|
66
|
+
runtime/header.h
|
67
|
+
runtime/utilities.c
|
68
|
+
libstemmer/libstemmer.c
|
69
|
+
libstemmer/libstemmer_utf8.c
|
70
|
+
libstemmer/modules.h
|
71
|
+
libstemmer/modules_utf8.h
|
72
|
+
include/libstemmer.h
|