ruby-stemmer-dimelo 0.9.3.dimelo1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +117 -0
- data/Rakefile +70 -0
- data/VERSION +1 -0
- data/ext/lingua/extconf.rb +40 -0
- data/ext/lingua/stemmer.c +115 -0
- data/lib/lingua/stemmer.rb +60 -0
- data/libstemmer_c/MANIFEST +72 -0
- data/libstemmer_c/Makefile +9 -0
- data/libstemmer_c/Makefile.windows +15 -0
- data/libstemmer_c/README +125 -0
- data/libstemmer_c/examples/stemwords.c +209 -0
- data/libstemmer_c/include/libstemmer.h +79 -0
- data/libstemmer_c/libstemmer/libstemmer.c +93 -0
- data/libstemmer_c/libstemmer/libstemmer_utf8.c +93 -0
- data/libstemmer_c/libstemmer/modules.h +195 -0
- data/libstemmer_c/libstemmer/modules.txt +51 -0
- data/libstemmer_c/libstemmer/modules_utf8.h +123 -0
- data/libstemmer_c/libstemmer/modules_utf8.txt +50 -0
- data/libstemmer_c/mkinc.mak +86 -0
- data/libstemmer_c/mkinc_utf8.mak +54 -0
- data/libstemmer_c/runtime/api.c +66 -0
- data/libstemmer_c/runtime/api.h +26 -0
- data/libstemmer_c/runtime/header.h +58 -0
- data/libstemmer_c/runtime/utilities.c +478 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1230 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_german.c +503 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_latin.c +443 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_latin.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_french.c +1230 -0
- data/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_german.c +509 -0
- data/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_latin.c +443 -0
- data/libstemmer_c/src_c/stem_UTF_8_latin.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/test/helper.rb +3 -0
- data/test/lingua/test_stemmer.rb +99 -0
- metadata +141 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 210c16040fce7de4293c411dd0139091debd99e8
|
4
|
+
data.tar.gz: 9a64fdd53a0c383fd286974a6cf57dbd06497e6a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 464276d630c2540b6db3f1e2c12f30a0e0a6d08195940673fbc01f8e6ec643bf5bc32778b081afcd1d2a3a950d94bacc5ff589865e156dddce619930aa1b5cea
|
7
|
+
data.tar.gz: d72d91388301b5835a92add0b298d9648082fe4d44a97e0a3360f1af828c31cc49f84586be7348dc9872dd6bcfec2fa4835697fa4428d5535f9d8e3725f59876
|
data/MIT-LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright (c) 2008-2011 Aurelian Oancea
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
21
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
= Ruby-Stemmer {Project Status}[http://stillmaintained.com/aurelian/ruby-stemmer.png]
|
2
|
+
|
3
|
+
Ruby-Stemmer exposes SnowBall API to Ruby.
|
4
|
+
|
5
|
+
This package includes libstemmer_c library released under BSD licence
|
6
|
+
and available for free {here}[http://snowball.tartarus.org/dist/libstemmer_c.tgz].
|
7
|
+
|
8
|
+
Support for latin language is also included and it has been generated with the snowball compiler using
|
9
|
+
{schinke contribution}[http://snowball.tartarus.org/otherapps/schinke/intro.html].
|
10
|
+
|
11
|
+
For more details about libstemmer_c please visit the {SnowBall website}[http://snowball.tartarus.org].
|
12
|
+
|
13
|
+
== Usage
|
14
|
+
|
15
|
+
require 'rubygems'
|
16
|
+
require 'lingua/stemmer'
|
17
|
+
|
18
|
+
stemmer= Lingua::Stemmer.new(:language => "ro")
|
19
|
+
stemmer.stem("netăgăduit") #=> netăgădu
|
20
|
+
|
21
|
+
=== Alternative
|
22
|
+
|
23
|
+
require 'rubygems'
|
24
|
+
require 'lingua/stemmer'
|
25
|
+
|
26
|
+
Lingua.stemmer( %w(incontestabil neîndoielnic), :language => "ro" ) #=> ["incontest", "neîndoieln"]
|
27
|
+
Lingua.stemmer("installation") #=> "instal"
|
28
|
+
Lingua.stemmer("installation", :language => "fr", :encoding => "ISO_8859_1") do | word |
|
29
|
+
puts "~> #{word}" #=> "instal"
|
30
|
+
end # => #<Lingua::Stemmer:0x102501e48>
|
31
|
+
|
32
|
+
=== Rails
|
33
|
+
|
34
|
+
# Rails2: -- config/environment.rb:
|
35
|
+
config.gem 'ruby-stemmer', :version => '>=0.6.2', :lib => 'lingua/stemmer'
|
36
|
+
|
37
|
+
# Rails3: -- Gemfile
|
38
|
+
gem 'ruby-stemmer', '>=0.8.3', :require => 'lingua/stemmer'
|
39
|
+
|
40
|
+
=== More details
|
41
|
+
|
42
|
+
* Complete API in {RDoc format}[http://rdoc.info/github/aurelian/ruby-stemmer/master/frames]
|
43
|
+
* More usage on the {test file}[http://github.com/aurelian/ruby-stemmer/blob/master/test/lingua/test_stemmer.rb]
|
44
|
+
|
45
|
+
== Install
|
46
|
+
|
47
|
+
=== Standard install with:
|
48
|
+
|
49
|
+
gem install ruby-stemmer
|
50
|
+
|
51
|
+
==== Windows
|
52
|
+
|
53
|
+
There's also a Windows (Fat bin) compiled against ruby 1.9.3 and ruby 1.8.7.
|
54
|
+
|
55
|
+
gem install ruby-stemmer --platform=x86-mingw32
|
56
|
+
|
57
|
+
As far as I know the above should work with {rubyinstaller}[http://rubyinstaller.org/]. If if fails, you could try with:
|
58
|
+
|
59
|
+
gem install ruby-stemmer --platform=x86-mswin32
|
60
|
+
|
61
|
+
{It's known}[http://cl.ly/BX9o] to work under Windows XP.
|
62
|
+
|
63
|
+
=== Development version
|
64
|
+
|
65
|
+
$ git clone git://github.com/aurelian/ruby-stemmer.git
|
66
|
+
$ cd ruby-stemmer
|
67
|
+
$ rake -T #<== see what we've got
|
68
|
+
$ rake compile #<== builds the extension do'h
|
69
|
+
$ rake test
|
70
|
+
|
71
|
+
== NOT A BUG
|
72
|
+
|
73
|
+
The stemming process is an algorithm to allow one to find the stem of an word (not the root of it).
|
74
|
+
For further reference on stem vs. root, please check wikipedia articles on the topic:
|
75
|
+
|
76
|
+
* http://en.wikipedia.org/wiki/Stem_%28linguistics%29
|
77
|
+
* http://en.wikipedia.org/wiki/Root_%28linguistics%29
|
78
|
+
|
79
|
+
== TODO
|
80
|
+
|
81
|
+
* {Open issues}[http://github.com/aurelian/ruby-stemmer/issues]
|
82
|
+
|
83
|
+
== Note on Patches/Pull Requests
|
84
|
+
|
85
|
+
* Fork the project from {github}[http://github.com/aurelian/ruby-stemmer]
|
86
|
+
* Make your feature addition or {bug fix}[http://github.com/aurelian/ruby-stemmer/issues]
|
87
|
+
* Add tests for it. This is important so I don't break it in a
|
88
|
+
future version unintentionally.
|
89
|
+
* Commit, do not mess with rakefile, version, or history.
|
90
|
+
|
91
|
+
if you want to have your own version, that is fine but
|
92
|
+
bump version in a commit by itself I can ignore when I pull
|
93
|
+
* Send me a pull request. Bonus points for topic branches.
|
94
|
+
|
95
|
+
== Alternative Stemmers for Ruby
|
96
|
+
|
97
|
+
* {stemmer4r}[http://rubyforge.org/projects/stemmer4r] (ext)
|
98
|
+
* {fast-stemmer}[http://github.com/romanbsd/fast-stemmer] (ext)
|
99
|
+
* {uea-stemmer}[http://github.com/ealdent/uea-stemmer] (ext)
|
100
|
+
* {stemmer}[http://rubyforge.org/projects/stemmer] (pure ruby)
|
101
|
+
* add yours
|
102
|
+
|
103
|
+
== Copyright
|
104
|
+
|
105
|
+
Copyright (c) 2008-2011 {Aurelian Oancea}[http://locknet.ro]. See MIT-LICENSE for details.
|
106
|
+
|
107
|
+
== Contributors
|
108
|
+
|
109
|
+
* {Aurelian Oancea}[https://github.com/aurelian]
|
110
|
+
* {Yury Korolev}[https://github.com/yury] - various bug fixes
|
111
|
+
* {Aaron Patterson}[https://github.com/tenderlove] - rake compiler (windows support), code cleanup
|
112
|
+
* {Damián Silvani}[https://github.com/munshkr] - Ruby 1.9 encoding
|
113
|
+
|
114
|
+
== Real life usage
|
115
|
+
* http://planet33.ru is using Ruby-Stemmer together with {Classifier}[http://github.com/yury/classifier] to automatically rate places based on users comments.
|
116
|
+
|
117
|
+
# encoding: utf-8
|
data/Rakefile
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
gem 'rake-compiler', '~>0.7'
|
5
|
+
require 'rake/extensiontask'
|
6
|
+
|
7
|
+
require 'jeweler'
|
8
|
+
$jeweler = Jeweler::Tasks.new do |gem|
|
9
|
+
gem.name = "ruby-stemmer"
|
10
|
+
gem.version = File.read(File.expand_path(File.join(File.dirname(__FILE__),"VERSION"))).strip!
|
11
|
+
gem.summary = %Q{Expose libstemmer_c to Ruby.}
|
12
|
+
gem.description = %Q{Expose the bundled libstemmer_c library to Ruby.}
|
13
|
+
gem.email = "oancea@gmail.com"
|
14
|
+
gem.homepage = "http://github.com/aurelian/ruby-stemmer"
|
15
|
+
gem.authors = ["Aurelian Oancea", "Yury Korolev"]
|
16
|
+
gem.extensions = ["ext/lingua/extconf.rb"]
|
17
|
+
gem.rubyforge_project = "ruby-stemmer"
|
18
|
+
gem.files = FileList['lib/**/*.rb', 'README.rdoc', 'MIT-LICENSE', 'VERSION', 'Rakefile', 'libstemmer_c/**/*', 'ext/**/*', 'test/**/*']
|
19
|
+
%w(ext/lingua/*.so ext/lingua/*.bundle ext/lingua/Makefile ext/lingua/mkmf.log ext/lingua/*.o libstemmer_c/**/*.o libstemmer_c/stemwords).each do | f |
|
20
|
+
gem.files.exclude f
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
Jeweler::GemcutterTasks.new
|
25
|
+
|
26
|
+
require 'rake/testtask'
|
27
|
+
Rake::TestTask.new(:test) do |test|
|
28
|
+
test.libs << 'lib' << 'test'
|
29
|
+
test.pattern = 'test/**/test_*.rb'
|
30
|
+
test.verbose = true
|
31
|
+
end
|
32
|
+
|
33
|
+
begin
|
34
|
+
require 'rcov/rcovtask'
|
35
|
+
Rcov::RcovTask.new do |test|
|
36
|
+
test.libs << 'test'
|
37
|
+
test.pattern = 'test/**/test_*.rb'
|
38
|
+
test.verbose = true
|
39
|
+
end
|
40
|
+
rescue LoadError
|
41
|
+
task :rcov do
|
42
|
+
abort "RCov is not available. In order to run rcov, you must: gem install rcov"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
task :test => :check_dependencies
|
47
|
+
|
48
|
+
task :default => :test
|
49
|
+
|
50
|
+
CLOBBER.include("libstemmer_c/**/*.o")
|
51
|
+
|
52
|
+
Rake::ExtensionTask.new('ruby-stemmer', $jeweler.jeweler.gemspec) do |ext|
|
53
|
+
ext.lib_dir = File.join(*['lib', 'lingua', ENV['FAT_DIR']].compact)
|
54
|
+
ext.ext_dir = File.join 'ext', 'lingua'
|
55
|
+
ext.cross_compile = true
|
56
|
+
ext.cross_platform = ['i386-mswin32-60', 'i386-mingw32']
|
57
|
+
ext.name = 'stemmer_native'
|
58
|
+
end
|
59
|
+
|
60
|
+
require 'rdoc/task'
|
61
|
+
Rake::RDocTask.new do |rdoc|
|
62
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
63
|
+
rdoc.rdoc_dir = 'rdoc'
|
64
|
+
rdoc.options << '--charset' << 'utf-8'
|
65
|
+
rdoc.title = "Ruby-Stemmer #{version}"
|
66
|
+
rdoc.rdoc_files.include('README*')
|
67
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
68
|
+
rdoc.rdoc_files.include('ext/lingua/stemmer.c')
|
69
|
+
rdoc.rdoc_files.include('MIT-LICENSE')
|
70
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.9.3
|
@@ -0,0 +1,40 @@
|
|
1
|
+
ENV['RC_ARCHS'] = '' if RUBY_PLATFORM =~ /darwin/
|
2
|
+
require "mkmf"
|
3
|
+
|
4
|
+
ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..', '..'))
|
5
|
+
LIBSTEMMER = File.join(ROOT, 'libstemmer_c')
|
6
|
+
|
7
|
+
# build libstemmer_c
|
8
|
+
# FreeBSD make is gmake
|
9
|
+
make= (RUBY_PLATFORM =~ /freebsd/)? 'gmake' : 'make'
|
10
|
+
|
11
|
+
# MacOS architecture mess up
|
12
|
+
if RUBY_PLATFORM =~ /darwin/
|
13
|
+
# see: #issue/3, #issue/5
|
14
|
+
begin
|
15
|
+
ENV['ARCHFLAGS']= "-arch " + %x[file #{File.expand_path(File.join(Config::CONFIG['bindir'], Config::CONFIG['RUBY_INSTALL_NAME']))}].strip!.match(/executable (.+)$/)[1] unless ENV['ARCHFLAGS'].nil?
|
16
|
+
rescue
|
17
|
+
$stderr << "Failed to get your ruby executable architecture.\n"
|
18
|
+
$stderr << "Please specify one using $ARCHFLAGS environment variable.\n"
|
19
|
+
exit
|
20
|
+
end
|
21
|
+
# see: #issue/9, #issue/6
|
22
|
+
# see: man compat
|
23
|
+
if ENV['COMMAND_MODE'] == 'legacy'
|
24
|
+
$stdout << "Setting compat mode to unix2003\n."
|
25
|
+
ENV['COMMAND_MODE']= 'unix2003'
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# make libstemmer_c. unless we're cross-compiling.
|
30
|
+
unless RUBY_PLATFORM =~ /i386-mingw32/
|
31
|
+
system "cd #{LIBSTEMMER}; #{make} libstemmer.o; cd #{ROOT};"
|
32
|
+
exit unless $? == 0
|
33
|
+
end
|
34
|
+
|
35
|
+
$CFLAGS += " -I#{File.expand_path(File.join(LIBSTEMMER, 'include'))} "
|
36
|
+
$libs += " -L#{LIBSTEMMER} #{File.expand_path(File.join(LIBSTEMMER, 'libstemmer.o'))} "
|
37
|
+
|
38
|
+
if have_header("libstemmer.h")
|
39
|
+
create_makefile("lingua/stemmer_native")
|
40
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include <libstemmer.h>
|
3
|
+
|
4
|
+
|
5
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
6
|
+
|
7
|
+
#include <ruby/encoding.h>
|
8
|
+
|
9
|
+
#define ENCODED_STR_NEW2(str, encoding) \
|
10
|
+
({ \
|
11
|
+
VALUE _string = rb_str_new2((const char *)str); \
|
12
|
+
int _enc = rb_enc_get_index(encoding); \
|
13
|
+
rb_enc_associate_index(_string, _enc); \
|
14
|
+
_string; \
|
15
|
+
})
|
16
|
+
|
17
|
+
#else
|
18
|
+
|
19
|
+
#define ENCODED_STR_NEW2(str, encoding) \
|
20
|
+
rb_str_new2((const char *)str)
|
21
|
+
|
22
|
+
#endif
|
23
|
+
|
24
|
+
|
25
|
+
VALUE rb_mLingua;
|
26
|
+
VALUE rb_cStemmer;
|
27
|
+
VALUE rb_eStemmerError;
|
28
|
+
|
29
|
+
/*
|
30
|
+
* Document-method: new
|
31
|
+
* call-seq: Lingua::Stemmer.new
|
32
|
+
*
|
33
|
+
* Creates a new Stemmer, pass <tt>:language</tt> and <tt>:encoding</tt> as arguments
|
34
|
+
* to change encoding or language, otherwise english with UTF_8 will be used
|
35
|
+
*
|
36
|
+
* require 'lingua/stemmer'
|
37
|
+
* s = Lingua::Stemmer.new :language => 'fr'
|
38
|
+
*/
|
39
|
+
static VALUE
|
40
|
+
rb_stemmer_init(VALUE self, VALUE rlang, VALUE renc) {
|
41
|
+
struct sb_stemmer * stemmer;
|
42
|
+
|
43
|
+
Data_Get_Struct(self, struct sb_stemmer, stemmer);
|
44
|
+
|
45
|
+
// In case someone sends() this method, free up the old one
|
46
|
+
if(stemmer) sb_stemmer_delete(stemmer);
|
47
|
+
|
48
|
+
stemmer = sb_stemmer_new( RSTRING_PTR(rlang), RSTRING_PTR(renc) );
|
49
|
+
if (!stemmer) {
|
50
|
+
if (!RTEST(renc)) {
|
51
|
+
rb_raise(rb_eStemmerError,
|
52
|
+
"Language %s not available for stemming", RSTRING_PTR(rlang));
|
53
|
+
} else {
|
54
|
+
rb_raise(rb_eStemmerError,
|
55
|
+
"Language %s not available for stemming in encoding %s",
|
56
|
+
RSTRING_PTR(rlang), RSTRING_PTR(renc));
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
DATA_PTR(self) = stemmer;
|
61
|
+
|
62
|
+
return self;
|
63
|
+
}
|
64
|
+
|
65
|
+
/*
|
66
|
+
* Document-method: stem
|
67
|
+
* call-seq: stem
|
68
|
+
*
|
69
|
+
* Stems a word
|
70
|
+
*
|
71
|
+
* require 'lingua/stemmer'
|
72
|
+
* s = Lingua::Stemmer.new
|
73
|
+
* s.stem "installation" # ==> install
|
74
|
+
*/
|
75
|
+
static VALUE
|
76
|
+
rb_stemmer_stem(VALUE self, VALUE word) {
|
77
|
+
struct sb_stemmer * stemmer;
|
78
|
+
|
79
|
+
Data_Get_Struct(self, struct sb_stemmer, stemmer);
|
80
|
+
if(!stemmer) rb_raise(rb_eRuntimeError, "Stemmer is not initialized");
|
81
|
+
|
82
|
+
VALUE s_word = rb_String(word);
|
83
|
+
const sb_symbol * stemmed = sb_stemmer_stem(stemmer,
|
84
|
+
(sb_symbol *)RSTRING_PTR(s_word),
|
85
|
+
RSTRING_LEN(s_word)
|
86
|
+
);
|
87
|
+
|
88
|
+
VALUE rb_enc = rb_iv_get(self, "@encoding");
|
89
|
+
return ENCODED_STR_NEW2((char *)stemmed, rb_enc);
|
90
|
+
}
|
91
|
+
|
92
|
+
static void
|
93
|
+
sb_stemmer_free(struct sb_stemmer * stemmer)
|
94
|
+
{
|
95
|
+
if(stemmer) sb_stemmer_delete(stemmer);
|
96
|
+
}
|
97
|
+
|
98
|
+
static VALUE
|
99
|
+
sb_stemmer_alloc(VALUE klass)
|
100
|
+
{
|
101
|
+
return Data_Wrap_Struct(klass, 0, sb_stemmer_free, 0);
|
102
|
+
}
|
103
|
+
|
104
|
+
/*
|
105
|
+
* Ruby-Stemmer, Ruby extension to SnowBall API using libstemmer_c
|
106
|
+
*/
|
107
|
+
void Init_stemmer_native() {
|
108
|
+
rb_mLingua = rb_define_module("Lingua");
|
109
|
+
rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
|
110
|
+
rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
|
111
|
+
rb_eStemmerError = rb_define_class_under(rb_mLingua, "StemmerError", rb_eException);
|
112
|
+
rb_define_private_method(rb_cStemmer, "native_init", rb_stemmer_init, 2);
|
113
|
+
rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
|
114
|
+
}
|
115
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
if RUBY_PLATFORM =~/(mswin|mingw)/i
|
2
|
+
require "lingua/#{RUBY_VERSION.sub(/\.\d+$/, '')}/stemmer_native"
|
3
|
+
else
|
4
|
+
require 'lingua/stemmer_native'
|
5
|
+
end
|
6
|
+
|
7
|
+
module Lingua
|
8
|
+
def self.stemmer(o, options={})
|
9
|
+
stemmer = Stemmer.new(options)
|
10
|
+
|
11
|
+
words = Array(o).map { |e| e.to_s }
|
12
|
+
|
13
|
+
results = []
|
14
|
+
words.each do |word|
|
15
|
+
result = stemmer.stem(word)
|
16
|
+
if block_given?
|
17
|
+
yield result
|
18
|
+
else
|
19
|
+
results << result
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
return stemmer if block_given?
|
24
|
+
results.length == 1 ? results[0] : results
|
25
|
+
end
|
26
|
+
|
27
|
+
class Stemmer
|
28
|
+
VERSION = File.read(File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "VERSION"))).strip!
|
29
|
+
|
30
|
+
attr_reader :language
|
31
|
+
attr_reader :encoding
|
32
|
+
|
33
|
+
# Creates a new Stemmer, pass <tt>:language</tt> and <tt>:encoding</tt>
|
34
|
+
# as arguments to change encoding or language, otherwise english with UTF_8
|
35
|
+
# will be used
|
36
|
+
#
|
37
|
+
# require 'lingua/stemmer'
|
38
|
+
# s = Lingua::Stemmer.new :language => 'fr'
|
39
|
+
#
|
40
|
+
def initialize(options={})
|
41
|
+
@language = (options[:language] || 'en').to_s
|
42
|
+
@encoding = (options[:encoding] || 'UTF_8').to_s
|
43
|
+
|
44
|
+
if RUBY_VERSION >= "1.9"
|
45
|
+
if not @encoding.is_a?(Encoding)
|
46
|
+
@encoding = Encoding.find(@encoding.gsub("_", "-"))
|
47
|
+
end
|
48
|
+
else
|
49
|
+
@encoding = @encoding.upcase.gsub("-", "_")
|
50
|
+
end
|
51
|
+
|
52
|
+
native_init(@language, native_encoding(@encoding))
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
def native_encoding(enc)
|
57
|
+
RUBY_VERSION >= "1.9" ? enc.name.gsub('-', '_') : enc
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
README
|
2
|
+
src_c/stem_ISO_8859_1_danish.c
|
3
|
+
src_c/stem_ISO_8859_1_danish.h
|
4
|
+
src_c/stem_ISO_8859_1_dutch.c
|
5
|
+
src_c/stem_ISO_8859_1_dutch.h
|
6
|
+
src_c/stem_ISO_8859_1_english.c
|
7
|
+
src_c/stem_ISO_8859_1_english.h
|
8
|
+
src_c/stem_ISO_8859_1_finnish.c
|
9
|
+
src_c/stem_ISO_8859_1_finnish.h
|
10
|
+
src_c/stem_ISO_8859_1_french.c
|
11
|
+
src_c/stem_ISO_8859_1_french.h
|
12
|
+
src_c/stem_ISO_8859_1_german.c
|
13
|
+
src_c/stem_ISO_8859_1_german.h
|
14
|
+
src_c/stem_ISO_8859_1_hungarian.c
|
15
|
+
src_c/stem_ISO_8859_1_hungarian.h
|
16
|
+
src_c/stem_ISO_8859_1_italian.c
|
17
|
+
src_c/stem_ISO_8859_1_italian.h
|
18
|
+
src_c/stem_ISO_8859_1_norwegian.c
|
19
|
+
src_c/stem_ISO_8859_1_norwegian.h
|
20
|
+
src_c/stem_ISO_8859_1_porter.c
|
21
|
+
src_c/stem_ISO_8859_1_porter.h
|
22
|
+
src_c/stem_ISO_8859_1_portuguese.c
|
23
|
+
src_c/stem_ISO_8859_1_portuguese.h
|
24
|
+
src_c/stem_ISO_8859_1_spanish.c
|
25
|
+
src_c/stem_ISO_8859_1_spanish.h
|
26
|
+
src_c/stem_ISO_8859_1_swedish.c
|
27
|
+
src_c/stem_ISO_8859_1_swedish.h
|
28
|
+
src_c/stem_ISO_8859_2_romanian.c
|
29
|
+
src_c/stem_ISO_8859_2_romanian.h
|
30
|
+
src_c/stem_KOI8_R_russian.c
|
31
|
+
src_c/stem_KOI8_R_russian.h
|
32
|
+
src_c/stem_UTF_8_danish.c
|
33
|
+
src_c/stem_UTF_8_danish.h
|
34
|
+
src_c/stem_UTF_8_dutch.c
|
35
|
+
src_c/stem_UTF_8_dutch.h
|
36
|
+
src_c/stem_UTF_8_english.c
|
37
|
+
src_c/stem_UTF_8_english.h
|
38
|
+
src_c/stem_UTF_8_finnish.c
|
39
|
+
src_c/stem_UTF_8_finnish.h
|
40
|
+
src_c/stem_UTF_8_french.c
|
41
|
+
src_c/stem_UTF_8_french.h
|
42
|
+
src_c/stem_UTF_8_german.c
|
43
|
+
src_c/stem_UTF_8_german.h
|
44
|
+
src_c/stem_UTF_8_hungarian.c
|
45
|
+
src_c/stem_UTF_8_hungarian.h
|
46
|
+
src_c/stem_UTF_8_italian.c
|
47
|
+
src_c/stem_UTF_8_italian.h
|
48
|
+
src_c/stem_UTF_8_norwegian.c
|
49
|
+
src_c/stem_UTF_8_norwegian.h
|
50
|
+
src_c/stem_UTF_8_porter.c
|
51
|
+
src_c/stem_UTF_8_porter.h
|
52
|
+
src_c/stem_UTF_8_portuguese.c
|
53
|
+
src_c/stem_UTF_8_portuguese.h
|
54
|
+
src_c/stem_UTF_8_romanian.c
|
55
|
+
src_c/stem_UTF_8_romanian.h
|
56
|
+
src_c/stem_UTF_8_russian.c
|
57
|
+
src_c/stem_UTF_8_russian.h
|
58
|
+
src_c/stem_UTF_8_spanish.c
|
59
|
+
src_c/stem_UTF_8_spanish.h
|
60
|
+
src_c/stem_UTF_8_swedish.c
|
61
|
+
src_c/stem_UTF_8_swedish.h
|
62
|
+
src_c/stem_UTF_8_turkish.c
|
63
|
+
src_c/stem_UTF_8_turkish.h
|
64
|
+
runtime/api.c
|
65
|
+
runtime/api.h
|
66
|
+
runtime/header.h
|
67
|
+
runtime/utilities.c
|
68
|
+
libstemmer/libstemmer.c
|
69
|
+
libstemmer/libstemmer_utf8.c
|
70
|
+
libstemmer/modules.h
|
71
|
+
libstemmer/modules_utf8.h
|
72
|
+
include/libstemmer.h
|