aurelian-ruby-stemmer 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +21 -0
- data/Manifest +87 -0
- data/README.textile +66 -0
- data/Rakefile +39 -0
- data/extconf.rb +13 -0
- data/libstemmer_c/MANIFEST +72 -0
- data/libstemmer_c/Makefile +9 -0
- data/libstemmer_c/README +125 -0
- data/libstemmer_c/examples/stemwords.c +209 -0
- data/libstemmer_c/include/libstemmer.h +79 -0
- data/libstemmer_c/libstemmer/libstemmer.c +93 -0
- data/libstemmer_c/libstemmer/libstemmer_utf8.c +93 -0
- data/libstemmer_c/libstemmer/modules.h +190 -0
- data/libstemmer_c/libstemmer/modules.txt +50 -0
- data/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/libstemmer_c/libstemmer/modules_utf8.txt +49 -0
- data/libstemmer_c/mkinc.mak +82 -0
- data/libstemmer_c/mkinc_utf8.mak +52 -0
- data/libstemmer_c/runtime/api.c +66 -0
- data/libstemmer_c/runtime/api.h +26 -0
- data/libstemmer_c/runtime/header.h +58 -0
- data/libstemmer_c/runtime/utilities.c +478 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_german.c +503 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_german.c +509 -0
- data/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ruby-stemmer.c +142 -0
- data/ruby-stemmer.gemspec +31 -0
- data/test.rb +26 -0
- metadata +224 -0
data/MIT-LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Copyright (c) 2008,2009 Aurelian Oancea
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
4
|
+
a copy of this software and associated documentation files (the
|
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
9
|
+
the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be
|
|
12
|
+
included in all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
21
|
+
|
data/Manifest
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
extconf.rb
|
|
2
|
+
libstemmer_c/examples/stemwords.c
|
|
3
|
+
libstemmer_c/include/libstemmer.h
|
|
4
|
+
libstemmer_c/libstemmer/libstemmer.c
|
|
5
|
+
libstemmer_c/libstemmer/libstemmer_utf8.c
|
|
6
|
+
libstemmer_c/libstemmer/modules.h
|
|
7
|
+
libstemmer_c/libstemmer/modules.txt
|
|
8
|
+
libstemmer_c/libstemmer/modules_utf8.h
|
|
9
|
+
libstemmer_c/libstemmer/modules_utf8.txt
|
|
10
|
+
libstemmer_c/Makefile
|
|
11
|
+
libstemmer_c/MANIFEST
|
|
12
|
+
libstemmer_c/mkinc.mak
|
|
13
|
+
libstemmer_c/mkinc_utf8.mak
|
|
14
|
+
libstemmer_c/README
|
|
15
|
+
libstemmer_c/runtime/api.c
|
|
16
|
+
libstemmer_c/runtime/api.h
|
|
17
|
+
libstemmer_c/runtime/header.h
|
|
18
|
+
libstemmer_c/runtime/utilities.c
|
|
19
|
+
libstemmer_c/src_c/stem_ISO_8859_1_danish.c
|
|
20
|
+
libstemmer_c/src_c/stem_ISO_8859_1_danish.h
|
|
21
|
+
libstemmer_c/src_c/stem_ISO_8859_1_dutch.c
|
|
22
|
+
libstemmer_c/src_c/stem_ISO_8859_1_dutch.h
|
|
23
|
+
libstemmer_c/src_c/stem_ISO_8859_1_english.c
|
|
24
|
+
libstemmer_c/src_c/stem_ISO_8859_1_english.h
|
|
25
|
+
libstemmer_c/src_c/stem_ISO_8859_1_finnish.c
|
|
26
|
+
libstemmer_c/src_c/stem_ISO_8859_1_finnish.h
|
|
27
|
+
libstemmer_c/src_c/stem_ISO_8859_1_french.c
|
|
28
|
+
libstemmer_c/src_c/stem_ISO_8859_1_french.h
|
|
29
|
+
libstemmer_c/src_c/stem_ISO_8859_1_german.c
|
|
30
|
+
libstemmer_c/src_c/stem_ISO_8859_1_german.h
|
|
31
|
+
libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c
|
|
32
|
+
libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h
|
|
33
|
+
libstemmer_c/src_c/stem_ISO_8859_1_italian.c
|
|
34
|
+
libstemmer_c/src_c/stem_ISO_8859_1_italian.h
|
|
35
|
+
libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c
|
|
36
|
+
libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h
|
|
37
|
+
libstemmer_c/src_c/stem_ISO_8859_1_porter.c
|
|
38
|
+
libstemmer_c/src_c/stem_ISO_8859_1_porter.h
|
|
39
|
+
libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c
|
|
40
|
+
libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h
|
|
41
|
+
libstemmer_c/src_c/stem_ISO_8859_1_spanish.c
|
|
42
|
+
libstemmer_c/src_c/stem_ISO_8859_1_spanish.h
|
|
43
|
+
libstemmer_c/src_c/stem_ISO_8859_1_swedish.c
|
|
44
|
+
libstemmer_c/src_c/stem_ISO_8859_1_swedish.h
|
|
45
|
+
libstemmer_c/src_c/stem_ISO_8859_2_romanian.c
|
|
46
|
+
libstemmer_c/src_c/stem_ISO_8859_2_romanian.h
|
|
47
|
+
libstemmer_c/src_c/stem_KOI8_R_russian.c
|
|
48
|
+
libstemmer_c/src_c/stem_KOI8_R_russian.h
|
|
49
|
+
libstemmer_c/src_c/stem_UTF_8_danish.c
|
|
50
|
+
libstemmer_c/src_c/stem_UTF_8_danish.h
|
|
51
|
+
libstemmer_c/src_c/stem_UTF_8_dutch.c
|
|
52
|
+
libstemmer_c/src_c/stem_UTF_8_dutch.h
|
|
53
|
+
libstemmer_c/src_c/stem_UTF_8_english.c
|
|
54
|
+
libstemmer_c/src_c/stem_UTF_8_english.h
|
|
55
|
+
libstemmer_c/src_c/stem_UTF_8_finnish.c
|
|
56
|
+
libstemmer_c/src_c/stem_UTF_8_finnish.h
|
|
57
|
+
libstemmer_c/src_c/stem_UTF_8_french.c
|
|
58
|
+
libstemmer_c/src_c/stem_UTF_8_french.h
|
|
59
|
+
libstemmer_c/src_c/stem_UTF_8_german.c
|
|
60
|
+
libstemmer_c/src_c/stem_UTF_8_german.h
|
|
61
|
+
libstemmer_c/src_c/stem_UTF_8_hungarian.c
|
|
62
|
+
libstemmer_c/src_c/stem_UTF_8_hungarian.h
|
|
63
|
+
libstemmer_c/src_c/stem_UTF_8_italian.c
|
|
64
|
+
libstemmer_c/src_c/stem_UTF_8_italian.h
|
|
65
|
+
libstemmer_c/src_c/stem_UTF_8_norwegian.c
|
|
66
|
+
libstemmer_c/src_c/stem_UTF_8_norwegian.h
|
|
67
|
+
libstemmer_c/src_c/stem_UTF_8_porter.c
|
|
68
|
+
libstemmer_c/src_c/stem_UTF_8_porter.h
|
|
69
|
+
libstemmer_c/src_c/stem_UTF_8_portuguese.c
|
|
70
|
+
libstemmer_c/src_c/stem_UTF_8_portuguese.h
|
|
71
|
+
libstemmer_c/src_c/stem_UTF_8_romanian.c
|
|
72
|
+
libstemmer_c/src_c/stem_UTF_8_romanian.h
|
|
73
|
+
libstemmer_c/src_c/stem_UTF_8_russian.c
|
|
74
|
+
libstemmer_c/src_c/stem_UTF_8_russian.h
|
|
75
|
+
libstemmer_c/src_c/stem_UTF_8_spanish.c
|
|
76
|
+
libstemmer_c/src_c/stem_UTF_8_spanish.h
|
|
77
|
+
libstemmer_c/src_c/stem_UTF_8_swedish.c
|
|
78
|
+
libstemmer_c/src_c/stem_UTF_8_swedish.h
|
|
79
|
+
libstemmer_c/src_c/stem_UTF_8_turkish.c
|
|
80
|
+
libstemmer_c/src_c/stem_UTF_8_turkish.h
|
|
81
|
+
Manifest
|
|
82
|
+
MIT-LICENSE
|
|
83
|
+
Rakefile
|
|
84
|
+
README.textile
|
|
85
|
+
ruby-stemmer.c
|
|
86
|
+
ruby-stemmer.gemspec
|
|
87
|
+
test.rb
|
data/README.textile
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
h3. About
|
|
2
|
+
|
|
3
|
+
*ruby-stemmer* exposes SnowBall API stemmer implementation to ruby.
|
|
4
|
+
|
|
5
|
+
This package includes *libstemmer_c* library - released under BSD licence and available for download at: "http://snowball.tartarus.org/dist/libstemmer_c.tgz":http://snowball.tartarus.org/dist/libstemmer_c.tgz.
|
|
6
|
+
|
|
7
|
+
For details about *libstemmer_c* please check libstemmer_c/README file or "http://snowball.tartarus.org":http://snowball.tartarus.org.
|
|
8
|
+
|
|
9
|
+
author: Aurelian Oancea, oancea at gmail.com
|
|
10
|
+
|
|
11
|
+
licence: MIT, see MIT-LICENSE file for details
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
h4. Install
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
Please use sudo or run as root if you get in to _Permission Deny_ issues.
|
|
18
|
+
|
|
19
|
+
h5. Stable version - with rubygems:
|
|
20
|
+
|
|
21
|
+
@$ gem install ruby-stemmer@
|
|
22
|
+
|
|
23
|
+
h5. Development version - from source
|
|
24
|
+
|
|
25
|
+
$ ruby extconf.rb # => compile libstemmer_c and generate a Makefile
|
|
26
|
+
$ make # => compile the library
|
|
27
|
+
$ ./test.rb # => test it :)
|
|
28
|
+
$ make install # => to install
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
h4. Usage
|
|
32
|
+
|
|
33
|
+
Please refer to @test.rb@.
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
h4. API
|
|
37
|
+
|
|
38
|
+
<pre>
|
|
39
|
+
module Lingua
|
|
40
|
+
class Steemer
|
|
41
|
+
|
|
42
|
+
# creates a new Steemer,
|
|
43
|
+
# defaults: language => en, encoding => UTF_8
|
|
44
|
+
# pass :language or :encoding to change them
|
|
45
|
+
def initialize
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# stemms the word
|
|
49
|
+
def stem(word)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# gets the length of the last stemmed word
|
|
53
|
+
# same as:
|
|
54
|
+
# word = Lingua::Steemer.new.stem("installation") # ==> install (string)
|
|
55
|
+
# word.length # ==> 6 (int)
|
|
56
|
+
def length
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
</pre>
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
h4. Todo
|
|
64
|
+
|
|
65
|
+
* Add (Array of Hashes) Lingua::Stemmer.list to list available languages/encodings
|
|
66
|
+
* Windows?
|
data/Rakefile
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'rake'
|
|
3
|
+
require 'echoe'
|
|
4
|
+
|
|
5
|
+
Echoe.new('ruby-stemmer', '0.5.4') do |p|
|
|
6
|
+
p.description = "Stemmer implementation to ruby using libstemmer_c. Working with ruby 1.9.1"
|
|
7
|
+
p.url = "http://github.com/aurelian/ruby-stemmer"
|
|
8
|
+
p.author = "Aurelian Oancea, Yury Korolev"
|
|
9
|
+
p.email = "oancea@gmail.com, yury.korolev@gmail.com"
|
|
10
|
+
p.extensions = ["extconf.rb"]
|
|
11
|
+
p.ignore_pattern = ["*.o", "**/*.o", "stemwords", "*.bundle", "*.a", "*.so"]
|
|
12
|
+
p.development_dependencies = []
|
|
13
|
+
p.runtime_dependencies = []
|
|
14
|
+
p.has_rdoc = true
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
rm_rf 'Makefile'
|
|
18
|
+
|
|
19
|
+
PKG_FILES = FileList[
|
|
20
|
+
'extconf.rb',
|
|
21
|
+
'ruby-stemmer.c',
|
|
22
|
+
'test.rb',
|
|
23
|
+
'[A-Z]*',
|
|
24
|
+
'libstemmer_c/**/*'
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
PKG_FILES.exclude('*.o')
|
|
28
|
+
PKG_FILES.exclude('**/*.o')
|
|
29
|
+
PKG_FILES.exclude('stemwords')
|
|
30
|
+
PKG_FILES.exclude('*.bundle')
|
|
31
|
+
PKG_FILES.exclude('*.a')
|
|
32
|
+
PKG_FILES.exclude('*.so')
|
|
33
|
+
|
|
34
|
+
desc "Cleans the workspace"
|
|
35
|
+
task :clean do
|
|
36
|
+
`rm -rf Makefile mkmf.log ruby-stemmer.o stemmer.bundle stemmer.so`
|
|
37
|
+
`cd libstemmer_c && make clean && cd ../`
|
|
38
|
+
end
|
|
39
|
+
|
data/extconf.rb
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
require "mkmf"
|
|
2
|
+
|
|
3
|
+
system "cd libstemmer_c; make libstemmer.o; cd #{File.dirname(__FILE__)};"
|
|
4
|
+
|
|
5
|
+
$CFLAGS += " -I#{File.join(File.dirname(__FILE__),'libstemmer_c','include')} "
|
|
6
|
+
$libs += " -L#{File.join(File.dirname(__FILE__),'libstemmer_c')} #{File.join(File.dirname(__FILE__),'libstemmer_c','libstemmer.o')} "
|
|
7
|
+
|
|
8
|
+
# dir_config("libstemmer")
|
|
9
|
+
|
|
10
|
+
if have_header("libstemmer.h") # && have_library('libstemmer')
|
|
11
|
+
create_makefile("lingua/stemmer")
|
|
12
|
+
end
|
|
13
|
+
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
README
|
|
2
|
+
src_c/stem_ISO_8859_1_danish.c
|
|
3
|
+
src_c/stem_ISO_8859_1_danish.h
|
|
4
|
+
src_c/stem_ISO_8859_1_dutch.c
|
|
5
|
+
src_c/stem_ISO_8859_1_dutch.h
|
|
6
|
+
src_c/stem_ISO_8859_1_english.c
|
|
7
|
+
src_c/stem_ISO_8859_1_english.h
|
|
8
|
+
src_c/stem_ISO_8859_1_finnish.c
|
|
9
|
+
src_c/stem_ISO_8859_1_finnish.h
|
|
10
|
+
src_c/stem_ISO_8859_1_french.c
|
|
11
|
+
src_c/stem_ISO_8859_1_french.h
|
|
12
|
+
src_c/stem_ISO_8859_1_german.c
|
|
13
|
+
src_c/stem_ISO_8859_1_german.h
|
|
14
|
+
src_c/stem_ISO_8859_1_hungarian.c
|
|
15
|
+
src_c/stem_ISO_8859_1_hungarian.h
|
|
16
|
+
src_c/stem_ISO_8859_1_italian.c
|
|
17
|
+
src_c/stem_ISO_8859_1_italian.h
|
|
18
|
+
src_c/stem_ISO_8859_1_norwegian.c
|
|
19
|
+
src_c/stem_ISO_8859_1_norwegian.h
|
|
20
|
+
src_c/stem_ISO_8859_1_porter.c
|
|
21
|
+
src_c/stem_ISO_8859_1_porter.h
|
|
22
|
+
src_c/stem_ISO_8859_1_portuguese.c
|
|
23
|
+
src_c/stem_ISO_8859_1_portuguese.h
|
|
24
|
+
src_c/stem_ISO_8859_1_spanish.c
|
|
25
|
+
src_c/stem_ISO_8859_1_spanish.h
|
|
26
|
+
src_c/stem_ISO_8859_1_swedish.c
|
|
27
|
+
src_c/stem_ISO_8859_1_swedish.h
|
|
28
|
+
src_c/stem_ISO_8859_2_romanian.c
|
|
29
|
+
src_c/stem_ISO_8859_2_romanian.h
|
|
30
|
+
src_c/stem_KOI8_R_russian.c
|
|
31
|
+
src_c/stem_KOI8_R_russian.h
|
|
32
|
+
src_c/stem_UTF_8_danish.c
|
|
33
|
+
src_c/stem_UTF_8_danish.h
|
|
34
|
+
src_c/stem_UTF_8_dutch.c
|
|
35
|
+
src_c/stem_UTF_8_dutch.h
|
|
36
|
+
src_c/stem_UTF_8_english.c
|
|
37
|
+
src_c/stem_UTF_8_english.h
|
|
38
|
+
src_c/stem_UTF_8_finnish.c
|
|
39
|
+
src_c/stem_UTF_8_finnish.h
|
|
40
|
+
src_c/stem_UTF_8_french.c
|
|
41
|
+
src_c/stem_UTF_8_french.h
|
|
42
|
+
src_c/stem_UTF_8_german.c
|
|
43
|
+
src_c/stem_UTF_8_german.h
|
|
44
|
+
src_c/stem_UTF_8_hungarian.c
|
|
45
|
+
src_c/stem_UTF_8_hungarian.h
|
|
46
|
+
src_c/stem_UTF_8_italian.c
|
|
47
|
+
src_c/stem_UTF_8_italian.h
|
|
48
|
+
src_c/stem_UTF_8_norwegian.c
|
|
49
|
+
src_c/stem_UTF_8_norwegian.h
|
|
50
|
+
src_c/stem_UTF_8_porter.c
|
|
51
|
+
src_c/stem_UTF_8_porter.h
|
|
52
|
+
src_c/stem_UTF_8_portuguese.c
|
|
53
|
+
src_c/stem_UTF_8_portuguese.h
|
|
54
|
+
src_c/stem_UTF_8_romanian.c
|
|
55
|
+
src_c/stem_UTF_8_romanian.h
|
|
56
|
+
src_c/stem_UTF_8_russian.c
|
|
57
|
+
src_c/stem_UTF_8_russian.h
|
|
58
|
+
src_c/stem_UTF_8_spanish.c
|
|
59
|
+
src_c/stem_UTF_8_spanish.h
|
|
60
|
+
src_c/stem_UTF_8_swedish.c
|
|
61
|
+
src_c/stem_UTF_8_swedish.h
|
|
62
|
+
src_c/stem_UTF_8_turkish.c
|
|
63
|
+
src_c/stem_UTF_8_turkish.h
|
|
64
|
+
runtime/api.c
|
|
65
|
+
runtime/api.h
|
|
66
|
+
runtime/header.h
|
|
67
|
+
runtime/utilities.c
|
|
68
|
+
libstemmer/libstemmer.c
|
|
69
|
+
libstemmer/libstemmer_utf8.c
|
|
70
|
+
libstemmer/modules.h
|
|
71
|
+
libstemmer/modules_utf8.h
|
|
72
|
+
include/libstemmer.h
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
include mkinc.mak
|
|
2
|
+
CFLAGS=-Iinclude -fPIC
|
|
3
|
+
all: libstemmer.o stemwords
|
|
4
|
+
libstemmer.o: $(snowball_sources:.c=.o)
|
|
5
|
+
$(AR) -cru $@ $^
|
|
6
|
+
stemwords: examples/stemwords.o libstemmer.o
|
|
7
|
+
$(CC) -o $@ $^
|
|
8
|
+
clean:
|
|
9
|
+
rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o
|
data/libstemmer_c/README
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
libstemmer_c
|
|
2
|
+
============
|
|
3
|
+
|
|
4
|
+
This document pertains to the C version of the libstemmer distribution,
|
|
5
|
+
available for download from:
|
|
6
|
+
|
|
7
|
+
http://snowball.tartarus.org/dist/libstemmer_c.tgz
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
Compiling the library
|
|
11
|
+
=====================
|
|
12
|
+
|
|
13
|
+
A simple makefile is provided for Unix style systems. On such systems, it
|
|
14
|
+
should be possible simply to run "make", and the file "libstemmer.o"
|
|
15
|
+
and the example program "stemwords" will be generated.
|
|
16
|
+
|
|
17
|
+
If this doesn't work on your system, you need to write your own build
|
|
18
|
+
system (or call the compiler directly). The files to compile are
|
|
19
|
+
all contained in the "libstemmer", "runtime" and "src_c" directories,
|
|
20
|
+
and the public header file is contained in the "include" directory.
|
|
21
|
+
|
|
22
|
+
The library comes in two flavours; UTF-8 only, and UTF-8 plus other character
|
|
23
|
+
sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of
|
|
24
|
+
"libstemmer.c".
|
|
25
|
+
|
|
26
|
+
For convenience "mkinc.mak" is a makefile fragment listing the source files and
|
|
27
|
+
header files used to compile the standard version of the library.
|
|
28
|
+
"mkinc_utf8.mak" is a comparable makefile fragment listing just the source
|
|
29
|
+
files for the UTF-8 only version of the library.
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
Using the library
|
|
33
|
+
=================
|
|
34
|
+
|
|
35
|
+
The library provides a simple C API. Essentially, a new stemmer can
|
|
36
|
+
be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then
|
|
37
|
+
used to stem a word, "sb_stemmer_length" returns the stemmed
|
|
38
|
+
length of the last word processed, and "sb_stemmer_delete" is
|
|
39
|
+
used to delete a stemmer.
|
|
40
|
+
|
|
41
|
+
Creating a stemmer is a relatively expensive operation - the expected
|
|
42
|
+
usage pattern is that a new stemmer is created when needed, used
|
|
43
|
+
to stem many words, and deleted after some time.
|
|
44
|
+
|
|
45
|
+
Stemmers are re-entrant, but not threadsafe. In other words, if
|
|
46
|
+
you wish to access the same stemmer object from multiple threads,
|
|
47
|
+
you must ensure that all access is protected by a mutex or similar
|
|
48
|
+
device.
|
|
49
|
+
|
|
50
|
+
libstemmer does not currently incorporate any mechanism for caching the results
|
|
51
|
+
of stemming operations. Such caching can greatly increase the performance of a
|
|
52
|
+
stemmer under certain situations, so suitable patches will be considered for
|
|
53
|
+
inclusion.
|
|
54
|
+
|
|
55
|
+
The standard libstemmer sources contain an algorithm for each of the supported
|
|
56
|
+
languages. The algorithm may be selected using the english name of the
|
|
57
|
+
language, or using the 2 or 3 letter ISO 639 language codes. In addition,
|
|
58
|
+
the traditional "Porter" stemming algorithm for english is included for
|
|
59
|
+
backwards compatibility purposes, but we recommend use of the "English"
|
|
60
|
+
stemmer in preference for new projects.
|
|
61
|
+
|
|
62
|
+
(Some minor algorithms which are included only as curiosities in the snowball
|
|
63
|
+
website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not
|
|
64
|
+
included in the standard libstemmer sources. These are not really supported by
|
|
65
|
+
the snowball project, but it would be possible to compile a modified libstemmer
|
|
66
|
+
library containing these if desired.)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
The stemwords example
|
|
70
|
+
=====================
|
|
71
|
+
|
|
72
|
+
The stemwords example program allows you to run any of the stemmers
|
|
73
|
+
compiled into the libstemmer library on a sample vocabulary. For
|
|
74
|
+
details on how to use it, run it with the "-h" command line option.
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
Using the library in a larger system
|
|
78
|
+
====================================
|
|
79
|
+
|
|
80
|
+
If you are incorporating the library into the build system of a larger
|
|
81
|
+
program, I recommend copying the unpacked tarball without modification into
|
|
82
|
+
a subdirectory of the sources of your program. Future versions of the
|
|
83
|
+
library are intended to keep the same structure, so this will keep the
|
|
84
|
+
work required to move to a new version of the library to a minimum.
|
|
85
|
+
|
|
86
|
+
As an additional convenience, the list of source and header files used
|
|
87
|
+
in the library is detailed in mkinc.mak - a file which is in a suitable
|
|
88
|
+
format for inclusion by a Makefile. By including this file in your build
|
|
89
|
+
system, you can link the snowball system into your program with a few
|
|
90
|
+
extra rules.
|
|
91
|
+
|
|
92
|
+
Using the library in a system using GNU autotools
|
|
93
|
+
=================================================
|
|
94
|
+
|
|
95
|
+
The libstemmer_c library can be integrated into a larger system which uses the
|
|
96
|
+
GNU autotool framework (and in particular, automake and autoconf) as follows:
|
|
97
|
+
|
|
98
|
+
1) Unpack libstemmer_c.tgz in the top level project directory so that there is
|
|
99
|
+
a libstemmer_c subdirectory of the top level directory of the project.
|
|
100
|
+
|
|
101
|
+
2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing:
|
|
102
|
+
|
|
103
|
+
noinst_LTLIBRARIES = libstemmer.la
|
|
104
|
+
include $(srcdir)/mkinc.mak
|
|
105
|
+
noinst_HEADERS = $(snowball_headers)
|
|
106
|
+
libstemmer_la_SOURCES = $(snowball_sources)
|
|
107
|
+
|
|
108
|
+
(You may also need to add other lines to this, for example, if you are using
|
|
109
|
+
compiler options which are not compatible with compiling the libstemmer
|
|
110
|
+
library.)
|
|
111
|
+
|
|
112
|
+
3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's
|
|
113
|
+
configure.ac file.
|
|
114
|
+
|
|
115
|
+
4) Add to the top level makefile the following lines (or modify existing
|
|
116
|
+
assignments to these variables appropriately):
|
|
117
|
+
|
|
118
|
+
AUTOMAKE_OPTIONS = subdir-objects
|
|
119
|
+
AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include
|
|
120
|
+
SUBDIRS=libstemmer_c
|
|
121
|
+
<name>_LIBADD = libstemmer_c/libstemmer.la
|
|
122
|
+
|
|
123
|
+
(Where <name> is the name of the library or executable which links against
|
|
124
|
+
libstemmer.)
|
|
125
|
+
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
/* This is a simple program which uses libstemmer to provide a command
|
|
2
|
+
* line interface for stemming using any of the algorithms provided.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
#include <stdio.h>
|
|
6
|
+
#include <stdlib.h> /* for malloc, free */
|
|
7
|
+
#include <string.h> /* for memmove */
|
|
8
|
+
#include <ctype.h> /* for isupper, tolower */
|
|
9
|
+
|
|
10
|
+
#include "libstemmer.h"
|
|
11
|
+
|
|
12
|
+
const char * progname;
|
|
13
|
+
static int pretty = 1;
|
|
14
|
+
|
|
15
|
+
static void
|
|
16
|
+
stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
|
|
17
|
+
{
|
|
18
|
+
#define INC 10
|
|
19
|
+
int lim = INC;
|
|
20
|
+
sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));
|
|
21
|
+
|
|
22
|
+
while(1) {
|
|
23
|
+
int ch = getc(f_in);
|
|
24
|
+
if (ch == EOF) {
|
|
25
|
+
free(b); return;
|
|
26
|
+
}
|
|
27
|
+
{
|
|
28
|
+
int i = 0;
|
|
29
|
+
int inlen = 0;
|
|
30
|
+
while(1) {
|
|
31
|
+
if (ch == '\n' || ch == EOF) break;
|
|
32
|
+
if (i == lim) {
|
|
33
|
+
sb_symbol * newb;
|
|
34
|
+
newb = (sb_symbol *)
|
|
35
|
+
realloc(b, (lim + INC) * sizeof(sb_symbol));
|
|
36
|
+
if (newb == 0) goto error;
|
|
37
|
+
b = newb;
|
|
38
|
+
lim = lim + INC;
|
|
39
|
+
}
|
|
40
|
+
/* Update count of utf-8 characters. */
|
|
41
|
+
if (ch < 0x80 || ch > 0xBF) inlen += 1;
|
|
42
|
+
/* force lower case: */
|
|
43
|
+
if (isupper(ch)) ch = tolower(ch);
|
|
44
|
+
|
|
45
|
+
b[i] = ch;
|
|
46
|
+
i++;
|
|
47
|
+
ch = getc(f_in);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
{
|
|
51
|
+
const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
|
|
52
|
+
if (stemmed == NULL)
|
|
53
|
+
{
|
|
54
|
+
fprintf(stderr, "Out of memory");
|
|
55
|
+
exit(1);
|
|
56
|
+
}
|
|
57
|
+
else
|
|
58
|
+
{
|
|
59
|
+
if (pretty == 1) {
|
|
60
|
+
fwrite(b, i, 1, f_out);
|
|
61
|
+
fputs(" -> ", f_out);
|
|
62
|
+
} else if (pretty == 2) {
|
|
63
|
+
fwrite(b, i, 1, f_out);
|
|
64
|
+
if (sb_stemmer_length(stemmer) > 0) {
|
|
65
|
+
int j;
|
|
66
|
+
if (inlen < 30) {
|
|
67
|
+
for (j = 30 - inlen; j > 0; j--)
|
|
68
|
+
fputs(" ", f_out);
|
|
69
|
+
} else {
|
|
70
|
+
fputs("\n", f_out);
|
|
71
|
+
for (j = 30; j > 0; j--)
|
|
72
|
+
fputs(" ", f_out);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
fputs((char *)stemmed, f_out);
|
|
78
|
+
putc('\n', f_out);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
error:
|
|
84
|
+
if (b != 0) free(b);
|
|
85
|
+
return;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/** Display the command line syntax, and then exit.
|
|
89
|
+
* @param n The value to exit with.
|
|
90
|
+
*/
|
|
91
|
+
static void
|
|
92
|
+
usage(int n)
|
|
93
|
+
{
|
|
94
|
+
printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
|
|
95
|
+
"\n"
|
|
96
|
+
"The input file consists of a list of words to be stemmed, one per\n"
|
|
97
|
+
"line. Words should be in lower case, but (for English) A-Z letters\n"
|
|
98
|
+
"are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
|
|
99
|
+
"used.\n"
|
|
100
|
+
"\n"
|
|
101
|
+
"If -c is given, the argument is the character encoding of the input\n"
|
|
102
|
+
"and output files. If it is omitted, the UTF-8 encoding is used.\n"
|
|
103
|
+
"\n"
|
|
104
|
+
"If -p is given the output file consists of each word of the input\n"
|
|
105
|
+
"file followed by \"->\" followed by its stemmed equivalent.\n"
|
|
106
|
+
"If -p2 is given the output file is a two column layout containing\n"
|
|
107
|
+
"the input words in the first column and the stemmed eqivalents in\n"
|
|
108
|
+
"the second column.\n"
|
|
109
|
+
"Otherwise, the output file consists of the stemmed words, one per\n"
|
|
110
|
+
"line.\n"
|
|
111
|
+
"\n"
|
|
112
|
+
"-h displays this help\n",
|
|
113
|
+
progname);
|
|
114
|
+
exit(n);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
int
|
|
118
|
+
main(int argc, char * argv[])
|
|
119
|
+
{
|
|
120
|
+
char * in = 0;
|
|
121
|
+
char * out = 0;
|
|
122
|
+
FILE * f_in;
|
|
123
|
+
FILE * f_out;
|
|
124
|
+
struct sb_stemmer * stemmer;
|
|
125
|
+
|
|
126
|
+
char * language = "english";
|
|
127
|
+
char * charenc = NULL;
|
|
128
|
+
|
|
129
|
+
char * s;
|
|
130
|
+
int i = 1;
|
|
131
|
+
pretty = 0;
|
|
132
|
+
|
|
133
|
+
progname = argv[0];
|
|
134
|
+
|
|
135
|
+
while(i < argc) {
|
|
136
|
+
s = argv[i++];
|
|
137
|
+
if (s[0] == '-') {
|
|
138
|
+
if (strcmp(s, "-o") == 0) {
|
|
139
|
+
if (i >= argc) {
|
|
140
|
+
fprintf(stderr, "%s requires an argument\n", s);
|
|
141
|
+
exit(1);
|
|
142
|
+
}
|
|
143
|
+
out = argv[i++];
|
|
144
|
+
} else if (strcmp(s, "-i") == 0) {
|
|
145
|
+
if (i >= argc) {
|
|
146
|
+
fprintf(stderr, "%s requires an argument\n", s);
|
|
147
|
+
exit(1);
|
|
148
|
+
}
|
|
149
|
+
in = argv[i++];
|
|
150
|
+
} else if (strcmp(s, "-l") == 0) {
|
|
151
|
+
if (i >= argc) {
|
|
152
|
+
fprintf(stderr, "%s requires an argument\n", s);
|
|
153
|
+
exit(1);
|
|
154
|
+
}
|
|
155
|
+
language = argv[i++];
|
|
156
|
+
} else if (strcmp(s, "-c") == 0) {
|
|
157
|
+
if (i >= argc) {
|
|
158
|
+
fprintf(stderr, "%s requires an argument\n", s);
|
|
159
|
+
exit(1);
|
|
160
|
+
}
|
|
161
|
+
charenc = argv[i++];
|
|
162
|
+
} else if (strcmp(s, "-p2") == 0) {
|
|
163
|
+
pretty = 2;
|
|
164
|
+
} else if (strcmp(s, "-p") == 0) {
|
|
165
|
+
pretty = 1;
|
|
166
|
+
} else if (strcmp(s, "-h") == 0) {
|
|
167
|
+
usage(0);
|
|
168
|
+
} else {
|
|
169
|
+
fprintf(stderr, "option %s unknown\n", s);
|
|
170
|
+
usage(1);
|
|
171
|
+
}
|
|
172
|
+
} else {
|
|
173
|
+
fprintf(stderr, "unexpected parameter %s\n", s);
|
|
174
|
+
usage(1);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/* prepare the files */
|
|
179
|
+
f_in = (in == 0) ? stdin : fopen(in, "r");
|
|
180
|
+
if (f_in == 0) {
|
|
181
|
+
fprintf(stderr, "file %s not found\n", in);
|
|
182
|
+
exit(1);
|
|
183
|
+
}
|
|
184
|
+
f_out = (out == 0) ? stdout : fopen(out, "w");
|
|
185
|
+
if (f_out == 0) {
|
|
186
|
+
fprintf(stderr, "file %s cannot be opened\n", out);
|
|
187
|
+
exit(1);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/* do the stemming process: */
|
|
191
|
+
stemmer = sb_stemmer_new(language, charenc);
|
|
192
|
+
if (stemmer == 0) {
|
|
193
|
+
if (charenc == NULL) {
|
|
194
|
+
fprintf(stderr, "language `%s' not available for stemming\n", language);
|
|
195
|
+
exit(1);
|
|
196
|
+
} else {
|
|
197
|
+
fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
|
|
198
|
+
exit(1);
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
stem_file(stemmer, f_in, f_out);
|
|
202
|
+
sb_stemmer_delete(stemmer);
|
|
203
|
+
|
|
204
|
+
if (in != 0) (void) fclose(f_in);
|
|
205
|
+
if (out != 0) (void) fclose(f_out);
|
|
206
|
+
|
|
207
|
+
return 0;
|
|
208
|
+
}
|
|
209
|
+
|