ruby-stemmer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. data/MIT-LICENSE +21 -0
  2. data/README +79 -0
  3. data/Rakefile +52 -0
  4. data/extconf.rb +14 -0
  5. data/libstemmer_c/MANIFEST +72 -0
  6. data/libstemmer_c/Makefile +9 -0
  7. data/libstemmer_c/README +125 -0
  8. data/libstemmer_c/include/libstemmer.h +79 -0
  9. data/libstemmer_c/libstemmer/libstemmer.c +93 -0
  10. data/libstemmer_c/libstemmer/libstemmer_utf8.c +93 -0
  11. data/libstemmer_c/libstemmer/modules.h +190 -0
  12. data/libstemmer_c/libstemmer/modules.txt +50 -0
  13. data/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  14. data/libstemmer_c/libstemmer/modules_utf8.txt +49 -0
  15. data/libstemmer_c/mkinc.mak +82 -0
  16. data/libstemmer_c/mkinc_utf8.mak +52 -0
  17. data/libstemmer_c/runtime/api.c +66 -0
  18. data/libstemmer_c/runtime/api.h +26 -0
  19. data/libstemmer_c/runtime/header.h +58 -0
  20. data/libstemmer_c/runtime/utilities.c +478 -0
  21. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  22. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  23. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  24. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  25. data/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  26. data/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  27. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  28. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  29. data/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  30. data/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  31. data/libstemmer_c/src_c/stem_ISO_8859_1_german.c +503 -0
  32. data/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  33. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  34. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  35. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  36. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  37. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  38. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  39. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  40. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  41. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  42. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  43. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  44. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  45. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  46. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  47. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  48. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  49. data/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  50. data/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  51. data/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  52. data/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  53. data/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  54. data/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  55. data/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  56. data/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  57. data/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  58. data/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  59. data/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  60. data/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  61. data/libstemmer_c/src_c/stem_UTF_8_german.c +509 -0
  62. data/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  63. data/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  64. data/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  65. data/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  66. data/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  67. data/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  68. data/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  69. data/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  70. data/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  71. data/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  72. data/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  73. data/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  74. data/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  75. data/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  76. data/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  77. data/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  78. data/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  79. data/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  80. data/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  81. data/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  82. data/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  83. data/ruby-stemmer.c +108 -0
  84. data/test.rb +31 -0
  85. metadata +141 -0
data/MIT-LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2008 Aurelian Oancea
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
data/README ADDED
@@ -0,0 +1,79 @@
1
+ $Id: README 17 2008-01-07 16:59:10Z aurelian $
2
+
3
+ == About
4
+
5
+ ruby-stemmer, an extension to ruby using stemmer implementation in C.
6
+
7
+ This package includes libstemmer_c library from http://snowball.tartarus.org/dist/libstemmer_c.tgz
8
+
9
+ For details about libstemmer_c read libstemmer_c/README or http://snowball.tartarus.org.
10
+
11
+ author: Aurelian Oancea, aurelian at locknet dot ro
12
+
13
+ licence: MIT, see MIT-LICENSE for details
14
+
15
+ == Install
16
+
17
+ I) Using RubyGems
18
+
19
+ $ gem install ruby-stemmer
20
+
21
+ II) From tarball
22
+
23
+ Use sudo or run as root if you get Permission Deny issues
24
+
25
+ Compile libstemmer_c and generate the Makefile:
26
+
27
+ $ ruby extconf.rb
28
+
29
+ Compile the library:
30
+
31
+ $ make
32
+
33
+ Test:
34
+
35
+ $ ./test.rb
36
+
37
+ Install it:
38
+
39
+ $ make install
40
+
41
+ Run the last command as root or with sudo if you get permission deny problems
42
+
43
+ == Usage
44
+
45
+ see test.rb
46
+
47
+ == API
48
+
49
+ module Lingua
50
+
51
+ class Steemer
52
+
53
+ # creates a new Steemer,
54
+ # defaults: language => en, encoding => UTF_8
55
+ # pass :language or :encoding to change them
56
+ def initialize
57
+ end
58
+
59
+ # stemms the word
60
+ def stem(word)
61
+ end
62
+
63
+ # gets the length of the last stemmed word
64
+ # same as:
65
+ # word = Lingua::Steemer.new.stem("installation") # ==> install (string)
66
+ # word.length # ==> 6 (int)
67
+ def length
68
+ end
69
+
70
+ end
71
+
72
+ end
73
+
74
+ == Todo
75
+
76
+ # RDoc
77
+ # Add (Array of Hashes) Lingua::Stemmer.list to list available languages/encodings
78
+ # Windows?
79
+
data/Rakefile ADDED
@@ -0,0 +1,52 @@
1
+ #
2
+ # $Id: Rakefile 17 2008-01-07 16:59:10Z aurelian $
3
+ #
4
+
5
+ require 'rubygems'
6
+ require 'rake'
7
+ require 'rake/gempackagetask'
8
+
9
+ rm_rf 'Makefile'
10
+
11
+ PKG_FILES = FileList[
12
+ 'extconf.rb',
13
+ 'ruby-stemmer.c',
14
+ 'test.rb',
15
+ '[A-Z]*',
16
+ 'libstemmer_c/**/*'
17
+ ]
18
+
19
+ PKG_FILES.exclude('*.o')
20
+ PKG_FILES.exclude('**/*.o')
21
+ PKG_FILES.exclude('stemwords')
22
+ PKG_FILES.exclude('*.bundle')
23
+ PKG_FILES.exclude('*.a')
24
+ PKG_FILES.exclude('*.so')
25
+
26
+ spec = Gem::Specification.new do | s |
27
+ s.name = 'ruby-stemmer'
28
+ s.version = '0.0.1'
29
+ s.summary = "Stemmer implementation to ruby using libstemmer_c."
30
+ s.description = <<-EOF
31
+ Stemmer implementation to ruby using libstemmer_c.
32
+ EOF
33
+
34
+ s.files = PKG_FILES.to_a
35
+ s.extensions << "extconf.rb"
36
+ s.has_rdoc = false
37
+
38
+ s.author = "Aurelian Oancea"
39
+ s.email = "aurelian@locknet.ro"
40
+ s.homepage = "http://nrr.rubyforge.org"
41
+ s.rubyforge_project = "nrr"
42
+ end
43
+
44
+ pt = Rake::GemPackageTask.new(spec) do |p|
45
+ p.need_tar = true
46
+ p.need_zip = true
47
+ end
48
+
49
+ task :default do
50
+ puts "Ok"
51
+ end
52
+
data/extconf.rb ADDED
@@ -0,0 +1,14 @@
1
+ #
2
+ # $Id: extconf.rb 15 2008-01-05 11:56:57Z aurelian $
3
+ #
4
+
5
+ require "mkmf"
6
+
7
+ system "cd libstemmer_c; make; cd #{File.dirname(__FILE__)};"
8
+
9
+ $CFLAGS += " -I#{File.dirname(__FILE__)}/libstemmer_c/include "
10
+ $libs += " -L#{File.dirname(__FILE__)}/libstemmer_c -llibstemmer.o "
11
+
12
+ have_header("libstemmer.h")
13
+ create_makefile("stemmer")
14
+
@@ -0,0 +1,72 @@
1
+ README
2
+ src_c/stem_ISO_8859_1_danish.c
3
+ src_c/stem_ISO_8859_1_danish.h
4
+ src_c/stem_ISO_8859_1_dutch.c
5
+ src_c/stem_ISO_8859_1_dutch.h
6
+ src_c/stem_ISO_8859_1_english.c
7
+ src_c/stem_ISO_8859_1_english.h
8
+ src_c/stem_ISO_8859_1_finnish.c
9
+ src_c/stem_ISO_8859_1_finnish.h
10
+ src_c/stem_ISO_8859_1_french.c
11
+ src_c/stem_ISO_8859_1_french.h
12
+ src_c/stem_ISO_8859_1_german.c
13
+ src_c/stem_ISO_8859_1_german.h
14
+ src_c/stem_ISO_8859_1_hungarian.c
15
+ src_c/stem_ISO_8859_1_hungarian.h
16
+ src_c/stem_ISO_8859_1_italian.c
17
+ src_c/stem_ISO_8859_1_italian.h
18
+ src_c/stem_ISO_8859_1_norwegian.c
19
+ src_c/stem_ISO_8859_1_norwegian.h
20
+ src_c/stem_ISO_8859_1_porter.c
21
+ src_c/stem_ISO_8859_1_porter.h
22
+ src_c/stem_ISO_8859_1_portuguese.c
23
+ src_c/stem_ISO_8859_1_portuguese.h
24
+ src_c/stem_ISO_8859_1_spanish.c
25
+ src_c/stem_ISO_8859_1_spanish.h
26
+ src_c/stem_ISO_8859_1_swedish.c
27
+ src_c/stem_ISO_8859_1_swedish.h
28
+ src_c/stem_ISO_8859_2_romanian.c
29
+ src_c/stem_ISO_8859_2_romanian.h
30
+ src_c/stem_KOI8_R_russian.c
31
+ src_c/stem_KOI8_R_russian.h
32
+ src_c/stem_UTF_8_danish.c
33
+ src_c/stem_UTF_8_danish.h
34
+ src_c/stem_UTF_8_dutch.c
35
+ src_c/stem_UTF_8_dutch.h
36
+ src_c/stem_UTF_8_english.c
37
+ src_c/stem_UTF_8_english.h
38
+ src_c/stem_UTF_8_finnish.c
39
+ src_c/stem_UTF_8_finnish.h
40
+ src_c/stem_UTF_8_french.c
41
+ src_c/stem_UTF_8_french.h
42
+ src_c/stem_UTF_8_german.c
43
+ src_c/stem_UTF_8_german.h
44
+ src_c/stem_UTF_8_hungarian.c
45
+ src_c/stem_UTF_8_hungarian.h
46
+ src_c/stem_UTF_8_italian.c
47
+ src_c/stem_UTF_8_italian.h
48
+ src_c/stem_UTF_8_norwegian.c
49
+ src_c/stem_UTF_8_norwegian.h
50
+ src_c/stem_UTF_8_porter.c
51
+ src_c/stem_UTF_8_porter.h
52
+ src_c/stem_UTF_8_portuguese.c
53
+ src_c/stem_UTF_8_portuguese.h
54
+ src_c/stem_UTF_8_romanian.c
55
+ src_c/stem_UTF_8_romanian.h
56
+ src_c/stem_UTF_8_russian.c
57
+ src_c/stem_UTF_8_russian.h
58
+ src_c/stem_UTF_8_spanish.c
59
+ src_c/stem_UTF_8_spanish.h
60
+ src_c/stem_UTF_8_swedish.c
61
+ src_c/stem_UTF_8_swedish.h
62
+ src_c/stem_UTF_8_turkish.c
63
+ src_c/stem_UTF_8_turkish.h
64
+ runtime/api.c
65
+ runtime/api.h
66
+ runtime/header.h
67
+ runtime/utilities.c
68
+ libstemmer/libstemmer.c
69
+ libstemmer/libstemmer_utf8.c
70
+ libstemmer/modules.h
71
+ libstemmer/modules_utf8.h
72
+ include/libstemmer.h
@@ -0,0 +1,9 @@
1
+ include mkinc.mak
2
+ CFLAGS=-Iinclude
3
+ all: libstemmer.o stemwords
4
+ libstemmer.o: $(snowball_sources:.c=.o)
5
+ $(AR) -cru $@ $^
6
+ stemwords: examples/stemwords.o libstemmer.o
7
+ $(CC) -o $@ $^
8
+ clean:
9
+ rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o
@@ -0,0 +1,125 @@
1
+ libstemmer_c
2
+ ============
3
+
4
+ This document pertains to the C version of the libstemmer distribution,
5
+ available for download from:
6
+
7
+ http://snowball.tartarus.org/dist/libstemmer_c.tgz
8
+
9
+
10
+ Compiling the library
11
+ =====================
12
+
13
+ A simple makefile is provided for Unix style systems. On such systems, it
14
+ should be possible simply to run "make", and the file "libstemmer.o"
15
+ and the example program "stemwords" will be generated.
16
+
17
+ If this doesn't work on your system, you need to write your own build
18
+ system (or call the compiler directly). The files to compile are
19
+ all contained in the "libstemmer", "runtime" and "src_c" directories,
20
+ and the public header file is contained in the "include" directory.
21
+
22
+ The library comes in two flavours; UTF-8 only, and UTF-8 plus other character
23
+ sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of
24
+ "libstemmer.c".
25
+
26
+ For convenience "mkinc.mak" is a makefile fragment listing the source files and
27
+ header files used to compile the standard version of the library.
28
+ "mkinc_utf8.mak" is a comparable makefile fragment listing just the source
29
+ files for the UTF-8 only version of the library.
30
+
31
+
32
+ Using the library
33
+ =================
34
+
35
+ The library provides a simple C API. Essentially, a new stemmer can
36
+ be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then
37
+ used to stem a word, "sb_stemmer_length" returns the stemmed
38
+ length of the last word processed, and "sb_stemmer_delete" is
39
+ used to delete a stemmer.
40
+
41
+ Creating a stemmer is a relatively expensive operation - the expected
42
+ usage pattern is that a new stemmer is created when needed, used
43
+ to stem many words, and deleted after some time.
44
+
45
+ Stemmers are re-entrant, but not threadsafe. In other words, if
46
+ you wish to access the same stemmer object from multiple threads,
47
+ you must ensure that all access is protected by a mutex or similar
48
+ device.
49
+
50
+ libstemmer does not currently incorporate any mechanism for caching the results
51
+ of stemming operations. Such caching can greatly increase the performance of a
52
+ stemmer under certain situations, so suitable patches will be considered for
53
+ inclusion.
54
+
55
+ The standard libstemmer sources contain an algorithm for each of the supported
56
+ languages. The algorithm may be selected using the english name of the
57
+ language, or using the 2 or 3 letter ISO 639 language codes. In addition,
58
+ the traditional "Porter" stemming algorithm for english is included for
59
+ backwards compatibility purposes, but we recommend use of the "English"
60
+ stemmer in preference for new projects.
61
+
62
+ (Some minor algorithms which are included only as curiosities in the snowball
63
+ website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not
64
+ included in the standard libstemmer sources. These are not really supported by
65
+ the snowball project, but it would be possible to compile a modified libstemmer
66
+ library containing these if desired.)
67
+
68
+
69
+ The stemwords example
70
+ =====================
71
+
72
+ The stemwords example program allows you to run any of the stemmers
73
+ compiled into the libstemmer library on a sample vocabulary. For
74
+ details on how to use it, run it with the "-h" command line option.
75
+
76
+
77
+ Using the library in a larger system
78
+ ====================================
79
+
80
+ If you are incorporating the library into the build system of a larger
81
+ program, I recommend copying the unpacked tarball without modification into
82
+ a subdirectory of the sources of your program. Future versions of the
83
+ library are intended to keep the same structure, so this will keep the
84
+ work required to move to a new version of the library to a minimum.
85
+
86
+ As an additional convenience, the list of source and header files used
87
+ in the library is detailed in mkinc.mak - a file which is in a suitable
88
+ format for inclusion by a Makefile. By including this file in your build
89
+ system, you can link the snowball system into your program with a few
90
+ extra rules.
91
+
92
+ Using the library in a system using GNU autotools
93
+ =================================================
94
+
95
+ The libstemmer_c library can be integrated into a larger system which uses the
96
+ GNU autotool framework (and in particular, automake and autoconf) as follows:
97
+
98
+ 1) Unpack libstemmer_c.tgz in the top level project directory so that there is
99
+ a libstemmer_c subdirectory of the top level directory of the project.
100
+
101
+ 2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing:
102
+
103
+ noinst_LTLIBRARIES = libstemmer.la
104
+ include $(srcdir)/mkinc.mak
105
+ noinst_HEADERS = $(snowball_headers)
106
+ libstemmer_la_SOURCES = $(snowball_sources)
107
+
108
+ (You may also need to add other lines to this, for example, if you are using
109
+ compiler options which are not compatible with compiling the libstemmer
110
+ library.)
111
+
112
+ 3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's
113
+ configure.ac file.
114
+
115
+ 4) Add to the top level makefile the following lines (or modify existing
116
+ assignments to these variables appropriately):
117
+
118
+ AUTOMAKE_OPTIONS = subdir-objects
119
+ AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include
120
+ SUBDIRS=libstemmer_c
121
+ <name>_LIBADD = libstemmer_c/libstemmer.la
122
+
123
+ (Where <name> is the name of the library or executable which links against
124
+ libstemmer.)
125
+
@@ -0,0 +1,79 @@
1
+
2
+ /* Make header file work when included from C++ */
3
+ #ifdef __cplusplus
4
+ extern "C" {
5
+ #endif
6
+
7
+ struct sb_stemmer;
8
+ typedef unsigned char sb_symbol;
9
+
10
+ /* FIXME - should be able to get a version number for each stemming
11
+ * algorithm (which will be incremented each time the output changes). */
12
+
13
+ /** Returns an array of the names of the available stemming algorithms.
14
+ * Note that these are the canonical names - aliases (ie, other names for
15
+ * the same algorithm) will not be included in the list.
16
+ * The list is terminated with a null pointer.
17
+ *
18
+ * The list must not be modified in any way.
19
+ */
20
+ const char ** sb_stemmer_list(void);
21
+
22
+ /** Create a new stemmer object, using the specified algorithm, for the
23
+ * specified character encoding.
24
+ *
25
+ * All algorithms will usually be available in UTF-8, but may also be
26
+ * available in other character encodings.
27
+ *
28
+ * @param algorithm The algorithm name. This is either the english
29
+ * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the
30
+ * language. Note that case is significant in this parameter - the
31
+ * value should be supplied in lower case.
32
+ *
33
+ * @param charenc The character encoding. NULL may be passed as
34
+ * this value, in which case UTF-8 encoding will be assumed. Otherwise,
35
+ * the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1),
36
+ * "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian). Note that
37
+ * case is significant in this parameter.
38
+ *
39
+ * @return NULL if the specified algorithm is not recognised, or the
40
+ * algorithm is not available for the requested encoding. Otherwise,
41
+ * returns a pointer to a newly created stemmer for the requested algorithm.
42
+ * The returned pointer must be deleted by calling sb_stemmer_delete().
43
+ *
44
+ * @note NULL will also be returned if an out of memory error occurs.
45
+ */
46
+ struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc);
47
+
48
+ /** Delete a stemmer object.
49
+ *
50
+ * This frees all resources allocated for the stemmer. After calling
51
+ * this function, the supplied stemmer may no longer be used in any way.
52
+ *
53
+ * It is safe to pass a null pointer to this function - this will have
54
+ * no effect.
55
+ */
56
+ void sb_stemmer_delete(struct sb_stemmer * stemmer);
57
+
58
+ /** Stem a word.
59
+ *
60
+ * The return value is owned by the stemmer - it must not be freed or
61
+ * modified, and it will become invalid when the stemmer is called again,
62
+ * or if the stemmer is freed.
63
+ *
64
+ * The length of the return value can be obtained using sb_stemmer_length().
65
+ *
66
+ * If an out-of-memory error occurs, this will return NULL.
67
+ */
68
+ const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer,
69
+ const sb_symbol * word, int size);
70
+
71
+ /** Get the length of the result of the last stemmed word.
72
+ * This should not be called before sb_stemmer_stem() has been called.
73
+ */
74
+ int sb_stemmer_length(struct sb_stemmer * stemmer);
75
+
76
+ #ifdef __cplusplus
77
+ }
78
+ #endif
79
+
@@ -0,0 +1,93 @@
1
+
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include "../include/libstemmer.h"
5
+ #include "../runtime/api.h"
6
+ #include "modules.h"
7
+
8
+ struct sb_stemmer {
9
+ struct SN_env * (*create)(void);
10
+ void (*close)(struct SN_env *);
11
+ int (*stem)(struct SN_env *);
12
+
13
+ struct SN_env * env;
14
+ };
15
+
16
+ extern const char **
17
+ sb_stemmer_list(void)
18
+ {
19
+ return algorithm_names;
20
+ }
21
+
22
+ static stemmer_encoding_t
23
+ sb_getenc(const char * charenc)
24
+ {
25
+ struct stemmer_encoding * encoding;
26
+ if (charenc == NULL) return ENC_UTF_8;
27
+ for (encoding = encodings; encoding->name != 0; encoding++) {
28
+ if (strcmp(encoding->name, charenc) == 0) break;
29
+ }
30
+ if (encoding->name == NULL) return ENC_UNKNOWN;
31
+ return encoding->enc;
32
+ }
33
+
34
+ extern struct sb_stemmer *
35
+ sb_stemmer_new(const char * algorithm, const char * charenc)
36
+ {
37
+ stemmer_encoding_t enc;
38
+ struct stemmer_modules * module;
39
+ struct sb_stemmer * stemmer =
40
+ (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
41
+ if (stemmer == NULL) return NULL;
42
+ enc = sb_getenc(charenc);
43
+ if (enc == ENC_UNKNOWN) return NULL;
44
+
45
+ for (module = modules; module->name != 0; module++) {
46
+ if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
47
+ }
48
+ if (module->name == NULL) return NULL;
49
+
50
+ stemmer->create = module->create;
51
+ stemmer->close = module->close;
52
+ stemmer->stem = module->stem;
53
+
54
+ stemmer->env = stemmer->create();
55
+ if (stemmer->env == NULL)
56
+ {
57
+ sb_stemmer_delete(stemmer);
58
+ return NULL;
59
+ }
60
+
61
+ return stemmer;
62
+ }
63
+
64
+ void
65
+ sb_stemmer_delete(struct sb_stemmer * stemmer)
66
+ {
67
+ if (stemmer == 0) return;
68
+ if (stemmer->close == 0) return;
69
+ stemmer->close(stemmer->env);
70
+ stemmer->close = 0;
71
+ free(stemmer);
72
+ }
73
+
74
+ const sb_symbol *
75
+ sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
76
+ {
77
+ int ret;
78
+ if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
79
+ {
80
+ stemmer->env->l = 0;
81
+ return NULL;
82
+ }
83
+ ret = stemmer->stem(stemmer->env);
84
+ if (ret < 0) return NULL;
85
+ stemmer->env->p[stemmer->env->l] = 0;
86
+ return (const sb_symbol *)(stemmer->env->p);
87
+ }
88
+
89
+ int
90
+ sb_stemmer_length(struct sb_stemmer * stemmer)
91
+ {
92
+ return stemmer->env->l;
93
+ }
@@ -0,0 +1,93 @@
1
+
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include "../include/libstemmer.h"
5
+ #include "../runtime/api.h"
6
+ #include "modules_utf8.h"
7
+
8
+ struct sb_stemmer {
9
+ struct SN_env * (*create)(void);
10
+ void (*close)(struct SN_env *);
11
+ int (*stem)(struct SN_env *);
12
+
13
+ struct SN_env * env;
14
+ };
15
+
16
+ extern const char **
17
+ sb_stemmer_list(void)
18
+ {
19
+ return algorithm_names;
20
+ }
21
+
22
+ static stemmer_encoding_t
23
+ sb_getenc(const char * charenc)
24
+ {
25
+ struct stemmer_encoding * encoding;
26
+ if (charenc == NULL) return ENC_UTF_8;
27
+ for (encoding = encodings; encoding->name != 0; encoding++) {
28
+ if (strcmp(encoding->name, charenc) == 0) break;
29
+ }
30
+ if (encoding->name == NULL) return ENC_UNKNOWN;
31
+ return encoding->enc;
32
+ }
33
+
34
+ extern struct sb_stemmer *
35
+ sb_stemmer_new(const char * algorithm, const char * charenc)
36
+ {
37
+ stemmer_encoding_t enc;
38
+ struct stemmer_modules * module;
39
+ struct sb_stemmer * stemmer =
40
+ (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
41
+ if (stemmer == NULL) return NULL;
42
+ enc = sb_getenc(charenc);
43
+ if (enc == ENC_UNKNOWN) return NULL;
44
+
45
+ for (module = modules; module->name != 0; module++) {
46
+ if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
47
+ }
48
+ if (module->name == NULL) return NULL;
49
+
50
+ stemmer->create = module->create;
51
+ stemmer->close = module->close;
52
+ stemmer->stem = module->stem;
53
+
54
+ stemmer->env = stemmer->create();
55
+ if (stemmer->env == NULL)
56
+ {
57
+ sb_stemmer_delete(stemmer);
58
+ return NULL;
59
+ }
60
+
61
+ return stemmer;
62
+ }
63
+
64
+ void
65
+ sb_stemmer_delete(struct sb_stemmer * stemmer)
66
+ {
67
+ if (stemmer == 0) return;
68
+ if (stemmer->close == 0) return;
69
+ stemmer->close(stemmer->env);
70
+ stemmer->close = 0;
71
+ free(stemmer);
72
+ }
73
+
74
+ const sb_symbol *
75
+ sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
76
+ {
77
+ int ret;
78
+ if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
79
+ {
80
+ stemmer->env->l = 0;
81
+ return NULL;
82
+ }
83
+ ret = stemmer->stem(stemmer->env);
84
+ if (ret < 0) return NULL;
85
+ stemmer->env->p[stemmer->env->l] = 0;
86
+ return (const sb_symbol *)(stemmer->env->p);
87
+ }
88
+
89
+ int
90
+ sb_stemmer_length(struct sb_stemmer * stemmer)
91
+ {
92
+ return stemmer->env->l;
93
+ }