mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,114 @@
1
+ // Copyright (c) 2001, Dr Martin Porter
2
+ // Copyright (c) 2002, Richard Boulton
3
+ // Copyright (c) 2015, Cesar Souza
4
+ // All rights reserved.
5
+ //
6
+ // Redistribution and use in source and binary forms, with or without
7
+ // modification, are permitted provided that the following conditions are met:
8
+ //
9
+ // * Redistributions of source code must retain the above copyright notice,
10
+ // * this list of conditions and the following disclaimer.
11
+ // * Redistributions in binary form must reproduce the above copyright
12
+ // * notice, this list of conditions and the following disclaimer in the
13
+ // * documentation and/or other materials provided with the distribution.
14
+ // * Neither the name of the copyright holders nor the names of its contributors
15
+ // * may be used to endorse or promote products derived from this software
16
+ // * without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
+ // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
22
+ // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
+ // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
+ // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
+ // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ namespace Snowball
30
+ {
31
+ using System;
32
+ using System.IO;
33
+ using System.Reflection;
34
+ using System.Linq;
35
+ using System.Text;
36
+
37
+ /// <summary>
38
+ /// Snowball's Stemmer program.
39
+ /// </summary>
40
+ ///
41
+ public static class Program
42
+ {
43
+
44
+ private static void usage()
45
+ {
46
+ Console.WriteLine("Usage: stemwords.exe -l <language> -i <input file> [-o <output file>]");
47
+ }
48
+
49
+ /// <summary>
50
+ /// Main program entrypoint.
51
+ /// </summary>
52
+ ///
53
+ public static void Main(String[] args)
54
+ {
55
+ string language = null;
56
+ string inputName = null;
57
+ string outputName = null;
58
+
59
+ for (int i = 0; i < args.Length; i++)
60
+ {
61
+ if (args[i] == "-l")
62
+ language = args[i + 1];
63
+ else if (args[i] == "-i")
64
+ inputName = args[i + 1];
65
+ if (args[i] == "-o")
66
+ outputName = args[i + 1];
67
+ }
68
+
69
+ if (language == null || inputName == null)
70
+ {
71
+ usage();
72
+ return;
73
+ }
74
+
75
+
76
+
77
+ Stemmer stemmer =
78
+ typeof(Stemmer).Assembly.GetTypes()
79
+ .Where(t => t.IsSubclassOf(typeof(Stemmer)) && !t.IsAbstract)
80
+ .Where(t => match(t.Name, language))
81
+ .Select(t => (Stemmer)Activator.CreateInstance(t)).FirstOrDefault();
82
+
83
+ if (stemmer == null)
84
+ {
85
+ Console.WriteLine("Language not found.");
86
+ return;
87
+ }
88
+
89
+ Console.WriteLine("Using " + stemmer.GetType());
90
+
91
+ TextWriter output = System.Console.Out;
92
+
93
+ if (outputName != null)
94
+ output = new StreamWriter(outputName);
95
+
96
+
97
+ foreach (var line in File.ReadAllLines(inputName))
98
+ {
99
+ var o = stemmer.Stem(line);
100
+ output.WriteLine(o);
101
+ }
102
+
103
+ output.Flush();
104
+ }
105
+
106
+ private static bool match(string stemmerName, string language)
107
+ {
108
+ string expectedName = language.Replace("_", "") + "Stemmer";
109
+
110
+ return stemmerName.StartsWith(expectedName,
111
+ StringComparison.CurrentCultureIgnoreCase);
112
+ }
113
+ }
114
+ }
@@ -0,0 +1,12 @@
1
+ Things to do:
2
+
3
+ - Write documentation for how to use libstemmer (as opposed to how stemming
4
+ algorithms themselves work).
5
+ Currently, the documentation in the include/libstemmer.h header file is
6
+ pretty clear and comprehensive, but an overview document wouldn't go amiss.
7
+
8
+ Things that would be nice to include at some point.
9
+
10
+ - Add version numbers to each stemming algorithm, and allow the interface to
11
+ request a specific version of the stemming algorithms. Default to providing
12
+ the latest version of the algorithm.
@@ -0,0 +1,148 @@
1
+ libstemmer_c
2
+ ============
3
+
4
+ This document pertains to the C version of the libstemmer distribution,
5
+ available for download from:
6
+
7
+ https://snowballstem.org/download.html
8
+
9
+
10
+ What is Stemming?
11
+ -----------------
12
+
13
+ Stemming maps different forms of the same word to a common "stem" - for
14
+ example, the English stemmer maps *connection*, *connections*, *connective*,
15
+ *connected*, and *connecting* to *connect*. So a searching for *connected*
16
+ would also find documents which only have the other forms.
17
+
18
+ This stem form is often a word itself, but this is not always the case as this
19
+ is not a requirement for text search systems, which are the intended field of
20
+ use. We also aim to conflate words with the same meaning, rather than all
21
+ words with a common linguistic root (so *awe* and *awful* don't have the same
22
+ stem), and over-stemming is more problematic than under-stemming so we tend not
23
+ to stem in cases that are hard to resolve. If you want to always reduce words
24
+ to a root form and/or get a root form which is itself a word then Snowball's
25
+ stemming algorithms likely aren't the right answer.
26
+
27
+
28
+ Compiling the library
29
+ =====================
30
+
31
+ A simple makefile is provided for Unix style systems. On such systems, it
32
+ should be possible simply to run "make", and the file "libstemmer.o"
33
+ and the example program "stemwords" will be generated.
34
+
35
+ If this doesn't work on your system, you need to write your own build
36
+ system (or call the compiler directly). The files to compile are
37
+ all contained in the "libstemmer", "runtime" and "src_c" directories,
38
+ and the public header file is contained in the "include" directory.
39
+
40
+ The library comes in two flavours; UTF-8 only, and UTF-8 plus other character
41
+ sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of
42
+ "libstemmer.c".
43
+
44
+ For convenience "mkinc.mak" is a makefile fragment listing the source files and
45
+ header files used to compile the standard version of the library.
46
+ "mkinc_utf8.mak" is a comparable makefile fragment listing just the source
47
+ files for the UTF-8 only version of the library.
48
+
49
+
50
+ Using the library
51
+ =================
52
+
53
+ The library provides a simple C API. Essentially, a new stemmer can
54
+ be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then
55
+ used to stem a word, "sb_stemmer_length" returns the stemmed
56
+ length of the last word processed, and "sb_stemmer_delete" is
57
+ used to delete a stemmer.
58
+
59
+ Generally you should create a stemmer object and reuse it rather than creating
60
+ a fresh object for each word stemmed (since there's some cost to creating and
61
+ destroying the object).
62
+
63
+ The stemmer code is re-entrant, but not thread-safe if the same stemmer object
64
+ is used concurrently in different threads.
65
+
66
+ If you want to perform stemming concurrently in different threads, we suggest
67
+ creating a new stemmer object for each thread. The alternative is to share
68
+ stemmer objects between threads and protect access using a mutex or similar
69
+ but that's liable to slow your program down as threads can end up waiting for
70
+ the lock.
71
+
72
+ libstemmer does not currently incorporate any mechanism for caching the results
73
+ of stemming operations. Such caching can greatly increase the performance of a
74
+ stemmer under certain situations, so suitable patches will be considered for
75
+ inclusion.
76
+
77
+ The standard libstemmer sources contain an algorithm for each of the supported
78
+ languages. The algorithm may be selected using the english name of the
79
+ language, or using the 2 or 3 letter ISO 639 language codes. In addition,
80
+ the traditional "Porter" stemming algorithm for english is included for
81
+ backwards compatibility purposes, but we recommend use of the "English"
82
+ stemmer in preference for new projects.
83
+
84
+ (Some minor algorithms which are included only as curiosities in the snowball
85
+ website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not
86
+ included in the standard libstemmer sources. These are not really supported by
87
+ the snowball project, but it would be possible to compile a modified libstemmer
88
+ library containing these if desired.)
89
+
90
+
91
+ The stemwords example
92
+ =====================
93
+
94
+ The stemwords example program allows you to run any of the stemmers
95
+ compiled into the libstemmer library on a sample vocabulary. For
96
+ details on how to use it, run it with the "-h" command line option.
97
+
98
+
99
+ Using the library in a larger system
100
+ ====================================
101
+
102
+ If you are incorporating the library into the build system of a larger
103
+ program, I recommend copying the unpacked tarball without modification into
104
+ a subdirectory of the sources of your program. Future versions of the
105
+ library are intended to keep the same structure, so this will keep the
106
+ work required to move to a new version of the library to a minimum.
107
+
108
+ As an additional convenience, the list of source and header files used
109
+ in the library is detailed in mkinc.mak - a file which is in a suitable
110
+ format for inclusion by a Makefile. By including this file in your build
111
+ system, you can link the snowball system into your program with a few
112
+ extra rules.
113
+
114
+ Using the library in a system using GNU autotools
115
+ =================================================
116
+
117
+ The libstemmer_c library can be integrated into a larger system which uses the
118
+ GNU autotool framework (and in particular, automake and autoconf) as follows:
119
+
120
+ 1) Unpack libstemmer_c-*.tar.gz in the top level project directory and rename
121
+ the resulting directory to remove the version number so that there is a
122
+ libstemmer_c subdirectory of the top level directory of the project.
123
+
124
+ 2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing:
125
+
126
+ noinst_LTLIBRARIES = libstemmer.la
127
+ include $(srcdir)/mkinc.mak
128
+ noinst_HEADERS = $(snowball_headers)
129
+ libstemmer_la_SOURCES = $(snowball_sources)
130
+
131
+ (You may also need to add other lines to this, for example, if you are using
132
+ compiler options which are not compatible with compiling the libstemmer
133
+ library.)
134
+
135
+ 3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's
136
+ configure.ac file.
137
+
138
+ 4) Add to the top level makefile the following lines (or modify existing
139
+ assignments to these variables appropriately):
140
+
141
+ AUTOMAKE_OPTIONS = subdir-objects
142
+ AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include
143
+ SUBDIRS=libstemmer_c
144
+ <name>_LIBADD = libstemmer_c/libstemmer.la
145
+
146
+ (Where <name> is the name of the library or executable which links against
147
+ libstemmer.)
148
+
@@ -0,0 +1,53 @@
1
+ libstemmer_csharp
2
+ =================
3
+
4
+ This document pertains to the C# version of the libstemmer distribution,
5
+ available for download from:
6
+
7
+ https://snowballstem.org/download.html
8
+
9
+
10
+ What is Stemming?
11
+ -----------------
12
+
13
+ Stemming maps different forms of the same word to a common "stem" - for
14
+ example, the English stemmer maps *connection*, *connections*, *connective*,
15
+ *connected*, and *connecting* to *connect*. So a searching for *connected*
16
+ would also find documents which only have the other forms.
17
+
18
+ This stem form is often a word itself, but this is not always the case as this
19
+ is not a requirement for text search systems, which are the intended field of
20
+ use. We also aim to conflate words with the same meaning, rather than all
21
+ words with a common linguistic root (so *awe* and *awful* don't have the same
22
+ stem), and over-stemming is more problematic than under-stemming so we tend not
23
+ to stem in cases that are hard to resolve. If you want to always reduce words
24
+ to a root form and/or get a root form which is itself a word then Snowball's
25
+ stemming algorithms likely aren't the right answer.
26
+
27
+
28
+ Compiling the library
29
+ =====================
30
+
31
+ To build a library::
32
+
33
+ mcs -target:library -out:snowballstemmer.dll csharp/Snowball/*.cs csharp/Snowball/Algorithms/*cs
34
+
35
+ And to build the example program using that library::
36
+
37
+ mcs -target:exe -out:stemwords.exe -r:snowballstemmer.dll csharp/Stemwords/Program.cs
38
+
39
+ Using the library
40
+ =================
41
+
42
+ There is currently no formal documentation on the use of the C# version
43
+ of the library. Additionally, its interface is not guaranteed to be
44
+ stable.
45
+
46
+ The stemmer code is re-entrant, but not thread-safe if the same stemmer object
47
+ is used concurrently in different threads.
48
+
49
+ If you want to perform stemming concurrently in different threads, we suggest
50
+ creating a new stemmer object for each thread. The alternative is to share
51
+ stemmer objects between threads and protect access using a mutex or similar
52
+ but that's liable to slow your program down as threads can end up waiting for
53
+ the lock.
@@ -0,0 +1,67 @@
1
+ libstemmer_java
2
+ ===============
3
+
4
+ This document pertains to the Java version of the libstemmer distribution,
5
+ available for download from:
6
+
7
+ https://snowballstem.org/download.html
8
+
9
+
10
+ What is Stemming?
11
+ -----------------
12
+
13
+ Stemming maps different forms of the same word to a common "stem" - for
14
+ example, the English stemmer maps *connection*, *connections*, *connective*,
15
+ *connected*, and *connecting* to *connect*. So a searching for *connected*
16
+ would also find documents which only have the other forms.
17
+
18
+ This stem form is often a word itself, but this is not always the case as this
19
+ is not a requirement for text search systems, which are the intended field of
20
+ use. We also aim to conflate words with the same meaning, rather than all
21
+ words with a common linguistic root (so *awe* and *awful* don't have the same
22
+ stem), and over-stemming is more problematic than under-stemming so we tend not
23
+ to stem in cases that are hard to resolve. If you want to always reduce words
24
+ to a root form and/or get a root form which is itself a word then Snowball's
25
+ stemming algorithms likely aren't the right answer.
26
+
27
+
28
+ Compiling the library
29
+ =====================
30
+
31
+ Simply run the java compiler on all the java source files under the java
32
+ directory. For example, this can be done under unix by changing directory into
33
+ the java directory, and running:
34
+
35
+ javac org/tartarus/snowball/*.java org/tartarus/snowball/ext/*.java
36
+
37
+ This will compile the library and also an example program "TestApp" which
38
+ provides a command line interface to the library.
39
+
40
+
41
+ Using the library
42
+ =================
43
+
44
+ There is currently no formal documentation on the use of the Java version
45
+ of the library. Additionally, its interface is not guaranteed to be
46
+ stable.
47
+
48
+ The best documentation of the library is the source of the TestApp example
49
+ program.
50
+
51
+ The stemmer code is re-entrant, but not thread-safe if the same stemmer object
52
+ is used concurrently in different threads.
53
+
54
+ If you want to perform stemming concurrently in different threads, we suggest
55
+ creating a new stemmer object for each thread. The alternative is to share
56
+ stemmer objects between threads and protect access using a mutex or similar
57
+ but that's liable to slow your program down as threads can end up waiting for
58
+ the lock.
59
+
60
+
61
+ The TestApp example
62
+ ===================
63
+
64
+ The TestApp example program allows you to run any of the stemmers
65
+ compiled into the libstemmer library on a sample vocabulary. For
66
+ details on how to use it, run it with no command line parameters.
67
+
@@ -0,0 +1,48 @@
1
+ Snowball stemming library collection for Javascript
2
+ ===================================================
3
+
4
+ What is Stemming?
5
+ -----------------
6
+
7
+ Stemming maps different forms of the same word to a common "stem" - for
8
+ example, the English stemmer maps *connection*, *connections*, *connective*,
9
+ *connected*, and *connecting* to *connect*. So a searching for *connected*
10
+ would also find documents which only have the other forms.
11
+
12
+ This stem form is often a word itself, but this is not always the case as this
13
+ is not a requirement for text search systems, which are the intended field of
14
+ use. We also aim to conflate words with the same meaning, rather than all
15
+ words with a common linguistic root (so *awe* and *awful* don't have the same
16
+ stem), and over-stemming is more problematic than under-stemming so we tend not
17
+ to stem in cases that are hard to resolve. If you want to always reduce words
18
+ to a root form and/or get a root form which is itself a word then Snowball's
19
+ stemming algorithms likely aren't the right answer.
20
+
21
+
22
+ How to use library
23
+ ------------------
24
+
25
+ You can use each stemming modules from Javascript code - e.g to use them
26
+ with node:
27
+
28
+ .. code-block:: javascript
29
+
30
+ const stemmer = require('base-stemmer.js');
31
+ const english_stemmer = require('english-stemmer.js');
32
+
33
+ var stemmer = new EnglishStemmer();
34
+ alert(stemmer.stemWord("testing"));
35
+
36
+ You'll need to bundle ``base-stemmer.js`` and whichever languages you want
37
+ stemmers for (e.g. ``english-stemmer.js`` for English).
38
+
39
+ FIXME: Document how to use in a web browser.
40
+
41
+ The stemmer code is re-entrant, but not thread-safe if the same stemmer object
42
+ is used concurrently in different threads.
43
+
44
+ If you want to perform stemming concurrently in different threads, we suggest
45
+ creating a new stemmer object for each thread. The alternative is to share
46
+ stemmer objects between threads and protect access using a mutex or similar
47
+ but that's liable to slow your program down as threads can end up waiting for
48
+ the lock.
@@ -0,0 +1,113 @@
1
+ Snowball stemming library collection for Python
2
+ ===============================================
3
+
4
+ Python 3 (>= 3.3) is supported. We no longer actively support Python 2 as
5
+ the Python developers stopped supporting it at the start of 2020. Snowball
6
+ 2.1.0 was the last release to officially support Python 2.
7
+
8
+ What is Stemming?
9
+ -----------------
10
+
11
+ Stemming maps different forms of the same word to a common "stem" - for
12
+ example, the English stemmer maps *connection*, *connections*, *connective*,
13
+ *connected*, and *connecting* to *connect*. So a searching for *connected*
14
+ would also find documents which only have the other forms.
15
+
16
+ This stem form is often a word itself, but this is not always the case as this
17
+ is not a requirement for text search systems, which are the intended field of
18
+ use. We also aim to conflate words with the same meaning, rather than all
19
+ words with a common linguistic root (so *awe* and *awful* don't have the same
20
+ stem), and over-stemming is more problematic than under-stemming so we tend not
21
+ to stem in cases that are hard to resolve. If you want to always reduce words
22
+ to a root form and/or get a root form which is itself a word then Snowball's
23
+ stemming algorithms likely aren't the right answer.
24
+
25
+ How to use library
26
+ ------------------
27
+
28
+ The ``snowballstemmer`` module has two functions.
29
+
30
+ The ``snowballstemmer.algorithms`` function returns a list of available
31
+ algorithm names.
32
+
33
+ The ``snowballstemmer.stemmer`` function takes an algorithm name and returns a
34
+ ``Stemmer`` object.
35
+
36
+ ``Stemmer`` objects have a ``Stemmer.stemWord(word)`` method and a
37
+ ``Stemmer.stemWords(word[])`` method.
38
+
39
+ .. code-block:: python
40
+
41
+ import snowballstemmer
42
+
43
+ stemmer = snowballstemmer.stemmer('english');
44
+ print(stemmer.stemWords("We are the world".split()));
45
+
46
+ Generally you should create a stemmer object and reuse it rather than creating
47
+ a fresh object for each word stemmed (since there's some cost to creating and
48
+ destroying the object).
49
+
50
+ The stemmer code is re-entrant, but not thread-safe if the same stemmer object
51
+ is used concurrently in different threads.
52
+
53
+ If you want to perform stemming concurrently in different threads, we suggest
54
+ creating a new stemmer object for each thread. The alternative is to share
55
+ stemmer objects between threads and protect access using a mutex or similar
56
+ (e.g. `threading.Lock` in Python) but that's liable to slow your program down
57
+ as threads can end up waiting for the lock.
58
+
59
+ Automatic Acceleration
60
+ ----------------------
61
+
62
+ `PyStemmer <https://pypi.org/project/PyStemmer/>`_ is a wrapper module for
63
+ Snowball's ``libstemmer_c`` and should provide results 100% compatible to
64
+ **snowballstemmer**.
65
+
66
+ **PyStemmer** is faster because it wraps generated C versions of the stemmers;
67
+ **snowballstemmer** uses generate Python code and is slower but offers a pure
68
+ Python solution.
69
+
70
+ If PyStemmer is installed, ``snowballstemmer.stemmer`` returns a ``PyStemmer``
71
+ ``Stemmer`` object which provides the same ``Stemmer.stemWord()`` and
72
+ ``Stemmer.stemWords()`` methods.
73
+
74
+ Benchmark
75
+ ~~~~~~~~~
76
+
77
+ This is a crude benchmark which measures the time for running each stemmer on
78
+ every word in its sample vocabulary (10,787,583 words over 26 languages). It's
79
+ not a realistic test of normal use as a real application would do much more
80
+ than just stemming. It's also skewed towards the stemmers which do more work
81
+ per word and towards those with larger sample vocabularies.
82
+
83
+ * Python 2.7 + **snowballstemmer** : 13m00s (15.0 * PyStemmer)
84
+ * Python 3.7 + **snowballstemmer** : 12m19s (14.2 * PyStemmer)
85
+ * PyPy 7.1.1 (Python 2.7.13) + **snowballstemmer** : 2m14s (2.6 * PyStemmer)
86
+ * PyPy 7.1.1 (Python 3.6.1) + **snowballstemmer** : 1m46s (2.0 * PyStemmer)
87
+ * Python 2.7 + **PyStemmer** : 52s
88
+
89
+ For reference the equivalent test for C runs in 9 seconds.
90
+
91
+ These results are for Snowball 2.0.0. They're likely to evolve over time as
92
+ the code Snowball generates for both Python and C continues to improve (for
93
+ a much older test over a different set of stemmers using Python 2.7,
94
+ **snowballstemmer** was 30 times slower than **PyStemmer**, or 9 times slower
95
+ with **PyPy**).
96
+
97
+ The message to take away is that if you're stemming a lot of words you should
98
+ either install **PyStemmer** (which **snowballstemmer** will then automatically
99
+ use for you as described above) or use PyPy.
100
+
101
+ The TestApp example
102
+ -------------------
103
+
104
+ The ``testapp.py`` example program allows you to run any of the stemmers
105
+ on a sample vocabulary.
106
+
107
+ Usage::
108
+
109
+ testapp.py <algorithm> "sentences ... "
110
+
111
+ .. code-block:: bash
112
+
113
+ $ python testapp.py English "sentences... "