mittens 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
// Copyright (c) 2001, Dr Martin Porter
|
|
2
|
+
// Copyright (c) 2002, Richard Boulton
|
|
3
|
+
// Copyright (c) 2015, Cesar Souza
|
|
4
|
+
// All rights reserved.
|
|
5
|
+
//
|
|
6
|
+
// Redistribution and use in source and binary forms, with or without
|
|
7
|
+
// modification, are permitted provided that the following conditions are met:
|
|
8
|
+
//
|
|
9
|
+
// * Redistributions of source code must retain the above copyright notice,
|
|
10
|
+
// * this list of conditions and the following disclaimer.
|
|
11
|
+
// * Redistributions in binary form must reproduce the above copyright
|
|
12
|
+
// * notice, this list of conditions and the following disclaimer in the
|
|
13
|
+
// * documentation and/or other materials provided with the distribution.
|
|
14
|
+
// * Neither the name of the copyright holders nor the names of its contributors
|
|
15
|
+
// * may be used to endorse or promote products derived from this software
|
|
16
|
+
// * without specific prior written permission.
|
|
17
|
+
//
|
|
18
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
19
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
20
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
21
|
+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
|
22
|
+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
23
|
+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
24
|
+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
25
|
+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
26
|
+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
27
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
28
|
+
|
|
29
|
+
namespace Snowball
|
|
30
|
+
{
|
|
31
|
+
using System;
|
|
32
|
+
using System.IO;
|
|
33
|
+
using System.Reflection;
|
|
34
|
+
using System.Linq;
|
|
35
|
+
using System.Text;
|
|
36
|
+
|
|
37
|
+
/// <summary>
|
|
38
|
+
/// Snowball's Stemmer program.
|
|
39
|
+
/// </summary>
|
|
40
|
+
///
|
|
41
|
+
public static class Program
|
|
42
|
+
{
|
|
43
|
+
|
|
44
|
+
private static void usage()
|
|
45
|
+
{
|
|
46
|
+
Console.WriteLine("Usage: stemwords.exe -l <language> -i <input file> [-o <output file>]");
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/// <summary>
|
|
50
|
+
/// Main program entrypoint.
|
|
51
|
+
/// </summary>
|
|
52
|
+
///
|
|
53
|
+
public static void Main(String[] args)
|
|
54
|
+
{
|
|
55
|
+
string language = null;
|
|
56
|
+
string inputName = null;
|
|
57
|
+
string outputName = null;
|
|
58
|
+
|
|
59
|
+
for (int i = 0; i < args.Length; i++)
|
|
60
|
+
{
|
|
61
|
+
if (args[i] == "-l")
|
|
62
|
+
language = args[i + 1];
|
|
63
|
+
else if (args[i] == "-i")
|
|
64
|
+
inputName = args[i + 1];
|
|
65
|
+
if (args[i] == "-o")
|
|
66
|
+
outputName = args[i + 1];
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if (language == null || inputName == null)
|
|
70
|
+
{
|
|
71
|
+
usage();
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
Stemmer stemmer =
|
|
78
|
+
typeof(Stemmer).Assembly.GetTypes()
|
|
79
|
+
.Where(t => t.IsSubclassOf(typeof(Stemmer)) && !t.IsAbstract)
|
|
80
|
+
.Where(t => match(t.Name, language))
|
|
81
|
+
.Select(t => (Stemmer)Activator.CreateInstance(t)).FirstOrDefault();
|
|
82
|
+
|
|
83
|
+
if (stemmer == null)
|
|
84
|
+
{
|
|
85
|
+
Console.WriteLine("Language not found.");
|
|
86
|
+
return;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
Console.WriteLine("Using " + stemmer.GetType());
|
|
90
|
+
|
|
91
|
+
TextWriter output = System.Console.Out;
|
|
92
|
+
|
|
93
|
+
if (outputName != null)
|
|
94
|
+
output = new StreamWriter(outputName);
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
foreach (var line in File.ReadAllLines(inputName))
|
|
98
|
+
{
|
|
99
|
+
var o = stemmer.Stem(line);
|
|
100
|
+
output.WriteLine(o);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
output.Flush();
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
private static bool match(string stemmerName, string language)
|
|
107
|
+
{
|
|
108
|
+
string expectedName = language.Replace("_", "") + "Stemmer";
|
|
109
|
+
|
|
110
|
+
return stemmerName.StartsWith(expectedName,
|
|
111
|
+
StringComparison.CurrentCultureIgnoreCase);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Things to do:
|
|
2
|
+
|
|
3
|
+
- Write documentation for how to use libstemmer (as opposed to how stemming
|
|
4
|
+
algorithms themselves work).
|
|
5
|
+
Currently, the documentation in the include/libstemmer.h header file is
|
|
6
|
+
pretty clear and comprehensive, but an overview document wouldn't go amiss.
|
|
7
|
+
|
|
8
|
+
Things that would be nice to include at some point.
|
|
9
|
+
|
|
10
|
+
- Add version numbers to each stemming algorithm, and allow the interface to
|
|
11
|
+
request a specific version of the stemming algorithms. Default to providing
|
|
12
|
+
the latest version of the algorithm.
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
libstemmer_c
|
|
2
|
+
============
|
|
3
|
+
|
|
4
|
+
This document pertains to the C version of the libstemmer distribution,
|
|
5
|
+
available for download from:
|
|
6
|
+
|
|
7
|
+
https://snowballstem.org/download.html
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
What is Stemming?
|
|
11
|
+
-----------------
|
|
12
|
+
|
|
13
|
+
Stemming maps different forms of the same word to a common "stem" - for
|
|
14
|
+
example, the English stemmer maps *connection*, *connections*, *connective*,
|
|
15
|
+
*connected*, and *connecting* to *connect*. So a searching for *connected*
|
|
16
|
+
would also find documents which only have the other forms.
|
|
17
|
+
|
|
18
|
+
This stem form is often a word itself, but this is not always the case as this
|
|
19
|
+
is not a requirement for text search systems, which are the intended field of
|
|
20
|
+
use. We also aim to conflate words with the same meaning, rather than all
|
|
21
|
+
words with a common linguistic root (so *awe* and *awful* don't have the same
|
|
22
|
+
stem), and over-stemming is more problematic than under-stemming so we tend not
|
|
23
|
+
to stem in cases that are hard to resolve. If you want to always reduce words
|
|
24
|
+
to a root form and/or get a root form which is itself a word then Snowball's
|
|
25
|
+
stemming algorithms likely aren't the right answer.
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
Compiling the library
|
|
29
|
+
=====================
|
|
30
|
+
|
|
31
|
+
A simple makefile is provided for Unix style systems. On such systems, it
|
|
32
|
+
should be possible simply to run "make", and the file "libstemmer.o"
|
|
33
|
+
and the example program "stemwords" will be generated.
|
|
34
|
+
|
|
35
|
+
If this doesn't work on your system, you need to write your own build
|
|
36
|
+
system (or call the compiler directly). The files to compile are
|
|
37
|
+
all contained in the "libstemmer", "runtime" and "src_c" directories,
|
|
38
|
+
and the public header file is contained in the "include" directory.
|
|
39
|
+
|
|
40
|
+
The library comes in two flavours; UTF-8 only, and UTF-8 plus other character
|
|
41
|
+
sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of
|
|
42
|
+
"libstemmer.c".
|
|
43
|
+
|
|
44
|
+
For convenience "mkinc.mak" is a makefile fragment listing the source files and
|
|
45
|
+
header files used to compile the standard version of the library.
|
|
46
|
+
"mkinc_utf8.mak" is a comparable makefile fragment listing just the source
|
|
47
|
+
files for the UTF-8 only version of the library.
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
Using the library
|
|
51
|
+
=================
|
|
52
|
+
|
|
53
|
+
The library provides a simple C API. Essentially, a new stemmer can
|
|
54
|
+
be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then
|
|
55
|
+
used to stem a word, "sb_stemmer_length" returns the stemmed
|
|
56
|
+
length of the last word processed, and "sb_stemmer_delete" is
|
|
57
|
+
used to delete a stemmer.
|
|
58
|
+
|
|
59
|
+
Generally you should create a stemmer object and reuse it rather than creating
|
|
60
|
+
a fresh object for each word stemmed (since there's some cost to creating and
|
|
61
|
+
destroying the object).
|
|
62
|
+
|
|
63
|
+
The stemmer code is re-entrant, but not thread-safe if the same stemmer object
|
|
64
|
+
is used concurrently in different threads.
|
|
65
|
+
|
|
66
|
+
If you want to perform stemming concurrently in different threads, we suggest
|
|
67
|
+
creating a new stemmer object for each thread. The alternative is to share
|
|
68
|
+
stemmer objects between threads and protect access using a mutex or similar
|
|
69
|
+
but that's liable to slow your program down as threads can end up waiting for
|
|
70
|
+
the lock.
|
|
71
|
+
|
|
72
|
+
libstemmer does not currently incorporate any mechanism for caching the results
|
|
73
|
+
of stemming operations. Such caching can greatly increase the performance of a
|
|
74
|
+
stemmer under certain situations, so suitable patches will be considered for
|
|
75
|
+
inclusion.
|
|
76
|
+
|
|
77
|
+
The standard libstemmer sources contain an algorithm for each of the supported
|
|
78
|
+
languages. The algorithm may be selected using the english name of the
|
|
79
|
+
language, or using the 2 or 3 letter ISO 639 language codes. In addition,
|
|
80
|
+
the traditional "Porter" stemming algorithm for english is included for
|
|
81
|
+
backwards compatibility purposes, but we recommend use of the "English"
|
|
82
|
+
stemmer in preference for new projects.
|
|
83
|
+
|
|
84
|
+
(Some minor algorithms which are included only as curiosities in the snowball
|
|
85
|
+
website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not
|
|
86
|
+
included in the standard libstemmer sources. These are not really supported by
|
|
87
|
+
the snowball project, but it would be possible to compile a modified libstemmer
|
|
88
|
+
library containing these if desired.)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
The stemwords example
|
|
92
|
+
=====================
|
|
93
|
+
|
|
94
|
+
The stemwords example program allows you to run any of the stemmers
|
|
95
|
+
compiled into the libstemmer library on a sample vocabulary. For
|
|
96
|
+
details on how to use it, run it with the "-h" command line option.
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
Using the library in a larger system
|
|
100
|
+
====================================
|
|
101
|
+
|
|
102
|
+
If you are incorporating the library into the build system of a larger
|
|
103
|
+
program, I recommend copying the unpacked tarball without modification into
|
|
104
|
+
a subdirectory of the sources of your program. Future versions of the
|
|
105
|
+
library are intended to keep the same structure, so this will keep the
|
|
106
|
+
work required to move to a new version of the library to a minimum.
|
|
107
|
+
|
|
108
|
+
As an additional convenience, the list of source and header files used
|
|
109
|
+
in the library is detailed in mkinc.mak - a file which is in a suitable
|
|
110
|
+
format for inclusion by a Makefile. By including this file in your build
|
|
111
|
+
system, you can link the snowball system into your program with a few
|
|
112
|
+
extra rules.
|
|
113
|
+
|
|
114
|
+
Using the library in a system using GNU autotools
|
|
115
|
+
=================================================
|
|
116
|
+
|
|
117
|
+
The libstemmer_c library can be integrated into a larger system which uses the
|
|
118
|
+
GNU autotool framework (and in particular, automake and autoconf) as follows:
|
|
119
|
+
|
|
120
|
+
1) Unpack libstemmer_c-*.tar.gz in the top level project directory and rename
|
|
121
|
+
the resulting directory to remove the version number so that there is a
|
|
122
|
+
libstemmer_c subdirectory of the top level directory of the project.
|
|
123
|
+
|
|
124
|
+
2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing:
|
|
125
|
+
|
|
126
|
+
noinst_LTLIBRARIES = libstemmer.la
|
|
127
|
+
include $(srcdir)/mkinc.mak
|
|
128
|
+
noinst_HEADERS = $(snowball_headers)
|
|
129
|
+
libstemmer_la_SOURCES = $(snowball_sources)
|
|
130
|
+
|
|
131
|
+
(You may also need to add other lines to this, for example, if you are using
|
|
132
|
+
compiler options which are not compatible with compiling the libstemmer
|
|
133
|
+
library.)
|
|
134
|
+
|
|
135
|
+
3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's
|
|
136
|
+
configure.ac file.
|
|
137
|
+
|
|
138
|
+
4) Add to the top level makefile the following lines (or modify existing
|
|
139
|
+
assignments to these variables appropriately):
|
|
140
|
+
|
|
141
|
+
AUTOMAKE_OPTIONS = subdir-objects
|
|
142
|
+
AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include
|
|
143
|
+
SUBDIRS=libstemmer_c
|
|
144
|
+
<name>_LIBADD = libstemmer_c/libstemmer.la
|
|
145
|
+
|
|
146
|
+
(Where <name> is the name of the library or executable which links against
|
|
147
|
+
libstemmer.)
|
|
148
|
+
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
libstemmer_csharp
|
|
2
|
+
=================
|
|
3
|
+
|
|
4
|
+
This document pertains to the C# version of the libstemmer distribution,
|
|
5
|
+
available for download from:
|
|
6
|
+
|
|
7
|
+
https://snowballstem.org/download.html
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
What is Stemming?
|
|
11
|
+
-----------------
|
|
12
|
+
|
|
13
|
+
Stemming maps different forms of the same word to a common "stem" - for
|
|
14
|
+
example, the English stemmer maps *connection*, *connections*, *connective*,
|
|
15
|
+
*connected*, and *connecting* to *connect*. So a searching for *connected*
|
|
16
|
+
would also find documents which only have the other forms.
|
|
17
|
+
|
|
18
|
+
This stem form is often a word itself, but this is not always the case as this
|
|
19
|
+
is not a requirement for text search systems, which are the intended field of
|
|
20
|
+
use. We also aim to conflate words with the same meaning, rather than all
|
|
21
|
+
words with a common linguistic root (so *awe* and *awful* don't have the same
|
|
22
|
+
stem), and over-stemming is more problematic than under-stemming so we tend not
|
|
23
|
+
to stem in cases that are hard to resolve. If you want to always reduce words
|
|
24
|
+
to a root form and/or get a root form which is itself a word then Snowball's
|
|
25
|
+
stemming algorithms likely aren't the right answer.
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
Compiling the library
|
|
29
|
+
=====================
|
|
30
|
+
|
|
31
|
+
To build a library::
|
|
32
|
+
|
|
33
|
+
mcs -target:library -out:snowballstemmer.dll csharp/Snowball/*.cs csharp/Snowball/Algorithms/*cs
|
|
34
|
+
|
|
35
|
+
And to build the example program using that library::
|
|
36
|
+
|
|
37
|
+
mcs -target:exe -out:stemwords.exe -r:snowballstemmer.dll csharp/Stemwords/Program.cs
|
|
38
|
+
|
|
39
|
+
Using the library
|
|
40
|
+
=================
|
|
41
|
+
|
|
42
|
+
There is currently no formal documentation on the use of the C# version
|
|
43
|
+
of the library. Additionally, its interface is not guaranteed to be
|
|
44
|
+
stable.
|
|
45
|
+
|
|
46
|
+
The stemmer code is re-entrant, but not thread-safe if the same stemmer object
|
|
47
|
+
is used concurrently in different threads.
|
|
48
|
+
|
|
49
|
+
If you want to perform stemming concurrently in different threads, we suggest
|
|
50
|
+
creating a new stemmer object for each thread. The alternative is to share
|
|
51
|
+
stemmer objects between threads and protect access using a mutex or similar
|
|
52
|
+
but that's liable to slow your program down as threads can end up waiting for
|
|
53
|
+
the lock.
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
libstemmer_java
|
|
2
|
+
===============
|
|
3
|
+
|
|
4
|
+
This document pertains to the Java version of the libstemmer distribution,
|
|
5
|
+
available for download from:
|
|
6
|
+
|
|
7
|
+
https://snowballstem.org/download.html
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
What is Stemming?
|
|
11
|
+
-----------------
|
|
12
|
+
|
|
13
|
+
Stemming maps different forms of the same word to a common "stem" - for
|
|
14
|
+
example, the English stemmer maps *connection*, *connections*, *connective*,
|
|
15
|
+
*connected*, and *connecting* to *connect*. So a searching for *connected*
|
|
16
|
+
would also find documents which only have the other forms.
|
|
17
|
+
|
|
18
|
+
This stem form is often a word itself, but this is not always the case as this
|
|
19
|
+
is not a requirement for text search systems, which are the intended field of
|
|
20
|
+
use. We also aim to conflate words with the same meaning, rather than all
|
|
21
|
+
words with a common linguistic root (so *awe* and *awful* don't have the same
|
|
22
|
+
stem), and over-stemming is more problematic than under-stemming so we tend not
|
|
23
|
+
to stem in cases that are hard to resolve. If you want to always reduce words
|
|
24
|
+
to a root form and/or get a root form which is itself a word then Snowball's
|
|
25
|
+
stemming algorithms likely aren't the right answer.
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
Compiling the library
|
|
29
|
+
=====================
|
|
30
|
+
|
|
31
|
+
Simply run the java compiler on all the java source files under the java
|
|
32
|
+
directory. For example, this can be done under unix by changing directory into
|
|
33
|
+
the java directory, and running:
|
|
34
|
+
|
|
35
|
+
javac org/tartarus/snowball/*.java org/tartarus/snowball/ext/*.java
|
|
36
|
+
|
|
37
|
+
This will compile the library and also an example program "TestApp" which
|
|
38
|
+
provides a command line interface to the library.
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
Using the library
|
|
42
|
+
=================
|
|
43
|
+
|
|
44
|
+
There is currently no formal documentation on the use of the Java version
|
|
45
|
+
of the library. Additionally, its interface is not guaranteed to be
|
|
46
|
+
stable.
|
|
47
|
+
|
|
48
|
+
The best documentation of the library is the source of the TestApp example
|
|
49
|
+
program.
|
|
50
|
+
|
|
51
|
+
The stemmer code is re-entrant, but not thread-safe if the same stemmer object
|
|
52
|
+
is used concurrently in different threads.
|
|
53
|
+
|
|
54
|
+
If you want to perform stemming concurrently in different threads, we suggest
|
|
55
|
+
creating a new stemmer object for each thread. The alternative is to share
|
|
56
|
+
stemmer objects between threads and protect access using a mutex or similar
|
|
57
|
+
but that's liable to slow your program down as threads can end up waiting for
|
|
58
|
+
the lock.
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
The TestApp example
|
|
62
|
+
===================
|
|
63
|
+
|
|
64
|
+
The TestApp example program allows you to run any of the stemmers
|
|
65
|
+
compiled into the libstemmer library on a sample vocabulary. For
|
|
66
|
+
details on how to use it, run it with no command line parameters.
|
|
67
|
+
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
Snowball stemming library collection for Javascript
|
|
2
|
+
===================================================
|
|
3
|
+
|
|
4
|
+
What is Stemming?
|
|
5
|
+
-----------------
|
|
6
|
+
|
|
7
|
+
Stemming maps different forms of the same word to a common "stem" - for
|
|
8
|
+
example, the English stemmer maps *connection*, *connections*, *connective*,
|
|
9
|
+
*connected*, and *connecting* to *connect*. So a searching for *connected*
|
|
10
|
+
would also find documents which only have the other forms.
|
|
11
|
+
|
|
12
|
+
This stem form is often a word itself, but this is not always the case as this
|
|
13
|
+
is not a requirement for text search systems, which are the intended field of
|
|
14
|
+
use. We also aim to conflate words with the same meaning, rather than all
|
|
15
|
+
words with a common linguistic root (so *awe* and *awful* don't have the same
|
|
16
|
+
stem), and over-stemming is more problematic than under-stemming so we tend not
|
|
17
|
+
to stem in cases that are hard to resolve. If you want to always reduce words
|
|
18
|
+
to a root form and/or get a root form which is itself a word then Snowball's
|
|
19
|
+
stemming algorithms likely aren't the right answer.
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
How to use library
|
|
23
|
+
------------------
|
|
24
|
+
|
|
25
|
+
You can use each stemming modules from Javascript code - e.g to use them
|
|
26
|
+
with node:
|
|
27
|
+
|
|
28
|
+
.. code-block:: javascript
|
|
29
|
+
|
|
30
|
+
const stemmer = require('base-stemmer.js');
|
|
31
|
+
const english_stemmer = require('english-stemmer.js');
|
|
32
|
+
|
|
33
|
+
var stemmer = new EnglishStemmer();
|
|
34
|
+
alert(stemmer.stemWord("testing"));
|
|
35
|
+
|
|
36
|
+
You'll need to bundle ``base-stemmer.js`` and whichever languages you want
|
|
37
|
+
stemmers for (e.g. ``english-stemmer.js`` for English).
|
|
38
|
+
|
|
39
|
+
FIXME: Document how to use in a web browser.
|
|
40
|
+
|
|
41
|
+
The stemmer code is re-entrant, but not thread-safe if the same stemmer object
|
|
42
|
+
is used concurrently in different threads.
|
|
43
|
+
|
|
44
|
+
If you want to perform stemming concurrently in different threads, we suggest
|
|
45
|
+
creating a new stemmer object for each thread. The alternative is to share
|
|
46
|
+
stemmer objects between threads and protect access using a mutex or similar
|
|
47
|
+
but that's liable to slow your program down as threads can end up waiting for
|
|
48
|
+
the lock.
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
Snowball stemming library collection for Python
|
|
2
|
+
===============================================
|
|
3
|
+
|
|
4
|
+
Python 3 (>= 3.3) is supported. We no longer actively support Python 2 as
|
|
5
|
+
the Python developers stopped supporting it at the start of 2020. Snowball
|
|
6
|
+
2.1.0 was the last release to officially support Python 2.
|
|
7
|
+
|
|
8
|
+
What is Stemming?
|
|
9
|
+
-----------------
|
|
10
|
+
|
|
11
|
+
Stemming maps different forms of the same word to a common "stem" - for
|
|
12
|
+
example, the English stemmer maps *connection*, *connections*, *connective*,
|
|
13
|
+
*connected*, and *connecting* to *connect*. So a searching for *connected*
|
|
14
|
+
would also find documents which only have the other forms.
|
|
15
|
+
|
|
16
|
+
This stem form is often a word itself, but this is not always the case as this
|
|
17
|
+
is not a requirement for text search systems, which are the intended field of
|
|
18
|
+
use. We also aim to conflate words with the same meaning, rather than all
|
|
19
|
+
words with a common linguistic root (so *awe* and *awful* don't have the same
|
|
20
|
+
stem), and over-stemming is more problematic than under-stemming so we tend not
|
|
21
|
+
to stem in cases that are hard to resolve. If you want to always reduce words
|
|
22
|
+
to a root form and/or get a root form which is itself a word then Snowball's
|
|
23
|
+
stemming algorithms likely aren't the right answer.
|
|
24
|
+
|
|
25
|
+
How to use library
|
|
26
|
+
------------------
|
|
27
|
+
|
|
28
|
+
The ``snowballstemmer`` module has two functions.
|
|
29
|
+
|
|
30
|
+
The ``snowballstemmer.algorithms`` function returns a list of available
|
|
31
|
+
algorithm names.
|
|
32
|
+
|
|
33
|
+
The ``snowballstemmer.stemmer`` function takes an algorithm name and returns a
|
|
34
|
+
``Stemmer`` object.
|
|
35
|
+
|
|
36
|
+
``Stemmer`` objects have a ``Stemmer.stemWord(word)`` method and a
|
|
37
|
+
``Stemmer.stemWords(word[])`` method.
|
|
38
|
+
|
|
39
|
+
.. code-block:: python
|
|
40
|
+
|
|
41
|
+
import snowballstemmer
|
|
42
|
+
|
|
43
|
+
stemmer = snowballstemmer.stemmer('english');
|
|
44
|
+
print(stemmer.stemWords("We are the world".split()));
|
|
45
|
+
|
|
46
|
+
Generally you should create a stemmer object and reuse it rather than creating
|
|
47
|
+
a fresh object for each word stemmed (since there's some cost to creating and
|
|
48
|
+
destroying the object).
|
|
49
|
+
|
|
50
|
+
The stemmer code is re-entrant, but not thread-safe if the same stemmer object
|
|
51
|
+
is used concurrently in different threads.
|
|
52
|
+
|
|
53
|
+
If you want to perform stemming concurrently in different threads, we suggest
|
|
54
|
+
creating a new stemmer object for each thread. The alternative is to share
|
|
55
|
+
stemmer objects between threads and protect access using a mutex or similar
|
|
56
|
+
(e.g. `threading.Lock` in Python) but that's liable to slow your program down
|
|
57
|
+
as threads can end up waiting for the lock.
|
|
58
|
+
|
|
59
|
+
Automatic Acceleration
|
|
60
|
+
----------------------
|
|
61
|
+
|
|
62
|
+
`PyStemmer <https://pypi.org/project/PyStemmer/>`_ is a wrapper module for
|
|
63
|
+
Snowball's ``libstemmer_c`` and should provide results 100% compatible to
|
|
64
|
+
**snowballstemmer**.
|
|
65
|
+
|
|
66
|
+
**PyStemmer** is faster because it wraps generated C versions of the stemmers;
|
|
67
|
+
**snowballstemmer** uses generate Python code and is slower but offers a pure
|
|
68
|
+
Python solution.
|
|
69
|
+
|
|
70
|
+
If PyStemmer is installed, ``snowballstemmer.stemmer`` returns a ``PyStemmer``
|
|
71
|
+
``Stemmer`` object which provides the same ``Stemmer.stemWord()`` and
|
|
72
|
+
``Stemmer.stemWords()`` methods.
|
|
73
|
+
|
|
74
|
+
Benchmark
|
|
75
|
+
~~~~~~~~~
|
|
76
|
+
|
|
77
|
+
This is a crude benchmark which measures the time for running each stemmer on
|
|
78
|
+
every word in its sample vocabulary (10,787,583 words over 26 languages). It's
|
|
79
|
+
not a realistic test of normal use as a real application would do much more
|
|
80
|
+
than just stemming. It's also skewed towards the stemmers which do more work
|
|
81
|
+
per word and towards those with larger sample vocabularies.
|
|
82
|
+
|
|
83
|
+
* Python 2.7 + **snowballstemmer** : 13m00s (15.0 * PyStemmer)
|
|
84
|
+
* Python 3.7 + **snowballstemmer** : 12m19s (14.2 * PyStemmer)
|
|
85
|
+
* PyPy 7.1.1 (Python 2.7.13) + **snowballstemmer** : 2m14s (2.6 * PyStemmer)
|
|
86
|
+
* PyPy 7.1.1 (Python 3.6.1) + **snowballstemmer** : 1m46s (2.0 * PyStemmer)
|
|
87
|
+
* Python 2.7 + **PyStemmer** : 52s
|
|
88
|
+
|
|
89
|
+
For reference the equivalent test for C runs in 9 seconds.
|
|
90
|
+
|
|
91
|
+
These results are for Snowball 2.0.0. They're likely to evolve over time as
|
|
92
|
+
the code Snowball generates for both Python and C continues to improve (for
|
|
93
|
+
a much older test over a different set of stemmers using Python 2.7,
|
|
94
|
+
**snowballstemmer** was 30 times slower than **PyStemmer**, or 9 times slower
|
|
95
|
+
with **PyPy**).
|
|
96
|
+
|
|
97
|
+
The message to take away is that if you're stemming a lot of words you should
|
|
98
|
+
either install **PyStemmer** (which **snowballstemmer** will then automatically
|
|
99
|
+
use for you as described above) or use PyPy.
|
|
100
|
+
|
|
101
|
+
The TestApp example
|
|
102
|
+
-------------------
|
|
103
|
+
|
|
104
|
+
The ``testapp.py`` example program allows you to run any of the stemmers
|
|
105
|
+
on a sample vocabulary.
|
|
106
|
+
|
|
107
|
+
Usage::
|
|
108
|
+
|
|
109
|
+
testapp.py <algorithm> "sentences ... "
|
|
110
|
+
|
|
111
|
+
.. code-block:: bash
|
|
112
|
+
|
|
113
|
+
$ python testapp.py English "sentences... "
|