mittens 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
@@ -0,0 +1,37 @@
|
|
1
|
+
Snowball is a small string processing language for creating stemming algorithms
|
2
|
+
for use in Information Retrieval, plus a collection of stemming algorithms
|
3
|
+
implemented using it.
|
4
|
+
|
5
|
+
Snowball was originally designed and built by Martin Porter. Martin retired
|
6
|
+
from development in 2014 and Snowball is now maintained as a community project.
|
7
|
+
Martin originally chose the name Snowball as a tribute to SNOBOL, the excellent
|
8
|
+
string handling language from the 1960s. It now also serves as a metaphor for
|
9
|
+
how the project grows by gathering contributions over time.
|
10
|
+
|
11
|
+
The Snowball compiler translates a Snowball program into source code in another
|
12
|
+
language - currently Ada, ISO C, C#, Go, Java, Javascript, Object Pascal,
|
13
|
+
Python and Rust are supported.
|
14
|
+
|
15
|
+
This repository contains the source code for the snowball compiler and the
|
16
|
+
stemming algorithms. The snowball compiler is written in ISO C - you'll need
|
17
|
+
a C compiler which support C99 to build it (but the C code it generates should
|
18
|
+
work with any ISO C compiler).
|
19
|
+
|
20
|
+
See https://snowballstem.org/ for more information about Snowball.
|
21
|
+
|
22
|
+
What is Stemming?
|
23
|
+
=================
|
24
|
+
|
25
|
+
Stemming maps different forms of the same word to a common "stem" - for
|
26
|
+
example, the English stemmer maps *connection*, *connections*, *connective*,
|
27
|
+
*connected*, and *connecting* to *connect*. So a searching for *connected*
|
28
|
+
would also find documents which only have the other forms.
|
29
|
+
|
30
|
+
This stem form is often a word itself, but this is not always the case as this
|
31
|
+
is not a requirement for text search systems, which are the intended field of
|
32
|
+
use. We also aim to conflate words with the same meaning, rather than all
|
33
|
+
words with a common linguistic root (so *awe* and *awful* don't have the same
|
34
|
+
stem), and over-stemming is more problematic than under-stemming so we tend not
|
35
|
+
to stem in cases that are hard to resolve. If you want to always reduce words
|
36
|
+
to a root form and/or get a root form which is itself a word then Snowball's
|
37
|
+
stemming algorithms likely aren't the right answer.
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# Ada Target for Snowball
|
2
|
+
|
3
|
+
The Ada Snowball generator generates an Ada child package for each Snowball algorithm.
|
4
|
+
The parent package is named `Stemmer` and it provides various operations used by the generated code.
|
5
|
+
The `Stemmer` package contains the Ada Snowball runtime available either in `ada/src` directory
|
6
|
+
or from https://github.com/stcarrez/ada-stemmer.
|
7
|
+
|
8
|
+
The generated child package declares the `Context_Type` tagged type and the `Stem` procedure:
|
9
|
+
|
10
|
+
```Ada
|
11
|
+
package Stemmer.<Algorithm-name> is
|
12
|
+
type Context_Type is new Stemmer.Context_Type with private;
|
13
|
+
procedure Stem (Z : in out Context_Type; Result : out Boolean);
|
14
|
+
private
|
15
|
+
type Context_Type is new Stemmer.Context_Type with record
|
16
|
+
...
|
17
|
+
end record;
|
18
|
+
end Stemmer.<Algorithm-name>;
|
19
|
+
```
|
20
|
+
|
21
|
+
It is possible to use directly the generated operation or use it through the `Stemmer.Factory` package.
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
To generate Ada source for a Snowball algorithm:
|
26
|
+
```
|
27
|
+
$ snowball path/to/algorithm.sbl -ada -P <algorithm-name> -o src/stemmer-<algorithm>
|
28
|
+
```
|
29
|
+
|
30
|
+
### Ada specific options
|
31
|
+
|
32
|
+
`-P <Algorithm-name>` the child package name used in the generated Ada file (defaults to `snowball`).
|
33
|
+
It must be a valid Ada identifier.
|
34
|
+
|
35
|
+
## Code Organization
|
36
|
+
|
37
|
+
`compiler/generator_ada.c` has the Ada code generation logic
|
38
|
+
|
39
|
+
`ada/src` contains the default Ada Snowball runtime support which is also
|
40
|
+
available at https://github.com/stcarrez/ada-stemmer
|
41
|
+
|
42
|
+
`ada/algorithms` location where the makefile generated code will end up
|
43
|
+
|
44
|
+
## Using the Generated Stemmers
|
45
|
+
|
46
|
+
To use the generated stemmer, import the Ada generated package, declare an instance
|
47
|
+
of the generated `Context_Type` and call the `Stem_Word` procedure.
|
48
|
+
|
49
|
+
```
|
50
|
+
with Stemmer.English;
|
51
|
+
|
52
|
+
Ctx : Stemmer.English.Context_Type;
|
53
|
+
Result : Boolean;
|
54
|
+
|
55
|
+
Ctx.Stem_Word ("zealously", Result);
|
56
|
+
if Result then
|
57
|
+
Ada.Text_IO.Put_Line (Ctx.Get_Result);
|
58
|
+
end if;
|
59
|
+
```
|
60
|
+
|
61
|
+
You can use the context as many times as you want.
|
62
|
+
|
63
|
+
## Testing
|
64
|
+
|
65
|
+
To run the tests, you will need an Ada compiler such as GNAT as well as the `gprbuild` build tool.
|
66
|
+
|
67
|
+
Only the existing Snowball algorithms have been used for testing. This does not exercise all features of the language.
|
68
|
+
|
69
|
+
Run:
|
70
|
+
|
71
|
+
```
|
72
|
+
$ make check_ada
|
73
|
+
```
|
74
|
+
|
@@ -0,0 +1,83 @@
|
|
1
|
+
with Ada.Characters.Handling;
|
2
|
+
with Ada.Text_IO;
|
3
|
+
with Ada.Command_Line;
|
4
|
+
with Ada.Containers.Indefinite_Vectors;
|
5
|
+
procedure Generate is
|
6
|
+
|
7
|
+
use Ada.Characters.Handling;
|
8
|
+
use Ada.Text_IO;
|
9
|
+
|
10
|
+
package String_Vectors is new Ada.Containers.Indefinite_Vectors
|
11
|
+
(Element_Type => String,
|
12
|
+
Index_Type => Positive);
|
13
|
+
|
14
|
+
Languages : String_Vectors.Vector;
|
15
|
+
|
16
|
+
function Capitalize (S : in String) return String is
|
17
|
+
(To_Upper (S (S'First)) & S (S'First + 1 .. S'Last));
|
18
|
+
|
19
|
+
procedure Write_Spec is
|
20
|
+
File : File_Type;
|
21
|
+
I : Natural := 0;
|
22
|
+
begin
|
23
|
+
Create (File, Out_File, "stemmer-factory.ads");
|
24
|
+
Put_Line (File, "package Stemmer.Factory with SPARK_Mode is");
|
25
|
+
New_Line (File);
|
26
|
+
Put (File, " type Language_Type is (");
|
27
|
+
for Lang of Languages loop
|
28
|
+
Put (File, "L_" & To_Upper (Lang));
|
29
|
+
I := I + 1;
|
30
|
+
if I < Natural (Languages.Length) then
|
31
|
+
Put_Line (File, ",");
|
32
|
+
Put (File, " ");
|
33
|
+
end if;
|
34
|
+
end loop;
|
35
|
+
Put_Line (File, ");");
|
36
|
+
New_Line (File);
|
37
|
+
Put_Line (File, " function Stem (Language : in Language_Type;");
|
38
|
+
Put_Line (File, " Word : in String) return String;");
|
39
|
+
New_Line (File);
|
40
|
+
Put_Line (File, "end Stemmer.Factory;");
|
41
|
+
Close (File);
|
42
|
+
end Write_Spec;
|
43
|
+
|
44
|
+
procedure Write_Body is
|
45
|
+
File : File_Type;
|
46
|
+
begin
|
47
|
+
Create (File, Out_File, "stemmer-factory.adb");
|
48
|
+
for Lang of Languages loop
|
49
|
+
Put_Line (File, "with Stemmer." & Capitalize (Lang) & ";");
|
50
|
+
end loop;
|
51
|
+
Put_Line (File, "package body Stemmer.Factory with SPARK_Mode is");
|
52
|
+
New_Line (File);
|
53
|
+
Put_Line (File, " function Stem (Language : in Language_Type;");
|
54
|
+
Put_Line (File, " Word : in String) return String is");
|
55
|
+
Put_Line (File, " Result : Boolean := False;");
|
56
|
+
Put_Line (File, " begin");
|
57
|
+
Put_Line (File, " case Language is");
|
58
|
+
for Lang of Languages loop
|
59
|
+
Put_Line (File, " when L_" & To_Upper (Lang) & " =>");
|
60
|
+
Put_Line (File, " declare");
|
61
|
+
Put_Line (File, " C : Stemmer." & Capitalize (Lang) & ".Context_Type;");
|
62
|
+
Put_Line (File, " begin");
|
63
|
+
Put_Line (File, " C.Stem_Word (Word, Result);");
|
64
|
+
Put_Line (File, " return Get_Result (C);");
|
65
|
+
Put_Line (File, " end;");
|
66
|
+
New_Line (File);
|
67
|
+
end loop;
|
68
|
+
Put_Line (File, " end case;");
|
69
|
+
Put_Line (File, " end Stem;");
|
70
|
+
New_Line (File);
|
71
|
+
Put_Line (File, "end Stemmer.Factory;");
|
72
|
+
Close (File);
|
73
|
+
end Write_Body;
|
74
|
+
|
75
|
+
Count : constant Natural := Ada.Command_Line.Argument_Count;
|
76
|
+
|
77
|
+
begin
|
78
|
+
for I in 1 .. Count loop
|
79
|
+
Languages.Append (To_Lower (Ada.Command_Line.Argument (I)));
|
80
|
+
end loop;
|
81
|
+
Write_Spec;
|
82
|
+
Write_Body;
|
83
|
+
end Generate;
|
@@ -0,0 +1,21 @@
|
|
1
|
+
with "stemmer_config";
|
2
|
+
project Generate is
|
3
|
+
|
4
|
+
Mains := ("generate.adb");
|
5
|
+
|
6
|
+
for Main use Mains;
|
7
|
+
|
8
|
+
for Source_Dirs use ("generate");
|
9
|
+
|
10
|
+
for Object_Dir use "./" & Stemmer_Config'Object_Dir & "/obj";
|
11
|
+
for Exec_Dir use "./" & Stemmer_Config'Exec_Dir & "/bin";
|
12
|
+
|
13
|
+
package Binder renames Stemmer_Config.Binder;
|
14
|
+
|
15
|
+
package Builder renames Stemmer_Config.Builder;
|
16
|
+
|
17
|
+
package Compiler renames Stemmer_Config.Compiler;
|
18
|
+
|
19
|
+
package Linker renames Stemmer_Config.Linker;
|
20
|
+
|
21
|
+
end Generate;
|