mittens 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
Snowball is a small string processing language for creating stemming algorithms
|
|
2
|
+
for use in Information Retrieval, plus a collection of stemming algorithms
|
|
3
|
+
implemented using it.
|
|
4
|
+
|
|
5
|
+
Snowball was originally designed and built by Martin Porter. Martin retired
|
|
6
|
+
from development in 2014 and Snowball is now maintained as a community project.
|
|
7
|
+
Martin originally chose the name Snowball as a tribute to SNOBOL, the excellent
|
|
8
|
+
string handling language from the 1960s. It now also serves as a metaphor for
|
|
9
|
+
how the project grows by gathering contributions over time.
|
|
10
|
+
|
|
11
|
+
The Snowball compiler translates a Snowball program into source code in another
|
|
12
|
+
language - currently Ada, ISO C, C#, Go, Java, Javascript, Object Pascal,
|
|
13
|
+
Python and Rust are supported.
|
|
14
|
+
|
|
15
|
+
This repository contains the source code for the snowball compiler and the
|
|
16
|
+
stemming algorithms. The snowball compiler is written in ISO C - you'll need
|
|
17
|
+
a C compiler which support C99 to build it (but the C code it generates should
|
|
18
|
+
work with any ISO C compiler).
|
|
19
|
+
|
|
20
|
+
See https://snowballstem.org/ for more information about Snowball.
|
|
21
|
+
|
|
22
|
+
What is Stemming?
|
|
23
|
+
=================
|
|
24
|
+
|
|
25
|
+
Stemming maps different forms of the same word to a common "stem" - for
|
|
26
|
+
example, the English stemmer maps *connection*, *connections*, *connective*,
|
|
27
|
+
*connected*, and *connecting* to *connect*. So a searching for *connected*
|
|
28
|
+
would also find documents which only have the other forms.
|
|
29
|
+
|
|
30
|
+
This stem form is often a word itself, but this is not always the case as this
|
|
31
|
+
is not a requirement for text search systems, which are the intended field of
|
|
32
|
+
use. We also aim to conflate words with the same meaning, rather than all
|
|
33
|
+
words with a common linguistic root (so *awe* and *awful* don't have the same
|
|
34
|
+
stem), and over-stemming is more problematic than under-stemming so we tend not
|
|
35
|
+
to stem in cases that are hard to resolve. If you want to always reduce words
|
|
36
|
+
to a root form and/or get a root form which is itself a word then Snowball's
|
|
37
|
+
stemming algorithms likely aren't the right answer.
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Ada Target for Snowball
|
|
2
|
+
|
|
3
|
+
The Ada Snowball generator generates an Ada child package for each Snowball algorithm.
|
|
4
|
+
The parent package is named `Stemmer` and it provides various operations used by the generated code.
|
|
5
|
+
The `Stemmer` package contains the Ada Snowball runtime available either in `ada/src` directory
|
|
6
|
+
or from https://github.com/stcarrez/ada-stemmer.
|
|
7
|
+
|
|
8
|
+
The generated child package declares the `Context_Type` tagged type and the `Stem` procedure:
|
|
9
|
+
|
|
10
|
+
```Ada
|
|
11
|
+
package Stemmer.<Algorithm-name> is
|
|
12
|
+
type Context_Type is new Stemmer.Context_Type with private;
|
|
13
|
+
procedure Stem (Z : in out Context_Type; Result : out Boolean);
|
|
14
|
+
private
|
|
15
|
+
type Context_Type is new Stemmer.Context_Type with record
|
|
16
|
+
...
|
|
17
|
+
end record;
|
|
18
|
+
end Stemmer.<Algorithm-name>;
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
It is possible to use directly the generated operation or use it through the `Stemmer.Factory` package.
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
To generate Ada source for a Snowball algorithm:
|
|
26
|
+
```
|
|
27
|
+
$ snowball path/to/algorithm.sbl -ada -P <algorithm-name> -o src/stemmer-<algorithm>
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Ada specific options
|
|
31
|
+
|
|
32
|
+
`-P <Algorithm-name>` the child package name used in the generated Ada file (defaults to `snowball`).
|
|
33
|
+
It must be a valid Ada identifier.
|
|
34
|
+
|
|
35
|
+
## Code Organization
|
|
36
|
+
|
|
37
|
+
`compiler/generator_ada.c` has the Ada code generation logic
|
|
38
|
+
|
|
39
|
+
`ada/src` contains the default Ada Snowball runtime support which is also
|
|
40
|
+
available at https://github.com/stcarrez/ada-stemmer
|
|
41
|
+
|
|
42
|
+
`ada/algorithms` location where the makefile generated code will end up
|
|
43
|
+
|
|
44
|
+
## Using the Generated Stemmers
|
|
45
|
+
|
|
46
|
+
To use the generated stemmer, import the Ada generated package, declare an instance
|
|
47
|
+
of the generated `Context_Type` and call the `Stem_Word` procedure.
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
with Stemmer.English;
|
|
51
|
+
|
|
52
|
+
Ctx : Stemmer.English.Context_Type;
|
|
53
|
+
Result : Boolean;
|
|
54
|
+
|
|
55
|
+
Ctx.Stem_Word ("zealously", Result);
|
|
56
|
+
if Result then
|
|
57
|
+
Ada.Text_IO.Put_Line (Ctx.Get_Result);
|
|
58
|
+
end if;
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
You can use the context as many times as you want.
|
|
62
|
+
|
|
63
|
+
## Testing
|
|
64
|
+
|
|
65
|
+
To run the tests, you will need an Ada compiler such as GNAT as well as the `gprbuild` build tool.
|
|
66
|
+
|
|
67
|
+
Only the existing Snowball algorithms have been used for testing. This does not exercise all features of the language.
|
|
68
|
+
|
|
69
|
+
Run:
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
$ make check_ada
|
|
73
|
+
```
|
|
74
|
+
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
with Ada.Characters.Handling;
|
|
2
|
+
with Ada.Text_IO;
|
|
3
|
+
with Ada.Command_Line;
|
|
4
|
+
with Ada.Containers.Indefinite_Vectors;
|
|
5
|
+
procedure Generate is
|
|
6
|
+
|
|
7
|
+
use Ada.Characters.Handling;
|
|
8
|
+
use Ada.Text_IO;
|
|
9
|
+
|
|
10
|
+
package String_Vectors is new Ada.Containers.Indefinite_Vectors
|
|
11
|
+
(Element_Type => String,
|
|
12
|
+
Index_Type => Positive);
|
|
13
|
+
|
|
14
|
+
Languages : String_Vectors.Vector;
|
|
15
|
+
|
|
16
|
+
function Capitalize (S : in String) return String is
|
|
17
|
+
(To_Upper (S (S'First)) & S (S'First + 1 .. S'Last));
|
|
18
|
+
|
|
19
|
+
procedure Write_Spec is
|
|
20
|
+
File : File_Type;
|
|
21
|
+
I : Natural := 0;
|
|
22
|
+
begin
|
|
23
|
+
Create (File, Out_File, "stemmer-factory.ads");
|
|
24
|
+
Put_Line (File, "package Stemmer.Factory with SPARK_Mode is");
|
|
25
|
+
New_Line (File);
|
|
26
|
+
Put (File, " type Language_Type is (");
|
|
27
|
+
for Lang of Languages loop
|
|
28
|
+
Put (File, "L_" & To_Upper (Lang));
|
|
29
|
+
I := I + 1;
|
|
30
|
+
if I < Natural (Languages.Length) then
|
|
31
|
+
Put_Line (File, ",");
|
|
32
|
+
Put (File, " ");
|
|
33
|
+
end if;
|
|
34
|
+
end loop;
|
|
35
|
+
Put_Line (File, ");");
|
|
36
|
+
New_Line (File);
|
|
37
|
+
Put_Line (File, " function Stem (Language : in Language_Type;");
|
|
38
|
+
Put_Line (File, " Word : in String) return String;");
|
|
39
|
+
New_Line (File);
|
|
40
|
+
Put_Line (File, "end Stemmer.Factory;");
|
|
41
|
+
Close (File);
|
|
42
|
+
end Write_Spec;
|
|
43
|
+
|
|
44
|
+
procedure Write_Body is
|
|
45
|
+
File : File_Type;
|
|
46
|
+
begin
|
|
47
|
+
Create (File, Out_File, "stemmer-factory.adb");
|
|
48
|
+
for Lang of Languages loop
|
|
49
|
+
Put_Line (File, "with Stemmer." & Capitalize (Lang) & ";");
|
|
50
|
+
end loop;
|
|
51
|
+
Put_Line (File, "package body Stemmer.Factory with SPARK_Mode is");
|
|
52
|
+
New_Line (File);
|
|
53
|
+
Put_Line (File, " function Stem (Language : in Language_Type;");
|
|
54
|
+
Put_Line (File, " Word : in String) return String is");
|
|
55
|
+
Put_Line (File, " Result : Boolean := False;");
|
|
56
|
+
Put_Line (File, " begin");
|
|
57
|
+
Put_Line (File, " case Language is");
|
|
58
|
+
for Lang of Languages loop
|
|
59
|
+
Put_Line (File, " when L_" & To_Upper (Lang) & " =>");
|
|
60
|
+
Put_Line (File, " declare");
|
|
61
|
+
Put_Line (File, " C : Stemmer." & Capitalize (Lang) & ".Context_Type;");
|
|
62
|
+
Put_Line (File, " begin");
|
|
63
|
+
Put_Line (File, " C.Stem_Word (Word, Result);");
|
|
64
|
+
Put_Line (File, " return Get_Result (C);");
|
|
65
|
+
Put_Line (File, " end;");
|
|
66
|
+
New_Line (File);
|
|
67
|
+
end loop;
|
|
68
|
+
Put_Line (File, " end case;");
|
|
69
|
+
Put_Line (File, " end Stem;");
|
|
70
|
+
New_Line (File);
|
|
71
|
+
Put_Line (File, "end Stemmer.Factory;");
|
|
72
|
+
Close (File);
|
|
73
|
+
end Write_Body;
|
|
74
|
+
|
|
75
|
+
Count : constant Natural := Ada.Command_Line.Argument_Count;
|
|
76
|
+
|
|
77
|
+
begin
|
|
78
|
+
for I in 1 .. Count loop
|
|
79
|
+
Languages.Append (To_Lower (Ada.Command_Line.Argument (I)));
|
|
80
|
+
end loop;
|
|
81
|
+
Write_Spec;
|
|
82
|
+
Write_Body;
|
|
83
|
+
end Generate;
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
with "stemmer_config";
|
|
2
|
+
project Generate is
|
|
3
|
+
|
|
4
|
+
Mains := ("generate.adb");
|
|
5
|
+
|
|
6
|
+
for Main use Mains;
|
|
7
|
+
|
|
8
|
+
for Source_Dirs use ("generate");
|
|
9
|
+
|
|
10
|
+
for Object_Dir use "./" & Stemmer_Config'Object_Dir & "/obj";
|
|
11
|
+
for Exec_Dir use "./" & Stemmer_Config'Exec_Dir & "/bin";
|
|
12
|
+
|
|
13
|
+
package Binder renames Stemmer_Config.Binder;
|
|
14
|
+
|
|
15
|
+
package Builder renames Stemmer_Config.Builder;
|
|
16
|
+
|
|
17
|
+
package Compiler renames Stemmer_Config.Compiler;
|
|
18
|
+
|
|
19
|
+
package Linker renames Stemmer_Config.Linker;
|
|
20
|
+
|
|
21
|
+
end Generate;
|