mittens 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +3 -3
- data/lib/mittens/version.rb +1 -1
- data/vendor/snowball/.github/workflows/ci.yml +216 -0
- data/vendor/snowball/CONTRIBUTING.rst +111 -62
- data/vendor/snowball/GNUmakefile +194 -136
- data/vendor/snowball/NEWS +798 -3
- data/vendor/snowball/README.rst +50 -1
- data/vendor/snowball/ada/src/stemmer.adb +25 -13
- data/vendor/snowball/ada/src/stemmer.ads +9 -9
- data/vendor/snowball/ada/stemmer_config.gpr +7 -7
- data/vendor/snowball/algorithms/basque.sbl +4 -19
- data/vendor/snowball/algorithms/catalan.sbl +2 -9
- data/vendor/snowball/algorithms/danish.sbl +1 -1
- data/vendor/snowball/algorithms/dutch.sbl +284 -122
- data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
- data/vendor/snowball/algorithms/english.sbl +52 -37
- data/vendor/snowball/algorithms/esperanto.sbl +157 -0
- data/vendor/snowball/algorithms/estonian.sbl +269 -0
- data/vendor/snowball/algorithms/finnish.sbl +2 -3
- data/vendor/snowball/algorithms/french.sbl +42 -16
- data/vendor/snowball/algorithms/german.sbl +35 -14
- data/vendor/snowball/algorithms/greek.sbl +76 -76
- data/vendor/snowball/algorithms/hungarian.sbl +8 -6
- data/vendor/snowball/algorithms/indonesian.sbl +14 -8
- data/vendor/snowball/algorithms/italian.sbl +11 -21
- data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
- data/vendor/snowball/algorithms/lovins.sbl +0 -1
- data/vendor/snowball/algorithms/nepali.sbl +138 -37
- data/vendor/snowball/algorithms/norwegian.sbl +19 -5
- data/vendor/snowball/algorithms/porter.sbl +2 -2
- data/vendor/snowball/algorithms/portuguese.sbl +9 -13
- data/vendor/snowball/algorithms/romanian.sbl +17 -4
- data/vendor/snowball/algorithms/serbian.sbl +467 -468
- data/vendor/snowball/algorithms/spanish.sbl +5 -7
- data/vendor/snowball/algorithms/swedish.sbl +60 -6
- data/vendor/snowball/algorithms/tamil.sbl +207 -176
- data/vendor/snowball/algorithms/turkish.sbl +461 -445
- data/vendor/snowball/algorithms/yiddish.sbl +36 -38
- data/vendor/snowball/compiler/analyser.c +445 -192
- data/vendor/snowball/compiler/driver.c +109 -101
- data/vendor/snowball/compiler/generator.c +853 -464
- data/vendor/snowball/compiler/generator_ada.c +404 -366
- data/vendor/snowball/compiler/generator_csharp.c +297 -260
- data/vendor/snowball/compiler/generator_go.c +323 -254
- data/vendor/snowball/compiler/generator_java.c +326 -252
- data/vendor/snowball/compiler/generator_js.c +362 -252
- data/vendor/snowball/compiler/generator_pascal.c +349 -197
- data/vendor/snowball/compiler/generator_python.c +257 -240
- data/vendor/snowball/compiler/generator_rust.c +423 -251
- data/vendor/snowball/compiler/header.h +117 -71
- data/vendor/snowball/compiler/space.c +137 -68
- data/vendor/snowball/compiler/syswords.h +2 -2
- data/vendor/snowball/compiler/tokeniser.c +125 -107
- data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
- data/vendor/snowball/csharp/Stemwords/App.config +2 -2
- data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
- data/vendor/snowball/doc/libstemmer_c_README +7 -4
- data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
- data/vendor/snowball/doc/libstemmer_java_README +12 -1
- data/vendor/snowball/doc/libstemmer_js_README +6 -4
- data/vendor/snowball/doc/libstemmer_python_README +9 -4
- data/vendor/snowball/examples/stemwords.c +12 -12
- data/vendor/snowball/go/env.go +107 -31
- data/vendor/snowball/go/util.go +0 -4
- data/vendor/snowball/include/libstemmer.h +4 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
- data/vendor/snowball/javascript/base-stemmer.js +186 -2
- data/vendor/snowball/javascript/stemwords.js +3 -6
- data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
- data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
- data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
- data/vendor/snowball/libstemmer/modules.txt +13 -10
- data/vendor/snowball/libstemmer/test.c +1 -1
- data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
- data/vendor/snowball/pascal/generate.pl +13 -13
- data/vendor/snowball/python/create_init.py +4 -1
- data/vendor/snowball/python/setup.cfg +0 -3
- data/vendor/snowball/python/setup.py +8 -3
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
- data/vendor/snowball/python/stemwords.py +8 -12
- data/vendor/snowball/runtime/api.c +10 -5
- data/vendor/snowball/runtime/header.h +10 -9
- data/vendor/snowball/runtime/utilities.c +9 -9
- data/vendor/snowball/rust/build.rs +1 -1
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
- data/vendor/snowball/tests/stemtest.c +7 -4
- metadata +7 -7
- data/vendor/snowball/.travis.yml +0 -112
- data/vendor/snowball/algorithms/german2.sbl +0 -145
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
- data/vendor/snowball/compiler/syswords2.h +0 -13
data/vendor/snowball/README.rst
CHANGED
@@ -24,7 +24,7 @@ What is Stemming?
|
|
24
24
|
|
25
25
|
Stemming maps different forms of the same word to a common "stem" - for
|
26
26
|
example, the English stemmer maps *connection*, *connections*, *connective*,
|
27
|
-
*connected*, and *connecting* to *connect*. So a
|
27
|
+
*connected*, and *connecting* to *connect*. So a search for *connected*
|
28
28
|
would also find documents which only have the other forms.
|
29
29
|
|
30
30
|
This stem form is often a word itself, but this is not always the case as this
|
@@ -35,3 +35,52 @@ stem), and over-stemming is more problematic than under-stemming so we tend not
|
|
35
35
|
to stem in cases that are hard to resolve. If you want to always reduce words
|
36
36
|
to a root form and/or get a root form which is itself a word then Snowball's
|
37
37
|
stemming algorithms likely aren't the right answer.
|
38
|
+
|
39
|
+
Building Snowball
|
40
|
+
=================
|
41
|
+
|
42
|
+
GNU make is required to build Snowball.
|
43
|
+
|
44
|
+
The build system is currently structured as two separate stages for many of the
|
45
|
+
target languages.
|
46
|
+
|
47
|
+
The first stage builds the Snowball compiler and runs it to create target
|
48
|
+
language code (and it can also run tests on each stemmer). The expectation is
|
49
|
+
that you then create a "distribution" tarballs of this code with ``make dist``
|
50
|
+
(or to create one for a specific target language, e.g. ``make
|
51
|
+
dist_libstemmer_c`` for C). These tarballs are created in the ``dist/``
|
52
|
+
subdirectory.
|
53
|
+
|
54
|
+
To actually build the libstemmer library you then unpack and build the
|
55
|
+
distribution tarball, e.g. for C::
|
56
|
+
|
57
|
+
tar xf dist/libstemmer_c-3.0.0.tar.gz
|
58
|
+
cd libstemmer_c-3.0.0
|
59
|
+
make
|
60
|
+
|
61
|
+
Cross-compiling
|
62
|
+
---------------
|
63
|
+
|
64
|
+
If cross-compiling starting from the git repo, the Snowball compiler needs to
|
65
|
+
be built with a native compiler then libstemmer with the cross-compiler. For
|
66
|
+
example::
|
67
|
+
|
68
|
+
make CC=cc dist_libstemmer_c
|
69
|
+
tar xf dist/libstemmer_c-3.0.0.tar.gz
|
70
|
+
cd libstemmer_c-3.0.0
|
71
|
+
make CC=riscv64-unknown-linux-gnu-gcc
|
72
|
+
|
73
|
+
If you are cross-compiling to or from Microsoft Windows, you'll need to also
|
74
|
+
work around an assumption in libstemmer's ``Makefile`` which sets ``EXEEXT``
|
75
|
+
based on the OS you are building on::
|
76
|
+
|
77
|
+
ifeq ($(OS),Windows_NT)
|
78
|
+
EXEEXT=.exe
|
79
|
+
endif
|
80
|
+
|
81
|
+
For example, if cross-compiling from Linux to Microsoft Windows, use something
|
82
|
+
like this for the libstemmer build::
|
83
|
+
|
84
|
+
make CC=x86_64-w64-mingw32-gcc EXEEXT=.exe
|
85
|
+
|
86
|
+
When going the other way, you'll need to use ``EXEEXT=``.
|
@@ -38,6 +38,7 @@ package body Stemmer with SPARK_Mode is
|
|
38
38
|
Result : out Boolean) is
|
39
39
|
begin
|
40
40
|
Context.P (1 .. Word'Length) := Word;
|
41
|
+
Context.Len := Word'Length;
|
41
42
|
Context.C := 0;
|
42
43
|
Context.L := Word'Length;
|
43
44
|
Context.Lb := 0;
|
@@ -46,7 +47,7 @@ package body Stemmer with SPARK_Mode is
|
|
46
47
|
|
47
48
|
function Get_Result (Context : in Context_Type'Class) return String is
|
48
49
|
begin
|
49
|
-
return Context.P (1 .. Context.
|
50
|
+
return Context.P (1 .. Context.Len);
|
50
51
|
end Get_Result;
|
51
52
|
|
52
53
|
function Eq_S (Context : in Context_Type'Class;
|
@@ -73,17 +74,12 @@ package body Stemmer with SPARK_Mode is
|
|
73
74
|
return S'Length;
|
74
75
|
end Eq_S_Backward;
|
75
76
|
|
76
|
-
function Length (Context : in Context_Type'Class) return Natural is
|
77
|
-
begin
|
78
|
-
return Context.L - Context.Lb;
|
79
|
-
end Length;
|
80
|
-
|
81
77
|
function Length_Utf8 (Context : in Context_Type'Class) return Natural is
|
82
78
|
Count : Natural := 0;
|
83
79
|
Pos : Positive := 1;
|
84
80
|
Val : Byte;
|
85
81
|
begin
|
86
|
-
while Pos <= Context.
|
82
|
+
while Pos <= Context.Len loop
|
87
83
|
Val := Character'Pos (Context.P (Pos));
|
88
84
|
Pos := Pos + 1;
|
89
85
|
if Val >= 16#C0# or Val < 16#80# then
|
@@ -93,6 +89,21 @@ package body Stemmer with SPARK_Mode is
|
|
93
89
|
return Count;
|
94
90
|
end Length_Utf8;
|
95
91
|
|
92
|
+
function Length_Utf8 (S : in String) return Natural is
|
93
|
+
Count : Natural := 0;
|
94
|
+
Pos : Positive := 1;
|
95
|
+
Val : Byte;
|
96
|
+
begin
|
97
|
+
while Pos <= S'Length loop
|
98
|
+
Val := Character'Pos (S (Pos));
|
99
|
+
Pos := Pos + 1;
|
100
|
+
if Val >= 16#C0# or Val < 16#80# then
|
101
|
+
Count := Count + 1;
|
102
|
+
end if;
|
103
|
+
end loop;
|
104
|
+
return Count;
|
105
|
+
end Length_Utf8;
|
106
|
+
|
96
107
|
function Check_Among (Context : in Context_Type'Class;
|
97
108
|
Pos : in Char_Index;
|
98
109
|
Shift : in Natural;
|
@@ -464,7 +475,7 @@ package body Stemmer with SPARK_Mode is
|
|
464
475
|
Ch : Utf8_Type;
|
465
476
|
Count : Natural;
|
466
477
|
begin
|
467
|
-
if Context.C
|
478
|
+
if Context.C <= Context.Lb then
|
468
479
|
Result := -1;
|
469
480
|
return;
|
470
481
|
end if;
|
@@ -532,7 +543,7 @@ package body Stemmer with SPARK_Mode is
|
|
532
543
|
Ch : Utf8_Type;
|
533
544
|
Count : Natural;
|
534
545
|
begin
|
535
|
-
if Context.C
|
546
|
+
if Context.C <= Context.Lb then
|
536
547
|
Result := -1;
|
537
548
|
return;
|
538
549
|
end if;
|
@@ -566,16 +577,17 @@ package body Stemmer with SPARK_Mode is
|
|
566
577
|
begin
|
567
578
|
Adjustment := S'Length - (C_Ket - C_Bra);
|
568
579
|
if Adjustment > 0 then
|
569
|
-
Context.P (C_Bra + S'Length + 1 .. Context.
|
570
|
-
:= Context.P (C_Ket + 1 .. Context.
|
580
|
+
Context.P (C_Bra + S'Length + 1 .. Context.Len + Adjustment + 1)
|
581
|
+
:= Context.P (C_Ket + 1 .. Context.Len + 1);
|
571
582
|
end if;
|
572
583
|
if S'Length > 0 then
|
573
584
|
Context.P (C_Bra + 1 .. C_Bra + S'Length) := S;
|
574
585
|
end if;
|
575
586
|
if Adjustment < 0 then
|
576
|
-
Context.P (C_Bra + S'Length + 1 .. Context.
|
577
|
-
:= Context.P (C_Ket + 1 .. Context.
|
587
|
+
Context.P (C_Bra + S'Length + 1 .. Context.Len + Adjustment + 1)
|
588
|
+
:= Context.P (C_Ket + 1 .. Context.Len + 1);
|
578
589
|
end if;
|
590
|
+
Context.Len := Context.Len + Adjustment;
|
579
591
|
Context.L := Context.L + Adjustment;
|
580
592
|
if Context.C >= C_Ket then
|
581
593
|
Context.C := Context.C + Adjustment;
|
@@ -138,10 +138,10 @@ private
|
|
138
138
|
Value : out Utf8_Type;
|
139
139
|
Count : out Natural);
|
140
140
|
|
141
|
-
function Length (Context : in Context_Type'Class) return Natural;
|
142
|
-
|
143
141
|
function Length_Utf8 (Context : in Context_Type'Class) return Natural;
|
144
142
|
|
143
|
+
function Length_Utf8 (S : in String) return Natural;
|
144
|
+
|
145
145
|
function Check_Among (Context : in Context_Type'Class;
|
146
146
|
Pos : in Char_Index;
|
147
147
|
Shift : in Natural;
|
@@ -181,19 +181,17 @@ private
|
|
181
181
|
S : in String;
|
182
182
|
Adjustment : out Integer) with
|
183
183
|
Global => null,
|
184
|
-
Pre =>
|
184
|
+
Pre => C_Ket >= C_Bra;
|
185
185
|
|
186
186
|
procedure Slice_Del (Context : in out Context_Type'Class) with
|
187
187
|
Global => null,
|
188
|
-
Pre => Context.
|
189
|
-
and Context.Ket <= Context.L;
|
188
|
+
Pre => Context.Ket >= Context.Bra;
|
190
189
|
|
191
190
|
procedure Slice_From (Context : in out Context_Type'Class;
|
192
191
|
Text : in String) with
|
193
192
|
Global => null,
|
194
|
-
Pre => Context.
|
195
|
-
and Context.Ket
|
196
|
-
and Context.L - Context.Lb + Text'Length + Context.Ket - Context.Bra < Context.P'Length;
|
193
|
+
Pre => Context.Ket >= Context.Bra
|
194
|
+
and Context.Len - (Context.Ket - Context.Bra) + Text'Length < Context.P'Length;
|
197
195
|
|
198
196
|
function Slice_To (Context : in Context_Type'Class) return String;
|
199
197
|
|
@@ -202,13 +200,15 @@ private
|
|
202
200
|
C_Ket : in Char_Index;
|
203
201
|
S : in String) with
|
204
202
|
Global => null,
|
205
|
-
Pre =>
|
203
|
+
Pre => C_Ket >= C_Bra
|
204
|
+
and Context.Len - (C_Ket - C_Bra) + S'Length < Context.P'Length;
|
206
205
|
|
207
206
|
-- The context indexes follow the C paradigm: they start at 0 for the first character.
|
208
207
|
-- This is necessary because several algorithms rely on this when they compare the
|
209
208
|
-- cursor position ('C') or setup some markers from the cursor.
|
210
209
|
type Context_Type is abstract tagged record
|
211
210
|
C : Char_Index := 0;
|
211
|
+
Len : Char_Index := 0;
|
212
212
|
L : Char_Index := 0;
|
213
213
|
Lb : Char_Index := 0;
|
214
214
|
Bra : Char_Index := 0;
|
@@ -12,7 +12,7 @@ abstract project Stemmer_Config is
|
|
12
12
|
|
13
13
|
package Builder is
|
14
14
|
case Mode is
|
15
|
-
when "debug" =>
|
15
|
+
when "debug" =>
|
16
16
|
for Default_Switches ("Ada") use ("-g", "-j" & Processors);
|
17
17
|
when others =>
|
18
18
|
for Default_Switches ("Ada") use ("-g", "-O3", "-j" & Processors);
|
@@ -50,7 +50,7 @@ abstract project Stemmer_Config is
|
|
50
50
|
when "debug" =>
|
51
51
|
for Default_Switches ("Ada") use ("-E");
|
52
52
|
|
53
|
-
|
53
|
+
when others =>
|
54
54
|
for Default_Switches ("Ada") use ("-E");
|
55
55
|
|
56
56
|
end case;
|
@@ -69,13 +69,13 @@ abstract project Stemmer_Config is
|
|
69
69
|
|
70
70
|
when "coverage" =>
|
71
71
|
for Default_Switches ("ada") use ("-fprofile-arcs");
|
72
|
-
|
73
|
-
|
74
|
-
|
72
|
+
|
73
|
+
when others =>
|
74
|
+
null;
|
75
75
|
end case;
|
76
76
|
|
77
|
-
end linker;
|
78
|
-
|
77
|
+
end linker;
|
78
|
+
|
79
79
|
package Ide is
|
80
80
|
for VCS_Kind use "git";
|
81
81
|
end Ide;
|
@@ -59,13 +59,8 @@ backwardmode (
|
|
59
59
|
( RV delete )
|
60
60
|
'garri' 'garria' 'tza'
|
61
61
|
(R2 delete)
|
62
|
-
'atseden'
|
63
|
-
(
|
64
|
-
'arabera'
|
65
|
-
(<- 'arabera')
|
66
|
-
'baditu'
|
67
|
-
(<- 'baditu')
|
68
|
-
|
62
|
+
'atseden' 'arabera' 'baditu'
|
63
|
+
( )
|
69
64
|
)
|
70
65
|
)
|
71
66
|
|
@@ -111,14 +106,8 @@ backwardmode (
|
|
111
106
|
(<- 'tra')
|
112
107
|
'minutuko'
|
113
108
|
(<- 'minutu')
|
114
|
-
'zehar'
|
115
|
-
(
|
116
|
-
'geldi'
|
117
|
-
(<- 'geldi')
|
118
|
-
'igaro'
|
119
|
-
(<- 'igaro')
|
120
|
-
'aurka'
|
121
|
-
(<- 'aurka')
|
109
|
+
'zehar' 'geldi' 'igaro' 'aurka'
|
110
|
+
( )
|
122
111
|
)
|
123
112
|
)
|
124
113
|
|
@@ -143,7 +132,3 @@ define stem as (
|
|
143
132
|
)
|
144
133
|
|
145
134
|
)
|
146
|
-
|
147
|
-
/*
|
148
|
-
Note 1: additions of 21 Jul 2010
|
149
|
-
*/
|
@@ -19,7 +19,7 @@ stringescapes {}
|
|
19
19
|
|
20
20
|
stringdef a' '{U+00E1}' // a-acute
|
21
21
|
stringdef a` '{U+00E0}' // a-grave
|
22
|
-
stringdef
|
22
|
+
stringdef cc '{U+00E7}' // c-cedilla
|
23
23
|
stringdef e' '{U+00E9}' // e-acute
|
24
24
|
stringdef e` '{U+00E8}' // e-grave
|
25
25
|
stringdef i' '{U+00ED}' // i-acute
|
@@ -90,7 +90,7 @@ backwardmode (
|
|
90
90
|
'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius'
|
91
91
|
'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste'
|
92
92
|
'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis'
|
93
|
-
'{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{
|
93
|
+
'{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{cc}a' 'nces' '{o'}' 'dor' 'all'
|
94
94
|
'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu'
|
95
95
|
'{o'}s' 'osa' 'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar'
|
96
96
|
'itar' 'ables' 'adors' 'idores' 'idors'
|
@@ -193,10 +193,3 @@ define stem as (
|
|
193
193
|
)
|
194
194
|
do cleaning
|
195
195
|
)
|
196
|
-
|
197
|
-
/*
|
198
|
-
First works 2010/07/19
|
199
|
-
First Grammatical Reviews: https://ca.wikipedia.org/wiki/Gram%C3%A0tica_del_catal%C3%A0
|
200
|
-
Suffix list: https://ca.wikipedia.org/wiki/Llista_de_sufixos
|
201
|
-
Irregular Verbs: https://ca.wikipedia.org/wiki/Flexi%C3%B3_verbal_del_catal%C3%A0
|
202
|
-
*/
|