mittens 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +3 -3
  4. data/lib/mittens/version.rb +1 -1
  5. data/vendor/snowball/.github/workflows/ci.yml +216 -0
  6. data/vendor/snowball/CONTRIBUTING.rst +111 -62
  7. data/vendor/snowball/GNUmakefile +194 -136
  8. data/vendor/snowball/NEWS +798 -3
  9. data/vendor/snowball/README.rst +50 -1
  10. data/vendor/snowball/ada/src/stemmer.adb +25 -13
  11. data/vendor/snowball/ada/src/stemmer.ads +9 -9
  12. data/vendor/snowball/ada/stemmer_config.gpr +7 -7
  13. data/vendor/snowball/algorithms/basque.sbl +4 -19
  14. data/vendor/snowball/algorithms/catalan.sbl +2 -9
  15. data/vendor/snowball/algorithms/danish.sbl +1 -1
  16. data/vendor/snowball/algorithms/dutch.sbl +284 -122
  17. data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
  18. data/vendor/snowball/algorithms/english.sbl +52 -37
  19. data/vendor/snowball/algorithms/esperanto.sbl +157 -0
  20. data/vendor/snowball/algorithms/estonian.sbl +269 -0
  21. data/vendor/snowball/algorithms/finnish.sbl +2 -3
  22. data/vendor/snowball/algorithms/french.sbl +42 -16
  23. data/vendor/snowball/algorithms/german.sbl +35 -14
  24. data/vendor/snowball/algorithms/greek.sbl +76 -76
  25. data/vendor/snowball/algorithms/hungarian.sbl +8 -6
  26. data/vendor/snowball/algorithms/indonesian.sbl +14 -8
  27. data/vendor/snowball/algorithms/italian.sbl +11 -21
  28. data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
  29. data/vendor/snowball/algorithms/lovins.sbl +0 -1
  30. data/vendor/snowball/algorithms/nepali.sbl +138 -37
  31. data/vendor/snowball/algorithms/norwegian.sbl +19 -5
  32. data/vendor/snowball/algorithms/porter.sbl +2 -2
  33. data/vendor/snowball/algorithms/portuguese.sbl +9 -13
  34. data/vendor/snowball/algorithms/romanian.sbl +17 -4
  35. data/vendor/snowball/algorithms/serbian.sbl +467 -468
  36. data/vendor/snowball/algorithms/spanish.sbl +5 -7
  37. data/vendor/snowball/algorithms/swedish.sbl +60 -6
  38. data/vendor/snowball/algorithms/tamil.sbl +207 -176
  39. data/vendor/snowball/algorithms/turkish.sbl +461 -445
  40. data/vendor/snowball/algorithms/yiddish.sbl +36 -38
  41. data/vendor/snowball/compiler/analyser.c +445 -192
  42. data/vendor/snowball/compiler/driver.c +109 -101
  43. data/vendor/snowball/compiler/generator.c +853 -464
  44. data/vendor/snowball/compiler/generator_ada.c +404 -366
  45. data/vendor/snowball/compiler/generator_csharp.c +297 -260
  46. data/vendor/snowball/compiler/generator_go.c +323 -254
  47. data/vendor/snowball/compiler/generator_java.c +326 -252
  48. data/vendor/snowball/compiler/generator_js.c +362 -252
  49. data/vendor/snowball/compiler/generator_pascal.c +349 -197
  50. data/vendor/snowball/compiler/generator_python.c +257 -240
  51. data/vendor/snowball/compiler/generator_rust.c +423 -251
  52. data/vendor/snowball/compiler/header.h +117 -71
  53. data/vendor/snowball/compiler/space.c +137 -68
  54. data/vendor/snowball/compiler/syswords.h +2 -2
  55. data/vendor/snowball/compiler/tokeniser.c +125 -107
  56. data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
  57. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
  58. data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
  59. data/vendor/snowball/csharp/Stemwords/App.config +2 -2
  60. data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
  61. data/vendor/snowball/doc/libstemmer_c_README +7 -4
  62. data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
  63. data/vendor/snowball/doc/libstemmer_java_README +12 -1
  64. data/vendor/snowball/doc/libstemmer_js_README +6 -4
  65. data/vendor/snowball/doc/libstemmer_python_README +9 -4
  66. data/vendor/snowball/examples/stemwords.c +12 -12
  67. data/vendor/snowball/go/env.go +107 -31
  68. data/vendor/snowball/go/util.go +0 -4
  69. data/vendor/snowball/include/libstemmer.h +4 -0
  70. data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
  71. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
  72. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
  73. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
  74. data/vendor/snowball/javascript/base-stemmer.js +186 -2
  75. data/vendor/snowball/javascript/stemwords.js +3 -6
  76. data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
  77. data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
  78. data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
  79. data/vendor/snowball/libstemmer/modules.txt +13 -10
  80. data/vendor/snowball/libstemmer/test.c +1 -1
  81. data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
  82. data/vendor/snowball/pascal/generate.pl +13 -13
  83. data/vendor/snowball/python/create_init.py +4 -1
  84. data/vendor/snowball/python/setup.cfg +0 -3
  85. data/vendor/snowball/python/setup.py +8 -3
  86. data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
  87. data/vendor/snowball/python/stemwords.py +8 -12
  88. data/vendor/snowball/runtime/api.c +10 -5
  89. data/vendor/snowball/runtime/header.h +10 -9
  90. data/vendor/snowball/runtime/utilities.c +9 -9
  91. data/vendor/snowball/rust/build.rs +1 -1
  92. data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
  93. data/vendor/snowball/tests/stemtest.c +7 -4
  94. metadata +7 -7
  95. data/vendor/snowball/.travis.yml +0 -112
  96. data/vendor/snowball/algorithms/german2.sbl +0 -145
  97. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
  98. data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -24,7 +24,7 @@ What is Stemming?
24
24
 
25
25
  Stemming maps different forms of the same word to a common "stem" - for
26
26
  example, the English stemmer maps *connection*, *connections*, *connective*,
27
- *connected*, and *connecting* to *connect*. So a searching for *connected*
27
+ *connected*, and *connecting* to *connect*. So a search for *connected*
28
28
  would also find documents which only have the other forms.
29
29
 
30
30
  This stem form is often a word itself, but this is not always the case as this
@@ -35,3 +35,52 @@ stem), and over-stemming is more problematic than under-stemming so we tend not
35
35
  to stem in cases that are hard to resolve. If you want to always reduce words
36
36
  to a root form and/or get a root form which is itself a word then Snowball's
37
37
  stemming algorithms likely aren't the right answer.
38
+
39
+ Building Snowball
40
+ =================
41
+
42
+ GNU make is required to build Snowball.
43
+
44
+ The build system is currently structured as two separate stages for many of the
45
+ target languages.
46
+
47
+ The first stage builds the Snowball compiler and runs it to create target
48
+ language code (and it can also run tests on each stemmer). The expectation is
49
+ that you then create a "distribution" tarballs of this code with ``make dist``
50
+ (or to create one for a specific target language, e.g. ``make
51
+ dist_libstemmer_c`` for C). These tarballs are created in the ``dist/``
52
+ subdirectory.
53
+
54
+ To actually build the libstemmer library you then unpack and build the
55
+ distribution tarball, e.g. for C::
56
+
57
+ tar xf dist/libstemmer_c-3.0.0.tar.gz
58
+ cd libstemmer_c-3.0.0
59
+ make
60
+
61
+ Cross-compiling
62
+ ---------------
63
+
64
+ If cross-compiling starting from the git repo, the Snowball compiler needs to
65
+ be built with a native compiler then libstemmer with the cross-compiler. For
66
+ example::
67
+
68
+ make CC=cc dist_libstemmer_c
69
+ tar xf dist/libstemmer_c-3.0.0.tar.gz
70
+ cd libstemmer_c-3.0.0
71
+ make CC=riscv64-unknown-linux-gnu-gcc
72
+
73
+ If you are cross-compiling to or from Microsoft Windows, you'll need to also
74
+ work around an assumption in libstemmer's ``Makefile`` which sets ``EXEEXT``
75
+ based on the OS you are building on::
76
+
77
+ ifeq ($(OS),Windows_NT)
78
+ EXEEXT=.exe
79
+ endif
80
+
81
+ For example, if cross-compiling from Linux to Microsoft Windows, use something
82
+ like this for the libstemmer build::
83
+
84
+ make CC=x86_64-w64-mingw32-gcc EXEEXT=.exe
85
+
86
+ When going the other way, you'll need to use ``EXEEXT=``.
@@ -38,6 +38,7 @@ package body Stemmer with SPARK_Mode is
38
38
  Result : out Boolean) is
39
39
  begin
40
40
  Context.P (1 .. Word'Length) := Word;
41
+ Context.Len := Word'Length;
41
42
  Context.C := 0;
42
43
  Context.L := Word'Length;
43
44
  Context.Lb := 0;
@@ -46,7 +47,7 @@ package body Stemmer with SPARK_Mode is
46
47
 
47
48
  function Get_Result (Context : in Context_Type'Class) return String is
48
49
  begin
49
- return Context.P (1 .. Context.L);
50
+ return Context.P (1 .. Context.Len);
50
51
  end Get_Result;
51
52
 
52
53
  function Eq_S (Context : in Context_Type'Class;
@@ -73,17 +74,12 @@ package body Stemmer with SPARK_Mode is
73
74
  return S'Length;
74
75
  end Eq_S_Backward;
75
76
 
76
- function Length (Context : in Context_Type'Class) return Natural is
77
- begin
78
- return Context.L - Context.Lb;
79
- end Length;
80
-
81
77
  function Length_Utf8 (Context : in Context_Type'Class) return Natural is
82
78
  Count : Natural := 0;
83
79
  Pos : Positive := 1;
84
80
  Val : Byte;
85
81
  begin
86
- while Pos <= Context.L loop
82
+ while Pos <= Context.Len loop
87
83
  Val := Character'Pos (Context.P (Pos));
88
84
  Pos := Pos + 1;
89
85
  if Val >= 16#C0# or Val < 16#80# then
@@ -93,6 +89,21 @@ package body Stemmer with SPARK_Mode is
93
89
  return Count;
94
90
  end Length_Utf8;
95
91
 
92
+ function Length_Utf8 (S : in String) return Natural is
93
+ Count : Natural := 0;
94
+ Pos : Positive := 1;
95
+ Val : Byte;
96
+ begin
97
+ while Pos <= S'Length loop
98
+ Val := Character'Pos (S (Pos));
99
+ Pos := Pos + 1;
100
+ if Val >= 16#C0# or Val < 16#80# then
101
+ Count := Count + 1;
102
+ end if;
103
+ end loop;
104
+ return Count;
105
+ end Length_Utf8;
106
+
96
107
  function Check_Among (Context : in Context_Type'Class;
97
108
  Pos : in Char_Index;
98
109
  Shift : in Natural;
@@ -464,7 +475,7 @@ package body Stemmer with SPARK_Mode is
464
475
  Ch : Utf8_Type;
465
476
  Count : Natural;
466
477
  begin
467
- if Context.C = 0 then
478
+ if Context.C <= Context.Lb then
468
479
  Result := -1;
469
480
  return;
470
481
  end if;
@@ -532,7 +543,7 @@ package body Stemmer with SPARK_Mode is
532
543
  Ch : Utf8_Type;
533
544
  Count : Natural;
534
545
  begin
535
- if Context.C = 0 then
546
+ if Context.C <= Context.Lb then
536
547
  Result := -1;
537
548
  return;
538
549
  end if;
@@ -566,16 +577,17 @@ package body Stemmer with SPARK_Mode is
566
577
  begin
567
578
  Adjustment := S'Length - (C_Ket - C_Bra);
568
579
  if Adjustment > 0 then
569
- Context.P (C_Bra + S'Length + 1 .. Context.Lb + Adjustment + 1)
570
- := Context.P (C_Ket + 1 .. Context.Lb + 1);
580
+ Context.P (C_Bra + S'Length + 1 .. Context.Len + Adjustment + 1)
581
+ := Context.P (C_Ket + 1 .. Context.Len + 1);
571
582
  end if;
572
583
  if S'Length > 0 then
573
584
  Context.P (C_Bra + 1 .. C_Bra + S'Length) := S;
574
585
  end if;
575
586
  if Adjustment < 0 then
576
- Context.P (C_Bra + S'Length + 1 .. Context.L + Adjustment + 1)
577
- := Context.P (C_Ket + 1 .. Context.L + 1);
587
+ Context.P (C_Bra + S'Length + 1 .. Context.Len + Adjustment + 1)
588
+ := Context.P (C_Ket + 1 .. Context.Len + 1);
578
589
  end if;
590
+ Context.Len := Context.Len + Adjustment;
579
591
  Context.L := Context.L + Adjustment;
580
592
  if Context.C >= C_Ket then
581
593
  Context.C := Context.C + Adjustment;
@@ -138,10 +138,10 @@ private
138
138
  Value : out Utf8_Type;
139
139
  Count : out Natural);
140
140
 
141
- function Length (Context : in Context_Type'Class) return Natural;
142
-
143
141
  function Length_Utf8 (Context : in Context_Type'Class) return Natural;
144
142
 
143
+ function Length_Utf8 (S : in String) return Natural;
144
+
145
145
  function Check_Among (Context : in Context_Type'Class;
146
146
  Pos : in Char_Index;
147
147
  Shift : in Natural;
@@ -181,19 +181,17 @@ private
181
181
  S : in String;
182
182
  Adjustment : out Integer) with
183
183
  Global => null,
184
- Pre => C_Bra >= Context.Lb and C_Ket >= C_Bra and C_Ket <= Context.L;
184
+ Pre => C_Ket >= C_Bra;
185
185
 
186
186
  procedure Slice_Del (Context : in out Context_Type'Class) with
187
187
  Global => null,
188
- Pre => Context.Bra >= Context.Lb and Context.Ket >= Context.Bra
189
- and Context.Ket <= Context.L;
188
+ Pre => Context.Ket >= Context.Bra;
190
189
 
191
190
  procedure Slice_From (Context : in out Context_Type'Class;
192
191
  Text : in String) with
193
192
  Global => null,
194
- Pre => Context.Bra >= Context.Lb and Context.Ket >= Context.Bra
195
- and Context.Ket <= Context.L
196
- and Context.L - Context.Lb + Text'Length + Context.Ket - Context.Bra < Context.P'Length;
193
+ Pre => Context.Ket >= Context.Bra
194
+ and Context.Len - (Context.Ket - Context.Bra) + Text'Length < Context.P'Length;
197
195
 
198
196
  function Slice_To (Context : in Context_Type'Class) return String;
199
197
 
@@ -202,13 +200,15 @@ private
202
200
  C_Ket : in Char_Index;
203
201
  S : in String) with
204
202
  Global => null,
205
- Pre => C_Bra >= Context.Lb and C_Ket >= C_Bra and C_Ket <= Context.L;
203
+ Pre => C_Ket >= C_Bra
204
+ and Context.Len - (C_Ket - C_Bra) + S'Length < Context.P'Length;
206
205
 
207
206
  -- The context indexes follow the C paradigm: they start at 0 for the first character.
208
207
  -- This is necessary because several algorithms rely on this when they compare the
209
208
  -- cursor position ('C') or setup some markers from the cursor.
210
209
  type Context_Type is abstract tagged record
211
210
  C : Char_Index := 0;
211
+ Len : Char_Index := 0;
212
212
  L : Char_Index := 0;
213
213
  Lb : Char_Index := 0;
214
214
  Bra : Char_Index := 0;
@@ -12,7 +12,7 @@ abstract project Stemmer_Config is
12
12
 
13
13
  package Builder is
14
14
  case Mode is
15
- when "debug" =>
15
+ when "debug" =>
16
16
  for Default_Switches ("Ada") use ("-g", "-j" & Processors);
17
17
  when others =>
18
18
  for Default_Switches ("Ada") use ("-g", "-O3", "-j" & Processors);
@@ -50,7 +50,7 @@ abstract project Stemmer_Config is
50
50
  when "debug" =>
51
51
  for Default_Switches ("Ada") use ("-E");
52
52
 
53
- when others =>
53
+ when others =>
54
54
  for Default_Switches ("Ada") use ("-E");
55
55
 
56
56
  end case;
@@ -69,13 +69,13 @@ abstract project Stemmer_Config is
69
69
 
70
70
  when "coverage" =>
71
71
  for Default_Switches ("ada") use ("-fprofile-arcs");
72
-
73
- when others =>
74
- null;
72
+
73
+ when others =>
74
+ null;
75
75
  end case;
76
76
 
77
- end linker;
78
-
77
+ end linker;
78
+
79
79
  package Ide is
80
80
  for VCS_Kind use "git";
81
81
  end Ide;
@@ -59,13 +59,8 @@ backwardmode (
59
59
  ( RV delete )
60
60
  'garri' 'garria' 'tza'
61
61
  (R2 delete)
62
- 'atseden'
63
- (<- 'atseden')
64
- 'arabera'
65
- (<- 'arabera')
66
- 'baditu'
67
- (<- 'baditu')
68
-
62
+ 'atseden' 'arabera' 'baditu'
63
+ ( )
69
64
  )
70
65
  )
71
66
 
@@ -111,14 +106,8 @@ backwardmode (
111
106
  (<- 'tra')
112
107
  'minutuko'
113
108
  (<- 'minutu')
114
- 'zehar'
115
- (<- 'zehar')
116
- 'geldi'
117
- (<- 'geldi')
118
- 'igaro'
119
- (<- 'igaro')
120
- 'aurka'
121
- (<- 'aurka')
109
+ 'zehar' 'geldi' 'igaro' 'aurka'
110
+ ( )
122
111
  )
123
112
  )
124
113
 
@@ -143,7 +132,3 @@ define stem as (
143
132
  )
144
133
 
145
134
  )
146
-
147
- /*
148
- Note 1: additions of 21 Jul 2010
149
- */
@@ -19,7 +19,7 @@ stringescapes {}
19
19
 
20
20
  stringdef a' '{U+00E1}' // a-acute
21
21
  stringdef a` '{U+00E0}' // a-grave
22
- stringdef c, '{U+00E7}' // c-cedilla
22
+ stringdef cc '{U+00E7}' // c-cedilla
23
23
  stringdef e' '{U+00E9}' // e-acute
24
24
  stringdef e` '{U+00E8}' // e-grave
25
25
  stringdef i' '{U+00ED}' // i-acute
@@ -90,7 +90,7 @@ backwardmode (
90
90
  'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius'
91
91
  'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste'
92
92
  'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis'
93
- '{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{c,}a' 'nces' '{o'}' 'dor' 'all'
93
+ '{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{cc}a' 'nces' '{o'}' 'dor' 'all'
94
94
  'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu'
95
95
  '{o'}s' 'osa' 'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar'
96
96
  'itar' 'ables' 'adors' 'idores' 'idors'
@@ -193,10 +193,3 @@ define stem as (
193
193
  )
194
194
  do cleaning
195
195
  )
196
-
197
- /*
198
- First works 2010/07/19
199
- First Grammatical Reviews: https://ca.wikipedia.org/wiki/Gram%C3%A0tica_del_catal%C3%A0
200
- Suffix list: https://ca.wikipedia.org/wiki/Llista_de_sufixos
201
- Irregular Verbs: https://ca.wikipedia.org/wiki/Flexi%C3%B3_verbal_del_catal%C3%A0
202
- */
@@ -33,7 +33,7 @@ define mark_regions as (
33
33
  $p1 = limit
34
34
 
35
35
  test ( hop 3 setmark x )
36
- goto v gopast non-v setmark p1
36
+ gopast v gopast non-v setmark p1
37
37
  try ( $p1 < x $p1 = x )
38
38
  )
39
39