mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,37 @@
1
+ Snowball is a small string processing language for creating stemming algorithms
2
+ for use in Information Retrieval, plus a collection of stemming algorithms
3
+ implemented using it.
4
+
5
+ Snowball was originally designed and built by Martin Porter. Martin retired
6
+ from development in 2014 and Snowball is now maintained as a community project.
7
+ Martin originally chose the name Snowball as a tribute to SNOBOL, the excellent
8
+ string handling language from the 1960s. It now also serves as a metaphor for
9
+ how the project grows by gathering contributions over time.
10
+
11
+ The Snowball compiler translates a Snowball program into source code in another
12
+ language - currently Ada, ISO C, C#, Go, Java, Javascript, Object Pascal,
13
+ Python and Rust are supported.
14
+
15
+ This repository contains the source code for the snowball compiler and the
16
+ stemming algorithms. The snowball compiler is written in ISO C - you'll need
17
+ a C compiler which support C99 to build it (but the C code it generates should
18
+ work with any ISO C compiler).
19
+
20
+ See https://snowballstem.org/ for more information about Snowball.
21
+
22
+ What is Stemming?
23
+ =================
24
+
25
+ Stemming maps different forms of the same word to a common "stem" - for
26
+ example, the English stemmer maps *connection*, *connections*, *connective*,
27
+ *connected*, and *connecting* to *connect*. So a searching for *connected*
28
+ would also find documents which only have the other forms.
29
+
30
+ This stem form is often a word itself, but this is not always the case as this
31
+ is not a requirement for text search systems, which are the intended field of
32
+ use. We also aim to conflate words with the same meaning, rather than all
33
+ words with a common linguistic root (so *awe* and *awful* don't have the same
34
+ stem), and over-stemming is more problematic than under-stemming so we tend not
35
+ to stem in cases that are hard to resolve. If you want to always reduce words
36
+ to a root form and/or get a root form which is itself a word then Snowball's
37
+ stemming algorithms likely aren't the right answer.
@@ -0,0 +1,74 @@
1
+ # Ada Target for Snowball
2
+
3
+ The Ada Snowball generator generates an Ada child package for each Snowball algorithm.
4
+ The parent package is named `Stemmer` and it provides various operations used by the generated code.
5
+ The `Stemmer` package contains the Ada Snowball runtime available either in `ada/src` directory
6
+ or from https://github.com/stcarrez/ada-stemmer.
7
+
8
+ The generated child package declares the `Context_Type` tagged type and the `Stem` procedure:
9
+
10
+ ```Ada
11
+ package Stemmer.<Algorithm-name> is
12
+ type Context_Type is new Stemmer.Context_Type with private;
13
+ procedure Stem (Z : in out Context_Type; Result : out Boolean);
14
+ private
15
+ type Context_Type is new Stemmer.Context_Type with record
16
+ ...
17
+ end record;
18
+ end Stemmer.<Algorithm-name>;
19
+ ```
20
+
21
+ It is possible to use directly the generated operation or use it through the `Stemmer.Factory` package.
22
+
23
+ ## Usage
24
+
25
+ To generate Ada source for a Snowball algorithm:
26
+ ```
27
+ $ snowball path/to/algorithm.sbl -ada -P <algorithm-name> -o src/stemmer-<algorithm>
28
+ ```
29
+
30
+ ### Ada specific options
31
+
32
+ `-P <Algorithm-name>` the child package name used in the generated Ada file (defaults to `snowball`).
33
+ It must be a valid Ada identifier.
34
+
35
+ ## Code Organization
36
+
37
+ `compiler/generator_ada.c` has the Ada code generation logic
38
+
39
+ `ada/src` contains the default Ada Snowball runtime support which is also
40
+ available at https://github.com/stcarrez/ada-stemmer
41
+
42
+ `ada/algorithms` location where the makefile generated code will end up
43
+
44
+ ## Using the Generated Stemmers
45
+
46
+ To use the generated stemmer, import the Ada generated package, declare an instance
47
+ of the generated `Context_Type` and call the `Stem_Word` procedure.
48
+
49
+ ```
50
+ with Stemmer.English;
51
+
52
+ Ctx : Stemmer.English.Context_Type;
53
+ Result : Boolean;
54
+
55
+ Ctx.Stem_Word ("zealously", Result);
56
+ if Result then
57
+ Ada.Text_IO.Put_Line (Ctx.Get_Result);
58
+ end if;
59
+ ```
60
+
61
+ You can use the context as many times as you want.
62
+
63
+ ## Testing
64
+
65
+ To run the tests, you will need an Ada compiler such as GNAT as well as the `gprbuild` build tool.
66
+
67
+ Only the existing Snowball algorithms have been used for testing. This does not exercise all features of the language.
68
+
69
+ Run:
70
+
71
+ ```
72
+ $ make check_ada
73
+ ```
74
+
@@ -0,0 +1,83 @@
1
+ with Ada.Characters.Handling;
2
+ with Ada.Text_IO;
3
+ with Ada.Command_Line;
4
+ with Ada.Containers.Indefinite_Vectors;
5
+ procedure Generate is
6
+
7
+ use Ada.Characters.Handling;
8
+ use Ada.Text_IO;
9
+
10
+ package String_Vectors is new Ada.Containers.Indefinite_Vectors
11
+ (Element_Type => String,
12
+ Index_Type => Positive);
13
+
14
+ Languages : String_Vectors.Vector;
15
+
16
+ function Capitalize (S : in String) return String is
17
+ (To_Upper (S (S'First)) & S (S'First + 1 .. S'Last));
18
+
19
+ procedure Write_Spec is
20
+ File : File_Type;
21
+ I : Natural := 0;
22
+ begin
23
+ Create (File, Out_File, "stemmer-factory.ads");
24
+ Put_Line (File, "package Stemmer.Factory with SPARK_Mode is");
25
+ New_Line (File);
26
+ Put (File, " type Language_Type is (");
27
+ for Lang of Languages loop
28
+ Put (File, "L_" & To_Upper (Lang));
29
+ I := I + 1;
30
+ if I < Natural (Languages.Length) then
31
+ Put_Line (File, ",");
32
+ Put (File, " ");
33
+ end if;
34
+ end loop;
35
+ Put_Line (File, ");");
36
+ New_Line (File);
37
+ Put_Line (File, " function Stem (Language : in Language_Type;");
38
+ Put_Line (File, " Word : in String) return String;");
39
+ New_Line (File);
40
+ Put_Line (File, "end Stemmer.Factory;");
41
+ Close (File);
42
+ end Write_Spec;
43
+
44
+ procedure Write_Body is
45
+ File : File_Type;
46
+ begin
47
+ Create (File, Out_File, "stemmer-factory.adb");
48
+ for Lang of Languages loop
49
+ Put_Line (File, "with Stemmer." & Capitalize (Lang) & ";");
50
+ end loop;
51
+ Put_Line (File, "package body Stemmer.Factory with SPARK_Mode is");
52
+ New_Line (File);
53
+ Put_Line (File, " function Stem (Language : in Language_Type;");
54
+ Put_Line (File, " Word : in String) return String is");
55
+ Put_Line (File, " Result : Boolean := False;");
56
+ Put_Line (File, " begin");
57
+ Put_Line (File, " case Language is");
58
+ for Lang of Languages loop
59
+ Put_Line (File, " when L_" & To_Upper (Lang) & " =>");
60
+ Put_Line (File, " declare");
61
+ Put_Line (File, " C : Stemmer." & Capitalize (Lang) & ".Context_Type;");
62
+ Put_Line (File, " begin");
63
+ Put_Line (File, " C.Stem_Word (Word, Result);");
64
+ Put_Line (File, " return Get_Result (C);");
65
+ Put_Line (File, " end;");
66
+ New_Line (File);
67
+ end loop;
68
+ Put_Line (File, " end case;");
69
+ Put_Line (File, " end Stem;");
70
+ New_Line (File);
71
+ Put_Line (File, "end Stemmer.Factory;");
72
+ Close (File);
73
+ end Write_Body;
74
+
75
+ Count : constant Natural := Ada.Command_Line.Argument_Count;
76
+
77
+ begin
78
+ for I in 1 .. Count loop
79
+ Languages.Append (To_Lower (Ada.Command_Line.Argument (I)));
80
+ end loop;
81
+ Write_Spec;
82
+ Write_Body;
83
+ end Generate;
@@ -0,0 +1,21 @@
1
+ with "stemmer_config";
2
+ project Generate is
3
+
4
+ Mains := ("generate.adb");
5
+
6
+ for Main use Mains;
7
+
8
+ for Source_Dirs use ("generate");
9
+
10
+ for Object_Dir use "./" & Stemmer_Config'Object_Dir & "/obj";
11
+ for Exec_Dir use "./" & Stemmer_Config'Exec_Dir & "/bin";
12
+
13
+ package Binder renames Stemmer_Config.Binder;
14
+
15
+ package Builder renames Stemmer_Config.Builder;
16
+
17
+ package Compiler renames Stemmer_Config.Compiler;
18
+
19
+ package Linker renames Stemmer_Config.Linker;
20
+
21
+ end Generate;