mittens 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,37 @@
1
+ Snowball is a small string processing language for creating stemming algorithms
2
+ for use in Information Retrieval, plus a collection of stemming algorithms
3
+ implemented using it.
4
+
5
+ Snowball was originally designed and built by Martin Porter. Martin retired
6
+ from development in 2014 and Snowball is now maintained as a community project.
7
+ Martin originally chose the name Snowball as a tribute to SNOBOL, the excellent
8
+ string handling language from the 1960s. It now also serves as a metaphor for
9
+ how the project grows by gathering contributions over time.
10
+
11
+ The Snowball compiler translates a Snowball program into source code in another
12
+ language - currently Ada, ISO C, C#, Go, Java, Javascript, Object Pascal,
13
+ Python and Rust are supported.
14
+
15
+ This repository contains the source code for the snowball compiler and the
16
+ stemming algorithms. The snowball compiler is written in ISO C - you'll need
17
+ a C compiler which support C99 to build it (but the C code it generates should
18
+ work with any ISO C compiler).
19
+
20
+ See https://snowballstem.org/ for more information about Snowball.
21
+
22
+ What is Stemming?
23
+ =================
24
+
25
+ Stemming maps different forms of the same word to a common "stem" - for
26
+ example, the English stemmer maps *connection*, *connections*, *connective*,
27
+ *connected*, and *connecting* to *connect*. So a searching for *connected*
28
+ would also find documents which only have the other forms.
29
+
30
+ This stem form is often a word itself, but this is not always the case as this
31
+ is not a requirement for text search systems, which are the intended field of
32
+ use. We also aim to conflate words with the same meaning, rather than all
33
+ words with a common linguistic root (so *awe* and *awful* don't have the same
34
+ stem), and over-stemming is more problematic than under-stemming so we tend not
35
+ to stem in cases that are hard to resolve. If you want to always reduce words
36
+ to a root form and/or get a root form which is itself a word then Snowball's
37
+ stemming algorithms likely aren't the right answer.
@@ -0,0 +1,74 @@
1
+ # Ada Target for Snowball
2
+
3
+ The Ada Snowball generator generates an Ada child package for each Snowball algorithm.
4
+ The parent package is named `Stemmer` and it provides various operations used by the generated code.
5
+ The `Stemmer` package contains the Ada Snowball runtime available either in `ada/src` directory
6
+ or from https://github.com/stcarrez/ada-stemmer.
7
+
8
+ The generated child package declares the `Context_Type` tagged type and the `Stem` procedure:
9
+
10
+ ```Ada
11
+ package Stemmer.<Algorithm-name> is
12
+ type Context_Type is new Stemmer.Context_Type with private;
13
+ procedure Stem (Z : in out Context_Type; Result : out Boolean);
14
+ private
15
+ type Context_Type is new Stemmer.Context_Type with record
16
+ ...
17
+ end record;
18
+ end Stemmer.<Algorithm-name>;
19
+ ```
20
+
21
+ It is possible to use directly the generated operation or use it through the `Stemmer.Factory` package.
22
+
23
+ ## Usage
24
+
25
+ To generate Ada source for a Snowball algorithm:
26
+ ```
27
+ $ snowball path/to/algorithm.sbl -ada -P <algorithm-name> -o src/stemmer-<algorithm>
28
+ ```
29
+
30
+ ### Ada specific options
31
+
32
+ `-P <Algorithm-name>` the child package name used in the generated Ada file (defaults to `snowball`).
33
+ It must be a valid Ada identifier.
34
+
35
+ ## Code Organization
36
+
37
+ `compiler/generator_ada.c` has the Ada code generation logic
38
+
39
+ `ada/src` contains the default Ada Snowball runtime support which is also
40
+ available at https://github.com/stcarrez/ada-stemmer
41
+
42
+ `ada/algorithms` location where the makefile generated code will end up
43
+
44
+ ## Using the Generated Stemmers
45
+
46
+ To use the generated stemmer, import the Ada generated package, declare an instance
47
+ of the generated `Context_Type` and call the `Stem_Word` procedure.
48
+
49
+ ```
50
+ with Stemmer.English;
51
+
52
+ Ctx : Stemmer.English.Context_Type;
53
+ Result : Boolean;
54
+
55
+ Ctx.Stem_Word ("zealously", Result);
56
+ if Result then
57
+ Ada.Text_IO.Put_Line (Ctx.Get_Result);
58
+ end if;
59
+ ```
60
+
61
+ You can use the context as many times as you want.
62
+
63
+ ## Testing
64
+
65
+ To run the tests, you will need an Ada compiler such as GNAT as well as the `gprbuild` build tool.
66
+
67
+ Only the existing Snowball algorithms have been used for testing. This does not exercise all features of the language.
68
+
69
+ Run:
70
+
71
+ ```
72
+ $ make check_ada
73
+ ```
74
+
@@ -0,0 +1,83 @@
1
+ with Ada.Characters.Handling;
2
+ with Ada.Text_IO;
3
+ with Ada.Command_Line;
4
+ with Ada.Containers.Indefinite_Vectors;
5
+ procedure Generate is
6
+
7
+ use Ada.Characters.Handling;
8
+ use Ada.Text_IO;
9
+
10
+ package String_Vectors is new Ada.Containers.Indefinite_Vectors
11
+ (Element_Type => String,
12
+ Index_Type => Positive);
13
+
14
+ Languages : String_Vectors.Vector;
15
+
16
+ function Capitalize (S : in String) return String is
17
+ (To_Upper (S (S'First)) & S (S'First + 1 .. S'Last));
18
+
19
+ procedure Write_Spec is
20
+ File : File_Type;
21
+ I : Natural := 0;
22
+ begin
23
+ Create (File, Out_File, "stemmer-factory.ads");
24
+ Put_Line (File, "package Stemmer.Factory with SPARK_Mode is");
25
+ New_Line (File);
26
+ Put (File, " type Language_Type is (");
27
+ for Lang of Languages loop
28
+ Put (File, "L_" & To_Upper (Lang));
29
+ I := I + 1;
30
+ if I < Natural (Languages.Length) then
31
+ Put_Line (File, ",");
32
+ Put (File, " ");
33
+ end if;
34
+ end loop;
35
+ Put_Line (File, ");");
36
+ New_Line (File);
37
+ Put_Line (File, " function Stem (Language : in Language_Type;");
38
+ Put_Line (File, " Word : in String) return String;");
39
+ New_Line (File);
40
+ Put_Line (File, "end Stemmer.Factory;");
41
+ Close (File);
42
+ end Write_Spec;
43
+
44
+ procedure Write_Body is
45
+ File : File_Type;
46
+ begin
47
+ Create (File, Out_File, "stemmer-factory.adb");
48
+ for Lang of Languages loop
49
+ Put_Line (File, "with Stemmer." & Capitalize (Lang) & ";");
50
+ end loop;
51
+ Put_Line (File, "package body Stemmer.Factory with SPARK_Mode is");
52
+ New_Line (File);
53
+ Put_Line (File, " function Stem (Language : in Language_Type;");
54
+ Put_Line (File, " Word : in String) return String is");
55
+ Put_Line (File, " Result : Boolean := False;");
56
+ Put_Line (File, " begin");
57
+ Put_Line (File, " case Language is");
58
+ for Lang of Languages loop
59
+ Put_Line (File, " when L_" & To_Upper (Lang) & " =>");
60
+ Put_Line (File, " declare");
61
+ Put_Line (File, " C : Stemmer." & Capitalize (Lang) & ".Context_Type;");
62
+ Put_Line (File, " begin");
63
+ Put_Line (File, " C.Stem_Word (Word, Result);");
64
+ Put_Line (File, " return Get_Result (C);");
65
+ Put_Line (File, " end;");
66
+ New_Line (File);
67
+ end loop;
68
+ Put_Line (File, " end case;");
69
+ Put_Line (File, " end Stem;");
70
+ New_Line (File);
71
+ Put_Line (File, "end Stemmer.Factory;");
72
+ Close (File);
73
+ end Write_Body;
74
+
75
+ Count : constant Natural := Ada.Command_Line.Argument_Count;
76
+
77
+ begin
78
+ for I in 1 .. Count loop
79
+ Languages.Append (To_Lower (Ada.Command_Line.Argument (I)));
80
+ end loop;
81
+ Write_Spec;
82
+ Write_Body;
83
+ end Generate;
@@ -0,0 +1,21 @@
1
+ with "stemmer_config";
2
+ project Generate is
3
+
4
+ Mains := ("generate.adb");
5
+
6
+ for Main use Mains;
7
+
8
+ for Source_Dirs use ("generate");
9
+
10
+ for Object_Dir use "./" & Stemmer_Config'Object_Dir & "/obj";
11
+ for Exec_Dir use "./" & Stemmer_Config'Exec_Dir & "/bin";
12
+
13
+ package Binder renames Stemmer_Config.Binder;
14
+
15
+ package Builder renames Stemmer_Config.Builder;
16
+
17
+ package Compiler renames Stemmer_Config.Compiler;
18
+
19
+ package Linker renames Stemmer_Config.Linker;
20
+
21
+ end Generate;