mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,267 @@
1
+ #!/usr/bin/env perl
2
+ use strict;
3
+ use 5.006;
4
+ use warnings;
5
+
6
+ my $progname = $0;
7
+
8
+ if (scalar @ARGV < 4 || scalar @ARGV > 5) {
9
+ print "Usage: $progname <outfile> <C source directory> <modules description file> <source list file> [<enc>]\n";
10
+ exit 1;
11
+ }
12
+
13
+ my $outname = shift(@ARGV);
14
+ my $c_src_dir = shift(@ARGV);
15
+ my $descfile = shift(@ARGV);
16
+ my $srclistfile = shift(@ARGV);
17
+ my $enc_only;
18
+ my $extn = '';
19
+ if (@ARGV) {
20
+ $enc_only = shift(@ARGV);
21
+ $extn = '_'.$enc_only;
22
+ }
23
+
24
+ my %aliases = ();
25
+ my %algorithms = ();
26
+ my %algorithm_encs = ();
27
+
28
+ my %encs = ();
29
+
30
+ sub addalgenc($$) {
31
+ my $alg = shift();
32
+ my $enc = shift();
33
+
34
+ if (defined $enc_only) {
35
+ my $norm_enc = lc $enc;
36
+ $norm_enc =~ s/_//g;
37
+ if ($norm_enc ne $enc_only) {
38
+ return;
39
+ }
40
+ }
41
+
42
+ if (defined $algorithm_encs{$alg}) {
43
+ my $hashref = $algorithm_encs{$alg};
44
+ $$hashref{$enc}=1;
45
+ } else {
46
+ my %newhash = ($enc => 1);
47
+ $algorithm_encs{$alg}=\%newhash;
48
+ }
49
+
50
+ $encs{$enc} = 1;
51
+ }
52
+
53
+ sub readinput()
54
+ {
55
+ open DESCFILE, $descfile;
56
+ my $line;
57
+ while ($line = <DESCFILE>)
58
+ {
59
+ next if $line =~ m/^\s*#/;
60
+ next if $line =~ m/^\s*$/;
61
+ my ($alg,$encstr,$aliases) = split(/\s+/, $line);
62
+ my $enc;
63
+ my $alias;
64
+
65
+ $algorithms{$alg} = 1;
66
+ foreach $alias (split(/,/, $aliases)) {
67
+ foreach $enc (split(/,/, $encstr)) {
68
+ # print "$alias, $enc\n";
69
+ $aliases{$alias} = $alg;
70
+ addalgenc($alg, $enc);
71
+ }
72
+ }
73
+ }
74
+ }
75
+
76
+ sub printoutput()
77
+ {
78
+ open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n";
79
+
80
+ print OUT <<EOS;
81
+ /* $outname: List of stemming modules.
82
+ *
83
+ * This file is generated by mkmodules.pl from a list of module names.
84
+ * Do not edit manually.
85
+ *
86
+ EOS
87
+
88
+ my $line = " * Modules included by this file are: ";
89
+ print OUT $line;
90
+ my $linelen = length($line);
91
+
92
+ my $need_sep = 0;
93
+ my $lang;
94
+ my $enc;
95
+ my @algorithms = sort keys(%algorithms);
96
+ foreach $lang (@algorithms) {
97
+ if ($need_sep) {
98
+ if (($linelen + 2 + length($lang)) > 77) {
99
+ print OUT ",\n * ";
100
+ $linelen = 3;
101
+ } else {
102
+ print OUT ', ';
103
+ $linelen += 2;
104
+ }
105
+ }
106
+ print OUT $lang;
107
+ $linelen += length($lang);
108
+ $need_sep = 1;
109
+ }
110
+ print OUT "\n */\n\n";
111
+
112
+ foreach $lang (@algorithms) {
113
+ my $hashref = $algorithm_encs{$lang};
114
+ foreach $enc (sort keys (%$hashref)) {
115
+ print OUT "#include \"../$c_src_dir/stem_${enc}_$lang.h\"\n";
116
+ }
117
+ }
118
+
119
+ print OUT <<EOS;
120
+
121
+ typedef enum {
122
+ ENC_UNKNOWN=0,
123
+ EOS
124
+ my $neednl = 0;
125
+ for $enc (sort keys %encs) {
126
+ print OUT ",\n" if $neednl;
127
+ print OUT " ENC_${enc}";
128
+ $neednl = 1;
129
+ }
130
+ print OUT <<EOS;
131
+
132
+ } stemmer_encoding_t;
133
+
134
+ struct stemmer_encoding {
135
+ const char * name;
136
+ stemmer_encoding_t enc;
137
+ };
138
+ static const struct stemmer_encoding encodings[] = {
139
+ EOS
140
+ for $enc (sort keys %encs) {
141
+ print OUT " {\"${enc}\", ENC_${enc}},\n";
142
+ }
143
+ print OUT <<EOS;
144
+ {0,ENC_UNKNOWN}
145
+ };
146
+
147
+ struct stemmer_modules {
148
+ const char * name;
149
+ stemmer_encoding_t enc;
150
+ struct SN_env * (*create)(void);
151
+ void (*close)(struct SN_env *);
152
+ int (*stem)(struct SN_env *);
153
+ };
154
+ static const struct stemmer_modules modules[] = {
155
+ EOS
156
+
157
+ for $lang (sort keys %aliases) {
158
+ my $l = $aliases{$lang};
159
+ my $hashref = $algorithm_encs{$l};
160
+ my $enc;
161
+ foreach $enc (sort keys (%$hashref)) {
162
+ my $p = "${l}_${enc}";
163
+ print OUT " {\"$lang\", ENC_$enc, ${p}_create_env, ${p}_close_env, ${p}_stem},\n";
164
+ }
165
+ }
166
+
167
+ print OUT <<EOS;
168
+ {0,ENC_UNKNOWN,0,0,0}
169
+ };
170
+ EOS
171
+
172
+ print OUT <<EOS;
173
+ static const char * algorithm_names[] = {
174
+ EOS
175
+
176
+ for $lang (@algorithms) {
177
+ print OUT " \"$lang\", \n";
178
+ }
179
+
180
+ print OUT <<EOS;
181
+ 0
182
+ };
183
+ EOS
184
+ close OUT or die "Can't close ${outname}: $!\n";
185
+ }
186
+
187
+ sub printsrclist()
188
+ {
189
+ open (OUT, ">$srclistfile") or die "Can't open output file `$srclistfile': $!\n";
190
+
191
+ print OUT <<EOS;
192
+ # $srclistfile: List of stemming module source files
193
+ #
194
+ # This file is generated by mkmodules.pl from a list of module names.
195
+ # Do not edit manually.
196
+ #
197
+ EOS
198
+
199
+ my $line = "# Modules included by this file are: ";
200
+ print OUT $line;
201
+ my $linelen = length($line);
202
+
203
+ my $need_sep = 0;
204
+ my $lang;
205
+ my $srcfile;
206
+ my $enc;
207
+ my @algorithms = sort keys(%algorithms);
208
+ foreach $lang (@algorithms) {
209
+ if ($need_sep) {
210
+ if (($linelen + 2 + length($lang)) > 77) {
211
+ print OUT ",\n# ";
212
+ $linelen = 3;
213
+ } else {
214
+ print OUT ', ';
215
+ $linelen += 2;
216
+ }
217
+ }
218
+ print OUT $lang;
219
+ $linelen += length($lang);
220
+ $need_sep = 1;
221
+ }
222
+
223
+ print OUT "\n\nsnowball_sources= \\\n";
224
+ for $lang (sort keys %aliases) {
225
+ my $hashref = $algorithm_encs{$lang};
226
+ my $enc;
227
+ foreach $enc (sort keys (%$hashref)) {
228
+ print OUT " src_c/stem_${enc}_${lang}.c \\\n";
229
+ }
230
+ }
231
+
232
+ $need_sep = 0;
233
+ for $srcfile ('runtime/api.c',
234
+ 'runtime/utilities.c',
235
+ "libstemmer/libstemmer${extn}.c") {
236
+ print OUT " \\\n" if $need_sep;
237
+ print OUT " $srcfile";
238
+ $need_sep = 1;
239
+ }
240
+
241
+ print OUT "\n\nsnowball_headers= \\\n";
242
+ for $lang (sort keys %aliases) {
243
+ my $hashref = $algorithm_encs{$lang};
244
+ my $enc;
245
+ foreach $enc (sort keys (%$hashref)) {
246
+ my $p = "${lang}_${enc}";
247
+ print OUT " src_c/stem_${enc}_${lang}.h \\\n";
248
+ }
249
+ }
250
+
251
+ $need_sep = 0;
252
+ for $srcfile ('include/libstemmer.h',
253
+ "libstemmer/modules${extn}.h",
254
+ 'runtime/api.h',
255
+ 'runtime/header.h') {
256
+ print OUT " \\\n" if $need_sep;
257
+ print OUT " $srcfile";
258
+ $need_sep = 1;
259
+ }
260
+
261
+ print OUT "\n\n";
262
+ close OUT or die "Can't close ${srclistfile}: $!\n";
263
+ }
264
+
265
+ readinput();
266
+ printoutput();
267
+ printsrclist();
@@ -0,0 +1,63 @@
1
+ # This file contains a list of stemmers to include in the distribution.
2
+ # The format is a set of space separated lines - on each line:
3
+ # First item is name of stemmer.
4
+ # Second item is comma separated list of character sets.
5
+ # Third item is comma separated list of names to refer to the stemmer by.
6
+ #
7
+ # Lines starting with a #, or blank lines, are ignored.
8
+
9
+ # List all the main algorithms for each language, in UTF-8, and also with
10
+ # the most commonly used encoding.
11
+
12
+ arabic UTF_8 arabic,ar,ara
13
+ armenian UTF_8 armenian,hy,hye,arm
14
+ basque UTF_8,ISO_8859_1 basque,eu,eus,baq
15
+ catalan UTF_8,ISO_8859_1 catalan,ca,cat
16
+ danish UTF_8,ISO_8859_1 danish,da,dan
17
+ dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
18
+ english UTF_8,ISO_8859_1 english,en,eng
19
+ finnish UTF_8,ISO_8859_1 finnish,fi,fin
20
+ french UTF_8,ISO_8859_1 french,fr,fre,fra
21
+ german UTF_8,ISO_8859_1 german,de,ger,deu
22
+ greek UTF_8 greek,el,gre,ell
23
+ hindi UTF_8 hindi,hi,hin
24
+ hungarian UTF_8,ISO_8859_2 hungarian,hu,hun
25
+ indonesian UTF_8,ISO_8859_1 indonesian,id,ind
26
+ irish UTF_8,ISO_8859_1 irish,ga,gle
27
+ italian UTF_8,ISO_8859_1 italian,it,ita
28
+ lithuanian UTF_8 lithuanian,lt,lit
29
+ nepali UTF_8 nepali,ne,nep
30
+ norwegian UTF_8,ISO_8859_1 norwegian,no,nor
31
+ portuguese UTF_8,ISO_8859_1 portuguese,pt,por
32
+ romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
33
+ russian UTF_8,KOI8_R russian,ru,rus
34
+ serbian UTF_8 serbian,sr,srp
35
+ spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
36
+ swedish UTF_8,ISO_8859_1 swedish,sv,swe
37
+ tamil UTF_8 tamil,ta,tam
38
+ turkish UTF_8 turkish,tr,tur
39
+ yiddish UTF_8 yiddish,yi,yid
40
+
41
+ # Also include the traditional porter algorithm for english.
42
+ # The porter algorithm is included in the libstemmer distribution to assist
43
+ # with backwards compatibility, but for new systems the english algorithm
44
+ # should be used in preference.
45
+ porter UTF_8,ISO_8859_1 porter english
46
+
47
+ # Some other stemmers in the snowball project are not included in the standard
48
+ # distribution. To compile a libstemmer with them in, add them to this list,
49
+ # and regenerate the distribution. (You will need a full source checkout for
50
+ # this.) They are included in the snowball website as curiosities, but are not
51
+ # intended for general use, and use of them is is not fully supported. These
52
+ # algorithms are:
53
+ #
54
+ # german2 - This is a slight modification of the german stemmer.
55
+ #german2 UTF_8,ISO_8859_1 german2 german
56
+ #
57
+ # kraaij_pohlmann - This is a different dutch stemmer.
58
+ #kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch
59
+ #
60
+ # lovins - This is an english stemmer, but fairly outdated, and
61
+ # only really applicable to a restricted type of input text
62
+ # (keywords in academic publications).
63
+ #lovins UTF_8,ISO_8859_1 lovins english
@@ -0,0 +1,34 @@
1
+
2
+ #include "libstemmer.h"
3
+
4
+ /* test code */
5
+ void error(const char * err) {
6
+ printf("%s\n", err);
7
+ exit(1);
8
+ }
9
+
10
+ int main () {
11
+ const char * stemmed;
12
+ const char * unstemmed;
13
+ struct sb_stemmer * s;
14
+ const char ** list = sb_stemmer_list();
15
+ if (*list == 0) error("TEST FAIL: empty list of stemmers");
16
+
17
+ s = sb_stemmer_new("e");
18
+ if (s != 0) error("TEST FAIL: non zero return for unrecognised language");
19
+ s = sb_stemmer_new("english");
20
+ if (s == 0) error("TEST FAIL: zero return for recognised language");
21
+ sb_stemmer_delete(s);
22
+ s = sb_stemmer_new("en");
23
+ if (s == 0) error("TEST FAIL: zero return for recognised language");
24
+ unstemmed = "recognised";
25
+ stemmed = sb_stemmer_stem(s, unstemmed, 10);
26
+ printf("%s -> %s\n", unstemmed, stemmed);
27
+ if (sb_stemmer_length(s) != strlen(stemmed))
28
+ error("TEST FAIL: length not correct");
29
+ unstemmed = "recognized";
30
+ printf("%s -> %s\n", unstemmed, stemmed);
31
+ sb_stemmer_delete(s);
32
+ printf("Success\n");
33
+ return 0;
34
+ }
@@ -0,0 +1,4 @@
1
+ /*.ppu
2
+ /*Stemmer.pas
3
+ /stemwords.dpr
4
+ /stemwords