mittens 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,86 @@
1
+ static const struct system_word vocab[82+1] = {
2
+ { 0, (const byte *)"", 82+1},
3
+
4
+ { 1, (const byte *)"$", c_dollar },
5
+ { 1, (const byte *)"(", c_bra },
6
+ { 1, (const byte *)")", c_ket },
7
+ { 1, (const byte *)"*", c_multiply },
8
+ { 1, (const byte *)"+", c_plus },
9
+ { 1, (const byte *)"-", c_minus },
10
+ { 1, (const byte *)"/", c_divide },
11
+ { 1, (const byte *)"<", c_ls },
12
+ { 1, (const byte *)"=", c_assign },
13
+ { 1, (const byte *)">", c_gr },
14
+ { 1, (const byte *)"?", c_debug },
15
+ { 1, (const byte *)"[", c_leftslice },
16
+ { 1, (const byte *)"]", c_rightslice },
17
+ { 2, (const byte *)"!=", c_ne },
18
+ { 2, (const byte *)"*=", c_multiplyassign },
19
+ { 2, (const byte *)"+=", c_plusassign },
20
+ { 2, (const byte *)"-=", c_minusassign },
21
+ { 2, (const byte *)"->", c_sliceto },
22
+ { 2, (const byte *)"/*", c_comment2 },
23
+ { 2, (const byte *)"//", c_comment1 },
24
+ { 2, (const byte *)"/=", c_divideassign },
25
+ { 2, (const byte *)"<+", c_insert },
26
+ { 2, (const byte *)"<-", c_slicefrom },
27
+ { 2, (const byte *)"<=", c_le },
28
+ { 2, (const byte *)"==", c_eq },
29
+ { 2, (const byte *)"=>", c_assignto },
30
+ { 2, (const byte *)">=", c_ge },
31
+ { 2, (const byte *)"as", c_as },
32
+ { 2, (const byte *)"do", c_do },
33
+ { 2, (const byte *)"or", c_or },
34
+ { 3, (const byte *)"and", c_and },
35
+ { 3, (const byte *)"for", c_for },
36
+ { 3, (const byte *)"get", c_get },
37
+ { 3, (const byte *)"hex", c_hex },
38
+ { 3, (const byte *)"hop", c_hop },
39
+ { 3, (const byte *)"len", c_len },
40
+ { 3, (const byte *)"non", c_non },
41
+ { 3, (const byte *)"not", c_not },
42
+ { 3, (const byte *)"set", c_set },
43
+ { 3, (const byte *)"try", c_try },
44
+ { 4, (const byte *)"fail", c_fail },
45
+ { 4, (const byte *)"goto", c_goto },
46
+ { 4, (const byte *)"loop", c_loop },
47
+ { 4, (const byte *)"next", c_next },
48
+ { 4, (const byte *)"size", c_size },
49
+ { 4, (const byte *)"test", c_test },
50
+ { 4, (const byte *)"true", c_true },
51
+ { 5, (const byte *)"among", c_among },
52
+ { 5, (const byte *)"false", c_false },
53
+ { 5, (const byte *)"lenof", c_lenof },
54
+ { 5, (const byte *)"limit", c_limit },
55
+ { 5, (const byte *)"unset", c_unset },
56
+ { 6, (const byte *)"atmark", c_atmark },
57
+ { 6, (const byte *)"attach", c_attach },
58
+ { 6, (const byte *)"cursor", c_cursor },
59
+ { 6, (const byte *)"define", c_define },
60
+ { 6, (const byte *)"delete", c_delete },
61
+ { 6, (const byte *)"gopast", c_gopast },
62
+ { 6, (const byte *)"insert", c_insert },
63
+ { 6, (const byte *)"maxint", c_maxint },
64
+ { 6, (const byte *)"minint", c_minint },
65
+ { 6, (const byte *)"repeat", c_repeat },
66
+ { 6, (const byte *)"sizeof", c_sizeof },
67
+ { 6, (const byte *)"tomark", c_tomark },
68
+ { 7, (const byte *)"atleast", c_atleast },
69
+ { 7, (const byte *)"atlimit", c_atlimit },
70
+ { 7, (const byte *)"decimal", c_decimal },
71
+ { 7, (const byte *)"reverse", c_reverse },
72
+ { 7, (const byte *)"setmark", c_setmark },
73
+ { 7, (const byte *)"strings", c_strings },
74
+ { 7, (const byte *)"tolimit", c_tolimit },
75
+ { 8, (const byte *)"booleans", c_booleans },
76
+ { 8, (const byte *)"integers", c_integers },
77
+ { 8, (const byte *)"routines", c_routines },
78
+ { 8, (const byte *)"setlimit", c_setlimit },
79
+ { 9, (const byte *)"backwards", c_backwards },
80
+ { 9, (const byte *)"externals", c_externals },
81
+ { 9, (const byte *)"groupings", c_groupings },
82
+ { 9, (const byte *)"stringdef", c_stringdef },
83
+ { 9, (const byte *)"substring", c_substring },
84
+ { 12, (const byte *)"backwardmode", c_backwardmode },
85
+ { 13, (const byte *)"stringescapes", c_stringescapes }
86
+ };
@@ -0,0 +1,13 @@
1
+ c_among = 4, c_and, c_as, c_assign, c_assignto, c_atleast,
2
+ c_atlimit, c_atmark, c_attach, c_backwardmode, c_backwards,
3
+ c_booleans, c_bra, c_comment1, c_comment2, c_cursor, c_debug,
4
+ c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do,
5
+ c_dollar, c_eq, c_externals, c_fail, c_false, c_for, c_ge, c_get,
6
+ c_gopast, c_goto, c_gr, c_groupings, c_hex, c_hop, c_insert,
7
+ c_integers, c_ket, c_le, c_leftslice, c_len, c_lenof, c_limit, c_loop,
8
+ c_ls, c_maxint, c_minint, c_minus, c_minusassign, c_multiply,
9
+ c_multiplyassign, c_ne, c_next, c_non, c_not, c_or, c_plus,
10
+ c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines,
11
+ c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom,
12
+ c_sliceto, c_stringdef, c_stringescapes, c_strings, c_substring,
13
+ c_test, c_tolimit, c_tomark, c_true, c_try, c_unset,
@@ -0,0 +1,567 @@
1
+
2
+ #include <stdio.h> /* stderr etc */
3
+ #include <stdlib.h> /* malloc free */
4
+ #include <string.h> /* strlen */
5
+ #include <ctype.h> /* isalpha etc */
6
+ #include "header.h"
7
+
8
+ struct system_word {
9
+ int s_size; /* size of system word */
10
+ const byte * s; /* pointer to the system word */
11
+ int code; /* its internal code */
12
+ };
13
+
14
+
15
+ /* ASCII collating assumed in syswords.c */
16
+
17
+ #include "syswords.h"
18
+
19
+ #define INITIAL_INPUT_BUFFER_SIZE 8192
20
+
21
+ static int hex_to_num(int ch);
22
+
23
+ static int smaller(int a, int b) { return a < b ? a : b; }
24
+
25
+ extern symbol * get_input(const char * filename) {
26
+ FILE * input = fopen(filename, "r");
27
+ if (input == 0) { return 0; }
28
+ {
29
+ symbol * u = create_b(INITIAL_INPUT_BUFFER_SIZE);
30
+ int size = 0;
31
+ while (true) {
32
+ int ch = getc(input);
33
+ if (ch == EOF) break;
34
+ if (size >= CAPACITY(u)) u = increase_capacity(u, size);
35
+ u[size++] = ch;
36
+ }
37
+ fclose(input);
38
+ SIZE(u) = size;
39
+ return u;
40
+ }
41
+ }
42
+
43
+ static void error(struct tokeniser * t, const char * s1, int n, symbol * p, const char * s2) {
44
+ if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); }
45
+ fprintf(stderr, "%s:%d: ", t->file, t->line_number);
46
+ if (s1) fprintf(stderr, "%s", s1);
47
+ if (p) {
48
+ int i;
49
+ for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]);
50
+ }
51
+ if (s2) fprintf(stderr, "%s", s2);
52
+ fprintf(stderr, "\n");
53
+ t->error_count++;
54
+ }
55
+
56
+ static void error1(struct tokeniser * t, const char * s) {
57
+ error(t, s, 0,0, 0);
58
+ }
59
+
60
+ static void error2(struct tokeniser * t, const char * s) {
61
+ error(t, "unexpected end of text after ", 0,0, s);
62
+ }
63
+
64
+ static int compare_words(int m, symbol * p, int n, const byte * q) {
65
+ if (m != n) return m - n;
66
+ {
67
+ int i; for (i = 0; i < n; i++) {
68
+ int diff = p[i] - q[i];
69
+ if (diff) return diff;
70
+ }
71
+ }
72
+ return 0;
73
+ }
74
+
75
+ static int find_word(int n, symbol * p) {
76
+ int i = 0; int j = vocab->code;
77
+ do {
78
+ int k = i + (j - i)/2;
79
+ const struct system_word * w = vocab + k;
80
+ int diff = compare_words(n, p, w->s_size, w->s);
81
+ if (diff == 0) return w->code;
82
+ if (diff < 0) j = k; else i = k;
83
+ } while (j - i != 1);
84
+ return -1;
85
+ }
86
+
87
+ static int get_number(int n, symbol * p) {
88
+ int x = 0;
89
+ int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0';
90
+ return x;
91
+ }
92
+
93
+ static int eq_s(struct tokeniser * t, const char * s) {
94
+ int l = strlen(s);
95
+ if (SIZE(t->p) - t->c < l) return false;
96
+ {
97
+ int i;
98
+ for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false;
99
+ }
100
+ t->c += l; return true;
101
+ }
102
+
103
+ static int white_space(struct tokeniser * t, int ch) {
104
+ switch (ch) {
105
+ case '\n':
106
+ t->line_number++;
107
+ /* fall through */
108
+ case '\r':
109
+ case '\t':
110
+ case ' ':
111
+ return true;
112
+ }
113
+ return false;
114
+ }
115
+
116
+ static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) {
117
+ struct m_pair * q;
118
+ for (q = t->m_pairs; q; q = q->next) {
119
+ symbol * name = q->name;
120
+ if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value;
121
+ }
122
+ return 0;
123
+ }
124
+
125
+ static int read_literal_string(struct tokeniser * t, int c) {
126
+ symbol * p = t->p;
127
+ int ch;
128
+ SIZE(t->b) = 0;
129
+ while (true) {
130
+ if (c >= SIZE(p)) { error2(t, "'"); return c; }
131
+ ch = p[c];
132
+ if (ch == '\n') { error1(t, "string not terminated"); return c; }
133
+ c++;
134
+ if (ch == t->m_start) {
135
+ /* Inside insert characters. */
136
+ int c0 = c;
137
+ int newlines = false; /* no newlines as yet */
138
+ int black_found = false; /* no printing chars as yet */
139
+ while (true) {
140
+ if (c >= SIZE(p)) { error2(t, "'"); return c; }
141
+ ch = p[c]; c++;
142
+ if (ch == t->m_end) break;
143
+ if (!white_space(t, ch)) black_found = true;
144
+ if (ch == '\n') newlines = true;
145
+ if (newlines && black_found) {
146
+ error1(t, "string not terminated");
147
+ return c;
148
+ }
149
+ }
150
+ if (!newlines) {
151
+ int n = c - c0 - 1; /* macro size */
152
+ int firstch = p[c0];
153
+ symbol * q = find_in_m(t, n, p + c0);
154
+ if (q == 0) {
155
+ if (n == 1 && (firstch == '\'' || firstch == t->m_start))
156
+ t->b = add_to_b(t->b, 1, p + c0);
157
+ else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') {
158
+ int codepoint = 0;
159
+ int x;
160
+ if (t->uplusmode == UPLUS_DEFINED) {
161
+ /* See if found with xxxx upper-cased. */
162
+ symbol * uc = create_b(n);
163
+ int i;
164
+ for (i = 0; i != n; ++i) {
165
+ uc[i] = toupper(p[c0 + i]);
166
+ }
167
+ q = find_in_m(t, n, uc);
168
+ lose_b(uc);
169
+ if (q != 0) {
170
+ t->b = add_to_b(t->b, SIZE(q), q);
171
+ continue;
172
+ }
173
+ error1(t, "Some U+xxxx stringdefs seen but not this one");
174
+ } else {
175
+ t->uplusmode = UPLUS_UNICODE;
176
+ }
177
+ for (x = c0 + 2; x != c - 1; ++x) {
178
+ int hex = hex_to_num(p[x]);
179
+ if (hex < 0) {
180
+ error1(t, "Bad hex digit following U+");
181
+ break;
182
+ }
183
+ codepoint = (codepoint << 4) | hex;
184
+ }
185
+ if (t->encoding == ENC_UTF8) {
186
+ if (codepoint < 0 || codepoint > 0x01ffff) {
187
+ error1(t, "character values exceed 0x01ffff");
188
+ }
189
+ /* Ensure there's enough space for a max length
190
+ * UTF-8 sequence. */
191
+ if (CAPACITY(t->b) < SIZE(t->b) + 3) {
192
+ t->b = increase_capacity(t->b, 3);
193
+ }
194
+ SIZE(t->b) += put_utf8(codepoint, t->b + SIZE(t->b));
195
+ } else {
196
+ symbol sym;
197
+ if (t->encoding == ENC_SINGLEBYTE) {
198
+ /* Only ISO-8859-1 is handled this way - for
199
+ * other single-byte character sets you need
200
+ * stringdef all the U+xxxx codes you use
201
+ * like - e.g.:
202
+ *
203
+ * stringdef U+0171 hex 'FB'
204
+ */
205
+ if (codepoint < 0 || codepoint > 0xff) {
206
+ error1(t, "character values exceed 256");
207
+ }
208
+ } else {
209
+ if (codepoint < 0 || codepoint > 0xffff) {
210
+ error1(t, "character values exceed 64K");
211
+ }
212
+ }
213
+ sym = codepoint;
214
+ t->b = add_to_b(t->b, 1, &sym);
215
+ }
216
+ } else
217
+ error(t, "string macro '", n, p + c0, "' undeclared");
218
+ } else
219
+ t->b = add_to_b(t->b, SIZE(q), q);
220
+ }
221
+ } else {
222
+ if (ch == '\'') return c;
223
+ if (ch < 0 || ch >= 0x80) {
224
+ if (t->encoding != ENC_WIDECHARS) {
225
+ /* We don't really want people using non-ASCII literal
226
+ * strings, but historically it's worked for single-byte
227
+ * and UTF-8 if the source encoding matches what the
228
+ * generated stemmer works in and it seems unfair to just
229
+ * suddenly make this a hard error.`
230
+ */
231
+ fprintf(stderr,
232
+ "%s:%d: warning: Non-ASCII literal strings aren't "
233
+ "portable - use stringdef instead\n",
234
+ t->file, t->line_number);
235
+ } else {
236
+ error1(t, "Non-ASCII literal strings aren't "
237
+ "portable - use stringdef instead");
238
+ }
239
+ }
240
+ t->b = add_to_b(t->b, 1, p + c - 1);
241
+ }
242
+ }
243
+ }
244
+
245
+ static int next_token(struct tokeniser * t) {
246
+ symbol * p = t->p;
247
+ int c = t->c;
248
+ int ch;
249
+ int code = -1;
250
+ while (true) {
251
+ if (c >= SIZE(p)) { t->c = c; return -1; }
252
+ ch = p[c];
253
+ if (white_space(t, ch)) { c++; continue; }
254
+ if (isalpha(ch)) {
255
+ int c0 = c;
256
+ while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++;
257
+ code = find_word(c - c0, p + c0);
258
+ if (code < 0 || t->token_disabled[code]) {
259
+ t->b = move_to_b(t->b, c - c0, p + c0);
260
+ code = c_name;
261
+ }
262
+ } else
263
+ if (isdigit(ch)) {
264
+ int c0 = c;
265
+ while (c < SIZE(p) && isdigit(p[c])) c++;
266
+ t->number = get_number(c - c0, p + c0);
267
+ code = c_number;
268
+ } else
269
+ if (ch == '\'') {
270
+ c = read_literal_string(t, c + 1);
271
+ code = c_literalstring;
272
+ } else
273
+ {
274
+ int lim = smaller(2, SIZE(p) - c);
275
+ int i;
276
+ for (i = lim; i > 0; i--) {
277
+ code = find_word(i, p + c);
278
+ if (code >= 0) { c += i; break; }
279
+ }
280
+ }
281
+ if (code >= 0) {
282
+ t->c = c;
283
+ return code;
284
+ }
285
+ error(t, "'", 1, p + c, "' unknown");
286
+ c++;
287
+ continue;
288
+ }
289
+ }
290
+
291
+ static int next_char(struct tokeniser * t) {
292
+ if (t->c >= SIZE(t->p)) return -1;
293
+ return t->p[t->c++];
294
+ }
295
+
296
+ static int next_real_char(struct tokeniser * t) {
297
+ while (true) {
298
+ int ch = next_char(t);
299
+ if (!white_space(t, ch)) return ch;
300
+ }
301
+ }
302
+
303
+ static void read_chars(struct tokeniser * t) {
304
+ int ch = next_real_char(t);
305
+ if (ch < 0) { error2(t, "stringdef"); return; }
306
+ {
307
+ int c0 = t->c-1;
308
+ while (true) {
309
+ ch = next_char(t);
310
+ if (white_space(t, ch) || ch < 0) break;
311
+ }
312
+ t->b2 = move_to_b(t->b2, t->c - c0 - 1, t->p + c0);
313
+ }
314
+ }
315
+
316
+ static int decimal_to_num(int ch) {
317
+ if ('0' <= ch && ch <= '9') return ch - '0';
318
+ return -1;
319
+ }
320
+
321
+ static int hex_to_num(int ch) {
322
+ if ('0' <= ch && ch <= '9') return ch - '0';
323
+ if ('a' <= ch && ch <= 'f') return ch - 'a' + 10;
324
+ if ('A' <= ch && ch <= 'F') return ch - 'A' + 10;
325
+ return -1;
326
+ }
327
+
328
+ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
329
+ int c = 0; int d = 0;
330
+ while (true) {
331
+ while (c < SIZE(p) && p[c] == ' ') c++;
332
+ if (c == SIZE(p)) break;
333
+ {
334
+ int number = 0;
335
+ while (c != SIZE(p)) {
336
+ int ch = p[c];
337
+ if (ch == ' ') break;
338
+ if (base == 10) {
339
+ ch = decimal_to_num(ch);
340
+ if (ch < 0) {
341
+ error1(t, "decimal string contains non-digits");
342
+ return;
343
+ }
344
+ } else {
345
+ ch = hex_to_num(ch);
346
+ if (ch < 0) {
347
+ error1(t, "hex string contains non-hex characters");
348
+ return;
349
+ }
350
+ }
351
+ number = base * number + ch;
352
+ c++;
353
+ }
354
+ if (t->encoding == ENC_SINGLEBYTE) {
355
+ if (number < 0 || number > 0xff) {
356
+ error1(t, "character values exceed 256");
357
+ return;
358
+ }
359
+ } else {
360
+ if (number < 0 || number > 0xffff) {
361
+ error1(t, "character values exceed 64K");
362
+ return;
363
+ }
364
+ }
365
+ if (t->encoding == ENC_UTF8)
366
+ d += put_utf8(number, p + d);
367
+ else
368
+ p[d++] = number;
369
+ }
370
+ }
371
+ SIZE(p) = d;
372
+ }
373
+
374
+ extern int read_token(struct tokeniser * t) {
375
+ symbol * p = t->p;
376
+ int held = t->token_held;
377
+ t->token_held = false;
378
+ if (held) return t->token;
379
+ while (true) {
380
+ int code = next_token(t);
381
+ switch (code) {
382
+ case c_comment1: /* slash-slash comment */
383
+ while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;
384
+ continue;
385
+ case c_comment2: /* slash-star comment */
386
+ while (true) {
387
+ if (t->c >= SIZE(p)) {
388
+ error1(t, "/* comment not terminated");
389
+ t->token = -1;
390
+ return -1;
391
+ }
392
+ if (p[t->c] == '\n') t->line_number++;
393
+ if (eq_s(t, "*/")) break;
394
+ t->c++;
395
+ }
396
+ continue;
397
+ case c_stringescapes: {
398
+ int ch1 = next_real_char(t);
399
+ int ch2 = next_real_char(t);
400
+ if (ch2 < 0) {
401
+ error2(t, "stringescapes");
402
+ continue;
403
+ }
404
+ if (ch1 == '\'') {
405
+ error1(t, "first stringescape cannot be '");
406
+ continue;
407
+ }
408
+ t->m_start = ch1;
409
+ t->m_end = ch2;
410
+ continue;
411
+ }
412
+ case c_stringdef: {
413
+ int base = 0;
414
+ read_chars(t);
415
+ code = read_token(t);
416
+ if (code == c_hex) { base = 16; code = read_token(t); } else
417
+ if (code == c_decimal) { base = 10; code = read_token(t); }
418
+ if (code != c_literalstring) {
419
+ error1(t, "string omitted after stringdef");
420
+ continue;
421
+ }
422
+ if (base > 0) convert_numeric_string(t, t->b, base);
423
+ { NEW(m_pair, q);
424
+ q->next = t->m_pairs;
425
+ q->name = copy_b(t->b2);
426
+ q->value = copy_b(t->b);
427
+ t->m_pairs = q;
428
+ if (t->uplusmode != UPLUS_DEFINED &&
429
+ (SIZE(t->b2) >= 3 && t->b2[0] == 'U' && t->b2[1] == '+')) {
430
+ if (t->uplusmode == UPLUS_UNICODE) {
431
+ error1(t, "U+xxxx already used with implicit meaning");
432
+ } else {
433
+ t->uplusmode = UPLUS_DEFINED;
434
+ }
435
+ }
436
+ }
437
+ continue;
438
+ }
439
+ case c_get:
440
+ code = read_token(t);
441
+ if (code != c_literalstring) {
442
+ error1(t, "string omitted after get"); continue;
443
+ }
444
+ t->get_depth++;
445
+ if (t->get_depth > 10) {
446
+ error1(t, "get directives go 10 deep. Looping?");
447
+ exit(1);
448
+ }
449
+ {
450
+ NEW(input, q);
451
+ char * file = b_to_s(t->b);
452
+ symbol * u = get_input(file);
453
+ if (u == 0) {
454
+ struct include * r;
455
+ for (r = t->includes; r; r = r->next) {
456
+ symbol * b = copy_b(r->b);
457
+ b = add_to_b(b, SIZE(t->b), t->b);
458
+ free(file);
459
+ file = b_to_s(b);
460
+ u = get_input(file);
461
+ lose_b(b);
462
+ if (u != 0) break;
463
+ }
464
+ }
465
+ if (u == 0) {
466
+ error(t, "Can't get '", SIZE(t->b), t->b, "'");
467
+ exit(1);
468
+ }
469
+ memmove(q, t, sizeof(struct input));
470
+ t->next = q;
471
+ t->p = u;
472
+ t->c = 0;
473
+ t->file = file;
474
+ t->file_needs_freeing = true;
475
+ t->line_number = 1;
476
+ }
477
+ p = t->p;
478
+ continue;
479
+ case -1:
480
+ if (t->next) {
481
+ lose_b(p);
482
+ {
483
+ struct input * q = t->next;
484
+ memmove(t, q, sizeof(struct input)); p = t->p;
485
+ FREE(q);
486
+ }
487
+ t->get_depth--;
488
+ continue;
489
+ }
490
+ /* fall through */
491
+ default:
492
+ t->previous_token = t->token;
493
+ t->token = code;
494
+ return code;
495
+ }
496
+ }
497
+ }
498
+
499
+ extern const char * name_of_token(int code) {
500
+ int i;
501
+ for (i = 1; i < vocab->code; i++)
502
+ if ((vocab + i)->code == code) return (const char *)(vocab + i)->s;
503
+ switch (code) {
504
+ case c_mathassign: return "=";
505
+ case c_name: return "name";
506
+ case c_number: return "number";
507
+ case c_literalstring:return "literal";
508
+ case c_neg: return "neg";
509
+ case c_grouping: return "grouping";
510
+ case c_call: return "call";
511
+ case c_booltest: return "Boolean test";
512
+ case -2: return "start of text";
513
+ case -1: return "end of text";
514
+ default: return "?";
515
+ }
516
+ }
517
+
518
+ extern void disable_token(struct tokeniser * t, int code) {
519
+ t->token_disabled[code] = 1;
520
+ }
521
+
522
+ extern struct tokeniser * create_tokeniser(symbol * p, char * file) {
523
+ NEW(tokeniser, t);
524
+ t->next = 0;
525
+ t->p = p;
526
+ t->c = 0;
527
+ t->file = file;
528
+ t->file_needs_freeing = false;
529
+ t->line_number = 1;
530
+ t->b = create_b(0);
531
+ t->b2 = create_b(0);
532
+ t->m_start = -1;
533
+ t->m_pairs = 0;
534
+ t->get_depth = 0;
535
+ t->error_count = 0;
536
+ t->token_held = false;
537
+ t->token = -2;
538
+ t->previous_token = -2;
539
+ t->uplusmode = UPLUS_NONE;
540
+ memset(t->token_disabled, 0, sizeof(t->token_disabled));
541
+ return t;
542
+ }
543
+
544
+ extern void close_tokeniser(struct tokeniser * t) {
545
+ lose_b(t->b);
546
+ lose_b(t->b2);
547
+ {
548
+ struct m_pair * q = t->m_pairs;
549
+ while (q) {
550
+ struct m_pair * q_next = q->next;
551
+ lose_b(q->name);
552
+ lose_b(q->value);
553
+ FREE(q);
554
+ q = q_next;
555
+ }
556
+ }
557
+ {
558
+ struct input * q = t->next;
559
+ while (q) {
560
+ struct input * q_next = q->next;
561
+ FREE(q);
562
+ q = q_next;
563
+ }
564
+ }
565
+ if (t->file_needs_freeing) free(t->file);
566
+ FREE(t);
567
+ }
@@ -0,0 +1,8 @@
1
+ *.o
2
+ *.suo
3
+ *.user
4
+ *.GhostDoc.xml
5
+ bin/
6
+ obj/
7
+ TestResults/
8
+ TestResult.xml
@@ -0,0 +1 @@
1
+ *.generated.cs