mittens 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/LICENSE.txt +1 -1
- data/README.md +4 -4
- data/lib/mittens/version.rb +1 -1
- data/mittens.gemspec +1 -1
- data/vendor/snowball/.github/workflows/ci.yml +216 -0
- data/vendor/snowball/CONTRIBUTING.rst +111 -62
- data/vendor/snowball/GNUmakefile +194 -136
- data/vendor/snowball/NEWS +798 -3
- data/vendor/snowball/README.rst +50 -1
- data/vendor/snowball/ada/src/stemmer.adb +25 -13
- data/vendor/snowball/ada/src/stemmer.ads +9 -9
- data/vendor/snowball/ada/stemmer_config.gpr +7 -7
- data/vendor/snowball/algorithms/basque.sbl +4 -19
- data/vendor/snowball/algorithms/catalan.sbl +2 -9
- data/vendor/snowball/algorithms/danish.sbl +1 -1
- data/vendor/snowball/algorithms/dutch.sbl +284 -122
- data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
- data/vendor/snowball/algorithms/english.sbl +52 -37
- data/vendor/snowball/algorithms/esperanto.sbl +157 -0
- data/vendor/snowball/algorithms/estonian.sbl +269 -0
- data/vendor/snowball/algorithms/finnish.sbl +2 -3
- data/vendor/snowball/algorithms/french.sbl +42 -16
- data/vendor/snowball/algorithms/german.sbl +35 -14
- data/vendor/snowball/algorithms/greek.sbl +76 -76
- data/vendor/snowball/algorithms/hungarian.sbl +8 -6
- data/vendor/snowball/algorithms/indonesian.sbl +14 -8
- data/vendor/snowball/algorithms/italian.sbl +11 -21
- data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
- data/vendor/snowball/algorithms/lovins.sbl +0 -1
- data/vendor/snowball/algorithms/nepali.sbl +138 -37
- data/vendor/snowball/algorithms/norwegian.sbl +19 -5
- data/vendor/snowball/algorithms/porter.sbl +2 -2
- data/vendor/snowball/algorithms/portuguese.sbl +9 -13
- data/vendor/snowball/algorithms/romanian.sbl +17 -4
- data/vendor/snowball/algorithms/serbian.sbl +467 -468
- data/vendor/snowball/algorithms/spanish.sbl +5 -7
- data/vendor/snowball/algorithms/swedish.sbl +60 -6
- data/vendor/snowball/algorithms/tamil.sbl +207 -176
- data/vendor/snowball/algorithms/turkish.sbl +461 -445
- data/vendor/snowball/algorithms/yiddish.sbl +36 -38
- data/vendor/snowball/compiler/analyser.c +445 -192
- data/vendor/snowball/compiler/driver.c +109 -101
- data/vendor/snowball/compiler/generator.c +853 -464
- data/vendor/snowball/compiler/generator_ada.c +404 -366
- data/vendor/snowball/compiler/generator_csharp.c +297 -260
- data/vendor/snowball/compiler/generator_go.c +323 -254
- data/vendor/snowball/compiler/generator_java.c +326 -252
- data/vendor/snowball/compiler/generator_js.c +362 -252
- data/vendor/snowball/compiler/generator_pascal.c +349 -197
- data/vendor/snowball/compiler/generator_python.c +257 -240
- data/vendor/snowball/compiler/generator_rust.c +423 -251
- data/vendor/snowball/compiler/header.h +117 -71
- data/vendor/snowball/compiler/space.c +137 -68
- data/vendor/snowball/compiler/syswords.h +2 -2
- data/vendor/snowball/compiler/tokeniser.c +125 -107
- data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
- data/vendor/snowball/csharp/Stemwords/App.config +2 -2
- data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
- data/vendor/snowball/doc/libstemmer_c_README +7 -4
- data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
- data/vendor/snowball/doc/libstemmer_java_README +12 -1
- data/vendor/snowball/doc/libstemmer_js_README +6 -4
- data/vendor/snowball/doc/libstemmer_python_README +9 -4
- data/vendor/snowball/examples/stemwords.c +12 -12
- data/vendor/snowball/go/env.go +107 -31
- data/vendor/snowball/go/util.go +0 -4
- data/vendor/snowball/include/libstemmer.h +4 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
- data/vendor/snowball/javascript/base-stemmer.js +186 -2
- data/vendor/snowball/javascript/stemwords.js +3 -6
- data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
- data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
- data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
- data/vendor/snowball/libstemmer/modules.txt +13 -10
- data/vendor/snowball/libstemmer/test.c +1 -1
- data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
- data/vendor/snowball/pascal/generate.pl +13 -13
- data/vendor/snowball/python/create_init.py +4 -1
- data/vendor/snowball/python/setup.cfg +0 -3
- data/vendor/snowball/python/setup.py +8 -3
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
- data/vendor/snowball/python/stemwords.py +8 -12
- data/vendor/snowball/runtime/api.c +10 -5
- data/vendor/snowball/runtime/header.h +10 -9
- data/vendor/snowball/runtime/utilities.c +9 -9
- data/vendor/snowball/rust/build.rs +1 -1
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
- data/vendor/snowball/tests/stemtest.c +7 -4
- metadata +8 -12
- data/vendor/snowball/.travis.yml +0 -112
- data/vendor/snowball/algorithms/german2.sbl +0 -145
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
- data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -12,7 +12,7 @@ struct system_word {
|
|
12
12
|
};
|
13
13
|
|
14
14
|
|
15
|
-
/* ASCII collating assumed in syswords.
|
15
|
+
/* ASCII collating assumed in syswords.h */
|
16
16
|
|
17
17
|
#include "syswords.h"
|
18
18
|
|
@@ -22,16 +22,16 @@ static int hex_to_num(int ch);
|
|
22
22
|
|
23
23
|
static int smaller(int a, int b) { return a < b ? a : b; }
|
24
24
|
|
25
|
-
extern
|
25
|
+
extern byte * get_input(const char * filename) {
|
26
26
|
FILE * input = fopen(filename, "r");
|
27
|
-
if (input ==
|
27
|
+
if (input == NULL) { return NULL; }
|
28
28
|
{
|
29
|
-
|
29
|
+
byte * u = create_s(INITIAL_INPUT_BUFFER_SIZE);
|
30
30
|
int size = 0;
|
31
31
|
while (true) {
|
32
32
|
int ch = getc(input);
|
33
33
|
if (ch == EOF) break;
|
34
|
-
if (size >= CAPACITY(u)) u =
|
34
|
+
if (size >= CAPACITY(u)) u = increase_capacity_s(u, size);
|
35
35
|
u[size++] = ch;
|
36
36
|
}
|
37
37
|
fclose(input);
|
@@ -40,7 +40,7 @@ extern symbol * get_input(const char * filename) {
|
|
40
40
|
}
|
41
41
|
}
|
42
42
|
|
43
|
-
static void error(struct tokeniser * t, const char * s1,
|
43
|
+
static void error(struct tokeniser * t, const char * s1, byte * p, int n, const char * s2) {
|
44
44
|
if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); }
|
45
45
|
fprintf(stderr, "%s:%d: ", t->file, t->line_number);
|
46
46
|
if (s1) fprintf(stderr, "%s", s1);
|
@@ -54,25 +54,19 @@ static void error(struct tokeniser * t, const char * s1, int n, symbol * p, cons
|
|
54
54
|
}
|
55
55
|
|
56
56
|
static void error1(struct tokeniser * t, const char * s) {
|
57
|
-
error(t, s,
|
57
|
+
error(t, s, NULL, 0, NULL);
|
58
58
|
}
|
59
59
|
|
60
60
|
static void error2(struct tokeniser * t, const char * s) {
|
61
|
-
error(t, "unexpected end of text after ",
|
61
|
+
error(t, "unexpected end of text after ", NULL, 0, s);
|
62
62
|
}
|
63
63
|
|
64
|
-
static int compare_words(int m,
|
64
|
+
static int compare_words(int m, const byte * p, int n, const byte * q) {
|
65
65
|
if (m != n) return m - n;
|
66
|
-
|
67
|
-
int i; for (i = 0; i < n; i++) {
|
68
|
-
int diff = p[i] - q[i];
|
69
|
-
if (diff) return diff;
|
70
|
-
}
|
71
|
-
}
|
72
|
-
return 0;
|
66
|
+
return memcmp(p, q, n);
|
73
67
|
}
|
74
68
|
|
75
|
-
static int find_word(int n,
|
69
|
+
static int find_word(int n, const byte * p) {
|
76
70
|
int i = 0; int j = vocab->code;
|
77
71
|
do {
|
78
72
|
int k = i + (j - i)/2;
|
@@ -84,22 +78,6 @@ static int find_word(int n, symbol * p) {
|
|
84
78
|
return -1;
|
85
79
|
}
|
86
80
|
|
87
|
-
static int get_number(int n, symbol * p) {
|
88
|
-
int x = 0;
|
89
|
-
int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0';
|
90
|
-
return x;
|
91
|
-
}
|
92
|
-
|
93
|
-
static int eq_s(struct tokeniser * t, const char * s) {
|
94
|
-
int l = strlen(s);
|
95
|
-
if (SIZE(t->p) - t->c < l) return false;
|
96
|
-
{
|
97
|
-
int i;
|
98
|
-
for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false;
|
99
|
-
}
|
100
|
-
t->c += l; return true;
|
101
|
-
}
|
102
|
-
|
103
81
|
static int white_space(struct tokeniser * t, int ch) {
|
104
82
|
switch (ch) {
|
105
83
|
case '\n':
|
@@ -113,61 +91,65 @@ static int white_space(struct tokeniser * t, int ch) {
|
|
113
91
|
return false;
|
114
92
|
}
|
115
93
|
|
116
|
-
static symbol * find_in_m(struct tokeniser * t, int n,
|
94
|
+
static symbol * find_in_m(struct tokeniser * t, int n, byte * p) {
|
117
95
|
struct m_pair * q;
|
118
96
|
for (q = t->m_pairs; q; q = q->next) {
|
119
|
-
|
120
|
-
if (n == SIZE(name) && memcmp(name, p, n
|
97
|
+
byte * name = q->name;
|
98
|
+
if (n == SIZE(name) && memcmp(name, p, n) == 0) return q->value;
|
121
99
|
}
|
122
|
-
return
|
100
|
+
return NULL;
|
123
101
|
}
|
124
102
|
|
125
103
|
static int read_literal_string(struct tokeniser * t, int c) {
|
126
|
-
|
104
|
+
byte * p = t->p;
|
127
105
|
int ch;
|
128
106
|
SIZE(t->b) = 0;
|
129
107
|
while (true) {
|
130
|
-
if (c >= SIZE(p)
|
108
|
+
if (c >= SIZE(p) || p[c] == '\n') {
|
109
|
+
error1(t, "string literal not terminated");
|
110
|
+
return c;
|
111
|
+
}
|
131
112
|
ch = p[c];
|
132
|
-
if (ch == '\n') { error1(t, "string not terminated"); return c; }
|
133
113
|
c++;
|
134
114
|
if (ch == t->m_start) {
|
135
115
|
/* Inside insert characters. */
|
136
116
|
int c0 = c;
|
137
117
|
int newlines = false; /* no newlines as yet */
|
138
|
-
int
|
118
|
+
int all_whitespace = true; /* no printing chars as yet */
|
139
119
|
while (true) {
|
140
|
-
if (c >= SIZE(p)
|
141
|
-
|
142
|
-
if (ch == t->m_end) break;
|
143
|
-
if (!white_space(t, ch)) black_found = true;
|
144
|
-
if (ch == '\n') newlines = true;
|
145
|
-
if (newlines && black_found) {
|
146
|
-
error1(t, "string not terminated");
|
120
|
+
if (c >= SIZE(p) || (p[c] == '\n' && !all_whitespace)) {
|
121
|
+
error1(t, "string literal not terminated");
|
147
122
|
return c;
|
148
123
|
}
|
124
|
+
ch = p[c];
|
125
|
+
if (ch == '\n') {
|
126
|
+
newlines = true;
|
127
|
+
}
|
128
|
+
c++;
|
129
|
+
if (ch == t->m_end) break;
|
130
|
+
if (!white_space(t, ch)) all_whitespace = false;
|
149
131
|
}
|
150
132
|
if (!newlines) {
|
151
133
|
int n = c - c0 - 1; /* macro size */
|
152
134
|
int firstch = p[c0];
|
153
135
|
symbol * q = find_in_m(t, n, p + c0);
|
154
|
-
if (q ==
|
136
|
+
if (q == NULL) {
|
155
137
|
if (n == 1 && (firstch == '\'' || firstch == t->m_start))
|
156
|
-
t->b =
|
138
|
+
t->b = add_symbol_to_b(t->b, p[c0]);
|
157
139
|
else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') {
|
158
140
|
int codepoint = 0;
|
159
141
|
int x;
|
160
142
|
if (t->uplusmode == UPLUS_DEFINED) {
|
161
143
|
/* See if found with xxxx upper-cased. */
|
162
|
-
|
144
|
+
byte * uc = create_s(n);
|
163
145
|
int i;
|
164
146
|
for (i = 0; i != n; ++i) {
|
165
147
|
uc[i] = toupper(p[c0 + i]);
|
166
148
|
}
|
167
149
|
q = find_in_m(t, n, uc);
|
168
|
-
|
169
|
-
if (q !=
|
170
|
-
t->b = add_to_b(t->b,
|
150
|
+
lose_s(uc);
|
151
|
+
if (q != NULL) {
|
152
|
+
t->b = add_to_b(t->b, q, SIZE(q));
|
171
153
|
continue;
|
172
154
|
}
|
173
155
|
error1(t, "Some U+xxxx stringdefs seen but not this one");
|
@@ -189,15 +171,14 @@ static int read_literal_string(struct tokeniser * t, int c) {
|
|
189
171
|
/* Ensure there's enough space for a max length
|
190
172
|
* UTF-8 sequence. */
|
191
173
|
if (CAPACITY(t->b) < SIZE(t->b) + 3) {
|
192
|
-
t->b =
|
174
|
+
t->b = increase_capacity_b(t->b, 3);
|
193
175
|
}
|
194
176
|
SIZE(t->b) += put_utf8(codepoint, t->b + SIZE(t->b));
|
195
177
|
} else {
|
196
|
-
symbol sym;
|
197
178
|
if (t->encoding == ENC_SINGLEBYTE) {
|
198
179
|
/* Only ISO-8859-1 is handled this way - for
|
199
180
|
* other single-byte character sets you need
|
200
|
-
* stringdef all the U+xxxx codes you use
|
181
|
+
* to stringdef all the U+xxxx codes you use
|
201
182
|
* like - e.g.:
|
202
183
|
*
|
203
184
|
* stringdef U+0171 hex 'FB'
|
@@ -210,13 +191,14 @@ static int read_literal_string(struct tokeniser * t, int c) {
|
|
210
191
|
error1(t, "character values exceed 64K");
|
211
192
|
}
|
212
193
|
}
|
213
|
-
|
214
|
-
t->b = add_to_b(t->b, 1, &sym);
|
194
|
+
t->b = add_symbol_to_b(t->b, (symbol)codepoint);
|
215
195
|
}
|
216
|
-
} else
|
217
|
-
error(t, "string macro '",
|
218
|
-
|
219
|
-
|
196
|
+
} else {
|
197
|
+
error(t, "string macro '", p + c0, n, "' undeclared");
|
198
|
+
}
|
199
|
+
} else {
|
200
|
+
t->b = add_to_b(t->b, q, SIZE(q));
|
201
|
+
}
|
220
202
|
}
|
221
203
|
} else {
|
222
204
|
if (ch == '\'') return c;
|
@@ -226,7 +208,7 @@ static int read_literal_string(struct tokeniser * t, int c) {
|
|
226
208
|
* strings, but historically it's worked for single-byte
|
227
209
|
* and UTF-8 if the source encoding matches what the
|
228
210
|
* generated stemmer works in and it seems unfair to just
|
229
|
-
* suddenly make this a hard error
|
211
|
+
* suddenly make this a hard error.
|
230
212
|
*/
|
231
213
|
fprintf(stderr,
|
232
214
|
"%s:%d: warning: Non-ASCII literal strings aren't "
|
@@ -237,13 +219,13 @@ static int read_literal_string(struct tokeniser * t, int c) {
|
|
237
219
|
"portable - use stringdef instead");
|
238
220
|
}
|
239
221
|
}
|
240
|
-
t->b =
|
222
|
+
t->b = add_symbol_to_b(t->b, p[c - 1]);
|
241
223
|
}
|
242
224
|
}
|
243
225
|
}
|
244
226
|
|
245
227
|
static int next_token(struct tokeniser * t) {
|
246
|
-
|
228
|
+
byte * p = t->p;
|
247
229
|
int c = t->c;
|
248
230
|
int ch;
|
249
231
|
int code = -1;
|
@@ -256,21 +238,21 @@ static int next_token(struct tokeniser * t) {
|
|
256
238
|
while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++;
|
257
239
|
code = find_word(c - c0, p + c0);
|
258
240
|
if (code < 0 || t->token_disabled[code]) {
|
259
|
-
t->
|
241
|
+
SIZE(t->s) = 0;
|
242
|
+
t->s = add_s_to_s(t->s, (const char*)p + c0, c - c0);
|
260
243
|
code = c_name;
|
261
244
|
}
|
262
|
-
} else
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
245
|
+
} else if (isdigit(ch)) {
|
246
|
+
int value = ch - '0';
|
247
|
+
while (++c < SIZE(p) && isdigit(p[c])) {
|
248
|
+
value = 10 * value + (p[c] - '0');
|
249
|
+
}
|
250
|
+
t->number = value;
|
267
251
|
code = c_number;
|
268
|
-
} else
|
269
|
-
if (ch == '\'') {
|
252
|
+
} else if (ch == '\'') {
|
270
253
|
c = read_literal_string(t, c + 1);
|
271
254
|
code = c_literalstring;
|
272
|
-
} else
|
273
|
-
{
|
255
|
+
} else {
|
274
256
|
int lim = smaller(2, SIZE(p) - c);
|
275
257
|
int i;
|
276
258
|
for (i = lim; i > 0; i--) {
|
@@ -282,7 +264,7 @@ static int next_token(struct tokeniser * t) {
|
|
282
264
|
t->c = c;
|
283
265
|
return code;
|
284
266
|
}
|
285
|
-
error(t, "'",
|
267
|
+
error(t, "'", p + c, 1, "' unknown");
|
286
268
|
c++;
|
287
269
|
continue;
|
288
270
|
}
|
@@ -309,7 +291,8 @@ static void read_chars(struct tokeniser * t) {
|
|
309
291
|
ch = next_char(t);
|
310
292
|
if (white_space(t, ch) || ch < 0) break;
|
311
293
|
}
|
312
|
-
t->
|
294
|
+
SIZE(t->s) = 0;
|
295
|
+
t->s = add_s_to_s(t->s, (const char*)t->p + c0, t->c - c0 - 1);
|
313
296
|
}
|
314
297
|
}
|
315
298
|
|
@@ -372,28 +355,39 @@ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
|
|
372
355
|
}
|
373
356
|
|
374
357
|
extern int read_token(struct tokeniser * t) {
|
375
|
-
|
358
|
+
byte * p = t->p;
|
376
359
|
int held = t->token_held;
|
377
360
|
t->token_held = false;
|
378
361
|
if (held) return t->token;
|
362
|
+
t->token_reported_as_unexpected = false;
|
379
363
|
while (true) {
|
380
364
|
int code = next_token(t);
|
381
365
|
switch (code) {
|
382
366
|
case c_comment1: /* slash-slash comment */
|
383
367
|
while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;
|
384
368
|
continue;
|
385
|
-
case c_comment2: /* slash-star comment */
|
369
|
+
case c_comment2: { /* slash-star comment */
|
370
|
+
// Scan for a '*' stopping one before the end since we need a
|
371
|
+
// '/' to follow it to close the comment.
|
372
|
+
int size_less_one = SIZE(p) - 1;
|
373
|
+
int c = t->c;
|
386
374
|
while (true) {
|
387
|
-
if (
|
375
|
+
if (c >= size_less_one) {
|
388
376
|
error1(t, "/* comment not terminated");
|
389
377
|
t->token = -1;
|
390
378
|
return -1;
|
391
379
|
}
|
392
|
-
if (p[
|
393
|
-
|
394
|
-
|
380
|
+
if (p[c] == '\n') {
|
381
|
+
t->line_number++;
|
382
|
+
} else if (p[c] == '*' && p[c + 1] == '/') {
|
383
|
+
// Found '*/' to end of comment.
|
384
|
+
t->c = c + 2;
|
385
|
+
break;
|
386
|
+
}
|
387
|
+
++c;
|
395
388
|
}
|
396
389
|
continue;
|
390
|
+
}
|
397
391
|
case c_stringescapes: {
|
398
392
|
int ch1 = next_real_char(t);
|
399
393
|
int ch2 = next_real_char(t);
|
@@ -422,11 +416,11 @@ extern int read_token(struct tokeniser * t) {
|
|
422
416
|
if (base > 0) convert_numeric_string(t, t->b, base);
|
423
417
|
{ NEW(m_pair, q);
|
424
418
|
q->next = t->m_pairs;
|
425
|
-
q->name =
|
419
|
+
q->name = copy_s(t->s);
|
426
420
|
q->value = copy_b(t->b);
|
427
421
|
t->m_pairs = q;
|
428
422
|
if (t->uplusmode != UPLUS_DEFINED &&
|
429
|
-
(SIZE(t->
|
423
|
+
(SIZE(t->s) >= 3 && t->s[0] == 'U' && t->s[1] == '+')) {
|
430
424
|
if (t->uplusmode == UPLUS_UNICODE) {
|
431
425
|
error1(t, "U+xxxx already used with implicit meaning");
|
432
426
|
} else {
|
@@ -448,22 +442,28 @@ extern int read_token(struct tokeniser * t) {
|
|
448
442
|
}
|
449
443
|
{
|
450
444
|
NEW(input, q);
|
451
|
-
char * file =
|
452
|
-
|
453
|
-
|
445
|
+
char * file = b_to_sz(t->b);
|
446
|
+
int file_owned = 1;
|
447
|
+
byte * u = get_input(file);
|
448
|
+
if (u == NULL) {
|
454
449
|
struct include * r;
|
455
450
|
for (r = t->includes; r; r = r->next) {
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
451
|
+
byte * s = copy_s(r->s);
|
452
|
+
s = add_sz_to_s(s, file);
|
453
|
+
s[SIZE(s)] = 0;
|
454
|
+
if (file_owned > 0) {
|
455
|
+
free(file);
|
456
|
+
} else {
|
457
|
+
lose_s((byte *)file);
|
458
|
+
}
|
459
|
+
file = (char*)s;
|
460
|
+
file_owned = -1;
|
460
461
|
u = get_input(file);
|
461
|
-
|
462
|
-
if (u != 0) break;
|
462
|
+
if (u != NULL) break;
|
463
463
|
}
|
464
464
|
}
|
465
|
-
if (u ==
|
466
|
-
error(t, "Can't get '",
|
465
|
+
if (u == NULL) {
|
466
|
+
error(t, "Can't get '", (byte *)file, strlen(file), "'");
|
467
467
|
exit(1);
|
468
468
|
}
|
469
469
|
memmove(q, t, sizeof(struct input));
|
@@ -471,14 +471,14 @@ extern int read_token(struct tokeniser * t) {
|
|
471
471
|
t->p = u;
|
472
472
|
t->c = 0;
|
473
473
|
t->file = file;
|
474
|
-
t->
|
474
|
+
t->file_owned = file_owned;
|
475
475
|
t->line_number = 1;
|
476
476
|
}
|
477
477
|
p = t->p;
|
478
478
|
continue;
|
479
479
|
case -1:
|
480
480
|
if (t->next) {
|
481
|
-
|
481
|
+
lose_s(p);
|
482
482
|
{
|
483
483
|
struct input * q = t->next;
|
484
484
|
memmove(t, q, sizeof(struct input)); p = t->p;
|
@@ -496,6 +496,12 @@ extern int read_token(struct tokeniser * t) {
|
|
496
496
|
}
|
497
497
|
}
|
498
498
|
|
499
|
+
extern int peek_token(struct tokeniser * t) {
|
500
|
+
int token = read_token(t);
|
501
|
+
t->token_held = true;
|
502
|
+
return token;
|
503
|
+
}
|
504
|
+
|
499
505
|
extern const char * name_of_token(int code) {
|
500
506
|
int i;
|
501
507
|
for (i = 1; i < vocab->code; i++)
|
@@ -509,6 +515,13 @@ extern const char * name_of_token(int code) {
|
|
509
515
|
case c_grouping: return "grouping";
|
510
516
|
case c_call: return "call";
|
511
517
|
case c_booltest: return "Boolean test";
|
518
|
+
case c_functionend: return "Function end";
|
519
|
+
case c_goto_grouping:
|
520
|
+
return "goto grouping";
|
521
|
+
case c_gopast_grouping:
|
522
|
+
return "gopast grouping";
|
523
|
+
case c_goto_non: return "goto non";
|
524
|
+
case c_gopast_non: return "gopast non";
|
512
525
|
case -2: return "start of text";
|
513
526
|
case -1: return "end of text";
|
514
527
|
default: return "?";
|
@@ -519,21 +532,22 @@ extern void disable_token(struct tokeniser * t, int code) {
|
|
519
532
|
t->token_disabled[code] = 1;
|
520
533
|
}
|
521
534
|
|
522
|
-
extern struct tokeniser * create_tokeniser(
|
535
|
+
extern struct tokeniser * create_tokeniser(byte * p, char * file) {
|
523
536
|
NEW(tokeniser, t);
|
524
|
-
t->next =
|
537
|
+
t->next = NULL;
|
525
538
|
t->p = p;
|
526
539
|
t->c = 0;
|
527
540
|
t->file = file;
|
528
|
-
t->
|
541
|
+
t->file_owned = 0;
|
529
542
|
t->line_number = 1;
|
530
543
|
t->b = create_b(0);
|
531
|
-
t->
|
544
|
+
t->s = create_s(0);
|
532
545
|
t->m_start = -1;
|
533
|
-
t->m_pairs =
|
546
|
+
t->m_pairs = NULL;
|
534
547
|
t->get_depth = 0;
|
535
548
|
t->error_count = 0;
|
536
549
|
t->token_held = false;
|
550
|
+
t->token_reported_as_unexpected = false;
|
537
551
|
t->token = -2;
|
538
552
|
t->previous_token = -2;
|
539
553
|
t->uplusmode = UPLUS_NONE;
|
@@ -543,12 +557,12 @@ extern struct tokeniser * create_tokeniser(symbol * p, char * file) {
|
|
543
557
|
|
544
558
|
extern void close_tokeniser(struct tokeniser * t) {
|
545
559
|
lose_b(t->b);
|
546
|
-
|
560
|
+
lose_s(t->s);
|
547
561
|
{
|
548
562
|
struct m_pair * q = t->m_pairs;
|
549
563
|
while (q) {
|
550
564
|
struct m_pair * q_next = q->next;
|
551
|
-
|
565
|
+
lose_s(q->name);
|
552
566
|
lose_b(q->value);
|
553
567
|
FREE(q);
|
554
568
|
q = q_next;
|
@@ -562,6 +576,10 @@ extern void close_tokeniser(struct tokeniser * t) {
|
|
562
576
|
q = q_next;
|
563
577
|
}
|
564
578
|
}
|
565
|
-
if (t->
|
579
|
+
if (t->file_owned > 0) {
|
580
|
+
free(t->file);
|
581
|
+
} else if (t->file_owned < 0) {
|
582
|
+
lose_s((byte *)t->file);
|
583
|
+
}
|
566
584
|
FREE(t);
|
567
585
|
}
|
@@ -2,10 +2,10 @@
|
|
2
2
|
// Copyright (c) 2002, Richard Boulton
|
3
3
|
// Copyright (c) 2015, Cesar Souza
|
4
4
|
// All rights reserved.
|
5
|
-
//
|
5
|
+
//
|
6
6
|
// Redistribution and use in source and binary forms, with or without
|
7
7
|
// modification, are permitted provided that the following conditions are met:
|
8
|
-
//
|
8
|
+
//
|
9
9
|
// * Redistributions of source code must retain the above copyright notice,
|
10
10
|
// * this list of conditions and the following disclaimer.
|
11
11
|
// * Redistributions in binary form must reproduce the above copyright
|
@@ -14,7 +14,7 @@
|
|
14
14
|
// * Neither the name of the copyright holders nor the names of its contributors
|
15
15
|
// * may be used to endorse or promote products derived from this software
|
16
16
|
// * without specific prior written permission.
|
17
|
-
//
|
17
|
+
//
|
18
18
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
19
19
|
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
20
20
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
@@ -34,41 +34,41 @@ namespace Snowball
|
|
34
34
|
/// <summary>
|
35
35
|
/// Snowball's among construction.
|
36
36
|
/// </summary>
|
37
|
-
///
|
37
|
+
///
|
38
38
|
public sealed class Among
|
39
39
|
{
|
40
40
|
/// <summary>
|
41
41
|
/// Search string.
|
42
42
|
/// </summary>
|
43
|
-
///
|
43
|
+
///
|
44
44
|
public string SearchString { get; private set; }
|
45
45
|
|
46
46
|
/// <summary>
|
47
47
|
/// Index to longest matching substring.
|
48
48
|
/// </summary>
|
49
|
-
///
|
49
|
+
///
|
50
50
|
public int MatchIndex { get; private set; }
|
51
51
|
|
52
52
|
/// <summary>
|
53
53
|
/// Result of the lookup.
|
54
54
|
/// </summary>
|
55
|
-
///
|
55
|
+
///
|
56
56
|
public int Result { get; private set; }
|
57
57
|
|
58
58
|
/// <summary>
|
59
59
|
/// Action to be invoked.
|
60
60
|
/// </summary>
|
61
|
-
///
|
61
|
+
///
|
62
62
|
public Func<bool> Action { get; private set; }
|
63
63
|
|
64
64
|
/// <summary>
|
65
65
|
/// Initializes a new instance of the <see cref="Among"/> class.
|
66
66
|
/// </summary>
|
67
|
-
///
|
67
|
+
///
|
68
68
|
/// <param name="str">The search string.</param>
|
69
69
|
/// <param name="index">The index to the longest matching substring.</param>
|
70
70
|
/// <param name="result">The result of the lookup.</param>
|
71
|
-
///
|
71
|
+
///
|
72
72
|
public Among(String str, int index, int result)
|
73
73
|
: this(str, index, result, null)
|
74
74
|
{
|
@@ -77,12 +77,12 @@ namespace Snowball
|
|
77
77
|
/// <summary>
|
78
78
|
/// Initializes a new instance of the <see cref="Among"/> class.
|
79
79
|
/// </summary>
|
80
|
-
///
|
80
|
+
///
|
81
81
|
/// <param name="str">The search string.</param>
|
82
82
|
/// <param name="index">The index to the longest matching substring.</param>
|
83
83
|
/// <param name="result">The result of the lookup.</param>
|
84
84
|
/// <param name="action">The action to be performed, if any.</param>
|
85
|
-
///
|
85
|
+
///
|
86
86
|
public Among(String str, int index, int result, Func<bool> action)
|
87
87
|
{
|
88
88
|
this.SearchString = str;
|
@@ -94,11 +94,11 @@ namespace Snowball
|
|
94
94
|
/// <summary>
|
95
95
|
/// Returns a <see cref="System.String" /> that represents this instance.
|
96
96
|
/// </summary>
|
97
|
-
///
|
97
|
+
///
|
98
98
|
/// <returns>
|
99
99
|
/// A <see cref="System.String" /> that represents this instance.
|
100
100
|
/// </returns>
|
101
|
-
///
|
101
|
+
///
|
102
102
|
public override string ToString()
|
103
103
|
{
|
104
104
|
return SearchString;
|
@@ -2,7 +2,7 @@
|
|
2
2
|
using System.Runtime.CompilerServices;
|
3
3
|
using System.Runtime.InteropServices;
|
4
4
|
|
5
|
-
// General Information about an assembly is controlled through the following
|
5
|
+
// General Information about an assembly is controlled through the following
|
6
6
|
// set of attributes. Change these attribute values to modify the information
|
7
7
|
// associated with an assembly.
|
8
8
|
[assembly: AssemblyTitle("Snowball")]
|
@@ -14,8 +14,8 @@ using System.Runtime.InteropServices;
|
|
14
14
|
[assembly: AssemblyTrademark("")]
|
15
15
|
[assembly: AssemblyCulture("")]
|
16
16
|
|
17
|
-
// Setting ComVisible to false makes the types in this assembly not visible
|
18
|
-
// to COM components. If you need to access a type in this assembly from
|
17
|
+
// Setting ComVisible to false makes the types in this assembly not visible
|
18
|
+
// to COM components. If you need to access a type in this assembly from
|
19
19
|
// COM, set the ComVisible attribute to true on that type.
|
20
20
|
[assembly: ComVisible(false)]
|
21
21
|
|
@@ -25,12 +25,12 @@ using System.Runtime.InteropServices;
|
|
25
25
|
// Version information for an assembly consists of the following four values:
|
26
26
|
//
|
27
27
|
// Major Version
|
28
|
-
// Minor Version
|
28
|
+
// Minor Version
|
29
29
|
// Build Number
|
30
30
|
// Revision
|
31
31
|
//
|
32
|
-
// You can specify all the values or you can default the Build and Revision Numbers
|
32
|
+
// You can specify all the values or you can default the Build and Revision Numbers
|
33
33
|
// by using the '*' as shown below:
|
34
34
|
// [assembly: AssemblyVersion("1.0.*")]
|
35
|
-
[assembly: AssemblyVersion(/*SNOWBALL_VERSION*/"
|
36
|
-
[assembly: AssemblyFileVersion(/*SNOWBALL_VERSION*/"
|
35
|
+
[assembly: AssemblyVersion(/*SNOWBALL_VERSION*/"3.0.1.0")]
|
36
|
+
[assembly: AssemblyFileVersion(/*SNOWBALL_VERSION*/"3.0.1.0")]
|