mittens 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
static const struct system_word vocab[82+1] = {
|
2
|
+
{ 0, (const byte *)"", 82+1},
|
3
|
+
|
4
|
+
{ 1, (const byte *)"$", c_dollar },
|
5
|
+
{ 1, (const byte *)"(", c_bra },
|
6
|
+
{ 1, (const byte *)")", c_ket },
|
7
|
+
{ 1, (const byte *)"*", c_multiply },
|
8
|
+
{ 1, (const byte *)"+", c_plus },
|
9
|
+
{ 1, (const byte *)"-", c_minus },
|
10
|
+
{ 1, (const byte *)"/", c_divide },
|
11
|
+
{ 1, (const byte *)"<", c_ls },
|
12
|
+
{ 1, (const byte *)"=", c_assign },
|
13
|
+
{ 1, (const byte *)">", c_gr },
|
14
|
+
{ 1, (const byte *)"?", c_debug },
|
15
|
+
{ 1, (const byte *)"[", c_leftslice },
|
16
|
+
{ 1, (const byte *)"]", c_rightslice },
|
17
|
+
{ 2, (const byte *)"!=", c_ne },
|
18
|
+
{ 2, (const byte *)"*=", c_multiplyassign },
|
19
|
+
{ 2, (const byte *)"+=", c_plusassign },
|
20
|
+
{ 2, (const byte *)"-=", c_minusassign },
|
21
|
+
{ 2, (const byte *)"->", c_sliceto },
|
22
|
+
{ 2, (const byte *)"/*", c_comment2 },
|
23
|
+
{ 2, (const byte *)"//", c_comment1 },
|
24
|
+
{ 2, (const byte *)"/=", c_divideassign },
|
25
|
+
{ 2, (const byte *)"<+", c_insert },
|
26
|
+
{ 2, (const byte *)"<-", c_slicefrom },
|
27
|
+
{ 2, (const byte *)"<=", c_le },
|
28
|
+
{ 2, (const byte *)"==", c_eq },
|
29
|
+
{ 2, (const byte *)"=>", c_assignto },
|
30
|
+
{ 2, (const byte *)">=", c_ge },
|
31
|
+
{ 2, (const byte *)"as", c_as },
|
32
|
+
{ 2, (const byte *)"do", c_do },
|
33
|
+
{ 2, (const byte *)"or", c_or },
|
34
|
+
{ 3, (const byte *)"and", c_and },
|
35
|
+
{ 3, (const byte *)"for", c_for },
|
36
|
+
{ 3, (const byte *)"get", c_get },
|
37
|
+
{ 3, (const byte *)"hex", c_hex },
|
38
|
+
{ 3, (const byte *)"hop", c_hop },
|
39
|
+
{ 3, (const byte *)"len", c_len },
|
40
|
+
{ 3, (const byte *)"non", c_non },
|
41
|
+
{ 3, (const byte *)"not", c_not },
|
42
|
+
{ 3, (const byte *)"set", c_set },
|
43
|
+
{ 3, (const byte *)"try", c_try },
|
44
|
+
{ 4, (const byte *)"fail", c_fail },
|
45
|
+
{ 4, (const byte *)"goto", c_goto },
|
46
|
+
{ 4, (const byte *)"loop", c_loop },
|
47
|
+
{ 4, (const byte *)"next", c_next },
|
48
|
+
{ 4, (const byte *)"size", c_size },
|
49
|
+
{ 4, (const byte *)"test", c_test },
|
50
|
+
{ 4, (const byte *)"true", c_true },
|
51
|
+
{ 5, (const byte *)"among", c_among },
|
52
|
+
{ 5, (const byte *)"false", c_false },
|
53
|
+
{ 5, (const byte *)"lenof", c_lenof },
|
54
|
+
{ 5, (const byte *)"limit", c_limit },
|
55
|
+
{ 5, (const byte *)"unset", c_unset },
|
56
|
+
{ 6, (const byte *)"atmark", c_atmark },
|
57
|
+
{ 6, (const byte *)"attach", c_attach },
|
58
|
+
{ 6, (const byte *)"cursor", c_cursor },
|
59
|
+
{ 6, (const byte *)"define", c_define },
|
60
|
+
{ 6, (const byte *)"delete", c_delete },
|
61
|
+
{ 6, (const byte *)"gopast", c_gopast },
|
62
|
+
{ 6, (const byte *)"insert", c_insert },
|
63
|
+
{ 6, (const byte *)"maxint", c_maxint },
|
64
|
+
{ 6, (const byte *)"minint", c_minint },
|
65
|
+
{ 6, (const byte *)"repeat", c_repeat },
|
66
|
+
{ 6, (const byte *)"sizeof", c_sizeof },
|
67
|
+
{ 6, (const byte *)"tomark", c_tomark },
|
68
|
+
{ 7, (const byte *)"atleast", c_atleast },
|
69
|
+
{ 7, (const byte *)"atlimit", c_atlimit },
|
70
|
+
{ 7, (const byte *)"decimal", c_decimal },
|
71
|
+
{ 7, (const byte *)"reverse", c_reverse },
|
72
|
+
{ 7, (const byte *)"setmark", c_setmark },
|
73
|
+
{ 7, (const byte *)"strings", c_strings },
|
74
|
+
{ 7, (const byte *)"tolimit", c_tolimit },
|
75
|
+
{ 8, (const byte *)"booleans", c_booleans },
|
76
|
+
{ 8, (const byte *)"integers", c_integers },
|
77
|
+
{ 8, (const byte *)"routines", c_routines },
|
78
|
+
{ 8, (const byte *)"setlimit", c_setlimit },
|
79
|
+
{ 9, (const byte *)"backwards", c_backwards },
|
80
|
+
{ 9, (const byte *)"externals", c_externals },
|
81
|
+
{ 9, (const byte *)"groupings", c_groupings },
|
82
|
+
{ 9, (const byte *)"stringdef", c_stringdef },
|
83
|
+
{ 9, (const byte *)"substring", c_substring },
|
84
|
+
{ 12, (const byte *)"backwardmode", c_backwardmode },
|
85
|
+
{ 13, (const byte *)"stringescapes", c_stringescapes }
|
86
|
+
};
|
@@ -0,0 +1,13 @@
|
|
1
|
+
c_among = 4, c_and, c_as, c_assign, c_assignto, c_atleast,
|
2
|
+
c_atlimit, c_atmark, c_attach, c_backwardmode, c_backwards,
|
3
|
+
c_booleans, c_bra, c_comment1, c_comment2, c_cursor, c_debug,
|
4
|
+
c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do,
|
5
|
+
c_dollar, c_eq, c_externals, c_fail, c_false, c_for, c_ge, c_get,
|
6
|
+
c_gopast, c_goto, c_gr, c_groupings, c_hex, c_hop, c_insert,
|
7
|
+
c_integers, c_ket, c_le, c_leftslice, c_len, c_lenof, c_limit, c_loop,
|
8
|
+
c_ls, c_maxint, c_minint, c_minus, c_minusassign, c_multiply,
|
9
|
+
c_multiplyassign, c_ne, c_next, c_non, c_not, c_or, c_plus,
|
10
|
+
c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines,
|
11
|
+
c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom,
|
12
|
+
c_sliceto, c_stringdef, c_stringescapes, c_strings, c_substring,
|
13
|
+
c_test, c_tolimit, c_tomark, c_true, c_try, c_unset,
|
@@ -0,0 +1,567 @@
|
|
1
|
+
|
2
|
+
#include <stdio.h> /* stderr etc */
|
3
|
+
#include <stdlib.h> /* malloc free */
|
4
|
+
#include <string.h> /* strlen */
|
5
|
+
#include <ctype.h> /* isalpha etc */
|
6
|
+
#include "header.h"
|
7
|
+
|
8
|
+
struct system_word {
|
9
|
+
int s_size; /* size of system word */
|
10
|
+
const byte * s; /* pointer to the system word */
|
11
|
+
int code; /* its internal code */
|
12
|
+
};
|
13
|
+
|
14
|
+
|
15
|
+
/* ASCII collating assumed in syswords.c */
|
16
|
+
|
17
|
+
#include "syswords.h"
|
18
|
+
|
19
|
+
#define INITIAL_INPUT_BUFFER_SIZE 8192
|
20
|
+
|
21
|
+
static int hex_to_num(int ch);
|
22
|
+
|
23
|
+
static int smaller(int a, int b) { return a < b ? a : b; }
|
24
|
+
|
25
|
+
extern symbol * get_input(const char * filename) {
|
26
|
+
FILE * input = fopen(filename, "r");
|
27
|
+
if (input == 0) { return 0; }
|
28
|
+
{
|
29
|
+
symbol * u = create_b(INITIAL_INPUT_BUFFER_SIZE);
|
30
|
+
int size = 0;
|
31
|
+
while (true) {
|
32
|
+
int ch = getc(input);
|
33
|
+
if (ch == EOF) break;
|
34
|
+
if (size >= CAPACITY(u)) u = increase_capacity(u, size);
|
35
|
+
u[size++] = ch;
|
36
|
+
}
|
37
|
+
fclose(input);
|
38
|
+
SIZE(u) = size;
|
39
|
+
return u;
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
static void error(struct tokeniser * t, const char * s1, int n, symbol * p, const char * s2) {
|
44
|
+
if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); }
|
45
|
+
fprintf(stderr, "%s:%d: ", t->file, t->line_number);
|
46
|
+
if (s1) fprintf(stderr, "%s", s1);
|
47
|
+
if (p) {
|
48
|
+
int i;
|
49
|
+
for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]);
|
50
|
+
}
|
51
|
+
if (s2) fprintf(stderr, "%s", s2);
|
52
|
+
fprintf(stderr, "\n");
|
53
|
+
t->error_count++;
|
54
|
+
}
|
55
|
+
|
56
|
+
static void error1(struct tokeniser * t, const char * s) {
|
57
|
+
error(t, s, 0,0, 0);
|
58
|
+
}
|
59
|
+
|
60
|
+
static void error2(struct tokeniser * t, const char * s) {
|
61
|
+
error(t, "unexpected end of text after ", 0,0, s);
|
62
|
+
}
|
63
|
+
|
64
|
+
static int compare_words(int m, symbol * p, int n, const byte * q) {
|
65
|
+
if (m != n) return m - n;
|
66
|
+
{
|
67
|
+
int i; for (i = 0; i < n; i++) {
|
68
|
+
int diff = p[i] - q[i];
|
69
|
+
if (diff) return diff;
|
70
|
+
}
|
71
|
+
}
|
72
|
+
return 0;
|
73
|
+
}
|
74
|
+
|
75
|
+
static int find_word(int n, symbol * p) {
|
76
|
+
int i = 0; int j = vocab->code;
|
77
|
+
do {
|
78
|
+
int k = i + (j - i)/2;
|
79
|
+
const struct system_word * w = vocab + k;
|
80
|
+
int diff = compare_words(n, p, w->s_size, w->s);
|
81
|
+
if (diff == 0) return w->code;
|
82
|
+
if (diff < 0) j = k; else i = k;
|
83
|
+
} while (j - i != 1);
|
84
|
+
return -1;
|
85
|
+
}
|
86
|
+
|
87
|
+
static int get_number(int n, symbol * p) {
|
88
|
+
int x = 0;
|
89
|
+
int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0';
|
90
|
+
return x;
|
91
|
+
}
|
92
|
+
|
93
|
+
static int eq_s(struct tokeniser * t, const char * s) {
|
94
|
+
int l = strlen(s);
|
95
|
+
if (SIZE(t->p) - t->c < l) return false;
|
96
|
+
{
|
97
|
+
int i;
|
98
|
+
for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false;
|
99
|
+
}
|
100
|
+
t->c += l; return true;
|
101
|
+
}
|
102
|
+
|
103
|
+
static int white_space(struct tokeniser * t, int ch) {
|
104
|
+
switch (ch) {
|
105
|
+
case '\n':
|
106
|
+
t->line_number++;
|
107
|
+
/* fall through */
|
108
|
+
case '\r':
|
109
|
+
case '\t':
|
110
|
+
case ' ':
|
111
|
+
return true;
|
112
|
+
}
|
113
|
+
return false;
|
114
|
+
}
|
115
|
+
|
116
|
+
static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) {
|
117
|
+
struct m_pair * q;
|
118
|
+
for (q = t->m_pairs; q; q = q->next) {
|
119
|
+
symbol * name = q->name;
|
120
|
+
if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value;
|
121
|
+
}
|
122
|
+
return 0;
|
123
|
+
}
|
124
|
+
|
125
|
+
static int read_literal_string(struct tokeniser * t, int c) {
|
126
|
+
symbol * p = t->p;
|
127
|
+
int ch;
|
128
|
+
SIZE(t->b) = 0;
|
129
|
+
while (true) {
|
130
|
+
if (c >= SIZE(p)) { error2(t, "'"); return c; }
|
131
|
+
ch = p[c];
|
132
|
+
if (ch == '\n') { error1(t, "string not terminated"); return c; }
|
133
|
+
c++;
|
134
|
+
if (ch == t->m_start) {
|
135
|
+
/* Inside insert characters. */
|
136
|
+
int c0 = c;
|
137
|
+
int newlines = false; /* no newlines as yet */
|
138
|
+
int black_found = false; /* no printing chars as yet */
|
139
|
+
while (true) {
|
140
|
+
if (c >= SIZE(p)) { error2(t, "'"); return c; }
|
141
|
+
ch = p[c]; c++;
|
142
|
+
if (ch == t->m_end) break;
|
143
|
+
if (!white_space(t, ch)) black_found = true;
|
144
|
+
if (ch == '\n') newlines = true;
|
145
|
+
if (newlines && black_found) {
|
146
|
+
error1(t, "string not terminated");
|
147
|
+
return c;
|
148
|
+
}
|
149
|
+
}
|
150
|
+
if (!newlines) {
|
151
|
+
int n = c - c0 - 1; /* macro size */
|
152
|
+
int firstch = p[c0];
|
153
|
+
symbol * q = find_in_m(t, n, p + c0);
|
154
|
+
if (q == 0) {
|
155
|
+
if (n == 1 && (firstch == '\'' || firstch == t->m_start))
|
156
|
+
t->b = add_to_b(t->b, 1, p + c0);
|
157
|
+
else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') {
|
158
|
+
int codepoint = 0;
|
159
|
+
int x;
|
160
|
+
if (t->uplusmode == UPLUS_DEFINED) {
|
161
|
+
/* See if found with xxxx upper-cased. */
|
162
|
+
symbol * uc = create_b(n);
|
163
|
+
int i;
|
164
|
+
for (i = 0; i != n; ++i) {
|
165
|
+
uc[i] = toupper(p[c0 + i]);
|
166
|
+
}
|
167
|
+
q = find_in_m(t, n, uc);
|
168
|
+
lose_b(uc);
|
169
|
+
if (q != 0) {
|
170
|
+
t->b = add_to_b(t->b, SIZE(q), q);
|
171
|
+
continue;
|
172
|
+
}
|
173
|
+
error1(t, "Some U+xxxx stringdefs seen but not this one");
|
174
|
+
} else {
|
175
|
+
t->uplusmode = UPLUS_UNICODE;
|
176
|
+
}
|
177
|
+
for (x = c0 + 2; x != c - 1; ++x) {
|
178
|
+
int hex = hex_to_num(p[x]);
|
179
|
+
if (hex < 0) {
|
180
|
+
error1(t, "Bad hex digit following U+");
|
181
|
+
break;
|
182
|
+
}
|
183
|
+
codepoint = (codepoint << 4) | hex;
|
184
|
+
}
|
185
|
+
if (t->encoding == ENC_UTF8) {
|
186
|
+
if (codepoint < 0 || codepoint > 0x01ffff) {
|
187
|
+
error1(t, "character values exceed 0x01ffff");
|
188
|
+
}
|
189
|
+
/* Ensure there's enough space for a max length
|
190
|
+
* UTF-8 sequence. */
|
191
|
+
if (CAPACITY(t->b) < SIZE(t->b) + 3) {
|
192
|
+
t->b = increase_capacity(t->b, 3);
|
193
|
+
}
|
194
|
+
SIZE(t->b) += put_utf8(codepoint, t->b + SIZE(t->b));
|
195
|
+
} else {
|
196
|
+
symbol sym;
|
197
|
+
if (t->encoding == ENC_SINGLEBYTE) {
|
198
|
+
/* Only ISO-8859-1 is handled this way - for
|
199
|
+
* other single-byte character sets you need
|
200
|
+
* stringdef all the U+xxxx codes you use
|
201
|
+
* like - e.g.:
|
202
|
+
*
|
203
|
+
* stringdef U+0171 hex 'FB'
|
204
|
+
*/
|
205
|
+
if (codepoint < 0 || codepoint > 0xff) {
|
206
|
+
error1(t, "character values exceed 256");
|
207
|
+
}
|
208
|
+
} else {
|
209
|
+
if (codepoint < 0 || codepoint > 0xffff) {
|
210
|
+
error1(t, "character values exceed 64K");
|
211
|
+
}
|
212
|
+
}
|
213
|
+
sym = codepoint;
|
214
|
+
t->b = add_to_b(t->b, 1, &sym);
|
215
|
+
}
|
216
|
+
} else
|
217
|
+
error(t, "string macro '", n, p + c0, "' undeclared");
|
218
|
+
} else
|
219
|
+
t->b = add_to_b(t->b, SIZE(q), q);
|
220
|
+
}
|
221
|
+
} else {
|
222
|
+
if (ch == '\'') return c;
|
223
|
+
if (ch < 0 || ch >= 0x80) {
|
224
|
+
if (t->encoding != ENC_WIDECHARS) {
|
225
|
+
/* We don't really want people using non-ASCII literal
|
226
|
+
* strings, but historically it's worked for single-byte
|
227
|
+
* and UTF-8 if the source encoding matches what the
|
228
|
+
* generated stemmer works in and it seems unfair to just
|
229
|
+
* suddenly make this a hard error.`
|
230
|
+
*/
|
231
|
+
fprintf(stderr,
|
232
|
+
"%s:%d: warning: Non-ASCII literal strings aren't "
|
233
|
+
"portable - use stringdef instead\n",
|
234
|
+
t->file, t->line_number);
|
235
|
+
} else {
|
236
|
+
error1(t, "Non-ASCII literal strings aren't "
|
237
|
+
"portable - use stringdef instead");
|
238
|
+
}
|
239
|
+
}
|
240
|
+
t->b = add_to_b(t->b, 1, p + c - 1);
|
241
|
+
}
|
242
|
+
}
|
243
|
+
}
|
244
|
+
|
245
|
+
static int next_token(struct tokeniser * t) {
|
246
|
+
symbol * p = t->p;
|
247
|
+
int c = t->c;
|
248
|
+
int ch;
|
249
|
+
int code = -1;
|
250
|
+
while (true) {
|
251
|
+
if (c >= SIZE(p)) { t->c = c; return -1; }
|
252
|
+
ch = p[c];
|
253
|
+
if (white_space(t, ch)) { c++; continue; }
|
254
|
+
if (isalpha(ch)) {
|
255
|
+
int c0 = c;
|
256
|
+
while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++;
|
257
|
+
code = find_word(c - c0, p + c0);
|
258
|
+
if (code < 0 || t->token_disabled[code]) {
|
259
|
+
t->b = move_to_b(t->b, c - c0, p + c0);
|
260
|
+
code = c_name;
|
261
|
+
}
|
262
|
+
} else
|
263
|
+
if (isdigit(ch)) {
|
264
|
+
int c0 = c;
|
265
|
+
while (c < SIZE(p) && isdigit(p[c])) c++;
|
266
|
+
t->number = get_number(c - c0, p + c0);
|
267
|
+
code = c_number;
|
268
|
+
} else
|
269
|
+
if (ch == '\'') {
|
270
|
+
c = read_literal_string(t, c + 1);
|
271
|
+
code = c_literalstring;
|
272
|
+
} else
|
273
|
+
{
|
274
|
+
int lim = smaller(2, SIZE(p) - c);
|
275
|
+
int i;
|
276
|
+
for (i = lim; i > 0; i--) {
|
277
|
+
code = find_word(i, p + c);
|
278
|
+
if (code >= 0) { c += i; break; }
|
279
|
+
}
|
280
|
+
}
|
281
|
+
if (code >= 0) {
|
282
|
+
t->c = c;
|
283
|
+
return code;
|
284
|
+
}
|
285
|
+
error(t, "'", 1, p + c, "' unknown");
|
286
|
+
c++;
|
287
|
+
continue;
|
288
|
+
}
|
289
|
+
}
|
290
|
+
|
291
|
+
static int next_char(struct tokeniser * t) {
|
292
|
+
if (t->c >= SIZE(t->p)) return -1;
|
293
|
+
return t->p[t->c++];
|
294
|
+
}
|
295
|
+
|
296
|
+
static int next_real_char(struct tokeniser * t) {
|
297
|
+
while (true) {
|
298
|
+
int ch = next_char(t);
|
299
|
+
if (!white_space(t, ch)) return ch;
|
300
|
+
}
|
301
|
+
}
|
302
|
+
|
303
|
+
static void read_chars(struct tokeniser * t) {
|
304
|
+
int ch = next_real_char(t);
|
305
|
+
if (ch < 0) { error2(t, "stringdef"); return; }
|
306
|
+
{
|
307
|
+
int c0 = t->c-1;
|
308
|
+
while (true) {
|
309
|
+
ch = next_char(t);
|
310
|
+
if (white_space(t, ch) || ch < 0) break;
|
311
|
+
}
|
312
|
+
t->b2 = move_to_b(t->b2, t->c - c0 - 1, t->p + c0);
|
313
|
+
}
|
314
|
+
}
|
315
|
+
|
316
|
+
static int decimal_to_num(int ch) {
|
317
|
+
if ('0' <= ch && ch <= '9') return ch - '0';
|
318
|
+
return -1;
|
319
|
+
}
|
320
|
+
|
321
|
+
static int hex_to_num(int ch) {
|
322
|
+
if ('0' <= ch && ch <= '9') return ch - '0';
|
323
|
+
if ('a' <= ch && ch <= 'f') return ch - 'a' + 10;
|
324
|
+
if ('A' <= ch && ch <= 'F') return ch - 'A' + 10;
|
325
|
+
return -1;
|
326
|
+
}
|
327
|
+
|
328
|
+
static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
|
329
|
+
int c = 0; int d = 0;
|
330
|
+
while (true) {
|
331
|
+
while (c < SIZE(p) && p[c] == ' ') c++;
|
332
|
+
if (c == SIZE(p)) break;
|
333
|
+
{
|
334
|
+
int number = 0;
|
335
|
+
while (c != SIZE(p)) {
|
336
|
+
int ch = p[c];
|
337
|
+
if (ch == ' ') break;
|
338
|
+
if (base == 10) {
|
339
|
+
ch = decimal_to_num(ch);
|
340
|
+
if (ch < 0) {
|
341
|
+
error1(t, "decimal string contains non-digits");
|
342
|
+
return;
|
343
|
+
}
|
344
|
+
} else {
|
345
|
+
ch = hex_to_num(ch);
|
346
|
+
if (ch < 0) {
|
347
|
+
error1(t, "hex string contains non-hex characters");
|
348
|
+
return;
|
349
|
+
}
|
350
|
+
}
|
351
|
+
number = base * number + ch;
|
352
|
+
c++;
|
353
|
+
}
|
354
|
+
if (t->encoding == ENC_SINGLEBYTE) {
|
355
|
+
if (number < 0 || number > 0xff) {
|
356
|
+
error1(t, "character values exceed 256");
|
357
|
+
return;
|
358
|
+
}
|
359
|
+
} else {
|
360
|
+
if (number < 0 || number > 0xffff) {
|
361
|
+
error1(t, "character values exceed 64K");
|
362
|
+
return;
|
363
|
+
}
|
364
|
+
}
|
365
|
+
if (t->encoding == ENC_UTF8)
|
366
|
+
d += put_utf8(number, p + d);
|
367
|
+
else
|
368
|
+
p[d++] = number;
|
369
|
+
}
|
370
|
+
}
|
371
|
+
SIZE(p) = d;
|
372
|
+
}
|
373
|
+
|
374
|
+
extern int read_token(struct tokeniser * t) {
|
375
|
+
symbol * p = t->p;
|
376
|
+
int held = t->token_held;
|
377
|
+
t->token_held = false;
|
378
|
+
if (held) return t->token;
|
379
|
+
while (true) {
|
380
|
+
int code = next_token(t);
|
381
|
+
switch (code) {
|
382
|
+
case c_comment1: /* slash-slash comment */
|
383
|
+
while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;
|
384
|
+
continue;
|
385
|
+
case c_comment2: /* slash-star comment */
|
386
|
+
while (true) {
|
387
|
+
if (t->c >= SIZE(p)) {
|
388
|
+
error1(t, "/* comment not terminated");
|
389
|
+
t->token = -1;
|
390
|
+
return -1;
|
391
|
+
}
|
392
|
+
if (p[t->c] == '\n') t->line_number++;
|
393
|
+
if (eq_s(t, "*/")) break;
|
394
|
+
t->c++;
|
395
|
+
}
|
396
|
+
continue;
|
397
|
+
case c_stringescapes: {
|
398
|
+
int ch1 = next_real_char(t);
|
399
|
+
int ch2 = next_real_char(t);
|
400
|
+
if (ch2 < 0) {
|
401
|
+
error2(t, "stringescapes");
|
402
|
+
continue;
|
403
|
+
}
|
404
|
+
if (ch1 == '\'') {
|
405
|
+
error1(t, "first stringescape cannot be '");
|
406
|
+
continue;
|
407
|
+
}
|
408
|
+
t->m_start = ch1;
|
409
|
+
t->m_end = ch2;
|
410
|
+
continue;
|
411
|
+
}
|
412
|
+
case c_stringdef: {
|
413
|
+
int base = 0;
|
414
|
+
read_chars(t);
|
415
|
+
code = read_token(t);
|
416
|
+
if (code == c_hex) { base = 16; code = read_token(t); } else
|
417
|
+
if (code == c_decimal) { base = 10; code = read_token(t); }
|
418
|
+
if (code != c_literalstring) {
|
419
|
+
error1(t, "string omitted after stringdef");
|
420
|
+
continue;
|
421
|
+
}
|
422
|
+
if (base > 0) convert_numeric_string(t, t->b, base);
|
423
|
+
{ NEW(m_pair, q);
|
424
|
+
q->next = t->m_pairs;
|
425
|
+
q->name = copy_b(t->b2);
|
426
|
+
q->value = copy_b(t->b);
|
427
|
+
t->m_pairs = q;
|
428
|
+
if (t->uplusmode != UPLUS_DEFINED &&
|
429
|
+
(SIZE(t->b2) >= 3 && t->b2[0] == 'U' && t->b2[1] == '+')) {
|
430
|
+
if (t->uplusmode == UPLUS_UNICODE) {
|
431
|
+
error1(t, "U+xxxx already used with implicit meaning");
|
432
|
+
} else {
|
433
|
+
t->uplusmode = UPLUS_DEFINED;
|
434
|
+
}
|
435
|
+
}
|
436
|
+
}
|
437
|
+
continue;
|
438
|
+
}
|
439
|
+
case c_get:
|
440
|
+
code = read_token(t);
|
441
|
+
if (code != c_literalstring) {
|
442
|
+
error1(t, "string omitted after get"); continue;
|
443
|
+
}
|
444
|
+
t->get_depth++;
|
445
|
+
if (t->get_depth > 10) {
|
446
|
+
error1(t, "get directives go 10 deep. Looping?");
|
447
|
+
exit(1);
|
448
|
+
}
|
449
|
+
{
|
450
|
+
NEW(input, q);
|
451
|
+
char * file = b_to_s(t->b);
|
452
|
+
symbol * u = get_input(file);
|
453
|
+
if (u == 0) {
|
454
|
+
struct include * r;
|
455
|
+
for (r = t->includes; r; r = r->next) {
|
456
|
+
symbol * b = copy_b(r->b);
|
457
|
+
b = add_to_b(b, SIZE(t->b), t->b);
|
458
|
+
free(file);
|
459
|
+
file = b_to_s(b);
|
460
|
+
u = get_input(file);
|
461
|
+
lose_b(b);
|
462
|
+
if (u != 0) break;
|
463
|
+
}
|
464
|
+
}
|
465
|
+
if (u == 0) {
|
466
|
+
error(t, "Can't get '", SIZE(t->b), t->b, "'");
|
467
|
+
exit(1);
|
468
|
+
}
|
469
|
+
memmove(q, t, sizeof(struct input));
|
470
|
+
t->next = q;
|
471
|
+
t->p = u;
|
472
|
+
t->c = 0;
|
473
|
+
t->file = file;
|
474
|
+
t->file_needs_freeing = true;
|
475
|
+
t->line_number = 1;
|
476
|
+
}
|
477
|
+
p = t->p;
|
478
|
+
continue;
|
479
|
+
case -1:
|
480
|
+
if (t->next) {
|
481
|
+
lose_b(p);
|
482
|
+
{
|
483
|
+
struct input * q = t->next;
|
484
|
+
memmove(t, q, sizeof(struct input)); p = t->p;
|
485
|
+
FREE(q);
|
486
|
+
}
|
487
|
+
t->get_depth--;
|
488
|
+
continue;
|
489
|
+
}
|
490
|
+
/* fall through */
|
491
|
+
default:
|
492
|
+
t->previous_token = t->token;
|
493
|
+
t->token = code;
|
494
|
+
return code;
|
495
|
+
}
|
496
|
+
}
|
497
|
+
}
|
498
|
+
|
499
|
+
extern const char * name_of_token(int code) {
|
500
|
+
int i;
|
501
|
+
for (i = 1; i < vocab->code; i++)
|
502
|
+
if ((vocab + i)->code == code) return (const char *)(vocab + i)->s;
|
503
|
+
switch (code) {
|
504
|
+
case c_mathassign: return "=";
|
505
|
+
case c_name: return "name";
|
506
|
+
case c_number: return "number";
|
507
|
+
case c_literalstring:return "literal";
|
508
|
+
case c_neg: return "neg";
|
509
|
+
case c_grouping: return "grouping";
|
510
|
+
case c_call: return "call";
|
511
|
+
case c_booltest: return "Boolean test";
|
512
|
+
case -2: return "start of text";
|
513
|
+
case -1: return "end of text";
|
514
|
+
default: return "?";
|
515
|
+
}
|
516
|
+
}
|
517
|
+
|
518
|
+
extern void disable_token(struct tokeniser * t, int code) {
|
519
|
+
t->token_disabled[code] = 1;
|
520
|
+
}
|
521
|
+
|
522
|
+
extern struct tokeniser * create_tokeniser(symbol * p, char * file) {
|
523
|
+
NEW(tokeniser, t);
|
524
|
+
t->next = 0;
|
525
|
+
t->p = p;
|
526
|
+
t->c = 0;
|
527
|
+
t->file = file;
|
528
|
+
t->file_needs_freeing = false;
|
529
|
+
t->line_number = 1;
|
530
|
+
t->b = create_b(0);
|
531
|
+
t->b2 = create_b(0);
|
532
|
+
t->m_start = -1;
|
533
|
+
t->m_pairs = 0;
|
534
|
+
t->get_depth = 0;
|
535
|
+
t->error_count = 0;
|
536
|
+
t->token_held = false;
|
537
|
+
t->token = -2;
|
538
|
+
t->previous_token = -2;
|
539
|
+
t->uplusmode = UPLUS_NONE;
|
540
|
+
memset(t->token_disabled, 0, sizeof(t->token_disabled));
|
541
|
+
return t;
|
542
|
+
}
|
543
|
+
|
544
|
+
extern void close_tokeniser(struct tokeniser * t) {
|
545
|
+
lose_b(t->b);
|
546
|
+
lose_b(t->b2);
|
547
|
+
{
|
548
|
+
struct m_pair * q = t->m_pairs;
|
549
|
+
while (q) {
|
550
|
+
struct m_pair * q_next = q->next;
|
551
|
+
lose_b(q->name);
|
552
|
+
lose_b(q->value);
|
553
|
+
FREE(q);
|
554
|
+
q = q_next;
|
555
|
+
}
|
556
|
+
}
|
557
|
+
{
|
558
|
+
struct input * q = t->next;
|
559
|
+
while (q) {
|
560
|
+
struct input * q_next = q->next;
|
561
|
+
FREE(q);
|
562
|
+
q = q_next;
|
563
|
+
}
|
564
|
+
}
|
565
|
+
if (t->file_needs_freeing) free(t->file);
|
566
|
+
FREE(t);
|
567
|
+
}
|
@@ -0,0 +1 @@
|
|
1
|
+
*.generated.cs
|