mittens 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
static const struct system_word vocab[82+1] = {
|
|
2
|
+
{ 0, (const byte *)"", 82+1},
|
|
3
|
+
|
|
4
|
+
{ 1, (const byte *)"$", c_dollar },
|
|
5
|
+
{ 1, (const byte *)"(", c_bra },
|
|
6
|
+
{ 1, (const byte *)")", c_ket },
|
|
7
|
+
{ 1, (const byte *)"*", c_multiply },
|
|
8
|
+
{ 1, (const byte *)"+", c_plus },
|
|
9
|
+
{ 1, (const byte *)"-", c_minus },
|
|
10
|
+
{ 1, (const byte *)"/", c_divide },
|
|
11
|
+
{ 1, (const byte *)"<", c_ls },
|
|
12
|
+
{ 1, (const byte *)"=", c_assign },
|
|
13
|
+
{ 1, (const byte *)">", c_gr },
|
|
14
|
+
{ 1, (const byte *)"?", c_debug },
|
|
15
|
+
{ 1, (const byte *)"[", c_leftslice },
|
|
16
|
+
{ 1, (const byte *)"]", c_rightslice },
|
|
17
|
+
{ 2, (const byte *)"!=", c_ne },
|
|
18
|
+
{ 2, (const byte *)"*=", c_multiplyassign },
|
|
19
|
+
{ 2, (const byte *)"+=", c_plusassign },
|
|
20
|
+
{ 2, (const byte *)"-=", c_minusassign },
|
|
21
|
+
{ 2, (const byte *)"->", c_sliceto },
|
|
22
|
+
{ 2, (const byte *)"/*", c_comment2 },
|
|
23
|
+
{ 2, (const byte *)"//", c_comment1 },
|
|
24
|
+
{ 2, (const byte *)"/=", c_divideassign },
|
|
25
|
+
{ 2, (const byte *)"<+", c_insert },
|
|
26
|
+
{ 2, (const byte *)"<-", c_slicefrom },
|
|
27
|
+
{ 2, (const byte *)"<=", c_le },
|
|
28
|
+
{ 2, (const byte *)"==", c_eq },
|
|
29
|
+
{ 2, (const byte *)"=>", c_assignto },
|
|
30
|
+
{ 2, (const byte *)">=", c_ge },
|
|
31
|
+
{ 2, (const byte *)"as", c_as },
|
|
32
|
+
{ 2, (const byte *)"do", c_do },
|
|
33
|
+
{ 2, (const byte *)"or", c_or },
|
|
34
|
+
{ 3, (const byte *)"and", c_and },
|
|
35
|
+
{ 3, (const byte *)"for", c_for },
|
|
36
|
+
{ 3, (const byte *)"get", c_get },
|
|
37
|
+
{ 3, (const byte *)"hex", c_hex },
|
|
38
|
+
{ 3, (const byte *)"hop", c_hop },
|
|
39
|
+
{ 3, (const byte *)"len", c_len },
|
|
40
|
+
{ 3, (const byte *)"non", c_non },
|
|
41
|
+
{ 3, (const byte *)"not", c_not },
|
|
42
|
+
{ 3, (const byte *)"set", c_set },
|
|
43
|
+
{ 3, (const byte *)"try", c_try },
|
|
44
|
+
{ 4, (const byte *)"fail", c_fail },
|
|
45
|
+
{ 4, (const byte *)"goto", c_goto },
|
|
46
|
+
{ 4, (const byte *)"loop", c_loop },
|
|
47
|
+
{ 4, (const byte *)"next", c_next },
|
|
48
|
+
{ 4, (const byte *)"size", c_size },
|
|
49
|
+
{ 4, (const byte *)"test", c_test },
|
|
50
|
+
{ 4, (const byte *)"true", c_true },
|
|
51
|
+
{ 5, (const byte *)"among", c_among },
|
|
52
|
+
{ 5, (const byte *)"false", c_false },
|
|
53
|
+
{ 5, (const byte *)"lenof", c_lenof },
|
|
54
|
+
{ 5, (const byte *)"limit", c_limit },
|
|
55
|
+
{ 5, (const byte *)"unset", c_unset },
|
|
56
|
+
{ 6, (const byte *)"atmark", c_atmark },
|
|
57
|
+
{ 6, (const byte *)"attach", c_attach },
|
|
58
|
+
{ 6, (const byte *)"cursor", c_cursor },
|
|
59
|
+
{ 6, (const byte *)"define", c_define },
|
|
60
|
+
{ 6, (const byte *)"delete", c_delete },
|
|
61
|
+
{ 6, (const byte *)"gopast", c_gopast },
|
|
62
|
+
{ 6, (const byte *)"insert", c_insert },
|
|
63
|
+
{ 6, (const byte *)"maxint", c_maxint },
|
|
64
|
+
{ 6, (const byte *)"minint", c_minint },
|
|
65
|
+
{ 6, (const byte *)"repeat", c_repeat },
|
|
66
|
+
{ 6, (const byte *)"sizeof", c_sizeof },
|
|
67
|
+
{ 6, (const byte *)"tomark", c_tomark },
|
|
68
|
+
{ 7, (const byte *)"atleast", c_atleast },
|
|
69
|
+
{ 7, (const byte *)"atlimit", c_atlimit },
|
|
70
|
+
{ 7, (const byte *)"decimal", c_decimal },
|
|
71
|
+
{ 7, (const byte *)"reverse", c_reverse },
|
|
72
|
+
{ 7, (const byte *)"setmark", c_setmark },
|
|
73
|
+
{ 7, (const byte *)"strings", c_strings },
|
|
74
|
+
{ 7, (const byte *)"tolimit", c_tolimit },
|
|
75
|
+
{ 8, (const byte *)"booleans", c_booleans },
|
|
76
|
+
{ 8, (const byte *)"integers", c_integers },
|
|
77
|
+
{ 8, (const byte *)"routines", c_routines },
|
|
78
|
+
{ 8, (const byte *)"setlimit", c_setlimit },
|
|
79
|
+
{ 9, (const byte *)"backwards", c_backwards },
|
|
80
|
+
{ 9, (const byte *)"externals", c_externals },
|
|
81
|
+
{ 9, (const byte *)"groupings", c_groupings },
|
|
82
|
+
{ 9, (const byte *)"stringdef", c_stringdef },
|
|
83
|
+
{ 9, (const byte *)"substring", c_substring },
|
|
84
|
+
{ 12, (const byte *)"backwardmode", c_backwardmode },
|
|
85
|
+
{ 13, (const byte *)"stringescapes", c_stringescapes }
|
|
86
|
+
};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
c_among = 4, c_and, c_as, c_assign, c_assignto, c_atleast,
|
|
2
|
+
c_atlimit, c_atmark, c_attach, c_backwardmode, c_backwards,
|
|
3
|
+
c_booleans, c_bra, c_comment1, c_comment2, c_cursor, c_debug,
|
|
4
|
+
c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do,
|
|
5
|
+
c_dollar, c_eq, c_externals, c_fail, c_false, c_for, c_ge, c_get,
|
|
6
|
+
c_gopast, c_goto, c_gr, c_groupings, c_hex, c_hop, c_insert,
|
|
7
|
+
c_integers, c_ket, c_le, c_leftslice, c_len, c_lenof, c_limit, c_loop,
|
|
8
|
+
c_ls, c_maxint, c_minint, c_minus, c_minusassign, c_multiply,
|
|
9
|
+
c_multiplyassign, c_ne, c_next, c_non, c_not, c_or, c_plus,
|
|
10
|
+
c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines,
|
|
11
|
+
c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom,
|
|
12
|
+
c_sliceto, c_stringdef, c_stringescapes, c_strings, c_substring,
|
|
13
|
+
c_test, c_tolimit, c_tomark, c_true, c_try, c_unset,
|
|
@@ -0,0 +1,567 @@
|
|
|
1
|
+
|
|
2
|
+
#include <stdio.h> /* stderr etc */
|
|
3
|
+
#include <stdlib.h> /* malloc free */
|
|
4
|
+
#include <string.h> /* strlen */
|
|
5
|
+
#include <ctype.h> /* isalpha etc */
|
|
6
|
+
#include "header.h"
|
|
7
|
+
|
|
8
|
+
struct system_word {
|
|
9
|
+
int s_size; /* size of system word */
|
|
10
|
+
const byte * s; /* pointer to the system word */
|
|
11
|
+
int code; /* its internal code */
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
/* ASCII collating assumed in syswords.c */
|
|
16
|
+
|
|
17
|
+
#include "syswords.h"
|
|
18
|
+
|
|
19
|
+
#define INITIAL_INPUT_BUFFER_SIZE 8192
|
|
20
|
+
|
|
21
|
+
static int hex_to_num(int ch);
|
|
22
|
+
|
|
23
|
+
static int smaller(int a, int b) { return a < b ? a : b; }
|
|
24
|
+
|
|
25
|
+
extern symbol * get_input(const char * filename) {
|
|
26
|
+
FILE * input = fopen(filename, "r");
|
|
27
|
+
if (input == 0) { return 0; }
|
|
28
|
+
{
|
|
29
|
+
symbol * u = create_b(INITIAL_INPUT_BUFFER_SIZE);
|
|
30
|
+
int size = 0;
|
|
31
|
+
while (true) {
|
|
32
|
+
int ch = getc(input);
|
|
33
|
+
if (ch == EOF) break;
|
|
34
|
+
if (size >= CAPACITY(u)) u = increase_capacity(u, size);
|
|
35
|
+
u[size++] = ch;
|
|
36
|
+
}
|
|
37
|
+
fclose(input);
|
|
38
|
+
SIZE(u) = size;
|
|
39
|
+
return u;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
static void error(struct tokeniser * t, const char * s1, int n, symbol * p, const char * s2) {
|
|
44
|
+
if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); }
|
|
45
|
+
fprintf(stderr, "%s:%d: ", t->file, t->line_number);
|
|
46
|
+
if (s1) fprintf(stderr, "%s", s1);
|
|
47
|
+
if (p) {
|
|
48
|
+
int i;
|
|
49
|
+
for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]);
|
|
50
|
+
}
|
|
51
|
+
if (s2) fprintf(stderr, "%s", s2);
|
|
52
|
+
fprintf(stderr, "\n");
|
|
53
|
+
t->error_count++;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
static void error1(struct tokeniser * t, const char * s) {
|
|
57
|
+
error(t, s, 0,0, 0);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
static void error2(struct tokeniser * t, const char * s) {
|
|
61
|
+
error(t, "unexpected end of text after ", 0,0, s);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
static int compare_words(int m, symbol * p, int n, const byte * q) {
|
|
65
|
+
if (m != n) return m - n;
|
|
66
|
+
{
|
|
67
|
+
int i; for (i = 0; i < n; i++) {
|
|
68
|
+
int diff = p[i] - q[i];
|
|
69
|
+
if (diff) return diff;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
return 0;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
static int find_word(int n, symbol * p) {
|
|
76
|
+
int i = 0; int j = vocab->code;
|
|
77
|
+
do {
|
|
78
|
+
int k = i + (j - i)/2;
|
|
79
|
+
const struct system_word * w = vocab + k;
|
|
80
|
+
int diff = compare_words(n, p, w->s_size, w->s);
|
|
81
|
+
if (diff == 0) return w->code;
|
|
82
|
+
if (diff < 0) j = k; else i = k;
|
|
83
|
+
} while (j - i != 1);
|
|
84
|
+
return -1;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
static int get_number(int n, symbol * p) {
|
|
88
|
+
int x = 0;
|
|
89
|
+
int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0';
|
|
90
|
+
return x;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
static int eq_s(struct tokeniser * t, const char * s) {
|
|
94
|
+
int l = strlen(s);
|
|
95
|
+
if (SIZE(t->p) - t->c < l) return false;
|
|
96
|
+
{
|
|
97
|
+
int i;
|
|
98
|
+
for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false;
|
|
99
|
+
}
|
|
100
|
+
t->c += l; return true;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
static int white_space(struct tokeniser * t, int ch) {
|
|
104
|
+
switch (ch) {
|
|
105
|
+
case '\n':
|
|
106
|
+
t->line_number++;
|
|
107
|
+
/* fall through */
|
|
108
|
+
case '\r':
|
|
109
|
+
case '\t':
|
|
110
|
+
case ' ':
|
|
111
|
+
return true;
|
|
112
|
+
}
|
|
113
|
+
return false;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) {
|
|
117
|
+
struct m_pair * q;
|
|
118
|
+
for (q = t->m_pairs; q; q = q->next) {
|
|
119
|
+
symbol * name = q->name;
|
|
120
|
+
if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value;
|
|
121
|
+
}
|
|
122
|
+
return 0;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
static int read_literal_string(struct tokeniser * t, int c) {
|
|
126
|
+
symbol * p = t->p;
|
|
127
|
+
int ch;
|
|
128
|
+
SIZE(t->b) = 0;
|
|
129
|
+
while (true) {
|
|
130
|
+
if (c >= SIZE(p)) { error2(t, "'"); return c; }
|
|
131
|
+
ch = p[c];
|
|
132
|
+
if (ch == '\n') { error1(t, "string not terminated"); return c; }
|
|
133
|
+
c++;
|
|
134
|
+
if (ch == t->m_start) {
|
|
135
|
+
/* Inside insert characters. */
|
|
136
|
+
int c0 = c;
|
|
137
|
+
int newlines = false; /* no newlines as yet */
|
|
138
|
+
int black_found = false; /* no printing chars as yet */
|
|
139
|
+
while (true) {
|
|
140
|
+
if (c >= SIZE(p)) { error2(t, "'"); return c; }
|
|
141
|
+
ch = p[c]; c++;
|
|
142
|
+
if (ch == t->m_end) break;
|
|
143
|
+
if (!white_space(t, ch)) black_found = true;
|
|
144
|
+
if (ch == '\n') newlines = true;
|
|
145
|
+
if (newlines && black_found) {
|
|
146
|
+
error1(t, "string not terminated");
|
|
147
|
+
return c;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
if (!newlines) {
|
|
151
|
+
int n = c - c0 - 1; /* macro size */
|
|
152
|
+
int firstch = p[c0];
|
|
153
|
+
symbol * q = find_in_m(t, n, p + c0);
|
|
154
|
+
if (q == 0) {
|
|
155
|
+
if (n == 1 && (firstch == '\'' || firstch == t->m_start))
|
|
156
|
+
t->b = add_to_b(t->b, 1, p + c0);
|
|
157
|
+
else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') {
|
|
158
|
+
int codepoint = 0;
|
|
159
|
+
int x;
|
|
160
|
+
if (t->uplusmode == UPLUS_DEFINED) {
|
|
161
|
+
/* See if found with xxxx upper-cased. */
|
|
162
|
+
symbol * uc = create_b(n);
|
|
163
|
+
int i;
|
|
164
|
+
for (i = 0; i != n; ++i) {
|
|
165
|
+
uc[i] = toupper(p[c0 + i]);
|
|
166
|
+
}
|
|
167
|
+
q = find_in_m(t, n, uc);
|
|
168
|
+
lose_b(uc);
|
|
169
|
+
if (q != 0) {
|
|
170
|
+
t->b = add_to_b(t->b, SIZE(q), q);
|
|
171
|
+
continue;
|
|
172
|
+
}
|
|
173
|
+
error1(t, "Some U+xxxx stringdefs seen but not this one");
|
|
174
|
+
} else {
|
|
175
|
+
t->uplusmode = UPLUS_UNICODE;
|
|
176
|
+
}
|
|
177
|
+
for (x = c0 + 2; x != c - 1; ++x) {
|
|
178
|
+
int hex = hex_to_num(p[x]);
|
|
179
|
+
if (hex < 0) {
|
|
180
|
+
error1(t, "Bad hex digit following U+");
|
|
181
|
+
break;
|
|
182
|
+
}
|
|
183
|
+
codepoint = (codepoint << 4) | hex;
|
|
184
|
+
}
|
|
185
|
+
if (t->encoding == ENC_UTF8) {
|
|
186
|
+
if (codepoint < 0 || codepoint > 0x01ffff) {
|
|
187
|
+
error1(t, "character values exceed 0x01ffff");
|
|
188
|
+
}
|
|
189
|
+
/* Ensure there's enough space for a max length
|
|
190
|
+
* UTF-8 sequence. */
|
|
191
|
+
if (CAPACITY(t->b) < SIZE(t->b) + 3) {
|
|
192
|
+
t->b = increase_capacity(t->b, 3);
|
|
193
|
+
}
|
|
194
|
+
SIZE(t->b) += put_utf8(codepoint, t->b + SIZE(t->b));
|
|
195
|
+
} else {
|
|
196
|
+
symbol sym;
|
|
197
|
+
if (t->encoding == ENC_SINGLEBYTE) {
|
|
198
|
+
/* Only ISO-8859-1 is handled this way - for
|
|
199
|
+
* other single-byte character sets you need
|
|
200
|
+
* stringdef all the U+xxxx codes you use
|
|
201
|
+
* like - e.g.:
|
|
202
|
+
*
|
|
203
|
+
* stringdef U+0171 hex 'FB'
|
|
204
|
+
*/
|
|
205
|
+
if (codepoint < 0 || codepoint > 0xff) {
|
|
206
|
+
error1(t, "character values exceed 256");
|
|
207
|
+
}
|
|
208
|
+
} else {
|
|
209
|
+
if (codepoint < 0 || codepoint > 0xffff) {
|
|
210
|
+
error1(t, "character values exceed 64K");
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
sym = codepoint;
|
|
214
|
+
t->b = add_to_b(t->b, 1, &sym);
|
|
215
|
+
}
|
|
216
|
+
} else
|
|
217
|
+
error(t, "string macro '", n, p + c0, "' undeclared");
|
|
218
|
+
} else
|
|
219
|
+
t->b = add_to_b(t->b, SIZE(q), q);
|
|
220
|
+
}
|
|
221
|
+
} else {
|
|
222
|
+
if (ch == '\'') return c;
|
|
223
|
+
if (ch < 0 || ch >= 0x80) {
|
|
224
|
+
if (t->encoding != ENC_WIDECHARS) {
|
|
225
|
+
/* We don't really want people using non-ASCII literal
|
|
226
|
+
* strings, but historically it's worked for single-byte
|
|
227
|
+
* and UTF-8 if the source encoding matches what the
|
|
228
|
+
* generated stemmer works in and it seems unfair to just
|
|
229
|
+
* suddenly make this a hard error.`
|
|
230
|
+
*/
|
|
231
|
+
fprintf(stderr,
|
|
232
|
+
"%s:%d: warning: Non-ASCII literal strings aren't "
|
|
233
|
+
"portable - use stringdef instead\n",
|
|
234
|
+
t->file, t->line_number);
|
|
235
|
+
} else {
|
|
236
|
+
error1(t, "Non-ASCII literal strings aren't "
|
|
237
|
+
"portable - use stringdef instead");
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
t->b = add_to_b(t->b, 1, p + c - 1);
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
static int next_token(struct tokeniser * t) {
|
|
246
|
+
symbol * p = t->p;
|
|
247
|
+
int c = t->c;
|
|
248
|
+
int ch;
|
|
249
|
+
int code = -1;
|
|
250
|
+
while (true) {
|
|
251
|
+
if (c >= SIZE(p)) { t->c = c; return -1; }
|
|
252
|
+
ch = p[c];
|
|
253
|
+
if (white_space(t, ch)) { c++; continue; }
|
|
254
|
+
if (isalpha(ch)) {
|
|
255
|
+
int c0 = c;
|
|
256
|
+
while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++;
|
|
257
|
+
code = find_word(c - c0, p + c0);
|
|
258
|
+
if (code < 0 || t->token_disabled[code]) {
|
|
259
|
+
t->b = move_to_b(t->b, c - c0, p + c0);
|
|
260
|
+
code = c_name;
|
|
261
|
+
}
|
|
262
|
+
} else
|
|
263
|
+
if (isdigit(ch)) {
|
|
264
|
+
int c0 = c;
|
|
265
|
+
while (c < SIZE(p) && isdigit(p[c])) c++;
|
|
266
|
+
t->number = get_number(c - c0, p + c0);
|
|
267
|
+
code = c_number;
|
|
268
|
+
} else
|
|
269
|
+
if (ch == '\'') {
|
|
270
|
+
c = read_literal_string(t, c + 1);
|
|
271
|
+
code = c_literalstring;
|
|
272
|
+
} else
|
|
273
|
+
{
|
|
274
|
+
int lim = smaller(2, SIZE(p) - c);
|
|
275
|
+
int i;
|
|
276
|
+
for (i = lim; i > 0; i--) {
|
|
277
|
+
code = find_word(i, p + c);
|
|
278
|
+
if (code >= 0) { c += i; break; }
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
if (code >= 0) {
|
|
282
|
+
t->c = c;
|
|
283
|
+
return code;
|
|
284
|
+
}
|
|
285
|
+
error(t, "'", 1, p + c, "' unknown");
|
|
286
|
+
c++;
|
|
287
|
+
continue;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
static int next_char(struct tokeniser * t) {
|
|
292
|
+
if (t->c >= SIZE(t->p)) return -1;
|
|
293
|
+
return t->p[t->c++];
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
static int next_real_char(struct tokeniser * t) {
|
|
297
|
+
while (true) {
|
|
298
|
+
int ch = next_char(t);
|
|
299
|
+
if (!white_space(t, ch)) return ch;
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
static void read_chars(struct tokeniser * t) {
|
|
304
|
+
int ch = next_real_char(t);
|
|
305
|
+
if (ch < 0) { error2(t, "stringdef"); return; }
|
|
306
|
+
{
|
|
307
|
+
int c0 = t->c-1;
|
|
308
|
+
while (true) {
|
|
309
|
+
ch = next_char(t);
|
|
310
|
+
if (white_space(t, ch) || ch < 0) break;
|
|
311
|
+
}
|
|
312
|
+
t->b2 = move_to_b(t->b2, t->c - c0 - 1, t->p + c0);
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
static int decimal_to_num(int ch) {
|
|
317
|
+
if ('0' <= ch && ch <= '9') return ch - '0';
|
|
318
|
+
return -1;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
static int hex_to_num(int ch) {
|
|
322
|
+
if ('0' <= ch && ch <= '9') return ch - '0';
|
|
323
|
+
if ('a' <= ch && ch <= 'f') return ch - 'a' + 10;
|
|
324
|
+
if ('A' <= ch && ch <= 'F') return ch - 'A' + 10;
|
|
325
|
+
return -1;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
|
|
329
|
+
int c = 0; int d = 0;
|
|
330
|
+
while (true) {
|
|
331
|
+
while (c < SIZE(p) && p[c] == ' ') c++;
|
|
332
|
+
if (c == SIZE(p)) break;
|
|
333
|
+
{
|
|
334
|
+
int number = 0;
|
|
335
|
+
while (c != SIZE(p)) {
|
|
336
|
+
int ch = p[c];
|
|
337
|
+
if (ch == ' ') break;
|
|
338
|
+
if (base == 10) {
|
|
339
|
+
ch = decimal_to_num(ch);
|
|
340
|
+
if (ch < 0) {
|
|
341
|
+
error1(t, "decimal string contains non-digits");
|
|
342
|
+
return;
|
|
343
|
+
}
|
|
344
|
+
} else {
|
|
345
|
+
ch = hex_to_num(ch);
|
|
346
|
+
if (ch < 0) {
|
|
347
|
+
error1(t, "hex string contains non-hex characters");
|
|
348
|
+
return;
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
number = base * number + ch;
|
|
352
|
+
c++;
|
|
353
|
+
}
|
|
354
|
+
if (t->encoding == ENC_SINGLEBYTE) {
|
|
355
|
+
if (number < 0 || number > 0xff) {
|
|
356
|
+
error1(t, "character values exceed 256");
|
|
357
|
+
return;
|
|
358
|
+
}
|
|
359
|
+
} else {
|
|
360
|
+
if (number < 0 || number > 0xffff) {
|
|
361
|
+
error1(t, "character values exceed 64K");
|
|
362
|
+
return;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
if (t->encoding == ENC_UTF8)
|
|
366
|
+
d += put_utf8(number, p + d);
|
|
367
|
+
else
|
|
368
|
+
p[d++] = number;
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
SIZE(p) = d;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
extern int read_token(struct tokeniser * t) {
|
|
375
|
+
symbol * p = t->p;
|
|
376
|
+
int held = t->token_held;
|
|
377
|
+
t->token_held = false;
|
|
378
|
+
if (held) return t->token;
|
|
379
|
+
while (true) {
|
|
380
|
+
int code = next_token(t);
|
|
381
|
+
switch (code) {
|
|
382
|
+
case c_comment1: /* slash-slash comment */
|
|
383
|
+
while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;
|
|
384
|
+
continue;
|
|
385
|
+
case c_comment2: /* slash-star comment */
|
|
386
|
+
while (true) {
|
|
387
|
+
if (t->c >= SIZE(p)) {
|
|
388
|
+
error1(t, "/* comment not terminated");
|
|
389
|
+
t->token = -1;
|
|
390
|
+
return -1;
|
|
391
|
+
}
|
|
392
|
+
if (p[t->c] == '\n') t->line_number++;
|
|
393
|
+
if (eq_s(t, "*/")) break;
|
|
394
|
+
t->c++;
|
|
395
|
+
}
|
|
396
|
+
continue;
|
|
397
|
+
case c_stringescapes: {
|
|
398
|
+
int ch1 = next_real_char(t);
|
|
399
|
+
int ch2 = next_real_char(t);
|
|
400
|
+
if (ch2 < 0) {
|
|
401
|
+
error2(t, "stringescapes");
|
|
402
|
+
continue;
|
|
403
|
+
}
|
|
404
|
+
if (ch1 == '\'') {
|
|
405
|
+
error1(t, "first stringescape cannot be '");
|
|
406
|
+
continue;
|
|
407
|
+
}
|
|
408
|
+
t->m_start = ch1;
|
|
409
|
+
t->m_end = ch2;
|
|
410
|
+
continue;
|
|
411
|
+
}
|
|
412
|
+
case c_stringdef: {
|
|
413
|
+
int base = 0;
|
|
414
|
+
read_chars(t);
|
|
415
|
+
code = read_token(t);
|
|
416
|
+
if (code == c_hex) { base = 16; code = read_token(t); } else
|
|
417
|
+
if (code == c_decimal) { base = 10; code = read_token(t); }
|
|
418
|
+
if (code != c_literalstring) {
|
|
419
|
+
error1(t, "string omitted after stringdef");
|
|
420
|
+
continue;
|
|
421
|
+
}
|
|
422
|
+
if (base > 0) convert_numeric_string(t, t->b, base);
|
|
423
|
+
{ NEW(m_pair, q);
|
|
424
|
+
q->next = t->m_pairs;
|
|
425
|
+
q->name = copy_b(t->b2);
|
|
426
|
+
q->value = copy_b(t->b);
|
|
427
|
+
t->m_pairs = q;
|
|
428
|
+
if (t->uplusmode != UPLUS_DEFINED &&
|
|
429
|
+
(SIZE(t->b2) >= 3 && t->b2[0] == 'U' && t->b2[1] == '+')) {
|
|
430
|
+
if (t->uplusmode == UPLUS_UNICODE) {
|
|
431
|
+
error1(t, "U+xxxx already used with implicit meaning");
|
|
432
|
+
} else {
|
|
433
|
+
t->uplusmode = UPLUS_DEFINED;
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
continue;
|
|
438
|
+
}
|
|
439
|
+
case c_get:
|
|
440
|
+
code = read_token(t);
|
|
441
|
+
if (code != c_literalstring) {
|
|
442
|
+
error1(t, "string omitted after get"); continue;
|
|
443
|
+
}
|
|
444
|
+
t->get_depth++;
|
|
445
|
+
if (t->get_depth > 10) {
|
|
446
|
+
error1(t, "get directives go 10 deep. Looping?");
|
|
447
|
+
exit(1);
|
|
448
|
+
}
|
|
449
|
+
{
|
|
450
|
+
NEW(input, q);
|
|
451
|
+
char * file = b_to_s(t->b);
|
|
452
|
+
symbol * u = get_input(file);
|
|
453
|
+
if (u == 0) {
|
|
454
|
+
struct include * r;
|
|
455
|
+
for (r = t->includes; r; r = r->next) {
|
|
456
|
+
symbol * b = copy_b(r->b);
|
|
457
|
+
b = add_to_b(b, SIZE(t->b), t->b);
|
|
458
|
+
free(file);
|
|
459
|
+
file = b_to_s(b);
|
|
460
|
+
u = get_input(file);
|
|
461
|
+
lose_b(b);
|
|
462
|
+
if (u != 0) break;
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
if (u == 0) {
|
|
466
|
+
error(t, "Can't get '", SIZE(t->b), t->b, "'");
|
|
467
|
+
exit(1);
|
|
468
|
+
}
|
|
469
|
+
memmove(q, t, sizeof(struct input));
|
|
470
|
+
t->next = q;
|
|
471
|
+
t->p = u;
|
|
472
|
+
t->c = 0;
|
|
473
|
+
t->file = file;
|
|
474
|
+
t->file_needs_freeing = true;
|
|
475
|
+
t->line_number = 1;
|
|
476
|
+
}
|
|
477
|
+
p = t->p;
|
|
478
|
+
continue;
|
|
479
|
+
case -1:
|
|
480
|
+
if (t->next) {
|
|
481
|
+
lose_b(p);
|
|
482
|
+
{
|
|
483
|
+
struct input * q = t->next;
|
|
484
|
+
memmove(t, q, sizeof(struct input)); p = t->p;
|
|
485
|
+
FREE(q);
|
|
486
|
+
}
|
|
487
|
+
t->get_depth--;
|
|
488
|
+
continue;
|
|
489
|
+
}
|
|
490
|
+
/* fall through */
|
|
491
|
+
default:
|
|
492
|
+
t->previous_token = t->token;
|
|
493
|
+
t->token = code;
|
|
494
|
+
return code;
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
extern const char * name_of_token(int code) {
|
|
500
|
+
int i;
|
|
501
|
+
for (i = 1; i < vocab->code; i++)
|
|
502
|
+
if ((vocab + i)->code == code) return (const char *)(vocab + i)->s;
|
|
503
|
+
switch (code) {
|
|
504
|
+
case c_mathassign: return "=";
|
|
505
|
+
case c_name: return "name";
|
|
506
|
+
case c_number: return "number";
|
|
507
|
+
case c_literalstring:return "literal";
|
|
508
|
+
case c_neg: return "neg";
|
|
509
|
+
case c_grouping: return "grouping";
|
|
510
|
+
case c_call: return "call";
|
|
511
|
+
case c_booltest: return "Boolean test";
|
|
512
|
+
case -2: return "start of text";
|
|
513
|
+
case -1: return "end of text";
|
|
514
|
+
default: return "?";
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
extern void disable_token(struct tokeniser * t, int code) {
|
|
519
|
+
t->token_disabled[code] = 1;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
extern struct tokeniser * create_tokeniser(symbol * p, char * file) {
|
|
523
|
+
NEW(tokeniser, t);
|
|
524
|
+
t->next = 0;
|
|
525
|
+
t->p = p;
|
|
526
|
+
t->c = 0;
|
|
527
|
+
t->file = file;
|
|
528
|
+
t->file_needs_freeing = false;
|
|
529
|
+
t->line_number = 1;
|
|
530
|
+
t->b = create_b(0);
|
|
531
|
+
t->b2 = create_b(0);
|
|
532
|
+
t->m_start = -1;
|
|
533
|
+
t->m_pairs = 0;
|
|
534
|
+
t->get_depth = 0;
|
|
535
|
+
t->error_count = 0;
|
|
536
|
+
t->token_held = false;
|
|
537
|
+
t->token = -2;
|
|
538
|
+
t->previous_token = -2;
|
|
539
|
+
t->uplusmode = UPLUS_NONE;
|
|
540
|
+
memset(t->token_disabled, 0, sizeof(t->token_disabled));
|
|
541
|
+
return t;
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
extern void close_tokeniser(struct tokeniser * t) {
|
|
545
|
+
lose_b(t->b);
|
|
546
|
+
lose_b(t->b2);
|
|
547
|
+
{
|
|
548
|
+
struct m_pair * q = t->m_pairs;
|
|
549
|
+
while (q) {
|
|
550
|
+
struct m_pair * q_next = q->next;
|
|
551
|
+
lose_b(q->name);
|
|
552
|
+
lose_b(q->value);
|
|
553
|
+
FREE(q);
|
|
554
|
+
q = q_next;
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
{
|
|
558
|
+
struct input * q = t->next;
|
|
559
|
+
while (q) {
|
|
560
|
+
struct input * q_next = q->next;
|
|
561
|
+
FREE(q);
|
|
562
|
+
q = q_next;
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
if (t->file_needs_freeing) free(t->file);
|
|
566
|
+
FREE(t);
|
|
567
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
*.generated.cs
|