ruco-cpp 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Rakefile +17 -0
- data/bin/console +14 -0
- data/bin/ruco +30 -0
- data/bin/setup +7 -0
- data/data/ruco/Parser.frame +359 -0
- data/data/ruco/Scanner.frame +896 -0
- data/data/ruco/picojson/Changes +14 -0
- data/data/ruco/picojson/LICENSE +25 -0
- data/data/ruco/picojson/Makefile +8 -0
- data/data/ruco/picojson/README.mkdn +183 -0
- data/data/ruco/picojson/examples/github-issues.cc +110 -0
- data/data/ruco/picojson/examples/iostream.cc +70 -0
- data/data/ruco/picojson/examples/streaming.cc +76 -0
- data/data/ruco/picojson/picojson.h +1299 -0
- data/ext/cocor/Action.cpp +81 -0
- data/ext/cocor/Action.h +59 -0
- data/ext/cocor/ArrayList.cpp +79 -0
- data/ext/cocor/ArrayList.h +52 -0
- data/ext/cocor/BitArray.cpp +156 -0
- data/ext/cocor/BitArray.h +68 -0
- data/ext/cocor/CharClass.cpp +42 -0
- data/ext/cocor/CharClass.h +48 -0
- data/ext/cocor/CharSet.cpp +166 -0
- data/ext/cocor/CharSet.h +68 -0
- data/ext/cocor/Coco.atg +528 -0
- data/ext/cocor/Coco.cpp +173 -0
- data/ext/cocor/Comment.cpp +45 -0
- data/ext/cocor/Comment.h +51 -0
- data/ext/cocor/Copyright.frame +27 -0
- data/ext/cocor/DFA.cpp +865 -0
- data/ext/cocor/DFA.h +132 -0
- data/ext/cocor/Generator.cpp +182 -0
- data/ext/cocor/Generator.h +61 -0
- data/ext/cocor/Graph.h +59 -0
- data/ext/cocor/HashTable.cpp +115 -0
- data/ext/cocor/HashTable.h +84 -0
- data/ext/cocor/Makefile +11 -0
- data/ext/cocor/Melted.cpp +39 -0
- data/ext/cocor/Melted.h +51 -0
- data/ext/cocor/Node.cpp +69 -0
- data/ext/cocor/Node.h +86 -0
- data/ext/cocor/Parser.cpp +925 -0
- data/ext/cocor/Parser.frame +326 -0
- data/ext/cocor/Parser.h +153 -0
- data/ext/cocor/ParserGen.cpp +486 -0
- data/ext/cocor/ParserGen.h +99 -0
- data/ext/cocor/Position.cpp +37 -0
- data/ext/cocor/Position.h +46 -0
- data/ext/cocor/README.md +12 -0
- data/ext/cocor/Scanner.cpp +833 -0
- data/ext/cocor/Scanner.frame +897 -0
- data/ext/cocor/Scanner.h +291 -0
- data/ext/cocor/Sets.h +84 -0
- data/ext/cocor/SortedList.cpp +141 -0
- data/ext/cocor/SortedList.h +68 -0
- data/ext/cocor/State.cpp +77 -0
- data/ext/cocor/State.h +55 -0
- data/ext/cocor/StringBuilder.cpp +88 -0
- data/ext/cocor/StringBuilder.h +29 -0
- data/ext/cocor/Symbol.cpp +61 -0
- data/ext/cocor/Symbol.h +70 -0
- data/ext/cocor/Tab.cpp +1248 -0
- data/ext/cocor/Tab.h +245 -0
- data/ext/cocor/Target.cpp +41 -0
- data/ext/cocor/Target.h +48 -0
- data/ext/cocor/build.bat +3 -0
- data/ext/cocor/build.sh +4 -0
- data/ext/cocor/coc.bat +1 -0
- data/ext/cocor/coc.sh +2 -0
- data/ext/cocor/cocor_ruby_ext.cpp +124 -0
- data/ext/cocor/cygBuild.bat +1 -0
- data/ext/cocor/extconf.rb +5 -0
- data/ext/cocor/mingwbuild.bat +2 -0
- data/ext/cocor/mkmf.log +57 -0
- data/ext/cocor/zipsources.bat +1 -0
- data/lib/cocor.rb +14 -0
- data/lib/ruco/version.rb +3 -0
- data/lib/ruco.rb +728 -0
- metadata +195 -0
@@ -0,0 +1,833 @@
|
|
1
|
+
/*----------------------------------------------------------------------
|
2
|
+
Compiler Generator Coco/R,
|
3
|
+
Copyright (c) 1990, 2004 Hanspeter Moessenboeck, University of Linz
|
4
|
+
extended by M. Loeberbauer & A. Woess, Univ. of Linz
|
5
|
+
ported to C++ by Csaba Balazs, University of Szeged
|
6
|
+
with improvements by Pat Terry, Rhodes University
|
7
|
+
|
8
|
+
This program is free software; you can redistribute it and/or modify it
|
9
|
+
under the terms of the GNU General Public License as published by the
|
10
|
+
Free Software Foundation; either version 2, or (at your option) any
|
11
|
+
later version.
|
12
|
+
|
13
|
+
This program is distributed in the hope that it will be useful, but
|
14
|
+
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
15
|
+
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
16
|
+
for more details.
|
17
|
+
|
18
|
+
You should have received a copy of the GNU General Public License along
|
19
|
+
with this program; if not, write to the Free Software Foundation, Inc.,
|
20
|
+
59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
21
|
+
|
22
|
+
As an exception, it is allowed to write an extension of Coco/R that is
|
23
|
+
used as a plugin in non-free software.
|
24
|
+
|
25
|
+
If not otherwise stated, any source code generated by Coco/R (other than
|
26
|
+
Coco/R itself) does not fall under the GNU General Public License.
|
27
|
+
-----------------------------------------------------------------------*/
|
28
|
+
|
29
|
+
|
30
|
+
#include <memory.h>
|
31
|
+
#include <string.h>
|
32
|
+
#include "Scanner.h"
|
33
|
+
|
34
|
+
namespace Coco {
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
// string handling, wide character
|
39
|
+
|
40
|
+
|
41
|
+
wchar_t* coco_string_create(const wchar_t* value) {
|
42
|
+
return coco_string_create(value, 0);
|
43
|
+
}
|
44
|
+
|
45
|
+
wchar_t* coco_string_create(const wchar_t *value, int startIndex) {
|
46
|
+
int valueLen = 0;
|
47
|
+
int len = 0;
|
48
|
+
|
49
|
+
if (value) {
|
50
|
+
valueLen = wcslen(value);
|
51
|
+
len = valueLen - startIndex;
|
52
|
+
}
|
53
|
+
|
54
|
+
return coco_string_create(value, startIndex, len);
|
55
|
+
}
|
56
|
+
|
57
|
+
wchar_t* coco_string_create(const wchar_t *value, int startIndex, int length) {
|
58
|
+
int len = 0;
|
59
|
+
wchar_t* data;
|
60
|
+
|
61
|
+
if (value) { len = length; }
|
62
|
+
data = new wchar_t[len + 1];
|
63
|
+
wcsncpy(data, &(value[startIndex]), len);
|
64
|
+
data[len] = 0;
|
65
|
+
|
66
|
+
return data;
|
67
|
+
}
|
68
|
+
|
69
|
+
wchar_t* coco_string_create_upper(const wchar_t* data) {
|
70
|
+
if (!data) { return NULL; }
|
71
|
+
|
72
|
+
int dataLen = 0;
|
73
|
+
if (data) { dataLen = wcslen(data); }
|
74
|
+
|
75
|
+
wchar_t *newData = new wchar_t[dataLen + 1];
|
76
|
+
|
77
|
+
for (int i = 0; i <= dataLen; i++) {
|
78
|
+
if ((L'a' <= data[i]) && (data[i] <= L'z')) {
|
79
|
+
newData[i] = data[i] + (L'A' - L'a');
|
80
|
+
}
|
81
|
+
else { newData[i] = data[i]; }
|
82
|
+
}
|
83
|
+
|
84
|
+
newData[dataLen] = L'\0';
|
85
|
+
return newData;
|
86
|
+
}
|
87
|
+
|
88
|
+
wchar_t* coco_string_create_lower(const wchar_t* data) {
|
89
|
+
if (!data) { return NULL; }
|
90
|
+
int dataLen = wcslen(data);
|
91
|
+
return coco_string_create_lower(data, 0, dataLen);
|
92
|
+
}
|
93
|
+
|
94
|
+
wchar_t* coco_string_create_lower(const wchar_t* data, int startIndex, int dataLen) {
|
95
|
+
if (!data) { return NULL; }
|
96
|
+
|
97
|
+
wchar_t* newData = new wchar_t[dataLen + 1];
|
98
|
+
|
99
|
+
for (int i = 0; i <= dataLen; i++) {
|
100
|
+
wchar_t ch = data[startIndex + i];
|
101
|
+
if ((L'A' <= ch) && (ch <= L'Z')) {
|
102
|
+
newData[i] = ch - (L'A' - L'a');
|
103
|
+
}
|
104
|
+
else { newData[i] = ch; }
|
105
|
+
}
|
106
|
+
newData[dataLen] = L'\0';
|
107
|
+
return newData;
|
108
|
+
}
|
109
|
+
|
110
|
+
wchar_t* coco_string_create_append(const wchar_t* data1, const wchar_t* data2) {
|
111
|
+
wchar_t* data;
|
112
|
+
int data1Len = 0;
|
113
|
+
int data2Len = 0;
|
114
|
+
|
115
|
+
if (data1) { data1Len = wcslen(data1); }
|
116
|
+
if (data2) {data2Len = wcslen(data2); }
|
117
|
+
|
118
|
+
data = new wchar_t[data1Len + data2Len + 1];
|
119
|
+
|
120
|
+
if (data1) { wcscpy(data, data1); }
|
121
|
+
if (data2) { wcscpy(data + data1Len, data2); }
|
122
|
+
|
123
|
+
data[data1Len + data2Len] = 0;
|
124
|
+
|
125
|
+
return data;
|
126
|
+
}
|
127
|
+
|
128
|
+
wchar_t* coco_string_create_append(const wchar_t *target, const wchar_t appendix) {
|
129
|
+
int targetLen = coco_string_length(target);
|
130
|
+
wchar_t* data = new wchar_t[targetLen + 2];
|
131
|
+
wcsncpy(data, target, targetLen);
|
132
|
+
data[targetLen] = appendix;
|
133
|
+
data[targetLen + 1] = 0;
|
134
|
+
return data;
|
135
|
+
}
|
136
|
+
|
137
|
+
void coco_string_delete(wchar_t* &data) {
|
138
|
+
delete [] data;
|
139
|
+
data = NULL;
|
140
|
+
}
|
141
|
+
|
142
|
+
int coco_string_length(const wchar_t* data) {
|
143
|
+
if (data) { return wcslen(data); }
|
144
|
+
return 0;
|
145
|
+
}
|
146
|
+
|
147
|
+
bool coco_string_endswith(const wchar_t* data, const wchar_t *end) {
|
148
|
+
int dataLen = wcslen(data);
|
149
|
+
int endLen = wcslen(end);
|
150
|
+
return (endLen <= dataLen) && (wcscmp(data + dataLen - endLen, end) == 0);
|
151
|
+
}
|
152
|
+
|
153
|
+
int coco_string_indexof(const wchar_t* data, const wchar_t value) {
|
154
|
+
const wchar_t* chr = wcschr(data, value);
|
155
|
+
|
156
|
+
if (chr) { return (chr-data); }
|
157
|
+
return -1;
|
158
|
+
}
|
159
|
+
|
160
|
+
int coco_string_lastindexof(const wchar_t* data, const wchar_t value) {
|
161
|
+
const wchar_t* chr = wcsrchr(data, value);
|
162
|
+
|
163
|
+
if (chr) { return (chr-data); }
|
164
|
+
return -1;
|
165
|
+
}
|
166
|
+
|
167
|
+
void coco_string_merge(wchar_t* &target, const wchar_t* appendix) {
|
168
|
+
if (!appendix) { return; }
|
169
|
+
wchar_t* data = coco_string_create_append(target, appendix);
|
170
|
+
delete [] target;
|
171
|
+
target = data;
|
172
|
+
}
|
173
|
+
|
174
|
+
bool coco_string_equal(const wchar_t* data1, const wchar_t* data2) {
|
175
|
+
return wcscmp( data1, data2 ) == 0;
|
176
|
+
}
|
177
|
+
|
178
|
+
int coco_string_compareto(const wchar_t* data1, const wchar_t* data2) {
|
179
|
+
return wcscmp(data1, data2);
|
180
|
+
}
|
181
|
+
|
182
|
+
int coco_string_hash(const wchar_t *data) {
|
183
|
+
int h = 0;
|
184
|
+
if (!data) { return 0; }
|
185
|
+
while (*data != 0) {
|
186
|
+
h = (h * 7) ^ *data;
|
187
|
+
++data;
|
188
|
+
}
|
189
|
+
if (h < 0) { h = -h; }
|
190
|
+
return h;
|
191
|
+
}
|
192
|
+
|
193
|
+
// string handling, ascii character
|
194
|
+
|
195
|
+
wchar_t* coco_string_create(const char* value) {
|
196
|
+
int len = 0;
|
197
|
+
if (value) { len = strlen(value); }
|
198
|
+
wchar_t* data = new wchar_t[len + 1];
|
199
|
+
for (int i = 0; i < len; ++i) { data[i] = (wchar_t) value[i]; }
|
200
|
+
data[len] = 0;
|
201
|
+
return data;
|
202
|
+
}
|
203
|
+
|
204
|
+
char* coco_string_create_char(const wchar_t *value) {
|
205
|
+
int len = coco_string_length(value);
|
206
|
+
char *res = new char[len + 1];
|
207
|
+
for (int i = 0; i < len; ++i) { res[i] = (char) value[i]; }
|
208
|
+
res[len] = 0;
|
209
|
+
return res;
|
210
|
+
}
|
211
|
+
|
212
|
+
void coco_string_delete(char* &data) {
|
213
|
+
delete [] data;
|
214
|
+
data = NULL;
|
215
|
+
}
|
216
|
+
|
217
|
+
|
218
|
+
Token::Token() {
|
219
|
+
kind = 0;
|
220
|
+
pos = 0;
|
221
|
+
col = 0;
|
222
|
+
line = 0;
|
223
|
+
val = NULL;
|
224
|
+
next = NULL;
|
225
|
+
}
|
226
|
+
|
227
|
+
Token::~Token() {
|
228
|
+
coco_string_delete(val);
|
229
|
+
}
|
230
|
+
|
231
|
+
Buffer::Buffer(FILE* s, bool isUserStream) {
|
232
|
+
// ensure binary read on windows
|
233
|
+
#if _MSC_VER >= 1300
|
234
|
+
_setmode(_fileno(s), _O_BINARY);
|
235
|
+
#endif
|
236
|
+
stream = s; this->isUserStream = isUserStream;
|
237
|
+
if (CanSeek()) {
|
238
|
+
fseek(s, 0, SEEK_END);
|
239
|
+
fileLen = ftell(s);
|
240
|
+
fseek(s, 0, SEEK_SET);
|
241
|
+
bufLen = (fileLen < COCO_MAX_BUFFER_LENGTH) ? fileLen : COCO_MAX_BUFFER_LENGTH;
|
242
|
+
bufStart = INT_MAX; // nothing in the buffer so far
|
243
|
+
} else {
|
244
|
+
fileLen = bufLen = bufStart = 0;
|
245
|
+
}
|
246
|
+
bufCapacity = (bufLen>0) ? bufLen : COCO_MIN_BUFFER_LENGTH;
|
247
|
+
buf = new unsigned char[bufCapacity];
|
248
|
+
if (fileLen > 0) SetPos(0); // setup buffer to position 0 (start)
|
249
|
+
else bufPos = 0; // index 0 is already after the file, thus Pos = 0 is invalid
|
250
|
+
if (bufLen == fileLen && CanSeek()) Close();
|
251
|
+
}
|
252
|
+
|
253
|
+
Buffer::Buffer(Buffer *b) {
|
254
|
+
buf = b->buf;
|
255
|
+
bufCapacity = b->bufCapacity;
|
256
|
+
b->buf = NULL;
|
257
|
+
bufStart = b->bufStart;
|
258
|
+
bufLen = b->bufLen;
|
259
|
+
fileLen = b->fileLen;
|
260
|
+
bufPos = b->bufPos;
|
261
|
+
stream = b->stream;
|
262
|
+
b->stream = NULL;
|
263
|
+
isUserStream = b->isUserStream;
|
264
|
+
}
|
265
|
+
|
266
|
+
Buffer::Buffer(const unsigned char* buf, int len) {
|
267
|
+
this->buf = new unsigned char[len];
|
268
|
+
memcpy(this->buf, buf, len*sizeof(unsigned char));
|
269
|
+
bufStart = 0;
|
270
|
+
bufCapacity = bufLen = len;
|
271
|
+
fileLen = len;
|
272
|
+
bufPos = 0;
|
273
|
+
stream = NULL;
|
274
|
+
}
|
275
|
+
|
276
|
+
Buffer::~Buffer() {
|
277
|
+
Close();
|
278
|
+
if (buf != NULL) {
|
279
|
+
delete [] buf;
|
280
|
+
buf = NULL;
|
281
|
+
}
|
282
|
+
}
|
283
|
+
|
284
|
+
void Buffer::Close() {
|
285
|
+
if (!isUserStream && stream != NULL) {
|
286
|
+
fclose(stream);
|
287
|
+
stream = NULL;
|
288
|
+
}
|
289
|
+
}
|
290
|
+
|
291
|
+
int Buffer::Read() {
|
292
|
+
if (bufPos < bufLen) {
|
293
|
+
return buf[bufPos++];
|
294
|
+
} else if (GetPos() < fileLen) {
|
295
|
+
SetPos(GetPos()); // shift buffer start to Pos
|
296
|
+
return buf[bufPos++];
|
297
|
+
} else if ((stream != NULL) && !CanSeek() && (ReadNextStreamChunk() > 0)) {
|
298
|
+
return buf[bufPos++];
|
299
|
+
} else {
|
300
|
+
return EoF;
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
304
|
+
int Buffer::Peek() {
|
305
|
+
int curPos = GetPos();
|
306
|
+
int ch = Read();
|
307
|
+
SetPos(curPos);
|
308
|
+
return ch;
|
309
|
+
}
|
310
|
+
|
311
|
+
// beg .. begin, zero-based, inclusive, in byte
|
312
|
+
// end .. end, zero-based, exclusive, in byte
|
313
|
+
wchar_t* Buffer::GetString(int beg, int end) {
|
314
|
+
int len = 0;
|
315
|
+
wchar_t *buf = new wchar_t[end - beg];
|
316
|
+
int oldPos = GetPos();
|
317
|
+
SetPos(beg);
|
318
|
+
while (GetPos() < end) buf[len++] = (wchar_t) Read();
|
319
|
+
SetPos(oldPos);
|
320
|
+
wchar_t *res = coco_string_create(buf, 0, len);
|
321
|
+
coco_string_delete(buf);
|
322
|
+
return res;
|
323
|
+
}
|
324
|
+
|
325
|
+
int Buffer::GetPos() {
|
326
|
+
return bufPos + bufStart;
|
327
|
+
}
|
328
|
+
|
329
|
+
void Buffer::SetPos(int value) {
|
330
|
+
if ((value >= fileLen) && (stream != NULL) && !CanSeek()) {
|
331
|
+
// Wanted position is after buffer and the stream
|
332
|
+
// is not seek-able e.g. network or console,
|
333
|
+
// thus we have to read the stream manually till
|
334
|
+
// the wanted position is in sight.
|
335
|
+
while ((value >= fileLen) && (ReadNextStreamChunk() > 0));
|
336
|
+
}
|
337
|
+
|
338
|
+
if ((value < 0) || (value > fileLen)) {
|
339
|
+
wprintf(L"--- buffer out of bounds access, position: %d\n", value);
|
340
|
+
exit(1);
|
341
|
+
}
|
342
|
+
|
343
|
+
if ((value >= bufStart) && (value < (bufStart + bufLen))) { // already in buffer
|
344
|
+
bufPos = value - bufStart;
|
345
|
+
} else if (stream != NULL) { // must be swapped in
|
346
|
+
fseek(stream, value, SEEK_SET);
|
347
|
+
bufLen = fread(buf, sizeof(unsigned char), bufCapacity, stream);
|
348
|
+
bufStart = value; bufPos = 0;
|
349
|
+
} else {
|
350
|
+
bufPos = fileLen - bufStart; // make Pos return fileLen
|
351
|
+
}
|
352
|
+
}
|
353
|
+
|
354
|
+
// Read the next chunk of bytes from the stream, increases the buffer
|
355
|
+
// if needed and updates the fields fileLen and bufLen.
|
356
|
+
// Returns the number of bytes read.
|
357
|
+
int Buffer::ReadNextStreamChunk() {
|
358
|
+
int free = bufCapacity - bufLen;
|
359
|
+
if (free == 0) {
|
360
|
+
// in the case of a growing input stream
|
361
|
+
// we can neither seek in the stream, nor can we
|
362
|
+
// foresee the maximum length, thus we must adapt
|
363
|
+
// the buffer size on demand.
|
364
|
+
bufCapacity = bufLen * 2;
|
365
|
+
unsigned char *newBuf = new unsigned char[bufCapacity];
|
366
|
+
memcpy(newBuf, buf, bufLen*sizeof(unsigned char));
|
367
|
+
delete [] buf;
|
368
|
+
buf = newBuf;
|
369
|
+
free = bufLen;
|
370
|
+
}
|
371
|
+
int read = fread(buf + bufLen, sizeof(unsigned char), free, stream);
|
372
|
+
if (read > 0) {
|
373
|
+
fileLen = bufLen = (bufLen + read);
|
374
|
+
return read;
|
375
|
+
}
|
376
|
+
// end of stream reached
|
377
|
+
return 0;
|
378
|
+
}
|
379
|
+
|
380
|
+
bool Buffer::CanSeek() {
|
381
|
+
return (stream != NULL) && (ftell(stream) != -1);
|
382
|
+
}
|
383
|
+
|
384
|
+
int UTF8Buffer::Read() {
|
385
|
+
int ch;
|
386
|
+
do {
|
387
|
+
ch = Buffer::Read();
|
388
|
+
// until we find a utf8 start (0xxxxxxx or 11xxxxxx)
|
389
|
+
} while ((ch >= 128) && ((ch & 0xC0) != 0xC0) && (ch != EoF));
|
390
|
+
if (ch < 128 || ch == EoF) {
|
391
|
+
// nothing to do, first 127 chars are the same in ascii and utf8
|
392
|
+
// 0xxxxxxx or end of file character
|
393
|
+
} else if ((ch & 0xF0) == 0xF0) {
|
394
|
+
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
395
|
+
int c1 = ch & 0x07; ch = Buffer::Read();
|
396
|
+
int c2 = ch & 0x3F; ch = Buffer::Read();
|
397
|
+
int c3 = ch & 0x3F; ch = Buffer::Read();
|
398
|
+
int c4 = ch & 0x3F;
|
399
|
+
ch = (((((c1 << 6) | c2) << 6) | c3) << 6) | c4;
|
400
|
+
} else if ((ch & 0xE0) == 0xE0) {
|
401
|
+
// 1110xxxx 10xxxxxx 10xxxxxx
|
402
|
+
int c1 = ch & 0x0F; ch = Buffer::Read();
|
403
|
+
int c2 = ch & 0x3F; ch = Buffer::Read();
|
404
|
+
int c3 = ch & 0x3F;
|
405
|
+
ch = (((c1 << 6) | c2) << 6) | c3;
|
406
|
+
} else if ((ch & 0xC0) == 0xC0) {
|
407
|
+
// 110xxxxx 10xxxxxx
|
408
|
+
int c1 = ch & 0x1F; ch = Buffer::Read();
|
409
|
+
int c2 = ch & 0x3F;
|
410
|
+
ch = (c1 << 6) | c2;
|
411
|
+
}
|
412
|
+
return ch;
|
413
|
+
}
|
414
|
+
|
415
|
+
Scanner::Scanner(const unsigned char* buf, int len) {
|
416
|
+
buffer = new Buffer(buf, len);
|
417
|
+
Init();
|
418
|
+
}
|
419
|
+
|
420
|
+
Scanner::Scanner(const wchar_t* fileName) {
|
421
|
+
FILE* stream;
|
422
|
+
char *chFileName = coco_string_create_char(fileName);
|
423
|
+
if ((stream = fopen(chFileName, "rb")) == NULL) {
|
424
|
+
wprintf(L"--- Cannot open file %ls\n", fileName);
|
425
|
+
exit(1);
|
426
|
+
}
|
427
|
+
coco_string_delete(chFileName);
|
428
|
+
buffer = new Buffer(stream, false);
|
429
|
+
Init();
|
430
|
+
}
|
431
|
+
|
432
|
+
Scanner::Scanner(FILE* s) {
|
433
|
+
buffer = new Buffer(s, true);
|
434
|
+
Init();
|
435
|
+
}
|
436
|
+
|
437
|
+
Scanner::~Scanner() {
|
438
|
+
char* cur = (char*) firstHeap;
|
439
|
+
|
440
|
+
while(cur != NULL) {
|
441
|
+
cur = *(char**) (cur + COCO_HEAP_BLOCK_SIZE);
|
442
|
+
free(firstHeap);
|
443
|
+
firstHeap = cur;
|
444
|
+
}
|
445
|
+
delete [] tval;
|
446
|
+
delete buffer;
|
447
|
+
}
|
448
|
+
|
449
|
+
void Scanner::Init() {
|
450
|
+
EOL = '\n';
|
451
|
+
eofSym = 0;
|
452
|
+
maxT = 41;
|
453
|
+
noSym = 41;
|
454
|
+
int i;
|
455
|
+
for (i = 65; i <= 90; ++i) start.set(i, 1);
|
456
|
+
for (i = 95; i <= 95; ++i) start.set(i, 1);
|
457
|
+
for (i = 97; i <= 122; ++i) start.set(i, 1);
|
458
|
+
for (i = 48; i <= 57; ++i) start.set(i, 2);
|
459
|
+
start.set(34, 12);
|
460
|
+
start.set(39, 5);
|
461
|
+
start.set(36, 13);
|
462
|
+
start.set(61, 16);
|
463
|
+
start.set(46, 31);
|
464
|
+
start.set(43, 17);
|
465
|
+
start.set(45, 18);
|
466
|
+
start.set(60, 32);
|
467
|
+
start.set(62, 20);
|
468
|
+
start.set(124, 23);
|
469
|
+
start.set(40, 33);
|
470
|
+
start.set(41, 24);
|
471
|
+
start.set(91, 25);
|
472
|
+
start.set(93, 26);
|
473
|
+
start.set(123, 27);
|
474
|
+
start.set(125, 28);
|
475
|
+
start.set(Buffer::EoF, -1);
|
476
|
+
keywords.set(L"COMPILER", 6);
|
477
|
+
keywords.set(L"IGNORECASE", 7);
|
478
|
+
keywords.set(L"CHARACTERS", 8);
|
479
|
+
keywords.set(L"TOKENS", 9);
|
480
|
+
keywords.set(L"PRAGMAS", 10);
|
481
|
+
keywords.set(L"COMMENTS", 11);
|
482
|
+
keywords.set(L"FROM", 12);
|
483
|
+
keywords.set(L"TO", 13);
|
484
|
+
keywords.set(L"NESTED", 14);
|
485
|
+
keywords.set(L"IGNORE", 15);
|
486
|
+
keywords.set(L"PRODUCTIONS", 16);
|
487
|
+
keywords.set(L"END", 19);
|
488
|
+
keywords.set(L"ANY", 23);
|
489
|
+
keywords.set(L"WEAK", 29);
|
490
|
+
keywords.set(L"SYNC", 36);
|
491
|
+
keywords.set(L"IF", 37);
|
492
|
+
keywords.set(L"CONTEXT", 38);
|
493
|
+
|
494
|
+
|
495
|
+
tvalLength = 128;
|
496
|
+
tval = new wchar_t[tvalLength]; // text of current token
|
497
|
+
|
498
|
+
// COCO_HEAP_BLOCK_SIZE byte heap + pointer to next heap block
|
499
|
+
heap = malloc(COCO_HEAP_BLOCK_SIZE + sizeof(void*));
|
500
|
+
firstHeap = heap;
|
501
|
+
heapEnd = (void**) (((char*) heap) + COCO_HEAP_BLOCK_SIZE);
|
502
|
+
*heapEnd = 0;
|
503
|
+
heapTop = heap;
|
504
|
+
if (sizeof(Token) > COCO_HEAP_BLOCK_SIZE) {
|
505
|
+
wprintf(L"--- Too small COCO_HEAP_BLOCK_SIZE\n");
|
506
|
+
exit(1);
|
507
|
+
}
|
508
|
+
|
509
|
+
pos = -1; line = 1; col = 0; charPos = -1;
|
510
|
+
oldEols = 0;
|
511
|
+
NextCh();
|
512
|
+
if (ch == 0xEF) { // check optional byte order mark for UTF-8
|
513
|
+
NextCh(); int ch1 = ch;
|
514
|
+
NextCh(); int ch2 = ch;
|
515
|
+
if (ch1 != 0xBB || ch2 != 0xBF) {
|
516
|
+
wprintf(L"Illegal byte order mark at start of file");
|
517
|
+
exit(1);
|
518
|
+
}
|
519
|
+
Buffer *oldBuf = buffer;
|
520
|
+
buffer = new UTF8Buffer(buffer); col = 0; charPos = -1;
|
521
|
+
delete oldBuf; oldBuf = NULL;
|
522
|
+
NextCh();
|
523
|
+
}
|
524
|
+
|
525
|
+
|
526
|
+
pt = tokens = CreateToken(); // first token is a dummy
|
527
|
+
}
|
528
|
+
|
529
|
+
void Scanner::NextCh() {
|
530
|
+
if (oldEols > 0) { ch = EOL; oldEols--; }
|
531
|
+
else {
|
532
|
+
pos = buffer->GetPos();
|
533
|
+
// buffer reads unicode chars, if UTF8 has been detected
|
534
|
+
ch = buffer->Read(); col++; charPos++;
|
535
|
+
// replace isolated '\r' by '\n' in order to make
|
536
|
+
// eol handling uniform across Windows, Unix and Mac
|
537
|
+
if (ch == L'\r' && buffer->Peek() != L'\n') ch = EOL;
|
538
|
+
if (ch == EOL) { line++; col = 0; }
|
539
|
+
}
|
540
|
+
|
541
|
+
}
|
542
|
+
|
543
|
+
void Scanner::AddCh() {
|
544
|
+
if (tlen >= tvalLength) {
|
545
|
+
tvalLength *= 2;
|
546
|
+
wchar_t *newBuf = new wchar_t[tvalLength];
|
547
|
+
memcpy(newBuf, tval, tlen*sizeof(wchar_t));
|
548
|
+
delete [] tval;
|
549
|
+
tval = newBuf;
|
550
|
+
}
|
551
|
+
if (ch != Buffer::EoF) {
|
552
|
+
tval[tlen++] = ch;
|
553
|
+
NextCh();
|
554
|
+
}
|
555
|
+
}
|
556
|
+
|
557
|
+
|
558
|
+
bool Scanner::Comment0() {
|
559
|
+
int level = 1, pos0 = pos, line0 = line, col0 = col, charPos0 = charPos;
|
560
|
+
NextCh();
|
561
|
+
if (ch == L'/') {
|
562
|
+
NextCh();
|
563
|
+
for(;;) {
|
564
|
+
if (ch == 10) {
|
565
|
+
level--;
|
566
|
+
if (level == 0) { oldEols = line - line0; NextCh(); return true; }
|
567
|
+
NextCh();
|
568
|
+
} else if (ch == buffer->EoF) return false;
|
569
|
+
else NextCh();
|
570
|
+
}
|
571
|
+
} else {
|
572
|
+
buffer->SetPos(pos0); NextCh(); line = line0; col = col0; charPos = charPos0;
|
573
|
+
}
|
574
|
+
return false;
|
575
|
+
}
|
576
|
+
|
577
|
+
bool Scanner::Comment1() {
|
578
|
+
int level = 1, pos0 = pos, line0 = line, col0 = col, charPos0 = charPos;
|
579
|
+
NextCh();
|
580
|
+
if (ch == L'*') {
|
581
|
+
NextCh();
|
582
|
+
for(;;) {
|
583
|
+
if (ch == L'*') {
|
584
|
+
NextCh();
|
585
|
+
if (ch == L'/') {
|
586
|
+
level--;
|
587
|
+
if (level == 0) { oldEols = line - line0; NextCh(); return true; }
|
588
|
+
NextCh();
|
589
|
+
}
|
590
|
+
} else if (ch == L'/') {
|
591
|
+
NextCh();
|
592
|
+
if (ch == L'*') {
|
593
|
+
level++; NextCh();
|
594
|
+
}
|
595
|
+
} else if (ch == buffer->EoF) return false;
|
596
|
+
else NextCh();
|
597
|
+
}
|
598
|
+
} else {
|
599
|
+
buffer->SetPos(pos0); NextCh(); line = line0; col = col0; charPos = charPos0;
|
600
|
+
}
|
601
|
+
return false;
|
602
|
+
}
|
603
|
+
|
604
|
+
|
605
|
+
void Scanner::CreateHeapBlock() {
|
606
|
+
void* newHeap;
|
607
|
+
char* cur = (char*) firstHeap;
|
608
|
+
|
609
|
+
while(((char*) tokens < cur) || ((char*) tokens > (cur + COCO_HEAP_BLOCK_SIZE))) {
|
610
|
+
cur = *((char**) (cur + COCO_HEAP_BLOCK_SIZE));
|
611
|
+
free(firstHeap);
|
612
|
+
firstHeap = cur;
|
613
|
+
}
|
614
|
+
|
615
|
+
// COCO_HEAP_BLOCK_SIZE byte heap + pointer to next heap block
|
616
|
+
newHeap = malloc(COCO_HEAP_BLOCK_SIZE + sizeof(void*));
|
617
|
+
*heapEnd = newHeap;
|
618
|
+
heapEnd = (void**) (((char*) newHeap) + COCO_HEAP_BLOCK_SIZE);
|
619
|
+
*heapEnd = 0;
|
620
|
+
heap = newHeap;
|
621
|
+
heapTop = heap;
|
622
|
+
}
|
623
|
+
|
624
|
+
Token* Scanner::CreateToken() {
|
625
|
+
Token *t;
|
626
|
+
if (((char*) heapTop + (int) sizeof(Token)) >= (char*) heapEnd) {
|
627
|
+
CreateHeapBlock();
|
628
|
+
}
|
629
|
+
t = (Token*) heapTop;
|
630
|
+
heapTop = (void*) ((char*) heapTop + sizeof(Token));
|
631
|
+
t->val = NULL;
|
632
|
+
t->next = NULL;
|
633
|
+
return t;
|
634
|
+
}
|
635
|
+
|
636
|
+
void Scanner::AppendVal(Token *t) {
|
637
|
+
int reqMem = (tlen + 1) * sizeof(wchar_t);
|
638
|
+
if (((char*) heapTop + reqMem) >= (char*) heapEnd) {
|
639
|
+
if (reqMem > COCO_HEAP_BLOCK_SIZE) {
|
640
|
+
wprintf(L"--- Too long token value\n");
|
641
|
+
exit(1);
|
642
|
+
}
|
643
|
+
CreateHeapBlock();
|
644
|
+
}
|
645
|
+
t->val = (wchar_t*) heapTop;
|
646
|
+
heapTop = (void*) ((char*) heapTop + reqMem);
|
647
|
+
|
648
|
+
wcsncpy(t->val, tval, tlen);
|
649
|
+
t->val[tlen] = L'\0';
|
650
|
+
}
|
651
|
+
|
652
|
+
Token* Scanner::NextToken() {
|
653
|
+
while (ch == ' ' ||
|
654
|
+
(ch >= 9 && ch <= 10) || ch == 13
|
655
|
+
) NextCh();
|
656
|
+
if ((ch == L'/' && Comment0()) || (ch == L'/' && Comment1())) return NextToken();
|
657
|
+
int recKind = noSym;
|
658
|
+
int recEnd = pos;
|
659
|
+
t = CreateToken();
|
660
|
+
t->pos = pos; t->col = col; t->line = line; t->charPos = charPos;
|
661
|
+
int state = start.state(ch);
|
662
|
+
tlen = 0; AddCh();
|
663
|
+
|
664
|
+
switch (state) {
|
665
|
+
case -1: { t->kind = eofSym; break; } // NextCh already done
|
666
|
+
case 0: {
|
667
|
+
case_0:
|
668
|
+
if (recKind != noSym) {
|
669
|
+
tlen = recEnd - t->pos;
|
670
|
+
SetScannerBehindT();
|
671
|
+
}
|
672
|
+
t->kind = recKind; break;
|
673
|
+
} // NextCh already done
|
674
|
+
case 1:
|
675
|
+
case_1:
|
676
|
+
recEnd = pos; recKind = 1;
|
677
|
+
if ((ch >= L'0' && ch <= L'9') || (ch >= L'A' && ch <= L'Z') || ch == L'_' || (ch >= L'a' && ch <= L'z')) {AddCh(); goto case_1;}
|
678
|
+
else {t->kind = 1; wchar_t *literal = coco_string_create(tval, 0, tlen); t->kind = keywords.get(literal, t->kind); coco_string_delete(literal); break;}
|
679
|
+
case 2:
|
680
|
+
case_2:
|
681
|
+
recEnd = pos; recKind = 2;
|
682
|
+
if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_2;}
|
683
|
+
else {t->kind = 2; break;}
|
684
|
+
case 3:
|
685
|
+
case_3:
|
686
|
+
{t->kind = 3; break;}
|
687
|
+
case 4:
|
688
|
+
case_4:
|
689
|
+
{t->kind = 4; break;}
|
690
|
+
case 5:
|
691
|
+
if (ch <= 9 || (ch >= 11 && ch <= 12) || (ch >= 14 && ch <= L'&') || (ch >= L'(' && ch <= L'[') || (ch >= L']' && ch <= 65535)) {AddCh(); goto case_6;}
|
692
|
+
else if (ch == 92) {AddCh(); goto case_7;}
|
693
|
+
else {goto case_0;}
|
694
|
+
case 6:
|
695
|
+
case_6:
|
696
|
+
if (ch == 39) {AddCh(); goto case_9;}
|
697
|
+
else {goto case_0;}
|
698
|
+
case 7:
|
699
|
+
case_7:
|
700
|
+
if ((ch >= L' ' && ch <= L'~')) {AddCh(); goto case_8;}
|
701
|
+
else {goto case_0;}
|
702
|
+
case 8:
|
703
|
+
case_8:
|
704
|
+
if ((ch >= L'0' && ch <= L'9') || (ch >= L'a' && ch <= L'f')) {AddCh(); goto case_8;}
|
705
|
+
else if (ch == 39) {AddCh(); goto case_9;}
|
706
|
+
else {goto case_0;}
|
707
|
+
case 9:
|
708
|
+
case_9:
|
709
|
+
{t->kind = 5; break;}
|
710
|
+
case 10:
|
711
|
+
case_10:
|
712
|
+
recEnd = pos; recKind = 42;
|
713
|
+
if ((ch >= L'0' && ch <= L'9') || (ch >= L'A' && ch <= L'Z') || ch == L'_' || (ch >= L'a' && ch <= L'z')) {AddCh(); goto case_10;}
|
714
|
+
else {t->kind = 42; break;}
|
715
|
+
case 11:
|
716
|
+
case_11:
|
717
|
+
recEnd = pos; recKind = 43;
|
718
|
+
if ((ch >= L'-' && ch <= L'.') || (ch >= L'0' && ch <= L':') || (ch >= L'A' && ch <= L'Z') || ch == L'_' || (ch >= L'a' && ch <= L'z')) {AddCh(); goto case_11;}
|
719
|
+
else {t->kind = 43; break;}
|
720
|
+
case 12:
|
721
|
+
case_12:
|
722
|
+
if (ch <= 9 || (ch >= 11 && ch <= 12) || (ch >= 14 && ch <= L'!') || (ch >= L'#' && ch <= L'[') || (ch >= L']' && ch <= 65535)) {AddCh(); goto case_12;}
|
723
|
+
else if (ch == 10 || ch == 13) {AddCh(); goto case_4;}
|
724
|
+
else if (ch == L'"') {AddCh(); goto case_3;}
|
725
|
+
else if (ch == 92) {AddCh(); goto case_14;}
|
726
|
+
else {goto case_0;}
|
727
|
+
case 13:
|
728
|
+
recEnd = pos; recKind = 42;
|
729
|
+
if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_10;}
|
730
|
+
else if ((ch >= L'A' && ch <= L'Z') || ch == L'_' || (ch >= L'a' && ch <= L'z')) {AddCh(); goto case_15;}
|
731
|
+
else {t->kind = 42; break;}
|
732
|
+
case 14:
|
733
|
+
case_14:
|
734
|
+
if ((ch >= L' ' && ch <= L'~')) {AddCh(); goto case_12;}
|
735
|
+
else {goto case_0;}
|
736
|
+
case 15:
|
737
|
+
case_15:
|
738
|
+
recEnd = pos; recKind = 42;
|
739
|
+
if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_10;}
|
740
|
+
else if ((ch >= L'A' && ch <= L'Z') || ch == L'_' || (ch >= L'a' && ch <= L'z')) {AddCh(); goto case_15;}
|
741
|
+
else if (ch == L'=') {AddCh(); goto case_11;}
|
742
|
+
else {t->kind = 42; break;}
|
743
|
+
case 16:
|
744
|
+
{t->kind = 17; break;}
|
745
|
+
case 17:
|
746
|
+
{t->kind = 20; break;}
|
747
|
+
case 18:
|
748
|
+
{t->kind = 21; break;}
|
749
|
+
case 19:
|
750
|
+
case_19:
|
751
|
+
{t->kind = 22; break;}
|
752
|
+
case 20:
|
753
|
+
{t->kind = 25; break;}
|
754
|
+
case 21:
|
755
|
+
case_21:
|
756
|
+
{t->kind = 26; break;}
|
757
|
+
case 22:
|
758
|
+
case_22:
|
759
|
+
{t->kind = 27; break;}
|
760
|
+
case 23:
|
761
|
+
{t->kind = 28; break;}
|
762
|
+
case 24:
|
763
|
+
{t->kind = 31; break;}
|
764
|
+
case 25:
|
765
|
+
{t->kind = 32; break;}
|
766
|
+
case 26:
|
767
|
+
{t->kind = 33; break;}
|
768
|
+
case 27:
|
769
|
+
{t->kind = 34; break;}
|
770
|
+
case 28:
|
771
|
+
{t->kind = 35; break;}
|
772
|
+
case 29:
|
773
|
+
case_29:
|
774
|
+
{t->kind = 39; break;}
|
775
|
+
case 30:
|
776
|
+
case_30:
|
777
|
+
{t->kind = 40; break;}
|
778
|
+
case 31:
|
779
|
+
recEnd = pos; recKind = 18;
|
780
|
+
if (ch == L'.') {AddCh(); goto case_19;}
|
781
|
+
else if (ch == L'>') {AddCh(); goto case_22;}
|
782
|
+
else if (ch == L')') {AddCh(); goto case_30;}
|
783
|
+
else {t->kind = 18; break;}
|
784
|
+
case 32:
|
785
|
+
recEnd = pos; recKind = 24;
|
786
|
+
if (ch == L'.') {AddCh(); goto case_21;}
|
787
|
+
else {t->kind = 24; break;}
|
788
|
+
case 33:
|
789
|
+
recEnd = pos; recKind = 30;
|
790
|
+
if (ch == L'.') {AddCh(); goto case_29;}
|
791
|
+
else {t->kind = 30; break;}
|
792
|
+
|
793
|
+
}
|
794
|
+
AppendVal(t);
|
795
|
+
return t;
|
796
|
+
}
|
797
|
+
|
798
|
+
void Scanner::SetScannerBehindT() {
|
799
|
+
buffer->SetPos(t->pos);
|
800
|
+
NextCh();
|
801
|
+
line = t->line; col = t->col; charPos = t->charPos;
|
802
|
+
for (int i = 0; i < tlen; i++) NextCh();
|
803
|
+
}
|
804
|
+
|
805
|
+
// get the next token (possibly a token already seen during peeking)
|
806
|
+
Token* Scanner::Scan() {
|
807
|
+
if (tokens->next == NULL) {
|
808
|
+
return pt = tokens = NextToken();
|
809
|
+
} else {
|
810
|
+
pt = tokens = tokens->next;
|
811
|
+
return tokens;
|
812
|
+
}
|
813
|
+
}
|
814
|
+
|
815
|
+
// peek for the next token, ignore pragmas
|
816
|
+
Token* Scanner::Peek() {
|
817
|
+
do {
|
818
|
+
if (pt->next == NULL) {
|
819
|
+
pt->next = NextToken();
|
820
|
+
}
|
821
|
+
pt = pt->next;
|
822
|
+
} while (pt->kind > maxT); // skip pragmas
|
823
|
+
|
824
|
+
return pt;
|
825
|
+
}
|
826
|
+
|
827
|
+
// make sure that peeking starts at the current scan position
|
828
|
+
void Scanner::ResetPeek() {
|
829
|
+
pt = tokens;
|
830
|
+
}
|
831
|
+
|
832
|
+
} // namespace
|
833
|
+
|