durable_rules 0.34.13 → 0.34.14
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/librb/durable.rb +9 -3
- data/librb/engine.rb +6 -6
- data/src/rules/Makefile +4 -3
- data/src/rules/events.c +13 -0
- data/src/rules/json.h +1 -0
- data/src/rules/net.c +90 -241
- data/src/rules/regex.c +1240 -0
- data/src/rules/regex.h +20 -0
- data/src/rules/rete.c +107 -26
- data/src/rules/rete.h +12 -0
- data/src/rules/rules.h +9 -1
- data/src/rulesrb/rules.c +6 -5
- metadata +4 -2
data/src/rules/regex.c
ADDED
@@ -0,0 +1,1240 @@
|
|
1
|
+
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <string.h>
|
5
|
+
#include "rules.h"
|
6
|
+
#include "rete.h"
|
7
|
+
#include "regex.h"
|
8
|
+
|
9
|
+
#define REGEX_SYMBOL 0x00
|
10
|
+
#define REGEX_UNION 0x01
|
11
|
+
#define REGEX_STAR 0x02
|
12
|
+
#define REGEX_PLUS 0x03
|
13
|
+
#define REGEX_QUESTION 0x04
|
14
|
+
#define REGEX_INTERVAL 0x05
|
15
|
+
#define REGEX_REGEX 0x06
|
16
|
+
#define REGEX_DOT 0xFFFE
|
17
|
+
|
18
|
+
#define MAX_TRANSITIONS 4096
|
19
|
+
#define MAX_QUEUE 1024
|
20
|
+
#define MAX_STATES 4096
|
21
|
+
#define MAX_HSET 1024
|
22
|
+
#define MAX_SET 8192
|
23
|
+
#define MAX_LIST 1024
|
24
|
+
#define MAX_INTERVAL 100
|
25
|
+
|
26
|
+
|
27
|
+
#define CREATE_QUEUE(type) \
|
28
|
+
type queue[MAX_QUEUE]; \
|
29
|
+
unsigned short first = 0; \
|
30
|
+
unsigned short last = 0; \
|
31
|
+
|
32
|
+
#define ENQUEUE(value) do { \
|
33
|
+
if ((last + 1) == first) { \
|
34
|
+
return ERR_REGEX_QUEUE_FULL; \
|
35
|
+
} \
|
36
|
+
queue[last] = value; \
|
37
|
+
last = (last + 1) % MAX_QUEUE; \
|
38
|
+
} while(0)
|
39
|
+
|
40
|
+
#define DEQUEUE(value) do { \
|
41
|
+
if (first == last) { \
|
42
|
+
*value = 0; \
|
43
|
+
} else { \
|
44
|
+
*value = queue[first]; \
|
45
|
+
first = (first + 1) % MAX_QUEUE; \
|
46
|
+
} \
|
47
|
+
} while(0)
|
48
|
+
|
49
|
+
#define CREATE_LIST(type) \
|
50
|
+
type list[MAX_QUEUE]; \
|
51
|
+
unsigned short top = 0;
|
52
|
+
|
53
|
+
#define LIST_EMPTY() !top
|
54
|
+
|
55
|
+
#define ADD(value) do { \
|
56
|
+
if ((top + 1) == MAX_LIST) { \
|
57
|
+
return ERR_REGEX_LIST_FULL; \
|
58
|
+
} \
|
59
|
+
list[top++] = value; \
|
60
|
+
for (unsigned short i = top - 1; (i > 0) && (list[i]->id < list[i - 1]->id); --i) {\
|
61
|
+
state *temp = list[i]; list[i] = list[i - 1]; list[i - 1] = temp; \
|
62
|
+
} \
|
63
|
+
} while(0)
|
64
|
+
|
65
|
+
#define LIST list, top
|
66
|
+
|
67
|
+
#define CREATE_HASHSET(type) \
|
68
|
+
type hset[MAX_HSET] = {0}; \
|
69
|
+
|
70
|
+
#define HSET(value) do { \
|
71
|
+
unsigned int size = 0; \
|
72
|
+
unsigned short index = value->hash % MAX_HSET; \
|
73
|
+
while (hset[index]) { \
|
74
|
+
index = (index + 1) % MAX_HSET; \
|
75
|
+
++size; \
|
76
|
+
if (size == MAX_HSET) { \
|
77
|
+
return ERR_REGEX_SET_FULL; \
|
78
|
+
} \
|
79
|
+
} \
|
80
|
+
hset[index] = value; \
|
81
|
+
} while(0)
|
82
|
+
|
83
|
+
#define HGET(valueHash, value) do { \
|
84
|
+
unsigned short index = valueHash % MAX_HSET; \
|
85
|
+
*value = NULL; \
|
86
|
+
while (hset[index] && !*value) { \
|
87
|
+
if (hset[index]->hash == valueHash) { \
|
88
|
+
*value = hset[index]; \
|
89
|
+
} \
|
90
|
+
index = (index + 1) % MAX_HSET; \
|
91
|
+
} \
|
92
|
+
} while(0)
|
93
|
+
|
94
|
+
#define HASHSET hset
|
95
|
+
|
96
|
+
#define CREATE_SET(type) \
|
97
|
+
type set[MAX_SET] = {0}; \
|
98
|
+
|
99
|
+
#define SET(value) do { \
|
100
|
+
unsigned int size = 0; \
|
101
|
+
unsigned int i = value % MAX_SET; \
|
102
|
+
while (set[i]) { \
|
103
|
+
i = (i + 1) % MAX_SET; \
|
104
|
+
++size; \
|
105
|
+
if (size == MAX_SET) { \
|
106
|
+
return ERR_REGEX_SET_FULL; \
|
107
|
+
} \
|
108
|
+
} \
|
109
|
+
set[i] = value; \
|
110
|
+
} while(0)
|
111
|
+
|
112
|
+
#define EXISTS(value, result) do { \
|
113
|
+
unsigned int i = value % MAX_SET; \
|
114
|
+
*result = 0; \
|
115
|
+
while (set[i] && !*result) { \
|
116
|
+
if (set[i] == value) { \
|
117
|
+
*result = 1; \
|
118
|
+
} \
|
119
|
+
i = (i + 1) % MAX_HSET; \
|
120
|
+
} \
|
121
|
+
} while(0)
|
122
|
+
|
123
|
+
#define CREATE_STATE(stateId, newState) do { \
|
124
|
+
unsigned int result = createState(stateId, newState); \
|
125
|
+
if (result != RULES_OK) { \
|
126
|
+
return result; \
|
127
|
+
} \
|
128
|
+
} while (0)
|
129
|
+
|
130
|
+
#define LINK_STATES(previousState, nextState, tokenSymbol) do { \
|
131
|
+
unsigned int result = linkStates(previousState, nextState, tokenSymbol); \
|
132
|
+
if (result != RULES_OK) { \
|
133
|
+
return result; \
|
134
|
+
} \
|
135
|
+
} while (0)
|
136
|
+
|
137
|
+
struct state;
|
138
|
+
|
139
|
+
typedef struct transition {
|
140
|
+
unsigned int symbol;
|
141
|
+
struct state *next;
|
142
|
+
} transition;
|
143
|
+
|
144
|
+
typedef struct state {
|
145
|
+
unsigned int hash;
|
146
|
+
unsigned short refCount;
|
147
|
+
unsigned short id;
|
148
|
+
unsigned short transitionsLength;
|
149
|
+
unsigned char isAccept;
|
150
|
+
unsigned char isReject;
|
151
|
+
transition transitions[MAX_TRANSITIONS];
|
152
|
+
} state;
|
153
|
+
|
154
|
+
typedef struct token {
|
155
|
+
unsigned char type;
|
156
|
+
unsigned short low;
|
157
|
+
unsigned short high;
|
158
|
+
unsigned short symbolsLength;
|
159
|
+
unsigned int symbols[MAX_TRANSITIONS];
|
160
|
+
unsigned short inverseSymbolsLength;
|
161
|
+
unsigned int inverseSymbols[MAX_TRANSITIONS];
|
162
|
+
} token;
|
163
|
+
|
164
|
+
typedef struct symbolEntry {
|
165
|
+
unsigned int symbol;
|
166
|
+
unsigned short index;
|
167
|
+
} symbolEntry;
|
168
|
+
|
169
|
+
static const unsigned int UTF8_OFFSETS[6] = {
|
170
|
+
0x00000000UL, 0x00003080UL, 0x000E2080UL,
|
171
|
+
0x03C82080UL, 0xFA082080UL, 0x82082080UL
|
172
|
+
};
|
173
|
+
|
174
|
+
static const char UTF8_TRAILING[256] = {
|
175
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
176
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
177
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
178
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
179
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
180
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
181
|
+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
182
|
+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
|
183
|
+
};
|
184
|
+
|
185
|
+
static const unsigned int EMPTY = 0;
|
186
|
+
|
187
|
+
unsigned int utf8ToUnicode(char **first, char *last, unsigned int *result) {
|
188
|
+
unsigned char byteNumber = UTF8_TRAILING[(unsigned char)*first[0]];
|
189
|
+
if (*first + byteNumber >= last) {
|
190
|
+
return ERR_PARSE_REGEX;
|
191
|
+
}
|
192
|
+
|
193
|
+
*result = 0;
|
194
|
+
switch (byteNumber) {
|
195
|
+
case 3:
|
196
|
+
*result += (unsigned char)*first[0];
|
197
|
+
*result <<= 6;
|
198
|
+
++*first;
|
199
|
+
case 2:
|
200
|
+
*result += (unsigned char)*first[0];
|
201
|
+
*result <<= 6;
|
202
|
+
++*first;
|
203
|
+
case 1:
|
204
|
+
*result += (unsigned char)*first[0];
|
205
|
+
*result <<= 6;
|
206
|
+
++*first;
|
207
|
+
case 0:
|
208
|
+
*result += (unsigned char)*first[0];
|
209
|
+
++*first;
|
210
|
+
}
|
211
|
+
*result -= UTF8_OFFSETS[byteNumber];
|
212
|
+
return REGEX_PARSE_OK;
|
213
|
+
}
|
214
|
+
|
215
|
+
static unsigned int readInternalRange(char *first,
|
216
|
+
unsigned short *rangeLength,
|
217
|
+
unsigned int *range);
|
218
|
+
|
219
|
+
static unsigned int readEscapedSymbol(char **first,
|
220
|
+
char *last,
|
221
|
+
unsigned short *rangeLength,
|
222
|
+
unsigned int *range) {
|
223
|
+
++*first;
|
224
|
+
if (*first >= last) {
|
225
|
+
return ERR_PARSE_REGEX;
|
226
|
+
}
|
227
|
+
|
228
|
+
switch (*first[0]) {
|
229
|
+
case '.':
|
230
|
+
case '|':
|
231
|
+
case '?':
|
232
|
+
case '*':
|
233
|
+
case '+':
|
234
|
+
case '(':
|
235
|
+
case ')':
|
236
|
+
case '[':
|
237
|
+
case ']':
|
238
|
+
case '{':
|
239
|
+
case '}':
|
240
|
+
case '%':
|
241
|
+
range[*rangeLength] = *first[0];
|
242
|
+
++*rangeLength;
|
243
|
+
++*first;
|
244
|
+
return REGEX_PARSE_OK;
|
245
|
+
case 'a':
|
246
|
+
++*first;
|
247
|
+
return readInternalRange("[\x41-\x5A\x61-\x7A\xC3\x80-\xC3\x96\xC3\x98-\xC3\xB6\xC3\xB8-\xC3\xBF]", rangeLength, range);
|
248
|
+
case 'c':
|
249
|
+
++*first;
|
250
|
+
return readInternalRange("[\x00-\x1F\x7F\xC2\x80-\xC2\x9F]", rangeLength, range);
|
251
|
+
case 'd':
|
252
|
+
++*first;
|
253
|
+
return readInternalRange("[0-9]", rangeLength, range);
|
254
|
+
case 'g':
|
255
|
+
++*first;
|
256
|
+
return readInternalRange("[\x21-\x7E]", rangeLength, range);
|
257
|
+
case 'l':
|
258
|
+
++*first;
|
259
|
+
return readInternalRange("[\x61-\x7A\xC3\x9F-\xC3\xB6\xC3\xB8-\xC3\xBF]", rangeLength, range);
|
260
|
+
case 'p':
|
261
|
+
++*first;
|
262
|
+
return readInternalRange("[.,;:?!'\"()\xC2\xA1\xC2\xBF-]", rangeLength, range);
|
263
|
+
case 's':
|
264
|
+
++*first;
|
265
|
+
return readInternalRange("[\x09-\x0D\x20]", rangeLength, range);
|
266
|
+
case 'u':
|
267
|
+
++*first;
|
268
|
+
return readInternalRange("[\x41-\x5A\xC3\x80-\xC3\x96\xC3\x98-\xC3\x9E]", rangeLength, range);
|
269
|
+
case 'w':
|
270
|
+
++*first;
|
271
|
+
return readInternalRange("[A-Za-z0-9]", rangeLength, range);
|
272
|
+
case 'x':
|
273
|
+
++*first;
|
274
|
+
return readInternalRange("[0-9A-Fa-f]", rangeLength, range);
|
275
|
+
}
|
276
|
+
|
277
|
+
return ERR_PARSE_REGEX;
|
278
|
+
}
|
279
|
+
|
280
|
+
static unsigned int readRange(char **first,
|
281
|
+
char *last,
|
282
|
+
unsigned short *rangeLength,
|
283
|
+
unsigned int *range,
|
284
|
+
unsigned short *inverseRangeLength,
|
285
|
+
unsigned int *inverseRange) {
|
286
|
+
unsigned char parseBegin = 1;
|
287
|
+
unsigned int lastSymbol = 0;
|
288
|
+
unsigned int currentSymbol;
|
289
|
+
unsigned char inverse = 0;
|
290
|
+
unsigned int result;
|
291
|
+
*rangeLength = 0;
|
292
|
+
if (inverseRangeLength) {
|
293
|
+
*inverseRangeLength = 0;
|
294
|
+
}
|
295
|
+
|
296
|
+
++*first;
|
297
|
+
if (*first[0] == '^') {
|
298
|
+
if (*first == last) {
|
299
|
+
return ERR_PARSE_REGEX;
|
300
|
+
}
|
301
|
+
|
302
|
+
inverse = 1;
|
303
|
+
++*first;
|
304
|
+
}
|
305
|
+
|
306
|
+
if (*first[0] == ']') {
|
307
|
+
if (*first == last) {
|
308
|
+
return ERR_PARSE_REGEX;
|
309
|
+
}
|
310
|
+
|
311
|
+
if (inverse) {
|
312
|
+
inverseRange[*inverseRangeLength] = (unsigned int)']';
|
313
|
+
++*inverseRangeLength;
|
314
|
+
} else {
|
315
|
+
range[*rangeLength] = (unsigned int)']';
|
316
|
+
++*rangeLength;
|
317
|
+
}
|
318
|
+
}
|
319
|
+
|
320
|
+
while (*first[0] != ']') {
|
321
|
+
if (*first == last) {
|
322
|
+
return ERR_PARSE_REGEX;
|
323
|
+
}
|
324
|
+
|
325
|
+
if (!parseBegin) {
|
326
|
+
if (!lastSymbol) {
|
327
|
+
return ERR_PARSE_REGEX;
|
328
|
+
}
|
329
|
+
|
330
|
+
result = utf8ToUnicode(first, last, ¤tSymbol);
|
331
|
+
if (result != REGEX_PARSE_OK) {
|
332
|
+
return result;
|
333
|
+
}
|
334
|
+
|
335
|
+
while (currentSymbol != lastSymbol) {
|
336
|
+
if (inverse) {
|
337
|
+
inverseRange[*inverseRangeLength] = currentSymbol;
|
338
|
+
++*inverseRangeLength;
|
339
|
+
} else {
|
340
|
+
range[*rangeLength] = currentSymbol;
|
341
|
+
++*rangeLength;
|
342
|
+
}
|
343
|
+
|
344
|
+
if (currentSymbol > lastSymbol) {
|
345
|
+
--currentSymbol;
|
346
|
+
} else {
|
347
|
+
++currentSymbol;
|
348
|
+
}
|
349
|
+
}
|
350
|
+
parseBegin = 1;
|
351
|
+
} else {
|
352
|
+
if (*first[0] == '-') {
|
353
|
+
parseBegin = 0;
|
354
|
+
++*first;
|
355
|
+
} else {
|
356
|
+
if (*first[0] != '%') {
|
357
|
+
result = utf8ToUnicode(first, last, ¤tSymbol);
|
358
|
+
if (result != REGEX_PARSE_OK) {
|
359
|
+
return result;
|
360
|
+
}
|
361
|
+
|
362
|
+
if (inverse) {
|
363
|
+
inverseRange[*inverseRangeLength] = currentSymbol;
|
364
|
+
++*inverseRangeLength;
|
365
|
+
} else {
|
366
|
+
range[*rangeLength] = currentSymbol;
|
367
|
+
++*rangeLength;
|
368
|
+
}
|
369
|
+
lastSymbol = currentSymbol;
|
370
|
+
} else {
|
371
|
+
if (inverse) {
|
372
|
+
unsigned int result = readEscapedSymbol(first, last, inverseRangeLength, inverseRange);
|
373
|
+
if (result != REGEX_PARSE_OK) {
|
374
|
+
return result;
|
375
|
+
}
|
376
|
+
} else {
|
377
|
+
unsigned int result = readEscapedSymbol(first, last, rangeLength, range);
|
378
|
+
if (result != REGEX_PARSE_OK) {
|
379
|
+
return result;
|
380
|
+
}
|
381
|
+
}
|
382
|
+
lastSymbol = 0;
|
383
|
+
}
|
384
|
+
}
|
385
|
+
}
|
386
|
+
}
|
387
|
+
|
388
|
+
if (!parseBegin) {
|
389
|
+
if (inverse) {
|
390
|
+
inverseRange[*inverseRangeLength] = (unsigned int)'-';
|
391
|
+
++*inverseRangeLength;
|
392
|
+
} else {
|
393
|
+
range[*rangeLength] = (unsigned int)'-';
|
394
|
+
++*rangeLength;
|
395
|
+
}
|
396
|
+
}
|
397
|
+
|
398
|
+
++*first;
|
399
|
+
return REGEX_PARSE_OK;
|
400
|
+
}
|
401
|
+
|
402
|
+
static unsigned int readInternalRange(char *first,
|
403
|
+
unsigned short *rangeLength,
|
404
|
+
unsigned int *range) {
|
405
|
+
unsigned int length = strlen(first);
|
406
|
+
return readRange(&first, first + length - 1, rangeLength, range, NULL, NULL);
|
407
|
+
}
|
408
|
+
|
409
|
+
static unsigned int readInterval(char **first,
|
410
|
+
char *last,
|
411
|
+
unsigned short *low,
|
412
|
+
unsigned short *high) {
|
413
|
+
|
414
|
+
++*first;
|
415
|
+
unsigned char parseBegin = 1;
|
416
|
+
char *numberBegin = *first;
|
417
|
+
while (*first[0] != '}') {
|
418
|
+
if (*first == last) {
|
419
|
+
return ERR_PARSE_REGEX;
|
420
|
+
}
|
421
|
+
|
422
|
+
if (parseBegin) {
|
423
|
+
if (*first[0] == ',' && numberBegin != *first) {
|
424
|
+
parseBegin = 0;
|
425
|
+
*first[0] = '\0';
|
426
|
+
*low = atoi(numberBegin);
|
427
|
+
*first[0] = ',';
|
428
|
+
numberBegin = *first + 1;
|
429
|
+
} else if (*first[0] > '9' || *first[0] < 0) {
|
430
|
+
return ERR_PARSE_REGEX;
|
431
|
+
}
|
432
|
+
} else if (*first[0] > '9' || *first[0] < 0) {
|
433
|
+
return ERR_PARSE_REGEX;
|
434
|
+
}
|
435
|
+
|
436
|
+
++*first;
|
437
|
+
}
|
438
|
+
|
439
|
+
if (numberBegin == *first) {
|
440
|
+
*high = 0;
|
441
|
+
} else {
|
442
|
+
*first[0] = '\0';
|
443
|
+
*high = atoi(numberBegin);
|
444
|
+
*first[0] = '}';
|
445
|
+
|
446
|
+
if (parseBegin) {
|
447
|
+
*low = *high;
|
448
|
+
}
|
449
|
+
}
|
450
|
+
|
451
|
+
if ((*high && *low > *high) || *high > MAX_INTERVAL) {
|
452
|
+
return ERR_PARSE_REGEX;
|
453
|
+
}
|
454
|
+
|
455
|
+
++*first;
|
456
|
+
return REGEX_PARSE_OK;
|
457
|
+
}
|
458
|
+
|
459
|
+
static unsigned int readNextToken(char **first,
|
460
|
+
char *last,
|
461
|
+
token *nextToken) {
|
462
|
+
unsigned int result = REGEX_PARSE_OK;
|
463
|
+
if (*first >= last) {
|
464
|
+
return REGEX_PARSE_END;
|
465
|
+
}
|
466
|
+
|
467
|
+
switch (*first[0]) {
|
468
|
+
case '|':
|
469
|
+
nextToken->type = REGEX_UNION;
|
470
|
+
break;
|
471
|
+
case '?':
|
472
|
+
nextToken->type = REGEX_QUESTION;
|
473
|
+
break;
|
474
|
+
case '*':
|
475
|
+
nextToken->type = REGEX_STAR;
|
476
|
+
break;
|
477
|
+
case '+':
|
478
|
+
nextToken->type = REGEX_PLUS;
|
479
|
+
break;
|
480
|
+
case '(':
|
481
|
+
nextToken->type = REGEX_REGEX;
|
482
|
+
break;
|
483
|
+
case ')':
|
484
|
+
nextToken->type = REGEX_REGEX;
|
485
|
+
result = REGEX_PARSE_END;
|
486
|
+
break;
|
487
|
+
case '[':
|
488
|
+
nextToken->type = REGEX_SYMBOL;
|
489
|
+
return readRange(first, last, &nextToken->symbolsLength,
|
490
|
+
nextToken->symbols,
|
491
|
+
&nextToken->inverseSymbolsLength,
|
492
|
+
nextToken->inverseSymbols);
|
493
|
+
case '{':
|
494
|
+
nextToken->type = REGEX_INTERVAL;
|
495
|
+
return readInterval(first, last, &nextToken->low, &nextToken->high);
|
496
|
+
case '%':
|
497
|
+
nextToken->type = REGEX_SYMBOL;
|
498
|
+
return readEscapedSymbol(first, last, &nextToken->symbolsLength, nextToken->symbols);
|
499
|
+
case '.':
|
500
|
+
nextToken->type = REGEX_SYMBOL;
|
501
|
+
nextToken->symbolsLength = 1;
|
502
|
+
nextToken->symbols[0] = REGEX_DOT;
|
503
|
+
break;
|
504
|
+
default:
|
505
|
+
nextToken->type = REGEX_SYMBOL;
|
506
|
+
nextToken->symbolsLength = 1;
|
507
|
+
return utf8ToUnicode(first, last, &nextToken->symbols[0]);
|
508
|
+
}
|
509
|
+
|
510
|
+
++*first;
|
511
|
+
return result;
|
512
|
+
}
|
513
|
+
|
514
|
+
static unsigned int storeRegexStateMachine(ruleset *tree,
|
515
|
+
unsigned short vocabularyLength,
|
516
|
+
unsigned short statesLength,
|
517
|
+
void **newStateMachine,
|
518
|
+
unsigned int *stateMachineOffset) {
|
519
|
+
|
520
|
+
unsigned int stateMachinelength = sizeof(symbolEntry) * vocabularyLength * 2;
|
521
|
+
stateMachinelength = stateMachinelength + sizeof(unsigned short) * statesLength * vocabularyLength;
|
522
|
+
stateMachinelength = stateMachinelength + sizeof(unsigned char) * statesLength;
|
523
|
+
if (!tree->regexStateMachinePool) {
|
524
|
+
tree->regexStateMachinePool = malloc(stateMachinelength);
|
525
|
+
if (!tree->regexStateMachinePool) {
|
526
|
+
return ERR_OUT_OF_MEMORY;
|
527
|
+
}
|
528
|
+
|
529
|
+
memset(tree->regexStateMachinePool, 0, stateMachinelength);
|
530
|
+
*stateMachineOffset = 0;
|
531
|
+
*newStateMachine = &tree->regexStateMachinePool[0];
|
532
|
+
tree->regexStateMachineOffset = stateMachinelength;
|
533
|
+
} else {
|
534
|
+
tree->regexStateMachinePool = realloc(tree->regexStateMachinePool, tree->regexStateMachineOffset + stateMachinelength);
|
535
|
+
if (!tree->regexStateMachinePool) {
|
536
|
+
return ERR_OUT_OF_MEMORY;
|
537
|
+
}
|
538
|
+
|
539
|
+
memset(&tree->regexStateMachinePool[tree->regexStateMachineOffset], 0, stateMachinelength);
|
540
|
+
*stateMachineOffset = tree->regexStateMachineOffset;
|
541
|
+
*newStateMachine = &tree->regexStateMachinePool[tree->regexStateMachineOffset];
|
542
|
+
tree->regexStateMachineOffset = tree->regexStateMachineOffset + stateMachinelength;
|
543
|
+
}
|
544
|
+
|
545
|
+
return RULES_OK;
|
546
|
+
}
|
547
|
+
|
548
|
+
static unsigned int createState(unsigned short *stateId,
|
549
|
+
state **newState) {
|
550
|
+
if (*stateId == MAX_STATES) {
|
551
|
+
return ERR_REGEX_MAX_STATES;
|
552
|
+
}
|
553
|
+
*newState = malloc(sizeof(state));
|
554
|
+
if (*newState == NULL) {
|
555
|
+
return ERR_OUT_OF_MEMORY;
|
556
|
+
}
|
557
|
+
(*newState)->id = *stateId;
|
558
|
+
(*newState)->transitionsLength = 0;
|
559
|
+
(*newState)->refCount = 0;
|
560
|
+
(*newState)->isAccept = 0;
|
561
|
+
(*newState)->isReject = 0;
|
562
|
+
(*newState)->hash = 0;
|
563
|
+
++*stateId;
|
564
|
+
|
565
|
+
return RULES_OK;
|
566
|
+
}
|
567
|
+
|
568
|
+
static unsigned int linkStates(state *previousState,
|
569
|
+
state *nextState,
|
570
|
+
unsigned int tokenSymbol) {
|
571
|
+
for (int i = 0; i < previousState->transitionsLength; ++i) {
|
572
|
+
if (previousState->transitions[i].symbol == tokenSymbol &&
|
573
|
+
previousState->transitions[i].next->id == nextState->id) {
|
574
|
+
return RULES_OK;
|
575
|
+
}
|
576
|
+
}
|
577
|
+
|
578
|
+
previousState->transitions[previousState->transitionsLength].symbol = tokenSymbol;
|
579
|
+
previousState->transitions[previousState->transitionsLength].next = nextState;
|
580
|
+
++previousState->transitionsLength;
|
581
|
+
++nextState->refCount;
|
582
|
+
if (previousState->transitionsLength == MAX_TRANSITIONS) {
|
583
|
+
return ERR_REGEX_MAX_TRANSITIONS;
|
584
|
+
}
|
585
|
+
|
586
|
+
return RULES_OK;
|
587
|
+
}
|
588
|
+
|
589
|
+
static void deleteTransition(state *previousState, unsigned short index) {
|
590
|
+
state *nextState = previousState->transitions[index].next;
|
591
|
+
--nextState->refCount;
|
592
|
+
if (!nextState->refCount) {
|
593
|
+
free(nextState);
|
594
|
+
}
|
595
|
+
|
596
|
+
for (unsigned short i = index + 1; i < previousState->transitionsLength; ++i) {
|
597
|
+
previousState->transitions[i - 1].symbol = previousState->transitions[i].symbol;
|
598
|
+
previousState->transitions[i - 1].next = previousState->transitions[i].next;
|
599
|
+
}
|
600
|
+
--previousState->transitionsLength;
|
601
|
+
}
|
602
|
+
|
603
|
+
static void unlinkStates(state *previousState,
|
604
|
+
state *nextState,
|
605
|
+
unsigned int tokenSymbol) {
|
606
|
+
for (int i = 0; i < previousState->transitionsLength; ++i) {
|
607
|
+
if (previousState->transitions[i].symbol == tokenSymbol &&
|
608
|
+
previousState->transitions[i].next->id == nextState->id) {
|
609
|
+
deleteTransition(previousState, i);
|
610
|
+
}
|
611
|
+
}
|
612
|
+
}
|
613
|
+
|
614
|
+
#ifdef _PRINT
|
615
|
+
static unsigned int printGraph(state *start) {
|
616
|
+
CREATE_QUEUE(state*);
|
617
|
+
unsigned char visited[MAX_STATES] = {0};
|
618
|
+
state *currentState = start;
|
619
|
+
visited[currentState->id] = 1;
|
620
|
+
while (currentState) {
|
621
|
+
printf("State %d\n", currentState->id);
|
622
|
+
if (currentState->isAccept) {
|
623
|
+
printf(" Accept\n");
|
624
|
+
}
|
625
|
+
if (currentState->isReject) {
|
626
|
+
printf(" Reject\n");
|
627
|
+
}
|
628
|
+
for (int i = 0; i < currentState->transitionsLength; ++ i) {
|
629
|
+
transition *currentTransition = ¤tState->transitions[i];
|
630
|
+
printf(" transition %x to state %d\n", currentTransition->symbol, currentTransition->next->id);
|
631
|
+
if (!visited[currentTransition->next->id]) {
|
632
|
+
visited[currentTransition->next->id] = 1;
|
633
|
+
ENQUEUE(currentTransition->next);
|
634
|
+
}
|
635
|
+
}
|
636
|
+
|
637
|
+
DEQUEUE(¤tState);
|
638
|
+
}
|
639
|
+
|
640
|
+
return RULES_OK;
|
641
|
+
}
|
642
|
+
#endif
|
643
|
+
|
644
|
+
static unsigned int cloneGraph(state *startState,
|
645
|
+
state *endState,
|
646
|
+
unsigned short *id,
|
647
|
+
state **newStart,
|
648
|
+
state **newEnd) {
|
649
|
+
CREATE_QUEUE(state*);
|
650
|
+
state *visited[MAX_STATES] = { NULL };
|
651
|
+
state *currentState = startState;
|
652
|
+
CREATE_STATE(id, &visited[currentState->id]);
|
653
|
+
while (currentState) {
|
654
|
+
if (currentState->isAccept) {
|
655
|
+
visited[currentState->id]->isAccept = 1;
|
656
|
+
}
|
657
|
+
|
658
|
+
if (currentState->isReject) {
|
659
|
+
visited[currentState->id]->isReject = 1;
|
660
|
+
}
|
661
|
+
|
662
|
+
for (int i = 0; i < currentState->transitionsLength; ++ i) {
|
663
|
+
transition *currentTransition = ¤tState->transitions[i];
|
664
|
+
|
665
|
+
if (!visited[currentTransition->next->id]) {
|
666
|
+
CREATE_STATE(id, &visited[currentTransition->next->id]);
|
667
|
+
ENQUEUE(currentTransition->next);
|
668
|
+
}
|
669
|
+
|
670
|
+
LINK_STATES(visited[currentState->id], visited[currentTransition->next->id], currentTransition->symbol);
|
671
|
+
}
|
672
|
+
|
673
|
+
DEQUEUE(¤tState);
|
674
|
+
}
|
675
|
+
|
676
|
+
*newStart = visited[startState->id];
|
677
|
+
*newEnd = visited[endState->id];
|
678
|
+
return RULES_OK;
|
679
|
+
}
|
680
|
+
|
681
|
+
static unsigned int createGraph(char **first,
|
682
|
+
char *last,
|
683
|
+
unsigned short *id,
|
684
|
+
state **startState,
|
685
|
+
state **endState) {
|
686
|
+
CREATE_STATE(id, startState);
|
687
|
+
CREATE_STATE(id, endState);
|
688
|
+
state *previousState = *startState;
|
689
|
+
state *currentState = *startState;
|
690
|
+
|
691
|
+
token currentToken;
|
692
|
+
unsigned int result = readNextToken(first, last, ¤tToken);
|
693
|
+
while (result == REGEX_PARSE_OK) {
|
694
|
+
switch (currentToken.type) {
|
695
|
+
case REGEX_SYMBOL:
|
696
|
+
previousState = currentState;
|
697
|
+
if (currentToken.symbolsLength) {
|
698
|
+
CREATE_STATE(id, ¤tState);
|
699
|
+
for (unsigned short i = 0; i < currentToken.symbolsLength; ++i) {
|
700
|
+
LINK_STATES(previousState, currentState, currentToken.symbols[i]);
|
701
|
+
}
|
702
|
+
}
|
703
|
+
|
704
|
+
if (currentToken.inverseSymbolsLength) {
|
705
|
+
CREATE_STATE(id, ¤tState);
|
706
|
+
currentState->isReject = 1;
|
707
|
+
for (unsigned short i = 0; i < currentToken.inverseSymbolsLength; ++i) {
|
708
|
+
LINK_STATES(previousState, currentState, currentToken.inverseSymbols[i]);
|
709
|
+
}
|
710
|
+
|
711
|
+
CREATE_STATE(id, ¤tState);
|
712
|
+
LINK_STATES(previousState, currentState, REGEX_DOT);
|
713
|
+
}
|
714
|
+
|
715
|
+
break;
|
716
|
+
case REGEX_UNION:
|
717
|
+
LINK_STATES(currentState, *endState, EMPTY);
|
718
|
+
CREATE_STATE(id, ¤tState);
|
719
|
+
previousState = *startState;
|
720
|
+
LINK_STATES(previousState, currentState, EMPTY);
|
721
|
+
break;
|
722
|
+
case REGEX_STAR:
|
723
|
+
{
|
724
|
+
state *anchorState;
|
725
|
+
CREATE_STATE(id, &anchorState);
|
726
|
+
LINK_STATES(currentState, previousState, EMPTY);
|
727
|
+
LINK_STATES(currentState, anchorState, EMPTY);
|
728
|
+
LINK_STATES(previousState, anchorState, EMPTY);
|
729
|
+
previousState = currentState;
|
730
|
+
currentState = anchorState;
|
731
|
+
}
|
732
|
+
break;
|
733
|
+
case REGEX_PLUS:
|
734
|
+
{
|
735
|
+
state *anchorState;
|
736
|
+
CREATE_STATE(id, &anchorState);
|
737
|
+
LINK_STATES(currentState, previousState, EMPTY);
|
738
|
+
LINK_STATES(currentState, anchorState, EMPTY);
|
739
|
+
previousState = currentState;
|
740
|
+
currentState = anchorState;
|
741
|
+
}
|
742
|
+
break;
|
743
|
+
case REGEX_QUESTION:
|
744
|
+
{
|
745
|
+
state *anchorState;
|
746
|
+
CREATE_STATE(id, &anchorState);
|
747
|
+
LINK_STATES(currentState, anchorState, EMPTY);
|
748
|
+
LINK_STATES(previousState, anchorState, EMPTY);
|
749
|
+
previousState = currentState;
|
750
|
+
currentState = anchorState;
|
751
|
+
}
|
752
|
+
break;
|
753
|
+
case REGEX_REGEX:
|
754
|
+
{
|
755
|
+
state *subStart;
|
756
|
+
state *subEnd;
|
757
|
+
result = createGraph(first, last, id, &subStart, &subEnd);
|
758
|
+
if (result != REGEX_PARSE_OK) {
|
759
|
+
return result;
|
760
|
+
}
|
761
|
+
|
762
|
+
LINK_STATES(currentState, subStart, EMPTY);
|
763
|
+
previousState = currentState;
|
764
|
+
currentState = subEnd;
|
765
|
+
}
|
766
|
+
break;
|
767
|
+
case REGEX_INTERVAL:
|
768
|
+
{
|
769
|
+
state *newCurrent = NULL;
|
770
|
+
state *newPrevious = NULL;
|
771
|
+
state *subStart = previousState;
|
772
|
+
state *subEnd = currentState;
|
773
|
+
state *anchorState;
|
774
|
+
CREATE_STATE(id, &anchorState);
|
775
|
+
for (unsigned short i = 1; i < (!currentToken.high? currentToken.low: currentToken.high); ++i) {
|
776
|
+
result = cloneGraph(previousState, currentState, id, &subStart, &subEnd);
|
777
|
+
if (result != REGEX_PARSE_OK) {
|
778
|
+
return result;
|
779
|
+
}
|
780
|
+
|
781
|
+
if (newCurrent) {
|
782
|
+
LINK_STATES(newCurrent, subStart, EMPTY);
|
783
|
+
} else {
|
784
|
+
newPrevious = subStart;
|
785
|
+
}
|
786
|
+
|
787
|
+
if (i >= currentToken.low) {
|
788
|
+
LINK_STATES(subStart, anchorState, EMPTY);
|
789
|
+
}
|
790
|
+
|
791
|
+
newCurrent = subEnd;
|
792
|
+
}
|
793
|
+
|
794
|
+
if (!currentToken.high) {
|
795
|
+
LINK_STATES(subEnd, subStart, EMPTY);
|
796
|
+
}
|
797
|
+
|
798
|
+
if (!currentToken.low) {
|
799
|
+
LINK_STATES(previousState, anchorState, EMPTY);
|
800
|
+
}
|
801
|
+
|
802
|
+
if (!newPrevious) {
|
803
|
+
LINK_STATES(currentState, anchorState, EMPTY);
|
804
|
+
previousState = currentState;
|
805
|
+
} else {
|
806
|
+
LINK_STATES(currentState, newPrevious, EMPTY);
|
807
|
+
LINK_STATES(newCurrent, anchorState, EMPTY);
|
808
|
+
previousState = newCurrent;
|
809
|
+
}
|
810
|
+
currentState = anchorState;
|
811
|
+
}
|
812
|
+
break;
|
813
|
+
}
|
814
|
+
if (result == REGEX_PARSE_OK) {
|
815
|
+
result = readNextToken(first, last, ¤tToken);
|
816
|
+
}
|
817
|
+
}
|
818
|
+
|
819
|
+
LINK_STATES(currentState, *endState, EMPTY);
|
820
|
+
|
821
|
+
if (result == REGEX_PARSE_END) {
|
822
|
+
return REGEX_PARSE_OK;
|
823
|
+
}
|
824
|
+
|
825
|
+
return result;
|
826
|
+
}
|
827
|
+
|
828
|
+
static unsigned int validateGraph(char **first, char *last) {
|
829
|
+
token currentToken;
|
830
|
+
unsigned int result = readNextToken(first, last, ¤tToken);
|
831
|
+
while (result == REGEX_PARSE_OK) {
|
832
|
+
switch (currentToken.type) {
|
833
|
+
case REGEX_SYMBOL:
|
834
|
+
case REGEX_UNION:
|
835
|
+
case REGEX_STAR:
|
836
|
+
case REGEX_PLUS:
|
837
|
+
case REGEX_QUESTION:
|
838
|
+
break;
|
839
|
+
case REGEX_REGEX:
|
840
|
+
result = validateGraph(first, last);
|
841
|
+
if (result != REGEX_PARSE_OK) {
|
842
|
+
return result;
|
843
|
+
}
|
844
|
+
|
845
|
+
break;
|
846
|
+
}
|
847
|
+
|
848
|
+
if (result == REGEX_PARSE_OK) {
|
849
|
+
result = readNextToken(first, last, ¤tToken);
|
850
|
+
}
|
851
|
+
}
|
852
|
+
|
853
|
+
if (result == REGEX_PARSE_END) {
|
854
|
+
return REGEX_PARSE_OK;
|
855
|
+
}
|
856
|
+
|
857
|
+
return REGEX_PARSE_OK;
|
858
|
+
}
|
859
|
+
|
860
|
+
static unsigned short calculateHash(state **list,
|
861
|
+
unsigned short stateListLength) {
|
862
|
+
unsigned int hash = 5381;
|
863
|
+
for (unsigned short i = 0; i < stateListLength; ++i) {
|
864
|
+
hash = ((hash << 5) + hash) + list[i]->id;
|
865
|
+
}
|
866
|
+
|
867
|
+
return hash;
|
868
|
+
}
|
869
|
+
|
870
|
+
static unsigned int ensureState(unsigned short *id,
|
871
|
+
state **list,
|
872
|
+
unsigned short stateListLength,
|
873
|
+
state **newState) {
|
874
|
+
CREATE_STATE(id, newState);
|
875
|
+
for (unsigned short i = 0; i < stateListLength; ++i) {
|
876
|
+
state *targetState = list[i];
|
877
|
+
for (unsigned short ii = 0; ii < targetState->transitionsLength; ++ii) {
|
878
|
+
transition *targetTransition = &targetState->transitions[ii];
|
879
|
+
LINK_STATES(*newState, targetTransition->next, targetTransition->symbol);
|
880
|
+
}
|
881
|
+
|
882
|
+
if (targetState->isAccept) {
|
883
|
+
(*newState)->isAccept = 1;
|
884
|
+
}
|
885
|
+
|
886
|
+
if (targetState->isReject) {
|
887
|
+
(*newState)->isReject = 1;
|
888
|
+
}
|
889
|
+
|
890
|
+
if ((*newState)->isReject && (*newState)->isAccept) {
|
891
|
+
return ERR_REGEX_CONFLICT;
|
892
|
+
}
|
893
|
+
}
|
894
|
+
|
895
|
+
return RULES_OK;
|
896
|
+
}
|
897
|
+
|
898
|
+
static unsigned int consolidateStates(state *currentState,
|
899
|
+
unsigned short *id) {
|
900
|
+
for (unsigned short i = 0; i < currentState->transitionsLength; ++i) {
|
901
|
+
transition *currentTransition = ¤tState->transitions[i];
|
902
|
+
if (!currentTransition->symbol) {
|
903
|
+
state *nextState = currentTransition->next;
|
904
|
+
if (nextState != currentState) {
|
905
|
+
for (unsigned short ii = 0; ii < nextState->transitionsLength; ++ii) {
|
906
|
+
transition *nextTransition = &nextState->transitions[ii];
|
907
|
+
LINK_STATES(currentState, nextTransition->next, nextTransition->symbol);
|
908
|
+
if (nextState->refCount == 1) {
|
909
|
+
--nextTransition->next->refCount;
|
910
|
+
}
|
911
|
+
}
|
912
|
+
}
|
913
|
+
|
914
|
+
if (nextState->isAccept) {
|
915
|
+
currentState->isAccept = 1;
|
916
|
+
}
|
917
|
+
|
918
|
+
if (nextState->isReject) {
|
919
|
+
currentState->isReject = 1;
|
920
|
+
}
|
921
|
+
|
922
|
+
if (currentState->isAccept && currentState->isReject) {
|
923
|
+
return ERR_REGEX_CONFLICT;
|
924
|
+
}
|
925
|
+
|
926
|
+
deleteTransition(currentState, i);
|
927
|
+
--i;
|
928
|
+
}
|
929
|
+
}
|
930
|
+
|
931
|
+
return RULES_OK;
|
932
|
+
}
|
933
|
+
|
934
|
+
static unsigned int consolidateTransitions(state *currentState,
|
935
|
+
unsigned short *id,
|
936
|
+
state **hset) {
|
937
|
+
transition oldTransitions[MAX_TRANSITIONS];
|
938
|
+
unsigned short oldTransitionsLength = 0;
|
939
|
+
transition newTransitions[MAX_TRANSITIONS];
|
940
|
+
unsigned short newTransitionsLength = 0;
|
941
|
+
CREATE_SET(unsigned int);
|
942
|
+
|
943
|
+
for (unsigned short i = 0; i < currentState->transitionsLength; ++i) {
|
944
|
+
transition *currentTransition = ¤tState->transitions[i];
|
945
|
+
CREATE_LIST(state*);
|
946
|
+
unsigned int foundSymbol = 0;
|
947
|
+
unsigned char symbolExists = 0;
|
948
|
+
EXISTS(currentTransition->symbol, &symbolExists);
|
949
|
+
if (!symbolExists) {
|
950
|
+
SET(currentTransition->symbol);
|
951
|
+
for (unsigned short ii = i + 1; ii < currentState->transitionsLength; ++ ii) {
|
952
|
+
transition *targetTransition = ¤tState->transitions[ii];
|
953
|
+
if ((currentTransition->symbol == targetTransition->symbol) ||
|
954
|
+
(currentTransition->symbol == REGEX_DOT && !targetTransition->next->isReject) ||
|
955
|
+
(targetTransition->symbol == REGEX_DOT && !currentTransition->next->isReject)) {
|
956
|
+
foundSymbol = currentTransition->symbol;
|
957
|
+
if (foundSymbol == REGEX_DOT) {
|
958
|
+
foundSymbol = targetTransition->symbol;
|
959
|
+
}
|
960
|
+
|
961
|
+
if (LIST_EMPTY()) {
|
962
|
+
ADD(currentTransition->next);
|
963
|
+
oldTransitions[oldTransitionsLength].symbol = currentTransition->symbol;
|
964
|
+
oldTransitions[oldTransitionsLength].next = currentTransition->next;
|
965
|
+
++oldTransitionsLength;
|
966
|
+
}
|
967
|
+
|
968
|
+
ADD(targetTransition->next);
|
969
|
+
oldTransitions[oldTransitionsLength].symbol = targetTransition->symbol;
|
970
|
+
oldTransitions[oldTransitionsLength].next = targetTransition->next;
|
971
|
+
++oldTransitionsLength;
|
972
|
+
}
|
973
|
+
}
|
974
|
+
|
975
|
+
if (!LIST_EMPTY()) {
|
976
|
+
state *newState;
|
977
|
+
unsigned int newStateHash = calculateHash(LIST);
|
978
|
+
HGET(newStateHash, &newState);
|
979
|
+
if (!newState) {
|
980
|
+
unsigned int result = ensureState(id, LIST, &newState);
|
981
|
+
if (result != REGEX_PARSE_OK) {
|
982
|
+
return result;
|
983
|
+
}
|
984
|
+
|
985
|
+
newState->hash = newStateHash;
|
986
|
+
HSET(newState);
|
987
|
+
}
|
988
|
+
|
989
|
+
newTransitions[newTransitionsLength].symbol = foundSymbol;
|
990
|
+
newTransitions[newTransitionsLength].next = newState;
|
991
|
+
++newTransitionsLength;
|
992
|
+
}
|
993
|
+
}
|
994
|
+
}
|
995
|
+
|
996
|
+
for (unsigned short i = 0; i < oldTransitionsLength; ++i) {
|
997
|
+
unlinkStates(currentState, oldTransitions[i].next, oldTransitions[i].symbol);
|
998
|
+
}
|
999
|
+
|
1000
|
+
for (unsigned short i = 0; i < newTransitionsLength; ++i) {
|
1001
|
+
LINK_STATES(currentState, newTransitions[i].next, newTransitions[i].symbol);
|
1002
|
+
}
|
1003
|
+
|
1004
|
+
return RULES_OK;
|
1005
|
+
}
|
1006
|
+
|
1007
|
+
static unsigned int transformToDFA(state *nfa,
|
1008
|
+
unsigned short *id) {
|
1009
|
+
|
1010
|
+
#ifdef _PRINT
|
1011
|
+
printf("*** NFA ***\n");
|
1012
|
+
printGraph(nfa);
|
1013
|
+
#endif
|
1014
|
+
|
1015
|
+
CREATE_HASHSET(state*);
|
1016
|
+
CREATE_QUEUE(state*);
|
1017
|
+
unsigned char visited[MAX_STATES] = {0};
|
1018
|
+
state *currentState = nfa;
|
1019
|
+
visited[currentState->id] = 1;
|
1020
|
+
while (currentState) {
|
1021
|
+
unsigned int result = consolidateStates(currentState, id);
|
1022
|
+
if (result != RULES_OK) {
|
1023
|
+
return result;
|
1024
|
+
}
|
1025
|
+
|
1026
|
+
result = consolidateTransitions(currentState, id, HASHSET);
|
1027
|
+
if (result != REGEX_PARSE_OK) {
|
1028
|
+
return result;
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
for (int i = 0; i < currentState->transitionsLength; ++ i) {
|
1032
|
+
transition *currentTransition = ¤tState->transitions[i];
|
1033
|
+
if (!visited[currentTransition->next->id]) {
|
1034
|
+
visited[currentTransition->next->id] = 1;
|
1035
|
+
ENQUEUE(currentTransition->next);
|
1036
|
+
}
|
1037
|
+
}
|
1038
|
+
|
1039
|
+
DEQUEUE(¤tState);
|
1040
|
+
}
|
1041
|
+
|
1042
|
+
#ifdef _PRINT
|
1043
|
+
printf("*** DFA ***\n");
|
1044
|
+
printGraph(nfa);
|
1045
|
+
#endif
|
1046
|
+
|
1047
|
+
return RULES_OK;
|
1048
|
+
}
|
1049
|
+
|
1050
|
+
static unsigned int calculateGraphDimensions(state *start,
|
1051
|
+
unsigned short *vocabularyLength,
|
1052
|
+
unsigned short *statesLength) {
|
1053
|
+
*vocabularyLength = 0;
|
1054
|
+
*statesLength = 0;
|
1055
|
+
CREATE_QUEUE(state*);
|
1056
|
+
unsigned char visited[MAX_STATES] = {0};
|
1057
|
+
CREATE_SET(unsigned int);
|
1058
|
+
state *currentState = start;
|
1059
|
+
visited[currentState->id] = 1;
|
1060
|
+
while (currentState) {
|
1061
|
+
++*statesLength;
|
1062
|
+
for (int i = 0; i < currentState->transitionsLength; ++ i) {
|
1063
|
+
transition *currentTransition = ¤tState->transitions[i];
|
1064
|
+
unsigned char symbolExists = 0;
|
1065
|
+
EXISTS(currentTransition->symbol, &symbolExists);
|
1066
|
+
if (!symbolExists) {
|
1067
|
+
SET(currentTransition->symbol);
|
1068
|
+
++*vocabularyLength;
|
1069
|
+
}
|
1070
|
+
|
1071
|
+
if (!visited[currentTransition->next->id]) {
|
1072
|
+
visited[currentTransition->next->id] = 1;
|
1073
|
+
ENQUEUE(currentTransition->next);
|
1074
|
+
}
|
1075
|
+
}
|
1076
|
+
|
1077
|
+
DEQUEUE(¤tState);
|
1078
|
+
}
|
1079
|
+
|
1080
|
+
return RULES_OK;
|
1081
|
+
}
|
1082
|
+
|
1083
|
+
static void setIndex(symbolEntry *symbolHashSet, unsigned short vocabularyLength, unsigned int symbol, unsigned short index) {
|
1084
|
+
unsigned int max = vocabularyLength * 2;
|
1085
|
+
unsigned int i = symbol % max;
|
1086
|
+
while (symbolHashSet[i].symbol) {
|
1087
|
+
i = (i + 1) % max;
|
1088
|
+
}
|
1089
|
+
symbolHashSet[i].symbol = symbol;
|
1090
|
+
symbolHashSet[i].index = index;
|
1091
|
+
}
|
1092
|
+
|
1093
|
+
static unsigned short getIndex(symbolEntry *symbolHashSet, unsigned short vocabularyLength, unsigned int symbol) {
|
1094
|
+
unsigned int max = vocabularyLength * 2;
|
1095
|
+
unsigned int i = symbol % max;
|
1096
|
+
while (symbolHashSet[i].symbol) {
|
1097
|
+
if (symbolHashSet[i].symbol == symbol) {
|
1098
|
+
return symbolHashSet[i].index;
|
1099
|
+
}
|
1100
|
+
i = (i + 1) % max;
|
1101
|
+
}
|
1102
|
+
|
1103
|
+
return 0;
|
1104
|
+
}
|
1105
|
+
|
1106
|
+
static unsigned int packGraph(state *start,
|
1107
|
+
void *stateMachine,
|
1108
|
+
unsigned short vocabularyLength,
|
1109
|
+
unsigned short statesLength) {
|
1110
|
+
CREATE_QUEUE(state*);
|
1111
|
+
unsigned short visited[MAX_STATES] = {0};
|
1112
|
+
symbolEntry *symbolHashSet = (symbolEntry *)stateMachine;
|
1113
|
+
unsigned short *stateTable = (unsigned short *)(symbolHashSet + vocabularyLength * 2);
|
1114
|
+
unsigned char *acceptVector = (unsigned char *)(stateTable + (vocabularyLength * statesLength));
|
1115
|
+
unsigned short stateNumber = 1;
|
1116
|
+
unsigned short vocabularyNumber = 1;
|
1117
|
+
state *currentState = start;
|
1118
|
+
visited[currentState->id] = stateNumber;
|
1119
|
+
++stateNumber;
|
1120
|
+
while (currentState) {
|
1121
|
+
unsigned short targetStateNumber = visited[currentState->id];
|
1122
|
+
if (currentState->isAccept) {
|
1123
|
+
acceptVector[targetStateNumber - 1] = 1;
|
1124
|
+
}
|
1125
|
+
|
1126
|
+
for (int i = 0; i < currentState->transitionsLength; ++ i) {
|
1127
|
+
transition *currentTransition = ¤tState->transitions[i];
|
1128
|
+
|
1129
|
+
if (!getIndex(symbolHashSet, vocabularyLength, currentTransition->symbol)) {
|
1130
|
+
setIndex(symbolHashSet, vocabularyLength, currentTransition->symbol, vocabularyNumber);
|
1131
|
+
++vocabularyNumber;
|
1132
|
+
}
|
1133
|
+
|
1134
|
+
if (!visited[currentTransition->next->id]) {
|
1135
|
+
visited[currentTransition->next->id] = stateNumber;
|
1136
|
+
++stateNumber;
|
1137
|
+
ENQUEUE(currentTransition->next);
|
1138
|
+
}
|
1139
|
+
|
1140
|
+
unsigned short targetSymbolNumber = getIndex(symbolHashSet, vocabularyLength, currentTransition->symbol);
|
1141
|
+
stateTable[statesLength * (targetSymbolNumber - 1) + (targetStateNumber - 1)] = visited[currentTransition->next->id];
|
1142
|
+
}
|
1143
|
+
|
1144
|
+
DEQUEUE(¤tState);
|
1145
|
+
}
|
1146
|
+
|
1147
|
+
return RULES_OK;
|
1148
|
+
}
|
1149
|
+
|
1150
|
+
unsigned int validateRegex(char *first,
|
1151
|
+
char *last) {
|
1152
|
+
return validateGraph(&first, last);
|
1153
|
+
}
|
1154
|
+
|
1155
|
+
unsigned int compileRegex(void *tree,
|
1156
|
+
char *first,
|
1157
|
+
char *last,
|
1158
|
+
unsigned short *vocabularyLength,
|
1159
|
+
unsigned short *statesLength,
|
1160
|
+
unsigned int *regexStateMachineOffset) {
|
1161
|
+
state *start;
|
1162
|
+
state *end;
|
1163
|
+
unsigned short id = 0;
|
1164
|
+
unsigned int result = createGraph(&first, last, &id, &start, &end);
|
1165
|
+
if (result != RULES_OK) {
|
1166
|
+
return result;
|
1167
|
+
}
|
1168
|
+
end->isAccept = 1;
|
1169
|
+
++start->refCount;
|
1170
|
+
result = transformToDFA(start, &id);
|
1171
|
+
if (result != RULES_OK) {
|
1172
|
+
return result;
|
1173
|
+
}
|
1174
|
+
result = calculateGraphDimensions(start,
|
1175
|
+
vocabularyLength,
|
1176
|
+
statesLength);
|
1177
|
+
if (result != RULES_OK) {
|
1178
|
+
return result;
|
1179
|
+
}
|
1180
|
+
void *newStateMachine;
|
1181
|
+
result = storeRegexStateMachine((ruleset *)tree,
|
1182
|
+
*vocabularyLength,
|
1183
|
+
*statesLength,
|
1184
|
+
&newStateMachine,
|
1185
|
+
regexStateMachineOffset);
|
1186
|
+
if (result != RULES_OK) {
|
1187
|
+
return result;
|
1188
|
+
}
|
1189
|
+
return packGraph(start,
|
1190
|
+
newStateMachine,
|
1191
|
+
*vocabularyLength,
|
1192
|
+
*statesLength);
|
1193
|
+
}
|
1194
|
+
|
1195
|
+
unsigned char evaluateRegex(void *tree,
|
1196
|
+
char *first,
|
1197
|
+
unsigned short length,
|
1198
|
+
unsigned short vocabularyLength,
|
1199
|
+
unsigned short statesLength,
|
1200
|
+
unsigned int regexStateMachineOffset) {
|
1201
|
+
symbolEntry *symbolHashSet = (symbolEntry *)&((ruleset *)tree)->regexStateMachinePool[regexStateMachineOffset];
|
1202
|
+
unsigned short *stateTable = (unsigned short *)(symbolHashSet + vocabularyLength * 2);
|
1203
|
+
unsigned char *acceptVector = (unsigned char *)(stateTable + (vocabularyLength * statesLength));
|
1204
|
+
unsigned short currentState = 1;
|
1205
|
+
char *last = first + length;
|
1206
|
+
while (first < last) {
|
1207
|
+
unsigned int unicodeSymbol;
|
1208
|
+
if (utf8ToUnicode(&first, last, &unicodeSymbol) != REGEX_PARSE_OK) {
|
1209
|
+
return 0;
|
1210
|
+
} else {
|
1211
|
+
unsigned short currentSymbol = getIndex(symbolHashSet, vocabularyLength, unicodeSymbol);
|
1212
|
+
if (!currentSymbol) {
|
1213
|
+
currentSymbol = getIndex(symbolHashSet, vocabularyLength, REGEX_DOT);
|
1214
|
+
if (!currentSymbol) {
|
1215
|
+
return 0;
|
1216
|
+
}
|
1217
|
+
|
1218
|
+
currentState = stateTable[statesLength * (currentSymbol - 1) + (currentState - 1)];
|
1219
|
+
if (!currentState) {
|
1220
|
+
return 0;
|
1221
|
+
}
|
1222
|
+
} else {
|
1223
|
+
currentState = stateTable[statesLength * (currentSymbol - 1) + (currentState - 1)];
|
1224
|
+
if (!currentState) {
|
1225
|
+
currentSymbol = getIndex(symbolHashSet, vocabularyLength, REGEX_DOT);
|
1226
|
+
if (!currentSymbol) {
|
1227
|
+
return 0;
|
1228
|
+
}
|
1229
|
+
|
1230
|
+
currentState = stateTable[statesLength * (currentSymbol - 1) + (currentState - 1)];
|
1231
|
+
if (!currentState) {
|
1232
|
+
return 0;
|
1233
|
+
}
|
1234
|
+
}
|
1235
|
+
}
|
1236
|
+
}
|
1237
|
+
}
|
1238
|
+
|
1239
|
+
return acceptVector[currentState - 1];
|
1240
|
+
}
|