durable_rules 0.34.13 → 0.34.14

Sign up to get free protection for your applications and to get access to all the features.
data/src/rules/regex.c ADDED
@@ -0,0 +1,1240 @@
1
+
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <string.h>
5
+ #include "rules.h"
6
+ #include "rete.h"
7
+ #include "regex.h"
8
+
9
+ #define REGEX_SYMBOL 0x00
10
+ #define REGEX_UNION 0x01
11
+ #define REGEX_STAR 0x02
12
+ #define REGEX_PLUS 0x03
13
+ #define REGEX_QUESTION 0x04
14
+ #define REGEX_INTERVAL 0x05
15
+ #define REGEX_REGEX 0x06
16
+ #define REGEX_DOT 0xFFFE
17
+
18
+ #define MAX_TRANSITIONS 4096
19
+ #define MAX_QUEUE 1024
20
+ #define MAX_STATES 4096
21
+ #define MAX_HSET 1024
22
+ #define MAX_SET 8192
23
+ #define MAX_LIST 1024
24
+ #define MAX_INTERVAL 100
25
+
26
+
27
+ #define CREATE_QUEUE(type) \
28
+ type queue[MAX_QUEUE]; \
29
+ unsigned short first = 0; \
30
+ unsigned short last = 0; \
31
+
32
+ #define ENQUEUE(value) do { \
33
+ if ((last + 1) == first) { \
34
+ return ERR_REGEX_QUEUE_FULL; \
35
+ } \
36
+ queue[last] = value; \
37
+ last = (last + 1) % MAX_QUEUE; \
38
+ } while(0)
39
+
40
+ #define DEQUEUE(value) do { \
41
+ if (first == last) { \
42
+ *value = 0; \
43
+ } else { \
44
+ *value = queue[first]; \
45
+ first = (first + 1) % MAX_QUEUE; \
46
+ } \
47
+ } while(0)
48
+
49
+ #define CREATE_LIST(type) \
50
+ type list[MAX_QUEUE]; \
51
+ unsigned short top = 0;
52
+
53
+ #define LIST_EMPTY() !top
54
+
55
+ #define ADD(value) do { \
56
+ if ((top + 1) == MAX_LIST) { \
57
+ return ERR_REGEX_LIST_FULL; \
58
+ } \
59
+ list[top++] = value; \
60
+ for (unsigned short i = top - 1; (i > 0) && (list[i]->id < list[i - 1]->id); --i) {\
61
+ state *temp = list[i]; list[i] = list[i - 1]; list[i - 1] = temp; \
62
+ } \
63
+ } while(0)
64
+
65
+ #define LIST list, top
66
+
67
+ #define CREATE_HASHSET(type) \
68
+ type hset[MAX_HSET] = {0}; \
69
+
70
+ #define HSET(value) do { \
71
+ unsigned int size = 0; \
72
+ unsigned short index = value->hash % MAX_HSET; \
73
+ while (hset[index]) { \
74
+ index = (index + 1) % MAX_HSET; \
75
+ ++size; \
76
+ if (size == MAX_HSET) { \
77
+ return ERR_REGEX_SET_FULL; \
78
+ } \
79
+ } \
80
+ hset[index] = value; \
81
+ } while(0)
82
+
83
+ #define HGET(valueHash, value) do { \
84
+ unsigned short index = valueHash % MAX_HSET; \
85
+ *value = NULL; \
86
+ while (hset[index] && !*value) { \
87
+ if (hset[index]->hash == valueHash) { \
88
+ *value = hset[index]; \
89
+ } \
90
+ index = (index + 1) % MAX_HSET; \
91
+ } \
92
+ } while(0)
93
+
94
+ #define HASHSET hset
95
+
96
+ #define CREATE_SET(type) \
97
+ type set[MAX_SET] = {0}; \
98
+
99
+ #define SET(value) do { \
100
+ unsigned int size = 0; \
101
+ unsigned int i = value % MAX_SET; \
102
+ while (set[i]) { \
103
+ i = (i + 1) % MAX_SET; \
104
+ ++size; \
105
+ if (size == MAX_SET) { \
106
+ return ERR_REGEX_SET_FULL; \
107
+ } \
108
+ } \
109
+ set[i] = value; \
110
+ } while(0)
111
+
112
+ #define EXISTS(value, result) do { \
113
+ unsigned int i = value % MAX_SET; \
114
+ *result = 0; \
115
+ while (set[i] && !*result) { \
116
+ if (set[i] == value) { \
117
+ *result = 1; \
118
+ } \
119
+ i = (i + 1) % MAX_HSET; \
120
+ } \
121
+ } while(0)
122
+
123
+ #define CREATE_STATE(stateId, newState) do { \
124
+ unsigned int result = createState(stateId, newState); \
125
+ if (result != RULES_OK) { \
126
+ return result; \
127
+ } \
128
+ } while (0)
129
+
130
+ #define LINK_STATES(previousState, nextState, tokenSymbol) do { \
131
+ unsigned int result = linkStates(previousState, nextState, tokenSymbol); \
132
+ if (result != RULES_OK) { \
133
+ return result; \
134
+ } \
135
+ } while (0)
136
+
137
+ struct state;
138
+
139
+ typedef struct transition {
140
+ unsigned int symbol;
141
+ struct state *next;
142
+ } transition;
143
+
144
+ typedef struct state {
145
+ unsigned int hash;
146
+ unsigned short refCount;
147
+ unsigned short id;
148
+ unsigned short transitionsLength;
149
+ unsigned char isAccept;
150
+ unsigned char isReject;
151
+ transition transitions[MAX_TRANSITIONS];
152
+ } state;
153
+
154
+ typedef struct token {
155
+ unsigned char type;
156
+ unsigned short low;
157
+ unsigned short high;
158
+ unsigned short symbolsLength;
159
+ unsigned int symbols[MAX_TRANSITIONS];
160
+ unsigned short inverseSymbolsLength;
161
+ unsigned int inverseSymbols[MAX_TRANSITIONS];
162
+ } token;
163
+
164
+ typedef struct symbolEntry {
165
+ unsigned int symbol;
166
+ unsigned short index;
167
+ } symbolEntry;
168
+
169
+ static const unsigned int UTF8_OFFSETS[6] = {
170
+ 0x00000000UL, 0x00003080UL, 0x000E2080UL,
171
+ 0x03C82080UL, 0xFA082080UL, 0x82082080UL
172
+ };
173
+
174
+ static const char UTF8_TRAILING[256] = {
175
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
176
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
177
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
178
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
179
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
180
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
181
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
182
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
183
+ };
184
+
185
+ static const unsigned int EMPTY = 0;
186
+
187
+ unsigned int utf8ToUnicode(char **first, char *last, unsigned int *result) {
188
+ unsigned char byteNumber = UTF8_TRAILING[(unsigned char)*first[0]];
189
+ if (*first + byteNumber >= last) {
190
+ return ERR_PARSE_REGEX;
191
+ }
192
+
193
+ *result = 0;
194
+ switch (byteNumber) {
195
+ case 3:
196
+ *result += (unsigned char)*first[0];
197
+ *result <<= 6;
198
+ ++*first;
199
+ case 2:
200
+ *result += (unsigned char)*first[0];
201
+ *result <<= 6;
202
+ ++*first;
203
+ case 1:
204
+ *result += (unsigned char)*first[0];
205
+ *result <<= 6;
206
+ ++*first;
207
+ case 0:
208
+ *result += (unsigned char)*first[0];
209
+ ++*first;
210
+ }
211
+ *result -= UTF8_OFFSETS[byteNumber];
212
+ return REGEX_PARSE_OK;
213
+ }
214
+
215
+ static unsigned int readInternalRange(char *first,
216
+ unsigned short *rangeLength,
217
+ unsigned int *range);
218
+
219
+ static unsigned int readEscapedSymbol(char **first,
220
+ char *last,
221
+ unsigned short *rangeLength,
222
+ unsigned int *range) {
223
+ ++*first;
224
+ if (*first >= last) {
225
+ return ERR_PARSE_REGEX;
226
+ }
227
+
228
+ switch (*first[0]) {
229
+ case '.':
230
+ case '|':
231
+ case '?':
232
+ case '*':
233
+ case '+':
234
+ case '(':
235
+ case ')':
236
+ case '[':
237
+ case ']':
238
+ case '{':
239
+ case '}':
240
+ case '%':
241
+ range[*rangeLength] = *first[0];
242
+ ++*rangeLength;
243
+ ++*first;
244
+ return REGEX_PARSE_OK;
245
+ case 'a':
246
+ ++*first;
247
+ return readInternalRange("[\x41-\x5A\x61-\x7A\xC3\x80-\xC3\x96\xC3\x98-\xC3\xB6\xC3\xB8-\xC3\xBF]", rangeLength, range);
248
+ case 'c':
249
+ ++*first;
250
+ return readInternalRange("[\x00-\x1F\x7F\xC2\x80-\xC2\x9F]", rangeLength, range);
251
+ case 'd':
252
+ ++*first;
253
+ return readInternalRange("[0-9]", rangeLength, range);
254
+ case 'g':
255
+ ++*first;
256
+ return readInternalRange("[\x21-\x7E]", rangeLength, range);
257
+ case 'l':
258
+ ++*first;
259
+ return readInternalRange("[\x61-\x7A\xC3\x9F-\xC3\xB6\xC3\xB8-\xC3\xBF]", rangeLength, range);
260
+ case 'p':
261
+ ++*first;
262
+ return readInternalRange("[.,;:?!'\"()\xC2\xA1\xC2\xBF-]", rangeLength, range);
263
+ case 's':
264
+ ++*first;
265
+ return readInternalRange("[\x09-\x0D\x20]", rangeLength, range);
266
+ case 'u':
267
+ ++*first;
268
+ return readInternalRange("[\x41-\x5A\xC3\x80-\xC3\x96\xC3\x98-\xC3\x9E]", rangeLength, range);
269
+ case 'w':
270
+ ++*first;
271
+ return readInternalRange("[A-Za-z0-9]", rangeLength, range);
272
+ case 'x':
273
+ ++*first;
274
+ return readInternalRange("[0-9A-Fa-f]", rangeLength, range);
275
+ }
276
+
277
+ return ERR_PARSE_REGEX;
278
+ }
279
+
280
+ static unsigned int readRange(char **first,
281
+ char *last,
282
+ unsigned short *rangeLength,
283
+ unsigned int *range,
284
+ unsigned short *inverseRangeLength,
285
+ unsigned int *inverseRange) {
286
+ unsigned char parseBegin = 1;
287
+ unsigned int lastSymbol = 0;
288
+ unsigned int currentSymbol;
289
+ unsigned char inverse = 0;
290
+ unsigned int result;
291
+ *rangeLength = 0;
292
+ if (inverseRangeLength) {
293
+ *inverseRangeLength = 0;
294
+ }
295
+
296
+ ++*first;
297
+ if (*first[0] == '^') {
298
+ if (*first == last) {
299
+ return ERR_PARSE_REGEX;
300
+ }
301
+
302
+ inverse = 1;
303
+ ++*first;
304
+ }
305
+
306
+ if (*first[0] == ']') {
307
+ if (*first == last) {
308
+ return ERR_PARSE_REGEX;
309
+ }
310
+
311
+ if (inverse) {
312
+ inverseRange[*inverseRangeLength] = (unsigned int)']';
313
+ ++*inverseRangeLength;
314
+ } else {
315
+ range[*rangeLength] = (unsigned int)']';
316
+ ++*rangeLength;
317
+ }
318
+ }
319
+
320
+ while (*first[0] != ']') {
321
+ if (*first == last) {
322
+ return ERR_PARSE_REGEX;
323
+ }
324
+
325
+ if (!parseBegin) {
326
+ if (!lastSymbol) {
327
+ return ERR_PARSE_REGEX;
328
+ }
329
+
330
+ result = utf8ToUnicode(first, last, &currentSymbol);
331
+ if (result != REGEX_PARSE_OK) {
332
+ return result;
333
+ }
334
+
335
+ while (currentSymbol != lastSymbol) {
336
+ if (inverse) {
337
+ inverseRange[*inverseRangeLength] = currentSymbol;
338
+ ++*inverseRangeLength;
339
+ } else {
340
+ range[*rangeLength] = currentSymbol;
341
+ ++*rangeLength;
342
+ }
343
+
344
+ if (currentSymbol > lastSymbol) {
345
+ --currentSymbol;
346
+ } else {
347
+ ++currentSymbol;
348
+ }
349
+ }
350
+ parseBegin = 1;
351
+ } else {
352
+ if (*first[0] == '-') {
353
+ parseBegin = 0;
354
+ ++*first;
355
+ } else {
356
+ if (*first[0] != '%') {
357
+ result = utf8ToUnicode(first, last, &currentSymbol);
358
+ if (result != REGEX_PARSE_OK) {
359
+ return result;
360
+ }
361
+
362
+ if (inverse) {
363
+ inverseRange[*inverseRangeLength] = currentSymbol;
364
+ ++*inverseRangeLength;
365
+ } else {
366
+ range[*rangeLength] = currentSymbol;
367
+ ++*rangeLength;
368
+ }
369
+ lastSymbol = currentSymbol;
370
+ } else {
371
+ if (inverse) {
372
+ unsigned int result = readEscapedSymbol(first, last, inverseRangeLength, inverseRange);
373
+ if (result != REGEX_PARSE_OK) {
374
+ return result;
375
+ }
376
+ } else {
377
+ unsigned int result = readEscapedSymbol(first, last, rangeLength, range);
378
+ if (result != REGEX_PARSE_OK) {
379
+ return result;
380
+ }
381
+ }
382
+ lastSymbol = 0;
383
+ }
384
+ }
385
+ }
386
+ }
387
+
388
+ if (!parseBegin) {
389
+ if (inverse) {
390
+ inverseRange[*inverseRangeLength] = (unsigned int)'-';
391
+ ++*inverseRangeLength;
392
+ } else {
393
+ range[*rangeLength] = (unsigned int)'-';
394
+ ++*rangeLength;
395
+ }
396
+ }
397
+
398
+ ++*first;
399
+ return REGEX_PARSE_OK;
400
+ }
401
+
402
+ static unsigned int readInternalRange(char *first,
403
+ unsigned short *rangeLength,
404
+ unsigned int *range) {
405
+ unsigned int length = strlen(first);
406
+ return readRange(&first, first + length - 1, rangeLength, range, NULL, NULL);
407
+ }
408
+
409
+ static unsigned int readInterval(char **first,
410
+ char *last,
411
+ unsigned short *low,
412
+ unsigned short *high) {
413
+
414
+ ++*first;
415
+ unsigned char parseBegin = 1;
416
+ char *numberBegin = *first;
417
+ while (*first[0] != '}') {
418
+ if (*first == last) {
419
+ return ERR_PARSE_REGEX;
420
+ }
421
+
422
+ if (parseBegin) {
423
+ if (*first[0] == ',' && numberBegin != *first) {
424
+ parseBegin = 0;
425
+ *first[0] = '\0';
426
+ *low = atoi(numberBegin);
427
+ *first[0] = ',';
428
+ numberBegin = *first + 1;
429
+ } else if (*first[0] > '9' || *first[0] < 0) {
430
+ return ERR_PARSE_REGEX;
431
+ }
432
+ } else if (*first[0] > '9' || *first[0] < 0) {
433
+ return ERR_PARSE_REGEX;
434
+ }
435
+
436
+ ++*first;
437
+ }
438
+
439
+ if (numberBegin == *first) {
440
+ *high = 0;
441
+ } else {
442
+ *first[0] = '\0';
443
+ *high = atoi(numberBegin);
444
+ *first[0] = '}';
445
+
446
+ if (parseBegin) {
447
+ *low = *high;
448
+ }
449
+ }
450
+
451
+ if ((*high && *low > *high) || *high > MAX_INTERVAL) {
452
+ return ERR_PARSE_REGEX;
453
+ }
454
+
455
+ ++*first;
456
+ return REGEX_PARSE_OK;
457
+ }
458
+
459
+ static unsigned int readNextToken(char **first,
460
+ char *last,
461
+ token *nextToken) {
462
+ unsigned int result = REGEX_PARSE_OK;
463
+ if (*first >= last) {
464
+ return REGEX_PARSE_END;
465
+ }
466
+
467
+ switch (*first[0]) {
468
+ case '|':
469
+ nextToken->type = REGEX_UNION;
470
+ break;
471
+ case '?':
472
+ nextToken->type = REGEX_QUESTION;
473
+ break;
474
+ case '*':
475
+ nextToken->type = REGEX_STAR;
476
+ break;
477
+ case '+':
478
+ nextToken->type = REGEX_PLUS;
479
+ break;
480
+ case '(':
481
+ nextToken->type = REGEX_REGEX;
482
+ break;
483
+ case ')':
484
+ nextToken->type = REGEX_REGEX;
485
+ result = REGEX_PARSE_END;
486
+ break;
487
+ case '[':
488
+ nextToken->type = REGEX_SYMBOL;
489
+ return readRange(first, last, &nextToken->symbolsLength,
490
+ nextToken->symbols,
491
+ &nextToken->inverseSymbolsLength,
492
+ nextToken->inverseSymbols);
493
+ case '{':
494
+ nextToken->type = REGEX_INTERVAL;
495
+ return readInterval(first, last, &nextToken->low, &nextToken->high);
496
+ case '%':
497
+ nextToken->type = REGEX_SYMBOL;
498
+ return readEscapedSymbol(first, last, &nextToken->symbolsLength, nextToken->symbols);
499
+ case '.':
500
+ nextToken->type = REGEX_SYMBOL;
501
+ nextToken->symbolsLength = 1;
502
+ nextToken->symbols[0] = REGEX_DOT;
503
+ break;
504
+ default:
505
+ nextToken->type = REGEX_SYMBOL;
506
+ nextToken->symbolsLength = 1;
507
+ return utf8ToUnicode(first, last, &nextToken->symbols[0]);
508
+ }
509
+
510
+ ++*first;
511
+ return result;
512
+ }
513
+
514
+ static unsigned int storeRegexStateMachine(ruleset *tree,
515
+ unsigned short vocabularyLength,
516
+ unsigned short statesLength,
517
+ void **newStateMachine,
518
+ unsigned int *stateMachineOffset) {
519
+
520
+ unsigned int stateMachinelength = sizeof(symbolEntry) * vocabularyLength * 2;
521
+ stateMachinelength = stateMachinelength + sizeof(unsigned short) * statesLength * vocabularyLength;
522
+ stateMachinelength = stateMachinelength + sizeof(unsigned char) * statesLength;
523
+ if (!tree->regexStateMachinePool) {
524
+ tree->regexStateMachinePool = malloc(stateMachinelength);
525
+ if (!tree->regexStateMachinePool) {
526
+ return ERR_OUT_OF_MEMORY;
527
+ }
528
+
529
+ memset(tree->regexStateMachinePool, 0, stateMachinelength);
530
+ *stateMachineOffset = 0;
531
+ *newStateMachine = &tree->regexStateMachinePool[0];
532
+ tree->regexStateMachineOffset = stateMachinelength;
533
+ } else {
534
+ tree->regexStateMachinePool = realloc(tree->regexStateMachinePool, tree->regexStateMachineOffset + stateMachinelength);
535
+ if (!tree->regexStateMachinePool) {
536
+ return ERR_OUT_OF_MEMORY;
537
+ }
538
+
539
+ memset(&tree->regexStateMachinePool[tree->regexStateMachineOffset], 0, stateMachinelength);
540
+ *stateMachineOffset = tree->regexStateMachineOffset;
541
+ *newStateMachine = &tree->regexStateMachinePool[tree->regexStateMachineOffset];
542
+ tree->regexStateMachineOffset = tree->regexStateMachineOffset + stateMachinelength;
543
+ }
544
+
545
+ return RULES_OK;
546
+ }
547
+
548
+ static unsigned int createState(unsigned short *stateId,
549
+ state **newState) {
550
+ if (*stateId == MAX_STATES) {
551
+ return ERR_REGEX_MAX_STATES;
552
+ }
553
+ *newState = malloc(sizeof(state));
554
+ if (*newState == NULL) {
555
+ return ERR_OUT_OF_MEMORY;
556
+ }
557
+ (*newState)->id = *stateId;
558
+ (*newState)->transitionsLength = 0;
559
+ (*newState)->refCount = 0;
560
+ (*newState)->isAccept = 0;
561
+ (*newState)->isReject = 0;
562
+ (*newState)->hash = 0;
563
+ ++*stateId;
564
+
565
+ return RULES_OK;
566
+ }
567
+
568
+ static unsigned int linkStates(state *previousState,
569
+ state *nextState,
570
+ unsigned int tokenSymbol) {
571
+ for (int i = 0; i < previousState->transitionsLength; ++i) {
572
+ if (previousState->transitions[i].symbol == tokenSymbol &&
573
+ previousState->transitions[i].next->id == nextState->id) {
574
+ return RULES_OK;
575
+ }
576
+ }
577
+
578
+ previousState->transitions[previousState->transitionsLength].symbol = tokenSymbol;
579
+ previousState->transitions[previousState->transitionsLength].next = nextState;
580
+ ++previousState->transitionsLength;
581
+ ++nextState->refCount;
582
+ if (previousState->transitionsLength == MAX_TRANSITIONS) {
583
+ return ERR_REGEX_MAX_TRANSITIONS;
584
+ }
585
+
586
+ return RULES_OK;
587
+ }
588
+
589
+ static void deleteTransition(state *previousState, unsigned short index) {
590
+ state *nextState = previousState->transitions[index].next;
591
+ --nextState->refCount;
592
+ if (!nextState->refCount) {
593
+ free(nextState);
594
+ }
595
+
596
+ for (unsigned short i = index + 1; i < previousState->transitionsLength; ++i) {
597
+ previousState->transitions[i - 1].symbol = previousState->transitions[i].symbol;
598
+ previousState->transitions[i - 1].next = previousState->transitions[i].next;
599
+ }
600
+ --previousState->transitionsLength;
601
+ }
602
+
603
+ static void unlinkStates(state *previousState,
604
+ state *nextState,
605
+ unsigned int tokenSymbol) {
606
+ for (int i = 0; i < previousState->transitionsLength; ++i) {
607
+ if (previousState->transitions[i].symbol == tokenSymbol &&
608
+ previousState->transitions[i].next->id == nextState->id) {
609
+ deleteTransition(previousState, i);
610
+ }
611
+ }
612
+ }
613
+
614
+ #ifdef _PRINT
615
+ static unsigned int printGraph(state *start) {
616
+ CREATE_QUEUE(state*);
617
+ unsigned char visited[MAX_STATES] = {0};
618
+ state *currentState = start;
619
+ visited[currentState->id] = 1;
620
+ while (currentState) {
621
+ printf("State %d\n", currentState->id);
622
+ if (currentState->isAccept) {
623
+ printf(" Accept\n");
624
+ }
625
+ if (currentState->isReject) {
626
+ printf(" Reject\n");
627
+ }
628
+ for (int i = 0; i < currentState->transitionsLength; ++ i) {
629
+ transition *currentTransition = &currentState->transitions[i];
630
+ printf(" transition %x to state %d\n", currentTransition->symbol, currentTransition->next->id);
631
+ if (!visited[currentTransition->next->id]) {
632
+ visited[currentTransition->next->id] = 1;
633
+ ENQUEUE(currentTransition->next);
634
+ }
635
+ }
636
+
637
+ DEQUEUE(&currentState);
638
+ }
639
+
640
+ return RULES_OK;
641
+ }
642
+ #endif
643
+
644
+ static unsigned int cloneGraph(state *startState,
645
+ state *endState,
646
+ unsigned short *id,
647
+ state **newStart,
648
+ state **newEnd) {
649
+ CREATE_QUEUE(state*);
650
+ state *visited[MAX_STATES] = { NULL };
651
+ state *currentState = startState;
652
+ CREATE_STATE(id, &visited[currentState->id]);
653
+ while (currentState) {
654
+ if (currentState->isAccept) {
655
+ visited[currentState->id]->isAccept = 1;
656
+ }
657
+
658
+ if (currentState->isReject) {
659
+ visited[currentState->id]->isReject = 1;
660
+ }
661
+
662
+ for (int i = 0; i < currentState->transitionsLength; ++ i) {
663
+ transition *currentTransition = &currentState->transitions[i];
664
+
665
+ if (!visited[currentTransition->next->id]) {
666
+ CREATE_STATE(id, &visited[currentTransition->next->id]);
667
+ ENQUEUE(currentTransition->next);
668
+ }
669
+
670
+ LINK_STATES(visited[currentState->id], visited[currentTransition->next->id], currentTransition->symbol);
671
+ }
672
+
673
+ DEQUEUE(&currentState);
674
+ }
675
+
676
+ *newStart = visited[startState->id];
677
+ *newEnd = visited[endState->id];
678
+ return RULES_OK;
679
+ }
680
+
681
+ static unsigned int createGraph(char **first,
682
+ char *last,
683
+ unsigned short *id,
684
+ state **startState,
685
+ state **endState) {
686
+ CREATE_STATE(id, startState);
687
+ CREATE_STATE(id, endState);
688
+ state *previousState = *startState;
689
+ state *currentState = *startState;
690
+
691
+ token currentToken;
692
+ unsigned int result = readNextToken(first, last, &currentToken);
693
+ while (result == REGEX_PARSE_OK) {
694
+ switch (currentToken.type) {
695
+ case REGEX_SYMBOL:
696
+ previousState = currentState;
697
+ if (currentToken.symbolsLength) {
698
+ CREATE_STATE(id, &currentState);
699
+ for (unsigned short i = 0; i < currentToken.symbolsLength; ++i) {
700
+ LINK_STATES(previousState, currentState, currentToken.symbols[i]);
701
+ }
702
+ }
703
+
704
+ if (currentToken.inverseSymbolsLength) {
705
+ CREATE_STATE(id, &currentState);
706
+ currentState->isReject = 1;
707
+ for (unsigned short i = 0; i < currentToken.inverseSymbolsLength; ++i) {
708
+ LINK_STATES(previousState, currentState, currentToken.inverseSymbols[i]);
709
+ }
710
+
711
+ CREATE_STATE(id, &currentState);
712
+ LINK_STATES(previousState, currentState, REGEX_DOT);
713
+ }
714
+
715
+ break;
716
+ case REGEX_UNION:
717
+ LINK_STATES(currentState, *endState, EMPTY);
718
+ CREATE_STATE(id, &currentState);
719
+ previousState = *startState;
720
+ LINK_STATES(previousState, currentState, EMPTY);
721
+ break;
722
+ case REGEX_STAR:
723
+ {
724
+ state *anchorState;
725
+ CREATE_STATE(id, &anchorState);
726
+ LINK_STATES(currentState, previousState, EMPTY);
727
+ LINK_STATES(currentState, anchorState, EMPTY);
728
+ LINK_STATES(previousState, anchorState, EMPTY);
729
+ previousState = currentState;
730
+ currentState = anchorState;
731
+ }
732
+ break;
733
+ case REGEX_PLUS:
734
+ {
735
+ state *anchorState;
736
+ CREATE_STATE(id, &anchorState);
737
+ LINK_STATES(currentState, previousState, EMPTY);
738
+ LINK_STATES(currentState, anchorState, EMPTY);
739
+ previousState = currentState;
740
+ currentState = anchorState;
741
+ }
742
+ break;
743
+ case REGEX_QUESTION:
744
+ {
745
+ state *anchorState;
746
+ CREATE_STATE(id, &anchorState);
747
+ LINK_STATES(currentState, anchorState, EMPTY);
748
+ LINK_STATES(previousState, anchorState, EMPTY);
749
+ previousState = currentState;
750
+ currentState = anchorState;
751
+ }
752
+ break;
753
+ case REGEX_REGEX:
754
+ {
755
+ state *subStart;
756
+ state *subEnd;
757
+ result = createGraph(first, last, id, &subStart, &subEnd);
758
+ if (result != REGEX_PARSE_OK) {
759
+ return result;
760
+ }
761
+
762
+ LINK_STATES(currentState, subStart, EMPTY);
763
+ previousState = currentState;
764
+ currentState = subEnd;
765
+ }
766
+ break;
767
+ case REGEX_INTERVAL:
768
+ {
769
+ state *newCurrent = NULL;
770
+ state *newPrevious = NULL;
771
+ state *subStart = previousState;
772
+ state *subEnd = currentState;
773
+ state *anchorState;
774
+ CREATE_STATE(id, &anchorState);
775
+ for (unsigned short i = 1; i < (!currentToken.high? currentToken.low: currentToken.high); ++i) {
776
+ result = cloneGraph(previousState, currentState, id, &subStart, &subEnd);
777
+ if (result != REGEX_PARSE_OK) {
778
+ return result;
779
+ }
780
+
781
+ if (newCurrent) {
782
+ LINK_STATES(newCurrent, subStart, EMPTY);
783
+ } else {
784
+ newPrevious = subStart;
785
+ }
786
+
787
+ if (i >= currentToken.low) {
788
+ LINK_STATES(subStart, anchorState, EMPTY);
789
+ }
790
+
791
+ newCurrent = subEnd;
792
+ }
793
+
794
+ if (!currentToken.high) {
795
+ LINK_STATES(subEnd, subStart, EMPTY);
796
+ }
797
+
798
+ if (!currentToken.low) {
799
+ LINK_STATES(previousState, anchorState, EMPTY);
800
+ }
801
+
802
+ if (!newPrevious) {
803
+ LINK_STATES(currentState, anchorState, EMPTY);
804
+ previousState = currentState;
805
+ } else {
806
+ LINK_STATES(currentState, newPrevious, EMPTY);
807
+ LINK_STATES(newCurrent, anchorState, EMPTY);
808
+ previousState = newCurrent;
809
+ }
810
+ currentState = anchorState;
811
+ }
812
+ break;
813
+ }
814
+ if (result == REGEX_PARSE_OK) {
815
+ result = readNextToken(first, last, &currentToken);
816
+ }
817
+ }
818
+
819
+ LINK_STATES(currentState, *endState, EMPTY);
820
+
821
+ if (result == REGEX_PARSE_END) {
822
+ return REGEX_PARSE_OK;
823
+ }
824
+
825
+ return result;
826
+ }
827
+
828
+ static unsigned int validateGraph(char **first, char *last) {
829
+ token currentToken;
830
+ unsigned int result = readNextToken(first, last, &currentToken);
831
+ while (result == REGEX_PARSE_OK) {
832
+ switch (currentToken.type) {
833
+ case REGEX_SYMBOL:
834
+ case REGEX_UNION:
835
+ case REGEX_STAR:
836
+ case REGEX_PLUS:
837
+ case REGEX_QUESTION:
838
+ break;
839
+ case REGEX_REGEX:
840
+ result = validateGraph(first, last);
841
+ if (result != REGEX_PARSE_OK) {
842
+ return result;
843
+ }
844
+
845
+ break;
846
+ }
847
+
848
+ if (result == REGEX_PARSE_OK) {
849
+ result = readNextToken(first, last, &currentToken);
850
+ }
851
+ }
852
+
853
+ if (result == REGEX_PARSE_END) {
854
+ return REGEX_PARSE_OK;
855
+ }
856
+
857
+ return REGEX_PARSE_OK;
858
+ }
859
+
860
+ static unsigned short calculateHash(state **list,
861
+ unsigned short stateListLength) {
862
+ unsigned int hash = 5381;
863
+ for (unsigned short i = 0; i < stateListLength; ++i) {
864
+ hash = ((hash << 5) + hash) + list[i]->id;
865
+ }
866
+
867
+ return hash;
868
+ }
869
+
870
+ static unsigned int ensureState(unsigned short *id,
871
+ state **list,
872
+ unsigned short stateListLength,
873
+ state **newState) {
874
+ CREATE_STATE(id, newState);
875
+ for (unsigned short i = 0; i < stateListLength; ++i) {
876
+ state *targetState = list[i];
877
+ for (unsigned short ii = 0; ii < targetState->transitionsLength; ++ii) {
878
+ transition *targetTransition = &targetState->transitions[ii];
879
+ LINK_STATES(*newState, targetTransition->next, targetTransition->symbol);
880
+ }
881
+
882
+ if (targetState->isAccept) {
883
+ (*newState)->isAccept = 1;
884
+ }
885
+
886
+ if (targetState->isReject) {
887
+ (*newState)->isReject = 1;
888
+ }
889
+
890
+ if ((*newState)->isReject && (*newState)->isAccept) {
891
+ return ERR_REGEX_CONFLICT;
892
+ }
893
+ }
894
+
895
+ return RULES_OK;
896
+ }
897
+
898
+ static unsigned int consolidateStates(state *currentState,
899
+ unsigned short *id) {
900
+ for (unsigned short i = 0; i < currentState->transitionsLength; ++i) {
901
+ transition *currentTransition = &currentState->transitions[i];
902
+ if (!currentTransition->symbol) {
903
+ state *nextState = currentTransition->next;
904
+ if (nextState != currentState) {
905
+ for (unsigned short ii = 0; ii < nextState->transitionsLength; ++ii) {
906
+ transition *nextTransition = &nextState->transitions[ii];
907
+ LINK_STATES(currentState, nextTransition->next, nextTransition->symbol);
908
+ if (nextState->refCount == 1) {
909
+ --nextTransition->next->refCount;
910
+ }
911
+ }
912
+ }
913
+
914
+ if (nextState->isAccept) {
915
+ currentState->isAccept = 1;
916
+ }
917
+
918
+ if (nextState->isReject) {
919
+ currentState->isReject = 1;
920
+ }
921
+
922
+ if (currentState->isAccept && currentState->isReject) {
923
+ return ERR_REGEX_CONFLICT;
924
+ }
925
+
926
+ deleteTransition(currentState, i);
927
+ --i;
928
+ }
929
+ }
930
+
931
+ return RULES_OK;
932
+ }
933
+
934
+ static unsigned int consolidateTransitions(state *currentState,
935
+ unsigned short *id,
936
+ state **hset) {
937
+ transition oldTransitions[MAX_TRANSITIONS];
938
+ unsigned short oldTransitionsLength = 0;
939
+ transition newTransitions[MAX_TRANSITIONS];
940
+ unsigned short newTransitionsLength = 0;
941
+ CREATE_SET(unsigned int);
942
+
943
+ for (unsigned short i = 0; i < currentState->transitionsLength; ++i) {
944
+ transition *currentTransition = &currentState->transitions[i];
945
+ CREATE_LIST(state*);
946
+ unsigned int foundSymbol = 0;
947
+ unsigned char symbolExists = 0;
948
+ EXISTS(currentTransition->symbol, &symbolExists);
949
+ if (!symbolExists) {
950
+ SET(currentTransition->symbol);
951
+ for (unsigned short ii = i + 1; ii < currentState->transitionsLength; ++ ii) {
952
+ transition *targetTransition = &currentState->transitions[ii];
953
+ if ((currentTransition->symbol == targetTransition->symbol) ||
954
+ (currentTransition->symbol == REGEX_DOT && !targetTransition->next->isReject) ||
955
+ (targetTransition->symbol == REGEX_DOT && !currentTransition->next->isReject)) {
956
+ foundSymbol = currentTransition->symbol;
957
+ if (foundSymbol == REGEX_DOT) {
958
+ foundSymbol = targetTransition->symbol;
959
+ }
960
+
961
+ if (LIST_EMPTY()) {
962
+ ADD(currentTransition->next);
963
+ oldTransitions[oldTransitionsLength].symbol = currentTransition->symbol;
964
+ oldTransitions[oldTransitionsLength].next = currentTransition->next;
965
+ ++oldTransitionsLength;
966
+ }
967
+
968
+ ADD(targetTransition->next);
969
+ oldTransitions[oldTransitionsLength].symbol = targetTransition->symbol;
970
+ oldTransitions[oldTransitionsLength].next = targetTransition->next;
971
+ ++oldTransitionsLength;
972
+ }
973
+ }
974
+
975
+ if (!LIST_EMPTY()) {
976
+ state *newState;
977
+ unsigned int newStateHash = calculateHash(LIST);
978
+ HGET(newStateHash, &newState);
979
+ if (!newState) {
980
+ unsigned int result = ensureState(id, LIST, &newState);
981
+ if (result != REGEX_PARSE_OK) {
982
+ return result;
983
+ }
984
+
985
+ newState->hash = newStateHash;
986
+ HSET(newState);
987
+ }
988
+
989
+ newTransitions[newTransitionsLength].symbol = foundSymbol;
990
+ newTransitions[newTransitionsLength].next = newState;
991
+ ++newTransitionsLength;
992
+ }
993
+ }
994
+ }
995
+
996
+ for (unsigned short i = 0; i < oldTransitionsLength; ++i) {
997
+ unlinkStates(currentState, oldTransitions[i].next, oldTransitions[i].symbol);
998
+ }
999
+
1000
+ for (unsigned short i = 0; i < newTransitionsLength; ++i) {
1001
+ LINK_STATES(currentState, newTransitions[i].next, newTransitions[i].symbol);
1002
+ }
1003
+
1004
+ return RULES_OK;
1005
+ }
1006
+
1007
+ static unsigned int transformToDFA(state *nfa,
1008
+ unsigned short *id) {
1009
+
1010
+ #ifdef _PRINT
1011
+ printf("*** NFA ***\n");
1012
+ printGraph(nfa);
1013
+ #endif
1014
+
1015
+ CREATE_HASHSET(state*);
1016
+ CREATE_QUEUE(state*);
1017
+ unsigned char visited[MAX_STATES] = {0};
1018
+ state *currentState = nfa;
1019
+ visited[currentState->id] = 1;
1020
+ while (currentState) {
1021
+ unsigned int result = consolidateStates(currentState, id);
1022
+ if (result != RULES_OK) {
1023
+ return result;
1024
+ }
1025
+
1026
+ result = consolidateTransitions(currentState, id, HASHSET);
1027
+ if (result != REGEX_PARSE_OK) {
1028
+ return result;
1029
+ }
1030
+
1031
+ for (int i = 0; i < currentState->transitionsLength; ++ i) {
1032
+ transition *currentTransition = &currentState->transitions[i];
1033
+ if (!visited[currentTransition->next->id]) {
1034
+ visited[currentTransition->next->id] = 1;
1035
+ ENQUEUE(currentTransition->next);
1036
+ }
1037
+ }
1038
+
1039
+ DEQUEUE(&currentState);
1040
+ }
1041
+
1042
+ #ifdef _PRINT
1043
+ printf("*** DFA ***\n");
1044
+ printGraph(nfa);
1045
+ #endif
1046
+
1047
+ return RULES_OK;
1048
+ }
1049
+
1050
+ static unsigned int calculateGraphDimensions(state *start,
1051
+ unsigned short *vocabularyLength,
1052
+ unsigned short *statesLength) {
1053
+ *vocabularyLength = 0;
1054
+ *statesLength = 0;
1055
+ CREATE_QUEUE(state*);
1056
+ unsigned char visited[MAX_STATES] = {0};
1057
+ CREATE_SET(unsigned int);
1058
+ state *currentState = start;
1059
+ visited[currentState->id] = 1;
1060
+ while (currentState) {
1061
+ ++*statesLength;
1062
+ for (int i = 0; i < currentState->transitionsLength; ++ i) {
1063
+ transition *currentTransition = &currentState->transitions[i];
1064
+ unsigned char symbolExists = 0;
1065
+ EXISTS(currentTransition->symbol, &symbolExists);
1066
+ if (!symbolExists) {
1067
+ SET(currentTransition->symbol);
1068
+ ++*vocabularyLength;
1069
+ }
1070
+
1071
+ if (!visited[currentTransition->next->id]) {
1072
+ visited[currentTransition->next->id] = 1;
1073
+ ENQUEUE(currentTransition->next);
1074
+ }
1075
+ }
1076
+
1077
+ DEQUEUE(&currentState);
1078
+ }
1079
+
1080
+ return RULES_OK;
1081
+ }
1082
+
1083
+ static void setIndex(symbolEntry *symbolHashSet, unsigned short vocabularyLength, unsigned int symbol, unsigned short index) {
1084
+ unsigned int max = vocabularyLength * 2;
1085
+ unsigned int i = symbol % max;
1086
+ while (symbolHashSet[i].symbol) {
1087
+ i = (i + 1) % max;
1088
+ }
1089
+ symbolHashSet[i].symbol = symbol;
1090
+ symbolHashSet[i].index = index;
1091
+ }
1092
+
1093
+ static unsigned short getIndex(symbolEntry *symbolHashSet, unsigned short vocabularyLength, unsigned int symbol) {
1094
+ unsigned int max = vocabularyLength * 2;
1095
+ unsigned int i = symbol % max;
1096
+ while (symbolHashSet[i].symbol) {
1097
+ if (symbolHashSet[i].symbol == symbol) {
1098
+ return symbolHashSet[i].index;
1099
+ }
1100
+ i = (i + 1) % max;
1101
+ }
1102
+
1103
+ return 0;
1104
+ }
1105
+
1106
+ static unsigned int packGraph(state *start,
1107
+ void *stateMachine,
1108
+ unsigned short vocabularyLength,
1109
+ unsigned short statesLength) {
1110
+ CREATE_QUEUE(state*);
1111
+ unsigned short visited[MAX_STATES] = {0};
1112
+ symbolEntry *symbolHashSet = (symbolEntry *)stateMachine;
1113
+ unsigned short *stateTable = (unsigned short *)(symbolHashSet + vocabularyLength * 2);
1114
+ unsigned char *acceptVector = (unsigned char *)(stateTable + (vocabularyLength * statesLength));
1115
+ unsigned short stateNumber = 1;
1116
+ unsigned short vocabularyNumber = 1;
1117
+ state *currentState = start;
1118
+ visited[currentState->id] = stateNumber;
1119
+ ++stateNumber;
1120
+ while (currentState) {
1121
+ unsigned short targetStateNumber = visited[currentState->id];
1122
+ if (currentState->isAccept) {
1123
+ acceptVector[targetStateNumber - 1] = 1;
1124
+ }
1125
+
1126
+ for (int i = 0; i < currentState->transitionsLength; ++ i) {
1127
+ transition *currentTransition = &currentState->transitions[i];
1128
+
1129
+ if (!getIndex(symbolHashSet, vocabularyLength, currentTransition->symbol)) {
1130
+ setIndex(symbolHashSet, vocabularyLength, currentTransition->symbol, vocabularyNumber);
1131
+ ++vocabularyNumber;
1132
+ }
1133
+
1134
+ if (!visited[currentTransition->next->id]) {
1135
+ visited[currentTransition->next->id] = stateNumber;
1136
+ ++stateNumber;
1137
+ ENQUEUE(currentTransition->next);
1138
+ }
1139
+
1140
+ unsigned short targetSymbolNumber = getIndex(symbolHashSet, vocabularyLength, currentTransition->symbol);
1141
+ stateTable[statesLength * (targetSymbolNumber - 1) + (targetStateNumber - 1)] = visited[currentTransition->next->id];
1142
+ }
1143
+
1144
+ DEQUEUE(&currentState);
1145
+ }
1146
+
1147
+ return RULES_OK;
1148
+ }
1149
+
1150
+ unsigned int validateRegex(char *first,
1151
+ char *last) {
1152
+ return validateGraph(&first, last);
1153
+ }
1154
+
1155
+ unsigned int compileRegex(void *tree,
1156
+ char *first,
1157
+ char *last,
1158
+ unsigned short *vocabularyLength,
1159
+ unsigned short *statesLength,
1160
+ unsigned int *regexStateMachineOffset) {
1161
+ state *start;
1162
+ state *end;
1163
+ unsigned short id = 0;
1164
+ unsigned int result = createGraph(&first, last, &id, &start, &end);
1165
+ if (result != RULES_OK) {
1166
+ return result;
1167
+ }
1168
+ end->isAccept = 1;
1169
+ ++start->refCount;
1170
+ result = transformToDFA(start, &id);
1171
+ if (result != RULES_OK) {
1172
+ return result;
1173
+ }
1174
+ result = calculateGraphDimensions(start,
1175
+ vocabularyLength,
1176
+ statesLength);
1177
+ if (result != RULES_OK) {
1178
+ return result;
1179
+ }
1180
+ void *newStateMachine;
1181
+ result = storeRegexStateMachine((ruleset *)tree,
1182
+ *vocabularyLength,
1183
+ *statesLength,
1184
+ &newStateMachine,
1185
+ regexStateMachineOffset);
1186
+ if (result != RULES_OK) {
1187
+ return result;
1188
+ }
1189
+ return packGraph(start,
1190
+ newStateMachine,
1191
+ *vocabularyLength,
1192
+ *statesLength);
1193
+ }
1194
+
1195
+ unsigned char evaluateRegex(void *tree,
1196
+ char *first,
1197
+ unsigned short length,
1198
+ unsigned short vocabularyLength,
1199
+ unsigned short statesLength,
1200
+ unsigned int regexStateMachineOffset) {
1201
+ symbolEntry *symbolHashSet = (symbolEntry *)&((ruleset *)tree)->regexStateMachinePool[regexStateMachineOffset];
1202
+ unsigned short *stateTable = (unsigned short *)(symbolHashSet + vocabularyLength * 2);
1203
+ unsigned char *acceptVector = (unsigned char *)(stateTable + (vocabularyLength * statesLength));
1204
+ unsigned short currentState = 1;
1205
+ char *last = first + length;
1206
+ while (first < last) {
1207
+ unsigned int unicodeSymbol;
1208
+ if (utf8ToUnicode(&first, last, &unicodeSymbol) != REGEX_PARSE_OK) {
1209
+ return 0;
1210
+ } else {
1211
+ unsigned short currentSymbol = getIndex(symbolHashSet, vocabularyLength, unicodeSymbol);
1212
+ if (!currentSymbol) {
1213
+ currentSymbol = getIndex(symbolHashSet, vocabularyLength, REGEX_DOT);
1214
+ if (!currentSymbol) {
1215
+ return 0;
1216
+ }
1217
+
1218
+ currentState = stateTable[statesLength * (currentSymbol - 1) + (currentState - 1)];
1219
+ if (!currentState) {
1220
+ return 0;
1221
+ }
1222
+ } else {
1223
+ currentState = stateTable[statesLength * (currentSymbol - 1) + (currentState - 1)];
1224
+ if (!currentState) {
1225
+ currentSymbol = getIndex(symbolHashSet, vocabularyLength, REGEX_DOT);
1226
+ if (!currentSymbol) {
1227
+ return 0;
1228
+ }
1229
+
1230
+ currentState = stateTable[statesLength * (currentSymbol - 1) + (currentState - 1)];
1231
+ if (!currentState) {
1232
+ return 0;
1233
+ }
1234
+ }
1235
+ }
1236
+ }
1237
+ }
1238
+
1239
+ return acceptVector[currentState - 1];
1240
+ }