durable_rules 0.34.13 → 0.34.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/src/rules/regex.c ADDED
@@ -0,0 +1,1240 @@
1
+
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <string.h>
5
+ #include "rules.h"
6
+ #include "rete.h"
7
+ #include "regex.h"
8
+
9
+ #define REGEX_SYMBOL 0x00
10
+ #define REGEX_UNION 0x01
11
+ #define REGEX_STAR 0x02
12
+ #define REGEX_PLUS 0x03
13
+ #define REGEX_QUESTION 0x04
14
+ #define REGEX_INTERVAL 0x05
15
+ #define REGEX_REGEX 0x06
16
+ #define REGEX_DOT 0xFFFE
17
+
18
+ #define MAX_TRANSITIONS 4096
19
+ #define MAX_QUEUE 1024
20
+ #define MAX_STATES 4096
21
+ #define MAX_HSET 1024
22
+ #define MAX_SET 8192
23
+ #define MAX_LIST 1024
24
+ #define MAX_INTERVAL 100
25
+
26
+
27
+ #define CREATE_QUEUE(type) \
28
+ type queue[MAX_QUEUE]; \
29
+ unsigned short first = 0; \
30
+ unsigned short last = 0; \
31
+
32
+ #define ENQUEUE(value) do { \
33
+ if ((last + 1) == first) { \
34
+ return ERR_REGEX_QUEUE_FULL; \
35
+ } \
36
+ queue[last] = value; \
37
+ last = (last + 1) % MAX_QUEUE; \
38
+ } while(0)
39
+
40
+ #define DEQUEUE(value) do { \
41
+ if (first == last) { \
42
+ *value = 0; \
43
+ } else { \
44
+ *value = queue[first]; \
45
+ first = (first + 1) % MAX_QUEUE; \
46
+ } \
47
+ } while(0)
48
+
49
+ #define CREATE_LIST(type) \
50
+ type list[MAX_QUEUE]; \
51
+ unsigned short top = 0;
52
+
53
+ #define LIST_EMPTY() !top
54
+
55
+ #define ADD(value) do { \
56
+ if ((top + 1) == MAX_LIST) { \
57
+ return ERR_REGEX_LIST_FULL; \
58
+ } \
59
+ list[top++] = value; \
60
+ for (unsigned short i = top - 1; (i > 0) && (list[i]->id < list[i - 1]->id); --i) {\
61
+ state *temp = list[i]; list[i] = list[i - 1]; list[i - 1] = temp; \
62
+ } \
63
+ } while(0)
64
+
65
+ #define LIST list, top
66
+
67
+ #define CREATE_HASHSET(type) \
68
+ type hset[MAX_HSET] = {0}; \
69
+
70
+ #define HSET(value) do { \
71
+ unsigned int size = 0; \
72
+ unsigned short index = value->hash % MAX_HSET; \
73
+ while (hset[index]) { \
74
+ index = (index + 1) % MAX_HSET; \
75
+ ++size; \
76
+ if (size == MAX_HSET) { \
77
+ return ERR_REGEX_SET_FULL; \
78
+ } \
79
+ } \
80
+ hset[index] = value; \
81
+ } while(0)
82
+
83
+ #define HGET(valueHash, value) do { \
84
+ unsigned short index = valueHash % MAX_HSET; \
85
+ *value = NULL; \
86
+ while (hset[index] && !*value) { \
87
+ if (hset[index]->hash == valueHash) { \
88
+ *value = hset[index]; \
89
+ } \
90
+ index = (index + 1) % MAX_HSET; \
91
+ } \
92
+ } while(0)
93
+
94
+ #define HASHSET hset
95
+
96
+ #define CREATE_SET(type) \
97
+ type set[MAX_SET] = {0}; \
98
+
99
+ #define SET(value) do { \
100
+ unsigned int size = 0; \
101
+ unsigned int i = value % MAX_SET; \
102
+ while (set[i]) { \
103
+ i = (i + 1) % MAX_SET; \
104
+ ++size; \
105
+ if (size == MAX_SET) { \
106
+ return ERR_REGEX_SET_FULL; \
107
+ } \
108
+ } \
109
+ set[i] = value; \
110
+ } while(0)
111
+
112
+ #define EXISTS(value, result) do { \
113
+ unsigned int i = value % MAX_SET; \
114
+ *result = 0; \
115
+ while (set[i] && !*result) { \
116
+ if (set[i] == value) { \
117
+ *result = 1; \
118
+ } \
119
+ i = (i + 1) % MAX_HSET; \
120
+ } \
121
+ } while(0)
122
+
123
+ #define CREATE_STATE(stateId, newState) do { \
124
+ unsigned int result = createState(stateId, newState); \
125
+ if (result != RULES_OK) { \
126
+ return result; \
127
+ } \
128
+ } while (0)
129
+
130
+ #define LINK_STATES(previousState, nextState, tokenSymbol) do { \
131
+ unsigned int result = linkStates(previousState, nextState, tokenSymbol); \
132
+ if (result != RULES_OK) { \
133
+ return result; \
134
+ } \
135
+ } while (0)
136
+
137
+ struct state;
138
+
139
+ typedef struct transition {
140
+ unsigned int symbol;
141
+ struct state *next;
142
+ } transition;
143
+
144
+ typedef struct state {
145
+ unsigned int hash;
146
+ unsigned short refCount;
147
+ unsigned short id;
148
+ unsigned short transitionsLength;
149
+ unsigned char isAccept;
150
+ unsigned char isReject;
151
+ transition transitions[MAX_TRANSITIONS];
152
+ } state;
153
+
154
+ typedef struct token {
155
+ unsigned char type;
156
+ unsigned short low;
157
+ unsigned short high;
158
+ unsigned short symbolsLength;
159
+ unsigned int symbols[MAX_TRANSITIONS];
160
+ unsigned short inverseSymbolsLength;
161
+ unsigned int inverseSymbols[MAX_TRANSITIONS];
162
+ } token;
163
+
164
+ typedef struct symbolEntry {
165
+ unsigned int symbol;
166
+ unsigned short index;
167
+ } symbolEntry;
168
+
169
+ static const unsigned int UTF8_OFFSETS[6] = {
170
+ 0x00000000UL, 0x00003080UL, 0x000E2080UL,
171
+ 0x03C82080UL, 0xFA082080UL, 0x82082080UL
172
+ };
173
+
174
+ static const char UTF8_TRAILING[256] = {
175
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
176
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
177
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
178
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
179
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
180
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
181
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
182
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
183
+ };
184
+
185
+ static const unsigned int EMPTY = 0;
186
+
187
+ unsigned int utf8ToUnicode(char **first, char *last, unsigned int *result) {
188
+ unsigned char byteNumber = UTF8_TRAILING[(unsigned char)*first[0]];
189
+ if (*first + byteNumber >= last) {
190
+ return ERR_PARSE_REGEX;
191
+ }
192
+
193
+ *result = 0;
194
+ switch (byteNumber) {
195
+ case 3:
196
+ *result += (unsigned char)*first[0];
197
+ *result <<= 6;
198
+ ++*first;
199
+ case 2:
200
+ *result += (unsigned char)*first[0];
201
+ *result <<= 6;
202
+ ++*first;
203
+ case 1:
204
+ *result += (unsigned char)*first[0];
205
+ *result <<= 6;
206
+ ++*first;
207
+ case 0:
208
+ *result += (unsigned char)*first[0];
209
+ ++*first;
210
+ }
211
+ *result -= UTF8_OFFSETS[byteNumber];
212
+ return REGEX_PARSE_OK;
213
+ }
214
+
215
+ static unsigned int readInternalRange(char *first,
216
+ unsigned short *rangeLength,
217
+ unsigned int *range);
218
+
219
+ static unsigned int readEscapedSymbol(char **first,
220
+ char *last,
221
+ unsigned short *rangeLength,
222
+ unsigned int *range) {
223
+ ++*first;
224
+ if (*first >= last) {
225
+ return ERR_PARSE_REGEX;
226
+ }
227
+
228
+ switch (*first[0]) {
229
+ case '.':
230
+ case '|':
231
+ case '?':
232
+ case '*':
233
+ case '+':
234
+ case '(':
235
+ case ')':
236
+ case '[':
237
+ case ']':
238
+ case '{':
239
+ case '}':
240
+ case '%':
241
+ range[*rangeLength] = *first[0];
242
+ ++*rangeLength;
243
+ ++*first;
244
+ return REGEX_PARSE_OK;
245
+ case 'a':
246
+ ++*first;
247
+ return readInternalRange("[\x41-\x5A\x61-\x7A\xC3\x80-\xC3\x96\xC3\x98-\xC3\xB6\xC3\xB8-\xC3\xBF]", rangeLength, range);
248
+ case 'c':
249
+ ++*first;
250
+ return readInternalRange("[\x00-\x1F\x7F\xC2\x80-\xC2\x9F]", rangeLength, range);
251
+ case 'd':
252
+ ++*first;
253
+ return readInternalRange("[0-9]", rangeLength, range);
254
+ case 'g':
255
+ ++*first;
256
+ return readInternalRange("[\x21-\x7E]", rangeLength, range);
257
+ case 'l':
258
+ ++*first;
259
+ return readInternalRange("[\x61-\x7A\xC3\x9F-\xC3\xB6\xC3\xB8-\xC3\xBF]", rangeLength, range);
260
+ case 'p':
261
+ ++*first;
262
+ return readInternalRange("[.,;:?!'\"()\xC2\xA1\xC2\xBF-]", rangeLength, range);
263
+ case 's':
264
+ ++*first;
265
+ return readInternalRange("[\x09-\x0D\x20]", rangeLength, range);
266
+ case 'u':
267
+ ++*first;
268
+ return readInternalRange("[\x41-\x5A\xC3\x80-\xC3\x96\xC3\x98-\xC3\x9E]", rangeLength, range);
269
+ case 'w':
270
+ ++*first;
271
+ return readInternalRange("[A-Za-z0-9]", rangeLength, range);
272
+ case 'x':
273
+ ++*first;
274
+ return readInternalRange("[0-9A-Fa-f]", rangeLength, range);
275
+ }
276
+
277
+ return ERR_PARSE_REGEX;
278
+ }
279
+
280
+ static unsigned int readRange(char **first,
281
+ char *last,
282
+ unsigned short *rangeLength,
283
+ unsigned int *range,
284
+ unsigned short *inverseRangeLength,
285
+ unsigned int *inverseRange) {
286
+ unsigned char parseBegin = 1;
287
+ unsigned int lastSymbol = 0;
288
+ unsigned int currentSymbol;
289
+ unsigned char inverse = 0;
290
+ unsigned int result;
291
+ *rangeLength = 0;
292
+ if (inverseRangeLength) {
293
+ *inverseRangeLength = 0;
294
+ }
295
+
296
+ ++*first;
297
+ if (*first[0] == '^') {
298
+ if (*first == last) {
299
+ return ERR_PARSE_REGEX;
300
+ }
301
+
302
+ inverse = 1;
303
+ ++*first;
304
+ }
305
+
306
+ if (*first[0] == ']') {
307
+ if (*first == last) {
308
+ return ERR_PARSE_REGEX;
309
+ }
310
+
311
+ if (inverse) {
312
+ inverseRange[*inverseRangeLength] = (unsigned int)']';
313
+ ++*inverseRangeLength;
314
+ } else {
315
+ range[*rangeLength] = (unsigned int)']';
316
+ ++*rangeLength;
317
+ }
318
+ }
319
+
320
+ while (*first[0] != ']') {
321
+ if (*first == last) {
322
+ return ERR_PARSE_REGEX;
323
+ }
324
+
325
+ if (!parseBegin) {
326
+ if (!lastSymbol) {
327
+ return ERR_PARSE_REGEX;
328
+ }
329
+
330
+ result = utf8ToUnicode(first, last, &currentSymbol);
331
+ if (result != REGEX_PARSE_OK) {
332
+ return result;
333
+ }
334
+
335
+ while (currentSymbol != lastSymbol) {
336
+ if (inverse) {
337
+ inverseRange[*inverseRangeLength] = currentSymbol;
338
+ ++*inverseRangeLength;
339
+ } else {
340
+ range[*rangeLength] = currentSymbol;
341
+ ++*rangeLength;
342
+ }
343
+
344
+ if (currentSymbol > lastSymbol) {
345
+ --currentSymbol;
346
+ } else {
347
+ ++currentSymbol;
348
+ }
349
+ }
350
+ parseBegin = 1;
351
+ } else {
352
+ if (*first[0] == '-') {
353
+ parseBegin = 0;
354
+ ++*first;
355
+ } else {
356
+ if (*first[0] != '%') {
357
+ result = utf8ToUnicode(first, last, &currentSymbol);
358
+ if (result != REGEX_PARSE_OK) {
359
+ return result;
360
+ }
361
+
362
+ if (inverse) {
363
+ inverseRange[*inverseRangeLength] = currentSymbol;
364
+ ++*inverseRangeLength;
365
+ } else {
366
+ range[*rangeLength] = currentSymbol;
367
+ ++*rangeLength;
368
+ }
369
+ lastSymbol = currentSymbol;
370
+ } else {
371
+ if (inverse) {
372
+ unsigned int result = readEscapedSymbol(first, last, inverseRangeLength, inverseRange);
373
+ if (result != REGEX_PARSE_OK) {
374
+ return result;
375
+ }
376
+ } else {
377
+ unsigned int result = readEscapedSymbol(first, last, rangeLength, range);
378
+ if (result != REGEX_PARSE_OK) {
379
+ return result;
380
+ }
381
+ }
382
+ lastSymbol = 0;
383
+ }
384
+ }
385
+ }
386
+ }
387
+
388
+ if (!parseBegin) {
389
+ if (inverse) {
390
+ inverseRange[*inverseRangeLength] = (unsigned int)'-';
391
+ ++*inverseRangeLength;
392
+ } else {
393
+ range[*rangeLength] = (unsigned int)'-';
394
+ ++*rangeLength;
395
+ }
396
+ }
397
+
398
+ ++*first;
399
+ return REGEX_PARSE_OK;
400
+ }
401
+
402
+ static unsigned int readInternalRange(char *first,
403
+ unsigned short *rangeLength,
404
+ unsigned int *range) {
405
+ unsigned int length = strlen(first);
406
+ return readRange(&first, first + length - 1, rangeLength, range, NULL, NULL);
407
+ }
408
+
409
+ static unsigned int readInterval(char **first,
410
+ char *last,
411
+ unsigned short *low,
412
+ unsigned short *high) {
413
+
414
+ ++*first;
415
+ unsigned char parseBegin = 1;
416
+ char *numberBegin = *first;
417
+ while (*first[0] != '}') {
418
+ if (*first == last) {
419
+ return ERR_PARSE_REGEX;
420
+ }
421
+
422
+ if (parseBegin) {
423
+ if (*first[0] == ',' && numberBegin != *first) {
424
+ parseBegin = 0;
425
+ *first[0] = '\0';
426
+ *low = atoi(numberBegin);
427
+ *first[0] = ',';
428
+ numberBegin = *first + 1;
429
+ } else if (*first[0] > '9' || *first[0] < 0) {
430
+ return ERR_PARSE_REGEX;
431
+ }
432
+ } else if (*first[0] > '9' || *first[0] < 0) {
433
+ return ERR_PARSE_REGEX;
434
+ }
435
+
436
+ ++*first;
437
+ }
438
+
439
+ if (numberBegin == *first) {
440
+ *high = 0;
441
+ } else {
442
+ *first[0] = '\0';
443
+ *high = atoi(numberBegin);
444
+ *first[0] = '}';
445
+
446
+ if (parseBegin) {
447
+ *low = *high;
448
+ }
449
+ }
450
+
451
+ if ((*high && *low > *high) || *high > MAX_INTERVAL) {
452
+ return ERR_PARSE_REGEX;
453
+ }
454
+
455
+ ++*first;
456
+ return REGEX_PARSE_OK;
457
+ }
458
+
459
+ static unsigned int readNextToken(char **first,
460
+ char *last,
461
+ token *nextToken) {
462
+ unsigned int result = REGEX_PARSE_OK;
463
+ if (*first >= last) {
464
+ return REGEX_PARSE_END;
465
+ }
466
+
467
+ switch (*first[0]) {
468
+ case '|':
469
+ nextToken->type = REGEX_UNION;
470
+ break;
471
+ case '?':
472
+ nextToken->type = REGEX_QUESTION;
473
+ break;
474
+ case '*':
475
+ nextToken->type = REGEX_STAR;
476
+ break;
477
+ case '+':
478
+ nextToken->type = REGEX_PLUS;
479
+ break;
480
+ case '(':
481
+ nextToken->type = REGEX_REGEX;
482
+ break;
483
+ case ')':
484
+ nextToken->type = REGEX_REGEX;
485
+ result = REGEX_PARSE_END;
486
+ break;
487
+ case '[':
488
+ nextToken->type = REGEX_SYMBOL;
489
+ return readRange(first, last, &nextToken->symbolsLength,
490
+ nextToken->symbols,
491
+ &nextToken->inverseSymbolsLength,
492
+ nextToken->inverseSymbols);
493
+ case '{':
494
+ nextToken->type = REGEX_INTERVAL;
495
+ return readInterval(first, last, &nextToken->low, &nextToken->high);
496
+ case '%':
497
+ nextToken->type = REGEX_SYMBOL;
498
+ return readEscapedSymbol(first, last, &nextToken->symbolsLength, nextToken->symbols);
499
+ case '.':
500
+ nextToken->type = REGEX_SYMBOL;
501
+ nextToken->symbolsLength = 1;
502
+ nextToken->symbols[0] = REGEX_DOT;
503
+ break;
504
+ default:
505
+ nextToken->type = REGEX_SYMBOL;
506
+ nextToken->symbolsLength = 1;
507
+ return utf8ToUnicode(first, last, &nextToken->symbols[0]);
508
+ }
509
+
510
+ ++*first;
511
+ return result;
512
+ }
513
+
514
+ static unsigned int storeRegexStateMachine(ruleset *tree,
515
+ unsigned short vocabularyLength,
516
+ unsigned short statesLength,
517
+ void **newStateMachine,
518
+ unsigned int *stateMachineOffset) {
519
+
520
+ unsigned int stateMachinelength = sizeof(symbolEntry) * vocabularyLength * 2;
521
+ stateMachinelength = stateMachinelength + sizeof(unsigned short) * statesLength * vocabularyLength;
522
+ stateMachinelength = stateMachinelength + sizeof(unsigned char) * statesLength;
523
+ if (!tree->regexStateMachinePool) {
524
+ tree->regexStateMachinePool = malloc(stateMachinelength);
525
+ if (!tree->regexStateMachinePool) {
526
+ return ERR_OUT_OF_MEMORY;
527
+ }
528
+
529
+ memset(tree->regexStateMachinePool, 0, stateMachinelength);
530
+ *stateMachineOffset = 0;
531
+ *newStateMachine = &tree->regexStateMachinePool[0];
532
+ tree->regexStateMachineOffset = stateMachinelength;
533
+ } else {
534
+ tree->regexStateMachinePool = realloc(tree->regexStateMachinePool, tree->regexStateMachineOffset + stateMachinelength);
535
+ if (!tree->regexStateMachinePool) {
536
+ return ERR_OUT_OF_MEMORY;
537
+ }
538
+
539
+ memset(&tree->regexStateMachinePool[tree->regexStateMachineOffset], 0, stateMachinelength);
540
+ *stateMachineOffset = tree->regexStateMachineOffset;
541
+ *newStateMachine = &tree->regexStateMachinePool[tree->regexStateMachineOffset];
542
+ tree->regexStateMachineOffset = tree->regexStateMachineOffset + stateMachinelength;
543
+ }
544
+
545
+ return RULES_OK;
546
+ }
547
+
548
+ static unsigned int createState(unsigned short *stateId,
549
+ state **newState) {
550
+ if (*stateId == MAX_STATES) {
551
+ return ERR_REGEX_MAX_STATES;
552
+ }
553
+ *newState = malloc(sizeof(state));
554
+ if (*newState == NULL) {
555
+ return ERR_OUT_OF_MEMORY;
556
+ }
557
+ (*newState)->id = *stateId;
558
+ (*newState)->transitionsLength = 0;
559
+ (*newState)->refCount = 0;
560
+ (*newState)->isAccept = 0;
561
+ (*newState)->isReject = 0;
562
+ (*newState)->hash = 0;
563
+ ++*stateId;
564
+
565
+ return RULES_OK;
566
+ }
567
+
568
+ static unsigned int linkStates(state *previousState,
569
+ state *nextState,
570
+ unsigned int tokenSymbol) {
571
+ for (int i = 0; i < previousState->transitionsLength; ++i) {
572
+ if (previousState->transitions[i].symbol == tokenSymbol &&
573
+ previousState->transitions[i].next->id == nextState->id) {
574
+ return RULES_OK;
575
+ }
576
+ }
577
+
578
+ previousState->transitions[previousState->transitionsLength].symbol = tokenSymbol;
579
+ previousState->transitions[previousState->transitionsLength].next = nextState;
580
+ ++previousState->transitionsLength;
581
+ ++nextState->refCount;
582
+ if (previousState->transitionsLength == MAX_TRANSITIONS) {
583
+ return ERR_REGEX_MAX_TRANSITIONS;
584
+ }
585
+
586
+ return RULES_OK;
587
+ }
588
+
589
+ static void deleteTransition(state *previousState, unsigned short index) {
590
+ state *nextState = previousState->transitions[index].next;
591
+ --nextState->refCount;
592
+ if (!nextState->refCount) {
593
+ free(nextState);
594
+ }
595
+
596
+ for (unsigned short i = index + 1; i < previousState->transitionsLength; ++i) {
597
+ previousState->transitions[i - 1].symbol = previousState->transitions[i].symbol;
598
+ previousState->transitions[i - 1].next = previousState->transitions[i].next;
599
+ }
600
+ --previousState->transitionsLength;
601
+ }
602
+
603
+ static void unlinkStates(state *previousState,
604
+ state *nextState,
605
+ unsigned int tokenSymbol) {
606
+ for (int i = 0; i < previousState->transitionsLength; ++i) {
607
+ if (previousState->transitions[i].symbol == tokenSymbol &&
608
+ previousState->transitions[i].next->id == nextState->id) {
609
+ deleteTransition(previousState, i);
610
+ }
611
+ }
612
+ }
613
+
614
+ #ifdef _PRINT
615
+ static unsigned int printGraph(state *start) {
616
+ CREATE_QUEUE(state*);
617
+ unsigned char visited[MAX_STATES] = {0};
618
+ state *currentState = start;
619
+ visited[currentState->id] = 1;
620
+ while (currentState) {
621
+ printf("State %d\n", currentState->id);
622
+ if (currentState->isAccept) {
623
+ printf(" Accept\n");
624
+ }
625
+ if (currentState->isReject) {
626
+ printf(" Reject\n");
627
+ }
628
+ for (int i = 0; i < currentState->transitionsLength; ++ i) {
629
+ transition *currentTransition = &currentState->transitions[i];
630
+ printf(" transition %x to state %d\n", currentTransition->symbol, currentTransition->next->id);
631
+ if (!visited[currentTransition->next->id]) {
632
+ visited[currentTransition->next->id] = 1;
633
+ ENQUEUE(currentTransition->next);
634
+ }
635
+ }
636
+
637
+ DEQUEUE(&currentState);
638
+ }
639
+
640
+ return RULES_OK;
641
+ }
642
+ #endif
643
+
644
+ static unsigned int cloneGraph(state *startState,
645
+ state *endState,
646
+ unsigned short *id,
647
+ state **newStart,
648
+ state **newEnd) {
649
+ CREATE_QUEUE(state*);
650
+ state *visited[MAX_STATES] = { NULL };
651
+ state *currentState = startState;
652
+ CREATE_STATE(id, &visited[currentState->id]);
653
+ while (currentState) {
654
+ if (currentState->isAccept) {
655
+ visited[currentState->id]->isAccept = 1;
656
+ }
657
+
658
+ if (currentState->isReject) {
659
+ visited[currentState->id]->isReject = 1;
660
+ }
661
+
662
+ for (int i = 0; i < currentState->transitionsLength; ++ i) {
663
+ transition *currentTransition = &currentState->transitions[i];
664
+
665
+ if (!visited[currentTransition->next->id]) {
666
+ CREATE_STATE(id, &visited[currentTransition->next->id]);
667
+ ENQUEUE(currentTransition->next);
668
+ }
669
+
670
+ LINK_STATES(visited[currentState->id], visited[currentTransition->next->id], currentTransition->symbol);
671
+ }
672
+
673
+ DEQUEUE(&currentState);
674
+ }
675
+
676
+ *newStart = visited[startState->id];
677
+ *newEnd = visited[endState->id];
678
+ return RULES_OK;
679
+ }
680
+
681
+ static unsigned int createGraph(char **first,
682
+ char *last,
683
+ unsigned short *id,
684
+ state **startState,
685
+ state **endState) {
686
+ CREATE_STATE(id, startState);
687
+ CREATE_STATE(id, endState);
688
+ state *previousState = *startState;
689
+ state *currentState = *startState;
690
+
691
+ token currentToken;
692
+ unsigned int result = readNextToken(first, last, &currentToken);
693
+ while (result == REGEX_PARSE_OK) {
694
+ switch (currentToken.type) {
695
+ case REGEX_SYMBOL:
696
+ previousState = currentState;
697
+ if (currentToken.symbolsLength) {
698
+ CREATE_STATE(id, &currentState);
699
+ for (unsigned short i = 0; i < currentToken.symbolsLength; ++i) {
700
+ LINK_STATES(previousState, currentState, currentToken.symbols[i]);
701
+ }
702
+ }
703
+
704
+ if (currentToken.inverseSymbolsLength) {
705
+ CREATE_STATE(id, &currentState);
706
+ currentState->isReject = 1;
707
+ for (unsigned short i = 0; i < currentToken.inverseSymbolsLength; ++i) {
708
+ LINK_STATES(previousState, currentState, currentToken.inverseSymbols[i]);
709
+ }
710
+
711
+ CREATE_STATE(id, &currentState);
712
+ LINK_STATES(previousState, currentState, REGEX_DOT);
713
+ }
714
+
715
+ break;
716
+ case REGEX_UNION:
717
+ LINK_STATES(currentState, *endState, EMPTY);
718
+ CREATE_STATE(id, &currentState);
719
+ previousState = *startState;
720
+ LINK_STATES(previousState, currentState, EMPTY);
721
+ break;
722
+ case REGEX_STAR:
723
+ {
724
+ state *anchorState;
725
+ CREATE_STATE(id, &anchorState);
726
+ LINK_STATES(currentState, previousState, EMPTY);
727
+ LINK_STATES(currentState, anchorState, EMPTY);
728
+ LINK_STATES(previousState, anchorState, EMPTY);
729
+ previousState = currentState;
730
+ currentState = anchorState;
731
+ }
732
+ break;
733
+ case REGEX_PLUS:
734
+ {
735
+ state *anchorState;
736
+ CREATE_STATE(id, &anchorState);
737
+ LINK_STATES(currentState, previousState, EMPTY);
738
+ LINK_STATES(currentState, anchorState, EMPTY);
739
+ previousState = currentState;
740
+ currentState = anchorState;
741
+ }
742
+ break;
743
+ case REGEX_QUESTION:
744
+ {
745
+ state *anchorState;
746
+ CREATE_STATE(id, &anchorState);
747
+ LINK_STATES(currentState, anchorState, EMPTY);
748
+ LINK_STATES(previousState, anchorState, EMPTY);
749
+ previousState = currentState;
750
+ currentState = anchorState;
751
+ }
752
+ break;
753
+ case REGEX_REGEX:
754
+ {
755
+ state *subStart;
756
+ state *subEnd;
757
+ result = createGraph(first, last, id, &subStart, &subEnd);
758
+ if (result != REGEX_PARSE_OK) {
759
+ return result;
760
+ }
761
+
762
+ LINK_STATES(currentState, subStart, EMPTY);
763
+ previousState = currentState;
764
+ currentState = subEnd;
765
+ }
766
+ break;
767
+ case REGEX_INTERVAL:
768
+ {
769
+ state *newCurrent = NULL;
770
+ state *newPrevious = NULL;
771
+ state *subStart = previousState;
772
+ state *subEnd = currentState;
773
+ state *anchorState;
774
+ CREATE_STATE(id, &anchorState);
775
+ for (unsigned short i = 1; i < (!currentToken.high? currentToken.low: currentToken.high); ++i) {
776
+ result = cloneGraph(previousState, currentState, id, &subStart, &subEnd);
777
+ if (result != REGEX_PARSE_OK) {
778
+ return result;
779
+ }
780
+
781
+ if (newCurrent) {
782
+ LINK_STATES(newCurrent, subStart, EMPTY);
783
+ } else {
784
+ newPrevious = subStart;
785
+ }
786
+
787
+ if (i >= currentToken.low) {
788
+ LINK_STATES(subStart, anchorState, EMPTY);
789
+ }
790
+
791
+ newCurrent = subEnd;
792
+ }
793
+
794
+ if (!currentToken.high) {
795
+ LINK_STATES(subEnd, subStart, EMPTY);
796
+ }
797
+
798
+ if (!currentToken.low) {
799
+ LINK_STATES(previousState, anchorState, EMPTY);
800
+ }
801
+
802
+ if (!newPrevious) {
803
+ LINK_STATES(currentState, anchorState, EMPTY);
804
+ previousState = currentState;
805
+ } else {
806
+ LINK_STATES(currentState, newPrevious, EMPTY);
807
+ LINK_STATES(newCurrent, anchorState, EMPTY);
808
+ previousState = newCurrent;
809
+ }
810
+ currentState = anchorState;
811
+ }
812
+ break;
813
+ }
814
+ if (result == REGEX_PARSE_OK) {
815
+ result = readNextToken(first, last, &currentToken);
816
+ }
817
+ }
818
+
819
+ LINK_STATES(currentState, *endState, EMPTY);
820
+
821
+ if (result == REGEX_PARSE_END) {
822
+ return REGEX_PARSE_OK;
823
+ }
824
+
825
+ return result;
826
+ }
827
+
828
+ static unsigned int validateGraph(char **first, char *last) {
829
+ token currentToken;
830
+ unsigned int result = readNextToken(first, last, &currentToken);
831
+ while (result == REGEX_PARSE_OK) {
832
+ switch (currentToken.type) {
833
+ case REGEX_SYMBOL:
834
+ case REGEX_UNION:
835
+ case REGEX_STAR:
836
+ case REGEX_PLUS:
837
+ case REGEX_QUESTION:
838
+ break;
839
+ case REGEX_REGEX:
840
+ result = validateGraph(first, last);
841
+ if (result != REGEX_PARSE_OK) {
842
+ return result;
843
+ }
844
+
845
+ break;
846
+ }
847
+
848
+ if (result == REGEX_PARSE_OK) {
849
+ result = readNextToken(first, last, &currentToken);
850
+ }
851
+ }
852
+
853
+ if (result == REGEX_PARSE_END) {
854
+ return REGEX_PARSE_OK;
855
+ }
856
+
857
+ return REGEX_PARSE_OK;
858
+ }
859
+
860
+ static unsigned short calculateHash(state **list,
861
+ unsigned short stateListLength) {
862
+ unsigned int hash = 5381;
863
+ for (unsigned short i = 0; i < stateListLength; ++i) {
864
+ hash = ((hash << 5) + hash) + list[i]->id;
865
+ }
866
+
867
+ return hash;
868
+ }
869
+
870
+ static unsigned int ensureState(unsigned short *id,
871
+ state **list,
872
+ unsigned short stateListLength,
873
+ state **newState) {
874
+ CREATE_STATE(id, newState);
875
+ for (unsigned short i = 0; i < stateListLength; ++i) {
876
+ state *targetState = list[i];
877
+ for (unsigned short ii = 0; ii < targetState->transitionsLength; ++ii) {
878
+ transition *targetTransition = &targetState->transitions[ii];
879
+ LINK_STATES(*newState, targetTransition->next, targetTransition->symbol);
880
+ }
881
+
882
+ if (targetState->isAccept) {
883
+ (*newState)->isAccept = 1;
884
+ }
885
+
886
+ if (targetState->isReject) {
887
+ (*newState)->isReject = 1;
888
+ }
889
+
890
+ if ((*newState)->isReject && (*newState)->isAccept) {
891
+ return ERR_REGEX_CONFLICT;
892
+ }
893
+ }
894
+
895
+ return RULES_OK;
896
+ }
897
+
898
+ static unsigned int consolidateStates(state *currentState,
899
+ unsigned short *id) {
900
+ for (unsigned short i = 0; i < currentState->transitionsLength; ++i) {
901
+ transition *currentTransition = &currentState->transitions[i];
902
+ if (!currentTransition->symbol) {
903
+ state *nextState = currentTransition->next;
904
+ if (nextState != currentState) {
905
+ for (unsigned short ii = 0; ii < nextState->transitionsLength; ++ii) {
906
+ transition *nextTransition = &nextState->transitions[ii];
907
+ LINK_STATES(currentState, nextTransition->next, nextTransition->symbol);
908
+ if (nextState->refCount == 1) {
909
+ --nextTransition->next->refCount;
910
+ }
911
+ }
912
+ }
913
+
914
+ if (nextState->isAccept) {
915
+ currentState->isAccept = 1;
916
+ }
917
+
918
+ if (nextState->isReject) {
919
+ currentState->isReject = 1;
920
+ }
921
+
922
+ if (currentState->isAccept && currentState->isReject) {
923
+ return ERR_REGEX_CONFLICT;
924
+ }
925
+
926
+ deleteTransition(currentState, i);
927
+ --i;
928
+ }
929
+ }
930
+
931
+ return RULES_OK;
932
+ }
933
+
934
+ static unsigned int consolidateTransitions(state *currentState,
935
+ unsigned short *id,
936
+ state **hset) {
937
+ transition oldTransitions[MAX_TRANSITIONS];
938
+ unsigned short oldTransitionsLength = 0;
939
+ transition newTransitions[MAX_TRANSITIONS];
940
+ unsigned short newTransitionsLength = 0;
941
+ CREATE_SET(unsigned int);
942
+
943
+ for (unsigned short i = 0; i < currentState->transitionsLength; ++i) {
944
+ transition *currentTransition = &currentState->transitions[i];
945
+ CREATE_LIST(state*);
946
+ unsigned int foundSymbol = 0;
947
+ unsigned char symbolExists = 0;
948
+ EXISTS(currentTransition->symbol, &symbolExists);
949
+ if (!symbolExists) {
950
+ SET(currentTransition->symbol);
951
+ for (unsigned short ii = i + 1; ii < currentState->transitionsLength; ++ ii) {
952
+ transition *targetTransition = &currentState->transitions[ii];
953
+ if ((currentTransition->symbol == targetTransition->symbol) ||
954
+ (currentTransition->symbol == REGEX_DOT && !targetTransition->next->isReject) ||
955
+ (targetTransition->symbol == REGEX_DOT && !currentTransition->next->isReject)) {
956
+ foundSymbol = currentTransition->symbol;
957
+ if (foundSymbol == REGEX_DOT) {
958
+ foundSymbol = targetTransition->symbol;
959
+ }
960
+
961
+ if (LIST_EMPTY()) {
962
+ ADD(currentTransition->next);
963
+ oldTransitions[oldTransitionsLength].symbol = currentTransition->symbol;
964
+ oldTransitions[oldTransitionsLength].next = currentTransition->next;
965
+ ++oldTransitionsLength;
966
+ }
967
+
968
+ ADD(targetTransition->next);
969
+ oldTransitions[oldTransitionsLength].symbol = targetTransition->symbol;
970
+ oldTransitions[oldTransitionsLength].next = targetTransition->next;
971
+ ++oldTransitionsLength;
972
+ }
973
+ }
974
+
975
+ if (!LIST_EMPTY()) {
976
+ state *newState;
977
+ unsigned int newStateHash = calculateHash(LIST);
978
+ HGET(newStateHash, &newState);
979
+ if (!newState) {
980
+ unsigned int result = ensureState(id, LIST, &newState);
981
+ if (result != REGEX_PARSE_OK) {
982
+ return result;
983
+ }
984
+
985
+ newState->hash = newStateHash;
986
+ HSET(newState);
987
+ }
988
+
989
+ newTransitions[newTransitionsLength].symbol = foundSymbol;
990
+ newTransitions[newTransitionsLength].next = newState;
991
+ ++newTransitionsLength;
992
+ }
993
+ }
994
+ }
995
+
996
+ for (unsigned short i = 0; i < oldTransitionsLength; ++i) {
997
+ unlinkStates(currentState, oldTransitions[i].next, oldTransitions[i].symbol);
998
+ }
999
+
1000
+ for (unsigned short i = 0; i < newTransitionsLength; ++i) {
1001
+ LINK_STATES(currentState, newTransitions[i].next, newTransitions[i].symbol);
1002
+ }
1003
+
1004
+ return RULES_OK;
1005
+ }
1006
+
1007
+ static unsigned int transformToDFA(state *nfa,
1008
+ unsigned short *id) {
1009
+
1010
+ #ifdef _PRINT
1011
+ printf("*** NFA ***\n");
1012
+ printGraph(nfa);
1013
+ #endif
1014
+
1015
+ CREATE_HASHSET(state*);
1016
+ CREATE_QUEUE(state*);
1017
+ unsigned char visited[MAX_STATES] = {0};
1018
+ state *currentState = nfa;
1019
+ visited[currentState->id] = 1;
1020
+ while (currentState) {
1021
+ unsigned int result = consolidateStates(currentState, id);
1022
+ if (result != RULES_OK) {
1023
+ return result;
1024
+ }
1025
+
1026
+ result = consolidateTransitions(currentState, id, HASHSET);
1027
+ if (result != REGEX_PARSE_OK) {
1028
+ return result;
1029
+ }
1030
+
1031
+ for (int i = 0; i < currentState->transitionsLength; ++ i) {
1032
+ transition *currentTransition = &currentState->transitions[i];
1033
+ if (!visited[currentTransition->next->id]) {
1034
+ visited[currentTransition->next->id] = 1;
1035
+ ENQUEUE(currentTransition->next);
1036
+ }
1037
+ }
1038
+
1039
+ DEQUEUE(&currentState);
1040
+ }
1041
+
1042
+ #ifdef _PRINT
1043
+ printf("*** DFA ***\n");
1044
+ printGraph(nfa);
1045
+ #endif
1046
+
1047
+ return RULES_OK;
1048
+ }
1049
+
1050
+ static unsigned int calculateGraphDimensions(state *start,
1051
+ unsigned short *vocabularyLength,
1052
+ unsigned short *statesLength) {
1053
+ *vocabularyLength = 0;
1054
+ *statesLength = 0;
1055
+ CREATE_QUEUE(state*);
1056
+ unsigned char visited[MAX_STATES] = {0};
1057
+ CREATE_SET(unsigned int);
1058
+ state *currentState = start;
1059
+ visited[currentState->id] = 1;
1060
+ while (currentState) {
1061
+ ++*statesLength;
1062
+ for (int i = 0; i < currentState->transitionsLength; ++ i) {
1063
+ transition *currentTransition = &currentState->transitions[i];
1064
+ unsigned char symbolExists = 0;
1065
+ EXISTS(currentTransition->symbol, &symbolExists);
1066
+ if (!symbolExists) {
1067
+ SET(currentTransition->symbol);
1068
+ ++*vocabularyLength;
1069
+ }
1070
+
1071
+ if (!visited[currentTransition->next->id]) {
1072
+ visited[currentTransition->next->id] = 1;
1073
+ ENQUEUE(currentTransition->next);
1074
+ }
1075
+ }
1076
+
1077
+ DEQUEUE(&currentState);
1078
+ }
1079
+
1080
+ return RULES_OK;
1081
+ }
1082
+
1083
+ static void setIndex(symbolEntry *symbolHashSet, unsigned short vocabularyLength, unsigned int symbol, unsigned short index) {
1084
+ unsigned int max = vocabularyLength * 2;
1085
+ unsigned int i = symbol % max;
1086
+ while (symbolHashSet[i].symbol) {
1087
+ i = (i + 1) % max;
1088
+ }
1089
+ symbolHashSet[i].symbol = symbol;
1090
+ symbolHashSet[i].index = index;
1091
+ }
1092
+
1093
+ static unsigned short getIndex(symbolEntry *symbolHashSet, unsigned short vocabularyLength, unsigned int symbol) {
1094
+ unsigned int max = vocabularyLength * 2;
1095
+ unsigned int i = symbol % max;
1096
+ while (symbolHashSet[i].symbol) {
1097
+ if (symbolHashSet[i].symbol == symbol) {
1098
+ return symbolHashSet[i].index;
1099
+ }
1100
+ i = (i + 1) % max;
1101
+ }
1102
+
1103
+ return 0;
1104
+ }
1105
+
1106
+ static unsigned int packGraph(state *start,
1107
+ void *stateMachine,
1108
+ unsigned short vocabularyLength,
1109
+ unsigned short statesLength) {
1110
+ CREATE_QUEUE(state*);
1111
+ unsigned short visited[MAX_STATES] = {0};
1112
+ symbolEntry *symbolHashSet = (symbolEntry *)stateMachine;
1113
+ unsigned short *stateTable = (unsigned short *)(symbolHashSet + vocabularyLength * 2);
1114
+ unsigned char *acceptVector = (unsigned char *)(stateTable + (vocabularyLength * statesLength));
1115
+ unsigned short stateNumber = 1;
1116
+ unsigned short vocabularyNumber = 1;
1117
+ state *currentState = start;
1118
+ visited[currentState->id] = stateNumber;
1119
+ ++stateNumber;
1120
+ while (currentState) {
1121
+ unsigned short targetStateNumber = visited[currentState->id];
1122
+ if (currentState->isAccept) {
1123
+ acceptVector[targetStateNumber - 1] = 1;
1124
+ }
1125
+
1126
+ for (int i = 0; i < currentState->transitionsLength; ++ i) {
1127
+ transition *currentTransition = &currentState->transitions[i];
1128
+
1129
+ if (!getIndex(symbolHashSet, vocabularyLength, currentTransition->symbol)) {
1130
+ setIndex(symbolHashSet, vocabularyLength, currentTransition->symbol, vocabularyNumber);
1131
+ ++vocabularyNumber;
1132
+ }
1133
+
1134
+ if (!visited[currentTransition->next->id]) {
1135
+ visited[currentTransition->next->id] = stateNumber;
1136
+ ++stateNumber;
1137
+ ENQUEUE(currentTransition->next);
1138
+ }
1139
+
1140
+ unsigned short targetSymbolNumber = getIndex(symbolHashSet, vocabularyLength, currentTransition->symbol);
1141
+ stateTable[statesLength * (targetSymbolNumber - 1) + (targetStateNumber - 1)] = visited[currentTransition->next->id];
1142
+ }
1143
+
1144
+ DEQUEUE(&currentState);
1145
+ }
1146
+
1147
+ return RULES_OK;
1148
+ }
1149
+
1150
+ unsigned int validateRegex(char *first,
1151
+ char *last) {
1152
+ return validateGraph(&first, last);
1153
+ }
1154
+
1155
+ unsigned int compileRegex(void *tree,
1156
+ char *first,
1157
+ char *last,
1158
+ unsigned short *vocabularyLength,
1159
+ unsigned short *statesLength,
1160
+ unsigned int *regexStateMachineOffset) {
1161
+ state *start;
1162
+ state *end;
1163
+ unsigned short id = 0;
1164
+ unsigned int result = createGraph(&first, last, &id, &start, &end);
1165
+ if (result != RULES_OK) {
1166
+ return result;
1167
+ }
1168
+ end->isAccept = 1;
1169
+ ++start->refCount;
1170
+ result = transformToDFA(start, &id);
1171
+ if (result != RULES_OK) {
1172
+ return result;
1173
+ }
1174
+ result = calculateGraphDimensions(start,
1175
+ vocabularyLength,
1176
+ statesLength);
1177
+ if (result != RULES_OK) {
1178
+ return result;
1179
+ }
1180
+ void *newStateMachine;
1181
+ result = storeRegexStateMachine((ruleset *)tree,
1182
+ *vocabularyLength,
1183
+ *statesLength,
1184
+ &newStateMachine,
1185
+ regexStateMachineOffset);
1186
+ if (result != RULES_OK) {
1187
+ return result;
1188
+ }
1189
+ return packGraph(start,
1190
+ newStateMachine,
1191
+ *vocabularyLength,
1192
+ *statesLength);
1193
+ }
1194
+
1195
+ unsigned char evaluateRegex(void *tree,
1196
+ char *first,
1197
+ unsigned short length,
1198
+ unsigned short vocabularyLength,
1199
+ unsigned short statesLength,
1200
+ unsigned int regexStateMachineOffset) {
1201
+ symbolEntry *symbolHashSet = (symbolEntry *)&((ruleset *)tree)->regexStateMachinePool[regexStateMachineOffset];
1202
+ unsigned short *stateTable = (unsigned short *)(symbolHashSet + vocabularyLength * 2);
1203
+ unsigned char *acceptVector = (unsigned char *)(stateTable + (vocabularyLength * statesLength));
1204
+ unsigned short currentState = 1;
1205
+ char *last = first + length;
1206
+ while (first < last) {
1207
+ unsigned int unicodeSymbol;
1208
+ if (utf8ToUnicode(&first, last, &unicodeSymbol) != REGEX_PARSE_OK) {
1209
+ return 0;
1210
+ } else {
1211
+ unsigned short currentSymbol = getIndex(symbolHashSet, vocabularyLength, unicodeSymbol);
1212
+ if (!currentSymbol) {
1213
+ currentSymbol = getIndex(symbolHashSet, vocabularyLength, REGEX_DOT);
1214
+ if (!currentSymbol) {
1215
+ return 0;
1216
+ }
1217
+
1218
+ currentState = stateTable[statesLength * (currentSymbol - 1) + (currentState - 1)];
1219
+ if (!currentState) {
1220
+ return 0;
1221
+ }
1222
+ } else {
1223
+ currentState = stateTable[statesLength * (currentSymbol - 1) + (currentState - 1)];
1224
+ if (!currentState) {
1225
+ currentSymbol = getIndex(symbolHashSet, vocabularyLength, REGEX_DOT);
1226
+ if (!currentSymbol) {
1227
+ return 0;
1228
+ }
1229
+
1230
+ currentState = stateTable[statesLength * (currentSymbol - 1) + (currentState - 1)];
1231
+ if (!currentState) {
1232
+ return 0;
1233
+ }
1234
+ }
1235
+ }
1236
+ }
1237
+ }
1238
+
1239
+ return acceptVector[currentState - 1];
1240
+ }