ruby_rnv 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +3 -0
- data/ext/rnv/extconf.rb +15 -0
- data/ext/rnv/ruby_rnv.c +742 -0
- data/ext/rnv/src/ary.c +78 -0
- data/ext/rnv/src/ary.h +10 -0
- data/ext/rnv/src/drv.c +472 -0
- data/ext/rnv/src/drv.h +35 -0
- data/ext/rnv/src/er.c +15 -0
- data/ext/rnv/src/er.h +16 -0
- data/ext/rnv/src/erbit.h +14 -0
- data/ext/rnv/src/ht.c +90 -0
- data/ext/rnv/src/ht.h +22 -0
- data/ext/rnv/src/ll.h +43 -0
- data/ext/rnv/src/m.c +60 -0
- data/ext/rnv/src/m.h +10 -0
- data/ext/rnv/src/rn.c +569 -0
- data/ext/rnv/src/rn.h +150 -0
- data/ext/rnv/src/rnc.c +1191 -0
- data/ext/rnv/src/rnc.h +68 -0
- data/ext/rnv/src/rnd.c +436 -0
- data/ext/rnv/src/rnd.h +25 -0
- data/ext/rnv/src/rnl.c +62 -0
- data/ext/rnv/src/rnl.h +18 -0
- data/ext/rnv/src/rnv.c +158 -0
- data/ext/rnv/src/rnv.h +30 -0
- data/ext/rnv/src/rnx.c +153 -0
- data/ext/rnv/src/rnx.h +16 -0
- data/ext/rnv/src/rx.c +749 -0
- data/ext/rnv/src/rx.h +43 -0
- data/ext/rnv/src/rx_cls_ranges.c +126 -0
- data/ext/rnv/src/rx_cls_u.c +262 -0
- data/ext/rnv/src/s.c +103 -0
- data/ext/rnv/src/s.h +32 -0
- data/ext/rnv/src/sc.c +62 -0
- data/ext/rnv/src/sc.h +26 -0
- data/ext/rnv/src/type.h +121 -0
- data/ext/rnv/src/u.c +88 -0
- data/ext/rnv/src/u.h +26 -0
- data/ext/rnv/src/xcl.c +472 -0
- data/ext/rnv/src/xmlc.c +20 -0
- data/ext/rnv/src/xmlc.h +16 -0
- data/ext/rnv/src/xsd.c +789 -0
- data/ext/rnv/src/xsd.h +27 -0
- data/ext/rnv/src/xsd_tm.c +100 -0
- data/ext/rnv/src/xsd_tm.h +15 -0
- data/lib/rnv.rb +2 -0
- data/lib/rnv/ox_sax_document.rb +84 -0
- data/lib/rnv/validator.rb +104 -0
- metadata +175 -0
data/ext/rnv/src/rnx.h
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
/* $Id: rnx.h,v 1.7 2004/02/18 12:53:42 dvd Exp $ */
|
2
|
+
|
3
|
+
#include "type.h"
|
4
|
+
|
5
|
+
#ifndef RNX_H
|
6
|
+
#define RNX_H 1
|
7
|
+
|
8
|
+
extern void rnx_init(rnv_t *rnv, rnx_st_t *rnx_st);
|
9
|
+
extern void rnx_clear(void);
|
10
|
+
|
11
|
+
extern void rnx_expected(rnv_t *rnv, rnx_st_t *rnx_st, int p,int req);
|
12
|
+
|
13
|
+
extern char *rnx_p2str(rnv_t *rnv, int p);
|
14
|
+
extern char *rnx_nc2str(rnv_t *rnv, int nc);
|
15
|
+
|
16
|
+
#endif
|
data/ext/rnv/src/rx.c
ADDED
@@ -0,0 +1,749 @@
|
|
1
|
+
#include "type.h"
|
2
|
+
|
3
|
+
/* $Id: rx.c,v 1.33 2004/02/25 00:00:32 dvd Exp $ */
|
4
|
+
|
5
|
+
#include <string.h> /*strlen,strcpy,strcmp*/
|
6
|
+
#include <assert.h>
|
7
|
+
#include "u.h" /*u_get,u_strlen*/
|
8
|
+
#include "xmlc.h"
|
9
|
+
#include "m.h"
|
10
|
+
#include "s.h"
|
11
|
+
#include "ht.h"
|
12
|
+
#include "ll.h"
|
13
|
+
#include "er.h"
|
14
|
+
#include "rx.h"
|
15
|
+
#include "erbit.h"
|
16
|
+
|
17
|
+
#define LEN_P RX_LEN_P
|
18
|
+
#define PRIME_P RX_PRIME_P
|
19
|
+
#define LIM_P RX_LIM_P
|
20
|
+
#define LEN_2 RX_LEN_2
|
21
|
+
#define PRIME_2 RX_PRIME_2
|
22
|
+
#define LEN_R RX_LEN_R
|
23
|
+
#define PRIME_R RX_PRIME_R
|
24
|
+
|
25
|
+
#define R_AVG_SIZE 16
|
26
|
+
|
27
|
+
/* it is good to have few patterns when deltas are memoized */
|
28
|
+
#define P_ERROR 0
|
29
|
+
#define P_NOT_ALLOWED 1
|
30
|
+
#define P_EMPTY 2
|
31
|
+
#define P_CHOICE 3
|
32
|
+
#define P_GROUP 4
|
33
|
+
#define P_ONE_OR_MORE 5 /*+*/
|
34
|
+
#define P_EXCEPT 6 /*single-single*/
|
35
|
+
#define P_RANGE 7 /*lower,upper inclusive*/
|
36
|
+
#define P_CLASS 8 /*complement is .-*/
|
37
|
+
#define P_ANY 9
|
38
|
+
#define P_CHAR 10
|
39
|
+
|
40
|
+
#define P_SIZE 3
|
41
|
+
#define P_AVG_SIZE 2
|
42
|
+
|
43
|
+
static int p_size[]={1,1,1,3,3,2,3,3,2,1,2};
|
44
|
+
|
45
|
+
#define P_TYP(i) (rx_st->pattern[i]&0xF)
|
46
|
+
#define P_IS(i,x) (x==P_TYP(i))
|
47
|
+
#define P_CHK(i,x) assert(P_IS(i,x))
|
48
|
+
|
49
|
+
#define P_unop(TYP,p,p1) P_CHK(p,TYP); p1=rx_st->pattern[p+1]
|
50
|
+
#define P_binop(TYP,p,p1,p2) P_unop(TYP,p,p1); p2=rx_st->pattern[p+2]
|
51
|
+
#define NotAllowed(p) P_CHK(p,P_NotAllowed)
|
52
|
+
#define Empty(p) P_CHK(p,P_Empty)
|
53
|
+
#define Any(p) P_CHK(p,P_Any)
|
54
|
+
#define Choice(p,p1,p2) P_binop(P_CHOICE,p,p1,p2)
|
55
|
+
#define Group(p,p1,p2) P_binop(P_GROUP,p,p1,p2)
|
56
|
+
#define OneOrMore(p,p1) P_unop(P_ONE_OR_MORE,p,p1)
|
57
|
+
#define Except(p,p1,p2) P_binop(P_EXCEPT,p,p1,p2)
|
58
|
+
#define Range(p,cf,cl) P_binop(P_RANGE,p,cf,cl)
|
59
|
+
#define Class(p,cn) P_unop(P_CLASS,p,cn)
|
60
|
+
#define Char(p,c) P_unop(P_CHAR,p,c)
|
61
|
+
|
62
|
+
#define P_NUL 0x100
|
63
|
+
|
64
|
+
#define setNullable(x) if(x) rx_st->pattern[rx_st->i_p]|=P_NUL
|
65
|
+
#define nullable(p) (rx_st->pattern[p]&P_NUL)
|
66
|
+
|
67
|
+
/* 'compact' in drv and rx do different things.
|
68
|
+
In drv, it limits the size of the table of memoized deltas. In rx, it limits the size
|
69
|
+
of the buffer for cached regular expressions; memoized deltas are always limited by LIM_M,
|
70
|
+
since the whole repertoire of unicode characters can blow up the buffer.
|
71
|
+
*/
|
72
|
+
|
73
|
+
static int accept_p(rx_st_t *rx_st) {
|
74
|
+
int j;
|
75
|
+
if((j=ht_get(&rx_st->ht_p,rx_st->i_p))==-1) {
|
76
|
+
ht_put(&rx_st->ht_p,j=rx_st->i_p);
|
77
|
+
rx_st->i_p+=p_size[P_TYP(rx_st->i_p)];
|
78
|
+
if(rx_st->i_p+P_SIZE>rx_st->len_p) rx_st->pattern=(int*)m_stretch(rx_st->pattern,rx_st->len_p=2*(rx_st->i_p+P_SIZE),rx_st->i_p,sizeof(int));
|
79
|
+
}
|
80
|
+
return j;
|
81
|
+
}
|
82
|
+
|
83
|
+
#define P_NEW(x) (rx_st->pattern[rx_st->i_p]=x)
|
84
|
+
|
85
|
+
#define P_newunop(TYP,p1) P_NEW(TYP); rx_st->pattern[rx_st->i_p+1]=p1
|
86
|
+
#define P_newbinop(TYP,p1,p2) P_newunop(TYP,p1); rx_st->pattern[rx_st->i_p+2]=p2
|
87
|
+
static int newNotAllowed(rx_st_t *rx_st) {P_NEW(P_NOT_ALLOWED); return accept_p(rx_st);}
|
88
|
+
static int newEmpty(rx_st_t *rx_st) {P_NEW(P_EMPTY); setNullable(1); return accept_p(rx_st);}
|
89
|
+
static int newAny(rx_st_t *rx_st) {P_NEW(P_ANY); return accept_p(rx_st);}
|
90
|
+
static int newChoice(rx_st_t *rx_st, int p1,int p2) {P_newbinop(P_CHOICE,p1,p2); setNullable(nullable(p1)||nullable(p2)); return accept_p(rx_st);}
|
91
|
+
static int newGroup(rx_st_t *rx_st, int p1,int p2) {P_newbinop(P_GROUP,p1,p2); setNullable(nullable(p1)&&nullable(p2)); return accept_p(rx_st);}
|
92
|
+
static int newOneOrMore(rx_st_t *rx_st, int p1) {P_newunop(P_ONE_OR_MORE,p1); setNullable(nullable(p1)); return accept_p(rx_st);}
|
93
|
+
static int newExcept(rx_st_t *rx_st, int p1,int p2) {P_newbinop(P_EXCEPT,p1,p2); return accept_p(rx_st);}
|
94
|
+
static int newRange(rx_st_t *rx_st, int cf,int cl) {P_newbinop(P_RANGE,cf,cl); return accept_p(rx_st);}
|
95
|
+
static int newClass(rx_st_t *rx_st, int cn) {P_newunop(P_CLASS,cn); return accept_p(rx_st);}
|
96
|
+
static int newChar(rx_st_t *rx_st, int c) {P_newunop(P_CHAR,c); return accept_p(rx_st);}
|
97
|
+
|
98
|
+
static int one_or_more(rx_st_t *rx_st, int p) {
|
99
|
+
if(P_IS(p,P_EMPTY)) return p;
|
100
|
+
if(P_IS(p,P_NOT_ALLOWED)) return p;
|
101
|
+
return newOneOrMore(rx_st, p);
|
102
|
+
}
|
103
|
+
|
104
|
+
static int group(rx_st_t *rx_st, int p1,int p2) {
|
105
|
+
if(P_IS(p1,P_NOT_ALLOWED)) return p1;
|
106
|
+
if(P_IS(p2,P_NOT_ALLOWED)) return p2;
|
107
|
+
if(P_IS(p1,P_EMPTY)) return p2;
|
108
|
+
if(P_IS(p2,P_EMPTY)) return p1;
|
109
|
+
return newGroup(rx_st, p1,p2);
|
110
|
+
}
|
111
|
+
|
112
|
+
static int samechoice(rx_st_t *rx_st, int p1,int p2) {
|
113
|
+
if(P_IS(p1,P_CHOICE)) {
|
114
|
+
int p11,p12; Choice(p1,p11,p12);
|
115
|
+
return p12==p2||samechoice(rx_st, p11,p2);
|
116
|
+
} else return p1==p2;
|
117
|
+
}
|
118
|
+
|
119
|
+
static int choice(rx_st_t *rx_st, int p1,int p2) {
|
120
|
+
if(P_IS(p1,P_NOT_ALLOWED)) return p2;
|
121
|
+
if(P_IS(p2,P_NOT_ALLOWED)) return p1;
|
122
|
+
if(P_IS(p2,P_CHOICE)) {
|
123
|
+
int p21,p22; Choice(p2,p21,p22);
|
124
|
+
p1=choice(rx_st, p1,p21); return choice(rx_st, p1,p22);
|
125
|
+
}
|
126
|
+
if(samechoice(rx_st, p1,p2)) return p1;
|
127
|
+
if(nullable(p1) && (P_IS(p2,P_EMPTY))) return p1;
|
128
|
+
if(nullable(p2) && (P_IS(p1,P_EMPTY))) return p2;
|
129
|
+
return newChoice(rx_st, p1,p2);
|
130
|
+
}
|
131
|
+
|
132
|
+
static int cls(rx_st_t *rx_st, int cn) {
|
133
|
+
if(cn<0) return newExcept(rx_st, rx_st->any,newClass(rx_st, -cn));
|
134
|
+
if(cn==0) return rx_st->notAllowed;
|
135
|
+
return newClass(rx_st, cn);
|
136
|
+
}
|
137
|
+
|
138
|
+
static int equal_r(void *user, int r1,int r2) {
|
139
|
+
rx_st_t *rx_st = (rx_st_t *)user;
|
140
|
+
return strcmp(rx_st->regex+r1,rx_st->regex+r2)==0;
|
141
|
+
}
|
142
|
+
static int hash_r(void *user, int r) {
|
143
|
+
rx_st_t *rx_st = (rx_st_t *)user;
|
144
|
+
return s_hval(rx_st->regex+r);
|
145
|
+
}
|
146
|
+
|
147
|
+
static int equal_p(void *user, int p1,int p2) {
|
148
|
+
rx_st_t *rx_st = (rx_st_t *)user;
|
149
|
+
int *pp1=rx_st->pattern+p1,*pp2=rx_st->pattern+p2;
|
150
|
+
if(P_TYP(p1)!=P_TYP(p2)) return 0;
|
151
|
+
switch(p_size[P_TYP(p1)]) {
|
152
|
+
case 3: if(pp1[2]!=pp2[2]) return 0;
|
153
|
+
case 2: if(pp1[1]!=pp2[1]) return 0;
|
154
|
+
case 1: return 1;
|
155
|
+
default: assert(0);
|
156
|
+
}
|
157
|
+
return 0;
|
158
|
+
}
|
159
|
+
static int hash_p(void *user, int p) {
|
160
|
+
rx_st_t *rx_st = (rx_st_t *)user;
|
161
|
+
int *pp=rx_st->pattern+p; int h=0;
|
162
|
+
switch(p_size[P_TYP(p)]) {
|
163
|
+
case 1: h=pp[0]&0xF; break;
|
164
|
+
case 2: h=(pp[0]&0xF)|(pp[1]<<4); break;
|
165
|
+
case 3: h=(pp[0]&0xF)|((pp[1]^pp[2])<<4); break;
|
166
|
+
default: assert(0);
|
167
|
+
}
|
168
|
+
return h*PRIME_P;
|
169
|
+
}
|
170
|
+
|
171
|
+
static int equal_2(void *user, int x1,int x2) {
|
172
|
+
rx_st_t *rx_st = (rx_st_t *)user;
|
173
|
+
return rx_st->r2p[x1][0]==rx_st->r2p[x2][0];
|
174
|
+
}
|
175
|
+
static int hash_2(void *user, int x) {
|
176
|
+
rx_st_t *rx_st = (rx_st_t *)user;
|
177
|
+
return rx_st->r2p[x][0]*PRIME_2;
|
178
|
+
}
|
179
|
+
|
180
|
+
static int add_r(rx_st_t *rx_st, char *rx) {
|
181
|
+
int len=strlen(rx)+1;
|
182
|
+
if(rx_st->i_r+len>rx_st->len_r) rx_st->regex=(char*)m_stretch(rx_st->regex,rx_st->len_r=2*(rx_st->i_r+len),rx_st->i_r,sizeof(char));
|
183
|
+
strcpy(rx_st->regex+rx_st->i_r,rx);
|
184
|
+
return len;
|
185
|
+
}
|
186
|
+
|
187
|
+
#define ERRPOS
|
188
|
+
|
189
|
+
#define err(msg) (*rnv->verror_handler)(rnv,erno|ERBIT_RX,msg" in \"%s\" at offset %i\n",ap)
|
190
|
+
void rx_default_verror_handler(rnv_t *rnv, int erno,va_list ap) {
|
191
|
+
(*er_printf)("regular expressions: ");
|
192
|
+
switch(erno) {
|
193
|
+
case RX_ER_BADCH: err("bad character"); break;
|
194
|
+
case RX_ER_UNFIN: err("unfinished expression"); break;
|
195
|
+
case RX_ER_NOLSQ: err("'[' expected"); break;
|
196
|
+
case RX_ER_NORSQ: err("']' expected"); break;
|
197
|
+
case RX_ER_NOLCU: err("'{' expected"); break;
|
198
|
+
case RX_ER_NORCU: err("'}' expected"); break;
|
199
|
+
case RX_ER_NOLPA: err("'(' expected"); break;
|
200
|
+
case RX_ER_NORPA: err("')' expected"); break;
|
201
|
+
case RX_ER_BADCL: err("unknown class"); break;
|
202
|
+
case RX_ER_NODGT: err("digit expected"); break;
|
203
|
+
case RX_ER_DNUOB: err("reversed bounds"); break;
|
204
|
+
case RX_ER_NOTRC: err("range or class expected"); break;
|
205
|
+
default: assert(0);
|
206
|
+
}
|
207
|
+
}
|
208
|
+
|
209
|
+
//void (*rx_verror_handler)(int erno,va_list ap)=&rx_default_verror_handler;
|
210
|
+
|
211
|
+
static void error_handler(rx_st_t *rx_st,int erno,...) {
|
212
|
+
va_list ap; va_start(ap,erno); (*rx_st->rnv->rx_verror_handler)(rx_st->rnv, erno,ap); va_end(ap);
|
213
|
+
}
|
214
|
+
|
215
|
+
#define LEN_M RX_LEN_M
|
216
|
+
#define PRIME_M RX_PRIME_M
|
217
|
+
#define LIM_M RX_LIM_M
|
218
|
+
|
219
|
+
#define M_SIZE 3
|
220
|
+
|
221
|
+
#define M_SET(p) rx_st->memo[rx_st->i_m][M_SIZE-1]=p
|
222
|
+
#define M_RET(m) rx_st->memo[m][M_SIZE-1]
|
223
|
+
|
224
|
+
static int new_memo(rx_st_t *rx_st, int p,int c) {
|
225
|
+
int *me=rx_st->memo[rx_st->i_m];
|
226
|
+
ht_deli(&rx_st->ht_m,rx_st->i_m);
|
227
|
+
me[0]=p; me[1]=c;
|
228
|
+
return ht_get(&rx_st->ht_m,rx_st->i_m);
|
229
|
+
}
|
230
|
+
|
231
|
+
static int equal_m(void *user,int m1,int m2) {
|
232
|
+
rx_st_t *rx_st = (rx_st_t *)user;
|
233
|
+
int *me1=rx_st->memo[m1],*me2=rx_st->memo[m2];
|
234
|
+
return (me1[0]==me2[0])&&(me1[1]==me2[1]);
|
235
|
+
}
|
236
|
+
static int hash_m(void *user,int m) {
|
237
|
+
rx_st_t *rx_st = (rx_st_t *)user;
|
238
|
+
int *me=rx_st->memo[m];
|
239
|
+
return (me[0]^me[1])*PRIME_M;
|
240
|
+
}
|
241
|
+
|
242
|
+
static void accept_m(rx_st_t *rx_st) {
|
243
|
+
if(ht_get(&rx_st->ht_m,rx_st->i_m)!=-1) ht_del(&rx_st->ht_m,rx_st->i_m);
|
244
|
+
ht_put(&rx_st->ht_m,rx_st->i_m++);
|
245
|
+
if(rx_st->i_m>=LIM_M) rx_st->i_m=0;
|
246
|
+
if(rx_st->i_m==rx_st->len_m) rx_st->memo=(int(*)[M_SIZE])m_stretch(rx_st->memo,rx_st->len_m=rx_st->i_m*2,rx_st->i_m,sizeof(int[M_SIZE]));
|
247
|
+
}
|
248
|
+
|
249
|
+
static void windup(rx_st_t *rx_st);
|
250
|
+
void rx_init(rx_st_t *rx_st) {
|
251
|
+
// memset(rx_st, 0, sizeof(rx_st_t));
|
252
|
+
|
253
|
+
rx_st->rnv->rx_verror_handler=&rx_default_verror_handler;
|
254
|
+
|
255
|
+
rx_st->pattern=(int *)m_alloc(rx_st->len_p=P_AVG_SIZE*LEN_P,sizeof(int));
|
256
|
+
rx_st->r2p=(int (*)[2])m_alloc(rx_st->len_2=LEN_2,sizeof(int[2]));
|
257
|
+
rx_st->regex=(char*)m_alloc(rx_st->len_r=R_AVG_SIZE*LEN_R,sizeof(char));
|
258
|
+
rx_st->memo=(int (*)[M_SIZE])m_alloc(rx_st->len_m=LEN_M,sizeof(int[M_SIZE]));
|
259
|
+
|
260
|
+
rx_st->ht_p.user = rx_st;
|
261
|
+
rx_st->ht_2.user = rx_st;
|
262
|
+
rx_st->ht_r.user = rx_st;
|
263
|
+
rx_st->ht_m.user = rx_st;
|
264
|
+
|
265
|
+
ht_init(&rx_st->ht_p,LEN_P,&hash_p,&equal_p);
|
266
|
+
ht_init(&rx_st->ht_2,LEN_2,&hash_2,&equal_2);
|
267
|
+
ht_init(&rx_st->ht_r,LEN_R,&hash_r,&equal_r);
|
268
|
+
ht_init(&rx_st->ht_m,LEN_M,&hash_m,&equal_m);
|
269
|
+
|
270
|
+
windup(rx_st);
|
271
|
+
}
|
272
|
+
|
273
|
+
void rx_clear(rx_st_t *rx_st) {
|
274
|
+
ht_clear(&rx_st->ht_p); ht_clear(&rx_st->ht_2); ht_clear(&rx_st->ht_r); ht_clear(&rx_st->ht_m);
|
275
|
+
windup(rx_st);
|
276
|
+
}
|
277
|
+
|
278
|
+
static void windup(rx_st_t *rx_st) {
|
279
|
+
rx_st->i_p=rx_st->i_r=rx_st->i_2=rx_st->i_m=0;
|
280
|
+
rx_st->pattern[0]=P_ERROR; accept_p(rx_st);
|
281
|
+
rx_st->empty=newEmpty(rx_st); rx_st->notAllowed=newNotAllowed(rx_st); rx_st->any=newAny(rx_st);
|
282
|
+
}
|
283
|
+
|
284
|
+
#define SYM_END 0
|
285
|
+
#define SYM_CLS 1
|
286
|
+
#define SYM_ESC 2
|
287
|
+
#define SYM_CHR 3
|
288
|
+
|
289
|
+
static void error(rx_st_t *rx_st, int erno) {
|
290
|
+
if(!rx_st->errors) error_handler(rx_st, erno,rx_st->regex+rx_st->r0,u_strlen(rx_st->regex+rx_st->r0)-u_strlen(rx_st->regex+rx_st->ri));
|
291
|
+
++rx_st->errors;
|
292
|
+
}
|
293
|
+
|
294
|
+
#include "rx_cls_u.c"
|
295
|
+
|
296
|
+
static int chclass(rx_st_t *rx_st) {
|
297
|
+
int u,cl,rj;
|
298
|
+
rx_st->ri+=u_get(&u,rx_st->regex+rx_st->ri);
|
299
|
+
if(u=='\0') {--rx_st->ri; error(rx_st, RX_ER_NOLCU); return 0;}
|
300
|
+
if(u!='{') {error(rx_st, RX_ER_NOLCU); return 0;}
|
301
|
+
rj=rx_st->ri;
|
302
|
+
for(;;) {
|
303
|
+
if(rx_st->regex[rj]=='\0') {rx_st->ri=rj; error(rx_st, RX_ER_NORCU); return 0;}
|
304
|
+
if(rx_st->regex[rj]=='}') {
|
305
|
+
if((cl=s_ntab(rx_st->regex+rx_st->ri,rj-rx_st->ri,clstab,NUM_CLS_U))==NUM_CLS_U) {error(rx_st, RX_ER_BADCL); cl=0;}
|
306
|
+
rx_st->ri=rj+1;
|
307
|
+
return cl;
|
308
|
+
}
|
309
|
+
++rj;
|
310
|
+
}
|
311
|
+
}
|
312
|
+
|
313
|
+
#define CLS_NL (NUM_CLS_U+1)
|
314
|
+
#define CLS_S (NUM_CLS_U+2)
|
315
|
+
#define CLS_I (NUM_CLS_U+3)
|
316
|
+
#define CLS_C (NUM_CLS_U+4)
|
317
|
+
#define CLS_W (NUM_CLS_U+5)
|
318
|
+
#define NUM_CLS (NUM_CLS_U+6)
|
319
|
+
|
320
|
+
static void getsym(rx_st_t *rx_st) {
|
321
|
+
int u;
|
322
|
+
if(rx_st->regex[rx_st->ri]=='\0') rx_st->sym=SYM_END; else {
|
323
|
+
rx_st->ri+=u_get(&u,rx_st->regex+rx_st->ri);
|
324
|
+
if(u=='\\') {
|
325
|
+
rx_st->ri+=u_get(&u,rx_st->regex+rx_st->ri);
|
326
|
+
switch(u) {
|
327
|
+
case '\0': --rx_st->ri; error(rx_st, RX_ER_UNFIN); rx_st->sym=SYM_END; break;
|
328
|
+
case 'p': rx_st->sym=SYM_CLS; rx_st->val=chclass(rx_st); break;
|
329
|
+
case 'P': rx_st->sym=SYM_CLS; rx_st->val=-chclass(rx_st); break;
|
330
|
+
case 's': rx_st->sym=SYM_CLS; rx_st->val=CLS_S; break;
|
331
|
+
case 'S': rx_st->sym=SYM_CLS; rx_st->val=-CLS_S; break;
|
332
|
+
case 'i': rx_st->sym=SYM_CLS; rx_st->val=CLS_I; break;
|
333
|
+
case 'I': rx_st->sym=SYM_CLS; rx_st->val=-CLS_I; break;
|
334
|
+
case 'c': rx_st->sym=SYM_CLS; rx_st->val=CLS_C; break;
|
335
|
+
case 'C': rx_st->sym=SYM_CLS; rx_st->val=-CLS_C; break;
|
336
|
+
case 'd': rx_st->sym=SYM_CLS; rx_st->val=CLS_U_Nd; break;
|
337
|
+
case 'D': rx_st->sym=SYM_CLS; rx_st->val=-CLS_U_Nd; break;
|
338
|
+
case 'w': rx_st->sym=SYM_CLS; rx_st->val=CLS_W; break;
|
339
|
+
case 'W': rx_st->sym=SYM_CLS; rx_st->val=-CLS_W; break;
|
340
|
+
case 'n': rx_st->sym=SYM_ESC; rx_st->val=0xA; break;
|
341
|
+
case 'r': rx_st->sym=SYM_ESC; rx_st->val=0xD; break;
|
342
|
+
case 't': rx_st->sym=SYM_ESC; rx_st->val=0x9; break;
|
343
|
+
case '\\': case '|': case '.': case '-': case '^': case '?': case '*': case '+':
|
344
|
+
case '{': case '}': case '[': case ']': case '(': case ')':
|
345
|
+
rx_st->sym=SYM_ESC; rx_st->val=u; break;
|
346
|
+
default: error(rx_st, RX_ER_BADCH); rx_st->sym=SYM_ESC; rx_st->val=u; break;
|
347
|
+
}
|
348
|
+
} else {
|
349
|
+
switch(u) {
|
350
|
+
case '.': rx_st->sym=SYM_CLS; rx_st->val=-CLS_NL; break;
|
351
|
+
default: rx_st->sym=SYM_CHR; rx_st->val=u; break;
|
352
|
+
}
|
353
|
+
}
|
354
|
+
}
|
355
|
+
}
|
356
|
+
|
357
|
+
static void chk_get(rx_st_t *rx_st, int v,int erno) {if(rx_st->sym!=SYM_CHR||rx_st->val!=v) error(rx_st, erno); getsym(rx_st);}
|
358
|
+
|
359
|
+
|
360
|
+
#define chkrch(val) if((val)=='['||(val)==']'||(val)=='-') error(rx_st, RX_ER_NOTRC)
|
361
|
+
|
362
|
+
static int chgroup(rx_st_t *rx_st) {
|
363
|
+
int p=rx_st->notAllowed,c;
|
364
|
+
for(;;) {
|
365
|
+
switch(rx_st->sym) {
|
366
|
+
case SYM_CHR: chkrch(rx_st->val);
|
367
|
+
case SYM_ESC: c=rx_st->val; getsym(rx_st);
|
368
|
+
if(rx_st->sym==SYM_CHR&&rx_st->val=='-') {
|
369
|
+
if(rx_st->regex[rx_st->ri]=='[') {
|
370
|
+
p=choice(rx_st, p,newChar(rx_st, c));
|
371
|
+
goto END_OF_GROUP;
|
372
|
+
} else {
|
373
|
+
getsym(rx_st);
|
374
|
+
switch(rx_st->sym) {
|
375
|
+
case SYM_CHR: chkrch(rx_st->val);
|
376
|
+
case SYM_ESC: p=choice(rx_st, p,newRange(rx_st, c,rx_st->val)); getsym(rx_st); break;
|
377
|
+
default: error(rx_st, RX_ER_BADCH); getsym(rx_st); break;
|
378
|
+
}
|
379
|
+
}
|
380
|
+
} else {
|
381
|
+
p=choice(rx_st, p,newChar(rx_st, c));
|
382
|
+
}
|
383
|
+
break;
|
384
|
+
case SYM_CLS: p=choice(rx_st, p,cls(rx_st, rx_st->val)); getsym(rx_st); break;
|
385
|
+
case SYM_END: error(rx_st, RX_ER_NORSQ); goto END_OF_GROUP;
|
386
|
+
default: assert(0);
|
387
|
+
}
|
388
|
+
if(rx_st->sym==SYM_CHR&&(rx_st->val==']'||rx_st->val=='-')) goto END_OF_GROUP;
|
389
|
+
}
|
390
|
+
END_OF_GROUP:;
|
391
|
+
return p;
|
392
|
+
}
|
393
|
+
|
394
|
+
static int chexpr(rx_st_t *rx_st) {
|
395
|
+
int p;
|
396
|
+
if(rx_st->sym==SYM_CHR&&rx_st->val=='^') { getsym(rx_st);
|
397
|
+
p=newExcept(rx_st, rx_st->any,chgroup(rx_st));
|
398
|
+
} else {
|
399
|
+
p=chgroup(rx_st);
|
400
|
+
}
|
401
|
+
if(rx_st->sym==SYM_CHR&&rx_st->val=='-') { getsym(rx_st);
|
402
|
+
chk_get(rx_st, '[',RX_ER_NOLSQ); p=newExcept(rx_st, p,chexpr(rx_st)); chk_get(rx_st, ']',RX_ER_NORSQ);
|
403
|
+
}
|
404
|
+
return p;
|
405
|
+
}
|
406
|
+
|
407
|
+
static int expression(rx_st_t *rx_st);
|
408
|
+
static int atom(rx_st_t *rx_st) {
|
409
|
+
int p=0;
|
410
|
+
switch(rx_st->sym) {
|
411
|
+
case SYM_CHR:
|
412
|
+
switch(rx_st->val) {
|
413
|
+
case '[': getsym(rx_st); p=chexpr(rx_st); chk_get(rx_st, ']',RX_ER_NORSQ); break;
|
414
|
+
case '(': getsym(rx_st); p=expression(rx_st); chk_get(rx_st, ')',RX_ER_NORPA); break;
|
415
|
+
case '{': case '?': case '*': case '+': case '|':
|
416
|
+
case ')': case ']': case '}': error(rx_st, RX_ER_BADCH); getsym(rx_st); break;
|
417
|
+
default: p=newChar(rx_st, rx_st->val); getsym(rx_st); break;
|
418
|
+
}
|
419
|
+
break;
|
420
|
+
case SYM_ESC: p=newChar(rx_st, rx_st->val); getsym(rx_st); break;
|
421
|
+
case SYM_CLS: p=cls(rx_st, rx_st->val); getsym(rx_st); break;
|
422
|
+
default: error(rx_st, RX_ER_BADCH); getsym(rx_st); break;
|
423
|
+
}
|
424
|
+
return p;
|
425
|
+
}
|
426
|
+
|
427
|
+
static int number(rx_st_t *rx_st) {
|
428
|
+
int n=0,m;
|
429
|
+
for(;;) {
|
430
|
+
if(rx_st->sym!=SYM_CHR) goto END_OF_DIGITS;
|
431
|
+
switch(rx_st->val) {
|
432
|
+
case '0': m=0; break;
|
433
|
+
case '1': m=1; break;
|
434
|
+
case '2': m=2; break;
|
435
|
+
case '3': m=3; break;
|
436
|
+
case '4': m=4; break;
|
437
|
+
case '5': m=5; break;
|
438
|
+
case '6': m=6; break;
|
439
|
+
case '7': m=7; break;
|
440
|
+
case '8': m=8; break;
|
441
|
+
case '9': m=9; break;
|
442
|
+
default: goto END_OF_DIGITS;
|
443
|
+
}
|
444
|
+
n=n*10+m;
|
445
|
+
getsym(rx_st);
|
446
|
+
}
|
447
|
+
END_OF_DIGITS:;
|
448
|
+
return n;
|
449
|
+
}
|
450
|
+
|
451
|
+
static int quantifier(rx_st_t *rx_st, int p0) {
|
452
|
+
int p=rx_st->empty,n,n0;
|
453
|
+
n=n0=number(rx_st);
|
454
|
+
while(n--) p=group(rx_st, p,p0);
|
455
|
+
if(rx_st->sym==SYM_CHR) {
|
456
|
+
if(rx_st->val==',') {
|
457
|
+
getsym(rx_st);
|
458
|
+
if(rx_st->sym==SYM_CHR && rx_st->val=='}') {
|
459
|
+
p=group(rx_st, p,choice(rx_st, rx_st->empty,one_or_more(rx_st, p0)));
|
460
|
+
} else {
|
461
|
+
n=number(rx_st)-n0; if(n<0) {error(rx_st, RX_ER_DNUOB); n=0;}
|
462
|
+
while(n--) p=group(rx_st, p,choice(rx_st, rx_st->empty,p0));
|
463
|
+
}
|
464
|
+
}
|
465
|
+
} else error(rx_st, RX_ER_NODGT);
|
466
|
+
return p;
|
467
|
+
}
|
468
|
+
|
469
|
+
static int piece(rx_st_t *rx_st) {
|
470
|
+
int p;
|
471
|
+
p=atom(rx_st);
|
472
|
+
if(rx_st->sym==SYM_CHR) {
|
473
|
+
switch(rx_st->val) {
|
474
|
+
case '{': getsym(rx_st); p=quantifier(rx_st, p); chk_get(rx_st, '}',RX_ER_NOLCU); break;
|
475
|
+
case '?': getsym(rx_st); p=choice(rx_st, rx_st->empty,p); break;
|
476
|
+
case '*': getsym(rx_st); p=choice(rx_st, rx_st->empty,one_or_more(rx_st, p)); break;
|
477
|
+
case '+': getsym(rx_st); p=one_or_more(rx_st, p); break;
|
478
|
+
default: break;
|
479
|
+
}
|
480
|
+
}
|
481
|
+
return p;
|
482
|
+
}
|
483
|
+
|
484
|
+
static int branch(rx_st_t *rx_st) {
|
485
|
+
int p;
|
486
|
+
p=rx_st->empty;
|
487
|
+
while(!(rx_st->sym==SYM_END||(rx_st->sym==SYM_CHR&&(rx_st->val=='|'||rx_st->val==')')))) p=group(rx_st, p,piece(rx_st));
|
488
|
+
return p;
|
489
|
+
}
|
490
|
+
|
491
|
+
static int expression(rx_st_t *rx_st) {
|
492
|
+
int p;
|
493
|
+
p=branch(rx_st);
|
494
|
+
while(rx_st->sym==SYM_CHR&&rx_st->val=='|') {
|
495
|
+
getsym(rx_st);
|
496
|
+
p=choice(rx_st, p,branch(rx_st));
|
497
|
+
}
|
498
|
+
return p;
|
499
|
+
}
|
500
|
+
|
501
|
+
static void bind(rx_st_t *rx_st, int r) {
|
502
|
+
rx_st->r0=rx_st->ri=r; rx_st->sym=-1; rx_st->errors=0;
|
503
|
+
getsym(rx_st);
|
504
|
+
}
|
505
|
+
|
506
|
+
static int compile(rnv_t *rnv, rx_st_t *rx_st, char *rx) {
|
507
|
+
int r=0,p=0,d_r;
|
508
|
+
d_r=add_r(rx_st, rx);
|
509
|
+
if((r=ht_get(&rx_st->ht_r,rx_st->i_r))==-1) {
|
510
|
+
if(rnv->rx_compact&&rx_st->i_p>=P_AVG_SIZE*LIM_P) {rx_clear(rx_st); d_r=add_r(rx_st, rx);}
|
511
|
+
ht_put(&rx_st->ht_r,r=rx_st->i_r);
|
512
|
+
rx_st->i_r+=d_r;
|
513
|
+
bind(rx_st, r); p=expression(rx_st); if(rx_st->sym!=SYM_END) error(rx_st, RX_ER_BADCH);
|
514
|
+
rx_st->r2p[rx_st->i_2][0]=r; rx_st->r2p[rx_st->i_2][1]=p;
|
515
|
+
ht_put(&rx_st->ht_2,rx_st->i_2++);
|
516
|
+
if(rx_st->i_2==rx_st->len_2) rx_st->r2p=(int(*)[2])m_stretch(rx_st->r2p,rx_st->len_2=2*rx_st->i_2,rx_st->i_2,sizeof(int[2]));
|
517
|
+
} else {
|
518
|
+
rx_st->r2p[rx_st->i_2][0]=r;
|
519
|
+
p=rx_st->r2p[ht_get(&rx_st->ht_2,rx_st->i_2)][1];
|
520
|
+
}
|
521
|
+
return p;
|
522
|
+
}
|
523
|
+
|
524
|
+
#include "rx_cls_ranges.c"
|
525
|
+
|
526
|
+
static int in_class(int c,int cn) {
|
527
|
+
switch(cn) {
|
528
|
+
case 0: return 0;
|
529
|
+
case CLS_U_C: return in_class(c,CLS_U_Cc)||in_class(c,CLS_U_Cf)||in_class(c,CLS_U_Co);
|
530
|
+
case CLS_U_Cc: return u_in_ranges(c,CcRanges,sizeof(CcRanges)/sizeof(int[2]));
|
531
|
+
case CLS_U_Cf: return u_in_ranges(c,CfRanges,sizeof(CfRanges)/sizeof(int[2]));
|
532
|
+
case CLS_U_Co: return u_in_ranges(c,CoRanges,sizeof(CoRanges)/sizeof(int[2]));
|
533
|
+
case CLS_U_IsAlphabeticPresentationForms: return u_in_ranges(c,IsAlphabeticPresentationFormsRanges,sizeof(IsAlphabeticPresentationFormsRanges)/sizeof(int[2]));
|
534
|
+
case CLS_U_IsArabic: return u_in_ranges(c,IsArabicRanges,sizeof(IsArabicRanges)/sizeof(int[2]));
|
535
|
+
case CLS_U_IsArabicPresentationForms_A: return u_in_ranges(c,IsArabicPresentationForms_ARanges,sizeof(IsArabicPresentationForms_ARanges)/sizeof(int[2]));
|
536
|
+
case CLS_U_IsArabicPresentationForms_B: return u_in_ranges(c,IsArabicPresentationForms_BRanges,sizeof(IsArabicPresentationForms_BRanges)/sizeof(int[2]));
|
537
|
+
case CLS_U_IsArmenian: return u_in_ranges(c,IsArmenianRanges,sizeof(IsArmenianRanges)/sizeof(int[2]));
|
538
|
+
case CLS_U_IsArrows: return u_in_ranges(c,IsArrowsRanges,sizeof(IsArrowsRanges)/sizeof(int[2]));
|
539
|
+
case CLS_U_IsBasicLatin: return u_in_ranges(c,IsBasicLatinRanges,sizeof(IsBasicLatinRanges)/sizeof(int[2]));
|
540
|
+
case CLS_U_IsBengali: return u_in_ranges(c,IsBengaliRanges,sizeof(IsBengaliRanges)/sizeof(int[2]));
|
541
|
+
case CLS_U_IsBlockElements: return u_in_ranges(c,IsBlockElementsRanges,sizeof(IsBlockElementsRanges)/sizeof(int[2]));
|
542
|
+
case CLS_U_IsBopomofo: return u_in_ranges(c,IsBopomofoRanges,sizeof(IsBopomofoRanges)/sizeof(int[2]));
|
543
|
+
case CLS_U_IsBopomofoExtended: return u_in_ranges(c,IsBopomofoExtendedRanges,sizeof(IsBopomofoExtendedRanges)/sizeof(int[2]));
|
544
|
+
case CLS_U_IsBoxDrawing: return u_in_ranges(c,IsBoxDrawingRanges,sizeof(IsBoxDrawingRanges)/sizeof(int[2]));
|
545
|
+
case CLS_U_IsBraillePatterns: return u_in_ranges(c,IsBraillePatternsRanges,sizeof(IsBraillePatternsRanges)/sizeof(int[2]));
|
546
|
+
case CLS_U_IsByzantineMusicalSymbols: return u_in_ranges(c,IsByzantineMusicalSymbolsRanges,sizeof(IsByzantineMusicalSymbolsRanges)/sizeof(int[2]));
|
547
|
+
case CLS_U_IsCJKCompatibility: return u_in_ranges(c,IsCJKCompatibilityRanges,sizeof(IsCJKCompatibilityRanges)/sizeof(int[2]));
|
548
|
+
case CLS_U_IsCJKCompatibilityForms: return u_in_ranges(c,IsCJKCompatibilityFormsRanges,sizeof(IsCJKCompatibilityFormsRanges)/sizeof(int[2]));
|
549
|
+
case CLS_U_IsCJKCompatibilityIdeographs: return u_in_ranges(c,IsCJKCompatibilityIdeographsRanges,sizeof(IsCJKCompatibilityIdeographsRanges)/sizeof(int[2]));
|
550
|
+
case CLS_U_IsCJKCompatibilityIdeographsSupplement: return u_in_ranges(c,IsCJKCompatibilityIdeographsSupplementRanges,sizeof(IsCJKCompatibilityIdeographsSupplementRanges)/sizeof(int[2]));
|
551
|
+
case CLS_U_IsCJKRadicalsSupplement: return u_in_ranges(c,IsCJKRadicalsSupplementRanges,sizeof(IsCJKRadicalsSupplementRanges)/sizeof(int[2]));
|
552
|
+
case CLS_U_IsCJKSymbolsandPunctuation: return u_in_ranges(c,IsCJKSymbolsandPunctuationRanges,sizeof(IsCJKSymbolsandPunctuationRanges)/sizeof(int[2]));
|
553
|
+
case CLS_U_IsCJKUnifiedIdeographs: return u_in_ranges(c,IsCJKUnifiedIdeographsRanges,sizeof(IsCJKUnifiedIdeographsRanges)/sizeof(int[2]));
|
554
|
+
case CLS_U_IsCJKUnifiedIdeographsExtensionA: return u_in_ranges(c,IsCJKUnifiedIdeographsExtensionARanges,sizeof(IsCJKUnifiedIdeographsExtensionARanges)/sizeof(int[2]));
|
555
|
+
case CLS_U_IsCJKUnifiedIdeographsExtensionB: return u_in_ranges(c,IsCJKUnifiedIdeographsExtensionBRanges,sizeof(IsCJKUnifiedIdeographsExtensionBRanges)/sizeof(int[2]));
|
556
|
+
case CLS_U_IsCherokee: return u_in_ranges(c,IsCherokeeRanges,sizeof(IsCherokeeRanges)/sizeof(int[2]));
|
557
|
+
case CLS_U_IsCombiningDiacriticalMarks: return u_in_ranges(c,IsCombiningDiacriticalMarksRanges,sizeof(IsCombiningDiacriticalMarksRanges)/sizeof(int[2]));
|
558
|
+
case CLS_U_IsCombiningHalfMarks: return u_in_ranges(c,IsCombiningHalfMarksRanges,sizeof(IsCombiningHalfMarksRanges)/sizeof(int[2]));
|
559
|
+
case CLS_U_IsCombiningMarksforSymbols: return u_in_ranges(c,IsCombiningMarksforSymbolsRanges,sizeof(IsCombiningMarksforSymbolsRanges)/sizeof(int[2]));
|
560
|
+
case CLS_U_IsControlPictures: return u_in_ranges(c,IsControlPicturesRanges,sizeof(IsControlPicturesRanges)/sizeof(int[2]));
|
561
|
+
case CLS_U_IsCurrencySymbols: return u_in_ranges(c,IsCurrencySymbolsRanges,sizeof(IsCurrencySymbolsRanges)/sizeof(int[2]));
|
562
|
+
case CLS_U_IsCyrillic: return u_in_ranges(c,IsCyrillicRanges,sizeof(IsCyrillicRanges)/sizeof(int[2]));
|
563
|
+
case CLS_U_IsDeseret: return u_in_ranges(c,IsDeseretRanges,sizeof(IsDeseretRanges)/sizeof(int[2]));
|
564
|
+
case CLS_U_IsDevanagari: return u_in_ranges(c,IsDevanagariRanges,sizeof(IsDevanagariRanges)/sizeof(int[2]));
|
565
|
+
case CLS_U_IsDingbats: return u_in_ranges(c,IsDingbatsRanges,sizeof(IsDingbatsRanges)/sizeof(int[2]));
|
566
|
+
case CLS_U_IsEnclosedAlphanumerics: return u_in_ranges(c,IsEnclosedAlphanumericsRanges,sizeof(IsEnclosedAlphanumericsRanges)/sizeof(int[2]));
|
567
|
+
case CLS_U_IsEnclosedCJKLettersandMonths: return u_in_ranges(c,IsEnclosedCJKLettersandMonthsRanges,sizeof(IsEnclosedCJKLettersandMonthsRanges)/sizeof(int[2]));
|
568
|
+
case CLS_U_IsEthiopic: return u_in_ranges(c,IsEthiopicRanges,sizeof(IsEthiopicRanges)/sizeof(int[2]));
|
569
|
+
case CLS_U_IsGeneralPunctuation: return u_in_ranges(c,IsGeneralPunctuationRanges,sizeof(IsGeneralPunctuationRanges)/sizeof(int[2]));
|
570
|
+
case CLS_U_IsGeometricShapes: return u_in_ranges(c,IsGeometricShapesRanges,sizeof(IsGeometricShapesRanges)/sizeof(int[2]));
|
571
|
+
case CLS_U_IsGeorgian: return u_in_ranges(c,IsGeorgianRanges,sizeof(IsGeorgianRanges)/sizeof(int[2]));
|
572
|
+
case CLS_U_IsGothic: return u_in_ranges(c,IsGothicRanges,sizeof(IsGothicRanges)/sizeof(int[2]));
|
573
|
+
case CLS_U_IsGreek: return u_in_ranges(c,IsGreekRanges,sizeof(IsGreekRanges)/sizeof(int[2]));
|
574
|
+
case CLS_U_IsGreekExtended: return u_in_ranges(c,IsGreekExtendedRanges,sizeof(IsGreekExtendedRanges)/sizeof(int[2]));
|
575
|
+
case CLS_U_IsGujarati: return u_in_ranges(c,IsGujaratiRanges,sizeof(IsGujaratiRanges)/sizeof(int[2]));
|
576
|
+
case CLS_U_IsGurmukhi: return u_in_ranges(c,IsGurmukhiRanges,sizeof(IsGurmukhiRanges)/sizeof(int[2]));
|
577
|
+
case CLS_U_IsHalfwidthandFullwidthForms: return u_in_ranges(c,IsHalfwidthandFullwidthFormsRanges,sizeof(IsHalfwidthandFullwidthFormsRanges)/sizeof(int[2]));
|
578
|
+
case CLS_U_IsHangulCompatibilityJamo: return u_in_ranges(c,IsHangulCompatibilityJamoRanges,sizeof(IsHangulCompatibilityJamoRanges)/sizeof(int[2]));
|
579
|
+
case CLS_U_IsHangulJamo: return u_in_ranges(c,IsHangulJamoRanges,sizeof(IsHangulJamoRanges)/sizeof(int[2]));
|
580
|
+
case CLS_U_IsHangulSyllables: return u_in_ranges(c,IsHangulSyllablesRanges,sizeof(IsHangulSyllablesRanges)/sizeof(int[2]));
|
581
|
+
case CLS_U_IsHebrew: return u_in_ranges(c,IsHebrewRanges,sizeof(IsHebrewRanges)/sizeof(int[2]));
|
582
|
+
case CLS_U_IsHiragana: return u_in_ranges(c,IsHiraganaRanges,sizeof(IsHiraganaRanges)/sizeof(int[2]));
|
583
|
+
case CLS_U_IsIPAExtensions: return u_in_ranges(c,IsIPAExtensionsRanges,sizeof(IsIPAExtensionsRanges)/sizeof(int[2]));
|
584
|
+
case CLS_U_IsIdeographicDescriptionCharacters: return u_in_ranges(c,IsIdeographicDescriptionCharactersRanges,sizeof(IsIdeographicDescriptionCharactersRanges)/sizeof(int[2]));
|
585
|
+
case CLS_U_IsKanbun: return u_in_ranges(c,IsKanbunRanges,sizeof(IsKanbunRanges)/sizeof(int[2]));
|
586
|
+
case CLS_U_IsKangxiRadicals: return u_in_ranges(c,IsKangxiRadicalsRanges,sizeof(IsKangxiRadicalsRanges)/sizeof(int[2]));
|
587
|
+
case CLS_U_IsKannada: return u_in_ranges(c,IsKannadaRanges,sizeof(IsKannadaRanges)/sizeof(int[2]));
|
588
|
+
case CLS_U_IsKatakana: return u_in_ranges(c,IsKatakanaRanges,sizeof(IsKatakanaRanges)/sizeof(int[2]));
|
589
|
+
case CLS_U_IsKhmer: return u_in_ranges(c,IsKhmerRanges,sizeof(IsKhmerRanges)/sizeof(int[2]));
|
590
|
+
case CLS_U_IsLao: return u_in_ranges(c,IsLaoRanges,sizeof(IsLaoRanges)/sizeof(int[2]));
|
591
|
+
case CLS_U_IsLatin_1Supplement: return u_in_ranges(c,IsLatin_1SupplementRanges,sizeof(IsLatin_1SupplementRanges)/sizeof(int[2]));
|
592
|
+
case CLS_U_IsLatinExtended_A: return u_in_ranges(c,IsLatinExtended_ARanges,sizeof(IsLatinExtended_ARanges)/sizeof(int[2]));
|
593
|
+
case CLS_U_IsLatinExtended_B: return u_in_ranges(c,IsLatinExtended_BRanges,sizeof(IsLatinExtended_BRanges)/sizeof(int[2]));
|
594
|
+
case CLS_U_IsLatinExtendedAdditional: return u_in_ranges(c,IsLatinExtendedAdditionalRanges,sizeof(IsLatinExtendedAdditionalRanges)/sizeof(int[2]));
|
595
|
+
case CLS_U_IsLetterlikeSymbols: return u_in_ranges(c,IsLetterlikeSymbolsRanges,sizeof(IsLetterlikeSymbolsRanges)/sizeof(int[2]));
|
596
|
+
case CLS_U_IsMalayalam: return u_in_ranges(c,IsMalayalamRanges,sizeof(IsMalayalamRanges)/sizeof(int[2]));
|
597
|
+
case CLS_U_IsMathematicalAlphanumericSymbols: return u_in_ranges(c,IsMathematicalAlphanumericSymbolsRanges,sizeof(IsMathematicalAlphanumericSymbolsRanges)/sizeof(int[2]));
|
598
|
+
case CLS_U_IsMathematicalOperators: return u_in_ranges(c,IsMathematicalOperatorsRanges,sizeof(IsMathematicalOperatorsRanges)/sizeof(int[2]));
|
599
|
+
case CLS_U_IsMiscellaneousSymbols: return u_in_ranges(c,IsMiscellaneousSymbolsRanges,sizeof(IsMiscellaneousSymbolsRanges)/sizeof(int[2]));
|
600
|
+
case CLS_U_IsMiscellaneousTechnical: return u_in_ranges(c,IsMiscellaneousTechnicalRanges,sizeof(IsMiscellaneousTechnicalRanges)/sizeof(int[2]));
|
601
|
+
case CLS_U_IsMongolian: return u_in_ranges(c,IsMongolianRanges,sizeof(IsMongolianRanges)/sizeof(int[2]));
|
602
|
+
case CLS_U_IsMusicalSymbols: return u_in_ranges(c,IsMusicalSymbolsRanges,sizeof(IsMusicalSymbolsRanges)/sizeof(int[2]));
|
603
|
+
case CLS_U_IsMyanmar: return u_in_ranges(c,IsMyanmarRanges,sizeof(IsMyanmarRanges)/sizeof(int[2]));
|
604
|
+
case CLS_U_IsNumberForms: return u_in_ranges(c,IsNumberFormsRanges,sizeof(IsNumberFormsRanges)/sizeof(int[2]));
|
605
|
+
case CLS_U_IsOgham: return u_in_ranges(c,IsOghamRanges,sizeof(IsOghamRanges)/sizeof(int[2]));
|
606
|
+
case CLS_U_IsOldItalic: return u_in_ranges(c,IsOldItalicRanges,sizeof(IsOldItalicRanges)/sizeof(int[2]));
|
607
|
+
case CLS_U_IsOpticalCharacterRecognition: return u_in_ranges(c,IsOpticalCharacterRecognitionRanges,sizeof(IsOpticalCharacterRecognitionRanges)/sizeof(int[2]));
|
608
|
+
case CLS_U_IsOriya: return u_in_ranges(c,IsOriyaRanges,sizeof(IsOriyaRanges)/sizeof(int[2]));
|
609
|
+
case CLS_U_IsPrivateUse: return u_in_ranges(c,IsPrivateUseRanges,sizeof(IsPrivateUseRanges)/sizeof(int[2]));
|
610
|
+
case CLS_U_IsRunic: return u_in_ranges(c,IsRunicRanges,sizeof(IsRunicRanges)/sizeof(int[2]));
|
611
|
+
case CLS_U_IsSinhala: return u_in_ranges(c,IsSinhalaRanges,sizeof(IsSinhalaRanges)/sizeof(int[2]));
|
612
|
+
case CLS_U_IsSmallFormVariants: return u_in_ranges(c,IsSmallFormVariantsRanges,sizeof(IsSmallFormVariantsRanges)/sizeof(int[2]));
|
613
|
+
case CLS_U_IsSpacingModifierLetters: return u_in_ranges(c,IsSpacingModifierLettersRanges,sizeof(IsSpacingModifierLettersRanges)/sizeof(int[2]));
|
614
|
+
case CLS_U_IsSpecials: return u_in_ranges(c,IsSpecialsRanges,sizeof(IsSpecialsRanges)/sizeof(int[2]));
|
615
|
+
case CLS_U_IsSuperscriptsandSubscripts: return u_in_ranges(c,IsSuperscriptsandSubscriptsRanges,sizeof(IsSuperscriptsandSubscriptsRanges)/sizeof(int[2]));
|
616
|
+
case CLS_U_IsSyriac: return u_in_ranges(c,IsSyriacRanges,sizeof(IsSyriacRanges)/sizeof(int[2]));
|
617
|
+
case CLS_U_IsTags: return u_in_ranges(c,IsTagsRanges,sizeof(IsTagsRanges)/sizeof(int[2]));
|
618
|
+
case CLS_U_IsTamil: return u_in_ranges(c,IsTamilRanges,sizeof(IsTamilRanges)/sizeof(int[2]));
|
619
|
+
case CLS_U_IsTelugu: return u_in_ranges(c,IsTeluguRanges,sizeof(IsTeluguRanges)/sizeof(int[2]));
|
620
|
+
case CLS_U_IsThaana: return u_in_ranges(c,IsThaanaRanges,sizeof(IsThaanaRanges)/sizeof(int[2]));
|
621
|
+
case CLS_U_IsThai: return u_in_ranges(c,IsThaiRanges,sizeof(IsThaiRanges)/sizeof(int[2]));
|
622
|
+
case CLS_U_IsTibetan: return u_in_ranges(c,IsTibetanRanges,sizeof(IsTibetanRanges)/sizeof(int[2]));
|
623
|
+
case CLS_U_IsUnifiedCanadianAboriginalSyllabics: return u_in_ranges(c,IsUnifiedCanadianAboriginalSyllabicsRanges,sizeof(IsUnifiedCanadianAboriginalSyllabicsRanges)/sizeof(int[2]));
|
624
|
+
case CLS_U_IsYiRadicals: return u_in_ranges(c,IsYiRadicalsRanges,sizeof(IsYiRadicalsRanges)/sizeof(int[2]));
|
625
|
+
case CLS_U_IsYiSyllables: return u_in_ranges(c,IsYiSyllablesRanges,sizeof(IsYiSyllablesRanges)/sizeof(int[2]));
|
626
|
+
case CLS_U_L: return in_class(c,CLS_U_Ll)||in_class(c,CLS_U_Lm)||in_class(c,CLS_U_Lo)||in_class(c,CLS_U_Lt)||in_class(c,CLS_U_Lu);
|
627
|
+
case CLS_U_Ll: return u_in_ranges(c,LlRanges,sizeof(LlRanges)/sizeof(int[2]));
|
628
|
+
case CLS_U_Lm: return u_in_ranges(c,LmRanges,sizeof(LmRanges)/sizeof(int[2]));
|
629
|
+
case CLS_U_Lo: return u_in_ranges(c,LoRanges,sizeof(LoRanges)/sizeof(int[2]));
|
630
|
+
case CLS_U_Lt: return u_in_ranges(c,LtRanges,sizeof(LtRanges)/sizeof(int[2]));
|
631
|
+
case CLS_U_Lu: return u_in_ranges(c,LuRanges,sizeof(LuRanges)/sizeof(int[2]));
|
632
|
+
case CLS_U_M: return in_class(c,CLS_U_Mc)||in_class(c,CLS_U_Me)||in_class(c,CLS_U_Mn);
|
633
|
+
case CLS_U_Mc: return u_in_ranges(c,McRanges,sizeof(McRanges)/sizeof(int[2]));
|
634
|
+
case CLS_U_Me: return u_in_ranges(c,MeRanges,sizeof(MeRanges)/sizeof(int[2]));
|
635
|
+
case CLS_U_Mn: return u_in_ranges(c,MnRanges,sizeof(MnRanges)/sizeof(int[2]));
|
636
|
+
case CLS_U_N: return in_class(c,CLS_U_Nd)||in_class(c,CLS_U_Nl)||in_class(c,CLS_U_No);
|
637
|
+
case CLS_U_Nd: return u_in_ranges(c,NdRanges,sizeof(NdRanges)/sizeof(int[2]));
|
638
|
+
case CLS_U_Nl: return u_in_ranges(c,NlRanges,sizeof(NlRanges)/sizeof(int[2]));
|
639
|
+
case CLS_U_No: return u_in_ranges(c,NoRanges,sizeof(NoRanges)/sizeof(int[2]));
|
640
|
+
case CLS_U_P: return in_class(c,CLS_U_Pc)||in_class(c,CLS_U_Pd)||in_class(c,CLS_U_Pe)||in_class(c,CLS_U_Pf)||in_class(c,CLS_U_Pi)||in_class(c,CLS_U_Po)||in_class(c,CLS_U_Ps);
|
641
|
+
case CLS_U_Pc: return u_in_ranges(c,PcRanges,sizeof(PcRanges)/sizeof(int[2]));
|
642
|
+
case CLS_U_Pd: return u_in_ranges(c,PdRanges,sizeof(PdRanges)/sizeof(int[2]));
|
643
|
+
case CLS_U_Pe: return u_in_ranges(c,PeRanges,sizeof(PeRanges)/sizeof(int[2]));
|
644
|
+
case CLS_U_Pf: return u_in_ranges(c,PfRanges,sizeof(PfRanges)/sizeof(int[2]));
|
645
|
+
case CLS_U_Pi: return u_in_ranges(c,PiRanges,sizeof(PiRanges)/sizeof(int[2]));
|
646
|
+
case CLS_U_Po: return u_in_ranges(c,PoRanges,sizeof(PoRanges)/sizeof(int[2]));
|
647
|
+
case CLS_U_Ps: return u_in_ranges(c,PsRanges,sizeof(PsRanges)/sizeof(int[2]));
|
648
|
+
case CLS_U_S: return in_class(c,CLS_U_Sc)||in_class(c,CLS_U_Sk)||in_class(c,CLS_U_Sm)||in_class(c,CLS_U_So);
|
649
|
+
case CLS_U_Sc: return u_in_ranges(c,ScRanges,sizeof(ScRanges)/sizeof(int[2]));
|
650
|
+
case CLS_U_Sk: return u_in_ranges(c,SkRanges,sizeof(SkRanges)/sizeof(int[2]));
|
651
|
+
case CLS_U_Sm: return u_in_ranges(c,SmRanges,sizeof(SmRanges)/sizeof(int[2]));
|
652
|
+
case CLS_U_So: return u_in_ranges(c,SoRanges,sizeof(SoRanges)/sizeof(int[2]));
|
653
|
+
case CLS_U_Z: return in_class(c,CLS_U_Zl)||in_class(c,CLS_U_Zp)||in_class(c,CLS_U_Zs);
|
654
|
+
case CLS_U_Zl: return u_in_ranges(c,ZlRanges,sizeof(ZlRanges)/sizeof(int[2]));
|
655
|
+
case CLS_U_Zp: return u_in_ranges(c,ZpRanges,sizeof(ZpRanges)/sizeof(int[2]));
|
656
|
+
case CLS_U_Zs: return u_in_ranges(c,ZsRanges,sizeof(ZsRanges)/sizeof(int[2]));
|
657
|
+
case CLS_NL: return c=='\n'||c=='\r';
|
658
|
+
case CLS_S: return xmlc_white_space(c);
|
659
|
+
case CLS_I: return xmlc_base_char(c)||xmlc_ideographic(c)||c=='_'||c==':';
|
660
|
+
case CLS_C: return in_class(c,CLS_I)||xmlc_digit(c)||xmlc_combining_char(c)||xmlc_extender(c)||c=='.'||c=='-';
|
661
|
+
case CLS_W: return !(in_class(c,CLS_U_P)||in_class(c,CLS_U_Z)||in_class(c,CLS_U_C));
|
662
|
+
default: assert(0);
|
663
|
+
}
|
664
|
+
return 0;
|
665
|
+
}
|
666
|
+
|
667
|
+
|
668
|
+
static int drv(rx_st_t *rx_st, int p,int c) {
|
669
|
+
int p1,p2,cf,cl,cn,ret,m;
|
670
|
+
assert(!P_IS(p,P_ERROR));
|
671
|
+
m=new_memo(rx_st, p,c);
|
672
|
+
if(m!=-1) return M_RET(m);
|
673
|
+
switch(P_TYP(p)) {
|
674
|
+
case P_NOT_ALLOWED: case P_EMPTY: ret=rx_st->notAllowed; break;
|
675
|
+
case P_CHOICE: Choice(p,p1,p2); ret=choice(rx_st, drv(rx_st, p1,c),drv(rx_st, p2,c)); break;
|
676
|
+
case P_GROUP: Group(p,p1,p2); {int p11=group(rx_st, drv(rx_st, p1,c),p2); ret=nullable(p1)?choice(rx_st, p11,drv(rx_st, p2,c)):p11;} break;
|
677
|
+
case P_ONE_OR_MORE: OneOrMore(p,p1); ret=group(rx_st, drv(rx_st, p1,c),choice(rx_st, rx_st->empty,p)); break;
|
678
|
+
case P_EXCEPT: Except(p,p1,p2); ret=nullable(drv(rx_st, p1,c))&&!nullable(drv(rx_st, p2,c))?rx_st->empty:rx_st->notAllowed; break;
|
679
|
+
case P_RANGE: Range(p,cf,cl); ret=cf<=c&&c<=cl?rx_st->empty:rx_st->notAllowed; break;
|
680
|
+
case P_CLASS: Class(p,cn); ret=in_class(c,cn)?rx_st->empty:rx_st->notAllowed; break;
|
681
|
+
case P_ANY: ret=rx_st->empty; break;
|
682
|
+
case P_CHAR: Char(p,cf); ret=c==cf?rx_st->empty:rx_st->notAllowed; break;
|
683
|
+
default: ret=0; assert(0);
|
684
|
+
}
|
685
|
+
new_memo(rx_st, p,c); M_SET(ret);
|
686
|
+
accept_m(rx_st);
|
687
|
+
return ret;
|
688
|
+
}
|
689
|
+
|
690
|
+
int rx_check(rnv_t *rnv, rx_st_t *rx_st, char *rx) {(void)compile(rnv, rx_st, rx); return !rx_st->errors;}
|
691
|
+
|
692
|
+
int rx_match(rnv_t *rnv, rx_st_t *rx_st, char *rx,char *s,int n) {
|
693
|
+
int p=compile(rnv, rx_st, rx);
|
694
|
+
if(!rx_st->errors) {
|
695
|
+
char *end=s+n;
|
696
|
+
int u;
|
697
|
+
for(;;) {
|
698
|
+
if(p==rx_st->notAllowed) return 0;
|
699
|
+
if(s==end) return nullable(p);
|
700
|
+
s+=u_get(&u,s);
|
701
|
+
p=drv(rx_st, p,u);
|
702
|
+
}
|
703
|
+
} else return 0;
|
704
|
+
}
|
705
|
+
|
706
|
+
int rx_rmatch(rnv_t *rnv, rx_st_t *rx_st, char *rx,char *s,int n) {
|
707
|
+
int p=compile(rnv, rx_st, rx);
|
708
|
+
if(!rx_st->errors) {
|
709
|
+
char *end=s+n;
|
710
|
+
int u;
|
711
|
+
for(;;) {
|
712
|
+
if(p==rx_st->notAllowed) return 0;
|
713
|
+
if(s==end) return nullable(p);
|
714
|
+
s+=u_get(&u,s);
|
715
|
+
if(xmlc_white_space(u)) u=' ';
|
716
|
+
p=drv(rx_st, p,u);
|
717
|
+
}
|
718
|
+
} else return 0;
|
719
|
+
}
|
720
|
+
|
721
|
+
int rx_cmatch(rnv_t *rnv, rx_st_t *rx_st, char *rx,char *s,int n) {
|
722
|
+
int p=compile(rnv, rx_st, rx);
|
723
|
+
if(!rx_st->errors) {
|
724
|
+
char *end=s+n;
|
725
|
+
int u;
|
726
|
+
SKIP_SPACE: for(;;) {
|
727
|
+
if(s==end) return nullable(p);
|
728
|
+
s+=u_get(&u,s);
|
729
|
+
if(!xmlc_white_space(u)) break;
|
730
|
+
}
|
731
|
+
for(;;) {
|
732
|
+
if(p==rx_st->notAllowed) return 0;
|
733
|
+
if(xmlc_white_space(u)) { u=' ';
|
734
|
+
p=drv(rx_st, p,u);
|
735
|
+
if(p==rx_st->notAllowed) {
|
736
|
+
for(;;) {
|
737
|
+
if(s==end) return 1;
|
738
|
+
s+=u_get(&u,s);
|
739
|
+
if(!xmlc_white_space(u)) return 0;
|
740
|
+
}
|
741
|
+
} else goto SKIP_SPACE;
|
742
|
+
}
|
743
|
+
p=drv(rx_st, p,u);
|
744
|
+
if(s==end) goto SKIP_SPACE;
|
745
|
+
s+=u_get(&u,s);
|
746
|
+
}
|
747
|
+
} else return 0;
|
748
|
+
}
|
749
|
+
|