zig-pug 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +346 -0
  3. package/binding.c +375 -0
  4. package/binding.gyp +28 -0
  5. package/common.gypi +5 -0
  6. package/include/zigpug.h +135 -0
  7. package/index.js +205 -0
  8. package/package.json +87 -0
  9. package/vendor/mujs/COPYING +16 -0
  10. package/vendor/mujs/README +50 -0
  11. package/vendor/mujs/astnames.h +92 -0
  12. package/vendor/mujs/jsarray.c +832 -0
  13. package/vendor/mujs/jsboolean.c +38 -0
  14. package/vendor/mujs/jsbuiltin.c +249 -0
  15. package/vendor/mujs/jscompile.c +1428 -0
  16. package/vendor/mujs/jsdate.c +861 -0
  17. package/vendor/mujs/jsdtoa.c +749 -0
  18. package/vendor/mujs/jserror.c +139 -0
  19. package/vendor/mujs/jsfunction.c +231 -0
  20. package/vendor/mujs/jsgc.c +284 -0
  21. package/vendor/mujs/jsi.h +870 -0
  22. package/vendor/mujs/jsintern.c +137 -0
  23. package/vendor/mujs/jslex.c +878 -0
  24. package/vendor/mujs/jsmath.c +194 -0
  25. package/vendor/mujs/jsnumber.c +198 -0
  26. package/vendor/mujs/jsobject.c +560 -0
  27. package/vendor/mujs/json.c +422 -0
  28. package/vendor/mujs/jsparse.c +1065 -0
  29. package/vendor/mujs/jsproperty.c +341 -0
  30. package/vendor/mujs/jsregexp.c +232 -0
  31. package/vendor/mujs/jsrepr.c +285 -0
  32. package/vendor/mujs/jsrun.c +2096 -0
  33. package/vendor/mujs/jsstate.c +334 -0
  34. package/vendor/mujs/jsstring.c +852 -0
  35. package/vendor/mujs/jsvalue.c +708 -0
  36. package/vendor/mujs/libmujs.a +0 -0
  37. package/vendor/mujs/main.c +396 -0
  38. package/vendor/mujs/mujs.h +253 -0
  39. package/vendor/mujs/one.c +25 -0
  40. package/vendor/mujs/opnames.h +85 -0
  41. package/vendor/mujs/pp.c +980 -0
  42. package/vendor/mujs/regexp.c +1277 -0
  43. package/vendor/mujs/regexp.h +46 -0
  44. package/vendor/mujs/utf.c +305 -0
  45. package/vendor/mujs/utf.h +52 -0
  46. package/vendor/mujs/utfdata.h +2209 -0
@@ -0,0 +1,1277 @@
1
+ #include <stdlib.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+ #include <setjmp.h>
5
+ #include <limits.h>
6
+
7
+ #include "regexp.h"
8
+ #include "utf.h"
9
+
10
+ #define emit regemit
11
+ #define next regnext
12
+ #define accept regaccept
13
+
14
+ #define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
15
+
16
+ #define REPINF 255
17
+ #ifndef REG_MAXPROG
18
+ #define REG_MAXPROG (32 << 10)
19
+ #endif
20
+ #ifndef REG_MAXREC
21
+ #define REG_MAXREC 1024
22
+ #endif
23
+ #ifndef REG_MAXSPAN
24
+ #define REG_MAXSPAN 64
25
+ #endif
26
+ #ifndef REG_MAXCLASS
27
+ #define REG_MAXCLASS 128
28
+ #endif
29
+
30
+ typedef struct Reclass Reclass;
31
+ typedef struct Renode Renode;
32
+ typedef struct Reinst Reinst;
33
+ typedef struct Rethread Rethread;
34
+
35
+ struct Reclass {
36
+ Rune *end;
37
+ Rune spans[REG_MAXSPAN];
38
+ };
39
+
40
+ struct Reprog {
41
+ Reinst *start, *end;
42
+ Reclass *cclass;
43
+ int flags;
44
+ int nsub;
45
+ };
46
+
47
+ struct cstate {
48
+ Reprog *prog;
49
+ Renode *pstart, *pend;
50
+
51
+ const char *source;
52
+ int ncclass;
53
+ int nsub;
54
+ Renode *sub[REG_MAXSUB];
55
+
56
+ int lookahead;
57
+ Rune yychar;
58
+ Reclass *yycc;
59
+ int yymin, yymax;
60
+
61
+ const char *error;
62
+ jmp_buf kaboom;
63
+
64
+ Reclass cclass[REG_MAXCLASS];
65
+ };
66
+
67
+ static void die(struct cstate *g, const char *message)
68
+ {
69
+ g->error = message;
70
+ longjmp(g->kaboom, 1);
71
+ }
72
+
73
+ static int canon(Rune c)
74
+ {
75
+ Rune u = toupperrune(c);
76
+ if (c >= 128 && u < 128)
77
+ return c;
78
+ return u;
79
+ }
80
+
81
+ /* Scan */
82
+
83
+ enum {
84
+ L_CHAR = 256,
85
+ L_CCLASS, /* character class */
86
+ L_NCCLASS, /* negative character class */
87
+ L_NC, /* "(?:" no capture */
88
+ L_PLA, /* "(?=" positive lookahead */
89
+ L_NLA, /* "(?!" negative lookahead */
90
+ L_WORD, /* "\b" word boundary */
91
+ L_NWORD, /* "\B" non-word boundary */
92
+ L_REF, /* "\1" back-reference */
93
+ L_COUNT, /* {M,N} */
94
+ };
95
+
96
+ static int hex(struct cstate *g, int c)
97
+ {
98
+ if (c >= '0' && c <= '9') return c - '0';
99
+ if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
100
+ if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
101
+ die(g, "invalid escape sequence");
102
+ return 0;
103
+ }
104
+
105
+ static int dec(struct cstate *g, int c)
106
+ {
107
+ if (c >= '0' && c <= '9') return c - '0';
108
+ die(g, "invalid quantifier");
109
+ return 0;
110
+ }
111
+
112
+ #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|-0123456789"
113
+
114
+ static int isunicodeletter(int c)
115
+ {
116
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
117
+ }
118
+
119
+ static int nextrune(struct cstate *g)
120
+ {
121
+ if (!*g->source) {
122
+ g->yychar = EOF;
123
+ return 0;
124
+ }
125
+ g->source += chartorune(&g->yychar, g->source);
126
+ if (g->yychar == '\\') {
127
+ if (!*g->source)
128
+ die(g, "unterminated escape sequence");
129
+ g->source += chartorune(&g->yychar, g->source);
130
+ switch (g->yychar) {
131
+ case 'f': g->yychar = '\f'; return 0;
132
+ case 'n': g->yychar = '\n'; return 0;
133
+ case 'r': g->yychar = '\r'; return 0;
134
+ case 't': g->yychar = '\t'; return 0;
135
+ case 'v': g->yychar = '\v'; return 0;
136
+ case 'c':
137
+ if (!g->source[0])
138
+ die(g, "unterminated escape sequence");
139
+ g->yychar = (*g->source++) & 31;
140
+ return 0;
141
+ case 'x':
142
+ if (!g->source[0] || !g->source[1])
143
+ die(g, "unterminated escape sequence");
144
+ g->yychar = hex(g, *g->source++) << 4;
145
+ g->yychar += hex(g, *g->source++);
146
+ if (g->yychar == 0) {
147
+ g->yychar = '0';
148
+ return 1;
149
+ }
150
+ return 1;
151
+ case 'u':
152
+ if (!g->source[0] || !g->source[1] || !g->source[2] || !g->source[3])
153
+ die(g, "unterminated escape sequence");
154
+ g->yychar = hex(g, *g->source++) << 12;
155
+ g->yychar += hex(g, *g->source++) << 8;
156
+ g->yychar += hex(g, *g->source++) << 4;
157
+ g->yychar += hex(g, *g->source++);
158
+ if (g->yychar == 0) {
159
+ g->yychar = '0';
160
+ return 1;
161
+ }
162
+ return 1;
163
+ case 0:
164
+ g->yychar = '0';
165
+ return 1;
166
+ }
167
+ if (strchr(ESCAPES, g->yychar))
168
+ return 1;
169
+ if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
170
+ die(g, "invalid escape character");
171
+ return 0;
172
+ }
173
+ return 0;
174
+ }
175
+
176
+ static int lexcount(struct cstate *g)
177
+ {
178
+ g->yychar = *g->source++;
179
+
180
+ g->yymin = dec(g, g->yychar);
181
+ g->yychar = *g->source++;
182
+ while (g->yychar != ',' && g->yychar != '}') {
183
+ g->yymin = g->yymin * 10 + dec(g, g->yychar);
184
+ g->yychar = *g->source++;
185
+ if (g->yymin >= REPINF)
186
+ die(g, "numeric overflow");
187
+ }
188
+
189
+ if (g->yychar == ',') {
190
+ g->yychar = *g->source++;
191
+ if (g->yychar == '}') {
192
+ g->yymax = REPINF;
193
+ } else {
194
+ g->yymax = dec(g, g->yychar);
195
+ g->yychar = *g->source++;
196
+ while (g->yychar != '}') {
197
+ g->yymax = g->yymax * 10 + dec(g, g->yychar);
198
+ g->yychar = *g->source++;
199
+ if (g->yymax >= REPINF)
200
+ die(g, "numeric overflow");
201
+ }
202
+ }
203
+ } else {
204
+ g->yymax = g->yymin;
205
+ }
206
+
207
+ return L_COUNT;
208
+ }
209
+
210
+ static void newcclass(struct cstate *g)
211
+ {
212
+ if (g->ncclass >= REG_MAXCLASS)
213
+ die(g, "too many character classes");
214
+ g->yycc = g->cclass + g->ncclass++;
215
+ g->yycc->end = g->yycc->spans;
216
+ }
217
+
218
+ static void addrange(struct cstate *g, Rune a, Rune b)
219
+ {
220
+ Reclass *cc = g->yycc;
221
+ Rune *p;
222
+
223
+ if (a > b)
224
+ die(g, "invalid character class range");
225
+
226
+ /* extend existing ranges if they overlap */
227
+ for (p = cc->spans; p < cc->end; p += 2) {
228
+ /* completely inside old range */
229
+ if (a >= p[0] && b <= p[1])
230
+ return;
231
+
232
+ /* completely swallows old range */
233
+ if (a < p[0] && b >= p[1]) {
234
+ p[0] = a;
235
+ p[1] = b;
236
+ return;
237
+ }
238
+
239
+ /* extend at start */
240
+ if (b >= p[0] - 1 && b <= p[1] && a < p[0]) {
241
+ p[0] = a;
242
+ return;
243
+ }
244
+
245
+ /* extend at end */
246
+ if (a >= p[0] && a <= p[1] + 1 && b > p[1]) {
247
+ p[1] = b;
248
+ return;
249
+ }
250
+ }
251
+
252
+ if (cc->end + 2 >= cc->spans + nelem(cc->spans))
253
+ die(g, "too many character class ranges");
254
+ *cc->end++ = a;
255
+ *cc->end++ = b;
256
+ }
257
+
258
+ static void addranges_d(struct cstate *g)
259
+ {
260
+ addrange(g, '0', '9');
261
+ }
262
+
263
+ static void addranges_D(struct cstate *g)
264
+ {
265
+ addrange(g, 0, '0'-1);
266
+ addrange(g, '9'+1, 0xFFFF);
267
+ }
268
+
269
+ static void addranges_s(struct cstate *g)
270
+ {
271
+ addrange(g, 0x9, 0xD);
272
+ addrange(g, 0x20, 0x20);
273
+ addrange(g, 0xA0, 0xA0);
274
+ addrange(g, 0x2028, 0x2029);
275
+ addrange(g, 0xFEFF, 0xFEFF);
276
+ }
277
+
278
+ static void addranges_S(struct cstate *g)
279
+ {
280
+ addrange(g, 0, 0x9-1);
281
+ addrange(g, 0xD+1, 0x20-1);
282
+ addrange(g, 0x20+1, 0xA0-1);
283
+ addrange(g, 0xA0+1, 0x2028-1);
284
+ addrange(g, 0x2029+1, 0xFEFF-1);
285
+ addrange(g, 0xFEFF+1, 0xFFFF);
286
+ }
287
+
288
+ static void addranges_w(struct cstate *g)
289
+ {
290
+ addrange(g, '0', '9');
291
+ addrange(g, 'A', 'Z');
292
+ addrange(g, '_', '_');
293
+ addrange(g, 'a', 'z');
294
+ }
295
+
296
+ static void addranges_W(struct cstate *g)
297
+ {
298
+ addrange(g, 0, '0'-1);
299
+ addrange(g, '9'+1, 'A'-1);
300
+ addrange(g, 'Z'+1, '_'-1);
301
+ addrange(g, '_'+1, 'a'-1);
302
+ addrange(g, 'z'+1, 0xFFFF);
303
+ }
304
+
305
+ static int lexclass(struct cstate *g)
306
+ {
307
+ int type = L_CCLASS;
308
+ int quoted, havesave, havedash;
309
+ Rune save = 0;
310
+
311
+ newcclass(g);
312
+
313
+ quoted = nextrune(g);
314
+ if (!quoted && g->yychar == '^') {
315
+ type = L_NCCLASS;
316
+ quoted = nextrune(g);
317
+ }
318
+
319
+ havesave = havedash = 0;
320
+ for (;;) {
321
+ if (g->yychar == EOF)
322
+ die(g, "unterminated character class");
323
+ if (!quoted && g->yychar == ']')
324
+ break;
325
+
326
+ if (!quoted && g->yychar == '-') {
327
+ if (havesave) {
328
+ if (havedash) {
329
+ addrange(g, save, '-');
330
+ havesave = havedash = 0;
331
+ } else {
332
+ havedash = 1;
333
+ }
334
+ } else {
335
+ save = '-';
336
+ havesave = 1;
337
+ }
338
+ } else if (quoted && strchr("DSWdsw", g->yychar)) {
339
+ if (havesave) {
340
+ addrange(g, save, save);
341
+ if (havedash)
342
+ addrange(g, '-', '-');
343
+ }
344
+ switch (g->yychar) {
345
+ case 'd': addranges_d(g); break;
346
+ case 's': addranges_s(g); break;
347
+ case 'w': addranges_w(g); break;
348
+ case 'D': addranges_D(g); break;
349
+ case 'S': addranges_S(g); break;
350
+ case 'W': addranges_W(g); break;
351
+ }
352
+ havesave = havedash = 0;
353
+ } else {
354
+ if (quoted) {
355
+ if (g->yychar == 'b')
356
+ g->yychar = '\b';
357
+ else if (g->yychar == '0')
358
+ g->yychar = 0;
359
+ /* else identity escape */
360
+ }
361
+ if (havesave) {
362
+ if (havedash) {
363
+ addrange(g, save, g->yychar);
364
+ havesave = havedash = 0;
365
+ } else {
366
+ addrange(g, save, save);
367
+ save = g->yychar;
368
+ }
369
+ } else {
370
+ save = g->yychar;
371
+ havesave = 1;
372
+ }
373
+ }
374
+
375
+ quoted = nextrune(g);
376
+ }
377
+
378
+ if (havesave) {
379
+ addrange(g, save, save);
380
+ if (havedash)
381
+ addrange(g, '-', '-');
382
+ }
383
+
384
+ return type;
385
+ }
386
+
387
+ static int lex(struct cstate *g)
388
+ {
389
+ int quoted = nextrune(g);
390
+ if (quoted) {
391
+ switch (g->yychar) {
392
+ case 'b': return L_WORD;
393
+ case 'B': return L_NWORD;
394
+ case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
395
+ case 's': newcclass(g); addranges_s(g); return L_CCLASS;
396
+ case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
397
+ case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
398
+ case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
399
+ case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
400
+ case '0': g->yychar = 0; return L_CHAR;
401
+ }
402
+ if (g->yychar >= '0' && g->yychar <= '9') {
403
+ g->yychar -= '0';
404
+ if (*g->source >= '0' && *g->source <= '9')
405
+ g->yychar = g->yychar * 10 + *g->source++ - '0';
406
+ return L_REF;
407
+ }
408
+ return L_CHAR;
409
+ }
410
+
411
+ switch (g->yychar) {
412
+ case EOF:
413
+ case '$': case ')': case '*': case '+':
414
+ case '.': case '?': case '^': case '|':
415
+ return g->yychar;
416
+ }
417
+
418
+ if (g->yychar == '{')
419
+ return lexcount(g);
420
+ if (g->yychar == '[')
421
+ return lexclass(g);
422
+ if (g->yychar == '(') {
423
+ if (g->source[0] == '?') {
424
+ if (g->source[1] == ':') {
425
+ g->source += 2;
426
+ return L_NC;
427
+ }
428
+ if (g->source[1] == '=') {
429
+ g->source += 2;
430
+ return L_PLA;
431
+ }
432
+ if (g->source[1] == '!') {
433
+ g->source += 2;
434
+ return L_NLA;
435
+ }
436
+ }
437
+ return '(';
438
+ }
439
+
440
+ return L_CHAR;
441
+ }
442
+
443
+ /* Parse */
444
+
445
+ enum {
446
+ P_CAT, P_ALT, P_REP,
447
+ P_BOL, P_EOL, P_WORD, P_NWORD,
448
+ P_PAR, P_PLA, P_NLA,
449
+ P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
450
+ P_REF,
451
+ };
452
+
453
+ struct Renode {
454
+ unsigned char type;
455
+ unsigned char ng, m, n;
456
+ Rune c;
457
+ int cc;
458
+ Renode *x;
459
+ Renode *y;
460
+ };
461
+
462
+ static Renode *newnode(struct cstate *g, int type)
463
+ {
464
+ Renode *node = g->pend++;
465
+ node->type = type;
466
+ node->cc = -1;
467
+ node->c = 0;
468
+ node->ng = 0;
469
+ node->m = 0;
470
+ node->n = 0;
471
+ node->x = node->y = NULL;
472
+ return node;
473
+ }
474
+
475
+ static int empty(Renode *node)
476
+ {
477
+ if (!node) return 1;
478
+ switch (node->type) {
479
+ default: return 1;
480
+ case P_CAT: return empty(node->x) && empty(node->y);
481
+ case P_ALT: return empty(node->x) || empty(node->y);
482
+ case P_REP: return empty(node->x) || node->m == 0;
483
+ case P_PAR: return empty(node->x);
484
+ case P_REF: return empty(node->x);
485
+ case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
486
+ }
487
+ }
488
+
489
+ static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
490
+ {
491
+ Renode *rep = newnode(g, P_REP);
492
+ if (max == REPINF && empty(atom))
493
+ die(g, "infinite loop matching the empty string");
494
+ rep->ng = ng;
495
+ rep->m = min;
496
+ rep->n = max;
497
+ rep->x = atom;
498
+ return rep;
499
+ }
500
+
501
+ static void next(struct cstate *g)
502
+ {
503
+ g->lookahead = lex(g);
504
+ }
505
+
506
+ static int accept(struct cstate *g, int t)
507
+ {
508
+ if (g->lookahead == t) {
509
+ next(g);
510
+ return 1;
511
+ }
512
+ return 0;
513
+ }
514
+
515
+ static Renode *parsealt(struct cstate *g);
516
+
517
+ static Renode *parseatom(struct cstate *g)
518
+ {
519
+ Renode *atom;
520
+ if (g->lookahead == L_CHAR) {
521
+ atom = newnode(g, P_CHAR);
522
+ atom->c = g->yychar;
523
+ next(g);
524
+ return atom;
525
+ }
526
+ if (g->lookahead == L_CCLASS) {
527
+ atom = newnode(g, P_CCLASS);
528
+ atom->cc = g->yycc - g->cclass;
529
+ next(g);
530
+ return atom;
531
+ }
532
+ if (g->lookahead == L_NCCLASS) {
533
+ atom = newnode(g, P_NCCLASS);
534
+ atom->cc = g->yycc - g->cclass;
535
+ next(g);
536
+ return atom;
537
+ }
538
+ if (g->lookahead == L_REF) {
539
+ atom = newnode(g, P_REF);
540
+ if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar])
541
+ die(g, "invalid back-reference");
542
+ atom->n = g->yychar;
543
+ atom->x = g->sub[g->yychar];
544
+ next(g);
545
+ return atom;
546
+ }
547
+ if (accept(g, '.'))
548
+ return newnode(g, P_ANY);
549
+ if (accept(g, '(')) {
550
+ atom = newnode(g, P_PAR);
551
+ if (g->nsub == REG_MAXSUB)
552
+ die(g, "too many captures");
553
+ atom->n = g->nsub++;
554
+ atom->x = parsealt(g);
555
+ g->sub[atom->n] = atom;
556
+ if (!accept(g, ')'))
557
+ die(g, "unmatched '('");
558
+ return atom;
559
+ }
560
+ if (accept(g, L_NC)) {
561
+ atom = parsealt(g);
562
+ if (!accept(g, ')'))
563
+ die(g, "unmatched '('");
564
+ return atom;
565
+ }
566
+ if (accept(g, L_PLA)) {
567
+ atom = newnode(g, P_PLA);
568
+ atom->x = parsealt(g);
569
+ if (!accept(g, ')'))
570
+ die(g, "unmatched '('");
571
+ return atom;
572
+ }
573
+ if (accept(g, L_NLA)) {
574
+ atom = newnode(g, P_NLA);
575
+ atom->x = parsealt(g);
576
+ if (!accept(g, ')'))
577
+ die(g, "unmatched '('");
578
+ return atom;
579
+ }
580
+ die(g, "syntax error");
581
+ return NULL;
582
+ }
583
+
584
+ static Renode *parserep(struct cstate *g)
585
+ {
586
+ Renode *atom;
587
+
588
+ if (accept(g, '^')) return newnode(g, P_BOL);
589
+ if (accept(g, '$')) return newnode(g, P_EOL);
590
+ if (accept(g, L_WORD)) return newnode(g, P_WORD);
591
+ if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
592
+
593
+ atom = parseatom(g);
594
+ if (g->lookahead == L_COUNT) {
595
+ int min = g->yymin, max = g->yymax;
596
+ next(g);
597
+ if (max < min)
598
+ die(g, "invalid quantifier");
599
+ return newrep(g, atom, accept(g, '?'), min, max);
600
+ }
601
+ if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
602
+ if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
603
+ if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
604
+ return atom;
605
+ }
606
+
607
+ static Renode *parsecat(struct cstate *g)
608
+ {
609
+ Renode *cat, *head, **tail;
610
+ if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
611
+ /* Build a right-leaning tree by splicing in new 'cat' at the tail. */
612
+ head = parserep(g);
613
+ tail = &head;
614
+ while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
615
+ cat = newnode(g, P_CAT);
616
+ cat->x = *tail;
617
+ cat->y = parserep(g);
618
+ *tail = cat;
619
+ tail = &cat->y;
620
+ }
621
+ return head;
622
+ }
623
+ return NULL;
624
+ }
625
+
626
+ static Renode *parsealt(struct cstate *g)
627
+ {
628
+ Renode *alt, *x;
629
+ alt = parsecat(g);
630
+ while (accept(g, '|')) {
631
+ x = alt;
632
+ alt = newnode(g, P_ALT);
633
+ alt->x = x;
634
+ alt->y = parsecat(g);
635
+ }
636
+ return alt;
637
+ }
638
+
639
+ /* Compile */
640
+
641
+ enum {
642
+ I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
643
+ I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
644
+ I_BOL, I_EOL, I_WORD, I_NWORD,
645
+ I_LPAR, I_RPAR
646
+ };
647
+
648
+ struct Reinst {
649
+ unsigned char opcode;
650
+ unsigned char n;
651
+ Rune c;
652
+ Reclass *cc;
653
+ Reinst *x;
654
+ Reinst *y;
655
+ };
656
+
657
+ static int count(struct cstate *g, Renode *node, int depth)
658
+ {
659
+ int min, max, n;
660
+ if (!node) return 0;
661
+ if (++depth > REG_MAXREC) die(g, "stack overflow");
662
+ switch (node->type) {
663
+ default: return 1;
664
+ case P_CAT: return count(g, node->x, depth) + count(g, node->y, depth);
665
+ case P_ALT: return count(g, node->x, depth) + count(g, node->y, depth) + 2;
666
+ case P_REP:
667
+ min = node->m;
668
+ max = node->n;
669
+ if (min == max) n = count(g, node->x, depth) * min;
670
+ else if (max < REPINF) n = count(g, node->x, depth) * max + (max - min);
671
+ else n = count(g, node->x, depth) * (min + 1) + 2;
672
+ if (n < 0 || n > REG_MAXPROG) die(g, "program too large");
673
+ return n;
674
+ case P_PAR: return count(g, node->x, depth) + 2;
675
+ case P_PLA: return count(g, node->x, depth) + 2;
676
+ case P_NLA: return count(g, node->x, depth) + 2;
677
+ }
678
+ }
679
+
680
+ static Reinst *emit(Reprog *prog, int opcode)
681
+ {
682
+ Reinst *inst = prog->end++;
683
+ inst->opcode = opcode;
684
+ inst->n = 0;
685
+ inst->c = 0;
686
+ inst->cc = NULL;
687
+ inst->x = inst->y = NULL;
688
+ return inst;
689
+ }
690
+
691
+ static void compile(Reprog *prog, Renode *node)
692
+ {
693
+ Reinst *inst, *split, *jump;
694
+ int i;
695
+
696
+ loop:
697
+ if (!node)
698
+ return;
699
+
700
+ switch (node->type) {
701
+ case P_CAT:
702
+ compile(prog, node->x);
703
+ node = node->y;
704
+ goto loop;
705
+
706
+ case P_ALT:
707
+ split = emit(prog, I_SPLIT);
708
+ compile(prog, node->x);
709
+ jump = emit(prog, I_JUMP);
710
+ compile(prog, node->y);
711
+ split->x = split + 1;
712
+ split->y = jump + 1;
713
+ jump->x = prog->end;
714
+ break;
715
+
716
+ case P_REP:
717
+ inst = NULL; /* silence compiler warning. assert(node->m > 0). */
718
+ for (i = 0; i < node->m; ++i) {
719
+ inst = prog->end;
720
+ compile(prog, node->x);
721
+ }
722
+ if (node->m == node->n)
723
+ break;
724
+ if (node->n < REPINF) {
725
+ for (i = node->m; i < node->n; ++i) {
726
+ split = emit(prog, I_SPLIT);
727
+ compile(prog, node->x);
728
+ if (node->ng) {
729
+ split->y = split + 1;
730
+ split->x = prog->end;
731
+ } else {
732
+ split->x = split + 1;
733
+ split->y = prog->end;
734
+ }
735
+ }
736
+ } else if (node->m == 0) {
737
+ split = emit(prog, I_SPLIT);
738
+ compile(prog, node->x);
739
+ jump = emit(prog, I_JUMP);
740
+ if (node->ng) {
741
+ split->y = split + 1;
742
+ split->x = prog->end;
743
+ } else {
744
+ split->x = split + 1;
745
+ split->y = prog->end;
746
+ }
747
+ jump->x = split;
748
+ } else {
749
+ split = emit(prog, I_SPLIT);
750
+ if (node->ng) {
751
+ split->y = inst;
752
+ split->x = prog->end;
753
+ } else {
754
+ split->x = inst;
755
+ split->y = prog->end;
756
+ }
757
+ }
758
+ break;
759
+
760
+ case P_BOL: emit(prog, I_BOL); break;
761
+ case P_EOL: emit(prog, I_EOL); break;
762
+ case P_WORD: emit(prog, I_WORD); break;
763
+ case P_NWORD: emit(prog, I_NWORD); break;
764
+
765
+ case P_PAR:
766
+ inst = emit(prog, I_LPAR);
767
+ inst->n = node->n;
768
+ compile(prog, node->x);
769
+ inst = emit(prog, I_RPAR);
770
+ inst->n = node->n;
771
+ break;
772
+ case P_PLA:
773
+ split = emit(prog, I_PLA);
774
+ compile(prog, node->x);
775
+ emit(prog, I_END);
776
+ split->x = split + 1;
777
+ split->y = prog->end;
778
+ break;
779
+ case P_NLA:
780
+ split = emit(prog, I_NLA);
781
+ compile(prog, node->x);
782
+ emit(prog, I_END);
783
+ split->x = split + 1;
784
+ split->y = prog->end;
785
+ break;
786
+
787
+ case P_ANY:
788
+ emit(prog, I_ANY);
789
+ break;
790
+ case P_CHAR:
791
+ inst = emit(prog, I_CHAR);
792
+ inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
793
+ break;
794
+ case P_CCLASS:
795
+ inst = emit(prog, I_CCLASS);
796
+ inst->cc = prog->cclass + node->cc;
797
+ break;
798
+ case P_NCCLASS:
799
+ inst = emit(prog, I_NCCLASS);
800
+ inst->cc = prog->cclass + node->cc;
801
+ break;
802
+ case P_REF:
803
+ inst = emit(prog, I_REF);
804
+ inst->n = node->n;
805
+ break;
806
+ }
807
+ }
808
+
809
+ #ifdef TEST
810
+ static void dumpnode(struct cstate *g, Renode *node)
811
+ {
812
+ Rune *p;
813
+ Reclass *cc;
814
+ if (!node) { printf("Empty"); return; }
815
+ switch (node->type) {
816
+ case P_CAT: printf("Cat("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break;
817
+ case P_ALT: printf("Alt("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break;
818
+ case P_REP:
819
+ printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
820
+ dumpnode(g, node->x);
821
+ printf(")");
822
+ break;
823
+ case P_BOL: printf("Bol"); break;
824
+ case P_EOL: printf("Eol"); break;
825
+ case P_WORD: printf("Word"); break;
826
+ case P_NWORD: printf("NotWord"); break;
827
+ case P_PAR: printf("Par(%d,", node->n); dumpnode(g, node->x); printf(")"); break;
828
+ case P_PLA: printf("PLA("); dumpnode(g, node->x); printf(")"); break;
829
+ case P_NLA: printf("NLA("); dumpnode(g, node->x); printf(")"); break;
830
+ case P_ANY: printf("Any"); break;
831
+ case P_CHAR: printf("Char(%c)", node->c); break;
832
+ case P_CCLASS:
833
+ printf("Class(");
834
+ cc = g->cclass + node->cc;
835
+ for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
836
+ printf(")");
837
+ break;
838
+ case P_NCCLASS:
839
+ printf("NotClass(");
840
+ cc = g->cclass + node->cc;
841
+ for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
842
+ printf(")");
843
+ break;
844
+ case P_REF: printf("Ref(%d)", node->n); break;
845
+ }
846
+ }
847
+
848
+ static void dumpcclass(Reclass *cc) {
849
+ Rune *p;
850
+ for (p = cc->spans; p < cc->end; p += 2) {
851
+ if (p[0] > 32 && p[0] < 127)
852
+ printf(" %c", p[0]);
853
+ else
854
+ printf(" \\x%02x", p[0]);
855
+ if (p[1] > 32 && p[1] < 127)
856
+ printf("-%c", p[1]);
857
+ else
858
+ printf("-\\x%02x", p[1]);
859
+ }
860
+ putchar('\n');
861
+ }
862
+
863
+ static void dumpprog(Reprog *prog)
864
+ {
865
+ Reinst *inst;
866
+ int i;
867
+ for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
868
+ printf("% 5d: ", i);
869
+ switch (inst->opcode) {
870
+ case I_END: puts("end"); break;
871
+ case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
872
+ case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
873
+ case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
874
+ case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
875
+ case I_ANY: puts("any"); break;
876
+ case I_ANYNL: puts("anynl"); break;
877
+ case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
878
+ case I_CCLASS: printf("cclass"); dumpcclass(inst->cc); break;
879
+ case I_NCCLASS: printf("ncclass"); dumpcclass(inst->cc); break;
880
+ case I_REF: printf("ref %d\n", inst->n); break;
881
+ case I_BOL: puts("bol"); break;
882
+ case I_EOL: puts("eol"); break;
883
+ case I_WORD: puts("word"); break;
884
+ case I_NWORD: puts("nword"); break;
885
+ case I_LPAR: printf("lpar %d\n", inst->n); break;
886
+ case I_RPAR: printf("rpar %d\n", inst->n); break;
887
+ }
888
+ }
889
+ }
890
+ #endif
891
+
892
+ Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
893
+ const char *pattern, int cflags, const char **errorp)
894
+ {
895
+ struct cstate g;
896
+ Renode *node;
897
+ Reinst *split, *jump;
898
+ int i, n;
899
+
900
+ g.pstart = NULL;
901
+ g.prog = NULL;
902
+
903
+ if (setjmp(g.kaboom)) {
904
+ if (errorp) *errorp = g.error;
905
+ alloc(ctx, g.pstart, 0);
906
+ if (g.prog) {
907
+ alloc(ctx, g.prog->cclass, 0);
908
+ alloc(ctx, g.prog->start, 0);
909
+ alloc(ctx, g.prog, 0);
910
+ }
911
+ return NULL;
912
+ }
913
+
914
+ g.prog = alloc(ctx, NULL, sizeof (Reprog));
915
+ if (!g.prog)
916
+ die(&g, "cannot allocate regular expression");
917
+ g.prog->start = NULL;
918
+ g.prog->cclass = NULL;
919
+
920
+ n = strlen(pattern) * 2;
921
+ if (n > REG_MAXPROG)
922
+ die(&g, "program too large");
923
+ if (n > 0) {
924
+ g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n);
925
+ if (!g.pstart)
926
+ die(&g, "cannot allocate regular expression parse list");
927
+ }
928
+
929
+ g.source = pattern;
930
+ g.ncclass = 0;
931
+ g.nsub = 1;
932
+ for (i = 0; i < REG_MAXSUB; ++i)
933
+ g.sub[i] = 0;
934
+
935
+ g.prog->flags = cflags;
936
+
937
+ next(&g);
938
+ node = parsealt(&g);
939
+ if (g.lookahead == ')')
940
+ die(&g, "unmatched ')'");
941
+ if (g.lookahead != EOF)
942
+ die(&g, "syntax error");
943
+
944
+ #ifdef TEST
945
+ dumpnode(&g, node);
946
+ putchar('\n');
947
+ #endif
948
+
949
+ n = 6 + count(&g, node, 0);
950
+ if (n < 0 || n > REG_MAXPROG)
951
+ die(&g, "program too large");
952
+
953
+ g.prog->nsub = g.nsub;
954
+ g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst));
955
+ if (!g.prog->start)
956
+ die(&g, "cannot allocate regular expression instruction list");
957
+
958
+ if (g.ncclass > 0) {
959
+ g.prog->cclass = alloc(ctx, NULL, g.ncclass * sizeof (Reclass));
960
+ if (!g.prog->cclass)
961
+ die(&g, "cannot allocate regular expression character class list");
962
+ memcpy(g.prog->cclass, g.cclass, g.ncclass * sizeof (Reclass));
963
+ for (i = 0; i < g.ncclass; ++i)
964
+ g.prog->cclass[i].end = g.prog->cclass[i].spans + (g.cclass[i].end - g.cclass[i].spans);
965
+ }
966
+
967
+ split = emit(g.prog, I_SPLIT);
968
+ split->x = split + 3;
969
+ split->y = split + 1;
970
+ emit(g.prog, I_ANYNL);
971
+ jump = emit(g.prog, I_JUMP);
972
+ jump->x = split;
973
+ emit(g.prog, I_LPAR);
974
+ compile(g.prog, node);
975
+ emit(g.prog, I_RPAR);
976
+ emit(g.prog, I_END);
977
+
978
+ #ifdef TEST
979
+ dumpprog(g.prog);
980
+ #endif
981
+
982
+ alloc(ctx, g.pstart, 0);
983
+
984
+ if (errorp) *errorp = NULL;
985
+ return g.prog;
986
+ }
987
+
988
+ void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog)
989
+ {
990
+ if (prog) {
991
+ if (prog->cclass)
992
+ alloc(ctx, prog->cclass, 0);
993
+ alloc(ctx, prog->start, 0);
994
+ alloc(ctx, prog, 0);
995
+ }
996
+ }
997
+
998
+ static void *default_alloc(void *ctx, void *p, int n)
999
+ {
1000
+ if (n == 0) {
1001
+ free(p);
1002
+ return NULL;
1003
+ }
1004
+ return realloc(p, (size_t)n);
1005
+ }
1006
+
1007
+ Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
1008
+ {
1009
+ return regcompx(default_alloc, NULL, pattern, cflags, errorp);
1010
+ }
1011
+
1012
+ void regfree(Reprog *prog)
1013
+ {
1014
+ regfreex(default_alloc, NULL, prog);
1015
+ }
1016
+
1017
+ /* Match */
1018
+
1019
+ static int isnewline(int c)
1020
+ {
1021
+ return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
1022
+ }
1023
+
1024
+ static int iswordchar(int c)
1025
+ {
1026
+ return c == '_' ||
1027
+ (c >= 'a' && c <= 'z') ||
1028
+ (c >= 'A' && c <= 'Z') ||
1029
+ (c >= '0' && c <= '9');
1030
+ }
1031
+
1032
+ static int incclass(Reclass *cc, Rune c)
1033
+ {
1034
+ Rune *p;
1035
+ for (p = cc->spans; p < cc->end; p += 2)
1036
+ if (p[0] <= c && c <= p[1])
1037
+ return 1;
1038
+ return 0;
1039
+ }
1040
+
1041
+ static int incclasscanon(Reclass *cc, Rune c)
1042
+ {
1043
+ Rune *p, r;
1044
+ for (p = cc->spans; p < cc->end; p += 2)
1045
+ for (r = p[0]; r <= p[1]; ++r)
1046
+ if (c == canon(r))
1047
+ return 1;
1048
+ return 0;
1049
+ }
1050
+
1051
+ static int strncmpcanon(const char *a, const char *b, int n)
1052
+ {
1053
+ Rune ra, rb;
1054
+ int c;
1055
+ while (n--) {
1056
+ if (!*a) return -1;
1057
+ if (!*b) return 1;
1058
+ a += chartorune(&ra, a);
1059
+ b += chartorune(&rb, b);
1060
+ c = canon(ra) - canon(rb);
1061
+ if (c)
1062
+ return c;
1063
+ }
1064
+ return 0;
1065
+ }
1066
+
1067
+ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth)
1068
+ {
1069
+ Resub scratch;
1070
+ int result;
1071
+ int i;
1072
+ Rune c;
1073
+
1074
+ /* stack overflow */
1075
+ if (depth > REG_MAXREC)
1076
+ return -1;
1077
+
1078
+ for (;;) {
1079
+ switch (pc->opcode) {
1080
+ case I_END:
1081
+ return 0;
1082
+ case I_JUMP:
1083
+ pc = pc->x;
1084
+ break;
1085
+ case I_SPLIT:
1086
+ scratch = *out;
1087
+ result = match(pc->x, sp, bol, flags, &scratch, depth+1);
1088
+ if (result == -1)
1089
+ return -1;
1090
+ if (result == 0) {
1091
+ *out = scratch;
1092
+ return 0;
1093
+ }
1094
+ pc = pc->y;
1095
+ break;
1096
+
1097
+ case I_PLA:
1098
+ result = match(pc->x, sp, bol, flags, out, depth+1);
1099
+ if (result == -1)
1100
+ return -1;
1101
+ if (result == 1)
1102
+ return 1;
1103
+ pc = pc->y;
1104
+ break;
1105
+ case I_NLA:
1106
+ scratch = *out;
1107
+ result = match(pc->x, sp, bol, flags, &scratch, depth+1);
1108
+ if (result == -1)
1109
+ return -1;
1110
+ if (result == 0)
1111
+ return 1;
1112
+ pc = pc->y;
1113
+ break;
1114
+
1115
+ case I_ANYNL:
1116
+ if (!*sp) return 1;
1117
+ sp += chartorune(&c, sp);
1118
+ pc = pc + 1;
1119
+ break;
1120
+ case I_ANY:
1121
+ if (!*sp) return 1;
1122
+ sp += chartorune(&c, sp);
1123
+ if (isnewline(c))
1124
+ return 1;
1125
+ pc = pc + 1;
1126
+ break;
1127
+ case I_CHAR:
1128
+ if (!*sp) return 1;
1129
+ sp += chartorune(&c, sp);
1130
+ if (flags & REG_ICASE)
1131
+ c = canon(c);
1132
+ if (c != pc->c)
1133
+ return 1;
1134
+ pc = pc + 1;
1135
+ break;
1136
+ case I_CCLASS:
1137
+ if (!*sp) return 1;
1138
+ sp += chartorune(&c, sp);
1139
+ if (flags & REG_ICASE) {
1140
+ if (!incclasscanon(pc->cc, canon(c)))
1141
+ return 1;
1142
+ } else {
1143
+ if (!incclass(pc->cc, c))
1144
+ return 1;
1145
+ }
1146
+ pc = pc + 1;
1147
+ break;
1148
+ case I_NCCLASS:
1149
+ if (!*sp) return 1;
1150
+ sp += chartorune(&c, sp);
1151
+ if (flags & REG_ICASE) {
1152
+ if (incclasscanon(pc->cc, canon(c)))
1153
+ return 1;
1154
+ } else {
1155
+ if (incclass(pc->cc, c))
1156
+ return 1;
1157
+ }
1158
+ pc = pc + 1;
1159
+ break;
1160
+ case I_REF:
1161
+ i = out->sub[pc->n].ep - out->sub[pc->n].sp;
1162
+ if (flags & REG_ICASE) {
1163
+ if (strncmpcanon(sp, out->sub[pc->n].sp, i))
1164
+ return 1;
1165
+ } else {
1166
+ if (strncmp(sp, out->sub[pc->n].sp, i))
1167
+ return 1;
1168
+ }
1169
+ if (i > 0)
1170
+ sp += i;
1171
+ pc = pc + 1;
1172
+ break;
1173
+
1174
+ case I_BOL:
1175
+ if (sp == bol && !(flags & REG_NOTBOL)) {
1176
+ pc = pc + 1;
1177
+ break;
1178
+ }
1179
+ if (flags & REG_NEWLINE) {
1180
+ if (sp > bol && isnewline(sp[-1])) {
1181
+ pc = pc + 1;
1182
+ break;
1183
+ }
1184
+ }
1185
+ return 1;
1186
+ case I_EOL:
1187
+ if (*sp == 0) {
1188
+ pc = pc + 1;
1189
+ break;
1190
+ }
1191
+ if (flags & REG_NEWLINE) {
1192
+ if (isnewline(*sp)) {
1193
+ pc = pc + 1;
1194
+ break;
1195
+ }
1196
+ }
1197
+ return 1;
1198
+ case I_WORD:
1199
+ i = sp > bol && iswordchar(sp[-1]);
1200
+ i ^= iswordchar(sp[0]);
1201
+ if (!i)
1202
+ return 1;
1203
+ pc = pc + 1;
1204
+ break;
1205
+ case I_NWORD:
1206
+ i = sp > bol && iswordchar(sp[-1]);
1207
+ i ^= iswordchar(sp[0]);
1208
+ if (i)
1209
+ return 1;
1210
+ pc = pc + 1;
1211
+ break;
1212
+
1213
+ case I_LPAR:
1214
+ out->sub[pc->n].sp = sp;
1215
+ pc = pc + 1;
1216
+ break;
1217
+ case I_RPAR:
1218
+ out->sub[pc->n].ep = sp;
1219
+ pc = pc + 1;
1220
+ break;
1221
+ default:
1222
+ return 1;
1223
+ }
1224
+ }
1225
+ }
1226
+
1227
+ int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1228
+ {
1229
+ Resub scratch;
1230
+ int i;
1231
+
1232
+ if (!sub)
1233
+ sub = &scratch;
1234
+
1235
+ sub->nsub = prog->nsub;
1236
+ for (i = 0; i < REG_MAXSUB; ++i)
1237
+ sub->sub[i].sp = sub->sub[i].ep = NULL;
1238
+
1239
+ return match(prog->start, sp, sp, prog->flags | eflags, sub, 0);
1240
+ }
1241
+
1242
+ #ifdef TEST
1243
+ int main(int argc, char **argv)
1244
+ {
1245
+ const char *error;
1246
+ const char *s;
1247
+ Reprog *p;
1248
+ Resub m;
1249
+ int i;
1250
+
1251
+ if (argc > 1) {
1252
+ p = regcomp(argv[1], 0, &error);
1253
+ if (!p) {
1254
+ fprintf(stderr, "regcomp: %s\n", error);
1255
+ return 1;
1256
+ }
1257
+
1258
+ if (argc > 2) {
1259
+ s = argv[2];
1260
+ printf("nsub = %d\n", p->nsub);
1261
+ if (!regexec(p, s, &m, 0)) {
1262
+ for (i = 0; i < m.nsub; ++i) {
1263
+ int n = m.sub[i].ep - m.sub[i].sp;
1264
+ if (n > 0)
1265
+ printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1266
+ else
1267
+ printf("match %d: n=0 ''\n", i);
1268
+ }
1269
+ } else {
1270
+ printf("no match\n");
1271
+ }
1272
+ }
1273
+ }
1274
+
1275
+ return 0;
1276
+ }
1277
+ #endif