Mxx_ru 1.4.6 → 1.4.7

Sign up to get free protection for your applications and to get access to all the features.
data/tests/c/pcre/study.c DELETED
@@ -1,472 +0,0 @@
1
- /*************************************************
2
- * Perl-Compatible Regular Expressions *
3
- *************************************************/
4
-
5
- /*
6
- This is a library of functions to support regular expressions whose syntax
7
- and semantics are as close as possible to those of the Perl 5 language. See
8
- the file Tech.Notes for some information on the internals.
9
-
10
- Written by: Philip Hazel <ph10@cam.ac.uk>
11
-
12
- Copyright (c) 1997-2003 University of Cambridge
13
-
14
- -----------------------------------------------------------------------------
15
- Permission is granted to anyone to use this software for any purpose on any
16
- computer system, and to redistribute it freely, subject to the following
17
- restrictions:
18
-
19
- 1. This software is distributed in the hope that it will be useful,
20
- but WITHOUT ANY WARRANTY; without even the implied warranty of
21
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
-
23
- 2. The origin of this software must not be misrepresented, either by
24
- explicit claim or by omission.
25
-
26
- 3. Altered versions must be plainly marked as such, and must not be
27
- misrepresented as being the original software.
28
-
29
- 4. If PCRE is embedded in any software that is released under the GNU
30
- General Purpose Licence (GPL), then the terms of that licence shall
31
- supersede any condition above with which it is incompatible.
32
- -----------------------------------------------------------------------------
33
- */
34
-
35
-
36
- /* Include the internals header, which itself includes Standard C headers plus
37
- the external pcre header. */
38
-
39
- #include "internal.h"
40
-
41
-
42
-
43
- /*************************************************
44
- * Set a bit and maybe its alternate case *
45
- *************************************************/
46
-
47
- /* Given a character, set its bit in the table, and also the bit for the other
48
- version of a letter if we are caseless.
49
-
50
- Arguments:
51
- start_bits points to the bit map
52
- c is the character
53
- caseless the caseless flag
54
- cd the block with char table pointers
55
-
56
- Returns: nothing
57
- */
58
-
59
- static void
60
- set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61
- {
62
- start_bits[c/8] |= (1 << (c&7));
63
- if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64
- start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65
- }
66
-
67
-
68
-
69
- /*************************************************
70
- * Create bitmap of starting chars *
71
- *************************************************/
72
-
73
- /* This function scans a compiled unanchored expression and attempts to build a
74
- bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75
- goes by, we may be able to get more clever at doing this.
76
-
77
- Arguments:
78
- code points to an expression
79
- start_bits points to a 32-byte table, initialized to 0
80
- caseless the current state of the caseless flag
81
- utf8 TRUE if in UTF-8 mode
82
- cd the block with char table pointers
83
-
84
- Returns: TRUE if table built, FALSE otherwise
85
- */
86
-
87
- static BOOL
88
- set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
89
- BOOL utf8, compile_data *cd)
90
- {
91
- register int c;
92
-
93
- /* This next statement and the later reference to dummy are here in order to
94
- trick the optimizer of the IBM C compiler for OS/2 into generating correct
95
- code. Apparently IBM isn't going to fix the problem, and we would rather not
96
- disable optimization (in this module it actually makes a big difference, and
97
- the pcre module can use all the optimization it can get). */
98
-
99
- volatile int dummy;
100
-
101
- do
102
- {
103
- const uschar *tcode = code + 1 + LINK_SIZE;
104
- BOOL try_next = TRUE;
105
-
106
- while (try_next)
107
- {
108
- /* If a branch starts with a bracket or a positive lookahead assertion,
109
- recurse to set bits from within them. That's all for this branch. */
110
-
111
- if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
112
- {
113
- if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
114
- return FALSE;
115
- try_next = FALSE;
116
- }
117
-
118
- else switch(*tcode)
119
- {
120
- default:
121
- return FALSE;
122
-
123
- /* Skip over callout */
124
-
125
- case OP_CALLOUT:
126
- tcode += 2;
127
- break;
128
-
129
- /* Skip over extended extraction bracket number */
130
-
131
- case OP_BRANUMBER:
132
- tcode += 3;
133
- break;
134
-
135
- /* Skip over lookbehind and negative lookahead assertions */
136
-
137
- case OP_ASSERT_NOT:
138
- case OP_ASSERTBACK:
139
- case OP_ASSERTBACK_NOT:
140
- do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
141
- tcode += 1+LINK_SIZE;
142
- break;
143
-
144
- /* Skip over an option setting, changing the caseless flag */
145
-
146
- case OP_OPT:
147
- caseless = (tcode[1] & PCRE_CASELESS) != 0;
148
- tcode += 2;
149
- break;
150
-
151
- /* BRAZERO does the bracket, but carries on. */
152
-
153
- case OP_BRAZERO:
154
- case OP_BRAMINZERO:
155
- if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
156
- return FALSE;
157
- dummy = 1;
158
- do tcode += GET(tcode,1); while (*tcode == OP_ALT);
159
- tcode += 1+LINK_SIZE;
160
- break;
161
-
162
- /* Single-char * or ? sets the bit and tries the next item */
163
-
164
- case OP_STAR:
165
- case OP_MINSTAR:
166
- case OP_QUERY:
167
- case OP_MINQUERY:
168
- set_bit(start_bits, tcode[1], caseless, cd);
169
- tcode += 2;
170
- #ifdef SUPPORT_UTF8
171
- if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
172
- #endif
173
- break;
174
-
175
- /* Single-char upto sets the bit and tries the next */
176
-
177
- case OP_UPTO:
178
- case OP_MINUPTO:
179
- set_bit(start_bits, tcode[3], caseless, cd);
180
- tcode += 4;
181
- #ifdef SUPPORT_UTF8
182
- if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
183
- #endif
184
- break;
185
-
186
- /* At least one single char sets the bit and stops */
187
-
188
- case OP_EXACT: /* Fall through */
189
- tcode++;
190
-
191
- case OP_CHARS: /* Fall through */
192
- tcode++;
193
-
194
- case OP_PLUS:
195
- case OP_MINPLUS:
196
- set_bit(start_bits, tcode[1], caseless, cd);
197
- try_next = FALSE;
198
- break;
199
-
200
- /* Single character type sets the bits and stops */
201
-
202
- case OP_NOT_DIGIT:
203
- for (c = 0; c < 32; c++)
204
- start_bits[c] |= ~cd->cbits[c+cbit_digit];
205
- try_next = FALSE;
206
- break;
207
-
208
- case OP_DIGIT:
209
- for (c = 0; c < 32; c++)
210
- start_bits[c] |= cd->cbits[c+cbit_digit];
211
- try_next = FALSE;
212
- break;
213
-
214
- case OP_NOT_WHITESPACE:
215
- for (c = 0; c < 32; c++)
216
- start_bits[c] |= ~cd->cbits[c+cbit_space];
217
- try_next = FALSE;
218
- break;
219
-
220
- case OP_WHITESPACE:
221
- for (c = 0; c < 32; c++)
222
- start_bits[c] |= cd->cbits[c+cbit_space];
223
- try_next = FALSE;
224
- break;
225
-
226
- case OP_NOT_WORDCHAR:
227
- for (c = 0; c < 32; c++)
228
- start_bits[c] |= ~cd->cbits[c+cbit_word];
229
- try_next = FALSE;
230
- break;
231
-
232
- case OP_WORDCHAR:
233
- for (c = 0; c < 32; c++)
234
- start_bits[c] |= cd->cbits[c+cbit_word];
235
- try_next = FALSE;
236
- break;
237
-
238
- /* One or more character type fudges the pointer and restarts, knowing
239
- it will hit a single character type and stop there. */
240
-
241
- case OP_TYPEPLUS:
242
- case OP_TYPEMINPLUS:
243
- tcode++;
244
- break;
245
-
246
- case OP_TYPEEXACT:
247
- tcode += 3;
248
- break;
249
-
250
- /* Zero or more repeats of character types set the bits and then
251
- try again. */
252
-
253
- case OP_TYPEUPTO:
254
- case OP_TYPEMINUPTO:
255
- tcode += 2; /* Fall through */
256
-
257
- case OP_TYPESTAR:
258
- case OP_TYPEMINSTAR:
259
- case OP_TYPEQUERY:
260
- case OP_TYPEMINQUERY:
261
- switch(tcode[1])
262
- {
263
- case OP_ANY:
264
- return FALSE;
265
-
266
- case OP_NOT_DIGIT:
267
- for (c = 0; c < 32; c++)
268
- start_bits[c] |= ~cd->cbits[c+cbit_digit];
269
- break;
270
-
271
- case OP_DIGIT:
272
- for (c = 0; c < 32; c++)
273
- start_bits[c] |= cd->cbits[c+cbit_digit];
274
- break;
275
-
276
- case OP_NOT_WHITESPACE:
277
- for (c = 0; c < 32; c++)
278
- start_bits[c] |= ~cd->cbits[c+cbit_space];
279
- break;
280
-
281
- case OP_WHITESPACE:
282
- for (c = 0; c < 32; c++)
283
- start_bits[c] |= cd->cbits[c+cbit_space];
284
- break;
285
-
286
- case OP_NOT_WORDCHAR:
287
- for (c = 0; c < 32; c++)
288
- start_bits[c] |= ~cd->cbits[c+cbit_word];
289
- break;
290
-
291
- case OP_WORDCHAR:
292
- for (c = 0; c < 32; c++)
293
- start_bits[c] |= cd->cbits[c+cbit_word];
294
- break;
295
- }
296
-
297
- tcode += 2;
298
- break;
299
-
300
- /* Character class where all the information is in a bit map: set the
301
- bits and either carry on or not, according to the repeat count. If it was
302
- a negative class, and we are operating with UTF-8 characters, any byte
303
- with a value >= 0xc4 is a potentially valid starter because it starts a
304
- character with a value > 255. */
305
-
306
- case OP_NCLASS:
307
- if (utf8)
308
- {
309
- start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
310
- memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
311
- }
312
- /* Fall through */
313
-
314
- case OP_CLASS:
315
- {
316
- tcode++;
317
-
318
- /* In UTF-8 mode, the bits in a bit map correspond to character
319
- values, not to byte values. However, the bit map we are constructing is
320
- for byte values. So we have to do a conversion for characters whose
321
- value is > 127. In fact, there are only two possible starting bytes for
322
- characters in the range 128 - 255. */
323
-
324
- if (utf8)
325
- {
326
- for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
327
- for (c = 128; c < 256; c++)
328
- {
329
- if ((tcode[c/8] && (1 << (c&7))) != 0)
330
- {
331
- int d = (c >> 6) | 0xc0; /* Set bit for this starter */
332
- start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
333
- c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
334
- }
335
- }
336
- }
337
-
338
- /* In non-UTF-8 mode, the two bit maps are completely compatible. */
339
-
340
- else
341
- {
342
- for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
343
- }
344
-
345
- /* Advance past the bit map, and act on what follows */
346
-
347
- tcode += 32;
348
- switch (*tcode)
349
- {
350
- case OP_CRSTAR:
351
- case OP_CRMINSTAR:
352
- case OP_CRQUERY:
353
- case OP_CRMINQUERY:
354
- tcode++;
355
- break;
356
-
357
- case OP_CRRANGE:
358
- case OP_CRMINRANGE:
359
- if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
360
- else try_next = FALSE;
361
- break;
362
-
363
- default:
364
- try_next = FALSE;
365
- break;
366
- }
367
- }
368
- break; /* End of bitmap class handling */
369
-
370
- } /* End of switch */
371
- } /* End of try_next loop */
372
-
373
- code += GET(code, 1); /* Advance to next branch */
374
- }
375
- while (*code == OP_ALT);
376
- return TRUE;
377
- }
378
-
379
-
380
-
381
- /*************************************************
382
- * Study a compiled expression *
383
- *************************************************/
384
-
385
- /* This function is handed a compiled expression that it must study to produce
386
- information that will speed up the matching. It returns a pcre_extra block
387
- which then gets handed back to pcre_exec().
388
-
389
- Arguments:
390
- re points to the compiled expression
391
- options contains option bits
392
- errorptr points to where to place error messages;
393
- set NULL unless error
394
-
395
- Returns: pointer to a pcre_extra block, with study_data filled in and the
396
- appropriate flag set;
397
- NULL on error or if no optimization possible
398
- */
399
-
400
- EXPORT pcre_extra *
401
- pcre_study(const pcre *external_re, int options, const char **errorptr)
402
- {
403
- uschar start_bits[32];
404
- pcre_extra *extra;
405
- pcre_study_data *study;
406
- const real_pcre *re = (const real_pcre *)external_re;
407
- uschar *code = (uschar *)re + sizeof(real_pcre) +
408
- (re->name_count * re->name_entry_size);
409
- compile_data compile_block;
410
-
411
- *errorptr = NULL;
412
-
413
- if (re == NULL || re->magic_number != MAGIC_NUMBER)
414
- {
415
- *errorptr = "argument is not a compiled regular expression";
416
- return NULL;
417
- }
418
-
419
- if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
420
- {
421
- *errorptr = "unknown or incorrect option bit(s) set";
422
- return NULL;
423
- }
424
-
425
- /* For an anchored pattern, or an unanchored pattern that has a first char, or
426
- a multiline pattern that matches only at "line starts", no further processing
427
- at present. */
428
-
429
- if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
430
- return NULL;
431
-
432
- /* Set the character tables in the block which is passed around */
433
-
434
- compile_block.lcc = re->tables + lcc_offset;
435
- compile_block.fcc = re->tables + fcc_offset;
436
- compile_block.cbits = re->tables + cbits_offset;
437
- compile_block.ctypes = re->tables + ctypes_offset;
438
-
439
- /* See if we can find a fixed set of initial characters for the pattern. */
440
-
441
- memset(start_bits, 0, 32 * sizeof(uschar));
442
- if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
443
- (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
444
-
445
- /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
446
- the latter, which is pointed to by the former, which may also get additional
447
- data set later by the calling program. At the moment, the size of
448
- pcre_study_data is fixed. We nevertheless save it in a field for returning via
449
- the pcre_fullinfo() function so that if it becomes variable in the future, we
450
- don't have to change that code. */
451
-
452
- extra = (pcre_extra *)(pcre_malloc)
453
- (sizeof(pcre_extra) + sizeof(pcre_study_data));
454
-
455
- if (extra == NULL)
456
- {
457
- *errorptr = "failed to get memory";
458
- return NULL;
459
- }
460
-
461
- study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
462
- extra->flags = PCRE_EXTRA_STUDY_DATA;
463
- extra->study_data = study;
464
-
465
- study->size = sizeof(pcre_study_data);
466
- study->options = PCRE_STUDY_MAPPED;
467
- memcpy(study->start_bits, start_bits, sizeof(start_bits));
468
-
469
- return extra;
470
- }
471
-
472
- /* End of study.c */