ctags.rb 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/Gemfile +2 -0
  2. data/Rakefile +23 -0
  3. data/ctags.rb.gemspec +23 -0
  4. data/ext/.gitignore +3 -0
  5. data/ext/extconf.rb +15 -0
  6. data/ext/vendor/exuberant-ctags/.gitignore +6 -0
  7. data/ext/vendor/exuberant-ctags/.indent.pro +31 -0
  8. data/ext/vendor/exuberant-ctags/COPYING +340 -0
  9. data/ext/vendor/exuberant-ctags/EXTENDING.html +386 -0
  10. data/ext/vendor/exuberant-ctags/FAQ +371 -0
  11. data/ext/vendor/exuberant-ctags/INSTALL +215 -0
  12. data/ext/vendor/exuberant-ctags/INSTALL.oth +73 -0
  13. data/ext/vendor/exuberant-ctags/MAINTAINERS +88 -0
  14. data/ext/vendor/exuberant-ctags/Makefile.in +222 -0
  15. data/ext/vendor/exuberant-ctags/NEWS +871 -0
  16. data/ext/vendor/exuberant-ctags/README +73 -0
  17. data/ext/vendor/exuberant-ctags/ant.c +42 -0
  18. data/ext/vendor/exuberant-ctags/argproc.c +505 -0
  19. data/ext/vendor/exuberant-ctags/args.c +274 -0
  20. data/ext/vendor/exuberant-ctags/args.h +63 -0
  21. data/ext/vendor/exuberant-ctags/asm.c +387 -0
  22. data/ext/vendor/exuberant-ctags/asp.c +328 -0
  23. data/ext/vendor/exuberant-ctags/awk.c +81 -0
  24. data/ext/vendor/exuberant-ctags/basic.c +203 -0
  25. data/ext/vendor/exuberant-ctags/beta.c +321 -0
  26. data/ext/vendor/exuberant-ctags/c.c +2932 -0
  27. data/ext/vendor/exuberant-ctags/cobol.c +50 -0
  28. data/ext/vendor/exuberant-ctags/config.h.in +277 -0
  29. data/ext/vendor/exuberant-ctags/configure +7704 -0
  30. data/ext/vendor/exuberant-ctags/configure.ac +532 -0
  31. data/ext/vendor/exuberant-ctags/ctags.1 +1186 -0
  32. data/ext/vendor/exuberant-ctags/ctags.h +28 -0
  33. data/ext/vendor/exuberant-ctags/ctags.html +2087 -0
  34. data/ext/vendor/exuberant-ctags/ctags.spec +40 -0
  35. data/ext/vendor/exuberant-ctags/debug.c +113 -0
  36. data/ext/vendor/exuberant-ctags/debug.h +70 -0
  37. data/ext/vendor/exuberant-ctags/descrip.mms +68 -0
  38. data/ext/vendor/exuberant-ctags/dosbatch.c +42 -0
  39. data/ext/vendor/exuberant-ctags/e_amiga.h +24 -0
  40. data/ext/vendor/exuberant-ctags/e_djgpp.h +47 -0
  41. data/ext/vendor/exuberant-ctags/e_mac.h +143 -0
  42. data/ext/vendor/exuberant-ctags/e_msoft.h +76 -0
  43. data/ext/vendor/exuberant-ctags/e_os2.h +37 -0
  44. data/ext/vendor/exuberant-ctags/e_qdos.h +34 -0
  45. data/ext/vendor/exuberant-ctags/e_riscos.h +58 -0
  46. data/ext/vendor/exuberant-ctags/e_vms.h +31 -0
  47. data/ext/vendor/exuberant-ctags/eiffel.c +1352 -0
  48. data/ext/vendor/exuberant-ctags/entry.c +847 -0
  49. data/ext/vendor/exuberant-ctags/entry.h +103 -0
  50. data/ext/vendor/exuberant-ctags/erlang.c +189 -0
  51. data/ext/vendor/exuberant-ctags/flex.c +2243 -0
  52. data/ext/vendor/exuberant-ctags/fortran.c +2197 -0
  53. data/ext/vendor/exuberant-ctags/general.h +127 -0
  54. data/ext/vendor/exuberant-ctags/get.c +669 -0
  55. data/ext/vendor/exuberant-ctags/get.h +50 -0
  56. data/ext/vendor/exuberant-ctags/gnu_regex/.svn/all-wcprops +47 -0
  57. data/ext/vendor/exuberant-ctags/gnu_regex/.svn/entries +112 -0
  58. data/ext/vendor/exuberant-ctags/gnu_regex/.svn/text-base/README.txt.svn-base +5 -0
  59. data/ext/vendor/exuberant-ctags/gnu_regex/.svn/text-base/regcomp.c.svn-base +3818 -0
  60. data/ext/vendor/exuberant-ctags/gnu_regex/.svn/text-base/regex.c.svn-base +74 -0
  61. data/ext/vendor/exuberant-ctags/gnu_regex/.svn/text-base/regex.h.svn-base +575 -0
  62. data/ext/vendor/exuberant-ctags/gnu_regex/.svn/text-base/regex_internal.c.svn-base +1713 -0
  63. data/ext/vendor/exuberant-ctags/gnu_regex/.svn/text-base/regex_internal.h.svn-base +773 -0
  64. data/ext/vendor/exuberant-ctags/gnu_regex/.svn/text-base/regexec.c.svn-base +4338 -0
  65. data/ext/vendor/exuberant-ctags/gnu_regex/README.txt +5 -0
  66. data/ext/vendor/exuberant-ctags/gnu_regex/regcomp.c +3818 -0
  67. data/ext/vendor/exuberant-ctags/gnu_regex/regex.c +74 -0
  68. data/ext/vendor/exuberant-ctags/gnu_regex/regex.h +575 -0
  69. data/ext/vendor/exuberant-ctags/gnu_regex/regex_internal.c +1713 -0
  70. data/ext/vendor/exuberant-ctags/gnu_regex/regex_internal.h +773 -0
  71. data/ext/vendor/exuberant-ctags/gnu_regex/regexec.c +4338 -0
  72. data/ext/vendor/exuberant-ctags/html.c +49 -0
  73. data/ext/vendor/exuberant-ctags/jscript.c +1572 -0
  74. data/ext/vendor/exuberant-ctags/keyword.c +258 -0
  75. data/ext/vendor/exuberant-ctags/keyword.h +34 -0
  76. data/ext/vendor/exuberant-ctags/lisp.c +139 -0
  77. data/ext/vendor/exuberant-ctags/lregex.c +704 -0
  78. data/ext/vendor/exuberant-ctags/lua.c +133 -0
  79. data/ext/vendor/exuberant-ctags/mac.c +273 -0
  80. data/ext/vendor/exuberant-ctags/magic.diff +21 -0
  81. data/ext/vendor/exuberant-ctags/main.c +584 -0
  82. data/ext/vendor/exuberant-ctags/main.h +32 -0
  83. data/ext/vendor/exuberant-ctags/maintainer.mak +476 -0
  84. data/ext/vendor/exuberant-ctags/make.c +217 -0
  85. data/ext/vendor/exuberant-ctags/matlab.c +44 -0
  86. data/ext/vendor/exuberant-ctags/mk_bc3.mak +46 -0
  87. data/ext/vendor/exuberant-ctags/mk_bc5.mak +49 -0
  88. data/ext/vendor/exuberant-ctags/mk_djg.mak +18 -0
  89. data/ext/vendor/exuberant-ctags/mk_manx.mak +65 -0
  90. data/ext/vendor/exuberant-ctags/mk_mingw.mak +31 -0
  91. data/ext/vendor/exuberant-ctags/mk_mpw.mak +130 -0
  92. data/ext/vendor/exuberant-ctags/mk_mvc.mak +40 -0
  93. data/ext/vendor/exuberant-ctags/mk_os2.mak +104 -0
  94. data/ext/vendor/exuberant-ctags/mk_qdos.mak +100 -0
  95. data/ext/vendor/exuberant-ctags/mk_sas.mak +63 -0
  96. data/ext/vendor/exuberant-ctags/mkinstalldirs +40 -0
  97. data/ext/vendor/exuberant-ctags/ocaml.c +1842 -0
  98. data/ext/vendor/exuberant-ctags/options.c +1842 -0
  99. data/ext/vendor/exuberant-ctags/options.h +155 -0
  100. data/ext/vendor/exuberant-ctags/parse.c +677 -0
  101. data/ext/vendor/exuberant-ctags/parse.h +129 -0
  102. data/ext/vendor/exuberant-ctags/parsers.h +63 -0
  103. data/ext/vendor/exuberant-ctags/pascal.c +267 -0
  104. data/ext/vendor/exuberant-ctags/perl.c +382 -0
  105. data/ext/vendor/exuberant-ctags/php.c +237 -0
  106. data/ext/vendor/exuberant-ctags/python.c +771 -0
  107. data/ext/vendor/exuberant-ctags/qdos.c +106 -0
  108. data/ext/vendor/exuberant-ctags/read.c +569 -0
  109. data/ext/vendor/exuberant-ctags/read.h +116 -0
  110. data/ext/vendor/exuberant-ctags/readtags.c +959 -0
  111. data/ext/vendor/exuberant-ctags/readtags.h +252 -0
  112. data/ext/vendor/exuberant-ctags/rexx.c +39 -0
  113. data/ext/vendor/exuberant-ctags/routines.c +891 -0
  114. data/ext/vendor/exuberant-ctags/routines.h +134 -0
  115. data/ext/vendor/exuberant-ctags/ruby.c +408 -0
  116. data/ext/vendor/exuberant-ctags/scheme.c +111 -0
  117. data/ext/vendor/exuberant-ctags/sh.c +115 -0
  118. data/ext/vendor/exuberant-ctags/slang.c +41 -0
  119. data/ext/vendor/exuberant-ctags/sml.c +212 -0
  120. data/ext/vendor/exuberant-ctags/sort.c +230 -0
  121. data/ext/vendor/exuberant-ctags/sort.h +32 -0
  122. data/ext/vendor/exuberant-ctags/source.mak +122 -0
  123. data/ext/vendor/exuberant-ctags/sql.c +2112 -0
  124. data/ext/vendor/exuberant-ctags/strlist.c +281 -0
  125. data/ext/vendor/exuberant-ctags/strlist.h +54 -0
  126. data/ext/vendor/exuberant-ctags/tcl.c +116 -0
  127. data/ext/vendor/exuberant-ctags/tex.c +524 -0
  128. data/ext/vendor/exuberant-ctags/verilog.c +340 -0
  129. data/ext/vendor/exuberant-ctags/vhdl.c +835 -0
  130. data/ext/vendor/exuberant-ctags/vim.c +636 -0
  131. data/ext/vendor/exuberant-ctags/vstring.c +232 -0
  132. data/ext/vendor/exuberant-ctags/vstring.h +85 -0
  133. data/ext/vendor/exuberant-ctags/yacc.c +40 -0
  134. data/lib/ctags/exuberant.rb +45 -0
  135. data/lib/ctags/version.rb +3 -0
  136. data/lib/ctags.rb +6 -0
  137. data/test/test_ctags.rb +24 -0
  138. metadata +233 -0
@@ -0,0 +1,3818 @@
1
+ /* Extended regular expression matching and search library.
2
+ Copyright (C) 2002,2003,2004,2005,2006,2007,2009
3
+ Free Software Foundation, Inc.
4
+ This file is part of the GNU C Library.
5
+ Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6
+
7
+ The GNU C Library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ The GNU C Library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with the GNU C Library; if not, write to the Free
19
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20
+ 02111-1307 USA. */
21
+
22
+ static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
23
+ size_t length, reg_syntax_t syntax);
24
+ static void re_compile_fastmap_iter (regex_t *bufp,
25
+ const re_dfastate_t *init_state,
26
+ char *fastmap);
27
+ static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28
+ #ifdef RE_ENABLE_I18N
29
+ static void free_charset (re_charset_t *cset);
30
+ #endif /* RE_ENABLE_I18N */
31
+ static void free_workarea_compile (regex_t *preg);
32
+ static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33
+ #ifdef RE_ENABLE_I18N
34
+ static void optimize_utf8 (re_dfa_t *dfa);
35
+ #endif
36
+ static reg_errcode_t analyze (regex_t *preg);
37
+ static reg_errcode_t preorder (bin_tree_t *root,
38
+ reg_errcode_t (fn (void *, bin_tree_t *)),
39
+ void *extra);
40
+ static reg_errcode_t postorder (bin_tree_t *root,
41
+ reg_errcode_t (fn (void *, bin_tree_t *)),
42
+ void *extra);
43
+ static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
44
+ static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
45
+ static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46
+ bin_tree_t *node);
47
+ static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
48
+ static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
49
+ static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
50
+ static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
51
+ static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
52
+ unsigned int constraint);
53
+ static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
54
+ static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55
+ int node, int root);
56
+ static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
57
+ static int fetch_number (re_string_t *input, re_token_t *token,
58
+ reg_syntax_t syntax);
59
+ static int peek_token (re_token_t *token, re_string_t *input,
60
+ reg_syntax_t syntax) internal_function;
61
+ static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
62
+ reg_syntax_t syntax, reg_errcode_t *err);
63
+ static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
64
+ re_token_t *token, reg_syntax_t syntax,
65
+ int nest, reg_errcode_t *err);
66
+ static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
67
+ re_token_t *token, reg_syntax_t syntax,
68
+ int nest, reg_errcode_t *err);
69
+ static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
70
+ re_token_t *token, reg_syntax_t syntax,
71
+ int nest, reg_errcode_t *err);
72
+ static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
73
+ re_token_t *token, reg_syntax_t syntax,
74
+ int nest, reg_errcode_t *err);
75
+ static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
76
+ re_dfa_t *dfa, re_token_t *token,
77
+ reg_syntax_t syntax, reg_errcode_t *err);
78
+ static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
79
+ re_token_t *token, reg_syntax_t syntax,
80
+ reg_errcode_t *err);
81
+ static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82
+ re_string_t *regexp,
83
+ re_token_t *token, int token_len,
84
+ re_dfa_t *dfa,
85
+ reg_syntax_t syntax,
86
+ int accept_hyphen);
87
+ static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
88
+ re_string_t *regexp,
89
+ re_token_t *token);
90
+ #ifdef RE_ENABLE_I18N
91
+ static reg_errcode_t build_equiv_class (bitset_t sbcset,
92
+ re_charset_t *mbcset,
93
+ int *equiv_class_alloc,
94
+ const unsigned char *name);
95
+ static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96
+ bitset_t sbcset,
97
+ re_charset_t *mbcset,
98
+ int *char_class_alloc,
99
+ const unsigned char *class_name,
100
+ reg_syntax_t syntax);
101
+ #else /* not RE_ENABLE_I18N */
102
+ static reg_errcode_t build_equiv_class (bitset_t sbcset,
103
+ const unsigned char *name);
104
+ static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105
+ bitset_t sbcset,
106
+ const unsigned char *class_name,
107
+ reg_syntax_t syntax);
108
+ #endif /* not RE_ENABLE_I18N */
109
+ static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
110
+ RE_TRANSLATE_TYPE trans,
111
+ const unsigned char *class_name,
112
+ const unsigned char *extra,
113
+ int non_match, reg_errcode_t *err);
114
+ static bin_tree_t *create_tree (re_dfa_t *dfa,
115
+ bin_tree_t *left, bin_tree_t *right,
116
+ re_token_type_t type);
117
+ static bin_tree_t *create_token_tree (re_dfa_t *dfa,
118
+ bin_tree_t *left, bin_tree_t *right,
119
+ const re_token_t *token);
120
+ static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
121
+ static void free_token (re_token_t *node);
122
+ static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
123
+ static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124
+
125
+ /* This table gives an error message for each of the error codes listed
126
+ in regex.h. Obviously the order here has to be same as there.
127
+ POSIX doesn't require that we do anything for REG_NOERROR,
128
+ but why not be nice? */
129
+
130
+ const char __re_error_msgid[] attribute_hidden =
131
+ {
132
+ #define REG_NOERROR_IDX 0
133
+ gettext_noop ("Success") /* REG_NOERROR */
134
+ "\0"
135
+ #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136
+ gettext_noop ("No match") /* REG_NOMATCH */
137
+ "\0"
138
+ #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139
+ gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140
+ "\0"
141
+ #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142
+ gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143
+ "\0"
144
+ #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145
+ gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146
+ "\0"
147
+ #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148
+ gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149
+ "\0"
150
+ #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151
+ gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152
+ "\0"
153
+ #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154
+ gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155
+ "\0"
156
+ #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157
+ gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158
+ "\0"
159
+ #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160
+ gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161
+ "\0"
162
+ #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163
+ gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164
+ "\0"
165
+ #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166
+ gettext_noop ("Invalid range end") /* REG_ERANGE */
167
+ "\0"
168
+ #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169
+ gettext_noop ("Memory exhausted") /* REG_ESPACE */
170
+ "\0"
171
+ #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172
+ gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173
+ "\0"
174
+ #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175
+ gettext_noop ("Premature end of regular expression") /* REG_EEND */
176
+ "\0"
177
+ #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178
+ gettext_noop ("Regular expression too big") /* REG_ESIZE */
179
+ "\0"
180
+ #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181
+ gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182
+ };
183
+
184
+ const size_t __re_error_msgid_idx[] attribute_hidden =
185
+ {
186
+ REG_NOERROR_IDX,
187
+ REG_NOMATCH_IDX,
188
+ REG_BADPAT_IDX,
189
+ REG_ECOLLATE_IDX,
190
+ REG_ECTYPE_IDX,
191
+ REG_EESCAPE_IDX,
192
+ REG_ESUBREG_IDX,
193
+ REG_EBRACK_IDX,
194
+ REG_EPAREN_IDX,
195
+ REG_EBRACE_IDX,
196
+ REG_BADBR_IDX,
197
+ REG_ERANGE_IDX,
198
+ REG_ESPACE_IDX,
199
+ REG_BADRPT_IDX,
200
+ REG_EEND_IDX,
201
+ REG_ESIZE_IDX,
202
+ REG_ERPAREN_IDX
203
+ };
204
+
205
+ /* Entry points for GNU code. */
206
+
207
+ /* re_compile_pattern is the GNU regular expression compiler: it
208
+ compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209
+ Returns 0 if the pattern was valid, otherwise an error string.
210
+
211
+ Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212
+ are set in BUFP on entry. */
213
+
214
+ const char *
215
+ re_compile_pattern (pattern, length, bufp)
216
+ const char *pattern;
217
+ size_t length;
218
+ struct re_pattern_buffer *bufp;
219
+ {
220
+ reg_errcode_t ret;
221
+
222
+ /* And GNU code determines whether or not to get register information
223
+ by passing null for the REGS argument to re_match, etc., not by
224
+ setting no_sub, unless RE_NO_SUB is set. */
225
+ bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226
+
227
+ /* Match anchors at newline. */
228
+ bufp->newline_anchor = 1;
229
+
230
+ ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231
+
232
+ if (!ret)
233
+ return NULL;
234
+ return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235
+ }
236
+ #ifdef _LIBC
237
+ weak_alias (__re_compile_pattern, re_compile_pattern)
238
+ #endif
239
+
240
+ /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
241
+ also be assigned to arbitrarily: each pattern buffer stores its own
242
+ syntax, so it can be changed between regex compilations. */
243
+ /* This has no initializer because initialized variables in Emacs
244
+ become read-only after dumping. */
245
+ reg_syntax_t re_syntax_options;
246
+
247
+
248
+ /* Specify the precise syntax of regexps for compilation. This provides
249
+ for compatibility for various utilities which historically have
250
+ different, incompatible syntaxes.
251
+
252
+ The argument SYNTAX is a bit mask comprised of the various bits
253
+ defined in regex.h. We return the old syntax. */
254
+
255
+ reg_syntax_t
256
+ re_set_syntax (syntax)
257
+ reg_syntax_t syntax;
258
+ {
259
+ reg_syntax_t ret = re_syntax_options;
260
+
261
+ re_syntax_options = syntax;
262
+ return ret;
263
+ }
264
+ #ifdef _LIBC
265
+ weak_alias (__re_set_syntax, re_set_syntax)
266
+ #endif
267
+
268
+ int
269
+ re_compile_fastmap (bufp)
270
+ struct re_pattern_buffer *bufp;
271
+ {
272
+ re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
273
+ char *fastmap = bufp->fastmap;
274
+
275
+ memset (fastmap, '\0', sizeof (char) * SBC_MAX);
276
+ re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
277
+ if (dfa->init_state != dfa->init_state_word)
278
+ re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
279
+ if (dfa->init_state != dfa->init_state_nl)
280
+ re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
281
+ if (dfa->init_state != dfa->init_state_begbuf)
282
+ re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
283
+ bufp->fastmap_accurate = 1;
284
+ return 0;
285
+ }
286
+ #ifdef _LIBC
287
+ weak_alias (__re_compile_fastmap, re_compile_fastmap)
288
+ #endif
289
+
290
+ static inline void
291
+ __attribute ((always_inline))
292
+ re_set_fastmap (char *fastmap, int icase, int ch)
293
+ {
294
+ fastmap[ch] = 1;
295
+ if (icase)
296
+ fastmap[tolower (ch)] = 1;
297
+ }
298
+
299
+ /* Helper function for re_compile_fastmap.
300
+ Compile fastmap for the initial_state INIT_STATE. */
301
+
302
+ static void
303
+ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
304
+ char *fastmap)
305
+ {
306
+ re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
307
+ int node_cnt;
308
+ int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
309
+ for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
310
+ {
311
+ int node = init_state->nodes.elems[node_cnt];
312
+ re_token_type_t type = dfa->nodes[node].type;
313
+
314
+ if (type == CHARACTER)
315
+ {
316
+ re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
317
+ #ifdef RE_ENABLE_I18N
318
+ if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
319
+ {
320
+ unsigned char *buf = alloca (dfa->mb_cur_max), *p;
321
+ wchar_t wc;
322
+ mbstate_t state;
323
+
324
+ p = buf;
325
+ *p++ = dfa->nodes[node].opr.c;
326
+ while (++node < dfa->nodes_len
327
+ && dfa->nodes[node].type == CHARACTER
328
+ && dfa->nodes[node].mb_partial)
329
+ *p++ = dfa->nodes[node].opr.c;
330
+ memset (&state, '\0', sizeof (state));
331
+ if (__mbrtowc (&wc, (const char *) buf, p - buf,
332
+ &state) == p - buf
333
+ && (__wcrtomb ((char *) buf, towlower (wc), &state)
334
+ != (size_t) -1))
335
+ re_set_fastmap (fastmap, 0, buf[0]);
336
+ }
337
+ #endif
338
+ }
339
+ else if (type == SIMPLE_BRACKET)
340
+ {
341
+ int i, ch;
342
+ for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
343
+ {
344
+ int j;
345
+ bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
346
+ for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
347
+ if (w & ((bitset_word_t) 1 << j))
348
+ re_set_fastmap (fastmap, icase, ch);
349
+ }
350
+ }
351
+ #ifdef RE_ENABLE_I18N
352
+ else if (type == COMPLEX_BRACKET)
353
+ {
354
+ re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355
+ int i;
356
+
357
+ # ifdef _LIBC
358
+ /* See if we have to try all bytes which start multiple collation
359
+ elements.
360
+ e.g. In da_DK, we want to catch 'a' since "aa" is a valid
361
+ collation element, and don't catch 'b' since 'b' is
362
+ the only collation element which starts from 'b' (and
363
+ it is caught by SIMPLE_BRACKET). */
364
+ if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
365
+ && (cset->ncoll_syms || cset->nranges))
366
+ {
367
+ const int32_t *table = (const int32_t *)
368
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369
+ for (i = 0; i < SBC_MAX; ++i)
370
+ if (table[i] < 0)
371
+ re_set_fastmap (fastmap, icase, i);
372
+ }
373
+ # endif /* _LIBC */
374
+
375
+ /* See if we have to start the match at all multibyte characters,
376
+ i.e. where we would not find an invalid sequence. This only
377
+ applies to multibyte character sets; for single byte character
378
+ sets, the SIMPLE_BRACKET again suffices. */
379
+ if (dfa->mb_cur_max > 1
380
+ && (cset->nchar_classes || cset->non_match
381
+ # ifdef _LIBC
382
+ || cset->nequiv_classes
383
+ # endif /* _LIBC */
384
+ ))
385
+ {
386
+ unsigned char c = 0;
387
+ do
388
+ {
389
+ mbstate_t mbs;
390
+ memset (&mbs, 0, sizeof (mbs));
391
+ if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
392
+ re_set_fastmap (fastmap, false, (int) c);
393
+ }
394
+ while (++c != 0);
395
+ }
396
+
397
+ else
398
+ {
399
+ /* ... Else catch all bytes which can start the mbchars. */
400
+ for (i = 0; i < cset->nmbchars; ++i)
401
+ {
402
+ char buf[256];
403
+ mbstate_t state;
404
+ memset (&state, '\0', sizeof (state));
405
+ if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
406
+ re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
407
+ if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
408
+ {
409
+ if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
410
+ != (size_t) -1)
411
+ re_set_fastmap (fastmap, false, *(unsigned char *) buf);
412
+ }
413
+ }
414
+ }
415
+ }
416
+ #endif /* RE_ENABLE_I18N */
417
+ else if (type == OP_PERIOD
418
+ #ifdef RE_ENABLE_I18N
419
+ || type == OP_UTF8_PERIOD
420
+ #endif /* RE_ENABLE_I18N */
421
+ || type == END_OF_RE)
422
+ {
423
+ memset (fastmap, '\1', sizeof (char) * SBC_MAX);
424
+ if (type == END_OF_RE)
425
+ bufp->can_be_null = 1;
426
+ return;
427
+ }
428
+ }
429
+ }
430
+
431
+ /* Entry point for POSIX code. */
432
+ /* regcomp takes a regular expression as a string and compiles it.
433
+
434
+ PREG is a regex_t *. We do not expect any fields to be initialized,
435
+ since POSIX says we shouldn't. Thus, we set
436
+
437
+ `buffer' to the compiled pattern;
438
+ `used' to the length of the compiled pattern;
439
+ `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
440
+ REG_EXTENDED bit in CFLAGS is set; otherwise, to
441
+ RE_SYNTAX_POSIX_BASIC;
442
+ `newline_anchor' to REG_NEWLINE being set in CFLAGS;
443
+ `fastmap' to an allocated space for the fastmap;
444
+ `fastmap_accurate' to zero;
445
+ `re_nsub' to the number of subexpressions in PATTERN.
446
+
447
+ PATTERN is the address of the pattern string.
448
+
449
+ CFLAGS is a series of bits which affect compilation.
450
+
451
+ If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
452
+ use POSIX basic syntax.
453
+
454
+ If REG_NEWLINE is set, then . and [^...] don't match newline.
455
+ Also, regexec will try a match beginning after every newline.
456
+
457
+ If REG_ICASE is set, then we considers upper- and lowercase
458
+ versions of letters to be equivalent when matching.
459
+
460
+ If REG_NOSUB is set, then when PREG is passed to regexec, that
461
+ routine will report only success or failure, and nothing about the
462
+ registers.
463
+
464
+ It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
465
+ the return codes and their meanings.) */
466
+
467
+ int
468
+ regcomp (preg, pattern, cflags)
469
+ regex_t *__restrict preg;
470
+ const char *__restrict pattern;
471
+ int cflags;
472
+ {
473
+ reg_errcode_t ret;
474
+ reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
475
+ : RE_SYNTAX_POSIX_BASIC);
476
+
477
+ preg->buffer = NULL;
478
+ preg->allocated = 0;
479
+ preg->used = 0;
480
+
481
+ /* Try to allocate space for the fastmap. */
482
+ preg->fastmap = re_malloc (char, SBC_MAX);
483
+ if (BE (preg->fastmap == NULL, 0))
484
+ return REG_ESPACE;
485
+
486
+ syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
487
+
488
+ /* If REG_NEWLINE is set, newlines are treated differently. */
489
+ if (cflags & REG_NEWLINE)
490
+ { /* REG_NEWLINE implies neither . nor [^...] match newline. */
491
+ syntax &= ~RE_DOT_NEWLINE;
492
+ syntax |= RE_HAT_LISTS_NOT_NEWLINE;
493
+ /* It also changes the matching behavior. */
494
+ preg->newline_anchor = 1;
495
+ }
496
+ else
497
+ preg->newline_anchor = 0;
498
+ preg->no_sub = !!(cflags & REG_NOSUB);
499
+ preg->translate = NULL;
500
+
501
+ ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
502
+
503
+ /* POSIX doesn't distinguish between an unmatched open-group and an
504
+ unmatched close-group: both are REG_EPAREN. */
505
+ if (ret == REG_ERPAREN)
506
+ ret = REG_EPAREN;
507
+
508
+ /* We have already checked preg->fastmap != NULL. */
509
+ if (BE (ret == REG_NOERROR, 1))
510
+ /* Compute the fastmap now, since regexec cannot modify the pattern
511
+ buffer. This function never fails in this implementation. */
512
+ (void) re_compile_fastmap (preg);
513
+ else
514
+ {
515
+ /* Some error occurred while compiling the expression. */
516
+ re_free (preg->fastmap);
517
+ preg->fastmap = NULL;
518
+ }
519
+
520
+ return (int) ret;
521
+ }
522
+ #ifdef _LIBC
523
+ weak_alias (__regcomp, regcomp)
524
+ #endif
525
+
526
+ /* Returns a message corresponding to an error code, ERRCODE, returned
527
+ from either regcomp or regexec. We don't use PREG here. */
528
+
529
+ size_t
530
+ regerror (
531
+ int errcode,
532
+ const regex_t *__restrict preg,
533
+ char *__restrict errbuf,
534
+ size_t errbuf_size)
535
+ {
536
+ const char *msg;
537
+ size_t msg_size;
538
+
539
+ if (BE (errcode < 0
540
+ || errcode >= (int) (sizeof (__re_error_msgid_idx)
541
+ / sizeof (__re_error_msgid_idx[0])), 0))
542
+ /* Only error codes returned by the rest of the code should be passed
543
+ to this routine. If we are given anything else, or if other regex
544
+ code generates an invalid error code, then the program has a bug.
545
+ Dump core so we can fix it. */
546
+ abort ();
547
+
548
+ msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
549
+
550
+ msg_size = strlen (msg) + 1; /* Includes the null. */
551
+
552
+ if (BE (errbuf_size != 0, 1))
553
+ {
554
+ if (BE (msg_size > errbuf_size, 0))
555
+ {
556
+ #if defined HAVE_MEMPCPY || defined _LIBC
557
+ *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
558
+ #else
559
+ memcpy (errbuf, msg, errbuf_size - 1);
560
+ errbuf[errbuf_size - 1] = 0;
561
+ #endif
562
+ }
563
+ else
564
+ memcpy (errbuf, msg, msg_size);
565
+ }
566
+
567
+ return msg_size;
568
+ }
569
+ #ifdef _LIBC
570
+ weak_alias (__regerror, regerror)
571
+ #endif
572
+
573
+
574
+ #ifdef RE_ENABLE_I18N
575
+ /* This static array is used for the map to single-byte characters when
576
+ UTF-8 is used. Otherwise we would allocate memory just to initialize
577
+ it the same all the time. UTF-8 is the preferred encoding so this is
578
+ a worthwhile optimization. */
579
+ static const bitset_t utf8_sb_map =
580
+ {
581
+ /* Set the first 128 bits. */
582
+ [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
583
+ };
584
+ #endif
585
+
586
+
587
+ static void
588
+ free_dfa_content (re_dfa_t *dfa)
589
+ {
590
+ int i, j;
591
+
592
+ if (dfa->nodes)
593
+ for (i = 0; i < dfa->nodes_len; ++i)
594
+ free_token (dfa->nodes + i);
595
+ re_free (dfa->nexts);
596
+ for (i = 0; i < dfa->nodes_len; ++i)
597
+ {
598
+ if (dfa->eclosures != NULL)
599
+ re_node_set_free (dfa->eclosures + i);
600
+ if (dfa->inveclosures != NULL)
601
+ re_node_set_free (dfa->inveclosures + i);
602
+ if (dfa->edests != NULL)
603
+ re_node_set_free (dfa->edests + i);
604
+ }
605
+ re_free (dfa->edests);
606
+ re_free (dfa->eclosures);
607
+ re_free (dfa->inveclosures);
608
+ re_free (dfa->nodes);
609
+
610
+ if (dfa->state_table)
611
+ for (i = 0; i <= dfa->state_hash_mask; ++i)
612
+ {
613
+ struct re_state_table_entry *entry = dfa->state_table + i;
614
+ for (j = 0; j < entry->num; ++j)
615
+ {
616
+ re_dfastate_t *state = entry->array[j];
617
+ free_state (state);
618
+ }
619
+ re_free (entry->array);
620
+ }
621
+ re_free (dfa->state_table);
622
+ #ifdef RE_ENABLE_I18N
623
+ if (dfa->sb_char != utf8_sb_map)
624
+ re_free (dfa->sb_char);
625
+ #endif
626
+ re_free (dfa->subexp_map);
627
+ #ifdef DEBUG
628
+ re_free (dfa->re_str);
629
+ #endif
630
+
631
+ re_free (dfa);
632
+ }
633
+
634
+
635
+ /* Free dynamically allocated space used by PREG. */
636
+
637
+ void
638
+ regfree (preg)
639
+ regex_t *preg;
640
+ {
641
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
642
+ if (BE (dfa != NULL, 1))
643
+ free_dfa_content (dfa);
644
+ preg->buffer = NULL;
645
+ preg->allocated = 0;
646
+
647
+ re_free (preg->fastmap);
648
+ preg->fastmap = NULL;
649
+
650
+ re_free (preg->translate);
651
+ preg->translate = NULL;
652
+ }
653
+ #ifdef _LIBC
654
+ weak_alias (__regfree, regfree)
655
+ #endif
656
+
657
+ /* Entry points compatible with 4.2 BSD regex library. We don't define
658
+ them unless specifically requested. */
659
+
660
+ #if defined _REGEX_RE_COMP || defined _LIBC
661
+
662
+ /* BSD has one and only one pattern buffer. */
663
+ static struct re_pattern_buffer re_comp_buf;
664
+
665
+ char *
666
+ # ifdef _LIBC
667
+ /* Make these definitions weak in libc, so POSIX programs can redefine
668
+ these names if they don't use our functions, and still use
669
+ regcomp/regexec above without link errors. */
670
+ weak_function
671
+ # endif
672
+ re_comp (s)
673
+ const char *s;
674
+ {
675
+ reg_errcode_t ret;
676
+ char *fastmap;
677
+
678
+ if (!s)
679
+ {
680
+ if (!re_comp_buf.buffer)
681
+ return gettext ("No previous regular expression");
682
+ return 0;
683
+ }
684
+
685
+ if (re_comp_buf.buffer)
686
+ {
687
+ fastmap = re_comp_buf.fastmap;
688
+ re_comp_buf.fastmap = NULL;
689
+ __regfree (&re_comp_buf);
690
+ memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
691
+ re_comp_buf.fastmap = fastmap;
692
+ }
693
+
694
+ if (re_comp_buf.fastmap == NULL)
695
+ {
696
+ re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
697
+ if (re_comp_buf.fastmap == NULL)
698
+ return (char *) gettext (__re_error_msgid
699
+ + __re_error_msgid_idx[(int) REG_ESPACE]);
700
+ }
701
+
702
+ /* Since `re_exec' always passes NULL for the `regs' argument, we
703
+ don't need to initialize the pattern buffer fields which affect it. */
704
+
705
+ /* Match anchors at newlines. */
706
+ re_comp_buf.newline_anchor = 1;
707
+
708
+ ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
709
+
710
+ if (!ret)
711
+ return NULL;
712
+
713
+ /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
714
+ return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
715
+ }
716
+
717
+ #ifdef _LIBC
718
+ libc_freeres_fn (free_mem)
719
+ {
720
+ __regfree (&re_comp_buf);
721
+ }
722
+ #endif
723
+
724
+ #endif /* _REGEX_RE_COMP */
725
+
726
+ /* Internal entry point.
727
+ Compile the regular expression PATTERN, whose length is LENGTH.
728
+ SYNTAX indicate regular expression's syntax. */
729
+
730
+ static reg_errcode_t
731
+ re_compile_internal (regex_t *preg, const char * pattern, size_t length,
732
+ reg_syntax_t syntax)
733
+ {
734
+ reg_errcode_t err = REG_NOERROR;
735
+ re_dfa_t *dfa;
736
+ re_string_t regexp;
737
+
738
+ /* Initialize the pattern buffer. */
739
+ preg->fastmap_accurate = 0;
740
+ preg->syntax = syntax;
741
+ preg->not_bol = preg->not_eol = 0;
742
+ preg->used = 0;
743
+ preg->re_nsub = 0;
744
+ preg->can_be_null = 0;
745
+ preg->regs_allocated = REGS_UNALLOCATED;
746
+
747
+ /* Initialize the dfa. */
748
+ dfa = (re_dfa_t *) preg->buffer;
749
+ if (BE (preg->allocated < sizeof (re_dfa_t), 0))
750
+ {
751
+ /* If zero allocated, but buffer is non-null, try to realloc
752
+ enough space. This loses if buffer's address is bogus, but
753
+ that is the user's responsibility. If ->buffer is NULL this
754
+ is a simple allocation. */
755
+ dfa = re_realloc (preg->buffer, re_dfa_t, 1);
756
+ if (dfa == NULL)
757
+ return REG_ESPACE;
758
+ preg->allocated = sizeof (re_dfa_t);
759
+ preg->buffer = (unsigned char *) dfa;
760
+ }
761
+ preg->used = sizeof (re_dfa_t);
762
+
763
+ err = init_dfa (dfa, length);
764
+ if (BE (err != REG_NOERROR, 0))
765
+ {
766
+ free_dfa_content (dfa);
767
+ preg->buffer = NULL;
768
+ preg->allocated = 0;
769
+ return err;
770
+ }
771
+ #ifdef DEBUG
772
+ /* Note: length+1 will not overflow since it is checked in init_dfa. */
773
+ dfa->re_str = re_malloc (char, length + 1);
774
+ strncpy (dfa->re_str, pattern, length + 1);
775
+ #endif
776
+
777
+ __libc_lock_init (dfa->lock);
778
+
779
+ err = re_string_construct (&regexp, pattern, length, preg->translate,
780
+ syntax & RE_ICASE, dfa);
781
+ if (BE (err != REG_NOERROR, 0))
782
+ {
783
+ re_compile_internal_free_return:
784
+ free_workarea_compile (preg);
785
+ re_string_destruct (&regexp);
786
+ free_dfa_content (dfa);
787
+ preg->buffer = NULL;
788
+ preg->allocated = 0;
789
+ return err;
790
+ }
791
+
792
+ /* Parse the regular expression, and build a structure tree. */
793
+ preg->re_nsub = 0;
794
+ dfa->str_tree = parse (&regexp, preg, syntax, &err);
795
+ if (BE (dfa->str_tree == NULL, 0))
796
+ goto re_compile_internal_free_return;
797
+
798
+ /* Analyze the tree and create the nfa. */
799
+ err = analyze (preg);
800
+ if (BE (err != REG_NOERROR, 0))
801
+ goto re_compile_internal_free_return;
802
+
803
+ #ifdef RE_ENABLE_I18N
804
+ /* If possible, do searching in single byte encoding to speed things up. */
805
+ if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
806
+ optimize_utf8 (dfa);
807
+ #endif
808
+
809
+ /* Then create the initial state of the dfa. */
810
+ err = create_initial_state (dfa);
811
+
812
+ /* Release work areas. */
813
+ free_workarea_compile (preg);
814
+ re_string_destruct (&regexp);
815
+
816
+ if (BE (err != REG_NOERROR, 0))
817
+ {
818
+ free_dfa_content (dfa);
819
+ preg->buffer = NULL;
820
+ preg->allocated = 0;
821
+ }
822
+
823
+ return err;
824
+ }
825
+
826
+ /* Initialize DFA. We use the length of the regular expression PAT_LEN
827
+ as the initial length of some arrays. */
828
+
829
+ static reg_errcode_t
830
+ init_dfa (re_dfa_t *dfa, size_t pat_len)
831
+ {
832
+ unsigned int table_size;
833
+ #ifndef _LIBC
834
+ char *codeset_name;
835
+ #endif
836
+
837
+ memset (dfa, '\0', sizeof (re_dfa_t));
838
+
839
+ /* Force allocation of str_tree_storage the first time. */
840
+ dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
841
+
842
+ /* Avoid overflows. */
843
+ if (pat_len == SIZE_MAX)
844
+ return REG_ESPACE;
845
+
846
+ dfa->nodes_alloc = pat_len + 1;
847
+ dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
848
+
849
+ /* table_size = 2 ^ ceil(log pat_len) */
850
+ for (table_size = 1; ; table_size <<= 1)
851
+ if (table_size > pat_len)
852
+ break;
853
+
854
+ dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
855
+ dfa->state_hash_mask = table_size - 1;
856
+
857
+ dfa->mb_cur_max = MB_CUR_MAX;
858
+ #ifdef _LIBC
859
+ if (dfa->mb_cur_max == 6
860
+ && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
861
+ dfa->is_utf8 = 1;
862
+ dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
863
+ != 0);
864
+ #else
865
+ # ifdef HAVE_LANGINFO_CODESET
866
+ codeset_name = nl_langinfo (CODESET);
867
+ # else
868
+ codeset_name = getenv ("LC_ALL");
869
+ if (codeset_name == NULL || codeset_name[0] == '\0')
870
+ codeset_name = getenv ("LC_CTYPE");
871
+ if (codeset_name == NULL || codeset_name[0] == '\0')
872
+ codeset_name = getenv ("LANG");
873
+ if (codeset_name == NULL)
874
+ codeset_name = "";
875
+ else if (strchr (codeset_name, '.') != NULL)
876
+ codeset_name = strchr (codeset_name, '.') + 1;
877
+ # endif
878
+
879
+ if (strcasecmp (codeset_name, "UTF-8") == 0
880
+ || strcasecmp (codeset_name, "UTF8") == 0)
881
+ dfa->is_utf8 = 1;
882
+
883
+ /* We check exhaustively in the loop below if this charset is a
884
+ superset of ASCII. */
885
+ dfa->map_notascii = 0;
886
+ #endif
887
+
888
+ #ifdef RE_ENABLE_I18N
889
+ if (dfa->mb_cur_max > 1)
890
+ {
891
+ if (dfa->is_utf8)
892
+ dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
893
+ else
894
+ {
895
+ int i, j, ch;
896
+
897
+ dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
898
+ if (BE (dfa->sb_char == NULL, 0))
899
+ return REG_ESPACE;
900
+
901
+ /* Set the bits corresponding to single byte chars. */
902
+ for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
903
+ for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
904
+ {
905
+ wint_t wch = __btowc (ch);
906
+ if (wch != WEOF)
907
+ dfa->sb_char[i] |= (bitset_word_t) 1 << j;
908
+ # ifndef _LIBC
909
+ if (isascii (ch) && wch != ch)
910
+ dfa->map_notascii = 1;
911
+ # endif
912
+ }
913
+ }
914
+ }
915
+ #endif
916
+
917
+ if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
918
+ return REG_ESPACE;
919
+ return REG_NOERROR;
920
+ }
921
+
922
+ /* Initialize WORD_CHAR table, which indicate which character is
923
+ "word". In this case "word" means that it is the word construction
924
+ character used by some operators like "\<", "\>", etc. */
925
+
926
+ static void
927
+ internal_function
928
+ init_word_char (re_dfa_t *dfa)
929
+ {
930
+ int i, j, ch;
931
+ dfa->word_ops_used = 1;
932
+ for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
933
+ for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
934
+ if (isalnum (ch) || ch == '_')
935
+ dfa->word_char[i] |= (bitset_word_t) 1 << j;
936
+ }
937
+
938
+ /* Free the work area which are only used while compiling. */
939
+
940
+ static void
941
+ free_workarea_compile (regex_t *preg)
942
+ {
943
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
944
+ bin_tree_storage_t *storage, *next;
945
+ for (storage = dfa->str_tree_storage; storage; storage = next)
946
+ {
947
+ next = storage->next;
948
+ re_free (storage);
949
+ }
950
+ dfa->str_tree_storage = NULL;
951
+ dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
952
+ dfa->str_tree = NULL;
953
+ re_free (dfa->org_indices);
954
+ dfa->org_indices = NULL;
955
+ }
956
+
957
+ /* Create initial states for all contexts. */
958
+
959
+ static reg_errcode_t
960
+ create_initial_state (re_dfa_t *dfa)
961
+ {
962
+ int first, i;
963
+ reg_errcode_t err;
964
+ re_node_set init_nodes;
965
+
966
+ /* Initial states have the epsilon closure of the node which is
967
+ the first node of the regular expression. */
968
+ first = dfa->str_tree->first->node_idx;
969
+ dfa->init_node = first;
970
+ err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
971
+ if (BE (err != REG_NOERROR, 0))
972
+ return err;
973
+
974
+ /* The back-references which are in initial states can epsilon transit,
975
+ since in this case all of the subexpressions can be null.
976
+ Then we add epsilon closures of the nodes which are the next nodes of
977
+ the back-references. */
978
+ if (dfa->nbackref > 0)
979
+ for (i = 0; i < init_nodes.nelem; ++i)
980
+ {
981
+ int node_idx = init_nodes.elems[i];
982
+ re_token_type_t type = dfa->nodes[node_idx].type;
983
+
984
+ int clexp_idx;
985
+ if (type != OP_BACK_REF)
986
+ continue;
987
+ for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
988
+ {
989
+ re_token_t *clexp_node;
990
+ clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
991
+ if (clexp_node->type == OP_CLOSE_SUBEXP
992
+ && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
993
+ break;
994
+ }
995
+ if (clexp_idx == init_nodes.nelem)
996
+ continue;
997
+
998
+ if (type == OP_BACK_REF)
999
+ {
1000
+ int dest_idx = dfa->edests[node_idx].elems[0];
1001
+ if (!re_node_set_contains (&init_nodes, dest_idx))
1002
+ {
1003
+ re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1004
+ i = 0;
1005
+ }
1006
+ }
1007
+ }
1008
+
1009
+ /* It must be the first time to invoke acquire_state. */
1010
+ dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1011
+ /* We don't check ERR here, since the initial state must not be NULL. */
1012
+ if (BE (dfa->init_state == NULL, 0))
1013
+ return err;
1014
+ if (dfa->init_state->has_constraint)
1015
+ {
1016
+ dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1017
+ CONTEXT_WORD);
1018
+ dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1019
+ CONTEXT_NEWLINE);
1020
+ dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1021
+ &init_nodes,
1022
+ CONTEXT_NEWLINE
1023
+ | CONTEXT_BEGBUF);
1024
+ if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1025
+ || dfa->init_state_begbuf == NULL, 0))
1026
+ return err;
1027
+ }
1028
+ else
1029
+ dfa->init_state_word = dfa->init_state_nl
1030
+ = dfa->init_state_begbuf = dfa->init_state;
1031
+
1032
+ re_node_set_free (&init_nodes);
1033
+ return REG_NOERROR;
1034
+ }
1035
+
1036
+ #ifdef RE_ENABLE_I18N
1037
+ /* If it is possible to do searching in single byte encoding instead of UTF-8
1038
+ to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1039
+ DFA nodes where needed. */
1040
+
1041
+ static void
1042
+ optimize_utf8 (re_dfa_t *dfa)
1043
+ {
1044
+ int node, i, mb_chars = 0, has_period = 0;
1045
+
1046
+ for (node = 0; node < dfa->nodes_len; ++node)
1047
+ switch (dfa->nodes[node].type)
1048
+ {
1049
+ case CHARACTER:
1050
+ if (dfa->nodes[node].opr.c >= 0x80)
1051
+ mb_chars = 1;
1052
+ break;
1053
+ case ANCHOR:
1054
+ switch (dfa->nodes[node].opr.ctx_type)
1055
+ {
1056
+ case LINE_FIRST:
1057
+ case LINE_LAST:
1058
+ case BUF_FIRST:
1059
+ case BUF_LAST:
1060
+ break;
1061
+ default:
1062
+ /* Word anchors etc. cannot be handled. It's okay to test
1063
+ opr.ctx_type since constraints (for all DFA nodes) are
1064
+ created by ORing one or more opr.ctx_type values. */
1065
+ return;
1066
+ }
1067
+ break;
1068
+ case OP_PERIOD:
1069
+ has_period = 1;
1070
+ break;
1071
+ case OP_BACK_REF:
1072
+ case OP_ALT:
1073
+ case END_OF_RE:
1074
+ case OP_DUP_ASTERISK:
1075
+ case OP_OPEN_SUBEXP:
1076
+ case OP_CLOSE_SUBEXP:
1077
+ break;
1078
+ case COMPLEX_BRACKET:
1079
+ return;
1080
+ case SIMPLE_BRACKET:
1081
+ /* Just double check. The non-ASCII range starts at 0x80. */
1082
+ assert (0x80 % BITSET_WORD_BITS == 0);
1083
+ for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1084
+ if (dfa->nodes[node].opr.sbcset[i])
1085
+ return;
1086
+ break;
1087
+ default:
1088
+ abort ();
1089
+ }
1090
+
1091
+ if (mb_chars || has_period)
1092
+ for (node = 0; node < dfa->nodes_len; ++node)
1093
+ {
1094
+ if (dfa->nodes[node].type == CHARACTER
1095
+ && dfa->nodes[node].opr.c >= 0x80)
1096
+ dfa->nodes[node].mb_partial = 0;
1097
+ else if (dfa->nodes[node].type == OP_PERIOD)
1098
+ dfa->nodes[node].type = OP_UTF8_PERIOD;
1099
+ }
1100
+
1101
+ /* The search can be in single byte locale. */
1102
+ dfa->mb_cur_max = 1;
1103
+ dfa->is_utf8 = 0;
1104
+ dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1105
+ }
1106
+ #endif
1107
+
1108
+ /* Analyze the structure tree, and calculate "first", "next", "edest",
1109
+ "eclosure", and "inveclosure". */
1110
+
1111
+ static reg_errcode_t
1112
+ analyze (regex_t *preg)
1113
+ {
1114
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1115
+ reg_errcode_t ret;
1116
+
1117
+ /* Allocate arrays. */
1118
+ dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1119
+ dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1120
+ dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1121
+ dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1122
+ if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1123
+ || dfa->eclosures == NULL, 0))
1124
+ return REG_ESPACE;
1125
+
1126
+ dfa->subexp_map = re_malloc (int, preg->re_nsub);
1127
+ if (dfa->subexp_map != NULL)
1128
+ {
1129
+ int i;
1130
+ for (i = 0; i < preg->re_nsub; i++)
1131
+ dfa->subexp_map[i] = i;
1132
+ preorder (dfa->str_tree, optimize_subexps, dfa);
1133
+ for (i = 0; i < preg->re_nsub; i++)
1134
+ if (dfa->subexp_map[i] != i)
1135
+ break;
1136
+ if (i == preg->re_nsub)
1137
+ {
1138
+ free (dfa->subexp_map);
1139
+ dfa->subexp_map = NULL;
1140
+ }
1141
+ }
1142
+
1143
+ ret = postorder (dfa->str_tree, lower_subexps, preg);
1144
+ if (BE (ret != REG_NOERROR, 0))
1145
+ return ret;
1146
+ ret = postorder (dfa->str_tree, calc_first, dfa);
1147
+ if (BE (ret != REG_NOERROR, 0))
1148
+ return ret;
1149
+ preorder (dfa->str_tree, calc_next, dfa);
1150
+ ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1151
+ if (BE (ret != REG_NOERROR, 0))
1152
+ return ret;
1153
+ ret = calc_eclosure (dfa);
1154
+ if (BE (ret != REG_NOERROR, 0))
1155
+ return ret;
1156
+
1157
+ /* We only need this during the prune_impossible_nodes pass in regexec.c;
1158
+ skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1159
+ if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1160
+ || dfa->nbackref)
1161
+ {
1162
+ dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1163
+ if (BE (dfa->inveclosures == NULL, 0))
1164
+ return REG_ESPACE;
1165
+ ret = calc_inveclosure (dfa);
1166
+ }
1167
+
1168
+ return ret;
1169
+ }
1170
+
1171
+ /* Our parse trees are very unbalanced, so we cannot use a stack to
1172
+ implement parse tree visits. Instead, we use parent pointers and
1173
+ some hairy code in these two functions. */
1174
+ static reg_errcode_t
1175
+ postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1176
+ void *extra)
1177
+ {
1178
+ bin_tree_t *node, *prev;
1179
+
1180
+ for (node = root; ; )
1181
+ {
1182
+ /* Descend down the tree, preferably to the left (or to the right
1183
+ if that's the only child). */
1184
+ while (node->left || node->right)
1185
+ if (node->left)
1186
+ node = node->left;
1187
+ else
1188
+ node = node->right;
1189
+
1190
+ do
1191
+ {
1192
+ reg_errcode_t err = fn (extra, node);
1193
+ if (BE (err != REG_NOERROR, 0))
1194
+ return err;
1195
+ if (node->parent == NULL)
1196
+ return REG_NOERROR;
1197
+ prev = node;
1198
+ node = node->parent;
1199
+ }
1200
+ /* Go up while we have a node that is reached from the right. */
1201
+ while (node->right == prev || node->right == NULL);
1202
+ node = node->right;
1203
+ }
1204
+ }
1205
+
1206
+ static reg_errcode_t
1207
+ preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1208
+ void *extra)
1209
+ {
1210
+ bin_tree_t *node;
1211
+
1212
+ for (node = root; ; )
1213
+ {
1214
+ reg_errcode_t err = fn (extra, node);
1215
+ if (BE (err != REG_NOERROR, 0))
1216
+ return err;
1217
+
1218
+ /* Go to the left node, or up and to the right. */
1219
+ if (node->left)
1220
+ node = node->left;
1221
+ else
1222
+ {
1223
+ bin_tree_t *prev = NULL;
1224
+ while (node->right == prev || node->right == NULL)
1225
+ {
1226
+ prev = node;
1227
+ node = node->parent;
1228
+ if (!node)
1229
+ return REG_NOERROR;
1230
+ }
1231
+ node = node->right;
1232
+ }
1233
+ }
1234
+ }
1235
+
1236
+ /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1237
+ re_search_internal to map the inner one's opr.idx to this one's. Adjust
1238
+ backreferences as well. Requires a preorder visit. */
1239
+ static reg_errcode_t
1240
+ optimize_subexps (void *extra, bin_tree_t *node)
1241
+ {
1242
+ re_dfa_t *dfa = (re_dfa_t *) extra;
1243
+
1244
+ if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1245
+ {
1246
+ int idx = node->token.opr.idx;
1247
+ node->token.opr.idx = dfa->subexp_map[idx];
1248
+ dfa->used_bkref_map |= 1 << node->token.opr.idx;
1249
+ }
1250
+
1251
+ else if (node->token.type == SUBEXP
1252
+ && node->left && node->left->token.type == SUBEXP)
1253
+ {
1254
+ int other_idx = node->left->token.opr.idx;
1255
+
1256
+ node->left = node->left->left;
1257
+ if (node->left)
1258
+ node->left->parent = node;
1259
+
1260
+ dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1261
+ if (other_idx < BITSET_WORD_BITS)
1262
+ dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1263
+ }
1264
+
1265
+ return REG_NOERROR;
1266
+ }
1267
+
1268
+ /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1269
+ of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1270
+ static reg_errcode_t
1271
+ lower_subexps (void *extra, bin_tree_t *node)
1272
+ {
1273
+ regex_t *preg = (regex_t *) extra;
1274
+ reg_errcode_t err = REG_NOERROR;
1275
+
1276
+ if (node->left && node->left->token.type == SUBEXP)
1277
+ {
1278
+ node->left = lower_subexp (&err, preg, node->left);
1279
+ if (node->left)
1280
+ node->left->parent = node;
1281
+ }
1282
+ if (node->right && node->right->token.type == SUBEXP)
1283
+ {
1284
+ node->right = lower_subexp (&err, preg, node->right);
1285
+ if (node->right)
1286
+ node->right->parent = node;
1287
+ }
1288
+
1289
+ return err;
1290
+ }
1291
+
1292
+ static bin_tree_t *
1293
+ lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1294
+ {
1295
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1296
+ bin_tree_t *body = node->left;
1297
+ bin_tree_t *op, *cls, *tree1, *tree;
1298
+
1299
+ if (preg->no_sub
1300
+ /* We do not optimize empty subexpressions, because otherwise we may
1301
+ have bad CONCAT nodes with NULL children. This is obviously not
1302
+ very common, so we do not lose much. An example that triggers
1303
+ this case is the sed "script" /\(\)/x. */
1304
+ && node->left != NULL
1305
+ && (node->token.opr.idx >= BITSET_WORD_BITS
1306
+ || !(dfa->used_bkref_map
1307
+ & ((bitset_word_t) 1 << node->token.opr.idx))))
1308
+ return node->left;
1309
+
1310
+ /* Convert the SUBEXP node to the concatenation of an
1311
+ OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1312
+ op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1313
+ cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1314
+ tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1315
+ tree = create_tree (dfa, op, tree1, CONCAT);
1316
+ if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1317
+ {
1318
+ *err = REG_ESPACE;
1319
+ return NULL;
1320
+ }
1321
+
1322
+ op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1323
+ op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1324
+ return tree;
1325
+ }
1326
+
1327
+ /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1328
+ nodes. Requires a postorder visit. */
1329
+ static reg_errcode_t
1330
+ calc_first (void *extra, bin_tree_t *node)
1331
+ {
1332
+ re_dfa_t *dfa = (re_dfa_t *) extra;
1333
+ if (node->token.type == CONCAT)
1334
+ {
1335
+ node->first = node->left->first;
1336
+ node->node_idx = node->left->node_idx;
1337
+ }
1338
+ else
1339
+ {
1340
+ node->first = node;
1341
+ node->node_idx = re_dfa_add_node (dfa, node->token);
1342
+ if (BE (node->node_idx == -1, 0))
1343
+ return REG_ESPACE;
1344
+ if (node->token.type == ANCHOR)
1345
+ dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1346
+ }
1347
+ return REG_NOERROR;
1348
+ }
1349
+
1350
+ /* Pass 2: compute NEXT on the tree. Preorder visit. */
1351
+ static reg_errcode_t
1352
+ calc_next (void *extra, bin_tree_t *node)
1353
+ {
1354
+ switch (node->token.type)
1355
+ {
1356
+ case OP_DUP_ASTERISK:
1357
+ node->left->next = node;
1358
+ break;
1359
+ case CONCAT:
1360
+ node->left->next = node->right->first;
1361
+ node->right->next = node->next;
1362
+ break;
1363
+ default:
1364
+ if (node->left)
1365
+ node->left->next = node->next;
1366
+ if (node->right)
1367
+ node->right->next = node->next;
1368
+ break;
1369
+ }
1370
+ return REG_NOERROR;
1371
+ }
1372
+
1373
+ /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1374
+ static reg_errcode_t
1375
+ link_nfa_nodes (void *extra, bin_tree_t *node)
1376
+ {
1377
+ re_dfa_t *dfa = (re_dfa_t *) extra;
1378
+ int idx = node->node_idx;
1379
+ reg_errcode_t err = REG_NOERROR;
1380
+
1381
+ switch (node->token.type)
1382
+ {
1383
+ case CONCAT:
1384
+ break;
1385
+
1386
+ case END_OF_RE:
1387
+ assert (node->next == NULL);
1388
+ break;
1389
+
1390
+ case OP_DUP_ASTERISK:
1391
+ case OP_ALT:
1392
+ {
1393
+ int left, right;
1394
+ dfa->has_plural_match = 1;
1395
+ if (node->left != NULL)
1396
+ left = node->left->first->node_idx;
1397
+ else
1398
+ left = node->next->node_idx;
1399
+ if (node->right != NULL)
1400
+ right = node->right->first->node_idx;
1401
+ else
1402
+ right = node->next->node_idx;
1403
+ assert (left > -1);
1404
+ assert (right > -1);
1405
+ err = re_node_set_init_2 (dfa->edests + idx, left, right);
1406
+ }
1407
+ break;
1408
+
1409
+ case ANCHOR:
1410
+ case OP_OPEN_SUBEXP:
1411
+ case OP_CLOSE_SUBEXP:
1412
+ err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1413
+ break;
1414
+
1415
+ case OP_BACK_REF:
1416
+ dfa->nexts[idx] = node->next->node_idx;
1417
+ if (node->token.type == OP_BACK_REF)
1418
+ re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1419
+ break;
1420
+
1421
+ default:
1422
+ assert (!IS_EPSILON_NODE (node->token.type));
1423
+ dfa->nexts[idx] = node->next->node_idx;
1424
+ break;
1425
+ }
1426
+
1427
+ return err;
1428
+ }
1429
+
1430
+ /* Duplicate the epsilon closure of the node ROOT_NODE.
1431
+ Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1432
+ to their own constraint. */
1433
+
1434
+ static reg_errcode_t
1435
+ internal_function
1436
+ duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1437
+ int root_node, unsigned int init_constraint)
1438
+ {
1439
+ int org_node, clone_node, ret;
1440
+ unsigned int constraint = init_constraint;
1441
+ for (org_node = top_org_node, clone_node = top_clone_node;;)
1442
+ {
1443
+ int org_dest, clone_dest;
1444
+ if (dfa->nodes[org_node].type == OP_BACK_REF)
1445
+ {
1446
+ /* If the back reference epsilon-transit, its destination must
1447
+ also have the constraint. Then duplicate the epsilon closure
1448
+ of the destination of the back reference, and store it in
1449
+ edests of the back reference. */
1450
+ org_dest = dfa->nexts[org_node];
1451
+ re_node_set_empty (dfa->edests + clone_node);
1452
+ clone_dest = duplicate_node (dfa, org_dest, constraint);
1453
+ if (BE (clone_dest == -1, 0))
1454
+ return REG_ESPACE;
1455
+ dfa->nexts[clone_node] = dfa->nexts[org_node];
1456
+ ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1457
+ if (BE (ret < 0, 0))
1458
+ return REG_ESPACE;
1459
+ }
1460
+ else if (dfa->edests[org_node].nelem == 0)
1461
+ {
1462
+ /* In case of the node can't epsilon-transit, don't duplicate the
1463
+ destination and store the original destination as the
1464
+ destination of the node. */
1465
+ dfa->nexts[clone_node] = dfa->nexts[org_node];
1466
+ break;
1467
+ }
1468
+ else if (dfa->edests[org_node].nelem == 1)
1469
+ {
1470
+ /* In case of the node can epsilon-transit, and it has only one
1471
+ destination. */
1472
+ org_dest = dfa->edests[org_node].elems[0];
1473
+ re_node_set_empty (dfa->edests + clone_node);
1474
+ /* If the node is root_node itself, it means the epsilon clsoure
1475
+ has a loop. Then tie it to the destination of the root_node. */
1476
+ if (org_node == root_node && clone_node != org_node)
1477
+ {
1478
+ ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1479
+ if (BE (ret < 0, 0))
1480
+ return REG_ESPACE;
1481
+ break;
1482
+ }
1483
+ /* In case of the node has another constraint, add it. */
1484
+ constraint |= dfa->nodes[org_node].constraint;
1485
+ clone_dest = duplicate_node (dfa, org_dest, constraint);
1486
+ if (BE (clone_dest == -1, 0))
1487
+ return REG_ESPACE;
1488
+ ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489
+ if (BE (ret < 0, 0))
1490
+ return REG_ESPACE;
1491
+ }
1492
+ else /* dfa->edests[org_node].nelem == 2 */
1493
+ {
1494
+ /* In case of the node can epsilon-transit, and it has two
1495
+ destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1496
+ org_dest = dfa->edests[org_node].elems[0];
1497
+ re_node_set_empty (dfa->edests + clone_node);
1498
+ /* Search for a duplicated node which satisfies the constraint. */
1499
+ clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1500
+ if (clone_dest == -1)
1501
+ {
1502
+ /* There is no such duplicated node, create a new one. */
1503
+ reg_errcode_t err;
1504
+ clone_dest = duplicate_node (dfa, org_dest, constraint);
1505
+ if (BE (clone_dest == -1, 0))
1506
+ return REG_ESPACE;
1507
+ ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1508
+ if (BE (ret < 0, 0))
1509
+ return REG_ESPACE;
1510
+ err = duplicate_node_closure (dfa, org_dest, clone_dest,
1511
+ root_node, constraint);
1512
+ if (BE (err != REG_NOERROR, 0))
1513
+ return err;
1514
+ }
1515
+ else
1516
+ {
1517
+ /* There is a duplicated node which satisfies the constraint,
1518
+ use it to avoid infinite loop. */
1519
+ ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1520
+ if (BE (ret < 0, 0))
1521
+ return REG_ESPACE;
1522
+ }
1523
+
1524
+ org_dest = dfa->edests[org_node].elems[1];
1525
+ clone_dest = duplicate_node (dfa, org_dest, constraint);
1526
+ if (BE (clone_dest == -1, 0))
1527
+ return REG_ESPACE;
1528
+ ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529
+ if (BE (ret < 0, 0))
1530
+ return REG_ESPACE;
1531
+ }
1532
+ org_node = org_dest;
1533
+ clone_node = clone_dest;
1534
+ }
1535
+ return REG_NOERROR;
1536
+ }
1537
+
1538
+ /* Search for a node which is duplicated from the node ORG_NODE, and
1539
+ satisfies the constraint CONSTRAINT. */
1540
+
1541
+ static int
1542
+ search_duplicated_node (const re_dfa_t *dfa, int org_node,
1543
+ unsigned int constraint)
1544
+ {
1545
+ int idx;
1546
+ for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1547
+ {
1548
+ if (org_node == dfa->org_indices[idx]
1549
+ && constraint == dfa->nodes[idx].constraint)
1550
+ return idx; /* Found. */
1551
+ }
1552
+ return -1; /* Not found. */
1553
+ }
1554
+
1555
+ /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1556
+ Return the index of the new node, or -1 if insufficient storage is
1557
+ available. */
1558
+
1559
+ static int
1560
+ duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1561
+ {
1562
+ int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1563
+ if (BE (dup_idx != -1, 1))
1564
+ {
1565
+ dfa->nodes[dup_idx].constraint = constraint;
1566
+ dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1567
+ dfa->nodes[dup_idx].duplicated = 1;
1568
+
1569
+ /* Store the index of the original node. */
1570
+ dfa->org_indices[dup_idx] = org_idx;
1571
+ }
1572
+ return dup_idx;
1573
+ }
1574
+
1575
+ static reg_errcode_t
1576
+ calc_inveclosure (re_dfa_t *dfa)
1577
+ {
1578
+ int src, idx, ret;
1579
+ for (idx = 0; idx < dfa->nodes_len; ++idx)
1580
+ re_node_set_init_empty (dfa->inveclosures + idx);
1581
+
1582
+ for (src = 0; src < dfa->nodes_len; ++src)
1583
+ {
1584
+ int *elems = dfa->eclosures[src].elems;
1585
+ for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1586
+ {
1587
+ ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1588
+ if (BE (ret == -1, 0))
1589
+ return REG_ESPACE;
1590
+ }
1591
+ }
1592
+
1593
+ return REG_NOERROR;
1594
+ }
1595
+
1596
+ /* Calculate "eclosure" for all the node in DFA. */
1597
+
1598
+ static reg_errcode_t
1599
+ calc_eclosure (re_dfa_t *dfa)
1600
+ {
1601
+ int node_idx, incomplete;
1602
+ #ifdef DEBUG
1603
+ assert (dfa->nodes_len > 0);
1604
+ #endif
1605
+ incomplete = 0;
1606
+ /* For each nodes, calculate epsilon closure. */
1607
+ for (node_idx = 0; ; ++node_idx)
1608
+ {
1609
+ reg_errcode_t err;
1610
+ re_node_set eclosure_elem;
1611
+ if (node_idx == dfa->nodes_len)
1612
+ {
1613
+ if (!incomplete)
1614
+ break;
1615
+ incomplete = 0;
1616
+ node_idx = 0;
1617
+ }
1618
+
1619
+ #ifdef DEBUG
1620
+ assert (dfa->eclosures[node_idx].nelem != -1);
1621
+ #endif
1622
+
1623
+ /* If we have already calculated, skip it. */
1624
+ if (dfa->eclosures[node_idx].nelem != 0)
1625
+ continue;
1626
+ /* Calculate epsilon closure of `node_idx'. */
1627
+ err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1628
+ if (BE (err != REG_NOERROR, 0))
1629
+ return err;
1630
+
1631
+ if (dfa->eclosures[node_idx].nelem == 0)
1632
+ {
1633
+ incomplete = 1;
1634
+ re_node_set_free (&eclosure_elem);
1635
+ }
1636
+ }
1637
+ return REG_NOERROR;
1638
+ }
1639
+
1640
+ /* Calculate epsilon closure of NODE. */
1641
+
1642
+ static reg_errcode_t
1643
+ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1644
+ {
1645
+ reg_errcode_t err;
1646
+ int i, incomplete;
1647
+ re_node_set eclosure;
1648
+ incomplete = 0;
1649
+ err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1650
+ if (BE (err != REG_NOERROR, 0))
1651
+ return err;
1652
+
1653
+ /* This indicates that we are calculating this node now.
1654
+ We reference this value to avoid infinite loop. */
1655
+ dfa->eclosures[node].nelem = -1;
1656
+
1657
+ /* If the current node has constraints, duplicate all nodes
1658
+ since they must inherit the constraints. */
1659
+ if (dfa->nodes[node].constraint
1660
+ && dfa->edests[node].nelem
1661
+ && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1662
+ {
1663
+ err = duplicate_node_closure (dfa, node, node, node,
1664
+ dfa->nodes[node].constraint);
1665
+ if (BE (err != REG_NOERROR, 0))
1666
+ return err;
1667
+ }
1668
+
1669
+ /* Expand each epsilon destination nodes. */
1670
+ if (IS_EPSILON_NODE(dfa->nodes[node].type))
1671
+ for (i = 0; i < dfa->edests[node].nelem; ++i)
1672
+ {
1673
+ re_node_set eclosure_elem;
1674
+ int edest = dfa->edests[node].elems[i];
1675
+ /* If calculating the epsilon closure of `edest' is in progress,
1676
+ return intermediate result. */
1677
+ if (dfa->eclosures[edest].nelem == -1)
1678
+ {
1679
+ incomplete = 1;
1680
+ continue;
1681
+ }
1682
+ /* If we haven't calculated the epsilon closure of `edest' yet,
1683
+ calculate now. Otherwise use calculated epsilon closure. */
1684
+ if (dfa->eclosures[edest].nelem == 0)
1685
+ {
1686
+ err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1687
+ if (BE (err != REG_NOERROR, 0))
1688
+ return err;
1689
+ }
1690
+ else
1691
+ eclosure_elem = dfa->eclosures[edest];
1692
+ /* Merge the epsilon closure of `edest'. */
1693
+ re_node_set_merge (&eclosure, &eclosure_elem);
1694
+ /* If the epsilon closure of `edest' is incomplete,
1695
+ the epsilon closure of this node is also incomplete. */
1696
+ if (dfa->eclosures[edest].nelem == 0)
1697
+ {
1698
+ incomplete = 1;
1699
+ re_node_set_free (&eclosure_elem);
1700
+ }
1701
+ }
1702
+
1703
+ /* Epsilon closures include itself. */
1704
+ re_node_set_insert (&eclosure, node);
1705
+ if (incomplete && !root)
1706
+ dfa->eclosures[node].nelem = 0;
1707
+ else
1708
+ dfa->eclosures[node] = eclosure;
1709
+ *new_set = eclosure;
1710
+ return REG_NOERROR;
1711
+ }
1712
+
1713
+ /* Functions for token which are used in the parser. */
1714
+
1715
+ /* Fetch a token from INPUT.
1716
+ We must not use this function inside bracket expressions. */
1717
+
1718
+ static void
1719
+ internal_function
1720
+ fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1721
+ {
1722
+ re_string_skip_bytes (input, peek_token (result, input, syntax));
1723
+ }
1724
+
1725
+ /* Peek a token from INPUT, and return the length of the token.
1726
+ We must not use this function inside bracket expressions. */
1727
+
1728
+ static int
1729
+ internal_function
1730
+ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1731
+ {
1732
+ unsigned char c;
1733
+
1734
+ if (re_string_eoi (input))
1735
+ {
1736
+ token->type = END_OF_RE;
1737
+ return 0;
1738
+ }
1739
+
1740
+ c = re_string_peek_byte (input, 0);
1741
+ token->opr.c = c;
1742
+
1743
+ token->word_char = 0;
1744
+ #ifdef RE_ENABLE_I18N
1745
+ token->mb_partial = 0;
1746
+ if (input->mb_cur_max > 1 &&
1747
+ !re_string_first_byte (input, re_string_cur_idx (input)))
1748
+ {
1749
+ token->type = CHARACTER;
1750
+ token->mb_partial = 1;
1751
+ return 1;
1752
+ }
1753
+ #endif
1754
+ if (c == '\\')
1755
+ {
1756
+ unsigned char c2;
1757
+ if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1758
+ {
1759
+ token->type = BACK_SLASH;
1760
+ return 1;
1761
+ }
1762
+
1763
+ c2 = re_string_peek_byte_case (input, 1);
1764
+ token->opr.c = c2;
1765
+ token->type = CHARACTER;
1766
+ #ifdef RE_ENABLE_I18N
1767
+ if (input->mb_cur_max > 1)
1768
+ {
1769
+ wint_t wc = re_string_wchar_at (input,
1770
+ re_string_cur_idx (input) + 1);
1771
+ token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1772
+ }
1773
+ else
1774
+ #endif
1775
+ token->word_char = IS_WORD_CHAR (c2) != 0;
1776
+
1777
+ switch (c2)
1778
+ {
1779
+ case '|':
1780
+ if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1781
+ token->type = OP_ALT;
1782
+ break;
1783
+ case '1': case '2': case '3': case '4': case '5':
1784
+ case '6': case '7': case '8': case '9':
1785
+ if (!(syntax & RE_NO_BK_REFS))
1786
+ {
1787
+ token->type = OP_BACK_REF;
1788
+ token->opr.idx = c2 - '1';
1789
+ }
1790
+ break;
1791
+ case '<':
1792
+ if (!(syntax & RE_NO_GNU_OPS))
1793
+ {
1794
+ token->type = ANCHOR;
1795
+ token->opr.ctx_type = WORD_FIRST;
1796
+ }
1797
+ break;
1798
+ case '>':
1799
+ if (!(syntax & RE_NO_GNU_OPS))
1800
+ {
1801
+ token->type = ANCHOR;
1802
+ token->opr.ctx_type = WORD_LAST;
1803
+ }
1804
+ break;
1805
+ case 'b':
1806
+ if (!(syntax & RE_NO_GNU_OPS))
1807
+ {
1808
+ token->type = ANCHOR;
1809
+ token->opr.ctx_type = WORD_DELIM;
1810
+ }
1811
+ break;
1812
+ case 'B':
1813
+ if (!(syntax & RE_NO_GNU_OPS))
1814
+ {
1815
+ token->type = ANCHOR;
1816
+ token->opr.ctx_type = NOT_WORD_DELIM;
1817
+ }
1818
+ break;
1819
+ case 'w':
1820
+ if (!(syntax & RE_NO_GNU_OPS))
1821
+ token->type = OP_WORD;
1822
+ break;
1823
+ case 'W':
1824
+ if (!(syntax & RE_NO_GNU_OPS))
1825
+ token->type = OP_NOTWORD;
1826
+ break;
1827
+ case 's':
1828
+ if (!(syntax & RE_NO_GNU_OPS))
1829
+ token->type = OP_SPACE;
1830
+ break;
1831
+ case 'S':
1832
+ if (!(syntax & RE_NO_GNU_OPS))
1833
+ token->type = OP_NOTSPACE;
1834
+ break;
1835
+ case '`':
1836
+ if (!(syntax & RE_NO_GNU_OPS))
1837
+ {
1838
+ token->type = ANCHOR;
1839
+ token->opr.ctx_type = BUF_FIRST;
1840
+ }
1841
+ break;
1842
+ case '\'':
1843
+ if (!(syntax & RE_NO_GNU_OPS))
1844
+ {
1845
+ token->type = ANCHOR;
1846
+ token->opr.ctx_type = BUF_LAST;
1847
+ }
1848
+ break;
1849
+ case '(':
1850
+ if (!(syntax & RE_NO_BK_PARENS))
1851
+ token->type = OP_OPEN_SUBEXP;
1852
+ break;
1853
+ case ')':
1854
+ if (!(syntax & RE_NO_BK_PARENS))
1855
+ token->type = OP_CLOSE_SUBEXP;
1856
+ break;
1857
+ case '+':
1858
+ if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1859
+ token->type = OP_DUP_PLUS;
1860
+ break;
1861
+ case '?':
1862
+ if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1863
+ token->type = OP_DUP_QUESTION;
1864
+ break;
1865
+ case '{':
1866
+ if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1867
+ token->type = OP_OPEN_DUP_NUM;
1868
+ break;
1869
+ case '}':
1870
+ if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1871
+ token->type = OP_CLOSE_DUP_NUM;
1872
+ break;
1873
+ default:
1874
+ break;
1875
+ }
1876
+ return 2;
1877
+ }
1878
+
1879
+ token->type = CHARACTER;
1880
+ #ifdef RE_ENABLE_I18N
1881
+ if (input->mb_cur_max > 1)
1882
+ {
1883
+ wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1884
+ token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1885
+ }
1886
+ else
1887
+ #endif
1888
+ token->word_char = IS_WORD_CHAR (token->opr.c);
1889
+
1890
+ switch (c)
1891
+ {
1892
+ case '\n':
1893
+ if (syntax & RE_NEWLINE_ALT)
1894
+ token->type = OP_ALT;
1895
+ break;
1896
+ case '|':
1897
+ if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1898
+ token->type = OP_ALT;
1899
+ break;
1900
+ case '*':
1901
+ token->type = OP_DUP_ASTERISK;
1902
+ break;
1903
+ case '+':
1904
+ if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1905
+ token->type = OP_DUP_PLUS;
1906
+ break;
1907
+ case '?':
1908
+ if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1909
+ token->type = OP_DUP_QUESTION;
1910
+ break;
1911
+ case '{':
1912
+ if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1913
+ token->type = OP_OPEN_DUP_NUM;
1914
+ break;
1915
+ case '}':
1916
+ if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1917
+ token->type = OP_CLOSE_DUP_NUM;
1918
+ break;
1919
+ case '(':
1920
+ if (syntax & RE_NO_BK_PARENS)
1921
+ token->type = OP_OPEN_SUBEXP;
1922
+ break;
1923
+ case ')':
1924
+ if (syntax & RE_NO_BK_PARENS)
1925
+ token->type = OP_CLOSE_SUBEXP;
1926
+ break;
1927
+ case '[':
1928
+ token->type = OP_OPEN_BRACKET;
1929
+ break;
1930
+ case '.':
1931
+ token->type = OP_PERIOD;
1932
+ break;
1933
+ case '^':
1934
+ if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1935
+ re_string_cur_idx (input) != 0)
1936
+ {
1937
+ char prev = re_string_peek_byte (input, -1);
1938
+ if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1939
+ break;
1940
+ }
1941
+ token->type = ANCHOR;
1942
+ token->opr.ctx_type = LINE_FIRST;
1943
+ break;
1944
+ case '$':
1945
+ if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1946
+ re_string_cur_idx (input) + 1 != re_string_length (input))
1947
+ {
1948
+ re_token_t next;
1949
+ re_string_skip_bytes (input, 1);
1950
+ peek_token (&next, input, syntax);
1951
+ re_string_skip_bytes (input, -1);
1952
+ if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1953
+ break;
1954
+ }
1955
+ token->type = ANCHOR;
1956
+ token->opr.ctx_type = LINE_LAST;
1957
+ break;
1958
+ default:
1959
+ break;
1960
+ }
1961
+ return 1;
1962
+ }
1963
+
1964
+ /* Peek a token from INPUT, and return the length of the token.
1965
+ We must not use this function out of bracket expressions. */
1966
+
1967
+ static int
1968
+ internal_function
1969
+ peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1970
+ {
1971
+ unsigned char c;
1972
+ if (re_string_eoi (input))
1973
+ {
1974
+ token->type = END_OF_RE;
1975
+ return 0;
1976
+ }
1977
+ c = re_string_peek_byte (input, 0);
1978
+ token->opr.c = c;
1979
+
1980
+ #ifdef RE_ENABLE_I18N
1981
+ if (input->mb_cur_max > 1 &&
1982
+ !re_string_first_byte (input, re_string_cur_idx (input)))
1983
+ {
1984
+ token->type = CHARACTER;
1985
+ return 1;
1986
+ }
1987
+ #endif /* RE_ENABLE_I18N */
1988
+
1989
+ if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1990
+ && re_string_cur_idx (input) + 1 < re_string_length (input))
1991
+ {
1992
+ /* In this case, '\' escape a character. */
1993
+ unsigned char c2;
1994
+ re_string_skip_bytes (input, 1);
1995
+ c2 = re_string_peek_byte (input, 0);
1996
+ token->opr.c = c2;
1997
+ token->type = CHARACTER;
1998
+ return 1;
1999
+ }
2000
+ if (c == '[') /* '[' is a special char in a bracket exps. */
2001
+ {
2002
+ unsigned char c2;
2003
+ int token_len;
2004
+ if (re_string_cur_idx (input) + 1 < re_string_length (input))
2005
+ c2 = re_string_peek_byte (input, 1);
2006
+ else
2007
+ c2 = 0;
2008
+ token->opr.c = c2;
2009
+ token_len = 2;
2010
+ switch (c2)
2011
+ {
2012
+ case '.':
2013
+ token->type = OP_OPEN_COLL_ELEM;
2014
+ break;
2015
+ case '=':
2016
+ token->type = OP_OPEN_EQUIV_CLASS;
2017
+ break;
2018
+ case ':':
2019
+ if (syntax & RE_CHAR_CLASSES)
2020
+ {
2021
+ token->type = OP_OPEN_CHAR_CLASS;
2022
+ break;
2023
+ }
2024
+ /* else fall through. */
2025
+ default:
2026
+ token->type = CHARACTER;
2027
+ token->opr.c = c;
2028
+ token_len = 1;
2029
+ break;
2030
+ }
2031
+ return token_len;
2032
+ }
2033
+ switch (c)
2034
+ {
2035
+ case '-':
2036
+ token->type = OP_CHARSET_RANGE;
2037
+ break;
2038
+ case ']':
2039
+ token->type = OP_CLOSE_BRACKET;
2040
+ break;
2041
+ case '^':
2042
+ token->type = OP_NON_MATCH_LIST;
2043
+ break;
2044
+ default:
2045
+ token->type = CHARACTER;
2046
+ }
2047
+ return 1;
2048
+ }
2049
+
2050
+ /* Functions for parser. */
2051
+
2052
+ /* Entry point of the parser.
2053
+ Parse the regular expression REGEXP and return the structure tree.
2054
+ If an error is occured, ERR is set by error code, and return NULL.
2055
+ This function build the following tree, from regular expression <reg_exp>:
2056
+ CAT
2057
+ / \
2058
+ / \
2059
+ <reg_exp> EOR
2060
+
2061
+ CAT means concatenation.
2062
+ EOR means end of regular expression. */
2063
+
2064
+ static bin_tree_t *
2065
+ parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2066
+ reg_errcode_t *err)
2067
+ {
2068
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2069
+ bin_tree_t *tree, *eor, *root;
2070
+ re_token_t current_token;
2071
+ dfa->syntax = syntax;
2072
+ fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2073
+ tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2074
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
2075
+ return NULL;
2076
+ eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2077
+ if (tree != NULL)
2078
+ root = create_tree (dfa, tree, eor, CONCAT);
2079
+ else
2080
+ root = eor;
2081
+ if (BE (eor == NULL || root == NULL, 0))
2082
+ {
2083
+ *err = REG_ESPACE;
2084
+ return NULL;
2085
+ }
2086
+ return root;
2087
+ }
2088
+
2089
+ /* This function build the following tree, from regular expression
2090
+ <branch1>|<branch2>:
2091
+ ALT
2092
+ / \
2093
+ / \
2094
+ <branch1> <branch2>
2095
+
2096
+ ALT means alternative, which represents the operator `|'. */
2097
+
2098
+ static bin_tree_t *
2099
+ parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2100
+ reg_syntax_t syntax, int nest, reg_errcode_t *err)
2101
+ {
2102
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2103
+ bin_tree_t *tree, *branch = NULL;
2104
+ tree = parse_branch (regexp, preg, token, syntax, nest, err);
2105
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
2106
+ return NULL;
2107
+
2108
+ while (token->type == OP_ALT)
2109
+ {
2110
+ fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2111
+ if (token->type != OP_ALT && token->type != END_OF_RE
2112
+ && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2113
+ {
2114
+ branch = parse_branch (regexp, preg, token, syntax, nest, err);
2115
+ if (BE (*err != REG_NOERROR && branch == NULL, 0))
2116
+ return NULL;
2117
+ }
2118
+ else
2119
+ branch = NULL;
2120
+ tree = create_tree (dfa, tree, branch, OP_ALT);
2121
+ if (BE (tree == NULL, 0))
2122
+ {
2123
+ *err = REG_ESPACE;
2124
+ return NULL;
2125
+ }
2126
+ }
2127
+ return tree;
2128
+ }
2129
+
2130
+ /* This function build the following tree, from regular expression
2131
+ <exp1><exp2>:
2132
+ CAT
2133
+ / \
2134
+ / \
2135
+ <exp1> <exp2>
2136
+
2137
+ CAT means concatenation. */
2138
+
2139
+ static bin_tree_t *
2140
+ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2141
+ reg_syntax_t syntax, int nest, reg_errcode_t *err)
2142
+ {
2143
+ bin_tree_t *tree, *exp;
2144
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2145
+ tree = parse_expression (regexp, preg, token, syntax, nest, err);
2146
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
2147
+ return NULL;
2148
+
2149
+ while (token->type != OP_ALT && token->type != END_OF_RE
2150
+ && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2151
+ {
2152
+ exp = parse_expression (regexp, preg, token, syntax, nest, err);
2153
+ if (BE (*err != REG_NOERROR && exp == NULL, 0))
2154
+ {
2155
+ return NULL;
2156
+ }
2157
+ if (tree != NULL && exp != NULL)
2158
+ {
2159
+ tree = create_tree (dfa, tree, exp, CONCAT);
2160
+ if (tree == NULL)
2161
+ {
2162
+ *err = REG_ESPACE;
2163
+ return NULL;
2164
+ }
2165
+ }
2166
+ else if (tree == NULL)
2167
+ tree = exp;
2168
+ /* Otherwise exp == NULL, we don't need to create new tree. */
2169
+ }
2170
+ return tree;
2171
+ }
2172
+
2173
+ /* This function build the following tree, from regular expression a*:
2174
+ *
2175
+ |
2176
+ a
2177
+ */
2178
+
2179
+ static bin_tree_t *
2180
+ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2181
+ reg_syntax_t syntax, int nest, reg_errcode_t *err)
2182
+ {
2183
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2184
+ bin_tree_t *tree;
2185
+ switch (token->type)
2186
+ {
2187
+ case CHARACTER:
2188
+ tree = create_token_tree (dfa, NULL, NULL, token);
2189
+ if (BE (tree == NULL, 0))
2190
+ {
2191
+ *err = REG_ESPACE;
2192
+ return NULL;
2193
+ }
2194
+ #ifdef RE_ENABLE_I18N
2195
+ if (dfa->mb_cur_max > 1)
2196
+ {
2197
+ while (!re_string_eoi (regexp)
2198
+ && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2199
+ {
2200
+ bin_tree_t *mbc_remain;
2201
+ fetch_token (token, regexp, syntax);
2202
+ mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2203
+ tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2204
+ if (BE (mbc_remain == NULL || tree == NULL, 0))
2205
+ {
2206
+ *err = REG_ESPACE;
2207
+ return NULL;
2208
+ }
2209
+ }
2210
+ }
2211
+ #endif
2212
+ break;
2213
+ case OP_OPEN_SUBEXP:
2214
+ tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2215
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
2216
+ return NULL;
2217
+ break;
2218
+ case OP_OPEN_BRACKET:
2219
+ tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2220
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
2221
+ return NULL;
2222
+ break;
2223
+ case OP_BACK_REF:
2224
+ if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2225
+ {
2226
+ *err = REG_ESUBREG;
2227
+ return NULL;
2228
+ }
2229
+ dfa->used_bkref_map |= 1 << token->opr.idx;
2230
+ tree = create_token_tree (dfa, NULL, NULL, token);
2231
+ if (BE (tree == NULL, 0))
2232
+ {
2233
+ *err = REG_ESPACE;
2234
+ return NULL;
2235
+ }
2236
+ ++dfa->nbackref;
2237
+ dfa->has_mb_node = 1;
2238
+ break;
2239
+ case OP_OPEN_DUP_NUM:
2240
+ if (syntax & RE_CONTEXT_INVALID_DUP)
2241
+ {
2242
+ *err = REG_BADRPT;
2243
+ return NULL;
2244
+ }
2245
+ /* FALLTHROUGH */
2246
+ case OP_DUP_ASTERISK:
2247
+ case OP_DUP_PLUS:
2248
+ case OP_DUP_QUESTION:
2249
+ if (syntax & RE_CONTEXT_INVALID_OPS)
2250
+ {
2251
+ *err = REG_BADRPT;
2252
+ return NULL;
2253
+ }
2254
+ else if (syntax & RE_CONTEXT_INDEP_OPS)
2255
+ {
2256
+ fetch_token (token, regexp, syntax);
2257
+ return parse_expression (regexp, preg, token, syntax, nest, err);
2258
+ }
2259
+ /* else fall through */
2260
+ case OP_CLOSE_SUBEXP:
2261
+ if ((token->type == OP_CLOSE_SUBEXP) &&
2262
+ !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2263
+ {
2264
+ *err = REG_ERPAREN;
2265
+ return NULL;
2266
+ }
2267
+ /* else fall through */
2268
+ case OP_CLOSE_DUP_NUM:
2269
+ /* We treat it as a normal character. */
2270
+
2271
+ /* Then we can these characters as normal characters. */
2272
+ token->type = CHARACTER;
2273
+ /* mb_partial and word_char bits should be initialized already
2274
+ by peek_token. */
2275
+ tree = create_token_tree (dfa, NULL, NULL, token);
2276
+ if (BE (tree == NULL, 0))
2277
+ {
2278
+ *err = REG_ESPACE;
2279
+ return NULL;
2280
+ }
2281
+ break;
2282
+ case ANCHOR:
2283
+ if ((token->opr.ctx_type
2284
+ & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2285
+ && dfa->word_ops_used == 0)
2286
+ init_word_char (dfa);
2287
+ if (token->opr.ctx_type == WORD_DELIM
2288
+ || token->opr.ctx_type == NOT_WORD_DELIM)
2289
+ {
2290
+ bin_tree_t *tree_first, *tree_last;
2291
+ if (token->opr.ctx_type == WORD_DELIM)
2292
+ {
2293
+ token->opr.ctx_type = WORD_FIRST;
2294
+ tree_first = create_token_tree (dfa, NULL, NULL, token);
2295
+ token->opr.ctx_type = WORD_LAST;
2296
+ }
2297
+ else
2298
+ {
2299
+ token->opr.ctx_type = INSIDE_WORD;
2300
+ tree_first = create_token_tree (dfa, NULL, NULL, token);
2301
+ token->opr.ctx_type = INSIDE_NOTWORD;
2302
+ }
2303
+ tree_last = create_token_tree (dfa, NULL, NULL, token);
2304
+ tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2305
+ if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2306
+ {
2307
+ *err = REG_ESPACE;
2308
+ return NULL;
2309
+ }
2310
+ }
2311
+ else
2312
+ {
2313
+ tree = create_token_tree (dfa, NULL, NULL, token);
2314
+ if (BE (tree == NULL, 0))
2315
+ {
2316
+ *err = REG_ESPACE;
2317
+ return NULL;
2318
+ }
2319
+ }
2320
+ /* We must return here, since ANCHORs can't be followed
2321
+ by repetition operators.
2322
+ eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2323
+ it must not be "<ANCHOR(^)><REPEAT(*)>". */
2324
+ fetch_token (token, regexp, syntax);
2325
+ return tree;
2326
+ case OP_PERIOD:
2327
+ tree = create_token_tree (dfa, NULL, NULL, token);
2328
+ if (BE (tree == NULL, 0))
2329
+ {
2330
+ *err = REG_ESPACE;
2331
+ return NULL;
2332
+ }
2333
+ if (dfa->mb_cur_max > 1)
2334
+ dfa->has_mb_node = 1;
2335
+ break;
2336
+ case OP_WORD:
2337
+ case OP_NOTWORD:
2338
+ tree = build_charclass_op (dfa, regexp->trans,
2339
+ (const unsigned char *) "alnum",
2340
+ (const unsigned char *) "_",
2341
+ token->type == OP_NOTWORD, err);
2342
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
2343
+ return NULL;
2344
+ break;
2345
+ case OP_SPACE:
2346
+ case OP_NOTSPACE:
2347
+ tree = build_charclass_op (dfa, regexp->trans,
2348
+ (const unsigned char *) "space",
2349
+ (const unsigned char *) "",
2350
+ token->type == OP_NOTSPACE, err);
2351
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
2352
+ return NULL;
2353
+ break;
2354
+ case OP_ALT:
2355
+ case END_OF_RE:
2356
+ return NULL;
2357
+ case BACK_SLASH:
2358
+ *err = REG_EESCAPE;
2359
+ return NULL;
2360
+ default:
2361
+ /* Must not happen? */
2362
+ #ifdef DEBUG
2363
+ assert (0);
2364
+ #endif
2365
+ return NULL;
2366
+ }
2367
+ fetch_token (token, regexp, syntax);
2368
+
2369
+ while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2370
+ || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2371
+ {
2372
+ tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2373
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
2374
+ return NULL;
2375
+ /* In BRE consecutive duplications are not allowed. */
2376
+ if ((syntax & RE_CONTEXT_INVALID_DUP)
2377
+ && (token->type == OP_DUP_ASTERISK
2378
+ || token->type == OP_OPEN_DUP_NUM))
2379
+ {
2380
+ *err = REG_BADRPT;
2381
+ return NULL;
2382
+ }
2383
+ }
2384
+
2385
+ return tree;
2386
+ }
2387
+
2388
+ /* This function build the following tree, from regular expression
2389
+ (<reg_exp>):
2390
+ SUBEXP
2391
+ |
2392
+ <reg_exp>
2393
+ */
2394
+
2395
+ static bin_tree_t *
2396
+ parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2397
+ reg_syntax_t syntax, int nest, reg_errcode_t *err)
2398
+ {
2399
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2400
+ bin_tree_t *tree;
2401
+ size_t cur_nsub;
2402
+ cur_nsub = preg->re_nsub++;
2403
+
2404
+ fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2405
+
2406
+ /* The subexpression may be a null string. */
2407
+ if (token->type == OP_CLOSE_SUBEXP)
2408
+ tree = NULL;
2409
+ else
2410
+ {
2411
+ tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2412
+ if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2413
+ *err = REG_EPAREN;
2414
+ if (BE (*err != REG_NOERROR, 0))
2415
+ return NULL;
2416
+ }
2417
+
2418
+ if (cur_nsub <= '9' - '1')
2419
+ dfa->completed_bkref_map |= 1 << cur_nsub;
2420
+
2421
+ tree = create_tree (dfa, tree, NULL, SUBEXP);
2422
+ if (BE (tree == NULL, 0))
2423
+ {
2424
+ *err = REG_ESPACE;
2425
+ return NULL;
2426
+ }
2427
+ tree->token.opr.idx = cur_nsub;
2428
+ return tree;
2429
+ }
2430
+
2431
+ /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2432
+
2433
+ static bin_tree_t *
2434
+ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2435
+ re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2436
+ {
2437
+ bin_tree_t *tree = NULL, *old_tree = NULL;
2438
+ int i, start, end, start_idx = re_string_cur_idx (regexp);
2439
+ re_token_t start_token = *token;
2440
+
2441
+ if (token->type == OP_OPEN_DUP_NUM)
2442
+ {
2443
+ end = 0;
2444
+ start = fetch_number (regexp, token, syntax);
2445
+ if (start == -1)
2446
+ {
2447
+ if (token->type == CHARACTER && token->opr.c == ',')
2448
+ start = 0; /* We treat "{,m}" as "{0,m}". */
2449
+ else
2450
+ {
2451
+ *err = REG_BADBR; /* <re>{} is invalid. */
2452
+ return NULL;
2453
+ }
2454
+ }
2455
+ if (BE (start != -2, 1))
2456
+ {
2457
+ /* We treat "{n}" as "{n,n}". */
2458
+ end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2459
+ : ((token->type == CHARACTER && token->opr.c == ',')
2460
+ ? fetch_number (regexp, token, syntax) : -2));
2461
+ }
2462
+ if (BE (start == -2 || end == -2, 0))
2463
+ {
2464
+ /* Invalid sequence. */
2465
+ if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2466
+ {
2467
+ if (token->type == END_OF_RE)
2468
+ *err = REG_EBRACE;
2469
+ else
2470
+ *err = REG_BADBR;
2471
+
2472
+ return NULL;
2473
+ }
2474
+
2475
+ /* If the syntax bit is set, rollback. */
2476
+ re_string_set_index (regexp, start_idx);
2477
+ *token = start_token;
2478
+ token->type = CHARACTER;
2479
+ /* mb_partial and word_char bits should be already initialized by
2480
+ peek_token. */
2481
+ return elem;
2482
+ }
2483
+
2484
+ if (BE (end != -1 && start > end, 0))
2485
+ {
2486
+ /* First number greater than second. */
2487
+ *err = REG_BADBR;
2488
+ return NULL;
2489
+ }
2490
+ }
2491
+ else
2492
+ {
2493
+ start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2494
+ end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2495
+ }
2496
+
2497
+ fetch_token (token, regexp, syntax);
2498
+
2499
+ if (BE (elem == NULL, 0))
2500
+ return NULL;
2501
+ if (BE (start == 0 && end == 0, 0))
2502
+ {
2503
+ postorder (elem, free_tree, NULL);
2504
+ return NULL;
2505
+ }
2506
+
2507
+ /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2508
+ if (BE (start > 0, 0))
2509
+ {
2510
+ tree = elem;
2511
+ for (i = 2; i <= start; ++i)
2512
+ {
2513
+ elem = duplicate_tree (elem, dfa);
2514
+ tree = create_tree (dfa, tree, elem, CONCAT);
2515
+ if (BE (elem == NULL || tree == NULL, 0))
2516
+ goto parse_dup_op_espace;
2517
+ }
2518
+
2519
+ if (start == end)
2520
+ return tree;
2521
+
2522
+ /* Duplicate ELEM before it is marked optional. */
2523
+ elem = duplicate_tree (elem, dfa);
2524
+ old_tree = tree;
2525
+ }
2526
+ else
2527
+ old_tree = NULL;
2528
+
2529
+ if (elem->token.type == SUBEXP)
2530
+ postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2531
+
2532
+ tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2533
+ if (BE (tree == NULL, 0))
2534
+ goto parse_dup_op_espace;
2535
+
2536
+ /* This loop is actually executed only when end != -1,
2537
+ to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2538
+ already created the start+1-th copy. */
2539
+ for (i = start + 2; i <= end; ++i)
2540
+ {
2541
+ elem = duplicate_tree (elem, dfa);
2542
+ tree = create_tree (dfa, tree, elem, CONCAT);
2543
+ if (BE (elem == NULL || tree == NULL, 0))
2544
+ goto parse_dup_op_espace;
2545
+
2546
+ tree = create_tree (dfa, tree, NULL, OP_ALT);
2547
+ if (BE (tree == NULL, 0))
2548
+ goto parse_dup_op_espace;
2549
+ }
2550
+
2551
+ if (old_tree)
2552
+ tree = create_tree (dfa, old_tree, tree, CONCAT);
2553
+
2554
+ return tree;
2555
+
2556
+ parse_dup_op_espace:
2557
+ *err = REG_ESPACE;
2558
+ return NULL;
2559
+ }
2560
+
2561
+ /* Size of the names for collating symbol/equivalence_class/character_class.
2562
+ I'm not sure, but maybe enough. */
2563
+ #define BRACKET_NAME_BUF_SIZE 32
2564
+
2565
+ #ifndef _LIBC
2566
+ /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2567
+ Build the range expression which starts from START_ELEM, and ends
2568
+ at END_ELEM. The result are written to MBCSET and SBCSET.
2569
+ RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2570
+ mbcset->range_ends, is a pointer argument sinse we may
2571
+ update it. */
2572
+
2573
+ static reg_errcode_t
2574
+ internal_function
2575
+ # ifdef RE_ENABLE_I18N
2576
+ build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2577
+ bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2578
+ # else /* not RE_ENABLE_I18N */
2579
+ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2580
+ bracket_elem_t *end_elem)
2581
+ # endif /* not RE_ENABLE_I18N */
2582
+ {
2583
+ unsigned int start_ch, end_ch;
2584
+ /* Equivalence Classes and Character Classes can't be a range start/end. */
2585
+ if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2586
+ || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2587
+ 0))
2588
+ return REG_ERANGE;
2589
+
2590
+ /* We can handle no multi character collating elements without libc
2591
+ support. */
2592
+ if (BE ((start_elem->type == COLL_SYM
2593
+ && strlen ((char *) start_elem->opr.name) > 1)
2594
+ || (end_elem->type == COLL_SYM
2595
+ && strlen ((char *) end_elem->opr.name) > 1), 0))
2596
+ return REG_ECOLLATE;
2597
+
2598
+ # ifdef RE_ENABLE_I18N
2599
+ {
2600
+ wchar_t wc;
2601
+ wint_t start_wc;
2602
+ wint_t end_wc;
2603
+ wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2604
+
2605
+ start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2606
+ : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2607
+ : 0));
2608
+ end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2609
+ : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2610
+ : 0));
2611
+ start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2612
+ ? __btowc (start_ch) : start_elem->opr.wch);
2613
+ end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2614
+ ? __btowc (end_ch) : end_elem->opr.wch);
2615
+ if (start_wc == WEOF || end_wc == WEOF)
2616
+ return REG_ECOLLATE;
2617
+ cmp_buf[0] = start_wc;
2618
+ cmp_buf[4] = end_wc;
2619
+ if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2620
+ return REG_ERANGE;
2621
+
2622
+ /* Got valid collation sequence values, add them as a new entry.
2623
+ However, for !_LIBC we have no collation elements: if the
2624
+ character set is single byte, the single byte character set
2625
+ that we build below suffices. parse_bracket_exp passes
2626
+ no MBCSET if dfa->mb_cur_max == 1. */
2627
+ if (mbcset)
2628
+ {
2629
+ /* Check the space of the arrays. */
2630
+ if (BE (*range_alloc == mbcset->nranges, 0))
2631
+ {
2632
+ /* There is not enough space, need realloc. */
2633
+ wchar_t *new_array_start, *new_array_end;
2634
+ int new_nranges;
2635
+
2636
+ /* +1 in case of mbcset->nranges is 0. */
2637
+ new_nranges = 2 * mbcset->nranges + 1;
2638
+ /* Use realloc since mbcset->range_starts and mbcset->range_ends
2639
+ are NULL if *range_alloc == 0. */
2640
+ new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2641
+ new_nranges);
2642
+ new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2643
+ new_nranges);
2644
+
2645
+ if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2646
+ return REG_ESPACE;
2647
+
2648
+ mbcset->range_starts = new_array_start;
2649
+ mbcset->range_ends = new_array_end;
2650
+ *range_alloc = new_nranges;
2651
+ }
2652
+
2653
+ mbcset->range_starts[mbcset->nranges] = start_wc;
2654
+ mbcset->range_ends[mbcset->nranges++] = end_wc;
2655
+ }
2656
+
2657
+ /* Build the table for single byte characters. */
2658
+ for (wc = 0; wc < SBC_MAX; ++wc)
2659
+ {
2660
+ cmp_buf[2] = wc;
2661
+ if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2662
+ && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2663
+ bitset_set (sbcset, wc);
2664
+ }
2665
+ }
2666
+ # else /* not RE_ENABLE_I18N */
2667
+ {
2668
+ unsigned int ch;
2669
+ start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2670
+ : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2671
+ : 0));
2672
+ end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2673
+ : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2674
+ : 0));
2675
+ if (start_ch > end_ch)
2676
+ return REG_ERANGE;
2677
+ /* Build the table for single byte characters. */
2678
+ for (ch = 0; ch < SBC_MAX; ++ch)
2679
+ if (start_ch <= ch && ch <= end_ch)
2680
+ bitset_set (sbcset, ch);
2681
+ }
2682
+ # endif /* not RE_ENABLE_I18N */
2683
+ return REG_NOERROR;
2684
+ }
2685
+ #endif /* not _LIBC */
2686
+
2687
+ #ifndef _LIBC
2688
+ /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2689
+ Build the collating element which is represented by NAME.
2690
+ The result are written to MBCSET and SBCSET.
2691
+ COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2692
+ pointer argument since we may update it. */
2693
+
2694
+ static reg_errcode_t
2695
+ internal_function
2696
+ # ifdef RE_ENABLE_I18N
2697
+ build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2698
+ int *coll_sym_alloc, const unsigned char *name)
2699
+ # else /* not RE_ENABLE_I18N */
2700
+ build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2701
+ # endif /* not RE_ENABLE_I18N */
2702
+ {
2703
+ size_t name_len = strlen ((const char *) name);
2704
+ if (BE (name_len != 1, 0))
2705
+ return REG_ECOLLATE;
2706
+ else
2707
+ {
2708
+ bitset_set (sbcset, name[0]);
2709
+ return REG_NOERROR;
2710
+ }
2711
+ }
2712
+ #endif /* not _LIBC */
2713
+
2714
+ /* This function parse bracket expression like "[abc]", "[a-c]",
2715
+ "[[.a-a.]]" etc. */
2716
+
2717
+ static bin_tree_t *
2718
+ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2719
+ reg_syntax_t syntax, reg_errcode_t *err)
2720
+ {
2721
+ #ifdef _LIBC
2722
+ const unsigned char *collseqmb;
2723
+ const char *collseqwc;
2724
+ uint32_t nrules;
2725
+ int32_t table_size;
2726
+ const int32_t *symb_table;
2727
+ const unsigned char *extra;
2728
+
2729
+ /* Local function for parse_bracket_exp used in _LIBC environement.
2730
+ Seek the collating symbol entry correspondings to NAME.
2731
+ Return the index of the symbol in the SYMB_TABLE. */
2732
+
2733
+ auto inline int32_t
2734
+ __attribute ((always_inline))
2735
+ seek_collating_symbol_entry (name, name_len)
2736
+ const unsigned char *name;
2737
+ size_t name_len;
2738
+ {
2739
+ int32_t hash = elem_hash ((const char *) name, name_len);
2740
+ int32_t elem = hash % table_size;
2741
+ if (symb_table[2 * elem] != 0)
2742
+ {
2743
+ int32_t second = hash % (table_size - 2) + 1;
2744
+
2745
+ do
2746
+ {
2747
+ /* First compare the hashing value. */
2748
+ if (symb_table[2 * elem] == hash
2749
+ /* Compare the length of the name. */
2750
+ && name_len == extra[symb_table[2 * elem + 1]]
2751
+ /* Compare the name. */
2752
+ && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2753
+ name_len) == 0)
2754
+ {
2755
+ /* Yep, this is the entry. */
2756
+ break;
2757
+ }
2758
+
2759
+ /* Next entry. */
2760
+ elem += second;
2761
+ }
2762
+ while (symb_table[2 * elem] != 0);
2763
+ }
2764
+ return elem;
2765
+ }
2766
+
2767
+ /* Local function for parse_bracket_exp used in _LIBC environment.
2768
+ Look up the collation sequence value of BR_ELEM.
2769
+ Return the value if succeeded, UINT_MAX otherwise. */
2770
+
2771
+ auto inline unsigned int
2772
+ __attribute ((always_inline))
2773
+ lookup_collation_sequence_value (br_elem)
2774
+ bracket_elem_t *br_elem;
2775
+ {
2776
+ if (br_elem->type == SB_CHAR)
2777
+ {
2778
+ /*
2779
+ if (MB_CUR_MAX == 1)
2780
+ */
2781
+ if (nrules == 0)
2782
+ return collseqmb[br_elem->opr.ch];
2783
+ else
2784
+ {
2785
+ wint_t wc = __btowc (br_elem->opr.ch);
2786
+ return __collseq_table_lookup (collseqwc, wc);
2787
+ }
2788
+ }
2789
+ else if (br_elem->type == MB_CHAR)
2790
+ {
2791
+ if (nrules != 0)
2792
+ return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2793
+ }
2794
+ else if (br_elem->type == COLL_SYM)
2795
+ {
2796
+ size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2797
+ if (nrules != 0)
2798
+ {
2799
+ int32_t elem, idx;
2800
+ elem = seek_collating_symbol_entry (br_elem->opr.name,
2801
+ sym_name_len);
2802
+ if (symb_table[2 * elem] != 0)
2803
+ {
2804
+ /* We found the entry. */
2805
+ idx = symb_table[2 * elem + 1];
2806
+ /* Skip the name of collating element name. */
2807
+ idx += 1 + extra[idx];
2808
+ /* Skip the byte sequence of the collating element. */
2809
+ idx += 1 + extra[idx];
2810
+ /* Adjust for the alignment. */
2811
+ idx = (idx + 3) & ~3;
2812
+ /* Skip the multibyte collation sequence value. */
2813
+ idx += sizeof (unsigned int);
2814
+ /* Skip the wide char sequence of the collating element. */
2815
+ idx += sizeof (unsigned int) *
2816
+ (1 + *(unsigned int *) (extra + idx));
2817
+ /* Return the collation sequence value. */
2818
+ return *(unsigned int *) (extra + idx);
2819
+ }
2820
+ else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2821
+ {
2822
+ /* No valid character. Match it as a single byte
2823
+ character. */
2824
+ return collseqmb[br_elem->opr.name[0]];
2825
+ }
2826
+ }
2827
+ else if (sym_name_len == 1)
2828
+ return collseqmb[br_elem->opr.name[0]];
2829
+ }
2830
+ return UINT_MAX;
2831
+ }
2832
+
2833
+ /* Local function for parse_bracket_exp used in _LIBC environement.
2834
+ Build the range expression which starts from START_ELEM, and ends
2835
+ at END_ELEM. The result are written to MBCSET and SBCSET.
2836
+ RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2837
+ mbcset->range_ends, is a pointer argument sinse we may
2838
+ update it. */
2839
+
2840
+ auto inline reg_errcode_t
2841
+ __attribute ((always_inline))
2842
+ build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2843
+ re_charset_t *mbcset;
2844
+ int *range_alloc;
2845
+ bitset_t sbcset;
2846
+ bracket_elem_t *start_elem, *end_elem;
2847
+ {
2848
+ unsigned int ch;
2849
+ uint32_t start_collseq;
2850
+ uint32_t end_collseq;
2851
+
2852
+ /* Equivalence Classes and Character Classes can't be a range
2853
+ start/end. */
2854
+ if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2855
+ || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2856
+ 0))
2857
+ return REG_ERANGE;
2858
+
2859
+ start_collseq = lookup_collation_sequence_value (start_elem);
2860
+ end_collseq = lookup_collation_sequence_value (end_elem);
2861
+ /* Check start/end collation sequence values. */
2862
+ if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2863
+ return REG_ECOLLATE;
2864
+ if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2865
+ return REG_ERANGE;
2866
+
2867
+ /* Got valid collation sequence values, add them as a new entry.
2868
+ However, if we have no collation elements, and the character set
2869
+ is single byte, the single byte character set that we
2870
+ build below suffices. */
2871
+ if (nrules > 0 || dfa->mb_cur_max > 1)
2872
+ {
2873
+ /* Check the space of the arrays. */
2874
+ if (BE (*range_alloc == mbcset->nranges, 0))
2875
+ {
2876
+ /* There is not enough space, need realloc. */
2877
+ uint32_t *new_array_start;
2878
+ uint32_t *new_array_end;
2879
+ int new_nranges;
2880
+
2881
+ /* +1 in case of mbcset->nranges is 0. */
2882
+ new_nranges = 2 * mbcset->nranges + 1;
2883
+ new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2884
+ new_nranges);
2885
+ new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2886
+ new_nranges);
2887
+
2888
+ if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2889
+ return REG_ESPACE;
2890
+
2891
+ mbcset->range_starts = new_array_start;
2892
+ mbcset->range_ends = new_array_end;
2893
+ *range_alloc = new_nranges;
2894
+ }
2895
+
2896
+ mbcset->range_starts[mbcset->nranges] = start_collseq;
2897
+ mbcset->range_ends[mbcset->nranges++] = end_collseq;
2898
+ }
2899
+
2900
+ /* Build the table for single byte characters. */
2901
+ for (ch = 0; ch < SBC_MAX; ch++)
2902
+ {
2903
+ uint32_t ch_collseq;
2904
+ /*
2905
+ if (MB_CUR_MAX == 1)
2906
+ */
2907
+ if (nrules == 0)
2908
+ ch_collseq = collseqmb[ch];
2909
+ else
2910
+ ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2911
+ if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2912
+ bitset_set (sbcset, ch);
2913
+ }
2914
+ return REG_NOERROR;
2915
+ }
2916
+
2917
+ /* Local function for parse_bracket_exp used in _LIBC environement.
2918
+ Build the collating element which is represented by NAME.
2919
+ The result are written to MBCSET and SBCSET.
2920
+ COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2921
+ pointer argument sinse we may update it. */
2922
+
2923
+ auto inline reg_errcode_t
2924
+ __attribute ((always_inline))
2925
+ build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2926
+ re_charset_t *mbcset;
2927
+ int *coll_sym_alloc;
2928
+ bitset_t sbcset;
2929
+ const unsigned char *name;
2930
+ {
2931
+ int32_t elem, idx;
2932
+ size_t name_len = strlen ((const char *) name);
2933
+ if (nrules != 0)
2934
+ {
2935
+ elem = seek_collating_symbol_entry (name, name_len);
2936
+ if (symb_table[2 * elem] != 0)
2937
+ {
2938
+ /* We found the entry. */
2939
+ idx = symb_table[2 * elem + 1];
2940
+ /* Skip the name of collating element name. */
2941
+ idx += 1 + extra[idx];
2942
+ }
2943
+ else if (symb_table[2 * elem] == 0 && name_len == 1)
2944
+ {
2945
+ /* No valid character, treat it as a normal
2946
+ character. */
2947
+ bitset_set (sbcset, name[0]);
2948
+ return REG_NOERROR;
2949
+ }
2950
+ else
2951
+ return REG_ECOLLATE;
2952
+
2953
+ /* Got valid collation sequence, add it as a new entry. */
2954
+ /* Check the space of the arrays. */
2955
+ if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2956
+ {
2957
+ /* Not enough, realloc it. */
2958
+ /* +1 in case of mbcset->ncoll_syms is 0. */
2959
+ int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2960
+ /* Use realloc since mbcset->coll_syms is NULL
2961
+ if *alloc == 0. */
2962
+ int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2963
+ new_coll_sym_alloc);
2964
+ if (BE (new_coll_syms == NULL, 0))
2965
+ return REG_ESPACE;
2966
+ mbcset->coll_syms = new_coll_syms;
2967
+ *coll_sym_alloc = new_coll_sym_alloc;
2968
+ }
2969
+ mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2970
+ return REG_NOERROR;
2971
+ }
2972
+ else
2973
+ {
2974
+ if (BE (name_len != 1, 0))
2975
+ return REG_ECOLLATE;
2976
+ else
2977
+ {
2978
+ bitset_set (sbcset, name[0]);
2979
+ return REG_NOERROR;
2980
+ }
2981
+ }
2982
+ }
2983
+ #endif
2984
+
2985
+ re_token_t br_token;
2986
+ re_bitset_ptr_t sbcset;
2987
+ #ifdef RE_ENABLE_I18N
2988
+ re_charset_t *mbcset;
2989
+ int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2990
+ int equiv_class_alloc = 0, char_class_alloc = 0;
2991
+ #endif /* not RE_ENABLE_I18N */
2992
+ int non_match = 0;
2993
+ bin_tree_t *work_tree;
2994
+ int token_len;
2995
+ int first_round = 1;
2996
+ #ifdef _LIBC
2997
+ collseqmb = (const unsigned char *)
2998
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2999
+ nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3000
+ if (nrules)
3001
+ {
3002
+ /*
3003
+ if (MB_CUR_MAX > 1)
3004
+ */
3005
+ collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3006
+ table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3007
+ symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3008
+ _NL_COLLATE_SYMB_TABLEMB);
3009
+ extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3010
+ _NL_COLLATE_SYMB_EXTRAMB);
3011
+ }
3012
+ #endif
3013
+ sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3014
+ #ifdef RE_ENABLE_I18N
3015
+ mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3016
+ #endif /* RE_ENABLE_I18N */
3017
+ #ifdef RE_ENABLE_I18N
3018
+ if (BE (sbcset == NULL || mbcset == NULL, 0))
3019
+ #else
3020
+ if (BE (sbcset == NULL, 0))
3021
+ #endif /* RE_ENABLE_I18N */
3022
+ {
3023
+ *err = REG_ESPACE;
3024
+ return NULL;
3025
+ }
3026
+
3027
+ token_len = peek_token_bracket (token, regexp, syntax);
3028
+ if (BE (token->type == END_OF_RE, 0))
3029
+ {
3030
+ *err = REG_BADPAT;
3031
+ goto parse_bracket_exp_free_return;
3032
+ }
3033
+ if (token->type == OP_NON_MATCH_LIST)
3034
+ {
3035
+ #ifdef RE_ENABLE_I18N
3036
+ mbcset->non_match = 1;
3037
+ #endif /* not RE_ENABLE_I18N */
3038
+ non_match = 1;
3039
+ if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3040
+ bitset_set (sbcset, '\n');
3041
+ re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3042
+ token_len = peek_token_bracket (token, regexp, syntax);
3043
+ if (BE (token->type == END_OF_RE, 0))
3044
+ {
3045
+ *err = REG_BADPAT;
3046
+ goto parse_bracket_exp_free_return;
3047
+ }
3048
+ }
3049
+
3050
+ /* We treat the first ']' as a normal character. */
3051
+ if (token->type == OP_CLOSE_BRACKET)
3052
+ token->type = CHARACTER;
3053
+
3054
+ while (1)
3055
+ {
3056
+ bracket_elem_t start_elem, end_elem;
3057
+ unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3058
+ unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3059
+ reg_errcode_t ret;
3060
+ int token_len2 = 0, is_range_exp = 0;
3061
+ re_token_t token2;
3062
+
3063
+ start_elem.opr.name = start_name_buf;
3064
+ ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3065
+ syntax, first_round);
3066
+ if (BE (ret != REG_NOERROR, 0))
3067
+ {
3068
+ *err = ret;
3069
+ goto parse_bracket_exp_free_return;
3070
+ }
3071
+ first_round = 0;
3072
+
3073
+ /* Get information about the next token. We need it in any case. */
3074
+ token_len = peek_token_bracket (token, regexp, syntax);
3075
+
3076
+ /* Do not check for ranges if we know they are not allowed. */
3077
+ if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3078
+ {
3079
+ if (BE (token->type == END_OF_RE, 0))
3080
+ {
3081
+ *err = REG_EBRACK;
3082
+ goto parse_bracket_exp_free_return;
3083
+ }
3084
+ if (token->type == OP_CHARSET_RANGE)
3085
+ {
3086
+ re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3087
+ token_len2 = peek_token_bracket (&token2, regexp, syntax);
3088
+ if (BE (token2.type == END_OF_RE, 0))
3089
+ {
3090
+ *err = REG_EBRACK;
3091
+ goto parse_bracket_exp_free_return;
3092
+ }
3093
+ if (token2.type == OP_CLOSE_BRACKET)
3094
+ {
3095
+ /* We treat the last '-' as a normal character. */
3096
+ re_string_skip_bytes (regexp, -token_len);
3097
+ token->type = CHARACTER;
3098
+ }
3099
+ else
3100
+ is_range_exp = 1;
3101
+ }
3102
+ }
3103
+
3104
+ if (is_range_exp == 1)
3105
+ {
3106
+ end_elem.opr.name = end_name_buf;
3107
+ ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3108
+ dfa, syntax, 1);
3109
+ if (BE (ret != REG_NOERROR, 0))
3110
+ {
3111
+ *err = ret;
3112
+ goto parse_bracket_exp_free_return;
3113
+ }
3114
+
3115
+ token_len = peek_token_bracket (token, regexp, syntax);
3116
+
3117
+ #ifdef _LIBC
3118
+ *err = build_range_exp (sbcset, mbcset, &range_alloc,
3119
+ &start_elem, &end_elem);
3120
+ #else
3121
+ # ifdef RE_ENABLE_I18N
3122
+ *err = build_range_exp (sbcset,
3123
+ dfa->mb_cur_max > 1 ? mbcset : NULL,
3124
+ &range_alloc, &start_elem, &end_elem);
3125
+ # else
3126
+ *err = build_range_exp (sbcset, &start_elem, &end_elem);
3127
+ # endif
3128
+ #endif /* RE_ENABLE_I18N */
3129
+ if (BE (*err != REG_NOERROR, 0))
3130
+ goto parse_bracket_exp_free_return;
3131
+ }
3132
+ else
3133
+ {
3134
+ switch (start_elem.type)
3135
+ {
3136
+ case SB_CHAR:
3137
+ bitset_set (sbcset, start_elem.opr.ch);
3138
+ break;
3139
+ #ifdef RE_ENABLE_I18N
3140
+ case MB_CHAR:
3141
+ /* Check whether the array has enough space. */
3142
+ if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3143
+ {
3144
+ wchar_t *new_mbchars;
3145
+ /* Not enough, realloc it. */
3146
+ /* +1 in case of mbcset->nmbchars is 0. */
3147
+ mbchar_alloc = 2 * mbcset->nmbchars + 1;
3148
+ /* Use realloc since array is NULL if *alloc == 0. */
3149
+ new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3150
+ mbchar_alloc);
3151
+ if (BE (new_mbchars == NULL, 0))
3152
+ goto parse_bracket_exp_espace;
3153
+ mbcset->mbchars = new_mbchars;
3154
+ }
3155
+ mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3156
+ break;
3157
+ #endif /* RE_ENABLE_I18N */
3158
+ case EQUIV_CLASS:
3159
+ *err = build_equiv_class (sbcset,
3160
+ #ifdef RE_ENABLE_I18N
3161
+ mbcset, &equiv_class_alloc,
3162
+ #endif /* RE_ENABLE_I18N */
3163
+ start_elem.opr.name);
3164
+ if (BE (*err != REG_NOERROR, 0))
3165
+ goto parse_bracket_exp_free_return;
3166
+ break;
3167
+ case COLL_SYM:
3168
+ *err = build_collating_symbol (sbcset,
3169
+ #ifdef RE_ENABLE_I18N
3170
+ mbcset, &coll_sym_alloc,
3171
+ #endif /* RE_ENABLE_I18N */
3172
+ start_elem.opr.name);
3173
+ if (BE (*err != REG_NOERROR, 0))
3174
+ goto parse_bracket_exp_free_return;
3175
+ break;
3176
+ case CHAR_CLASS:
3177
+ *err = build_charclass (regexp->trans, sbcset,
3178
+ #ifdef RE_ENABLE_I18N
3179
+ mbcset, &char_class_alloc,
3180
+ #endif /* RE_ENABLE_I18N */
3181
+ start_elem.opr.name, syntax);
3182
+ if (BE (*err != REG_NOERROR, 0))
3183
+ goto parse_bracket_exp_free_return;
3184
+ break;
3185
+ default:
3186
+ assert (0);
3187
+ break;
3188
+ }
3189
+ }
3190
+ if (BE (token->type == END_OF_RE, 0))
3191
+ {
3192
+ *err = REG_EBRACK;
3193
+ goto parse_bracket_exp_free_return;
3194
+ }
3195
+ if (token->type == OP_CLOSE_BRACKET)
3196
+ break;
3197
+ }
3198
+
3199
+ re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3200
+
3201
+ /* If it is non-matching list. */
3202
+ if (non_match)
3203
+ bitset_not (sbcset);
3204
+
3205
+ #ifdef RE_ENABLE_I18N
3206
+ /* Ensure only single byte characters are set. */
3207
+ if (dfa->mb_cur_max > 1)
3208
+ bitset_mask (sbcset, dfa->sb_char);
3209
+
3210
+ if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3211
+ || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3212
+ || mbcset->non_match)))
3213
+ {
3214
+ bin_tree_t *mbc_tree;
3215
+ int sbc_idx;
3216
+ /* Build a tree for complex bracket. */
3217
+ dfa->has_mb_node = 1;
3218
+ br_token.type = COMPLEX_BRACKET;
3219
+ br_token.opr.mbcset = mbcset;
3220
+ mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3221
+ if (BE (mbc_tree == NULL, 0))
3222
+ goto parse_bracket_exp_espace;
3223
+ for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3224
+ if (sbcset[sbc_idx])
3225
+ break;
3226
+ /* If there are no bits set in sbcset, there is no point
3227
+ of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3228
+ if (sbc_idx < BITSET_WORDS)
3229
+ {
3230
+ /* Build a tree for simple bracket. */
3231
+ br_token.type = SIMPLE_BRACKET;
3232
+ br_token.opr.sbcset = sbcset;
3233
+ work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3234
+ if (BE (work_tree == NULL, 0))
3235
+ goto parse_bracket_exp_espace;
3236
+
3237
+ /* Then join them by ALT node. */
3238
+ work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3239
+ if (BE (work_tree == NULL, 0))
3240
+ goto parse_bracket_exp_espace;
3241
+ }
3242
+ else
3243
+ {
3244
+ re_free (sbcset);
3245
+ work_tree = mbc_tree;
3246
+ }
3247
+ }
3248
+ else
3249
+ #endif /* not RE_ENABLE_I18N */
3250
+ {
3251
+ #ifdef RE_ENABLE_I18N
3252
+ free_charset (mbcset);
3253
+ #endif
3254
+ /* Build a tree for simple bracket. */
3255
+ br_token.type = SIMPLE_BRACKET;
3256
+ br_token.opr.sbcset = sbcset;
3257
+ work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3258
+ if (BE (work_tree == NULL, 0))
3259
+ goto parse_bracket_exp_espace;
3260
+ }
3261
+ return work_tree;
3262
+
3263
+ parse_bracket_exp_espace:
3264
+ *err = REG_ESPACE;
3265
+ parse_bracket_exp_free_return:
3266
+ re_free (sbcset);
3267
+ #ifdef RE_ENABLE_I18N
3268
+ free_charset (mbcset);
3269
+ #endif /* RE_ENABLE_I18N */
3270
+ return NULL;
3271
+ }
3272
+
3273
+ /* Parse an element in the bracket expression. */
3274
+
3275
+ static reg_errcode_t
3276
+ parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3277
+ re_token_t *token, int token_len, re_dfa_t *dfa,
3278
+ reg_syntax_t syntax, int accept_hyphen)
3279
+ {
3280
+ #ifdef RE_ENABLE_I18N
3281
+ int cur_char_size;
3282
+ cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3283
+ if (cur_char_size > 1)
3284
+ {
3285
+ elem->type = MB_CHAR;
3286
+ elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3287
+ re_string_skip_bytes (regexp, cur_char_size);
3288
+ return REG_NOERROR;
3289
+ }
3290
+ #endif /* RE_ENABLE_I18N */
3291
+ re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3292
+ if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3293
+ || token->type == OP_OPEN_EQUIV_CLASS)
3294
+ return parse_bracket_symbol (elem, regexp, token);
3295
+ if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3296
+ {
3297
+ /* A '-' must only appear as anything but a range indicator before
3298
+ the closing bracket. Everything else is an error. */
3299
+ re_token_t token2;
3300
+ (void) peek_token_bracket (&token2, regexp, syntax);
3301
+ if (token2.type != OP_CLOSE_BRACKET)
3302
+ /* The actual error value is not standardized since this whole
3303
+ case is undefined. But ERANGE makes good sense. */
3304
+ return REG_ERANGE;
3305
+ }
3306
+ elem->type = SB_CHAR;
3307
+ elem->opr.ch = token->opr.c;
3308
+ return REG_NOERROR;
3309
+ }
3310
+
3311
+ /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3312
+ such as [:<character_class>:], [.<collating_element>.], and
3313
+ [=<equivalent_class>=]. */
3314
+
3315
+ static reg_errcode_t
3316
+ parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3317
+ re_token_t *token)
3318
+ {
3319
+ unsigned char ch, delim = token->opr.c;
3320
+ int i = 0;
3321
+ if (re_string_eoi(regexp))
3322
+ return REG_EBRACK;
3323
+ for (;; ++i)
3324
+ {
3325
+ if (i >= BRACKET_NAME_BUF_SIZE)
3326
+ return REG_EBRACK;
3327
+ if (token->type == OP_OPEN_CHAR_CLASS)
3328
+ ch = re_string_fetch_byte_case (regexp);
3329
+ else
3330
+ ch = re_string_fetch_byte (regexp);
3331
+ if (re_string_eoi(regexp))
3332
+ return REG_EBRACK;
3333
+ if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3334
+ break;
3335
+ elem->opr.name[i] = ch;
3336
+ }
3337
+ re_string_skip_bytes (regexp, 1);
3338
+ elem->opr.name[i] = '\0';
3339
+ switch (token->type)
3340
+ {
3341
+ case OP_OPEN_COLL_ELEM:
3342
+ elem->type = COLL_SYM;
3343
+ break;
3344
+ case OP_OPEN_EQUIV_CLASS:
3345
+ elem->type = EQUIV_CLASS;
3346
+ break;
3347
+ case OP_OPEN_CHAR_CLASS:
3348
+ elem->type = CHAR_CLASS;
3349
+ break;
3350
+ default:
3351
+ break;
3352
+ }
3353
+ return REG_NOERROR;
3354
+ }
3355
+
3356
+ /* Helper function for parse_bracket_exp.
3357
+ Build the equivalence class which is represented by NAME.
3358
+ The result are written to MBCSET and SBCSET.
3359
+ EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3360
+ is a pointer argument sinse we may update it. */
3361
+
3362
+ static reg_errcode_t
3363
+ #ifdef RE_ENABLE_I18N
3364
+ build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3365
+ int *equiv_class_alloc, const unsigned char *name)
3366
+ #else /* not RE_ENABLE_I18N */
3367
+ build_equiv_class (bitset_t sbcset, const unsigned char *name)
3368
+ #endif /* not RE_ENABLE_I18N */
3369
+ {
3370
+ #ifdef _LIBC
3371
+ uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3372
+ if (nrules != 0)
3373
+ {
3374
+ const int32_t *table, *indirect;
3375
+ const unsigned char *weights, *extra, *cp;
3376
+ unsigned char char_buf[2];
3377
+ int32_t idx1, idx2;
3378
+ unsigned int ch;
3379
+ size_t len;
3380
+ /* This #include defines a local function! */
3381
+ # include <locale/weight.h>
3382
+ /* Calculate the index for equivalence class. */
3383
+ cp = name;
3384
+ table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3385
+ weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3386
+ _NL_COLLATE_WEIGHTMB);
3387
+ extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3388
+ _NL_COLLATE_EXTRAMB);
3389
+ indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3390
+ _NL_COLLATE_INDIRECTMB);
3391
+ idx1 = findidx (&cp);
3392
+ if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3393
+ /* This isn't a valid character. */
3394
+ return REG_ECOLLATE;
3395
+
3396
+ /* Build single byte matcing table for this equivalence class. */
3397
+ char_buf[1] = (unsigned char) '\0';
3398
+ len = weights[idx1 & 0xffffff];
3399
+ for (ch = 0; ch < SBC_MAX; ++ch)
3400
+ {
3401
+ char_buf[0] = ch;
3402
+ cp = char_buf;
3403
+ idx2 = findidx (&cp);
3404
+ /*
3405
+ idx2 = table[ch];
3406
+ */
3407
+ if (idx2 == 0)
3408
+ /* This isn't a valid character. */
3409
+ continue;
3410
+ /* Compare only if the length matches and the collation rule
3411
+ index is the same. */
3412
+ if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3413
+ {
3414
+ int cnt = 0;
3415
+
3416
+ while (cnt <= len &&
3417
+ weights[(idx1 & 0xffffff) + 1 + cnt]
3418
+ == weights[(idx2 & 0xffffff) + 1 + cnt])
3419
+ ++cnt;
3420
+
3421
+ if (cnt > len)
3422
+ bitset_set (sbcset, ch);
3423
+ }
3424
+ }
3425
+ /* Check whether the array has enough space. */
3426
+ if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3427
+ {
3428
+ /* Not enough, realloc it. */
3429
+ /* +1 in case of mbcset->nequiv_classes is 0. */
3430
+ int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3431
+ /* Use realloc since the array is NULL if *alloc == 0. */
3432
+ int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3433
+ int32_t,
3434
+ new_equiv_class_alloc);
3435
+ if (BE (new_equiv_classes == NULL, 0))
3436
+ return REG_ESPACE;
3437
+ mbcset->equiv_classes = new_equiv_classes;
3438
+ *equiv_class_alloc = new_equiv_class_alloc;
3439
+ }
3440
+ mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3441
+ }
3442
+ else
3443
+ #endif /* _LIBC */
3444
+ {
3445
+ if (BE (strlen ((const char *) name) != 1, 0))
3446
+ return REG_ECOLLATE;
3447
+ bitset_set (sbcset, *name);
3448
+ }
3449
+ return REG_NOERROR;
3450
+ }
3451
+
3452
+ /* Helper function for parse_bracket_exp.
3453
+ Build the character class which is represented by NAME.
3454
+ The result are written to MBCSET and SBCSET.
3455
+ CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3456
+ is a pointer argument sinse we may update it. */
3457
+
3458
+ static reg_errcode_t
3459
+ #ifdef RE_ENABLE_I18N
3460
+ build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3461
+ re_charset_t *mbcset, int *char_class_alloc,
3462
+ const unsigned char *class_name, reg_syntax_t syntax)
3463
+ #else /* not RE_ENABLE_I18N */
3464
+ build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3465
+ const unsigned char *class_name, reg_syntax_t syntax)
3466
+ #endif /* not RE_ENABLE_I18N */
3467
+ {
3468
+ int i;
3469
+ const char *name = (const char *) class_name;
3470
+
3471
+ /* In case of REG_ICASE "upper" and "lower" match the both of
3472
+ upper and lower cases. */
3473
+ if ((syntax & RE_ICASE)
3474
+ && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3475
+ name = "alpha";
3476
+
3477
+ #ifdef RE_ENABLE_I18N
3478
+ /* Check the space of the arrays. */
3479
+ if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3480
+ {
3481
+ /* Not enough, realloc it. */
3482
+ /* +1 in case of mbcset->nchar_classes is 0. */
3483
+ int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3484
+ /* Use realloc since array is NULL if *alloc == 0. */
3485
+ wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3486
+ new_char_class_alloc);
3487
+ if (BE (new_char_classes == NULL, 0))
3488
+ return REG_ESPACE;
3489
+ mbcset->char_classes = new_char_classes;
3490
+ *char_class_alloc = new_char_class_alloc;
3491
+ }
3492
+ mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3493
+ #endif /* RE_ENABLE_I18N */
3494
+
3495
+ #define BUILD_CHARCLASS_LOOP(ctype_func) \
3496
+ do { \
3497
+ if (BE (trans != NULL, 0)) \
3498
+ { \
3499
+ for (i = 0; i < SBC_MAX; ++i) \
3500
+ if (ctype_func (i)) \
3501
+ bitset_set (sbcset, trans[i]); \
3502
+ } \
3503
+ else \
3504
+ { \
3505
+ for (i = 0; i < SBC_MAX; ++i) \
3506
+ if (ctype_func (i)) \
3507
+ bitset_set (sbcset, i); \
3508
+ } \
3509
+ } while (0)
3510
+
3511
+ if (strcmp (name, "alnum") == 0)
3512
+ BUILD_CHARCLASS_LOOP (isalnum);
3513
+ else if (strcmp (name, "cntrl") == 0)
3514
+ BUILD_CHARCLASS_LOOP (iscntrl);
3515
+ else if (strcmp (name, "lower") == 0)
3516
+ BUILD_CHARCLASS_LOOP (islower);
3517
+ else if (strcmp (name, "space") == 0)
3518
+ BUILD_CHARCLASS_LOOP (isspace);
3519
+ else if (strcmp (name, "alpha") == 0)
3520
+ BUILD_CHARCLASS_LOOP (isalpha);
3521
+ else if (strcmp (name, "digit") == 0)
3522
+ BUILD_CHARCLASS_LOOP (isdigit);
3523
+ else if (strcmp (name, "print") == 0)
3524
+ BUILD_CHARCLASS_LOOP (isprint);
3525
+ else if (strcmp (name, "upper") == 0)
3526
+ BUILD_CHARCLASS_LOOP (isupper);
3527
+ else if (strcmp (name, "blank") == 0)
3528
+ BUILD_CHARCLASS_LOOP (isblank);
3529
+ else if (strcmp (name, "graph") == 0)
3530
+ BUILD_CHARCLASS_LOOP (isgraph);
3531
+ else if (strcmp (name, "punct") == 0)
3532
+ BUILD_CHARCLASS_LOOP (ispunct);
3533
+ else if (strcmp (name, "xdigit") == 0)
3534
+ BUILD_CHARCLASS_LOOP (isxdigit);
3535
+ else
3536
+ return REG_ECTYPE;
3537
+
3538
+ return REG_NOERROR;
3539
+ }
3540
+
3541
+ static bin_tree_t *
3542
+ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3543
+ const unsigned char *class_name,
3544
+ const unsigned char *extra, int non_match,
3545
+ reg_errcode_t *err)
3546
+ {
3547
+ re_bitset_ptr_t sbcset;
3548
+ #ifdef RE_ENABLE_I18N
3549
+ re_charset_t *mbcset;
3550
+ int alloc = 0;
3551
+ #endif /* not RE_ENABLE_I18N */
3552
+ reg_errcode_t ret;
3553
+ re_token_t br_token;
3554
+ bin_tree_t *tree;
3555
+
3556
+ sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3557
+ #ifdef RE_ENABLE_I18N
3558
+ mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3559
+ #endif /* RE_ENABLE_I18N */
3560
+
3561
+ #ifdef RE_ENABLE_I18N
3562
+ if (BE (sbcset == NULL || mbcset == NULL, 0))
3563
+ #else /* not RE_ENABLE_I18N */
3564
+ if (BE (sbcset == NULL, 0))
3565
+ #endif /* not RE_ENABLE_I18N */
3566
+ {
3567
+ *err = REG_ESPACE;
3568
+ return NULL;
3569
+ }
3570
+
3571
+ if (non_match)
3572
+ {
3573
+ #ifdef RE_ENABLE_I18N
3574
+ mbcset->non_match = 1;
3575
+ #endif /* not RE_ENABLE_I18N */
3576
+ }
3577
+
3578
+ /* We don't care the syntax in this case. */
3579
+ ret = build_charclass (trans, sbcset,
3580
+ #ifdef RE_ENABLE_I18N
3581
+ mbcset, &alloc,
3582
+ #endif /* RE_ENABLE_I18N */
3583
+ class_name, 0);
3584
+
3585
+ if (BE (ret != REG_NOERROR, 0))
3586
+ {
3587
+ re_free (sbcset);
3588
+ #ifdef RE_ENABLE_I18N
3589
+ free_charset (mbcset);
3590
+ #endif /* RE_ENABLE_I18N */
3591
+ *err = ret;
3592
+ return NULL;
3593
+ }
3594
+ /* \w match '_' also. */
3595
+ for (; *extra; extra++)
3596
+ bitset_set (sbcset, *extra);
3597
+
3598
+ /* If it is non-matching list. */
3599
+ if (non_match)
3600
+ bitset_not (sbcset);
3601
+
3602
+ #ifdef RE_ENABLE_I18N
3603
+ /* Ensure only single byte characters are set. */
3604
+ if (dfa->mb_cur_max > 1)
3605
+ bitset_mask (sbcset, dfa->sb_char);
3606
+ #endif
3607
+
3608
+ /* Build a tree for simple bracket. */
3609
+ br_token.type = SIMPLE_BRACKET;
3610
+ br_token.opr.sbcset = sbcset;
3611
+ tree = create_token_tree (dfa, NULL, NULL, &br_token);
3612
+ if (BE (tree == NULL, 0))
3613
+ goto build_word_op_espace;
3614
+
3615
+ #ifdef RE_ENABLE_I18N
3616
+ if (dfa->mb_cur_max > 1)
3617
+ {
3618
+ bin_tree_t *mbc_tree;
3619
+ /* Build a tree for complex bracket. */
3620
+ br_token.type = COMPLEX_BRACKET;
3621
+ br_token.opr.mbcset = mbcset;
3622
+ dfa->has_mb_node = 1;
3623
+ mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3624
+ if (BE (mbc_tree == NULL, 0))
3625
+ goto build_word_op_espace;
3626
+ /* Then join them by ALT node. */
3627
+ tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3628
+ if (BE (mbc_tree != NULL, 1))
3629
+ return tree;
3630
+ }
3631
+ else
3632
+ {
3633
+ free_charset (mbcset);
3634
+ return tree;
3635
+ }
3636
+ #else /* not RE_ENABLE_I18N */
3637
+ return tree;
3638
+ #endif /* not RE_ENABLE_I18N */
3639
+
3640
+ build_word_op_espace:
3641
+ re_free (sbcset);
3642
+ #ifdef RE_ENABLE_I18N
3643
+ free_charset (mbcset);
3644
+ #endif /* RE_ENABLE_I18N */
3645
+ *err = REG_ESPACE;
3646
+ return NULL;
3647
+ }
3648
+
3649
+ /* This is intended for the expressions like "a{1,3}".
3650
+ Fetch a number from `input', and return the number.
3651
+ Return -1, if the number field is empty like "{,1}".
3652
+ Return -2, If an error is occured. */
3653
+
3654
+ static int
3655
+ fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3656
+ {
3657
+ int num = -1;
3658
+ unsigned char c;
3659
+ while (1)
3660
+ {
3661
+ fetch_token (token, input, syntax);
3662
+ c = token->opr.c;
3663
+ if (BE (token->type == END_OF_RE, 0))
3664
+ return -2;
3665
+ if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3666
+ break;
3667
+ num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3668
+ ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3669
+ num = (num > RE_DUP_MAX) ? -2 : num;
3670
+ }
3671
+ return num;
3672
+ }
3673
+
3674
+ #ifdef RE_ENABLE_I18N
3675
+ static void
3676
+ free_charset (re_charset_t *cset)
3677
+ {
3678
+ re_free (cset->mbchars);
3679
+ # ifdef _LIBC
3680
+ re_free (cset->coll_syms);
3681
+ re_free (cset->equiv_classes);
3682
+ re_free (cset->range_starts);
3683
+ re_free (cset->range_ends);
3684
+ # endif
3685
+ re_free (cset->char_classes);
3686
+ re_free (cset);
3687
+ }
3688
+ #endif /* RE_ENABLE_I18N */
3689
+
3690
+ /* Functions for binary tree operation. */
3691
+
3692
+ /* Create a tree node. */
3693
+
3694
+ static bin_tree_t *
3695
+ create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3696
+ re_token_type_t type)
3697
+ {
3698
+ re_token_t t;
3699
+ t.type = type;
3700
+ return create_token_tree (dfa, left, right, &t);
3701
+ }
3702
+
3703
+ static bin_tree_t *
3704
+ create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3705
+ const re_token_t *token)
3706
+ {
3707
+ bin_tree_t *tree;
3708
+ if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3709
+ {
3710
+ bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3711
+
3712
+ if (storage == NULL)
3713
+ return NULL;
3714
+ storage->next = dfa->str_tree_storage;
3715
+ dfa->str_tree_storage = storage;
3716
+ dfa->str_tree_storage_idx = 0;
3717
+ }
3718
+ tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3719
+
3720
+ tree->parent = NULL;
3721
+ tree->left = left;
3722
+ tree->right = right;
3723
+ tree->token = *token;
3724
+ tree->token.duplicated = 0;
3725
+ tree->token.opt_subexp = 0;
3726
+ tree->first = NULL;
3727
+ tree->next = NULL;
3728
+ tree->node_idx = -1;
3729
+
3730
+ if (left != NULL)
3731
+ left->parent = tree;
3732
+ if (right != NULL)
3733
+ right->parent = tree;
3734
+ return tree;
3735
+ }
3736
+
3737
+ /* Mark the tree SRC as an optional subexpression.
3738
+ To be called from preorder or postorder. */
3739
+
3740
+ static reg_errcode_t
3741
+ mark_opt_subexp (void *extra, bin_tree_t *node)
3742
+ {
3743
+ int idx = (int) (long) extra;
3744
+ if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3745
+ node->token.opt_subexp = 1;
3746
+
3747
+ return REG_NOERROR;
3748
+ }
3749
+
3750
+ /* Free the allocated memory inside NODE. */
3751
+
3752
+ static void
3753
+ free_token (re_token_t *node)
3754
+ {
3755
+ #ifdef RE_ENABLE_I18N
3756
+ if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3757
+ free_charset (node->opr.mbcset);
3758
+ else
3759
+ #endif /* RE_ENABLE_I18N */
3760
+ if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3761
+ re_free (node->opr.sbcset);
3762
+ }
3763
+
3764
+ /* Worker function for tree walking. Free the allocated memory inside NODE
3765
+ and its children. */
3766
+
3767
+ static reg_errcode_t
3768
+ free_tree (void *extra, bin_tree_t *node)
3769
+ {
3770
+ free_token (&node->token);
3771
+ return REG_NOERROR;
3772
+ }
3773
+
3774
+
3775
+ /* Duplicate the node SRC, and return new node. This is a preorder
3776
+ visit similar to the one implemented by the generic visitor, but
3777
+ we need more infrastructure to maintain two parallel trees --- so,
3778
+ it's easier to duplicate. */
3779
+
3780
+ static bin_tree_t *
3781
+ duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3782
+ {
3783
+ const bin_tree_t *node;
3784
+ bin_tree_t *dup_root;
3785
+ bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3786
+
3787
+ for (node = root; ; )
3788
+ {
3789
+ /* Create a new tree and link it back to the current parent. */
3790
+ *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3791
+ if (*p_new == NULL)
3792
+ return NULL;
3793
+ (*p_new)->parent = dup_node;
3794
+ (*p_new)->token.duplicated = 1;
3795
+ dup_node = *p_new;
3796
+
3797
+ /* Go to the left node, or up and to the right. */
3798
+ if (node->left)
3799
+ {
3800
+ node = node->left;
3801
+ p_new = &dup_node->left;
3802
+ }
3803
+ else
3804
+ {
3805
+ const bin_tree_t *prev = NULL;
3806
+ while (node->right == prev || node->right == NULL)
3807
+ {
3808
+ prev = node;
3809
+ node = node->parent;
3810
+ dup_node = dup_node->parent;
3811
+ if (!node)
3812
+ return dup_root;
3813
+ }
3814
+ node = node->right;
3815
+ p_new = &dup_node->right;
3816
+ }
3817
+ }
3818
+ }