ox 2.14.14 → 2.14.17

Sign up to get free protection for your applications and to get access to all the features.
data/ext/ox/parse.c CHANGED
@@ -3,35 +3,35 @@
3
3
  * All rights reserved.
4
4
  */
5
5
 
6
- #include <stdlib.h>
7
6
  #include <errno.h>
8
7
  #include <stdbool.h>
9
8
  #include <stdio.h>
9
+ #include <stdlib.h>
10
10
  #include <string.h>
11
11
  #include <strings.h>
12
12
 
13
- #include "ruby.h"
14
- #include "ox.h"
15
- #include "err.h"
16
13
  #include "attr.h"
17
- #include "intern.h"
14
+ #include "err.h"
18
15
  #include "helper.h"
16
+ #include "intern.h"
17
+ #include "ox.h"
18
+ #include "ruby.h"
19
19
  #include "special.h"
20
20
 
21
- static void read_instruction(PInfo pi);
22
- static void read_doctype(PInfo pi);
23
- static void read_comment(PInfo pi);
24
- static char* read_element(PInfo pi);
25
- static void read_text(PInfo pi);
21
+ static void read_instruction(PInfo pi);
22
+ static void read_doctype(PInfo pi);
23
+ static void read_comment(PInfo pi);
24
+ static char *read_element(PInfo pi);
25
+ static void read_text(PInfo pi);
26
26
  /*static void read_reduced_text(PInfo pi); */
27
- static void read_cdata(PInfo pi);
28
- static char* read_name_token(PInfo pi);
29
- static char* read_quoted_value(PInfo pi);
30
- static char* read_hex_uint64(char *b, uint64_t *up);
31
- static char* read_10_uint64(char *b, uint64_t *up);
32
- static char* read_coded_chars(PInfo pi, char *text);
33
- static void next_non_white(PInfo pi);
34
- static int collapse_special(PInfo pi, char *str);
27
+ static void read_cdata(PInfo pi);
28
+ static char *read_name_token(PInfo pi);
29
+ static char *read_quoted_value(PInfo pi);
30
+ static char *read_hex_uint64(char *b, uint64_t *up);
31
+ static char *read_10_uint64(char *b, uint64_t *up);
32
+ static char *read_coded_chars(PInfo pi, char *text);
33
+ static void next_non_white(PInfo pi);
34
+ static int collapse_special(PInfo pi, char *str);
35
35
 
36
36
  /* This XML parser is a single pass, destructive, callback parser. It is a
37
37
  * single pass parse since it only make one pass over the characters in the
@@ -46,53 +46,43 @@ static int collapse_special(PInfo pi, char *str);
46
46
  * all cases to parse the string.
47
47
  */
48
48
 
49
- static char xml_valid_lower_chars[34] = "xxxxxxxxxooxxoxxxxxxxxxxxxxxxxxxo";
49
+ static char xml_valid_lower_chars[34] = "xxxxxxxxxooxxoxxxxxxxxxxxxxxxxxxo";
50
50
 
51
- inline static int
52
- is_white(char c) {
51
+ inline static int is_white(char c) {
53
52
  switch (c) {
54
53
  case ' ':
55
54
  case '\t':
56
55
  case '\f':
57
56
  case '\n':
58
- case '\r':
59
- return 1;
60
- default:
61
- return 0;
57
+ case '\r': return 1;
58
+ default: return 0;
62
59
  }
63
60
  }
64
61
 
65
-
66
- inline static void
67
- next_non_white(PInfo pi) {
62
+ inline static void next_non_white(PInfo pi) {
68
63
  for (; 1; pi->s++) {
69
- switch (*pi->s) {
70
- case ' ':
71
- case '\t':
72
- case '\f':
73
- case '\n':
74
- case '\r':
75
- break;
76
- default:
77
- return;
78
- }
64
+ switch (*pi->s) {
65
+ case ' ':
66
+ case '\t':
67
+ case '\f':
68
+ case '\n':
69
+ case '\r': break;
70
+ default: return;
71
+ }
79
72
  }
80
73
  }
81
74
 
82
- inline static void
83
- next_white(PInfo pi) {
75
+ inline static void next_white(PInfo pi) {
84
76
  for (; 1; pi->s++) {
85
- switch (*pi->s) {
86
- case ' ':
87
- case '\t':
88
- case '\f':
89
- case '\n':
90
- case '\r':
91
- case '\0':
92
- return;
93
- default:
94
- break;
95
- }
77
+ switch (*pi->s) {
78
+ case ' ':
79
+ case '\t':
80
+ case '\f':
81
+ case '\n':
82
+ case '\r':
83
+ case '\0': return;
84
+ default: break;
85
+ }
96
86
  }
97
87
  }
98
88
 
@@ -100,53 +90,52 @@ static void fix_newlines(char *buf) {
100
90
  #if HAVE_INDEX
101
91
  if (NULL != index(buf, '\r')) {
102
92
  #endif
103
- char *s = buf;
104
- char *d = buf;
105
-
106
- for (; '\0' != *s; s++) {
107
- if ('\r' == *s) {
108
- if ('\n' == *(s + 1)) {
109
- continue;
110
- }
111
- *d = '\n';
112
- } else if (d < s) {
113
- *d = *s;
114
- }
115
- d++;
116
- }
117
- *d = '\0';
93
+ char *s = buf;
94
+ char *d = buf;
95
+
96
+ for (; '\0' != *s; s++) {
97
+ if ('\r' == *s) {
98
+ if ('\n' == *(s + 1)) {
99
+ continue;
100
+ }
101
+ *d = '\n';
102
+ } else if (d < s) {
103
+ *d = *s;
104
+ }
105
+ d++;
106
+ }
107
+ *d = '\0';
118
108
  #if HAVE_INDEX
119
109
  }
120
110
  #endif
121
111
  }
122
112
 
123
- static void
124
- mark_pi_cb(void *ptr) {
113
+ static void mark_pi_cb(void *ptr) {
125
114
  if (NULL != ptr) {
126
- HelperStack stack = &((PInfo)ptr)->helpers;
127
- Helper h;
115
+ HelperStack stack = &((PInfo)ptr)->helpers;
116
+ Helper h;
128
117
 
129
- for (h = stack->head; h < stack->tail; h++) {
130
- if (NoCode != h->type) {
131
- rb_gc_mark(h->obj);
132
- }
133
- }
118
+ for (h = stack->head; h < stack->tail; h++) {
119
+ if (NoCode != h->type) {
120
+ rb_gc_mark(h->obj);
121
+ }
122
+ }
134
123
  }
135
124
  }
136
125
 
137
126
  VALUE
138
127
  ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options, Err err) {
139
- struct _pInfo pi;
140
- int body_read = 0;
141
- int block_given = rb_block_given_p();
142
- volatile VALUE wrap;
128
+ struct _pInfo pi;
129
+ int body_read = 0;
130
+ int block_given = rb_block_given_p();
131
+ volatile VALUE wrap;
143
132
 
144
133
  if (0 == xml) {
145
- set_error(err, "Invalid arg, xml string can not be null", xml, 0);
146
- return Qnil;
134
+ set_error(err, "Invalid arg, xml string can not be null", xml, 0);
135
+ return Qnil;
147
136
  }
148
137
  if (DEBUG <= options->trace) {
149
- printf("Parsing xml:\n%s\n", xml);
138
+ printf("Parsing xml:\n%s\n", xml);
150
139
  }
151
140
  // initialize parse info
152
141
  helper_stack_init(&pi.helpers);
@@ -154,663 +143,637 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
154
143
  wrap = Data_Wrap_Struct(rb_cObject, mark_pi_cb, NULL, &pi);
155
144
 
156
145
  err_init(&pi.err);
157
- pi.str = xml;
158
- pi.end = pi.str + len;
159
- pi.s = xml;
160
- pi.pcb = pcb;
161
- pi.obj = Qnil;
146
+ pi.str = xml;
147
+ pi.end = pi.str + len;
148
+ pi.s = xml;
149
+ pi.pcb = pcb;
150
+ pi.obj = Qnil;
162
151
  pi.circ_array = 0;
163
- pi.options = options;
164
- pi.marked = NULL;
165
- pi.mark_size = 0;
166
- pi.mark_cnt = 0;
152
+ pi.options = options;
153
+ pi.marked = NULL;
154
+ pi.mark_size = 0;
155
+ pi.mark_cnt = 0;
167
156
  while (1) {
168
- next_non_white(&pi); // skip white space
169
- if ('\0' == *pi.s) {
170
- break;
171
- }
172
- if (body_read && 0 != endp) {
173
- *endp = pi.s;
174
- break;
175
- }
176
- if ('<' != *pi.s) { // all top level entities start with <
177
- set_error(err, "invalid format, expected <", pi.str, pi.s);
178
- helper_stack_cleanup(&pi.helpers);
179
- return Qnil;
180
- }
181
- pi.s++; // past <
182
- switch (*pi.s) {
183
- case '?': // processing instruction
184
- pi.s++;
185
- read_instruction(&pi);
186
- break;
187
- case '!': // comment or doctype
188
- pi.s++;
189
- if ('\0' == *pi.s) {
190
- set_error(err, "invalid format, DOCTYPE or comment not terminated", pi.str, pi.s);
191
- helper_stack_cleanup(&pi.helpers);
192
- return Qnil;
193
- } else if ('-' == *pi.s) {
194
- pi.s++; // skip -
195
- if ('-' != *pi.s) {
196
- set_error(err, "invalid format, bad comment format", pi.str, pi.s);
197
- helper_stack_cleanup(&pi.helpers);
198
- return Qnil;
199
- } else {
200
- pi.s++; // skip second -
201
- read_comment(&pi);
202
- }
203
- } else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7) : 0 == strncmp("DOCTYPE", pi.s, 7)) {
204
- pi.s += 7;
205
- read_doctype(&pi);
206
- } else {
207
- set_error(err, "invalid format, DOCTYPE or comment expected", pi.str, pi.s);
208
- helper_stack_cleanup(&pi.helpers);
209
- return Qnil;
210
- }
211
- break;
212
- case '\0':
213
- set_error(err, "invalid format, document not terminated", pi.str, pi.s);
214
- helper_stack_cleanup(&pi.helpers);
215
- return Qnil;
216
- default:
217
- read_element(&pi);
218
- body_read = 1;
219
- break;
220
- }
221
- if (err_has(&pi.err)) {
222
- *err = pi.err;
223
- helper_stack_cleanup(&pi.helpers);
224
- return Qnil;
225
- }
226
- if (block_given && Qnil != pi.obj && Qundef != pi.obj) {
227
- if (NULL != pcb->finish) {
228
- pcb->finish(&pi);
229
- }
230
- rb_yield(pi.obj);
231
- }
157
+ next_non_white(&pi); // skip white space
158
+ if ('\0' == *pi.s) {
159
+ break;
160
+ }
161
+ if (body_read && 0 != endp) {
162
+ *endp = pi.s;
163
+ break;
164
+ }
165
+ if ('<' != *pi.s) { // all top level entities start with <
166
+ set_error(err, "invalid format, expected <", pi.str, pi.s);
167
+ helper_stack_cleanup(&pi.helpers);
168
+ return Qnil;
169
+ }
170
+ pi.s++; // past <
171
+ switch (*pi.s) {
172
+ case '?': // processing instruction
173
+ pi.s++;
174
+ read_instruction(&pi);
175
+ break;
176
+ case '!': // comment or doctype
177
+ pi.s++;
178
+ if ('\0' == *pi.s) {
179
+ set_error(err, "invalid format, DOCTYPE or comment not terminated", pi.str, pi.s);
180
+ helper_stack_cleanup(&pi.helpers);
181
+ return Qnil;
182
+ } else if ('-' == *pi.s) {
183
+ pi.s++; // skip -
184
+ if ('-' != *pi.s) {
185
+ set_error(err, "invalid format, bad comment format", pi.str, pi.s);
186
+ helper_stack_cleanup(&pi.helpers);
187
+ return Qnil;
188
+ } else {
189
+ pi.s++; // skip second -
190
+ read_comment(&pi);
191
+ }
192
+ } else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7)
193
+ : 0 == strncmp("DOCTYPE", pi.s, 7)) {
194
+ pi.s += 7;
195
+ read_doctype(&pi);
196
+ } else {
197
+ set_error(err, "invalid format, DOCTYPE or comment expected", pi.str, pi.s);
198
+ helper_stack_cleanup(&pi.helpers);
199
+ return Qnil;
200
+ }
201
+ break;
202
+ case '\0':
203
+ set_error(err, "invalid format, document not terminated", pi.str, pi.s);
204
+ helper_stack_cleanup(&pi.helpers);
205
+ return Qnil;
206
+ default:
207
+ read_element(&pi);
208
+ body_read = 1;
209
+ break;
210
+ }
211
+ if (err_has(&pi.err)) {
212
+ *err = pi.err;
213
+ helper_stack_cleanup(&pi.helpers);
214
+ return Qnil;
215
+ }
216
+ if (block_given && Qnil != pi.obj && Qundef != pi.obj) {
217
+ if (NULL != pcb->finish) {
218
+ pcb->finish(&pi);
219
+ }
220
+ rb_yield(pi.obj);
221
+ }
232
222
  }
233
223
  DATA_PTR(wrap) = NULL;
234
224
  helper_stack_cleanup(&pi.helpers);
235
225
  if (NULL != pcb->finish) {
236
- pcb->finish(&pi);
226
+ pcb->finish(&pi);
237
227
  }
238
228
  return pi.obj;
239
229
  }
240
230
 
241
231
  // Entered after the "<?" sequence. Ready to read the rest.
242
- static void
243
- read_instruction(PInfo pi) {
244
- char content[256];
245
- char *content_ptr;
246
- struct _attrStack attrs;
247
- char *attr_name;
248
- char *attr_value;
249
- char *target;
250
- char *end;
251
- char c;
252
- char *cend;
253
- size_t size;
254
- bool attrs_ok = true;
232
+ static void read_instruction(PInfo pi) {
233
+ char content[256];
234
+ char *content_ptr;
235
+ struct _attrStack attrs;
236
+ char *attr_name;
237
+ char *attr_value;
238
+ char *target;
239
+ char *end;
240
+ char c;
241
+ char *cend;
242
+ size_t size;
243
+ bool attrs_ok = true;
255
244
 
256
245
  *content = '\0';
257
246
  attr_stack_init(&attrs);
258
247
  if (0 == (target = read_name_token(pi))) {
259
- return;
248
+ return;
260
249
  }
261
250
  end = pi->s;
262
251
  for (; true; pi->s++) {
263
252
  switch (*pi->s) {
264
253
  case '?':
265
254
  if ('>' == *(pi->s + 1)) {
266
- pi->s++;
267
- goto DONE;
255
+ pi->s++;
256
+ goto DONE;
268
257
  }
269
258
  break;
270
- case '\0':
271
- set_error(&pi->err, "processing instruction not terminated", pi->str, pi->s);
272
- return;
273
- default:
274
- break;
259
+ case '\0': set_error(&pi->err, "processing instruction not terminated", pi->str, pi->s); return;
260
+ default: break;
275
261
  }
276
262
  }
277
263
  DONE:
278
- cend = pi->s;
279
- size = cend - end - 1;
264
+ cend = pi->s;
265
+ size = cend - end - 1;
280
266
  pi->s = end;
281
267
  if (size < sizeof(content)) {
282
- content_ptr = content;
268
+ content_ptr = content;
283
269
  } else {
284
- content_ptr = ALLOC_N(char, size + 1);
270
+ content_ptr = ALLOC_N(char, size + 1);
285
271
  }
286
272
  memcpy(content_ptr, end, size);
287
273
  content_ptr[size] = '\0';
288
274
 
289
275
  next_non_white(pi);
290
- c = *pi->s;
291
- *end = '\0'; // terminate name
276
+ c = *pi->s;
277
+ *end = '\0'; // terminate name
292
278
  if ('?' != c) {
293
- while ('?' != c) {
294
- pi->last = 0;
295
- if ('\0' == *pi->s) {
296
- attr_stack_cleanup(&attrs);
297
- set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
298
- return;
299
- }
300
- next_non_white(pi);
301
- if (0 == (attr_name = read_name_token(pi))) {
302
- attr_stack_cleanup(&attrs);
303
- return;
304
- }
305
- end = pi->s;
306
- next_non_white(pi);
307
- if ('=' != *pi->s++) {
308
- attrs_ok = false;
309
- break;
310
- }
311
- *end = '\0'; // terminate name
312
- // read value
313
- next_non_white(pi);
314
- if (0 == (attr_value = read_quoted_value(pi))) {
315
- attr_stack_cleanup(&attrs);
316
- return;
317
- }
318
- attr_stack_push(&attrs, attr_name, attr_value);
319
- next_non_white(pi);
320
- if ('\0' == pi->last) {
321
- c = *pi->s;
322
- } else {
323
- c = pi->last;
324
- }
325
- }
326
- if ('?' == *pi->s) {
327
- pi->s++;
328
- }
279
+ while ('?' != c) {
280
+ pi->last = 0;
281
+ if ('\0' == *pi->s) {
282
+ attr_stack_cleanup(&attrs);
283
+ set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
284
+ return;
285
+ }
286
+ next_non_white(pi);
287
+ if (0 == (attr_name = read_name_token(pi))) {
288
+ attr_stack_cleanup(&attrs);
289
+ return;
290
+ }
291
+ end = pi->s;
292
+ next_non_white(pi);
293
+ if ('=' != *pi->s++) {
294
+ attrs_ok = false;
295
+ break;
296
+ }
297
+ *end = '\0'; // terminate name
298
+ // read value
299
+ next_non_white(pi);
300
+ if (0 == (attr_value = read_quoted_value(pi))) {
301
+ attr_stack_cleanup(&attrs);
302
+ return;
303
+ }
304
+ attr_stack_push(&attrs, attr_name, attr_value);
305
+ next_non_white(pi);
306
+ if ('\0' == pi->last) {
307
+ c = *pi->s;
308
+ } else {
309
+ c = pi->last;
310
+ }
311
+ }
312
+ if ('?' == *pi->s) {
313
+ pi->s++;
314
+ }
329
315
  } else {
330
- pi->s++;
316
+ pi->s++;
331
317
  }
332
318
  if (attrs_ok) {
333
- if ('>' != *pi->s++) {
334
- attr_stack_cleanup(&attrs);
335
- set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
336
- return;
337
- }
319
+ if ('>' != *pi->s++) {
320
+ attr_stack_cleanup(&attrs);
321
+ set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
322
+ return;
323
+ }
338
324
  } else {
339
- pi->s = cend + 1;
325
+ pi->s = cend + 1;
340
326
  }
341
327
  if (0 != pi->pcb->instruct) {
342
- if (attrs_ok) {
343
- pi->pcb->instruct(pi, target, attrs.head, 0);
344
- } else {
345
- pi->pcb->instruct(pi, target, attrs.head, content_ptr);
346
- }
328
+ if (attrs_ok) {
329
+ pi->pcb->instruct(pi, target, attrs.head, 0);
330
+ } else {
331
+ pi->pcb->instruct(pi, target, attrs.head, content_ptr);
332
+ }
347
333
  }
348
334
  attr_stack_cleanup(&attrs);
349
335
  if (content_ptr != content) {
350
- xfree(content_ptr);
336
+ xfree(content_ptr);
351
337
  }
352
338
  }
353
339
 
354
- static void
355
- read_delimited(PInfo pi, char end) {
356
- char c;
340
+ static void read_delimited(PInfo pi, char end) {
341
+ char c;
357
342
 
358
343
  if ('"' == end || '\'' == end) {
359
- for (c = *pi->s++; end != c; c = *pi->s++) {
360
- if ('\0' == c) {
361
- set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s);
362
- return;
363
- }
364
- }
344
+ for (c = *pi->s++; end != c; c = *pi->s++) {
345
+ if ('\0' == c) {
346
+ set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s);
347
+ return;
348
+ }
349
+ }
365
350
  } else {
366
- while (1) {
367
- c = *pi->s++;
368
- if (end == c) {
369
- return;
370
- }
371
- switch (c) {
372
- case '\0':
373
- set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s);
374
- return;
375
- case '"':
376
- read_delimited(pi, c);
377
- break;
378
- case '\'':
379
- read_delimited(pi, c);
380
- break;
381
- case '[':
382
- read_delimited(pi, ']');
383
- break;
384
- case '<':
385
- read_delimited(pi, '>');
386
- break;
387
- default:
388
- break;
389
- }
390
- }
351
+ while (1) {
352
+ c = *pi->s++;
353
+ if (end == c) {
354
+ return;
355
+ }
356
+ switch (c) {
357
+ case '\0': set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s); return;
358
+ case '"': read_delimited(pi, c); break;
359
+ case '\'': read_delimited(pi, c); break;
360
+ case '[': read_delimited(pi, ']'); break;
361
+ case '<': read_delimited(pi, '>'); break;
362
+ default: break;
363
+ }
364
+ }
391
365
  }
392
366
  }
393
367
 
394
368
  // Entered after the "<!DOCTYPE" sequence plus the first character after
395
369
  // that. Ready to read the rest.
396
- static void
397
- read_doctype(PInfo pi) {
398
- char *doctype;
370
+ static void read_doctype(PInfo pi) {
371
+ char *doctype;
399
372
 
400
373
  next_non_white(pi);
401
374
  doctype = pi->s;
402
375
  read_delimited(pi, '>');
403
376
  if (err_has(&pi->err)) {
404
- return;
377
+ return;
405
378
  }
406
379
  pi->s--;
407
380
  *pi->s = '\0';
408
381
  pi->s++;
409
382
  if (0 != pi->pcb->add_doctype) {
410
- fix_newlines(doctype);
411
- pi->pcb->add_doctype(pi, doctype);
383
+ fix_newlines(doctype);
384
+ pi->pcb->add_doctype(pi, doctype);
412
385
  }
413
386
  }
414
387
 
415
388
  // Entered after "<!--". Returns error code.
416
- static void
417
- read_comment(PInfo pi) {
418
- char *end;
419
- char *s;
420
- char *comment;
421
- int done = 0;
389
+ static void read_comment(PInfo pi) {
390
+ char *end;
391
+ char *s;
392
+ char *comment;
393
+ int done = 0;
422
394
 
423
395
  next_non_white(pi);
424
396
  comment = pi->s;
425
- end = strstr(pi->s, "-->");
397
+ end = strstr(pi->s, "-->");
426
398
  if (0 == end) {
427
- set_error(&pi->err, "invalid format, comment not terminated", pi->str, pi->s);
428
- return;
399
+ set_error(&pi->err, "invalid format, comment not terminated", pi->str, pi->s);
400
+ return;
429
401
  }
430
402
  for (s = end - 1; pi->s < s && !done; s--) {
431
- switch(*s) {
432
- case ' ':
433
- case '\t':
434
- case '\f':
435
- case '\n':
436
- case '\r':
437
- break;
438
- default:
439
- *(s + 1) = '\0';
440
- done = 1;
441
- break;
442
- }
403
+ switch (*s) {
404
+ case ' ':
405
+ case '\t':
406
+ case '\f':
407
+ case '\n':
408
+ case '\r': break;
409
+ default:
410
+ *(s + 1) = '\0';
411
+ done = 1;
412
+ break;
413
+ }
443
414
  }
444
- *end = '\0'; // in case the comment was blank
415
+ *end = '\0'; // in case the comment was blank
445
416
  pi->s = end + 3;
446
417
  if (0 != pi->pcb->add_comment) {
447
- fix_newlines(comment);
448
- pi->pcb->add_comment(pi, comment);
418
+ fix_newlines(comment);
419
+ pi->pcb->add_comment(pi, comment);
449
420
  }
450
421
  }
451
422
 
452
423
  // Entered after the '<' and the first character after that. Returns stat
453
424
  // code.
454
- static char*
455
- read_element(PInfo pi) {
456
- struct _attrStack attrs;
457
- const char *attr_name;
458
- const char *attr_value;
459
- char *name;
460
- char *ename;
461
- char *end;
462
- char c;
463
- long elen;
464
- int hasChildren = 0;
465
- int done = 0;
425
+ static char *read_element(PInfo pi) {
426
+ struct _attrStack attrs;
427
+ const char *attr_name;
428
+ const char *attr_value;
429
+ char *name;
430
+ char *ename;
431
+ char *end;
432
+ char c;
433
+ long elen;
434
+ int hasChildren = 0;
435
+ int done = 0;
466
436
 
467
437
  attr_stack_init(&attrs);
468
438
  if (0 == (ename = read_name_token(pi))) {
469
- return 0;
439
+ return 0;
470
440
  }
471
- end = pi->s;
441
+ end = pi->s;
472
442
  elen = end - ename;
473
443
  next_non_white(pi);
474
- c = *pi->s;
444
+ c = *pi->s;
475
445
  *end = '\0';
476
446
  if ('/' == c) {
477
- // empty element, no attributes and no children
478
- pi->s++;
479
- if ('>' != *pi->s) {
480
- attr_stack_cleanup(&attrs);
481
- set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
482
- return 0;
483
- }
484
- pi->s++; /* past > */
485
- pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
486
- pi->pcb->end_element(pi, ename);
447
+ // empty element, no attributes and no children
448
+ pi->s++;
449
+ if ('>' != *pi->s) {
450
+ attr_stack_cleanup(&attrs);
451
+ set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
452
+ return 0;
453
+ }
454
+ pi->s++; /* past > */
455
+ pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
456
+ pi->pcb->end_element(pi, ename);
487
457
 
488
- attr_stack_cleanup(&attrs);
489
- return 0;
458
+ attr_stack_cleanup(&attrs);
459
+ return 0;
490
460
  }
491
461
  /* read attribute names until the close (/ or >) is reached */
492
462
  while (!done) {
493
- if ('\0' == c) {
494
- if (pi->end <= pi->s) {
495
- break;
496
- }
497
- next_non_white(pi);
498
- c = *pi->s;
499
- }
500
- pi->last = 0;
501
- switch (c) {
502
- case '\0':
503
- attr_stack_cleanup(&attrs);
504
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
505
- return 0;
506
- case '/':
507
- /* Element with just attributes. */
508
- pi->s++;
509
- if ('>' != *pi->s) {
510
- attr_stack_cleanup(&attrs);
511
- set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
512
- return 0;
513
- }
514
- pi->s++;
515
- pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
516
- pi->pcb->end_element(pi, ename);
517
- attr_stack_cleanup(&attrs);
518
-
519
- return 0;
520
- case '>':
521
- /* has either children or a value */
522
- pi->s++;
523
- hasChildren = 1;
524
- done = 1;
525
- pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
463
+ if ('\0' == c) {
464
+ if (pi->end <= pi->s) {
465
+ break;
466
+ }
467
+ next_non_white(pi);
468
+ c = *pi->s;
469
+ }
470
+ pi->last = 0;
471
+ switch (c) {
472
+ case '\0':
473
+ attr_stack_cleanup(&attrs);
474
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
475
+ return 0;
476
+ case '/':
477
+ /* Element with just attributes. */
478
+ pi->s++;
479
+ if ('>' != *pi->s) {
480
+ attr_stack_cleanup(&attrs);
481
+ set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
482
+ return 0;
483
+ }
484
+ pi->s++;
485
+ pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
486
+ pi->pcb->end_element(pi, ename);
487
+ attr_stack_cleanup(&attrs);
488
+
489
+ return 0;
490
+ case '>':
491
+ /* has either children or a value */
492
+ pi->s++;
493
+ hasChildren = 1;
494
+ done = 1;
495
+ pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
526
496
 
527
- break;
528
- default:
529
- /* Attribute name so it's an element and the attribute will be */
530
- /* added to it. */
531
- if (0 == (attr_name = read_name_token(pi))) {
532
- attr_stack_cleanup(&attrs);
533
- return 0;
534
- }
535
- end = pi->s;
536
- next_non_white(pi);
537
- if ('=' != *pi->s++) {
538
- if (TolerantEffort == pi->options->effort) {
539
- pi->s--;
540
- pi->last = *pi->s;
541
- *end = '\0'; /* terminate name */
542
- attr_value = "";
543
- attr_stack_push(&attrs, attr_name, attr_value);
544
- break;
545
- } else {
546
- attr_stack_cleanup(&attrs);
547
- set_error(&pi->err, "invalid format, no attribute value", pi->str, pi->s);
548
- return 0;
549
- }
550
- }
551
- *end = '\0'; /* terminate name */
552
- /* read value */
553
- next_non_white(pi);
554
- if (0 == (attr_value = read_quoted_value(pi))) {
555
- return 0;
556
- }
557
- if (pi->options->convert_special && 0 != strchr(attr_value, '&')) {
558
- if (0 != collapse_special(pi, (char*)attr_value) || err_has(&pi->err)) {
559
- attr_stack_cleanup(&attrs);
560
- return 0;
561
- }
562
- }
563
- attr_stack_push(&attrs, attr_name, attr_value);
564
- break;
565
- }
566
- if ('\0' == pi->last) {
567
- c = '\0';
568
- } else {
569
- c = pi->last;
570
- pi->last = '\0';
571
- }
497
+ break;
498
+ default:
499
+ /* Attribute name so it's an element and the attribute will be */
500
+ /* added to it. */
501
+ if (0 == (attr_name = read_name_token(pi))) {
502
+ attr_stack_cleanup(&attrs);
503
+ return 0;
504
+ }
505
+ end = pi->s;
506
+ next_non_white(pi);
507
+ if ('=' != *pi->s++) {
508
+ if (TolerantEffort == pi->options->effort) {
509
+ pi->s--;
510
+ pi->last = *pi->s;
511
+ *end = '\0'; /* terminate name */
512
+ attr_value = "";
513
+ attr_stack_push(&attrs, attr_name, attr_value);
514
+ break;
515
+ } else {
516
+ attr_stack_cleanup(&attrs);
517
+ set_error(&pi->err, "invalid format, no attribute value", pi->str, pi->s);
518
+ return 0;
519
+ }
520
+ }
521
+ *end = '\0'; /* terminate name */
522
+ /* read value */
523
+ next_non_white(pi);
524
+ if (0 == (attr_value = read_quoted_value(pi))) {
525
+ return 0;
526
+ }
527
+ if (pi->options->convert_special && 0 != strchr(attr_value, '&')) {
528
+ if (0 != collapse_special(pi, (char *)attr_value) || err_has(&pi->err)) {
529
+ attr_stack_cleanup(&attrs);
530
+ return 0;
531
+ }
532
+ }
533
+ attr_stack_push(&attrs, attr_name, attr_value);
534
+ break;
535
+ }
536
+ if ('\0' == pi->last) {
537
+ c = '\0';
538
+ } else {
539
+ c = pi->last;
540
+ pi->last = '\0';
541
+ }
572
542
  }
573
543
  if (hasChildren) {
574
- char *start;
575
- int first = 1;
576
-
577
- done = 0;
578
- /* read children */
579
- while (!done) {
580
- start = pi->s;
581
- next_non_white(pi);
582
- if (OffSkip == pi->options->skip && start < pi->s && '<' == *pi->s) {
583
- c = *pi->s;
584
- *pi->s = '\0';
585
- pi->pcb->add_text(pi, start, 1);
586
- *pi->s = c;
587
- }
588
- c = *pi->s++;
589
- if ('\0' == c) {
590
- attr_stack_cleanup(&attrs);
591
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
592
- return 0;
593
- }
594
- if ('<' == c) {
595
- char *slash;
596
-
597
- switch (*pi->s) {
598
- case '!': /* better be a comment or CDATA */
599
- pi->s++;
600
- if ('-' == *pi->s && '-' == *(pi->s + 1)) {
601
- pi->s += 2;
602
- read_comment(pi);
603
- } else if ((TolerantEffort == pi->options->effort) ?
604
- 0 == strncasecmp("[CDATA[", pi->s, 7) :
605
- 0 == strncmp("[CDATA[", pi->s, 7)) {
606
- pi->s += 7;
607
- read_cdata(pi);
608
- } else {
609
- attr_stack_cleanup(&attrs);
610
- set_error(&pi->err, "invalid format, invalid comment or CDATA format", pi->str, pi->s);
611
- return 0;
612
- }
613
- break;
614
- case '?': /* processing instruction */
615
- pi->s++;
616
- read_instruction(pi);
617
- break;
618
- case '/':
619
- slash = pi->s;
620
- pi->s++;
621
- if (0 == (name = read_name_token(pi))) {
622
- attr_stack_cleanup(&attrs);
623
- return 0;
624
- }
625
- end = pi->s;
626
- next_non_white(pi);
627
- c = *pi->s;
628
- *end = '\0';
629
- if (0 != ((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
630
- attr_stack_cleanup(&attrs);
631
- if (TolerantEffort == pi->options->effort) {
632
- pi->pcb->end_element(pi, ename);
633
- return name;
634
- } else {
635
- set_error(&pi->err, "invalid format, elements overlap", pi->str, pi->s);
636
- return 0;
637
- }
638
- }
639
- if ('>' != c) {
640
- attr_stack_cleanup(&attrs);
641
- set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
642
- return 0;
643
- }
644
- if (first && start != slash - 1) {
645
- // Some white space between start and here so add as
646
- // text after checking skip.
647
- *(slash - 1) = '\0';
648
- switch (pi->options->skip) {
649
- case CrSkip: {
650
- char *s = start;
651
- char *e = start;
652
-
653
- for (; '\0' != *e; e++) {
654
- if ('\r' != *e) {
655
- *s++ = *e;
656
- }
657
- }
658
- *s = '\0';
659
- break;
660
- }
661
- case SpcSkip:
662
- *start = '\0';
663
- break;
664
- case NoSkip:
665
- case OffSkip:
666
- default:
667
- break;
668
- }
669
- if ('\0' != *start) {
670
- pi->pcb->add_text(pi, start, 1);
671
- }
672
- }
673
- pi->s++;
674
- pi->pcb->end_element(pi, ename);
675
- attr_stack_cleanup(&attrs);
676
- return 0;
677
- case '\0':
678
- attr_stack_cleanup(&attrs);
679
- if (TolerantEffort == pi->options->effort) {
680
- return 0;
681
- } else {
682
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
683
- return 0;
684
- }
685
- default:
686
- first = 0;
687
- /* a child element */
688
- // Child closed with mismatched name.
689
- if (0 != (name = read_element(pi))) {
690
- attr_stack_cleanup(&attrs);
691
-
692
- if (0 == ((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
693
- pi->s++;
694
- pi->pcb->end_element(pi, ename);
695
- return 0;
696
- } else { // not the correct element yet
697
- pi->pcb->end_element(pi, ename);
698
- return name;
699
- }
700
- } else if (err_has(&pi->err)) {
701
- return 0;
702
- }
703
- break;
704
- }
705
- } else { /* read as TEXT */
706
- pi->s = start;
707
- /*pi->s--; */
708
- read_text(pi);
709
- /*read_reduced_text(pi); */
710
-
711
- /* to exit read_text with no errors the next character must be < */
712
- if ('/' == *(pi->s + 1) &&
713
- 0 == ((TolerantEffort == pi->options->effort) ? strncasecmp(ename, pi->s + 2, elen) : strncmp(ename, pi->s + 2, elen)) &&
714
- '>' == *(pi->s + elen + 2)) {
715
-
716
- /* close tag after text so treat as a value */
717
- pi->s += elen + 3;
718
- pi->pcb->end_element(pi, ename);
719
- attr_stack_cleanup(&attrs);
720
- return 0;
721
- }
722
- }
723
- }
544
+ char *start;
545
+ int first = 1;
546
+
547
+ done = 0;
548
+ /* read children */
549
+ while (!done) {
550
+ start = pi->s;
551
+ next_non_white(pi);
552
+ if (OffSkip == pi->options->skip && start < pi->s && '<' == *pi->s) {
553
+ c = *pi->s;
554
+ *pi->s = '\0';
555
+ pi->pcb->add_text(pi, start, 1);
556
+ *pi->s = c;
557
+ }
558
+ c = *pi->s++;
559
+ if ('\0' == c) {
560
+ attr_stack_cleanup(&attrs);
561
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
562
+ return 0;
563
+ }
564
+ if ('<' == c) {
565
+ char *slash;
566
+
567
+ switch (*pi->s) {
568
+ case '!': /* better be a comment or CDATA */
569
+ pi->s++;
570
+ if ('-' == *pi->s && '-' == *(pi->s + 1)) {
571
+ pi->s += 2;
572
+ read_comment(pi);
573
+ } else if ((TolerantEffort == pi->options->effort) ? 0 == strncasecmp("[CDATA[", pi->s, 7)
574
+ : 0 == strncmp("[CDATA[", pi->s, 7)) {
575
+ pi->s += 7;
576
+ read_cdata(pi);
577
+ } else {
578
+ attr_stack_cleanup(&attrs);
579
+ set_error(&pi->err, "invalid format, invalid comment or CDATA format", pi->str, pi->s);
580
+ return 0;
581
+ }
582
+ break;
583
+ case '?': /* processing instruction */
584
+ pi->s++;
585
+ read_instruction(pi);
586
+ break;
587
+ case '/':
588
+ slash = pi->s;
589
+ pi->s++;
590
+ if (0 == (name = read_name_token(pi))) {
591
+ attr_stack_cleanup(&attrs);
592
+ return 0;
593
+ }
594
+ end = pi->s;
595
+ next_non_white(pi);
596
+ c = *pi->s;
597
+ *end = '\0';
598
+ if (0 !=
599
+ ((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
600
+ attr_stack_cleanup(&attrs);
601
+ if (TolerantEffort == pi->options->effort) {
602
+ pi->pcb->end_element(pi, ename);
603
+ return name;
604
+ } else {
605
+ set_error(&pi->err, "invalid format, elements overlap", pi->str, pi->s);
606
+ return 0;
607
+ }
608
+ }
609
+ if ('>' != c) {
610
+ attr_stack_cleanup(&attrs);
611
+ set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
612
+ return 0;
613
+ }
614
+ if (first && start != slash - 1) {
615
+ // Some white space between start and here so add as
616
+ // text after checking skip.
617
+ *(slash - 1) = '\0';
618
+ switch (pi->options->skip) {
619
+ case CrSkip: {
620
+ char *s = start;
621
+ char *e = start;
622
+
623
+ for (; '\0' != *e; e++) {
624
+ if ('\r' != *e) {
625
+ *s++ = *e;
626
+ }
627
+ }
628
+ *s = '\0';
629
+ break;
630
+ }
631
+ case SpcSkip: *start = '\0'; break;
632
+ case NoSkip:
633
+ case OffSkip:
634
+ default: break;
635
+ }
636
+ if ('\0' != *start) {
637
+ pi->pcb->add_text(pi, start, 1);
638
+ }
639
+ }
640
+ pi->s++;
641
+ pi->pcb->end_element(pi, ename);
642
+ attr_stack_cleanup(&attrs);
643
+ return 0;
644
+ case '\0':
645
+ attr_stack_cleanup(&attrs);
646
+ if (TolerantEffort == pi->options->effort) {
647
+ return 0;
648
+ } else {
649
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
650
+ return 0;
651
+ }
652
+ default:
653
+ first = 0;
654
+ /* a child element */
655
+ // Child closed with mismatched name.
656
+ if (0 != (name = read_element(pi))) {
657
+ attr_stack_cleanup(&attrs);
658
+
659
+ if (0 ==
660
+ ((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
661
+ pi->s++;
662
+ pi->pcb->end_element(pi, ename);
663
+ return 0;
664
+ } else { // not the correct element yet
665
+ pi->pcb->end_element(pi, ename);
666
+ return name;
667
+ }
668
+ } else if (err_has(&pi->err)) {
669
+ return 0;
670
+ }
671
+ break;
672
+ }
673
+ } else { /* read as TEXT */
674
+ pi->s = start;
675
+ /*pi->s--; */
676
+ read_text(pi);
677
+ /*read_reduced_text(pi); */
678
+
679
+ /* to exit read_text with no errors the next character must be < */
680
+ if ('/' == *(pi->s + 1) &&
681
+ 0 == ((TolerantEffort == pi->options->effort) ? strncasecmp(ename, pi->s + 2, elen)
682
+ : strncmp(ename, pi->s + 2, elen)) &&
683
+ '>' == *(pi->s + elen + 2)) {
684
+ /* close tag after text so treat as a value */
685
+ pi->s += elen + 3;
686
+ pi->pcb->end_element(pi, ename);
687
+ attr_stack_cleanup(&attrs);
688
+ return 0;
689
+ }
690
+ }
691
+ }
724
692
  }
725
693
  attr_stack_cleanup(&attrs);
726
694
  return 0;
727
695
  }
728
696
 
729
- static void
730
- read_text(PInfo pi) {
731
- char buf[MAX_TEXT_LEN];
732
- char *b = buf;
733
- char *alloc_buf = 0;
734
- char *end = b + sizeof(buf) - 2;
735
- char c;
736
- int done = 0;
697
+ static void read_text(PInfo pi) {
698
+ char buf[MAX_TEXT_LEN];
699
+ char *b = buf;
700
+ char *alloc_buf = 0;
701
+ char *end = b + sizeof(buf) - 2;
702
+ char c;
703
+ int done = 0;
737
704
 
738
705
  while (!done) {
739
- c = *pi->s++;
740
- switch(c) {
741
- case '<':
742
- done = 1;
743
- pi->s--;
744
- break;
745
- case '\0':
746
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
747
- return;
748
- default:
749
- if (end <= (b + (('&' == c) ? 7 : 0))) { /* extra 8 for special just in case it is sequence of bytes */
750
- unsigned long size;
751
-
752
- if (0 == alloc_buf) {
753
- size = sizeof(buf) * 2;
754
- alloc_buf = ALLOC_N(char, size);
755
- memcpy(alloc_buf, buf, b - buf);
756
- b = alloc_buf + (b - buf);
757
- } else {
758
- unsigned long pos = b - alloc_buf;
759
-
760
- size = (end - alloc_buf) * 2;
761
- REALLOC_N(alloc_buf, char, size);
762
- b = alloc_buf + pos;
763
- }
764
- end = alloc_buf + size - 2;
765
- }
766
- if ('&' == c) {
767
- if (0 == (b = read_coded_chars(pi, b))) {
768
- return;
769
- }
770
- } else {
771
- if (0 <= c && c <= 0x20) {
772
- if (StrictEffort == pi->options->effort && 'x' == xml_valid_lower_chars[(unsigned char)c]) {
773
- set_error(&pi->err, "invalid character", pi->str, pi->s);
774
- return;
775
- }
776
- switch (pi->options->skip) {
777
- case CrSkip:
778
- if (buf != b && '\n' == c && '\r' == *(b - 1)) {
779
- *(b - 1) = '\n';
780
- } else {
781
- *b++ = c;
782
- }
783
- break;
784
- case SpcSkip:
785
- if (is_white(c)) {
786
- if (buf == b || ' ' != *(b - 1)) {
787
- *b++ = ' ';
788
- }
789
- } else {
790
- *b++ = c;
791
- }
792
- break;
793
- case NoSkip:
794
- case OffSkip:
795
- default:
796
- *b++ = c;
797
- break;
798
- }
799
- } else {
800
- *b++ = c;
801
- }
802
- }
803
- break;
804
- }
706
+ c = *pi->s++;
707
+ switch (c) {
708
+ case '<':
709
+ done = 1;
710
+ pi->s--;
711
+ break;
712
+ case '\0': set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s); return;
713
+ default:
714
+ if (end <= (b + (('&' == c) ? 7 : 0))) { /* extra 8 for special just in case it is sequence of bytes */
715
+ unsigned long size;
716
+
717
+ if (0 == alloc_buf) {
718
+ size = sizeof(buf) * 2;
719
+ alloc_buf = ALLOC_N(char, size);
720
+ memcpy(alloc_buf, buf, b - buf);
721
+ b = alloc_buf + (b - buf);
722
+ } else {
723
+ unsigned long pos = b - alloc_buf;
724
+
725
+ size = (end - alloc_buf) * 2;
726
+ REALLOC_N(alloc_buf, char, size);
727
+ b = alloc_buf + pos;
728
+ }
729
+ end = alloc_buf + size - 2;
730
+ }
731
+ if ('&' == c) {
732
+ if (0 == (b = read_coded_chars(pi, b))) {
733
+ return;
734
+ }
735
+ } else {
736
+ if (0 <= c && c <= 0x20) {
737
+ if (StrictEffort == pi->options->effort && 'x' == xml_valid_lower_chars[(unsigned char)c]) {
738
+ set_error(&pi->err, "invalid character", pi->str, pi->s);
739
+ return;
740
+ }
741
+ switch (pi->options->skip) {
742
+ case CrSkip:
743
+ if (buf != b && '\n' == c && '\r' == *(b - 1)) {
744
+ *(b - 1) = '\n';
745
+ } else {
746
+ *b++ = c;
747
+ }
748
+ break;
749
+ case SpcSkip:
750
+ if (is_white(c)) {
751
+ if (buf == b || ' ' != *(b - 1)) {
752
+ *b++ = ' ';
753
+ }
754
+ } else {
755
+ *b++ = c;
756
+ }
757
+ break;
758
+ case NoSkip:
759
+ case OffSkip:
760
+ default: *b++ = c; break;
761
+ }
762
+ } else {
763
+ *b++ = c;
764
+ }
765
+ }
766
+ break;
767
+ }
805
768
  }
806
769
  *b = '\0';
807
770
  if (0 != alloc_buf) {
808
- fix_newlines(alloc_buf);
809
- pi->pcb->add_text(pi, alloc_buf, ('/' == *(pi->s + 1)));
810
- xfree(alloc_buf);
771
+ fix_newlines(alloc_buf);
772
+ pi->pcb->add_text(pi, alloc_buf, ('/' == *(pi->s + 1)));
773
+ xfree(alloc_buf);
811
774
  } else {
812
- fix_newlines(buf);
813
- pi->pcb->add_text(pi, buf, ('/' == *(pi->s + 1)));
775
+ fix_newlines(buf);
776
+ pi->pcb->add_text(pi, buf, ('/' == *(pi->s + 1)));
814
777
  }
815
778
  }
816
779
 
@@ -886,323 +849,322 @@ read_reduced_text(PInfo pi) {
886
849
  }
887
850
  #endif
888
851
 
889
- static char*
890
- read_name_token(PInfo pi) {
891
- char *start;
852
+ static char *read_name_token(PInfo pi) {
853
+ char *start;
892
854
 
893
855
  next_non_white(pi);
894
856
  start = pi->s;
895
857
  for (; 1; pi->s++) {
896
- switch (*pi->s) {
897
- case ' ':
898
- case '\t':
899
- case '\f':
900
- case '?':
901
- case '=':
902
- case '/':
903
- case '>':
904
- case '\n':
905
- case '\r':
906
- return start;
907
- case '\0':
908
- /* documents never terminate after a name token */
909
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
910
- return 0;
911
- break; /* to avoid warnings */
912
- case ':':
913
- if ('\0' == *pi->options->strip_ns) {
914
- break;
915
- } else if ('*' == *pi->options->strip_ns && '\0' == pi->options->strip_ns[1]) {
916
- start = pi->s + 1;
917
- } else if (0 == strncmp(pi->options->strip_ns, start, pi->s - start)) {
918
- start = pi->s + 1;
919
- }
920
- break;
921
- default:
922
- break;
923
- }
858
+ switch (*pi->s) {
859
+ case ' ':
860
+ case '\t':
861
+ case '\f':
862
+ case '?':
863
+ case '=':
864
+ case '/':
865
+ case '>':
866
+ case '\n':
867
+ case '\r': return start;
868
+ case '\0':
869
+ /* documents never terminate after a name token */
870
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
871
+ return 0;
872
+ break; /* to avoid warnings */
873
+ case ':':
874
+ if ('\0' == *pi->options->strip_ns) {
875
+ break;
876
+ } else if ('*' == *pi->options->strip_ns && '\0' == pi->options->strip_ns[1]) {
877
+ start = pi->s + 1;
878
+ } else if (0 == strncmp(pi->options->strip_ns, start, pi->s - start)) {
879
+ start = pi->s + 1;
880
+ }
881
+ break;
882
+ default: break;
883
+ }
924
884
  }
925
885
  return start;
926
886
  }
927
887
 
928
- static void
929
- read_cdata(PInfo pi) {
930
- char *start;
931
- char *end;
888
+ static void read_cdata(PInfo pi) {
889
+ char *start;
890
+ char *end;
932
891
 
933
892
  start = pi->s;
934
- end = strstr(pi->s, "]]>");
893
+ end = strstr(pi->s, "]]>");
935
894
  if (end == 0) {
936
- set_error(&pi->err, "invalid format, CDATA not terminated", pi->str, pi->s);
937
- return;
895
+ set_error(&pi->err, "invalid format, CDATA not terminated", pi->str, pi->s);
896
+ return;
938
897
  }
939
- *end = '\0';
898
+ *end = '\0';
940
899
  pi->s = end + 3;
941
900
  if (0 != pi->pcb->add_cdata) {
942
- fix_newlines(start);
943
- pi->pcb->add_cdata(pi, start, end - start);
901
+ fix_newlines(start);
902
+ pi->pcb->add_cdata(pi, start, end - start);
944
903
  }
945
904
  }
946
905
 
947
906
  /* Assume the value starts immediately and goes until the quote character is
948
907
  * reached again. Do not read the character after the terminating quote.
949
908
  */
950
- static char*
951
- read_quoted_value(PInfo pi) {
952
- char *value = 0;
909
+ static char *read_quoted_value(PInfo pi) {
910
+ char *value = 0;
953
911
 
954
912
  if ('"' == *pi->s || '\'' == *pi->s) {
955
- char term = *pi->s;
913
+ char term = *pi->s;
956
914
 
957
- pi->s++; /* skip quote character */
915
+ pi->s++; /* skip quote character */
958
916
  value = pi->s;
959
917
  for (; *pi->s != term; pi->s++) {
960
918
  if ('\0' == *pi->s) {
961
919
  set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
962
- return 0;
920
+ return 0;
963
921
  }
964
922
  }
965
- *pi->s = '\0'; /* terminate value */
966
- pi->s++; /* move past quote */
923
+ *pi->s = '\0'; /* terminate value */
924
+ pi->s++; /* move past quote */
967
925
  } else if (StrictEffort == pi->options->effort) {
968
- set_error(&pi->err, "invalid format, expected a quote character", pi->str, pi->s);
969
- return 0;
926
+ set_error(&pi->err, "invalid format, expected a quote character", pi->str, pi->s);
927
+ return 0;
970
928
  } else if (TolerantEffort == pi->options->effort) {
971
929
  value = pi->s;
972
930
  for (; 1; pi->s++) {
973
- switch (*pi->s) {
974
- case '\0':
975
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
976
- return 0;
977
- case ' ':
978
- case '/':
979
- case '>':
980
- case '?': // for instructions
981
- case '\t':
982
- case '\n':
983
- case '\r':
984
- pi->last = *pi->s;
985
- *pi->s = '\0'; /* terminate value */
986
- pi->s++;
987
- return value;
988
- default:
989
- break;
931
+ switch (*pi->s) {
932
+ case '\0': set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s); return 0;
933
+ case ' ':
934
+ case '/':
935
+ case '>':
936
+ case '?': // for instructions
937
+ case '\t':
938
+ case '\n':
939
+ case '\r':
940
+ pi->last = *pi->s;
941
+ *pi->s = '\0'; /* terminate value */
942
+ pi->s++;
943
+ return value;
944
+ default: break;
990
945
  }
991
946
  }
992
947
  } else {
993
948
  value = pi->s;
994
949
  next_white(pi);
995
- if ('\0' == *pi->s) {
996
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
997
- return 0;
950
+ if ('\0' == *pi->s) {
951
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
952
+ return 0;
998
953
  }
999
954
  *pi->s++ = '\0'; /* terminate value */
1000
955
  }
1001
956
  return value;
1002
957
  }
1003
958
 
1004
- static char*
1005
- read_hex_uint64(char *b, uint64_t *up) {
1006
- uint64_t u = 0;
1007
- char c;
959
+ static char *read_hex_uint64(char *b, uint64_t *up) {
960
+ uint64_t u = 0;
961
+ char c;
1008
962
 
1009
963
  for (; ';' != *b; b++) {
1010
- c = *b;
1011
- if ('0' <= c && c <= '9') {
1012
- u = (u << 4) | (uint64_t)(c - '0');
1013
- } else if ('a' <= c && c <= 'f') {
1014
- u = (u << 4) | (uint64_t)(c - 'a' + 10);
1015
- } else if ('A' <= c && c <= 'F') {
1016
- u = (u << 4) | (uint64_t)(c - 'A' + 10);
1017
- } else {
1018
- return 0;
1019
- }
964
+ c = *b;
965
+ if ('0' <= c && c <= '9') {
966
+ u = (u << 4) | (uint64_t)(c - '0');
967
+ } else if ('a' <= c && c <= 'f') {
968
+ u = (u << 4) | (uint64_t)(c - 'a' + 10);
969
+ } else if ('A' <= c && c <= 'F') {
970
+ u = (u << 4) | (uint64_t)(c - 'A' + 10);
971
+ } else {
972
+ return 0;
973
+ }
1020
974
  }
1021
975
  *up = u;
1022
976
 
1023
977
  return b;
1024
978
  }
1025
979
 
1026
- static char*
1027
- read_10_uint64(char *b, uint64_t *up) {
1028
- uint64_t u = 0;
1029
- char c;
980
+ static char *read_10_uint64(char *b, uint64_t *up) {
981
+ uint64_t u = 0;
982
+ char c;
1030
983
 
1031
984
  for (; ';' != *b; b++) {
1032
- c = *b;
1033
- if ('0' <= c && c <= '9') {
1034
- u = (u * 10) + (uint64_t)(c - '0');
1035
- } else {
1036
- return 0;
1037
- }
985
+ c = *b;
986
+ if ('0' <= c && c <= '9') {
987
+ u = (u * 10) + (uint64_t)(c - '0');
988
+ } else {
989
+ return 0;
990
+ }
1038
991
  }
1039
992
  *up = u;
1040
993
 
1041
994
  return b;
1042
995
  }
1043
996
 
1044
- static char*
1045
- read_coded_chars(PInfo pi, char *text) {
1046
- char *b, buf[32];
1047
- char *end = buf + sizeof(buf) - 1;
1048
- char *s;
1049
- long blen = 0;
997
+ static char *read_coded_chars(PInfo pi, char *text) {
998
+ char *b, buf[32];
999
+ char *end = buf + sizeof(buf) - 1;
1000
+ char *s;
1001
+ long blen = 0;
1050
1002
 
1051
1003
  for (b = buf, s = pi->s; b < end; b++, s++) {
1052
- *b = *s;
1053
- if (';' == *s) {
1054
- *(b + 1) = '\0';
1055
- blen = b - buf;
1056
- s++;
1057
- break;
1058
- }
1004
+ *b = *s;
1005
+ if (';' == *s) {
1006
+ *(b + 1) = '\0';
1007
+ blen = b - buf;
1008
+ s++;
1009
+ break;
1010
+ }
1059
1011
  }
1060
1012
  if (b > end) {
1061
- *text++ = '&';
1013
+ *text++ = '&';
1062
1014
  } else if ('#' == *buf) {
1063
- uint64_t u = 0;
1015
+ uint64_t u = 0;
1064
1016
 
1065
- b = buf + 1;
1066
- if ('x' == *b || 'X' == *b) {
1067
- b = read_hex_uint64(b + 1, &u);
1068
- } else {
1069
- b = read_10_uint64(b, &u);
1070
- }
1071
- if (0 == b) {
1072
- *text++ = '&';
1073
- } else {
1074
- if (u <= 0x000000000000007FULL) {
1075
- *text++ = (char)u;
1076
- } else if (ox_utf8_encoding == pi->options->rb_enc) {
1077
- text = ox_ucs_to_utf8_chars(text, u);
1078
- } else if (0 == pi->options->rb_enc) {
1079
- pi->options->rb_enc = ox_utf8_encoding;
1080
- text = ox_ucs_to_utf8_chars(text, u);
1081
- } else if (TolerantEffort == pi->options->effort) {
1082
- *text++ = '&';
1083
- return text;
1084
- } else if (u <= 0x00000000000000FFULL) {
1085
- *text++ = (char)u;
1086
- } else {
1087
- /*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */
1088
- set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
1089
- return NULL;
1090
- }
1091
- pi->s = s;
1092
- }
1017
+ b = buf + 1;
1018
+ if ('x' == *b || 'X' == *b) {
1019
+ b = read_hex_uint64(b + 1, &u);
1020
+ } else {
1021
+ b = read_10_uint64(b, &u);
1022
+ }
1023
+ if (0 == b) {
1024
+ *text++ = '&';
1025
+ } else {
1026
+ if (u <= 0x000000000000007FULL) {
1027
+ *text++ = (char)u;
1028
+ } else if (ox_utf8_encoding == pi->options->rb_enc) {
1029
+ text = ox_ucs_to_utf8_chars(text, u);
1030
+ } else if (0 == pi->options->rb_enc) {
1031
+ pi->options->rb_enc = ox_utf8_encoding;
1032
+ text = ox_ucs_to_utf8_chars(text, u);
1033
+ } else if (TolerantEffort == pi->options->effort) {
1034
+ *text++ = '&';
1035
+ return text;
1036
+ } else if (u <= 0x00000000000000FFULL) {
1037
+ *text++ = (char)u;
1038
+ } else {
1039
+ /*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character
1040
+ * sequences.", pi->str, pi->s); */
1041
+ set_error(&pi->err,
1042
+ "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.",
1043
+ pi->str,
1044
+ pi->s);
1045
+ return NULL;
1046
+ }
1047
+ pi->s = s;
1048
+ }
1093
1049
  } else {
1094
- char *t2;
1095
-
1096
- buf[blen] = '\0';
1097
- if (NULL == (t2 = ox_entity_lookup(text, buf))) {
1098
- *text++ = '&';
1099
- } else {
1100
- text = t2;
1101
- pi->s = s;
1102
- }
1050
+ char *t2;
1051
+
1052
+ buf[blen] = '\0';
1053
+ if (NULL == (t2 = ox_entity_lookup(text, buf))) {
1054
+ *text++ = '&';
1055
+ } else {
1056
+ text = t2;
1057
+ pi->s = s;
1058
+ }
1103
1059
  }
1104
1060
  return text;
1105
1061
  }
1106
1062
 
1107
- static int
1108
- collapse_special(PInfo pi, char *str) {
1109
- char *s = str;
1110
- char *b = str;
1063
+ static int collapse_special(PInfo pi, char *str) {
1064
+ char *s = str;
1065
+ char *b = str;
1111
1066
 
1112
1067
  while ('\0' != *s) {
1113
- if ('&' == *s) {
1114
- int c;
1115
- char *end;
1116
-
1117
- s++;
1118
- if ('#' == *s) {
1119
- uint64_t u = 0;
1120
- char x;
1121
-
1122
- s++;
1123
- if ('x' == *s || 'X' == *s) {
1124
- x = *s;
1125
- s++;
1126
- end = read_hex_uint64(s, &u);
1127
- } else {
1128
- x = '\0';
1129
- end = read_10_uint64(s, &u);
1130
- }
1131
- if (0 == end) {
1132
- if (TolerantEffort == pi->options->effort) {
1133
- *b++ = '&';
1134
- *b++ = '#';
1135
- if ('\0' != x) {
1136
- *b++ = x;
1137
- }
1138
- continue;
1139
- }
1140
- return EDOM;
1141
- }
1142
- if (u <= 0x000000000000007FULL) {
1143
- *b++ = (char)u;
1144
- } else if (ox_utf8_encoding == pi->options->rb_enc) {
1145
- b = ox_ucs_to_utf8_chars(b, u);
1146
- /* TBD support UTF-16 */
1147
- } else if (0 == pi->options->rb_enc) {
1148
- pi->options->rb_enc = ox_utf8_encoding;
1149
- b = ox_ucs_to_utf8_chars(b, u);
1150
- } else {
1151
- /* set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);*/
1152
- set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
1153
- return 0;
1154
- }
1155
- s = end + 1;
1156
- } else {
1157
- if (0 == strncasecmp(s, "lt;", 3)) {
1158
- c = '<';
1159
- s += 3;
1160
- } else if (0 == strncasecmp(s, "gt;", 3)) {
1161
- c = '>';
1162
- s += 3;
1163
- } else if (0 == strncasecmp(s, "amp;", 4)) {
1164
- c = '&';
1165
- s += 4;
1166
- } else if (0 == strncasecmp(s, "quot;", 5)) {
1167
- c = '"';
1168
- s += 5;
1169
- } else if (0 == strncasecmp(s, "apos;", 5)) {
1170
- c = '\'';
1171
- s += 5;
1172
- } else if (TolerantEffort == pi->options->effort) {
1173
- *b++ = '&';
1174
- continue;
1175
- } else {
1176
- char key[16];
1177
- char *k = key;
1178
- char *kend = key + sizeof(key) - 1;
1179
-
1180
- *k++ = *s;
1181
- while (';' != *s++) {
1182
- if ('\0' == *s) {
1183
- set_error(&pi->err, "Invalid format, special character does not end with a semicolon", pi->str, pi->s);
1184
- return EDOM;
1185
- }
1186
- if (kend <= k) {
1187
- k = key;
1188
- break;
1189
- }
1190
- *k++ = *s;
1191
- }
1192
- k--;
1193
- *k = '\0';
1194
- if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
1195
- set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
1196
- c = '?';
1197
- return 0;
1198
- }
1199
- continue;
1200
- }
1201
- *b++ = (char)c;
1202
- }
1203
- } else {
1204
- *b++ = *s++;
1205
- }
1068
+ if ('&' == *s) {
1069
+ int c;
1070
+ char *end;
1071
+
1072
+ s++;
1073
+ if ('#' == *s) {
1074
+ uint64_t u = 0;
1075
+ char x;
1076
+
1077
+ s++;
1078
+ if ('x' == *s || 'X' == *s) {
1079
+ x = *s;
1080
+ s++;
1081
+ end = read_hex_uint64(s, &u);
1082
+ } else {
1083
+ x = '\0';
1084
+ end = read_10_uint64(s, &u);
1085
+ }
1086
+ if (0 == end) {
1087
+ if (TolerantEffort == pi->options->effort) {
1088
+ *b++ = '&';
1089
+ *b++ = '#';
1090
+ if ('\0' != x) {
1091
+ *b++ = x;
1092
+ }
1093
+ continue;
1094
+ }
1095
+ return EDOM;
1096
+ }
1097
+ if (u <= 0x000000000000007FULL) {
1098
+ *b++ = (char)u;
1099
+ } else if (ox_utf8_encoding == pi->options->rb_enc) {
1100
+ b = ox_ucs_to_utf8_chars(b, u);
1101
+ /* TBD support UTF-16 */
1102
+ } else if (0 == pi->options->rb_enc) {
1103
+ pi->options->rb_enc = ox_utf8_encoding;
1104
+ b = ox_ucs_to_utf8_chars(b, u);
1105
+ } else {
1106
+ /* set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character
1107
+ * sequences.", pi->str, pi->s);*/
1108
+ set_error(&pi->err,
1109
+ "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.",
1110
+ pi->str,
1111
+ pi->s);
1112
+ return 0;
1113
+ }
1114
+ s = end + 1;
1115
+ } else {
1116
+ if (0 == strncasecmp(s, "lt;", 3)) {
1117
+ c = '<';
1118
+ s += 3;
1119
+ } else if (0 == strncasecmp(s, "gt;", 3)) {
1120
+ c = '>';
1121
+ s += 3;
1122
+ } else if (0 == strncasecmp(s, "amp;", 4)) {
1123
+ c = '&';
1124
+ s += 4;
1125
+ } else if (0 == strncasecmp(s, "quot;", 5)) {
1126
+ c = '"';
1127
+ s += 5;
1128
+ } else if (0 == strncasecmp(s, "apos;", 5)) {
1129
+ c = '\'';
1130
+ s += 5;
1131
+ } else if (TolerantEffort == pi->options->effort) {
1132
+ *b++ = '&';
1133
+ continue;
1134
+ } else {
1135
+ char key[16];
1136
+ char *k = key;
1137
+ char *kend = key + sizeof(key) - 1;
1138
+
1139
+ *k++ = *s;
1140
+ while (';' != *s++) {
1141
+ if ('\0' == *s) {
1142
+ set_error(&pi->err,
1143
+ "Invalid format, special character does not end with a semicolon",
1144
+ pi->str,
1145
+ pi->s);
1146
+ return EDOM;
1147
+ }
1148
+ if (kend <= k) {
1149
+ k = key;
1150
+ break;
1151
+ }
1152
+ *k++ = *s;
1153
+ }
1154
+ k--;
1155
+ *k = '\0';
1156
+ if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
1157
+ set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
1158
+ c = '?';
1159
+ return 0;
1160
+ }
1161
+ continue;
1162
+ }
1163
+ *b++ = (char)c;
1164
+ }
1165
+ } else {
1166
+ *b++ = *s++;
1167
+ }
1206
1168
  }
1207
1169
  *b = '\0';
1208
1170