ox 2.14.14 → 2.14.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/ox/parse.c CHANGED
@@ -3,35 +3,35 @@
3
3
  * All rights reserved.
4
4
  */
5
5
 
6
- #include <stdlib.h>
7
6
  #include <errno.h>
8
7
  #include <stdbool.h>
9
8
  #include <stdio.h>
9
+ #include <stdlib.h>
10
10
  #include <string.h>
11
11
  #include <strings.h>
12
12
 
13
- #include "ruby.h"
14
- #include "ox.h"
15
- #include "err.h"
16
13
  #include "attr.h"
17
- #include "intern.h"
14
+ #include "err.h"
18
15
  #include "helper.h"
16
+ #include "intern.h"
17
+ #include "ox.h"
18
+ #include "ruby.h"
19
19
  #include "special.h"
20
20
 
21
- static void read_instruction(PInfo pi);
22
- static void read_doctype(PInfo pi);
23
- static void read_comment(PInfo pi);
24
- static char* read_element(PInfo pi);
25
- static void read_text(PInfo pi);
21
+ static void read_instruction(PInfo pi);
22
+ static void read_doctype(PInfo pi);
23
+ static void read_comment(PInfo pi);
24
+ static char *read_element(PInfo pi);
25
+ static void read_text(PInfo pi);
26
26
  /*static void read_reduced_text(PInfo pi); */
27
- static void read_cdata(PInfo pi);
28
- static char* read_name_token(PInfo pi);
29
- static char* read_quoted_value(PInfo pi);
30
- static char* read_hex_uint64(char *b, uint64_t *up);
31
- static char* read_10_uint64(char *b, uint64_t *up);
32
- static char* read_coded_chars(PInfo pi, char *text);
33
- static void next_non_white(PInfo pi);
34
- static int collapse_special(PInfo pi, char *str);
27
+ static void read_cdata(PInfo pi);
28
+ static char *read_name_token(PInfo pi);
29
+ static char *read_quoted_value(PInfo pi);
30
+ static char *read_hex_uint64(char *b, uint64_t *up);
31
+ static char *read_10_uint64(char *b, uint64_t *up);
32
+ static char *read_coded_chars(PInfo pi, char *text);
33
+ static void next_non_white(PInfo pi);
34
+ static int collapse_special(PInfo pi, char *str);
35
35
 
36
36
  /* This XML parser is a single pass, destructive, callback parser. It is a
37
37
  * single pass parse since it only make one pass over the characters in the
@@ -46,53 +46,43 @@ static int collapse_special(PInfo pi, char *str);
46
46
  * all cases to parse the string.
47
47
  */
48
48
 
49
- static char xml_valid_lower_chars[34] = "xxxxxxxxxooxxoxxxxxxxxxxxxxxxxxxo";
49
+ static char xml_valid_lower_chars[34] = "xxxxxxxxxooxxoxxxxxxxxxxxxxxxxxxo";
50
50
 
51
- inline static int
52
- is_white(char c) {
51
+ inline static int is_white(char c) {
53
52
  switch (c) {
54
53
  case ' ':
55
54
  case '\t':
56
55
  case '\f':
57
56
  case '\n':
58
- case '\r':
59
- return 1;
60
- default:
61
- return 0;
57
+ case '\r': return 1;
58
+ default: return 0;
62
59
  }
63
60
  }
64
61
 
65
-
66
- inline static void
67
- next_non_white(PInfo pi) {
62
+ inline static void next_non_white(PInfo pi) {
68
63
  for (; 1; pi->s++) {
69
- switch (*pi->s) {
70
- case ' ':
71
- case '\t':
72
- case '\f':
73
- case '\n':
74
- case '\r':
75
- break;
76
- default:
77
- return;
78
- }
64
+ switch (*pi->s) {
65
+ case ' ':
66
+ case '\t':
67
+ case '\f':
68
+ case '\n':
69
+ case '\r': break;
70
+ default: return;
71
+ }
79
72
  }
80
73
  }
81
74
 
82
- inline static void
83
- next_white(PInfo pi) {
75
+ inline static void next_white(PInfo pi) {
84
76
  for (; 1; pi->s++) {
85
- switch (*pi->s) {
86
- case ' ':
87
- case '\t':
88
- case '\f':
89
- case '\n':
90
- case '\r':
91
- case '\0':
92
- return;
93
- default:
94
- break;
95
- }
77
+ switch (*pi->s) {
78
+ case ' ':
79
+ case '\t':
80
+ case '\f':
81
+ case '\n':
82
+ case '\r':
83
+ case '\0': return;
84
+ default: break;
85
+ }
96
86
  }
97
87
  }
98
88
 
@@ -100,53 +90,52 @@ static void fix_newlines(char *buf) {
100
90
  #if HAVE_INDEX
101
91
  if (NULL != index(buf, '\r')) {
102
92
  #endif
103
- char *s = buf;
104
- char *d = buf;
105
-
106
- for (; '\0' != *s; s++) {
107
- if ('\r' == *s) {
108
- if ('\n' == *(s + 1)) {
109
- continue;
110
- }
111
- *d = '\n';
112
- } else if (d < s) {
113
- *d = *s;
114
- }
115
- d++;
116
- }
117
- *d = '\0';
93
+ char *s = buf;
94
+ char *d = buf;
95
+
96
+ for (; '\0' != *s; s++) {
97
+ if ('\r' == *s) {
98
+ if ('\n' == *(s + 1)) {
99
+ continue;
100
+ }
101
+ *d = '\n';
102
+ } else if (d < s) {
103
+ *d = *s;
104
+ }
105
+ d++;
106
+ }
107
+ *d = '\0';
118
108
  #if HAVE_INDEX
119
109
  }
120
110
  #endif
121
111
  }
122
112
 
123
- static void
124
- mark_pi_cb(void *ptr) {
113
+ static void mark_pi_cb(void *ptr) {
125
114
  if (NULL != ptr) {
126
- HelperStack stack = &((PInfo)ptr)->helpers;
127
- Helper h;
115
+ HelperStack stack = &((PInfo)ptr)->helpers;
116
+ Helper h;
128
117
 
129
- for (h = stack->head; h < stack->tail; h++) {
130
- if (NoCode != h->type) {
131
- rb_gc_mark(h->obj);
132
- }
133
- }
118
+ for (h = stack->head; h < stack->tail; h++) {
119
+ if (NoCode != h->type) {
120
+ rb_gc_mark(h->obj);
121
+ }
122
+ }
134
123
  }
135
124
  }
136
125
 
137
126
  VALUE
138
127
  ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options, Err err) {
139
- struct _pInfo pi;
140
- int body_read = 0;
141
- int block_given = rb_block_given_p();
142
- volatile VALUE wrap;
128
+ struct _pInfo pi;
129
+ int body_read = 0;
130
+ int block_given = rb_block_given_p();
131
+ volatile VALUE wrap;
143
132
 
144
133
  if (0 == xml) {
145
- set_error(err, "Invalid arg, xml string can not be null", xml, 0);
146
- return Qnil;
134
+ set_error(err, "Invalid arg, xml string can not be null", xml, 0);
135
+ return Qnil;
147
136
  }
148
137
  if (DEBUG <= options->trace) {
149
- printf("Parsing xml:\n%s\n", xml);
138
+ printf("Parsing xml:\n%s\n", xml);
150
139
  }
151
140
  // initialize parse info
152
141
  helper_stack_init(&pi.helpers);
@@ -154,663 +143,637 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
154
143
  wrap = Data_Wrap_Struct(rb_cObject, mark_pi_cb, NULL, &pi);
155
144
 
156
145
  err_init(&pi.err);
157
- pi.str = xml;
158
- pi.end = pi.str + len;
159
- pi.s = xml;
160
- pi.pcb = pcb;
161
- pi.obj = Qnil;
146
+ pi.str = xml;
147
+ pi.end = pi.str + len;
148
+ pi.s = xml;
149
+ pi.pcb = pcb;
150
+ pi.obj = Qnil;
162
151
  pi.circ_array = 0;
163
- pi.options = options;
164
- pi.marked = NULL;
165
- pi.mark_size = 0;
166
- pi.mark_cnt = 0;
152
+ pi.options = options;
153
+ pi.marked = NULL;
154
+ pi.mark_size = 0;
155
+ pi.mark_cnt = 0;
167
156
  while (1) {
168
- next_non_white(&pi); // skip white space
169
- if ('\0' == *pi.s) {
170
- break;
171
- }
172
- if (body_read && 0 != endp) {
173
- *endp = pi.s;
174
- break;
175
- }
176
- if ('<' != *pi.s) { // all top level entities start with <
177
- set_error(err, "invalid format, expected <", pi.str, pi.s);
178
- helper_stack_cleanup(&pi.helpers);
179
- return Qnil;
180
- }
181
- pi.s++; // past <
182
- switch (*pi.s) {
183
- case '?': // processing instruction
184
- pi.s++;
185
- read_instruction(&pi);
186
- break;
187
- case '!': // comment or doctype
188
- pi.s++;
189
- if ('\0' == *pi.s) {
190
- set_error(err, "invalid format, DOCTYPE or comment not terminated", pi.str, pi.s);
191
- helper_stack_cleanup(&pi.helpers);
192
- return Qnil;
193
- } else if ('-' == *pi.s) {
194
- pi.s++; // skip -
195
- if ('-' != *pi.s) {
196
- set_error(err, "invalid format, bad comment format", pi.str, pi.s);
197
- helper_stack_cleanup(&pi.helpers);
198
- return Qnil;
199
- } else {
200
- pi.s++; // skip second -
201
- read_comment(&pi);
202
- }
203
- } else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7) : 0 == strncmp("DOCTYPE", pi.s, 7)) {
204
- pi.s += 7;
205
- read_doctype(&pi);
206
- } else {
207
- set_error(err, "invalid format, DOCTYPE or comment expected", pi.str, pi.s);
208
- helper_stack_cleanup(&pi.helpers);
209
- return Qnil;
210
- }
211
- break;
212
- case '\0':
213
- set_error(err, "invalid format, document not terminated", pi.str, pi.s);
214
- helper_stack_cleanup(&pi.helpers);
215
- return Qnil;
216
- default:
217
- read_element(&pi);
218
- body_read = 1;
219
- break;
220
- }
221
- if (err_has(&pi.err)) {
222
- *err = pi.err;
223
- helper_stack_cleanup(&pi.helpers);
224
- return Qnil;
225
- }
226
- if (block_given && Qnil != pi.obj && Qundef != pi.obj) {
227
- if (NULL != pcb->finish) {
228
- pcb->finish(&pi);
229
- }
230
- rb_yield(pi.obj);
231
- }
157
+ next_non_white(&pi); // skip white space
158
+ if ('\0' == *pi.s) {
159
+ break;
160
+ }
161
+ if (body_read && 0 != endp) {
162
+ *endp = pi.s;
163
+ break;
164
+ }
165
+ if ('<' != *pi.s) { // all top level entities start with <
166
+ set_error(err, "invalid format, expected <", pi.str, pi.s);
167
+ helper_stack_cleanup(&pi.helpers);
168
+ return Qnil;
169
+ }
170
+ pi.s++; // past <
171
+ switch (*pi.s) {
172
+ case '?': // processing instruction
173
+ pi.s++;
174
+ read_instruction(&pi);
175
+ break;
176
+ case '!': // comment or doctype
177
+ pi.s++;
178
+ if ('\0' == *pi.s) {
179
+ set_error(err, "invalid format, DOCTYPE or comment not terminated", pi.str, pi.s);
180
+ helper_stack_cleanup(&pi.helpers);
181
+ return Qnil;
182
+ } else if ('-' == *pi.s) {
183
+ pi.s++; // skip -
184
+ if ('-' != *pi.s) {
185
+ set_error(err, "invalid format, bad comment format", pi.str, pi.s);
186
+ helper_stack_cleanup(&pi.helpers);
187
+ return Qnil;
188
+ } else {
189
+ pi.s++; // skip second -
190
+ read_comment(&pi);
191
+ }
192
+ } else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7)
193
+ : 0 == strncmp("DOCTYPE", pi.s, 7)) {
194
+ pi.s += 7;
195
+ read_doctype(&pi);
196
+ } else {
197
+ set_error(err, "invalid format, DOCTYPE or comment expected", pi.str, pi.s);
198
+ helper_stack_cleanup(&pi.helpers);
199
+ return Qnil;
200
+ }
201
+ break;
202
+ case '\0':
203
+ set_error(err, "invalid format, document not terminated", pi.str, pi.s);
204
+ helper_stack_cleanup(&pi.helpers);
205
+ return Qnil;
206
+ default:
207
+ read_element(&pi);
208
+ body_read = 1;
209
+ break;
210
+ }
211
+ if (err_has(&pi.err)) {
212
+ *err = pi.err;
213
+ helper_stack_cleanup(&pi.helpers);
214
+ return Qnil;
215
+ }
216
+ if (block_given && Qnil != pi.obj && Qundef != pi.obj) {
217
+ if (NULL != pcb->finish) {
218
+ pcb->finish(&pi);
219
+ }
220
+ rb_yield(pi.obj);
221
+ }
232
222
  }
233
223
  DATA_PTR(wrap) = NULL;
234
224
  helper_stack_cleanup(&pi.helpers);
235
225
  if (NULL != pcb->finish) {
236
- pcb->finish(&pi);
226
+ pcb->finish(&pi);
237
227
  }
238
228
  return pi.obj;
239
229
  }
240
230
 
241
231
  // Entered after the "<?" sequence. Ready to read the rest.
242
- static void
243
- read_instruction(PInfo pi) {
244
- char content[256];
245
- char *content_ptr;
246
- struct _attrStack attrs;
247
- char *attr_name;
248
- char *attr_value;
249
- char *target;
250
- char *end;
251
- char c;
252
- char *cend;
253
- size_t size;
254
- bool attrs_ok = true;
232
+ static void read_instruction(PInfo pi) {
233
+ char content[256];
234
+ char *content_ptr;
235
+ struct _attrStack attrs;
236
+ char *attr_name;
237
+ char *attr_value;
238
+ char *target;
239
+ char *end;
240
+ char c;
241
+ char *cend;
242
+ size_t size;
243
+ bool attrs_ok = true;
255
244
 
256
245
  *content = '\0';
257
246
  attr_stack_init(&attrs);
258
247
  if (0 == (target = read_name_token(pi))) {
259
- return;
248
+ return;
260
249
  }
261
250
  end = pi->s;
262
251
  for (; true; pi->s++) {
263
252
  switch (*pi->s) {
264
253
  case '?':
265
254
  if ('>' == *(pi->s + 1)) {
266
- pi->s++;
267
- goto DONE;
255
+ pi->s++;
256
+ goto DONE;
268
257
  }
269
258
  break;
270
- case '\0':
271
- set_error(&pi->err, "processing instruction not terminated", pi->str, pi->s);
272
- return;
273
- default:
274
- break;
259
+ case '\0': set_error(&pi->err, "processing instruction not terminated", pi->str, pi->s); return;
260
+ default: break;
275
261
  }
276
262
  }
277
263
  DONE:
278
- cend = pi->s;
279
- size = cend - end - 1;
264
+ cend = pi->s;
265
+ size = cend - end - 1;
280
266
  pi->s = end;
281
267
  if (size < sizeof(content)) {
282
- content_ptr = content;
268
+ content_ptr = content;
283
269
  } else {
284
- content_ptr = ALLOC_N(char, size + 1);
270
+ content_ptr = ALLOC_N(char, size + 1);
285
271
  }
286
272
  memcpy(content_ptr, end, size);
287
273
  content_ptr[size] = '\0';
288
274
 
289
275
  next_non_white(pi);
290
- c = *pi->s;
291
- *end = '\0'; // terminate name
276
+ c = *pi->s;
277
+ *end = '\0'; // terminate name
292
278
  if ('?' != c) {
293
- while ('?' != c) {
294
- pi->last = 0;
295
- if ('\0' == *pi->s) {
296
- attr_stack_cleanup(&attrs);
297
- set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
298
- return;
299
- }
300
- next_non_white(pi);
301
- if (0 == (attr_name = read_name_token(pi))) {
302
- attr_stack_cleanup(&attrs);
303
- return;
304
- }
305
- end = pi->s;
306
- next_non_white(pi);
307
- if ('=' != *pi->s++) {
308
- attrs_ok = false;
309
- break;
310
- }
311
- *end = '\0'; // terminate name
312
- // read value
313
- next_non_white(pi);
314
- if (0 == (attr_value = read_quoted_value(pi))) {
315
- attr_stack_cleanup(&attrs);
316
- return;
317
- }
318
- attr_stack_push(&attrs, attr_name, attr_value);
319
- next_non_white(pi);
320
- if ('\0' == pi->last) {
321
- c = *pi->s;
322
- } else {
323
- c = pi->last;
324
- }
325
- }
326
- if ('?' == *pi->s) {
327
- pi->s++;
328
- }
279
+ while ('?' != c) {
280
+ pi->last = 0;
281
+ if ('\0' == *pi->s) {
282
+ attr_stack_cleanup(&attrs);
283
+ set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
284
+ return;
285
+ }
286
+ next_non_white(pi);
287
+ if (0 == (attr_name = read_name_token(pi))) {
288
+ attr_stack_cleanup(&attrs);
289
+ return;
290
+ }
291
+ end = pi->s;
292
+ next_non_white(pi);
293
+ if ('=' != *pi->s++) {
294
+ attrs_ok = false;
295
+ break;
296
+ }
297
+ *end = '\0'; // terminate name
298
+ // read value
299
+ next_non_white(pi);
300
+ if (0 == (attr_value = read_quoted_value(pi))) {
301
+ attr_stack_cleanup(&attrs);
302
+ return;
303
+ }
304
+ attr_stack_push(&attrs, attr_name, attr_value);
305
+ next_non_white(pi);
306
+ if ('\0' == pi->last) {
307
+ c = *pi->s;
308
+ } else {
309
+ c = pi->last;
310
+ }
311
+ }
312
+ if ('?' == *pi->s) {
313
+ pi->s++;
314
+ }
329
315
  } else {
330
- pi->s++;
316
+ pi->s++;
331
317
  }
332
318
  if (attrs_ok) {
333
- if ('>' != *pi->s++) {
334
- attr_stack_cleanup(&attrs);
335
- set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
336
- return;
337
- }
319
+ if ('>' != *pi->s++) {
320
+ attr_stack_cleanup(&attrs);
321
+ set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
322
+ return;
323
+ }
338
324
  } else {
339
- pi->s = cend + 1;
325
+ pi->s = cend + 1;
340
326
  }
341
327
  if (0 != pi->pcb->instruct) {
342
- if (attrs_ok) {
343
- pi->pcb->instruct(pi, target, attrs.head, 0);
344
- } else {
345
- pi->pcb->instruct(pi, target, attrs.head, content_ptr);
346
- }
328
+ if (attrs_ok) {
329
+ pi->pcb->instruct(pi, target, attrs.head, 0);
330
+ } else {
331
+ pi->pcb->instruct(pi, target, attrs.head, content_ptr);
332
+ }
347
333
  }
348
334
  attr_stack_cleanup(&attrs);
349
335
  if (content_ptr != content) {
350
- xfree(content_ptr);
336
+ xfree(content_ptr);
351
337
  }
352
338
  }
353
339
 
354
- static void
355
- read_delimited(PInfo pi, char end) {
356
- char c;
340
+ static void read_delimited(PInfo pi, char end) {
341
+ char c;
357
342
 
358
343
  if ('"' == end || '\'' == end) {
359
- for (c = *pi->s++; end != c; c = *pi->s++) {
360
- if ('\0' == c) {
361
- set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s);
362
- return;
363
- }
364
- }
344
+ for (c = *pi->s++; end != c; c = *pi->s++) {
345
+ if ('\0' == c) {
346
+ set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s);
347
+ return;
348
+ }
349
+ }
365
350
  } else {
366
- while (1) {
367
- c = *pi->s++;
368
- if (end == c) {
369
- return;
370
- }
371
- switch (c) {
372
- case '\0':
373
- set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s);
374
- return;
375
- case '"':
376
- read_delimited(pi, c);
377
- break;
378
- case '\'':
379
- read_delimited(pi, c);
380
- break;
381
- case '[':
382
- read_delimited(pi, ']');
383
- break;
384
- case '<':
385
- read_delimited(pi, '>');
386
- break;
387
- default:
388
- break;
389
- }
390
- }
351
+ while (1) {
352
+ c = *pi->s++;
353
+ if (end == c) {
354
+ return;
355
+ }
356
+ switch (c) {
357
+ case '\0': set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s); return;
358
+ case '"': read_delimited(pi, c); break;
359
+ case '\'': read_delimited(pi, c); break;
360
+ case '[': read_delimited(pi, ']'); break;
361
+ case '<': read_delimited(pi, '>'); break;
362
+ default: break;
363
+ }
364
+ }
391
365
  }
392
366
  }
393
367
 
394
368
  // Entered after the "<!DOCTYPE" sequence plus the first character after
395
369
  // that. Ready to read the rest.
396
- static void
397
- read_doctype(PInfo pi) {
398
- char *doctype;
370
+ static void read_doctype(PInfo pi) {
371
+ char *doctype;
399
372
 
400
373
  next_non_white(pi);
401
374
  doctype = pi->s;
402
375
  read_delimited(pi, '>');
403
376
  if (err_has(&pi->err)) {
404
- return;
377
+ return;
405
378
  }
406
379
  pi->s--;
407
380
  *pi->s = '\0';
408
381
  pi->s++;
409
382
  if (0 != pi->pcb->add_doctype) {
410
- fix_newlines(doctype);
411
- pi->pcb->add_doctype(pi, doctype);
383
+ fix_newlines(doctype);
384
+ pi->pcb->add_doctype(pi, doctype);
412
385
  }
413
386
  }
414
387
 
415
388
  // Entered after "<!--". Returns error code.
416
- static void
417
- read_comment(PInfo pi) {
418
- char *end;
419
- char *s;
420
- char *comment;
421
- int done = 0;
389
+ static void read_comment(PInfo pi) {
390
+ char *end;
391
+ char *s;
392
+ char *comment;
393
+ int done = 0;
422
394
 
423
395
  next_non_white(pi);
424
396
  comment = pi->s;
425
- end = strstr(pi->s, "-->");
397
+ end = strstr(pi->s, "-->");
426
398
  if (0 == end) {
427
- set_error(&pi->err, "invalid format, comment not terminated", pi->str, pi->s);
428
- return;
399
+ set_error(&pi->err, "invalid format, comment not terminated", pi->str, pi->s);
400
+ return;
429
401
  }
430
402
  for (s = end - 1; pi->s < s && !done; s--) {
431
- switch(*s) {
432
- case ' ':
433
- case '\t':
434
- case '\f':
435
- case '\n':
436
- case '\r':
437
- break;
438
- default:
439
- *(s + 1) = '\0';
440
- done = 1;
441
- break;
442
- }
403
+ switch (*s) {
404
+ case ' ':
405
+ case '\t':
406
+ case '\f':
407
+ case '\n':
408
+ case '\r': break;
409
+ default:
410
+ *(s + 1) = '\0';
411
+ done = 1;
412
+ break;
413
+ }
443
414
  }
444
- *end = '\0'; // in case the comment was blank
415
+ *end = '\0'; // in case the comment was blank
445
416
  pi->s = end + 3;
446
417
  if (0 != pi->pcb->add_comment) {
447
- fix_newlines(comment);
448
- pi->pcb->add_comment(pi, comment);
418
+ fix_newlines(comment);
419
+ pi->pcb->add_comment(pi, comment);
449
420
  }
450
421
  }
451
422
 
452
423
  // Entered after the '<' and the first character after that. Returns stat
453
424
  // code.
454
- static char*
455
- read_element(PInfo pi) {
456
- struct _attrStack attrs;
457
- const char *attr_name;
458
- const char *attr_value;
459
- char *name;
460
- char *ename;
461
- char *end;
462
- char c;
463
- long elen;
464
- int hasChildren = 0;
465
- int done = 0;
425
+ static char *read_element(PInfo pi) {
426
+ struct _attrStack attrs;
427
+ const char *attr_name;
428
+ const char *attr_value;
429
+ char *name;
430
+ char *ename;
431
+ char *end;
432
+ char c;
433
+ long elen;
434
+ int hasChildren = 0;
435
+ int done = 0;
466
436
 
467
437
  attr_stack_init(&attrs);
468
438
  if (0 == (ename = read_name_token(pi))) {
469
- return 0;
439
+ return 0;
470
440
  }
471
- end = pi->s;
441
+ end = pi->s;
472
442
  elen = end - ename;
473
443
  next_non_white(pi);
474
- c = *pi->s;
444
+ c = *pi->s;
475
445
  *end = '\0';
476
446
  if ('/' == c) {
477
- // empty element, no attributes and no children
478
- pi->s++;
479
- if ('>' != *pi->s) {
480
- attr_stack_cleanup(&attrs);
481
- set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
482
- return 0;
483
- }
484
- pi->s++; /* past > */
485
- pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
486
- pi->pcb->end_element(pi, ename);
447
+ // empty element, no attributes and no children
448
+ pi->s++;
449
+ if ('>' != *pi->s) {
450
+ attr_stack_cleanup(&attrs);
451
+ set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
452
+ return 0;
453
+ }
454
+ pi->s++; /* past > */
455
+ pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
456
+ pi->pcb->end_element(pi, ename);
487
457
 
488
- attr_stack_cleanup(&attrs);
489
- return 0;
458
+ attr_stack_cleanup(&attrs);
459
+ return 0;
490
460
  }
491
461
  /* read attribute names until the close (/ or >) is reached */
492
462
  while (!done) {
493
- if ('\0' == c) {
494
- if (pi->end <= pi->s) {
495
- break;
496
- }
497
- next_non_white(pi);
498
- c = *pi->s;
499
- }
500
- pi->last = 0;
501
- switch (c) {
502
- case '\0':
503
- attr_stack_cleanup(&attrs);
504
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
505
- return 0;
506
- case '/':
507
- /* Element with just attributes. */
508
- pi->s++;
509
- if ('>' != *pi->s) {
510
- attr_stack_cleanup(&attrs);
511
- set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
512
- return 0;
513
- }
514
- pi->s++;
515
- pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
516
- pi->pcb->end_element(pi, ename);
517
- attr_stack_cleanup(&attrs);
518
-
519
- return 0;
520
- case '>':
521
- /* has either children or a value */
522
- pi->s++;
523
- hasChildren = 1;
524
- done = 1;
525
- pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
463
+ if ('\0' == c) {
464
+ if (pi->end <= pi->s) {
465
+ break;
466
+ }
467
+ next_non_white(pi);
468
+ c = *pi->s;
469
+ }
470
+ pi->last = 0;
471
+ switch (c) {
472
+ case '\0':
473
+ attr_stack_cleanup(&attrs);
474
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
475
+ return 0;
476
+ case '/':
477
+ /* Element with just attributes. */
478
+ pi->s++;
479
+ if ('>' != *pi->s) {
480
+ attr_stack_cleanup(&attrs);
481
+ set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
482
+ return 0;
483
+ }
484
+ pi->s++;
485
+ pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
486
+ pi->pcb->end_element(pi, ename);
487
+ attr_stack_cleanup(&attrs);
488
+
489
+ return 0;
490
+ case '>':
491
+ /* has either children or a value */
492
+ pi->s++;
493
+ hasChildren = 1;
494
+ done = 1;
495
+ pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
526
496
 
527
- break;
528
- default:
529
- /* Attribute name so it's an element and the attribute will be */
530
- /* added to it. */
531
- if (0 == (attr_name = read_name_token(pi))) {
532
- attr_stack_cleanup(&attrs);
533
- return 0;
534
- }
535
- end = pi->s;
536
- next_non_white(pi);
537
- if ('=' != *pi->s++) {
538
- if (TolerantEffort == pi->options->effort) {
539
- pi->s--;
540
- pi->last = *pi->s;
541
- *end = '\0'; /* terminate name */
542
- attr_value = "";
543
- attr_stack_push(&attrs, attr_name, attr_value);
544
- break;
545
- } else {
546
- attr_stack_cleanup(&attrs);
547
- set_error(&pi->err, "invalid format, no attribute value", pi->str, pi->s);
548
- return 0;
549
- }
550
- }
551
- *end = '\0'; /* terminate name */
552
- /* read value */
553
- next_non_white(pi);
554
- if (0 == (attr_value = read_quoted_value(pi))) {
555
- return 0;
556
- }
557
- if (pi->options->convert_special && 0 != strchr(attr_value, '&')) {
558
- if (0 != collapse_special(pi, (char*)attr_value) || err_has(&pi->err)) {
559
- attr_stack_cleanup(&attrs);
560
- return 0;
561
- }
562
- }
563
- attr_stack_push(&attrs, attr_name, attr_value);
564
- break;
565
- }
566
- if ('\0' == pi->last) {
567
- c = '\0';
568
- } else {
569
- c = pi->last;
570
- pi->last = '\0';
571
- }
497
+ break;
498
+ default:
499
+ /* Attribute name so it's an element and the attribute will be */
500
+ /* added to it. */
501
+ if (0 == (attr_name = read_name_token(pi))) {
502
+ attr_stack_cleanup(&attrs);
503
+ return 0;
504
+ }
505
+ end = pi->s;
506
+ next_non_white(pi);
507
+ if ('=' != *pi->s++) {
508
+ if (TolerantEffort == pi->options->effort) {
509
+ pi->s--;
510
+ pi->last = *pi->s;
511
+ *end = '\0'; /* terminate name */
512
+ attr_value = "";
513
+ attr_stack_push(&attrs, attr_name, attr_value);
514
+ break;
515
+ } else {
516
+ attr_stack_cleanup(&attrs);
517
+ set_error(&pi->err, "invalid format, no attribute value", pi->str, pi->s);
518
+ return 0;
519
+ }
520
+ }
521
+ *end = '\0'; /* terminate name */
522
+ /* read value */
523
+ next_non_white(pi);
524
+ if (0 == (attr_value = read_quoted_value(pi))) {
525
+ return 0;
526
+ }
527
+ if (pi->options->convert_special && 0 != strchr(attr_value, '&')) {
528
+ if (0 != collapse_special(pi, (char *)attr_value) || err_has(&pi->err)) {
529
+ attr_stack_cleanup(&attrs);
530
+ return 0;
531
+ }
532
+ }
533
+ attr_stack_push(&attrs, attr_name, attr_value);
534
+ break;
535
+ }
536
+ if ('\0' == pi->last) {
537
+ c = '\0';
538
+ } else {
539
+ c = pi->last;
540
+ pi->last = '\0';
541
+ }
572
542
  }
573
543
  if (hasChildren) {
574
- char *start;
575
- int first = 1;
576
-
577
- done = 0;
578
- /* read children */
579
- while (!done) {
580
- start = pi->s;
581
- next_non_white(pi);
582
- if (OffSkip == pi->options->skip && start < pi->s && '<' == *pi->s) {
583
- c = *pi->s;
584
- *pi->s = '\0';
585
- pi->pcb->add_text(pi, start, 1);
586
- *pi->s = c;
587
- }
588
- c = *pi->s++;
589
- if ('\0' == c) {
590
- attr_stack_cleanup(&attrs);
591
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
592
- return 0;
593
- }
594
- if ('<' == c) {
595
- char *slash;
596
-
597
- switch (*pi->s) {
598
- case '!': /* better be a comment or CDATA */
599
- pi->s++;
600
- if ('-' == *pi->s && '-' == *(pi->s + 1)) {
601
- pi->s += 2;
602
- read_comment(pi);
603
- } else if ((TolerantEffort == pi->options->effort) ?
604
- 0 == strncasecmp("[CDATA[", pi->s, 7) :
605
- 0 == strncmp("[CDATA[", pi->s, 7)) {
606
- pi->s += 7;
607
- read_cdata(pi);
608
- } else {
609
- attr_stack_cleanup(&attrs);
610
- set_error(&pi->err, "invalid format, invalid comment or CDATA format", pi->str, pi->s);
611
- return 0;
612
- }
613
- break;
614
- case '?': /* processing instruction */
615
- pi->s++;
616
- read_instruction(pi);
617
- break;
618
- case '/':
619
- slash = pi->s;
620
- pi->s++;
621
- if (0 == (name = read_name_token(pi))) {
622
- attr_stack_cleanup(&attrs);
623
- return 0;
624
- }
625
- end = pi->s;
626
- next_non_white(pi);
627
- c = *pi->s;
628
- *end = '\0';
629
- if (0 != ((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
630
- attr_stack_cleanup(&attrs);
631
- if (TolerantEffort == pi->options->effort) {
632
- pi->pcb->end_element(pi, ename);
633
- return name;
634
- } else {
635
- set_error(&pi->err, "invalid format, elements overlap", pi->str, pi->s);
636
- return 0;
637
- }
638
- }
639
- if ('>' != c) {
640
- attr_stack_cleanup(&attrs);
641
- set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
642
- return 0;
643
- }
644
- if (first && start != slash - 1) {
645
- // Some white space between start and here so add as
646
- // text after checking skip.
647
- *(slash - 1) = '\0';
648
- switch (pi->options->skip) {
649
- case CrSkip: {
650
- char *s = start;
651
- char *e = start;
652
-
653
- for (; '\0' != *e; e++) {
654
- if ('\r' != *e) {
655
- *s++ = *e;
656
- }
657
- }
658
- *s = '\0';
659
- break;
660
- }
661
- case SpcSkip:
662
- *start = '\0';
663
- break;
664
- case NoSkip:
665
- case OffSkip:
666
- default:
667
- break;
668
- }
669
- if ('\0' != *start) {
670
- pi->pcb->add_text(pi, start, 1);
671
- }
672
- }
673
- pi->s++;
674
- pi->pcb->end_element(pi, ename);
675
- attr_stack_cleanup(&attrs);
676
- return 0;
677
- case '\0':
678
- attr_stack_cleanup(&attrs);
679
- if (TolerantEffort == pi->options->effort) {
680
- return 0;
681
- } else {
682
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
683
- return 0;
684
- }
685
- default:
686
- first = 0;
687
- /* a child element */
688
- // Child closed with mismatched name.
689
- if (0 != (name = read_element(pi))) {
690
- attr_stack_cleanup(&attrs);
691
-
692
- if (0 == ((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
693
- pi->s++;
694
- pi->pcb->end_element(pi, ename);
695
- return 0;
696
- } else { // not the correct element yet
697
- pi->pcb->end_element(pi, ename);
698
- return name;
699
- }
700
- } else if (err_has(&pi->err)) {
701
- return 0;
702
- }
703
- break;
704
- }
705
- } else { /* read as TEXT */
706
- pi->s = start;
707
- /*pi->s--; */
708
- read_text(pi);
709
- /*read_reduced_text(pi); */
710
-
711
- /* to exit read_text with no errors the next character must be < */
712
- if ('/' == *(pi->s + 1) &&
713
- 0 == ((TolerantEffort == pi->options->effort) ? strncasecmp(ename, pi->s + 2, elen) : strncmp(ename, pi->s + 2, elen)) &&
714
- '>' == *(pi->s + elen + 2)) {
715
-
716
- /* close tag after text so treat as a value */
717
- pi->s += elen + 3;
718
- pi->pcb->end_element(pi, ename);
719
- attr_stack_cleanup(&attrs);
720
- return 0;
721
- }
722
- }
723
- }
544
+ char *start;
545
+ int first = 1;
546
+
547
+ done = 0;
548
+ /* read children */
549
+ while (!done) {
550
+ start = pi->s;
551
+ next_non_white(pi);
552
+ if (OffSkip == pi->options->skip && start < pi->s && '<' == *pi->s) {
553
+ c = *pi->s;
554
+ *pi->s = '\0';
555
+ pi->pcb->add_text(pi, start, 1);
556
+ *pi->s = c;
557
+ }
558
+ c = *pi->s++;
559
+ if ('\0' == c) {
560
+ attr_stack_cleanup(&attrs);
561
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
562
+ return 0;
563
+ }
564
+ if ('<' == c) {
565
+ char *slash;
566
+
567
+ switch (*pi->s) {
568
+ case '!': /* better be a comment or CDATA */
569
+ pi->s++;
570
+ if ('-' == *pi->s && '-' == *(pi->s + 1)) {
571
+ pi->s += 2;
572
+ read_comment(pi);
573
+ } else if ((TolerantEffort == pi->options->effort) ? 0 == strncasecmp("[CDATA[", pi->s, 7)
574
+ : 0 == strncmp("[CDATA[", pi->s, 7)) {
575
+ pi->s += 7;
576
+ read_cdata(pi);
577
+ } else {
578
+ attr_stack_cleanup(&attrs);
579
+ set_error(&pi->err, "invalid format, invalid comment or CDATA format", pi->str, pi->s);
580
+ return 0;
581
+ }
582
+ break;
583
+ case '?': /* processing instruction */
584
+ pi->s++;
585
+ read_instruction(pi);
586
+ break;
587
+ case '/':
588
+ slash = pi->s;
589
+ pi->s++;
590
+ if (0 == (name = read_name_token(pi))) {
591
+ attr_stack_cleanup(&attrs);
592
+ return 0;
593
+ }
594
+ end = pi->s;
595
+ next_non_white(pi);
596
+ c = *pi->s;
597
+ *end = '\0';
598
+ if (0 !=
599
+ ((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
600
+ attr_stack_cleanup(&attrs);
601
+ if (TolerantEffort == pi->options->effort) {
602
+ pi->pcb->end_element(pi, ename);
603
+ return name;
604
+ } else {
605
+ set_error(&pi->err, "invalid format, elements overlap", pi->str, pi->s);
606
+ return 0;
607
+ }
608
+ }
609
+ if ('>' != c) {
610
+ attr_stack_cleanup(&attrs);
611
+ set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
612
+ return 0;
613
+ }
614
+ if (first && start != slash - 1) {
615
+ // Some white space between start and here so add as
616
+ // text after checking skip.
617
+ *(slash - 1) = '\0';
618
+ switch (pi->options->skip) {
619
+ case CrSkip: {
620
+ char *s = start;
621
+ char *e = start;
622
+
623
+ for (; '\0' != *e; e++) {
624
+ if ('\r' != *e) {
625
+ *s++ = *e;
626
+ }
627
+ }
628
+ *s = '\0';
629
+ break;
630
+ }
631
+ case SpcSkip: *start = '\0'; break;
632
+ case NoSkip:
633
+ case OffSkip:
634
+ default: break;
635
+ }
636
+ if ('\0' != *start) {
637
+ pi->pcb->add_text(pi, start, 1);
638
+ }
639
+ }
640
+ pi->s++;
641
+ pi->pcb->end_element(pi, ename);
642
+ attr_stack_cleanup(&attrs);
643
+ return 0;
644
+ case '\0':
645
+ attr_stack_cleanup(&attrs);
646
+ if (TolerantEffort == pi->options->effort) {
647
+ return 0;
648
+ } else {
649
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
650
+ return 0;
651
+ }
652
+ default:
653
+ first = 0;
654
+ /* a child element */
655
+ // Child closed with mismatched name.
656
+ if (0 != (name = read_element(pi))) {
657
+ attr_stack_cleanup(&attrs);
658
+
659
+ if (0 ==
660
+ ((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
661
+ pi->s++;
662
+ pi->pcb->end_element(pi, ename);
663
+ return 0;
664
+ } else { // not the correct element yet
665
+ pi->pcb->end_element(pi, ename);
666
+ return name;
667
+ }
668
+ } else if (err_has(&pi->err)) {
669
+ return 0;
670
+ }
671
+ break;
672
+ }
673
+ } else { /* read as TEXT */
674
+ pi->s = start;
675
+ /*pi->s--; */
676
+ read_text(pi);
677
+ /*read_reduced_text(pi); */
678
+
679
+ /* to exit read_text with no errors the next character must be < */
680
+ if ('/' == *(pi->s + 1) &&
681
+ 0 == ((TolerantEffort == pi->options->effort) ? strncasecmp(ename, pi->s + 2, elen)
682
+ : strncmp(ename, pi->s + 2, elen)) &&
683
+ '>' == *(pi->s + elen + 2)) {
684
+ /* close tag after text so treat as a value */
685
+ pi->s += elen + 3;
686
+ pi->pcb->end_element(pi, ename);
687
+ attr_stack_cleanup(&attrs);
688
+ return 0;
689
+ }
690
+ }
691
+ }
724
692
  }
725
693
  attr_stack_cleanup(&attrs);
726
694
  return 0;
727
695
  }
728
696
 
729
- static void
730
- read_text(PInfo pi) {
731
- char buf[MAX_TEXT_LEN];
732
- char *b = buf;
733
- char *alloc_buf = 0;
734
- char *end = b + sizeof(buf) - 2;
735
- char c;
736
- int done = 0;
697
+ static void read_text(PInfo pi) {
698
+ char buf[MAX_TEXT_LEN];
699
+ char *b = buf;
700
+ char *alloc_buf = 0;
701
+ char *end = b + sizeof(buf) - 2;
702
+ char c;
703
+ int done = 0;
737
704
 
738
705
  while (!done) {
739
- c = *pi->s++;
740
- switch(c) {
741
- case '<':
742
- done = 1;
743
- pi->s--;
744
- break;
745
- case '\0':
746
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
747
- return;
748
- default:
749
- if (end <= (b + (('&' == c) ? 7 : 0))) { /* extra 8 for special just in case it is sequence of bytes */
750
- unsigned long size;
751
-
752
- if (0 == alloc_buf) {
753
- size = sizeof(buf) * 2;
754
- alloc_buf = ALLOC_N(char, size);
755
- memcpy(alloc_buf, buf, b - buf);
756
- b = alloc_buf + (b - buf);
757
- } else {
758
- unsigned long pos = b - alloc_buf;
759
-
760
- size = (end - alloc_buf) * 2;
761
- REALLOC_N(alloc_buf, char, size);
762
- b = alloc_buf + pos;
763
- }
764
- end = alloc_buf + size - 2;
765
- }
766
- if ('&' == c) {
767
- if (0 == (b = read_coded_chars(pi, b))) {
768
- return;
769
- }
770
- } else {
771
- if (0 <= c && c <= 0x20) {
772
- if (StrictEffort == pi->options->effort && 'x' == xml_valid_lower_chars[(unsigned char)c]) {
773
- set_error(&pi->err, "invalid character", pi->str, pi->s);
774
- return;
775
- }
776
- switch (pi->options->skip) {
777
- case CrSkip:
778
- if (buf != b && '\n' == c && '\r' == *(b - 1)) {
779
- *(b - 1) = '\n';
780
- } else {
781
- *b++ = c;
782
- }
783
- break;
784
- case SpcSkip:
785
- if (is_white(c)) {
786
- if (buf == b || ' ' != *(b - 1)) {
787
- *b++ = ' ';
788
- }
789
- } else {
790
- *b++ = c;
791
- }
792
- break;
793
- case NoSkip:
794
- case OffSkip:
795
- default:
796
- *b++ = c;
797
- break;
798
- }
799
- } else {
800
- *b++ = c;
801
- }
802
- }
803
- break;
804
- }
706
+ c = *pi->s++;
707
+ switch (c) {
708
+ case '<':
709
+ done = 1;
710
+ pi->s--;
711
+ break;
712
+ case '\0': set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s); return;
713
+ default:
714
+ if (end <= (b + (('&' == c) ? 7 : 0))) { /* extra 8 for special just in case it is sequence of bytes */
715
+ unsigned long size;
716
+
717
+ if (0 == alloc_buf) {
718
+ size = sizeof(buf) * 2;
719
+ alloc_buf = ALLOC_N(char, size);
720
+ memcpy(alloc_buf, buf, b - buf);
721
+ b = alloc_buf + (b - buf);
722
+ } else {
723
+ unsigned long pos = b - alloc_buf;
724
+
725
+ size = (end - alloc_buf) * 2;
726
+ REALLOC_N(alloc_buf, char, size);
727
+ b = alloc_buf + pos;
728
+ }
729
+ end = alloc_buf + size - 2;
730
+ }
731
+ if ('&' == c) {
732
+ if (0 == (b = read_coded_chars(pi, b))) {
733
+ return;
734
+ }
735
+ } else {
736
+ if (0 <= c && c <= 0x20) {
737
+ if (StrictEffort == pi->options->effort && 'x' == xml_valid_lower_chars[(unsigned char)c]) {
738
+ set_error(&pi->err, "invalid character", pi->str, pi->s);
739
+ return;
740
+ }
741
+ switch (pi->options->skip) {
742
+ case CrSkip:
743
+ if (buf != b && '\n' == c && '\r' == *(b - 1)) {
744
+ *(b - 1) = '\n';
745
+ } else {
746
+ *b++ = c;
747
+ }
748
+ break;
749
+ case SpcSkip:
750
+ if (is_white(c)) {
751
+ if (buf == b || ' ' != *(b - 1)) {
752
+ *b++ = ' ';
753
+ }
754
+ } else {
755
+ *b++ = c;
756
+ }
757
+ break;
758
+ case NoSkip:
759
+ case OffSkip:
760
+ default: *b++ = c; break;
761
+ }
762
+ } else {
763
+ *b++ = c;
764
+ }
765
+ }
766
+ break;
767
+ }
805
768
  }
806
769
  *b = '\0';
807
770
  if (0 != alloc_buf) {
808
- fix_newlines(alloc_buf);
809
- pi->pcb->add_text(pi, alloc_buf, ('/' == *(pi->s + 1)));
810
- xfree(alloc_buf);
771
+ fix_newlines(alloc_buf);
772
+ pi->pcb->add_text(pi, alloc_buf, ('/' == *(pi->s + 1)));
773
+ xfree(alloc_buf);
811
774
  } else {
812
- fix_newlines(buf);
813
- pi->pcb->add_text(pi, buf, ('/' == *(pi->s + 1)));
775
+ fix_newlines(buf);
776
+ pi->pcb->add_text(pi, buf, ('/' == *(pi->s + 1)));
814
777
  }
815
778
  }
816
779
 
@@ -886,323 +849,322 @@ read_reduced_text(PInfo pi) {
886
849
  }
887
850
  #endif
888
851
 
889
- static char*
890
- read_name_token(PInfo pi) {
891
- char *start;
852
+ static char *read_name_token(PInfo pi) {
853
+ char *start;
892
854
 
893
855
  next_non_white(pi);
894
856
  start = pi->s;
895
857
  for (; 1; pi->s++) {
896
- switch (*pi->s) {
897
- case ' ':
898
- case '\t':
899
- case '\f':
900
- case '?':
901
- case '=':
902
- case '/':
903
- case '>':
904
- case '\n':
905
- case '\r':
906
- return start;
907
- case '\0':
908
- /* documents never terminate after a name token */
909
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
910
- return 0;
911
- break; /* to avoid warnings */
912
- case ':':
913
- if ('\0' == *pi->options->strip_ns) {
914
- break;
915
- } else if ('*' == *pi->options->strip_ns && '\0' == pi->options->strip_ns[1]) {
916
- start = pi->s + 1;
917
- } else if (0 == strncmp(pi->options->strip_ns, start, pi->s - start)) {
918
- start = pi->s + 1;
919
- }
920
- break;
921
- default:
922
- break;
923
- }
858
+ switch (*pi->s) {
859
+ case ' ':
860
+ case '\t':
861
+ case '\f':
862
+ case '?':
863
+ case '=':
864
+ case '/':
865
+ case '>':
866
+ case '\n':
867
+ case '\r': return start;
868
+ case '\0':
869
+ /* documents never terminate after a name token */
870
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
871
+ return 0;
872
+ break; /* to avoid warnings */
873
+ case ':':
874
+ if ('\0' == *pi->options->strip_ns) {
875
+ break;
876
+ } else if ('*' == *pi->options->strip_ns && '\0' == pi->options->strip_ns[1]) {
877
+ start = pi->s + 1;
878
+ } else if (0 == strncmp(pi->options->strip_ns, start, pi->s - start)) {
879
+ start = pi->s + 1;
880
+ }
881
+ break;
882
+ default: break;
883
+ }
924
884
  }
925
885
  return start;
926
886
  }
927
887
 
928
- static void
929
- read_cdata(PInfo pi) {
930
- char *start;
931
- char *end;
888
+ static void read_cdata(PInfo pi) {
889
+ char *start;
890
+ char *end;
932
891
 
933
892
  start = pi->s;
934
- end = strstr(pi->s, "]]>");
893
+ end = strstr(pi->s, "]]>");
935
894
  if (end == 0) {
936
- set_error(&pi->err, "invalid format, CDATA not terminated", pi->str, pi->s);
937
- return;
895
+ set_error(&pi->err, "invalid format, CDATA not terminated", pi->str, pi->s);
896
+ return;
938
897
  }
939
- *end = '\0';
898
+ *end = '\0';
940
899
  pi->s = end + 3;
941
900
  if (0 != pi->pcb->add_cdata) {
942
- fix_newlines(start);
943
- pi->pcb->add_cdata(pi, start, end - start);
901
+ fix_newlines(start);
902
+ pi->pcb->add_cdata(pi, start, end - start);
944
903
  }
945
904
  }
946
905
 
947
906
  /* Assume the value starts immediately and goes until the quote character is
948
907
  * reached again. Do not read the character after the terminating quote.
949
908
  */
950
- static char*
951
- read_quoted_value(PInfo pi) {
952
- char *value = 0;
909
+ static char *read_quoted_value(PInfo pi) {
910
+ char *value = 0;
953
911
 
954
912
  if ('"' == *pi->s || '\'' == *pi->s) {
955
- char term = *pi->s;
913
+ char term = *pi->s;
956
914
 
957
- pi->s++; /* skip quote character */
915
+ pi->s++; /* skip quote character */
958
916
  value = pi->s;
959
917
  for (; *pi->s != term; pi->s++) {
960
918
  if ('\0' == *pi->s) {
961
919
  set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
962
- return 0;
920
+ return 0;
963
921
  }
964
922
  }
965
- *pi->s = '\0'; /* terminate value */
966
- pi->s++; /* move past quote */
923
+ *pi->s = '\0'; /* terminate value */
924
+ pi->s++; /* move past quote */
967
925
  } else if (StrictEffort == pi->options->effort) {
968
- set_error(&pi->err, "invalid format, expected a quote character", pi->str, pi->s);
969
- return 0;
926
+ set_error(&pi->err, "invalid format, expected a quote character", pi->str, pi->s);
927
+ return 0;
970
928
  } else if (TolerantEffort == pi->options->effort) {
971
929
  value = pi->s;
972
930
  for (; 1; pi->s++) {
973
- switch (*pi->s) {
974
- case '\0':
975
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
976
- return 0;
977
- case ' ':
978
- case '/':
979
- case '>':
980
- case '?': // for instructions
981
- case '\t':
982
- case '\n':
983
- case '\r':
984
- pi->last = *pi->s;
985
- *pi->s = '\0'; /* terminate value */
986
- pi->s++;
987
- return value;
988
- default:
989
- break;
931
+ switch (*pi->s) {
932
+ case '\0': set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s); return 0;
933
+ case ' ':
934
+ case '/':
935
+ case '>':
936
+ case '?': // for instructions
937
+ case '\t':
938
+ case '\n':
939
+ case '\r':
940
+ pi->last = *pi->s;
941
+ *pi->s = '\0'; /* terminate value */
942
+ pi->s++;
943
+ return value;
944
+ default: break;
990
945
  }
991
946
  }
992
947
  } else {
993
948
  value = pi->s;
994
949
  next_white(pi);
995
- if ('\0' == *pi->s) {
996
- set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
997
- return 0;
950
+ if ('\0' == *pi->s) {
951
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
952
+ return 0;
998
953
  }
999
954
  *pi->s++ = '\0'; /* terminate value */
1000
955
  }
1001
956
  return value;
1002
957
  }
1003
958
 
1004
- static char*
1005
- read_hex_uint64(char *b, uint64_t *up) {
1006
- uint64_t u = 0;
1007
- char c;
959
+ static char *read_hex_uint64(char *b, uint64_t *up) {
960
+ uint64_t u = 0;
961
+ char c;
1008
962
 
1009
963
  for (; ';' != *b; b++) {
1010
- c = *b;
1011
- if ('0' <= c && c <= '9') {
1012
- u = (u << 4) | (uint64_t)(c - '0');
1013
- } else if ('a' <= c && c <= 'f') {
1014
- u = (u << 4) | (uint64_t)(c - 'a' + 10);
1015
- } else if ('A' <= c && c <= 'F') {
1016
- u = (u << 4) | (uint64_t)(c - 'A' + 10);
1017
- } else {
1018
- return 0;
1019
- }
964
+ c = *b;
965
+ if ('0' <= c && c <= '9') {
966
+ u = (u << 4) | (uint64_t)(c - '0');
967
+ } else if ('a' <= c && c <= 'f') {
968
+ u = (u << 4) | (uint64_t)(c - 'a' + 10);
969
+ } else if ('A' <= c && c <= 'F') {
970
+ u = (u << 4) | (uint64_t)(c - 'A' + 10);
971
+ } else {
972
+ return 0;
973
+ }
1020
974
  }
1021
975
  *up = u;
1022
976
 
1023
977
  return b;
1024
978
  }
1025
979
 
1026
- static char*
1027
- read_10_uint64(char *b, uint64_t *up) {
1028
- uint64_t u = 0;
1029
- char c;
980
+ static char *read_10_uint64(char *b, uint64_t *up) {
981
+ uint64_t u = 0;
982
+ char c;
1030
983
 
1031
984
  for (; ';' != *b; b++) {
1032
- c = *b;
1033
- if ('0' <= c && c <= '9') {
1034
- u = (u * 10) + (uint64_t)(c - '0');
1035
- } else {
1036
- return 0;
1037
- }
985
+ c = *b;
986
+ if ('0' <= c && c <= '9') {
987
+ u = (u * 10) + (uint64_t)(c - '0');
988
+ } else {
989
+ return 0;
990
+ }
1038
991
  }
1039
992
  *up = u;
1040
993
 
1041
994
  return b;
1042
995
  }
1043
996
 
1044
- static char*
1045
- read_coded_chars(PInfo pi, char *text) {
1046
- char *b, buf[32];
1047
- char *end = buf + sizeof(buf) - 1;
1048
- char *s;
1049
- long blen = 0;
997
+ static char *read_coded_chars(PInfo pi, char *text) {
998
+ char *b, buf[32];
999
+ char *end = buf + sizeof(buf) - 1;
1000
+ char *s;
1001
+ long blen = 0;
1050
1002
 
1051
1003
  for (b = buf, s = pi->s; b < end; b++, s++) {
1052
- *b = *s;
1053
- if (';' == *s) {
1054
- *(b + 1) = '\0';
1055
- blen = b - buf;
1056
- s++;
1057
- break;
1058
- }
1004
+ *b = *s;
1005
+ if (';' == *s) {
1006
+ *(b + 1) = '\0';
1007
+ blen = b - buf;
1008
+ s++;
1009
+ break;
1010
+ }
1059
1011
  }
1060
1012
  if (b > end) {
1061
- *text++ = '&';
1013
+ *text++ = '&';
1062
1014
  } else if ('#' == *buf) {
1063
- uint64_t u = 0;
1015
+ uint64_t u = 0;
1064
1016
 
1065
- b = buf + 1;
1066
- if ('x' == *b || 'X' == *b) {
1067
- b = read_hex_uint64(b + 1, &u);
1068
- } else {
1069
- b = read_10_uint64(b, &u);
1070
- }
1071
- if (0 == b) {
1072
- *text++ = '&';
1073
- } else {
1074
- if (u <= 0x000000000000007FULL) {
1075
- *text++ = (char)u;
1076
- } else if (ox_utf8_encoding == pi->options->rb_enc) {
1077
- text = ox_ucs_to_utf8_chars(text, u);
1078
- } else if (0 == pi->options->rb_enc) {
1079
- pi->options->rb_enc = ox_utf8_encoding;
1080
- text = ox_ucs_to_utf8_chars(text, u);
1081
- } else if (TolerantEffort == pi->options->effort) {
1082
- *text++ = '&';
1083
- return text;
1084
- } else if (u <= 0x00000000000000FFULL) {
1085
- *text++ = (char)u;
1086
- } else {
1087
- /*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */
1088
- set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
1089
- return NULL;
1090
- }
1091
- pi->s = s;
1092
- }
1017
+ b = buf + 1;
1018
+ if ('x' == *b || 'X' == *b) {
1019
+ b = read_hex_uint64(b + 1, &u);
1020
+ } else {
1021
+ b = read_10_uint64(b, &u);
1022
+ }
1023
+ if (0 == b) {
1024
+ *text++ = '&';
1025
+ } else {
1026
+ if (u <= 0x000000000000007FULL) {
1027
+ *text++ = (char)u;
1028
+ } else if (ox_utf8_encoding == pi->options->rb_enc) {
1029
+ text = ox_ucs_to_utf8_chars(text, u);
1030
+ } else if (0 == pi->options->rb_enc) {
1031
+ pi->options->rb_enc = ox_utf8_encoding;
1032
+ text = ox_ucs_to_utf8_chars(text, u);
1033
+ } else if (TolerantEffort == pi->options->effort) {
1034
+ *text++ = '&';
1035
+ return text;
1036
+ } else if (u <= 0x00000000000000FFULL) {
1037
+ *text++ = (char)u;
1038
+ } else {
1039
+ /*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character
1040
+ * sequences.", pi->str, pi->s); */
1041
+ set_error(&pi->err,
1042
+ "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.",
1043
+ pi->str,
1044
+ pi->s);
1045
+ return NULL;
1046
+ }
1047
+ pi->s = s;
1048
+ }
1093
1049
  } else {
1094
- char *t2;
1095
-
1096
- buf[blen] = '\0';
1097
- if (NULL == (t2 = ox_entity_lookup(text, buf))) {
1098
- *text++ = '&';
1099
- } else {
1100
- text = t2;
1101
- pi->s = s;
1102
- }
1050
+ char *t2;
1051
+
1052
+ buf[blen] = '\0';
1053
+ if (NULL == (t2 = ox_entity_lookup(text, buf))) {
1054
+ *text++ = '&';
1055
+ } else {
1056
+ text = t2;
1057
+ pi->s = s;
1058
+ }
1103
1059
  }
1104
1060
  return text;
1105
1061
  }
1106
1062
 
1107
- static int
1108
- collapse_special(PInfo pi, char *str) {
1109
- char *s = str;
1110
- char *b = str;
1063
+ static int collapse_special(PInfo pi, char *str) {
1064
+ char *s = str;
1065
+ char *b = str;
1111
1066
 
1112
1067
  while ('\0' != *s) {
1113
- if ('&' == *s) {
1114
- int c;
1115
- char *end;
1116
-
1117
- s++;
1118
- if ('#' == *s) {
1119
- uint64_t u = 0;
1120
- char x;
1121
-
1122
- s++;
1123
- if ('x' == *s || 'X' == *s) {
1124
- x = *s;
1125
- s++;
1126
- end = read_hex_uint64(s, &u);
1127
- } else {
1128
- x = '\0';
1129
- end = read_10_uint64(s, &u);
1130
- }
1131
- if (0 == end) {
1132
- if (TolerantEffort == pi->options->effort) {
1133
- *b++ = '&';
1134
- *b++ = '#';
1135
- if ('\0' != x) {
1136
- *b++ = x;
1137
- }
1138
- continue;
1139
- }
1140
- return EDOM;
1141
- }
1142
- if (u <= 0x000000000000007FULL) {
1143
- *b++ = (char)u;
1144
- } else if (ox_utf8_encoding == pi->options->rb_enc) {
1145
- b = ox_ucs_to_utf8_chars(b, u);
1146
- /* TBD support UTF-16 */
1147
- } else if (0 == pi->options->rb_enc) {
1148
- pi->options->rb_enc = ox_utf8_encoding;
1149
- b = ox_ucs_to_utf8_chars(b, u);
1150
- } else {
1151
- /* set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);*/
1152
- set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
1153
- return 0;
1154
- }
1155
- s = end + 1;
1156
- } else {
1157
- if (0 == strncasecmp(s, "lt;", 3)) {
1158
- c = '<';
1159
- s += 3;
1160
- } else if (0 == strncasecmp(s, "gt;", 3)) {
1161
- c = '>';
1162
- s += 3;
1163
- } else if (0 == strncasecmp(s, "amp;", 4)) {
1164
- c = '&';
1165
- s += 4;
1166
- } else if (0 == strncasecmp(s, "quot;", 5)) {
1167
- c = '"';
1168
- s += 5;
1169
- } else if (0 == strncasecmp(s, "apos;", 5)) {
1170
- c = '\'';
1171
- s += 5;
1172
- } else if (TolerantEffort == pi->options->effort) {
1173
- *b++ = '&';
1174
- continue;
1175
- } else {
1176
- char key[16];
1177
- char *k = key;
1178
- char *kend = key + sizeof(key) - 1;
1179
-
1180
- *k++ = *s;
1181
- while (';' != *s++) {
1182
- if ('\0' == *s) {
1183
- set_error(&pi->err, "Invalid format, special character does not end with a semicolon", pi->str, pi->s);
1184
- return EDOM;
1185
- }
1186
- if (kend <= k) {
1187
- k = key;
1188
- break;
1189
- }
1190
- *k++ = *s;
1191
- }
1192
- k--;
1193
- *k = '\0';
1194
- if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
1195
- set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
1196
- c = '?';
1197
- return 0;
1198
- }
1199
- continue;
1200
- }
1201
- *b++ = (char)c;
1202
- }
1203
- } else {
1204
- *b++ = *s++;
1205
- }
1068
+ if ('&' == *s) {
1069
+ int c;
1070
+ char *end;
1071
+
1072
+ s++;
1073
+ if ('#' == *s) {
1074
+ uint64_t u = 0;
1075
+ char x;
1076
+
1077
+ s++;
1078
+ if ('x' == *s || 'X' == *s) {
1079
+ x = *s;
1080
+ s++;
1081
+ end = read_hex_uint64(s, &u);
1082
+ } else {
1083
+ x = '\0';
1084
+ end = read_10_uint64(s, &u);
1085
+ }
1086
+ if (0 == end) {
1087
+ if (TolerantEffort == pi->options->effort) {
1088
+ *b++ = '&';
1089
+ *b++ = '#';
1090
+ if ('\0' != x) {
1091
+ *b++ = x;
1092
+ }
1093
+ continue;
1094
+ }
1095
+ return EDOM;
1096
+ }
1097
+ if (u <= 0x000000000000007FULL) {
1098
+ *b++ = (char)u;
1099
+ } else if (ox_utf8_encoding == pi->options->rb_enc) {
1100
+ b = ox_ucs_to_utf8_chars(b, u);
1101
+ /* TBD support UTF-16 */
1102
+ } else if (0 == pi->options->rb_enc) {
1103
+ pi->options->rb_enc = ox_utf8_encoding;
1104
+ b = ox_ucs_to_utf8_chars(b, u);
1105
+ } else {
1106
+ /* set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character
1107
+ * sequences.", pi->str, pi->s);*/
1108
+ set_error(&pi->err,
1109
+ "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.",
1110
+ pi->str,
1111
+ pi->s);
1112
+ return 0;
1113
+ }
1114
+ s = end + 1;
1115
+ } else {
1116
+ if (0 == strncasecmp(s, "lt;", 3)) {
1117
+ c = '<';
1118
+ s += 3;
1119
+ } else if (0 == strncasecmp(s, "gt;", 3)) {
1120
+ c = '>';
1121
+ s += 3;
1122
+ } else if (0 == strncasecmp(s, "amp;", 4)) {
1123
+ c = '&';
1124
+ s += 4;
1125
+ } else if (0 == strncasecmp(s, "quot;", 5)) {
1126
+ c = '"';
1127
+ s += 5;
1128
+ } else if (0 == strncasecmp(s, "apos;", 5)) {
1129
+ c = '\'';
1130
+ s += 5;
1131
+ } else if (TolerantEffort == pi->options->effort) {
1132
+ *b++ = '&';
1133
+ continue;
1134
+ } else {
1135
+ char key[16];
1136
+ char *k = key;
1137
+ char *kend = key + sizeof(key) - 1;
1138
+
1139
+ *k++ = *s;
1140
+ while (';' != *s++) {
1141
+ if ('\0' == *s) {
1142
+ set_error(&pi->err,
1143
+ "Invalid format, special character does not end with a semicolon",
1144
+ pi->str,
1145
+ pi->s);
1146
+ return EDOM;
1147
+ }
1148
+ if (kend <= k) {
1149
+ k = key;
1150
+ break;
1151
+ }
1152
+ *k++ = *s;
1153
+ }
1154
+ k--;
1155
+ *k = '\0';
1156
+ if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
1157
+ set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
1158
+ c = '?';
1159
+ return 0;
1160
+ }
1161
+ continue;
1162
+ }
1163
+ *b++ = (char)c;
1164
+ }
1165
+ } else {
1166
+ *b++ = *s++;
1167
+ }
1206
1168
  }
1207
1169
  *b = '\0';
1208
1170