ox 2.14.13 → 2.14.15
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +1 -1
- data/ext/ox/attr.h +33 -39
- data/ext/ox/base64.c +48 -42
- data/ext/ox/base64.h +4 -4
- data/ext/ox/buf.h +80 -86
- data/ext/ox/builder.c +378 -423
- data/ext/ox/cache.c +2 -2
- data/ext/ox/cache8.c +37 -40
- data/ext/ox/cache8.h +7 -7
- data/ext/ox/dump.c +838 -867
- data/ext/ox/err.c +16 -13
- data/ext/ox/err.h +11 -12
- data/ext/ox/extconf.rb +5 -5
- data/ext/ox/gen_load.c +135 -137
- data/ext/ox/hash_load.c +130 -148
- data/ext/ox/helper.h +32 -39
- data/ext/ox/intern.c +1 -2
- data/ext/ox/obj_load.c +590 -644
- data/ext/ox/ox.c +2 -2
- data/ext/ox/ox.h +5 -5
- data/ext/ox/parse.c +836 -874
- data/ext/ox/sax.c +38 -23
- data/ext/ox/sax.h +2 -2
- data/ext/ox/sax_as.c +78 -94
- data/ext/ox/sax_buf.c +85 -94
- data/ext/ox/sax_buf.h +101 -120
- data/ext/ox/sax_hint.c +175 -184
- data/ext/ox/sax_hint.h +19 -19
- data/ext/ox/sax_stack.h +59 -45
- data/ext/ox/slotcache.c +2 -2
- data/ext/ox/slotcache.h +4 -4
- data/ext/ox/special.c +320 -327
- data/ext/ox/special.h +2 -2
- data/ext/ox/type.h +19 -19
- data/lib/ox/bag.rb +13 -9
- data/lib/ox/cdata.rb +0 -2
- data/lib/ox/comment.rb +0 -2
- data/lib/ox/doctype.rb +0 -2
- data/lib/ox/document.rb +3 -5
- data/lib/ox/element.rb +41 -26
- data/lib/ox/error.rb +0 -3
- data/lib/ox/hasattrs.rb +7 -8
- data/lib/ox/instruct.rb +4 -6
- data/lib/ox/node.rb +3 -4
- data/lib/ox/raw.rb +0 -2
- data/lib/ox/sax.rb +20 -36
- data/lib/ox/version.rb +1 -2
- data/lib/ox/xmlrpc_adapter.rb +4 -6
- data/lib/ox.rb +15 -16
- metadata +6 -5
data/ext/ox/parse.c
CHANGED
@@ -3,35 +3,35 @@
|
|
3
3
|
* All rights reserved.
|
4
4
|
*/
|
5
5
|
|
6
|
-
#include <stdlib.h>
|
7
6
|
#include <errno.h>
|
8
7
|
#include <stdbool.h>
|
9
8
|
#include <stdio.h>
|
9
|
+
#include <stdlib.h>
|
10
10
|
#include <string.h>
|
11
11
|
#include <strings.h>
|
12
12
|
|
13
|
-
#include "ruby.h"
|
14
|
-
#include "ox.h"
|
15
|
-
#include "err.h"
|
16
13
|
#include "attr.h"
|
17
|
-
#include "
|
14
|
+
#include "err.h"
|
18
15
|
#include "helper.h"
|
16
|
+
#include "intern.h"
|
17
|
+
#include "ox.h"
|
18
|
+
#include "ruby.h"
|
19
19
|
#include "special.h"
|
20
20
|
|
21
|
-
static void
|
22
|
-
static void
|
23
|
-
static void
|
24
|
-
static char*
|
25
|
-
static void
|
21
|
+
static void read_instruction(PInfo pi);
|
22
|
+
static void read_doctype(PInfo pi);
|
23
|
+
static void read_comment(PInfo pi);
|
24
|
+
static char *read_element(PInfo pi);
|
25
|
+
static void read_text(PInfo pi);
|
26
26
|
/*static void read_reduced_text(PInfo pi); */
|
27
|
-
static void
|
28
|
-
static char*
|
29
|
-
static char*
|
30
|
-
static char*
|
31
|
-
static char*
|
32
|
-
static char*
|
33
|
-
static void
|
34
|
-
static int
|
27
|
+
static void read_cdata(PInfo pi);
|
28
|
+
static char *read_name_token(PInfo pi);
|
29
|
+
static char *read_quoted_value(PInfo pi);
|
30
|
+
static char *read_hex_uint64(char *b, uint64_t *up);
|
31
|
+
static char *read_10_uint64(char *b, uint64_t *up);
|
32
|
+
static char *read_coded_chars(PInfo pi, char *text);
|
33
|
+
static void next_non_white(PInfo pi);
|
34
|
+
static int collapse_special(PInfo pi, char *str);
|
35
35
|
|
36
36
|
/* This XML parser is a single pass, destructive, callback parser. It is a
|
37
37
|
* single pass parse since it only make one pass over the characters in the
|
@@ -46,53 +46,43 @@ static int collapse_special(PInfo pi, char *str);
|
|
46
46
|
* all cases to parse the string.
|
47
47
|
*/
|
48
48
|
|
49
|
-
static char
|
49
|
+
static char xml_valid_lower_chars[34] = "xxxxxxxxxooxxoxxxxxxxxxxxxxxxxxxo";
|
50
50
|
|
51
|
-
inline static int
|
52
|
-
is_white(char c) {
|
51
|
+
inline static int is_white(char c) {
|
53
52
|
switch (c) {
|
54
53
|
case ' ':
|
55
54
|
case '\t':
|
56
55
|
case '\f':
|
57
56
|
case '\n':
|
58
|
-
case '\r':
|
59
|
-
|
60
|
-
default:
|
61
|
-
return 0;
|
57
|
+
case '\r': return 1;
|
58
|
+
default: return 0;
|
62
59
|
}
|
63
60
|
}
|
64
61
|
|
65
|
-
|
66
|
-
inline static void
|
67
|
-
next_non_white(PInfo pi) {
|
62
|
+
inline static void next_non_white(PInfo pi) {
|
68
63
|
for (; 1; pi->s++) {
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
return;
|
78
|
-
}
|
64
|
+
switch (*pi->s) {
|
65
|
+
case ' ':
|
66
|
+
case '\t':
|
67
|
+
case '\f':
|
68
|
+
case '\n':
|
69
|
+
case '\r': break;
|
70
|
+
default: return;
|
71
|
+
}
|
79
72
|
}
|
80
73
|
}
|
81
74
|
|
82
|
-
inline static void
|
83
|
-
next_white(PInfo pi) {
|
75
|
+
inline static void next_white(PInfo pi) {
|
84
76
|
for (; 1; pi->s++) {
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
break;
|
95
|
-
}
|
77
|
+
switch (*pi->s) {
|
78
|
+
case ' ':
|
79
|
+
case '\t':
|
80
|
+
case '\f':
|
81
|
+
case '\n':
|
82
|
+
case '\r':
|
83
|
+
case '\0': return;
|
84
|
+
default: break;
|
85
|
+
}
|
96
86
|
}
|
97
87
|
}
|
98
88
|
|
@@ -100,53 +90,52 @@ static void fix_newlines(char *buf) {
|
|
100
90
|
#if HAVE_INDEX
|
101
91
|
if (NULL != index(buf, '\r')) {
|
102
92
|
#endif
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
93
|
+
char *s = buf;
|
94
|
+
char *d = buf;
|
95
|
+
|
96
|
+
for (; '\0' != *s; s++) {
|
97
|
+
if ('\r' == *s) {
|
98
|
+
if ('\n' == *(s + 1)) {
|
99
|
+
continue;
|
100
|
+
}
|
101
|
+
*d = '\n';
|
102
|
+
} else if (d < s) {
|
103
|
+
*d = *s;
|
104
|
+
}
|
105
|
+
d++;
|
106
|
+
}
|
107
|
+
*d = '\0';
|
118
108
|
#if HAVE_INDEX
|
119
109
|
}
|
120
110
|
#endif
|
121
111
|
}
|
122
112
|
|
123
|
-
static void
|
124
|
-
mark_pi_cb(void *ptr) {
|
113
|
+
static void mark_pi_cb(void *ptr) {
|
125
114
|
if (NULL != ptr) {
|
126
|
-
|
127
|
-
|
115
|
+
HelperStack stack = &((PInfo)ptr)->helpers;
|
116
|
+
Helper h;
|
128
117
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
118
|
+
for (h = stack->head; h < stack->tail; h++) {
|
119
|
+
if (NoCode != h->type) {
|
120
|
+
rb_gc_mark(h->obj);
|
121
|
+
}
|
122
|
+
}
|
134
123
|
}
|
135
124
|
}
|
136
125
|
|
137
126
|
VALUE
|
138
127
|
ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options, Err err) {
|
139
|
-
struct _pInfo
|
140
|
-
int
|
141
|
-
int
|
142
|
-
volatile VALUE
|
128
|
+
struct _pInfo pi;
|
129
|
+
int body_read = 0;
|
130
|
+
int block_given = rb_block_given_p();
|
131
|
+
volatile VALUE wrap;
|
143
132
|
|
144
133
|
if (0 == xml) {
|
145
|
-
|
146
|
-
|
134
|
+
set_error(err, "Invalid arg, xml string can not be null", xml, 0);
|
135
|
+
return Qnil;
|
147
136
|
}
|
148
137
|
if (DEBUG <= options->trace) {
|
149
|
-
|
138
|
+
printf("Parsing xml:\n%s\n", xml);
|
150
139
|
}
|
151
140
|
// initialize parse info
|
152
141
|
helper_stack_init(&pi.helpers);
|
@@ -154,663 +143,637 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
|
|
154
143
|
wrap = Data_Wrap_Struct(rb_cObject, mark_pi_cb, NULL, &pi);
|
155
144
|
|
156
145
|
err_init(&pi.err);
|
157
|
-
pi.str
|
158
|
-
pi.end
|
159
|
-
pi.s
|
160
|
-
pi.pcb
|
161
|
-
pi.obj
|
146
|
+
pi.str = xml;
|
147
|
+
pi.end = pi.str + len;
|
148
|
+
pi.s = xml;
|
149
|
+
pi.pcb = pcb;
|
150
|
+
pi.obj = Qnil;
|
162
151
|
pi.circ_array = 0;
|
163
|
-
pi.options
|
164
|
-
pi.marked
|
165
|
-
pi.mark_size
|
166
|
-
pi.mark_cnt
|
152
|
+
pi.options = options;
|
153
|
+
pi.marked = NULL;
|
154
|
+
pi.mark_size = 0;
|
155
|
+
pi.mark_cnt = 0;
|
167
156
|
while (1) {
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
157
|
+
next_non_white(&pi); // skip white space
|
158
|
+
if ('\0' == *pi.s) {
|
159
|
+
break;
|
160
|
+
}
|
161
|
+
if (body_read && 0 != endp) {
|
162
|
+
*endp = pi.s;
|
163
|
+
break;
|
164
|
+
}
|
165
|
+
if ('<' != *pi.s) { // all top level entities start with <
|
166
|
+
set_error(err, "invalid format, expected <", pi.str, pi.s);
|
167
|
+
helper_stack_cleanup(&pi.helpers);
|
168
|
+
return Qnil;
|
169
|
+
}
|
170
|
+
pi.s++; // past <
|
171
|
+
switch (*pi.s) {
|
172
|
+
case '?': // processing instruction
|
173
|
+
pi.s++;
|
174
|
+
read_instruction(&pi);
|
175
|
+
break;
|
176
|
+
case '!': // comment or doctype
|
177
|
+
pi.s++;
|
178
|
+
if ('\0' == *pi.s) {
|
179
|
+
set_error(err, "invalid format, DOCTYPE or comment not terminated", pi.str, pi.s);
|
180
|
+
helper_stack_cleanup(&pi.helpers);
|
181
|
+
return Qnil;
|
182
|
+
} else if ('-' == *pi.s) {
|
183
|
+
pi.s++; // skip -
|
184
|
+
if ('-' != *pi.s) {
|
185
|
+
set_error(err, "invalid format, bad comment format", pi.str, pi.s);
|
186
|
+
helper_stack_cleanup(&pi.helpers);
|
187
|
+
return Qnil;
|
188
|
+
} else {
|
189
|
+
pi.s++; // skip second -
|
190
|
+
read_comment(&pi);
|
191
|
+
}
|
192
|
+
} else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7)
|
193
|
+
: 0 == strncmp("DOCTYPE", pi.s, 7)) {
|
194
|
+
pi.s += 7;
|
195
|
+
read_doctype(&pi);
|
196
|
+
} else {
|
197
|
+
set_error(err, "invalid format, DOCTYPE or comment expected", pi.str, pi.s);
|
198
|
+
helper_stack_cleanup(&pi.helpers);
|
199
|
+
return Qnil;
|
200
|
+
}
|
201
|
+
break;
|
202
|
+
case '\0':
|
203
|
+
set_error(err, "invalid format, document not terminated", pi.str, pi.s);
|
204
|
+
helper_stack_cleanup(&pi.helpers);
|
205
|
+
return Qnil;
|
206
|
+
default:
|
207
|
+
read_element(&pi);
|
208
|
+
body_read = 1;
|
209
|
+
break;
|
210
|
+
}
|
211
|
+
if (err_has(&pi.err)) {
|
212
|
+
*err = pi.err;
|
213
|
+
helper_stack_cleanup(&pi.helpers);
|
214
|
+
return Qnil;
|
215
|
+
}
|
216
|
+
if (block_given && Qnil != pi.obj && Qundef != pi.obj) {
|
217
|
+
if (NULL != pcb->finish) {
|
218
|
+
pcb->finish(&pi);
|
219
|
+
}
|
220
|
+
rb_yield(pi.obj);
|
221
|
+
}
|
232
222
|
}
|
233
223
|
DATA_PTR(wrap) = NULL;
|
234
224
|
helper_stack_cleanup(&pi.helpers);
|
235
225
|
if (NULL != pcb->finish) {
|
236
|
-
|
226
|
+
pcb->finish(&pi);
|
237
227
|
}
|
238
228
|
return pi.obj;
|
239
229
|
}
|
240
230
|
|
241
231
|
// Entered after the "<?" sequence. Ready to read the rest.
|
242
|
-
static void
|
243
|
-
|
244
|
-
char
|
245
|
-
|
246
|
-
|
247
|
-
char
|
248
|
-
char
|
249
|
-
char
|
250
|
-
char
|
251
|
-
char
|
252
|
-
|
253
|
-
|
254
|
-
bool attrs_ok = true;
|
232
|
+
static void read_instruction(PInfo pi) {
|
233
|
+
char content[256];
|
234
|
+
char *content_ptr;
|
235
|
+
struct _attrStack attrs;
|
236
|
+
char *attr_name;
|
237
|
+
char *attr_value;
|
238
|
+
char *target;
|
239
|
+
char *end;
|
240
|
+
char c;
|
241
|
+
char *cend;
|
242
|
+
size_t size;
|
243
|
+
bool attrs_ok = true;
|
255
244
|
|
256
245
|
*content = '\0';
|
257
246
|
attr_stack_init(&attrs);
|
258
247
|
if (0 == (target = read_name_token(pi))) {
|
259
|
-
|
248
|
+
return;
|
260
249
|
}
|
261
250
|
end = pi->s;
|
262
251
|
for (; true; pi->s++) {
|
263
252
|
switch (*pi->s) {
|
264
253
|
case '?':
|
265
254
|
if ('>' == *(pi->s + 1)) {
|
266
|
-
|
267
|
-
|
255
|
+
pi->s++;
|
256
|
+
goto DONE;
|
268
257
|
}
|
269
258
|
break;
|
270
|
-
case '\0':
|
271
|
-
|
272
|
-
return;
|
273
|
-
default:
|
274
|
-
break;
|
259
|
+
case '\0': set_error(&pi->err, "processing instruction not terminated", pi->str, pi->s); return;
|
260
|
+
default: break;
|
275
261
|
}
|
276
262
|
}
|
277
263
|
DONE:
|
278
|
-
cend
|
279
|
-
size
|
264
|
+
cend = pi->s;
|
265
|
+
size = cend - end - 1;
|
280
266
|
pi->s = end;
|
281
267
|
if (size < sizeof(content)) {
|
282
|
-
|
268
|
+
content_ptr = content;
|
283
269
|
} else {
|
284
|
-
|
270
|
+
content_ptr = ALLOC_N(char, size + 1);
|
285
271
|
}
|
286
272
|
memcpy(content_ptr, end, size);
|
287
273
|
content_ptr[size] = '\0';
|
288
274
|
|
289
275
|
next_non_white(pi);
|
290
|
-
c
|
291
|
-
*end = '\0';
|
276
|
+
c = *pi->s;
|
277
|
+
*end = '\0'; // terminate name
|
292
278
|
if ('?' != c) {
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
279
|
+
while ('?' != c) {
|
280
|
+
pi->last = 0;
|
281
|
+
if ('\0' == *pi->s) {
|
282
|
+
attr_stack_cleanup(&attrs);
|
283
|
+
set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
|
284
|
+
return;
|
285
|
+
}
|
286
|
+
next_non_white(pi);
|
287
|
+
if (0 == (attr_name = read_name_token(pi))) {
|
288
|
+
attr_stack_cleanup(&attrs);
|
289
|
+
return;
|
290
|
+
}
|
291
|
+
end = pi->s;
|
292
|
+
next_non_white(pi);
|
293
|
+
if ('=' != *pi->s++) {
|
294
|
+
attrs_ok = false;
|
295
|
+
break;
|
296
|
+
}
|
297
|
+
*end = '\0'; // terminate name
|
298
|
+
// read value
|
299
|
+
next_non_white(pi);
|
300
|
+
if (0 == (attr_value = read_quoted_value(pi))) {
|
301
|
+
attr_stack_cleanup(&attrs);
|
302
|
+
return;
|
303
|
+
}
|
304
|
+
attr_stack_push(&attrs, attr_name, attr_value);
|
305
|
+
next_non_white(pi);
|
306
|
+
if ('\0' == pi->last) {
|
307
|
+
c = *pi->s;
|
308
|
+
} else {
|
309
|
+
c = pi->last;
|
310
|
+
}
|
311
|
+
}
|
312
|
+
if ('?' == *pi->s) {
|
313
|
+
pi->s++;
|
314
|
+
}
|
329
315
|
} else {
|
330
|
-
|
316
|
+
pi->s++;
|
331
317
|
}
|
332
318
|
if (attrs_ok) {
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
319
|
+
if ('>' != *pi->s++) {
|
320
|
+
attr_stack_cleanup(&attrs);
|
321
|
+
set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
|
322
|
+
return;
|
323
|
+
}
|
338
324
|
} else {
|
339
|
-
|
325
|
+
pi->s = cend + 1;
|
340
326
|
}
|
341
327
|
if (0 != pi->pcb->instruct) {
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
328
|
+
if (attrs_ok) {
|
329
|
+
pi->pcb->instruct(pi, target, attrs.head, 0);
|
330
|
+
} else {
|
331
|
+
pi->pcb->instruct(pi, target, attrs.head, content_ptr);
|
332
|
+
}
|
347
333
|
}
|
348
334
|
attr_stack_cleanup(&attrs);
|
349
335
|
if (content_ptr != content) {
|
350
|
-
|
336
|
+
xfree(content_ptr);
|
351
337
|
}
|
352
338
|
}
|
353
339
|
|
354
|
-
static void
|
355
|
-
|
356
|
-
char c;
|
340
|
+
static void read_delimited(PInfo pi, char end) {
|
341
|
+
char c;
|
357
342
|
|
358
343
|
if ('"' == end || '\'' == end) {
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
344
|
+
for (c = *pi->s++; end != c; c = *pi->s++) {
|
345
|
+
if ('\0' == c) {
|
346
|
+
set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s);
|
347
|
+
return;
|
348
|
+
}
|
349
|
+
}
|
365
350
|
} else {
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
break;
|
381
|
-
case '[':
|
382
|
-
read_delimited(pi, ']');
|
383
|
-
break;
|
384
|
-
case '<':
|
385
|
-
read_delimited(pi, '>');
|
386
|
-
break;
|
387
|
-
default:
|
388
|
-
break;
|
389
|
-
}
|
390
|
-
}
|
351
|
+
while (1) {
|
352
|
+
c = *pi->s++;
|
353
|
+
if (end == c) {
|
354
|
+
return;
|
355
|
+
}
|
356
|
+
switch (c) {
|
357
|
+
case '\0': set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s); return;
|
358
|
+
case '"': read_delimited(pi, c); break;
|
359
|
+
case '\'': read_delimited(pi, c); break;
|
360
|
+
case '[': read_delimited(pi, ']'); break;
|
361
|
+
case '<': read_delimited(pi, '>'); break;
|
362
|
+
default: break;
|
363
|
+
}
|
364
|
+
}
|
391
365
|
}
|
392
366
|
}
|
393
367
|
|
394
368
|
// Entered after the "<!DOCTYPE" sequence plus the first character after
|
395
369
|
// that. Ready to read the rest.
|
396
|
-
static void
|
397
|
-
|
398
|
-
char *doctype;
|
370
|
+
static void read_doctype(PInfo pi) {
|
371
|
+
char *doctype;
|
399
372
|
|
400
373
|
next_non_white(pi);
|
401
374
|
doctype = pi->s;
|
402
375
|
read_delimited(pi, '>');
|
403
376
|
if (err_has(&pi->err)) {
|
404
|
-
|
377
|
+
return;
|
405
378
|
}
|
406
379
|
pi->s--;
|
407
380
|
*pi->s = '\0';
|
408
381
|
pi->s++;
|
409
382
|
if (0 != pi->pcb->add_doctype) {
|
410
|
-
|
411
|
-
|
383
|
+
fix_newlines(doctype);
|
384
|
+
pi->pcb->add_doctype(pi, doctype);
|
412
385
|
}
|
413
386
|
}
|
414
387
|
|
415
388
|
// Entered after "<!--". Returns error code.
|
416
|
-
static void
|
417
|
-
|
418
|
-
char
|
419
|
-
char
|
420
|
-
|
421
|
-
int done = 0;
|
389
|
+
static void read_comment(PInfo pi) {
|
390
|
+
char *end;
|
391
|
+
char *s;
|
392
|
+
char *comment;
|
393
|
+
int done = 0;
|
422
394
|
|
423
395
|
next_non_white(pi);
|
424
396
|
comment = pi->s;
|
425
|
-
end
|
397
|
+
end = strstr(pi->s, "-->");
|
426
398
|
if (0 == end) {
|
427
|
-
|
428
|
-
|
399
|
+
set_error(&pi->err, "invalid format, comment not terminated", pi->str, pi->s);
|
400
|
+
return;
|
429
401
|
}
|
430
402
|
for (s = end - 1; pi->s < s && !done; s--) {
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
}
|
403
|
+
switch (*s) {
|
404
|
+
case ' ':
|
405
|
+
case '\t':
|
406
|
+
case '\f':
|
407
|
+
case '\n':
|
408
|
+
case '\r': break;
|
409
|
+
default:
|
410
|
+
*(s + 1) = '\0';
|
411
|
+
done = 1;
|
412
|
+
break;
|
413
|
+
}
|
443
414
|
}
|
444
|
-
*end
|
415
|
+
*end = '\0'; // in case the comment was blank
|
445
416
|
pi->s = end + 3;
|
446
417
|
if (0 != pi->pcb->add_comment) {
|
447
|
-
|
448
|
-
|
418
|
+
fix_newlines(comment);
|
419
|
+
pi->pcb->add_comment(pi, comment);
|
449
420
|
}
|
450
421
|
}
|
451
422
|
|
452
423
|
// Entered after the '<' and the first character after that. Returns stat
|
453
424
|
// code.
|
454
|
-
static char*
|
455
|
-
|
456
|
-
|
457
|
-
const char
|
458
|
-
|
459
|
-
char
|
460
|
-
char
|
461
|
-
char
|
462
|
-
|
463
|
-
|
464
|
-
int
|
465
|
-
int done = 0;
|
425
|
+
static char *read_element(PInfo pi) {
|
426
|
+
struct _attrStack attrs;
|
427
|
+
const char *attr_name;
|
428
|
+
const char *attr_value;
|
429
|
+
char *name;
|
430
|
+
char *ename;
|
431
|
+
char *end;
|
432
|
+
char c;
|
433
|
+
long elen;
|
434
|
+
int hasChildren = 0;
|
435
|
+
int done = 0;
|
466
436
|
|
467
437
|
attr_stack_init(&attrs);
|
468
438
|
if (0 == (ename = read_name_token(pi))) {
|
469
|
-
|
439
|
+
return 0;
|
470
440
|
}
|
471
|
-
end
|
441
|
+
end = pi->s;
|
472
442
|
elen = end - ename;
|
473
443
|
next_non_white(pi);
|
474
|
-
c
|
444
|
+
c = *pi->s;
|
475
445
|
*end = '\0';
|
476
446
|
if ('/' == c) {
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
447
|
+
// empty element, no attributes and no children
|
448
|
+
pi->s++;
|
449
|
+
if ('>' != *pi->s) {
|
450
|
+
attr_stack_cleanup(&attrs);
|
451
|
+
set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
|
452
|
+
return 0;
|
453
|
+
}
|
454
|
+
pi->s++; /* past > */
|
455
|
+
pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
|
456
|
+
pi->pcb->end_element(pi, ename);
|
487
457
|
|
488
|
-
|
489
|
-
|
458
|
+
attr_stack_cleanup(&attrs);
|
459
|
+
return 0;
|
490
460
|
}
|
491
461
|
/* read attribute names until the close (/ or >) is reached */
|
492
462
|
while (!done) {
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
463
|
+
if ('\0' == c) {
|
464
|
+
if (pi->end <= pi->s) {
|
465
|
+
break;
|
466
|
+
}
|
467
|
+
next_non_white(pi);
|
468
|
+
c = *pi->s;
|
469
|
+
}
|
470
|
+
pi->last = 0;
|
471
|
+
switch (c) {
|
472
|
+
case '\0':
|
473
|
+
attr_stack_cleanup(&attrs);
|
474
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
475
|
+
return 0;
|
476
|
+
case '/':
|
477
|
+
/* Element with just attributes. */
|
478
|
+
pi->s++;
|
479
|
+
if ('>' != *pi->s) {
|
480
|
+
attr_stack_cleanup(&attrs);
|
481
|
+
set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
|
482
|
+
return 0;
|
483
|
+
}
|
484
|
+
pi->s++;
|
485
|
+
pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
|
486
|
+
pi->pcb->end_element(pi, ename);
|
487
|
+
attr_stack_cleanup(&attrs);
|
488
|
+
|
489
|
+
return 0;
|
490
|
+
case '>':
|
491
|
+
/* has either children or a value */
|
492
|
+
pi->s++;
|
493
|
+
hasChildren = 1;
|
494
|
+
done = 1;
|
495
|
+
pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
|
526
496
|
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
497
|
+
break;
|
498
|
+
default:
|
499
|
+
/* Attribute name so it's an element and the attribute will be */
|
500
|
+
/* added to it. */
|
501
|
+
if (0 == (attr_name = read_name_token(pi))) {
|
502
|
+
attr_stack_cleanup(&attrs);
|
503
|
+
return 0;
|
504
|
+
}
|
505
|
+
end = pi->s;
|
506
|
+
next_non_white(pi);
|
507
|
+
if ('=' != *pi->s++) {
|
508
|
+
if (TolerantEffort == pi->options->effort) {
|
509
|
+
pi->s--;
|
510
|
+
pi->last = *pi->s;
|
511
|
+
*end = '\0'; /* terminate name */
|
512
|
+
attr_value = "";
|
513
|
+
attr_stack_push(&attrs, attr_name, attr_value);
|
514
|
+
break;
|
515
|
+
} else {
|
516
|
+
attr_stack_cleanup(&attrs);
|
517
|
+
set_error(&pi->err, "invalid format, no attribute value", pi->str, pi->s);
|
518
|
+
return 0;
|
519
|
+
}
|
520
|
+
}
|
521
|
+
*end = '\0'; /* terminate name */
|
522
|
+
/* read value */
|
523
|
+
next_non_white(pi);
|
524
|
+
if (0 == (attr_value = read_quoted_value(pi))) {
|
525
|
+
return 0;
|
526
|
+
}
|
527
|
+
if (pi->options->convert_special && 0 != strchr(attr_value, '&')) {
|
528
|
+
if (0 != collapse_special(pi, (char *)attr_value) || err_has(&pi->err)) {
|
529
|
+
attr_stack_cleanup(&attrs);
|
530
|
+
return 0;
|
531
|
+
}
|
532
|
+
}
|
533
|
+
attr_stack_push(&attrs, attr_name, attr_value);
|
534
|
+
break;
|
535
|
+
}
|
536
|
+
if ('\0' == pi->last) {
|
537
|
+
c = '\0';
|
538
|
+
} else {
|
539
|
+
c = pi->last;
|
540
|
+
pi->last = '\0';
|
541
|
+
}
|
572
542
|
}
|
573
543
|
if (hasChildren) {
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
}
|
723
|
-
}
|
544
|
+
char *start;
|
545
|
+
int first = 1;
|
546
|
+
|
547
|
+
done = 0;
|
548
|
+
/* read children */
|
549
|
+
while (!done) {
|
550
|
+
start = pi->s;
|
551
|
+
next_non_white(pi);
|
552
|
+
if (OffSkip == pi->options->skip && start < pi->s && '<' == *pi->s) {
|
553
|
+
c = *pi->s;
|
554
|
+
*pi->s = '\0';
|
555
|
+
pi->pcb->add_text(pi, start, 1);
|
556
|
+
*pi->s = c;
|
557
|
+
}
|
558
|
+
c = *pi->s++;
|
559
|
+
if ('\0' == c) {
|
560
|
+
attr_stack_cleanup(&attrs);
|
561
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
562
|
+
return 0;
|
563
|
+
}
|
564
|
+
if ('<' == c) {
|
565
|
+
char *slash;
|
566
|
+
|
567
|
+
switch (*pi->s) {
|
568
|
+
case '!': /* better be a comment or CDATA */
|
569
|
+
pi->s++;
|
570
|
+
if ('-' == *pi->s && '-' == *(pi->s + 1)) {
|
571
|
+
pi->s += 2;
|
572
|
+
read_comment(pi);
|
573
|
+
} else if ((TolerantEffort == pi->options->effort) ? 0 == strncasecmp("[CDATA[", pi->s, 7)
|
574
|
+
: 0 == strncmp("[CDATA[", pi->s, 7)) {
|
575
|
+
pi->s += 7;
|
576
|
+
read_cdata(pi);
|
577
|
+
} else {
|
578
|
+
attr_stack_cleanup(&attrs);
|
579
|
+
set_error(&pi->err, "invalid format, invalid comment or CDATA format", pi->str, pi->s);
|
580
|
+
return 0;
|
581
|
+
}
|
582
|
+
break;
|
583
|
+
case '?': /* processing instruction */
|
584
|
+
pi->s++;
|
585
|
+
read_instruction(pi);
|
586
|
+
break;
|
587
|
+
case '/':
|
588
|
+
slash = pi->s;
|
589
|
+
pi->s++;
|
590
|
+
if (0 == (name = read_name_token(pi))) {
|
591
|
+
attr_stack_cleanup(&attrs);
|
592
|
+
return 0;
|
593
|
+
}
|
594
|
+
end = pi->s;
|
595
|
+
next_non_white(pi);
|
596
|
+
c = *pi->s;
|
597
|
+
*end = '\0';
|
598
|
+
if (0 !=
|
599
|
+
((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
|
600
|
+
attr_stack_cleanup(&attrs);
|
601
|
+
if (TolerantEffort == pi->options->effort) {
|
602
|
+
pi->pcb->end_element(pi, ename);
|
603
|
+
return name;
|
604
|
+
} else {
|
605
|
+
set_error(&pi->err, "invalid format, elements overlap", pi->str, pi->s);
|
606
|
+
return 0;
|
607
|
+
}
|
608
|
+
}
|
609
|
+
if ('>' != c) {
|
610
|
+
attr_stack_cleanup(&attrs);
|
611
|
+
set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
|
612
|
+
return 0;
|
613
|
+
}
|
614
|
+
if (first && start != slash - 1) {
|
615
|
+
// Some white space between start and here so add as
|
616
|
+
// text after checking skip.
|
617
|
+
*(slash - 1) = '\0';
|
618
|
+
switch (pi->options->skip) {
|
619
|
+
case CrSkip: {
|
620
|
+
char *s = start;
|
621
|
+
char *e = start;
|
622
|
+
|
623
|
+
for (; '\0' != *e; e++) {
|
624
|
+
if ('\r' != *e) {
|
625
|
+
*s++ = *e;
|
626
|
+
}
|
627
|
+
}
|
628
|
+
*s = '\0';
|
629
|
+
break;
|
630
|
+
}
|
631
|
+
case SpcSkip: *start = '\0'; break;
|
632
|
+
case NoSkip:
|
633
|
+
case OffSkip:
|
634
|
+
default: break;
|
635
|
+
}
|
636
|
+
if ('\0' != *start) {
|
637
|
+
pi->pcb->add_text(pi, start, 1);
|
638
|
+
}
|
639
|
+
}
|
640
|
+
pi->s++;
|
641
|
+
pi->pcb->end_element(pi, ename);
|
642
|
+
attr_stack_cleanup(&attrs);
|
643
|
+
return 0;
|
644
|
+
case '\0':
|
645
|
+
attr_stack_cleanup(&attrs);
|
646
|
+
if (TolerantEffort == pi->options->effort) {
|
647
|
+
return 0;
|
648
|
+
} else {
|
649
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
650
|
+
return 0;
|
651
|
+
}
|
652
|
+
default:
|
653
|
+
first = 0;
|
654
|
+
/* a child element */
|
655
|
+
// Child closed with mismatched name.
|
656
|
+
if (0 != (name = read_element(pi))) {
|
657
|
+
attr_stack_cleanup(&attrs);
|
658
|
+
|
659
|
+
if (0 ==
|
660
|
+
((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
|
661
|
+
pi->s++;
|
662
|
+
pi->pcb->end_element(pi, ename);
|
663
|
+
return 0;
|
664
|
+
} else { // not the correct element yet
|
665
|
+
pi->pcb->end_element(pi, ename);
|
666
|
+
return name;
|
667
|
+
}
|
668
|
+
} else if (err_has(&pi->err)) {
|
669
|
+
return 0;
|
670
|
+
}
|
671
|
+
break;
|
672
|
+
}
|
673
|
+
} else { /* read as TEXT */
|
674
|
+
pi->s = start;
|
675
|
+
/*pi->s--; */
|
676
|
+
read_text(pi);
|
677
|
+
/*read_reduced_text(pi); */
|
678
|
+
|
679
|
+
/* to exit read_text with no errors the next character must be < */
|
680
|
+
if ('/' == *(pi->s + 1) &&
|
681
|
+
0 == ((TolerantEffort == pi->options->effort) ? strncasecmp(ename, pi->s + 2, elen)
|
682
|
+
: strncmp(ename, pi->s + 2, elen)) &&
|
683
|
+
'>' == *(pi->s + elen + 2)) {
|
684
|
+
/* close tag after text so treat as a value */
|
685
|
+
pi->s += elen + 3;
|
686
|
+
pi->pcb->end_element(pi, ename);
|
687
|
+
attr_stack_cleanup(&attrs);
|
688
|
+
return 0;
|
689
|
+
}
|
690
|
+
}
|
691
|
+
}
|
724
692
|
}
|
725
693
|
attr_stack_cleanup(&attrs);
|
726
694
|
return 0;
|
727
695
|
}
|
728
696
|
|
729
|
-
static void
|
730
|
-
|
731
|
-
char
|
732
|
-
char
|
733
|
-
char
|
734
|
-
char
|
735
|
-
|
736
|
-
int done = 0;
|
697
|
+
static void read_text(PInfo pi) {
|
698
|
+
char buf[MAX_TEXT_LEN];
|
699
|
+
char *b = buf;
|
700
|
+
char *alloc_buf = 0;
|
701
|
+
char *end = b + sizeof(buf) - 2;
|
702
|
+
char c;
|
703
|
+
int done = 0;
|
737
704
|
|
738
705
|
while (!done) {
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
}
|
802
|
-
}
|
803
|
-
break;
|
804
|
-
}
|
706
|
+
c = *pi->s++;
|
707
|
+
switch (c) {
|
708
|
+
case '<':
|
709
|
+
done = 1;
|
710
|
+
pi->s--;
|
711
|
+
break;
|
712
|
+
case '\0': set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s); return;
|
713
|
+
default:
|
714
|
+
if (end <= (b + (('&' == c) ? 7 : 0))) { /* extra 8 for special just in case it is sequence of bytes */
|
715
|
+
unsigned long size;
|
716
|
+
|
717
|
+
if (0 == alloc_buf) {
|
718
|
+
size = sizeof(buf) * 2;
|
719
|
+
alloc_buf = ALLOC_N(char, size);
|
720
|
+
memcpy(alloc_buf, buf, b - buf);
|
721
|
+
b = alloc_buf + (b - buf);
|
722
|
+
} else {
|
723
|
+
unsigned long pos = b - alloc_buf;
|
724
|
+
|
725
|
+
size = (end - alloc_buf) * 2;
|
726
|
+
REALLOC_N(alloc_buf, char, size);
|
727
|
+
b = alloc_buf + pos;
|
728
|
+
}
|
729
|
+
end = alloc_buf + size - 2;
|
730
|
+
}
|
731
|
+
if ('&' == c) {
|
732
|
+
if (0 == (b = read_coded_chars(pi, b))) {
|
733
|
+
return;
|
734
|
+
}
|
735
|
+
} else {
|
736
|
+
if (0 <= c && c <= 0x20) {
|
737
|
+
if (StrictEffort == pi->options->effort && 'x' == xml_valid_lower_chars[(unsigned char)c]) {
|
738
|
+
set_error(&pi->err, "invalid character", pi->str, pi->s);
|
739
|
+
return;
|
740
|
+
}
|
741
|
+
switch (pi->options->skip) {
|
742
|
+
case CrSkip:
|
743
|
+
if (buf != b && '\n' == c && '\r' == *(b - 1)) {
|
744
|
+
*(b - 1) = '\n';
|
745
|
+
} else {
|
746
|
+
*b++ = c;
|
747
|
+
}
|
748
|
+
break;
|
749
|
+
case SpcSkip:
|
750
|
+
if (is_white(c)) {
|
751
|
+
if (buf == b || ' ' != *(b - 1)) {
|
752
|
+
*b++ = ' ';
|
753
|
+
}
|
754
|
+
} else {
|
755
|
+
*b++ = c;
|
756
|
+
}
|
757
|
+
break;
|
758
|
+
case NoSkip:
|
759
|
+
case OffSkip:
|
760
|
+
default: *b++ = c; break;
|
761
|
+
}
|
762
|
+
} else {
|
763
|
+
*b++ = c;
|
764
|
+
}
|
765
|
+
}
|
766
|
+
break;
|
767
|
+
}
|
805
768
|
}
|
806
769
|
*b = '\0';
|
807
770
|
if (0 != alloc_buf) {
|
808
|
-
|
809
|
-
|
810
|
-
|
771
|
+
fix_newlines(alloc_buf);
|
772
|
+
pi->pcb->add_text(pi, alloc_buf, ('/' == *(pi->s + 1)));
|
773
|
+
xfree(alloc_buf);
|
811
774
|
} else {
|
812
|
-
|
813
|
-
|
775
|
+
fix_newlines(buf);
|
776
|
+
pi->pcb->add_text(pi, buf, ('/' == *(pi->s + 1)));
|
814
777
|
}
|
815
778
|
}
|
816
779
|
|
@@ -886,323 +849,322 @@ read_reduced_text(PInfo pi) {
|
|
886
849
|
}
|
887
850
|
#endif
|
888
851
|
|
889
|
-
static char*
|
890
|
-
|
891
|
-
char *start;
|
852
|
+
static char *read_name_token(PInfo pi) {
|
853
|
+
char *start;
|
892
854
|
|
893
855
|
next_non_white(pi);
|
894
856
|
start = pi->s;
|
895
857
|
for (; 1; pi->s++) {
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
break;
|
923
|
-
}
|
858
|
+
switch (*pi->s) {
|
859
|
+
case ' ':
|
860
|
+
case '\t':
|
861
|
+
case '\f':
|
862
|
+
case '?':
|
863
|
+
case '=':
|
864
|
+
case '/':
|
865
|
+
case '>':
|
866
|
+
case '\n':
|
867
|
+
case '\r': return start;
|
868
|
+
case '\0':
|
869
|
+
/* documents never terminate after a name token */
|
870
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
871
|
+
return 0;
|
872
|
+
break; /* to avoid warnings */
|
873
|
+
case ':':
|
874
|
+
if ('\0' == *pi->options->strip_ns) {
|
875
|
+
break;
|
876
|
+
} else if ('*' == *pi->options->strip_ns && '\0' == pi->options->strip_ns[1]) {
|
877
|
+
start = pi->s + 1;
|
878
|
+
} else if (0 == strncmp(pi->options->strip_ns, start, pi->s - start)) {
|
879
|
+
start = pi->s + 1;
|
880
|
+
}
|
881
|
+
break;
|
882
|
+
default: break;
|
883
|
+
}
|
924
884
|
}
|
925
885
|
return start;
|
926
886
|
}
|
927
887
|
|
928
|
-
static void
|
929
|
-
|
930
|
-
char
|
931
|
-
char *end;
|
888
|
+
static void read_cdata(PInfo pi) {
|
889
|
+
char *start;
|
890
|
+
char *end;
|
932
891
|
|
933
892
|
start = pi->s;
|
934
|
-
end
|
893
|
+
end = strstr(pi->s, "]]>");
|
935
894
|
if (end == 0) {
|
936
|
-
|
937
|
-
|
895
|
+
set_error(&pi->err, "invalid format, CDATA not terminated", pi->str, pi->s);
|
896
|
+
return;
|
938
897
|
}
|
939
|
-
*end
|
898
|
+
*end = '\0';
|
940
899
|
pi->s = end + 3;
|
941
900
|
if (0 != pi->pcb->add_cdata) {
|
942
|
-
|
943
|
-
|
901
|
+
fix_newlines(start);
|
902
|
+
pi->pcb->add_cdata(pi, start, end - start);
|
944
903
|
}
|
945
904
|
}
|
946
905
|
|
947
906
|
/* Assume the value starts immediately and goes until the quote character is
|
948
907
|
* reached again. Do not read the character after the terminating quote.
|
949
908
|
*/
|
950
|
-
static char*
|
951
|
-
|
952
|
-
char *value = 0;
|
909
|
+
static char *read_quoted_value(PInfo pi) {
|
910
|
+
char *value = 0;
|
953
911
|
|
954
912
|
if ('"' == *pi->s || '\'' == *pi->s) {
|
955
|
-
char
|
913
|
+
char term = *pi->s;
|
956
914
|
|
957
|
-
pi->s++;
|
915
|
+
pi->s++; /* skip quote character */
|
958
916
|
value = pi->s;
|
959
917
|
for (; *pi->s != term; pi->s++) {
|
960
918
|
if ('\0' == *pi->s) {
|
961
919
|
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
962
|
-
|
920
|
+
return 0;
|
963
921
|
}
|
964
922
|
}
|
965
|
-
*pi->s = '\0';
|
966
|
-
pi->s++;
|
923
|
+
*pi->s = '\0'; /* terminate value */
|
924
|
+
pi->s++; /* move past quote */
|
967
925
|
} else if (StrictEffort == pi->options->effort) {
|
968
|
-
|
969
|
-
|
926
|
+
set_error(&pi->err, "invalid format, expected a quote character", pi->str, pi->s);
|
927
|
+
return 0;
|
970
928
|
} else if (TolerantEffort == pi->options->effort) {
|
971
929
|
value = pi->s;
|
972
930
|
for (; 1; pi->s++) {
|
973
|
-
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
return value;
|
988
|
-
default:
|
989
|
-
break;
|
931
|
+
switch (*pi->s) {
|
932
|
+
case '\0': set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s); return 0;
|
933
|
+
case ' ':
|
934
|
+
case '/':
|
935
|
+
case '>':
|
936
|
+
case '?': // for instructions
|
937
|
+
case '\t':
|
938
|
+
case '\n':
|
939
|
+
case '\r':
|
940
|
+
pi->last = *pi->s;
|
941
|
+
*pi->s = '\0'; /* terminate value */
|
942
|
+
pi->s++;
|
943
|
+
return value;
|
944
|
+
default: break;
|
990
945
|
}
|
991
946
|
}
|
992
947
|
} else {
|
993
948
|
value = pi->s;
|
994
949
|
next_white(pi);
|
995
|
-
|
996
|
-
|
997
|
-
|
950
|
+
if ('\0' == *pi->s) {
|
951
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
952
|
+
return 0;
|
998
953
|
}
|
999
954
|
*pi->s++ = '\0'; /* terminate value */
|
1000
955
|
}
|
1001
956
|
return value;
|
1002
957
|
}
|
1003
958
|
|
1004
|
-
static char*
|
1005
|
-
|
1006
|
-
|
1007
|
-
char c;
|
959
|
+
static char *read_hex_uint64(char *b, uint64_t *up) {
|
960
|
+
uint64_t u = 0;
|
961
|
+
char c;
|
1008
962
|
|
1009
963
|
for (; ';' != *b; b++) {
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
964
|
+
c = *b;
|
965
|
+
if ('0' <= c && c <= '9') {
|
966
|
+
u = (u << 4) | (uint64_t)(c - '0');
|
967
|
+
} else if ('a' <= c && c <= 'f') {
|
968
|
+
u = (u << 4) | (uint64_t)(c - 'a' + 10);
|
969
|
+
} else if ('A' <= c && c <= 'F') {
|
970
|
+
u = (u << 4) | (uint64_t)(c - 'A' + 10);
|
971
|
+
} else {
|
972
|
+
return 0;
|
973
|
+
}
|
1020
974
|
}
|
1021
975
|
*up = u;
|
1022
976
|
|
1023
977
|
return b;
|
1024
978
|
}
|
1025
979
|
|
1026
|
-
static char*
|
1027
|
-
|
1028
|
-
|
1029
|
-
char c;
|
980
|
+
static char *read_10_uint64(char *b, uint64_t *up) {
|
981
|
+
uint64_t u = 0;
|
982
|
+
char c;
|
1030
983
|
|
1031
984
|
for (; ';' != *b; b++) {
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
985
|
+
c = *b;
|
986
|
+
if ('0' <= c && c <= '9') {
|
987
|
+
u = (u * 10) + (uint64_t)(c - '0');
|
988
|
+
} else {
|
989
|
+
return 0;
|
990
|
+
}
|
1038
991
|
}
|
1039
992
|
*up = u;
|
1040
993
|
|
1041
994
|
return b;
|
1042
995
|
}
|
1043
996
|
|
1044
|
-
static char*
|
1045
|
-
|
1046
|
-
char
|
1047
|
-
char
|
1048
|
-
|
1049
|
-
long blen = 0;
|
997
|
+
static char *read_coded_chars(PInfo pi, char *text) {
|
998
|
+
char *b, buf[32];
|
999
|
+
char *end = buf + sizeof(buf) - 1;
|
1000
|
+
char *s;
|
1001
|
+
long blen = 0;
|
1050
1002
|
|
1051
1003
|
for (b = buf, s = pi->s; b < end; b++, s++) {
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
1004
|
+
*b = *s;
|
1005
|
+
if (';' == *s) {
|
1006
|
+
*(b + 1) = '\0';
|
1007
|
+
blen = b - buf;
|
1008
|
+
s++;
|
1009
|
+
break;
|
1010
|
+
}
|
1059
1011
|
}
|
1060
1012
|
if (b > end) {
|
1061
|
-
|
1013
|
+
*text++ = '&';
|
1062
1014
|
} else if ('#' == *buf) {
|
1063
|
-
|
1015
|
+
uint64_t u = 0;
|
1064
1016
|
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1017
|
+
b = buf + 1;
|
1018
|
+
if ('x' == *b || 'X' == *b) {
|
1019
|
+
b = read_hex_uint64(b + 1, &u);
|
1020
|
+
} else {
|
1021
|
+
b = read_10_uint64(b, &u);
|
1022
|
+
}
|
1023
|
+
if (0 == b) {
|
1024
|
+
*text++ = '&';
|
1025
|
+
} else {
|
1026
|
+
if (u <= 0x000000000000007FULL) {
|
1027
|
+
*text++ = (char)u;
|
1028
|
+
} else if (ox_utf8_encoding == pi->options->rb_enc) {
|
1029
|
+
text = ox_ucs_to_utf8_chars(text, u);
|
1030
|
+
} else if (0 == pi->options->rb_enc) {
|
1031
|
+
pi->options->rb_enc = ox_utf8_encoding;
|
1032
|
+
text = ox_ucs_to_utf8_chars(text, u);
|
1033
|
+
} else if (TolerantEffort == pi->options->effort) {
|
1034
|
+
*text++ = '&';
|
1035
|
+
return text;
|
1036
|
+
} else if (u <= 0x00000000000000FFULL) {
|
1037
|
+
*text++ = (char)u;
|
1038
|
+
} else {
|
1039
|
+
/*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character
|
1040
|
+
* sequences.", pi->str, pi->s); */
|
1041
|
+
set_error(&pi->err,
|
1042
|
+
"Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.",
|
1043
|
+
pi->str,
|
1044
|
+
pi->s);
|
1045
|
+
return NULL;
|
1046
|
+
}
|
1047
|
+
pi->s = s;
|
1048
|
+
}
|
1093
1049
|
} else {
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1050
|
+
char *t2;
|
1051
|
+
|
1052
|
+
buf[blen] = '\0';
|
1053
|
+
if (NULL == (t2 = ox_entity_lookup(text, buf))) {
|
1054
|
+
*text++ = '&';
|
1055
|
+
} else {
|
1056
|
+
text = t2;
|
1057
|
+
pi->s = s;
|
1058
|
+
}
|
1103
1059
|
}
|
1104
1060
|
return text;
|
1105
1061
|
}
|
1106
1062
|
|
1107
|
-
static int
|
1108
|
-
|
1109
|
-
char
|
1110
|
-
char *b = str;
|
1063
|
+
static int collapse_special(PInfo pi, char *str) {
|
1064
|
+
char *s = str;
|
1065
|
+
char *b = str;
|
1111
1066
|
|
1112
1067
|
while ('\0' != *s) {
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1172
|
-
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1186
|
-
|
1187
|
-
|
1188
|
-
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1068
|
+
if ('&' == *s) {
|
1069
|
+
int c;
|
1070
|
+
char *end;
|
1071
|
+
|
1072
|
+
s++;
|
1073
|
+
if ('#' == *s) {
|
1074
|
+
uint64_t u = 0;
|
1075
|
+
char x;
|
1076
|
+
|
1077
|
+
s++;
|
1078
|
+
if ('x' == *s || 'X' == *s) {
|
1079
|
+
x = *s;
|
1080
|
+
s++;
|
1081
|
+
end = read_hex_uint64(s, &u);
|
1082
|
+
} else {
|
1083
|
+
x = '\0';
|
1084
|
+
end = read_10_uint64(s, &u);
|
1085
|
+
}
|
1086
|
+
if (0 == end) {
|
1087
|
+
if (TolerantEffort == pi->options->effort) {
|
1088
|
+
*b++ = '&';
|
1089
|
+
*b++ = '#';
|
1090
|
+
if ('\0' != x) {
|
1091
|
+
*b++ = x;
|
1092
|
+
}
|
1093
|
+
continue;
|
1094
|
+
}
|
1095
|
+
return EDOM;
|
1096
|
+
}
|
1097
|
+
if (u <= 0x000000000000007FULL) {
|
1098
|
+
*b++ = (char)u;
|
1099
|
+
} else if (ox_utf8_encoding == pi->options->rb_enc) {
|
1100
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
1101
|
+
/* TBD support UTF-16 */
|
1102
|
+
} else if (0 == pi->options->rb_enc) {
|
1103
|
+
pi->options->rb_enc = ox_utf8_encoding;
|
1104
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
1105
|
+
} else {
|
1106
|
+
/* set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character
|
1107
|
+
* sequences.", pi->str, pi->s);*/
|
1108
|
+
set_error(&pi->err,
|
1109
|
+
"Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.",
|
1110
|
+
pi->str,
|
1111
|
+
pi->s);
|
1112
|
+
return 0;
|
1113
|
+
}
|
1114
|
+
s = end + 1;
|
1115
|
+
} else {
|
1116
|
+
if (0 == strncasecmp(s, "lt;", 3)) {
|
1117
|
+
c = '<';
|
1118
|
+
s += 3;
|
1119
|
+
} else if (0 == strncasecmp(s, "gt;", 3)) {
|
1120
|
+
c = '>';
|
1121
|
+
s += 3;
|
1122
|
+
} else if (0 == strncasecmp(s, "amp;", 4)) {
|
1123
|
+
c = '&';
|
1124
|
+
s += 4;
|
1125
|
+
} else if (0 == strncasecmp(s, "quot;", 5)) {
|
1126
|
+
c = '"';
|
1127
|
+
s += 5;
|
1128
|
+
} else if (0 == strncasecmp(s, "apos;", 5)) {
|
1129
|
+
c = '\'';
|
1130
|
+
s += 5;
|
1131
|
+
} else if (TolerantEffort == pi->options->effort) {
|
1132
|
+
*b++ = '&';
|
1133
|
+
continue;
|
1134
|
+
} else {
|
1135
|
+
char key[16];
|
1136
|
+
char *k = key;
|
1137
|
+
char *kend = key + sizeof(key) - 1;
|
1138
|
+
|
1139
|
+
*k++ = *s;
|
1140
|
+
while (';' != *s++) {
|
1141
|
+
if ('\0' == *s) {
|
1142
|
+
set_error(&pi->err,
|
1143
|
+
"Invalid format, special character does not end with a semicolon",
|
1144
|
+
pi->str,
|
1145
|
+
pi->s);
|
1146
|
+
return EDOM;
|
1147
|
+
}
|
1148
|
+
if (kend <= k) {
|
1149
|
+
k = key;
|
1150
|
+
break;
|
1151
|
+
}
|
1152
|
+
*k++ = *s;
|
1153
|
+
}
|
1154
|
+
k--;
|
1155
|
+
*k = '\0';
|
1156
|
+
if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
|
1157
|
+
set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
|
1158
|
+
c = '?';
|
1159
|
+
return 0;
|
1160
|
+
}
|
1161
|
+
continue;
|
1162
|
+
}
|
1163
|
+
*b++ = (char)c;
|
1164
|
+
}
|
1165
|
+
} else {
|
1166
|
+
*b++ = *s++;
|
1167
|
+
}
|
1206
1168
|
}
|
1207
1169
|
*b = '\0';
|
1208
1170
|
|