ox 2.14.14 → 2.14.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +1 -1
- data/ext/ox/attr.h +33 -39
- data/ext/ox/base64.c +48 -42
- data/ext/ox/base64.h +4 -4
- data/ext/ox/buf.h +80 -86
- data/ext/ox/builder.c +378 -423
- data/ext/ox/cache.c +2 -2
- data/ext/ox/cache8.c +37 -40
- data/ext/ox/cache8.h +7 -7
- data/ext/ox/dump.c +838 -867
- data/ext/ox/err.c +16 -13
- data/ext/ox/err.h +11 -12
- data/ext/ox/extconf.rb +5 -5
- data/ext/ox/gen_load.c +135 -137
- data/ext/ox/hash_load.c +130 -148
- data/ext/ox/helper.h +32 -39
- data/ext/ox/intern.c +1 -2
- data/ext/ox/obj_load.c +590 -644
- data/ext/ox/ox.c +2 -2
- data/ext/ox/ox.h +5 -5
- data/ext/ox/parse.c +836 -874
- data/ext/ox/sax.c +38 -23
- data/ext/ox/sax.h +2 -2
- data/ext/ox/sax_as.c +78 -94
- data/ext/ox/sax_buf.c +85 -94
- data/ext/ox/sax_buf.h +101 -120
- data/ext/ox/sax_hint.c +175 -184
- data/ext/ox/sax_hint.h +19 -19
- data/ext/ox/sax_stack.h +59 -45
- data/ext/ox/slotcache.c +2 -2
- data/ext/ox/slotcache.h +4 -4
- data/ext/ox/special.c +320 -327
- data/ext/ox/special.h +2 -2
- data/ext/ox/type.h +19 -19
- data/lib/ox/bag.rb +13 -9
- data/lib/ox/cdata.rb +0 -2
- data/lib/ox/comment.rb +0 -2
- data/lib/ox/doctype.rb +0 -2
- data/lib/ox/document.rb +3 -5
- data/lib/ox/element.rb +41 -26
- data/lib/ox/error.rb +0 -3
- data/lib/ox/hasattrs.rb +7 -8
- data/lib/ox/instruct.rb +4 -6
- data/lib/ox/node.rb +3 -4
- data/lib/ox/raw.rb +0 -2
- data/lib/ox/sax.rb +20 -36
- data/lib/ox/version.rb +1 -2
- data/lib/ox/xmlrpc_adapter.rb +4 -6
- data/lib/ox.rb +15 -16
- metadata +6 -5
data/ext/ox/parse.c
CHANGED
@@ -3,35 +3,35 @@
|
|
3
3
|
* All rights reserved.
|
4
4
|
*/
|
5
5
|
|
6
|
-
#include <stdlib.h>
|
7
6
|
#include <errno.h>
|
8
7
|
#include <stdbool.h>
|
9
8
|
#include <stdio.h>
|
9
|
+
#include <stdlib.h>
|
10
10
|
#include <string.h>
|
11
11
|
#include <strings.h>
|
12
12
|
|
13
|
-
#include "ruby.h"
|
14
|
-
#include "ox.h"
|
15
|
-
#include "err.h"
|
16
13
|
#include "attr.h"
|
17
|
-
#include "
|
14
|
+
#include "err.h"
|
18
15
|
#include "helper.h"
|
16
|
+
#include "intern.h"
|
17
|
+
#include "ox.h"
|
18
|
+
#include "ruby.h"
|
19
19
|
#include "special.h"
|
20
20
|
|
21
|
-
static void
|
22
|
-
static void
|
23
|
-
static void
|
24
|
-
static char*
|
25
|
-
static void
|
21
|
+
static void read_instruction(PInfo pi);
|
22
|
+
static void read_doctype(PInfo pi);
|
23
|
+
static void read_comment(PInfo pi);
|
24
|
+
static char *read_element(PInfo pi);
|
25
|
+
static void read_text(PInfo pi);
|
26
26
|
/*static void read_reduced_text(PInfo pi); */
|
27
|
-
static void
|
28
|
-
static char*
|
29
|
-
static char*
|
30
|
-
static char*
|
31
|
-
static char*
|
32
|
-
static char*
|
33
|
-
static void
|
34
|
-
static int
|
27
|
+
static void read_cdata(PInfo pi);
|
28
|
+
static char *read_name_token(PInfo pi);
|
29
|
+
static char *read_quoted_value(PInfo pi);
|
30
|
+
static char *read_hex_uint64(char *b, uint64_t *up);
|
31
|
+
static char *read_10_uint64(char *b, uint64_t *up);
|
32
|
+
static char *read_coded_chars(PInfo pi, char *text);
|
33
|
+
static void next_non_white(PInfo pi);
|
34
|
+
static int collapse_special(PInfo pi, char *str);
|
35
35
|
|
36
36
|
/* This XML parser is a single pass, destructive, callback parser. It is a
|
37
37
|
* single pass parse since it only make one pass over the characters in the
|
@@ -46,53 +46,43 @@ static int collapse_special(PInfo pi, char *str);
|
|
46
46
|
* all cases to parse the string.
|
47
47
|
*/
|
48
48
|
|
49
|
-
static char
|
49
|
+
static char xml_valid_lower_chars[34] = "xxxxxxxxxooxxoxxxxxxxxxxxxxxxxxxo";
|
50
50
|
|
51
|
-
inline static int
|
52
|
-
is_white(char c) {
|
51
|
+
inline static int is_white(char c) {
|
53
52
|
switch (c) {
|
54
53
|
case ' ':
|
55
54
|
case '\t':
|
56
55
|
case '\f':
|
57
56
|
case '\n':
|
58
|
-
case '\r':
|
59
|
-
|
60
|
-
default:
|
61
|
-
return 0;
|
57
|
+
case '\r': return 1;
|
58
|
+
default: return 0;
|
62
59
|
}
|
63
60
|
}
|
64
61
|
|
65
|
-
|
66
|
-
inline static void
|
67
|
-
next_non_white(PInfo pi) {
|
62
|
+
inline static void next_non_white(PInfo pi) {
|
68
63
|
for (; 1; pi->s++) {
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
return;
|
78
|
-
}
|
64
|
+
switch (*pi->s) {
|
65
|
+
case ' ':
|
66
|
+
case '\t':
|
67
|
+
case '\f':
|
68
|
+
case '\n':
|
69
|
+
case '\r': break;
|
70
|
+
default: return;
|
71
|
+
}
|
79
72
|
}
|
80
73
|
}
|
81
74
|
|
82
|
-
inline static void
|
83
|
-
next_white(PInfo pi) {
|
75
|
+
inline static void next_white(PInfo pi) {
|
84
76
|
for (; 1; pi->s++) {
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
break;
|
95
|
-
}
|
77
|
+
switch (*pi->s) {
|
78
|
+
case ' ':
|
79
|
+
case '\t':
|
80
|
+
case '\f':
|
81
|
+
case '\n':
|
82
|
+
case '\r':
|
83
|
+
case '\0': return;
|
84
|
+
default: break;
|
85
|
+
}
|
96
86
|
}
|
97
87
|
}
|
98
88
|
|
@@ -100,53 +90,52 @@ static void fix_newlines(char *buf) {
|
|
100
90
|
#if HAVE_INDEX
|
101
91
|
if (NULL != index(buf, '\r')) {
|
102
92
|
#endif
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
93
|
+
char *s = buf;
|
94
|
+
char *d = buf;
|
95
|
+
|
96
|
+
for (; '\0' != *s; s++) {
|
97
|
+
if ('\r' == *s) {
|
98
|
+
if ('\n' == *(s + 1)) {
|
99
|
+
continue;
|
100
|
+
}
|
101
|
+
*d = '\n';
|
102
|
+
} else if (d < s) {
|
103
|
+
*d = *s;
|
104
|
+
}
|
105
|
+
d++;
|
106
|
+
}
|
107
|
+
*d = '\0';
|
118
108
|
#if HAVE_INDEX
|
119
109
|
}
|
120
110
|
#endif
|
121
111
|
}
|
122
112
|
|
123
|
-
static void
|
124
|
-
mark_pi_cb(void *ptr) {
|
113
|
+
static void mark_pi_cb(void *ptr) {
|
125
114
|
if (NULL != ptr) {
|
126
|
-
|
127
|
-
|
115
|
+
HelperStack stack = &((PInfo)ptr)->helpers;
|
116
|
+
Helper h;
|
128
117
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
118
|
+
for (h = stack->head; h < stack->tail; h++) {
|
119
|
+
if (NoCode != h->type) {
|
120
|
+
rb_gc_mark(h->obj);
|
121
|
+
}
|
122
|
+
}
|
134
123
|
}
|
135
124
|
}
|
136
125
|
|
137
126
|
VALUE
|
138
127
|
ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options, Err err) {
|
139
|
-
struct _pInfo
|
140
|
-
int
|
141
|
-
int
|
142
|
-
volatile VALUE
|
128
|
+
struct _pInfo pi;
|
129
|
+
int body_read = 0;
|
130
|
+
int block_given = rb_block_given_p();
|
131
|
+
volatile VALUE wrap;
|
143
132
|
|
144
133
|
if (0 == xml) {
|
145
|
-
|
146
|
-
|
134
|
+
set_error(err, "Invalid arg, xml string can not be null", xml, 0);
|
135
|
+
return Qnil;
|
147
136
|
}
|
148
137
|
if (DEBUG <= options->trace) {
|
149
|
-
|
138
|
+
printf("Parsing xml:\n%s\n", xml);
|
150
139
|
}
|
151
140
|
// initialize parse info
|
152
141
|
helper_stack_init(&pi.helpers);
|
@@ -154,663 +143,637 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
|
|
154
143
|
wrap = Data_Wrap_Struct(rb_cObject, mark_pi_cb, NULL, &pi);
|
155
144
|
|
156
145
|
err_init(&pi.err);
|
157
|
-
pi.str
|
158
|
-
pi.end
|
159
|
-
pi.s
|
160
|
-
pi.pcb
|
161
|
-
pi.obj
|
146
|
+
pi.str = xml;
|
147
|
+
pi.end = pi.str + len;
|
148
|
+
pi.s = xml;
|
149
|
+
pi.pcb = pcb;
|
150
|
+
pi.obj = Qnil;
|
162
151
|
pi.circ_array = 0;
|
163
|
-
pi.options
|
164
|
-
pi.marked
|
165
|
-
pi.mark_size
|
166
|
-
pi.mark_cnt
|
152
|
+
pi.options = options;
|
153
|
+
pi.marked = NULL;
|
154
|
+
pi.mark_size = 0;
|
155
|
+
pi.mark_cnt = 0;
|
167
156
|
while (1) {
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
157
|
+
next_non_white(&pi); // skip white space
|
158
|
+
if ('\0' == *pi.s) {
|
159
|
+
break;
|
160
|
+
}
|
161
|
+
if (body_read && 0 != endp) {
|
162
|
+
*endp = pi.s;
|
163
|
+
break;
|
164
|
+
}
|
165
|
+
if ('<' != *pi.s) { // all top level entities start with <
|
166
|
+
set_error(err, "invalid format, expected <", pi.str, pi.s);
|
167
|
+
helper_stack_cleanup(&pi.helpers);
|
168
|
+
return Qnil;
|
169
|
+
}
|
170
|
+
pi.s++; // past <
|
171
|
+
switch (*pi.s) {
|
172
|
+
case '?': // processing instruction
|
173
|
+
pi.s++;
|
174
|
+
read_instruction(&pi);
|
175
|
+
break;
|
176
|
+
case '!': // comment or doctype
|
177
|
+
pi.s++;
|
178
|
+
if ('\0' == *pi.s) {
|
179
|
+
set_error(err, "invalid format, DOCTYPE or comment not terminated", pi.str, pi.s);
|
180
|
+
helper_stack_cleanup(&pi.helpers);
|
181
|
+
return Qnil;
|
182
|
+
} else if ('-' == *pi.s) {
|
183
|
+
pi.s++; // skip -
|
184
|
+
if ('-' != *pi.s) {
|
185
|
+
set_error(err, "invalid format, bad comment format", pi.str, pi.s);
|
186
|
+
helper_stack_cleanup(&pi.helpers);
|
187
|
+
return Qnil;
|
188
|
+
} else {
|
189
|
+
pi.s++; // skip second -
|
190
|
+
read_comment(&pi);
|
191
|
+
}
|
192
|
+
} else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7)
|
193
|
+
: 0 == strncmp("DOCTYPE", pi.s, 7)) {
|
194
|
+
pi.s += 7;
|
195
|
+
read_doctype(&pi);
|
196
|
+
} else {
|
197
|
+
set_error(err, "invalid format, DOCTYPE or comment expected", pi.str, pi.s);
|
198
|
+
helper_stack_cleanup(&pi.helpers);
|
199
|
+
return Qnil;
|
200
|
+
}
|
201
|
+
break;
|
202
|
+
case '\0':
|
203
|
+
set_error(err, "invalid format, document not terminated", pi.str, pi.s);
|
204
|
+
helper_stack_cleanup(&pi.helpers);
|
205
|
+
return Qnil;
|
206
|
+
default:
|
207
|
+
read_element(&pi);
|
208
|
+
body_read = 1;
|
209
|
+
break;
|
210
|
+
}
|
211
|
+
if (err_has(&pi.err)) {
|
212
|
+
*err = pi.err;
|
213
|
+
helper_stack_cleanup(&pi.helpers);
|
214
|
+
return Qnil;
|
215
|
+
}
|
216
|
+
if (block_given && Qnil != pi.obj && Qundef != pi.obj) {
|
217
|
+
if (NULL != pcb->finish) {
|
218
|
+
pcb->finish(&pi);
|
219
|
+
}
|
220
|
+
rb_yield(pi.obj);
|
221
|
+
}
|
232
222
|
}
|
233
223
|
DATA_PTR(wrap) = NULL;
|
234
224
|
helper_stack_cleanup(&pi.helpers);
|
235
225
|
if (NULL != pcb->finish) {
|
236
|
-
|
226
|
+
pcb->finish(&pi);
|
237
227
|
}
|
238
228
|
return pi.obj;
|
239
229
|
}
|
240
230
|
|
241
231
|
// Entered after the "<?" sequence. Ready to read the rest.
|
242
|
-
static void
|
243
|
-
|
244
|
-
char
|
245
|
-
|
246
|
-
|
247
|
-
char
|
248
|
-
char
|
249
|
-
char
|
250
|
-
char
|
251
|
-
char
|
252
|
-
|
253
|
-
|
254
|
-
bool attrs_ok = true;
|
232
|
+
static void read_instruction(PInfo pi) {
|
233
|
+
char content[256];
|
234
|
+
char *content_ptr;
|
235
|
+
struct _attrStack attrs;
|
236
|
+
char *attr_name;
|
237
|
+
char *attr_value;
|
238
|
+
char *target;
|
239
|
+
char *end;
|
240
|
+
char c;
|
241
|
+
char *cend;
|
242
|
+
size_t size;
|
243
|
+
bool attrs_ok = true;
|
255
244
|
|
256
245
|
*content = '\0';
|
257
246
|
attr_stack_init(&attrs);
|
258
247
|
if (0 == (target = read_name_token(pi))) {
|
259
|
-
|
248
|
+
return;
|
260
249
|
}
|
261
250
|
end = pi->s;
|
262
251
|
for (; true; pi->s++) {
|
263
252
|
switch (*pi->s) {
|
264
253
|
case '?':
|
265
254
|
if ('>' == *(pi->s + 1)) {
|
266
|
-
|
267
|
-
|
255
|
+
pi->s++;
|
256
|
+
goto DONE;
|
268
257
|
}
|
269
258
|
break;
|
270
|
-
case '\0':
|
271
|
-
|
272
|
-
return;
|
273
|
-
default:
|
274
|
-
break;
|
259
|
+
case '\0': set_error(&pi->err, "processing instruction not terminated", pi->str, pi->s); return;
|
260
|
+
default: break;
|
275
261
|
}
|
276
262
|
}
|
277
263
|
DONE:
|
278
|
-
cend
|
279
|
-
size
|
264
|
+
cend = pi->s;
|
265
|
+
size = cend - end - 1;
|
280
266
|
pi->s = end;
|
281
267
|
if (size < sizeof(content)) {
|
282
|
-
|
268
|
+
content_ptr = content;
|
283
269
|
} else {
|
284
|
-
|
270
|
+
content_ptr = ALLOC_N(char, size + 1);
|
285
271
|
}
|
286
272
|
memcpy(content_ptr, end, size);
|
287
273
|
content_ptr[size] = '\0';
|
288
274
|
|
289
275
|
next_non_white(pi);
|
290
|
-
c
|
291
|
-
*end = '\0';
|
276
|
+
c = *pi->s;
|
277
|
+
*end = '\0'; // terminate name
|
292
278
|
if ('?' != c) {
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
279
|
+
while ('?' != c) {
|
280
|
+
pi->last = 0;
|
281
|
+
if ('\0' == *pi->s) {
|
282
|
+
attr_stack_cleanup(&attrs);
|
283
|
+
set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
|
284
|
+
return;
|
285
|
+
}
|
286
|
+
next_non_white(pi);
|
287
|
+
if (0 == (attr_name = read_name_token(pi))) {
|
288
|
+
attr_stack_cleanup(&attrs);
|
289
|
+
return;
|
290
|
+
}
|
291
|
+
end = pi->s;
|
292
|
+
next_non_white(pi);
|
293
|
+
if ('=' != *pi->s++) {
|
294
|
+
attrs_ok = false;
|
295
|
+
break;
|
296
|
+
}
|
297
|
+
*end = '\0'; // terminate name
|
298
|
+
// read value
|
299
|
+
next_non_white(pi);
|
300
|
+
if (0 == (attr_value = read_quoted_value(pi))) {
|
301
|
+
attr_stack_cleanup(&attrs);
|
302
|
+
return;
|
303
|
+
}
|
304
|
+
attr_stack_push(&attrs, attr_name, attr_value);
|
305
|
+
next_non_white(pi);
|
306
|
+
if ('\0' == pi->last) {
|
307
|
+
c = *pi->s;
|
308
|
+
} else {
|
309
|
+
c = pi->last;
|
310
|
+
}
|
311
|
+
}
|
312
|
+
if ('?' == *pi->s) {
|
313
|
+
pi->s++;
|
314
|
+
}
|
329
315
|
} else {
|
330
|
-
|
316
|
+
pi->s++;
|
331
317
|
}
|
332
318
|
if (attrs_ok) {
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
319
|
+
if ('>' != *pi->s++) {
|
320
|
+
attr_stack_cleanup(&attrs);
|
321
|
+
set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
|
322
|
+
return;
|
323
|
+
}
|
338
324
|
} else {
|
339
|
-
|
325
|
+
pi->s = cend + 1;
|
340
326
|
}
|
341
327
|
if (0 != pi->pcb->instruct) {
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
328
|
+
if (attrs_ok) {
|
329
|
+
pi->pcb->instruct(pi, target, attrs.head, 0);
|
330
|
+
} else {
|
331
|
+
pi->pcb->instruct(pi, target, attrs.head, content_ptr);
|
332
|
+
}
|
347
333
|
}
|
348
334
|
attr_stack_cleanup(&attrs);
|
349
335
|
if (content_ptr != content) {
|
350
|
-
|
336
|
+
xfree(content_ptr);
|
351
337
|
}
|
352
338
|
}
|
353
339
|
|
354
|
-
static void
|
355
|
-
|
356
|
-
char c;
|
340
|
+
static void read_delimited(PInfo pi, char end) {
|
341
|
+
char c;
|
357
342
|
|
358
343
|
if ('"' == end || '\'' == end) {
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
344
|
+
for (c = *pi->s++; end != c; c = *pi->s++) {
|
345
|
+
if ('\0' == c) {
|
346
|
+
set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s);
|
347
|
+
return;
|
348
|
+
}
|
349
|
+
}
|
365
350
|
} else {
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
break;
|
381
|
-
case '[':
|
382
|
-
read_delimited(pi, ']');
|
383
|
-
break;
|
384
|
-
case '<':
|
385
|
-
read_delimited(pi, '>');
|
386
|
-
break;
|
387
|
-
default:
|
388
|
-
break;
|
389
|
-
}
|
390
|
-
}
|
351
|
+
while (1) {
|
352
|
+
c = *pi->s++;
|
353
|
+
if (end == c) {
|
354
|
+
return;
|
355
|
+
}
|
356
|
+
switch (c) {
|
357
|
+
case '\0': set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s); return;
|
358
|
+
case '"': read_delimited(pi, c); break;
|
359
|
+
case '\'': read_delimited(pi, c); break;
|
360
|
+
case '[': read_delimited(pi, ']'); break;
|
361
|
+
case '<': read_delimited(pi, '>'); break;
|
362
|
+
default: break;
|
363
|
+
}
|
364
|
+
}
|
391
365
|
}
|
392
366
|
}
|
393
367
|
|
394
368
|
// Entered after the "<!DOCTYPE" sequence plus the first character after
|
395
369
|
// that. Ready to read the rest.
|
396
|
-
static void
|
397
|
-
|
398
|
-
char *doctype;
|
370
|
+
static void read_doctype(PInfo pi) {
|
371
|
+
char *doctype;
|
399
372
|
|
400
373
|
next_non_white(pi);
|
401
374
|
doctype = pi->s;
|
402
375
|
read_delimited(pi, '>');
|
403
376
|
if (err_has(&pi->err)) {
|
404
|
-
|
377
|
+
return;
|
405
378
|
}
|
406
379
|
pi->s--;
|
407
380
|
*pi->s = '\0';
|
408
381
|
pi->s++;
|
409
382
|
if (0 != pi->pcb->add_doctype) {
|
410
|
-
|
411
|
-
|
383
|
+
fix_newlines(doctype);
|
384
|
+
pi->pcb->add_doctype(pi, doctype);
|
412
385
|
}
|
413
386
|
}
|
414
387
|
|
415
388
|
// Entered after "<!--". Returns error code.
|
416
|
-
static void
|
417
|
-
|
418
|
-
char
|
419
|
-
char
|
420
|
-
|
421
|
-
int done = 0;
|
389
|
+
static void read_comment(PInfo pi) {
|
390
|
+
char *end;
|
391
|
+
char *s;
|
392
|
+
char *comment;
|
393
|
+
int done = 0;
|
422
394
|
|
423
395
|
next_non_white(pi);
|
424
396
|
comment = pi->s;
|
425
|
-
end
|
397
|
+
end = strstr(pi->s, "-->");
|
426
398
|
if (0 == end) {
|
427
|
-
|
428
|
-
|
399
|
+
set_error(&pi->err, "invalid format, comment not terminated", pi->str, pi->s);
|
400
|
+
return;
|
429
401
|
}
|
430
402
|
for (s = end - 1; pi->s < s && !done; s--) {
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
}
|
403
|
+
switch (*s) {
|
404
|
+
case ' ':
|
405
|
+
case '\t':
|
406
|
+
case '\f':
|
407
|
+
case '\n':
|
408
|
+
case '\r': break;
|
409
|
+
default:
|
410
|
+
*(s + 1) = '\0';
|
411
|
+
done = 1;
|
412
|
+
break;
|
413
|
+
}
|
443
414
|
}
|
444
|
-
*end
|
415
|
+
*end = '\0'; // in case the comment was blank
|
445
416
|
pi->s = end + 3;
|
446
417
|
if (0 != pi->pcb->add_comment) {
|
447
|
-
|
448
|
-
|
418
|
+
fix_newlines(comment);
|
419
|
+
pi->pcb->add_comment(pi, comment);
|
449
420
|
}
|
450
421
|
}
|
451
422
|
|
452
423
|
// Entered after the '<' and the first character after that. Returns stat
|
453
424
|
// code.
|
454
|
-
static char*
|
455
|
-
|
456
|
-
|
457
|
-
const char
|
458
|
-
|
459
|
-
char
|
460
|
-
char
|
461
|
-
char
|
462
|
-
|
463
|
-
|
464
|
-
int
|
465
|
-
int done = 0;
|
425
|
+
static char *read_element(PInfo pi) {
|
426
|
+
struct _attrStack attrs;
|
427
|
+
const char *attr_name;
|
428
|
+
const char *attr_value;
|
429
|
+
char *name;
|
430
|
+
char *ename;
|
431
|
+
char *end;
|
432
|
+
char c;
|
433
|
+
long elen;
|
434
|
+
int hasChildren = 0;
|
435
|
+
int done = 0;
|
466
436
|
|
467
437
|
attr_stack_init(&attrs);
|
468
438
|
if (0 == (ename = read_name_token(pi))) {
|
469
|
-
|
439
|
+
return 0;
|
470
440
|
}
|
471
|
-
end
|
441
|
+
end = pi->s;
|
472
442
|
elen = end - ename;
|
473
443
|
next_non_white(pi);
|
474
|
-
c
|
444
|
+
c = *pi->s;
|
475
445
|
*end = '\0';
|
476
446
|
if ('/' == c) {
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
447
|
+
// empty element, no attributes and no children
|
448
|
+
pi->s++;
|
449
|
+
if ('>' != *pi->s) {
|
450
|
+
attr_stack_cleanup(&attrs);
|
451
|
+
set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
|
452
|
+
return 0;
|
453
|
+
}
|
454
|
+
pi->s++; /* past > */
|
455
|
+
pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
|
456
|
+
pi->pcb->end_element(pi, ename);
|
487
457
|
|
488
|
-
|
489
|
-
|
458
|
+
attr_stack_cleanup(&attrs);
|
459
|
+
return 0;
|
490
460
|
}
|
491
461
|
/* read attribute names until the close (/ or >) is reached */
|
492
462
|
while (!done) {
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
463
|
+
if ('\0' == c) {
|
464
|
+
if (pi->end <= pi->s) {
|
465
|
+
break;
|
466
|
+
}
|
467
|
+
next_non_white(pi);
|
468
|
+
c = *pi->s;
|
469
|
+
}
|
470
|
+
pi->last = 0;
|
471
|
+
switch (c) {
|
472
|
+
case '\0':
|
473
|
+
attr_stack_cleanup(&attrs);
|
474
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
475
|
+
return 0;
|
476
|
+
case '/':
|
477
|
+
/* Element with just attributes. */
|
478
|
+
pi->s++;
|
479
|
+
if ('>' != *pi->s) {
|
480
|
+
attr_stack_cleanup(&attrs);
|
481
|
+
set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
|
482
|
+
return 0;
|
483
|
+
}
|
484
|
+
pi->s++;
|
485
|
+
pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
|
486
|
+
pi->pcb->end_element(pi, ename);
|
487
|
+
attr_stack_cleanup(&attrs);
|
488
|
+
|
489
|
+
return 0;
|
490
|
+
case '>':
|
491
|
+
/* has either children or a value */
|
492
|
+
pi->s++;
|
493
|
+
hasChildren = 1;
|
494
|
+
done = 1;
|
495
|
+
pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
|
526
496
|
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
497
|
+
break;
|
498
|
+
default:
|
499
|
+
/* Attribute name so it's an element and the attribute will be */
|
500
|
+
/* added to it. */
|
501
|
+
if (0 == (attr_name = read_name_token(pi))) {
|
502
|
+
attr_stack_cleanup(&attrs);
|
503
|
+
return 0;
|
504
|
+
}
|
505
|
+
end = pi->s;
|
506
|
+
next_non_white(pi);
|
507
|
+
if ('=' != *pi->s++) {
|
508
|
+
if (TolerantEffort == pi->options->effort) {
|
509
|
+
pi->s--;
|
510
|
+
pi->last = *pi->s;
|
511
|
+
*end = '\0'; /* terminate name */
|
512
|
+
attr_value = "";
|
513
|
+
attr_stack_push(&attrs, attr_name, attr_value);
|
514
|
+
break;
|
515
|
+
} else {
|
516
|
+
attr_stack_cleanup(&attrs);
|
517
|
+
set_error(&pi->err, "invalid format, no attribute value", pi->str, pi->s);
|
518
|
+
return 0;
|
519
|
+
}
|
520
|
+
}
|
521
|
+
*end = '\0'; /* terminate name */
|
522
|
+
/* read value */
|
523
|
+
next_non_white(pi);
|
524
|
+
if (0 == (attr_value = read_quoted_value(pi))) {
|
525
|
+
return 0;
|
526
|
+
}
|
527
|
+
if (pi->options->convert_special && 0 != strchr(attr_value, '&')) {
|
528
|
+
if (0 != collapse_special(pi, (char *)attr_value) || err_has(&pi->err)) {
|
529
|
+
attr_stack_cleanup(&attrs);
|
530
|
+
return 0;
|
531
|
+
}
|
532
|
+
}
|
533
|
+
attr_stack_push(&attrs, attr_name, attr_value);
|
534
|
+
break;
|
535
|
+
}
|
536
|
+
if ('\0' == pi->last) {
|
537
|
+
c = '\0';
|
538
|
+
} else {
|
539
|
+
c = pi->last;
|
540
|
+
pi->last = '\0';
|
541
|
+
}
|
572
542
|
}
|
573
543
|
if (hasChildren) {
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
}
|
723
|
-
}
|
544
|
+
char *start;
|
545
|
+
int first = 1;
|
546
|
+
|
547
|
+
done = 0;
|
548
|
+
/* read children */
|
549
|
+
while (!done) {
|
550
|
+
start = pi->s;
|
551
|
+
next_non_white(pi);
|
552
|
+
if (OffSkip == pi->options->skip && start < pi->s && '<' == *pi->s) {
|
553
|
+
c = *pi->s;
|
554
|
+
*pi->s = '\0';
|
555
|
+
pi->pcb->add_text(pi, start, 1);
|
556
|
+
*pi->s = c;
|
557
|
+
}
|
558
|
+
c = *pi->s++;
|
559
|
+
if ('\0' == c) {
|
560
|
+
attr_stack_cleanup(&attrs);
|
561
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
562
|
+
return 0;
|
563
|
+
}
|
564
|
+
if ('<' == c) {
|
565
|
+
char *slash;
|
566
|
+
|
567
|
+
switch (*pi->s) {
|
568
|
+
case '!': /* better be a comment or CDATA */
|
569
|
+
pi->s++;
|
570
|
+
if ('-' == *pi->s && '-' == *(pi->s + 1)) {
|
571
|
+
pi->s += 2;
|
572
|
+
read_comment(pi);
|
573
|
+
} else if ((TolerantEffort == pi->options->effort) ? 0 == strncasecmp("[CDATA[", pi->s, 7)
|
574
|
+
: 0 == strncmp("[CDATA[", pi->s, 7)) {
|
575
|
+
pi->s += 7;
|
576
|
+
read_cdata(pi);
|
577
|
+
} else {
|
578
|
+
attr_stack_cleanup(&attrs);
|
579
|
+
set_error(&pi->err, "invalid format, invalid comment or CDATA format", pi->str, pi->s);
|
580
|
+
return 0;
|
581
|
+
}
|
582
|
+
break;
|
583
|
+
case '?': /* processing instruction */
|
584
|
+
pi->s++;
|
585
|
+
read_instruction(pi);
|
586
|
+
break;
|
587
|
+
case '/':
|
588
|
+
slash = pi->s;
|
589
|
+
pi->s++;
|
590
|
+
if (0 == (name = read_name_token(pi))) {
|
591
|
+
attr_stack_cleanup(&attrs);
|
592
|
+
return 0;
|
593
|
+
}
|
594
|
+
end = pi->s;
|
595
|
+
next_non_white(pi);
|
596
|
+
c = *pi->s;
|
597
|
+
*end = '\0';
|
598
|
+
if (0 !=
|
599
|
+
((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
|
600
|
+
attr_stack_cleanup(&attrs);
|
601
|
+
if (TolerantEffort == pi->options->effort) {
|
602
|
+
pi->pcb->end_element(pi, ename);
|
603
|
+
return name;
|
604
|
+
} else {
|
605
|
+
set_error(&pi->err, "invalid format, elements overlap", pi->str, pi->s);
|
606
|
+
return 0;
|
607
|
+
}
|
608
|
+
}
|
609
|
+
if ('>' != c) {
|
610
|
+
attr_stack_cleanup(&attrs);
|
611
|
+
set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
|
612
|
+
return 0;
|
613
|
+
}
|
614
|
+
if (first && start != slash - 1) {
|
615
|
+
// Some white space between start and here so add as
|
616
|
+
// text after checking skip.
|
617
|
+
*(slash - 1) = '\0';
|
618
|
+
switch (pi->options->skip) {
|
619
|
+
case CrSkip: {
|
620
|
+
char *s = start;
|
621
|
+
char *e = start;
|
622
|
+
|
623
|
+
for (; '\0' != *e; e++) {
|
624
|
+
if ('\r' != *e) {
|
625
|
+
*s++ = *e;
|
626
|
+
}
|
627
|
+
}
|
628
|
+
*s = '\0';
|
629
|
+
break;
|
630
|
+
}
|
631
|
+
case SpcSkip: *start = '\0'; break;
|
632
|
+
case NoSkip:
|
633
|
+
case OffSkip:
|
634
|
+
default: break;
|
635
|
+
}
|
636
|
+
if ('\0' != *start) {
|
637
|
+
pi->pcb->add_text(pi, start, 1);
|
638
|
+
}
|
639
|
+
}
|
640
|
+
pi->s++;
|
641
|
+
pi->pcb->end_element(pi, ename);
|
642
|
+
attr_stack_cleanup(&attrs);
|
643
|
+
return 0;
|
644
|
+
case '\0':
|
645
|
+
attr_stack_cleanup(&attrs);
|
646
|
+
if (TolerantEffort == pi->options->effort) {
|
647
|
+
return 0;
|
648
|
+
} else {
|
649
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
650
|
+
return 0;
|
651
|
+
}
|
652
|
+
default:
|
653
|
+
first = 0;
|
654
|
+
/* a child element */
|
655
|
+
// Child closed with mismatched name.
|
656
|
+
if (0 != (name = read_element(pi))) {
|
657
|
+
attr_stack_cleanup(&attrs);
|
658
|
+
|
659
|
+
if (0 ==
|
660
|
+
((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
|
661
|
+
pi->s++;
|
662
|
+
pi->pcb->end_element(pi, ename);
|
663
|
+
return 0;
|
664
|
+
} else { // not the correct element yet
|
665
|
+
pi->pcb->end_element(pi, ename);
|
666
|
+
return name;
|
667
|
+
}
|
668
|
+
} else if (err_has(&pi->err)) {
|
669
|
+
return 0;
|
670
|
+
}
|
671
|
+
break;
|
672
|
+
}
|
673
|
+
} else { /* read as TEXT */
|
674
|
+
pi->s = start;
|
675
|
+
/*pi->s--; */
|
676
|
+
read_text(pi);
|
677
|
+
/*read_reduced_text(pi); */
|
678
|
+
|
679
|
+
/* to exit read_text with no errors the next character must be < */
|
680
|
+
if ('/' == *(pi->s + 1) &&
|
681
|
+
0 == ((TolerantEffort == pi->options->effort) ? strncasecmp(ename, pi->s + 2, elen)
|
682
|
+
: strncmp(ename, pi->s + 2, elen)) &&
|
683
|
+
'>' == *(pi->s + elen + 2)) {
|
684
|
+
/* close tag after text so treat as a value */
|
685
|
+
pi->s += elen + 3;
|
686
|
+
pi->pcb->end_element(pi, ename);
|
687
|
+
attr_stack_cleanup(&attrs);
|
688
|
+
return 0;
|
689
|
+
}
|
690
|
+
}
|
691
|
+
}
|
724
692
|
}
|
725
693
|
attr_stack_cleanup(&attrs);
|
726
694
|
return 0;
|
727
695
|
}
|
728
696
|
|
729
|
-
static void
|
730
|
-
|
731
|
-
char
|
732
|
-
char
|
733
|
-
char
|
734
|
-
char
|
735
|
-
|
736
|
-
int done = 0;
|
697
|
+
static void read_text(PInfo pi) {
|
698
|
+
char buf[MAX_TEXT_LEN];
|
699
|
+
char *b = buf;
|
700
|
+
char *alloc_buf = 0;
|
701
|
+
char *end = b + sizeof(buf) - 2;
|
702
|
+
char c;
|
703
|
+
int done = 0;
|
737
704
|
|
738
705
|
while (!done) {
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
}
|
802
|
-
}
|
803
|
-
break;
|
804
|
-
}
|
706
|
+
c = *pi->s++;
|
707
|
+
switch (c) {
|
708
|
+
case '<':
|
709
|
+
done = 1;
|
710
|
+
pi->s--;
|
711
|
+
break;
|
712
|
+
case '\0': set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s); return;
|
713
|
+
default:
|
714
|
+
if (end <= (b + (('&' == c) ? 7 : 0))) { /* extra 8 for special just in case it is sequence of bytes */
|
715
|
+
unsigned long size;
|
716
|
+
|
717
|
+
if (0 == alloc_buf) {
|
718
|
+
size = sizeof(buf) * 2;
|
719
|
+
alloc_buf = ALLOC_N(char, size);
|
720
|
+
memcpy(alloc_buf, buf, b - buf);
|
721
|
+
b = alloc_buf + (b - buf);
|
722
|
+
} else {
|
723
|
+
unsigned long pos = b - alloc_buf;
|
724
|
+
|
725
|
+
size = (end - alloc_buf) * 2;
|
726
|
+
REALLOC_N(alloc_buf, char, size);
|
727
|
+
b = alloc_buf + pos;
|
728
|
+
}
|
729
|
+
end = alloc_buf + size - 2;
|
730
|
+
}
|
731
|
+
if ('&' == c) {
|
732
|
+
if (0 == (b = read_coded_chars(pi, b))) {
|
733
|
+
return;
|
734
|
+
}
|
735
|
+
} else {
|
736
|
+
if (0 <= c && c <= 0x20) {
|
737
|
+
if (StrictEffort == pi->options->effort && 'x' == xml_valid_lower_chars[(unsigned char)c]) {
|
738
|
+
set_error(&pi->err, "invalid character", pi->str, pi->s);
|
739
|
+
return;
|
740
|
+
}
|
741
|
+
switch (pi->options->skip) {
|
742
|
+
case CrSkip:
|
743
|
+
if (buf != b && '\n' == c && '\r' == *(b - 1)) {
|
744
|
+
*(b - 1) = '\n';
|
745
|
+
} else {
|
746
|
+
*b++ = c;
|
747
|
+
}
|
748
|
+
break;
|
749
|
+
case SpcSkip:
|
750
|
+
if (is_white(c)) {
|
751
|
+
if (buf == b || ' ' != *(b - 1)) {
|
752
|
+
*b++ = ' ';
|
753
|
+
}
|
754
|
+
} else {
|
755
|
+
*b++ = c;
|
756
|
+
}
|
757
|
+
break;
|
758
|
+
case NoSkip:
|
759
|
+
case OffSkip:
|
760
|
+
default: *b++ = c; break;
|
761
|
+
}
|
762
|
+
} else {
|
763
|
+
*b++ = c;
|
764
|
+
}
|
765
|
+
}
|
766
|
+
break;
|
767
|
+
}
|
805
768
|
}
|
806
769
|
*b = '\0';
|
807
770
|
if (0 != alloc_buf) {
|
808
|
-
|
809
|
-
|
810
|
-
|
771
|
+
fix_newlines(alloc_buf);
|
772
|
+
pi->pcb->add_text(pi, alloc_buf, ('/' == *(pi->s + 1)));
|
773
|
+
xfree(alloc_buf);
|
811
774
|
} else {
|
812
|
-
|
813
|
-
|
775
|
+
fix_newlines(buf);
|
776
|
+
pi->pcb->add_text(pi, buf, ('/' == *(pi->s + 1)));
|
814
777
|
}
|
815
778
|
}
|
816
779
|
|
@@ -886,323 +849,322 @@ read_reduced_text(PInfo pi) {
|
|
886
849
|
}
|
887
850
|
#endif
|
888
851
|
|
889
|
-
static char*
|
890
|
-
|
891
|
-
char *start;
|
852
|
+
static char *read_name_token(PInfo pi) {
|
853
|
+
char *start;
|
892
854
|
|
893
855
|
next_non_white(pi);
|
894
856
|
start = pi->s;
|
895
857
|
for (; 1; pi->s++) {
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
break;
|
923
|
-
}
|
858
|
+
switch (*pi->s) {
|
859
|
+
case ' ':
|
860
|
+
case '\t':
|
861
|
+
case '\f':
|
862
|
+
case '?':
|
863
|
+
case '=':
|
864
|
+
case '/':
|
865
|
+
case '>':
|
866
|
+
case '\n':
|
867
|
+
case '\r': return start;
|
868
|
+
case '\0':
|
869
|
+
/* documents never terminate after a name token */
|
870
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
871
|
+
return 0;
|
872
|
+
break; /* to avoid warnings */
|
873
|
+
case ':':
|
874
|
+
if ('\0' == *pi->options->strip_ns) {
|
875
|
+
break;
|
876
|
+
} else if ('*' == *pi->options->strip_ns && '\0' == pi->options->strip_ns[1]) {
|
877
|
+
start = pi->s + 1;
|
878
|
+
} else if (0 == strncmp(pi->options->strip_ns, start, pi->s - start)) {
|
879
|
+
start = pi->s + 1;
|
880
|
+
}
|
881
|
+
break;
|
882
|
+
default: break;
|
883
|
+
}
|
924
884
|
}
|
925
885
|
return start;
|
926
886
|
}
|
927
887
|
|
928
|
-
static void
|
929
|
-
|
930
|
-
char
|
931
|
-
char *end;
|
888
|
+
static void read_cdata(PInfo pi) {
|
889
|
+
char *start;
|
890
|
+
char *end;
|
932
891
|
|
933
892
|
start = pi->s;
|
934
|
-
end
|
893
|
+
end = strstr(pi->s, "]]>");
|
935
894
|
if (end == 0) {
|
936
|
-
|
937
|
-
|
895
|
+
set_error(&pi->err, "invalid format, CDATA not terminated", pi->str, pi->s);
|
896
|
+
return;
|
938
897
|
}
|
939
|
-
*end
|
898
|
+
*end = '\0';
|
940
899
|
pi->s = end + 3;
|
941
900
|
if (0 != pi->pcb->add_cdata) {
|
942
|
-
|
943
|
-
|
901
|
+
fix_newlines(start);
|
902
|
+
pi->pcb->add_cdata(pi, start, end - start);
|
944
903
|
}
|
945
904
|
}
|
946
905
|
|
947
906
|
/* Assume the value starts immediately and goes until the quote character is
|
948
907
|
* reached again. Do not read the character after the terminating quote.
|
949
908
|
*/
|
950
|
-
static char*
|
951
|
-
|
952
|
-
char *value = 0;
|
909
|
+
static char *read_quoted_value(PInfo pi) {
|
910
|
+
char *value = 0;
|
953
911
|
|
954
912
|
if ('"' == *pi->s || '\'' == *pi->s) {
|
955
|
-
char
|
913
|
+
char term = *pi->s;
|
956
914
|
|
957
|
-
pi->s++;
|
915
|
+
pi->s++; /* skip quote character */
|
958
916
|
value = pi->s;
|
959
917
|
for (; *pi->s != term; pi->s++) {
|
960
918
|
if ('\0' == *pi->s) {
|
961
919
|
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
962
|
-
|
920
|
+
return 0;
|
963
921
|
}
|
964
922
|
}
|
965
|
-
*pi->s = '\0';
|
966
|
-
pi->s++;
|
923
|
+
*pi->s = '\0'; /* terminate value */
|
924
|
+
pi->s++; /* move past quote */
|
967
925
|
} else if (StrictEffort == pi->options->effort) {
|
968
|
-
|
969
|
-
|
926
|
+
set_error(&pi->err, "invalid format, expected a quote character", pi->str, pi->s);
|
927
|
+
return 0;
|
970
928
|
} else if (TolerantEffort == pi->options->effort) {
|
971
929
|
value = pi->s;
|
972
930
|
for (; 1; pi->s++) {
|
973
|
-
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
return value;
|
988
|
-
default:
|
989
|
-
break;
|
931
|
+
switch (*pi->s) {
|
932
|
+
case '\0': set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s); return 0;
|
933
|
+
case ' ':
|
934
|
+
case '/':
|
935
|
+
case '>':
|
936
|
+
case '?': // for instructions
|
937
|
+
case '\t':
|
938
|
+
case '\n':
|
939
|
+
case '\r':
|
940
|
+
pi->last = *pi->s;
|
941
|
+
*pi->s = '\0'; /* terminate value */
|
942
|
+
pi->s++;
|
943
|
+
return value;
|
944
|
+
default: break;
|
990
945
|
}
|
991
946
|
}
|
992
947
|
} else {
|
993
948
|
value = pi->s;
|
994
949
|
next_white(pi);
|
995
|
-
|
996
|
-
|
997
|
-
|
950
|
+
if ('\0' == *pi->s) {
|
951
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
952
|
+
return 0;
|
998
953
|
}
|
999
954
|
*pi->s++ = '\0'; /* terminate value */
|
1000
955
|
}
|
1001
956
|
return value;
|
1002
957
|
}
|
1003
958
|
|
1004
|
-
static char*
|
1005
|
-
|
1006
|
-
|
1007
|
-
char c;
|
959
|
+
static char *read_hex_uint64(char *b, uint64_t *up) {
|
960
|
+
uint64_t u = 0;
|
961
|
+
char c;
|
1008
962
|
|
1009
963
|
for (; ';' != *b; b++) {
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
964
|
+
c = *b;
|
965
|
+
if ('0' <= c && c <= '9') {
|
966
|
+
u = (u << 4) | (uint64_t)(c - '0');
|
967
|
+
} else if ('a' <= c && c <= 'f') {
|
968
|
+
u = (u << 4) | (uint64_t)(c - 'a' + 10);
|
969
|
+
} else if ('A' <= c && c <= 'F') {
|
970
|
+
u = (u << 4) | (uint64_t)(c - 'A' + 10);
|
971
|
+
} else {
|
972
|
+
return 0;
|
973
|
+
}
|
1020
974
|
}
|
1021
975
|
*up = u;
|
1022
976
|
|
1023
977
|
return b;
|
1024
978
|
}
|
1025
979
|
|
1026
|
-
static char*
|
1027
|
-
|
1028
|
-
|
1029
|
-
char c;
|
980
|
+
static char *read_10_uint64(char *b, uint64_t *up) {
|
981
|
+
uint64_t u = 0;
|
982
|
+
char c;
|
1030
983
|
|
1031
984
|
for (; ';' != *b; b++) {
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
985
|
+
c = *b;
|
986
|
+
if ('0' <= c && c <= '9') {
|
987
|
+
u = (u * 10) + (uint64_t)(c - '0');
|
988
|
+
} else {
|
989
|
+
return 0;
|
990
|
+
}
|
1038
991
|
}
|
1039
992
|
*up = u;
|
1040
993
|
|
1041
994
|
return b;
|
1042
995
|
}
|
1043
996
|
|
1044
|
-
static char*
|
1045
|
-
|
1046
|
-
char
|
1047
|
-
char
|
1048
|
-
|
1049
|
-
long blen = 0;
|
997
|
+
static char *read_coded_chars(PInfo pi, char *text) {
|
998
|
+
char *b, buf[32];
|
999
|
+
char *end = buf + sizeof(buf) - 1;
|
1000
|
+
char *s;
|
1001
|
+
long blen = 0;
|
1050
1002
|
|
1051
1003
|
for (b = buf, s = pi->s; b < end; b++, s++) {
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
1004
|
+
*b = *s;
|
1005
|
+
if (';' == *s) {
|
1006
|
+
*(b + 1) = '\0';
|
1007
|
+
blen = b - buf;
|
1008
|
+
s++;
|
1009
|
+
break;
|
1010
|
+
}
|
1059
1011
|
}
|
1060
1012
|
if (b > end) {
|
1061
|
-
|
1013
|
+
*text++ = '&';
|
1062
1014
|
} else if ('#' == *buf) {
|
1063
|
-
|
1015
|
+
uint64_t u = 0;
|
1064
1016
|
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1017
|
+
b = buf + 1;
|
1018
|
+
if ('x' == *b || 'X' == *b) {
|
1019
|
+
b = read_hex_uint64(b + 1, &u);
|
1020
|
+
} else {
|
1021
|
+
b = read_10_uint64(b, &u);
|
1022
|
+
}
|
1023
|
+
if (0 == b) {
|
1024
|
+
*text++ = '&';
|
1025
|
+
} else {
|
1026
|
+
if (u <= 0x000000000000007FULL) {
|
1027
|
+
*text++ = (char)u;
|
1028
|
+
} else if (ox_utf8_encoding == pi->options->rb_enc) {
|
1029
|
+
text = ox_ucs_to_utf8_chars(text, u);
|
1030
|
+
} else if (0 == pi->options->rb_enc) {
|
1031
|
+
pi->options->rb_enc = ox_utf8_encoding;
|
1032
|
+
text = ox_ucs_to_utf8_chars(text, u);
|
1033
|
+
} else if (TolerantEffort == pi->options->effort) {
|
1034
|
+
*text++ = '&';
|
1035
|
+
return text;
|
1036
|
+
} else if (u <= 0x00000000000000FFULL) {
|
1037
|
+
*text++ = (char)u;
|
1038
|
+
} else {
|
1039
|
+
/*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character
|
1040
|
+
* sequences.", pi->str, pi->s); */
|
1041
|
+
set_error(&pi->err,
|
1042
|
+
"Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.",
|
1043
|
+
pi->str,
|
1044
|
+
pi->s);
|
1045
|
+
return NULL;
|
1046
|
+
}
|
1047
|
+
pi->s = s;
|
1048
|
+
}
|
1093
1049
|
} else {
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1050
|
+
char *t2;
|
1051
|
+
|
1052
|
+
buf[blen] = '\0';
|
1053
|
+
if (NULL == (t2 = ox_entity_lookup(text, buf))) {
|
1054
|
+
*text++ = '&';
|
1055
|
+
} else {
|
1056
|
+
text = t2;
|
1057
|
+
pi->s = s;
|
1058
|
+
}
|
1103
1059
|
}
|
1104
1060
|
return text;
|
1105
1061
|
}
|
1106
1062
|
|
1107
|
-
static int
|
1108
|
-
|
1109
|
-
char
|
1110
|
-
char *b = str;
|
1063
|
+
static int collapse_special(PInfo pi, char *str) {
|
1064
|
+
char *s = str;
|
1065
|
+
char *b = str;
|
1111
1066
|
|
1112
1067
|
while ('\0' != *s) {
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1172
|
-
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1186
|
-
|
1187
|
-
|
1188
|
-
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1068
|
+
if ('&' == *s) {
|
1069
|
+
int c;
|
1070
|
+
char *end;
|
1071
|
+
|
1072
|
+
s++;
|
1073
|
+
if ('#' == *s) {
|
1074
|
+
uint64_t u = 0;
|
1075
|
+
char x;
|
1076
|
+
|
1077
|
+
s++;
|
1078
|
+
if ('x' == *s || 'X' == *s) {
|
1079
|
+
x = *s;
|
1080
|
+
s++;
|
1081
|
+
end = read_hex_uint64(s, &u);
|
1082
|
+
} else {
|
1083
|
+
x = '\0';
|
1084
|
+
end = read_10_uint64(s, &u);
|
1085
|
+
}
|
1086
|
+
if (0 == end) {
|
1087
|
+
if (TolerantEffort == pi->options->effort) {
|
1088
|
+
*b++ = '&';
|
1089
|
+
*b++ = '#';
|
1090
|
+
if ('\0' != x) {
|
1091
|
+
*b++ = x;
|
1092
|
+
}
|
1093
|
+
continue;
|
1094
|
+
}
|
1095
|
+
return EDOM;
|
1096
|
+
}
|
1097
|
+
if (u <= 0x000000000000007FULL) {
|
1098
|
+
*b++ = (char)u;
|
1099
|
+
} else if (ox_utf8_encoding == pi->options->rb_enc) {
|
1100
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
1101
|
+
/* TBD support UTF-16 */
|
1102
|
+
} else if (0 == pi->options->rb_enc) {
|
1103
|
+
pi->options->rb_enc = ox_utf8_encoding;
|
1104
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
1105
|
+
} else {
|
1106
|
+
/* set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character
|
1107
|
+
* sequences.", pi->str, pi->s);*/
|
1108
|
+
set_error(&pi->err,
|
1109
|
+
"Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.",
|
1110
|
+
pi->str,
|
1111
|
+
pi->s);
|
1112
|
+
return 0;
|
1113
|
+
}
|
1114
|
+
s = end + 1;
|
1115
|
+
} else {
|
1116
|
+
if (0 == strncasecmp(s, "lt;", 3)) {
|
1117
|
+
c = '<';
|
1118
|
+
s += 3;
|
1119
|
+
} else if (0 == strncasecmp(s, "gt;", 3)) {
|
1120
|
+
c = '>';
|
1121
|
+
s += 3;
|
1122
|
+
} else if (0 == strncasecmp(s, "amp;", 4)) {
|
1123
|
+
c = '&';
|
1124
|
+
s += 4;
|
1125
|
+
} else if (0 == strncasecmp(s, "quot;", 5)) {
|
1126
|
+
c = '"';
|
1127
|
+
s += 5;
|
1128
|
+
} else if (0 == strncasecmp(s, "apos;", 5)) {
|
1129
|
+
c = '\'';
|
1130
|
+
s += 5;
|
1131
|
+
} else if (TolerantEffort == pi->options->effort) {
|
1132
|
+
*b++ = '&';
|
1133
|
+
continue;
|
1134
|
+
} else {
|
1135
|
+
char key[16];
|
1136
|
+
char *k = key;
|
1137
|
+
char *kend = key + sizeof(key) - 1;
|
1138
|
+
|
1139
|
+
*k++ = *s;
|
1140
|
+
while (';' != *s++) {
|
1141
|
+
if ('\0' == *s) {
|
1142
|
+
set_error(&pi->err,
|
1143
|
+
"Invalid format, special character does not end with a semicolon",
|
1144
|
+
pi->str,
|
1145
|
+
pi->s);
|
1146
|
+
return EDOM;
|
1147
|
+
}
|
1148
|
+
if (kend <= k) {
|
1149
|
+
k = key;
|
1150
|
+
break;
|
1151
|
+
}
|
1152
|
+
*k++ = *s;
|
1153
|
+
}
|
1154
|
+
k--;
|
1155
|
+
*k = '\0';
|
1156
|
+
if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
|
1157
|
+
set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
|
1158
|
+
c = '?';
|
1159
|
+
return 0;
|
1160
|
+
}
|
1161
|
+
continue;
|
1162
|
+
}
|
1163
|
+
*b++ = (char)c;
|
1164
|
+
}
|
1165
|
+
} else {
|
1166
|
+
*b++ = *s++;
|
1167
|
+
}
|
1206
1168
|
}
|
1207
1169
|
*b = '\0';
|
1208
1170
|
|