ox-bundlecachetest 2.14.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +751 -0
- data/LICENSE +21 -0
- data/README.md +351 -0
- data/ext/ox/attr.h +78 -0
- data/ext/ox/base64.c +105 -0
- data/ext/ox/base64.h +18 -0
- data/ext/ox/buf.h +162 -0
- data/ext/ox/builder.c +948 -0
- data/ext/ox/cache.c +351 -0
- data/ext/ox/cache.h +21 -0
- data/ext/ox/cache8.c +106 -0
- data/ext/ox/cache8.h +23 -0
- data/ext/ox/dump.c +1260 -0
- data/ext/ox/err.c +46 -0
- data/ext/ox/err.h +36 -0
- data/ext/ox/extconf.rb +47 -0
- data/ext/ox/gen_load.c +342 -0
- data/ext/ox/hash_load.c +309 -0
- data/ext/ox/helper.h +84 -0
- data/ext/ox/intern.c +157 -0
- data/ext/ox/intern.h +25 -0
- data/ext/ox/obj_load.c +809 -0
- data/ext/ox/ox.c +1649 -0
- data/ext/ox/ox.h +245 -0
- data/ext/ox/parse.c +1197 -0
- data/ext/ox/sax.c +1570 -0
- data/ext/ox/sax.h +69 -0
- data/ext/ox/sax_as.c +270 -0
- data/ext/ox/sax_buf.c +209 -0
- data/ext/ox/sax_buf.h +204 -0
- data/ext/ox/sax_hint.c +207 -0
- data/ext/ox/sax_hint.h +40 -0
- data/ext/ox/sax_stack.h +113 -0
- data/ext/ox/slotcache.c +158 -0
- data/ext/ox/slotcache.h +19 -0
- data/ext/ox/special.c +390 -0
- data/ext/ox/special.h +14 -0
- data/ext/ox/type.h +39 -0
- data/lib/ox/bag.rb +103 -0
- data/lib/ox/cdata.rb +10 -0
- data/lib/ox/comment.rb +11 -0
- data/lib/ox/doctype.rb +11 -0
- data/lib/ox/document.rb +28 -0
- data/lib/ox/element.rb +464 -0
- data/lib/ox/error.rb +25 -0
- data/lib/ox/hasattrs.rb +54 -0
- data/lib/ox/instruct.rb +34 -0
- data/lib/ox/node.rb +23 -0
- data/lib/ox/raw.rb +12 -0
- data/lib/ox/sax.rb +97 -0
- data/lib/ox/version.rb +4 -0
- data/lib/ox/xmlrpc_adapter.rb +33 -0
- data/lib/ox.rb +79 -0
- metadata +128 -0
data/ext/ox/parse.c
ADDED
|
@@ -0,0 +1,1197 @@
|
|
|
1
|
+
/* parse.c
|
|
2
|
+
* Copyright (c) 2011, Peter Ohler
|
|
3
|
+
* All rights reserved.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
#include <errno.h>
|
|
7
|
+
#include <stdbool.h>
|
|
8
|
+
#include <stdio.h>
|
|
9
|
+
#include <stdlib.h>
|
|
10
|
+
#include <string.h>
|
|
11
|
+
#include <strings.h>
|
|
12
|
+
|
|
13
|
+
#include "attr.h"
|
|
14
|
+
#include "err.h"
|
|
15
|
+
#include "helper.h"
|
|
16
|
+
#include "intern.h"
|
|
17
|
+
#include "ox.h"
|
|
18
|
+
#include "ruby.h"
|
|
19
|
+
#include "special.h"
|
|
20
|
+
|
|
21
|
+
static void mark_pi_cb(void *ptr);
|
|
22
|
+
static void read_instruction(PInfo pi);
|
|
23
|
+
static void read_doctype(PInfo pi);
|
|
24
|
+
static void read_comment(PInfo pi);
|
|
25
|
+
static char *read_element(PInfo pi);
|
|
26
|
+
static void read_text(PInfo pi);
|
|
27
|
+
/*static void read_reduced_text(PInfo pi); */
|
|
28
|
+
static void read_cdata(PInfo pi);
|
|
29
|
+
static char *read_name_token(PInfo pi);
|
|
30
|
+
static char *read_quoted_value(PInfo pi);
|
|
31
|
+
static char *read_hex_uint64(char *b, uint64_t *up);
|
|
32
|
+
static char *read_10_uint64(char *b, uint64_t *up);
|
|
33
|
+
static char *read_coded_chars(PInfo pi, char *text);
|
|
34
|
+
static void next_non_white(PInfo pi);
|
|
35
|
+
static int collapse_special(PInfo pi, char *str);
|
|
36
|
+
|
|
37
|
+
static const rb_data_type_t ox_wrap_type = {
|
|
38
|
+
"Object",
|
|
39
|
+
{
|
|
40
|
+
mark_pi_cb,
|
|
41
|
+
NULL,
|
|
42
|
+
NULL,
|
|
43
|
+
},
|
|
44
|
+
0,
|
|
45
|
+
0,
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
/* This XML parser is a single pass, destructive, callback parser. It is a
|
|
49
|
+
* single pass parse since it only make one pass over the characters in the
|
|
50
|
+
* XML document string. It is destructive because it re-uses the content of
|
|
51
|
+
* the string for values in the callback and places \0 characters at various
|
|
52
|
+
* places to mark the end of tokens and strings. It is a callback parser like
|
|
53
|
+
* a SAX parser because it uses callback when document elements are
|
|
54
|
+
* encountered.
|
|
55
|
+
*
|
|
56
|
+
* Parsing is very tolerant. Lack of headers and even mispelled element
|
|
57
|
+
* endings are passed over without raising an error. A best attempt is made in
|
|
58
|
+
* all cases to parse the string.
|
|
59
|
+
*/
|
|
60
|
+
|
|
61
|
+
static char xml_valid_lower_chars[34] = "xxxxxxxxxooxxoxxxxxxxxxxxxxxxxxxo";
|
|
62
|
+
|
|
63
|
+
inline static int is_white(char c) {
|
|
64
|
+
switch (c) {
|
|
65
|
+
case ' ':
|
|
66
|
+
case '\t':
|
|
67
|
+
case '\f':
|
|
68
|
+
case '\n':
|
|
69
|
+
case '\r': return 1;
|
|
70
|
+
default: return 0;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
inline static void next_non_white(PInfo pi) {
|
|
75
|
+
for (; 1; pi->s++) {
|
|
76
|
+
switch (*pi->s) {
|
|
77
|
+
case ' ':
|
|
78
|
+
case '\t':
|
|
79
|
+
case '\f':
|
|
80
|
+
case '\n':
|
|
81
|
+
case '\r': break;
|
|
82
|
+
default: return;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
inline static void next_white(PInfo pi) {
|
|
88
|
+
for (; 1; pi->s++) {
|
|
89
|
+
switch (*pi->s) {
|
|
90
|
+
case ' ':
|
|
91
|
+
case '\t':
|
|
92
|
+
case '\f':
|
|
93
|
+
case '\n':
|
|
94
|
+
case '\r':
|
|
95
|
+
case '\0': return;
|
|
96
|
+
default: break;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
static void fix_newlines(char *buf) {
|
|
102
|
+
#if HAVE_INDEX
|
|
103
|
+
if (NULL != index(buf, '\r')) {
|
|
104
|
+
#endif
|
|
105
|
+
char *s = buf;
|
|
106
|
+
char *d = buf;
|
|
107
|
+
|
|
108
|
+
for (; '\0' != *s; s++) {
|
|
109
|
+
if ('\r' == *s) {
|
|
110
|
+
if ('\n' == *(s + 1)) {
|
|
111
|
+
continue;
|
|
112
|
+
}
|
|
113
|
+
*d = '\n';
|
|
114
|
+
} else if (d < s) {
|
|
115
|
+
*d = *s;
|
|
116
|
+
}
|
|
117
|
+
d++;
|
|
118
|
+
}
|
|
119
|
+
*d = '\0';
|
|
120
|
+
#if HAVE_INDEX
|
|
121
|
+
}
|
|
122
|
+
#endif
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
static void mark_pi_cb(void *ptr) {
|
|
126
|
+
if (NULL != ptr) {
|
|
127
|
+
HelperStack stack = &((PInfo)ptr)->helpers;
|
|
128
|
+
Helper h;
|
|
129
|
+
|
|
130
|
+
for (h = stack->head; h < stack->tail; h++) {
|
|
131
|
+
if (NoCode != h->type) {
|
|
132
|
+
rb_gc_mark(h->obj);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
VALUE
|
|
139
|
+
ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options, Err err) {
|
|
140
|
+
struct _pInfo pi;
|
|
141
|
+
int body_read = 0;
|
|
142
|
+
int block_given = rb_block_given_p();
|
|
143
|
+
volatile VALUE wrap;
|
|
144
|
+
|
|
145
|
+
if (0 == xml) {
|
|
146
|
+
set_error(err, "Invalid arg, xml string can not be null", xml, 0);
|
|
147
|
+
return Qnil;
|
|
148
|
+
}
|
|
149
|
+
if (DEBUG <= options->trace) {
|
|
150
|
+
printf("Parsing xml:\n%s\n", xml);
|
|
151
|
+
}
|
|
152
|
+
// initialize parse info
|
|
153
|
+
helper_stack_init(&pi.helpers);
|
|
154
|
+
// Protect against GC
|
|
155
|
+
wrap = TypedData_Wrap_Struct(rb_cObject, &ox_wrap_type, &pi);
|
|
156
|
+
|
|
157
|
+
err_init(&pi.err);
|
|
158
|
+
pi.str = xml;
|
|
159
|
+
pi.end = pi.str + len;
|
|
160
|
+
pi.s = xml;
|
|
161
|
+
pi.pcb = pcb;
|
|
162
|
+
pi.obj = Qnil;
|
|
163
|
+
pi.circ_array = 0;
|
|
164
|
+
pi.options = options;
|
|
165
|
+
pi.marked = NULL;
|
|
166
|
+
pi.mark_size = 0;
|
|
167
|
+
pi.mark_cnt = 0;
|
|
168
|
+
while (1) {
|
|
169
|
+
next_non_white(&pi); // skip white space
|
|
170
|
+
if ('\0' == *pi.s) {
|
|
171
|
+
break;
|
|
172
|
+
}
|
|
173
|
+
if (body_read && 0 != endp) {
|
|
174
|
+
*endp = pi.s;
|
|
175
|
+
break;
|
|
176
|
+
}
|
|
177
|
+
if ('<' != *pi.s) { // all top level entities start with <
|
|
178
|
+
set_error(err, "invalid format, expected <", pi.str, pi.s);
|
|
179
|
+
helper_stack_cleanup(&pi.helpers);
|
|
180
|
+
return Qnil;
|
|
181
|
+
}
|
|
182
|
+
pi.s++; // past <
|
|
183
|
+
switch (*pi.s) {
|
|
184
|
+
case '?': // processing instruction
|
|
185
|
+
pi.s++;
|
|
186
|
+
read_instruction(&pi);
|
|
187
|
+
break;
|
|
188
|
+
case '!': // comment or doctype
|
|
189
|
+
pi.s++;
|
|
190
|
+
if ('\0' == *pi.s) {
|
|
191
|
+
set_error(err, "invalid format, DOCTYPE or comment not terminated", pi.str, pi.s);
|
|
192
|
+
helper_stack_cleanup(&pi.helpers);
|
|
193
|
+
return Qnil;
|
|
194
|
+
} else if ('-' == *pi.s) {
|
|
195
|
+
pi.s++; // skip -
|
|
196
|
+
if ('-' != *pi.s) {
|
|
197
|
+
set_error(err, "invalid format, bad comment format", pi.str, pi.s);
|
|
198
|
+
helper_stack_cleanup(&pi.helpers);
|
|
199
|
+
return Qnil;
|
|
200
|
+
} else {
|
|
201
|
+
pi.s++; // skip second -
|
|
202
|
+
read_comment(&pi);
|
|
203
|
+
}
|
|
204
|
+
} else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7)
|
|
205
|
+
: 0 == strncmp("DOCTYPE", pi.s, 7)) {
|
|
206
|
+
pi.s += 7;
|
|
207
|
+
read_doctype(&pi);
|
|
208
|
+
} else {
|
|
209
|
+
set_error(err, "invalid format, DOCTYPE or comment expected", pi.str, pi.s);
|
|
210
|
+
helper_stack_cleanup(&pi.helpers);
|
|
211
|
+
return Qnil;
|
|
212
|
+
}
|
|
213
|
+
break;
|
|
214
|
+
case '\0':
|
|
215
|
+
set_error(err, "invalid format, document not terminated", pi.str, pi.s);
|
|
216
|
+
helper_stack_cleanup(&pi.helpers);
|
|
217
|
+
return Qnil;
|
|
218
|
+
default:
|
|
219
|
+
read_element(&pi);
|
|
220
|
+
body_read = 1;
|
|
221
|
+
break;
|
|
222
|
+
}
|
|
223
|
+
if (err_has(&pi.err)) {
|
|
224
|
+
*err = pi.err;
|
|
225
|
+
helper_stack_cleanup(&pi.helpers);
|
|
226
|
+
return Qnil;
|
|
227
|
+
}
|
|
228
|
+
if (block_given && Qnil != pi.obj && Qundef != pi.obj) {
|
|
229
|
+
if (NULL != pcb->finish) {
|
|
230
|
+
pcb->finish(&pi);
|
|
231
|
+
}
|
|
232
|
+
rb_yield(pi.obj);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
DATA_PTR(wrap) = NULL;
|
|
236
|
+
helper_stack_cleanup(&pi.helpers);
|
|
237
|
+
if (NULL != pcb->finish) {
|
|
238
|
+
pcb->finish(&pi);
|
|
239
|
+
}
|
|
240
|
+
return pi.obj;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Entered after the "<?" sequence. Ready to read the rest.
|
|
244
|
+
static void read_instruction(PInfo pi) {
|
|
245
|
+
char content[256];
|
|
246
|
+
char *content_ptr;
|
|
247
|
+
struct _attrStack attrs;
|
|
248
|
+
char *attr_name;
|
|
249
|
+
char *attr_value;
|
|
250
|
+
char *target;
|
|
251
|
+
char *end;
|
|
252
|
+
char c;
|
|
253
|
+
char *cend;
|
|
254
|
+
size_t size;
|
|
255
|
+
bool attrs_ok = true;
|
|
256
|
+
|
|
257
|
+
*content = '\0';
|
|
258
|
+
attr_stack_init(&attrs);
|
|
259
|
+
if (0 == (target = read_name_token(pi))) {
|
|
260
|
+
return;
|
|
261
|
+
}
|
|
262
|
+
end = pi->s;
|
|
263
|
+
for (; true; pi->s++) {
|
|
264
|
+
switch (*pi->s) {
|
|
265
|
+
case '?':
|
|
266
|
+
if ('>' == *(pi->s + 1)) {
|
|
267
|
+
pi->s++;
|
|
268
|
+
goto DONE;
|
|
269
|
+
}
|
|
270
|
+
break;
|
|
271
|
+
case '\0': set_error(&pi->err, "processing instruction not terminated", pi->str, pi->s); return;
|
|
272
|
+
default: break;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
DONE:
|
|
276
|
+
cend = pi->s;
|
|
277
|
+
size = cend - end - 1;
|
|
278
|
+
pi->s = end;
|
|
279
|
+
if (size < sizeof(content)) {
|
|
280
|
+
content_ptr = content;
|
|
281
|
+
} else {
|
|
282
|
+
content_ptr = ALLOC_N(char, size + 1);
|
|
283
|
+
}
|
|
284
|
+
memcpy(content_ptr, end, size);
|
|
285
|
+
content_ptr[size] = '\0';
|
|
286
|
+
|
|
287
|
+
next_non_white(pi);
|
|
288
|
+
c = *pi->s;
|
|
289
|
+
*end = '\0'; // terminate name
|
|
290
|
+
if ('?' != c) {
|
|
291
|
+
while ('?' != c) {
|
|
292
|
+
pi->last = 0;
|
|
293
|
+
if ('\0' == *pi->s) {
|
|
294
|
+
attr_stack_cleanup(&attrs);
|
|
295
|
+
set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
|
|
296
|
+
return;
|
|
297
|
+
}
|
|
298
|
+
next_non_white(pi);
|
|
299
|
+
if (0 == (attr_name = read_name_token(pi))) {
|
|
300
|
+
attr_stack_cleanup(&attrs);
|
|
301
|
+
return;
|
|
302
|
+
}
|
|
303
|
+
end = pi->s;
|
|
304
|
+
next_non_white(pi);
|
|
305
|
+
if ('=' != *pi->s++) {
|
|
306
|
+
attrs_ok = false;
|
|
307
|
+
break;
|
|
308
|
+
}
|
|
309
|
+
*end = '\0'; // terminate name
|
|
310
|
+
// read value
|
|
311
|
+
next_non_white(pi);
|
|
312
|
+
if (0 == (attr_value = read_quoted_value(pi))) {
|
|
313
|
+
attr_stack_cleanup(&attrs);
|
|
314
|
+
return;
|
|
315
|
+
}
|
|
316
|
+
attr_stack_push(&attrs, attr_name, attr_value);
|
|
317
|
+
next_non_white(pi);
|
|
318
|
+
if ('\0' == pi->last) {
|
|
319
|
+
c = *pi->s;
|
|
320
|
+
} else {
|
|
321
|
+
c = pi->last;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
if ('?' == *pi->s) {
|
|
325
|
+
pi->s++;
|
|
326
|
+
}
|
|
327
|
+
} else {
|
|
328
|
+
pi->s++;
|
|
329
|
+
}
|
|
330
|
+
if (attrs_ok) {
|
|
331
|
+
if ('>' != *pi->s++) {
|
|
332
|
+
attr_stack_cleanup(&attrs);
|
|
333
|
+
set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
|
|
334
|
+
return;
|
|
335
|
+
}
|
|
336
|
+
} else {
|
|
337
|
+
pi->s = cend + 1;
|
|
338
|
+
}
|
|
339
|
+
if (0 != pi->pcb->instruct) {
|
|
340
|
+
if (attrs_ok) {
|
|
341
|
+
pi->pcb->instruct(pi, target, attrs.head, 0);
|
|
342
|
+
} else {
|
|
343
|
+
pi->pcb->instruct(pi, target, attrs.head, content_ptr);
|
|
344
|
+
}
|
|
345
|
+
} else {
|
|
346
|
+
for (Attr a = attrs.head; a < attrs.tail; a++) {
|
|
347
|
+
if (0 == strcasecmp(a->name, "encoding")) {
|
|
348
|
+
strncpy(pi->options->encoding, a->value, sizeof(pi->options->encoding) - 1);
|
|
349
|
+
pi->options->encoding[sizeof(pi->options->encoding) - 1] = '\0';
|
|
350
|
+
pi->options->rb_enc = rb_enc_find(a->value);
|
|
351
|
+
break;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
attr_stack_cleanup(&attrs);
|
|
356
|
+
if (content_ptr != content) {
|
|
357
|
+
xfree(content_ptr);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
static void read_delimited(PInfo pi, char end) {
|
|
362
|
+
char c;
|
|
363
|
+
|
|
364
|
+
if ('"' == end || '\'' == end) {
|
|
365
|
+
for (c = *pi->s++; end != c; c = *pi->s++) {
|
|
366
|
+
if ('\0' == c) {
|
|
367
|
+
set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s);
|
|
368
|
+
return;
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
} else {
|
|
372
|
+
while (1) {
|
|
373
|
+
c = *pi->s++;
|
|
374
|
+
if (end == c) {
|
|
375
|
+
return;
|
|
376
|
+
}
|
|
377
|
+
switch (c) {
|
|
378
|
+
case '\0': set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s); return;
|
|
379
|
+
case '"': read_delimited(pi, c); break;
|
|
380
|
+
case '\'': read_delimited(pi, c); break;
|
|
381
|
+
case '[': read_delimited(pi, ']'); break;
|
|
382
|
+
case '<': read_delimited(pi, '>'); break;
|
|
383
|
+
default: break;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
// Entered after the "<!DOCTYPE" sequence plus the first character after
|
|
390
|
+
// that. Ready to read the rest.
|
|
391
|
+
static void read_doctype(PInfo pi) {
|
|
392
|
+
char *doctype;
|
|
393
|
+
|
|
394
|
+
next_non_white(pi);
|
|
395
|
+
doctype = pi->s;
|
|
396
|
+
read_delimited(pi, '>');
|
|
397
|
+
if (err_has(&pi->err)) {
|
|
398
|
+
return;
|
|
399
|
+
}
|
|
400
|
+
pi->s--;
|
|
401
|
+
*pi->s = '\0';
|
|
402
|
+
pi->s++;
|
|
403
|
+
if (0 != pi->pcb->add_doctype) {
|
|
404
|
+
fix_newlines(doctype);
|
|
405
|
+
pi->pcb->add_doctype(pi, doctype);
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// Entered after "<!--". Returns error code.
|
|
410
|
+
static void read_comment(PInfo pi) {
|
|
411
|
+
char *end;
|
|
412
|
+
char *s;
|
|
413
|
+
char *comment;
|
|
414
|
+
int done = 0;
|
|
415
|
+
|
|
416
|
+
next_non_white(pi);
|
|
417
|
+
comment = pi->s;
|
|
418
|
+
end = strstr(pi->s, "-->");
|
|
419
|
+
if (0 == end) {
|
|
420
|
+
set_error(&pi->err, "invalid format, comment not terminated", pi->str, pi->s);
|
|
421
|
+
return;
|
|
422
|
+
}
|
|
423
|
+
for (s = end - 1; pi->s < s && !done; s--) {
|
|
424
|
+
switch (*s) {
|
|
425
|
+
case ' ':
|
|
426
|
+
case '\t':
|
|
427
|
+
case '\f':
|
|
428
|
+
case '\n':
|
|
429
|
+
case '\r': break;
|
|
430
|
+
default:
|
|
431
|
+
*(s + 1) = '\0';
|
|
432
|
+
done = 1;
|
|
433
|
+
break;
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
*end = '\0'; // in case the comment was blank
|
|
437
|
+
pi->s = end + 3;
|
|
438
|
+
if (0 != pi->pcb->add_comment) {
|
|
439
|
+
fix_newlines(comment);
|
|
440
|
+
pi->pcb->add_comment(pi, comment);
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
// Entered after the '<' and the first character after that. Returns stat
|
|
445
|
+
// code.
|
|
446
|
+
static char *read_element(PInfo pi) {
|
|
447
|
+
struct _attrStack attrs;
|
|
448
|
+
const char *attr_name;
|
|
449
|
+
const char *attr_value;
|
|
450
|
+
char *name;
|
|
451
|
+
char *ename;
|
|
452
|
+
char *end;
|
|
453
|
+
char c;
|
|
454
|
+
long elen;
|
|
455
|
+
int hasChildren = 0;
|
|
456
|
+
int done = 0;
|
|
457
|
+
|
|
458
|
+
attr_stack_init(&attrs);
|
|
459
|
+
if (0 == (ename = read_name_token(pi))) {
|
|
460
|
+
return 0;
|
|
461
|
+
}
|
|
462
|
+
end = pi->s;
|
|
463
|
+
elen = end - ename;
|
|
464
|
+
next_non_white(pi);
|
|
465
|
+
c = *pi->s;
|
|
466
|
+
*end = '\0';
|
|
467
|
+
if ('/' == c) {
|
|
468
|
+
// empty element, no attributes and no children
|
|
469
|
+
pi->s++;
|
|
470
|
+
if ('>' != *pi->s) {
|
|
471
|
+
attr_stack_cleanup(&attrs);
|
|
472
|
+
set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
|
|
473
|
+
return 0;
|
|
474
|
+
}
|
|
475
|
+
pi->s++; /* past > */
|
|
476
|
+
pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
|
|
477
|
+
pi->pcb->end_element(pi, ename);
|
|
478
|
+
|
|
479
|
+
attr_stack_cleanup(&attrs);
|
|
480
|
+
return 0;
|
|
481
|
+
}
|
|
482
|
+
/* read attribute names until the close (/ or >) is reached */
|
|
483
|
+
while (!done) {
|
|
484
|
+
if ('\0' == c) {
|
|
485
|
+
if (pi->end <= pi->s) {
|
|
486
|
+
break;
|
|
487
|
+
}
|
|
488
|
+
next_non_white(pi);
|
|
489
|
+
c = *pi->s;
|
|
490
|
+
}
|
|
491
|
+
pi->last = 0;
|
|
492
|
+
switch (c) {
|
|
493
|
+
case '\0':
|
|
494
|
+
attr_stack_cleanup(&attrs);
|
|
495
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
|
496
|
+
return 0;
|
|
497
|
+
case '/':
|
|
498
|
+
/* Element with just attributes. */
|
|
499
|
+
pi->s++;
|
|
500
|
+
if ('>' != *pi->s) {
|
|
501
|
+
attr_stack_cleanup(&attrs);
|
|
502
|
+
set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
|
|
503
|
+
return 0;
|
|
504
|
+
}
|
|
505
|
+
pi->s++;
|
|
506
|
+
pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
|
|
507
|
+
pi->pcb->end_element(pi, ename);
|
|
508
|
+
attr_stack_cleanup(&attrs);
|
|
509
|
+
|
|
510
|
+
return 0;
|
|
511
|
+
case '>':
|
|
512
|
+
/* has either children or a value */
|
|
513
|
+
pi->s++;
|
|
514
|
+
hasChildren = 1;
|
|
515
|
+
done = 1;
|
|
516
|
+
pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
|
|
517
|
+
|
|
518
|
+
break;
|
|
519
|
+
default:
|
|
520
|
+
/* Attribute name so it's an element and the attribute will be */
|
|
521
|
+
/* added to it. */
|
|
522
|
+
if (0 == (attr_name = read_name_token(pi))) {
|
|
523
|
+
attr_stack_cleanup(&attrs);
|
|
524
|
+
return 0;
|
|
525
|
+
}
|
|
526
|
+
end = pi->s;
|
|
527
|
+
next_non_white(pi);
|
|
528
|
+
if ('=' != *pi->s++) {
|
|
529
|
+
if (TolerantEffort == pi->options->effort) {
|
|
530
|
+
pi->s--;
|
|
531
|
+
pi->last = *pi->s;
|
|
532
|
+
*end = '\0'; /* terminate name */
|
|
533
|
+
attr_value = "";
|
|
534
|
+
attr_stack_push(&attrs, attr_name, attr_value);
|
|
535
|
+
break;
|
|
536
|
+
} else {
|
|
537
|
+
attr_stack_cleanup(&attrs);
|
|
538
|
+
set_error(&pi->err, "invalid format, no attribute value", pi->str, pi->s);
|
|
539
|
+
return 0;
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
*end = '\0'; /* terminate name */
|
|
543
|
+
/* read value */
|
|
544
|
+
next_non_white(pi);
|
|
545
|
+
if (0 == (attr_value = read_quoted_value(pi))) {
|
|
546
|
+
return 0;
|
|
547
|
+
}
|
|
548
|
+
if (pi->options->convert_special && 0 != strchr(attr_value, '&')) {
|
|
549
|
+
if (0 != collapse_special(pi, (char *)attr_value) || err_has(&pi->err)) {
|
|
550
|
+
attr_stack_cleanup(&attrs);
|
|
551
|
+
return 0;
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
attr_stack_push(&attrs, attr_name, attr_value);
|
|
555
|
+
break;
|
|
556
|
+
}
|
|
557
|
+
if ('\0' == pi->last) {
|
|
558
|
+
c = '\0';
|
|
559
|
+
} else {
|
|
560
|
+
c = pi->last;
|
|
561
|
+
pi->last = '\0';
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
if (hasChildren) {
|
|
565
|
+
char *start;
|
|
566
|
+
int first = 1;
|
|
567
|
+
|
|
568
|
+
done = 0;
|
|
569
|
+
/* read children */
|
|
570
|
+
while (!done) {
|
|
571
|
+
start = pi->s;
|
|
572
|
+
next_non_white(pi);
|
|
573
|
+
if (OffSkip == pi->options->skip && start < pi->s && '<' == *pi->s) {
|
|
574
|
+
c = *pi->s;
|
|
575
|
+
*pi->s = '\0';
|
|
576
|
+
pi->pcb->add_text(pi, start, 1);
|
|
577
|
+
*pi->s = c;
|
|
578
|
+
}
|
|
579
|
+
c = *pi->s++;
|
|
580
|
+
if ('\0' == c) {
|
|
581
|
+
attr_stack_cleanup(&attrs);
|
|
582
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
|
583
|
+
return 0;
|
|
584
|
+
}
|
|
585
|
+
if ('<' == c) {
|
|
586
|
+
char *slash;
|
|
587
|
+
|
|
588
|
+
switch (*pi->s) {
|
|
589
|
+
case '!': /* better be a comment or CDATA */
|
|
590
|
+
pi->s++;
|
|
591
|
+
if ('-' == *pi->s && '-' == *(pi->s + 1)) {
|
|
592
|
+
pi->s += 2;
|
|
593
|
+
read_comment(pi);
|
|
594
|
+
} else if ((TolerantEffort == pi->options->effort) ? 0 == strncasecmp("[CDATA[", pi->s, 7)
|
|
595
|
+
: 0 == strncmp("[CDATA[", pi->s, 7)) {
|
|
596
|
+
pi->s += 7;
|
|
597
|
+
read_cdata(pi);
|
|
598
|
+
} else {
|
|
599
|
+
attr_stack_cleanup(&attrs);
|
|
600
|
+
set_error(&pi->err, "invalid format, invalid comment or CDATA format", pi->str, pi->s);
|
|
601
|
+
return 0;
|
|
602
|
+
}
|
|
603
|
+
break;
|
|
604
|
+
case '?': /* processing instruction */
|
|
605
|
+
pi->s++;
|
|
606
|
+
read_instruction(pi);
|
|
607
|
+
break;
|
|
608
|
+
case '/':
|
|
609
|
+
slash = pi->s;
|
|
610
|
+
pi->s++;
|
|
611
|
+
if (0 == (name = read_name_token(pi))) {
|
|
612
|
+
attr_stack_cleanup(&attrs);
|
|
613
|
+
return 0;
|
|
614
|
+
}
|
|
615
|
+
end = pi->s;
|
|
616
|
+
next_non_white(pi);
|
|
617
|
+
c = *pi->s;
|
|
618
|
+
*end = '\0';
|
|
619
|
+
if (0 !=
|
|
620
|
+
((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
|
|
621
|
+
attr_stack_cleanup(&attrs);
|
|
622
|
+
if (TolerantEffort == pi->options->effort) {
|
|
623
|
+
pi->pcb->end_element(pi, ename);
|
|
624
|
+
return name;
|
|
625
|
+
} else {
|
|
626
|
+
set_error(&pi->err, "invalid format, elements overlap", pi->str, pi->s);
|
|
627
|
+
return 0;
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
if ('>' != c) {
|
|
631
|
+
attr_stack_cleanup(&attrs);
|
|
632
|
+
set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
|
|
633
|
+
return 0;
|
|
634
|
+
}
|
|
635
|
+
if (first && start != slash - 1) {
|
|
636
|
+
// Some white space between start and here so add as
|
|
637
|
+
// text after checking skip.
|
|
638
|
+
*(slash - 1) = '\0';
|
|
639
|
+
switch (pi->options->skip) {
|
|
640
|
+
case CrSkip: {
|
|
641
|
+
char *s = start;
|
|
642
|
+
char *e = start;
|
|
643
|
+
|
|
644
|
+
for (; '\0' != *e; e++) {
|
|
645
|
+
if ('\r' != *e) {
|
|
646
|
+
*s++ = *e;
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
*s = '\0';
|
|
650
|
+
break;
|
|
651
|
+
}
|
|
652
|
+
case SpcSkip: *start = '\0'; break;
|
|
653
|
+
case NoSkip:
|
|
654
|
+
case OffSkip:
|
|
655
|
+
default: break;
|
|
656
|
+
}
|
|
657
|
+
if ('\0' != *start) {
|
|
658
|
+
pi->pcb->add_text(pi, start, 1);
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
pi->s++;
|
|
662
|
+
pi->pcb->end_element(pi, ename);
|
|
663
|
+
attr_stack_cleanup(&attrs);
|
|
664
|
+
return 0;
|
|
665
|
+
case '\0':
|
|
666
|
+
attr_stack_cleanup(&attrs);
|
|
667
|
+
if (TolerantEffort == pi->options->effort) {
|
|
668
|
+
return 0;
|
|
669
|
+
} else {
|
|
670
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
|
671
|
+
return 0;
|
|
672
|
+
}
|
|
673
|
+
default:
|
|
674
|
+
first = 0;
|
|
675
|
+
/* a child element */
|
|
676
|
+
// Child closed with mismatched name.
|
|
677
|
+
if (0 != (name = read_element(pi))) {
|
|
678
|
+
attr_stack_cleanup(&attrs);
|
|
679
|
+
|
|
680
|
+
if (0 ==
|
|
681
|
+
((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
|
|
682
|
+
pi->s++;
|
|
683
|
+
pi->pcb->end_element(pi, ename);
|
|
684
|
+
return 0;
|
|
685
|
+
} else { // not the correct element yet
|
|
686
|
+
pi->pcb->end_element(pi, ename);
|
|
687
|
+
return name;
|
|
688
|
+
}
|
|
689
|
+
} else if (err_has(&pi->err)) {
|
|
690
|
+
return 0;
|
|
691
|
+
}
|
|
692
|
+
break;
|
|
693
|
+
}
|
|
694
|
+
} else { /* read as TEXT */
|
|
695
|
+
char prev = *(start - 1);
|
|
696
|
+
|
|
697
|
+
pi->s = start;
|
|
698
|
+
if ('>' != prev && (' ' <= prev || is_white(prev))) {
|
|
699
|
+
pi->s--;
|
|
700
|
+
}
|
|
701
|
+
read_text(pi);
|
|
702
|
+
/*read_reduced_text(pi); */
|
|
703
|
+
|
|
704
|
+
/* to exit read_text with no errors the next character must be < */
|
|
705
|
+
if ('/' == *(pi->s + 1) &&
|
|
706
|
+
0 == ((TolerantEffort == pi->options->effort) ? strncasecmp(ename, pi->s + 2, elen)
|
|
707
|
+
: strncmp(ename, pi->s + 2, elen)) &&
|
|
708
|
+
'>' == *(pi->s + elen + 2)) {
|
|
709
|
+
/* close tag after text so treat as a value */
|
|
710
|
+
pi->s += elen + 3;
|
|
711
|
+
pi->pcb->end_element(pi, ename);
|
|
712
|
+
attr_stack_cleanup(&attrs);
|
|
713
|
+
return 0;
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
attr_stack_cleanup(&attrs);
|
|
719
|
+
return 0;
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
static void read_text(PInfo pi) {
|
|
723
|
+
char buf[MAX_TEXT_LEN];
|
|
724
|
+
char *b = buf;
|
|
725
|
+
char *alloc_buf = 0;
|
|
726
|
+
char *end = b + sizeof(buf) - 2;
|
|
727
|
+
char c;
|
|
728
|
+
int done = 0;
|
|
729
|
+
|
|
730
|
+
while (!done) {
|
|
731
|
+
c = *pi->s++;
|
|
732
|
+
switch (c) {
|
|
733
|
+
case '<':
|
|
734
|
+
done = 1;
|
|
735
|
+
pi->s--;
|
|
736
|
+
break;
|
|
737
|
+
case '\0': set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s); return;
|
|
738
|
+
default:
|
|
739
|
+
if (end <= (b + (('&' == c) ? 7 : 0))) { /* extra 8 for special just in case it is sequence of bytes */
|
|
740
|
+
unsigned long size;
|
|
741
|
+
|
|
742
|
+
if (0 == alloc_buf) {
|
|
743
|
+
size = sizeof(buf) * 2;
|
|
744
|
+
alloc_buf = ALLOC_N(char, size);
|
|
745
|
+
memcpy(alloc_buf, buf, b - buf);
|
|
746
|
+
b = alloc_buf + (b - buf);
|
|
747
|
+
} else {
|
|
748
|
+
unsigned long pos = b - alloc_buf;
|
|
749
|
+
|
|
750
|
+
size = (end - alloc_buf) * 2;
|
|
751
|
+
REALLOC_N(alloc_buf, char, size);
|
|
752
|
+
b = alloc_buf + pos;
|
|
753
|
+
}
|
|
754
|
+
end = alloc_buf + size - 2;
|
|
755
|
+
}
|
|
756
|
+
if ('&' == c) {
|
|
757
|
+
if (0 == (b = read_coded_chars(pi, b))) {
|
|
758
|
+
return;
|
|
759
|
+
}
|
|
760
|
+
} else {
|
|
761
|
+
if (0 <= c && c <= 0x20) {
|
|
762
|
+
if (StrictEffort == pi->options->effort && 'x' == xml_valid_lower_chars[(unsigned char)c]) {
|
|
763
|
+
set_error(&pi->err, "invalid character", pi->str, pi->s);
|
|
764
|
+
return;
|
|
765
|
+
}
|
|
766
|
+
switch (pi->options->skip) {
|
|
767
|
+
case CrSkip:
|
|
768
|
+
if (buf != b && '\n' == c && '\r' == *(b - 1)) {
|
|
769
|
+
*(b - 1) = '\n';
|
|
770
|
+
} else {
|
|
771
|
+
*b++ = c;
|
|
772
|
+
}
|
|
773
|
+
break;
|
|
774
|
+
case SpcSkip:
|
|
775
|
+
if (is_white(c)) {
|
|
776
|
+
if (buf == b || ' ' != *(b - 1)) {
|
|
777
|
+
*b++ = ' ';
|
|
778
|
+
}
|
|
779
|
+
} else {
|
|
780
|
+
*b++ = c;
|
|
781
|
+
}
|
|
782
|
+
break;
|
|
783
|
+
case NoSkip:
|
|
784
|
+
case OffSkip:
|
|
785
|
+
default: *b++ = c; break;
|
|
786
|
+
}
|
|
787
|
+
} else {
|
|
788
|
+
*b++ = c;
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
break;
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
*b = '\0';
|
|
795
|
+
if (0 != alloc_buf) {
|
|
796
|
+
fix_newlines(alloc_buf);
|
|
797
|
+
pi->pcb->add_text(pi, alloc_buf, ('/' == *(pi->s + 1)));
|
|
798
|
+
xfree(alloc_buf);
|
|
799
|
+
} else {
|
|
800
|
+
fix_newlines(buf);
|
|
801
|
+
pi->pcb->add_text(pi, buf, ('/' == *(pi->s + 1)));
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
#if 0
|
|
806
|
+
static void
|
|
807
|
+
read_reduced_text(PInfo pi) {
|
|
808
|
+
char buf[MAX_TEXT_LEN];
|
|
809
|
+
char *b = buf;
|
|
810
|
+
char *alloc_buf = 0;
|
|
811
|
+
char *end = b + sizeof(buf) - 2;
|
|
812
|
+
char c;
|
|
813
|
+
int spc = 0;
|
|
814
|
+
int done = 0;
|
|
815
|
+
|
|
816
|
+
while (!done) {
|
|
817
|
+
c = *pi->s++;
|
|
818
|
+
switch(c) {
|
|
819
|
+
case ' ':
|
|
820
|
+
case '\t':
|
|
821
|
+
case '\f':
|
|
822
|
+
case '\n':
|
|
823
|
+
case '\r':
|
|
824
|
+
spc = 1;
|
|
825
|
+
break;
|
|
826
|
+
case '<':
|
|
827
|
+
done = 1;
|
|
828
|
+
pi->s--;
|
|
829
|
+
break;
|
|
830
|
+
case '\0':
|
|
831
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
|
832
|
+
return;
|
|
833
|
+
default:
|
|
834
|
+
if (end <= (b + spc + (('&' == c) ? 7 : 0))) { /* extra 8 for special just in case it is sequence of bytes */
|
|
835
|
+
unsigned long size;
|
|
836
|
+
|
|
837
|
+
if (0 == alloc_buf) {
|
|
838
|
+
size = sizeof(buf) * 2;
|
|
839
|
+
alloc_buf = ALLOC_N(char, size);
|
|
840
|
+
memcpy(alloc_buf, buf, b - buf);
|
|
841
|
+
b = alloc_buf + (b - buf);
|
|
842
|
+
} else {
|
|
843
|
+
unsigned long pos = b - alloc_buf;
|
|
844
|
+
|
|
845
|
+
size = (end - alloc_buf) * 2;
|
|
846
|
+
REALLOC(alloc_buf, char, size);
|
|
847
|
+
b = alloc_buf + pos;
|
|
848
|
+
}
|
|
849
|
+
end = alloc_buf + size - 2;
|
|
850
|
+
}
|
|
851
|
+
if (spc) {
|
|
852
|
+
*b++ = ' ';
|
|
853
|
+
}
|
|
854
|
+
spc = 0;
|
|
855
|
+
if ('&' == c) {
|
|
856
|
+
if (0 == (b = read_coded_chars(pi, b))) {
|
|
857
|
+
return;
|
|
858
|
+
}
|
|
859
|
+
} else {
|
|
860
|
+
*b++ = c;
|
|
861
|
+
}
|
|
862
|
+
break;
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
*b = '\0';
|
|
866
|
+
if (0 != alloc_buf) {
|
|
867
|
+
fix_newlines(alloc_buf);
|
|
868
|
+
pi->pcb->add_text(pi, alloc_buf, ('/' == *(pi->s + 1)));
|
|
869
|
+
xfree(alloc_buf);
|
|
870
|
+
} else {
|
|
871
|
+
fix_newlines(buf);
|
|
872
|
+
pi->pcb->add_text(pi, buf, ('/' == *(pi->s + 1)));
|
|
873
|
+
}
|
|
874
|
+
}
|
|
875
|
+
#endif
|
|
876
|
+
|
|
877
|
+
static char *read_name_token(PInfo pi) {
|
|
878
|
+
char *start;
|
|
879
|
+
|
|
880
|
+
next_non_white(pi);
|
|
881
|
+
start = pi->s;
|
|
882
|
+
for (; 1; pi->s++) {
|
|
883
|
+
switch (*pi->s) {
|
|
884
|
+
case ' ':
|
|
885
|
+
case '\t':
|
|
886
|
+
case '\f':
|
|
887
|
+
case '?':
|
|
888
|
+
case '=':
|
|
889
|
+
case '/':
|
|
890
|
+
case '>':
|
|
891
|
+
case '\n':
|
|
892
|
+
case '\r': return start;
|
|
893
|
+
case '\0':
|
|
894
|
+
/* documents never terminate after a name token */
|
|
895
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
|
896
|
+
return 0;
|
|
897
|
+
break; /* to avoid warnings */
|
|
898
|
+
case ':':
|
|
899
|
+
if ('\0' == *pi->options->strip_ns) {
|
|
900
|
+
break;
|
|
901
|
+
} else if ('*' == *pi->options->strip_ns && '\0' == pi->options->strip_ns[1]) {
|
|
902
|
+
start = pi->s + 1;
|
|
903
|
+
} else if (0 == strncmp(pi->options->strip_ns, start, pi->s - start)) {
|
|
904
|
+
start = pi->s + 1;
|
|
905
|
+
}
|
|
906
|
+
break;
|
|
907
|
+
default: break;
|
|
908
|
+
}
|
|
909
|
+
}
|
|
910
|
+
return start;
|
|
911
|
+
}
|
|
912
|
+
|
|
913
|
+
static void read_cdata(PInfo pi) {
|
|
914
|
+
char *start;
|
|
915
|
+
char *end;
|
|
916
|
+
|
|
917
|
+
start = pi->s;
|
|
918
|
+
end = strstr(pi->s, "]]>");
|
|
919
|
+
if (end == 0) {
|
|
920
|
+
set_error(&pi->err, "invalid format, CDATA not terminated", pi->str, pi->s);
|
|
921
|
+
return;
|
|
922
|
+
}
|
|
923
|
+
*end = '\0';
|
|
924
|
+
pi->s = end + 3;
|
|
925
|
+
if (0 != pi->pcb->add_cdata) {
|
|
926
|
+
fix_newlines(start);
|
|
927
|
+
pi->pcb->add_cdata(pi, start, end - start);
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
/* Assume the value starts immediately and goes until the quote character is
|
|
932
|
+
* reached again. Do not read the character after the terminating quote.
|
|
933
|
+
*/
|
|
934
|
+
static char *read_quoted_value(PInfo pi) {
|
|
935
|
+
char *value = 0;
|
|
936
|
+
|
|
937
|
+
if ('"' == *pi->s || '\'' == *pi->s) {
|
|
938
|
+
char term = *pi->s;
|
|
939
|
+
|
|
940
|
+
pi->s++; /* skip quote character */
|
|
941
|
+
value = pi->s;
|
|
942
|
+
for (; *pi->s != term; pi->s++) {
|
|
943
|
+
if ('\0' == *pi->s) {
|
|
944
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
|
945
|
+
return 0;
|
|
946
|
+
}
|
|
947
|
+
}
|
|
948
|
+
*pi->s = '\0'; /* terminate value */
|
|
949
|
+
pi->s++; /* move past quote */
|
|
950
|
+
} else if (StrictEffort == pi->options->effort) {
|
|
951
|
+
set_error(&pi->err, "invalid format, expected a quote character", pi->str, pi->s);
|
|
952
|
+
return 0;
|
|
953
|
+
} else if (TolerantEffort == pi->options->effort) {
|
|
954
|
+
value = pi->s;
|
|
955
|
+
for (; 1; pi->s++) {
|
|
956
|
+
switch (*pi->s) {
|
|
957
|
+
case '\0': set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s); return 0;
|
|
958
|
+
case ' ':
|
|
959
|
+
case '/':
|
|
960
|
+
case '>':
|
|
961
|
+
case '?': // for instructions
|
|
962
|
+
case '\t':
|
|
963
|
+
case '\n':
|
|
964
|
+
case '\r':
|
|
965
|
+
pi->last = *pi->s;
|
|
966
|
+
*pi->s = '\0'; /* terminate value */
|
|
967
|
+
pi->s++;
|
|
968
|
+
return value;
|
|
969
|
+
default: break;
|
|
970
|
+
}
|
|
971
|
+
}
|
|
972
|
+
} else {
|
|
973
|
+
value = pi->s;
|
|
974
|
+
next_white(pi);
|
|
975
|
+
if ('\0' == *pi->s) {
|
|
976
|
+
set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
|
|
977
|
+
return 0;
|
|
978
|
+
}
|
|
979
|
+
*pi->s++ = '\0'; /* terminate value */
|
|
980
|
+
}
|
|
981
|
+
return value;
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
static char *read_hex_uint64(char *b, uint64_t *up) {
|
|
985
|
+
uint64_t u = 0;
|
|
986
|
+
char c;
|
|
987
|
+
|
|
988
|
+
for (; ';' != *b; b++) {
|
|
989
|
+
c = *b;
|
|
990
|
+
if ('0' <= c && c <= '9') {
|
|
991
|
+
u = (u << 4) | (uint64_t)(c - '0');
|
|
992
|
+
} else if ('a' <= c && c <= 'f') {
|
|
993
|
+
u = (u << 4) | (uint64_t)(c - 'a' + 10);
|
|
994
|
+
} else if ('A' <= c && c <= 'F') {
|
|
995
|
+
u = (u << 4) | (uint64_t)(c - 'A' + 10);
|
|
996
|
+
} else {
|
|
997
|
+
return 0;
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
*up = u;
|
|
1001
|
+
|
|
1002
|
+
return b;
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
static char *read_10_uint64(char *b, uint64_t *up) {
|
|
1006
|
+
uint64_t u = 0;
|
|
1007
|
+
char c;
|
|
1008
|
+
|
|
1009
|
+
for (; ';' != *b; b++) {
|
|
1010
|
+
c = *b;
|
|
1011
|
+
if ('0' <= c && c <= '9') {
|
|
1012
|
+
u = (u * 10) + (uint64_t)(c - '0');
|
|
1013
|
+
} else {
|
|
1014
|
+
return 0;
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
*up = u;
|
|
1018
|
+
|
|
1019
|
+
return b;
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
static char *read_coded_chars(PInfo pi, char *text) {
|
|
1023
|
+
char *b, buf[32];
|
|
1024
|
+
char *end = buf + sizeof(buf) - 1;
|
|
1025
|
+
char *s;
|
|
1026
|
+
long blen = 0;
|
|
1027
|
+
|
|
1028
|
+
for (b = buf, s = pi->s; b < end; b++, s++) {
|
|
1029
|
+
*b = *s;
|
|
1030
|
+
if (';' == *s) {
|
|
1031
|
+
*(b + 1) = '\0';
|
|
1032
|
+
blen = b - buf;
|
|
1033
|
+
s++;
|
|
1034
|
+
break;
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
if (b > end) {
|
|
1038
|
+
*text++ = '&';
|
|
1039
|
+
} else if ('#' == *buf) {
|
|
1040
|
+
uint64_t u = 0;
|
|
1041
|
+
|
|
1042
|
+
b = buf + 1;
|
|
1043
|
+
if ('x' == *b || 'X' == *b) {
|
|
1044
|
+
b = read_hex_uint64(b + 1, &u);
|
|
1045
|
+
} else {
|
|
1046
|
+
b = read_10_uint64(b, &u);
|
|
1047
|
+
}
|
|
1048
|
+
if (0 == b) {
|
|
1049
|
+
*text++ = '&';
|
|
1050
|
+
} else {
|
|
1051
|
+
if (u <= 0x000000000000007FULL) {
|
|
1052
|
+
*text++ = (char)u;
|
|
1053
|
+
} else if (ox_utf8_encoding == pi->options->rb_enc) {
|
|
1054
|
+
text = ox_ucs_to_utf8_chars(text, u);
|
|
1055
|
+
} else if (0 == pi->options->rb_enc) {
|
|
1056
|
+
pi->options->rb_enc = ox_utf8_encoding;
|
|
1057
|
+
text = ox_ucs_to_utf8_chars(text, u);
|
|
1058
|
+
} else if (TolerantEffort == pi->options->effort) {
|
|
1059
|
+
*text++ = '&';
|
|
1060
|
+
return text;
|
|
1061
|
+
} else if (u <= 0x00000000000000FFULL) {
|
|
1062
|
+
*text++ = (char)u;
|
|
1063
|
+
} else {
|
|
1064
|
+
/*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character
|
|
1065
|
+
* sequences.", pi->str, pi->s); */
|
|
1066
|
+
set_error(&pi->err,
|
|
1067
|
+
"Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.",
|
|
1068
|
+
pi->str,
|
|
1069
|
+
pi->s);
|
|
1070
|
+
return NULL;
|
|
1071
|
+
}
|
|
1072
|
+
pi->s = s;
|
|
1073
|
+
}
|
|
1074
|
+
} else {
|
|
1075
|
+
char *t2;
|
|
1076
|
+
|
|
1077
|
+
buf[blen] = '\0';
|
|
1078
|
+
if (NULL == (t2 = ox_entity_lookup(text, buf))) {
|
|
1079
|
+
*text++ = '&';
|
|
1080
|
+
} else {
|
|
1081
|
+
text = t2;
|
|
1082
|
+
pi->s = s;
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
return text;
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
static int collapse_special(PInfo pi, char *str) {
|
|
1089
|
+
char *s = str;
|
|
1090
|
+
char *b = str;
|
|
1091
|
+
|
|
1092
|
+
while ('\0' != *s) {
|
|
1093
|
+
if ('&' == *s) {
|
|
1094
|
+
int c;
|
|
1095
|
+
char *end;
|
|
1096
|
+
|
|
1097
|
+
s++;
|
|
1098
|
+
if ('#' == *s) {
|
|
1099
|
+
uint64_t u = 0;
|
|
1100
|
+
char x;
|
|
1101
|
+
|
|
1102
|
+
s++;
|
|
1103
|
+
if ('x' == *s || 'X' == *s) {
|
|
1104
|
+
x = *s;
|
|
1105
|
+
s++;
|
|
1106
|
+
end = read_hex_uint64(s, &u);
|
|
1107
|
+
} else {
|
|
1108
|
+
x = '\0';
|
|
1109
|
+
end = read_10_uint64(s, &u);
|
|
1110
|
+
}
|
|
1111
|
+
if (0 == end) {
|
|
1112
|
+
if (TolerantEffort == pi->options->effort) {
|
|
1113
|
+
*b++ = '&';
|
|
1114
|
+
*b++ = '#';
|
|
1115
|
+
if ('\0' != x) {
|
|
1116
|
+
*b++ = x;
|
|
1117
|
+
}
|
|
1118
|
+
continue;
|
|
1119
|
+
}
|
|
1120
|
+
return EDOM;
|
|
1121
|
+
}
|
|
1122
|
+
if (u <= 0x000000000000007FULL) {
|
|
1123
|
+
*b++ = (char)u;
|
|
1124
|
+
} else if (ox_utf8_encoding == pi->options->rb_enc) {
|
|
1125
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
|
1126
|
+
/* TBD support UTF-16 */
|
|
1127
|
+
} else if (0 == pi->options->rb_enc) {
|
|
1128
|
+
pi->options->rb_enc = ox_utf8_encoding;
|
|
1129
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
|
1130
|
+
} else {
|
|
1131
|
+
/* set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character
|
|
1132
|
+
* sequences.", pi->str, pi->s);*/
|
|
1133
|
+
set_error(&pi->err,
|
|
1134
|
+
"Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.",
|
|
1135
|
+
pi->str,
|
|
1136
|
+
pi->s);
|
|
1137
|
+
return 0;
|
|
1138
|
+
}
|
|
1139
|
+
s = end + 1;
|
|
1140
|
+
} else {
|
|
1141
|
+
if (0 == strncasecmp(s, "lt;", 3)) {
|
|
1142
|
+
c = '<';
|
|
1143
|
+
s += 3;
|
|
1144
|
+
} else if (0 == strncasecmp(s, "gt;", 3)) {
|
|
1145
|
+
c = '>';
|
|
1146
|
+
s += 3;
|
|
1147
|
+
} else if (0 == strncasecmp(s, "amp;", 4)) {
|
|
1148
|
+
c = '&';
|
|
1149
|
+
s += 4;
|
|
1150
|
+
} else if (0 == strncasecmp(s, "quot;", 5)) {
|
|
1151
|
+
c = '"';
|
|
1152
|
+
s += 5;
|
|
1153
|
+
} else if (0 == strncasecmp(s, "apos;", 5)) {
|
|
1154
|
+
c = '\'';
|
|
1155
|
+
s += 5;
|
|
1156
|
+
} else if (TolerantEffort == pi->options->effort) {
|
|
1157
|
+
*b++ = '&';
|
|
1158
|
+
continue;
|
|
1159
|
+
} else {
|
|
1160
|
+
char key[16];
|
|
1161
|
+
char *k = key;
|
|
1162
|
+
char *kend = key + sizeof(key) - 1;
|
|
1163
|
+
|
|
1164
|
+
*k++ = *s;
|
|
1165
|
+
while (';' != *s++) {
|
|
1166
|
+
if ('\0' == *s) {
|
|
1167
|
+
set_error(&pi->err,
|
|
1168
|
+
"Invalid format, special character does not end with a semicolon",
|
|
1169
|
+
pi->str,
|
|
1170
|
+
pi->s);
|
|
1171
|
+
return EDOM;
|
|
1172
|
+
}
|
|
1173
|
+
if (kend <= k) {
|
|
1174
|
+
k = key;
|
|
1175
|
+
break;
|
|
1176
|
+
}
|
|
1177
|
+
*k++ = *s;
|
|
1178
|
+
}
|
|
1179
|
+
k--;
|
|
1180
|
+
*k = '\0';
|
|
1181
|
+
if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
|
|
1182
|
+
set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
|
|
1183
|
+
c = '?';
|
|
1184
|
+
return 0;
|
|
1185
|
+
}
|
|
1186
|
+
continue;
|
|
1187
|
+
}
|
|
1188
|
+
*b++ = (char)c;
|
|
1189
|
+
}
|
|
1190
|
+
} else {
|
|
1191
|
+
*b++ = *s++;
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
*b = '\0';
|
|
1195
|
+
|
|
1196
|
+
return 0;
|
|
1197
|
+
}
|