ox-bundlecachetest 2.14.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +751 -0
  3. data/LICENSE +21 -0
  4. data/README.md +351 -0
  5. data/ext/ox/attr.h +78 -0
  6. data/ext/ox/base64.c +105 -0
  7. data/ext/ox/base64.h +18 -0
  8. data/ext/ox/buf.h +162 -0
  9. data/ext/ox/builder.c +948 -0
  10. data/ext/ox/cache.c +351 -0
  11. data/ext/ox/cache.h +21 -0
  12. data/ext/ox/cache8.c +106 -0
  13. data/ext/ox/cache8.h +23 -0
  14. data/ext/ox/dump.c +1260 -0
  15. data/ext/ox/err.c +46 -0
  16. data/ext/ox/err.h +36 -0
  17. data/ext/ox/extconf.rb +47 -0
  18. data/ext/ox/gen_load.c +342 -0
  19. data/ext/ox/hash_load.c +309 -0
  20. data/ext/ox/helper.h +84 -0
  21. data/ext/ox/intern.c +157 -0
  22. data/ext/ox/intern.h +25 -0
  23. data/ext/ox/obj_load.c +809 -0
  24. data/ext/ox/ox.c +1649 -0
  25. data/ext/ox/ox.h +245 -0
  26. data/ext/ox/parse.c +1197 -0
  27. data/ext/ox/sax.c +1570 -0
  28. data/ext/ox/sax.h +69 -0
  29. data/ext/ox/sax_as.c +270 -0
  30. data/ext/ox/sax_buf.c +209 -0
  31. data/ext/ox/sax_buf.h +204 -0
  32. data/ext/ox/sax_hint.c +207 -0
  33. data/ext/ox/sax_hint.h +40 -0
  34. data/ext/ox/sax_stack.h +113 -0
  35. data/ext/ox/slotcache.c +158 -0
  36. data/ext/ox/slotcache.h +19 -0
  37. data/ext/ox/special.c +390 -0
  38. data/ext/ox/special.h +14 -0
  39. data/ext/ox/type.h +39 -0
  40. data/lib/ox/bag.rb +103 -0
  41. data/lib/ox/cdata.rb +10 -0
  42. data/lib/ox/comment.rb +11 -0
  43. data/lib/ox/doctype.rb +11 -0
  44. data/lib/ox/document.rb +28 -0
  45. data/lib/ox/element.rb +464 -0
  46. data/lib/ox/error.rb +25 -0
  47. data/lib/ox/hasattrs.rb +54 -0
  48. data/lib/ox/instruct.rb +34 -0
  49. data/lib/ox/node.rb +23 -0
  50. data/lib/ox/raw.rb +12 -0
  51. data/lib/ox/sax.rb +97 -0
  52. data/lib/ox/version.rb +4 -0
  53. data/lib/ox/xmlrpc_adapter.rb +33 -0
  54. data/lib/ox.rb +79 -0
  55. metadata +128 -0
data/ext/ox/parse.c ADDED
@@ -0,0 +1,1197 @@
1
+ /* parse.c
2
+ * Copyright (c) 2011, Peter Ohler
3
+ * All rights reserved.
4
+ */
5
+
6
+ #include <errno.h>
7
+ #include <stdbool.h>
8
+ #include <stdio.h>
9
+ #include <stdlib.h>
10
+ #include <string.h>
11
+ #include <strings.h>
12
+
13
+ #include "attr.h"
14
+ #include "err.h"
15
+ #include "helper.h"
16
+ #include "intern.h"
17
+ #include "ox.h"
18
+ #include "ruby.h"
19
+ #include "special.h"
20
+
21
+ static void mark_pi_cb(void *ptr);
22
+ static void read_instruction(PInfo pi);
23
+ static void read_doctype(PInfo pi);
24
+ static void read_comment(PInfo pi);
25
+ static char *read_element(PInfo pi);
26
+ static void read_text(PInfo pi);
27
+ /*static void read_reduced_text(PInfo pi); */
28
+ static void read_cdata(PInfo pi);
29
+ static char *read_name_token(PInfo pi);
30
+ static char *read_quoted_value(PInfo pi);
31
+ static char *read_hex_uint64(char *b, uint64_t *up);
32
+ static char *read_10_uint64(char *b, uint64_t *up);
33
+ static char *read_coded_chars(PInfo pi, char *text);
34
+ static void next_non_white(PInfo pi);
35
+ static int collapse_special(PInfo pi, char *str);
36
+
37
+ static const rb_data_type_t ox_wrap_type = {
38
+ "Object",
39
+ {
40
+ mark_pi_cb,
41
+ NULL,
42
+ NULL,
43
+ },
44
+ 0,
45
+ 0,
46
+ };
47
+
48
+ /* This XML parser is a single pass, destructive, callback parser. It is a
49
+ * single pass parse since it only make one pass over the characters in the
50
+ * XML document string. It is destructive because it re-uses the content of
51
+ * the string for values in the callback and places \0 characters at various
52
+ * places to mark the end of tokens and strings. It is a callback parser like
53
+ * a SAX parser because it uses callback when document elements are
54
+ * encountered.
55
+ *
56
+ * Parsing is very tolerant. Lack of headers and even mispelled element
57
+ * endings are passed over without raising an error. A best attempt is made in
58
+ * all cases to parse the string.
59
+ */
60
+
61
+ static char xml_valid_lower_chars[34] = "xxxxxxxxxooxxoxxxxxxxxxxxxxxxxxxo";
62
+
63
+ inline static int is_white(char c) {
64
+ switch (c) {
65
+ case ' ':
66
+ case '\t':
67
+ case '\f':
68
+ case '\n':
69
+ case '\r': return 1;
70
+ default: return 0;
71
+ }
72
+ }
73
+
74
+ inline static void next_non_white(PInfo pi) {
75
+ for (; 1; pi->s++) {
76
+ switch (*pi->s) {
77
+ case ' ':
78
+ case '\t':
79
+ case '\f':
80
+ case '\n':
81
+ case '\r': break;
82
+ default: return;
83
+ }
84
+ }
85
+ }
86
+
87
+ inline static void next_white(PInfo pi) {
88
+ for (; 1; pi->s++) {
89
+ switch (*pi->s) {
90
+ case ' ':
91
+ case '\t':
92
+ case '\f':
93
+ case '\n':
94
+ case '\r':
95
+ case '\0': return;
96
+ default: break;
97
+ }
98
+ }
99
+ }
100
+
101
+ static void fix_newlines(char *buf) {
102
+ #if HAVE_INDEX
103
+ if (NULL != index(buf, '\r')) {
104
+ #endif
105
+ char *s = buf;
106
+ char *d = buf;
107
+
108
+ for (; '\0' != *s; s++) {
109
+ if ('\r' == *s) {
110
+ if ('\n' == *(s + 1)) {
111
+ continue;
112
+ }
113
+ *d = '\n';
114
+ } else if (d < s) {
115
+ *d = *s;
116
+ }
117
+ d++;
118
+ }
119
+ *d = '\0';
120
+ #if HAVE_INDEX
121
+ }
122
+ #endif
123
+ }
124
+
125
+ static void mark_pi_cb(void *ptr) {
126
+ if (NULL != ptr) {
127
+ HelperStack stack = &((PInfo)ptr)->helpers;
128
+ Helper h;
129
+
130
+ for (h = stack->head; h < stack->tail; h++) {
131
+ if (NoCode != h->type) {
132
+ rb_gc_mark(h->obj);
133
+ }
134
+ }
135
+ }
136
+ }
137
+
138
+ VALUE
139
+ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options, Err err) {
140
+ struct _pInfo pi;
141
+ int body_read = 0;
142
+ int block_given = rb_block_given_p();
143
+ volatile VALUE wrap;
144
+
145
+ if (0 == xml) {
146
+ set_error(err, "Invalid arg, xml string can not be null", xml, 0);
147
+ return Qnil;
148
+ }
149
+ if (DEBUG <= options->trace) {
150
+ printf("Parsing xml:\n%s\n", xml);
151
+ }
152
+ // initialize parse info
153
+ helper_stack_init(&pi.helpers);
154
+ // Protect against GC
155
+ wrap = TypedData_Wrap_Struct(rb_cObject, &ox_wrap_type, &pi);
156
+
157
+ err_init(&pi.err);
158
+ pi.str = xml;
159
+ pi.end = pi.str + len;
160
+ pi.s = xml;
161
+ pi.pcb = pcb;
162
+ pi.obj = Qnil;
163
+ pi.circ_array = 0;
164
+ pi.options = options;
165
+ pi.marked = NULL;
166
+ pi.mark_size = 0;
167
+ pi.mark_cnt = 0;
168
+ while (1) {
169
+ next_non_white(&pi); // skip white space
170
+ if ('\0' == *pi.s) {
171
+ break;
172
+ }
173
+ if (body_read && 0 != endp) {
174
+ *endp = pi.s;
175
+ break;
176
+ }
177
+ if ('<' != *pi.s) { // all top level entities start with <
178
+ set_error(err, "invalid format, expected <", pi.str, pi.s);
179
+ helper_stack_cleanup(&pi.helpers);
180
+ return Qnil;
181
+ }
182
+ pi.s++; // past <
183
+ switch (*pi.s) {
184
+ case '?': // processing instruction
185
+ pi.s++;
186
+ read_instruction(&pi);
187
+ break;
188
+ case '!': // comment or doctype
189
+ pi.s++;
190
+ if ('\0' == *pi.s) {
191
+ set_error(err, "invalid format, DOCTYPE or comment not terminated", pi.str, pi.s);
192
+ helper_stack_cleanup(&pi.helpers);
193
+ return Qnil;
194
+ } else if ('-' == *pi.s) {
195
+ pi.s++; // skip -
196
+ if ('-' != *pi.s) {
197
+ set_error(err, "invalid format, bad comment format", pi.str, pi.s);
198
+ helper_stack_cleanup(&pi.helpers);
199
+ return Qnil;
200
+ } else {
201
+ pi.s++; // skip second -
202
+ read_comment(&pi);
203
+ }
204
+ } else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7)
205
+ : 0 == strncmp("DOCTYPE", pi.s, 7)) {
206
+ pi.s += 7;
207
+ read_doctype(&pi);
208
+ } else {
209
+ set_error(err, "invalid format, DOCTYPE or comment expected", pi.str, pi.s);
210
+ helper_stack_cleanup(&pi.helpers);
211
+ return Qnil;
212
+ }
213
+ break;
214
+ case '\0':
215
+ set_error(err, "invalid format, document not terminated", pi.str, pi.s);
216
+ helper_stack_cleanup(&pi.helpers);
217
+ return Qnil;
218
+ default:
219
+ read_element(&pi);
220
+ body_read = 1;
221
+ break;
222
+ }
223
+ if (err_has(&pi.err)) {
224
+ *err = pi.err;
225
+ helper_stack_cleanup(&pi.helpers);
226
+ return Qnil;
227
+ }
228
+ if (block_given && Qnil != pi.obj && Qundef != pi.obj) {
229
+ if (NULL != pcb->finish) {
230
+ pcb->finish(&pi);
231
+ }
232
+ rb_yield(pi.obj);
233
+ }
234
+ }
235
+ DATA_PTR(wrap) = NULL;
236
+ helper_stack_cleanup(&pi.helpers);
237
+ if (NULL != pcb->finish) {
238
+ pcb->finish(&pi);
239
+ }
240
+ return pi.obj;
241
+ }
242
+
243
+ // Entered after the "<?" sequence. Ready to read the rest.
244
+ static void read_instruction(PInfo pi) {
245
+ char content[256];
246
+ char *content_ptr;
247
+ struct _attrStack attrs;
248
+ char *attr_name;
249
+ char *attr_value;
250
+ char *target;
251
+ char *end;
252
+ char c;
253
+ char *cend;
254
+ size_t size;
255
+ bool attrs_ok = true;
256
+
257
+ *content = '\0';
258
+ attr_stack_init(&attrs);
259
+ if (0 == (target = read_name_token(pi))) {
260
+ return;
261
+ }
262
+ end = pi->s;
263
+ for (; true; pi->s++) {
264
+ switch (*pi->s) {
265
+ case '?':
266
+ if ('>' == *(pi->s + 1)) {
267
+ pi->s++;
268
+ goto DONE;
269
+ }
270
+ break;
271
+ case '\0': set_error(&pi->err, "processing instruction not terminated", pi->str, pi->s); return;
272
+ default: break;
273
+ }
274
+ }
275
+ DONE:
276
+ cend = pi->s;
277
+ size = cend - end - 1;
278
+ pi->s = end;
279
+ if (size < sizeof(content)) {
280
+ content_ptr = content;
281
+ } else {
282
+ content_ptr = ALLOC_N(char, size + 1);
283
+ }
284
+ memcpy(content_ptr, end, size);
285
+ content_ptr[size] = '\0';
286
+
287
+ next_non_white(pi);
288
+ c = *pi->s;
289
+ *end = '\0'; // terminate name
290
+ if ('?' != c) {
291
+ while ('?' != c) {
292
+ pi->last = 0;
293
+ if ('\0' == *pi->s) {
294
+ attr_stack_cleanup(&attrs);
295
+ set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
296
+ return;
297
+ }
298
+ next_non_white(pi);
299
+ if (0 == (attr_name = read_name_token(pi))) {
300
+ attr_stack_cleanup(&attrs);
301
+ return;
302
+ }
303
+ end = pi->s;
304
+ next_non_white(pi);
305
+ if ('=' != *pi->s++) {
306
+ attrs_ok = false;
307
+ break;
308
+ }
309
+ *end = '\0'; // terminate name
310
+ // read value
311
+ next_non_white(pi);
312
+ if (0 == (attr_value = read_quoted_value(pi))) {
313
+ attr_stack_cleanup(&attrs);
314
+ return;
315
+ }
316
+ attr_stack_push(&attrs, attr_name, attr_value);
317
+ next_non_white(pi);
318
+ if ('\0' == pi->last) {
319
+ c = *pi->s;
320
+ } else {
321
+ c = pi->last;
322
+ }
323
+ }
324
+ if ('?' == *pi->s) {
325
+ pi->s++;
326
+ }
327
+ } else {
328
+ pi->s++;
329
+ }
330
+ if (attrs_ok) {
331
+ if ('>' != *pi->s++) {
332
+ attr_stack_cleanup(&attrs);
333
+ set_error(&pi->err, "invalid format, processing instruction not terminated", pi->str, pi->s);
334
+ return;
335
+ }
336
+ } else {
337
+ pi->s = cend + 1;
338
+ }
339
+ if (0 != pi->pcb->instruct) {
340
+ if (attrs_ok) {
341
+ pi->pcb->instruct(pi, target, attrs.head, 0);
342
+ } else {
343
+ pi->pcb->instruct(pi, target, attrs.head, content_ptr);
344
+ }
345
+ } else {
346
+ for (Attr a = attrs.head; a < attrs.tail; a++) {
347
+ if (0 == strcasecmp(a->name, "encoding")) {
348
+ strncpy(pi->options->encoding, a->value, sizeof(pi->options->encoding) - 1);
349
+ pi->options->encoding[sizeof(pi->options->encoding) - 1] = '\0';
350
+ pi->options->rb_enc = rb_enc_find(a->value);
351
+ break;
352
+ }
353
+ }
354
+ }
355
+ attr_stack_cleanup(&attrs);
356
+ if (content_ptr != content) {
357
+ xfree(content_ptr);
358
+ }
359
+ }
360
+
361
+ static void read_delimited(PInfo pi, char end) {
362
+ char c;
363
+
364
+ if ('"' == end || '\'' == end) {
365
+ for (c = *pi->s++; end != c; c = *pi->s++) {
366
+ if ('\0' == c) {
367
+ set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s);
368
+ return;
369
+ }
370
+ }
371
+ } else {
372
+ while (1) {
373
+ c = *pi->s++;
374
+ if (end == c) {
375
+ return;
376
+ }
377
+ switch (c) {
378
+ case '\0': set_error(&pi->err, "invalid format, dectype not terminated", pi->str, pi->s); return;
379
+ case '"': read_delimited(pi, c); break;
380
+ case '\'': read_delimited(pi, c); break;
381
+ case '[': read_delimited(pi, ']'); break;
382
+ case '<': read_delimited(pi, '>'); break;
383
+ default: break;
384
+ }
385
+ }
386
+ }
387
+ }
388
+
389
+ // Entered after the "<!DOCTYPE" sequence plus the first character after
390
+ // that. Ready to read the rest.
391
+ static void read_doctype(PInfo pi) {
392
+ char *doctype;
393
+
394
+ next_non_white(pi);
395
+ doctype = pi->s;
396
+ read_delimited(pi, '>');
397
+ if (err_has(&pi->err)) {
398
+ return;
399
+ }
400
+ pi->s--;
401
+ *pi->s = '\0';
402
+ pi->s++;
403
+ if (0 != pi->pcb->add_doctype) {
404
+ fix_newlines(doctype);
405
+ pi->pcb->add_doctype(pi, doctype);
406
+ }
407
+ }
408
+
409
+ // Entered after "<!--". Returns error code.
410
+ static void read_comment(PInfo pi) {
411
+ char *end;
412
+ char *s;
413
+ char *comment;
414
+ int done = 0;
415
+
416
+ next_non_white(pi);
417
+ comment = pi->s;
418
+ end = strstr(pi->s, "-->");
419
+ if (0 == end) {
420
+ set_error(&pi->err, "invalid format, comment not terminated", pi->str, pi->s);
421
+ return;
422
+ }
423
+ for (s = end - 1; pi->s < s && !done; s--) {
424
+ switch (*s) {
425
+ case ' ':
426
+ case '\t':
427
+ case '\f':
428
+ case '\n':
429
+ case '\r': break;
430
+ default:
431
+ *(s + 1) = '\0';
432
+ done = 1;
433
+ break;
434
+ }
435
+ }
436
+ *end = '\0'; // in case the comment was blank
437
+ pi->s = end + 3;
438
+ if (0 != pi->pcb->add_comment) {
439
+ fix_newlines(comment);
440
+ pi->pcb->add_comment(pi, comment);
441
+ }
442
+ }
443
+
444
+ // Entered after the '<' and the first character after that. Returns stat
445
+ // code.
446
+ static char *read_element(PInfo pi) {
447
+ struct _attrStack attrs;
448
+ const char *attr_name;
449
+ const char *attr_value;
450
+ char *name;
451
+ char *ename;
452
+ char *end;
453
+ char c;
454
+ long elen;
455
+ int hasChildren = 0;
456
+ int done = 0;
457
+
458
+ attr_stack_init(&attrs);
459
+ if (0 == (ename = read_name_token(pi))) {
460
+ return 0;
461
+ }
462
+ end = pi->s;
463
+ elen = end - ename;
464
+ next_non_white(pi);
465
+ c = *pi->s;
466
+ *end = '\0';
467
+ if ('/' == c) {
468
+ // empty element, no attributes and no children
469
+ pi->s++;
470
+ if ('>' != *pi->s) {
471
+ attr_stack_cleanup(&attrs);
472
+ set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
473
+ return 0;
474
+ }
475
+ pi->s++; /* past > */
476
+ pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
477
+ pi->pcb->end_element(pi, ename);
478
+
479
+ attr_stack_cleanup(&attrs);
480
+ return 0;
481
+ }
482
+ /* read attribute names until the close (/ or >) is reached */
483
+ while (!done) {
484
+ if ('\0' == c) {
485
+ if (pi->end <= pi->s) {
486
+ break;
487
+ }
488
+ next_non_white(pi);
489
+ c = *pi->s;
490
+ }
491
+ pi->last = 0;
492
+ switch (c) {
493
+ case '\0':
494
+ attr_stack_cleanup(&attrs);
495
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
496
+ return 0;
497
+ case '/':
498
+ /* Element with just attributes. */
499
+ pi->s++;
500
+ if ('>' != *pi->s) {
501
+ attr_stack_cleanup(&attrs);
502
+ set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
503
+ return 0;
504
+ }
505
+ pi->s++;
506
+ pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
507
+ pi->pcb->end_element(pi, ename);
508
+ attr_stack_cleanup(&attrs);
509
+
510
+ return 0;
511
+ case '>':
512
+ /* has either children or a value */
513
+ pi->s++;
514
+ hasChildren = 1;
515
+ done = 1;
516
+ pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
517
+
518
+ break;
519
+ default:
520
+ /* Attribute name so it's an element and the attribute will be */
521
+ /* added to it. */
522
+ if (0 == (attr_name = read_name_token(pi))) {
523
+ attr_stack_cleanup(&attrs);
524
+ return 0;
525
+ }
526
+ end = pi->s;
527
+ next_non_white(pi);
528
+ if ('=' != *pi->s++) {
529
+ if (TolerantEffort == pi->options->effort) {
530
+ pi->s--;
531
+ pi->last = *pi->s;
532
+ *end = '\0'; /* terminate name */
533
+ attr_value = "";
534
+ attr_stack_push(&attrs, attr_name, attr_value);
535
+ break;
536
+ } else {
537
+ attr_stack_cleanup(&attrs);
538
+ set_error(&pi->err, "invalid format, no attribute value", pi->str, pi->s);
539
+ return 0;
540
+ }
541
+ }
542
+ *end = '\0'; /* terminate name */
543
+ /* read value */
544
+ next_non_white(pi);
545
+ if (0 == (attr_value = read_quoted_value(pi))) {
546
+ return 0;
547
+ }
548
+ if (pi->options->convert_special && 0 != strchr(attr_value, '&')) {
549
+ if (0 != collapse_special(pi, (char *)attr_value) || err_has(&pi->err)) {
550
+ attr_stack_cleanup(&attrs);
551
+ return 0;
552
+ }
553
+ }
554
+ attr_stack_push(&attrs, attr_name, attr_value);
555
+ break;
556
+ }
557
+ if ('\0' == pi->last) {
558
+ c = '\0';
559
+ } else {
560
+ c = pi->last;
561
+ pi->last = '\0';
562
+ }
563
+ }
564
+ if (hasChildren) {
565
+ char *start;
566
+ int first = 1;
567
+
568
+ done = 0;
569
+ /* read children */
570
+ while (!done) {
571
+ start = pi->s;
572
+ next_non_white(pi);
573
+ if (OffSkip == pi->options->skip && start < pi->s && '<' == *pi->s) {
574
+ c = *pi->s;
575
+ *pi->s = '\0';
576
+ pi->pcb->add_text(pi, start, 1);
577
+ *pi->s = c;
578
+ }
579
+ c = *pi->s++;
580
+ if ('\0' == c) {
581
+ attr_stack_cleanup(&attrs);
582
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
583
+ return 0;
584
+ }
585
+ if ('<' == c) {
586
+ char *slash;
587
+
588
+ switch (*pi->s) {
589
+ case '!': /* better be a comment or CDATA */
590
+ pi->s++;
591
+ if ('-' == *pi->s && '-' == *(pi->s + 1)) {
592
+ pi->s += 2;
593
+ read_comment(pi);
594
+ } else if ((TolerantEffort == pi->options->effort) ? 0 == strncasecmp("[CDATA[", pi->s, 7)
595
+ : 0 == strncmp("[CDATA[", pi->s, 7)) {
596
+ pi->s += 7;
597
+ read_cdata(pi);
598
+ } else {
599
+ attr_stack_cleanup(&attrs);
600
+ set_error(&pi->err, "invalid format, invalid comment or CDATA format", pi->str, pi->s);
601
+ return 0;
602
+ }
603
+ break;
604
+ case '?': /* processing instruction */
605
+ pi->s++;
606
+ read_instruction(pi);
607
+ break;
608
+ case '/':
609
+ slash = pi->s;
610
+ pi->s++;
611
+ if (0 == (name = read_name_token(pi))) {
612
+ attr_stack_cleanup(&attrs);
613
+ return 0;
614
+ }
615
+ end = pi->s;
616
+ next_non_white(pi);
617
+ c = *pi->s;
618
+ *end = '\0';
619
+ if (0 !=
620
+ ((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
621
+ attr_stack_cleanup(&attrs);
622
+ if (TolerantEffort == pi->options->effort) {
623
+ pi->pcb->end_element(pi, ename);
624
+ return name;
625
+ } else {
626
+ set_error(&pi->err, "invalid format, elements overlap", pi->str, pi->s);
627
+ return 0;
628
+ }
629
+ }
630
+ if ('>' != c) {
631
+ attr_stack_cleanup(&attrs);
632
+ set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
633
+ return 0;
634
+ }
635
+ if (first && start != slash - 1) {
636
+ // Some white space between start and here so add as
637
+ // text after checking skip.
638
+ *(slash - 1) = '\0';
639
+ switch (pi->options->skip) {
640
+ case CrSkip: {
641
+ char *s = start;
642
+ char *e = start;
643
+
644
+ for (; '\0' != *e; e++) {
645
+ if ('\r' != *e) {
646
+ *s++ = *e;
647
+ }
648
+ }
649
+ *s = '\0';
650
+ break;
651
+ }
652
+ case SpcSkip: *start = '\0'; break;
653
+ case NoSkip:
654
+ case OffSkip:
655
+ default: break;
656
+ }
657
+ if ('\0' != *start) {
658
+ pi->pcb->add_text(pi, start, 1);
659
+ }
660
+ }
661
+ pi->s++;
662
+ pi->pcb->end_element(pi, ename);
663
+ attr_stack_cleanup(&attrs);
664
+ return 0;
665
+ case '\0':
666
+ attr_stack_cleanup(&attrs);
667
+ if (TolerantEffort == pi->options->effort) {
668
+ return 0;
669
+ } else {
670
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
671
+ return 0;
672
+ }
673
+ default:
674
+ first = 0;
675
+ /* a child element */
676
+ // Child closed with mismatched name.
677
+ if (0 != (name = read_element(pi))) {
678
+ attr_stack_cleanup(&attrs);
679
+
680
+ if (0 ==
681
+ ((TolerantEffort == pi->options->effort) ? strcasecmp(name, ename) : strcmp(name, ename))) {
682
+ pi->s++;
683
+ pi->pcb->end_element(pi, ename);
684
+ return 0;
685
+ } else { // not the correct element yet
686
+ pi->pcb->end_element(pi, ename);
687
+ return name;
688
+ }
689
+ } else if (err_has(&pi->err)) {
690
+ return 0;
691
+ }
692
+ break;
693
+ }
694
+ } else { /* read as TEXT */
695
+ char prev = *(start - 1);
696
+
697
+ pi->s = start;
698
+ if ('>' != prev && (' ' <= prev || is_white(prev))) {
699
+ pi->s--;
700
+ }
701
+ read_text(pi);
702
+ /*read_reduced_text(pi); */
703
+
704
+ /* to exit read_text with no errors the next character must be < */
705
+ if ('/' == *(pi->s + 1) &&
706
+ 0 == ((TolerantEffort == pi->options->effort) ? strncasecmp(ename, pi->s + 2, elen)
707
+ : strncmp(ename, pi->s + 2, elen)) &&
708
+ '>' == *(pi->s + elen + 2)) {
709
+ /* close tag after text so treat as a value */
710
+ pi->s += elen + 3;
711
+ pi->pcb->end_element(pi, ename);
712
+ attr_stack_cleanup(&attrs);
713
+ return 0;
714
+ }
715
+ }
716
+ }
717
+ }
718
+ attr_stack_cleanup(&attrs);
719
+ return 0;
720
+ }
721
+
722
+ static void read_text(PInfo pi) {
723
+ char buf[MAX_TEXT_LEN];
724
+ char *b = buf;
725
+ char *alloc_buf = 0;
726
+ char *end = b + sizeof(buf) - 2;
727
+ char c;
728
+ int done = 0;
729
+
730
+ while (!done) {
731
+ c = *pi->s++;
732
+ switch (c) {
733
+ case '<':
734
+ done = 1;
735
+ pi->s--;
736
+ break;
737
+ case '\0': set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s); return;
738
+ default:
739
+ if (end <= (b + (('&' == c) ? 7 : 0))) { /* extra 8 for special just in case it is sequence of bytes */
740
+ unsigned long size;
741
+
742
+ if (0 == alloc_buf) {
743
+ size = sizeof(buf) * 2;
744
+ alloc_buf = ALLOC_N(char, size);
745
+ memcpy(alloc_buf, buf, b - buf);
746
+ b = alloc_buf + (b - buf);
747
+ } else {
748
+ unsigned long pos = b - alloc_buf;
749
+
750
+ size = (end - alloc_buf) * 2;
751
+ REALLOC_N(alloc_buf, char, size);
752
+ b = alloc_buf + pos;
753
+ }
754
+ end = alloc_buf + size - 2;
755
+ }
756
+ if ('&' == c) {
757
+ if (0 == (b = read_coded_chars(pi, b))) {
758
+ return;
759
+ }
760
+ } else {
761
+ if (0 <= c && c <= 0x20) {
762
+ if (StrictEffort == pi->options->effort && 'x' == xml_valid_lower_chars[(unsigned char)c]) {
763
+ set_error(&pi->err, "invalid character", pi->str, pi->s);
764
+ return;
765
+ }
766
+ switch (pi->options->skip) {
767
+ case CrSkip:
768
+ if (buf != b && '\n' == c && '\r' == *(b - 1)) {
769
+ *(b - 1) = '\n';
770
+ } else {
771
+ *b++ = c;
772
+ }
773
+ break;
774
+ case SpcSkip:
775
+ if (is_white(c)) {
776
+ if (buf == b || ' ' != *(b - 1)) {
777
+ *b++ = ' ';
778
+ }
779
+ } else {
780
+ *b++ = c;
781
+ }
782
+ break;
783
+ case NoSkip:
784
+ case OffSkip:
785
+ default: *b++ = c; break;
786
+ }
787
+ } else {
788
+ *b++ = c;
789
+ }
790
+ }
791
+ break;
792
+ }
793
+ }
794
+ *b = '\0';
795
+ if (0 != alloc_buf) {
796
+ fix_newlines(alloc_buf);
797
+ pi->pcb->add_text(pi, alloc_buf, ('/' == *(pi->s + 1)));
798
+ xfree(alloc_buf);
799
+ } else {
800
+ fix_newlines(buf);
801
+ pi->pcb->add_text(pi, buf, ('/' == *(pi->s + 1)));
802
+ }
803
+ }
804
+
805
+ #if 0
806
+ static void
807
+ read_reduced_text(PInfo pi) {
808
+ char buf[MAX_TEXT_LEN];
809
+ char *b = buf;
810
+ char *alloc_buf = 0;
811
+ char *end = b + sizeof(buf) - 2;
812
+ char c;
813
+ int spc = 0;
814
+ int done = 0;
815
+
816
+ while (!done) {
817
+ c = *pi->s++;
818
+ switch(c) {
819
+ case ' ':
820
+ case '\t':
821
+ case '\f':
822
+ case '\n':
823
+ case '\r':
824
+ spc = 1;
825
+ break;
826
+ case '<':
827
+ done = 1;
828
+ pi->s--;
829
+ break;
830
+ case '\0':
831
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
832
+ return;
833
+ default:
834
+ if (end <= (b + spc + (('&' == c) ? 7 : 0))) { /* extra 8 for special just in case it is sequence of bytes */
835
+ unsigned long size;
836
+
837
+ if (0 == alloc_buf) {
838
+ size = sizeof(buf) * 2;
839
+ alloc_buf = ALLOC_N(char, size);
840
+ memcpy(alloc_buf, buf, b - buf);
841
+ b = alloc_buf + (b - buf);
842
+ } else {
843
+ unsigned long pos = b - alloc_buf;
844
+
845
+ size = (end - alloc_buf) * 2;
846
+ REALLOC(alloc_buf, char, size);
847
+ b = alloc_buf + pos;
848
+ }
849
+ end = alloc_buf + size - 2;
850
+ }
851
+ if (spc) {
852
+ *b++ = ' ';
853
+ }
854
+ spc = 0;
855
+ if ('&' == c) {
856
+ if (0 == (b = read_coded_chars(pi, b))) {
857
+ return;
858
+ }
859
+ } else {
860
+ *b++ = c;
861
+ }
862
+ break;
863
+ }
864
+ }
865
+ *b = '\0';
866
+ if (0 != alloc_buf) {
867
+ fix_newlines(alloc_buf);
868
+ pi->pcb->add_text(pi, alloc_buf, ('/' == *(pi->s + 1)));
869
+ xfree(alloc_buf);
870
+ } else {
871
+ fix_newlines(buf);
872
+ pi->pcb->add_text(pi, buf, ('/' == *(pi->s + 1)));
873
+ }
874
+ }
875
+ #endif
876
+
877
+ static char *read_name_token(PInfo pi) {
878
+ char *start;
879
+
880
+ next_non_white(pi);
881
+ start = pi->s;
882
+ for (; 1; pi->s++) {
883
+ switch (*pi->s) {
884
+ case ' ':
885
+ case '\t':
886
+ case '\f':
887
+ case '?':
888
+ case '=':
889
+ case '/':
890
+ case '>':
891
+ case '\n':
892
+ case '\r': return start;
893
+ case '\0':
894
+ /* documents never terminate after a name token */
895
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
896
+ return 0;
897
+ break; /* to avoid warnings */
898
+ case ':':
899
+ if ('\0' == *pi->options->strip_ns) {
900
+ break;
901
+ } else if ('*' == *pi->options->strip_ns && '\0' == pi->options->strip_ns[1]) {
902
+ start = pi->s + 1;
903
+ } else if (0 == strncmp(pi->options->strip_ns, start, pi->s - start)) {
904
+ start = pi->s + 1;
905
+ }
906
+ break;
907
+ default: break;
908
+ }
909
+ }
910
+ return start;
911
+ }
912
+
913
+ static void read_cdata(PInfo pi) {
914
+ char *start;
915
+ char *end;
916
+
917
+ start = pi->s;
918
+ end = strstr(pi->s, "]]>");
919
+ if (end == 0) {
920
+ set_error(&pi->err, "invalid format, CDATA not terminated", pi->str, pi->s);
921
+ return;
922
+ }
923
+ *end = '\0';
924
+ pi->s = end + 3;
925
+ if (0 != pi->pcb->add_cdata) {
926
+ fix_newlines(start);
927
+ pi->pcb->add_cdata(pi, start, end - start);
928
+ }
929
+ }
930
+
931
+ /* Assume the value starts immediately and goes until the quote character is
932
+ * reached again. Do not read the character after the terminating quote.
933
+ */
934
+ static char *read_quoted_value(PInfo pi) {
935
+ char *value = 0;
936
+
937
+ if ('"' == *pi->s || '\'' == *pi->s) {
938
+ char term = *pi->s;
939
+
940
+ pi->s++; /* skip quote character */
941
+ value = pi->s;
942
+ for (; *pi->s != term; pi->s++) {
943
+ if ('\0' == *pi->s) {
944
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
945
+ return 0;
946
+ }
947
+ }
948
+ *pi->s = '\0'; /* terminate value */
949
+ pi->s++; /* move past quote */
950
+ } else if (StrictEffort == pi->options->effort) {
951
+ set_error(&pi->err, "invalid format, expected a quote character", pi->str, pi->s);
952
+ return 0;
953
+ } else if (TolerantEffort == pi->options->effort) {
954
+ value = pi->s;
955
+ for (; 1; pi->s++) {
956
+ switch (*pi->s) {
957
+ case '\0': set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s); return 0;
958
+ case ' ':
959
+ case '/':
960
+ case '>':
961
+ case '?': // for instructions
962
+ case '\t':
963
+ case '\n':
964
+ case '\r':
965
+ pi->last = *pi->s;
966
+ *pi->s = '\0'; /* terminate value */
967
+ pi->s++;
968
+ return value;
969
+ default: break;
970
+ }
971
+ }
972
+ } else {
973
+ value = pi->s;
974
+ next_white(pi);
975
+ if ('\0' == *pi->s) {
976
+ set_error(&pi->err, "invalid format, document not terminated", pi->str, pi->s);
977
+ return 0;
978
+ }
979
+ *pi->s++ = '\0'; /* terminate value */
980
+ }
981
+ return value;
982
+ }
983
+
984
+ static char *read_hex_uint64(char *b, uint64_t *up) {
985
+ uint64_t u = 0;
986
+ char c;
987
+
988
+ for (; ';' != *b; b++) {
989
+ c = *b;
990
+ if ('0' <= c && c <= '9') {
991
+ u = (u << 4) | (uint64_t)(c - '0');
992
+ } else if ('a' <= c && c <= 'f') {
993
+ u = (u << 4) | (uint64_t)(c - 'a' + 10);
994
+ } else if ('A' <= c && c <= 'F') {
995
+ u = (u << 4) | (uint64_t)(c - 'A' + 10);
996
+ } else {
997
+ return 0;
998
+ }
999
+ }
1000
+ *up = u;
1001
+
1002
+ return b;
1003
+ }
1004
+
1005
+ static char *read_10_uint64(char *b, uint64_t *up) {
1006
+ uint64_t u = 0;
1007
+ char c;
1008
+
1009
+ for (; ';' != *b; b++) {
1010
+ c = *b;
1011
+ if ('0' <= c && c <= '9') {
1012
+ u = (u * 10) + (uint64_t)(c - '0');
1013
+ } else {
1014
+ return 0;
1015
+ }
1016
+ }
1017
+ *up = u;
1018
+
1019
+ return b;
1020
+ }
1021
+
1022
+ static char *read_coded_chars(PInfo pi, char *text) {
1023
+ char *b, buf[32];
1024
+ char *end = buf + sizeof(buf) - 1;
1025
+ char *s;
1026
+ long blen = 0;
1027
+
1028
+ for (b = buf, s = pi->s; b < end; b++, s++) {
1029
+ *b = *s;
1030
+ if (';' == *s) {
1031
+ *(b + 1) = '\0';
1032
+ blen = b - buf;
1033
+ s++;
1034
+ break;
1035
+ }
1036
+ }
1037
+ if (b > end) {
1038
+ *text++ = '&';
1039
+ } else if ('#' == *buf) {
1040
+ uint64_t u = 0;
1041
+
1042
+ b = buf + 1;
1043
+ if ('x' == *b || 'X' == *b) {
1044
+ b = read_hex_uint64(b + 1, &u);
1045
+ } else {
1046
+ b = read_10_uint64(b, &u);
1047
+ }
1048
+ if (0 == b) {
1049
+ *text++ = '&';
1050
+ } else {
1051
+ if (u <= 0x000000000000007FULL) {
1052
+ *text++ = (char)u;
1053
+ } else if (ox_utf8_encoding == pi->options->rb_enc) {
1054
+ text = ox_ucs_to_utf8_chars(text, u);
1055
+ } else if (0 == pi->options->rb_enc) {
1056
+ pi->options->rb_enc = ox_utf8_encoding;
1057
+ text = ox_ucs_to_utf8_chars(text, u);
1058
+ } else if (TolerantEffort == pi->options->effort) {
1059
+ *text++ = '&';
1060
+ return text;
1061
+ } else if (u <= 0x00000000000000FFULL) {
1062
+ *text++ = (char)u;
1063
+ } else {
1064
+ /*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character
1065
+ * sequences.", pi->str, pi->s); */
1066
+ set_error(&pi->err,
1067
+ "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.",
1068
+ pi->str,
1069
+ pi->s);
1070
+ return NULL;
1071
+ }
1072
+ pi->s = s;
1073
+ }
1074
+ } else {
1075
+ char *t2;
1076
+
1077
+ buf[blen] = '\0';
1078
+ if (NULL == (t2 = ox_entity_lookup(text, buf))) {
1079
+ *text++ = '&';
1080
+ } else {
1081
+ text = t2;
1082
+ pi->s = s;
1083
+ }
1084
+ }
1085
+ return text;
1086
+ }
1087
+
1088
+ static int collapse_special(PInfo pi, char *str) {
1089
+ char *s = str;
1090
+ char *b = str;
1091
+
1092
+ while ('\0' != *s) {
1093
+ if ('&' == *s) {
1094
+ int c;
1095
+ char *end;
1096
+
1097
+ s++;
1098
+ if ('#' == *s) {
1099
+ uint64_t u = 0;
1100
+ char x;
1101
+
1102
+ s++;
1103
+ if ('x' == *s || 'X' == *s) {
1104
+ x = *s;
1105
+ s++;
1106
+ end = read_hex_uint64(s, &u);
1107
+ } else {
1108
+ x = '\0';
1109
+ end = read_10_uint64(s, &u);
1110
+ }
1111
+ if (0 == end) {
1112
+ if (TolerantEffort == pi->options->effort) {
1113
+ *b++ = '&';
1114
+ *b++ = '#';
1115
+ if ('\0' != x) {
1116
+ *b++ = x;
1117
+ }
1118
+ continue;
1119
+ }
1120
+ return EDOM;
1121
+ }
1122
+ if (u <= 0x000000000000007FULL) {
1123
+ *b++ = (char)u;
1124
+ } else if (ox_utf8_encoding == pi->options->rb_enc) {
1125
+ b = ox_ucs_to_utf8_chars(b, u);
1126
+ /* TBD support UTF-16 */
1127
+ } else if (0 == pi->options->rb_enc) {
1128
+ pi->options->rb_enc = ox_utf8_encoding;
1129
+ b = ox_ucs_to_utf8_chars(b, u);
1130
+ } else {
1131
+ /* set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character
1132
+ * sequences.", pi->str, pi->s);*/
1133
+ set_error(&pi->err,
1134
+ "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.",
1135
+ pi->str,
1136
+ pi->s);
1137
+ return 0;
1138
+ }
1139
+ s = end + 1;
1140
+ } else {
1141
+ if (0 == strncasecmp(s, "lt;", 3)) {
1142
+ c = '<';
1143
+ s += 3;
1144
+ } else if (0 == strncasecmp(s, "gt;", 3)) {
1145
+ c = '>';
1146
+ s += 3;
1147
+ } else if (0 == strncasecmp(s, "amp;", 4)) {
1148
+ c = '&';
1149
+ s += 4;
1150
+ } else if (0 == strncasecmp(s, "quot;", 5)) {
1151
+ c = '"';
1152
+ s += 5;
1153
+ } else if (0 == strncasecmp(s, "apos;", 5)) {
1154
+ c = '\'';
1155
+ s += 5;
1156
+ } else if (TolerantEffort == pi->options->effort) {
1157
+ *b++ = '&';
1158
+ continue;
1159
+ } else {
1160
+ char key[16];
1161
+ char *k = key;
1162
+ char *kend = key + sizeof(key) - 1;
1163
+
1164
+ *k++ = *s;
1165
+ while (';' != *s++) {
1166
+ if ('\0' == *s) {
1167
+ set_error(&pi->err,
1168
+ "Invalid format, special character does not end with a semicolon",
1169
+ pi->str,
1170
+ pi->s);
1171
+ return EDOM;
1172
+ }
1173
+ if (kend <= k) {
1174
+ k = key;
1175
+ break;
1176
+ }
1177
+ *k++ = *s;
1178
+ }
1179
+ k--;
1180
+ *k = '\0';
1181
+ if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
1182
+ set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
1183
+ c = '?';
1184
+ return 0;
1185
+ }
1186
+ continue;
1187
+ }
1188
+ *b++ = (char)c;
1189
+ }
1190
+ } else {
1191
+ *b++ = *s++;
1192
+ }
1193
+ }
1194
+ *b = '\0';
1195
+
1196
+ return 0;
1197
+ }