ox 1.9.2 → 1.9.3
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of ox might be problematic. Click here for more details.
- data/README.md +15 -2
- data/ext/ox/extconf.rb +1 -1
- data/ext/ox/ox.c +8 -3
- data/ext/ox/ox.h +2 -2
- data/ext/ox/parse.c +104 -20
- data/ext/ox/sax.c +123 -35
- data/lib/ox/version.rb +1 -1
- metadata +14 -11
- checksums.yaml +0 -7
data/README.md
CHANGED
@@ -34,9 +34,22 @@ A fast XML parser and Object marshaller as a Ruby gem.
|
|
34
34
|
|
35
35
|
## <a name="release">Release Notes</a>
|
36
36
|
|
37
|
-
### Release 1.9.
|
37
|
+
### Release 1.9.3
|
38
38
|
|
39
|
-
-
|
39
|
+
- mcarpenter fixed a compile problem with Cygwin.
|
40
|
+
|
41
|
+
- Now more tolerant when the :effort is set to :tolerant. Ox will let all sorts
|
42
|
+
of errors typical in HTML documents pass. The result may not be perfect but
|
43
|
+
at least parsed results are returned.
|
44
|
+
|
45
|
+
- Attribute values need not be quoted or they can be quoted with single
|
46
|
+
quotes or there can be no =value are all.
|
47
|
+
|
48
|
+
- Elements not terminated will be terminated by the next element
|
49
|
+
termination. This effect goes up until a match is found on the element
|
50
|
+
name.
|
51
|
+
|
52
|
+
- SAX parser also given a :tolerant option with the same tolerance as the string parser.
|
40
53
|
|
41
54
|
## <a name="description">Description</a>
|
42
55
|
|
data/ext/ox/extconf.rb
CHANGED
@@ -20,7 +20,7 @@ dflags = {
|
|
20
20
|
'HAS_RB_TIME_TIMESPEC' => ('ruby' == type && ('1.9.3' == RUBY_VERSION)) ? 1 : 0,
|
21
21
|
#'HAS_RB_TIME_TIMESPEC' => ('ruby' == type && ('1.9.3' == RUBY_VERSION || '2' <= version[0])) ? 1 : 0,
|
22
22
|
'HAS_TM_GMTOFF' => ('ruby' == type && (('1' == version[0] && '9' == version[1]) || '2' <= version[0]) &&
|
23
|
-
!(platform.include?('solaris') || platform.include?('linux') || RUBY_PLATFORM =~ /(win|w)32$/)) ? 1 : 0,
|
23
|
+
!(platform.include?('cygwin') || platform.include?('solaris') || platform.include?('linux') || RUBY_PLATFORM =~ /(win|w)32$/)) ? 1 : 0,
|
24
24
|
'HAS_ENCODING_SUPPORT' => (('ruby' == type || 'rubinius' == type) &&
|
25
25
|
(('1' == version[0] && '9' == version[1]) || '2' <= version[0])) ? 1 : 0,
|
26
26
|
'HAS_PRIVATE_ENCODING' => ('jruby' == type && '1' == version[0] && '9' == version[1]) ? 1 : 0,
|
data/ext/ox/ox.c
CHANGED
@@ -435,7 +435,7 @@ static VALUE
|
|
435
435
|
load(char *xml, int argc, VALUE *argv, VALUE self, VALUE encoding) {
|
436
436
|
VALUE obj;
|
437
437
|
struct _Options options = ox_default_options;
|
438
|
-
|
438
|
+
|
439
439
|
if (1 == argc && rb_cHash == rb_obj_class(*argv)) {
|
440
440
|
VALUE h = *argv;
|
441
441
|
VALUE v;
|
@@ -622,11 +622,13 @@ load_file(int argc, VALUE *argv, VALUE self) {
|
|
622
622
|
* @param [Ox::Sax] handler SAX (responds to OX::Sax methods) like handler
|
623
623
|
* @param [IO|String] io IO Object to read from
|
624
624
|
* @param [Hash] options parse options
|
625
|
-
* @param [true|false] :convert_special flag indicating special
|
625
|
+
* @param [true|false] :convert_special flag indicating special characters like < are converted
|
626
|
+
* @param [true|false] :tolerant flag indicating the parser should be tolerant of XML errors
|
626
627
|
*/
|
627
628
|
static VALUE
|
628
629
|
sax_parse(int argc, VALUE *argv, VALUE self) {
|
629
630
|
int convert = 0;
|
631
|
+
int tolerant = 0;
|
630
632
|
|
631
633
|
if (argc < 2) {
|
632
634
|
rb_raise(ox_parse_error_class, "Wrong number of arguments to sax_parse.\n");
|
@@ -638,8 +640,11 @@ sax_parse(int argc, VALUE *argv, VALUE self) {
|
|
638
640
|
if (Qnil != (v = rb_hash_lookup(h, convert_special_sym))) {
|
639
641
|
convert = (Qtrue == v);
|
640
642
|
}
|
643
|
+
if (Qnil != (v = rb_hash_lookup(h, tolerant_sym))) {
|
644
|
+
tolerant = (Qtrue == v);
|
645
|
+
}
|
641
646
|
}
|
642
|
-
ox_sax_parse(argv[0], argv[1], convert);
|
647
|
+
ox_sax_parse(argv[0], argv[1], convert, tolerant);
|
643
648
|
|
644
649
|
return Qnil;
|
645
650
|
}
|
data/ext/ox/ox.h
CHANGED
@@ -206,15 +206,15 @@ struct _PInfo {
|
|
206
206
|
CircArray circ_array;
|
207
207
|
unsigned long id; /* set for text types when cirs_array is set */
|
208
208
|
Options options;
|
209
|
+
char last; /* last character read, rarely set */
|
209
210
|
};
|
210
211
|
|
211
212
|
extern VALUE ox_parse(char *xml, ParseCallbacks pcb, char **endp, Options options);
|
212
213
|
extern void _ox_raise_error(const char *msg, const char *xml, const char *current, const char* file, int line);
|
213
214
|
|
214
|
-
extern void ox_sax_parse(VALUE handler, VALUE io, int convert);
|
215
|
+
extern void ox_sax_parse(VALUE handler, VALUE io, int convert, int tolerant);
|
215
216
|
extern void ox_sax_define(void);
|
216
217
|
|
217
|
-
|
218
218
|
extern char* ox_write_obj_to_str(VALUE obj, Options copts);
|
219
219
|
extern void ox_write_obj_to_file(VALUE obj, const char *path, Options copts);
|
220
220
|
|
data/ext/ox/parse.c
CHANGED
@@ -39,7 +39,7 @@
|
|
39
39
|
static void read_instruction(PInfo pi);
|
40
40
|
static void read_doctype(PInfo pi);
|
41
41
|
static void read_comment(PInfo pi);
|
42
|
-
static
|
42
|
+
static char* read_element(PInfo pi);
|
43
43
|
static void read_text(PInfo pi);
|
44
44
|
/*static void read_reduced_text(PInfo pi); */
|
45
45
|
static void read_cdata(PInfo pi);
|
@@ -147,7 +147,7 @@ ox_parse(char *xml, ParseCallbacks pcb, char **endp, Options options) {
|
|
147
147
|
pi.s++; /* skip second - */
|
148
148
|
read_comment(&pi);
|
149
149
|
}
|
150
|
-
} else if (0 == strncmp("DOCTYPE", pi.s, 7)) {
|
150
|
+
} else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7) : 0 == strncmp("DOCTYPE", pi.s, 7)) {
|
151
151
|
pi.s += 7;
|
152
152
|
read_doctype(&pi);
|
153
153
|
} else {
|
@@ -210,7 +210,8 @@ read_instruction(PInfo pi) {
|
|
210
210
|
c = *pi->s;
|
211
211
|
*end = '\0'; /* terminate name */
|
212
212
|
if ('?' != c) {
|
213
|
-
while ('?' !=
|
213
|
+
while ('?' != c) {
|
214
|
+
pi->last = 0;
|
214
215
|
if ('\0' == *pi->s) {
|
215
216
|
raise_error("invalid format, processing instruction not terminated", pi->str, pi->s);
|
216
217
|
}
|
@@ -232,6 +233,11 @@ read_instruction(PInfo pi) {
|
|
232
233
|
break;
|
233
234
|
}
|
234
235
|
next_non_white(pi);
|
236
|
+
if ('\0' == pi->last) {
|
237
|
+
c = *pi->s;
|
238
|
+
} else {
|
239
|
+
c = pi->last;
|
240
|
+
}
|
235
241
|
}
|
236
242
|
if ('?' == *pi->s) {
|
237
243
|
pi->s++;
|
@@ -326,7 +332,7 @@ read_comment(PInfo pi) {
|
|
326
332
|
/* Entered after the '<' and the first character after that. Returns status
|
327
333
|
* code.
|
328
334
|
*/
|
329
|
-
static
|
335
|
+
static char*
|
330
336
|
read_element(PInfo pi) {
|
331
337
|
struct _Attr attrs[MAX_ATTRS];
|
332
338
|
Attr ap = attrs;
|
@@ -356,7 +362,7 @@ read_element(PInfo pi) {
|
|
356
362
|
pi->pcb->add_element(pi, ename, attrs, hasChildren);
|
357
363
|
pi->pcb->end_element(pi, ename);
|
358
364
|
|
359
|
-
return;
|
365
|
+
return 0;
|
360
366
|
}
|
361
367
|
/* read attribute names until the close (/ or >) is reached */
|
362
368
|
while (!done) {
|
@@ -364,6 +370,7 @@ read_element(PInfo pi) {
|
|
364
370
|
next_non_white(pi);
|
365
371
|
c = *pi->s;
|
366
372
|
}
|
373
|
+
pi->last = 0;
|
367
374
|
switch (c) {
|
368
375
|
case '\0':
|
369
376
|
raise_error("invalid format, document not terminated", pi->str, pi->s);
|
@@ -378,7 +385,7 @@ read_element(PInfo pi) {
|
|
378
385
|
pi->pcb->add_element(pi, ename, attrs, hasChildren);
|
379
386
|
pi->pcb->end_element(pi, ename);
|
380
387
|
|
381
|
-
return;
|
388
|
+
return 0;
|
382
389
|
case '>':
|
383
390
|
/* has either children or a value */
|
384
391
|
pi->s++;
|
@@ -394,7 +401,19 @@ read_element(PInfo pi) {
|
|
394
401
|
end = pi->s;
|
395
402
|
next_non_white(pi);
|
396
403
|
if ('=' != *pi->s++) {
|
397
|
-
|
404
|
+
if (TolerantEffort == pi->options->effort) {
|
405
|
+
pi->s--;
|
406
|
+
pi->last = *pi->s;
|
407
|
+
*end = '\0'; /* terminate name */
|
408
|
+
ap->value = "";
|
409
|
+
ap++;
|
410
|
+
if (MAX_ATTRS <= (ap - attrs)) {
|
411
|
+
raise_error("too many attributes", pi->str, pi->s);
|
412
|
+
}
|
413
|
+
break;
|
414
|
+
} else {
|
415
|
+
raise_error("invalid format, no attribute value", pi->str, pi->s);
|
416
|
+
}
|
398
417
|
}
|
399
418
|
*end = '\0'; /* terminate name */
|
400
419
|
/* read value */
|
@@ -411,7 +430,12 @@ read_element(PInfo pi) {
|
|
411
430
|
}
|
412
431
|
break;
|
413
432
|
}
|
414
|
-
|
433
|
+
if ('\0' == pi->last) {
|
434
|
+
c = '\0';
|
435
|
+
} else {
|
436
|
+
c = pi->last;
|
437
|
+
pi->last = '\0';
|
438
|
+
}
|
415
439
|
}
|
416
440
|
if (hasChildren) {
|
417
441
|
char *start;
|
@@ -435,7 +459,9 @@ read_element(PInfo pi) {
|
|
435
459
|
if ('-' == *pi->s && '-' == *(pi->s + 1)) {
|
436
460
|
pi->s += 2;
|
437
461
|
read_comment(pi);
|
438
|
-
} else if (
|
462
|
+
} else if ((TolerantEffort == pi->options->effort) ?
|
463
|
+
0 == strncasecmp("[CDATA[", pi->s, 7) :
|
464
|
+
0 == strncmp("[CDATA[", pi->s, 7)) {
|
439
465
|
pi->s += 7;
|
440
466
|
read_cdata(pi);
|
441
467
|
} else {
|
@@ -455,7 +481,12 @@ read_element(PInfo pi) {
|
|
455
481
|
c = *pi->s;
|
456
482
|
*end = '\0';
|
457
483
|
if (0 != strcmp(name, ename)) {
|
458
|
-
|
484
|
+
if (TolerantEffort == pi->options->effort) {
|
485
|
+
pi->pcb->end_element(pi, ename);
|
486
|
+
return name;
|
487
|
+
} else {
|
488
|
+
raise_error("invalid format, elements overlap", pi->str, pi->s);
|
489
|
+
}
|
459
490
|
}
|
460
491
|
if ('>' != c) {
|
461
492
|
raise_error("invalid format, element not closed", pi->str, pi->s);
|
@@ -467,13 +498,27 @@ read_element(PInfo pi) {
|
|
467
498
|
}
|
468
499
|
pi->s++;
|
469
500
|
pi->pcb->end_element(pi, ename);
|
470
|
-
return;
|
501
|
+
return 0;
|
471
502
|
case '\0':
|
472
|
-
|
503
|
+
if (TolerantEffort == pi->options->effort) {
|
504
|
+
return 0;
|
505
|
+
} else {
|
506
|
+
raise_error("invalid format, document not terminated", pi->str, pi->s);
|
507
|
+
}
|
473
508
|
default:
|
474
509
|
first = 0;
|
475
510
|
/* a child element */
|
476
|
-
|
511
|
+
// Child closed with mismatched name.
|
512
|
+
if (0 != (name = read_element(pi))) {
|
513
|
+
if (0 == strcmp(name, ename)) {
|
514
|
+
pi->s++;
|
515
|
+
pi->pcb->end_element(pi, ename);
|
516
|
+
return 0;
|
517
|
+
} else { // not the correct element yet
|
518
|
+
pi->pcb->end_element(pi, ename);
|
519
|
+
return name;
|
520
|
+
}
|
521
|
+
}
|
477
522
|
break;
|
478
523
|
}
|
479
524
|
} else { /* read as TEXT */
|
@@ -489,11 +534,12 @@ read_element(PInfo pi) {
|
|
489
534
|
/* close tag after text so treat as a value */
|
490
535
|
pi->s += elen + 3;
|
491
536
|
pi->pcb->end_element(pi, ename);
|
492
|
-
return;
|
537
|
+
return 0;
|
493
538
|
}
|
494
539
|
}
|
495
540
|
}
|
496
541
|
}
|
542
|
+
return 0;
|
497
543
|
}
|
498
544
|
|
499
545
|
static void
|
@@ -697,10 +743,31 @@ read_quoted_value(PInfo pi) {
|
|
697
743
|
raise_error("invalid format, document not terminated", pi->str, pi->s);
|
698
744
|
}
|
699
745
|
}
|
700
|
-
*pi->s = '\0';
|
701
|
-
pi->s++;
|
746
|
+
*pi->s = '\0'; /* terminate value */
|
747
|
+
pi->s++; /* move past quote */
|
702
748
|
} else if (StrictEffort == pi->options->effort) {
|
703
749
|
raise_error("invalid format, expected a quote character", pi->str, pi->s);
|
750
|
+
} else if (TolerantEffort == pi->options->effort) {
|
751
|
+
value = pi->s;
|
752
|
+
for (; 1; pi->s++) {
|
753
|
+
switch (*pi->s) {
|
754
|
+
case '\0':
|
755
|
+
raise_error("invalid format, document not terminated", pi->str, pi->s);
|
756
|
+
case ' ':
|
757
|
+
case '/':
|
758
|
+
case '>':
|
759
|
+
case '?': // for instructions
|
760
|
+
case '\t':
|
761
|
+
case '\n':
|
762
|
+
case '\r':
|
763
|
+
pi->last = *pi->s;
|
764
|
+
*pi->s = '\0'; /* terminate value */
|
765
|
+
pi->s++;
|
766
|
+
return value;
|
767
|
+
default:
|
768
|
+
break;
|
769
|
+
}
|
770
|
+
}
|
704
771
|
} else {
|
705
772
|
value = pi->s;
|
706
773
|
next_white(pi);
|
@@ -812,7 +879,7 @@ read_coded_chars(PInfo pi, char *text) {
|
|
812
879
|
}
|
813
880
|
}
|
814
881
|
if (b > end) {
|
815
|
-
*text++ =
|
882
|
+
*text++ = '&';
|
816
883
|
} else if ('#' == *buf) {
|
817
884
|
uint64_t u = 0;
|
818
885
|
|
@@ -823,9 +890,8 @@ read_coded_chars(PInfo pi, char *text) {
|
|
823
890
|
b = read_10_uint64(b, &u);
|
824
891
|
}
|
825
892
|
if (0 == b) {
|
826
|
-
*text++ =
|
893
|
+
*text++ = '&';
|
827
894
|
} else {
|
828
|
-
pi->s = s;
|
829
895
|
if (u <= 0x000000000000007FULL) {
|
830
896
|
*text++ = (char)u;
|
831
897
|
#if HAS_PRIVATE_ENCODING
|
@@ -842,10 +908,14 @@ read_coded_chars(PInfo pi, char *text) {
|
|
842
908
|
#endif
|
843
909
|
pi->options->rb_enc = ox_utf8_encoding;
|
844
910
|
text = ucs_to_utf8_chars(text, u);
|
911
|
+
} else if (TolerantEffort == pi->options->effort) {
|
912
|
+
*text++ = '&';
|
913
|
+
return text;
|
845
914
|
} else {
|
846
915
|
/*raise_error("Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */
|
847
916
|
raise_error("Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
|
848
917
|
}
|
918
|
+
pi->s = s;
|
849
919
|
}
|
850
920
|
} else if (0 == strcasecmp(buf, "nbsp;")) {
|
851
921
|
pi->s = s;
|
@@ -866,7 +936,7 @@ read_coded_chars(PInfo pi, char *text) {
|
|
866
936
|
pi->s = s;
|
867
937
|
*text++ = '\'';
|
868
938
|
} else {
|
869
|
-
*text++ =
|
939
|
+
*text++ = '&';
|
870
940
|
}
|
871
941
|
return text;
|
872
942
|
}
|
@@ -884,15 +954,26 @@ collapse_special(PInfo pi, char *str) {
|
|
884
954
|
s++;
|
885
955
|
if ('#' == *s) {
|
886
956
|
uint64_t u = 0;
|
957
|
+
char x;
|
887
958
|
|
888
959
|
s++;
|
889
960
|
if ('x' == *s || 'X' == *s) {
|
961
|
+
x = *s;
|
890
962
|
s++;
|
891
963
|
end = read_hex_uint64(s, &u);
|
892
964
|
} else {
|
965
|
+
x = '\0';
|
893
966
|
end = read_10_uint64(s, &u);
|
894
967
|
}
|
895
968
|
if (0 == end) {
|
969
|
+
if (TolerantEffort == pi->options->effort) {
|
970
|
+
*b++ = '&';
|
971
|
+
*b++ = '#';
|
972
|
+
if ('\0' != x) {
|
973
|
+
*b++ = x;
|
974
|
+
}
|
975
|
+
continue;
|
976
|
+
}
|
896
977
|
return EDOM;
|
897
978
|
}
|
898
979
|
if (u <= 0x000000000000007FULL) {
|
@@ -933,6 +1014,9 @@ collapse_special(PInfo pi, char *str) {
|
|
933
1014
|
} else if (0 == strncasecmp(s, "apos;", 5)) {
|
934
1015
|
c = '\'';
|
935
1016
|
s += 5;
|
1017
|
+
} else if (TolerantEffort == pi->options->effort) {
|
1018
|
+
*b++ = '&';
|
1019
|
+
continue;
|
936
1020
|
} else {
|
937
1021
|
c = '?';
|
938
1022
|
while (';' != *s++) {
|
data/ext/ox/sax.c
CHANGED
@@ -42,6 +42,8 @@
|
|
42
42
|
#include "ruby.h"
|
43
43
|
#include "ox.h"
|
44
44
|
|
45
|
+
#define NAME_MISMATCH 1
|
46
|
+
|
45
47
|
typedef struct _SaxDrive {
|
46
48
|
char base_buf[0x00010000];
|
47
49
|
char *buf;
|
@@ -55,6 +57,7 @@ typedef struct _SaxDrive {
|
|
55
57
|
VALUE value_obj;
|
56
58
|
int (*read_func)(struct _SaxDrive *dr);
|
57
59
|
int convert_special;
|
60
|
+
int tolerant;
|
58
61
|
union {
|
59
62
|
int fd;
|
60
63
|
VALUE io;
|
@@ -84,7 +87,7 @@ typedef struct _SaxDrive {
|
|
84
87
|
#ifdef NEEDS_STPCPY
|
85
88
|
char *stpncpy(char *dest, const char *src, size_t n);
|
86
89
|
#endif
|
87
|
-
static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert);
|
90
|
+
static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert, int tolerant);
|
88
91
|
static void sax_drive_cleanup(SaxDrive dr);
|
89
92
|
static int sax_drive_read(SaxDrive dr);
|
90
93
|
static void sax_drive_error(SaxDrive dr, const char *msg, int critical);
|
@@ -98,8 +101,8 @@ static int read_element(SaxDrive dr);
|
|
98
101
|
static int read_text(SaxDrive dr);
|
99
102
|
static const char* read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml);
|
100
103
|
static char read_name_token(SaxDrive dr);
|
101
|
-
static int read_quoted_value(SaxDrive dr);
|
102
|
-
static int collapse_special(char *str);
|
104
|
+
static int read_quoted_value(SaxDrive dr, char *last);
|
105
|
+
static int collapse_special(char *str, int tolerant);
|
103
106
|
|
104
107
|
static VALUE rescue_cb(VALUE rdr, VALUE err);
|
105
108
|
static VALUE io_cb(VALUE rdr);
|
@@ -251,10 +254,10 @@ str2sym(const char *str, SaxDrive dr, char **strp) {
|
|
251
254
|
|
252
255
|
|
253
256
|
void
|
254
|
-
ox_sax_parse(VALUE handler, VALUE io, int convert) {
|
257
|
+
ox_sax_parse(VALUE handler, VALUE io, int convert, int tolerant) {
|
255
258
|
struct _SaxDrive dr;
|
256
259
|
|
257
|
-
sax_drive_init(&dr, handler, io, convert);
|
260
|
+
sax_drive_init(&dr, handler, io, convert, tolerant);
|
258
261
|
#if 0
|
259
262
|
printf("*** sax_parse with these flags\n");
|
260
263
|
printf(" has_instruct = %s\n", dr.has_instruct ? "true" : "false");
|
@@ -293,7 +296,7 @@ respond_to(VALUE obj, ID method) {
|
|
293
296
|
}
|
294
297
|
|
295
298
|
static void
|
296
|
-
sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert) {
|
299
|
+
sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert, int tolerant) {
|
297
300
|
if (ox_stringio_class == rb_obj_class(io)) {
|
298
301
|
VALUE s = rb_funcall2(io, ox_string_id, 0, 0);
|
299
302
|
|
@@ -344,6 +347,7 @@ sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert) {
|
|
344
347
|
dr->value_obj = rb_data_object_alloc(sax_value_class, dr, 0, 0);
|
345
348
|
rb_gc_register_address(&dr->value_obj);
|
346
349
|
dr->convert_special = convert;
|
350
|
+
dr->tolerant = tolerant;
|
347
351
|
dr->has_instruct = respond_to(handler, ox_instruct_id);
|
348
352
|
dr->has_end_instruct = respond_to(handler, ox_end_instruct_id);
|
349
353
|
dr->has_attr = respond_to(handler, ox_attr_id);
|
@@ -485,7 +489,7 @@ read_children(SaxDrive dr, int first) {
|
|
485
489
|
if ('\0' == c || (is_white(c) && '\0' == (c = next_non_white(dr)))) {
|
486
490
|
if (!first) {
|
487
491
|
sax_drive_error(dr, "invalid format, element not terminated", 1);
|
488
|
-
err = 1;
|
492
|
+
err = -1;
|
489
493
|
}
|
490
494
|
break; /* normal completion if first */
|
491
495
|
}
|
@@ -525,12 +529,12 @@ read_children(SaxDrive dr, int first) {
|
|
525
529
|
for (i = 7; 0 < i; i--) {
|
526
530
|
sax_drive_get(dr);
|
527
531
|
}
|
528
|
-
if (0 == strncmp("DOCTYPE", dr->str, 7)) {
|
532
|
+
if ((dr->tolerant) ? 0 == strncasecmp("DOCTYPE", dr->str, 7) : 0 == strncmp("DOCTYPE", dr->str, 7)) {
|
529
533
|
if (element_read || !first) {
|
530
534
|
sax_drive_error(dr, "invalid format, DOCTYPE can not come after an element", 0);
|
531
535
|
}
|
532
536
|
err = read_doctype(dr);
|
533
|
-
} else if (0 == strncmp("[CDATA[", dr->str, 7)) {
|
537
|
+
} else if ((dr->tolerant) ? 0 == strncasecmp("[CDATA[", dr->str, 7) : 0 == strncmp("[CDATA[", dr->str, 7)) {
|
534
538
|
err = read_cdata(dr);
|
535
539
|
} else {
|
536
540
|
sax_drive_error(dr, "invalid format, DOCTYPE or comment expected", 1);
|
@@ -552,7 +556,7 @@ read_children(SaxDrive dr, int first) {
|
|
552
556
|
break;
|
553
557
|
default:
|
554
558
|
backup(dr); /* safe since no read occurred after getting last character */
|
555
|
-
if (first && element_read) {
|
559
|
+
if (first && element_read && !dr->tolerant) {
|
556
560
|
sax_drive_error(dr, "invalid format, multiple top level elements", 0);
|
557
561
|
}
|
558
562
|
err = read_element(dr);
|
@@ -632,7 +636,7 @@ read_instruction(SaxDrive dr) {
|
|
632
636
|
VALUE args[1];
|
633
637
|
|
634
638
|
if (dr->convert_special) {
|
635
|
-
if (0 != collapse_special(content)) {
|
639
|
+
if (0 != collapse_special(content, dr->tolerant)) {
|
636
640
|
sax_drive_error(dr, "invalid format, special character does not end with a semicolon", 0);
|
637
641
|
}
|
638
642
|
}
|
@@ -836,6 +840,7 @@ read_element(SaxDrive dr) {
|
|
836
840
|
int closed;
|
837
841
|
int line = dr->line;
|
838
842
|
int col = dr->col - 1;
|
843
|
+
int e;
|
839
844
|
|
840
845
|
if ('\0' == (c = read_name_token(dr))) {
|
841
846
|
return -1;
|
@@ -887,8 +892,28 @@ read_element(SaxDrive dr) {
|
|
887
892
|
rb_funcall2(dr->handler, ox_end_element_id, 1, args);
|
888
893
|
}
|
889
894
|
} else {
|
890
|
-
if (0 != read_children(dr, 0)) {
|
891
|
-
|
895
|
+
if (0 != (e = read_children(dr, 0))) {
|
896
|
+
if (NAME_MISMATCH == e) {
|
897
|
+
if (0 != dr->has_end_element) {
|
898
|
+
VALUE args[1];
|
899
|
+
|
900
|
+
if (dr->has_line) {
|
901
|
+
rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
|
902
|
+
}
|
903
|
+
if (dr->has_column) {
|
904
|
+
rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col - 2));
|
905
|
+
}
|
906
|
+
args[0] = name;
|
907
|
+
rb_funcall2(dr->handler, ox_end_element_id, 1, args);
|
908
|
+
}
|
909
|
+
if (0 == strcmp(dr->str, ename)) {
|
910
|
+
return 0;
|
911
|
+
} else {
|
912
|
+
return NAME_MISMATCH;
|
913
|
+
}
|
914
|
+
} else {
|
915
|
+
return -1;
|
916
|
+
}
|
892
917
|
}
|
893
918
|
line = dr->line;
|
894
919
|
col = dr->col;
|
@@ -902,8 +927,24 @@ read_element(SaxDrive dr) {
|
|
902
927
|
rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
|
903
928
|
}
|
904
929
|
//printf("*** ename: %s close: %s\n", ename, dr->str);
|
905
|
-
|
906
|
-
|
930
|
+
if (dr->tolerant) {
|
931
|
+
if (0 != dr->has_end_element) {
|
932
|
+
VALUE args[1];
|
933
|
+
|
934
|
+
if (dr->has_line) {
|
935
|
+
rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
|
936
|
+
}
|
937
|
+
if (dr->has_column) {
|
938
|
+
rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col - 2));
|
939
|
+
}
|
940
|
+
args[0] = name;
|
941
|
+
rb_funcall2(dr->handler, ox_end_element_id, 1, args);
|
942
|
+
}
|
943
|
+
return NAME_MISMATCH; // dr->str is still the name
|
944
|
+
} else {
|
945
|
+
sax_drive_error(dr, "invalid format, element start and end names do not match", 1);
|
946
|
+
return -1;
|
947
|
+
}
|
907
948
|
}
|
908
949
|
if (0 != dr->has_end_element) {
|
909
950
|
VALUE args[1];
|
@@ -953,7 +994,7 @@ read_text(SaxDrive dr) {
|
|
953
994
|
VALUE args[1];
|
954
995
|
|
955
996
|
if (dr->convert_special) {
|
956
|
-
if (0 != collapse_special(dr->str) && 0 != strchr(dr->str, '&')) {
|
997
|
+
if (0 != collapse_special(dr->str, dr->tolerant) && 0 != strchr(dr->str, '&')) {
|
957
998
|
sax_drive_error(dr, "invalid format, special character does not end with a semicolon", 0);
|
958
999
|
}
|
959
1000
|
}
|
@@ -986,22 +1027,24 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml) {
|
|
986
1027
|
int is_encoding = 0;
|
987
1028
|
int line;
|
988
1029
|
int col;
|
1030
|
+
char last;
|
1031
|
+
char *attr_value;
|
989
1032
|
|
990
1033
|
dr->str = dr->cur; /* lock it down */
|
991
1034
|
if (is_white(c)) {
|
992
1035
|
c = next_non_white(dr);
|
993
1036
|
}
|
994
1037
|
while (termc != c && term2 != c) {
|
995
|
-
|
1038
|
+
backup(dr);
|
996
1039
|
line = dr->line;
|
997
1040
|
col = dr->col;
|
998
1041
|
if ('\0' == c) {
|
999
1042
|
return "invalid format, attributes not terminated";
|
1000
1043
|
}
|
1001
1044
|
if ('\0' == (c = read_name_token(dr))) {
|
1002
|
-
return "error reading
|
1045
|
+
return "error reading token";
|
1003
1046
|
}
|
1004
|
-
if (is_xml && 0 ==
|
1047
|
+
if (is_xml && 0 == strcasecmp("encoding", dr->str)) {
|
1005
1048
|
is_encoding = 1;
|
1006
1049
|
}
|
1007
1050
|
/* TBD use symbol cache */
|
@@ -1011,20 +1054,28 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml) {
|
|
1011
1054
|
if (is_white(c)) {
|
1012
1055
|
c = next_non_white(dr);
|
1013
1056
|
}
|
1057
|
+
last = '\0';
|
1014
1058
|
if ('=' != c) {
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1059
|
+
if (dr->tolerant) {
|
1060
|
+
attr_value = (char*)"";
|
1061
|
+
last = c;
|
1062
|
+
} else {
|
1063
|
+
return "invalid format, no attribute value";
|
1064
|
+
}
|
1065
|
+
} else {
|
1066
|
+
if (0 != read_quoted_value(dr, &last)) {
|
1067
|
+
return "error reading quoted value";
|
1068
|
+
}
|
1069
|
+
attr_value = dr->str;
|
1070
|
+
if (is_encoding) {
|
1021
1071
|
#if HAS_ENCODING_SUPPORT
|
1022
|
-
|
1072
|
+
dr->encoding = rb_enc_find(dr->str);
|
1023
1073
|
#elif HAS_PRIVATE_ENCODING
|
1024
|
-
|
1074
|
+
dr->encoding = rb_str_new2(dr->str);
|
1025
1075
|
#endif
|
1026
|
-
|
1027
|
-
|
1076
|
+
is_encoding = 0;
|
1077
|
+
}
|
1078
|
+
}
|
1028
1079
|
if (dr->has_attr_value) {
|
1029
1080
|
VALUE args[2];
|
1030
1081
|
|
@@ -1041,10 +1092,10 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml) {
|
|
1041
1092
|
VALUE args[2];
|
1042
1093
|
|
1043
1094
|
args[0] = name;
|
1044
|
-
if (0 != collapse_special(dr->str) && 0 != strchr(dr->str, '&')) {
|
1095
|
+
if (0 != collapse_special(dr->str, dr->tolerant) && 0 != strchr(dr->str, '&')) {
|
1045
1096
|
sax_drive_error(dr, "invalid format, special character does not end with a semicolon", 0);
|
1046
1097
|
}
|
1047
|
-
args[1] = rb_str_new2(
|
1098
|
+
args[1] = rb_str_new2(attr_value);
|
1048
1099
|
#if HAS_ENCODING_SUPPORT
|
1049
1100
|
if (0 != dr->encoding) {
|
1050
1101
|
rb_enc_associate(args[1], dr->encoding);
|
@@ -1062,7 +1113,11 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml) {
|
|
1062
1113
|
}
|
1063
1114
|
rb_funcall2(dr->handler, ox_attr_id, 2, args);
|
1064
1115
|
}
|
1065
|
-
|
1116
|
+
if ('\0' != last) {
|
1117
|
+
c = last;
|
1118
|
+
} else {
|
1119
|
+
c = next_non_white(dr);
|
1120
|
+
}
|
1066
1121
|
}
|
1067
1122
|
dr->str = 0;
|
1068
1123
|
|
@@ -1105,7 +1160,7 @@ read_name_token(SaxDrive dr) {
|
|
1105
1160
|
}
|
1106
1161
|
|
1107
1162
|
static int
|
1108
|
-
read_quoted_value(SaxDrive dr) {
|
1163
|
+
read_quoted_value(SaxDrive dr, char *last) {
|
1109
1164
|
char c;
|
1110
1165
|
|
1111
1166
|
dr->str = dr->cur;
|
@@ -1123,6 +1178,26 @@ read_quoted_value(SaxDrive dr) {
|
|
1123
1178
|
return -1;
|
1124
1179
|
}
|
1125
1180
|
}
|
1181
|
+
} else if (dr->tolerant) {
|
1182
|
+
dr->str = dr->cur - 1;
|
1183
|
+
while ('\0' != (c = sax_drive_get(dr))) {
|
1184
|
+
switch (c) {
|
1185
|
+
case '\0':
|
1186
|
+
sax_drive_error(dr, "invalid format, non quoted value not terminated", 1);
|
1187
|
+
case ' ':
|
1188
|
+
case '/':
|
1189
|
+
case '>':
|
1190
|
+
case '?': // for instructions
|
1191
|
+
case '\t':
|
1192
|
+
case '\n':
|
1193
|
+
case '\r':
|
1194
|
+
*last = c;
|
1195
|
+
*(dr->cur - 1) = '\0'; /* terminate value */
|
1196
|
+
return 0;
|
1197
|
+
default:
|
1198
|
+
break;
|
1199
|
+
}
|
1200
|
+
}
|
1126
1201
|
} else {
|
1127
1202
|
dr->str = dr->cur - 1;
|
1128
1203
|
if ('\0' == (c = next_white(dr))) {
|
@@ -1240,7 +1315,7 @@ read_from_str(SaxDrive dr) {
|
|
1240
1315
|
}
|
1241
1316
|
|
1242
1317
|
static int
|
1243
|
-
collapse_special(char *str) {
|
1318
|
+
collapse_special(char *str, int tolerant) {
|
1244
1319
|
char *s = str;
|
1245
1320
|
char *b = str;
|
1246
1321
|
|
@@ -1248,17 +1323,27 @@ collapse_special(char *str) {
|
|
1248
1323
|
if ('&' == *s) {
|
1249
1324
|
int c;
|
1250
1325
|
char *end;
|
1326
|
+
int x = 0;
|
1251
1327
|
|
1252
1328
|
s++;
|
1253
1329
|
if ('#' == *s) {
|
1254
1330
|
s++;
|
1255
1331
|
if ('x' == *s || 'X' == *s) {
|
1256
1332
|
s++;
|
1333
|
+
x = 1;
|
1257
1334
|
c = (int)strtol(s, &end, 16);
|
1258
1335
|
} else {
|
1259
1336
|
c = (int)strtol(s, &end, 10);
|
1260
1337
|
}
|
1261
1338
|
if (';' != *end) {
|
1339
|
+
if (tolerant) {
|
1340
|
+
*b++ = '&';
|
1341
|
+
*b++ = '#';
|
1342
|
+
if (x) {
|
1343
|
+
*b++ = *(s - 1);
|
1344
|
+
}
|
1345
|
+
continue;
|
1346
|
+
}
|
1262
1347
|
return EDOM;
|
1263
1348
|
}
|
1264
1349
|
s = end + 1;
|
@@ -1277,6 +1362,9 @@ collapse_special(char *str) {
|
|
1277
1362
|
} else if (0 == strncasecmp(s, "apos;", 5)) {
|
1278
1363
|
c = '\'';
|
1279
1364
|
s += 5;
|
1365
|
+
} else if (tolerant) {
|
1366
|
+
*b++ = '&';
|
1367
|
+
continue;
|
1280
1368
|
} else {
|
1281
1369
|
c = '?';
|
1282
1370
|
while (';' != *s++) {
|
@@ -1397,7 +1485,7 @@ sax_value_as_s(VALUE self) {
|
|
1397
1485
|
return Qnil;
|
1398
1486
|
}
|
1399
1487
|
if (dr->convert_special) {
|
1400
|
-
if (0 != collapse_special(dr->str) && 0 != strchr(dr->str, '&')) {
|
1488
|
+
if (0 != collapse_special(dr->str, dr->tolerant) && 0 != strchr(dr->str, '&')) {
|
1401
1489
|
sax_drive_error(dr, "invalid format, special character does not end with a semicolon", 0);
|
1402
1490
|
}
|
1403
1491
|
}
|
data/lib/ox/version.rb
CHANGED
metadata
CHANGED
@@ -1,19 +1,21 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ox
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.9.
|
4
|
+
version: 1.9.3
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Peter Ohler
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-22 00:00:00.000000000 Z
|
12
13
|
dependencies: []
|
13
|
-
description: "A fast XML parser and object serializer that uses only standard C
|
14
|
-
\
|
15
|
-
optimized\nXML handling. It was designed to be an alternative to Nokogiri
|
16
|
-
Ruby\nXML parsers for generic XML parsing and as an alternative to Marshal
|
14
|
+
description: ! "A fast XML parser and object serializer that uses only standard C
|
15
|
+
lib.\n \nOptimized XML (Ox), as the name implies was written to provide
|
16
|
+
speed optimized\nXML handling. It was designed to be an alternative to Nokogiri
|
17
|
+
and other Ruby\nXML parsers for generic XML parsing and as an alternative to Marshal
|
18
|
+
for Object\nserialization. "
|
17
19
|
email: peter@ohler.com
|
18
20
|
executables: []
|
19
21
|
extensions:
|
@@ -54,7 +56,6 @@ files:
|
|
54
56
|
- README.md
|
55
57
|
homepage: http://www.ohler.com/ox
|
56
58
|
licenses: []
|
57
|
-
metadata: {}
|
58
59
|
post_install_message:
|
59
60
|
rdoc_options:
|
60
61
|
- --main
|
@@ -63,20 +64,22 @@ require_paths:
|
|
63
64
|
- lib
|
64
65
|
- ext
|
65
66
|
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
66
68
|
requirements:
|
67
|
-
- - '>='
|
69
|
+
- - ! '>='
|
68
70
|
- !ruby/object:Gem::Version
|
69
71
|
version: '0'
|
70
72
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
71
74
|
requirements:
|
72
|
-
- - '>='
|
75
|
+
- - ! '>='
|
73
76
|
- !ruby/object:Gem::Version
|
74
77
|
version: '0'
|
75
78
|
requirements: []
|
76
79
|
rubyforge_project: ox
|
77
|
-
rubygems_version:
|
80
|
+
rubygems_version: 1.8.23
|
78
81
|
signing_key:
|
79
|
-
specification_version:
|
82
|
+
specification_version: 3
|
80
83
|
summary: A fast XML parser and object serializer.
|
81
84
|
test_files: []
|
82
85
|
has_rdoc: true
|
checksums.yaml
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
---
|
2
|
-
SHA1:
|
3
|
-
metadata.gz: be07680b4d502d3d808f0a3b7ad63e3a66685211
|
4
|
-
data.tar.gz: d83f1a8a18ab5f30c9aceb1d46aea0e51dc30298
|
5
|
-
SHA512:
|
6
|
-
metadata.gz: 63813c9d94cb42e86b47c57b66f6d0fc5b0d6150717b85cc43b1021b4cfa1729602f2d0f7b085767ddf28a609bd8ec182a5edd3a5090819cf85da3e46d518a7b
|
7
|
-
data.tar.gz: 82374d21d291afbc5222b6458f7ef1fce2f5e661c28a89a0b1633c9ea93e4659fc7100809722aa3c4bacc0a58387a5edc4481bd8fde6cddac21d31281ab0a625
|