ox 1.9.2 → 1.9.3

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of ox might be problematic. Click here for more details.

data/README.md CHANGED
@@ -34,9 +34,22 @@ A fast XML parser and Object marshaller as a Ruby gem.
34
34
 
35
35
  ## <a name="release">Release Notes</a>
36
36
 
37
- ### Release 1.9.2
37
+ ### Release 1.9.3
38
38
 
39
- - Fixed bug in the sax element name check that cause a memory write error.
39
+ - mcarpenter fixed a compile problem with Cygwin.
40
+
41
+ - Now more tolerant when the :effort is set to :tolerant. Ox will let all sorts
42
+ of errors typical in HTML documents pass. The result may not be perfect but
43
+ at least parsed results are returned.
44
+
45
+ - Attribute values need not be quoted or they can be quoted with single
46
+ quotes or there can be no =value are all.
47
+
48
+ - Elements not terminated will be terminated by the next element
49
+ termination. This effect goes up until a match is found on the element
50
+ name.
51
+
52
+ - SAX parser also given a :tolerant option with the same tolerance as the string parser.
40
53
 
41
54
  ## <a name="description">Description</a>
42
55
 
@@ -20,7 +20,7 @@ dflags = {
20
20
  'HAS_RB_TIME_TIMESPEC' => ('ruby' == type && ('1.9.3' == RUBY_VERSION)) ? 1 : 0,
21
21
  #'HAS_RB_TIME_TIMESPEC' => ('ruby' == type && ('1.9.3' == RUBY_VERSION || '2' <= version[0])) ? 1 : 0,
22
22
  'HAS_TM_GMTOFF' => ('ruby' == type && (('1' == version[0] && '9' == version[1]) || '2' <= version[0]) &&
23
- !(platform.include?('solaris') || platform.include?('linux') || RUBY_PLATFORM =~ /(win|w)32$/)) ? 1 : 0,
23
+ !(platform.include?('cygwin') || platform.include?('solaris') || platform.include?('linux') || RUBY_PLATFORM =~ /(win|w)32$/)) ? 1 : 0,
24
24
  'HAS_ENCODING_SUPPORT' => (('ruby' == type || 'rubinius' == type) &&
25
25
  (('1' == version[0] && '9' == version[1]) || '2' <= version[0])) ? 1 : 0,
26
26
  'HAS_PRIVATE_ENCODING' => ('jruby' == type && '1' == version[0] && '9' == version[1]) ? 1 : 0,
@@ -435,7 +435,7 @@ static VALUE
435
435
  load(char *xml, int argc, VALUE *argv, VALUE self, VALUE encoding) {
436
436
  VALUE obj;
437
437
  struct _Options options = ox_default_options;
438
-
438
+
439
439
  if (1 == argc && rb_cHash == rb_obj_class(*argv)) {
440
440
  VALUE h = *argv;
441
441
  VALUE v;
@@ -622,11 +622,13 @@ load_file(int argc, VALUE *argv, VALUE self) {
622
622
  * @param [Ox::Sax] handler SAX (responds to OX::Sax methods) like handler
623
623
  * @param [IO|String] io IO Object to read from
624
624
  * @param [Hash] options parse options
625
- * @param [true|false] :convert_special flag indicating special special characters like &lt; are converted
625
+ * @param [true|false] :convert_special flag indicating special characters like &lt; are converted
626
+ * @param [true|false] :tolerant flag indicating the parser should be tolerant of XML errors
626
627
  */
627
628
  static VALUE
628
629
  sax_parse(int argc, VALUE *argv, VALUE self) {
629
630
  int convert = 0;
631
+ int tolerant = 0;
630
632
 
631
633
  if (argc < 2) {
632
634
  rb_raise(ox_parse_error_class, "Wrong number of arguments to sax_parse.\n");
@@ -638,8 +640,11 @@ sax_parse(int argc, VALUE *argv, VALUE self) {
638
640
  if (Qnil != (v = rb_hash_lookup(h, convert_special_sym))) {
639
641
  convert = (Qtrue == v);
640
642
  }
643
+ if (Qnil != (v = rb_hash_lookup(h, tolerant_sym))) {
644
+ tolerant = (Qtrue == v);
645
+ }
641
646
  }
642
- ox_sax_parse(argv[0], argv[1], convert);
647
+ ox_sax_parse(argv[0], argv[1], convert, tolerant);
643
648
 
644
649
  return Qnil;
645
650
  }
@@ -206,15 +206,15 @@ struct _PInfo {
206
206
  CircArray circ_array;
207
207
  unsigned long id; /* set for text types when cirs_array is set */
208
208
  Options options;
209
+ char last; /* last character read, rarely set */
209
210
  };
210
211
 
211
212
  extern VALUE ox_parse(char *xml, ParseCallbacks pcb, char **endp, Options options);
212
213
  extern void _ox_raise_error(const char *msg, const char *xml, const char *current, const char* file, int line);
213
214
 
214
- extern void ox_sax_parse(VALUE handler, VALUE io, int convert);
215
+ extern void ox_sax_parse(VALUE handler, VALUE io, int convert, int tolerant);
215
216
  extern void ox_sax_define(void);
216
217
 
217
-
218
218
  extern char* ox_write_obj_to_str(VALUE obj, Options copts);
219
219
  extern void ox_write_obj_to_file(VALUE obj, const char *path, Options copts);
220
220
 
@@ -39,7 +39,7 @@
39
39
  static void read_instruction(PInfo pi);
40
40
  static void read_doctype(PInfo pi);
41
41
  static void read_comment(PInfo pi);
42
- static void read_element(PInfo pi);
42
+ static char* read_element(PInfo pi);
43
43
  static void read_text(PInfo pi);
44
44
  /*static void read_reduced_text(PInfo pi); */
45
45
  static void read_cdata(PInfo pi);
@@ -147,7 +147,7 @@ ox_parse(char *xml, ParseCallbacks pcb, char **endp, Options options) {
147
147
  pi.s++; /* skip second - */
148
148
  read_comment(&pi);
149
149
  }
150
- } else if (0 == strncmp("DOCTYPE", pi.s, 7)) {
150
+ } else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7) : 0 == strncmp("DOCTYPE", pi.s, 7)) {
151
151
  pi.s += 7;
152
152
  read_doctype(&pi);
153
153
  } else {
@@ -210,7 +210,8 @@ read_instruction(PInfo pi) {
210
210
  c = *pi->s;
211
211
  *end = '\0'; /* terminate name */
212
212
  if ('?' != c) {
213
- while ('?' != *pi->s) {
213
+ while ('?' != c) {
214
+ pi->last = 0;
214
215
  if ('\0' == *pi->s) {
215
216
  raise_error("invalid format, processing instruction not terminated", pi->str, pi->s);
216
217
  }
@@ -232,6 +233,11 @@ read_instruction(PInfo pi) {
232
233
  break;
233
234
  }
234
235
  next_non_white(pi);
236
+ if ('\0' == pi->last) {
237
+ c = *pi->s;
238
+ } else {
239
+ c = pi->last;
240
+ }
235
241
  }
236
242
  if ('?' == *pi->s) {
237
243
  pi->s++;
@@ -326,7 +332,7 @@ read_comment(PInfo pi) {
326
332
  /* Entered after the '<' and the first character after that. Returns status
327
333
  * code.
328
334
  */
329
- static void
335
+ static char*
330
336
  read_element(PInfo pi) {
331
337
  struct _Attr attrs[MAX_ATTRS];
332
338
  Attr ap = attrs;
@@ -356,7 +362,7 @@ read_element(PInfo pi) {
356
362
  pi->pcb->add_element(pi, ename, attrs, hasChildren);
357
363
  pi->pcb->end_element(pi, ename);
358
364
 
359
- return;
365
+ return 0;
360
366
  }
361
367
  /* read attribute names until the close (/ or >) is reached */
362
368
  while (!done) {
@@ -364,6 +370,7 @@ read_element(PInfo pi) {
364
370
  next_non_white(pi);
365
371
  c = *pi->s;
366
372
  }
373
+ pi->last = 0;
367
374
  switch (c) {
368
375
  case '\0':
369
376
  raise_error("invalid format, document not terminated", pi->str, pi->s);
@@ -378,7 +385,7 @@ read_element(PInfo pi) {
378
385
  pi->pcb->add_element(pi, ename, attrs, hasChildren);
379
386
  pi->pcb->end_element(pi, ename);
380
387
 
381
- return;
388
+ return 0;
382
389
  case '>':
383
390
  /* has either children or a value */
384
391
  pi->s++;
@@ -394,7 +401,19 @@ read_element(PInfo pi) {
394
401
  end = pi->s;
395
402
  next_non_white(pi);
396
403
  if ('=' != *pi->s++) {
397
- raise_error("invalid format, no attribute value", pi->str, pi->s);
404
+ if (TolerantEffort == pi->options->effort) {
405
+ pi->s--;
406
+ pi->last = *pi->s;
407
+ *end = '\0'; /* terminate name */
408
+ ap->value = "";
409
+ ap++;
410
+ if (MAX_ATTRS <= (ap - attrs)) {
411
+ raise_error("too many attributes", pi->str, pi->s);
412
+ }
413
+ break;
414
+ } else {
415
+ raise_error("invalid format, no attribute value", pi->str, pi->s);
416
+ }
398
417
  }
399
418
  *end = '\0'; /* terminate name */
400
419
  /* read value */
@@ -411,7 +430,12 @@ read_element(PInfo pi) {
411
430
  }
412
431
  break;
413
432
  }
414
- c = '\0';
433
+ if ('\0' == pi->last) {
434
+ c = '\0';
435
+ } else {
436
+ c = pi->last;
437
+ pi->last = '\0';
438
+ }
415
439
  }
416
440
  if (hasChildren) {
417
441
  char *start;
@@ -435,7 +459,9 @@ read_element(PInfo pi) {
435
459
  if ('-' == *pi->s && '-' == *(pi->s + 1)) {
436
460
  pi->s += 2;
437
461
  read_comment(pi);
438
- } else if (0 == strncmp("[CDATA[", pi->s, 7)) {
462
+ } else if ((TolerantEffort == pi->options->effort) ?
463
+ 0 == strncasecmp("[CDATA[", pi->s, 7) :
464
+ 0 == strncmp("[CDATA[", pi->s, 7)) {
439
465
  pi->s += 7;
440
466
  read_cdata(pi);
441
467
  } else {
@@ -455,7 +481,12 @@ read_element(PInfo pi) {
455
481
  c = *pi->s;
456
482
  *end = '\0';
457
483
  if (0 != strcmp(name, ename)) {
458
- raise_error("invalid format, elements overlap", pi->str, pi->s);
484
+ if (TolerantEffort == pi->options->effort) {
485
+ pi->pcb->end_element(pi, ename);
486
+ return name;
487
+ } else {
488
+ raise_error("invalid format, elements overlap", pi->str, pi->s);
489
+ }
459
490
  }
460
491
  if ('>' != c) {
461
492
  raise_error("invalid format, element not closed", pi->str, pi->s);
@@ -467,13 +498,27 @@ read_element(PInfo pi) {
467
498
  }
468
499
  pi->s++;
469
500
  pi->pcb->end_element(pi, ename);
470
- return;
501
+ return 0;
471
502
  case '\0':
472
- raise_error("invalid format, document not terminated", pi->str, pi->s);
503
+ if (TolerantEffort == pi->options->effort) {
504
+ return 0;
505
+ } else {
506
+ raise_error("invalid format, document not terminated", pi->str, pi->s);
507
+ }
473
508
  default:
474
509
  first = 0;
475
510
  /* a child element */
476
- read_element(pi);
511
+ // Child closed with mismatched name.
512
+ if (0 != (name = read_element(pi))) {
513
+ if (0 == strcmp(name, ename)) {
514
+ pi->s++;
515
+ pi->pcb->end_element(pi, ename);
516
+ return 0;
517
+ } else { // not the correct element yet
518
+ pi->pcb->end_element(pi, ename);
519
+ return name;
520
+ }
521
+ }
477
522
  break;
478
523
  }
479
524
  } else { /* read as TEXT */
@@ -489,11 +534,12 @@ read_element(PInfo pi) {
489
534
  /* close tag after text so treat as a value */
490
535
  pi->s += elen + 3;
491
536
  pi->pcb->end_element(pi, ename);
492
- return;
537
+ return 0;
493
538
  }
494
539
  }
495
540
  }
496
541
  }
542
+ return 0;
497
543
  }
498
544
 
499
545
  static void
@@ -697,10 +743,31 @@ read_quoted_value(PInfo pi) {
697
743
  raise_error("invalid format, document not terminated", pi->str, pi->s);
698
744
  }
699
745
  }
700
- *pi->s = '\0'; /* terminate value */
701
- pi->s++; /* move past quote */
746
+ *pi->s = '\0'; /* terminate value */
747
+ pi->s++; /* move past quote */
702
748
  } else if (StrictEffort == pi->options->effort) {
703
749
  raise_error("invalid format, expected a quote character", pi->str, pi->s);
750
+ } else if (TolerantEffort == pi->options->effort) {
751
+ value = pi->s;
752
+ for (; 1; pi->s++) {
753
+ switch (*pi->s) {
754
+ case '\0':
755
+ raise_error("invalid format, document not terminated", pi->str, pi->s);
756
+ case ' ':
757
+ case '/':
758
+ case '>':
759
+ case '?': // for instructions
760
+ case '\t':
761
+ case '\n':
762
+ case '\r':
763
+ pi->last = *pi->s;
764
+ *pi->s = '\0'; /* terminate value */
765
+ pi->s++;
766
+ return value;
767
+ default:
768
+ break;
769
+ }
770
+ }
704
771
  } else {
705
772
  value = pi->s;
706
773
  next_white(pi);
@@ -812,7 +879,7 @@ read_coded_chars(PInfo pi, char *text) {
812
879
  }
813
880
  }
814
881
  if (b > end) {
815
- *text++ = *pi->s;
882
+ *text++ = '&';
816
883
  } else if ('#' == *buf) {
817
884
  uint64_t u = 0;
818
885
 
@@ -823,9 +890,8 @@ read_coded_chars(PInfo pi, char *text) {
823
890
  b = read_10_uint64(b, &u);
824
891
  }
825
892
  if (0 == b) {
826
- *text++ = *pi->s;
893
+ *text++ = '&';
827
894
  } else {
828
- pi->s = s;
829
895
  if (u <= 0x000000000000007FULL) {
830
896
  *text++ = (char)u;
831
897
  #if HAS_PRIVATE_ENCODING
@@ -842,10 +908,14 @@ read_coded_chars(PInfo pi, char *text) {
842
908
  #endif
843
909
  pi->options->rb_enc = ox_utf8_encoding;
844
910
  text = ucs_to_utf8_chars(text, u);
911
+ } else if (TolerantEffort == pi->options->effort) {
912
+ *text++ = '&';
913
+ return text;
845
914
  } else {
846
915
  /*raise_error("Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */
847
916
  raise_error("Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
848
917
  }
918
+ pi->s = s;
849
919
  }
850
920
  } else if (0 == strcasecmp(buf, "nbsp;")) {
851
921
  pi->s = s;
@@ -866,7 +936,7 @@ read_coded_chars(PInfo pi, char *text) {
866
936
  pi->s = s;
867
937
  *text++ = '\'';
868
938
  } else {
869
- *text++ = *pi->s;
939
+ *text++ = '&';
870
940
  }
871
941
  return text;
872
942
  }
@@ -884,15 +954,26 @@ collapse_special(PInfo pi, char *str) {
884
954
  s++;
885
955
  if ('#' == *s) {
886
956
  uint64_t u = 0;
957
+ char x;
887
958
 
888
959
  s++;
889
960
  if ('x' == *s || 'X' == *s) {
961
+ x = *s;
890
962
  s++;
891
963
  end = read_hex_uint64(s, &u);
892
964
  } else {
965
+ x = '\0';
893
966
  end = read_10_uint64(s, &u);
894
967
  }
895
968
  if (0 == end) {
969
+ if (TolerantEffort == pi->options->effort) {
970
+ *b++ = '&';
971
+ *b++ = '#';
972
+ if ('\0' != x) {
973
+ *b++ = x;
974
+ }
975
+ continue;
976
+ }
896
977
  return EDOM;
897
978
  }
898
979
  if (u <= 0x000000000000007FULL) {
@@ -933,6 +1014,9 @@ collapse_special(PInfo pi, char *str) {
933
1014
  } else if (0 == strncasecmp(s, "apos;", 5)) {
934
1015
  c = '\'';
935
1016
  s += 5;
1017
+ } else if (TolerantEffort == pi->options->effort) {
1018
+ *b++ = '&';
1019
+ continue;
936
1020
  } else {
937
1021
  c = '?';
938
1022
  while (';' != *s++) {
@@ -42,6 +42,8 @@
42
42
  #include "ruby.h"
43
43
  #include "ox.h"
44
44
 
45
+ #define NAME_MISMATCH 1
46
+
45
47
  typedef struct _SaxDrive {
46
48
  char base_buf[0x00010000];
47
49
  char *buf;
@@ -55,6 +57,7 @@ typedef struct _SaxDrive {
55
57
  VALUE value_obj;
56
58
  int (*read_func)(struct _SaxDrive *dr);
57
59
  int convert_special;
60
+ int tolerant;
58
61
  union {
59
62
  int fd;
60
63
  VALUE io;
@@ -84,7 +87,7 @@ typedef struct _SaxDrive {
84
87
  #ifdef NEEDS_STPCPY
85
88
  char *stpncpy(char *dest, const char *src, size_t n);
86
89
  #endif
87
- static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert);
90
+ static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert, int tolerant);
88
91
  static void sax_drive_cleanup(SaxDrive dr);
89
92
  static int sax_drive_read(SaxDrive dr);
90
93
  static void sax_drive_error(SaxDrive dr, const char *msg, int critical);
@@ -98,8 +101,8 @@ static int read_element(SaxDrive dr);
98
101
  static int read_text(SaxDrive dr);
99
102
  static const char* read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml);
100
103
  static char read_name_token(SaxDrive dr);
101
- static int read_quoted_value(SaxDrive dr);
102
- static int collapse_special(char *str);
104
+ static int read_quoted_value(SaxDrive dr, char *last);
105
+ static int collapse_special(char *str, int tolerant);
103
106
 
104
107
  static VALUE rescue_cb(VALUE rdr, VALUE err);
105
108
  static VALUE io_cb(VALUE rdr);
@@ -251,10 +254,10 @@ str2sym(const char *str, SaxDrive dr, char **strp) {
251
254
 
252
255
 
253
256
  void
254
- ox_sax_parse(VALUE handler, VALUE io, int convert) {
257
+ ox_sax_parse(VALUE handler, VALUE io, int convert, int tolerant) {
255
258
  struct _SaxDrive dr;
256
259
 
257
- sax_drive_init(&dr, handler, io, convert);
260
+ sax_drive_init(&dr, handler, io, convert, tolerant);
258
261
  #if 0
259
262
  printf("*** sax_parse with these flags\n");
260
263
  printf(" has_instruct = %s\n", dr.has_instruct ? "true" : "false");
@@ -293,7 +296,7 @@ respond_to(VALUE obj, ID method) {
293
296
  }
294
297
 
295
298
  static void
296
- sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert) {
299
+ sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert, int tolerant) {
297
300
  if (ox_stringio_class == rb_obj_class(io)) {
298
301
  VALUE s = rb_funcall2(io, ox_string_id, 0, 0);
299
302
 
@@ -344,6 +347,7 @@ sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert) {
344
347
  dr->value_obj = rb_data_object_alloc(sax_value_class, dr, 0, 0);
345
348
  rb_gc_register_address(&dr->value_obj);
346
349
  dr->convert_special = convert;
350
+ dr->tolerant = tolerant;
347
351
  dr->has_instruct = respond_to(handler, ox_instruct_id);
348
352
  dr->has_end_instruct = respond_to(handler, ox_end_instruct_id);
349
353
  dr->has_attr = respond_to(handler, ox_attr_id);
@@ -485,7 +489,7 @@ read_children(SaxDrive dr, int first) {
485
489
  if ('\0' == c || (is_white(c) && '\0' == (c = next_non_white(dr)))) {
486
490
  if (!first) {
487
491
  sax_drive_error(dr, "invalid format, element not terminated", 1);
488
- err = 1;
492
+ err = -1;
489
493
  }
490
494
  break; /* normal completion if first */
491
495
  }
@@ -525,12 +529,12 @@ read_children(SaxDrive dr, int first) {
525
529
  for (i = 7; 0 < i; i--) {
526
530
  sax_drive_get(dr);
527
531
  }
528
- if (0 == strncmp("DOCTYPE", dr->str, 7)) {
532
+ if ((dr->tolerant) ? 0 == strncasecmp("DOCTYPE", dr->str, 7) : 0 == strncmp("DOCTYPE", dr->str, 7)) {
529
533
  if (element_read || !first) {
530
534
  sax_drive_error(dr, "invalid format, DOCTYPE can not come after an element", 0);
531
535
  }
532
536
  err = read_doctype(dr);
533
- } else if (0 == strncmp("[CDATA[", dr->str, 7)) {
537
+ } else if ((dr->tolerant) ? 0 == strncasecmp("[CDATA[", dr->str, 7) : 0 == strncmp("[CDATA[", dr->str, 7)) {
534
538
  err = read_cdata(dr);
535
539
  } else {
536
540
  sax_drive_error(dr, "invalid format, DOCTYPE or comment expected", 1);
@@ -552,7 +556,7 @@ read_children(SaxDrive dr, int first) {
552
556
  break;
553
557
  default:
554
558
  backup(dr); /* safe since no read occurred after getting last character */
555
- if (first && element_read) {
559
+ if (first && element_read && !dr->tolerant) {
556
560
  sax_drive_error(dr, "invalid format, multiple top level elements", 0);
557
561
  }
558
562
  err = read_element(dr);
@@ -632,7 +636,7 @@ read_instruction(SaxDrive dr) {
632
636
  VALUE args[1];
633
637
 
634
638
  if (dr->convert_special) {
635
- if (0 != collapse_special(content)) {
639
+ if (0 != collapse_special(content, dr->tolerant)) {
636
640
  sax_drive_error(dr, "invalid format, special character does not end with a semicolon", 0);
637
641
  }
638
642
  }
@@ -836,6 +840,7 @@ read_element(SaxDrive dr) {
836
840
  int closed;
837
841
  int line = dr->line;
838
842
  int col = dr->col - 1;
843
+ int e;
839
844
 
840
845
  if ('\0' == (c = read_name_token(dr))) {
841
846
  return -1;
@@ -887,8 +892,28 @@ read_element(SaxDrive dr) {
887
892
  rb_funcall2(dr->handler, ox_end_element_id, 1, args);
888
893
  }
889
894
  } else {
890
- if (0 != read_children(dr, 0)) {
891
- return -1;
895
+ if (0 != (e = read_children(dr, 0))) {
896
+ if (NAME_MISMATCH == e) {
897
+ if (0 != dr->has_end_element) {
898
+ VALUE args[1];
899
+
900
+ if (dr->has_line) {
901
+ rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
902
+ }
903
+ if (dr->has_column) {
904
+ rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col - 2));
905
+ }
906
+ args[0] = name;
907
+ rb_funcall2(dr->handler, ox_end_element_id, 1, args);
908
+ }
909
+ if (0 == strcmp(dr->str, ename)) {
910
+ return 0;
911
+ } else {
912
+ return NAME_MISMATCH;
913
+ }
914
+ } else {
915
+ return -1;
916
+ }
892
917
  }
893
918
  line = dr->line;
894
919
  col = dr->col;
@@ -902,8 +927,24 @@ read_element(SaxDrive dr) {
902
927
  rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
903
928
  }
904
929
  //printf("*** ename: %s close: %s\n", ename, dr->str);
905
- sax_drive_error(dr, "invalid format, element start and end names do not match", 1);
906
- return -1;
930
+ if (dr->tolerant) {
931
+ if (0 != dr->has_end_element) {
932
+ VALUE args[1];
933
+
934
+ if (dr->has_line) {
935
+ rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
936
+ }
937
+ if (dr->has_column) {
938
+ rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col - 2));
939
+ }
940
+ args[0] = name;
941
+ rb_funcall2(dr->handler, ox_end_element_id, 1, args);
942
+ }
943
+ return NAME_MISMATCH; // dr->str is still the name
944
+ } else {
945
+ sax_drive_error(dr, "invalid format, element start and end names do not match", 1);
946
+ return -1;
947
+ }
907
948
  }
908
949
  if (0 != dr->has_end_element) {
909
950
  VALUE args[1];
@@ -953,7 +994,7 @@ read_text(SaxDrive dr) {
953
994
  VALUE args[1];
954
995
 
955
996
  if (dr->convert_special) {
956
- if (0 != collapse_special(dr->str) && 0 != strchr(dr->str, '&')) {
997
+ if (0 != collapse_special(dr->str, dr->tolerant) && 0 != strchr(dr->str, '&')) {
957
998
  sax_drive_error(dr, "invalid format, special character does not end with a semicolon", 0);
958
999
  }
959
1000
  }
@@ -986,22 +1027,24 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml) {
986
1027
  int is_encoding = 0;
987
1028
  int line;
988
1029
  int col;
1030
+ char last;
1031
+ char *attr_value;
989
1032
 
990
1033
  dr->str = dr->cur; /* lock it down */
991
1034
  if (is_white(c)) {
992
1035
  c = next_non_white(dr);
993
1036
  }
994
1037
  while (termc != c && term2 != c) {
995
- backup(dr);
1038
+ backup(dr);
996
1039
  line = dr->line;
997
1040
  col = dr->col;
998
1041
  if ('\0' == c) {
999
1042
  return "invalid format, attributes not terminated";
1000
1043
  }
1001
1044
  if ('\0' == (c = read_name_token(dr))) {
1002
- return "error reading tolen";
1045
+ return "error reading token";
1003
1046
  }
1004
- if (is_xml && 0 == strcmp("encoding", dr->str)) {
1047
+ if (is_xml && 0 == strcasecmp("encoding", dr->str)) {
1005
1048
  is_encoding = 1;
1006
1049
  }
1007
1050
  /* TBD use symbol cache */
@@ -1011,20 +1054,28 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml) {
1011
1054
  if (is_white(c)) {
1012
1055
  c = next_non_white(dr);
1013
1056
  }
1057
+ last = '\0';
1014
1058
  if ('=' != c) {
1015
- return "invalid format, no attribute value";
1016
- }
1017
- if (0 != read_quoted_value(dr)) {
1018
- return "error reading quoted value";
1019
- }
1020
- if (is_encoding) {
1059
+ if (dr->tolerant) {
1060
+ attr_value = (char*)"";
1061
+ last = c;
1062
+ } else {
1063
+ return "invalid format, no attribute value";
1064
+ }
1065
+ } else {
1066
+ if (0 != read_quoted_value(dr, &last)) {
1067
+ return "error reading quoted value";
1068
+ }
1069
+ attr_value = dr->str;
1070
+ if (is_encoding) {
1021
1071
  #if HAS_ENCODING_SUPPORT
1022
- dr->encoding = rb_enc_find(dr->str);
1072
+ dr->encoding = rb_enc_find(dr->str);
1023
1073
  #elif HAS_PRIVATE_ENCODING
1024
- dr->encoding = rb_str_new2(dr->str);
1074
+ dr->encoding = rb_str_new2(dr->str);
1025
1075
  #endif
1026
- is_encoding = 0;
1027
- }
1076
+ is_encoding = 0;
1077
+ }
1078
+ }
1028
1079
  if (dr->has_attr_value) {
1029
1080
  VALUE args[2];
1030
1081
 
@@ -1041,10 +1092,10 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml) {
1041
1092
  VALUE args[2];
1042
1093
 
1043
1094
  args[0] = name;
1044
- if (0 != collapse_special(dr->str) && 0 != strchr(dr->str, '&')) {
1095
+ if (0 != collapse_special(dr->str, dr->tolerant) && 0 != strchr(dr->str, '&')) {
1045
1096
  sax_drive_error(dr, "invalid format, special character does not end with a semicolon", 0);
1046
1097
  }
1047
- args[1] = rb_str_new2(dr->str);
1098
+ args[1] = rb_str_new2(attr_value);
1048
1099
  #if HAS_ENCODING_SUPPORT
1049
1100
  if (0 != dr->encoding) {
1050
1101
  rb_enc_associate(args[1], dr->encoding);
@@ -1062,7 +1113,11 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml) {
1062
1113
  }
1063
1114
  rb_funcall2(dr->handler, ox_attr_id, 2, args);
1064
1115
  }
1065
- c = next_non_white(dr);
1116
+ if ('\0' != last) {
1117
+ c = last;
1118
+ } else {
1119
+ c = next_non_white(dr);
1120
+ }
1066
1121
  }
1067
1122
  dr->str = 0;
1068
1123
 
@@ -1105,7 +1160,7 @@ read_name_token(SaxDrive dr) {
1105
1160
  }
1106
1161
 
1107
1162
  static int
1108
- read_quoted_value(SaxDrive dr) {
1163
+ read_quoted_value(SaxDrive dr, char *last) {
1109
1164
  char c;
1110
1165
 
1111
1166
  dr->str = dr->cur;
@@ -1123,6 +1178,26 @@ read_quoted_value(SaxDrive dr) {
1123
1178
  return -1;
1124
1179
  }
1125
1180
  }
1181
+ } else if (dr->tolerant) {
1182
+ dr->str = dr->cur - 1;
1183
+ while ('\0' != (c = sax_drive_get(dr))) {
1184
+ switch (c) {
1185
+ case '\0':
1186
+ sax_drive_error(dr, "invalid format, non quoted value not terminated", 1);
1187
+ case ' ':
1188
+ case '/':
1189
+ case '>':
1190
+ case '?': // for instructions
1191
+ case '\t':
1192
+ case '\n':
1193
+ case '\r':
1194
+ *last = c;
1195
+ *(dr->cur - 1) = '\0'; /* terminate value */
1196
+ return 0;
1197
+ default:
1198
+ break;
1199
+ }
1200
+ }
1126
1201
  } else {
1127
1202
  dr->str = dr->cur - 1;
1128
1203
  if ('\0' == (c = next_white(dr))) {
@@ -1240,7 +1315,7 @@ read_from_str(SaxDrive dr) {
1240
1315
  }
1241
1316
 
1242
1317
  static int
1243
- collapse_special(char *str) {
1318
+ collapse_special(char *str, int tolerant) {
1244
1319
  char *s = str;
1245
1320
  char *b = str;
1246
1321
 
@@ -1248,17 +1323,27 @@ collapse_special(char *str) {
1248
1323
  if ('&' == *s) {
1249
1324
  int c;
1250
1325
  char *end;
1326
+ int x = 0;
1251
1327
 
1252
1328
  s++;
1253
1329
  if ('#' == *s) {
1254
1330
  s++;
1255
1331
  if ('x' == *s || 'X' == *s) {
1256
1332
  s++;
1333
+ x = 1;
1257
1334
  c = (int)strtol(s, &end, 16);
1258
1335
  } else {
1259
1336
  c = (int)strtol(s, &end, 10);
1260
1337
  }
1261
1338
  if (';' != *end) {
1339
+ if (tolerant) {
1340
+ *b++ = '&';
1341
+ *b++ = '#';
1342
+ if (x) {
1343
+ *b++ = *(s - 1);
1344
+ }
1345
+ continue;
1346
+ }
1262
1347
  return EDOM;
1263
1348
  }
1264
1349
  s = end + 1;
@@ -1277,6 +1362,9 @@ collapse_special(char *str) {
1277
1362
  } else if (0 == strncasecmp(s, "apos;", 5)) {
1278
1363
  c = '\'';
1279
1364
  s += 5;
1365
+ } else if (tolerant) {
1366
+ *b++ = '&';
1367
+ continue;
1280
1368
  } else {
1281
1369
  c = '?';
1282
1370
  while (';' != *s++) {
@@ -1397,7 +1485,7 @@ sax_value_as_s(VALUE self) {
1397
1485
  return Qnil;
1398
1486
  }
1399
1487
  if (dr->convert_special) {
1400
- if (0 != collapse_special(dr->str) && 0 != strchr(dr->str, '&')) {
1488
+ if (0 != collapse_special(dr->str, dr->tolerant) && 0 != strchr(dr->str, '&')) {
1401
1489
  sax_drive_error(dr, "invalid format, special character does not end with a semicolon", 0);
1402
1490
  }
1403
1491
  }
@@ -1,5 +1,5 @@
1
1
 
2
2
  module Ox
3
3
  # Current version of the module.
4
- VERSION = '1.9.2'
4
+ VERSION = '1.9.3'
5
5
  end
metadata CHANGED
@@ -1,19 +1,21 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ox
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.2
4
+ version: 1.9.3
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - Peter Ohler
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2013-03-09 00:00:00.000000000 Z
12
+ date: 2013-03-22 00:00:00.000000000 Z
12
13
  dependencies: []
13
- description: "A fast XML parser and object serializer that uses only standard C lib.\n
14
- \ \nOptimized XML (Ox), as the name implies was written to provide speed
15
- optimized\nXML handling. It was designed to be an alternative to Nokogiri and other
16
- Ruby\nXML parsers for generic XML parsing and as an alternative to Marshal for Object\nserialization. "
14
+ description: ! "A fast XML parser and object serializer that uses only standard C
15
+ lib.\n \nOptimized XML (Ox), as the name implies was written to provide
16
+ speed optimized\nXML handling. It was designed to be an alternative to Nokogiri
17
+ and other Ruby\nXML parsers for generic XML parsing and as an alternative to Marshal
18
+ for Object\nserialization. "
17
19
  email: peter@ohler.com
18
20
  executables: []
19
21
  extensions:
@@ -54,7 +56,6 @@ files:
54
56
  - README.md
55
57
  homepage: http://www.ohler.com/ox
56
58
  licenses: []
57
- metadata: {}
58
59
  post_install_message:
59
60
  rdoc_options:
60
61
  - --main
@@ -63,20 +64,22 @@ require_paths:
63
64
  - lib
64
65
  - ext
65
66
  required_ruby_version: !ruby/object:Gem::Requirement
67
+ none: false
66
68
  requirements:
67
- - - '>='
69
+ - - ! '>='
68
70
  - !ruby/object:Gem::Version
69
71
  version: '0'
70
72
  required_rubygems_version: !ruby/object:Gem::Requirement
73
+ none: false
71
74
  requirements:
72
- - - '>='
75
+ - - ! '>='
73
76
  - !ruby/object:Gem::Version
74
77
  version: '0'
75
78
  requirements: []
76
79
  rubyforge_project: ox
77
- rubygems_version: 2.0.0
80
+ rubygems_version: 1.8.23
78
81
  signing_key:
79
- specification_version: 4
82
+ specification_version: 3
80
83
  summary: A fast XML parser and object serializer.
81
84
  test_files: []
82
85
  has_rdoc: true
checksums.yaml DELETED
@@ -1,7 +0,0 @@
1
- ---
2
- SHA1:
3
- metadata.gz: be07680b4d502d3d808f0a3b7ad63e3a66685211
4
- data.tar.gz: d83f1a8a18ab5f30c9aceb1d46aea0e51dc30298
5
- SHA512:
6
- metadata.gz: 63813c9d94cb42e86b47c57b66f6d0fc5b0d6150717b85cc43b1021b4cfa1729602f2d0f7b085767ddf28a609bd8ec182a5edd3a5090819cf85da3e46d518a7b
7
- data.tar.gz: 82374d21d291afbc5222b6458f7ef1fce2f5e661c28a89a0b1633c9ea93e4659fc7100809722aa3c4bacc0a58387a5edc4481bd8fde6cddac21d31281ab0a625