ox 1.9.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of ox might be problematic. Click here for more details.

@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 31bc59694575361e4f784ffc8eb8b37fdbb08057
4
+ data.tar.gz: 9332c70a159d96aa167cc738ac0a0e5bbb2ff6e9
5
+ SHA512:
6
+ metadata.gz: 8ee7dccef40d6141d12b1d8eee8a5b92e5d0ad4b04a0c253086c4c333961da1894eb6a450031bf6e4476f6d62c0fb4392793786ae9fdcc2dd84db7597ac8b2bf
7
+ data.tar.gz: 22bed155c466594191d54ac1ed6db28d6a442c54b03836bf94378590af164fc2d2f67713d44ecd4cfedad1c2aec3bb06df5ca029eff746753b2f8715e2f6f8c9
data/README.md CHANGED
@@ -34,31 +34,29 @@ A fast XML parser and Object marshaller as a Ruby gem.
34
34
 
35
35
  ## <a name="release">Release Notes</a>
36
36
 
37
- ### Release 1.9.4
37
+ ### Release 2.0.0
38
38
 
39
- - SAX tolerant mode handle multiple elements in a document better.
39
+ - The SAX parser went through a significant re-write. The options have changed. It is now 15% faster on large files and
40
+ much better at recovering from errors. So much so that the tolerant option was removed and is now the default and
41
+ only behavior. A smart option was added however. The smart option recognizes a file as an HTML file and will apply a
42
+ simple set of validation rules that allow the HTML to be parsed more reasonably. Errors will cause callbacks but the
43
+ parsing continues with the best guess as to how to recover. Rubymaniac has helped with testing and prompted the
44
+ rewrite to support parsing HTML pages.
40
45
 
41
- ### Release 1.9.3
46
+ - HTML is now supported with the SAX parser. The parser knows some tags like \<br\> or \<img\> do not have to be
47
+ closed. Other hints as to how to parse and when to raise errors are also included. The parser does it's best to
48
+ continue parsing even after errors.
42
49
 
43
- - mcarpenter fixed a compile problem with Cygwin.
50
+ - Added symbolize option to the sax parser. This option, if set to false will use strings instead of symbols for
51
+ element and attribute names.
44
52
 
45
- - Now more tolerant when the :effort is set to :tolerant. Ox will let all sorts
46
- of errors typical in HTML documents pass. The result may not be perfect but
47
- at least parsed results are returned.
48
-
49
- - Attribute values need not be quoted or they can be quoted with single
50
- quotes or there can be no =value are all.
51
-
52
- - Elements not terminated will be terminated by the next element
53
- termination. This effect goes up until a match is found on the element
54
- name.
55
-
56
- - SAX parser also given a :tolerant option with the same tolerance as the string parser.
53
+ - A contrib directory was added for people to submit useful bits of code that can be used with Ox. The first
54
+ contributor is Notezen with a nice way of building XML.
57
55
 
58
56
  ## <a name="description">Description</a>
59
57
 
60
58
  Optimized XML (Ox), as the name implies was written to provide speed optimized
61
- XML handling. It was designed to be an alternative to Nokogiri and other Ruby
59
+ XML and now HTML handling. It was designed to be an alternative to Nokogiri and other Ruby
62
60
  XML parsers in generic XML parsing and as an alternative to Marshal for Object
63
61
  serialization.
64
62
 
@@ -99,6 +97,7 @@ Ox is compatible with Ruby 1.8.7, 1.9.2, JRuby, and RBX.
99
97
 
100
98
  ### Object Dump Sample:
101
99
 
100
+ ```ruby
102
101
  require 'ox'
103
102
 
104
103
  class Sample
@@ -117,9 +116,11 @@ Ox is compatible with Ruby 1.8.7, 1.9.2, JRuby, and RBX.
117
116
  xml = Ox.dump(obj)
118
117
  # Convert the object back into a Sample Object.
119
118
  obj2 = Ox.parse_obj(xml)
119
+ ```
120
120
 
121
121
  ### Generic XML Writing and Parsing:
122
122
 
123
+ ```ruby
123
124
  require 'ox'
124
125
 
125
126
  doc = Ox::Document.new(:version => '1.0')
@@ -148,9 +149,11 @@ Ox is compatible with Ruby 1.8.7, 1.9.2, JRuby, and RBX.
148
149
  doc2 = Ox.parse(xml)
149
150
  puts "Same? #{doc == doc2}"
150
151
  # true
152
+ ```
151
153
 
152
154
  ### SAX XML Parsing:
153
155
 
156
+ ```ruby
154
157
  require 'stringio'
155
158
  require 'ox'
156
159
 
@@ -181,6 +184,7 @@ Ox is compatible with Ruby 1.8.7, 1.9.2, JRuby, and RBX.
181
184
  # end: bottom
182
185
  # end: middle
183
186
  # end: top
187
+ ```
184
188
 
185
189
  ### Object XML format
186
190
 
@@ -63,7 +63,7 @@ ox_cache_new(Cache *cache) {
63
63
  *cache = ALLOC(struct _Cache);
64
64
  (*cache)->key = 0;
65
65
  (*cache)->value = Qundef;
66
- bzero((*cache)->slots, sizeof((*cache)->slots));
66
+ memset((*cache)->slots, 0, sizeof((*cache)->slots));
67
67
  }
68
68
 
69
69
  VALUE
@@ -131,7 +131,6 @@ ox_cache_get(Cache cache, const char *key, VALUE **slot, char **keyp) {
131
131
  *slot = &cache->value;
132
132
  if (0 != keyp) {
133
133
  if (0 == cache->key) {
134
- // TBD bug somewhere
135
134
  printf("*** Error: failed to set the key for %s\n", key);
136
135
  *keyp = 0;
137
136
  } else {
@@ -595,9 +595,9 @@ dump_obj(ID aid, VALUE obj, unsigned int depth, Out out) {
595
595
  e.closed = (0 >= cnt);
596
596
  out->w_start(out, &e);
597
597
  if (!e.closed) {
598
- VALUE *np = RARRAY_PTR(obj);
599
- int i;
600
- int d2 = depth + 1;
598
+ const VALUE *np = RARRAY_PTR(obj);
599
+ int i;
600
+ int d2 = depth + 1;
601
601
 
602
602
  for (i = cnt; 0 < i; i--, np++) {
603
603
  dump_obj(0, *np, d2, out);
@@ -835,10 +835,10 @@ dump_obj(ID aid, VALUE obj, unsigned int depth, Out out) {
835
835
  e.closed = (0 >= cnt);
836
836
  out->w_start(out, &e);
837
837
  if (0 < cnt) {
838
- VALUE *np = RARRAY_PTR(vars);
839
- ID vid;
840
- unsigned int od = out->depth;
841
- int i;
838
+ const VALUE *np = RARRAY_PTR(vars);
839
+ ID vid;
840
+ unsigned int od = out->depth;
841
+ int i;
842
842
 
843
843
  out->depth = depth + 1;
844
844
  for (i = cnt; 0 < i; i--, np++) {
@@ -1105,9 +1105,9 @@ dump_gen_nodes(VALUE obj, unsigned int depth, Out out) {
1105
1105
  int indent_needed = 1;
1106
1106
 
1107
1107
  if (0 < cnt) {
1108
- VALUE *np = RARRAY_PTR(obj);
1109
- VALUE clas;
1110
- int d2 = depth + 1;
1108
+ const VALUE *np = RARRAY_PTR(obj);
1109
+ VALUE clas;
1110
+ int d2 = depth + 1;
1111
1111
 
1112
1112
  for (; 0 < cnt; cnt--, np++) {
1113
1113
  clas = rb_obj_class(*np);
@@ -1117,7 +1117,7 @@ dump_gen_nodes(VALUE obj, unsigned int depth, Out out) {
1117
1117
  dump_gen_instruct(*np, d2, out);
1118
1118
  indent_needed = (1 == cnt) ? 0 : 1;
1119
1119
  } else if (rb_cString == clas) {
1120
- dump_str_value(out, StringValuePtr(*np), RSTRING_LEN(*np));
1120
+ dump_str_value(out, StringValuePtr(*(VALUE*)np), RSTRING_LEN(*np));
1121
1121
  indent_needed = (1 == cnt) ? 0 : 1;
1122
1122
  } else if (ox_comment_clas == clas) {
1123
1123
  dump_gen_val_node(*np, d2, "<!-- ", 5, " -->", 4, out);
@@ -4,7 +4,7 @@ extension_name = 'ox'
4
4
  dir_config(extension_name)
5
5
 
6
6
  parts = RUBY_DESCRIPTION.split(' ')
7
- type = parts[0]
7
+ type = parts[0].downcase()
8
8
  type = 'ree' if 'ruby' == type && RUBY_DESCRIPTION.include?('Ruby Enterprise Edition')
9
9
  platform = RUBY_PLATFORM
10
10
  version = RUBY_VERSION.split('.')
@@ -21,8 +21,10 @@ dflags = {
21
21
  #'HAS_RB_TIME_TIMESPEC' => ('ruby' == type && ('1.9.3' == RUBY_VERSION || '2' <= version[0])) ? 1 : 0,
22
22
  'HAS_TM_GMTOFF' => ('ruby' == type && (('1' == version[0] && '9' == version[1]) || '2' <= version[0]) &&
23
23
  !(platform.include?('cygwin') || platform.include?('solaris') || platform.include?('linux') || RUBY_PLATFORM =~ /(win|w)32$/)) ? 1 : 0,
24
- 'HAS_ENCODING_SUPPORT' => (('ruby' == type || 'rubinius' == type) &&
24
+ 'HAS_ENCODING_SUPPORT' => (('ruby' == type || 'rubinius' == type || 'macruby' == type) &&
25
25
  (('1' == version[0] && '9' == version[1]) || '2' <= version[0])) ? 1 : 0,
26
+ 'HAS_ONIG' => (('ruby' == type || 'jruby' == type || 'rubinius' == type) &&
27
+ (('1' == version[0] && '9' == version[1]) || '2' <= version[0])) ? 1 : 0,
26
28
  'HAS_PRIVATE_ENCODING' => ('jruby' == type && '1' == version[0] && '9' == version[1]) ? 1 : 0,
27
29
  'HAS_NANO_TIME' => ('ruby' == type && ('1' == version[0] && '9' == version[1]) || '2' <= version[0]) ? 1 : 0,
28
30
  'HAS_RSTRUCT' => ('ruby' == type || 'ree' == type) ? 1 : 0,
@@ -382,16 +382,7 @@ parse_regexp(const char *text) {
382
382
  int options = 0;
383
383
 
384
384
  te = text + strlen(text) - 1;
385
- #if HAS_ENCODING_SUPPORT
386
- for (; text < te && '/' != *te; te--) {
387
- switch (*te) {
388
- case 'i': options |= ONIG_OPTION_IGNORECASE; break;
389
- case 'm': options |= ONIG_OPTION_MULTILINE; break;
390
- case 'x': options |= ONIG_OPTION_EXTEND; break;
391
- default: break;
392
- }
393
- }
394
- #elif HAS_PRIVATE_ENCODING
385
+ #if HAS_ONIG
395
386
  for (; text < te && '/' != *te; te--) {
396
387
  switch (*te) {
397
388
  case 'i': options |= ONIG_OPTION_IGNORECASE; break;
@@ -35,6 +35,7 @@
35
35
 
36
36
  #include "ruby.h"
37
37
  #include "ox.h"
38
+ #include "sax.h"
38
39
 
39
40
  /* maximum to allocate on the stack, arbitrary limit */
40
41
  #define SMALL_XML 65536
@@ -128,8 +129,9 @@ static VALUE object_sym;
128
129
  static VALUE opt_format_sym;
129
130
  static VALUE optimized_sym;
130
131
  static VALUE strict_sym;
131
- static VALUE strict_sym;
132
+ static VALUE smart_sym;
132
133
  static VALUE symbolize_keys_sym;
134
+ static VALUE symbolize_sym;
133
135
  static VALUE tolerant_sym;
134
136
  static VALUE trace_sym;
135
137
  static VALUE with_dtd_sym;
@@ -306,7 +308,7 @@ set_def_opts(VALUE self, VALUE opts) {
306
308
  } else {
307
309
  Check_Type(v, T_STRING);
308
310
  strncpy(ox_default_options.encoding, StringValuePtr(v), sizeof(ox_default_options.encoding) - 1);
309
- #ifdef HAVE_RUBY_ENCODING_H
311
+ #if HAS_ENCODING_SUPPORT
310
312
  ox_default_options.rb_enc = rb_enc_find(ox_default_options.encoding);
311
313
  #elif HAS_PRIVATE_ENCODING
312
314
  ox_default_options.rb_enc = rb_str_new2(ox_default_options.encoding);
@@ -472,7 +474,7 @@ load(char *xml, int argc, VALUE *argv, VALUE self, VALUE encoding) {
472
474
  options.sym_keys = (Qfalse == v) ? No : Yes;
473
475
  }
474
476
  }
475
- #ifdef HAVE_RUBY_ENCODING_H
477
+ #if HAS_ENCODING_SUPPORT
476
478
  if ('\0' == *options.encoding) {
477
479
  if (Qnil != encoding) {
478
480
  options.rb_enc = rb_enc_from_index(rb_enc_get_index(encoding));
@@ -548,8 +550,12 @@ load_str(int argc, VALUE *argv, VALUE self) {
548
550
  } else {
549
551
  xml = ALLOCA_N(char, len);
550
552
  }
551
- #ifdef HAVE_RUBY_ENCODING_H
553
+ #if HAS_ENCODING_SUPPORT
554
+ #ifdef MACRUBY_RUBY
555
+ encoding = rb_funcall(*argv, rb_intern("encoding"), 0);
556
+ #else
552
557
  encoding = rb_obj_encoding(*argv);
558
+ #endif
553
559
  #elif HAS_PRIVATE_ENCODING
554
560
  encoding = rb_funcall(*argv, rb_intern("encoding"), 0);
555
561
  #else
@@ -623,12 +629,16 @@ load_file(int argc, VALUE *argv, VALUE self) {
623
629
  * @param [IO|String] io IO Object to read from
624
630
  * @param [Hash] options parse options
625
631
  * @param [true|false] :convert_special flag indicating special characters like &lt; are converted
626
- * @param [true|false] :tolerant flag indicating the parser should be tolerant of XML errors
632
+ * @param [true|false] :symbolize flag indicating the parser symbolize element and attribute names
633
+ * @param [true|false] :smart flag indicating the parser use hints if available (use with html)
627
634
  */
628
635
  static VALUE
629
636
  sax_parse(int argc, VALUE *argv, VALUE self) {
630
- int convert = 0;
631
- int tolerant = 0;
637
+ struct _SaxOptions options;
638
+
639
+ options.symbolize = 1;
640
+ options.convert_special = 0;
641
+ options.smart = 0;
632
642
 
633
643
  if (argc < 2) {
634
644
  rb_raise(ox_parse_error_class, "Wrong number of arguments to sax_parse.\n");
@@ -638,13 +648,16 @@ sax_parse(int argc, VALUE *argv, VALUE self) {
638
648
  VALUE v;
639
649
 
640
650
  if (Qnil != (v = rb_hash_lookup(h, convert_special_sym))) {
641
- convert = (Qtrue == v);
651
+ options.convert_special = (Qtrue == v);
652
+ }
653
+ if (Qnil != (v = rb_hash_lookup(h, smart_sym))) {
654
+ options.smart = (Qtrue == v);
642
655
  }
643
- if (Qnil != (v = rb_hash_lookup(h, tolerant_sym))) {
644
- tolerant = (Qtrue == v);
656
+ if (Qnil != (v = rb_hash_lookup(h, symbolize_sym))) {
657
+ options.symbolize = (Qtrue == v);
645
658
  }
646
659
  }
647
- ox_sax_parse(argv[0], argv[1], convert, tolerant);
660
+ ox_sax_parse(argv[0], argv[1], &options);
648
661
 
649
662
  return Qnil;
650
663
  }
@@ -875,8 +888,10 @@ void Init_ox() {
875
888
  opt_format_sym = ID2SYM(rb_intern("opt_format")); rb_gc_register_address(&opt_format_sym);
876
889
  optimized_sym = ID2SYM(rb_intern("optimized")); rb_gc_register_address(&optimized_sym);
877
890
  ox_encoding_sym = ID2SYM(rb_intern("encoding")); rb_gc_register_address(&ox_encoding_sym);
891
+ smart_sym = ID2SYM(rb_intern("smart")); rb_gc_register_address(&smart_sym);
878
892
  strict_sym = ID2SYM(rb_intern("strict")); rb_gc_register_address(&strict_sym);
879
893
  symbolize_keys_sym = ID2SYM(rb_intern("symbolize_keys")); rb_gc_register_address(&symbolize_keys_sym);
894
+ symbolize_sym = ID2SYM(rb_intern("symbolize")); rb_gc_register_address(&symbolize_sym);
880
895
  tolerant_sym = ID2SYM(rb_intern("tolerant")); rb_gc_register_address(&tolerant_sym);
881
896
  trace_sym = ID2SYM(rb_intern("trace")); rb_gc_register_address(&trace_sym);
882
897
  with_dtd_sym = ID2SYM(rb_intern("with_dtd")); rb_gc_register_address(&with_dtd_sym);
@@ -186,7 +186,7 @@ typedef struct _Options {
186
186
  char mode; /* LoadMode */
187
187
  char effort; /* Effort */
188
188
  char sym_keys; /* symbolize keys */
189
- #ifdef HAVE_RUBY_ENCODING_H
189
+ #if HAS_ENCODING_SUPPORT
190
190
  rb_encoding *rb_enc;
191
191
  #elif HAS_PRIVATE_ENCODING
192
192
  VALUE rb_enc;
@@ -212,7 +212,6 @@ struct _PInfo {
212
212
  extern VALUE ox_parse(char *xml, ParseCallbacks pcb, char **endp, Options options);
213
213
  extern void _ox_raise_error(const char *msg, const char *xml, const char *current, const char* file, int line);
214
214
 
215
- extern void ox_sax_parse(VALUE handler, VALUE io, int convert, int tolerant);
216
215
  extern void ox_sax_define(void);
217
216
 
218
217
  extern char* ox_write_obj_to_str(VALUE obj, Options copts);
@@ -1,21 +1,21 @@
1
1
  /* sax.c
2
2
  * Copyright (c) 2011, Peter Ohler
3
3
  * All rights reserved.
4
- *
4
+ *
5
5
  * Redistribution and use in source and binary forms, with or without
6
6
  * modification, are permitted provided that the following conditions are met:
7
- *
7
+ *
8
8
  * - Redistributions of source code must retain the above copyright notice, this
9
9
  * list of conditions and the following disclaimer.
10
- *
10
+ *
11
11
  * - Redistributions in binary form must reproduce the above copyright notice,
12
12
  * this list of conditions and the following disclaimer in the documentation
13
13
  * and/or other materials provided with the distribution.
14
- *
14
+ *
15
15
  * - Neither the name of Peter Ohler nor the names of its contributors may be
16
16
  * used to endorse or promote products derived from this software without
17
17
  * specific prior written permission.
18
- *
18
+ *
19
19
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
20
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
21
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -34,93 +34,59 @@
34
34
  #include <strings.h>
35
35
  #include <sys/types.h>
36
36
  #if NEEDS_UIO
37
- #include <sys/uio.h>
37
+ #include <sys/uio.h>
38
38
  #endif
39
39
  #include <unistd.h>
40
40
  #include <time.h>
41
41
 
42
42
  #include "ruby.h"
43
43
  #include "ox.h"
44
+ #include "sax.h"
45
+ #include "sax_stack.h"
46
+ #include "sax_buf.h"
44
47
 
45
48
  #define NAME_MISMATCH 1
46
49
 
47
- typedef struct _SaxDrive {
48
- char base_buf[0x00010000];
49
- char *buf;
50
- char *buf_end;
51
- char *cur;
52
- char *read_end; /* one past last character read */
53
- char *str; /* start of current string being read */
54
- int line;
55
- int col;
56
- VALUE handler;
57
- VALUE value_obj;
58
- int (*read_func)(struct _SaxDrive *dr);
59
- int convert_special;
60
- int tolerant;
61
- union {
62
- int fd;
63
- VALUE io;
64
- const char *in_str;
65
- };
66
- int has_instruct;
67
- int has_end_instruct;
68
- int has_attr;
69
- int has_attr_value;
70
- int has_doctype;
71
- int has_comment;
72
- int has_cdata;
73
- int has_text;
74
- int has_value;
75
- int has_start_element;
76
- int has_end_element;
77
- int has_error;
78
- int has_line;
79
- int has_column;
80
- #if HAS_ENCODING_SUPPORT
81
- rb_encoding *encoding;
82
- #elif HAS_PRIVATE_ENCODING
83
- VALUE encoding;
84
- #endif
85
- } *SaxDrive;
86
-
87
- #ifdef NEEDS_STPCPY
88
- char *stpncpy(char *dest, const char *src, size_t n);
89
- #endif
90
- static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert, int tolerant);
91
- static void sax_drive_cleanup(SaxDrive dr);
92
- static int sax_drive_read(SaxDrive dr);
93
- static void sax_drive_error(SaxDrive dr, const char *msg, int critical);
94
-
95
- static int read_children(SaxDrive dr, int first);
96
- static int read_instruction(SaxDrive dr);
97
- static int read_doctype(SaxDrive dr);
98
- static int read_cdata(SaxDrive dr);
99
- static int read_comment(SaxDrive dr);
100
- static int read_element(SaxDrive dr);
101
- static int read_text(SaxDrive dr);
102
- static const char* read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml);
50
+ #define START_STATE 1
51
+ #define BODY_STATE 2
52
+ #define AFTER_STATE 3
53
+
54
+ // error prefixes
55
+ #define BAD_BOM "Bad BOM: "
56
+ #define NO_TERM "Not Terminated: "
57
+ #define INVALID_FORMAT "Invalid Format: "
58
+ #define CASE_ERROR "Case Error: "
59
+ #define OUT_OF_ORDER "Out of Order: "
60
+ #define WRONG_CHAR "Unexpected Character: "
61
+ #define EL_MISMATCH "Start End Mismatch: "
62
+ #define INV_ELEMENT "Invalid Element: "
63
+
64
+ static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options);
65
+ static void parse(SaxDrive dr);
66
+ // All read functions should return the next character after the 'thing' that was read and leave dr->cur one after that.
67
+ static char read_instruction(SaxDrive dr);
68
+ static char read_doctype(SaxDrive dr);
69
+ static char read_cdata(SaxDrive dr);
70
+ static char read_comment(SaxDrive dr);
71
+ static char read_element_start(SaxDrive dr);
72
+ static char read_element_end(SaxDrive dr);
73
+ static char read_text(SaxDrive dr);
74
+ static char read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req);
103
75
  static char read_name_token(SaxDrive dr);
104
- static int read_quoted_value(SaxDrive dr, char *last);
105
- static int collapse_special(char *str, int tolerant);
106
-
107
- static VALUE rescue_cb(VALUE rdr, VALUE err);
108
- static VALUE io_cb(VALUE rdr);
109
- static VALUE partial_io_cb(VALUE rdr);
110
- static int read_from_io(SaxDrive dr);
111
- #ifndef JRUBY_RUBY
112
- static int read_from_fd(SaxDrive dr);
113
- #endif
114
- static int read_from_io_partial(SaxDrive dr);
115
- static int read_from_str(SaxDrive dr);
76
+ static char read_quoted_value(SaxDrive dr);
116
77
 
117
- static VALUE sax_value_class;
78
+ static void end_element_cb(SaxDrive dr, VALUE name, int line, int col);
79
+
80
+ static void hint_clear_empty(SaxDrive dr);
81
+ static Nv hint_try_close(SaxDrive dr, const char *name);
82
+
83
+ VALUE ox_sax_value_class = Qnil;
118
84
 
119
85
  /* This is only for CentOS 5.4 with Ruby 1.9.3-p0 and for OS X 10.6 and Solaris 10. */
120
86
  #ifdef NEEDS_STPCPY
121
87
  char *stpncpy(char *dest, const char *src, size_t n) {
122
88
  size_t cnt = strlen(src) + 1;
123
-
89
+
124
90
  if (n < cnt) {
125
91
  cnt = n;
126
92
  }
@@ -130,246 +96,54 @@ char *stpncpy(char *dest, const char *src, size_t n) {
130
96
  }
131
97
  #endif
132
98
 
133
- static inline char
134
- sax_drive_get(SaxDrive dr) {
135
- if (dr->read_end <= dr->cur) {
136
- if (0 != sax_drive_read(dr)) {
137
- return 0;
138
- }
139
- }
140
- if ('\n' == *dr->cur) {
141
- dr->line++;
142
- dr->col = 0;
143
- }
144
- dr->col++;
145
-
146
- return *dr->cur++;
147
- }
148
-
149
- static inline void
150
- backup(SaxDrive dr) {
151
- dr->cur--;
152
- dr->col--; // should reverse wrap but not worth it
153
- }
154
-
155
- static inline void
156
- reset_reader(SaxDrive dr, char *cur, int line, int col) {
157
- dr->cur = cur;
158
- dr->line = line;
159
- dr->col = col;
160
- }
161
-
162
-
163
- /* Starts by reading a character so it is safe to use with an empty or
164
- * compacted buffer.
165
- */
166
- inline static char
167
- next_non_white(SaxDrive dr) {
168
- char c;
169
-
170
- while ('\0' != (c = sax_drive_get(dr))) {
171
- switch(c) {
172
- case ' ':
173
- case '\t':
174
- case '\f':
175
- case '\n':
176
- case '\r':
177
- break;
178
- default:
179
- return c;
180
- }
181
- }
182
- return '\0';
183
- }
184
-
185
- /* Starts by reading a character so it is safe to use with an empty or
186
- * compacted buffer.
187
- */
188
- inline static char
189
- next_white(SaxDrive dr) {
190
- char c;
191
-
192
- while ('\0' != (c = sax_drive_get(dr))) {
193
- switch(c) {
194
- case ' ':
195
- case '\t':
196
- case '\f':
197
- case '\n':
198
- case '\r':
199
- case '\0':
200
- return c;
201
- default:
202
- break;
203
- }
204
- }
205
- return '\0';
206
- }
207
-
208
- inline static int
209
- is_white(char c) {
210
- switch(c) {
211
- case ' ':
212
- case '\t':
213
- case '\f':
214
- case '\n':
215
- case '\r':
216
- return 1;
217
- default:
218
- break;
219
- }
220
- return 0;
221
- }
222
-
223
- inline static VALUE
224
- str2sym(const char *str, SaxDrive dr, char **strp) {
225
- VALUE *slot;
226
- VALUE sym;
227
-
228
- if (Qundef == (sym = ox_cache_get(ox_symbol_cache, str, &slot, strp))) {
229
- #if HAS_ENCODING_SUPPORT
230
- if (0 != dr->encoding) {
231
- VALUE rstr = rb_str_new2(str);
232
-
233
- rb_enc_associate(rstr, dr->encoding);
234
- sym = rb_funcall(rstr, ox_to_sym_id, 0);
235
- } else {
236
- sym = ID2SYM(rb_intern(str));
237
- }
238
- #elif HAS_PRIVATE_ENCODING
239
- if (Qnil != dr->encoding) {
240
- VALUE rstr = rb_str_new2(str);
241
-
242
- rb_funcall(rstr, ox_force_encoding_id, 1, dr->encoding);
243
- sym = rb_funcall(rstr, ox_to_sym_id, 0);
244
- } else {
245
- sym = ID2SYM(rb_intern(str));
246
- }
247
- #else
248
- sym = ID2SYM(rb_intern(str));
249
- #endif
250
- *slot = sym;
251
- }
252
- return sym;
253
- }
254
-
255
-
256
99
  void
257
- ox_sax_parse(VALUE handler, VALUE io, int convert, int tolerant) {
100
+ ox_sax_parse(VALUE handler, VALUE io, SaxOptions options) {
258
101
  struct _SaxDrive dr;
259
-
260
- sax_drive_init(&dr, handler, io, convert, tolerant);
102
+
103
+ sax_drive_init(&dr, handler, io, options);
261
104
  #if 0
262
105
  printf("*** sax_parse with these flags\n");
263
- printf(" has_instruct = %s\n", dr.has_instruct ? "true" : "false");
264
- printf(" has_end_instruct = %s\n", dr.has_end_instruct ? "true" : "false");
265
- printf(" has_attr = %s\n", dr.has_attr ? "true" : "false");
266
- printf(" has_attr_value = %s\n", dr.has_attr_value ? "true" : "false");
267
- printf(" has_doctype = %s\n", dr.has_doctype ? "true" : "false");
268
- printf(" has_comment = %s\n", dr.has_comment ? "true" : "false");
269
- printf(" has_cdata = %s\n", dr.has_cdata ? "true" : "false");
270
- printf(" has_text = %s\n", dr.has_text ? "true" : "false");
271
- printf(" has_value = %s\n", dr.has_value ? "true" : "false");
272
- printf(" has_start_element = %s\n", dr.has_start_element ? "true" : "false");
273
- printf(" has_end_element = %s\n", dr.has_end_element ? "true" : "false");
274
- printf(" has_error = %s\n", dr.has_error ? "true" : "false");
275
- printf(" has_line = %s\n", dr.has_line ? "true" : "false");
276
- printf(" has_column = %s\n", dr.has_column ? "true" : "false");
277
- #endif
278
- read_children(&dr, 1);
279
- sax_drive_cleanup(&dr);
280
- }
281
-
282
- inline static int
283
- respond_to(VALUE obj, ID method) {
284
- #ifdef JRUBY_RUBY
285
- /* There is a bug in JRuby where rb_respond_to() returns true (1) even if
286
- * a method is private. */
287
- {
288
- VALUE args[1];
289
-
290
- *args = ID2SYM(method);
291
- return (Qtrue == rb_funcall2(obj, rb_intern("respond_to?"), 1, args));
292
- }
293
- #else
294
- return rb_respond_to(obj, method);
106
+ printf(" has_instruct = %s\n", dr.has.instruct ? "true" : "false");
107
+ printf(" has_end_instruct = %s\n", dr.has.end_instruct ? "true" : "false");
108
+ printf(" has_attr = %s\n", dr.has.attr ? "true" : "false");
109
+ printf(" has_attr_value = %s\n", dr.has.attr_value ? "true" : "false");
110
+ printf(" has_doctype = %s\n", dr.has.doctype ? "true" : "false");
111
+ printf(" has_comment = %s\n", dr.has.comment ? "true" : "false");
112
+ printf(" has_cdata = %s\n", dr.has.cdata ? "true" : "false");
113
+ printf(" has_text = %s\n", dr.has.text ? "true" : "false");
114
+ printf(" has_value = %s\n", dr.has.value ? "true" : "false");
115
+ printf(" has_start_element = %s\n", dr.has.start_element ? "true" : "false");
116
+ printf(" has_end_element = %s\n", dr.has.end_element ? "true" : "false");
117
+ printf(" has_error = %s\n", dr.has.error ? "true" : "false");
118
+ printf(" has_line = %s\n", dr.has.line ? "true" : "false");
119
+ printf(" has_column = %s\n", dr.has.column ? "true" : "false");
295
120
  #endif
121
+ parse(&dr);
122
+ ox_sax_drive_cleanup(&dr);
296
123
  }
297
124
 
298
125
  static void
299
- sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert, int tolerant) {
300
- if (ox_stringio_class == rb_obj_class(io)) {
301
- VALUE s = rb_funcall2(io, ox_string_id, 0, 0);
302
-
303
- dr->read_func = read_from_str;
304
- dr->in_str = StringValuePtr(s);
305
- } else if (rb_respond_to(io, ox_readpartial_id)) {
306
- #ifdef JRUBY_RUBY
307
- dr->read_func = read_from_io_partial;
308
- dr->io = io;
309
- #else
310
- VALUE rfd;
311
-
312
- if (rb_respond_to(io, ox_fileno_id) && Qnil != (rfd = rb_funcall(io, ox_fileno_id, 0))) {
313
- dr->read_func = read_from_fd;
314
- dr->fd = FIX2INT(rfd);
315
- } else {
316
- dr->read_func = read_from_io_partial;
317
- dr->io = io;
318
- }
319
- #endif
320
- } else if (rb_respond_to(io, ox_read_id)) {
321
- #ifdef JRUBY_RUBY
322
- dr->read_func = read_from_io;
323
- dr->io = io;
324
- #else
325
- VALUE rfd;
326
-
327
- if (rb_respond_to(io, ox_fileno_id) && Qnil != (rfd = rb_funcall(io, ox_fileno_id, 0))) {
328
- dr->read_func = read_from_fd;
329
- dr->fd = FIX2INT(rfd);
330
- } else {
331
- dr->read_func = read_from_io;
332
- dr->io = io;
333
- }
334
- #endif
335
- } else {
336
- rb_raise(ox_arg_error_class, "sax_parser io argument must respond to readpartial() or read().\n");
337
- }
338
- dr->buf = dr->base_buf;
339
- *dr->buf = '\0';
340
- dr->buf_end = dr->buf + sizeof(dr->base_buf) - 1; /* 1 less to make debugging easier */
341
- dr->cur = dr->buf;
342
- dr->read_end = dr->buf;
343
- dr->str = 0;
344
- dr->line = 1;
345
- dr->col = 0;
126
+ sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options) {
127
+ ox_sax_buf_init(&dr->buf, io);
128
+ dr->buf.dr = dr;
129
+ stack_init(&dr->stack);
346
130
  dr->handler = handler;
347
- dr->value_obj = rb_data_object_alloc(sax_value_class, dr, 0, 0);
131
+ dr->value_obj = rb_data_object_alloc(ox_sax_value_class, dr, 0, 0);
348
132
  rb_gc_register_address(&dr->value_obj);
349
- dr->convert_special = convert;
350
- dr->tolerant = tolerant;
351
- dr->has_instruct = respond_to(handler, ox_instruct_id);
352
- dr->has_end_instruct = respond_to(handler, ox_end_instruct_id);
353
- dr->has_attr = respond_to(handler, ox_attr_id);
354
- dr->has_attr_value = respond_to(handler, ox_attr_value_id);
355
- dr->has_doctype = respond_to(handler, ox_doctype_id);
356
- dr->has_comment = respond_to(handler, ox_comment_id);
357
- dr->has_cdata = respond_to(handler, ox_cdata_id);
358
- dr->has_text = respond_to(handler, ox_text_id);
359
- dr->has_value = respond_to(handler, ox_value_id);
360
- dr->has_start_element = respond_to(handler, ox_start_element_id);
361
- dr->has_end_element = respond_to(handler, ox_end_element_id);
362
- dr->has_error = respond_to(handler, ox_error_id);
363
- dr->has_line = (Qtrue == rb_ivar_defined(handler, ox_at_line_id));
364
- dr->has_column = (Qtrue == rb_ivar_defined(handler, ox_at_column_id));
133
+ dr->options = *options;
134
+ dr->hints = 0;
135
+ dr->err = 0;
136
+ has_init(&dr->has, handler);
365
137
  #if HAS_ENCODING_SUPPORT
366
138
  if ('\0' == *ox_default_options.encoding) {
367
139
  VALUE encoding;
368
140
 
141
+ dr->encoding = 0;
369
142
  if (rb_respond_to(io, ox_external_encoding_id) && Qnil != (encoding = rb_funcall(io, ox_external_encoding_id, 0))) {
370
- dr->encoding = rb_enc_from_index(rb_enc_get_index(encoding));
371
- } else {
372
- dr->encoding = 0;
143
+ int e = rb_enc_get_index(encoding);
144
+ if (0 <= e) {
145
+ dr->encoding = rb_enc_from_index(e);
146
+ }
373
147
  }
374
148
  } else {
375
149
  dr->encoding = rb_enc_find(ox_default_options.encoding);
@@ -389,190 +163,178 @@ sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert, int tolerant)
389
163
  #endif
390
164
  }
391
165
 
392
- static void
393
- sax_drive_cleanup(SaxDrive dr) {
166
+ void
167
+ ox_sax_drive_cleanup(SaxDrive dr) {
394
168
  rb_gc_unregister_address(&dr->value_obj);
395
- if (dr->base_buf != dr->buf) {
396
- xfree(dr->buf);
397
- }
398
- }
399
-
400
- static int
401
- sax_drive_read(SaxDrive dr) {
402
- int err;
403
- size_t shift = 0;
404
-
405
- if (dr->buf < dr->cur) {
406
- if (0 == dr->str) {
407
- shift = dr->cur - dr->buf;
408
- } else {
409
- shift = dr->str - dr->buf;
410
- }
411
- /*printf("\n*** shift: %lu\n", shift); */
412
- if (0 == shift) { /* no space left so allocate more */
413
- char *old = dr->buf;
414
- size_t size = dr->buf_end - dr->buf;
415
-
416
- if (dr->buf == dr->base_buf) {
417
- dr->buf = ALLOC_N(char, size * 2);
418
- memcpy(dr->buf, old, size);
419
- } else {
420
- REALLOC_N(dr->buf, char, size * 2);
421
- }
422
- dr->buf_end = dr->buf + size * 2;
423
- dr->cur = dr->buf + (dr->cur - old);
424
- dr->read_end = dr->buf + (dr->read_end - old);
425
- if (0 != dr->str) {
426
- dr->str = dr->buf + (dr->str - old);
427
- }
428
- } else {
429
- memmove(dr->buf, dr->buf + shift, dr->read_end - (dr->buf + shift));
430
- dr->cur -= shift;
431
- dr->read_end -= shift;
432
- if (0 != dr->str) {
433
- dr->str -= shift;
434
- }
435
- }
436
- }
437
- err = dr->read_func(dr);
438
- *dr->read_end = '\0';
439
-
440
- return err;
169
+ buf_cleanup(&dr->buf);
170
+ stack_cleanup(&dr->stack);
441
171
  }
442
172
 
443
173
  static void
444
- sax_drive_error(SaxDrive dr, const char *msg, int critical) {
445
- if (dr->has_error) {
174
+ ox_sax_drive_error_at(SaxDrive dr, const char *msg, int line, int col) {
175
+ if (dr->has.error) {
446
176
  VALUE args[3];
447
177
 
448
178
  args[0] = rb_str_new2(msg);
449
- args[1] = INT2FIX(dr->line);
450
- args[2] = INT2FIX(dr->col);
451
- if (dr->has_line) {
179
+ args[1] = INT2FIX(line);
180
+ args[2] = INT2FIX(col);
181
+ if (dr->has.line) {
452
182
  rb_ivar_set(dr->handler, ox_at_line_id, args[1]);
453
183
  }
454
- if (dr->has_column) {
184
+ if (dr->has.column) {
455
185
  rb_ivar_set(dr->handler, ox_at_column_id, args[2]);
456
186
  }
457
187
  rb_funcall2(dr->handler, ox_error_id, 3, args);
458
- } else if (critical) {
459
- sax_drive_cleanup(dr);
460
- rb_raise(ox_parse_error_class, "%s at line %d, column %d\n", msg, dr->line, dr->col);
461
188
  }
462
189
  }
463
190
 
464
- static int
465
- read_children(SaxDrive dr, int first) {
466
- int err = 0;
467
- int element_read = !first;
468
- char c;
469
- int line;
470
- int col;
471
-
472
- while (!err) {
473
- dr->str = dr->cur; /* protect the start */
474
- c = sax_drive_get(dr);
475
- if (first) {
476
- if (0xEF == (uint8_t)c) { /* only UTF8 is supported */
477
- if (0xBB == (uint8_t)sax_drive_get(dr) && 0xBF == (uint8_t)sax_drive_get(dr)) {
191
+ void
192
+ ox_sax_drive_error(SaxDrive dr, const char *msg) {
193
+ ox_sax_drive_error_at(dr, msg, dr->buf.line, dr->buf.col);
194
+ }
195
+
196
+ static char
197
+ skipBOM(SaxDrive dr) {
198
+ char c = buf_get(&dr->buf);
199
+
200
+ if (0xEF == (uint8_t)c) { /* only UTF8 is supported */
201
+ if (0xBB == (uint8_t)buf_get(&dr->buf) && 0xBF == (uint8_t)buf_get(&dr->buf)) {
478
202
  #if HAS_ENCODING_SUPPORT
479
- dr->encoding = ox_utf8_encoding;
203
+ dr->encoding = ox_utf8_encoding;
480
204
  #elif HAS_PRIVATE_ENCODING
481
- dr->encoding = ox_utf8_encoding;
205
+ dr->encoding = ox_utf8_encoding;
482
206
  #endif
483
- c = sax_drive_get(dr);
484
- } else {
485
- sax_drive_error(dr, "invalid format, invalid BOM or a binary file.", 1);
486
- }
487
- }
207
+ c = buf_get(&dr->buf);
208
+ } else {
209
+ ox_sax_drive_error(dr, BAD_BOM "invalid BOM or a binary file.");
210
+ c = '\0';
488
211
  }
489
- if ('\0' == c || (is_white(c) && '\0' == (c = next_non_white(dr)))) {
490
- if (!first) {
491
- sax_drive_error(dr, "invalid format, element not terminated", 1);
492
- err = -1;
493
- }
494
- break; /* normal completion if first */
212
+ }
213
+ return c;
214
+ }
215
+
216
+ static void
217
+ parse(SaxDrive dr) {
218
+ char c = skipBOM(dr);
219
+ int state = START_STATE;
220
+
221
+ while ('\0' != c) {
222
+ buf_protect(&dr->buf);
223
+ if (is_white(c) && '\0' == (c = buf_next_non_white(&dr->buf))) {
224
+ break;
495
225
  }
496
- if ('<' != c) {
497
- if (first) { /* all top level entities start with < */
498
- sax_drive_error(dr, "invalid format, expected <", 1);
499
- break; /* unrecoverable */
500
- }
501
- if (0 != (err = read_text(dr))) { /* finished when < is reached */
502
- break;
503
- }
504
- }
505
- dr->str = dr->cur; /* protect the start for elements */
506
- c = sax_drive_get(dr);
507
- switch (c) {
508
- case '?': /* instructions (xml or otherwise) */
509
- err = read_instruction(dr);
510
- break;
511
- case '!': /* comment or doctype */
512
- dr->str = dr->cur;
513
- c = sax_drive_get(dr);
514
- if ('\0' == c) {
515
- sax_drive_error(dr, "invalid format, DOCTYPE or comment not terminated", 1);
516
- err = 1;
517
- } else if ('-' == c) {
518
- c = sax_drive_get(dr); /* skip first - and get next character */
519
- if ('-' != c) {
520
- sax_drive_error(dr, "invalid format, bad comment format", 1);
521
- err = 1;
226
+ if ('<' == c) {
227
+ c = buf_get(&dr->buf);
228
+ switch (c) {
229
+ case '?': /* instructions (xml or otherwise) */
230
+ c = read_instruction(dr);
231
+ break;
232
+ case '!': /* comment or doctype */
233
+ buf_protect(&dr->buf);
234
+ c = buf_get(&dr->buf);
235
+ if ('\0' == c) {
236
+ ox_sax_drive_error(dr, NO_TERM "DOCTYPE or comment not terminated");
237
+ goto DONE;
238
+ } else if ('-' == c) {
239
+ c = buf_get(&dr->buf); /* skip first - and get next character */
240
+ if ('-' != c) {
241
+ ox_sax_drive_error(dr, INVALID_FORMAT "bad comment format, expected <!--");
242
+ } else {
243
+ c = buf_get(&dr->buf); /* skip second - */
244
+ }
245
+ c = read_comment(dr);
522
246
  } else {
523
- c = sax_drive_get(dr); /* skip second - */
524
- err = read_comment(dr);
247
+ int i;
248
+ int spaced = 0;
249
+ int line = dr->buf.line;
250
+ int col = dr->buf.col;
251
+
252
+ if (is_white(c)) {
253
+ spaced = 1;
254
+ c = buf_next_non_white(&dr->buf);
255
+ }
256
+ dr->buf.str = dr->buf.tail - 1;
257
+ for (i = 7; 0 < i; i--) {
258
+ c = buf_get(&dr->buf);
259
+ }
260
+ if (0 == strncmp("DOCTYPE", dr->buf.str, 7)) {
261
+ if (spaced) {
262
+ ox_sax_drive_error_at(dr, WRONG_CHAR "<!DOCTYPE can not included spaces", line, col);
263
+ }
264
+ if (START_STATE != state) {
265
+ ox_sax_drive_error(dr, OUT_OF_ORDER "DOCTYPE can not come after an element");
266
+ }
267
+ c = read_doctype(dr);
268
+ } else if (0 == strncasecmp("DOCTYPE", dr->buf.str, 7)) {
269
+ ox_sax_drive_error(dr, CASE_ERROR "expected DOCTYPE all in caps");
270
+ if (START_STATE != state) {
271
+ ox_sax_drive_error(dr, OUT_OF_ORDER "DOCTYPE can not come after an element");
272
+ }
273
+ c = read_doctype(dr);
274
+ } else if (0 == strncmp("[CDATA[", dr->buf.str, 7)) {
275
+ if (spaced) {
276
+ ox_sax_drive_error_at(dr, WRONG_CHAR "<![CDATA[ can not included spaces", line, col);
277
+ }
278
+ c = read_cdata(dr);
279
+ } else if (0 == strncasecmp("[CDATA[", dr->buf.str, 7)) {
280
+ ox_sax_drive_error(dr, CASE_ERROR "expected CDATA all in caps");
281
+ c = read_cdata(dr);
282
+ } else {
283
+ ox_sax_drive_error_at(dr, WRONG_CHAR "DOCTYPE, CDATA, or comment expected", line, col);
284
+ c = read_name_token(dr);
285
+ if ('>' == c) {
286
+ c = buf_get(&dr->buf);
287
+ }
288
+ }
525
289
  }
526
- } else {
527
- int i;
528
-
529
- for (i = 7; 0 < i; i--) {
530
- sax_drive_get(dr);
531
- }
532
- if ((dr->tolerant) ? 0 == strncasecmp("DOCTYPE", dr->str, 7) : 0 == strncmp("DOCTYPE", dr->str, 7)) {
533
- if (element_read || !first) {
534
- sax_drive_error(dr, "invalid format, DOCTYPE can not come after an element", 0);
535
- }
536
- err = read_doctype(dr);
537
- } else if ((dr->tolerant) ? 0 == strncasecmp("[CDATA[", dr->str, 7) : 0 == strncmp("[CDATA[", dr->str, 7)) {
538
- err = read_cdata(dr);
539
- } else {
540
- sax_drive_error(dr, "invalid format, DOCTYPE or comment expected", 1);
541
- err = 1;
542
- }
543
- }
544
- break;
545
- case '/': /* element end */
546
- line = dr->line;
547
- col = dr->col;
548
- err = ('\0' == read_name_token(dr));
549
- dr->line = line;
550
- dr->col = col;
551
- if (first && dr->tolerant) {
552
- sax_drive_error(dr, "invalid format, unmatched element end", 0);
553
- } else {
554
- return err;
290
+ break;
291
+ case '/': /* element end */
292
+ c = read_element_end(dr);
293
+ if (0 == stack_peek(&dr->stack)) {
294
+ state = AFTER_STATE;
295
+ }
296
+ break;
297
+ case '\0':
298
+ goto DONE;
299
+ default:
300
+ buf_backup(&dr->buf);
301
+ if (AFTER_STATE == state) {
302
+ ox_sax_drive_error(dr, OUT_OF_ORDER "multiple top level elements");
303
+ }
304
+ state = BODY_STATE;
305
+ c = read_element_start(dr);
306
+ if (0 == stack_peek(&dr->stack)) {
307
+ state = AFTER_STATE;
308
+ }
309
+ break;
555
310
  }
556
- break;
557
- case '\0':
558
- sax_drive_error(dr, "invalid format, document not terminated", 1);
559
- err = 1;
560
- break;
561
- default:
562
- backup(dr); /* safe since no read occurred after getting last character */
563
- if (first && element_read && !dr->tolerant) {
564
- sax_drive_error(dr, "invalid format, multiple top level elements", 0);
565
- }
566
- err = read_element(dr);
567
- if (NAME_MISMATCH == err && dr->tolerant && first) {
568
- // must have been a end element with no matching start
569
- err = 0;
570
- }
571
- element_read = 1;
572
- break;
311
+ } else {
312
+ buf_reset(&dr->buf);
313
+ c = read_text(dr);
573
314
  }
574
315
  }
575
- return err;
316
+ DONE:
317
+ if (dr->stack.head < dr->stack.tail) {
318
+ char msg[256];
319
+ Nv sp;
320
+
321
+ if (dr->has.line) {
322
+ rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(dr->buf.line));
323
+ }
324
+ if (dr->has.column) {
325
+ rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(dr->buf.col));
326
+ }
327
+ for (sp = dr->stack.tail - 1; dr->stack.head <= sp; sp--) {
328
+ snprintf(msg, sizeof(msg) - 1, "%selement '%s' not closed", EL_MISMATCH, sp->name);
329
+ ox_sax_drive_error_at(dr, msg, dr->buf.line, dr->buf.col);
330
+ if (dr->has.end_element) {
331
+ VALUE args[1];
332
+
333
+ args[0] = sp->val;
334
+ rb_funcall2(dr->handler, ox_end_element_id, 1, args);
335
+ }
336
+ }
337
+ }
576
338
  }
577
339
 
578
340
  static void
@@ -580,13 +342,14 @@ read_content(SaxDrive dr, char *content, size_t len) {
580
342
  char c;
581
343
  char *end = content + len;
582
344
 
583
- while ('\0' != (c = sax_drive_get(dr))) {
345
+ while ('\0' != (c = buf_get(&dr->buf))) {
584
346
  if (end < content) {
585
- sax_drive_error(dr, "processing instruction content too large", 1);
347
+ ox_sax_drive_error(dr, "processing instruction content too large");
348
+ return;
586
349
  }
587
350
  if ('?' == c) {
588
- if ('\0' == (c = sax_drive_get(dr))) {
589
- sax_drive_error(dr, "invalid format, document not terminated", 1);
351
+ if ('\0' == (c = buf_get(&dr->buf))) {
352
+ ox_sax_drive_error(dr, NO_TERM "document not terminated");
590
353
  }
591
354
  if ('>' == c) {
592
355
  *content = '\0';
@@ -603,50 +366,50 @@ read_content(SaxDrive dr, char *content, size_t len) {
603
366
 
604
367
  /* Entered after the "<?" sequence. Ready to read the rest.
605
368
  */
606
- static int
369
+ static char
607
370
  read_instruction(SaxDrive dr) {
608
371
  char content[1024];
609
372
  char c;
610
373
  char *cend;
611
- const char *err;
612
374
  VALUE target = Qnil;
613
375
  int is_xml;
614
- int line = dr->line;
615
- int col = dr->col - 1;
376
+ int line = dr->buf.line;
377
+ int col = dr->buf.col - 1;
616
378
 
379
+ buf_protect(&dr->buf);
617
380
  if ('\0' == (c = read_name_token(dr))) {
618
- return -1;
381
+ return c;
619
382
  }
620
- is_xml = (0 == strcmp("xml", dr->str));
621
- if (dr->has_instruct || dr->has_end_instruct) {
622
- target = rb_str_new2(dr->str);
383
+ is_xml = (0 == strcmp("xml", dr->buf.str));
384
+ if (dr->has.instruct || dr->has.end_instruct) {
385
+ target = rb_str_new2(dr->buf.str);
623
386
  }
624
- if (dr->has_instruct) {
387
+ if (dr->has.instruct) {
625
388
  VALUE args[1];
626
389
 
627
- if (dr->has_line) {
390
+ if (dr->has.line) {
628
391
  rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
629
392
  }
630
- if (dr->has_column) {
393
+ if (dr->has.column) {
631
394
  rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
632
395
  }
633
396
  args[0] = target;
634
397
  rb_funcall2(dr->handler, ox_instruct_id, 1, args);
635
398
  }
636
- dr->str = dr->cur; /* make sure the start doesn't get compacted out */
637
- line = dr->line;
638
- col = dr->col;
399
+ buf_protect(&dr->buf);
400
+ line = dr->buf.line;
401
+ col = dr->buf.col;
639
402
  read_content(dr, content, sizeof(content) - 1);
640
- cend = dr->cur;
641
- reset_reader(dr, dr->str, line, col);
642
- if (0 != (err = read_attrs(dr, c, '?', '?', is_xml))) {
643
- if (dr->has_text) {
403
+ cend = dr->buf.tail;
404
+ buf_reset(&dr->buf);
405
+ dr->err = 0;
406
+ c = read_attrs(dr, c, '?', '?', is_xml, 1);
407
+ if (dr->err) {
408
+ if (dr->has.text) {
644
409
  VALUE args[1];
645
410
 
646
- if (dr->convert_special) {
647
- if (0 != collapse_special(content, dr->tolerant)) {
648
- sax_drive_error(dr, "invalid format, special character does not end with a semicolon", 0);
649
- }
411
+ if (dr->options.convert_special) {
412
+ ox_sax_collapse_special(dr, content, line, col);
650
413
  }
651
414
  args[0] = rb_str_new2(content);
652
415
  #if HAS_ENCODING_SUPPORT
@@ -658,106 +421,144 @@ read_instruction(SaxDrive dr) {
658
421
  rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
659
422
  }
660
423
  #endif
661
- if (dr->has_line) {
424
+ if (dr->has.line) {
662
425
  rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
663
426
  }
664
- if (dr->has_column) {
427
+ if (dr->has.column) {
665
428
  rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
666
429
  }
667
430
  rb_funcall2(dr->handler, ox_text_id, 1, args);
668
431
  }
669
- dr->cur = cend;
432
+ dr->buf.tail = cend;
433
+ c = buf_get(&dr->buf);
670
434
  } else {
671
- line = dr->line;
672
- col = dr->col;
673
- c = next_non_white(dr);
674
- if ('>' != c) {
675
- sax_drive_error(dr, "invalid format, instruction not terminated", 1);
676
- return -1;
435
+ line = dr->buf.line;
436
+ col = dr->buf.col;
437
+ c = buf_next_non_white(&dr->buf);
438
+ if ('>' == c) {
439
+ c = buf_get(&dr->buf);
440
+ } else {
441
+ ox_sax_drive_error_at(dr, NO_TERM "instruction not terminated", line, col);
442
+ if ('>' == c) {
443
+ c = buf_get(&dr->buf);
444
+ }
677
445
  }
678
446
  }
679
- if (dr->has_end_instruct) {
447
+ if (dr->has.end_instruct) {
680
448
  VALUE args[1];
681
449
 
682
- if (dr->has_line) {
450
+ if (dr->has.line) {
683
451
  rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
684
452
  }
685
- if (dr->has_column) {
453
+ if (dr->has.column) {
686
454
  rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
687
455
  }
688
456
  args[0] = target;
689
457
  rb_funcall2(dr->handler, ox_end_instruct_id, 1, args);
690
458
  }
691
- dr->str = 0;
459
+ dr->buf.str = 0;
692
460
 
693
- return 0;
461
+ return c;
694
462
  }
695
463
 
696
464
  /* Entered after the "<!DOCTYPE" sequence. Ready to read the rest.
697
465
  */
698
- static int
466
+ static char
699
467
  read_doctype(SaxDrive dr) {
700
468
  char c;
701
- int line = dr->line;
702
- int col = dr->col - 10;
469
+ int line = dr->buf.line;
470
+ int col = dr->buf.col - 10;
471
+ char *s;
703
472
 
704
- dr->str = dr->cur - 1; /* mark the start */
705
- while ('>' != (c = sax_drive_get(dr))) {
473
+ buf_backup(&dr->buf); /* back up to the start in case the cdata is empty */
474
+ buf_protect(&dr->buf);
475
+ while ('>' != (c = buf_get(&dr->buf))) {
706
476
  if ('\0' == c) {
707
- sax_drive_error(dr, "invalid format, doctype terminated unexpectedly", 1);
708
- return -1;
477
+ ox_sax_drive_error(dr, NO_TERM "doctype not terminated");
478
+ return c;
709
479
  }
710
480
  }
711
- *(dr->cur - 1) = '\0';
712
- if (dr->has_doctype) {
481
+ if (dr->options.smart && 0 == dr->hints) {
482
+ for (s = dr->buf.str; is_white(*s); s++) { }
483
+ if (0 == strncasecmp("HTML", s, 4)) {
484
+ dr->hints = ox_hints_html();
485
+ }
486
+ }
487
+ *(dr->buf.tail - 1) = '\0';
488
+ if (dr->has.doctype) {
713
489
  VALUE args[1];
714
490
 
715
- if (dr->has_line) {
491
+ if (dr->has.line) {
716
492
  rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
717
493
  }
718
- if (dr->has_column) {
494
+ if (dr->has.column) {
719
495
  rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
720
496
  }
721
- args[0] = rb_str_new2(dr->str);
497
+ args[0] = rb_str_new2(dr->buf.str);
722
498
  rb_funcall2(dr->handler, ox_doctype_id, 1, args);
723
499
  }
724
- dr->str = 0;
500
+ dr->buf.str = 0;
725
501
 
726
- return 0;
502
+ return buf_get(&dr->buf);
727
503
  }
728
504
 
729
505
  /* Entered after the "<![CDATA[" sequence. Ready to read the rest.
730
506
  */
731
- static int
507
+ static char
732
508
  read_cdata(SaxDrive dr) {
733
- char c;
734
- int end = 0;
735
- int line = dr->line;
736
- int col = dr->col - 10;
737
-
738
- backup(dr); /* back up to the start in case the cdata is empty */
739
- dr->str = dr->cur; /* mark the start */
509
+ char c;
510
+ int end = 0;
511
+ int line = dr->buf.line;
512
+ int col = dr->buf.col - 10;
513
+ struct _CheckPt cp = CHECK_PT_INIT;
514
+
515
+ buf_backup(&dr->buf); /* back up to the start in case the cdata is empty */
516
+ buf_protect(&dr->buf);
740
517
  while (1) {
741
- c = sax_drive_get(dr);
742
- if (']' == c) {
518
+ c = buf_get(&dr->buf);
519
+ switch (c) {
520
+ case ']':
743
521
  end++;
744
- } else if ('>' == c) {
522
+ break;
523
+ case '>':
745
524
  if (2 <= end) {
746
- *(dr->cur - 3) = '\0';
747
- break;
525
+ *(dr->buf.tail - 3) = '\0';
526
+ c = buf_get(&dr->buf);
527
+ goto CB;
748
528
  }
529
+ if (!buf_checkset(&cp)) {
530
+ buf_checkpoint(&dr->buf, &cp);
531
+ }
749
532
  end = 0;
750
- } else if ('\0' == c) {
751
- sax_drive_error(dr, "invalid format, cdata terminated unexpectedly", 1);
752
- return -1;
753
- } else {
754
- end = 0;
755
- }
533
+ break;
534
+ case '<':
535
+ if (!buf_checkset(&cp)) {
536
+ buf_checkpoint(&dr->buf, &cp);
537
+ }
538
+ end = 0;
539
+ break;
540
+ case '\0':
541
+ if (buf_checkset(&cp)) {
542
+ c = buf_checkback(&dr->buf, &cp);
543
+ ox_sax_drive_error(dr, NO_TERM "CDATA not terminated");
544
+ *(dr->buf.tail - 1) = '\0';
545
+ goto CB;
546
+ }
547
+ ox_sax_drive_error(dr, NO_TERM "CDATA not terminated");
548
+ return '\0';
549
+ default:
550
+ if (1 < end && !buf_checkset(&cp)) {
551
+ buf_checkpoint(&dr->buf, &cp);
552
+ }
553
+ end = 0;
554
+ break;
555
+ }
756
556
  }
757
- if (dr->has_cdata) {
557
+ CB:
558
+ if (dr->has.cdata) {
758
559
  VALUE args[1];
759
560
 
760
- args[0] = rb_str_new2(dr->str);
561
+ args[0] = rb_str_new2(dr->buf.str);
761
562
  #if HAS_ENCODING_SUPPORT
762
563
  if (0 != dr->encoding) {
763
564
  rb_enc_associate(args[0], dr->encoding);
@@ -767,53 +568,76 @@ read_cdata(SaxDrive dr) {
767
568
  rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
768
569
  }
769
570
  #endif
770
- if (dr->has_line) {
571
+ if (dr->has.line) {
771
572
  rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
772
573
  }
773
- if (dr->has_column) {
574
+ if (dr->has.column) {
774
575
  rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
775
576
  }
776
577
  rb_funcall2(dr->handler, ox_cdata_id, 1, args);
777
578
  }
778
- dr->str = 0;
579
+ dr->buf.str = 0;
779
580
 
780
- return 0;
581
+ return c;
781
582
  }
782
583
 
783
584
  /* Entered after the "<!--" sequence. Ready to read the rest.
784
585
  */
785
- static int
586
+ static char
786
587
  read_comment(SaxDrive dr) {
787
- char c;
788
- int end = 0;
789
- int line = dr->line;
790
- int col = dr->col - 5;
791
-
792
- dr->str = dr->cur - 1; /* mark the start */
588
+ char c;
589
+ int end = 0;
590
+ int line = dr->buf.line;
591
+ int col = dr->buf.col - 4;
592
+ struct _CheckPt cp = CHECK_PT_INIT;
593
+
594
+ buf_backup(&dr->buf); /* back up to the start in case the cdata is empty */
595
+ buf_protect(&dr->buf);
793
596
  while (1) {
794
- c = sax_drive_get(dr);
795
- if ('-' == c) {
796
- if (end) {
797
- *(dr->cur - 2) = '\0';
798
- break;
799
- } else {
800
- end = 1;
597
+ c = buf_get(&dr->buf);
598
+ switch (c) {
599
+ case '-':
600
+ end++;
601
+ break;
602
+ case '>':
603
+ if (2 <= end) {
604
+ *(dr->buf.tail - 3) = '\0';
605
+ c = buf_get(&dr->buf);
606
+ goto CB;
801
607
  }
802
- } else if ('\0' == c) {
803
- sax_drive_error(dr, "invalid format, comment terminated unexpectedly", 1);
804
- return -1;
805
- } else {
608
+ if (!buf_checkset(&cp)) {
609
+ buf_checkpoint(&dr->buf, &cp);
610
+ }
806
611
  end = 0;
807
- }
808
- }
809
- c = sax_drive_get(dr);
810
- if ('>' != c) {
811
- sax_drive_error(dr, "invalid format, comment terminated unexpectedly", 1);
612
+ break;
613
+ case '<':
614
+ if (!buf_checkset(&cp)) {
615
+ buf_checkpoint(&dr->buf, &cp);
616
+ }
617
+ end = 0;
618
+ break;
619
+ case '\0':
620
+ if (buf_checkset(&cp)) {
621
+ c = buf_checkback(&dr->buf, &cp);
622
+ ox_sax_drive_error(dr, NO_TERM "comment not terminated");
623
+ *(dr->buf.tail - 1) = '\0';
624
+ goto CB;
625
+ }
626
+ ox_sax_drive_error(dr, NO_TERM "comment not terminated");
627
+ return '\0';
628
+ default:
629
+ if (1 < end && !buf_checkset(&cp)) {
630
+ buf_checkpoint(&dr->buf, &cp);
631
+ }
632
+ end = 0;
633
+ break;
634
+ }
812
635
  }
813
- if (dr->has_comment) {
636
+ CB:
637
+ if (dr->has.comment) {
814
638
  VALUE args[1];
815
639
 
816
- args[0] = rb_str_new2(dr->str);
640
+ args[0] = rb_str_new2(dr->buf.str);
817
641
  #if HAS_ENCODING_SUPPORT
818
642
  if (0 != dr->encoding) {
819
643
  rb_enc_associate(args[0], dr->encoding);
@@ -823,44 +647,91 @@ read_comment(SaxDrive dr) {
823
647
  rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
824
648
  }
825
649
  #endif
826
- if (dr->has_line) {
650
+ if (dr->has.line) {
827
651
  rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
828
652
  }
829
- if (dr->has_column) {
653
+ if (dr->has.column) {
830
654
  rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
831
655
  }
832
656
  rb_funcall2(dr->handler, ox_comment_id, 1, args);
833
657
  }
834
- dr->str = 0;
658
+ dr->buf.str = 0;
835
659
 
836
- return 0;
660
+ return c;
837
661
  }
838
662
 
839
663
  /* Entered after the '<' and the first character after that. Returns status
840
664
  * code.
841
665
  */
842
- static int
843
- read_element(SaxDrive dr) {
666
+ static char
667
+ read_element_start(SaxDrive dr) {
844
668
  char *ename = 0;
845
669
  VALUE name = Qnil;
846
- const char *err;
847
670
  char c;
848
671
  int closed;
849
- int line = dr->line;
850
- int col = dr->col - 1;
851
- int e;
672
+ int line = dr->buf.line;
673
+ int col = dr->buf.col - 1;
674
+ Hint h = 0;
675
+ int stackless = 0;
852
676
 
853
677
  if ('\0' == (c = read_name_token(dr))) {
854
- return -1;
678
+ return '\0';
679
+ }
680
+ if (dr->options.smart && 0 == dr->hints && stack_empty(&dr->stack) && 0 == strcasecmp("html", dr->buf.str)) {
681
+ dr->hints = ox_hints_html();
682
+ }
683
+ if (0 != dr->hints) {
684
+ hint_clear_empty(dr);
685
+ h = ox_hint_find(dr->hints, dr->buf.str);
686
+ if (0 == h) {
687
+ char msg[100];
688
+
689
+ sprintf(msg, "%s%s is not a valid element type for a %s document type.", INV_ELEMENT, dr->buf.str, dr->hints->name);
690
+ ox_sax_drive_error(dr, msg);
691
+ } else {
692
+ Nv top_nv = stack_peek(&dr->stack);
693
+
694
+ if (h->empty) {
695
+ stackless = 1;
696
+ }
697
+ if (0 != top_nv) {
698
+ char msg[256];
699
+
700
+ if (!h->nest && 0 == strcasecmp(top_nv->name, h->name)) {
701
+ snprintf(msg, sizeof(msg) - 1, "%s%s can not be nested in a %s document, closing previous.",
702
+ INV_ELEMENT, dr->buf.str, dr->hints->name);
703
+ ox_sax_drive_error(dr, msg);
704
+ stack_pop(&dr->stack);
705
+ end_element_cb(dr, top_nv->val, line, col);
706
+ top_nv = stack_peek(&dr->stack);
707
+ }
708
+ if (0 != h->parents) {
709
+ const char **p;
710
+ int ok = 0;
711
+
712
+ for (p = h->parents; 0 != *p; p++) {
713
+ if (0 == strcasecmp(*p, top_nv->name)) {
714
+ ok = 1;
715
+ break;
716
+ }
717
+ }
718
+ if (!ok) {
719
+ snprintf(msg, sizeof(msg) - 1, "%s%s can not be a child of a %s in a %s document.",
720
+ INV_ELEMENT, h->name, top_nv->name, dr->hints->name);
721
+ ox_sax_drive_error(dr, msg);
722
+ }
723
+ }
724
+ }
725
+ }
855
726
  }
856
- name = str2sym(dr->str, dr, &ename);
857
- if (dr->has_start_element) {
727
+ name = str2sym(dr, dr->buf.str, &ename);
728
+ if (dr->has.start_element) {
858
729
  VALUE args[1];
859
730
 
860
- if (dr->has_line) {
731
+ if (dr->has.line) {
861
732
  rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
862
733
  }
863
- if (dr->has_column) {
734
+ if (dr->has.column) {
864
735
  rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
865
736
  }
866
737
  args[0] = name;
@@ -871,142 +742,158 @@ read_element(SaxDrive dr) {
871
742
  } else if ('>' == c) {
872
743
  closed = 0;
873
744
  } else {
874
- if (0 != (err = read_attrs(dr, c, '/', '>', 0))) {
875
- sax_drive_error(dr, err, 1);
876
- return -1;
877
- }
878
- closed = ('/' == *(dr->cur - 1));
745
+ buf_protect(&dr->buf);
746
+ c = read_attrs(dr, c, '/', '>', 0, 0);
747
+ if (is_white(c)) {
748
+ c = buf_next_non_white(&dr->buf);
749
+ }
750
+ closed = ('/' == c);
879
751
  }
880
752
  if (closed) {
881
- c = next_non_white(dr);
882
- if ('>' != c) {
883
- sax_drive_error(dr, "invalid format, element not closed", 1);
884
- return -1;
885
- }
753
+ c = buf_next_non_white(&dr->buf);
754
+ line = dr->buf.line;
755
+ col = dr->buf.col - 1;
756
+ end_element_cb(dr, name, line, col);
757
+ } else if (stackless) {
758
+ end_element_cb(dr, name, line, col);
759
+ } else {
760
+ stack_push(&dr->stack, ename, name, h);
886
761
  }
887
- if (closed) {
888
- line = dr->line;
889
- col = dr->col - 1;
890
- if (dr->has_end_element) {
891
- VALUE args[1];
762
+ if ('>' != c) {
763
+ ox_sax_drive_error(dr, WRONG_CHAR "element not closed");
764
+ return c;
765
+ }
766
+ dr->buf.str = 0;
892
767
 
893
- if (dr->has_line) {
894
- rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
895
- }
896
- if (dr->has_column) {
897
- rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
898
- }
899
- args[0] = name;
900
- rb_funcall2(dr->handler, ox_end_element_id, 1, args);
901
- }
768
+ return buf_get(&dr->buf);
769
+ }
770
+
771
+ static Nv
772
+ stack_rev_find(NStack stack, const char *name) {
773
+ Nv nv;
774
+
775
+ for (nv = stack->tail - 1; stack->head <= nv; nv--) {
776
+ if (0 == strcmp(name, nv->name)) {
777
+ return nv;
778
+ }
779
+ }
780
+ return 0;
781
+ }
782
+
783
+ static char
784
+ read_element_end(SaxDrive dr) {
785
+ VALUE name = Qnil;
786
+ char c;
787
+ int line = dr->buf.line;
788
+ int col = dr->buf.col - 2;
789
+ Nv nv;
790
+
791
+ if ('\0' == (c = read_name_token(dr))) {
792
+ return '\0';
793
+ }
794
+ // c should be > and current is one past so read another char
795
+ c = buf_get(&dr->buf);
796
+ nv = stack_peek(&dr->stack);
797
+ if (0 != nv && 0 == strcmp(dr->buf.str, nv->name)) {
798
+ name = nv->val;
799
+ stack_pop(&dr->stack);
902
800
  } else {
903
- if (0 != (e = read_children(dr, 0))) {
904
- if (NAME_MISMATCH == e) {
905
- if (0 != dr->has_end_element) {
801
+ // Mismatched start and end
802
+ char msg[256];
803
+ Nv match = stack_rev_find(&dr->stack, dr->buf.str);
804
+
805
+ if (0 == match) {
806
+ // Not found so open and close element.
807
+ char *ename = 0;
808
+ Hint h = ox_hint_find(dr->hints, dr->buf.str);
809
+
810
+ if (0 != h && h->empty) {
811
+ // Just close normally
812
+ name = str2sym(dr, dr->buf.str, &ename);
813
+ snprintf(msg, sizeof(msg) - 1, "%selement '%s' should not have a separate close element", EL_MISMATCH, dr->buf.str);
814
+ ox_sax_drive_error_at(dr, msg, line, col);
815
+ return c;
816
+ } else {
817
+ snprintf(msg, sizeof(msg) - 1, "%selement '%s' closed but not opened", EL_MISMATCH, dr->buf.str);
818
+ ox_sax_drive_error_at(dr, msg, line, col);
819
+ name = str2sym(dr, dr->buf.str, &ename);
820
+ if (dr->has.start_element) {
906
821
  VALUE args[1];
907
822
 
908
- if (dr->has_line) {
823
+ if (dr->has.line) {
909
824
  rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
910
825
  }
911
- if (dr->has_column) {
912
- rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col - 2));
826
+ if (dr->has.column) {
827
+ rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
913
828
  }
914
829
  args[0] = name;
915
- rb_funcall2(dr->handler, ox_end_element_id, 1, args);
830
+ rb_funcall2(dr->handler, ox_start_element_id, 1, args);
916
831
  }
917
- if (0 == strcmp(dr->str, ename)) {
918
- return 0;
919
- } else {
920
- return NAME_MISMATCH;
921
- }
922
- } else {
923
- return -1;
924
- }
925
- }
926
- line = dr->line;
927
- col = dr->col;
928
- // read_children reads up to the end of the terminating element name
929
- dr->col += (uint32_t)(dr->cur - dr->str);
930
- if (0 != ename && 0 != strcmp(ename, dr->str)) {
931
- if (dr->has_line) {
932
- rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
933
- }
934
- if (dr->has_column) {
935
- rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
936
832
  }
937
- sax_drive_error(dr, "invalid format, element start and end names do not match", 1);
938
- //printf("*** ename: %s close: %s\n", ename, dr->str);
939
- if (dr->tolerant) {
940
- if (0 != dr->has_end_element) {
941
- VALUE args[1];
833
+ } else {
834
+ // Found a match so close all up to the found element in stack.
835
+ Nv n2;
942
836
 
943
- if (dr->has_line) {
944
- rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
945
- }
946
- if (dr->has_column) {
947
- rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col - 2));
837
+ if (0 != (n2 = hint_try_close(dr, dr->buf.str))) {
838
+ name = n2->val;
839
+ } else {
840
+ snprintf(msg, sizeof(msg) - 1, "%selement '%s' close does not match '%s' open", EL_MISMATCH, dr->buf.str, nv->name);
841
+ ox_sax_drive_error_at(dr, msg, line, col);
842
+ if (dr->has.line) {
843
+ rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
844
+ }
845
+ if (dr->has.column) {
846
+ rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
847
+ }
848
+ for (nv = stack_pop(&dr->stack); match < nv; nv = stack_pop(&dr->stack)) {
849
+ if (dr->has.end_element) {
850
+ rb_funcall(dr->handler, ox_end_element_id, 1, nv->val);
948
851
  }
949
- args[0] = name;
950
- rb_funcall2(dr->handler, ox_end_element_id, 1, args);
951
852
  }
952
- return NAME_MISMATCH; // dr->str is still the name
953
- } else {
954
- return -1;
853
+ name = nv->val;
955
854
  }
956
855
  }
957
- if (0 != dr->has_end_element) {
958
- VALUE args[1];
959
-
960
- if (dr->has_line) {
961
- rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
962
- }
963
- if (dr->has_column) {
964
- rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col - 2));
965
- }
966
- args[0] = name;
967
- rb_funcall2(dr->handler, ox_end_element_id, 1, args);
968
- }
969
856
  }
970
- dr->str = 0;
857
+ end_element_cb(dr, name, line, col);
971
858
 
972
- return 0;
859
+ return c;
973
860
  }
974
861
 
975
- static int
862
+ static char
976
863
  read_text(SaxDrive dr) {
977
864
  char c;
978
- int line = dr->line;
979
- int col = dr->col - 1;
865
+ int line = dr->buf.line;
866
+ int col = dr->buf.col - 1;
980
867
 
981
- /* start marked in read_children */
982
- /*dr->str = dr->cur - 1; / * mark the start */
983
- while ('<' != (c = sax_drive_get(dr))) {
868
+ buf_backup(&dr->buf);
869
+ buf_protect(&dr->buf);
870
+ while ('<' != (c = buf_get(&dr->buf))) {
984
871
  if ('\0' == c) {
985
- sax_drive_error(dr, "invalid format, text terminated unexpectedly", 1);
986
- return -1;
872
+ ox_sax_drive_error(dr, NO_TERM "text not terminated");
873
+ break;
987
874
  }
988
875
  }
989
- *(dr->cur - 1) = '\0';
990
- if (dr->has_value) {
876
+ if ('\0' != c) {
877
+ *(dr->buf.tail - 1) = '\0';
878
+ }
879
+ if (dr->has.value) {
991
880
  VALUE args[1];
992
881
 
993
- if (dr->has_line) {
882
+ if (dr->has.line) {
994
883
  rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
995
884
  }
996
- if (dr->has_column) {
885
+ if (dr->has.column) {
997
886
  rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
998
887
  }
999
888
  *args = dr->value_obj;
1000
889
  rb_funcall2(dr->handler, ox_value_id, 1, args);
1001
- } else if (dr->has_text) {
890
+ } else if (dr->has.text) {
1002
891
  VALUE args[1];
1003
892
 
1004
- if (dr->convert_special) {
1005
- if (0 != collapse_special(dr->str, dr->tolerant) && 0 != strchr(dr->str, '&')) {
1006
- sax_drive_error(dr, "invalid format, special character does not end with a semicolon", 0);
1007
- }
893
+ if (dr->options.convert_special) {
894
+ ox_sax_collapse_special(dr, dr->buf.str, line, col);
1008
895
  }
1009
- args[0] = rb_str_new2(dr->str);
896
+ args[0] = rb_str_new2(dr->buf.str);
1010
897
  #if HAS_ENCODING_SUPPORT
1011
898
  if (0 != dr->encoding) {
1012
899
  rb_enc_associate(args[0], dr->encoding);
@@ -1016,93 +903,92 @@ read_text(SaxDrive dr) {
1016
903
  rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
1017
904
  }
1018
905
  #endif
1019
- if (dr->has_line) {
906
+ if (dr->has.line) {
1020
907
  rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
1021
908
  }
1022
- if (dr->has_column) {
909
+ if (dr->has.column) {
1023
910
  rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
1024
911
  }
1025
912
  rb_funcall2(dr->handler, ox_text_id, 1, args);
1026
913
  }
1027
- dr->str = 0;
914
+ dr->buf.str = 0;
1028
915
 
1029
- return 0;
916
+ return c;
1030
917
  }
1031
918
 
1032
- static const char*
1033
- read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml) {
919
+ static char
920
+ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req) {
1034
921
  VALUE name = Qnil;
1035
922
  int is_encoding = 0;
1036
923
  int line;
1037
924
  int col;
1038
- char last;
1039
925
  char *attr_value;
1040
-
1041
- dr->str = dr->cur; /* lock it down */
926
+
927
+ // already protected by caller
928
+ dr->buf.str = dr->buf.tail;
1042
929
  if (is_white(c)) {
1043
- c = next_non_white(dr);
930
+ c = buf_next_non_white(&dr->buf);
1044
931
  }
1045
932
  while (termc != c && term2 != c) {
1046
- backup(dr);
1047
- line = dr->line;
1048
- col = dr->col;
933
+ buf_backup(&dr->buf);
1049
934
  if ('\0' == c) {
1050
- return "invalid format, attributes not terminated";
935
+ ox_sax_drive_error(dr, NO_TERM "attributes not terminated");
936
+ return '\0';
1051
937
  }
938
+ line = dr->buf.line;
939
+ col = dr->buf.col;
1052
940
  if ('\0' == (c = read_name_token(dr))) {
1053
- return "error reading token";
941
+ ox_sax_drive_error(dr, NO_TERM "error reading token");
942
+ return '\0';
1054
943
  }
1055
- if (is_xml && 0 == strcasecmp("encoding", dr->str)) {
944
+ if (is_xml && 0 == strcasecmp("encoding", dr->buf.str)) {
1056
945
  is_encoding = 1;
1057
946
  }
1058
- /* TBD use symbol cache */
1059
- if (dr->has_attr || dr->has_attr_value) {
1060
- name = str2sym(dr->str, dr, 0);
947
+ if (dr->has.attr || dr->has.attr_value) {
948
+ name = str2sym(dr, dr->buf.str, 0);
1061
949
  }
1062
950
  if (is_white(c)) {
1063
- c = next_non_white(dr);
951
+ c = buf_next_non_white(&dr->buf);
1064
952
  }
1065
- last = '\0';
1066
953
  if ('=' != c) {
1067
- if (dr->tolerant) {
1068
- attr_value = (char*)"";
1069
- last = c;
954
+ if (eq_req) {
955
+ dr->err = 1;
956
+ return c;
1070
957
  } else {
1071
- return "invalid format, no attribute value";
958
+ ox_sax_drive_error(dr, WRONG_CHAR "no attribute value");
959
+ attr_value = (char*)"";
1072
960
  }
1073
961
  } else {
1074
- if (0 != read_quoted_value(dr, &last)) {
1075
- return "error reading quoted value";
1076
- }
1077
- attr_value = dr->str;
962
+ line = dr->buf.line;
963
+ col = dr->buf.col;
964
+ c = read_quoted_value(dr);
965
+ attr_value = dr->buf.str;
1078
966
  if (is_encoding) {
1079
967
  #if HAS_ENCODING_SUPPORT
1080
- dr->encoding = rb_enc_find(dr->str);
968
+ dr->encoding = rb_enc_find(dr->buf.str);
1081
969
  #elif HAS_PRIVATE_ENCODING
1082
- dr->encoding = rb_str_new2(dr->str);
970
+ dr->encoding = rb_str_new2(dr->buf.str);
1083
971
  #endif
1084
972
  is_encoding = 0;
1085
973
  }
1086
974
  }
1087
- if (dr->has_attr_value) {
975
+ if (dr->has.attr_value) {
1088
976
  VALUE args[2];
1089
977
 
1090
- if (dr->has_line) {
978
+ if (dr->has.line) {
1091
979
  rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
1092
980
  }
1093
- if (dr->has_column) {
981
+ if (dr->has.column) {
1094
982
  rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
1095
983
  }
1096
984
  args[0] = name;
1097
985
  args[1] = dr->value_obj;
1098
986
  rb_funcall2(dr->handler, ox_attr_value_id, 2, args);
1099
- } else if (dr->has_attr) {
987
+ } else if (dr->has.attr) {
1100
988
  VALUE args[2];
1101
989
 
1102
990
  args[0] = name;
1103
- if (0 != collapse_special(dr->str, dr->tolerant) && 0 != strchr(dr->str, '&')) {
1104
- sax_drive_error(dr, "invalid format, special character does not end with a semicolon", 0);
1105
- }
991
+ ox_sax_collapse_special(dr, dr->buf.str, line, col);
1106
992
  args[1] = rb_str_new2(attr_value);
1107
993
  #if HAS_ENCODING_SUPPORT
1108
994
  if (0 != dr->encoding) {
@@ -1113,34 +999,35 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml) {
1113
999
  rb_funcall(args[1], ox_force_encoding_id, 1, dr->encoding);
1114
1000
  }
1115
1001
  #endif
1116
- if (dr->has_line) {
1002
+ if (dr->has.line) {
1117
1003
  rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
1118
1004
  }
1119
- if (dr->has_column) {
1005
+ if (dr->has.column) {
1120
1006
  rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
1121
1007
  }
1122
1008
  rb_funcall2(dr->handler, ox_attr_id, 2, args);
1123
1009
  }
1124
- if ('\0' != last) {
1125
- c = last;
1126
- } else {
1127
- c = next_non_white(dr);
1010
+ if (is_white(c)) {
1011
+ c = buf_next_non_white(&dr->buf);
1128
1012
  }
1129
1013
  }
1130
- dr->str = 0;
1014
+ dr->buf.str = 0;
1131
1015
 
1132
- return 0;
1016
+ return c;
1133
1017
  }
1134
1018
 
1019
+ /* The character after the character after the word is returned. dr->buf.tail is one past that. dr->buf.str will point to the
1020
+ * token which will be '\0' terminated.
1021
+ */
1135
1022
  static char
1136
1023
  read_name_token(SaxDrive dr) {
1137
1024
  char c;
1138
1025
 
1139
- dr->str = dr->cur; /* make sure the start doesn't get compacted out */
1140
- c = sax_drive_get(dr);
1026
+ dr->buf.str = dr->buf.tail;
1027
+ c = buf_get(&dr->buf);
1141
1028
  if (is_white(c)) {
1142
- c = next_non_white(dr);
1143
- dr->str = dr->cur - 1;
1029
+ c = buf_next_non_white(&dr->buf);
1030
+ dr->buf.str = dr->buf.tail - 1;
1144
1031
  }
1145
1032
  while (1) {
1146
1033
  switch (c) {
@@ -1151,179 +1038,73 @@ read_name_token(SaxDrive dr) {
1151
1038
  case '=':
1152
1039
  case '/':
1153
1040
  case '>':
1041
+ case '<':
1154
1042
  case '\n':
1155
1043
  case '\r':
1156
- *(dr->cur - 1) = '\0';
1044
+ *(dr->buf.tail - 1) = '\0';
1157
1045
  return c;
1158
1046
  case '\0':
1159
1047
  /* documents never terminate after a name token */
1160
- sax_drive_error(dr, "invalid format, document not terminated", 1);
1048
+ ox_sax_drive_error(dr, NO_TERM "document not terminated");
1161
1049
  return '\0';
1162
1050
  default:
1163
1051
  break;
1164
1052
  }
1165
- c = sax_drive_get(dr);
1053
+ c = buf_get(&dr->buf);
1166
1054
  }
1167
1055
  return '\0';
1168
1056
  }
1169
1057
 
1170
- static int
1171
- read_quoted_value(SaxDrive dr, char *last) {
1172
- char c;
1058
+ /* The character after the quote or if there is no quote, the character after the word is returned. dr->buf.tail is one past
1059
+ * that. dr->buf.str will point to the token which will be '\0' terminated.
1060
+ */
1061
+ static char
1062
+ read_quoted_value(SaxDrive dr) {
1063
+ char c;
1173
1064
 
1174
- dr->str = dr->cur;
1175
- c = sax_drive_get(dr);
1065
+ c = buf_get(&dr->buf);
1176
1066
  if (is_white(c)) {
1177
- c = next_non_white(dr);
1067
+ c = buf_next_non_white(&dr->buf);
1178
1068
  }
1179
1069
  if ('"' == c || '\'' == c) {
1180
1070
  char term = c;
1181
1071
 
1182
- dr->str = dr->cur;
1183
- while (term != (c = sax_drive_get(dr))) {
1072
+ dr->buf.str = dr->buf.tail;
1073
+ while (term != (c = buf_get(&dr->buf))) {
1184
1074
  if ('\0' == c) {
1185
- sax_drive_error(dr, "invalid format, quoted value not terminated", 1);
1186
- return -1;
1075
+ ox_sax_drive_error(dr, NO_TERM "quoted value not terminated");
1076
+ return '\0';
1187
1077
  }
1188
1078
  }
1189
- } else if (dr->tolerant) {
1190
- dr->str = dr->cur - 1;
1191
- while ('\0' != (c = sax_drive_get(dr))) {
1192
- switch (c) {
1193
- case '\0':
1194
- sax_drive_error(dr, "invalid format, non quoted value not terminated", 1);
1195
- case ' ':
1196
- case '/':
1197
- case '>':
1198
- case '?': // for instructions
1199
- case '\t':
1200
- case '\n':
1201
- case '\r':
1202
- *last = c;
1203
- *(dr->cur - 1) = '\0'; /* terminate value */
1204
- return 0;
1205
- default:
1206
- break;
1207
- }
1208
- }
1209
- } else {
1210
- dr->str = dr->cur - 1;
1211
- if ('\0' == (c = next_white(dr))) {
1212
- sax_drive_error(dr, "invalid format, attibute value not in quotes", 1);
1213
- }
1214
- }
1215
- *(dr->cur - 1) = '\0'; /* terminate value */
1216
-
1217
- return 0;
1218
- }
1219
-
1220
- static VALUE
1221
- rescue_cb(VALUE rdr, VALUE err) {
1222
- #ifndef JRUBY_RUBY
1223
- /* JRuby seems to play by a different set if rules. It passes in an Fixnum
1224
- * instead of an error like other Rubies. For now assume all errors are
1225
- * EOF and deal with the results further down the line. */
1226
- #if (defined(RUBINIUS_RUBY) || (1 == RUBY_VERSION_MAJOR && 8 == RUBY_VERSION_MINOR))
1227
- if (rb_obj_class(err) != rb_eTypeError) {
1228
- #else
1229
- if (rb_obj_class(err) != rb_eEOFError) {
1230
- #endif
1231
- SaxDrive dr = (SaxDrive)rdr;
1232
-
1233
- sax_drive_cleanup(dr);
1234
- rb_raise(err, "at line %d, column %d\n", dr->line, dr->col);
1235
- }
1236
- #endif
1237
- return Qfalse;
1238
- }
1239
-
1240
- static VALUE
1241
- partial_io_cb(VALUE rdr) {
1242
- SaxDrive dr = (SaxDrive)rdr;
1243
- VALUE args[1];
1244
- VALUE rstr;
1245
- char *str;
1246
- size_t cnt;
1247
-
1248
- args[0] = ULONG2NUM(dr->buf_end - dr->cur);
1249
- rstr = rb_funcall2(dr->io, ox_readpartial_id, 1, args);
1250
- str = StringValuePtr(rstr);
1251
- cnt = strlen(str);
1252
- /*printf("*** read %lu bytes, str: '%s'\n", cnt, str); */
1253
- strcpy(dr->cur, str);
1254
- dr->read_end = dr->cur + cnt;
1255
-
1256
- return Qtrue;
1257
- }
1258
-
1259
- static VALUE
1260
- io_cb(VALUE rdr) {
1261
- SaxDrive dr = (SaxDrive)rdr;
1262
- VALUE args[1];
1263
- VALUE rstr;
1264
- char *str;
1265
- size_t cnt;
1266
-
1267
- args[0] = ULONG2NUM(dr->buf_end - dr->cur);
1268
- /*args[0] = SIZET2NUM(dr->buf_end - dr->cur); */
1269
- rstr = rb_funcall2(dr->io, ox_read_id, 1, args);
1270
- str = StringValuePtr(rstr);
1271
- cnt = strlen(str);
1272
- /*printf("*** read %lu bytes, str: '%s'\n", cnt, str); */
1273
- strcpy(dr->cur, str);
1274
- dr->read_end = dr->cur + cnt;
1275
-
1276
- return Qtrue;
1277
- }
1278
-
1279
- static int
1280
- read_from_io_partial(SaxDrive dr) {
1281
- return (Qfalse == rb_rescue(partial_io_cb, (VALUE)dr, rescue_cb, (VALUE)dr));
1282
- }
1283
-
1284
- static int
1285
- read_from_io(SaxDrive dr) {
1286
- return (Qfalse == rb_rescue(io_cb, (VALUE)dr, rescue_cb, (VALUE)dr));
1287
- }
1288
-
1289
- #ifndef JRUBY_RUBY
1290
- static int
1291
- read_from_fd(SaxDrive dr) {
1292
- ssize_t cnt;
1293
- size_t max = dr->buf_end - dr->cur;
1294
-
1295
- cnt = read(dr->fd, dr->cur, max);
1296
- if (cnt < 0) {
1297
- sax_drive_error(dr, "failed to read from file", 1);
1298
- return -1;
1299
- } else if (0 != cnt) {
1300
- dr->read_end = dr->cur + cnt;
1079
+ // dr->buf.tail is one past quote char
1080
+ *(dr->buf.tail - 1) = '\0'; /* terminate value */
1081
+ c = buf_get(&dr->buf);
1082
+ return c;
1301
1083
  }
1302
- return 0;
1303
- }
1304
- #endif
1305
-
1306
- static int
1307
- read_from_str(SaxDrive dr) {
1308
- size_t max = dr->buf_end - dr->cur - 1;
1309
- char *s;
1310
- long cnt;
1311
-
1312
- if ('\0' == *dr->in_str) {
1313
- /* done */
1314
- return -1;
1084
+ // not quoted, look for something that terminates the string
1085
+ dr->buf.str = dr->buf.tail - 1;
1086
+ ox_sax_drive_error(dr, WRONG_CHAR "attribute value not in quotes");
1087
+ while ('\0' != (c = buf_get(&dr->buf))) {
1088
+ switch (c) {
1089
+ case ' ':
1090
+ case '/':
1091
+ case '>':
1092
+ case '?': // for instructions
1093
+ case '\t':
1094
+ case '\n':
1095
+ case '\r':
1096
+ *(dr->buf.tail - 1) = '\0'; /* terminate value */
1097
+ // dr->buf.tail is in the correct position, one after the word terminator
1098
+ return c;
1099
+ default:
1100
+ break;
1101
+ }
1315
1102
  }
1316
- s = stpncpy(dr->cur, dr->in_str, max);
1317
- *s = '\0';
1318
- cnt = s - dr->cur;
1319
- dr->in_str += cnt;
1320
- dr->read_end = dr->cur + cnt;
1321
-
1322
- return 0;
1103
+ return '\0'; // should never get here
1323
1104
  }
1324
1105
 
1325
- static int
1326
- collapse_special(char *str, int tolerant) {
1106
+ int
1107
+ ox_sax_collapse_special(SaxDrive dr, char *str, int line, int col) {
1327
1108
  char *s = str;
1328
1109
  char *b = str;
1329
1110
 
@@ -1332,7 +1113,7 @@ collapse_special(char *str, int tolerant) {
1332
1113
  int c;
1333
1114
  char *end;
1334
1115
  int x = 0;
1335
-
1116
+
1336
1117
  s++;
1337
1118
  if ('#' == *s) {
1338
1119
  s++;
@@ -1344,46 +1125,47 @@ collapse_special(char *str, int tolerant) {
1344
1125
  c = (int)strtol(s, &end, 10);
1345
1126
  }
1346
1127
  if (';' != *end) {
1347
- if (tolerant) {
1348
- *b++ = '&';
1349
- *b++ = '#';
1350
- if (x) {
1351
- *b++ = *(s - 1);
1352
- }
1353
- continue;
1128
+ ox_sax_drive_error(dr, NO_TERM "special character does not end with a semicolon");
1129
+ *b++ = '&';
1130
+ *b++ = '#';
1131
+ if (x) {
1132
+ *b++ = *(s - 1);
1354
1133
  }
1355
- return EDOM;
1134
+ continue;
1356
1135
  }
1136
+ col += (int)(end - s);
1357
1137
  s = end + 1;
1358
1138
  } else if (0 == strncasecmp(s, "lt;", 3)) {
1359
1139
  c = '<';
1360
1140
  s += 3;
1141
+ col += 3;
1361
1142
  } else if (0 == strncasecmp(s, "gt;", 3)) {
1362
1143
  c = '>';
1363
1144
  s += 3;
1145
+ col += 3;
1364
1146
  } else if (0 == strncasecmp(s, "amp;", 4)) {
1365
1147
  c = '&';
1366
1148
  s += 4;
1149
+ col += 4;
1367
1150
  } else if (0 == strncasecmp(s, "quot;", 5)) {
1368
1151
  c = '"';
1369
1152
  s += 5;
1153
+ col += 5;
1370
1154
  } else if (0 == strncasecmp(s, "apos;", 5)) {
1371
1155
  c = '\'';
1372
1156
  s += 5;
1373
- } else if (tolerant) {
1374
- *b++ = '&';
1375
- continue;
1376
1157
  } else {
1377
- c = '?';
1378
- while (';' != *s++) {
1379
- if ('\0' == *s) {
1380
- return EDOM;
1381
- }
1382
- }
1383
- s++;
1158
+ ox_sax_drive_error_at(dr, NO_TERM "special character does not end with a semicolon", line, col);
1159
+ c = '&';
1384
1160
  }
1385
1161
  *b++ = (char)c;
1162
+ col++;
1386
1163
  } else {
1164
+ if ('\n' == *s) {
1165
+ line++;
1166
+ col = 0;
1167
+ }
1168
+ col++;
1387
1169
  *b++ = *s++;
1388
1170
  }
1389
1171
  }
@@ -1392,214 +1174,58 @@ collapse_special(char *str, int tolerant) {
1392
1174
  return 0;
1393
1175
  }
1394
1176
 
1395
- static VALUE
1396
- parse_double_time(const char *text) {
1397
- long v = 0;
1398
- long v2 = 0;
1399
- const char *dot = 0;
1400
- char c;
1401
-
1402
- for (; '.' != *text; text++) {
1403
- c = *text;
1404
- if (c < '0' || '9' < c) {
1405
- return Qnil;
1406
- }
1407
- v = 10 * v + (long)(c - '0');
1408
- }
1409
- dot = text++;
1410
- for (; '\0' != *text && text - dot <= 6; text++) {
1411
- c = *text;
1412
- if (c < '0' || '9' < c) {
1413
- return Qnil;
1414
- }
1415
- v2 = 10 * v2 + (long)(c - '0');
1416
- }
1417
- for (; text - dot <= 9; text++) {
1418
- v2 *= 10;
1419
- }
1420
- #if HAS_NANO_TIME
1421
- return rb_time_nano_new(v, v2);
1422
- #else
1423
- return rb_time_new(v, v2 / 1000);
1424
- #endif
1425
- }
1177
+ static void
1178
+ hint_clear_empty(SaxDrive dr) {
1179
+ Nv nv;
1426
1180
 
1427
- typedef struct _Tp {
1428
- int cnt;
1429
- char end;
1430
- char alt;
1431
- } *Tp;
1432
-
1433
- static VALUE
1434
- parse_xsd_time(const char *text) {
1435
- long cargs[10];
1436
- long *cp = cargs;
1437
- long v;
1438
- int i;
1439
- char c = '\0';
1440
- struct _Tp tpa[10] = { { 4, '-', '-' },
1441
- { 2, '-', '-' },
1442
- { 2, 'T', ' ' },
1443
- { 2, ':', ':' },
1444
- { 2, ':', ':' },
1445
- { 2, '.', '.' },
1446
- { 9, '+', '-' },
1447
- { 2, ':', ':' },
1448
- { 2, '\0', '\0' },
1449
- { 0, '\0', '\0' } };
1450
- Tp tp = tpa;
1451
- struct tm tm;
1452
-
1453
- memset(cargs, 0, sizeof(cargs));
1454
- for (; 0 != tp->cnt; tp++) {
1455
- for (i = tp->cnt, v = 0; 0 < i ; text++, i--) {
1456
- c = *text;
1457
- if (c < '0' || '9' < c) {
1458
- if ('\0' == c || tp->end == c || tp->alt == c) {
1459
- break;
1460
- }
1461
- return Qnil;
1462
- }
1463
- v = 10 * v + (long)(c - '0');
1464
- }
1465
- if ('\0' == c) {
1181
+ for (nv = stack_peek(&dr->stack); 0 != nv; nv = stack_peek(&dr->stack)) {
1182
+ if (0 == nv->hint) {
1466
1183
  break;
1467
1184
  }
1468
- c = *text++;
1469
- if (tp->end != c && tp->alt != c) {
1470
- return Qnil;
1471
- }
1472
- *cp++ = v;
1473
- }
1474
- tm.tm_year = (int)cargs[0] - 1900;
1475
- tm.tm_mon = (int)cargs[1] - 1;
1476
- tm.tm_mday = (int)cargs[2];
1477
- tm.tm_hour = (int)cargs[3];
1478
- tm.tm_min = (int)cargs[4];
1479
- tm.tm_sec = (int)cargs[5];
1480
- #if HAS_NANO_TIME
1481
- return rb_time_nano_new(mktime(&tm), cargs[6]);
1482
- #else
1483
- return rb_time_new(mktime(&tm), cargs[6] / 1000);
1484
- #endif
1485
- }
1486
-
1487
- static VALUE
1488
- sax_value_as_s(VALUE self) {
1489
- SaxDrive dr = DATA_PTR(self);
1490
- VALUE rs;
1491
-
1492
- if ('\0' == *dr->str) {
1493
- return Qnil;
1494
- }
1495
- if (dr->convert_special) {
1496
- if (0 != collapse_special(dr->str, dr->tolerant) && 0 != strchr(dr->str, '&')) {
1497
- sax_drive_error(dr, "invalid format, special character does not end with a semicolon", 0);
1185
+ if (nv->hint->empty) {
1186
+ end_element_cb(dr, nv->val, dr->buf.line, dr->buf.col);
1187
+ stack_pop(&dr->stack);
1188
+ } else {
1189
+ break;
1498
1190
  }
1499
1191
  }
1500
- rs = rb_str_new2(dr->str);
1501
- #if HAS_ENCODING_SUPPORT
1502
- if (0 != dr->encoding) {
1503
- rb_enc_associate(rs, dr->encoding);
1504
- }
1505
- #elif HAS_PRIVATE_ENCODING
1506
- if (Qnil != dr->encoding) {
1507
- rb_funcall(rs, ox_force_encoding_id, 1, dr->encoding);
1508
- }
1509
- #endif
1510
- return rs;
1511
1192
  }
1512
1193
 
1513
- static VALUE
1514
- sax_value_as_sym(VALUE self) {
1515
- SaxDrive dr = DATA_PTR(self);
1194
+ static Nv
1195
+ hint_try_close(SaxDrive dr, const char *name) {
1196
+ Hint h = ox_hint_find(dr->hints, name);
1197
+ Nv nv;
1516
1198
 
1517
- if ('\0' == *dr->str) {
1518
- return Qnil;
1199
+ if (0 == h) {
1200
+ return 0;
1519
1201
  }
1520
- return str2sym(dr->str, dr, 0);
1521
- }
1522
-
1523
- static VALUE
1524
- sax_value_as_f(VALUE self) {
1525
- SaxDrive dr = DATA_PTR(self);
1526
-
1527
- if ('\0' == *dr->str) {
1528
- return Qnil;
1529
- }
1530
- return rb_float_new(strtod(dr->str, 0));
1531
- }
1532
-
1533
- static VALUE
1534
- sax_value_as_i(VALUE self) {
1535
- SaxDrive dr = DATA_PTR(self);
1536
- const char *s = dr->str;
1537
- long n = 0;
1538
- int neg = 0;
1539
-
1540
- if ('\0' == *s) {
1541
- return Qnil;
1542
- }
1543
- if ('-' == *s) {
1544
- neg = 1;
1545
- s++;
1546
- } else if ('+' == *s) {
1547
- s++;
1548
- }
1549
- for (; '\0' != *s; s++) {
1550
- if ('0' <= *s && *s <= '9') {
1551
- n = n * 10 + (*s - '0');
1202
+ for (nv = stack_peek(&dr->stack); 0 != nv; nv = stack_peek(&dr->stack)) {
1203
+ if (0 == strcasecmp(name, nv->name)) {
1204
+ stack_pop(&dr->stack);
1205
+ return nv;
1206
+ }
1207
+ if (0 == nv->hint) {
1208
+ break;
1209
+ }
1210
+ if (nv->hint->empty) {
1211
+ end_element_cb(dr, nv->val, dr->buf.line, dr->buf.col);
1212
+ dr->stack.tail = nv;
1552
1213
  } else {
1553
- rb_raise(ox_arg_error_class, "Not a valid Fixnum.\n");
1214
+ break;
1554
1215
  }
1555
1216
  }
1556
- if (neg) {
1557
- n = -n;
1558
- }
1559
- return LONG2NUM(n);
1217
+ return 0;
1560
1218
  }
1561
1219
 
1562
- static VALUE
1563
- sax_value_as_time(VALUE self) {
1564
- SaxDrive dr = DATA_PTR(self);
1565
- const char *str = dr->str;
1566
- VALUE t;
1567
-
1568
- if ('\0' == *str) {
1569
- return Qnil;
1570
- }
1571
- if (Qnil == (t = parse_double_time(str)) &&
1572
- Qnil == (t = parse_xsd_time(str))) {
1573
- VALUE args[1];
1574
-
1575
- /*printf("**** time parse\n"); */
1576
- *args = rb_str_new2(str);
1577
- t = rb_funcall2(ox_time_class, ox_parse_id, 1, args);
1220
+ static void
1221
+ end_element_cb(SaxDrive dr, VALUE name, int line, int col) {
1222
+ if (dr->has.end_element) {
1223
+ if (dr->has.line) {
1224
+ rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line));
1225
+ }
1226
+ if (dr->has.column) {
1227
+ rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col));
1228
+ }
1229
+ rb_funcall(dr->handler, ox_end_element_id, 1, name);
1578
1230
  }
1579
- return t;
1580
- }
1581
-
1582
- static VALUE
1583
- sax_value_as_bool(VALUE self) {
1584
- return (0 == strcasecmp("true", ((SaxDrive)DATA_PTR(self))->str)) ? Qtrue : Qfalse;
1585
- }
1586
-
1587
- static VALUE
1588
- sax_value_empty(VALUE self) {
1589
- return ('\0' == *((SaxDrive)DATA_PTR(self))->str) ? Qtrue : Qfalse;
1590
- }
1591
-
1592
- void
1593
- ox_sax_define() {
1594
- VALUE sax_module = rb_const_get_at(Ox, rb_intern("Sax"));
1595
-
1596
- sax_value_class = rb_define_class_under(sax_module, "Value", rb_cObject);
1597
-
1598
- rb_define_method(sax_value_class, "as_s", sax_value_as_s, 0);
1599
- rb_define_method(sax_value_class, "as_sym", sax_value_as_sym, 0);
1600
- rb_define_method(sax_value_class, "as_i", sax_value_as_i, 0);
1601
- rb_define_method(sax_value_class, "as_f", sax_value_as_f, 0);
1602
- rb_define_method(sax_value_class, "as_time", sax_value_as_time, 0);
1603
- rb_define_method(sax_value_class, "as_bool", sax_value_as_bool, 0);
1604
- rb_define_method(sax_value_class, "empty?", sax_value_empty, 0);
1605
1231
  }