ox 1.2.15 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of ox might be problematic. Click here for more details.

@@ -1,17 +1,38 @@
1
- = Ox: A fast XML parser and Object marshaller.
1
+ # Ox gem
2
+ A fast XML parser and Object marshaller as a Ruby gem.
3
+
4
+ ## <a name="installation">Installation</a>
5
+ gem install ox
6
+
7
+ ## <a name="source">Source</a>
2
8
 
3
9
  *GitHub* *repo*: https://github.com/ohler55/ox
4
10
 
5
11
  *RubyGems* *repo*: https://rubygems.org/gems/ox
6
12
 
7
- === Description:
13
+ ## <a name="follow">Follow @oxgem on Twitter</a>
14
+
15
+ [Follow @oxgem on Twitter](http://twitter.com/#!/oxgem) for announcements and news about the Ox gem.
16
+
17
+ ## <a name="build_status">Build Status</a>
18
+
19
+ [![Build Status](http://travis-ci.org/ohler55/ox.png)](http://travis-ci.org/ohler55/ox)
20
+
21
+ ## <a name="release">Release Notes</a>
22
+
23
+ ### Release 1.3.0
24
+
25
+ - fixed Mutex dump bug
26
+ - added SAX parser, 30+ times faster than Nokogiri and 10+ times faster than LibXML
27
+
28
+ ## <a name="description">Description</a>
8
29
 
9
30
  Optimized XML (Ox), as the name implies was written to provide speed optimized
10
31
  XML handling. It was designed to be an alternative to Nokogiri in generic XML
11
32
  parsing and as an alternative to Marshal for Object serialization.
12
33
 
13
- Nokogiri relies on libXml while Ox is self contained. Ox uses nothing other
14
- than standard C libraries so version issues with libXml are not an issue.
34
+ Unlike Nokogiri Ox is self contained. Ox uses nothing other than standard C
35
+ libraries so version issues with libXml are not an issue.
15
36
 
16
37
  Marshal uses a binary format for serializing Objects. That binary format
17
38
  changes with releases making Marshal dumped Object incompatible between some
@@ -25,9 +46,9 @@ It is possible to write an XML serialization gem with Nokogiri but writing
25
46
  such a package in Ruby results in a module significantly slower than
26
47
  Marshal. This is what triggered the start of Ox development.
27
48
 
28
- Ox handles XML documents in two ways. It is a generic XML parser and writer as
29
- well as a fast Object / XML marshaller. Ox was written for speed as a
30
- replacement for Nokogiri and for Marshal.
49
+ Ox handles XML documents in three ways. It is a generic XML parser and writer,
50
+ a fast Object / XML marshaller, and a stream SAX parser. Ox was written for
51
+ speed as a replacement for Nokogiri, Ruby LibXML, and for Marshal.
31
52
 
32
53
  As an XML parser it is 2 or more times faster than Nokogiri and as a generic
33
54
  XML writer it is as much as 20 times faster than Nokogiri. Of course different
@@ -36,53 +57,60 @@ files may result in slightly different times.
36
57
  As an Object serializer Ox is up to 6 times faster than the standard Ruby
37
58
  Marshal.dump() and up to 3 times faster than Marshal.load().
38
59
 
39
-
40
- === Object Dump Sample:
41
-
42
- require 'ox'
43
-
44
- class Sample
45
- attr_accessor :a, :b, :c
46
-
47
- def initialize(a, b, c)
48
- @a = a
49
- @b = b
50
- @c = c
60
+ The SAX like stream parser is over 30 times faster than Nokogiri and more than
61
+ 10 times faster than LibXML when using a trivial Ruby side set of
62
+ callbacks. Unlike Nokogiri and LibXML, Ox can be tuned to use only the SAX
63
+ callbacks that are of interest to the caller. (See the perf_sax.rb file for an
64
+ example.)
65
+
66
+ Ox is compatible with Ruby 1.8.7, 1.9.2, JRuby, and RBX.
67
+
68
+ ### Object Dump Sample:
69
+
70
+ require 'ox'
71
+
72
+ class Sample
73
+ attr_accessor :a, :b, :c
74
+
75
+ def initialize(a, b, c)
76
+ @a = a
77
+ @b = b
78
+ @c = c
79
+ end
51
80
  end
52
- end
53
-
54
- # Create Object
55
- obj = Sample.new(1, "bee", ['x', :y, 7.0])
56
- # Now dump the Object to an XML String.
57
- xml = Ox.dump(obj)
58
- # Convert the object back into a Sample Object.
59
- obj2 = Ox.parse_obj(xml)
60
-
61
- === Generic XML Writing and Parsing:
62
-
63
- require 'ox'
64
-
65
- doc = Ox::Document.new(:version => '1.0')
66
-
67
- top = Ox::Element.new('top')
68
- top[:name] = 'sample'
69
- doc << top
70
-
71
- mid = Ox::Element.new('middle')
72
- mid[:name] = 'second'
73
- top << mid
74
-
75
- bot = Ox::Element.new('bottom')
76
- bot[:name] = 'third'
77
- mid << bot
78
-
79
- xml = Ox.dump(doc)
80
- puts xml
81
- doc2 = Ox.parse(xml)
82
- puts "Same? #{doc == doc2}"
83
-
84
-
85
- === Object XML format
81
+
82
+ # Create Object
83
+ obj = Sample.new(1, "bee", ['x', :y, 7.0])
84
+ # Now dump the Object to an XML String.
85
+ xml = Ox.dump(obj)
86
+ # Convert the object back into a Sample Object.
87
+ obj2 = Ox.parse_obj(xml)
88
+
89
+ ### Generic XML Writing and Parsing:
90
+
91
+ require 'ox'
92
+
93
+ doc = Ox::Document.new(:version => '1.0')
94
+
95
+ top = Ox::Element.new('top')
96
+ top[:name] = 'sample'
97
+ doc << top
98
+
99
+ mid = Ox::Element.new('middle')
100
+ mid[:name] = 'second'
101
+ top << mid
102
+
103
+ bot = Ox::Element.new('bottom')
104
+ bot[:name] = 'third'
105
+ mid << bot
106
+
107
+ xml = Ox.dump(doc)
108
+ puts xml
109
+ doc2 = Ox.parse(xml)
110
+ puts "Same? #{doc == doc2}"
111
+
112
+
113
+ ### Object XML format
86
114
 
87
115
  The XML format used for Object encoding follows the structure of the
88
116
  Object. Each XML element is encoded so that the XML element name is a type
@@ -127,25 +155,25 @@ interpreter.)
127
155
  Values are encoded as the text portion of an element or in the sub-elements
128
156
  of the principle. For example, a Fixnum is encoded as:
129
157
 
130
- <i>123</i>
158
+ <i>123</i>
131
159
 
132
160
  An Array has sub-elements and is encoded similar to this example.
133
161
 
134
- <a>
135
- <i>1</i>
136
- <s>abc</s>
137
- </a>
162
+ <a>
163
+ <i>1</i>
164
+ <s>abc</s>
165
+ </a>
138
166
 
139
167
  A Hash is encoded with an even number of elements where the first element is
140
168
  the key and the second is the value. This is repeated for each entry in the
141
169
  Hash. An example is of { 1 => 'one', 2 => 'two' } encoding is:
142
170
 
143
- <h>
144
- <i>1</i>
145
- <s>one</s>
146
- <i>2</i>
147
- <s>two</s>
148
- </h>
171
+ <h>
172
+ <i>1</i>
173
+ <s>one</s>
174
+ <i>2</i>
175
+ <s>two</s>
176
+ </h>
149
177
 
150
178
  Strings with characters not allowed in XML are base64 encoded amd will be
151
179
  converted back into a String when loaded.
data/ext/ox/dump.c CHANGED
@@ -829,14 +829,14 @@ static int
829
829
  dump_var(ID key, VALUE value, Out out) {
830
830
  if (T_DATA == rb_type(value) && rb_cTime != rb_obj_class(value)) {
831
831
  /* There is a secret recipe that keeps Exception mesg attributes as a
832
- * T_DATA until it is needed. StringValue() makes the value needed and
833
- * it is converted to a regular Ruby Object. It might seem reasonable
834
- * to expect that this would be done before calling the foreach
835
- * callback but it isn't. A slight hack fixes the inconsistency. If
836
- * the var is not something that can be represented as a String then
837
- * this will fail.
832
+ * T_DATA until it is needed. StringValue() or a safer method of
833
+ * calling to_s() makes the value needed and it is converted to a
834
+ * regular Ruby Object. It might seem reasonable to expect that this
835
+ * would be done before calling the foreach callback but it isn't. A
836
+ * slight hack fixes the inconsistency. If the var is not something
837
+ * that can be represented as a String then this will fail.
838
838
  */
839
- StringValue(value);
839
+ rb_funcall(value, to_s_id, 0);
840
840
  }
841
841
  dump_obj(key, value, out->depth, out);
842
842
 
data/ext/ox/ox.c CHANGED
@@ -48,15 +48,24 @@ VALUE Ox = Qnil;
48
48
  ID at_id;
49
49
  ID attributes_id;
50
50
  ID beg_id;
51
+ ID cdata_id;
52
+ ID comment_id;
51
53
  ID den_id;
54
+ ID doctype_id;
55
+ ID end_element_id;
52
56
  ID end_id;
57
+ ID error_id;
53
58
  ID excl_id;
54
59
  ID inspect_id;
60
+ ID instruct_id;
55
61
  ID keys_id;
56
62
  ID local_id;
57
63
  ID nodes_id;
58
64
  ID num_id;
59
65
  ID parse_id;
66
+ ID readpartial_id;
67
+ ID start_element_id;
68
+ ID text_id;
60
69
  ID to_c_id;
61
70
  ID to_s_id;
62
71
  ID tv_sec_id;
@@ -140,7 +149,7 @@ static VALUE
140
149
  get_def_opts(VALUE self) {
141
150
  VALUE opts = rb_hash_new();
142
151
  int elen = (int)strlen(default_options.encoding);
143
-
152
+
144
153
  rb_hash_aset(opts, encoding_sym, (0 == elen) ? Qnil : rb_str_new(default_options.encoding, elen));
145
154
  rb_hash_aset(opts, indent_sym, INT2FIX(default_options.indent));
146
155
  rb_hash_aset(opts, trace_sym, INT2FIX(default_options.trace));
@@ -434,6 +443,20 @@ load_file(int argc, VALUE *argv, VALUE self) {
434
443
  return load(xml, argc - 1, argv + 1, self);
435
444
  }
436
445
 
446
+ /* call-seq: sax_parse(handler, io)
447
+ *
448
+ * Parses an IO stream or file containing an XML document. Raises an exception
449
+ * if the XML is malformed or the classes specified are not valid.
450
+ * @param [Ox::Sax] handler SAX (responds to OX::Sax methods) like handler
451
+ * @param [IO|String] io IO Object to read from
452
+ */
453
+ static VALUE
454
+ sax_parse(VALUE self, VALUE handler, VALUE io) {
455
+ ox_sax_parse(handler, io);
456
+
457
+ return Qnil;
458
+ }
459
+
437
460
  static void
438
461
  parse_dump_options(VALUE ropts, Options copts) {
439
462
  struct _YesNoOpt ynos[] = {
@@ -583,6 +606,7 @@ void Init_ox() {
583
606
  rb_define_module_function(Ox, "parse_obj", to_obj, 1);
584
607
  rb_define_module_function(Ox, "parse", to_gen, 1);
585
608
  rb_define_module_function(Ox, "load", load_str, -1);
609
+ rb_define_module_function(Ox, "sax_parse", sax_parse, 2);
586
610
 
587
611
  rb_define_module_function(Ox, "to_xml", dump, -1);
588
612
  rb_define_module_function(Ox, "dump", dump, -1);
@@ -591,23 +615,32 @@ void Init_ox() {
591
615
  rb_define_module_function(Ox, "to_file", to_file, -1);
592
616
 
593
617
  rb_require("time");
594
- parse_id = rb_intern("parse");
595
- local_id = rb_intern("local");
596
618
  at_id = rb_intern("at");
597
- inspect_id = rb_intern("inspect");
619
+ attributes_id = rb_intern("@attributes");
598
620
  beg_id = rb_intern("@beg");
599
- end_id = rb_intern("@end");
621
+ cdata_id = rb_intern("cdata");
622
+ comment_id = rb_intern("comment");
600
623
  den_id = rb_intern("@den");
624
+ doctype_id = rb_intern("doctype");
625
+ end_element_id = rb_intern("end_element");
626
+ end_id = rb_intern("@end");
627
+ error_id = rb_intern("error");
601
628
  excl_id = rb_intern("@excl");
602
- value_id = rb_intern("@value");
629
+ inspect_id = rb_intern("inspect");
630
+ instruct_id = rb_intern("instruct");
631
+ keys_id = rb_intern("keys");
632
+ local_id = rb_intern("local");
603
633
  nodes_id = rb_intern("@nodes");
604
634
  num_id = rb_intern("@num");
605
- attributes_id = rb_intern("@attributes");
606
- keys_id = rb_intern("keys");
607
- tv_sec_id = rb_intern("tv_sec");
608
- tv_usec_id = rb_intern("tv_usec");
635
+ parse_id = rb_intern("parse");
636
+ readpartial_id = rb_intern("readpartial");
637
+ start_element_id = rb_intern("start_element");
638
+ text_id = rb_intern("text");
609
639
  to_c_id = rb_intern("to_c");
610
640
  to_s_id = rb_intern("to_s");
641
+ tv_sec_id = rb_intern("tv_sec");
642
+ tv_usec_id = rb_intern("tv_usec");
643
+ value_id = rb_intern("@value");
611
644
 
612
645
  time_class = rb_const_get(rb_cObject, rb_intern("Time"));
613
646
  struct_class = rb_const_get(rb_cObject, rb_intern("Struct"));
data/ext/ox/ox.h CHANGED
@@ -198,6 +198,8 @@ typedef struct _Options {
198
198
  extern VALUE parse(char *xml, ParseCallbacks pcb, char **endp, int trace, Effort effort);
199
199
  extern void _raise_error(const char *msg, const char *xml, const char *current, const char* file, int line);
200
200
 
201
+ extern void ox_sax_parse(VALUE handler, VALUE io);
202
+
201
203
  extern char* write_obj_to_str(VALUE obj, Options copts);
202
204
  extern void write_obj_to_file(VALUE obj, const char *path, Options copts);
203
205
 
@@ -206,15 +208,24 @@ extern VALUE Ox;
206
208
  extern ID at_id;
207
209
  extern ID attributes_id;
208
210
  extern ID beg_id;
211
+ extern ID cdata_id;
212
+ extern ID comment_id;
209
213
  extern ID den_id;
214
+ extern ID doctype_id;
215
+ extern ID end_element_id;
210
216
  extern ID end_id;
217
+ extern ID error_id;
211
218
  extern ID excl_id;
212
219
  extern ID inspect_id;
220
+ extern ID instruct_id;
213
221
  extern ID keys_id;
214
222
  extern ID local_id;
215
223
  extern ID nodes_id;
216
224
  extern ID num_id;
217
225
  extern ID parse_id;
226
+ extern ID readpartial_id;
227
+ extern ID start_element_id;
228
+ extern ID text_id;
218
229
  extern ID to_c_id;
219
230
  extern ID to_s_id;
220
231
  extern ID tv_sec_id;
data/ext/ox/sax.c ADDED
@@ -0,0 +1,758 @@
1
+ /* sax.c
2
+ * Copyright (c) 2011, Peter Ohler
3
+ * All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ *
8
+ * - Redistributions of source code must retain the above copyright notice, this
9
+ * list of conditions and the following disclaimer.
10
+ *
11
+ * - Redistributions in binary form must reproduce the above copyright notice,
12
+ * this list of conditions and the following disclaimer in the documentation
13
+ * and/or other materials provided with the distribution.
14
+ *
15
+ * - Neither the name of Peter Ohler nor the names of its contributors may be
16
+ * used to endorse or promote products derived from this software without
17
+ * specific prior written permission.
18
+ *
19
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+ */
30
+
31
+ #include <stdlib.h>
32
+ #include <errno.h>
33
+ #include <stdio.h>
34
+ #include <string.h>
35
+ #include <sys/types.h>
36
+ #include <sys/uio.h>
37
+ #include <unistd.h>
38
+
39
+ #include "ruby.h"
40
+ #include "ox.h"
41
+
42
+ typedef struct _SaxDrive {
43
+ char base_buf[0x00010000];
44
+ //char base_buf[0x00000010];
45
+ char *buf;
46
+ char *buf_end;
47
+ char *cur;
48
+ char *read_end; // one past last character read
49
+ char *str; // start of current string being read
50
+ int line;
51
+ int col;
52
+ VALUE handler;
53
+ int (*read_func)(struct _SaxDrive *dr);
54
+ union {
55
+ int fd;
56
+ VALUE io;
57
+ };
58
+ int has_instruct;
59
+ int has_doctype;
60
+ int has_comment;
61
+ int has_cdata;
62
+ int has_text;
63
+ int has_start_element;
64
+ int has_end_element;
65
+ int has_error;
66
+ rb_encoding *encoding;
67
+ } *SaxDrive;
68
+
69
+ static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io);
70
+ static void sax_drive_cleanup(SaxDrive dr);
71
+ static int sax_drive_read(SaxDrive dr);
72
+ static void sax_drive_error(SaxDrive dr, const char *msg, int critical);
73
+
74
+ static int read_children(SaxDrive dr, int first);
75
+ static int read_instruction(SaxDrive dr);
76
+ static int read_doctype(SaxDrive dr);
77
+ static int read_cdata(SaxDrive dr);
78
+ static int read_comment(SaxDrive dr);
79
+ static int read_element(SaxDrive dr);
80
+ static int read_text(SaxDrive dr);
81
+ static int read_attrs(SaxDrive dr, VALUE *attrs, char c, char termc, char term2);
82
+ static char read_name_token(SaxDrive dr);
83
+ static int read_quoted_value(SaxDrive dr);
84
+
85
+ static VALUE io_cb(VALUE rdr);
86
+ static int read_from_io(SaxDrive dr);
87
+ static int read_from_fd(SaxDrive dr);
88
+
89
+ static inline char
90
+ sax_drive_get(SaxDrive dr) {
91
+ if (dr->read_end <= dr->cur) {
92
+ if (0 != sax_drive_read(dr)) {
93
+ return 0;
94
+ }
95
+ }
96
+ if ('\n' == *dr->cur) {
97
+ dr->line++;
98
+ dr->col = 0;
99
+ }
100
+ dr->col++;
101
+
102
+ return *dr->cur++;
103
+ }
104
+
105
+ /* Starts by reading a character so it is safe to use with an empty or
106
+ * compacted buffer.
107
+ */
108
+ inline static char
109
+ next_non_white(SaxDrive dr) {
110
+ char c;
111
+
112
+ while ('\0' != (c = sax_drive_get(dr))) {
113
+ switch(c) {
114
+ case ' ':
115
+ case '\t':
116
+ case '\f':
117
+ case '\n':
118
+ case '\r':
119
+ break;
120
+ default:
121
+ return c;
122
+ }
123
+ }
124
+ return '\0';
125
+ }
126
+
127
+ inline static int
128
+ is_white(char c) {
129
+ switch(c) {
130
+ case ' ':
131
+ case '\t':
132
+ case '\f':
133
+ case '\n':
134
+ case '\r':
135
+ return 1;
136
+ default:
137
+ break;
138
+ }
139
+ return 0;
140
+ }
141
+
142
+ inline static VALUE
143
+ str2sym(const char *str) {
144
+ VALUE *slot;
145
+ VALUE sym;
146
+
147
+ if (Qundef == (sym = ox_cache_get(symbol_cache, str, &slot))) {
148
+ sym = ID2SYM(rb_intern(str));
149
+ *slot = sym;
150
+ }
151
+ return sym;
152
+ }
153
+
154
+
155
+ void
156
+ ox_sax_parse(VALUE handler, VALUE io) {
157
+ struct _SaxDrive dr;
158
+
159
+ sax_drive_init(&dr, handler, io);
160
+ #if 0
161
+ printf("*** sax_parse with these flags\n");
162
+ printf(" has_instruct = %s\n", dr.has_instruct ? "true" : "false");
163
+ printf(" has_doctype = %s\n", dr.has_doctype ? "true" : "false");
164
+ printf(" has_comment = %s\n", dr.has_comment ? "true" : "false");
165
+ printf(" has_cdata = %s\n", dr.has_cdata ? "true" : "false");
166
+ printf(" has_text = %s\n", dr.has_text ? "true" : "false");
167
+ printf(" has_start_element = %s\n", dr.has_start_element ? "true" : "false");
168
+ printf(" has_end_element = %s\n", dr.has_end_element ? "true" : "false");
169
+ printf(" has_error = %s\n", dr.has_error ? "true" : "false");
170
+ #endif
171
+ read_children(&dr, 1);
172
+ sax_drive_cleanup(&dr);
173
+ }
174
+
175
+ static void
176
+ sax_drive_init(SaxDrive dr, VALUE handler, VALUE io) {
177
+ if (rb_respond_to(io, readpartial_id)) {
178
+ VALUE rfd;
179
+
180
+ if (rb_respond_to(io, rb_intern("fileno")) && Qnil != (rfd = rb_funcall(io, rb_intern("fileno"), 0))) {
181
+ dr->read_func = read_from_fd;
182
+ dr->fd = FIX2INT(rfd);
183
+ } else {
184
+ dr->read_func = read_from_io;
185
+ dr->io = io;
186
+ }
187
+ } else {
188
+ rb_raise(rb_eArgError, "sax_parser io argument must respond to readpartial().\n");
189
+ }
190
+ dr->buf = dr->base_buf;
191
+ *dr->buf = '\0';
192
+ dr->buf_end = dr->buf + sizeof(dr->base_buf) - 1; // 1 less to make debugging easier
193
+ dr->cur = dr->buf;
194
+ dr->read_end = dr->buf;
195
+ dr->str = 0;
196
+ dr->line = 1;
197
+ dr->col = 0;
198
+ dr->handler = handler;
199
+ dr->has_instruct = rb_respond_to(handler, instruct_id);
200
+ dr->has_doctype = rb_respond_to(handler, doctype_id);
201
+ dr->has_comment = rb_respond_to(handler, comment_id);
202
+ dr->has_cdata = rb_respond_to(handler, cdata_id);
203
+ dr->has_text = rb_respond_to(handler, text_id);
204
+ dr->has_start_element = rb_respond_to(handler, start_element_id);
205
+ dr->has_end_element = rb_respond_to(handler, end_element_id);
206
+ dr->has_error = rb_respond_to(handler, error_id);
207
+ dr->encoding = 0;
208
+ }
209
+
210
+ static void
211
+ sax_drive_cleanup(SaxDrive dr) {
212
+ if (dr->base_buf != dr->buf) {
213
+ free(dr->buf);
214
+ }
215
+ }
216
+
217
+ static int
218
+ sax_drive_read(SaxDrive dr) {
219
+ int err;
220
+ size_t shift = 0;
221
+
222
+ if (dr->buf < dr->cur) {
223
+ if (0 == dr->str) {
224
+ shift = dr->cur - dr->buf;
225
+ } else {
226
+ shift = dr->str - dr->buf;
227
+ }
228
+ //printf("\n*** shift: %lu\n", shift);
229
+ if (0 == shift) { // no space left so allocate more
230
+ char *old = dr->buf;
231
+ size_t size = dr->buf_end - dr->buf;
232
+
233
+ if (dr->buf == dr->base_buf) {
234
+ if (0 == (dr->buf = (char*)malloc(size * 2))) {
235
+ rb_raise(rb_eNoMemError, "Could not allocate memory for large element.\n");
236
+ }
237
+ memcpy(dr->buf, old, size);
238
+ } else {
239
+ if (0 == (dr->buf = (char*)realloc(dr->buf, size * 2))) {
240
+ rb_raise(rb_eNoMemError, "Could not allocate memory for large element.\n");
241
+ }
242
+ }
243
+ dr->buf_end = dr->buf + size * 2;
244
+ dr->cur = dr->buf + (dr->cur - old);
245
+ dr->read_end = dr->buf + (dr->read_end - old);
246
+ if (0 != dr->str) {
247
+ dr->str = dr->buf + (dr->str - old);
248
+ }
249
+ } else {
250
+ memmove(dr->buf, dr->buf + shift, dr->read_end - (dr->buf + shift));
251
+ dr->cur -= shift;
252
+ dr->read_end -= shift;
253
+ if (0 != dr->str) {
254
+ dr->str -= shift;
255
+ }
256
+ }
257
+ }
258
+ err = dr->read_func(dr);
259
+ *dr->read_end = '\0';
260
+
261
+ return err;
262
+ }
263
+
264
+ static void
265
+ sax_drive_error(SaxDrive dr, const char *msg, int critical) {
266
+ if (dr->has_error) {
267
+ VALUE args[3];
268
+
269
+ args[0] = rb_str_new2(msg);
270
+ args[1] = INT2FIX(dr->line);
271
+ args[2] = INT2FIX(dr->col);
272
+ rb_funcall2(dr->handler, error_id, 3, args);
273
+ } else if (critical) {
274
+ sax_drive_cleanup(dr);
275
+ rb_raise(rb_eSyntaxError, "%s at line %d, column %d\n", msg, dr->line, dr->col);
276
+ }
277
+ }
278
+
279
+ static int
280
+ read_children(SaxDrive dr, int first) {
281
+ int err = 0;
282
+ int element_read = !first;
283
+ int doctype_read = !first;
284
+ char c;
285
+
286
+ while (!err) {
287
+ dr->str = dr->cur; // protect the start
288
+ if ('\0' == (c = next_non_white(dr))) {
289
+ if (!first) {
290
+ sax_drive_error(dr, "invalid format, element not terminated", 1);
291
+ err = 1;
292
+ }
293
+ break; // normal completion if first
294
+ }
295
+ if ('<' != c) {
296
+ if (first) { // all top level entities start with <
297
+ sax_drive_error(dr, "invalid format, expected <", 1);
298
+ break; // unrecoverable
299
+ }
300
+ if (0 != (err = read_text(dr))) { // finished when < is reached
301
+ break;
302
+ }
303
+ }
304
+ dr->str = dr->cur; // protect the start for elements
305
+ c = sax_drive_get(dr);
306
+ switch (c) {
307
+ case '?': // instructions (xml or otherwise)
308
+ if (!first || element_read || doctype_read) {
309
+ sax_drive_error(dr, "invalid format, instruction must come before elements", 0);
310
+ }
311
+ err = read_instruction(dr);
312
+ break;
313
+ case '!': // comment or doctype
314
+ dr->str = dr->cur;
315
+ c = sax_drive_get(dr);
316
+ if ('\0' == c) {
317
+ sax_drive_error(dr, "invalid format, DOCTYPE or comment not terminated", 1);
318
+ err = 1;
319
+ } else if ('-' == c) {
320
+ c = sax_drive_get(dr); // skip first - and get next character
321
+ if ('-' != c) {
322
+ sax_drive_error(dr, "invalid format, bad comment format", 1);
323
+ err = 1;
324
+ } else {
325
+ c = sax_drive_get(dr); // skip second -
326
+ err = read_comment(dr);
327
+ }
328
+ } else {
329
+ int i;
330
+
331
+ for (i = 7; 0 < i; i--) {
332
+ sax_drive_get(dr);
333
+ }
334
+ if (0 == strncmp("DOCTYPE", dr->str, 7)) {
335
+ if (element_read || !first) {
336
+ sax_drive_error(dr, "invalid format, DOCTYPE can not come after an element", 0);
337
+ }
338
+ doctype_read = 1;
339
+ err = read_doctype(dr);
340
+ } else if (0 == strncmp("[CDATA[", dr->str, 7)) {
341
+ err = read_cdata(dr);
342
+ } else {
343
+ sax_drive_error(dr, "invalid format, DOCTYPE or comment expected", 1);
344
+ err = 1;
345
+ }
346
+ }
347
+ break;
348
+ case '/': // element end
349
+ return ('\0' == read_name_token(dr));
350
+ break;
351
+ case '\0':
352
+ sax_drive_error(dr, "invalid format, document not terminated", 1);
353
+ err = 1;
354
+ break;
355
+ default:
356
+ dr->cur--; // safe since no read occurred after getting last character
357
+ if (first && element_read) {
358
+ sax_drive_error(dr, "invalid format, multiple top level elements", 0);
359
+ }
360
+ err = read_element(dr);
361
+ element_read = 1;
362
+ break;
363
+ }
364
+ }
365
+ return err;
366
+ }
367
+
368
+ /* Entered after the "<?" sequence. Ready to read the rest.
369
+ */
370
+ static int
371
+ read_instruction(SaxDrive dr) {
372
+ VALUE target = Qnil;
373
+ VALUE attrs = Qnil;
374
+ char c;
375
+
376
+ if ('\0' == (c = read_name_token(dr))) {
377
+ return -1;
378
+ }
379
+ if (dr->has_instruct) {
380
+ target = rb_str_new2(dr->str);
381
+ }
382
+ if (0 != read_attrs(dr, &attrs, c, '?', '?')) {
383
+ return -1;
384
+ }
385
+ c = next_non_white(dr);
386
+ if ('>' != c) {
387
+ sax_drive_error(dr, "invalid format, instruction not terminated", 1);
388
+ return -1;
389
+ }
390
+ if (0 != dr->has_instruct) {
391
+ VALUE args[2];
392
+
393
+ args[0] = target;
394
+ args[1] = attrs;
395
+ rb_funcall2(dr->handler, instruct_id, 2, args);
396
+ }
397
+ dr->str = 0;
398
+
399
+ return 0;
400
+ }
401
+
402
+ /* Entered after the "<!DOCTYPE" sequence. Ready to read the rest.
403
+ */
404
+ static int
405
+ read_doctype(SaxDrive dr) {
406
+ char c;
407
+
408
+ dr->str = dr->cur - 1; // mark the start
409
+ while ('>' != (c = sax_drive_get(dr))) {
410
+ if ('\0' == c) {
411
+ sax_drive_error(dr, "invalid format, doctype terminated unexpectedly", 1);
412
+ return -1;
413
+ }
414
+ }
415
+ *(dr->cur - 1) = '\0';
416
+ if (dr->has_doctype) {
417
+ VALUE args[1];
418
+
419
+ args[0] = rb_str_new2(dr->str);
420
+ rb_funcall2(dr->handler, doctype_id, 1, args);
421
+ }
422
+ dr->str = 0;
423
+
424
+ return 0;
425
+ }
426
+
427
+ /* Entered after the "<![CDATA[" sequence. Ready to read the rest.
428
+ */
429
+ static int
430
+ read_cdata(SaxDrive dr) {
431
+ char c;
432
+ int end = 0;
433
+
434
+ dr->str = dr->cur - 1; // mark the start
435
+ while (1) {
436
+ c = sax_drive_get(dr);
437
+ if (']' == c) {
438
+ end++;
439
+ } else if ('>' == c) {
440
+ if (2 <= end) {
441
+ *(dr->cur - 3) = '\0';
442
+ break;
443
+ }
444
+ end = 0;
445
+ } else if ('\0' == c) {
446
+ sax_drive_error(dr, "invalid format, cdata terminated unexpectedly", 1);
447
+ return -1;
448
+ } else {
449
+ end = 0;
450
+ }
451
+ }
452
+ if (dr->has_cdata) {
453
+ VALUE args[1];
454
+
455
+ args[0] = rb_str_new2(dr->str);
456
+ #ifdef HAVE_RUBY_ENCODING_H
457
+ if (0 != dr->encoding) {
458
+ rb_enc_associate(args[0], dr->encoding);
459
+ }
460
+ #endif
461
+ rb_funcall2(dr->handler, cdata_id, 1, args);
462
+ }
463
+ dr->str = 0;
464
+
465
+ return 0;
466
+ }
467
+
468
+ /* Entered after the "<!--" sequence. Ready to read the rest.
469
+ */
470
+ static int
471
+ read_comment(SaxDrive dr) {
472
+ char c;
473
+ int end = 0;
474
+
475
+ dr->str = dr->cur - 1; // mark the start
476
+ while (1) {
477
+ c = sax_drive_get(dr);
478
+ if ('-' == c) {
479
+ if (end) {
480
+ *(dr->cur - 2) = '\0';
481
+ break;
482
+ } else {
483
+ end = 1;
484
+ }
485
+ } else if ('\0' == c) {
486
+ sax_drive_error(dr, "invalid format, comment terminated unexpectedly", 1);
487
+ return -1;
488
+ } else {
489
+ end = 0;
490
+ }
491
+ }
492
+ c = sax_drive_get(dr);
493
+ if ('>' != c) {
494
+ sax_drive_error(dr, "invalid format, comment terminated unexpectedly", 1);
495
+ }
496
+ if (dr->has_comment) {
497
+ VALUE args[1];
498
+
499
+ args[0] = rb_str_new2(dr->str);
500
+ #ifdef HAVE_RUBY_ENCODING_H
501
+ if (0 != dr->encoding) {
502
+ rb_enc_associate(args[0], dr->encoding);
503
+ }
504
+ #endif
505
+ rb_funcall2(dr->handler, comment_id, 1, args);
506
+ }
507
+ dr->str = 0;
508
+
509
+ return 0;
510
+ }
511
+
512
+ /* Entered after the '<' and the first character after that. Returns status
513
+ * code.
514
+ */
515
+ static int
516
+ read_element(SaxDrive dr) {
517
+ VALUE name = Qnil;
518
+ VALUE attrs = Qnil;
519
+ char c;
520
+ int closed;
521
+
522
+ if ('\0' == (c = read_name_token(dr))) {
523
+ return -1;
524
+ }
525
+ name = str2sym(dr->str);
526
+ if ('/' == c) {
527
+ closed = 1;
528
+ } else if ('>' == c) {
529
+ closed = 0;
530
+ } else {
531
+ if (0 != read_attrs(dr, &attrs, c, '/', '>')) {
532
+ return -1;
533
+ }
534
+ closed = ('/' == *(dr->cur - 1));
535
+ }
536
+ if (closed) {
537
+ c = next_non_white(dr);
538
+ if ('>' != c) {
539
+ sax_drive_error(dr, "invalid format, element not closed", 1);
540
+ return -1;
541
+ }
542
+ }
543
+ if (0 != dr->has_start_element) {
544
+ VALUE args[2];
545
+
546
+ args[0] = name;
547
+ args[1] = attrs;
548
+ rb_funcall2(dr->handler, start_element_id, 2, args);
549
+ if (closed && dr->has_end_element) {
550
+ rb_funcall2(dr->handler, end_element_id, 1, args);
551
+ }
552
+ }
553
+ if (!closed) {
554
+ if (0 != read_children(dr, 0)) {
555
+ return -1;
556
+ }
557
+ if (0 != strcmp(dr->str, rb_id2name(SYM2ID(name)))) {
558
+ sax_drive_error(dr, "invalid format, element start and end names do not match", 1);
559
+ return -1;
560
+ }
561
+ if (0 != dr->has_end_element) {
562
+ VALUE args[1];
563
+
564
+ args[0] = name;
565
+ rb_funcall2(dr->handler, end_element_id, 1, args);
566
+ }
567
+ }
568
+ dr->str = 0;
569
+
570
+ return 0;
571
+ }
572
+
573
+ static int
574
+ read_text(SaxDrive dr) {
575
+ char c;
576
+
577
+ dr->str = dr->cur - 1; // mark the start
578
+ while ('<' != (c = sax_drive_get(dr))) {
579
+ if ('\0' == c) {
580
+ sax_drive_error(dr, "invalid format, text terminated unexpectedly", 1);
581
+ return -1;
582
+ }
583
+ }
584
+ *(dr->cur - 1) = '\0';
585
+ if (dr->has_text) {
586
+ VALUE args[1];
587
+
588
+ args[0] = rb_str_new2(dr->str);
589
+ #ifdef HAVE_RUBY_ENCODING_H
590
+ if (0 != dr->encoding) {
591
+ rb_enc_associate(args[0], dr->encoding);
592
+ }
593
+ #endif
594
+ rb_funcall2(dr->handler, text_id, 1, args);
595
+ }
596
+ return 0;
597
+ }
598
+
599
+ static int
600
+ read_attrs(SaxDrive dr, VALUE *attrs, char c, char termc, char term2) {
601
+ VALUE name = Qnil;
602
+ int is_encoding = 0;
603
+
604
+ dr->str = dr->cur; // lock it down
605
+ if (is_white(c)) {
606
+ c = next_non_white(dr);
607
+ }
608
+ while (termc != c && term2 != c) {
609
+ dr->cur--;
610
+ if ('\0' == c) {
611
+ sax_drive_error(dr, "invalid format, processing instruction not terminated", 1);
612
+ return -1;
613
+ }
614
+ if ('\0' == (c = read_name_token(dr))) {
615
+ return -1;
616
+ }
617
+ if ('?' == termc && 0 == strcmp("encoding", dr->str)) {
618
+ is_encoding = 1;
619
+ }
620
+ if (dr->has_instruct) {
621
+ name = str2sym(dr->str);
622
+ }
623
+ if (is_white(c)) {
624
+ c = next_non_white(dr);
625
+ }
626
+ if ('=' != c) {
627
+ sax_drive_error(dr, "invalid format, no attribute value", 1);
628
+ return -1;
629
+ }
630
+ if (0 != read_quoted_value(dr)) {
631
+ return -1;
632
+ }
633
+ #ifdef HAVE_RUBY_ENCODING_H
634
+ if (is_encoding) {
635
+ dr->encoding = rb_enc_find(dr->str);
636
+ }
637
+ #endif
638
+ if (dr->has_instruct) {
639
+ VALUE rstr = rb_str_new2(dr->str);
640
+
641
+ if (Qnil == *attrs) {
642
+ *attrs = rb_hash_new();
643
+ }
644
+ #ifdef HAVE_RUBY_ENCODING_H
645
+ if (0 != dr->encoding) {
646
+ rb_enc_associate(rstr, dr->encoding);
647
+ }
648
+ #endif
649
+ rb_hash_aset(*attrs, name, rstr);
650
+ }
651
+ c = next_non_white(dr);
652
+ }
653
+ return 0;
654
+ }
655
+
656
+ static char
657
+ read_name_token(SaxDrive dr) {
658
+ char c;
659
+
660
+ dr->str = dr->cur; // make sure the start doesn't get compacted out
661
+ c = sax_drive_get(dr);
662
+ if (is_white(c)) {
663
+ c = next_non_white(dr);
664
+ dr->str = dr->cur - 1;
665
+ }
666
+ while (1) {
667
+ switch (c) {
668
+ case ' ':
669
+ case '\t':
670
+ case '\f':
671
+ case '?':
672
+ case '=':
673
+ case '/':
674
+ case '>':
675
+ case '\n':
676
+ case '\r':
677
+ *(dr->cur - 1) = '\0';
678
+ return c;
679
+ case '\0':
680
+ // documents never terminate after a name token
681
+ sax_drive_error(dr, "invalid format, document not terminated", 1);
682
+ return '\0';
683
+ default:
684
+ break;
685
+ }
686
+ c = sax_drive_get(dr);
687
+ }
688
+ return '\0';
689
+ }
690
+
691
+ static int
692
+ read_quoted_value(SaxDrive dr) {
693
+ char c;
694
+
695
+ dr->str = dr->cur;
696
+ c = sax_drive_get(dr);
697
+ if (is_white(c)) {
698
+ c = next_non_white(dr);
699
+ }
700
+ if ('"' != c) {
701
+ sax_drive_error(dr, "invalid format, attibute value not in quotes", 1);
702
+ return -1;
703
+ }
704
+ dr->str = dr->cur;
705
+ while ('"' != (c = sax_drive_get(dr))) {
706
+ if ('\0' == c) {
707
+ sax_drive_error(dr, "invalid format, quoted value not terminated", 1);
708
+ return -1;
709
+ }
710
+ }
711
+ *(dr->cur - 1) = '\0'; // terminate value
712
+
713
+ return 0;
714
+ }
715
+
716
+ static int
717
+ read_from_io(SaxDrive dr) {
718
+ int ex = 0;
719
+
720
+ rb_protect(io_cb, (VALUE)dr, &ex);
721
+ // printf("*** io_cb exception = %d\n", ex);
722
+ // An error code of 6 is always returned not matter what kind of Exception is raised.
723
+ return ex;
724
+ }
725
+
726
+ static VALUE
727
+ io_cb(VALUE rdr) {
728
+ SaxDrive dr = (SaxDrive)rdr;
729
+ VALUE args[1];
730
+ VALUE rstr;
731
+ char *str;
732
+ size_t cnt;
733
+
734
+ args[0] = SIZET2NUM(dr->buf_end - dr->cur);
735
+ rstr = rb_funcall2(dr->io, readpartial_id, 1, args);
736
+ str = StringValuePtr(rstr);
737
+ cnt = strlen(str);
738
+ //printf("*** read %lu bytes, str: '%s'\n", cnt, str);
739
+ strcpy(dr->cur, str);
740
+ dr->read_end = dr->cur + cnt;
741
+
742
+ return Qnil;
743
+ }
744
+
745
+ static int
746
+ read_from_fd(SaxDrive dr) {
747
+ ssize_t cnt;
748
+ size_t max = dr->buf_end - dr->cur;
749
+
750
+ cnt = read(dr->fd, dr->cur, max);
751
+ if (cnt < 0) {
752
+ sax_drive_error(dr, "failed to read from file", 1);
753
+ return -1;
754
+ } else if (0 != cnt) {
755
+ dr->read_end = dr->cur + cnt;
756
+ }
757
+ return 0;
758
+ }