ox 2.11.0 → 2.13.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +46 -0
- data/README.md +55 -7
- data/ext/ox/builder.c +13 -7
- data/ext/ox/dump.c +31 -25
- data/ext/ox/extconf.rb +16 -34
- data/ext/ox/gen_load.c +18 -96
- data/ext/ox/hash_load.c +62 -26
- data/ext/ox/obj_load.c +8 -45
- data/ext/ox/ox.c +59 -48
- data/ext/ox/ox.h +33 -38
- data/ext/ox/parse.c +59 -67
- data/ext/ox/sax.c +84 -134
- data/ext/ox/sax.h +2 -4
- data/ext/ox/sax_as.c +2 -6
- data/ext/ox/sax_buf.c +1 -1
- data/ext/ox/special.c +346 -0
- data/ext/ox/special.h +1 -0
- data/lib/ox/element.rb +1 -0
- data/lib/ox/version.rb +1 -1
- metadata +7 -8
- data/ext/ox/encode.h +0 -26
data/ext/ox/ox.c
CHANGED
@@ -127,6 +127,7 @@ static VALUE limited_sym;
|
|
127
127
|
static VALUE margin_sym;
|
128
128
|
static VALUE mode_sym;
|
129
129
|
static VALUE nest_ok_sym;
|
130
|
+
static VALUE no_empty_sym;
|
130
131
|
static VALUE object_sym;
|
131
132
|
static VALUE off_sym;
|
132
133
|
static VALUE opt_format_sym;
|
@@ -153,10 +154,8 @@ static VALUE element_key_mod_sym;
|
|
153
154
|
static ID encoding_id;
|
154
155
|
static ID has_key_id;
|
155
156
|
|
156
|
-
#if
|
157
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
157
158
|
rb_encoding *ox_utf8_encoding = 0;
|
158
|
-
#elif HAS_PRIVATE_ENCODING
|
159
|
-
VALUE ox_utf8_encoding = Qnil;
|
160
159
|
#else
|
161
160
|
void *ox_utf8_encoding = 0;
|
162
161
|
#endif
|
@@ -177,18 +176,15 @@ struct _options ox_default_options = {
|
|
177
176
|
Yes, // sym_keys
|
178
177
|
SpcSkip, // skip
|
179
178
|
No, // smart
|
180
|
-
|
179
|
+
true, // convert_special
|
181
180
|
No, // allow_invalid
|
181
|
+
false, // no_empty
|
182
182
|
{ '\0' }, // inv_repl
|
183
183
|
{ '\0' }, // strip_ns
|
184
184
|
NULL, // html_hints
|
185
185
|
Qnil, // attr_key_mod;
|
186
186
|
Qnil, // element_key_mod;
|
187
|
-
#if HAS_PRIVATE_ENCODING
|
188
|
-
Qnil // rb_enc
|
189
|
-
#else
|
190
187
|
0 // rb_enc
|
191
|
-
#endif
|
192
188
|
};
|
193
189
|
|
194
190
|
extern ParseCallbacks ox_obj_callbacks;
|
@@ -293,6 +289,7 @@ hints_to_overlay(Hints hints) {
|
|
293
289
|
* - _:smart_ [true|false|nil] flag indicating the SAX parser uses hints if available (use with html)
|
294
290
|
* - _:convert_special_ [true|false|nil] flag indicating special characters like < are converted with the SAX parser
|
295
291
|
* - _:invalid_replace_ [nil|String] replacement string for invalid XML characters on dump. nil indicates include anyway as hex. A string, limited to 10 characters will replace the invalid character with the replace.
|
292
|
+
* - _:no_empty_ [true|false|nil] flag indicating there should be no empty elements in a dump
|
296
293
|
* - _:strip_namespace_ [String|true|false] false or "" results in no namespace stripping. A string of "*" or true will strip all namespaces. Any other non-empty string indicates that matching namespaces will be stripped.
|
297
294
|
* - _:overlay_ [Hash] a Hash of keys that match html element names and values that are one of
|
298
295
|
* - _:active_ - make the normal callback for the element
|
@@ -326,6 +323,7 @@ get_def_opts(VALUE self) {
|
|
326
323
|
rb_hash_aset(opts, element_key_mod_sym, ox_default_options.element_key_mod);
|
327
324
|
rb_hash_aset(opts, smart_sym, (Yes == ox_default_options.smart) ? Qtrue : ((No == ox_default_options.smart) ? Qfalse : Qnil));
|
328
325
|
rb_hash_aset(opts, convert_special_sym, (ox_default_options.convert_special) ? Qtrue : Qfalse);
|
326
|
+
rb_hash_aset(opts, no_empty_sym, (ox_default_options.no_empty) ? Qtrue : Qfalse);
|
329
327
|
switch (ox_default_options.mode) {
|
330
328
|
case ObjMode: rb_hash_aset(opts, mode_sym, object_sym); break;
|
331
329
|
case GenMode: rb_hash_aset(opts, mode_sym, generic_sym); break;
|
@@ -466,11 +464,8 @@ set_def_opts(VALUE self, VALUE opts) {
|
|
466
464
|
} else {
|
467
465
|
Check_Type(v, T_STRING);
|
468
466
|
strncpy(ox_default_options.encoding, StringValuePtr(v), sizeof(ox_default_options.encoding) - 1);
|
469
|
-
#if
|
467
|
+
#if HAVE_RB_ENC_FIND
|
470
468
|
ox_default_options.rb_enc = rb_enc_find(ox_default_options.encoding);
|
471
|
-
#elif HAS_PRIVATE_ENCODING
|
472
|
-
ox_default_options.rb_enc = rb_str_new2(ox_default_options.encoding);
|
473
|
-
rb_gc_register_address(&ox_default_options.rb_enc);
|
474
469
|
#endif
|
475
470
|
}
|
476
471
|
|
@@ -542,6 +537,17 @@ set_def_opts(VALUE self, VALUE opts) {
|
|
542
537
|
rb_raise(ox_parse_error_class, ":convert_special must be true or false.\n");
|
543
538
|
}
|
544
539
|
|
540
|
+
v = rb_hash_lookup(opts, no_empty_sym);
|
541
|
+
if (Qnil == v) {
|
542
|
+
// no change
|
543
|
+
} else if (Qtrue == v) {
|
544
|
+
ox_default_options.no_empty = 1;
|
545
|
+
} else if (Qfalse == v) {
|
546
|
+
ox_default_options.no_empty = 0;
|
547
|
+
} else {
|
548
|
+
rb_raise(ox_parse_error_class, ":no_empty must be true or false.\n");
|
549
|
+
}
|
550
|
+
|
545
551
|
v = rb_hash_aref(opts, invalid_replace_sym);
|
546
552
|
if (Qnil == v) {
|
547
553
|
ox_default_options.allow_invalid = Yes;
|
@@ -659,14 +665,14 @@ to_obj(VALUE self, VALUE ruby_xml) {
|
|
659
665
|
xml = ALLOCA_N(char, len);
|
660
666
|
}
|
661
667
|
memcpy(xml, x, len);
|
662
|
-
#
|
668
|
+
#ifdef RB_GC_GUARD
|
663
669
|
rb_gc_disable();
|
664
670
|
#endif
|
665
671
|
obj = ox_parse(xml, len - 1, ox_obj_callbacks, 0, &options, &err);
|
666
672
|
if (SMALL_XML < len) {
|
667
673
|
xfree(xml);
|
668
674
|
}
|
669
|
-
#
|
675
|
+
#ifdef RB_GC_GUARD
|
670
676
|
RB_GC_GUARD(obj);
|
671
677
|
rb_gc_enable();
|
672
678
|
#endif
|
@@ -777,6 +783,9 @@ load(char *xml, size_t len, int argc, VALUE *argv, VALUE self, VALUE encoding, E
|
|
777
783
|
if (Qnil != (v = rb_hash_lookup(h, convert_special_sym))) {
|
778
784
|
options.convert_special = (Qfalse != v);
|
779
785
|
}
|
786
|
+
if (Qnil != (v = rb_hash_lookup(h, no_empty_sym))) {
|
787
|
+
options.no_empty = (Qfalse != v);
|
788
|
+
}
|
780
789
|
|
781
790
|
v = rb_hash_lookup(h, invalid_replace_sym);
|
782
791
|
if (Qnil == v) {
|
@@ -830,7 +839,7 @@ load(char *xml, size_t len, int argc, VALUE *argv, VALUE self, VALUE encoding, E
|
|
830
839
|
options.margin_len = strlen(options.margin);
|
831
840
|
}
|
832
841
|
}
|
833
|
-
#if
|
842
|
+
#if HAVE_RB_ENC_FIND
|
834
843
|
if ('\0' == *options.encoding) {
|
835
844
|
if (Qnil != encoding) {
|
836
845
|
options.rb_enc = rb_enc_from_index(rb_enc_get_index(encoding));
|
@@ -840,26 +849,15 @@ load(char *xml, size_t len, int argc, VALUE *argv, VALUE self, VALUE encoding, E
|
|
840
849
|
} else if (0 == options.rb_enc) {
|
841
850
|
options.rb_enc = rb_enc_find(options.encoding);
|
842
851
|
}
|
843
|
-
#elif HAS_PRIVATE_ENCODING
|
844
|
-
if ('\0' == *options.encoding) {
|
845
|
-
if (Qnil != encoding) {
|
846
|
-
options.rb_enc = encoding;
|
847
|
-
} else {
|
848
|
-
options.rb_enc = Qnil;
|
849
|
-
}
|
850
|
-
} else if (0 == options.rb_enc) {
|
851
|
-
options.rb_enc = rb_str_new2(options.encoding);
|
852
|
-
rb_gc_register_address(&options.rb_enc);
|
853
|
-
}
|
854
852
|
#endif
|
855
853
|
xml = defuse_bom(xml, &options);
|
856
854
|
switch (options.mode) {
|
857
855
|
case ObjMode:
|
858
|
-
#
|
856
|
+
#ifdef RB_GC_GUARD
|
859
857
|
rb_gc_disable();
|
860
858
|
#endif
|
861
859
|
obj = ox_parse(xml, len, ox_obj_callbacks, 0, &options, err);
|
862
|
-
#
|
860
|
+
#ifdef RB_GC_GUARD
|
863
861
|
RB_GC_GUARD(obj);
|
864
862
|
rb_gc_enable();
|
865
863
|
#endif
|
@@ -928,14 +926,8 @@ load_str(int argc, VALUE *argv, VALUE self) {
|
|
928
926
|
} else {
|
929
927
|
xml = ALLOCA_N(char, len);
|
930
928
|
}
|
931
|
-
#if
|
932
|
-
#ifdef MACRUBY_RUBY
|
933
|
-
encoding = rb_funcall(*argv, encoding_id, 0);
|
934
|
-
#else
|
929
|
+
#if HAVE_RB_OBJ_ENCODING
|
935
930
|
encoding = rb_obj_encoding(*argv);
|
936
|
-
#endif
|
937
|
-
#elif HAS_PRIVATE_ENCODING
|
938
|
-
encoding = rb_funcall(*argv, encoding_id, 0);
|
939
931
|
#else
|
940
932
|
encoding = Qnil;
|
941
933
|
#endif
|
@@ -996,7 +988,7 @@ load_file(int argc, VALUE *argv, VALUE self) {
|
|
996
988
|
xml = ALLOCA_N(char, len + 1);
|
997
989
|
}
|
998
990
|
fseek(f, 0, SEEK_SET);
|
999
|
-
if (len != fread(xml, 1, len, f)) {
|
991
|
+
if ((size_t)len != fread(xml, 1, len, f)) {
|
1000
992
|
ox_err_set(&err, rb_eLoadError, "Failed to read %ld bytes from %s.\n", (long)len, path);
|
1001
993
|
obj = Qnil;
|
1002
994
|
} else {
|
@@ -1208,6 +1200,9 @@ parse_dump_options(VALUE ropts, Options copts) {
|
|
1208
1200
|
}
|
1209
1201
|
strncpy(copts->encoding, StringValuePtr(v), sizeof(copts->encoding) - 1);
|
1210
1202
|
}
|
1203
|
+
if (Qnil != (v = rb_hash_lookup(ropts, no_empty_sym))) {
|
1204
|
+
copts->no_empty = (v == Qtrue);
|
1205
|
+
}
|
1211
1206
|
if (Qnil != (v = rb_hash_lookup(ropts, effort_sym))) {
|
1212
1207
|
if (auto_define_sym == v) {
|
1213
1208
|
copts->effort = AutoEffort;
|
@@ -1275,6 +1270,7 @@ parse_dump_options(VALUE ropts, Options copts) {
|
|
1275
1270
|
* - +obj+ [Object] Object to serialize as an XML document String
|
1276
1271
|
* - +options+ [Hash] formating options
|
1277
1272
|
* - *:indent* [Fixnum] format expected
|
1273
|
+
* - *:no_empty* [true|false] if true don't output empty elements
|
1278
1274
|
* - *:xsd_date* [true|false] use XSD date format if true, default: false
|
1279
1275
|
* - *:circular* [true|false] allow circular references, default: false
|
1280
1276
|
* - *:strict|:tolerant]* [ :effort effort to use when an undumpable object (e.g., IO) is encountered, default: :strict
|
@@ -1297,21 +1293,38 @@ dump(int argc, VALUE *argv, VALUE self) {
|
|
1297
1293
|
rb_raise(rb_eNoMemError, "Not enough memory.\n");
|
1298
1294
|
}
|
1299
1295
|
rstr = rb_str_new2(xml);
|
1300
|
-
#if
|
1296
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
1301
1297
|
if ('\0' != *copts.encoding) {
|
1302
1298
|
rb_enc_associate(rstr, rb_enc_find(copts.encoding));
|
1303
1299
|
}
|
1304
|
-
#elif HAS_PRIVATE_ENCODING
|
1305
|
-
if ('\0' != *copts.encoding) {
|
1306
|
-
rb_funcall(rstr, ox_force_encoding_id, 1, rb_str_new2(copts.encoding));
|
1307
|
-
}
|
1308
1300
|
#endif
|
1309
1301
|
xfree(xml);
|
1310
1302
|
|
1311
1303
|
return rstr;
|
1312
1304
|
}
|
1313
1305
|
|
1314
|
-
/* call-seq:
|
1306
|
+
/* call-seq: to_xml(obj, options) => xml-string
|
1307
|
+
*
|
1308
|
+
* Dumps an Object (obj) to a string.
|
1309
|
+
* - +obj+ [Object] Object to serialize as an XML document String
|
1310
|
+
* - +options+ [Hash] formating options
|
1311
|
+
* - *:indent* [Fixnum] format expected
|
1312
|
+
* - *:no_empty* [true|false] if true don't output empty elements
|
1313
|
+
* - *:xsd_date* [true|false] use XSD date format if true, default: false
|
1314
|
+
* - *:circular* [true|false] allow circular references, default: false
|
1315
|
+
* - *:strict|:tolerant]* [ :effort effort to use when an undumpable object (e.g., IO) is encountered, default: :strict
|
1316
|
+
* - _:strict_ - raise an NotImplementedError if an undumpable object is encountered
|
1317
|
+
* - _:tolerant_ - replaces undumplable objects with nil
|
1318
|
+
*
|
1319
|
+
* Note that an indent of less than zero will result in a tight one line output
|
1320
|
+
* unless the text in the XML fields contain new line characters.
|
1321
|
+
*/
|
1322
|
+
static VALUE
|
1323
|
+
to_xml(int argc, VALUE *argv, VALUE self) {
|
1324
|
+
return dump(argc, argv, self);
|
1325
|
+
}
|
1326
|
+
|
1327
|
+
/* call-seq: to_file(file_path, obj, options) => Object
|
1315
1328
|
*
|
1316
1329
|
* Dumps an Object to the specified file.
|
1317
1330
|
* - +file_path+ [String] file path to write the XML document to
|
@@ -1370,7 +1383,7 @@ void Init_ox() {
|
|
1370
1383
|
rb_define_module_function(Ox, "sax_parse", sax_parse, -1);
|
1371
1384
|
rb_define_module_function(Ox, "sax_html", sax_html, -1);
|
1372
1385
|
|
1373
|
-
rb_define_module_function(Ox, "to_xml",
|
1386
|
+
rb_define_module_function(Ox, "to_xml", to_xml, -1);
|
1374
1387
|
rb_define_module_function(Ox, "dump", dump, -1);
|
1375
1388
|
|
1376
1389
|
rb_define_module_function(Ox, "load_file", load_file, -1);
|
@@ -1480,6 +1493,7 @@ void Init_ox() {
|
|
1480
1493
|
margin_sym = ID2SYM(rb_intern("margin")); rb_gc_register_address(&margin_sym);
|
1481
1494
|
mode_sym = ID2SYM(rb_intern("mode")); rb_gc_register_address(&mode_sym);
|
1482
1495
|
nest_ok_sym = ID2SYM(rb_intern("nest_ok")); rb_gc_register_address(&nest_ok_sym);
|
1496
|
+
no_empty_sym = ID2SYM(rb_intern("no_empty")); rb_gc_register_address(&no_empty_sym);
|
1483
1497
|
object_sym = ID2SYM(rb_intern("object")); rb_gc_register_address(&object_sym);
|
1484
1498
|
off_sym = ID2SYM(rb_intern("off")); rb_gc_register_address(&off_sym);
|
1485
1499
|
opt_format_sym = ID2SYM(rb_intern("opt_format")); rb_gc_register_address(&opt_format_sym);
|
@@ -1532,11 +1546,8 @@ void Init_ox() {
|
|
1532
1546
|
rb_define _module_function(Ox, "cache8_test", cache8_test, 0);
|
1533
1547
|
#endif
|
1534
1548
|
|
1535
|
-
#if
|
1549
|
+
#if HAVE_RB_ENC_FIND
|
1536
1550
|
ox_utf8_encoding = rb_enc_find("UTF-8");
|
1537
|
-
#elif HAS_PRIVATE_ENCODING
|
1538
|
-
ox_utf8_encoding = rb_str_new2("UTF-8");
|
1539
|
-
rb_gc_register_address(&ox_utf8_encoding);
|
1540
1551
|
#endif
|
1541
1552
|
}
|
1542
1553
|
|
@@ -1557,7 +1568,7 @@ _ox_raise_error(const char *msg, const char *xml, const char *current, const cha
|
|
1557
1568
|
xline++;
|
1558
1569
|
}
|
1559
1570
|
}
|
1560
|
-
#
|
1571
|
+
#ifdef RB_GC_GUARD
|
1561
1572
|
rb_gc_enable();
|
1562
1573
|
#endif
|
1563
1574
|
rb_raise(ox_parse_error_class, "%s at line %d, column %d [%s:%d]\n", msg, xline, col, file, line);
|
data/ext/ox/ox.h
CHANGED
@@ -16,20 +16,15 @@ extern "C" {
|
|
16
16
|
#define RSTRING_NOT_MODIFIED
|
17
17
|
|
18
18
|
#include "ruby.h"
|
19
|
-
#if
|
19
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
20
20
|
#include "ruby/encoding.h"
|
21
21
|
#endif
|
22
22
|
|
23
|
-
#
|
24
|
-
#
|
25
|
-
enum st_retval {ST_CONTINUE = 0, ST_STOP = 1, ST_DELETE = 2, ST_CHECK};
|
23
|
+
#if HAVE_RUBY_ST_H
|
24
|
+
#include "ruby/st.h"
|
26
25
|
#else
|
27
|
-
|
28
|
-
/* Only on travis, local is where it is for all others. Seems to vary depending on the travis machine picked up. */
|
26
|
+
// Only on travis, local is where it is for all others. Seems to vary depending on the travis machine picked up.
|
29
27
|
#include "st.h"
|
30
|
-
#else
|
31
|
-
#include "ruby/st.h"
|
32
|
-
#endif
|
33
28
|
#endif
|
34
29
|
|
35
30
|
#include "cache.h"
|
@@ -123,49 +118,51 @@ typedef struct _circArray {
|
|
123
118
|
} *CircArray;
|
124
119
|
|
125
120
|
typedef struct _options {
|
126
|
-
char encoding[64];
|
127
|
-
char margin[128];
|
128
|
-
int indent;
|
129
|
-
int trace;
|
130
|
-
char margin_len;
|
131
|
-
char with_dtd;
|
132
|
-
char with_xml;
|
133
|
-
char with_instruct;
|
134
|
-
char circular;
|
135
|
-
char xsd_date;
|
136
|
-
char mode;
|
137
|
-
char effort;
|
138
|
-
char sym_keys;
|
139
|
-
char skip;
|
140
|
-
char smart;
|
141
|
-
char convert_special
|
142
|
-
char allow_invalid;
|
143
|
-
char
|
144
|
-
char
|
145
|
-
|
121
|
+
char encoding[64]; // encoding, stored in the option to avoid GC invalidation in default values
|
122
|
+
char margin[128]; // left margin for dumping
|
123
|
+
int indent; // indention for dump, default 2
|
124
|
+
int trace; // trace level
|
125
|
+
char margin_len; // margin length
|
126
|
+
char with_dtd; // YesNo
|
127
|
+
char with_xml; // YesNo
|
128
|
+
char with_instruct; // YesNo
|
129
|
+
char circular; // YesNo
|
130
|
+
char xsd_date; // YesNo
|
131
|
+
char mode; // LoadMode
|
132
|
+
char effort; // Effort
|
133
|
+
char sym_keys; // symbolize keys
|
134
|
+
char skip; // skip mode
|
135
|
+
char smart; // YesNo sax smart mode
|
136
|
+
char convert_special;// boolean true or false
|
137
|
+
char allow_invalid; // YesNo
|
138
|
+
char no_empty; // boolean - no empty elements when dumping
|
139
|
+
char inv_repl[12]; // max 10 valid characters, first character is the length
|
140
|
+
char strip_ns[64]; // namespace to strip, \0 is no-strip, \* is all, else only matches
|
141
|
+
struct _hints *html_hints; // html hints
|
146
142
|
VALUE attr_key_mod;
|
147
143
|
VALUE element_key_mod;
|
148
|
-
#if
|
144
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
149
145
|
rb_encoding *rb_enc;
|
150
|
-
#elif HAS_PRIVATE_ENCODING
|
151
|
-
VALUE rb_enc;
|
152
146
|
#else
|
153
147
|
void *rb_enc;
|
154
148
|
#endif
|
155
149
|
} *Options;
|
156
150
|
|
157
|
-
|
151
|
+
// parse information structure
|
158
152
|
struct _pInfo {
|
159
153
|
struct _helperStack helpers;
|
160
154
|
struct _err err;
|
161
|
-
char *str; //buffer being read from
|
155
|
+
char *str; // buffer being read from
|
162
156
|
char *end; // end of original string
|
163
157
|
char *s; // current position in buffer
|
164
158
|
VALUE obj;
|
165
159
|
ParseCallbacks pcb;
|
166
160
|
CircArray circ_array;
|
167
|
-
unsigned long id; //set for text types when cirs_array is set
|
161
|
+
unsigned long id; // set for text types when cirs_array is set
|
168
162
|
Options options;
|
163
|
+
VALUE *marked;
|
164
|
+
int mark_size; // allocated size
|
165
|
+
int mark_cnt;
|
169
166
|
char last; // last character read, rarely set
|
170
167
|
};
|
171
168
|
|
@@ -232,10 +229,8 @@ extern ID ox_tv_nsec_id;
|
|
232
229
|
extern ID ox_tv_usec_id;
|
233
230
|
extern ID ox_value_id;
|
234
231
|
|
235
|
-
#if
|
232
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
236
233
|
extern rb_encoding *ox_utf8_encoding;
|
237
|
-
#elif HAS_PRIVATE_ENCODING
|
238
|
-
extern VALUE ox_utf8_encoding;
|
239
234
|
#else
|
240
235
|
extern void *ox_utf8_encoding;
|
241
236
|
#endif
|
data/ext/ox/parse.c
CHANGED
@@ -121,7 +121,7 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
|
|
121
121
|
if (DEBUG <= options->trace) {
|
122
122
|
printf("Parsing xml:\n%s\n", xml);
|
123
123
|
}
|
124
|
-
|
124
|
+
// initialize parse info
|
125
125
|
helper_stack_init(&pi.helpers);
|
126
126
|
// Protect against GC
|
127
127
|
wrap = Data_Wrap_Struct(rb_cObject, mark_pi_cb, NULL, &pi);
|
@@ -134,8 +134,11 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
|
|
134
134
|
pi.obj = Qnil;
|
135
135
|
pi.circ_array = 0;
|
136
136
|
pi.options = options;
|
137
|
+
pi.marked = NULL;
|
138
|
+
pi.mark_size = 0;
|
139
|
+
pi.mark_cnt = 0;
|
137
140
|
while (1) {
|
138
|
-
next_non_white(&pi);
|
141
|
+
next_non_white(&pi); // skip white space
|
139
142
|
if ('\0' == *pi.s) {
|
140
143
|
break;
|
141
144
|
}
|
@@ -143,31 +146,31 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
|
|
143
146
|
*endp = pi.s;
|
144
147
|
break;
|
145
148
|
}
|
146
|
-
if ('<' != *pi.s) {
|
149
|
+
if ('<' != *pi.s) { // all top level entities start with <
|
147
150
|
set_error(err, "invalid format, expected <", pi.str, pi.s);
|
148
151
|
helper_stack_cleanup(&pi.helpers);
|
149
152
|
return Qnil;
|
150
153
|
}
|
151
|
-
pi.s++;
|
154
|
+
pi.s++; // past <
|
152
155
|
switch (*pi.s) {
|
153
|
-
case '?':
|
156
|
+
case '?': // processing instruction
|
154
157
|
pi.s++;
|
155
158
|
read_instruction(&pi);
|
156
159
|
break;
|
157
|
-
case '!':
|
160
|
+
case '!': // comment or doctype
|
158
161
|
pi.s++;
|
159
162
|
if ('\0' == *pi.s) {
|
160
163
|
set_error(err, "invalid format, DOCTYPE or comment not terminated", pi.str, pi.s);
|
161
164
|
helper_stack_cleanup(&pi.helpers);
|
162
165
|
return Qnil;
|
163
166
|
} else if ('-' == *pi.s) {
|
164
|
-
pi.s++;
|
167
|
+
pi.s++; // skip -
|
165
168
|
if ('-' != *pi.s) {
|
166
169
|
set_error(err, "invalid format, bad comment format", pi.str, pi.s);
|
167
170
|
helper_stack_cleanup(&pi.helpers);
|
168
171
|
return Qnil;
|
169
172
|
} else {
|
170
|
-
pi.s++;
|
173
|
+
pi.s++; // skip second -
|
171
174
|
read_comment(&pi);
|
172
175
|
}
|
173
176
|
} else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7) : 0 == strncmp("DOCTYPE", pi.s, 7)) {
|
@@ -229,8 +232,7 @@ gather_content(const char *src, char *content, size_t len) {
|
|
229
232
|
return 0;
|
230
233
|
}
|
231
234
|
|
232
|
-
|
233
|
-
*/
|
235
|
+
// Entered after the "<?" sequence. Ready to read the rest.
|
234
236
|
static void
|
235
237
|
read_instruction(PInfo pi) {
|
236
238
|
char content[1024];
|
@@ -255,7 +257,7 @@ read_instruction(PInfo pi) {
|
|
255
257
|
}
|
256
258
|
next_non_white(pi);
|
257
259
|
c = *pi->s;
|
258
|
-
*end = '\0';
|
260
|
+
*end = '\0'; // terminate name
|
259
261
|
if ('?' != c) {
|
260
262
|
while ('?' != c) {
|
261
263
|
pi->last = 0;
|
@@ -275,8 +277,8 @@ read_instruction(PInfo pi) {
|
|
275
277
|
attrs_ok = 0;
|
276
278
|
break;
|
277
279
|
}
|
278
|
-
*end = '\0';
|
279
|
-
|
280
|
+
*end = '\0'; // terminate name
|
281
|
+
// read value
|
280
282
|
next_non_white(pi);
|
281
283
|
if (0 == (attr_value = read_quoted_value(pi))) {
|
282
284
|
attr_stack_cleanup(&attrs);
|
@@ -355,9 +357,8 @@ read_delimited(PInfo pi, char end) {
|
|
355
357
|
}
|
356
358
|
}
|
357
359
|
|
358
|
-
|
359
|
-
|
360
|
-
*/
|
360
|
+
// Entered after the "<!DOCTYPE" sequence plus the first character after
|
361
|
+
// that. Ready to read the rest.
|
361
362
|
static void
|
362
363
|
read_doctype(PInfo pi) {
|
363
364
|
char *docType;
|
@@ -376,8 +377,7 @@ read_doctype(PInfo pi) {
|
|
376
377
|
}
|
377
378
|
}
|
378
379
|
|
379
|
-
|
380
|
-
*/
|
380
|
+
// Entered after "<!--". Returns error code.
|
381
381
|
static void
|
382
382
|
read_comment(PInfo pi) {
|
383
383
|
char *end;
|
@@ -406,16 +406,15 @@ read_comment(PInfo pi) {
|
|
406
406
|
break;
|
407
407
|
}
|
408
408
|
}
|
409
|
-
*end = '\0';
|
409
|
+
*end = '\0'; // in case the comment was blank
|
410
410
|
pi->s = end + 3;
|
411
411
|
if (0 != pi->pcb->add_comment) {
|
412
412
|
pi->pcb->add_comment(pi, comment);
|
413
413
|
}
|
414
414
|
}
|
415
415
|
|
416
|
-
|
417
|
-
|
418
|
-
*/
|
416
|
+
// Entered after the '<' and the first character after that. Returns stat
|
417
|
+
// code.
|
419
418
|
static char*
|
420
419
|
read_element(PInfo pi) {
|
421
420
|
struct _attrStack attrs;
|
@@ -439,10 +438,9 @@ read_element(PInfo pi) {
|
|
439
438
|
c = *pi->s;
|
440
439
|
*end = '\0';
|
441
440
|
if ('/' == c) {
|
442
|
-
|
441
|
+
// empty element, no attributes and no children
|
443
442
|
pi->s++;
|
444
443
|
if ('>' != *pi->s) {
|
445
|
-
/*printf("*** '%s' ***\n", pi->s); */
|
446
444
|
attr_stack_cleanup(&attrs);
|
447
445
|
set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
|
448
446
|
return 0;
|
@@ -480,8 +478,8 @@ read_element(PInfo pi) {
|
|
480
478
|
pi->s++;
|
481
479
|
pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
|
482
480
|
pi->pcb->end_element(pi, ename);
|
483
|
-
|
484
481
|
attr_stack_cleanup(&attrs);
|
482
|
+
|
485
483
|
return 0;
|
486
484
|
case '>':
|
487
485
|
/* has either children or a value */
|
@@ -545,6 +543,12 @@ read_element(PInfo pi) {
|
|
545
543
|
while (!done) {
|
546
544
|
start = pi->s;
|
547
545
|
next_non_white(pi);
|
546
|
+
if (OffSkip == pi->options->skip && start < pi->s && '<' == *pi->s) {
|
547
|
+
c = *pi->s;
|
548
|
+
*pi->s = '\0';
|
549
|
+
pi->pcb->add_text(pi, start, 1);
|
550
|
+
*pi->s = c;
|
551
|
+
}
|
548
552
|
c = *pi->s++;
|
549
553
|
if ('\0' == c) {
|
550
554
|
attr_stack_cleanup(&attrs);
|
@@ -1001,11 +1005,13 @@ read_coded_chars(PInfo pi, char *text) {
|
|
1001
1005
|
char *b, buf[32];
|
1002
1006
|
char *end = buf + sizeof(buf) - 1;
|
1003
1007
|
char *s;
|
1008
|
+
long blen = 0;
|
1004
1009
|
|
1005
1010
|
for (b = buf, s = pi->s; b < end; b++, s++) {
|
1006
1011
|
*b = *s;
|
1007
1012
|
if (';' == *s) {
|
1008
1013
|
*(b + 1) = '\0';
|
1014
|
+
blen = b - buf;
|
1009
1015
|
s++;
|
1010
1016
|
break;
|
1011
1017
|
}
|
@@ -1026,18 +1032,9 @@ read_coded_chars(PInfo pi, char *text) {
|
|
1026
1032
|
} else {
|
1027
1033
|
if (u <= 0x000000000000007FULL) {
|
1028
1034
|
*text++ = (char)u;
|
1029
|
-
#if HAS_PRIVATE_ENCODING
|
1030
|
-
} else if (ox_utf8_encoding == pi->options->rb_enc ||
|
1031
|
-
0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(pi->options->rb_enc)))) {
|
1032
|
-
#else
|
1033
1035
|
} else if (ox_utf8_encoding == pi->options->rb_enc) {
|
1034
|
-
#endif
|
1035
1036
|
text = ox_ucs_to_utf8_chars(text, u);
|
1036
|
-
#if HAS_PRIVATE_ENCODING
|
1037
|
-
} else if (Qnil == pi->options->rb_enc) {
|
1038
|
-
#else
|
1039
1037
|
} else if (0 == pi->options->rb_enc) {
|
1040
|
-
#endif
|
1041
1038
|
pi->options->rb_enc = ox_utf8_encoding;
|
1042
1039
|
text = ox_ucs_to_utf8_chars(text, u);
|
1043
1040
|
} else if (TolerantEffort == pi->options->effort) {
|
@@ -1048,30 +1045,20 @@ read_coded_chars(PInfo pi, char *text) {
|
|
1048
1045
|
} else {
|
1049
1046
|
/*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */
|
1050
1047
|
set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
|
1051
|
-
return
|
1048
|
+
return NULL;
|
1052
1049
|
}
|
1053
1050
|
pi->s = s;
|
1054
1051
|
}
|
1055
|
-
} else if (0 == strcasecmp(buf, "nbsp;")) {
|
1056
|
-
pi->s = s;
|
1057
|
-
*text++ = ' ';
|
1058
|
-
} else if (0 == strcasecmp(buf, "lt;")) {
|
1059
|
-
pi->s = s;
|
1060
|
-
*text++ = '<';
|
1061
|
-
} else if (0 == strcasecmp(buf, "gt;")) {
|
1062
|
-
pi->s = s;
|
1063
|
-
*text++ = '>';
|
1064
|
-
} else if (0 == strcasecmp(buf, "amp;")) {
|
1065
|
-
pi->s = s;
|
1066
|
-
*text++ = '&';
|
1067
|
-
} else if (0 == strcasecmp(buf, "quot;")) {
|
1068
|
-
pi->s = s;
|
1069
|
-
*text++ = '"';
|
1070
|
-
} else if (0 == strcasecmp(buf, "apos;")) {
|
1071
|
-
pi->s = s;
|
1072
|
-
*text++ = '\'';
|
1073
1052
|
} else {
|
1074
|
-
*
|
1053
|
+
char *t2;
|
1054
|
+
|
1055
|
+
buf[blen] = '\0';
|
1056
|
+
if (NULL == (t2 = ox_entity_lookup(text, buf))) {
|
1057
|
+
*text++ = '&';
|
1058
|
+
} else {
|
1059
|
+
text = t2;
|
1060
|
+
pi->s = s;
|
1061
|
+
}
|
1075
1062
|
}
|
1076
1063
|
return text;
|
1077
1064
|
}
|
@@ -1113,19 +1100,10 @@ collapse_special(PInfo pi, char *str) {
|
|
1113
1100
|
}
|
1114
1101
|
if (u <= 0x000000000000007FULL) {
|
1115
1102
|
*b++ = (char)u;
|
1116
|
-
#if HAS_PRIVATE_ENCODING
|
1117
|
-
} else if (ox_utf8_encoding == pi->options->rb_enc ||
|
1118
|
-
0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(pi->options->rb_enc)))) {
|
1119
|
-
#else
|
1120
1103
|
} else if (ox_utf8_encoding == pi->options->rb_enc) {
|
1121
|
-
#endif
|
1122
1104
|
b = ox_ucs_to_utf8_chars(b, u);
|
1123
1105
|
/* TBD support UTF-16 */
|
1124
|
-
#if HAS_PRIVATE_ENCODING
|
1125
|
-
} else if (Qnil == pi->options->rb_enc) {
|
1126
|
-
#else
|
1127
1106
|
} else if (0 == pi->options->rb_enc) {
|
1128
|
-
#endif
|
1129
1107
|
pi->options->rb_enc = ox_utf8_encoding;
|
1130
1108
|
b = ox_ucs_to_utf8_chars(b, u);
|
1131
1109
|
} else {
|
@@ -1154,16 +1132,30 @@ collapse_special(PInfo pi, char *str) {
|
|
1154
1132
|
*b++ = '&';
|
1155
1133
|
continue;
|
1156
1134
|
} else {
|
1157
|
-
|
1135
|
+
char key[16];
|
1136
|
+
char *k = key;
|
1137
|
+
char *kend = key + sizeof(key) - 1;
|
1138
|
+
|
1139
|
+
*k++ = *s;
|
1158
1140
|
while (';' != *s++) {
|
1159
1141
|
if ('\0' == *s) {
|
1160
1142
|
set_error(&pi->err, "Invalid format, special character does not end with a semicolon", pi->str, pi->s);
|
1161
1143
|
return EDOM;
|
1162
1144
|
}
|
1145
|
+
if (kend <= k) {
|
1146
|
+
k = key;
|
1147
|
+
break;
|
1148
|
+
}
|
1149
|
+
*k++ = *s;
|
1163
1150
|
}
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1151
|
+
k--;
|
1152
|
+
*k = '\0';
|
1153
|
+
if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
|
1154
|
+
set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
|
1155
|
+
c = '?';
|
1156
|
+
return 0;
|
1157
|
+
}
|
1158
|
+
continue;
|
1167
1159
|
}
|
1168
1160
|
*b++ = (char)c;
|
1169
1161
|
}
|