ox 2.11.0 → 2.13.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -127,6 +127,7 @@ static VALUE limited_sym;
127
127
  static VALUE margin_sym;
128
128
  static VALUE mode_sym;
129
129
  static VALUE nest_ok_sym;
130
+ static VALUE no_empty_sym;
130
131
  static VALUE object_sym;
131
132
  static VALUE off_sym;
132
133
  static VALUE opt_format_sym;
@@ -153,10 +154,8 @@ static VALUE element_key_mod_sym;
153
154
  static ID encoding_id;
154
155
  static ID has_key_id;
155
156
 
156
- #if HAS_ENCODING_SUPPORT
157
+ #if HAVE_RB_ENC_ASSOCIATE
157
158
  rb_encoding *ox_utf8_encoding = 0;
158
- #elif HAS_PRIVATE_ENCODING
159
- VALUE ox_utf8_encoding = Qnil;
160
159
  #else
161
160
  void *ox_utf8_encoding = 0;
162
161
  #endif
@@ -177,18 +176,15 @@ struct _options ox_default_options = {
177
176
  Yes, // sym_keys
178
177
  SpcSkip, // skip
179
178
  No, // smart
180
- 1, // convert_special
179
+ true, // convert_special
181
180
  No, // allow_invalid
181
+ false, // no_empty
182
182
  { '\0' }, // inv_repl
183
183
  { '\0' }, // strip_ns
184
184
  NULL, // html_hints
185
185
  Qnil, // attr_key_mod;
186
186
  Qnil, // element_key_mod;
187
- #if HAS_PRIVATE_ENCODING
188
- Qnil // rb_enc
189
- #else
190
187
  0 // rb_enc
191
- #endif
192
188
  };
193
189
 
194
190
  extern ParseCallbacks ox_obj_callbacks;
@@ -293,6 +289,7 @@ hints_to_overlay(Hints hints) {
293
289
  * - _:smart_ [true|false|nil] flag indicating the SAX parser uses hints if available (use with html)
294
290
  * - _:convert_special_ [true|false|nil] flag indicating special characters like < are converted with the SAX parser
295
291
  * - _:invalid_replace_ [nil|String] replacement string for invalid XML characters on dump. nil indicates include anyway as hex. A string, limited to 10 characters will replace the invalid character with the replace.
292
+ * - _:no_empty_ [true|false|nil] flag indicating there should be no empty elements in a dump
296
293
  * - _:strip_namespace_ [String|true|false] false or "" results in no namespace stripping. A string of "*" or true will strip all namespaces. Any other non-empty string indicates that matching namespaces will be stripped.
297
294
  * - _:overlay_ [Hash] a Hash of keys that match html element names and values that are one of
298
295
  * - _:active_ - make the normal callback for the element
@@ -326,6 +323,7 @@ get_def_opts(VALUE self) {
326
323
  rb_hash_aset(opts, element_key_mod_sym, ox_default_options.element_key_mod);
327
324
  rb_hash_aset(opts, smart_sym, (Yes == ox_default_options.smart) ? Qtrue : ((No == ox_default_options.smart) ? Qfalse : Qnil));
328
325
  rb_hash_aset(opts, convert_special_sym, (ox_default_options.convert_special) ? Qtrue : Qfalse);
326
+ rb_hash_aset(opts, no_empty_sym, (ox_default_options.no_empty) ? Qtrue : Qfalse);
329
327
  switch (ox_default_options.mode) {
330
328
  case ObjMode: rb_hash_aset(opts, mode_sym, object_sym); break;
331
329
  case GenMode: rb_hash_aset(opts, mode_sym, generic_sym); break;
@@ -466,11 +464,8 @@ set_def_opts(VALUE self, VALUE opts) {
466
464
  } else {
467
465
  Check_Type(v, T_STRING);
468
466
  strncpy(ox_default_options.encoding, StringValuePtr(v), sizeof(ox_default_options.encoding) - 1);
469
- #if HAS_ENCODING_SUPPORT
467
+ #if HAVE_RB_ENC_FIND
470
468
  ox_default_options.rb_enc = rb_enc_find(ox_default_options.encoding);
471
- #elif HAS_PRIVATE_ENCODING
472
- ox_default_options.rb_enc = rb_str_new2(ox_default_options.encoding);
473
- rb_gc_register_address(&ox_default_options.rb_enc);
474
469
  #endif
475
470
  }
476
471
 
@@ -542,6 +537,17 @@ set_def_opts(VALUE self, VALUE opts) {
542
537
  rb_raise(ox_parse_error_class, ":convert_special must be true or false.\n");
543
538
  }
544
539
 
540
+ v = rb_hash_lookup(opts, no_empty_sym);
541
+ if (Qnil == v) {
542
+ // no change
543
+ } else if (Qtrue == v) {
544
+ ox_default_options.no_empty = 1;
545
+ } else if (Qfalse == v) {
546
+ ox_default_options.no_empty = 0;
547
+ } else {
548
+ rb_raise(ox_parse_error_class, ":no_empty must be true or false.\n");
549
+ }
550
+
545
551
  v = rb_hash_aref(opts, invalid_replace_sym);
546
552
  if (Qnil == v) {
547
553
  ox_default_options.allow_invalid = Yes;
@@ -659,14 +665,14 @@ to_obj(VALUE self, VALUE ruby_xml) {
659
665
  xml = ALLOCA_N(char, len);
660
666
  }
661
667
  memcpy(xml, x, len);
662
- #if HAS_GC_GUARD
668
+ #ifdef RB_GC_GUARD
663
669
  rb_gc_disable();
664
670
  #endif
665
671
  obj = ox_parse(xml, len - 1, ox_obj_callbacks, 0, &options, &err);
666
672
  if (SMALL_XML < len) {
667
673
  xfree(xml);
668
674
  }
669
- #if HAS_GC_GUARD
675
+ #ifdef RB_GC_GUARD
670
676
  RB_GC_GUARD(obj);
671
677
  rb_gc_enable();
672
678
  #endif
@@ -777,6 +783,9 @@ load(char *xml, size_t len, int argc, VALUE *argv, VALUE self, VALUE encoding, E
777
783
  if (Qnil != (v = rb_hash_lookup(h, convert_special_sym))) {
778
784
  options.convert_special = (Qfalse != v);
779
785
  }
786
+ if (Qnil != (v = rb_hash_lookup(h, no_empty_sym))) {
787
+ options.no_empty = (Qfalse != v);
788
+ }
780
789
 
781
790
  v = rb_hash_lookup(h, invalid_replace_sym);
782
791
  if (Qnil == v) {
@@ -830,7 +839,7 @@ load(char *xml, size_t len, int argc, VALUE *argv, VALUE self, VALUE encoding, E
830
839
  options.margin_len = strlen(options.margin);
831
840
  }
832
841
  }
833
- #if HAS_ENCODING_SUPPORT
842
+ #if HAVE_RB_ENC_FIND
834
843
  if ('\0' == *options.encoding) {
835
844
  if (Qnil != encoding) {
836
845
  options.rb_enc = rb_enc_from_index(rb_enc_get_index(encoding));
@@ -840,26 +849,15 @@ load(char *xml, size_t len, int argc, VALUE *argv, VALUE self, VALUE encoding, E
840
849
  } else if (0 == options.rb_enc) {
841
850
  options.rb_enc = rb_enc_find(options.encoding);
842
851
  }
843
- #elif HAS_PRIVATE_ENCODING
844
- if ('\0' == *options.encoding) {
845
- if (Qnil != encoding) {
846
- options.rb_enc = encoding;
847
- } else {
848
- options.rb_enc = Qnil;
849
- }
850
- } else if (0 == options.rb_enc) {
851
- options.rb_enc = rb_str_new2(options.encoding);
852
- rb_gc_register_address(&options.rb_enc);
853
- }
854
852
  #endif
855
853
  xml = defuse_bom(xml, &options);
856
854
  switch (options.mode) {
857
855
  case ObjMode:
858
- #if HAS_GC_GUARD
856
+ #ifdef RB_GC_GUARD
859
857
  rb_gc_disable();
860
858
  #endif
861
859
  obj = ox_parse(xml, len, ox_obj_callbacks, 0, &options, err);
862
- #if HAS_GC_GUARD
860
+ #ifdef RB_GC_GUARD
863
861
  RB_GC_GUARD(obj);
864
862
  rb_gc_enable();
865
863
  #endif
@@ -928,14 +926,8 @@ load_str(int argc, VALUE *argv, VALUE self) {
928
926
  } else {
929
927
  xml = ALLOCA_N(char, len);
930
928
  }
931
- #if HAS_ENCODING_SUPPORT
932
- #ifdef MACRUBY_RUBY
933
- encoding = rb_funcall(*argv, encoding_id, 0);
934
- #else
929
+ #if HAVE_RB_OBJ_ENCODING
935
930
  encoding = rb_obj_encoding(*argv);
936
- #endif
937
- #elif HAS_PRIVATE_ENCODING
938
- encoding = rb_funcall(*argv, encoding_id, 0);
939
931
  #else
940
932
  encoding = Qnil;
941
933
  #endif
@@ -996,7 +988,7 @@ load_file(int argc, VALUE *argv, VALUE self) {
996
988
  xml = ALLOCA_N(char, len + 1);
997
989
  }
998
990
  fseek(f, 0, SEEK_SET);
999
- if (len != fread(xml, 1, len, f)) {
991
+ if ((size_t)len != fread(xml, 1, len, f)) {
1000
992
  ox_err_set(&err, rb_eLoadError, "Failed to read %ld bytes from %s.\n", (long)len, path);
1001
993
  obj = Qnil;
1002
994
  } else {
@@ -1208,6 +1200,9 @@ parse_dump_options(VALUE ropts, Options copts) {
1208
1200
  }
1209
1201
  strncpy(copts->encoding, StringValuePtr(v), sizeof(copts->encoding) - 1);
1210
1202
  }
1203
+ if (Qnil != (v = rb_hash_lookup(ropts, no_empty_sym))) {
1204
+ copts->no_empty = (v == Qtrue);
1205
+ }
1211
1206
  if (Qnil != (v = rb_hash_lookup(ropts, effort_sym))) {
1212
1207
  if (auto_define_sym == v) {
1213
1208
  copts->effort = AutoEffort;
@@ -1275,6 +1270,7 @@ parse_dump_options(VALUE ropts, Options copts) {
1275
1270
  * - +obj+ [Object] Object to serialize as an XML document String
1276
1271
  * - +options+ [Hash] formating options
1277
1272
  * - *:indent* [Fixnum] format expected
1273
+ * - *:no_empty* [true|false] if true don't output empty elements
1278
1274
  * - *:xsd_date* [true|false] use XSD date format if true, default: false
1279
1275
  * - *:circular* [true|false] allow circular references, default: false
1280
1276
  * - *:strict|:tolerant]* [ :effort effort to use when an undumpable object (e.g., IO) is encountered, default: :strict
@@ -1297,21 +1293,38 @@ dump(int argc, VALUE *argv, VALUE self) {
1297
1293
  rb_raise(rb_eNoMemError, "Not enough memory.\n");
1298
1294
  }
1299
1295
  rstr = rb_str_new2(xml);
1300
- #if HAS_ENCODING_SUPPORT
1296
+ #if HAVE_RB_ENC_ASSOCIATE
1301
1297
  if ('\0' != *copts.encoding) {
1302
1298
  rb_enc_associate(rstr, rb_enc_find(copts.encoding));
1303
1299
  }
1304
- #elif HAS_PRIVATE_ENCODING
1305
- if ('\0' != *copts.encoding) {
1306
- rb_funcall(rstr, ox_force_encoding_id, 1, rb_str_new2(copts.encoding));
1307
- }
1308
1300
  #endif
1309
1301
  xfree(xml);
1310
1302
 
1311
1303
  return rstr;
1312
1304
  }
1313
1305
 
1314
- /* call-seq: to_file(file_path, obj, options)
1306
+ /* call-seq: to_xml(obj, options) => xml-string
1307
+ *
1308
+ * Dumps an Object (obj) to a string.
1309
+ * - +obj+ [Object] Object to serialize as an XML document String
1310
+ * - +options+ [Hash] formating options
1311
+ * - *:indent* [Fixnum] format expected
1312
+ * - *:no_empty* [true|false] if true don't output empty elements
1313
+ * - *:xsd_date* [true|false] use XSD date format if true, default: false
1314
+ * - *:circular* [true|false] allow circular references, default: false
1315
+ * - *:strict|:tolerant]* [ :effort effort to use when an undumpable object (e.g., IO) is encountered, default: :strict
1316
+ * - _:strict_ - raise an NotImplementedError if an undumpable object is encountered
1317
+ * - _:tolerant_ - replaces undumplable objects with nil
1318
+ *
1319
+ * Note that an indent of less than zero will result in a tight one line output
1320
+ * unless the text in the XML fields contain new line characters.
1321
+ */
1322
+ static VALUE
1323
+ to_xml(int argc, VALUE *argv, VALUE self) {
1324
+ return dump(argc, argv, self);
1325
+ }
1326
+
1327
+ /* call-seq: to_file(file_path, obj, options) => Object
1315
1328
  *
1316
1329
  * Dumps an Object to the specified file.
1317
1330
  * - +file_path+ [String] file path to write the XML document to
@@ -1370,7 +1383,7 @@ void Init_ox() {
1370
1383
  rb_define_module_function(Ox, "sax_parse", sax_parse, -1);
1371
1384
  rb_define_module_function(Ox, "sax_html", sax_html, -1);
1372
1385
 
1373
- rb_define_module_function(Ox, "to_xml", dump, -1);
1386
+ rb_define_module_function(Ox, "to_xml", to_xml, -1);
1374
1387
  rb_define_module_function(Ox, "dump", dump, -1);
1375
1388
 
1376
1389
  rb_define_module_function(Ox, "load_file", load_file, -1);
@@ -1480,6 +1493,7 @@ void Init_ox() {
1480
1493
  margin_sym = ID2SYM(rb_intern("margin")); rb_gc_register_address(&margin_sym);
1481
1494
  mode_sym = ID2SYM(rb_intern("mode")); rb_gc_register_address(&mode_sym);
1482
1495
  nest_ok_sym = ID2SYM(rb_intern("nest_ok")); rb_gc_register_address(&nest_ok_sym);
1496
+ no_empty_sym = ID2SYM(rb_intern("no_empty")); rb_gc_register_address(&no_empty_sym);
1483
1497
  object_sym = ID2SYM(rb_intern("object")); rb_gc_register_address(&object_sym);
1484
1498
  off_sym = ID2SYM(rb_intern("off")); rb_gc_register_address(&off_sym);
1485
1499
  opt_format_sym = ID2SYM(rb_intern("opt_format")); rb_gc_register_address(&opt_format_sym);
@@ -1532,11 +1546,8 @@ void Init_ox() {
1532
1546
  rb_define _module_function(Ox, "cache8_test", cache8_test, 0);
1533
1547
  #endif
1534
1548
 
1535
- #if HAS_ENCODING_SUPPORT
1549
+ #if HAVE_RB_ENC_FIND
1536
1550
  ox_utf8_encoding = rb_enc_find("UTF-8");
1537
- #elif HAS_PRIVATE_ENCODING
1538
- ox_utf8_encoding = rb_str_new2("UTF-8");
1539
- rb_gc_register_address(&ox_utf8_encoding);
1540
1551
  #endif
1541
1552
  }
1542
1553
 
@@ -1557,7 +1568,7 @@ _ox_raise_error(const char *msg, const char *xml, const char *current, const cha
1557
1568
  xline++;
1558
1569
  }
1559
1570
  }
1560
- #if HAS_GC_GUARD
1571
+ #ifdef RB_GC_GUARD
1561
1572
  rb_gc_enable();
1562
1573
  #endif
1563
1574
  rb_raise(ox_parse_error_class, "%s at line %d, column %d [%s:%d]\n", msg, xline, col, file, line);
@@ -16,20 +16,15 @@ extern "C" {
16
16
  #define RSTRING_NOT_MODIFIED
17
17
 
18
18
  #include "ruby.h"
19
- #if HAS_ENCODING_SUPPORT
19
+ #if HAVE_RB_ENC_ASSOCIATE
20
20
  #include "ruby/encoding.h"
21
21
  #endif
22
22
 
23
- #ifdef RUBINIUS_RUBY
24
- #undef T_COMPLEX
25
- enum st_retval {ST_CONTINUE = 0, ST_STOP = 1, ST_DELETE = 2, ST_CHECK};
23
+ #if HAVE_RUBY_ST_H
24
+ #include "ruby/st.h"
26
25
  #else
27
- #if HAS_TOP_LEVEL_ST_H
28
- /* Only on travis, local is where it is for all others. Seems to vary depending on the travis machine picked up. */
26
+ // Only on travis, local is where it is for all others. Seems to vary depending on the travis machine picked up.
29
27
  #include "st.h"
30
- #else
31
- #include "ruby/st.h"
32
- #endif
33
28
  #endif
34
29
 
35
30
  #include "cache.h"
@@ -123,49 +118,51 @@ typedef struct _circArray {
123
118
  } *CircArray;
124
119
 
125
120
  typedef struct _options {
126
- char encoding[64]; /* encoding, stored in the option to avoid GC invalidation in default values */
127
- char margin[128]; /* left margin for dumping */
128
- int indent; /* indention for dump, default 2 */
129
- int trace; /* trace level */
130
- char margin_len; /* margin length */
131
- char with_dtd; /* YesNo */
132
- char with_xml; /* YesNo */
133
- char with_instruct; /* YesNo */
134
- char circular; /* YesNo */
135
- char xsd_date; /* YesNo */
136
- char mode; /* LoadMode */
137
- char effort; /* Effort */
138
- char sym_keys; /* symbolize keys */
139
- char skip; /* skip mode */
140
- char smart; /* YesNo sax smart mode */
141
- char convert_special;/* boolean true or false */
142
- char allow_invalid; /* YesNo */
143
- char inv_repl[12]; /* max 10 valid characters, first character is the length */
144
- char strip_ns[64]; /* namespace to strip, \0 is no-strip, \* is all, else only matches */
145
- struct _hints *html_hints; /* html hints */
121
+ char encoding[64]; // encoding, stored in the option to avoid GC invalidation in default values
122
+ char margin[128]; // left margin for dumping
123
+ int indent; // indention for dump, default 2
124
+ int trace; // trace level
125
+ char margin_len; // margin length
126
+ char with_dtd; // YesNo
127
+ char with_xml; // YesNo
128
+ char with_instruct; // YesNo
129
+ char circular; // YesNo
130
+ char xsd_date; // YesNo
131
+ char mode; // LoadMode
132
+ char effort; // Effort
133
+ char sym_keys; // symbolize keys
134
+ char skip; // skip mode
135
+ char smart; // YesNo sax smart mode
136
+ char convert_special;// boolean true or false
137
+ char allow_invalid; // YesNo
138
+ char no_empty; // boolean - no empty elements when dumping
139
+ char inv_repl[12]; // max 10 valid characters, first character is the length
140
+ char strip_ns[64]; // namespace to strip, \0 is no-strip, \* is all, else only matches
141
+ struct _hints *html_hints; // html hints
146
142
  VALUE attr_key_mod;
147
143
  VALUE element_key_mod;
148
- #if HAS_ENCODING_SUPPORT
144
+ #if HAVE_RB_ENC_ASSOCIATE
149
145
  rb_encoding *rb_enc;
150
- #elif HAS_PRIVATE_ENCODING
151
- VALUE rb_enc;
152
146
  #else
153
147
  void *rb_enc;
154
148
  #endif
155
149
  } *Options;
156
150
 
157
- /* parse information structure */
151
+ // parse information structure
158
152
  struct _pInfo {
159
153
  struct _helperStack helpers;
160
154
  struct _err err;
161
- char *str; //buffer being read from
155
+ char *str; // buffer being read from
162
156
  char *end; // end of original string
163
157
  char *s; // current position in buffer
164
158
  VALUE obj;
165
159
  ParseCallbacks pcb;
166
160
  CircArray circ_array;
167
- unsigned long id; //set for text types when cirs_array is set
161
+ unsigned long id; // set for text types when cirs_array is set
168
162
  Options options;
163
+ VALUE *marked;
164
+ int mark_size; // allocated size
165
+ int mark_cnt;
169
166
  char last; // last character read, rarely set
170
167
  };
171
168
 
@@ -232,10 +229,8 @@ extern ID ox_tv_nsec_id;
232
229
  extern ID ox_tv_usec_id;
233
230
  extern ID ox_value_id;
234
231
 
235
- #if HAS_ENCODING_SUPPORT
232
+ #if HAVE_RB_ENC_ASSOCIATE
236
233
  extern rb_encoding *ox_utf8_encoding;
237
- #elif HAS_PRIVATE_ENCODING
238
- extern VALUE ox_utf8_encoding;
239
234
  #else
240
235
  extern void *ox_utf8_encoding;
241
236
  #endif
@@ -121,7 +121,7 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
121
121
  if (DEBUG <= options->trace) {
122
122
  printf("Parsing xml:\n%s\n", xml);
123
123
  }
124
- /* initialize parse info */
124
+ // initialize parse info
125
125
  helper_stack_init(&pi.helpers);
126
126
  // Protect against GC
127
127
  wrap = Data_Wrap_Struct(rb_cObject, mark_pi_cb, NULL, &pi);
@@ -134,8 +134,11 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
134
134
  pi.obj = Qnil;
135
135
  pi.circ_array = 0;
136
136
  pi.options = options;
137
+ pi.marked = NULL;
138
+ pi.mark_size = 0;
139
+ pi.mark_cnt = 0;
137
140
  while (1) {
138
- next_non_white(&pi); /* skip white space */
141
+ next_non_white(&pi); // skip white space
139
142
  if ('\0' == *pi.s) {
140
143
  break;
141
144
  }
@@ -143,31 +146,31 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
143
146
  *endp = pi.s;
144
147
  break;
145
148
  }
146
- if ('<' != *pi.s) { /* all top level entities start with < */
149
+ if ('<' != *pi.s) { // all top level entities start with <
147
150
  set_error(err, "invalid format, expected <", pi.str, pi.s);
148
151
  helper_stack_cleanup(&pi.helpers);
149
152
  return Qnil;
150
153
  }
151
- pi.s++; /* past < */
154
+ pi.s++; // past <
152
155
  switch (*pi.s) {
153
- case '?': /* processing instruction */
156
+ case '?': // processing instruction
154
157
  pi.s++;
155
158
  read_instruction(&pi);
156
159
  break;
157
- case '!': /* comment or doctype */
160
+ case '!': // comment or doctype
158
161
  pi.s++;
159
162
  if ('\0' == *pi.s) {
160
163
  set_error(err, "invalid format, DOCTYPE or comment not terminated", pi.str, pi.s);
161
164
  helper_stack_cleanup(&pi.helpers);
162
165
  return Qnil;
163
166
  } else if ('-' == *pi.s) {
164
- pi.s++; /* skip - */
167
+ pi.s++; // skip -
165
168
  if ('-' != *pi.s) {
166
169
  set_error(err, "invalid format, bad comment format", pi.str, pi.s);
167
170
  helper_stack_cleanup(&pi.helpers);
168
171
  return Qnil;
169
172
  } else {
170
- pi.s++; /* skip second - */
173
+ pi.s++; // skip second -
171
174
  read_comment(&pi);
172
175
  }
173
176
  } else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7) : 0 == strncmp("DOCTYPE", pi.s, 7)) {
@@ -229,8 +232,7 @@ gather_content(const char *src, char *content, size_t len) {
229
232
  return 0;
230
233
  }
231
234
 
232
- /* Entered after the "<?" sequence. Ready to read the rest.
233
- */
235
+ // Entered after the "<?" sequence. Ready to read the rest.
234
236
  static void
235
237
  read_instruction(PInfo pi) {
236
238
  char content[1024];
@@ -255,7 +257,7 @@ read_instruction(PInfo pi) {
255
257
  }
256
258
  next_non_white(pi);
257
259
  c = *pi->s;
258
- *end = '\0'; /* terminate name */
260
+ *end = '\0'; // terminate name
259
261
  if ('?' != c) {
260
262
  while ('?' != c) {
261
263
  pi->last = 0;
@@ -275,8 +277,8 @@ read_instruction(PInfo pi) {
275
277
  attrs_ok = 0;
276
278
  break;
277
279
  }
278
- *end = '\0'; /* terminate name */
279
- /* read value */
280
+ *end = '\0'; // terminate name
281
+ // read value
280
282
  next_non_white(pi);
281
283
  if (0 == (attr_value = read_quoted_value(pi))) {
282
284
  attr_stack_cleanup(&attrs);
@@ -355,9 +357,8 @@ read_delimited(PInfo pi, char end) {
355
357
  }
356
358
  }
357
359
 
358
- /* Entered after the "<!DOCTYPE" sequence plus the first character after
359
- * that. Ready to read the rest.
360
- */
360
+ // Entered after the "<!DOCTYPE" sequence plus the first character after
361
+ // that. Ready to read the rest.
361
362
  static void
362
363
  read_doctype(PInfo pi) {
363
364
  char *docType;
@@ -376,8 +377,7 @@ read_doctype(PInfo pi) {
376
377
  }
377
378
  }
378
379
 
379
- /* Entered after "<!--". Returns error code.
380
- */
380
+ // Entered after "<!--". Returns error code.
381
381
  static void
382
382
  read_comment(PInfo pi) {
383
383
  char *end;
@@ -406,16 +406,15 @@ read_comment(PInfo pi) {
406
406
  break;
407
407
  }
408
408
  }
409
- *end = '\0'; /* in case the comment was blank */
409
+ *end = '\0'; // in case the comment was blank
410
410
  pi->s = end + 3;
411
411
  if (0 != pi->pcb->add_comment) {
412
412
  pi->pcb->add_comment(pi, comment);
413
413
  }
414
414
  }
415
415
 
416
- /* Entered after the '<' and the first character after that. Returns status
417
- * code.
418
- */
416
+ // Entered after the '<' and the first character after that. Returns stat
417
+ // code.
419
418
  static char*
420
419
  read_element(PInfo pi) {
421
420
  struct _attrStack attrs;
@@ -439,10 +438,9 @@ read_element(PInfo pi) {
439
438
  c = *pi->s;
440
439
  *end = '\0';
441
440
  if ('/' == c) {
442
- /* empty element, no attributes and no children */
441
+ // empty element, no attributes and no children
443
442
  pi->s++;
444
443
  if ('>' != *pi->s) {
445
- /*printf("*** '%s' ***\n", pi->s); */
446
444
  attr_stack_cleanup(&attrs);
447
445
  set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
448
446
  return 0;
@@ -480,8 +478,8 @@ read_element(PInfo pi) {
480
478
  pi->s++;
481
479
  pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
482
480
  pi->pcb->end_element(pi, ename);
483
-
484
481
  attr_stack_cleanup(&attrs);
482
+
485
483
  return 0;
486
484
  case '>':
487
485
  /* has either children or a value */
@@ -545,6 +543,12 @@ read_element(PInfo pi) {
545
543
  while (!done) {
546
544
  start = pi->s;
547
545
  next_non_white(pi);
546
+ if (OffSkip == pi->options->skip && start < pi->s && '<' == *pi->s) {
547
+ c = *pi->s;
548
+ *pi->s = '\0';
549
+ pi->pcb->add_text(pi, start, 1);
550
+ *pi->s = c;
551
+ }
548
552
  c = *pi->s++;
549
553
  if ('\0' == c) {
550
554
  attr_stack_cleanup(&attrs);
@@ -1001,11 +1005,13 @@ read_coded_chars(PInfo pi, char *text) {
1001
1005
  char *b, buf[32];
1002
1006
  char *end = buf + sizeof(buf) - 1;
1003
1007
  char *s;
1008
+ long blen = 0;
1004
1009
 
1005
1010
  for (b = buf, s = pi->s; b < end; b++, s++) {
1006
1011
  *b = *s;
1007
1012
  if (';' == *s) {
1008
1013
  *(b + 1) = '\0';
1014
+ blen = b - buf;
1009
1015
  s++;
1010
1016
  break;
1011
1017
  }
@@ -1026,18 +1032,9 @@ read_coded_chars(PInfo pi, char *text) {
1026
1032
  } else {
1027
1033
  if (u <= 0x000000000000007FULL) {
1028
1034
  *text++ = (char)u;
1029
- #if HAS_PRIVATE_ENCODING
1030
- } else if (ox_utf8_encoding == pi->options->rb_enc ||
1031
- 0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(pi->options->rb_enc)))) {
1032
- #else
1033
1035
  } else if (ox_utf8_encoding == pi->options->rb_enc) {
1034
- #endif
1035
1036
  text = ox_ucs_to_utf8_chars(text, u);
1036
- #if HAS_PRIVATE_ENCODING
1037
- } else if (Qnil == pi->options->rb_enc) {
1038
- #else
1039
1037
  } else if (0 == pi->options->rb_enc) {
1040
- #endif
1041
1038
  pi->options->rb_enc = ox_utf8_encoding;
1042
1039
  text = ox_ucs_to_utf8_chars(text, u);
1043
1040
  } else if (TolerantEffort == pi->options->effort) {
@@ -1048,30 +1045,20 @@ read_coded_chars(PInfo pi, char *text) {
1048
1045
  } else {
1049
1046
  /*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */
1050
1047
  set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
1051
- return 0;
1048
+ return NULL;
1052
1049
  }
1053
1050
  pi->s = s;
1054
1051
  }
1055
- } else if (0 == strcasecmp(buf, "nbsp;")) {
1056
- pi->s = s;
1057
- *text++ = ' ';
1058
- } else if (0 == strcasecmp(buf, "lt;")) {
1059
- pi->s = s;
1060
- *text++ = '<';
1061
- } else if (0 == strcasecmp(buf, "gt;")) {
1062
- pi->s = s;
1063
- *text++ = '>';
1064
- } else if (0 == strcasecmp(buf, "amp;")) {
1065
- pi->s = s;
1066
- *text++ = '&';
1067
- } else if (0 == strcasecmp(buf, "quot;")) {
1068
- pi->s = s;
1069
- *text++ = '"';
1070
- } else if (0 == strcasecmp(buf, "apos;")) {
1071
- pi->s = s;
1072
- *text++ = '\'';
1073
1052
  } else {
1074
- *text++ = '&';
1053
+ char *t2;
1054
+
1055
+ buf[blen] = '\0';
1056
+ if (NULL == (t2 = ox_entity_lookup(text, buf))) {
1057
+ *text++ = '&';
1058
+ } else {
1059
+ text = t2;
1060
+ pi->s = s;
1061
+ }
1075
1062
  }
1076
1063
  return text;
1077
1064
  }
@@ -1113,19 +1100,10 @@ collapse_special(PInfo pi, char *str) {
1113
1100
  }
1114
1101
  if (u <= 0x000000000000007FULL) {
1115
1102
  *b++ = (char)u;
1116
- #if HAS_PRIVATE_ENCODING
1117
- } else if (ox_utf8_encoding == pi->options->rb_enc ||
1118
- 0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(pi->options->rb_enc)))) {
1119
- #else
1120
1103
  } else if (ox_utf8_encoding == pi->options->rb_enc) {
1121
- #endif
1122
1104
  b = ox_ucs_to_utf8_chars(b, u);
1123
1105
  /* TBD support UTF-16 */
1124
- #if HAS_PRIVATE_ENCODING
1125
- } else if (Qnil == pi->options->rb_enc) {
1126
- #else
1127
1106
  } else if (0 == pi->options->rb_enc) {
1128
- #endif
1129
1107
  pi->options->rb_enc = ox_utf8_encoding;
1130
1108
  b = ox_ucs_to_utf8_chars(b, u);
1131
1109
  } else {
@@ -1154,16 +1132,30 @@ collapse_special(PInfo pi, char *str) {
1154
1132
  *b++ = '&';
1155
1133
  continue;
1156
1134
  } else {
1157
- c = '?';
1135
+ char key[16];
1136
+ char *k = key;
1137
+ char *kend = key + sizeof(key) - 1;
1138
+
1139
+ *k++ = *s;
1158
1140
  while (';' != *s++) {
1159
1141
  if ('\0' == *s) {
1160
1142
  set_error(&pi->err, "Invalid format, special character does not end with a semicolon", pi->str, pi->s);
1161
1143
  return EDOM;
1162
1144
  }
1145
+ if (kend <= k) {
1146
+ k = key;
1147
+ break;
1148
+ }
1149
+ *k++ = *s;
1163
1150
  }
1164
- s++;
1165
- set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
1166
- return 0;
1151
+ k--;
1152
+ *k = '\0';
1153
+ if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
1154
+ set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
1155
+ c = '?';
1156
+ return 0;
1157
+ }
1158
+ continue;
1167
1159
  }
1168
1160
  *b++ = (char)c;
1169
1161
  }