ox 2.12.0 → 2.13.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,20 +16,15 @@ extern "C" {
16
16
  #define RSTRING_NOT_MODIFIED
17
17
 
18
18
  #include "ruby.h"
19
- #if HAS_ENCODING_SUPPORT
19
+ #if HAVE_RB_ENC_ASSOCIATE
20
20
  #include "ruby/encoding.h"
21
21
  #endif
22
22
 
23
- #ifdef RUBINIUS_RUBY
24
- #undef T_COMPLEX
25
- enum st_retval {ST_CONTINUE = 0, ST_STOP = 1, ST_DELETE = 2, ST_CHECK};
23
+ #if HAVE_RUBY_ST_H
24
+ #include "ruby/st.h"
26
25
  #else
27
- #if HAS_TOP_LEVEL_ST_H
28
- /* Only on travis, local is where it is for all others. Seems to vary depending on the travis machine picked up. */
26
+ // Only on travis, local is where it is for all others. Seems to vary depending on the travis machine picked up.
29
27
  #include "st.h"
30
- #else
31
- #include "ruby/st.h"
32
- #endif
33
28
  #endif
34
29
 
35
30
  #include "cache.h"
@@ -146,16 +141,14 @@ typedef struct _options {
146
141
  struct _hints *html_hints; // html hints
147
142
  VALUE attr_key_mod;
148
143
  VALUE element_key_mod;
149
- #if HAS_ENCODING_SUPPORT
144
+ #if HAVE_RB_ENC_ASSOCIATE
150
145
  rb_encoding *rb_enc;
151
- #elif HAS_PRIVATE_ENCODING
152
- VALUE rb_enc;
153
146
  #else
154
147
  void *rb_enc;
155
148
  #endif
156
149
  } *Options;
157
150
 
158
- /* parse information structure */
151
+ // parse information structure
159
152
  struct _pInfo {
160
153
  struct _helperStack helpers;
161
154
  struct _err err;
@@ -167,6 +160,9 @@ struct _pInfo {
167
160
  CircArray circ_array;
168
161
  unsigned long id; // set for text types when cirs_array is set
169
162
  Options options;
163
+ VALUE *marked;
164
+ int mark_size; // allocated size
165
+ int mark_cnt;
170
166
  char last; // last character read, rarely set
171
167
  };
172
168
 
@@ -233,10 +229,8 @@ extern ID ox_tv_nsec_id;
233
229
  extern ID ox_tv_usec_id;
234
230
  extern ID ox_value_id;
235
231
 
236
- #if HAS_ENCODING_SUPPORT
232
+ #if HAVE_RB_ENC_ASSOCIATE
237
233
  extern rb_encoding *ox_utf8_encoding;
238
- #elif HAS_PRIVATE_ENCODING
239
- extern VALUE ox_utf8_encoding;
240
234
  #else
241
235
  extern void *ox_utf8_encoding;
242
236
  #endif
@@ -121,7 +121,7 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
121
121
  if (DEBUG <= options->trace) {
122
122
  printf("Parsing xml:\n%s\n", xml);
123
123
  }
124
- /* initialize parse info */
124
+ // initialize parse info
125
125
  helper_stack_init(&pi.helpers);
126
126
  // Protect against GC
127
127
  wrap = Data_Wrap_Struct(rb_cObject, mark_pi_cb, NULL, &pi);
@@ -134,8 +134,11 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
134
134
  pi.obj = Qnil;
135
135
  pi.circ_array = 0;
136
136
  pi.options = options;
137
+ pi.marked = NULL;
138
+ pi.mark_size = 0;
139
+ pi.mark_cnt = 0;
137
140
  while (1) {
138
- next_non_white(&pi); /* skip white space */
141
+ next_non_white(&pi); // skip white space
139
142
  if ('\0' == *pi.s) {
140
143
  break;
141
144
  }
@@ -143,31 +146,31 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
143
146
  *endp = pi.s;
144
147
  break;
145
148
  }
146
- if ('<' != *pi.s) { /* all top level entities start with < */
149
+ if ('<' != *pi.s) { // all top level entities start with <
147
150
  set_error(err, "invalid format, expected <", pi.str, pi.s);
148
151
  helper_stack_cleanup(&pi.helpers);
149
152
  return Qnil;
150
153
  }
151
- pi.s++; /* past < */
154
+ pi.s++; // past <
152
155
  switch (*pi.s) {
153
- case '?': /* processing instruction */
156
+ case '?': // processing instruction
154
157
  pi.s++;
155
158
  read_instruction(&pi);
156
159
  break;
157
- case '!': /* comment or doctype */
160
+ case '!': // comment or doctype
158
161
  pi.s++;
159
162
  if ('\0' == *pi.s) {
160
163
  set_error(err, "invalid format, DOCTYPE or comment not terminated", pi.str, pi.s);
161
164
  helper_stack_cleanup(&pi.helpers);
162
165
  return Qnil;
163
166
  } else if ('-' == *pi.s) {
164
- pi.s++; /* skip - */
167
+ pi.s++; // skip -
165
168
  if ('-' != *pi.s) {
166
169
  set_error(err, "invalid format, bad comment format", pi.str, pi.s);
167
170
  helper_stack_cleanup(&pi.helpers);
168
171
  return Qnil;
169
172
  } else {
170
- pi.s++; /* skip second - */
173
+ pi.s++; // skip second -
171
174
  read_comment(&pi);
172
175
  }
173
176
  } else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7) : 0 == strncmp("DOCTYPE", pi.s, 7)) {
@@ -229,8 +232,7 @@ gather_content(const char *src, char *content, size_t len) {
229
232
  return 0;
230
233
  }
231
234
 
232
- /* Entered after the "<?" sequence. Ready to read the rest.
233
- */
235
+ // Entered after the "<?" sequence. Ready to read the rest.
234
236
  static void
235
237
  read_instruction(PInfo pi) {
236
238
  char content[1024];
@@ -255,7 +257,7 @@ read_instruction(PInfo pi) {
255
257
  }
256
258
  next_non_white(pi);
257
259
  c = *pi->s;
258
- *end = '\0'; /* terminate name */
260
+ *end = '\0'; // terminate name
259
261
  if ('?' != c) {
260
262
  while ('?' != c) {
261
263
  pi->last = 0;
@@ -275,8 +277,8 @@ read_instruction(PInfo pi) {
275
277
  attrs_ok = 0;
276
278
  break;
277
279
  }
278
- *end = '\0'; /* terminate name */
279
- /* read value */
280
+ *end = '\0'; // terminate name
281
+ // read value
280
282
  next_non_white(pi);
281
283
  if (0 == (attr_value = read_quoted_value(pi))) {
282
284
  attr_stack_cleanup(&attrs);
@@ -355,9 +357,8 @@ read_delimited(PInfo pi, char end) {
355
357
  }
356
358
  }
357
359
 
358
- /* Entered after the "<!DOCTYPE" sequence plus the first character after
359
- * that. Ready to read the rest.
360
- */
360
+ // Entered after the "<!DOCTYPE" sequence plus the first character after
361
+ // that. Ready to read the rest.
361
362
  static void
362
363
  read_doctype(PInfo pi) {
363
364
  char *docType;
@@ -376,8 +377,7 @@ read_doctype(PInfo pi) {
376
377
  }
377
378
  }
378
379
 
379
- /* Entered after "<!--". Returns error code.
380
- */
380
+ // Entered after "<!--". Returns error code.
381
381
  static void
382
382
  read_comment(PInfo pi) {
383
383
  char *end;
@@ -406,16 +406,15 @@ read_comment(PInfo pi) {
406
406
  break;
407
407
  }
408
408
  }
409
- *end = '\0'; /* in case the comment was blank */
409
+ *end = '\0'; // in case the comment was blank
410
410
  pi->s = end + 3;
411
411
  if (0 != pi->pcb->add_comment) {
412
412
  pi->pcb->add_comment(pi, comment);
413
413
  }
414
414
  }
415
415
 
416
- /* Entered after the '<' and the first character after that. Returns status
417
- * code.
418
- */
416
+ // Entered after the '<' and the first character after that. Returns stat
417
+ // code.
419
418
  static char*
420
419
  read_element(PInfo pi) {
421
420
  struct _attrStack attrs;
@@ -439,10 +438,9 @@ read_element(PInfo pi) {
439
438
  c = *pi->s;
440
439
  *end = '\0';
441
440
  if ('/' == c) {
442
- /* empty element, no attributes and no children */
441
+ // empty element, no attributes and no children
443
442
  pi->s++;
444
443
  if ('>' != *pi->s) {
445
- /*printf("*** '%s' ***\n", pi->s); */
446
444
  attr_stack_cleanup(&attrs);
447
445
  set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
448
446
  return 0;
@@ -480,8 +478,8 @@ read_element(PInfo pi) {
480
478
  pi->s++;
481
479
  pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
482
480
  pi->pcb->end_element(pi, ename);
483
-
484
481
  attr_stack_cleanup(&attrs);
482
+
485
483
  return 0;
486
484
  case '>':
487
485
  /* has either children or a value */
@@ -545,6 +543,12 @@ read_element(PInfo pi) {
545
543
  while (!done) {
546
544
  start = pi->s;
547
545
  next_non_white(pi);
546
+ if (OffSkip == pi->options->skip && start < pi->s && '<' == *pi->s) {
547
+ c = *pi->s;
548
+ *pi->s = '\0';
549
+ pi->pcb->add_text(pi, start, 1);
550
+ *pi->s = c;
551
+ }
548
552
  c = *pi->s++;
549
553
  if ('\0' == c) {
550
554
  attr_stack_cleanup(&attrs);
@@ -1001,11 +1005,13 @@ read_coded_chars(PInfo pi, char *text) {
1001
1005
  char *b, buf[32];
1002
1006
  char *end = buf + sizeof(buf) - 1;
1003
1007
  char *s;
1008
+ long blen = 0;
1004
1009
 
1005
1010
  for (b = buf, s = pi->s; b < end; b++, s++) {
1006
1011
  *b = *s;
1007
1012
  if (';' == *s) {
1008
1013
  *(b + 1) = '\0';
1014
+ blen = b - buf;
1009
1015
  s++;
1010
1016
  break;
1011
1017
  }
@@ -1026,18 +1032,9 @@ read_coded_chars(PInfo pi, char *text) {
1026
1032
  } else {
1027
1033
  if (u <= 0x000000000000007FULL) {
1028
1034
  *text++ = (char)u;
1029
- #if HAS_PRIVATE_ENCODING
1030
- } else if (ox_utf8_encoding == pi->options->rb_enc ||
1031
- 0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(pi->options->rb_enc)))) {
1032
- #else
1033
1035
  } else if (ox_utf8_encoding == pi->options->rb_enc) {
1034
- #endif
1035
1036
  text = ox_ucs_to_utf8_chars(text, u);
1036
- #if HAS_PRIVATE_ENCODING
1037
- } else if (Qnil == pi->options->rb_enc) {
1038
- #else
1039
1037
  } else if (0 == pi->options->rb_enc) {
1040
- #endif
1041
1038
  pi->options->rb_enc = ox_utf8_encoding;
1042
1039
  text = ox_ucs_to_utf8_chars(text, u);
1043
1040
  } else if (TolerantEffort == pi->options->effort) {
@@ -1048,30 +1045,20 @@ read_coded_chars(PInfo pi, char *text) {
1048
1045
  } else {
1049
1046
  /*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */
1050
1047
  set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
1051
- return 0;
1048
+ return NULL;
1052
1049
  }
1053
1050
  pi->s = s;
1054
1051
  }
1055
- } else if (0 == strcasecmp(buf, "nbsp;")) {
1056
- pi->s = s;
1057
- *text++ = ' ';
1058
- } else if (0 == strcasecmp(buf, "lt;")) {
1059
- pi->s = s;
1060
- *text++ = '<';
1061
- } else if (0 == strcasecmp(buf, "gt;")) {
1062
- pi->s = s;
1063
- *text++ = '>';
1064
- } else if (0 == strcasecmp(buf, "amp;")) {
1065
- pi->s = s;
1066
- *text++ = '&';
1067
- } else if (0 == strcasecmp(buf, "quot;")) {
1068
- pi->s = s;
1069
- *text++ = '"';
1070
- } else if (0 == strcasecmp(buf, "apos;")) {
1071
- pi->s = s;
1072
- *text++ = '\'';
1073
1052
  } else {
1074
- *text++ = '&';
1053
+ char *t2;
1054
+
1055
+ buf[blen] = '\0';
1056
+ if (NULL == (t2 = ox_entity_lookup(text, buf))) {
1057
+ *text++ = '&';
1058
+ } else {
1059
+ text = t2;
1060
+ pi->s = s;
1061
+ }
1075
1062
  }
1076
1063
  return text;
1077
1064
  }
@@ -1113,19 +1100,10 @@ collapse_special(PInfo pi, char *str) {
1113
1100
  }
1114
1101
  if (u <= 0x000000000000007FULL) {
1115
1102
  *b++ = (char)u;
1116
- #if HAS_PRIVATE_ENCODING
1117
- } else if (ox_utf8_encoding == pi->options->rb_enc ||
1118
- 0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(pi->options->rb_enc)))) {
1119
- #else
1120
1103
  } else if (ox_utf8_encoding == pi->options->rb_enc) {
1121
- #endif
1122
1104
  b = ox_ucs_to_utf8_chars(b, u);
1123
1105
  /* TBD support UTF-16 */
1124
- #if HAS_PRIVATE_ENCODING
1125
- } else if (Qnil == pi->options->rb_enc) {
1126
- #else
1127
1106
  } else if (0 == pi->options->rb_enc) {
1128
- #endif
1129
1107
  pi->options->rb_enc = ox_utf8_encoding;
1130
1108
  b = ox_ucs_to_utf8_chars(b, u);
1131
1109
  } else {
@@ -1154,16 +1132,30 @@ collapse_special(PInfo pi, char *str) {
1154
1132
  *b++ = '&';
1155
1133
  continue;
1156
1134
  } else {
1157
- c = '?';
1135
+ char key[16];
1136
+ char *k = key;
1137
+ char *kend = key + sizeof(key) - 1;
1138
+
1139
+ *k++ = *s;
1158
1140
  while (';' != *s++) {
1159
1141
  if ('\0' == *s) {
1160
1142
  set_error(&pi->err, "Invalid format, special character does not end with a semicolon", pi->str, pi->s);
1161
1143
  return EDOM;
1162
1144
  }
1145
+ if (kend <= k) {
1146
+ k = key;
1147
+ break;
1148
+ }
1149
+ *k++ = *s;
1163
1150
  }
1164
- s++;
1165
- set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
1166
- return 0;
1151
+ k--;
1152
+ *k = '\0';
1153
+ if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
1154
+ set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
1155
+ c = '?';
1156
+ return 0;
1157
+ }
1158
+ continue;
1167
1159
  }
1168
1160
  *b++ = (char)c;
1169
1161
  }
@@ -9,13 +9,16 @@
9
9
  #include <stdio.h>
10
10
  #include <strings.h>
11
11
  #include <sys/types.h>
12
- #if NEEDS_UIO
12
+ #if HAVE_SYS_UIO_H
13
13
  #include <sys/uio.h>
14
14
  #endif
15
15
  #include <unistd.h>
16
16
  #include <time.h>
17
17
 
18
18
  #include "ruby.h"
19
+ #if HAVE_RB_ENC_ASSOCIATE
20
+ #include "ruby/encoding.h"
21
+ #endif
19
22
  #include "ox.h"
20
23
  #include "sax.h"
21
24
  #include "sax_stack.h"
@@ -55,7 +58,7 @@ static char read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml,
55
58
  static char read_name_token(SaxDrive dr);
56
59
  static char read_quoted_value(SaxDrive dr);
57
60
 
58
- static void end_element_cb(SaxDrive dr, VALUE name, int pos, int line, int col, Hint h);
61
+ static void end_element_cb(SaxDrive dr, VALUE name, long pos, long line, long col, Hint h);
59
62
 
60
63
  static void hint_clear_empty(SaxDrive dr);
61
64
  static Nv hint_try_close(SaxDrive dr, const char *name);
@@ -68,9 +71,9 @@ static VALUE protect_parse(VALUE drp) {
68
71
  return Qnil;
69
72
  }
70
73
 
71
- #if HAS_ENCODING_SUPPORT || HAS_PRIVATE_ENCODING
74
+ #if HAVE_RB_ENC_ASSOCIATE
72
75
  static int
73
- strIsAscii(const char *s) {
76
+ str_is_ascii(const char *s) {
74
77
  for (; '\0' != *s; s++) {
75
78
  if (*s < ' ' || '~' < *s) {
76
79
  return 0;
@@ -87,8 +90,8 @@ str2sym(SaxDrive dr, const char *str, const char **strp) {
87
90
 
88
91
  if (dr->options.symbolize) {
89
92
  if (Qundef == (sym = ox_cache_get(ox_symbol_cache, str, &slot, strp))) {
90
- #if HAS_ENCODING_SUPPORT
91
- if (0 != dr->encoding && !strIsAscii(str)) {
93
+ #if HAVE_RB_ENC_ASSOCIATE
94
+ if (0 != dr->encoding && !str_is_ascii(str)) {
92
95
  VALUE rstr = rb_str_new2(str);
93
96
 
94
97
  // TBD if sym can be pinned down then use this all the time
@@ -99,20 +102,6 @@ str2sym(SaxDrive dr, const char *str, const char **strp) {
99
102
  sym = ID2SYM(rb_intern(str));
100
103
  *slot = sym;
101
104
  }
102
- #elif HAS_PRIVATE_ENCODING
103
- if (Qnil != dr->encoding && !strIsAscii(str)) {
104
- VALUE rstr = rb_str_new2(str);
105
-
106
- rb_funcall(rstr, ox_force_encoding_id, 1, dr->encoding);
107
- sym = rb_funcall(rstr, ox_to_sym_id, 0);
108
- // Needed for Ruby 2.2 to get around the GC of symbols created
109
- // with to_sym which is needed for encoded symbols.
110
- rb_ary_push(ox_sym_bank, sym);
111
- *slot = Qundef;
112
- } else {
113
- sym = ID2SYM(rb_intern(str));
114
- *slot = sym;
115
- }
116
105
  #else
117
106
  sym = ID2SYM(rb_intern(str));
118
107
  *slot = sym;
@@ -120,14 +109,10 @@ str2sym(SaxDrive dr, const char *str, const char **strp) {
120
109
  }
121
110
  } else {
122
111
  sym = rb_str_new2(str);
123
- #if HAS_ENCODING_SUPPORT
112
+ #if HAVE_RB_ENC_ASSOCIATE
124
113
  if (0 != dr->encoding) {
125
114
  rb_enc_associate(sym, dr->encoding);
126
115
  }
127
- #elif HAS_PRIVATE_ENCODING
128
- if (Qnil != dr->encoding) {
129
- rb_funcall(sym, ox_force_encoding_id, 1, dr->encoding);
130
- }
131
116
  #endif
132
117
  if (0 != strp) {
133
118
  *strp = StringValuePtr(sym);
@@ -182,7 +167,7 @@ sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options) {
182
167
  dr->blocked = 0;
183
168
  dr->abort = false;
184
169
  has_init(&dr->has, handler);
185
- #if HAS_ENCODING_SUPPORT
170
+ #if HAVE_RB_ENC_FIND
186
171
  if ('\0' == *ox_default_options.encoding) {
187
172
  VALUE encoding;
188
173
 
@@ -196,18 +181,6 @@ sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options) {
196
181
  } else {
197
182
  dr->encoding = rb_enc_find(ox_default_options.encoding);
198
183
  }
199
- #elif HAS_PRIVATE_ENCODING
200
- if ('\0' == *ox_default_options.encoding) {
201
- VALUE encoding;
202
-
203
- if (rb_respond_to(io, ox_external_encoding_id) && Qnil != (encoding = rb_funcall(io, ox_external_encoding_id, 0))) {
204
- dr->encoding = encoding;
205
- } else {
206
- dr->encoding = Qnil;
207
- }
208
- } else {
209
- dr->encoding = rb_str_new2(ox_default_options.encoding);
210
- }
211
184
  #else
212
185
  dr->encoding = 0;
213
186
  #endif
@@ -221,7 +194,7 @@ ox_sax_drive_cleanup(SaxDrive dr) {
221
194
  }
222
195
 
223
196
  static void
224
- ox_sax_drive_error_at(SaxDrive dr, const char *msg, int pos, int line, int col) {
197
+ ox_sax_drive_error_at(SaxDrive dr, const char *msg, off_t pos, off_t line, off_t col) {
225
198
  if (dr->has.error) {
226
199
  VALUE args[3];
227
200
 
@@ -255,9 +228,7 @@ skipBOM(SaxDrive dr) {
255
228
 
256
229
  if (0xEF == (uint8_t)c) { /* only UTF8 is supported */
257
230
  if (0xBB == (uint8_t)buf_get(&dr->buf) && 0xBF == (uint8_t)buf_get(&dr->buf)) {
258
- #if HAS_ENCODING_SUPPORT
259
- dr->encoding = ox_utf8_encoding;
260
- #elif HAS_PRIVATE_ENCODING
231
+ #if HAVE_RB_ENC_FIND
261
232
  dr->encoding = ox_utf8_encoding;
262
233
  #else
263
234
  dr->encoding = UTF8_STR;
@@ -301,11 +272,11 @@ parse(SaxDrive dr) {
301
272
  }
302
273
  c = read_comment(dr);
303
274
  } else {
304
- int i;
305
- int spaced = 0;
306
- int pos = dr->buf.pos + 1;
307
- int line = dr->buf.line;
308
- int col = dr->buf.col + 1;
275
+ int i;
276
+ int spaced = 0;
277
+ off_t pos = dr->buf.pos + 1;
278
+ off_t line = dr->buf.line;
279
+ off_t col = dr->buf.col + 1;
309
280
 
310
281
  if (is_white(c)) {
311
282
  spaced = 1;
@@ -359,19 +330,15 @@ parse(SaxDrive dr) {
359
330
  parent = stack_peek(&dr->stack);
360
331
  if (0 != parent && 0 == parent->childCnt && dr->has.text && !dr->blocked) {
361
332
  VALUE args[1];
362
- int pos = dr->buf.pos;
363
- int line = dr->buf.line;
364
- int col = dr->buf.col - 1;
333
+ off_t pos = dr->buf.pos;
334
+ off_t line = dr->buf.line;
335
+ off_t col = dr->buf.col - 1;
365
336
 
366
337
  args[0] = rb_str_new2("");
367
- #if HAS_ENCODING_SUPPORT
338
+ #if HAVE_RB_ENC_ASSOCIATE
368
339
  if (0 != dr->encoding) {
369
340
  rb_enc_associate(args[0], dr->encoding);
370
341
  }
371
- #elif HAS_PRIVATE_ENCODING
372
- if (Qnil != dr->encoding) {
373
- rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
374
- }
375
342
  #endif
376
343
  if (dr->has.pos) {
377
344
  rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
@@ -479,9 +446,9 @@ read_instruction(SaxDrive dr) {
479
446
  int coff;
480
447
  VALUE target = Qnil;
481
448
  int is_xml;
482
- int pos = dr->buf.pos - 1;
483
- int line = dr->buf.line;
484
- int col = dr->buf.col - 1;
449
+ off_t pos = dr->buf.pos - 1;
450
+ off_t line = dr->buf.line;
451
+ off_t col = dr->buf.col - 1;
485
452
 
486
453
  buf_protect(&dr->buf);
487
454
  if ('\0' == (c = read_name_token(dr))) {
@@ -511,7 +478,7 @@ read_instruction(SaxDrive dr) {
511
478
  line = dr->buf.line;
512
479
  col = dr->buf.col;
513
480
  read_content(dr, content, sizeof(content) - 1);
514
- coff = dr->buf.tail - dr->buf.head;
481
+ coff = (int)(dr->buf.tail - dr->buf.head);
515
482
  buf_reset(&dr->buf);
516
483
  dr->err = 0;
517
484
  c = read_attrs(dr, c, '?', '?', is_xml, 1, NULL);
@@ -523,17 +490,13 @@ read_instruction(SaxDrive dr) {
523
490
  VALUE args[1];
524
491
 
525
492
  if (dr->options.convert_special) {
526
- ox_sax_collapse_special(dr, content, pos, line, col);
493
+ ox_sax_collapse_special(dr, content, (int)pos, (int)line, (int)col);
527
494
  }
528
495
  args[0] = rb_str_new2(content);
529
- #if HAS_ENCODING_SUPPORT
496
+ #if HAVE_RB_ENC_ASSOCIATE
530
497
  if (0 != dr->encoding) {
531
498
  rb_enc_associate(args[0], dr->encoding);
532
499
  }
533
- #elif HAS_PRIVATE_ENCODING
534
- if (Qnil != dr->encoding) {
535
- rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
536
- }
537
500
  #endif
538
501
  if (dr->has.line) {
539
502
  rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
@@ -627,9 +590,9 @@ read_delimited(SaxDrive dr, char end) {
627
590
  */
628
591
  static char
629
592
  read_doctype(SaxDrive dr) {
630
- int pos = dr->buf.pos - 9;
631
- int line = dr->buf.line;
632
- int col = dr->buf.col - 9;
593
+ long pos = (long)(dr->buf.pos - 9);
594
+ long line = (long)(dr->buf.line);
595
+ long col = (long)(dr->buf.col - 9);
633
596
  char *s;
634
597
  Nv parent = stack_peek(&dr->stack);
635
598
 
@@ -673,9 +636,9 @@ read_cdata(SaxDrive dr) {
673
636
  char c;
674
637
  char zero = '\0';
675
638
  int end = 0;
676
- int pos = dr->buf.pos - 9;
677
- int line = dr->buf.line;
678
- int col = dr->buf.col - 9;
639
+ long pos = (long)(dr->buf.pos - 9);
640
+ long line = (long)(dr->buf.line);
641
+ long col = (long)(dr->buf.col - 9);
679
642
  struct _checkPt cp = CHECK_PT_INIT;
680
643
  Nv parent = stack_peek(&dr->stack);
681
644
 
@@ -732,14 +695,10 @@ read_cdata(SaxDrive dr) {
732
695
  VALUE args[1];
733
696
 
734
697
  args[0] = rb_str_new2(dr->buf.str);
735
- #if HAS_ENCODING_SUPPORT
698
+ #if HAVE_RB_ENC_ASSOCIATE
736
699
  if (0 != dr->encoding) {
737
700
  rb_enc_associate(args[0], dr->encoding);
738
701
  }
739
- #elif HAS_PRIVATE_ENCODING
740
- if (Qnil != dr->encoding) {
741
- rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
742
- }
743
702
  #endif
744
703
  if (dr->has.pos) {
745
704
  rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
@@ -768,9 +727,9 @@ read_comment(SaxDrive dr) {
768
727
  char c;
769
728
  char zero = '\0';
770
729
  int end = 0;
771
- int pos = dr->buf.pos - 4;
772
- int line = dr->buf.line;
773
- int col = dr->buf.col - 4;
730
+ long pos = (long)(dr->buf.pos - 4);
731
+ long line = (long)(dr->buf.line);
732
+ long col = (long)(dr->buf.col - 4);
774
733
  struct _checkPt cp = CHECK_PT_INIT;
775
734
 
776
735
  buf_backup(&dr->buf); /* back up to the start in case the cdata is empty */
@@ -826,14 +785,10 @@ read_comment(SaxDrive dr) {
826
785
  (NULL != h && (ActiveOverlay == h->overlay || ActiveOverlay == h->overlay))) {
827
786
 
828
787
  args[0] = rb_str_new2(dr->buf.str);
829
- #if HAS_ENCODING_SUPPORT
788
+ #if HAVE_RB_ENC_ASSOCIATE
830
789
  if (0 != dr->encoding) {
831
790
  rb_enc_associate(args[0], dr->encoding);
832
791
  }
833
- #elif HAS_PRIVATE_ENCODING
834
- if (Qnil != dr->encoding) {
835
- rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
836
- }
837
792
  #endif
838
793
  if (dr->has.pos) {
839
794
  rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
@@ -864,9 +819,9 @@ read_element_start(SaxDrive dr) {
864
819
  volatile VALUE name = Qnil;
865
820
  char c;
866
821
  int closed;
867
- int pos = dr->buf.pos;
868
- int line = dr->buf.line;
869
- int col = dr->buf.col;
822
+ long pos = (long)(dr->buf.pos);
823
+ long line = (long)(dr->buf.line);
824
+ long col = (long)(dr->buf.col);
870
825
  Hint h = NULL;
871
826
  int stackless = 0;
872
827
  Nv parent = stack_peek(&dr->stack);
@@ -1020,9 +975,9 @@ static char
1020
975
  read_element_end(SaxDrive dr) {
1021
976
  VALUE name = Qnil;
1022
977
  char c;
1023
- int pos = dr->buf.pos - 1;
1024
- int line = dr->buf.line;
1025
- int col = dr->buf.col - 1;
978
+ long pos = (long)(dr->buf.pos - 1);
979
+ long line = (long)(dr->buf.line);
980
+ long col = (long)(dr->buf.col - 1);
1026
981
  Nv nv;
1027
982
  Hint h = NULL;
1028
983
 
@@ -1118,9 +1073,9 @@ static char
1118
1073
  read_text(SaxDrive dr) {
1119
1074
  VALUE args[1];
1120
1075
  char c;
1121
- int pos = dr->buf.pos;
1122
- int line = dr->buf.line;
1123
- int col = dr->buf.col - 1;
1076
+ long pos = (long)(dr->buf.pos);
1077
+ long line = (long)(dr->buf.line);
1078
+ long col = (long)(dr->buf.col - 1);
1124
1079
  Nv parent = stack_peek(&dr->stack);
1125
1080
  int allWhite = 1;
1126
1081
 
@@ -1158,14 +1113,10 @@ read_text(SaxDrive dr) {
1158
1113
  ((NoSkip == dr->options.skip && !isEnd) ||
1159
1114
  (OffSkip == dr->options.skip))) {
1160
1115
  args[0] = rb_str_new2(dr->buf.str);
1161
- #if HAS_ENCODING_SUPPORT
1116
+ #if HAVE_RB_ENC_ASSOCIATE
1162
1117
  if (0 != dr->encoding) {
1163
1118
  rb_enc_associate(args[0], dr->encoding);
1164
1119
  }
1165
- #elif HAS_PRIVATE_ENCODING
1166
- if (Qnil != dr->encoding) {
1167
- rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
1168
- }
1169
1120
  #endif
1170
1121
  if (dr->has.pos) {
1171
1122
  rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
@@ -1213,14 +1164,10 @@ read_text(SaxDrive dr) {
1213
1164
  break;
1214
1165
  }
1215
1166
  args[0] = rb_str_new2(dr->buf.str);
1216
- #if HAS_ENCODING_SUPPORT
1167
+ #if HAVE_RB_ENC_ASSOCIATE
1217
1168
  if (0 != dr->encoding) {
1218
1169
  rb_enc_associate(args[0], dr->encoding);
1219
1170
  }
1220
- #elif HAS_PRIVATE_ENCODING
1221
- if (Qnil != dr->encoding) {
1222
- rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
1223
- }
1224
1171
  #endif
1225
1172
  if (dr->has.pos) {
1226
1173
  rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
@@ -1266,9 +1213,9 @@ static char
1266
1213
  read_jump(SaxDrive dr, const char *pat) {
1267
1214
  VALUE args[1];
1268
1215
  char c;
1269
- int pos = dr->buf.pos;
1270
- int line = dr->buf.line;
1271
- int col = dr->buf.col - 1;
1216
+ long pos = (long)(dr->buf.pos);
1217
+ long line = (long)(dr->buf.line);
1218
+ long col = (long)(dr->buf.col - 1);
1272
1219
  Nv parent = stack_peek(&dr->stack);
1273
1220
 
1274
1221
  buf_protect(&dr->buf);
@@ -1299,14 +1246,10 @@ read_jump(SaxDrive dr, const char *pat) {
1299
1246
  // TBD check parent overlay
1300
1247
  if (dr->has.text && !dr->blocked) {
1301
1248
  args[0] = rb_str_new2(dr->buf.str);
1302
- #if HAS_ENCODING_SUPPORT
1249
+ #if HAVE_RB_ENC_ASSOCIATE
1303
1250
  if (0 != dr->encoding) {
1304
1251
  rb_enc_associate(args[0], dr->encoding);
1305
1252
  }
1306
- #elif HAS_PRIVATE_ENCODING
1307
- if (Qnil != dr->encoding) {
1308
- rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
1309
- }
1310
1253
  #endif
1311
1254
  if (dr->has.pos) {
1312
1255
  rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
@@ -1330,9 +1273,9 @@ static char
1330
1273
  read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req, Hint h) {
1331
1274
  VALUE name = Qnil;
1332
1275
  int is_encoding = 0;
1333
- int pos;
1334
- int line;
1335
- int col;
1276
+ off_t pos;
1277
+ off_t line;
1278
+ off_t col;
1336
1279
  char *attr_value;
1337
1280
 
1338
1281
  // already protected by caller
@@ -1377,10 +1320,8 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req,
1377
1320
  c = read_quoted_value(dr);
1378
1321
  attr_value = dr->buf.str;
1379
1322
  if (is_encoding) {
1380
- #if HAS_ENCODING_SUPPORT
1323
+ #if HAVE_RB_ENC_FIND
1381
1324
  dr->encoding = rb_enc_find(dr->buf.str);
1382
- #elif HAS_PRIVATE_ENCODING
1383
- dr->encoding = rb_str_new2(dr->buf.str);
1384
1325
  #else
1385
1326
  dr->encoding = dr->buf.str;
1386
1327
  #endif
@@ -1411,14 +1352,10 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req,
1411
1352
  ox_sax_collapse_special(dr, dr->buf.str, pos, line, col);
1412
1353
  }
1413
1354
  args[1] = rb_str_new2(attr_value);
1414
- #if HAS_ENCODING_SUPPORT
1355
+ #if HAVE_RB_ENC_ASSOCIATE
1415
1356
  if (0 != dr->encoding) {
1416
1357
  rb_enc_associate(args[1], dr->encoding);
1417
1358
  }
1418
- #elif HAS_PRIVATE_ENCODING
1419
- if (Qnil != dr->encoding) {
1420
- rb_funcall(args[1], ox_force_encoding_id, 1, dr->encoding);
1421
- }
1422
1359
  #endif
1423
1360
  if (dr->has.pos) {
1424
1361
  rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
@@ -1580,7 +1517,7 @@ read_10_uint64(char *b, uint64_t *up) {
1580
1517
  }
1581
1518
 
1582
1519
  int
1583
- ox_sax_collapse_special(SaxDrive dr, char *str, int pos, int line, int col) {
1520
+ ox_sax_collapse_special(SaxDrive dr, char *str, long pos, long line, long col) {
1584
1521
  char *s = str;
1585
1522
  char *b = str;
1586
1523
 
@@ -1614,19 +1551,12 @@ ox_sax_collapse_special(SaxDrive dr, char *str, int pos, int line, int col) {
1614
1551
  }
1615
1552
  if (u <= 0x000000000000007FULL) {
1616
1553
  *b++ = (char)u;
1617
- #if HAS_ENCODING_SUPPORT
1554
+ #if HAVE_RB_ENC_FIND
1618
1555
  } else if (ox_utf8_encoding == dr->encoding) {
1619
1556
  b = ox_ucs_to_utf8_chars(b, u);
1620
1557
  } else if (0 == dr->encoding) {
1621
1558
  dr->encoding = ox_utf8_encoding;
1622
1559
  b = ox_ucs_to_utf8_chars(b, u);
1623
- #elif HAS_PRIVATE_ENCODING
1624
- } else if (ox_utf8_encoding == dr->encoding ||
1625
- 0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(dr->encoding)))) {
1626
- b = ox_ucs_to_utf8_chars(b, u);
1627
- } else if (Qnil == dr->encoding) {
1628
- dr->encoding = ox_utf8_encoding;
1629
- b = ox_ucs_to_utf8_chars(b, u);
1630
1560
  #else
1631
1561
  } else if (0 == dr->encoding) {
1632
1562
  dr->encoding = UTF8_STR;
@@ -1668,8 +1598,28 @@ ox_sax_collapse_special(SaxDrive dr, char *str, int pos, int line, int col) {
1668
1598
  c = '\'';
1669
1599
  s += 5;
1670
1600
  } else {
1671
- ox_sax_drive_error_at(dr, INVALID_FORMAT "Invalid special character sequence", pos, line, col);
1672
- c = '&';
1601
+ char key[16];
1602
+ char *k = key;
1603
+ char *kend = key + sizeof(key) - 1;
1604
+ char *bn;
1605
+ char *s2 = s;
1606
+
1607
+ for (; ';' != *s2 && '\0' != *s2; s2++, k++) {
1608
+ if (kend <= k) {
1609
+ k = key;
1610
+ break;
1611
+ }
1612
+ *k = *s2;
1613
+ }
1614
+ *k = '\0';
1615
+ if ('\0' == *key || NULL == (bn = ox_entity_lookup(b, key))) {
1616
+ ox_sax_drive_error_at(dr, INVALID_FORMAT "Invalid special character sequence", pos, line, col);
1617
+ c = '&';
1618
+ } else {
1619
+ b = bn;
1620
+ s = s2 + 1;
1621
+ continue;
1622
+ }
1673
1623
  }
1674
1624
  *b++ = (char)c;
1675
1625
  col++;
@@ -1731,7 +1681,7 @@ hint_try_close(SaxDrive dr, const char *name) {
1731
1681
  }
1732
1682
 
1733
1683
  static void
1734
- end_element_cb(SaxDrive dr, VALUE name, int pos, int line, int col, Hint h) {
1684
+ end_element_cb(SaxDrive dr, VALUE name, long pos, long line, long col, Hint h) {
1735
1685
  if (dr->has.end_element && 0 >= dr->blocked && (NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
1736
1686
  if (dr->has.pos) {
1737
1687
  rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));