ox 2.12.0 → 2.13.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +48 -0
- data/README.md +31 -0
- data/ext/ox/builder.c +13 -7
- data/ext/ox/dump.c +18 -24
- data/ext/ox/extconf.rb +16 -34
- data/ext/ox/gen_load.c +18 -96
- data/ext/ox/hash_load.c +62 -26
- data/ext/ox/obj_load.c +14 -46
- data/ext/ox/ox.c +34 -46
- data/ext/ox/ox.h +10 -16
- data/ext/ox/parse.c +59 -67
- data/ext/ox/sax.c +84 -134
- data/ext/ox/sax.h +2 -4
- data/ext/ox/sax_as.c +2 -6
- data/ext/ox/sax_buf.c +1 -1
- data/ext/ox/special.c +346 -0
- data/ext/ox/special.h +1 -0
- data/lib/ox/element.rb +1 -0
- data/lib/ox/version.rb +1 -1
- metadata +7 -7
data/ext/ox/ox.h
CHANGED
@@ -16,20 +16,15 @@ extern "C" {
|
|
16
16
|
#define RSTRING_NOT_MODIFIED
|
17
17
|
|
18
18
|
#include "ruby.h"
|
19
|
-
#if
|
19
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
20
20
|
#include "ruby/encoding.h"
|
21
21
|
#endif
|
22
22
|
|
23
|
-
#
|
24
|
-
#
|
25
|
-
enum st_retval {ST_CONTINUE = 0, ST_STOP = 1, ST_DELETE = 2, ST_CHECK};
|
23
|
+
#if HAVE_RUBY_ST_H
|
24
|
+
#include "ruby/st.h"
|
26
25
|
#else
|
27
|
-
|
28
|
-
/* Only on travis, local is where it is for all others. Seems to vary depending on the travis machine picked up. */
|
26
|
+
// Only on travis, local is where it is for all others. Seems to vary depending on the travis machine picked up.
|
29
27
|
#include "st.h"
|
30
|
-
#else
|
31
|
-
#include "ruby/st.h"
|
32
|
-
#endif
|
33
28
|
#endif
|
34
29
|
|
35
30
|
#include "cache.h"
|
@@ -146,16 +141,14 @@ typedef struct _options {
|
|
146
141
|
struct _hints *html_hints; // html hints
|
147
142
|
VALUE attr_key_mod;
|
148
143
|
VALUE element_key_mod;
|
149
|
-
#if
|
144
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
150
145
|
rb_encoding *rb_enc;
|
151
|
-
#elif HAS_PRIVATE_ENCODING
|
152
|
-
VALUE rb_enc;
|
153
146
|
#else
|
154
147
|
void *rb_enc;
|
155
148
|
#endif
|
156
149
|
} *Options;
|
157
150
|
|
158
|
-
|
151
|
+
// parse information structure
|
159
152
|
struct _pInfo {
|
160
153
|
struct _helperStack helpers;
|
161
154
|
struct _err err;
|
@@ -167,6 +160,9 @@ struct _pInfo {
|
|
167
160
|
CircArray circ_array;
|
168
161
|
unsigned long id; // set for text types when cirs_array is set
|
169
162
|
Options options;
|
163
|
+
VALUE *marked;
|
164
|
+
int mark_size; // allocated size
|
165
|
+
int mark_cnt;
|
170
166
|
char last; // last character read, rarely set
|
171
167
|
};
|
172
168
|
|
@@ -233,10 +229,8 @@ extern ID ox_tv_nsec_id;
|
|
233
229
|
extern ID ox_tv_usec_id;
|
234
230
|
extern ID ox_value_id;
|
235
231
|
|
236
|
-
#if
|
232
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
237
233
|
extern rb_encoding *ox_utf8_encoding;
|
238
|
-
#elif HAS_PRIVATE_ENCODING
|
239
|
-
extern VALUE ox_utf8_encoding;
|
240
234
|
#else
|
241
235
|
extern void *ox_utf8_encoding;
|
242
236
|
#endif
|
data/ext/ox/parse.c
CHANGED
@@ -121,7 +121,7 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
|
|
121
121
|
if (DEBUG <= options->trace) {
|
122
122
|
printf("Parsing xml:\n%s\n", xml);
|
123
123
|
}
|
124
|
-
|
124
|
+
// initialize parse info
|
125
125
|
helper_stack_init(&pi.helpers);
|
126
126
|
// Protect against GC
|
127
127
|
wrap = Data_Wrap_Struct(rb_cObject, mark_pi_cb, NULL, &pi);
|
@@ -134,8 +134,11 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
|
|
134
134
|
pi.obj = Qnil;
|
135
135
|
pi.circ_array = 0;
|
136
136
|
pi.options = options;
|
137
|
+
pi.marked = NULL;
|
138
|
+
pi.mark_size = 0;
|
139
|
+
pi.mark_cnt = 0;
|
137
140
|
while (1) {
|
138
|
-
next_non_white(&pi);
|
141
|
+
next_non_white(&pi); // skip white space
|
139
142
|
if ('\0' == *pi.s) {
|
140
143
|
break;
|
141
144
|
}
|
@@ -143,31 +146,31 @@ ox_parse(char *xml, size_t len, ParseCallbacks pcb, char **endp, Options options
|
|
143
146
|
*endp = pi.s;
|
144
147
|
break;
|
145
148
|
}
|
146
|
-
if ('<' != *pi.s) {
|
149
|
+
if ('<' != *pi.s) { // all top level entities start with <
|
147
150
|
set_error(err, "invalid format, expected <", pi.str, pi.s);
|
148
151
|
helper_stack_cleanup(&pi.helpers);
|
149
152
|
return Qnil;
|
150
153
|
}
|
151
|
-
pi.s++;
|
154
|
+
pi.s++; // past <
|
152
155
|
switch (*pi.s) {
|
153
|
-
case '?':
|
156
|
+
case '?': // processing instruction
|
154
157
|
pi.s++;
|
155
158
|
read_instruction(&pi);
|
156
159
|
break;
|
157
|
-
case '!':
|
160
|
+
case '!': // comment or doctype
|
158
161
|
pi.s++;
|
159
162
|
if ('\0' == *pi.s) {
|
160
163
|
set_error(err, "invalid format, DOCTYPE or comment not terminated", pi.str, pi.s);
|
161
164
|
helper_stack_cleanup(&pi.helpers);
|
162
165
|
return Qnil;
|
163
166
|
} else if ('-' == *pi.s) {
|
164
|
-
pi.s++;
|
167
|
+
pi.s++; // skip -
|
165
168
|
if ('-' != *pi.s) {
|
166
169
|
set_error(err, "invalid format, bad comment format", pi.str, pi.s);
|
167
170
|
helper_stack_cleanup(&pi.helpers);
|
168
171
|
return Qnil;
|
169
172
|
} else {
|
170
|
-
pi.s++;
|
173
|
+
pi.s++; // skip second -
|
171
174
|
read_comment(&pi);
|
172
175
|
}
|
173
176
|
} else if ((TolerantEffort == options->effort) ? 0 == strncasecmp("DOCTYPE", pi.s, 7) : 0 == strncmp("DOCTYPE", pi.s, 7)) {
|
@@ -229,8 +232,7 @@ gather_content(const char *src, char *content, size_t len) {
|
|
229
232
|
return 0;
|
230
233
|
}
|
231
234
|
|
232
|
-
|
233
|
-
*/
|
235
|
+
// Entered after the "<?" sequence. Ready to read the rest.
|
234
236
|
static void
|
235
237
|
read_instruction(PInfo pi) {
|
236
238
|
char content[1024];
|
@@ -255,7 +257,7 @@ read_instruction(PInfo pi) {
|
|
255
257
|
}
|
256
258
|
next_non_white(pi);
|
257
259
|
c = *pi->s;
|
258
|
-
*end = '\0';
|
260
|
+
*end = '\0'; // terminate name
|
259
261
|
if ('?' != c) {
|
260
262
|
while ('?' != c) {
|
261
263
|
pi->last = 0;
|
@@ -275,8 +277,8 @@ read_instruction(PInfo pi) {
|
|
275
277
|
attrs_ok = 0;
|
276
278
|
break;
|
277
279
|
}
|
278
|
-
*end = '\0';
|
279
|
-
|
280
|
+
*end = '\0'; // terminate name
|
281
|
+
// read value
|
280
282
|
next_non_white(pi);
|
281
283
|
if (0 == (attr_value = read_quoted_value(pi))) {
|
282
284
|
attr_stack_cleanup(&attrs);
|
@@ -355,9 +357,8 @@ read_delimited(PInfo pi, char end) {
|
|
355
357
|
}
|
356
358
|
}
|
357
359
|
|
358
|
-
|
359
|
-
|
360
|
-
*/
|
360
|
+
// Entered after the "<!DOCTYPE" sequence plus the first character after
|
361
|
+
// that. Ready to read the rest.
|
361
362
|
static void
|
362
363
|
read_doctype(PInfo pi) {
|
363
364
|
char *docType;
|
@@ -376,8 +377,7 @@ read_doctype(PInfo pi) {
|
|
376
377
|
}
|
377
378
|
}
|
378
379
|
|
379
|
-
|
380
|
-
*/
|
380
|
+
// Entered after "<!--". Returns error code.
|
381
381
|
static void
|
382
382
|
read_comment(PInfo pi) {
|
383
383
|
char *end;
|
@@ -406,16 +406,15 @@ read_comment(PInfo pi) {
|
|
406
406
|
break;
|
407
407
|
}
|
408
408
|
}
|
409
|
-
*end = '\0';
|
409
|
+
*end = '\0'; // in case the comment was blank
|
410
410
|
pi->s = end + 3;
|
411
411
|
if (0 != pi->pcb->add_comment) {
|
412
412
|
pi->pcb->add_comment(pi, comment);
|
413
413
|
}
|
414
414
|
}
|
415
415
|
|
416
|
-
|
417
|
-
|
418
|
-
*/
|
416
|
+
// Entered after the '<' and the first character after that. Returns stat
|
417
|
+
// code.
|
419
418
|
static char*
|
420
419
|
read_element(PInfo pi) {
|
421
420
|
struct _attrStack attrs;
|
@@ -439,10 +438,9 @@ read_element(PInfo pi) {
|
|
439
438
|
c = *pi->s;
|
440
439
|
*end = '\0';
|
441
440
|
if ('/' == c) {
|
442
|
-
|
441
|
+
// empty element, no attributes and no children
|
443
442
|
pi->s++;
|
444
443
|
if ('>' != *pi->s) {
|
445
|
-
/*printf("*** '%s' ***\n", pi->s); */
|
446
444
|
attr_stack_cleanup(&attrs);
|
447
445
|
set_error(&pi->err, "invalid format, element not closed", pi->str, pi->s);
|
448
446
|
return 0;
|
@@ -480,8 +478,8 @@ read_element(PInfo pi) {
|
|
480
478
|
pi->s++;
|
481
479
|
pi->pcb->add_element(pi, ename, attrs.head, hasChildren);
|
482
480
|
pi->pcb->end_element(pi, ename);
|
483
|
-
|
484
481
|
attr_stack_cleanup(&attrs);
|
482
|
+
|
485
483
|
return 0;
|
486
484
|
case '>':
|
487
485
|
/* has either children or a value */
|
@@ -545,6 +543,12 @@ read_element(PInfo pi) {
|
|
545
543
|
while (!done) {
|
546
544
|
start = pi->s;
|
547
545
|
next_non_white(pi);
|
546
|
+
if (OffSkip == pi->options->skip && start < pi->s && '<' == *pi->s) {
|
547
|
+
c = *pi->s;
|
548
|
+
*pi->s = '\0';
|
549
|
+
pi->pcb->add_text(pi, start, 1);
|
550
|
+
*pi->s = c;
|
551
|
+
}
|
548
552
|
c = *pi->s++;
|
549
553
|
if ('\0' == c) {
|
550
554
|
attr_stack_cleanup(&attrs);
|
@@ -1001,11 +1005,13 @@ read_coded_chars(PInfo pi, char *text) {
|
|
1001
1005
|
char *b, buf[32];
|
1002
1006
|
char *end = buf + sizeof(buf) - 1;
|
1003
1007
|
char *s;
|
1008
|
+
long blen = 0;
|
1004
1009
|
|
1005
1010
|
for (b = buf, s = pi->s; b < end; b++, s++) {
|
1006
1011
|
*b = *s;
|
1007
1012
|
if (';' == *s) {
|
1008
1013
|
*(b + 1) = '\0';
|
1014
|
+
blen = b - buf;
|
1009
1015
|
s++;
|
1010
1016
|
break;
|
1011
1017
|
}
|
@@ -1026,18 +1032,9 @@ read_coded_chars(PInfo pi, char *text) {
|
|
1026
1032
|
} else {
|
1027
1033
|
if (u <= 0x000000000000007FULL) {
|
1028
1034
|
*text++ = (char)u;
|
1029
|
-
#if HAS_PRIVATE_ENCODING
|
1030
|
-
} else if (ox_utf8_encoding == pi->options->rb_enc ||
|
1031
|
-
0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(pi->options->rb_enc)))) {
|
1032
|
-
#else
|
1033
1035
|
} else if (ox_utf8_encoding == pi->options->rb_enc) {
|
1034
|
-
#endif
|
1035
1036
|
text = ox_ucs_to_utf8_chars(text, u);
|
1036
|
-
#if HAS_PRIVATE_ENCODING
|
1037
|
-
} else if (Qnil == pi->options->rb_enc) {
|
1038
|
-
#else
|
1039
1037
|
} else if (0 == pi->options->rb_enc) {
|
1040
|
-
#endif
|
1041
1038
|
pi->options->rb_enc = ox_utf8_encoding;
|
1042
1039
|
text = ox_ucs_to_utf8_chars(text, u);
|
1043
1040
|
} else if (TolerantEffort == pi->options->effort) {
|
@@ -1048,30 +1045,20 @@ read_coded_chars(PInfo pi, char *text) {
|
|
1048
1045
|
} else {
|
1049
1046
|
/*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */
|
1050
1047
|
set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
|
1051
|
-
return
|
1048
|
+
return NULL;
|
1052
1049
|
}
|
1053
1050
|
pi->s = s;
|
1054
1051
|
}
|
1055
|
-
} else if (0 == strcasecmp(buf, "nbsp;")) {
|
1056
|
-
pi->s = s;
|
1057
|
-
*text++ = ' ';
|
1058
|
-
} else if (0 == strcasecmp(buf, "lt;")) {
|
1059
|
-
pi->s = s;
|
1060
|
-
*text++ = '<';
|
1061
|
-
} else if (0 == strcasecmp(buf, "gt;")) {
|
1062
|
-
pi->s = s;
|
1063
|
-
*text++ = '>';
|
1064
|
-
} else if (0 == strcasecmp(buf, "amp;")) {
|
1065
|
-
pi->s = s;
|
1066
|
-
*text++ = '&';
|
1067
|
-
} else if (0 == strcasecmp(buf, "quot;")) {
|
1068
|
-
pi->s = s;
|
1069
|
-
*text++ = '"';
|
1070
|
-
} else if (0 == strcasecmp(buf, "apos;")) {
|
1071
|
-
pi->s = s;
|
1072
|
-
*text++ = '\'';
|
1073
1052
|
} else {
|
1074
|
-
*
|
1053
|
+
char *t2;
|
1054
|
+
|
1055
|
+
buf[blen] = '\0';
|
1056
|
+
if (NULL == (t2 = ox_entity_lookup(text, buf))) {
|
1057
|
+
*text++ = '&';
|
1058
|
+
} else {
|
1059
|
+
text = t2;
|
1060
|
+
pi->s = s;
|
1061
|
+
}
|
1075
1062
|
}
|
1076
1063
|
return text;
|
1077
1064
|
}
|
@@ -1113,19 +1100,10 @@ collapse_special(PInfo pi, char *str) {
|
|
1113
1100
|
}
|
1114
1101
|
if (u <= 0x000000000000007FULL) {
|
1115
1102
|
*b++ = (char)u;
|
1116
|
-
#if HAS_PRIVATE_ENCODING
|
1117
|
-
} else if (ox_utf8_encoding == pi->options->rb_enc ||
|
1118
|
-
0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(pi->options->rb_enc)))) {
|
1119
|
-
#else
|
1120
1103
|
} else if (ox_utf8_encoding == pi->options->rb_enc) {
|
1121
|
-
#endif
|
1122
1104
|
b = ox_ucs_to_utf8_chars(b, u);
|
1123
1105
|
/* TBD support UTF-16 */
|
1124
|
-
#if HAS_PRIVATE_ENCODING
|
1125
|
-
} else if (Qnil == pi->options->rb_enc) {
|
1126
|
-
#else
|
1127
1106
|
} else if (0 == pi->options->rb_enc) {
|
1128
|
-
#endif
|
1129
1107
|
pi->options->rb_enc = ox_utf8_encoding;
|
1130
1108
|
b = ox_ucs_to_utf8_chars(b, u);
|
1131
1109
|
} else {
|
@@ -1154,16 +1132,30 @@ collapse_special(PInfo pi, char *str) {
|
|
1154
1132
|
*b++ = '&';
|
1155
1133
|
continue;
|
1156
1134
|
} else {
|
1157
|
-
|
1135
|
+
char key[16];
|
1136
|
+
char *k = key;
|
1137
|
+
char *kend = key + sizeof(key) - 1;
|
1138
|
+
|
1139
|
+
*k++ = *s;
|
1158
1140
|
while (';' != *s++) {
|
1159
1141
|
if ('\0' == *s) {
|
1160
1142
|
set_error(&pi->err, "Invalid format, special character does not end with a semicolon", pi->str, pi->s);
|
1161
1143
|
return EDOM;
|
1162
1144
|
}
|
1145
|
+
if (kend <= k) {
|
1146
|
+
k = key;
|
1147
|
+
break;
|
1148
|
+
}
|
1149
|
+
*k++ = *s;
|
1163
1150
|
}
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1151
|
+
k--;
|
1152
|
+
*k = '\0';
|
1153
|
+
if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
|
1154
|
+
set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
|
1155
|
+
c = '?';
|
1156
|
+
return 0;
|
1157
|
+
}
|
1158
|
+
continue;
|
1167
1159
|
}
|
1168
1160
|
*b++ = (char)c;
|
1169
1161
|
}
|
data/ext/ox/sax.c
CHANGED
@@ -9,13 +9,16 @@
|
|
9
9
|
#include <stdio.h>
|
10
10
|
#include <strings.h>
|
11
11
|
#include <sys/types.h>
|
12
|
-
#if
|
12
|
+
#if HAVE_SYS_UIO_H
|
13
13
|
#include <sys/uio.h>
|
14
14
|
#endif
|
15
15
|
#include <unistd.h>
|
16
16
|
#include <time.h>
|
17
17
|
|
18
18
|
#include "ruby.h"
|
19
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
20
|
+
#include "ruby/encoding.h"
|
21
|
+
#endif
|
19
22
|
#include "ox.h"
|
20
23
|
#include "sax.h"
|
21
24
|
#include "sax_stack.h"
|
@@ -55,7 +58,7 @@ static char read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml,
|
|
55
58
|
static char read_name_token(SaxDrive dr);
|
56
59
|
static char read_quoted_value(SaxDrive dr);
|
57
60
|
|
58
|
-
static void end_element_cb(SaxDrive dr, VALUE name,
|
61
|
+
static void end_element_cb(SaxDrive dr, VALUE name, long pos, long line, long col, Hint h);
|
59
62
|
|
60
63
|
static void hint_clear_empty(SaxDrive dr);
|
61
64
|
static Nv hint_try_close(SaxDrive dr, const char *name);
|
@@ -68,9 +71,9 @@ static VALUE protect_parse(VALUE drp) {
|
|
68
71
|
return Qnil;
|
69
72
|
}
|
70
73
|
|
71
|
-
#if
|
74
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
72
75
|
static int
|
73
|
-
|
76
|
+
str_is_ascii(const char *s) {
|
74
77
|
for (; '\0' != *s; s++) {
|
75
78
|
if (*s < ' ' || '~' < *s) {
|
76
79
|
return 0;
|
@@ -87,8 +90,8 @@ str2sym(SaxDrive dr, const char *str, const char **strp) {
|
|
87
90
|
|
88
91
|
if (dr->options.symbolize) {
|
89
92
|
if (Qundef == (sym = ox_cache_get(ox_symbol_cache, str, &slot, strp))) {
|
90
|
-
#if
|
91
|
-
if (0 != dr->encoding && !
|
93
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
94
|
+
if (0 != dr->encoding && !str_is_ascii(str)) {
|
92
95
|
VALUE rstr = rb_str_new2(str);
|
93
96
|
|
94
97
|
// TBD if sym can be pinned down then use this all the time
|
@@ -99,20 +102,6 @@ str2sym(SaxDrive dr, const char *str, const char **strp) {
|
|
99
102
|
sym = ID2SYM(rb_intern(str));
|
100
103
|
*slot = sym;
|
101
104
|
}
|
102
|
-
#elif HAS_PRIVATE_ENCODING
|
103
|
-
if (Qnil != dr->encoding && !strIsAscii(str)) {
|
104
|
-
VALUE rstr = rb_str_new2(str);
|
105
|
-
|
106
|
-
rb_funcall(rstr, ox_force_encoding_id, 1, dr->encoding);
|
107
|
-
sym = rb_funcall(rstr, ox_to_sym_id, 0);
|
108
|
-
// Needed for Ruby 2.2 to get around the GC of symbols created
|
109
|
-
// with to_sym which is needed for encoded symbols.
|
110
|
-
rb_ary_push(ox_sym_bank, sym);
|
111
|
-
*slot = Qundef;
|
112
|
-
} else {
|
113
|
-
sym = ID2SYM(rb_intern(str));
|
114
|
-
*slot = sym;
|
115
|
-
}
|
116
105
|
#else
|
117
106
|
sym = ID2SYM(rb_intern(str));
|
118
107
|
*slot = sym;
|
@@ -120,14 +109,10 @@ str2sym(SaxDrive dr, const char *str, const char **strp) {
|
|
120
109
|
}
|
121
110
|
} else {
|
122
111
|
sym = rb_str_new2(str);
|
123
|
-
#if
|
112
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
124
113
|
if (0 != dr->encoding) {
|
125
114
|
rb_enc_associate(sym, dr->encoding);
|
126
115
|
}
|
127
|
-
#elif HAS_PRIVATE_ENCODING
|
128
|
-
if (Qnil != dr->encoding) {
|
129
|
-
rb_funcall(sym, ox_force_encoding_id, 1, dr->encoding);
|
130
|
-
}
|
131
116
|
#endif
|
132
117
|
if (0 != strp) {
|
133
118
|
*strp = StringValuePtr(sym);
|
@@ -182,7 +167,7 @@ sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options) {
|
|
182
167
|
dr->blocked = 0;
|
183
168
|
dr->abort = false;
|
184
169
|
has_init(&dr->has, handler);
|
185
|
-
#if
|
170
|
+
#if HAVE_RB_ENC_FIND
|
186
171
|
if ('\0' == *ox_default_options.encoding) {
|
187
172
|
VALUE encoding;
|
188
173
|
|
@@ -196,18 +181,6 @@ sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options) {
|
|
196
181
|
} else {
|
197
182
|
dr->encoding = rb_enc_find(ox_default_options.encoding);
|
198
183
|
}
|
199
|
-
#elif HAS_PRIVATE_ENCODING
|
200
|
-
if ('\0' == *ox_default_options.encoding) {
|
201
|
-
VALUE encoding;
|
202
|
-
|
203
|
-
if (rb_respond_to(io, ox_external_encoding_id) && Qnil != (encoding = rb_funcall(io, ox_external_encoding_id, 0))) {
|
204
|
-
dr->encoding = encoding;
|
205
|
-
} else {
|
206
|
-
dr->encoding = Qnil;
|
207
|
-
}
|
208
|
-
} else {
|
209
|
-
dr->encoding = rb_str_new2(ox_default_options.encoding);
|
210
|
-
}
|
211
184
|
#else
|
212
185
|
dr->encoding = 0;
|
213
186
|
#endif
|
@@ -221,7 +194,7 @@ ox_sax_drive_cleanup(SaxDrive dr) {
|
|
221
194
|
}
|
222
195
|
|
223
196
|
static void
|
224
|
-
ox_sax_drive_error_at(SaxDrive dr, const char *msg,
|
197
|
+
ox_sax_drive_error_at(SaxDrive dr, const char *msg, off_t pos, off_t line, off_t col) {
|
225
198
|
if (dr->has.error) {
|
226
199
|
VALUE args[3];
|
227
200
|
|
@@ -255,9 +228,7 @@ skipBOM(SaxDrive dr) {
|
|
255
228
|
|
256
229
|
if (0xEF == (uint8_t)c) { /* only UTF8 is supported */
|
257
230
|
if (0xBB == (uint8_t)buf_get(&dr->buf) && 0xBF == (uint8_t)buf_get(&dr->buf)) {
|
258
|
-
#if
|
259
|
-
dr->encoding = ox_utf8_encoding;
|
260
|
-
#elif HAS_PRIVATE_ENCODING
|
231
|
+
#if HAVE_RB_ENC_FIND
|
261
232
|
dr->encoding = ox_utf8_encoding;
|
262
233
|
#else
|
263
234
|
dr->encoding = UTF8_STR;
|
@@ -301,11 +272,11 @@ parse(SaxDrive dr) {
|
|
301
272
|
}
|
302
273
|
c = read_comment(dr);
|
303
274
|
} else {
|
304
|
-
int
|
305
|
-
int
|
306
|
-
|
307
|
-
|
308
|
-
|
275
|
+
int i;
|
276
|
+
int spaced = 0;
|
277
|
+
off_t pos = dr->buf.pos + 1;
|
278
|
+
off_t line = dr->buf.line;
|
279
|
+
off_t col = dr->buf.col + 1;
|
309
280
|
|
310
281
|
if (is_white(c)) {
|
311
282
|
spaced = 1;
|
@@ -359,19 +330,15 @@ parse(SaxDrive dr) {
|
|
359
330
|
parent = stack_peek(&dr->stack);
|
360
331
|
if (0 != parent && 0 == parent->childCnt && dr->has.text && !dr->blocked) {
|
361
332
|
VALUE args[1];
|
362
|
-
|
363
|
-
|
364
|
-
|
333
|
+
off_t pos = dr->buf.pos;
|
334
|
+
off_t line = dr->buf.line;
|
335
|
+
off_t col = dr->buf.col - 1;
|
365
336
|
|
366
337
|
args[0] = rb_str_new2("");
|
367
|
-
#if
|
338
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
368
339
|
if (0 != dr->encoding) {
|
369
340
|
rb_enc_associate(args[0], dr->encoding);
|
370
341
|
}
|
371
|
-
#elif HAS_PRIVATE_ENCODING
|
372
|
-
if (Qnil != dr->encoding) {
|
373
|
-
rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
|
374
|
-
}
|
375
342
|
#endif
|
376
343
|
if (dr->has.pos) {
|
377
344
|
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
@@ -479,9 +446,9 @@ read_instruction(SaxDrive dr) {
|
|
479
446
|
int coff;
|
480
447
|
VALUE target = Qnil;
|
481
448
|
int is_xml;
|
482
|
-
|
483
|
-
|
484
|
-
|
449
|
+
off_t pos = dr->buf.pos - 1;
|
450
|
+
off_t line = dr->buf.line;
|
451
|
+
off_t col = dr->buf.col - 1;
|
485
452
|
|
486
453
|
buf_protect(&dr->buf);
|
487
454
|
if ('\0' == (c = read_name_token(dr))) {
|
@@ -511,7 +478,7 @@ read_instruction(SaxDrive dr) {
|
|
511
478
|
line = dr->buf.line;
|
512
479
|
col = dr->buf.col;
|
513
480
|
read_content(dr, content, sizeof(content) - 1);
|
514
|
-
coff = dr->buf.tail - dr->buf.head;
|
481
|
+
coff = (int)(dr->buf.tail - dr->buf.head);
|
515
482
|
buf_reset(&dr->buf);
|
516
483
|
dr->err = 0;
|
517
484
|
c = read_attrs(dr, c, '?', '?', is_xml, 1, NULL);
|
@@ -523,17 +490,13 @@ read_instruction(SaxDrive dr) {
|
|
523
490
|
VALUE args[1];
|
524
491
|
|
525
492
|
if (dr->options.convert_special) {
|
526
|
-
ox_sax_collapse_special(dr, content, pos, line, col);
|
493
|
+
ox_sax_collapse_special(dr, content, (int)pos, (int)line, (int)col);
|
527
494
|
}
|
528
495
|
args[0] = rb_str_new2(content);
|
529
|
-
#if
|
496
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
530
497
|
if (0 != dr->encoding) {
|
531
498
|
rb_enc_associate(args[0], dr->encoding);
|
532
499
|
}
|
533
|
-
#elif HAS_PRIVATE_ENCODING
|
534
|
-
if (Qnil != dr->encoding) {
|
535
|
-
rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
|
536
|
-
}
|
537
500
|
#endif
|
538
501
|
if (dr->has.line) {
|
539
502
|
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
@@ -627,9 +590,9 @@ read_delimited(SaxDrive dr, char end) {
|
|
627
590
|
*/
|
628
591
|
static char
|
629
592
|
read_doctype(SaxDrive dr) {
|
630
|
-
|
631
|
-
|
632
|
-
|
593
|
+
long pos = (long)(dr->buf.pos - 9);
|
594
|
+
long line = (long)(dr->buf.line);
|
595
|
+
long col = (long)(dr->buf.col - 9);
|
633
596
|
char *s;
|
634
597
|
Nv parent = stack_peek(&dr->stack);
|
635
598
|
|
@@ -673,9 +636,9 @@ read_cdata(SaxDrive dr) {
|
|
673
636
|
char c;
|
674
637
|
char zero = '\0';
|
675
638
|
int end = 0;
|
676
|
-
|
677
|
-
|
678
|
-
|
639
|
+
long pos = (long)(dr->buf.pos - 9);
|
640
|
+
long line = (long)(dr->buf.line);
|
641
|
+
long col = (long)(dr->buf.col - 9);
|
679
642
|
struct _checkPt cp = CHECK_PT_INIT;
|
680
643
|
Nv parent = stack_peek(&dr->stack);
|
681
644
|
|
@@ -732,14 +695,10 @@ read_cdata(SaxDrive dr) {
|
|
732
695
|
VALUE args[1];
|
733
696
|
|
734
697
|
args[0] = rb_str_new2(dr->buf.str);
|
735
|
-
#if
|
698
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
736
699
|
if (0 != dr->encoding) {
|
737
700
|
rb_enc_associate(args[0], dr->encoding);
|
738
701
|
}
|
739
|
-
#elif HAS_PRIVATE_ENCODING
|
740
|
-
if (Qnil != dr->encoding) {
|
741
|
-
rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
|
742
|
-
}
|
743
702
|
#endif
|
744
703
|
if (dr->has.pos) {
|
745
704
|
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
@@ -768,9 +727,9 @@ read_comment(SaxDrive dr) {
|
|
768
727
|
char c;
|
769
728
|
char zero = '\0';
|
770
729
|
int end = 0;
|
771
|
-
|
772
|
-
|
773
|
-
|
730
|
+
long pos = (long)(dr->buf.pos - 4);
|
731
|
+
long line = (long)(dr->buf.line);
|
732
|
+
long col = (long)(dr->buf.col - 4);
|
774
733
|
struct _checkPt cp = CHECK_PT_INIT;
|
775
734
|
|
776
735
|
buf_backup(&dr->buf); /* back up to the start in case the cdata is empty */
|
@@ -826,14 +785,10 @@ read_comment(SaxDrive dr) {
|
|
826
785
|
(NULL != h && (ActiveOverlay == h->overlay || ActiveOverlay == h->overlay))) {
|
827
786
|
|
828
787
|
args[0] = rb_str_new2(dr->buf.str);
|
829
|
-
#if
|
788
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
830
789
|
if (0 != dr->encoding) {
|
831
790
|
rb_enc_associate(args[0], dr->encoding);
|
832
791
|
}
|
833
|
-
#elif HAS_PRIVATE_ENCODING
|
834
|
-
if (Qnil != dr->encoding) {
|
835
|
-
rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
|
836
|
-
}
|
837
792
|
#endif
|
838
793
|
if (dr->has.pos) {
|
839
794
|
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
@@ -864,9 +819,9 @@ read_element_start(SaxDrive dr) {
|
|
864
819
|
volatile VALUE name = Qnil;
|
865
820
|
char c;
|
866
821
|
int closed;
|
867
|
-
|
868
|
-
|
869
|
-
|
822
|
+
long pos = (long)(dr->buf.pos);
|
823
|
+
long line = (long)(dr->buf.line);
|
824
|
+
long col = (long)(dr->buf.col);
|
870
825
|
Hint h = NULL;
|
871
826
|
int stackless = 0;
|
872
827
|
Nv parent = stack_peek(&dr->stack);
|
@@ -1020,9 +975,9 @@ static char
|
|
1020
975
|
read_element_end(SaxDrive dr) {
|
1021
976
|
VALUE name = Qnil;
|
1022
977
|
char c;
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
978
|
+
long pos = (long)(dr->buf.pos - 1);
|
979
|
+
long line = (long)(dr->buf.line);
|
980
|
+
long col = (long)(dr->buf.col - 1);
|
1026
981
|
Nv nv;
|
1027
982
|
Hint h = NULL;
|
1028
983
|
|
@@ -1118,9 +1073,9 @@ static char
|
|
1118
1073
|
read_text(SaxDrive dr) {
|
1119
1074
|
VALUE args[1];
|
1120
1075
|
char c;
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1076
|
+
long pos = (long)(dr->buf.pos);
|
1077
|
+
long line = (long)(dr->buf.line);
|
1078
|
+
long col = (long)(dr->buf.col - 1);
|
1124
1079
|
Nv parent = stack_peek(&dr->stack);
|
1125
1080
|
int allWhite = 1;
|
1126
1081
|
|
@@ -1158,14 +1113,10 @@ read_text(SaxDrive dr) {
|
|
1158
1113
|
((NoSkip == dr->options.skip && !isEnd) ||
|
1159
1114
|
(OffSkip == dr->options.skip))) {
|
1160
1115
|
args[0] = rb_str_new2(dr->buf.str);
|
1161
|
-
#if
|
1116
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
1162
1117
|
if (0 != dr->encoding) {
|
1163
1118
|
rb_enc_associate(args[0], dr->encoding);
|
1164
1119
|
}
|
1165
|
-
#elif HAS_PRIVATE_ENCODING
|
1166
|
-
if (Qnil != dr->encoding) {
|
1167
|
-
rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
|
1168
|
-
}
|
1169
1120
|
#endif
|
1170
1121
|
if (dr->has.pos) {
|
1171
1122
|
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
@@ -1213,14 +1164,10 @@ read_text(SaxDrive dr) {
|
|
1213
1164
|
break;
|
1214
1165
|
}
|
1215
1166
|
args[0] = rb_str_new2(dr->buf.str);
|
1216
|
-
#if
|
1167
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
1217
1168
|
if (0 != dr->encoding) {
|
1218
1169
|
rb_enc_associate(args[0], dr->encoding);
|
1219
1170
|
}
|
1220
|
-
#elif HAS_PRIVATE_ENCODING
|
1221
|
-
if (Qnil != dr->encoding) {
|
1222
|
-
rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
|
1223
|
-
}
|
1224
1171
|
#endif
|
1225
1172
|
if (dr->has.pos) {
|
1226
1173
|
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
@@ -1266,9 +1213,9 @@ static char
|
|
1266
1213
|
read_jump(SaxDrive dr, const char *pat) {
|
1267
1214
|
VALUE args[1];
|
1268
1215
|
char c;
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1216
|
+
long pos = (long)(dr->buf.pos);
|
1217
|
+
long line = (long)(dr->buf.line);
|
1218
|
+
long col = (long)(dr->buf.col - 1);
|
1272
1219
|
Nv parent = stack_peek(&dr->stack);
|
1273
1220
|
|
1274
1221
|
buf_protect(&dr->buf);
|
@@ -1299,14 +1246,10 @@ read_jump(SaxDrive dr, const char *pat) {
|
|
1299
1246
|
// TBD check parent overlay
|
1300
1247
|
if (dr->has.text && !dr->blocked) {
|
1301
1248
|
args[0] = rb_str_new2(dr->buf.str);
|
1302
|
-
#if
|
1249
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
1303
1250
|
if (0 != dr->encoding) {
|
1304
1251
|
rb_enc_associate(args[0], dr->encoding);
|
1305
1252
|
}
|
1306
|
-
#elif HAS_PRIVATE_ENCODING
|
1307
|
-
if (Qnil != dr->encoding) {
|
1308
|
-
rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding);
|
1309
|
-
}
|
1310
1253
|
#endif
|
1311
1254
|
if (dr->has.pos) {
|
1312
1255
|
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
@@ -1330,9 +1273,9 @@ static char
|
|
1330
1273
|
read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req, Hint h) {
|
1331
1274
|
VALUE name = Qnil;
|
1332
1275
|
int is_encoding = 0;
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1276
|
+
off_t pos;
|
1277
|
+
off_t line;
|
1278
|
+
off_t col;
|
1336
1279
|
char *attr_value;
|
1337
1280
|
|
1338
1281
|
// already protected by caller
|
@@ -1377,10 +1320,8 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req,
|
|
1377
1320
|
c = read_quoted_value(dr);
|
1378
1321
|
attr_value = dr->buf.str;
|
1379
1322
|
if (is_encoding) {
|
1380
|
-
#if
|
1323
|
+
#if HAVE_RB_ENC_FIND
|
1381
1324
|
dr->encoding = rb_enc_find(dr->buf.str);
|
1382
|
-
#elif HAS_PRIVATE_ENCODING
|
1383
|
-
dr->encoding = rb_str_new2(dr->buf.str);
|
1384
1325
|
#else
|
1385
1326
|
dr->encoding = dr->buf.str;
|
1386
1327
|
#endif
|
@@ -1411,14 +1352,10 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req,
|
|
1411
1352
|
ox_sax_collapse_special(dr, dr->buf.str, pos, line, col);
|
1412
1353
|
}
|
1413
1354
|
args[1] = rb_str_new2(attr_value);
|
1414
|
-
#if
|
1355
|
+
#if HAVE_RB_ENC_ASSOCIATE
|
1415
1356
|
if (0 != dr->encoding) {
|
1416
1357
|
rb_enc_associate(args[1], dr->encoding);
|
1417
1358
|
}
|
1418
|
-
#elif HAS_PRIVATE_ENCODING
|
1419
|
-
if (Qnil != dr->encoding) {
|
1420
|
-
rb_funcall(args[1], ox_force_encoding_id, 1, dr->encoding);
|
1421
|
-
}
|
1422
1359
|
#endif
|
1423
1360
|
if (dr->has.pos) {
|
1424
1361
|
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
@@ -1580,7 +1517,7 @@ read_10_uint64(char *b, uint64_t *up) {
|
|
1580
1517
|
}
|
1581
1518
|
|
1582
1519
|
int
|
1583
|
-
ox_sax_collapse_special(SaxDrive dr, char *str,
|
1520
|
+
ox_sax_collapse_special(SaxDrive dr, char *str, long pos, long line, long col) {
|
1584
1521
|
char *s = str;
|
1585
1522
|
char *b = str;
|
1586
1523
|
|
@@ -1614,19 +1551,12 @@ ox_sax_collapse_special(SaxDrive dr, char *str, int pos, int line, int col) {
|
|
1614
1551
|
}
|
1615
1552
|
if (u <= 0x000000000000007FULL) {
|
1616
1553
|
*b++ = (char)u;
|
1617
|
-
#if
|
1554
|
+
#if HAVE_RB_ENC_FIND
|
1618
1555
|
} else if (ox_utf8_encoding == dr->encoding) {
|
1619
1556
|
b = ox_ucs_to_utf8_chars(b, u);
|
1620
1557
|
} else if (0 == dr->encoding) {
|
1621
1558
|
dr->encoding = ox_utf8_encoding;
|
1622
1559
|
b = ox_ucs_to_utf8_chars(b, u);
|
1623
|
-
#elif HAS_PRIVATE_ENCODING
|
1624
|
-
} else if (ox_utf8_encoding == dr->encoding ||
|
1625
|
-
0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(dr->encoding)))) {
|
1626
|
-
b = ox_ucs_to_utf8_chars(b, u);
|
1627
|
-
} else if (Qnil == dr->encoding) {
|
1628
|
-
dr->encoding = ox_utf8_encoding;
|
1629
|
-
b = ox_ucs_to_utf8_chars(b, u);
|
1630
1560
|
#else
|
1631
1561
|
} else if (0 == dr->encoding) {
|
1632
1562
|
dr->encoding = UTF8_STR;
|
@@ -1668,8 +1598,28 @@ ox_sax_collapse_special(SaxDrive dr, char *str, int pos, int line, int col) {
|
|
1668
1598
|
c = '\'';
|
1669
1599
|
s += 5;
|
1670
1600
|
} else {
|
1671
|
-
|
1672
|
-
|
1601
|
+
char key[16];
|
1602
|
+
char *k = key;
|
1603
|
+
char *kend = key + sizeof(key) - 1;
|
1604
|
+
char *bn;
|
1605
|
+
char *s2 = s;
|
1606
|
+
|
1607
|
+
for (; ';' != *s2 && '\0' != *s2; s2++, k++) {
|
1608
|
+
if (kend <= k) {
|
1609
|
+
k = key;
|
1610
|
+
break;
|
1611
|
+
}
|
1612
|
+
*k = *s2;
|
1613
|
+
}
|
1614
|
+
*k = '\0';
|
1615
|
+
if ('\0' == *key || NULL == (bn = ox_entity_lookup(b, key))) {
|
1616
|
+
ox_sax_drive_error_at(dr, INVALID_FORMAT "Invalid special character sequence", pos, line, col);
|
1617
|
+
c = '&';
|
1618
|
+
} else {
|
1619
|
+
b = bn;
|
1620
|
+
s = s2 + 1;
|
1621
|
+
continue;
|
1622
|
+
}
|
1673
1623
|
}
|
1674
1624
|
*b++ = (char)c;
|
1675
1625
|
col++;
|
@@ -1731,7 +1681,7 @@ hint_try_close(SaxDrive dr, const char *name) {
|
|
1731
1681
|
}
|
1732
1682
|
|
1733
1683
|
static void
|
1734
|
-
end_element_cb(SaxDrive dr, VALUE name,
|
1684
|
+
end_element_cb(SaxDrive dr, VALUE name, long pos, long line, long col, Hint h) {
|
1735
1685
|
if (dr->has.end_element && 0 >= dr->blocked && (NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
1736
1686
|
if (dr->has.pos) {
|
1737
1687
|
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|