ox 2.14.6 → 2.14.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +26 -0
- data/ext/ox/builder.c +0 -4
- data/ext/ox/cache.c +309 -131
- data/ext/ox/cache.h +10 -10
- data/ext/ox/dump.c +2 -2
- data/ext/ox/extconf.rb +4 -2
- data/ext/ox/gen_load.c +5 -73
- data/ext/ox/hash_load.c +0 -4
- data/ext/ox/intern.c +153 -0
- data/ext/ox/intern.h +25 -0
- data/ext/ox/obj_load.c +14 -86
- data/ext/ox/ox.c +1015 -935
- data/ext/ox/ox.h +186 -210
- data/ext/ox/parse.c +72 -31
- data/ext/ox/sax.c +1100 -1276
- data/ext/ox/sax.h +45 -31
- data/ext/ox/sax_as.c +3 -5
- data/ext/ox/sax_buf.c +7 -16
- data/ext/ox/slotcache.c +158 -0
- data/ext/ox/slotcache.h +19 -0
- data/lib/ox/version.rb +1 -1
- metadata +7 -4
- data/ext/ox/sax_has.h +0 -53
data/ext/ox/sax.c
CHANGED
@@ -4,66 +4,63 @@
|
|
4
4
|
*/
|
5
5
|
|
6
6
|
#include <ctype.h>
|
7
|
-
#include <stdlib.h>
|
8
7
|
#include <errno.h>
|
9
8
|
#include <stdio.h>
|
9
|
+
#include <stdlib.h>
|
10
10
|
#include <strings.h>
|
11
11
|
#include <sys/types.h>
|
12
12
|
#if HAVE_SYS_UIO_H
|
13
13
|
#include <sys/uio.h>
|
14
14
|
#endif
|
15
|
-
#include <unistd.h>
|
16
15
|
#include <time.h>
|
16
|
+
#include <unistd.h>
|
17
17
|
|
18
|
+
#include "intern.h"
|
19
|
+
#include "ox.h"
|
18
20
|
#include "ruby.h"
|
19
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
20
21
|
#include "ruby/encoding.h"
|
21
|
-
#endif
|
22
|
-
#include "ox.h"
|
23
22
|
#include "sax.h"
|
24
|
-
#include "sax_stack.h"
|
25
23
|
#include "sax_buf.h"
|
24
|
+
#include "sax_stack.h"
|
26
25
|
#include "special.h"
|
27
26
|
|
28
|
-
#define NAME_MISMATCH
|
27
|
+
#define NAME_MISMATCH 1
|
29
28
|
|
30
|
-
#define START_STATE
|
31
|
-
#define BODY_STATE
|
32
|
-
#define AFTER_STATE
|
29
|
+
#define START_STATE 1
|
30
|
+
#define BODY_STATE 2
|
31
|
+
#define AFTER_STATE 3
|
33
32
|
|
34
33
|
// error prefixes
|
35
|
-
#define BAD_BOM
|
36
|
-
#define NO_TERM
|
37
|
-
#define INVALID_FORMAT
|
38
|
-
#define CASE_ERROR
|
39
|
-
#define OUT_OF_ORDER
|
40
|
-
#define WRONG_CHAR
|
41
|
-
#define EL_MISMATCH
|
42
|
-
#define INV_ELEMENT
|
43
|
-
|
44
|
-
#define UTF8_STR
|
45
|
-
|
46
|
-
static void
|
47
|
-
static void
|
34
|
+
#define BAD_BOM "Bad BOM: "
|
35
|
+
#define NO_TERM "Not Terminated: "
|
36
|
+
#define INVALID_FORMAT "Invalid Format: "
|
37
|
+
#define CASE_ERROR "Case Error: "
|
38
|
+
#define OUT_OF_ORDER "Out of Order: "
|
39
|
+
#define WRONG_CHAR "Unexpected Character: "
|
40
|
+
#define EL_MISMATCH "Start End Mismatch: "
|
41
|
+
#define INV_ELEMENT "Invalid Element: "
|
42
|
+
|
43
|
+
#define UTF8_STR "UTF-8"
|
44
|
+
|
45
|
+
static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options);
|
46
|
+
static void parse(SaxDrive dr);
|
48
47
|
// All read functions should return the next character after the 'thing' that was read and leave dr->cur one after that.
|
49
|
-
static char
|
50
|
-
static char
|
51
|
-
static char
|
52
|
-
static char
|
53
|
-
static char
|
54
|
-
static char
|
55
|
-
static char
|
56
|
-
static char
|
57
|
-
static char
|
58
|
-
static char
|
59
|
-
static char
|
60
|
-
|
61
|
-
static void
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
VALUE ox_sax_value_class = Qnil;
|
48
|
+
static char read_instruction(SaxDrive dr);
|
49
|
+
static char read_doctype(SaxDrive dr);
|
50
|
+
static char read_cdata(SaxDrive dr);
|
51
|
+
static char read_comment(SaxDrive dr);
|
52
|
+
static char read_element_start(SaxDrive dr);
|
53
|
+
static char read_element_end(SaxDrive dr);
|
54
|
+
static char read_text(SaxDrive dr);
|
55
|
+
static char read_jump(SaxDrive dr, const char *pat);
|
56
|
+
static char read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req, Hint h);
|
57
|
+
static char read_name_token(SaxDrive dr);
|
58
|
+
static char read_quoted_value(SaxDrive dr);
|
59
|
+
|
60
|
+
static void hint_clear_empty(SaxDrive dr);
|
61
|
+
static Nv hint_try_close(SaxDrive dr, const char *name);
|
62
|
+
|
63
|
+
VALUE ox_sax_value_class = Qnil;
|
67
64
|
|
68
65
|
static VALUE protect_parse(VALUE drp) {
|
69
66
|
parse((SaxDrive)drp);
|
@@ -71,562 +68,561 @@ static VALUE protect_parse(VALUE drp) {
|
|
71
68
|
return Qnil;
|
72
69
|
}
|
73
70
|
|
74
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
75
|
-
static int
|
76
|
-
str_is_ascii(const char *s) {
|
77
|
-
for (; '\0' != *s; s++) {
|
78
|
-
if (*s < ' ' || '~' < *s) {
|
79
|
-
return 0;
|
80
|
-
}
|
81
|
-
}
|
82
|
-
return 1;
|
83
|
-
}
|
84
|
-
#endif
|
85
|
-
|
86
71
|
VALUE
|
87
|
-
str2sym(SaxDrive dr, const char *str, const char **strp) {
|
88
|
-
VALUE
|
89
|
-
VALUE sym;
|
72
|
+
str2sym(SaxDrive dr, const char *str, size_t len, const char **strp) {
|
73
|
+
VALUE sym;
|
90
74
|
|
91
75
|
if (dr->options.symbolize) {
|
92
|
-
|
93
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
94
|
-
if (0 != dr->encoding && !str_is_ascii(str)) {
|
95
|
-
VALUE rstr = rb_str_new2(str);
|
96
|
-
|
97
|
-
// TBD if sym can be pinned down then use this all the time
|
98
|
-
rb_enc_associate(rstr, dr->encoding);
|
99
|
-
sym = rb_funcall(rstr, ox_to_sym_id, 0);
|
100
|
-
*slot = Qundef;
|
101
|
-
} else {
|
102
|
-
sym = ID2SYM(rb_intern(str));
|
103
|
-
*slot = sym;
|
104
|
-
}
|
105
|
-
#else
|
106
|
-
sym = ID2SYM(rb_intern(str));
|
107
|
-
*slot = sym;
|
108
|
-
#endif
|
109
|
-
}
|
76
|
+
sym = ox_sym_intern(str, len, strp);
|
110
77
|
} else {
|
111
|
-
|
112
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
113
|
-
if (0 != dr->encoding) {
|
114
|
-
rb_enc_associate(sym, dr->encoding);
|
115
|
-
}
|
116
|
-
#endif
|
117
|
-
if (0 != strp) {
|
118
|
-
*strp = StringValuePtr(sym);
|
119
|
-
}
|
78
|
+
sym = dr->get_name(str, len, dr->encoding, strp);
|
120
79
|
}
|
121
80
|
return sym;
|
122
81
|
}
|
123
82
|
|
124
|
-
void
|
125
|
-
ox_sax_parse(VALUE handler, VALUE io, SaxOptions options) {
|
83
|
+
void ox_sax_parse(VALUE handler, VALUE io, SaxOptions options) {
|
126
84
|
#if HAVE_RB_EXT_RACTOR_SAFE
|
127
85
|
rb_ext_ractor_safe(true);
|
128
86
|
#endif
|
129
|
-
struct _saxDrive
|
130
|
-
int
|
87
|
+
struct _saxDrive dr;
|
88
|
+
int line = 0;
|
131
89
|
|
132
90
|
sax_drive_init(&dr, handler, io, options);
|
133
|
-
#if 0
|
134
|
-
printf("*** sax_parse with these flags\n");
|
135
|
-
printf(" has_instruct = %s\n", dr.has.instruct ? "true" : "false");
|
136
|
-
printf(" has_end_instruct = %s\n", dr.has.end_instruct ? "true" : "false");
|
137
|
-
printf(" has_attr = %s\n", dr.has.attr ? "true" : "false");
|
138
|
-
printf(" has_attr_value = %s\n", dr.has.attr_value ? "true" : "false");
|
139
|
-
printf(" has_attrs_done = %s\n", dr.has.attrs_done ? "true" : "false");
|
140
|
-
printf(" has_doctype = %s\n", dr.has.doctype ? "true" : "false");
|
141
|
-
printf(" has_comment = %s\n", dr.has.comment ? "true" : "false");
|
142
|
-
printf(" has_cdata = %s\n", dr.has.cdata ? "true" : "false");
|
143
|
-
printf(" has_text = %s\n", dr.has.text ? "true" : "false");
|
144
|
-
printf(" has_value = %s\n", dr.has.value ? "true" : "false");
|
145
|
-
printf(" has_start_element = %s\n", dr.has.start_element ? "true" : "false");
|
146
|
-
printf(" has_end_element = %s\n", dr.has.end_element ? "true" : "false");
|
147
|
-
printf(" has_error = %s\n", dr.has.error ? "true" : "false");
|
148
|
-
printf(" has_pos = %s\n", dr.has.pos ? "true" : "false");
|
149
|
-
printf(" has_line = %s\n", dr.has.line ? "true" : "false");
|
150
|
-
printf(" has_column = %s\n", dr.has.column ? "true" : "false");
|
151
|
-
#endif
|
152
|
-
//parse(&dr);
|
153
91
|
rb_protect(protect_parse, (VALUE)&dr, &line);
|
154
92
|
ox_sax_drive_cleanup(&dr);
|
155
93
|
if (0 != line) {
|
156
|
-
|
94
|
+
rb_jump_tag(line);
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
static void set_long_noop(VALUE handler, long pos) {
|
99
|
+
}
|
100
|
+
|
101
|
+
static void set_pos(VALUE handler, long pos) {
|
102
|
+
rb_ivar_set(handler, ox_at_pos_id, LONG2NUM(pos));
|
103
|
+
}
|
104
|
+
|
105
|
+
static void set_line(VALUE handler, long line) {
|
106
|
+
rb_ivar_set(handler, ox_at_line_id, LONG2NUM(line));
|
107
|
+
}
|
108
|
+
|
109
|
+
static void set_col(VALUE handler, long col) {
|
110
|
+
rb_ivar_set(handler, ox_at_column_id, LONG2NUM(col));
|
111
|
+
}
|
112
|
+
|
113
|
+
static void attr_noop(SaxDrive dr, VALUE name, char *value, long pos, long line, long col) {
|
114
|
+
}
|
115
|
+
|
116
|
+
static void attr_text(SaxDrive dr, VALUE name, char *value, long pos, long line, long col) {
|
117
|
+
VALUE args[2];
|
118
|
+
|
119
|
+
args[0] = name;
|
120
|
+
if (dr->options.convert_special) {
|
121
|
+
ox_sax_collapse_special(dr, value, pos, line, col);
|
122
|
+
}
|
123
|
+
args[1] = rb_str_new2(value);
|
124
|
+
if (0 != dr->encoding) {
|
125
|
+
rb_enc_associate(args[1], dr->encoding);
|
126
|
+
}
|
127
|
+
dr->set_pos(dr->handler, pos);
|
128
|
+
dr->set_line(dr->handler, line);
|
129
|
+
dr->set_col(dr->handler, col);
|
130
|
+
rb_funcall2(dr->handler, ox_attr_id, 2, args);
|
131
|
+
}
|
132
|
+
|
133
|
+
static void attr_value(SaxDrive dr, VALUE name, char *value, long pos, long line, long col) {
|
134
|
+
VALUE args[2];
|
135
|
+
|
136
|
+
dr->set_pos(dr->handler, pos);
|
137
|
+
dr->set_line(dr->handler, line);
|
138
|
+
dr->set_col(dr->handler, col);
|
139
|
+
args[0] = name;
|
140
|
+
args[1] = dr->value_obj;
|
141
|
+
rb_funcall2(dr->handler, ox_attr_value_id, 2, args);
|
142
|
+
}
|
143
|
+
|
144
|
+
static void attrs_done_noop(VALUE handler) {
|
145
|
+
}
|
146
|
+
|
147
|
+
static void attrs_done(VALUE handler) {
|
148
|
+
rb_funcall(handler, ox_attrs_done_id, 0);
|
149
|
+
}
|
150
|
+
|
151
|
+
static VALUE instruct_noop(SaxDrive dr, const char *target, long pos, long line, long col) {
|
152
|
+
return Qnil;
|
153
|
+
}
|
154
|
+
|
155
|
+
static VALUE instruct(SaxDrive dr, const char *target, long pos, long line, long col) {
|
156
|
+
VALUE arg = rb_str_new2(target);
|
157
|
+
|
158
|
+
dr->set_pos(dr->handler, pos);
|
159
|
+
dr->set_line(dr->handler, line);
|
160
|
+
dr->set_col(dr->handler, col);
|
161
|
+
rb_funcall(dr->handler, ox_instruct_id, 1, arg);
|
162
|
+
|
163
|
+
return arg;
|
164
|
+
}
|
165
|
+
|
166
|
+
static VALUE instruct_just_value(SaxDrive dr, const char *target, long pos, long line, long col) {
|
167
|
+
return rb_str_new2(target);
|
168
|
+
}
|
169
|
+
|
170
|
+
static void end_instruct_noop(SaxDrive dr, VALUE target, long pos, long line, long col) {
|
171
|
+
}
|
172
|
+
|
173
|
+
static void end_instruct(SaxDrive dr, VALUE target, long pos, long line, long col) {
|
174
|
+
dr->set_pos(dr->handler, pos);
|
175
|
+
dr->set_line(dr->handler, line);
|
176
|
+
dr->set_col(dr->handler, col);
|
177
|
+
rb_funcall(dr->handler, ox_end_instruct_id, 1, target);
|
178
|
+
}
|
179
|
+
|
180
|
+
static void dr_loc_noop(SaxDrive dr, long pos, long line, long col) {
|
181
|
+
}
|
182
|
+
|
183
|
+
static void comment(SaxDrive dr, long pos, long line, long col) {
|
184
|
+
if (!dr->blocked) {
|
185
|
+
Nv parent = stack_peek(&dr->stack);
|
186
|
+
Hint h = ox_hint_find(dr->options.hints, "!--");
|
187
|
+
|
188
|
+
if (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay ||
|
189
|
+
(NULL != h && (ActiveOverlay == h->overlay || ActiveOverlay == h->overlay))) {
|
190
|
+
VALUE arg = rb_str_new2(dr->buf.str);
|
191
|
+
|
192
|
+
if (0 != dr->encoding) {
|
193
|
+
rb_enc_associate(arg, dr->encoding);
|
194
|
+
}
|
195
|
+
dr->set_pos(dr->handler, pos);
|
196
|
+
dr->set_line(dr->handler, line);
|
197
|
+
dr->set_col(dr->handler, col);
|
198
|
+
rb_funcall(dr->handler, ox_comment_id, 1, arg);
|
199
|
+
}
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
static void cdata(SaxDrive dr, long pos, long line, long col) {
|
204
|
+
Nv parent = stack_peek(&dr->stack);
|
205
|
+
|
206
|
+
if (!dr->blocked && (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay)) {
|
207
|
+
VALUE arg = rb_str_new2(dr->buf.str);
|
208
|
+
|
209
|
+
if (0 != dr->encoding) {
|
210
|
+
rb_enc_associate(arg, dr->encoding);
|
211
|
+
}
|
212
|
+
dr->set_pos(dr->handler, pos);
|
213
|
+
dr->set_line(dr->handler, line);
|
214
|
+
dr->set_col(dr->handler, col);
|
215
|
+
rb_funcall(dr->handler, ox_cdata_id, 1, arg);
|
216
|
+
}
|
217
|
+
}
|
218
|
+
|
219
|
+
static void doctype(SaxDrive dr, long pos, long line, long col) {
|
220
|
+
dr->set_pos(dr->handler, pos);
|
221
|
+
dr->set_line(dr->handler, line);
|
222
|
+
dr->set_col(dr->handler, col);
|
223
|
+
rb_funcall(dr->handler, ox_doctype_id, 1, rb_str_new2(dr->buf.str));
|
224
|
+
}
|
225
|
+
|
226
|
+
static void error_noop(SaxDrive dr, const char *msg, long pos, long line, long col) {
|
227
|
+
}
|
228
|
+
|
229
|
+
static void error(SaxDrive dr, const char *msg, long pos, long line, long col) {
|
230
|
+
VALUE args[3];
|
231
|
+
|
232
|
+
args[0] = rb_str_new2(msg);
|
233
|
+
args[1] = LONG2NUM(line);
|
234
|
+
args[2] = LONG2NUM(col);
|
235
|
+
dr->set_pos(dr->handler, pos);
|
236
|
+
dr->set_line(dr->handler, line);
|
237
|
+
dr->set_col(dr->handler, col);
|
238
|
+
rb_funcall2(dr->handler, ox_error_id, 3, args);
|
239
|
+
}
|
240
|
+
|
241
|
+
static void end_element_cb(SaxDrive dr, VALUE name, long pos, long line, long col, Hint h) {
|
242
|
+
if (dr->has_end_element && 0 >= dr->blocked &&
|
243
|
+
(NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
244
|
+
dr->set_pos(dr->handler, pos);
|
245
|
+
dr->set_line(dr->handler, line);
|
246
|
+
dr->set_col(dr->handler, col);
|
247
|
+
rb_funcall(dr->handler, ox_end_element_id, 1, name);
|
248
|
+
}
|
249
|
+
if (NULL != h && BlockOverlay == h->overlay && 0 < dr->blocked) {
|
250
|
+
dr->blocked--;
|
157
251
|
}
|
158
252
|
}
|
159
253
|
|
160
|
-
static void
|
161
|
-
sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options) {
|
254
|
+
static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options) {
|
162
255
|
ox_sax_buf_init(&dr->buf, io);
|
163
256
|
dr->buf.dr = dr;
|
164
257
|
stack_init(&dr->stack);
|
165
|
-
dr->handler
|
258
|
+
dr->handler = handler;
|
166
259
|
dr->value_obj = Data_Wrap_Struct(ox_sax_value_class, 0, 0, dr);
|
167
260
|
rb_gc_register_address(&dr->value_obj);
|
168
261
|
dr->options = *options;
|
169
|
-
dr->err
|
262
|
+
dr->err = 0;
|
170
263
|
dr->blocked = 0;
|
171
|
-
dr->abort
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
264
|
+
dr->abort = false;
|
265
|
+
|
266
|
+
dr->set_pos = (Qtrue == rb_ivar_defined(handler, ox_at_pos_id)) ? set_pos : set_long_noop;
|
267
|
+
dr->set_line = (Qtrue == rb_ivar_defined(handler, ox_at_line_id)) ? set_line : set_long_noop;
|
268
|
+
dr->set_col = (Qtrue == rb_ivar_defined(handler, ox_at_column_id)) ? set_col : set_long_noop;
|
269
|
+
if (rb_respond_to(handler, ox_attr_value_id)) {
|
270
|
+
dr->attr_cb = attr_value;
|
271
|
+
dr->want_attr_name = true;
|
272
|
+
} else if (rb_respond_to(handler, ox_attr_id)) {
|
273
|
+
dr->attr_cb = attr_text;
|
274
|
+
dr->want_attr_name = true;
|
275
|
+
} else {
|
276
|
+
dr->attr_cb = attr_noop;
|
277
|
+
dr->want_attr_name = false;
|
278
|
+
}
|
279
|
+
dr->attrs_done = rb_respond_to(handler, ox_attrs_done_id) ? attrs_done : attrs_done_noop;
|
280
|
+
dr->instruct = rb_respond_to(handler, ox_instruct_id) ? instruct : instruct_noop;
|
281
|
+
dr->end_instruct = rb_respond_to(handler, ox_end_instruct_id) ? end_instruct : end_instruct_noop;
|
282
|
+
if (rb_respond_to(handler, ox_end_instruct_id) && !rb_respond_to(handler, ox_instruct_id)) {
|
283
|
+
dr->instruct = instruct_just_value;
|
284
|
+
}
|
285
|
+
dr->doctype = rb_respond_to(handler, ox_doctype_id) ? doctype : dr_loc_noop;
|
286
|
+
dr->comment = rb_respond_to(handler, ox_comment_id) ? comment : dr_loc_noop;
|
287
|
+
dr->cdata = rb_respond_to(handler, ox_cdata_id) ? cdata : dr_loc_noop;
|
288
|
+
dr->error = rb_respond_to(handler, ox_error_id) ? error : error_noop;
|
176
289
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
290
|
+
dr->has_text = rb_respond_to(handler, ox_text_id);
|
291
|
+
dr->has_value = rb_respond_to(handler, ox_value_id);
|
292
|
+
dr->has_start_element = rb_respond_to(handler, ox_start_element_id);
|
293
|
+
dr->has_end_element = rb_respond_to(handler, ox_end_element_id);
|
294
|
+
|
295
|
+
if ('\0' == *ox_default_options.encoding) {
|
296
|
+
VALUE encoding;
|
297
|
+
|
298
|
+
dr->encoding = 0;
|
299
|
+
if (rb_respond_to(io, ox_external_encoding_id) &&
|
300
|
+
Qnil != (encoding = rb_funcall(io, ox_external_encoding_id, 0))) {
|
301
|
+
int e = rb_enc_get_index(encoding);
|
302
|
+
if (0 <= e) {
|
303
|
+
dr->encoding = rb_enc_from_index(e);
|
304
|
+
}
|
305
|
+
}
|
184
306
|
} else {
|
185
307
|
dr->encoding = rb_enc_find(ox_default_options.encoding);
|
186
308
|
}
|
187
|
-
|
188
|
-
dr->encoding
|
189
|
-
|
309
|
+
dr->utf8 = (NULL == dr->encoding || rb_utf8_encoding() == dr->encoding);
|
310
|
+
if (NULL == dr->encoding || rb_utf8_encoding() == dr->encoding) { // UTF-8
|
311
|
+
dr->get_name = dr->options.symbolize ? ox_utf8_sym : ox_utf8_name; // TBD UTF8 sym?
|
312
|
+
} else {
|
313
|
+
dr->get_name = dr->options.symbolize ? ox_enc_sym : ox_enc_name;
|
314
|
+
}
|
190
315
|
}
|
191
316
|
|
192
|
-
void
|
193
|
-
ox_sax_drive_cleanup(SaxDrive dr) {
|
317
|
+
void ox_sax_drive_cleanup(SaxDrive dr) {
|
194
318
|
rb_gc_unregister_address(&dr->value_obj);
|
195
319
|
buf_cleanup(&dr->buf);
|
196
320
|
stack_cleanup(&dr->stack);
|
197
321
|
}
|
198
322
|
|
199
|
-
static void
|
200
|
-
|
201
|
-
if (dr->has.error) {
|
202
|
-
VALUE args[3];
|
203
|
-
|
204
|
-
args[0] = rb_str_new2(msg);
|
205
|
-
args[1] = LONG2NUM(line);
|
206
|
-
args[2] = LONG2NUM(col);
|
207
|
-
if (dr->has.pos) {
|
208
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
209
|
-
}
|
210
|
-
if (dr->has.pos) {
|
211
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
212
|
-
}
|
213
|
-
if (dr->has.line) {
|
214
|
-
rb_ivar_set(dr->handler, ox_at_line_id, args[1]);
|
215
|
-
}
|
216
|
-
if (dr->has.column) {
|
217
|
-
rb_ivar_set(dr->handler, ox_at_column_id, args[2]);
|
218
|
-
}
|
219
|
-
rb_funcall2(dr->handler, ox_error_id, 3, args);
|
220
|
-
}
|
323
|
+
static void ox_sax_drive_error_at(SaxDrive dr, const char *msg, off_t pos, off_t line, off_t col) {
|
324
|
+
dr->error(dr, msg, pos, line, col);
|
221
325
|
}
|
222
326
|
|
223
|
-
void
|
224
|
-
ox_sax_drive_error(SaxDrive dr, const char *msg) {
|
327
|
+
void ox_sax_drive_error(SaxDrive dr, const char *msg) {
|
225
328
|
ox_sax_drive_error_at(dr, msg, dr->buf.pos, dr->buf.line, dr->buf.col);
|
226
329
|
}
|
227
330
|
|
228
|
-
static char
|
229
|
-
|
230
|
-
char c = buf_get(&dr->buf);
|
331
|
+
static char skipBOM(SaxDrive dr) {
|
332
|
+
char c = buf_get(&dr->buf);
|
231
333
|
|
232
334
|
if (0xEF == (uint8_t)c) { /* only UTF8 is supported */
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
} else {
|
241
|
-
ox_sax_drive_error(dr, BAD_BOM "invalid BOM or a binary file.");
|
242
|
-
c = '\0';
|
243
|
-
}
|
335
|
+
if (0xBB == (uint8_t)buf_get(&dr->buf) && 0xBF == (uint8_t)buf_get(&dr->buf)) {
|
336
|
+
dr->encoding = ox_utf8_encoding;
|
337
|
+
c = buf_get(&dr->buf);
|
338
|
+
} else {
|
339
|
+
ox_sax_drive_error(dr, BAD_BOM "invalid BOM or a binary file.");
|
340
|
+
c = '\0';
|
341
|
+
}
|
244
342
|
}
|
245
343
|
return c;
|
246
344
|
}
|
247
345
|
|
248
|
-
static void
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
Nv parent;
|
346
|
+
static void parse(SaxDrive dr) {
|
347
|
+
char c = skipBOM(dr);
|
348
|
+
int state = START_STATE;
|
349
|
+
Nv parent;
|
253
350
|
|
254
351
|
while ('\0' != c) {
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
buf_backup(&dr->buf);
|
366
|
-
if (AFTER_STATE == state) {
|
367
|
-
ox_sax_drive_error(dr, OUT_OF_ORDER "multiple top level elements");
|
368
|
-
}
|
369
|
-
state = BODY_STATE;
|
370
|
-
c = read_element_start(dr);
|
371
|
-
if (0 == stack_peek(&dr->stack)) {
|
372
|
-
state = AFTER_STATE;
|
373
|
-
}
|
374
|
-
break;
|
375
|
-
}
|
376
|
-
} else {
|
377
|
-
buf_reset(&dr->buf);
|
378
|
-
c = read_text(dr);
|
379
|
-
}
|
352
|
+
buf_protect(&dr->buf);
|
353
|
+
if ('<' == c) {
|
354
|
+
c = buf_get(&dr->buf);
|
355
|
+
switch (c) {
|
356
|
+
case '?': /* instructions (xml or otherwise) */ c = read_instruction(dr); break;
|
357
|
+
case '!': /* comment or doctype */
|
358
|
+
buf_protect(&dr->buf);
|
359
|
+
c = buf_get(&dr->buf);
|
360
|
+
if ('\0' == c) {
|
361
|
+
ox_sax_drive_error(dr, NO_TERM "DOCTYPE or comment not terminated");
|
362
|
+
|
363
|
+
goto DONE;
|
364
|
+
} else if ('-' == c) {
|
365
|
+
c = buf_get(&dr->buf); /* skip first - and get next character */
|
366
|
+
if ('-' != c) {
|
367
|
+
ox_sax_drive_error(dr, INVALID_FORMAT "bad comment format, expected <!--");
|
368
|
+
} else {
|
369
|
+
c = buf_get(&dr->buf); /* skip second - */
|
370
|
+
}
|
371
|
+
c = read_comment(dr);
|
372
|
+
} else {
|
373
|
+
int i;
|
374
|
+
int spaced = 0;
|
375
|
+
off_t pos = dr->buf.pos + 1;
|
376
|
+
off_t line = dr->buf.line;
|
377
|
+
off_t col = dr->buf.col + 1;
|
378
|
+
|
379
|
+
if (is_white(c)) {
|
380
|
+
spaced = 1;
|
381
|
+
c = buf_next_non_white(&dr->buf);
|
382
|
+
}
|
383
|
+
dr->buf.str = dr->buf.tail - 1;
|
384
|
+
for (i = 7; 0 < i; i--) {
|
385
|
+
c = buf_get(&dr->buf);
|
386
|
+
}
|
387
|
+
if (0 == strncmp("DOCTYPE", dr->buf.str, 7)) {
|
388
|
+
if (spaced) {
|
389
|
+
ox_sax_drive_error_at(dr, WRONG_CHAR "<!DOCTYPE can not included spaces", pos, line, col);
|
390
|
+
}
|
391
|
+
if (START_STATE != state) {
|
392
|
+
ox_sax_drive_error(dr, OUT_OF_ORDER "DOCTYPE can not come after an element");
|
393
|
+
}
|
394
|
+
c = read_doctype(dr);
|
395
|
+
} else if (0 == strncasecmp("DOCTYPE", dr->buf.str, 7)) {
|
396
|
+
if (!dr->options.smart) {
|
397
|
+
ox_sax_drive_error(dr, CASE_ERROR "expected DOCTYPE all in caps");
|
398
|
+
}
|
399
|
+
if (START_STATE != state) {
|
400
|
+
ox_sax_drive_error(dr, OUT_OF_ORDER "DOCTYPE can not come after an element");
|
401
|
+
}
|
402
|
+
c = read_doctype(dr);
|
403
|
+
} else if (0 == strncmp("[CDATA[", dr->buf.str, 7)) {
|
404
|
+
if (spaced) {
|
405
|
+
ox_sax_drive_error_at(dr, WRONG_CHAR "<![CDATA[ can not included spaces", pos, line, col);
|
406
|
+
}
|
407
|
+
c = read_cdata(dr);
|
408
|
+
} else if (0 == strncasecmp("[CDATA[", dr->buf.str, 7)) {
|
409
|
+
if (!dr->options.smart) {
|
410
|
+
ox_sax_drive_error(dr, CASE_ERROR "expected CDATA all in caps");
|
411
|
+
}
|
412
|
+
c = read_cdata(dr);
|
413
|
+
} else {
|
414
|
+
Nv parent = stack_peek(&dr->stack);
|
415
|
+
|
416
|
+
if (0 != parent) {
|
417
|
+
parent->childCnt++;
|
418
|
+
}
|
419
|
+
ox_sax_drive_error_at(dr, WRONG_CHAR "DOCTYPE, CDATA, or comment expected", pos, line, col);
|
420
|
+
c = read_name_token(dr);
|
421
|
+
if ('>' == c) {
|
422
|
+
c = buf_get(&dr->buf);
|
423
|
+
}
|
424
|
+
}
|
425
|
+
}
|
426
|
+
break;
|
427
|
+
case '/': /* element end */
|
428
|
+
parent = stack_peek(&dr->stack);
|
429
|
+
if (0 != parent && 0 == parent->childCnt && dr->has_text && !dr->blocked) {
|
430
|
+
VALUE args[1];
|
431
|
+
args[0] = rb_str_new2("");
|
432
|
+
if (0 != dr->encoding) {
|
433
|
+
rb_enc_associate(args[0], dr->encoding);
|
434
|
+
}
|
435
|
+
dr->set_pos(dr->handler, dr->buf.pos);
|
436
|
+
dr->set_line(dr->handler, dr->buf.line);
|
437
|
+
dr->set_col(dr->handler, dr->buf.col);
|
438
|
+
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
439
|
+
}
|
440
|
+
c = read_element_end(dr);
|
441
|
+
if (0 == stack_peek(&dr->stack)) {
|
442
|
+
state = AFTER_STATE;
|
443
|
+
}
|
444
|
+
break;
|
445
|
+
case '\0': goto DONE;
|
446
|
+
default:
|
447
|
+
buf_backup(&dr->buf);
|
448
|
+
if (AFTER_STATE == state) {
|
449
|
+
ox_sax_drive_error(dr, OUT_OF_ORDER "multiple top level elements");
|
450
|
+
}
|
451
|
+
state = BODY_STATE;
|
452
|
+
c = read_element_start(dr);
|
453
|
+
if (0 == stack_peek(&dr->stack)) {
|
454
|
+
state = AFTER_STATE;
|
455
|
+
}
|
456
|
+
break;
|
457
|
+
}
|
458
|
+
} else {
|
459
|
+
buf_reset(&dr->buf);
|
460
|
+
c = read_text(dr);
|
461
|
+
}
|
380
462
|
}
|
381
|
-
|
463
|
+
DONE:
|
382
464
|
if (dr->abort) {
|
383
|
-
|
465
|
+
return;
|
384
466
|
}
|
385
467
|
if (dr->stack.head < dr->stack.tail) {
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(dr->buf.line));
|
394
|
-
}
|
395
|
-
if (dr->has.column) {
|
396
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(dr->buf.col));
|
397
|
-
}
|
398
|
-
for (sp = dr->stack.tail - 1; dr->stack.head <= sp; sp--) {
|
399
|
-
snprintf(msg, sizeof(msg) - 1, "%selement '%s' not closed", EL_MISMATCH, sp->name);
|
400
|
-
ox_sax_drive_error_at(dr, msg, dr->buf.pos, dr->buf.line, dr->buf.col);
|
401
|
-
if (dr->has.end_element && 0 >= dr->blocked &&
|
402
|
-
(NULL == sp->hint || ActiveOverlay == sp->hint->overlay || NestOverlay == sp->hint->overlay)) {
|
403
|
-
VALUE args[1];
|
404
|
-
|
405
|
-
args[0] = sp->val;
|
406
|
-
rb_funcall2(dr->handler, ox_end_element_id, 1, args);
|
407
|
-
}
|
408
|
-
if (dr->blocked && NULL != sp->hint && BlockOverlay == sp->hint->overlay) {
|
409
|
-
dr->blocked--;
|
410
|
-
}
|
468
|
+
char msg[256];
|
469
|
+
Nv sp;
|
470
|
+
|
471
|
+
for (sp = dr->stack.tail - 1; dr->stack.head <= sp; sp--) {
|
472
|
+
snprintf(msg, sizeof(msg) - 1, "%selement '%s' not closed", EL_MISMATCH, sp->name);
|
473
|
+
ox_sax_drive_error_at(dr, msg, dr->buf.pos, dr->buf.line, dr->buf.col);
|
474
|
+
end_element_cb(dr, sp->val, dr->buf.pos, dr->buf.line, dr->buf.col, sp->hint);
|
411
475
|
}
|
412
476
|
}
|
413
477
|
}
|
414
478
|
|
415
|
-
static void
|
416
|
-
|
417
|
-
char
|
418
|
-
char *end = content + len;
|
479
|
+
static void read_content(SaxDrive dr, char *content, size_t len) {
|
480
|
+
char c;
|
481
|
+
char *end = content + len;
|
419
482
|
|
420
483
|
while ('\0' != (c = buf_get(&dr->buf))) {
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
484
|
+
if (end <= content) {
|
485
|
+
*content = '\0';
|
486
|
+
ox_sax_drive_error(dr, "processing instruction content too large");
|
487
|
+
return;
|
488
|
+
}
|
489
|
+
if ('?' == c) {
|
490
|
+
if ('\0' == (c = buf_get(&dr->buf))) {
|
491
|
+
ox_sax_drive_error(dr, NO_TERM "document not terminated");
|
492
|
+
}
|
493
|
+
if ('>' == c) {
|
494
|
+
*content = '\0';
|
495
|
+
return;
|
496
|
+
} else {
|
497
|
+
*content++ = c;
|
498
|
+
}
|
499
|
+
} else {
|
500
|
+
*content++ = c;
|
501
|
+
}
|
439
502
|
}
|
440
503
|
*content = '\0';
|
441
504
|
}
|
442
505
|
|
443
506
|
/* Entered after the "<?" sequence. Ready to read the rest.
|
444
507
|
*/
|
445
|
-
static char
|
446
|
-
|
447
|
-
char
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
off_t
|
453
|
-
off_t
|
454
|
-
off_t col = dr->buf.col - 1;
|
508
|
+
static char read_instruction(SaxDrive dr) {
|
509
|
+
char content[4096];
|
510
|
+
char c;
|
511
|
+
int coff;
|
512
|
+
VALUE target = Qnil;
|
513
|
+
int is_xml;
|
514
|
+
off_t pos = dr->buf.pos - 1;
|
515
|
+
off_t line = dr->buf.line;
|
516
|
+
off_t col = dr->buf.col - 1;
|
455
517
|
|
456
518
|
buf_protect(&dr->buf);
|
457
519
|
if ('\0' == (c = read_name_token(dr))) {
|
458
520
|
return c;
|
459
521
|
}
|
460
522
|
is_xml = (0 == (dr->options.smart ? strcasecmp("xml", dr->buf.str) : strcmp("xml", dr->buf.str)));
|
461
|
-
|
462
|
-
|
463
|
-
}
|
464
|
-
if (dr->has.instruct) {
|
465
|
-
VALUE args[1];
|
466
|
-
|
467
|
-
if (dr->has.pos) {
|
468
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
469
|
-
}
|
470
|
-
if (dr->has.line) {
|
471
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
472
|
-
}
|
473
|
-
if (dr->has.column) {
|
474
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
475
|
-
}
|
476
|
-
args[0] = target;
|
477
|
-
rb_funcall2(dr->handler, ox_instruct_id, 1, args);
|
478
|
-
}
|
523
|
+
|
524
|
+
target = dr->instruct(dr, dr->buf.str, pos, line, col);
|
479
525
|
buf_protect(&dr->buf);
|
480
|
-
pos
|
526
|
+
pos = dr->buf.pos;
|
481
527
|
line = dr->buf.line;
|
482
|
-
col
|
528
|
+
col = dr->buf.col;
|
483
529
|
read_content(dr, content, sizeof(content) - 1);
|
484
530
|
coff = (int)(dr->buf.tail - dr->buf.head);
|
485
531
|
buf_reset(&dr->buf);
|
486
532
|
dr->err = 0;
|
487
|
-
c
|
488
|
-
|
489
|
-
rb_funcall(dr->handler, ox_attrs_done_id, 0);
|
490
|
-
}
|
533
|
+
c = read_attrs(dr, c, '?', '?', is_xml, 1, NULL);
|
534
|
+
dr->attrs_done(dr->handler);
|
491
535
|
if (dr->err) {
|
492
|
-
|
493
|
-
|
536
|
+
if (dr->has_text) {
|
537
|
+
VALUE args[1];
|
494
538
|
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
}
|
510
|
-
if (dr->has.column) {
|
511
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
512
|
-
}
|
513
|
-
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
514
|
-
}
|
515
|
-
dr->buf.tail = dr->buf.head + coff;
|
516
|
-
c = buf_get(&dr->buf);
|
539
|
+
if (dr->options.convert_special) {
|
540
|
+
ox_sax_collapse_special(dr, content, (int)pos, (int)line, (int)col);
|
541
|
+
}
|
542
|
+
args[0] = rb_str_new2(content);
|
543
|
+
if (0 != dr->encoding) {
|
544
|
+
rb_enc_associate(args[0], dr->encoding);
|
545
|
+
}
|
546
|
+
dr->set_pos(dr->handler, pos);
|
547
|
+
dr->set_line(dr->handler, line);
|
548
|
+
dr->set_col(dr->handler, col);
|
549
|
+
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
550
|
+
}
|
551
|
+
dr->buf.tail = dr->buf.head + coff;
|
552
|
+
c = buf_get(&dr->buf);
|
517
553
|
} else {
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
}
|
531
|
-
if (dr->has.end_instruct) {
|
532
|
-
VALUE args[1];
|
533
|
-
|
534
|
-
if (dr->has.pos) {
|
535
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
536
|
-
}
|
537
|
-
if (dr->has.line) {
|
538
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
539
|
-
}
|
540
|
-
if (dr->has.column) {
|
541
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
542
|
-
}
|
543
|
-
args[0] = target;
|
544
|
-
rb_funcall2(dr->handler, ox_end_instruct_id, 1, args);
|
554
|
+
pos = dr->buf.pos;
|
555
|
+
line = dr->buf.line;
|
556
|
+
col = dr->buf.col;
|
557
|
+
c = buf_next_non_white(&dr->buf);
|
558
|
+
if ('>' == c) {
|
559
|
+
c = buf_get(&dr->buf);
|
560
|
+
} else {
|
561
|
+
ox_sax_drive_error_at(dr, NO_TERM "instruction not terminated", pos, line, col);
|
562
|
+
if ('>' == c) {
|
563
|
+
c = buf_get(&dr->buf);
|
564
|
+
}
|
565
|
+
}
|
545
566
|
}
|
546
|
-
dr->
|
567
|
+
dr->end_instruct(dr, target, pos, line, col);
|
568
|
+
dr->buf.str = NULL;
|
547
569
|
|
548
570
|
return c;
|
549
571
|
}
|
550
572
|
|
551
|
-
static char
|
552
|
-
|
553
|
-
char c;
|
573
|
+
static char read_delimited(SaxDrive dr, char end) {
|
574
|
+
char c;
|
554
575
|
|
555
576
|
if ('"' == end || '\'' == end) {
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
577
|
+
while (end != (c = buf_get(&dr->buf))) {
|
578
|
+
if ('\0' == c) {
|
579
|
+
ox_sax_drive_error(dr, NO_TERM "doctype not terminated");
|
580
|
+
return c;
|
581
|
+
}
|
582
|
+
}
|
562
583
|
} else {
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
break;
|
578
|
-
case '[':
|
579
|
-
c = read_delimited(dr, ']');
|
580
|
-
break;
|
581
|
-
case '<':
|
582
|
-
c = read_delimited(dr, '>');
|
583
|
-
break;
|
584
|
-
default:
|
585
|
-
break;
|
586
|
-
}
|
587
|
-
}
|
584
|
+
while (1) {
|
585
|
+
c = buf_get(&dr->buf);
|
586
|
+
if (end == c) {
|
587
|
+
return c;
|
588
|
+
}
|
589
|
+
switch (c) {
|
590
|
+
case '\0': ox_sax_drive_error(dr, NO_TERM "doctype not terminated"); return c;
|
591
|
+
case '"': c = read_delimited(dr, c); break;
|
592
|
+
case '\'': c = read_delimited(dr, c); break;
|
593
|
+
case '[': c = read_delimited(dr, ']'); break;
|
594
|
+
case '<': c = read_delimited(dr, '>'); break;
|
595
|
+
default: break;
|
596
|
+
}
|
597
|
+
}
|
588
598
|
}
|
589
599
|
return c;
|
590
600
|
}
|
591
601
|
|
592
602
|
/* Entered after the "<!DOCTYPE " sequence. Ready to read the rest.
|
593
603
|
*/
|
594
|
-
static char
|
595
|
-
|
596
|
-
long
|
597
|
-
long
|
598
|
-
|
599
|
-
|
600
|
-
Nv parent = stack_peek(&dr->stack);
|
604
|
+
static char read_doctype(SaxDrive dr) {
|
605
|
+
long pos = (long)(dr->buf.pos - 9);
|
606
|
+
long line = (long)(dr->buf.line);
|
607
|
+
long col = (long)(dr->buf.col - 9);
|
608
|
+
char *s;
|
609
|
+
Nv parent = stack_peek(&dr->stack);
|
601
610
|
|
602
611
|
buf_backup(&dr->buf); /* back up to the start in case the doctype is empty */
|
603
612
|
buf_protect(&dr->buf);
|
604
613
|
read_delimited(dr, '>');
|
605
614
|
if (dr->options.smart && 0 == dr->options.hints) {
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
615
|
+
for (s = dr->buf.str; is_white(*s); s++) {
|
616
|
+
}
|
617
|
+
if (0 == strncasecmp("HTML", s, 4)) {
|
618
|
+
dr->options.hints = ox_hints_html();
|
619
|
+
}
|
610
620
|
}
|
611
621
|
*(dr->buf.tail - 1) = '\0';
|
612
622
|
if (0 != parent) {
|
613
|
-
|
614
|
-
}
|
615
|
-
if (dr->has.doctype) {
|
616
|
-
VALUE args[1];
|
617
|
-
|
618
|
-
if (dr->has.pos) {
|
619
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
620
|
-
}
|
621
|
-
if (dr->has.line) {
|
622
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
623
|
-
}
|
624
|
-
if (dr->has.column) {
|
625
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
626
|
-
}
|
627
|
-
args[0] = rb_str_new2(dr->buf.str);
|
628
|
-
rb_funcall2(dr->handler, ox_doctype_id, 1, args);
|
623
|
+
parent->childCnt++;
|
629
624
|
}
|
625
|
+
dr->doctype(dr, pos, line, col);
|
630
626
|
dr->buf.str = 0;
|
631
627
|
|
632
628
|
return buf_get(&dr->buf);
|
@@ -634,89 +630,65 @@ read_doctype(SaxDrive dr) {
|
|
634
630
|
|
635
631
|
/* Entered after the "<![CDATA[" sequence. Ready to read the rest.
|
636
632
|
*/
|
637
|
-
static char
|
638
|
-
|
639
|
-
char
|
640
|
-
|
641
|
-
|
642
|
-
long
|
643
|
-
long
|
644
|
-
|
645
|
-
|
646
|
-
Nv parent = stack_peek(&dr->stack);
|
633
|
+
static char read_cdata(SaxDrive dr) {
|
634
|
+
char c;
|
635
|
+
char zero = '\0';
|
636
|
+
int end = 0;
|
637
|
+
long pos = (long)(dr->buf.pos - 9);
|
638
|
+
long line = (long)(dr->buf.line);
|
639
|
+
long col = (long)(dr->buf.col - 9);
|
640
|
+
struct _checkPt cp = CHECK_PT_INIT;
|
641
|
+
Nv parent = stack_peek(&dr->stack);
|
647
642
|
|
648
643
|
// TBD check parent overlay
|
649
644
|
if (0 != parent) {
|
650
|
-
|
645
|
+
parent->childCnt++;
|
651
646
|
}
|
652
647
|
buf_backup(&dr->buf); /* back up to the start in case the cdata is empty */
|
653
648
|
buf_protect(&dr->buf);
|
654
649
|
while (1) {
|
655
650
|
c = buf_get(&dr->buf);
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
break;
|
660
|
-
case '>':
|
651
|
+
switch (c) {
|
652
|
+
case ']': end++; break;
|
653
|
+
case '>':
|
661
654
|
if (2 <= end) {
|
662
655
|
*(dr->buf.tail - 3) = '\0';
|
663
|
-
|
656
|
+
c = buf_get(&dr->buf);
|
664
657
|
goto CB;
|
665
658
|
}
|
666
|
-
|
667
|
-
|
668
|
-
|
659
|
+
if (!buf_checkset(&cp)) {
|
660
|
+
buf_checkpoint(&dr->buf, &cp);
|
661
|
+
}
|
669
662
|
end = 0;
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
663
|
+
break;
|
664
|
+
case '<':
|
665
|
+
if (!buf_checkset(&cp)) {
|
666
|
+
buf_checkpoint(&dr->buf, &cp);
|
667
|
+
}
|
668
|
+
end = 0;
|
669
|
+
break;
|
670
|
+
case '\0':
|
671
|
+
if (buf_checkset(&cp)) {
|
672
|
+
c = buf_checkback(&dr->buf, &cp);
|
673
|
+
ox_sax_drive_error(dr, NO_TERM "CDATA not terminated");
|
674
|
+
zero = c;
|
675
|
+
*(dr->buf.tail - 1) = '\0';
|
676
|
+
goto CB;
|
677
|
+
}
|
685
678
|
ox_sax_drive_error(dr, NO_TERM "CDATA not terminated");
|
686
679
|
return '\0';
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
}
|
695
|
-
CB:
|
696
|
-
if (!dr->blocked && (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay)) {
|
697
|
-
if (dr->has.cdata) {
|
698
|
-
VALUE args[1];
|
699
|
-
|
700
|
-
args[0] = rb_str_new2(dr->buf.str);
|
701
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
702
|
-
if (0 != dr->encoding) {
|
703
|
-
rb_enc_associate(args[0], dr->encoding);
|
704
|
-
}
|
705
|
-
#endif
|
706
|
-
if (dr->has.pos) {
|
707
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
708
|
-
}
|
709
|
-
if (dr->has.line) {
|
710
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
711
|
-
}
|
712
|
-
if (dr->has.column) {
|
713
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
714
|
-
}
|
715
|
-
rb_funcall2(dr->handler, ox_cdata_id, 1, args);
|
716
|
-
}
|
680
|
+
default:
|
681
|
+
if (1 < end && !buf_checkset(&cp)) {
|
682
|
+
buf_checkpoint(&dr->buf, &cp);
|
683
|
+
}
|
684
|
+
end = 0;
|
685
|
+
break;
|
686
|
+
}
|
717
687
|
}
|
688
|
+
CB:
|
689
|
+
dr->cdata(dr, pos, line, col);
|
718
690
|
if ('\0' != zero) {
|
719
|
-
|
691
|
+
*(dr->buf.tail - 1) = zero;
|
720
692
|
}
|
721
693
|
dr->buf.str = 0;
|
722
694
|
|
@@ -725,88 +697,60 @@ read_cdata(SaxDrive dr) {
|
|
725
697
|
|
726
698
|
/* Entered after the "<!--" sequence. Ready to read the rest.
|
727
699
|
*/
|
728
|
-
static char
|
729
|
-
|
730
|
-
char
|
731
|
-
|
732
|
-
|
733
|
-
long
|
734
|
-
long
|
735
|
-
|
736
|
-
struct _checkPt cp = CHECK_PT_INIT;
|
700
|
+
static char read_comment(SaxDrive dr) {
|
701
|
+
char c;
|
702
|
+
char zero = '\0';
|
703
|
+
int end = 0;
|
704
|
+
long pos = (long)(dr->buf.pos - 4);
|
705
|
+
long line = (long)(dr->buf.line);
|
706
|
+
long col = (long)(dr->buf.col - 4);
|
707
|
+
struct _checkPt cp = CHECK_PT_INIT;
|
737
708
|
|
738
709
|
buf_backup(&dr->buf); /* back up to the start in case the cdata is empty */
|
739
710
|
buf_protect(&dr->buf);
|
740
711
|
while (1) {
|
741
712
|
c = buf_get(&dr->buf);
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
break;
|
746
|
-
case '>':
|
713
|
+
switch (c) {
|
714
|
+
case '-': end++; break;
|
715
|
+
case '>':
|
747
716
|
if (2 <= end) {
|
748
717
|
*(dr->buf.tail - 3) = '\0';
|
749
|
-
|
718
|
+
c = buf_get(&dr->buf);
|
750
719
|
goto CB;
|
751
720
|
}
|
752
|
-
|
753
|
-
|
754
|
-
|
721
|
+
if (!buf_checkset(&cp)) {
|
722
|
+
buf_checkpoint(&dr->buf, &cp);
|
723
|
+
}
|
755
724
|
end = 0;
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
725
|
+
break;
|
726
|
+
case '<':
|
727
|
+
if (!buf_checkset(&cp)) {
|
728
|
+
buf_checkpoint(&dr->buf, &cp);
|
729
|
+
}
|
730
|
+
end = 0;
|
731
|
+
break;
|
732
|
+
case '\0':
|
733
|
+
if (buf_checkset(&cp)) {
|
734
|
+
c = buf_checkback(&dr->buf, &cp);
|
735
|
+
ox_sax_drive_error(dr, NO_TERM "comment not terminated");
|
736
|
+
zero = c;
|
737
|
+
*(dr->buf.tail - 1) = '\0';
|
738
|
+
goto CB;
|
739
|
+
}
|
771
740
|
ox_sax_drive_error(dr, NO_TERM "comment not terminated");
|
772
741
|
return '\0';
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
}
|
781
|
-
CB:
|
782
|
-
if (dr->has.comment && !dr->blocked) {
|
783
|
-
VALUE args[1];
|
784
|
-
Nv parent = stack_peek(&dr->stack);
|
785
|
-
Hint h = ox_hint_find(dr->options.hints, "!--");
|
786
|
-
|
787
|
-
if (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay ||
|
788
|
-
(NULL != h && (ActiveOverlay == h->overlay || ActiveOverlay == h->overlay))) {
|
789
|
-
|
790
|
-
args[0] = rb_str_new2(dr->buf.str);
|
791
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
792
|
-
if (0 != dr->encoding) {
|
793
|
-
rb_enc_associate(args[0], dr->encoding);
|
794
|
-
}
|
795
|
-
#endif
|
796
|
-
if (dr->has.pos) {
|
797
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
798
|
-
}
|
799
|
-
if (dr->has.line) {
|
800
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
801
|
-
}
|
802
|
-
if (dr->has.column) {
|
803
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
804
|
-
}
|
805
|
-
rb_funcall2(dr->handler, ox_comment_id, 1, args);
|
806
|
-
}
|
742
|
+
default:
|
743
|
+
if (1 < end && !buf_checkset(&cp)) {
|
744
|
+
buf_checkpoint(&dr->buf, &cp);
|
745
|
+
}
|
746
|
+
end = 0;
|
747
|
+
break;
|
748
|
+
}
|
807
749
|
}
|
750
|
+
CB:
|
751
|
+
dr->comment(dr, pos, line, col);
|
808
752
|
if ('\0' != zero) {
|
809
|
-
|
753
|
+
*(dr->buf.tail - 1) = zero;
|
810
754
|
}
|
811
755
|
dr->buf.str = 0;
|
812
756
|
|
@@ -816,106 +760,115 @@ read_comment(SaxDrive dr) {
|
|
816
760
|
/* Entered after the '<' and the first character after that. Returns status
|
817
761
|
* code.
|
818
762
|
*/
|
819
|
-
static char
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
long
|
826
|
-
long
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
Nv parent = stack_peek(&dr->stack);
|
763
|
+
static char read_element_start(SaxDrive dr) {
|
764
|
+
const char *ename = 0;
|
765
|
+
volatile VALUE name = Qnil;
|
766
|
+
char c;
|
767
|
+
int closed;
|
768
|
+
long pos = (long)(dr->buf.pos);
|
769
|
+
long line = (long)(dr->buf.line);
|
770
|
+
long col = (long)(dr->buf.col);
|
771
|
+
Hint h = NULL;
|
772
|
+
int stackless = 0;
|
773
|
+
Nv parent = stack_peek(&dr->stack);
|
831
774
|
|
832
775
|
if ('\0' == (c = read_name_token(dr))) {
|
833
776
|
return '\0';
|
834
777
|
}
|
835
778
|
if ('\0' == *dr->buf.str) {
|
836
|
-
|
779
|
+
char msg[256];
|
837
780
|
|
838
|
-
|
839
|
-
|
781
|
+
snprintf(msg, sizeof(msg) - 1, "%sempty element", INVALID_FORMAT);
|
782
|
+
ox_sax_drive_error_at(dr, msg, pos, line, col);
|
840
783
|
|
841
|
-
|
784
|
+
return buf_get(&dr->buf);
|
842
785
|
}
|
843
786
|
if (0 != parent) {
|
844
|
-
|
787
|
+
parent->childCnt++;
|
845
788
|
}
|
846
|
-
if (dr->options.smart && 0 == dr->options.hints && stack_empty(&dr->stack) &&
|
847
|
-
|
789
|
+
if (dr->options.smart && 0 == dr->options.hints && stack_empty(&dr->stack) &&
|
790
|
+
0 == strcasecmp("html", dr->buf.str)) {
|
791
|
+
dr->options.hints = ox_hints_html();
|
848
792
|
}
|
849
793
|
if (NULL != dr->options.hints) {
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
794
|
+
hint_clear_empty(dr);
|
795
|
+
h = ox_hint_find(dr->options.hints, dr->buf.str);
|
796
|
+
if (NULL == h) {
|
797
|
+
char msg[256];
|
798
|
+
|
799
|
+
snprintf(msg,
|
800
|
+
sizeof(msg),
|
801
|
+
"%s%s is not a valid element type for a %s document type.",
|
802
|
+
INV_ELEMENT,
|
803
|
+
dr->buf.str,
|
804
|
+
dr->options.hints->name);
|
805
|
+
ox_sax_drive_error(dr, msg);
|
806
|
+
} else {
|
807
|
+
Nv top_nv = stack_peek(&dr->stack);
|
808
|
+
|
809
|
+
if (AbortOverlay == h->overlay) {
|
810
|
+
if (rb_respond_to(dr->handler, ox_abort_id)) {
|
811
|
+
VALUE args[1];
|
812
|
+
|
813
|
+
args[0] = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 1, NULL);
|
814
|
+
rb_funcall2(dr->handler, ox_abort_id, 1, args);
|
815
|
+
}
|
816
|
+
dr->abort = true;
|
817
|
+
return '\0';
|
818
|
+
}
|
819
|
+
if (BlockOverlay == h->overlay) {
|
820
|
+
dr->blocked++;
|
821
|
+
}
|
822
|
+
if (h->empty) {
|
823
|
+
stackless = 1;
|
824
|
+
}
|
825
|
+
if (0 != top_nv) {
|
826
|
+
char msg[256];
|
827
|
+
|
828
|
+
if (!h->nest && NestOverlay != h->overlay && 0 == strcasecmp(top_nv->name, h->name)) {
|
829
|
+
snprintf(msg,
|
830
|
+
sizeof(msg) - 1,
|
831
|
+
"%s%s can not be nested in a %s document, closing previous.",
|
832
|
+
INV_ELEMENT,
|
833
|
+
dr->buf.str,
|
834
|
+
dr->options.hints->name);
|
835
|
+
ox_sax_drive_error(dr, msg);
|
836
|
+
stack_pop(&dr->stack);
|
837
|
+
end_element_cb(dr, top_nv->val, pos, line, col, top_nv->hint);
|
838
|
+
top_nv = stack_peek(&dr->stack);
|
839
|
+
}
|
840
|
+
if (NULL != top_nv && 0 != h->parents && NestOverlay != h->overlay) {
|
841
|
+
const char **p;
|
842
|
+
int ok = 0;
|
843
|
+
|
844
|
+
for (p = h->parents; 0 != *p; p++) {
|
845
|
+
if (0 == strcasecmp(*p, top_nv->name)) {
|
846
|
+
ok = 1;
|
847
|
+
break;
|
848
|
+
}
|
849
|
+
}
|
850
|
+
if (!ok) {
|
851
|
+
snprintf(msg,
|
852
|
+
sizeof(msg) - 1,
|
853
|
+
"%s%s can not be a child of a %s in a %s document.",
|
854
|
+
INV_ELEMENT,
|
855
|
+
h->name,
|
856
|
+
top_nv->name,
|
857
|
+
dr->options.hints->name);
|
858
|
+
ox_sax_drive_error(dr, msg);
|
859
|
+
}
|
860
|
+
}
|
861
|
+
}
|
862
|
+
}
|
863
|
+
}
|
864
|
+
name = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 1, &ename);
|
865
|
+
if (dr->has_start_element && 0 >= dr->blocked &&
|
866
|
+
(NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
867
|
+
VALUE args[1];
|
868
|
+
|
869
|
+
dr->set_pos(dr->handler, pos);
|
870
|
+
dr->set_line(dr->handler, line);
|
871
|
+
dr->set_col(dr->handler, col);
|
919
872
|
args[0] = name;
|
920
873
|
rb_funcall2(dr->handler, ox_start_element_id, 1, args);
|
921
874
|
}
|
@@ -924,362 +877,302 @@ read_element_start(SaxDrive dr) {
|
|
924
877
|
} else if ('>' == c) {
|
925
878
|
closed = 0;
|
926
879
|
} else {
|
927
|
-
|
880
|
+
buf_protect(&dr->buf);
|
928
881
|
c = read_attrs(dr, c, '/', '>', 0, 0, h);
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
882
|
+
if (is_white(c)) {
|
883
|
+
c = buf_next_non_white(&dr->buf);
|
884
|
+
}
|
885
|
+
closed = ('/' == c);
|
933
886
|
}
|
934
|
-
if (
|
935
|
-
|
887
|
+
if (0 >= dr->blocked && (NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
888
|
+
dr->attrs_done(dr->handler);
|
936
889
|
}
|
937
890
|
if (closed) {
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
col = dr->buf.col;
|
942
|
-
end_element_cb(dr, name, pos, line, col, h);
|
891
|
+
c = buf_next_non_white(&dr->buf);
|
892
|
+
|
893
|
+
end_element_cb(dr, name, dr->buf.pos, dr->buf.line, dr->buf.col, h);
|
943
894
|
} else if (stackless) {
|
944
|
-
|
895
|
+
end_element_cb(dr, name, pos, line, col, h);
|
945
896
|
} else if (NULL != h && h->jump) {
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
897
|
+
stack_push(&dr->stack, ename, name, h);
|
898
|
+
if ('>' != c) {
|
899
|
+
ox_sax_drive_error(dr, WRONG_CHAR "element not closed");
|
900
|
+
return c;
|
901
|
+
}
|
902
|
+
read_jump(dr, h->name);
|
903
|
+
return '<';
|
953
904
|
} else {
|
954
|
-
|
905
|
+
stack_push(&dr->stack, ename, name, h);
|
955
906
|
}
|
956
907
|
if ('>' != c) {
|
957
|
-
|
958
|
-
|
908
|
+
ox_sax_drive_error(dr, WRONG_CHAR "element not closed");
|
909
|
+
return c;
|
959
910
|
}
|
960
911
|
dr->buf.str = 0;
|
961
912
|
|
962
913
|
return buf_get(&dr->buf);
|
963
914
|
}
|
964
915
|
|
965
|
-
static Nv
|
966
|
-
|
967
|
-
Nv nv;
|
916
|
+
static Nv stack_rev_find(SaxDrive dr, const char *name) {
|
917
|
+
Nv nv;
|
968
918
|
|
969
919
|
for (nv = dr->stack.tail - 1; dr->stack.head <= nv; nv--) {
|
970
|
-
|
971
|
-
|
972
|
-
|
920
|
+
if (0 == (dr->options.smart ? strcasecmp(name, nv->name) : strcmp(name, nv->name))) {
|
921
|
+
return nv;
|
922
|
+
}
|
973
923
|
}
|
974
924
|
return 0;
|
975
925
|
}
|
976
926
|
|
977
|
-
static char
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
long
|
982
|
-
long
|
983
|
-
|
984
|
-
|
985
|
-
Hint h = NULL;
|
927
|
+
static char read_element_end(SaxDrive dr) {
|
928
|
+
VALUE name = Qnil;
|
929
|
+
char c;
|
930
|
+
long pos = (long)(dr->buf.pos - 1);
|
931
|
+
long line = (long)(dr->buf.line);
|
932
|
+
long col = (long)(dr->buf.col - 1);
|
933
|
+
Nv nv;
|
934
|
+
Hint h = NULL;
|
986
935
|
|
987
936
|
if ('\0' == (c = read_name_token(dr))) {
|
988
937
|
return '\0';
|
989
938
|
}
|
990
939
|
if (is_white(c)) {
|
991
|
-
|
940
|
+
c = buf_next_non_white(&dr->buf);
|
992
941
|
}
|
993
942
|
// c should be > and current is one past so read another char
|
994
|
-
c
|
943
|
+
c = buf_get(&dr->buf);
|
995
944
|
nv = stack_peek(&dr->stack);
|
996
|
-
if (0 != nv &&
|
997
|
-
|
998
|
-
|
999
|
-
|
1000
|
-
stack_pop(&dr->stack);
|
945
|
+
if (0 != nv && 0 == (dr->options.smart ? strcasecmp(dr->buf.str, nv->name) : strcmp(dr->buf.str, nv->name))) {
|
946
|
+
name = nv->val;
|
947
|
+
h = nv->hint;
|
948
|
+
stack_pop(&dr->stack);
|
1001
949
|
} else {
|
1002
|
-
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
1059
|
-
rb_funcall(dr->handler, ox_end_element_id, 1, nv->val);
|
1060
|
-
}
|
1061
|
-
if (NULL != nv->hint && BlockOverlay == nv->hint->overlay && 0 < dr->blocked) {
|
1062
|
-
dr->blocked--;
|
1063
|
-
}
|
1064
|
-
}
|
1065
|
-
name = nv->val;
|
1066
|
-
h = nv->hint;
|
1067
|
-
}
|
1068
|
-
}
|
950
|
+
// Mismatched start and end
|
951
|
+
char msg[256];
|
952
|
+
Nv match = stack_rev_find(dr, dr->buf.str);
|
953
|
+
|
954
|
+
if (0 == match) {
|
955
|
+
// Not found so open and close element.
|
956
|
+
h = ox_hint_find(dr->options.hints, dr->buf.str);
|
957
|
+
if (NULL != h && h->empty) {
|
958
|
+
// Just close normally
|
959
|
+
name = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 2, 0);
|
960
|
+
snprintf(msg,
|
961
|
+
sizeof(msg) - 1,
|
962
|
+
"%selement '%s' should not have a separate close element",
|
963
|
+
EL_MISMATCH,
|
964
|
+
dr->buf.str);
|
965
|
+
ox_sax_drive_error_at(dr, msg, pos, line, col);
|
966
|
+
return c;
|
967
|
+
} else {
|
968
|
+
snprintf(msg, sizeof(msg) - 1, "%selement '%s' closed but not opened", EL_MISMATCH, dr->buf.str);
|
969
|
+
ox_sax_drive_error_at(dr, msg, pos, line, col);
|
970
|
+
name = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 2, 0);
|
971
|
+
if (dr->has_start_element && 0 >= dr->blocked &&
|
972
|
+
(NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
973
|
+
VALUE args[1];
|
974
|
+
|
975
|
+
dr->set_pos(dr->handler, pos);
|
976
|
+
dr->set_line(dr->handler, line);
|
977
|
+
dr->set_col(dr->handler, col);
|
978
|
+
args[0] = name;
|
979
|
+
rb_funcall2(dr->handler, ox_start_element_id, 1, args);
|
980
|
+
}
|
981
|
+
if (NULL != h && BlockOverlay == h->overlay && 0 < dr->blocked) {
|
982
|
+
dr->blocked--;
|
983
|
+
}
|
984
|
+
}
|
985
|
+
} else {
|
986
|
+
// Found a match so close all up to the found element in stack.
|
987
|
+
Nv n2;
|
988
|
+
|
989
|
+
if (0 != (n2 = hint_try_close(dr, dr->buf.str))) {
|
990
|
+
name = n2->val;
|
991
|
+
h = n2->hint;
|
992
|
+
} else {
|
993
|
+
snprintf(msg,
|
994
|
+
sizeof(msg) - 1,
|
995
|
+
"%selement '%s' close does not match '%s' open",
|
996
|
+
EL_MISMATCH,
|
997
|
+
dr->buf.str,
|
998
|
+
nv->name);
|
999
|
+
ox_sax_drive_error_at(dr, msg, pos, line, col);
|
1000
|
+
for (nv = stack_pop(&dr->stack); match < nv; nv = stack_pop(&dr->stack)) {
|
1001
|
+
end_element_cb(dr, nv->val, pos, line, col, nv->hint);
|
1002
|
+
}
|
1003
|
+
name = nv->val;
|
1004
|
+
h = nv->hint;
|
1005
|
+
}
|
1006
|
+
}
|
1069
1007
|
}
|
1070
1008
|
end_element_cb(dr, name, pos, line, col, h);
|
1071
1009
|
|
1072
1010
|
return c;
|
1073
1011
|
}
|
1074
1012
|
|
1075
|
-
static char
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
long
|
1080
|
-
long
|
1081
|
-
|
1082
|
-
|
1083
|
-
int allWhite = 1;
|
1013
|
+
static char read_text(SaxDrive dr) {
|
1014
|
+
VALUE args[1];
|
1015
|
+
char c;
|
1016
|
+
long pos = (long)(dr->buf.pos);
|
1017
|
+
long line = (long)(dr->buf.line);
|
1018
|
+
long col = (long)(dr->buf.col - 1);
|
1019
|
+
Nv parent = stack_peek(&dr->stack);
|
1020
|
+
int allWhite = 1;
|
1084
1021
|
|
1085
1022
|
buf_backup(&dr->buf);
|
1086
1023
|
buf_protect(&dr->buf);
|
1087
1024
|
while ('<' != (c = buf_get(&dr->buf))) {
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
}
|
1025
|
+
switch (c) {
|
1026
|
+
case ' ':
|
1027
|
+
case '\t':
|
1028
|
+
case '\f':
|
1029
|
+
case '\n':
|
1030
|
+
case '\r': break;
|
1031
|
+
case '\0':
|
1032
|
+
if (allWhite) {
|
1033
|
+
return c;
|
1034
|
+
}
|
1099
1035
|
ox_sax_drive_error(dr, NO_TERM "text not terminated");
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
break;
|
1105
|
-
}
|
1036
|
+
goto END_OF_BUF;
|
1037
|
+
break;
|
1038
|
+
default: allWhite = 0; break;
|
1039
|
+
}
|
1106
1040
|
}
|
1107
|
-
|
1041
|
+
END_OF_BUF:
|
1108
1042
|
if ('\0' != c) {
|
1109
|
-
|
1043
|
+
*(dr->buf.tail - 1) = '\0';
|
1110
1044
|
}
|
1111
1045
|
if (allWhite) {
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
1129
|
-
}
|
1130
|
-
if (dr->has.column) {
|
1131
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
1132
|
-
}
|
1133
|
-
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
1134
|
-
}
|
1135
|
-
if (!isEnd || 0 == parent || 0 < parent->childCnt) {
|
1136
|
-
return c;
|
1137
|
-
}
|
1046
|
+
int isEnd = ('/' == buf_get(&dr->buf));
|
1047
|
+
|
1048
|
+
buf_backup(&dr->buf);
|
1049
|
+
if (dr->has_text && ((NoSkip == dr->options.skip && !isEnd) || (OffSkip == dr->options.skip))) {
|
1050
|
+
args[0] = rb_str_new2(dr->buf.str);
|
1051
|
+
if (0 != dr->encoding) {
|
1052
|
+
rb_enc_associate(args[0], dr->encoding);
|
1053
|
+
}
|
1054
|
+
dr->set_pos(dr->handler, pos);
|
1055
|
+
dr->set_line(dr->handler, line);
|
1056
|
+
dr->set_col(dr->handler, col);
|
1057
|
+
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
1058
|
+
}
|
1059
|
+
if (!isEnd || 0 == parent || 0 < parent->childCnt) {
|
1060
|
+
return c;
|
1061
|
+
}
|
1138
1062
|
}
|
1139
1063
|
if (0 != parent) {
|
1140
|
-
|
1064
|
+
parent->childCnt++;
|
1141
1065
|
}
|
1142
1066
|
if (!dr->blocked && (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay)) {
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
break;
|
1168
|
-
}
|
1169
|
-
args[0] = rb_str_new2(dr->buf.str);
|
1170
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
1171
|
-
if (0 != dr->encoding) {
|
1172
|
-
rb_enc_associate(args[0], dr->encoding);
|
1173
|
-
}
|
1174
|
-
#endif
|
1175
|
-
if (dr->has.pos) {
|
1176
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
1177
|
-
}
|
1178
|
-
if (dr->has.line) {
|
1179
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
1180
|
-
}
|
1181
|
-
if (dr->has.column) {
|
1182
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
1183
|
-
}
|
1184
|
-
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
1185
|
-
}
|
1067
|
+
if (dr->has_value) {
|
1068
|
+
dr->set_pos(dr->handler, pos);
|
1069
|
+
dr->set_line(dr->handler, line);
|
1070
|
+
dr->set_col(dr->handler, col);
|
1071
|
+
*args = dr->value_obj;
|
1072
|
+
rb_funcall2(dr->handler, ox_value_id, 1, args);
|
1073
|
+
} else if (dr->has_text) {
|
1074
|
+
if (dr->options.convert_special) {
|
1075
|
+
ox_sax_collapse_special(dr, dr->buf.str, pos, line, col);
|
1076
|
+
}
|
1077
|
+
switch (dr->options.skip) {
|
1078
|
+
case CrSkip: buf_collapse_return(dr->buf.str); break;
|
1079
|
+
case SpcSkip: buf_collapse_white(dr->buf.str); break;
|
1080
|
+
default: break;
|
1081
|
+
}
|
1082
|
+
args[0] = rb_str_new2(dr->buf.str);
|
1083
|
+
if (0 != dr->encoding) {
|
1084
|
+
rb_enc_associate(args[0], dr->encoding);
|
1085
|
+
}
|
1086
|
+
dr->set_pos(dr->handler, pos);
|
1087
|
+
dr->set_line(dr->handler, line);
|
1088
|
+
dr->set_col(dr->handler, col);
|
1089
|
+
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
1090
|
+
}
|
1186
1091
|
}
|
1187
1092
|
dr->buf.str = 0;
|
1188
1093
|
|
1189
1094
|
return c;
|
1190
1095
|
}
|
1191
1096
|
|
1192
|
-
static int
|
1193
|
-
|
1194
|
-
struct _checkPt cp;
|
1097
|
+
static int read_jump_term(Buf buf, const char *pat) {
|
1098
|
+
struct _checkPt cp;
|
1195
1099
|
|
1196
|
-
buf_checkpoint(buf, &cp);
|
1100
|
+
buf_checkpoint(buf, &cp); // right after <
|
1197
1101
|
if ('/' != buf_next_non_white(buf)) {
|
1198
|
-
|
1102
|
+
return 0;
|
1199
1103
|
}
|
1200
1104
|
if (*pat != tolower(buf_next_non_white(buf))) {
|
1201
|
-
|
1105
|
+
return 0;
|
1202
1106
|
}
|
1203
1107
|
for (pat++; '\0' != *pat; pat++) {
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1108
|
+
if (*pat != tolower(buf_get(buf))) {
|
1109
|
+
return 0;
|
1110
|
+
}
|
1207
1111
|
}
|
1208
1112
|
if ('>' != buf_next_non_white(buf)) {
|
1209
|
-
|
1113
|
+
return 0;
|
1210
1114
|
}
|
1211
1115
|
buf_checkback(buf, &cp);
|
1212
1116
|
return 1;
|
1213
1117
|
}
|
1214
1118
|
|
1215
|
-
static char
|
1216
|
-
|
1217
|
-
|
1218
|
-
|
1219
|
-
long
|
1220
|
-
long
|
1221
|
-
|
1222
|
-
Nv parent = stack_peek(&dr->stack);
|
1119
|
+
static char read_jump(SaxDrive dr, const char *pat) {
|
1120
|
+
VALUE args[1];
|
1121
|
+
char c;
|
1122
|
+
long pos = (long)(dr->buf.pos);
|
1123
|
+
long line = (long)(dr->buf.line);
|
1124
|
+
long col = (long)(dr->buf.col - 1);
|
1125
|
+
Nv parent = stack_peek(&dr->stack);
|
1223
1126
|
|
1224
1127
|
buf_protect(&dr->buf);
|
1225
1128
|
while (1) {
|
1226
|
-
|
1227
|
-
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
|
1129
|
+
c = buf_get(&dr->buf);
|
1130
|
+
switch (c) {
|
1131
|
+
case '<':
|
1132
|
+
if (read_jump_term(&dr->buf, pat)) {
|
1133
|
+
goto END_OF_BUF;
|
1134
|
+
break;
|
1135
|
+
}
|
1136
|
+
break;
|
1137
|
+
case '\0':
|
1235
1138
|
ox_sax_drive_error(dr, NO_TERM "not terminated");
|
1236
|
-
|
1237
|
-
|
1238
|
-
|
1239
|
-
|
1240
|
-
}
|
1139
|
+
goto END_OF_BUF;
|
1140
|
+
break;
|
1141
|
+
default: break;
|
1142
|
+
}
|
1241
1143
|
}
|
1242
|
-
|
1144
|
+
END_OF_BUF:
|
1243
1145
|
if ('\0' != c) {
|
1244
|
-
|
1146
|
+
*(dr->buf.tail - 1) = '\0';
|
1245
1147
|
}
|
1246
1148
|
if (0 != parent) {
|
1247
|
-
|
1149
|
+
parent->childCnt++;
|
1248
1150
|
}
|
1249
1151
|
// TBD check parent overlay
|
1250
|
-
if (dr->
|
1152
|
+
if (dr->has_text && !dr->blocked) {
|
1251
1153
|
args[0] = rb_str_new2(dr->buf.str);
|
1252
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
1253
1154
|
if (0 != dr->encoding) {
|
1254
1155
|
rb_enc_associate(args[0], dr->encoding);
|
1255
1156
|
}
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
}
|
1260
|
-
if (dr->has.line) {
|
1261
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
1262
|
-
}
|
1263
|
-
if (dr->has.column) {
|
1264
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
1265
|
-
}
|
1157
|
+
dr->set_pos(dr->handler, pos);
|
1158
|
+
dr->set_line(dr->handler, line);
|
1159
|
+
dr->set_col(dr->handler, col);
|
1266
1160
|
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
1267
1161
|
}
|
1268
1162
|
dr->buf.str = 0;
|
1269
1163
|
if ('\0' != c) {
|
1270
|
-
|
1164
|
+
*(dr->buf.tail - 1) = '<';
|
1271
1165
|
}
|
1272
1166
|
return c;
|
1273
1167
|
}
|
1274
1168
|
|
1275
|
-
static char
|
1276
|
-
|
1277
|
-
|
1278
|
-
|
1279
|
-
off_t
|
1280
|
-
off_t
|
1281
|
-
|
1282
|
-
char *attr_value;
|
1169
|
+
static char read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req, Hint h) {
|
1170
|
+
VALUE name = Qnil;
|
1171
|
+
int is_encoding = 0;
|
1172
|
+
off_t pos;
|
1173
|
+
off_t line;
|
1174
|
+
off_t col;
|
1175
|
+
char *attr_value;
|
1283
1176
|
|
1284
1177
|
// already protected by caller
|
1285
1178
|
dr->buf.str = dr->buf.tail;
|
@@ -1287,94 +1180,52 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req,
|
|
1287
1180
|
c = buf_next_non_white(&dr->buf);
|
1288
1181
|
}
|
1289
1182
|
while (termc != c && term2 != c) {
|
1290
|
-
|
1183
|
+
buf_backup(&dr->buf);
|
1291
1184
|
if ('\0' == c) {
|
1292
|
-
|
1293
|
-
|
1185
|
+
ox_sax_drive_error(dr, NO_TERM "attributes not terminated");
|
1186
|
+
return '\0';
|
1294
1187
|
}
|
1295
|
-
|
1296
|
-
|
1297
|
-
|
1188
|
+
pos = dr->buf.pos + 1;
|
1189
|
+
line = dr->buf.line;
|
1190
|
+
col = dr->buf.col + 1;
|
1298
1191
|
if ('\0' == (c = read_name_token(dr))) {
|
1299
|
-
|
1300
|
-
|
1192
|
+
ox_sax_drive_error(dr, NO_TERM "error reading token");
|
1193
|
+
return '\0';
|
1301
1194
|
}
|
1302
1195
|
if (is_xml && 0 == strcasecmp("encoding", dr->buf.str)) {
|
1303
1196
|
is_encoding = 1;
|
1304
1197
|
}
|
1305
|
-
if (dr->
|
1306
|
-
name = str2sym(dr, dr->buf.str, 0);
|
1198
|
+
if (dr->want_attr_name) {
|
1199
|
+
name = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 1, 0);
|
1307
1200
|
}
|
1308
1201
|
if (is_white(c)) {
|
1309
1202
|
c = buf_next_non_white(&dr->buf);
|
1310
1203
|
}
|
1311
1204
|
if ('=' != c) {
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1316
|
-
|
1317
|
-
|
1318
|
-
|
1205
|
+
if (eq_req) {
|
1206
|
+
dr->err = 1;
|
1207
|
+
return c;
|
1208
|
+
} else {
|
1209
|
+
ox_sax_drive_error(dr, WRONG_CHAR "no attribute value");
|
1210
|
+
attr_value = (char *)"";
|
1211
|
+
}
|
1319
1212
|
} else {
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1336
|
-
VALUE args[2];
|
1337
|
-
|
1338
|
-
if (dr->has.pos) {
|
1339
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
1340
|
-
}
|
1341
|
-
if (dr->has.line) {
|
1342
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
1343
|
-
}
|
1344
|
-
if (dr->has.column) {
|
1345
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
1346
|
-
}
|
1347
|
-
args[0] = name;
|
1348
|
-
args[1] = dr->value_obj;
|
1349
|
-
rb_funcall2(dr->handler, ox_attr_value_id, 2, args);
|
1350
|
-
} else if (dr->has.attr) {
|
1351
|
-
VALUE args[2];
|
1352
|
-
|
1353
|
-
args[0] = name;
|
1354
|
-
if (dr->options.convert_special) {
|
1355
|
-
ox_sax_collapse_special(dr, dr->buf.str, pos, line, col);
|
1356
|
-
}
|
1357
|
-
args[1] = rb_str_new2(attr_value);
|
1358
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
1359
|
-
if (0 != dr->encoding) {
|
1360
|
-
rb_enc_associate(args[1], dr->encoding);
|
1361
|
-
}
|
1362
|
-
#endif
|
1363
|
-
if (dr->has.pos) {
|
1364
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
1365
|
-
}
|
1366
|
-
if (dr->has.line) {
|
1367
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
1368
|
-
}
|
1369
|
-
if (dr->has.column) {
|
1370
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
1371
|
-
}
|
1372
|
-
rb_funcall2(dr->handler, ox_attr_id, 2, args);
|
1373
|
-
}
|
1374
|
-
}
|
1375
|
-
if (is_white(c)) {
|
1376
|
-
c = buf_next_non_white(&dr->buf);
|
1377
|
-
}
|
1213
|
+
pos = dr->buf.pos + 1;
|
1214
|
+
line = dr->buf.line;
|
1215
|
+
col = dr->buf.col + 1;
|
1216
|
+
c = read_quoted_value(dr);
|
1217
|
+
attr_value = dr->buf.str;
|
1218
|
+
if (is_encoding) {
|
1219
|
+
dr->encoding = rb_enc_find(dr->buf.str);
|
1220
|
+
is_encoding = 0;
|
1221
|
+
}
|
1222
|
+
}
|
1223
|
+
if (0 >= dr->blocked && (NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
1224
|
+
dr->attr_cb(dr, name, attr_value, pos, line, col);
|
1225
|
+
}
|
1226
|
+
if (is_white(c)) {
|
1227
|
+
c = buf_next_non_white(&dr->buf);
|
1228
|
+
}
|
1378
1229
|
}
|
1379
1230
|
dr->buf.str = 0;
|
1380
1231
|
|
@@ -1384,66 +1235,62 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req,
|
|
1384
1235
|
/* The character after the word is returned. dr->buf.tail is one past
|
1385
1236
|
* that. dr->buf.str will point to the token which will be '\0' terminated.
|
1386
1237
|
*/
|
1387
|
-
static char
|
1388
|
-
|
1389
|
-
char c;
|
1238
|
+
static char read_name_token(SaxDrive dr) {
|
1239
|
+
char c;
|
1390
1240
|
|
1391
1241
|
dr->buf.str = dr->buf.tail;
|
1392
|
-
c
|
1242
|
+
c = buf_get(&dr->buf);
|
1393
1243
|
if (is_white(c)) {
|
1394
|
-
c
|
1244
|
+
c = buf_next_non_white(&dr->buf);
|
1395
1245
|
dr->buf.str = dr->buf.tail - 1;
|
1396
1246
|
}
|
1397
1247
|
while (1) {
|
1398
|
-
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1402
|
-
|
1403
|
-
|
1404
|
-
|
1405
|
-
|
1406
|
-
|
1407
|
-
|
1408
|
-
|
1409
|
-
|
1410
|
-
return c;
|
1411
|
-
case '\0':
|
1248
|
+
switch (c) {
|
1249
|
+
case ' ':
|
1250
|
+
case '\t':
|
1251
|
+
case '\f':
|
1252
|
+
case '?':
|
1253
|
+
case '=':
|
1254
|
+
case '/':
|
1255
|
+
case '>':
|
1256
|
+
case '<':
|
1257
|
+
case '\n':
|
1258
|
+
case '\r': *(dr->buf.tail - 1) = '\0'; return c;
|
1259
|
+
case '\0':
|
1412
1260
|
/* documents never terminate after a name token */
|
1413
1261
|
ox_sax_drive_error(dr, NO_TERM "document not terminated");
|
1414
1262
|
return '\0';
|
1415
|
-
|
1416
|
-
|
1417
|
-
|
1418
|
-
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
1263
|
+
case ':':
|
1264
|
+
if ('\0' == *dr->options.strip_ns) {
|
1265
|
+
break;
|
1266
|
+
} else if ('*' == *dr->options.strip_ns && '\0' == dr->options.strip_ns[1]) {
|
1267
|
+
dr->buf.str = dr->buf.tail;
|
1268
|
+
} else if (dr->options.smart &&
|
1269
|
+
0 == strncasecmp(dr->options.strip_ns, dr->buf.str, dr->buf.tail - dr->buf.str - 1)) {
|
1270
|
+
dr->buf.str = dr->buf.tail;
|
1271
|
+
} else if (0 == strncmp(dr->options.strip_ns, dr->buf.str, dr->buf.tail - dr->buf.str - 1)) {
|
1272
|
+
dr->buf.str = dr->buf.tail;
|
1273
|
+
}
|
1274
|
+
break;
|
1275
|
+
default: break;
|
1276
|
+
}
|
1429
1277
|
c = buf_get(&dr->buf);
|
1430
1278
|
}
|
1431
1279
|
return '\0';
|
1432
1280
|
}
|
1433
1281
|
|
1434
|
-
/* The character after the quote or if there is no quote, the character after the word is returned. dr->buf.tail is one
|
1435
|
-
* that. dr->buf.str will point to the token which will be '\0' terminated.
|
1282
|
+
/* The character after the quote or if there is no quote, the character after the word is returned. dr->buf.tail is one
|
1283
|
+
* past that. dr->buf.str will point to the token which will be '\0' terminated.
|
1436
1284
|
*/
|
1437
|
-
static char
|
1438
|
-
|
1439
|
-
char c;
|
1285
|
+
static char read_quoted_value(SaxDrive dr) {
|
1286
|
+
char c;
|
1440
1287
|
|
1441
1288
|
c = buf_get(&dr->buf);
|
1442
1289
|
if (is_white(c)) {
|
1443
1290
|
c = buf_next_non_white(&dr->buf);
|
1444
1291
|
}
|
1445
1292
|
if ('"' == c || '\'' == c) {
|
1446
|
-
|
1293
|
+
char term = c;
|
1447
1294
|
|
1448
1295
|
dr->buf.str = dr->buf.tail;
|
1449
1296
|
while (term != (c = buf_get(&dr->buf))) {
|
@@ -1452,187 +1299,185 @@ read_quoted_value(SaxDrive dr) {
|
|
1452
1299
|
return '\0';
|
1453
1300
|
}
|
1454
1301
|
}
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1302
|
+
// dr->buf.tail is one past quote char
|
1303
|
+
*(dr->buf.tail - 1) = '\0'; /* terminate value */
|
1304
|
+
c = buf_get(&dr->buf);
|
1305
|
+
return c;
|
1459
1306
|
}
|
1460
1307
|
// not quoted, look for something that terminates the string
|
1461
1308
|
dr->buf.str = dr->buf.tail - 1;
|
1462
1309
|
ox_sax_drive_error(dr, WRONG_CHAR "attribute value not in quotes");
|
1463
1310
|
while ('\0' != (c = buf_get(&dr->buf))) {
|
1464
|
-
|
1465
|
-
|
1466
|
-
|
1467
|
-
|
1468
|
-
|
1469
|
-
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1473
|
-
|
1474
|
-
|
1475
|
-
|
1476
|
-
|
1477
|
-
}
|
1311
|
+
switch (c) {
|
1312
|
+
case ' ':
|
1313
|
+
// case '/':
|
1314
|
+
case '>':
|
1315
|
+
case '?': // for instructions
|
1316
|
+
case '\t':
|
1317
|
+
case '\n':
|
1318
|
+
case '\r':
|
1319
|
+
*(dr->buf.tail - 1) = '\0'; /* terminate value */
|
1320
|
+
// dr->buf.tail is in the correct position, one after the word terminator
|
1321
|
+
return c;
|
1322
|
+
default: break;
|
1323
|
+
}
|
1478
1324
|
}
|
1479
|
-
return '\0';
|
1325
|
+
return '\0'; // should never get here
|
1480
1326
|
}
|
1481
1327
|
|
1482
|
-
static char*
|
1483
|
-
|
1484
|
-
|
1485
|
-
char c;
|
1328
|
+
static char *read_hex_uint64(char *b, uint64_t *up) {
|
1329
|
+
uint64_t u = 0;
|
1330
|
+
char c;
|
1486
1331
|
|
1487
1332
|
for (; ';' != *b; b++) {
|
1488
|
-
|
1489
|
-
|
1490
|
-
|
1491
|
-
|
1492
|
-
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
|
1333
|
+
c = *b;
|
1334
|
+
if ('0' <= c && c <= '9') {
|
1335
|
+
u = (u << 4) | (uint64_t)(c - '0');
|
1336
|
+
} else if ('a' <= c && c <= 'f') {
|
1337
|
+
u = (u << 4) | (uint64_t)(c - 'a' + 10);
|
1338
|
+
} else if ('A' <= c && c <= 'F') {
|
1339
|
+
u = (u << 4) | (uint64_t)(c - 'A' + 10);
|
1340
|
+
} else {
|
1341
|
+
return 0;
|
1342
|
+
}
|
1498
1343
|
}
|
1499
1344
|
*up = u;
|
1500
1345
|
|
1501
1346
|
return b;
|
1502
1347
|
}
|
1503
1348
|
|
1504
|
-
static char*
|
1505
|
-
|
1506
|
-
|
1507
|
-
char c;
|
1349
|
+
static char *read_10_uint64(char *b, uint64_t *up) {
|
1350
|
+
uint64_t u = 0;
|
1351
|
+
char c;
|
1508
1352
|
|
1509
1353
|
for (; ';' != *b; b++) {
|
1510
|
-
|
1511
|
-
|
1512
|
-
|
1513
|
-
|
1514
|
-
|
1515
|
-
|
1354
|
+
c = *b;
|
1355
|
+
if ('0' <= c && c <= '9') {
|
1356
|
+
u = (u * 10) + (uint64_t)(c - '0');
|
1357
|
+
} else {
|
1358
|
+
return 0;
|
1359
|
+
}
|
1516
1360
|
}
|
1517
1361
|
*up = u;
|
1518
1362
|
|
1519
1363
|
return b;
|
1520
1364
|
}
|
1521
1365
|
|
1522
|
-
int
|
1523
|
-
|
1524
|
-
char
|
1525
|
-
char *b = str;
|
1366
|
+
int ox_sax_collapse_special(SaxDrive dr, char *str, long pos, long line, long col) {
|
1367
|
+
char *s = str;
|
1368
|
+
char *b = str;
|
1526
1369
|
|
1527
1370
|
while ('\0' != *s) {
|
1528
|
-
|
1529
|
-
|
1530
|
-
|
1371
|
+
switch (*s) {
|
1372
|
+
case '&': {
|
1373
|
+
int c = 0;
|
1374
|
+
char *end;
|
1531
1375
|
|
1532
1376
|
s++;
|
1533
1377
|
if ('#' == *s) {
|
1534
|
-
|
1535
|
-
|
1536
|
-
|
1537
|
-
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1544
|
-
|
1545
|
-
|
1546
|
-
|
1547
|
-
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1551
|
-
|
1552
|
-
|
1553
|
-
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1559
|
-
|
1560
|
-
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1570
|
-
|
1571
|
-
|
1572
|
-
|
1573
|
-
ox_sax_drive_error(dr, NO_TERM "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.");
|
1574
|
-
*b++ = '&';
|
1575
|
-
*b++ = '#';
|
1576
|
-
if ('\0' != x) {
|
1577
|
-
*b++ = x;
|
1578
|
-
}
|
1579
|
-
continue;
|
1580
|
-
*/
|
1581
|
-
}
|
1582
|
-
s = end + 1;
|
1583
|
-
continue;
|
1378
|
+
uint64_t u = 0;
|
1379
|
+
char x;
|
1380
|
+
|
1381
|
+
s++;
|
1382
|
+
if ('x' == *s || 'X' == *s) {
|
1383
|
+
x = *s;
|
1384
|
+
s++;
|
1385
|
+
end = read_hex_uint64(s, &u);
|
1386
|
+
} else {
|
1387
|
+
x = '\0';
|
1388
|
+
end = read_10_uint64(s, &u);
|
1389
|
+
}
|
1390
|
+
if (0 == end) {
|
1391
|
+
ox_sax_drive_error(dr, NO_TERM "special character does not end with a semicolon");
|
1392
|
+
*b++ = '&';
|
1393
|
+
*b++ = '#';
|
1394
|
+
if ('\0' != x) {
|
1395
|
+
*b++ = x;
|
1396
|
+
}
|
1397
|
+
continue;
|
1398
|
+
}
|
1399
|
+
if (u <= 0x000000000000007FULL) {
|
1400
|
+
*b++ = (char)u;
|
1401
|
+
} else if (ox_utf8_encoding == dr->encoding) {
|
1402
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
1403
|
+
} else if (0 == dr->encoding) {
|
1404
|
+
dr->encoding = ox_utf8_encoding;
|
1405
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
1406
|
+
} else {
|
1407
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
1408
|
+
/*
|
1409
|
+
ox_sax_drive_error(dr, NO_TERM "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character
|
1410
|
+
sequences."); *b++ = '&'; *b++ = '#'; if ('\0' != x) { *b++ = x;
|
1411
|
+
}
|
1412
|
+
continue;
|
1413
|
+
*/
|
1414
|
+
}
|
1415
|
+
s = end + 1;
|
1416
|
+
continue;
|
1584
1417
|
} else if (0 == strncasecmp(s, "lt;", 3)) {
|
1585
1418
|
c = '<';
|
1586
1419
|
s += 3;
|
1587
|
-
|
1420
|
+
col += 3;
|
1588
1421
|
} else if (0 == strncasecmp(s, "gt;", 3)) {
|
1589
1422
|
c = '>';
|
1590
1423
|
s += 3;
|
1591
|
-
|
1424
|
+
col += 3;
|
1592
1425
|
} else if (0 == strncasecmp(s, "amp;", 4)) {
|
1593
1426
|
c = '&';
|
1594
1427
|
s += 4;
|
1595
|
-
|
1428
|
+
col += 4;
|
1596
1429
|
} else if (0 == strncasecmp(s, "quot;", 5)) {
|
1597
1430
|
c = '"';
|
1598
1431
|
s += 5;
|
1599
|
-
|
1432
|
+
col += 5;
|
1600
1433
|
} else if (0 == strncasecmp(s, "apos;", 5)) {
|
1601
1434
|
c = '\'';
|
1602
1435
|
s += 5;
|
1603
1436
|
} else {
|
1604
|
-
|
1605
|
-
|
1606
|
-
|
1607
|
-
|
1608
|
-
|
1609
|
-
|
1610
|
-
|
1611
|
-
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1617
|
-
|
1618
|
-
|
1619
|
-
|
1620
|
-
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1437
|
+
char key[16];
|
1438
|
+
char *k = key;
|
1439
|
+
char *kend = key + sizeof(key) - 1;
|
1440
|
+
char *bn;
|
1441
|
+
char *s2 = s;
|
1442
|
+
|
1443
|
+
for (; ';' != *s2 && '\0' != *s2; s2++, k++) {
|
1444
|
+
if (kend <= k) {
|
1445
|
+
k = key;
|
1446
|
+
break;
|
1447
|
+
}
|
1448
|
+
*k = *s2;
|
1449
|
+
}
|
1450
|
+
*k = '\0';
|
1451
|
+
if ('\0' == *key || NULL == (bn = ox_entity_lookup(b, key))) {
|
1452
|
+
ox_sax_drive_error_at(dr, INVALID_FORMAT "Invalid special character sequence", pos, line, col);
|
1453
|
+
c = '&';
|
1454
|
+
} else {
|
1455
|
+
b = bn;
|
1456
|
+
s = s2 + 1;
|
1457
|
+
continue;
|
1458
|
+
}
|
1626
1459
|
}
|
1627
1460
|
*b++ = (char)c;
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1632
|
-
|
1461
|
+
col++;
|
1462
|
+
break;
|
1463
|
+
}
|
1464
|
+
case '\r':
|
1465
|
+
s++;
|
1466
|
+
if ('\n' == *s) {
|
1467
|
+
continue;
|
1633
1468
|
}
|
1634
|
-
|
1469
|
+
line++;
|
1470
|
+
col = 1;
|
1471
|
+
*b++ = '\n';
|
1472
|
+
break;
|
1473
|
+
case '\n':
|
1474
|
+
line++;
|
1475
|
+
col = 0;
|
1476
|
+
// fall through
|
1477
|
+
default:
|
1478
|
+
col++;
|
1635
1479
|
*b++ = *s++;
|
1480
|
+
break;
|
1636
1481
|
}
|
1637
1482
|
}
|
1638
1483
|
*b = '\0';
|
@@ -1640,64 +1485,43 @@ ox_sax_collapse_special(SaxDrive dr, char *str, long pos, long line, long col) {
|
|
1640
1485
|
return 0;
|
1641
1486
|
}
|
1642
1487
|
|
1643
|
-
static void
|
1644
|
-
|
1645
|
-
Nv nv;
|
1488
|
+
static void hint_clear_empty(SaxDrive dr) {
|
1489
|
+
Nv nv;
|
1646
1490
|
|
1647
1491
|
for (nv = stack_peek(&dr->stack); 0 != nv; nv = stack_peek(&dr->stack)) {
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1652
|
-
|
1653
|
-
|
1654
|
-
|
1655
|
-
|
1656
|
-
|
1492
|
+
if (0 == nv->hint) {
|
1493
|
+
break;
|
1494
|
+
}
|
1495
|
+
if (nv->hint->empty) {
|
1496
|
+
end_element_cb(dr, nv->val, dr->buf.pos, dr->buf.line, dr->buf.col, nv->hint);
|
1497
|
+
stack_pop(&dr->stack);
|
1498
|
+
} else {
|
1499
|
+
break;
|
1500
|
+
}
|
1657
1501
|
}
|
1658
1502
|
}
|
1659
1503
|
|
1660
|
-
static Nv
|
1661
|
-
|
1662
|
-
|
1663
|
-
Nv nv;
|
1504
|
+
static Nv hint_try_close(SaxDrive dr, const char *name) {
|
1505
|
+
Hint h = ox_hint_find(dr->options.hints, name);
|
1506
|
+
Nv nv;
|
1664
1507
|
|
1665
1508
|
if (0 == h) {
|
1666
|
-
|
1509
|
+
return 0;
|
1667
1510
|
}
|
1668
1511
|
for (nv = stack_peek(&dr->stack); 0 != nv; nv = stack_peek(&dr->stack)) {
|
1669
|
-
|
1670
|
-
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
1678
|
-
|
1679
|
-
|
1680
|
-
|
1681
|
-
|
1512
|
+
if (0 == strcasecmp(name, nv->name)) {
|
1513
|
+
stack_pop(&dr->stack);
|
1514
|
+
return nv;
|
1515
|
+
}
|
1516
|
+
if (0 == nv->hint) {
|
1517
|
+
break;
|
1518
|
+
}
|
1519
|
+
if (nv->hint->empty) {
|
1520
|
+
end_element_cb(dr, nv->val, dr->buf.pos, dr->buf.line, dr->buf.col, nv->hint);
|
1521
|
+
dr->stack.tail = nv;
|
1522
|
+
} else {
|
1523
|
+
break;
|
1524
|
+
}
|
1682
1525
|
}
|
1683
1526
|
return 0;
|
1684
1527
|
}
|
1685
|
-
|
1686
|
-
static void
|
1687
|
-
end_element_cb(SaxDrive dr, VALUE name, long pos, long line, long col, Hint h) {
|
1688
|
-
if (dr->has.end_element && 0 >= dr->blocked && (NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
1689
|
-
if (dr->has.pos) {
|
1690
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
1691
|
-
}
|
1692
|
-
if (dr->has.line) {
|
1693
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
1694
|
-
}
|
1695
|
-
if (dr->has.column) {
|
1696
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
1697
|
-
}
|
1698
|
-
rb_funcall(dr->handler, ox_end_element_id, 1, name);
|
1699
|
-
}
|
1700
|
-
if (NULL != h && BlockOverlay == h->overlay && 0 < dr->blocked) {
|
1701
|
-
dr->blocked--;
|
1702
|
-
}
|
1703
|
-
}
|