ox 2.14.3 → 2.14.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +33 -1
- data/README.md +1 -1
- data/ext/ox/builder.c +8 -8
- data/ext/ox/cache.c +320 -131
- data/ext/ox/cache.h +15 -13
- data/ext/ox/dump.c +2 -2
- data/ext/ox/extconf.rb +5 -2
- data/ext/ox/gen_load.c +8 -76
- data/ext/ox/hash_load.c +0 -4
- data/ext/ox/intern.c +158 -0
- data/ext/ox/intern.h +25 -0
- data/ext/ox/obj_load.c +12 -85
- data/ext/ox/ox.c +1018 -931
- data/ext/ox/ox.h +188 -210
- data/ext/ox/oxcache.c +160 -0
- data/ext/ox/oxcache.h +19 -0
- data/ext/ox/parse.c +72 -31
- data/ext/ox/sax.c +1093 -1279
- data/ext/ox/sax.h +45 -31
- data/ext/ox/sax_as.c +3 -5
- data/ext/ox/sax_buf.c +7 -16
- data/lib/ox/version.rb +1 -1
- metadata +11 -5
- data/ext/ox/sax_has.h +0 -53
data/ext/ox/sax.c
CHANGED
@@ -4,66 +4,63 @@
|
|
4
4
|
*/
|
5
5
|
|
6
6
|
#include <ctype.h>
|
7
|
-
#include <stdlib.h>
|
8
7
|
#include <errno.h>
|
9
8
|
#include <stdio.h>
|
9
|
+
#include <stdlib.h>
|
10
10
|
#include <strings.h>
|
11
11
|
#include <sys/types.h>
|
12
12
|
#if HAVE_SYS_UIO_H
|
13
13
|
#include <sys/uio.h>
|
14
14
|
#endif
|
15
|
-
#include <unistd.h>
|
16
15
|
#include <time.h>
|
16
|
+
#include <unistd.h>
|
17
17
|
|
18
|
+
#include "intern.h"
|
19
|
+
#include "ox.h"
|
18
20
|
#include "ruby.h"
|
19
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
20
21
|
#include "ruby/encoding.h"
|
21
|
-
#endif
|
22
|
-
#include "ox.h"
|
23
22
|
#include "sax.h"
|
24
|
-
#include "sax_stack.h"
|
25
23
|
#include "sax_buf.h"
|
24
|
+
#include "sax_stack.h"
|
26
25
|
#include "special.h"
|
27
26
|
|
28
|
-
#define NAME_MISMATCH
|
27
|
+
#define NAME_MISMATCH 1
|
29
28
|
|
30
|
-
#define START_STATE
|
31
|
-
#define BODY_STATE
|
32
|
-
#define AFTER_STATE
|
29
|
+
#define START_STATE 1
|
30
|
+
#define BODY_STATE 2
|
31
|
+
#define AFTER_STATE 3
|
33
32
|
|
34
33
|
// error prefixes
|
35
|
-
#define BAD_BOM
|
36
|
-
#define NO_TERM
|
37
|
-
#define INVALID_FORMAT
|
38
|
-
#define CASE_ERROR
|
39
|
-
#define OUT_OF_ORDER
|
40
|
-
#define WRONG_CHAR
|
41
|
-
#define EL_MISMATCH
|
42
|
-
#define INV_ELEMENT
|
43
|
-
|
44
|
-
#define UTF8_STR
|
45
|
-
|
46
|
-
static void
|
47
|
-
static void
|
34
|
+
#define BAD_BOM "Bad BOM: "
|
35
|
+
#define NO_TERM "Not Terminated: "
|
36
|
+
#define INVALID_FORMAT "Invalid Format: "
|
37
|
+
#define CASE_ERROR "Case Error: "
|
38
|
+
#define OUT_OF_ORDER "Out of Order: "
|
39
|
+
#define WRONG_CHAR "Unexpected Character: "
|
40
|
+
#define EL_MISMATCH "Start End Mismatch: "
|
41
|
+
#define INV_ELEMENT "Invalid Element: "
|
42
|
+
|
43
|
+
#define UTF8_STR "UTF-8"
|
44
|
+
|
45
|
+
static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options);
|
46
|
+
static void parse(SaxDrive dr);
|
48
47
|
// All read functions should return the next character after the 'thing' that was read and leave dr->cur one after that.
|
49
|
-
static char
|
50
|
-
static char
|
51
|
-
static char
|
52
|
-
static char
|
53
|
-
static char
|
54
|
-
static char
|
55
|
-
static char
|
56
|
-
static char
|
57
|
-
static char
|
58
|
-
static char
|
59
|
-
static char
|
60
|
-
|
61
|
-
static void
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
VALUE ox_sax_value_class = Qnil;
|
48
|
+
static char read_instruction(SaxDrive dr);
|
49
|
+
static char read_doctype(SaxDrive dr);
|
50
|
+
static char read_cdata(SaxDrive dr);
|
51
|
+
static char read_comment(SaxDrive dr);
|
52
|
+
static char read_element_start(SaxDrive dr);
|
53
|
+
static char read_element_end(SaxDrive dr);
|
54
|
+
static char read_text(SaxDrive dr);
|
55
|
+
static char read_jump(SaxDrive dr, const char *pat);
|
56
|
+
static char read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req, Hint h);
|
57
|
+
static char read_name_token(SaxDrive dr);
|
58
|
+
static char read_quoted_value(SaxDrive dr);
|
59
|
+
|
60
|
+
static void hint_clear_empty(SaxDrive dr);
|
61
|
+
static Nv hint_try_close(SaxDrive dr, const char *name);
|
62
|
+
|
63
|
+
VALUE ox_sax_value_class = Qnil;
|
67
64
|
|
68
65
|
static VALUE protect_parse(VALUE drp) {
|
69
66
|
parse((SaxDrive)drp);
|
@@ -71,559 +68,561 @@ static VALUE protect_parse(VALUE drp) {
|
|
71
68
|
return Qnil;
|
72
69
|
}
|
73
70
|
|
74
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
75
|
-
static int
|
76
|
-
str_is_ascii(const char *s) {
|
77
|
-
for (; '\0' != *s; s++) {
|
78
|
-
if (*s < ' ' || '~' < *s) {
|
79
|
-
return 0;
|
80
|
-
}
|
81
|
-
}
|
82
|
-
return 1;
|
83
|
-
}
|
84
|
-
#endif
|
85
|
-
|
86
71
|
VALUE
|
87
|
-
str2sym(SaxDrive dr, const char *str, const char **strp) {
|
88
|
-
VALUE
|
89
|
-
VALUE sym;
|
72
|
+
str2sym(SaxDrive dr, const char *str, size_t len, const char **strp) {
|
73
|
+
VALUE sym;
|
90
74
|
|
91
75
|
if (dr->options.symbolize) {
|
92
|
-
|
93
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
94
|
-
if (0 != dr->encoding && !str_is_ascii(str)) {
|
95
|
-
VALUE rstr = rb_str_new2(str);
|
96
|
-
|
97
|
-
// TBD if sym can be pinned down then use this all the time
|
98
|
-
rb_enc_associate(rstr, dr->encoding);
|
99
|
-
sym = rb_funcall(rstr, ox_to_sym_id, 0);
|
100
|
-
*slot = Qundef;
|
101
|
-
} else {
|
102
|
-
sym = ID2SYM(rb_intern(str));
|
103
|
-
*slot = sym;
|
104
|
-
}
|
105
|
-
#else
|
106
|
-
sym = ID2SYM(rb_intern(str));
|
107
|
-
*slot = sym;
|
108
|
-
#endif
|
109
|
-
}
|
76
|
+
sym = ox_sym_intern(str, len, strp);
|
110
77
|
} else {
|
111
|
-
|
112
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
113
|
-
if (0 != dr->encoding) {
|
114
|
-
rb_enc_associate(sym, dr->encoding);
|
115
|
-
}
|
116
|
-
#endif
|
117
|
-
if (0 != strp) {
|
118
|
-
*strp = StringValuePtr(sym);
|
119
|
-
}
|
78
|
+
sym = dr->get_name(str, len, dr->encoding, strp);
|
120
79
|
}
|
121
80
|
return sym;
|
122
81
|
}
|
123
82
|
|
124
|
-
void
|
125
|
-
|
126
|
-
|
127
|
-
|
83
|
+
void ox_sax_parse(VALUE handler, VALUE io, SaxOptions options) {
|
84
|
+
#if HAVE_RB_EXT_RACTOR_SAFE
|
85
|
+
rb_ext_ractor_safe(true);
|
86
|
+
#endif
|
87
|
+
struct _saxDrive dr;
|
88
|
+
int line = 0;
|
128
89
|
|
129
90
|
sax_drive_init(&dr, handler, io, options);
|
130
|
-
#if 0
|
131
|
-
printf("*** sax_parse with these flags\n");
|
132
|
-
printf(" has_instruct = %s\n", dr.has.instruct ? "true" : "false");
|
133
|
-
printf(" has_end_instruct = %s\n", dr.has.end_instruct ? "true" : "false");
|
134
|
-
printf(" has_attr = %s\n", dr.has.attr ? "true" : "false");
|
135
|
-
printf(" has_attr_value = %s\n", dr.has.attr_value ? "true" : "false");
|
136
|
-
printf(" has_attrs_done = %s\n", dr.has.attrs_done ? "true" : "false");
|
137
|
-
printf(" has_doctype = %s\n", dr.has.doctype ? "true" : "false");
|
138
|
-
printf(" has_comment = %s\n", dr.has.comment ? "true" : "false");
|
139
|
-
printf(" has_cdata = %s\n", dr.has.cdata ? "true" : "false");
|
140
|
-
printf(" has_text = %s\n", dr.has.text ? "true" : "false");
|
141
|
-
printf(" has_value = %s\n", dr.has.value ? "true" : "false");
|
142
|
-
printf(" has_start_element = %s\n", dr.has.start_element ? "true" : "false");
|
143
|
-
printf(" has_end_element = %s\n", dr.has.end_element ? "true" : "false");
|
144
|
-
printf(" has_error = %s\n", dr.has.error ? "true" : "false");
|
145
|
-
printf(" has_pos = %s\n", dr.has.pos ? "true" : "false");
|
146
|
-
printf(" has_line = %s\n", dr.has.line ? "true" : "false");
|
147
|
-
printf(" has_column = %s\n", dr.has.column ? "true" : "false");
|
148
|
-
#endif
|
149
|
-
//parse(&dr);
|
150
91
|
rb_protect(protect_parse, (VALUE)&dr, &line);
|
151
92
|
ox_sax_drive_cleanup(&dr);
|
152
93
|
if (0 != line) {
|
153
|
-
|
94
|
+
rb_jump_tag(line);
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
static void set_long_noop(VALUE handler, long pos) {
|
99
|
+
}
|
100
|
+
|
101
|
+
static void set_pos(VALUE handler, long pos) {
|
102
|
+
rb_ivar_set(handler, ox_at_pos_id, LONG2NUM(pos));
|
103
|
+
}
|
104
|
+
|
105
|
+
static void set_line(VALUE handler, long line) {
|
106
|
+
rb_ivar_set(handler, ox_at_line_id, LONG2NUM(line));
|
107
|
+
}
|
108
|
+
|
109
|
+
static void set_col(VALUE handler, long col) {
|
110
|
+
rb_ivar_set(handler, ox_at_column_id, LONG2NUM(col));
|
111
|
+
}
|
112
|
+
|
113
|
+
static void attr_noop(SaxDrive dr, VALUE name, char *value, long pos, long line, long col) {
|
114
|
+
}
|
115
|
+
|
116
|
+
static void attr_text(SaxDrive dr, VALUE name, char *value, long pos, long line, long col) {
|
117
|
+
VALUE args[2];
|
118
|
+
|
119
|
+
args[0] = name;
|
120
|
+
if (dr->options.convert_special) {
|
121
|
+
ox_sax_collapse_special(dr, value, pos, line, col);
|
122
|
+
}
|
123
|
+
args[1] = rb_str_new2(value);
|
124
|
+
if (0 != dr->encoding) {
|
125
|
+
rb_enc_associate(args[1], dr->encoding);
|
126
|
+
}
|
127
|
+
dr->set_pos(dr->handler, pos);
|
128
|
+
dr->set_line(dr->handler, line);
|
129
|
+
dr->set_col(dr->handler, col);
|
130
|
+
rb_funcall2(dr->handler, ox_attr_id, 2, args);
|
131
|
+
}
|
132
|
+
|
133
|
+
static void attr_value(SaxDrive dr, VALUE name, char *value, long pos, long line, long col) {
|
134
|
+
VALUE args[2];
|
135
|
+
|
136
|
+
dr->set_pos(dr->handler, pos);
|
137
|
+
dr->set_line(dr->handler, line);
|
138
|
+
dr->set_col(dr->handler, col);
|
139
|
+
args[0] = name;
|
140
|
+
args[1] = dr->value_obj;
|
141
|
+
rb_funcall2(dr->handler, ox_attr_value_id, 2, args);
|
142
|
+
}
|
143
|
+
|
144
|
+
static void attrs_done_noop(VALUE handler) {
|
145
|
+
}
|
146
|
+
|
147
|
+
static void attrs_done(VALUE handler) {
|
148
|
+
rb_funcall(handler, ox_attrs_done_id, 0);
|
149
|
+
}
|
150
|
+
|
151
|
+
static VALUE instruct_noop(SaxDrive dr, const char *target, long pos, long line, long col) {
|
152
|
+
return Qnil;
|
153
|
+
}
|
154
|
+
|
155
|
+
static VALUE instruct(SaxDrive dr, const char *target, long pos, long line, long col) {
|
156
|
+
VALUE arg = rb_str_new2(target);
|
157
|
+
|
158
|
+
dr->set_pos(dr->handler, pos);
|
159
|
+
dr->set_line(dr->handler, line);
|
160
|
+
dr->set_col(dr->handler, col);
|
161
|
+
rb_funcall(dr->handler, ox_instruct_id, 1, arg);
|
162
|
+
|
163
|
+
return arg;
|
164
|
+
}
|
165
|
+
|
166
|
+
static VALUE instruct_just_value(SaxDrive dr, const char *target, long pos, long line, long col) {
|
167
|
+
return rb_str_new2(target);
|
168
|
+
}
|
169
|
+
|
170
|
+
static void end_instruct_noop(SaxDrive dr, VALUE target, long pos, long line, long col) {
|
171
|
+
}
|
172
|
+
|
173
|
+
static void end_instruct(SaxDrive dr, VALUE target, long pos, long line, long col) {
|
174
|
+
dr->set_pos(dr->handler, pos);
|
175
|
+
dr->set_line(dr->handler, line);
|
176
|
+
dr->set_col(dr->handler, col);
|
177
|
+
rb_funcall(dr->handler, ox_end_instruct_id, 1, target);
|
178
|
+
}
|
179
|
+
|
180
|
+
static void dr_loc_noop(SaxDrive dr, long pos, long line, long col) {
|
181
|
+
}
|
182
|
+
|
183
|
+
static void comment(SaxDrive dr, long pos, long line, long col) {
|
184
|
+
if (!dr->blocked) {
|
185
|
+
Nv parent = stack_peek(&dr->stack);
|
186
|
+
Hint h = ox_hint_find(dr->options.hints, "!--");
|
187
|
+
|
188
|
+
if (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay ||
|
189
|
+
(NULL != h && (ActiveOverlay == h->overlay || ActiveOverlay == h->overlay))) {
|
190
|
+
VALUE arg = rb_str_new2(dr->buf.str);
|
191
|
+
|
192
|
+
if (0 != dr->encoding) {
|
193
|
+
rb_enc_associate(arg, dr->encoding);
|
194
|
+
}
|
195
|
+
dr->set_pos(dr->handler, pos);
|
196
|
+
dr->set_line(dr->handler, line);
|
197
|
+
dr->set_col(dr->handler, col);
|
198
|
+
rb_funcall(dr->handler, ox_comment_id, 1, arg);
|
199
|
+
}
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
static void cdata(SaxDrive dr, long pos, long line, long col) {
|
204
|
+
Nv parent = stack_peek(&dr->stack);
|
205
|
+
|
206
|
+
if (!dr->blocked && (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay)) {
|
207
|
+
VALUE arg = rb_str_new2(dr->buf.str);
|
208
|
+
|
209
|
+
if (0 != dr->encoding) {
|
210
|
+
rb_enc_associate(arg, dr->encoding);
|
211
|
+
}
|
212
|
+
dr->set_pos(dr->handler, pos);
|
213
|
+
dr->set_line(dr->handler, line);
|
214
|
+
dr->set_col(dr->handler, col);
|
215
|
+
rb_funcall(dr->handler, ox_cdata_id, 1, arg);
|
216
|
+
}
|
217
|
+
}
|
218
|
+
|
219
|
+
static void doctype(SaxDrive dr, long pos, long line, long col) {
|
220
|
+
dr->set_pos(dr->handler, pos);
|
221
|
+
dr->set_line(dr->handler, line);
|
222
|
+
dr->set_col(dr->handler, col);
|
223
|
+
rb_funcall(dr->handler, ox_doctype_id, 1, rb_str_new2(dr->buf.str));
|
224
|
+
}
|
225
|
+
|
226
|
+
static void error_noop(SaxDrive dr, const char *msg, long pos, long line, long col) {
|
227
|
+
}
|
228
|
+
|
229
|
+
static void error(SaxDrive dr, const char *msg, long pos, long line, long col) {
|
230
|
+
VALUE args[3];
|
231
|
+
|
232
|
+
args[0] = rb_str_new2(msg);
|
233
|
+
args[1] = LONG2NUM(line);
|
234
|
+
args[2] = LONG2NUM(col);
|
235
|
+
dr->set_pos(dr->handler, pos);
|
236
|
+
dr->set_line(dr->handler, line);
|
237
|
+
dr->set_col(dr->handler, col);
|
238
|
+
rb_funcall2(dr->handler, ox_error_id, 3, args);
|
239
|
+
}
|
240
|
+
|
241
|
+
static void end_element_cb(SaxDrive dr, VALUE name, long pos, long line, long col, Hint h) {
|
242
|
+
if (dr->has_end_element && 0 >= dr->blocked &&
|
243
|
+
(NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
244
|
+
dr->set_pos(dr->handler, pos);
|
245
|
+
dr->set_line(dr->handler, line);
|
246
|
+
dr->set_col(dr->handler, col);
|
247
|
+
rb_funcall(dr->handler, ox_end_element_id, 1, name);
|
248
|
+
}
|
249
|
+
if (NULL != h && BlockOverlay == h->overlay && 0 < dr->blocked) {
|
250
|
+
dr->blocked--;
|
154
251
|
}
|
155
252
|
}
|
156
253
|
|
157
|
-
static void
|
158
|
-
sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options) {
|
254
|
+
static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options) {
|
159
255
|
ox_sax_buf_init(&dr->buf, io);
|
160
256
|
dr->buf.dr = dr;
|
161
257
|
stack_init(&dr->stack);
|
162
|
-
dr->handler
|
258
|
+
dr->handler = handler;
|
163
259
|
dr->value_obj = Data_Wrap_Struct(ox_sax_value_class, 0, 0, dr);
|
164
260
|
rb_gc_register_address(&dr->value_obj);
|
165
261
|
dr->options = *options;
|
166
|
-
dr->err
|
262
|
+
dr->err = 0;
|
167
263
|
dr->blocked = 0;
|
168
|
-
dr->abort
|
169
|
-
|
170
|
-
|
264
|
+
dr->abort = false;
|
265
|
+
|
266
|
+
dr->set_pos = (Qtrue == rb_ivar_defined(handler, ox_at_pos_id)) ? set_pos : set_long_noop;
|
267
|
+
dr->set_line = (Qtrue == rb_ivar_defined(handler, ox_at_line_id)) ? set_line : set_long_noop;
|
268
|
+
dr->set_col = (Qtrue == rb_ivar_defined(handler, ox_at_column_id)) ? set_col : set_long_noop;
|
269
|
+
if (rb_respond_to(handler, ox_attr_value_id)) {
|
270
|
+
dr->attr_cb = attr_value;
|
271
|
+
dr->want_attr_name = true;
|
272
|
+
} else if (rb_respond_to(handler, ox_attr_id)) {
|
273
|
+
dr->attr_cb = attr_text;
|
274
|
+
dr->want_attr_name = true;
|
275
|
+
} else {
|
276
|
+
dr->attr_cb = attr_noop;
|
277
|
+
dr->want_attr_name = false;
|
278
|
+
}
|
279
|
+
dr->attrs_done = rb_respond_to(handler, ox_attrs_done_id) ? attrs_done : attrs_done_noop;
|
280
|
+
dr->instruct = rb_respond_to(handler, ox_instruct_id) ? instruct : instruct_noop;
|
281
|
+
dr->end_instruct = rb_respond_to(handler, ox_end_instruct_id) ? end_instruct : end_instruct_noop;
|
282
|
+
if (rb_respond_to(handler, ox_end_instruct_id) && !rb_respond_to(handler, ox_instruct_id)) {
|
283
|
+
dr->instruct = instruct_just_value;
|
284
|
+
}
|
285
|
+
dr->doctype = rb_respond_to(handler, ox_doctype_id) ? doctype : dr_loc_noop;
|
286
|
+
dr->comment = rb_respond_to(handler, ox_comment_id) ? comment : dr_loc_noop;
|
287
|
+
dr->cdata = rb_respond_to(handler, ox_cdata_id) ? cdata : dr_loc_noop;
|
288
|
+
dr->error = rb_respond_to(handler, ox_error_id) ? error : error_noop;
|
289
|
+
|
290
|
+
dr->has_text = rb_respond_to(handler, ox_text_id);
|
291
|
+
dr->has_value = rb_respond_to(handler, ox_value_id);
|
292
|
+
dr->has_start_element = rb_respond_to(handler, ox_start_element_id);
|
293
|
+
dr->has_end_element = rb_respond_to(handler, ox_end_element_id);
|
294
|
+
|
171
295
|
if ('\0' == *ox_default_options.encoding) {
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
296
|
+
VALUE encoding;
|
297
|
+
|
298
|
+
dr->encoding = 0;
|
299
|
+
if (rb_respond_to(io, ox_external_encoding_id) &&
|
300
|
+
Qnil != (encoding = rb_funcall(io, ox_external_encoding_id, 0))) {
|
301
|
+
int e = rb_enc_get_index(encoding);
|
302
|
+
if (0 <= e) {
|
303
|
+
dr->encoding = rb_enc_from_index(e);
|
304
|
+
}
|
305
|
+
}
|
181
306
|
} else {
|
182
307
|
dr->encoding = rb_enc_find(ox_default_options.encoding);
|
183
308
|
}
|
184
|
-
|
185
|
-
dr->encoding
|
186
|
-
|
309
|
+
dr->utf8 = (NULL == dr->encoding || rb_utf8_encoding() == dr->encoding);
|
310
|
+
if (NULL == dr->encoding || rb_utf8_encoding() == dr->encoding) { // UTF-8
|
311
|
+
dr->get_name = dr->options.symbolize ? ox_utf8_sym : ox_utf8_name; // TBD UTF8 sym?
|
312
|
+
} else {
|
313
|
+
dr->get_name = dr->options.symbolize ? ox_enc_sym : ox_enc_name;
|
314
|
+
}
|
187
315
|
}
|
188
316
|
|
189
|
-
void
|
190
|
-
ox_sax_drive_cleanup(SaxDrive dr) {
|
317
|
+
void ox_sax_drive_cleanup(SaxDrive dr) {
|
191
318
|
rb_gc_unregister_address(&dr->value_obj);
|
192
319
|
buf_cleanup(&dr->buf);
|
193
320
|
stack_cleanup(&dr->stack);
|
194
321
|
}
|
195
322
|
|
196
|
-
static void
|
197
|
-
|
198
|
-
if (dr->has.error) {
|
199
|
-
VALUE args[3];
|
200
|
-
|
201
|
-
args[0] = rb_str_new2(msg);
|
202
|
-
args[1] = LONG2NUM(line);
|
203
|
-
args[2] = LONG2NUM(col);
|
204
|
-
if (dr->has.pos) {
|
205
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
206
|
-
}
|
207
|
-
if (dr->has.pos) {
|
208
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
209
|
-
}
|
210
|
-
if (dr->has.line) {
|
211
|
-
rb_ivar_set(dr->handler, ox_at_line_id, args[1]);
|
212
|
-
}
|
213
|
-
if (dr->has.column) {
|
214
|
-
rb_ivar_set(dr->handler, ox_at_column_id, args[2]);
|
215
|
-
}
|
216
|
-
rb_funcall2(dr->handler, ox_error_id, 3, args);
|
217
|
-
}
|
323
|
+
static void ox_sax_drive_error_at(SaxDrive dr, const char *msg, off_t pos, off_t line, off_t col) {
|
324
|
+
dr->error(dr, msg, pos, line, col);
|
218
325
|
}
|
219
326
|
|
220
|
-
void
|
221
|
-
ox_sax_drive_error(SaxDrive dr, const char *msg) {
|
327
|
+
void ox_sax_drive_error(SaxDrive dr, const char *msg) {
|
222
328
|
ox_sax_drive_error_at(dr, msg, dr->buf.pos, dr->buf.line, dr->buf.col);
|
223
329
|
}
|
224
330
|
|
225
|
-
static char
|
226
|
-
|
227
|
-
char c = buf_get(&dr->buf);
|
331
|
+
static char skipBOM(SaxDrive dr) {
|
332
|
+
char c = buf_get(&dr->buf);
|
228
333
|
|
229
334
|
if (0xEF == (uint8_t)c) { /* only UTF8 is supported */
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
} else {
|
238
|
-
ox_sax_drive_error(dr, BAD_BOM "invalid BOM or a binary file.");
|
239
|
-
c = '\0';
|
240
|
-
}
|
335
|
+
if (0xBB == (uint8_t)buf_get(&dr->buf) && 0xBF == (uint8_t)buf_get(&dr->buf)) {
|
336
|
+
dr->encoding = ox_utf8_encoding;
|
337
|
+
c = buf_get(&dr->buf);
|
338
|
+
} else {
|
339
|
+
ox_sax_drive_error(dr, BAD_BOM "invalid BOM or a binary file.");
|
340
|
+
c = '\0';
|
341
|
+
}
|
241
342
|
}
|
242
343
|
return c;
|
243
344
|
}
|
244
345
|
|
245
|
-
static void
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
Nv parent;
|
346
|
+
static void parse(SaxDrive dr) {
|
347
|
+
char c = skipBOM(dr);
|
348
|
+
int state = START_STATE;
|
349
|
+
Nv parent;
|
250
350
|
|
251
351
|
while ('\0' != c) {
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
ox_sax_drive_error(dr, OUT_OF_ORDER "multiple top level elements");
|
365
|
-
}
|
366
|
-
state = BODY_STATE;
|
367
|
-
c = read_element_start(dr);
|
368
|
-
if (0 == stack_peek(&dr->stack)) {
|
369
|
-
state = AFTER_STATE;
|
370
|
-
}
|
371
|
-
break;
|
372
|
-
}
|
373
|
-
} else {
|
374
|
-
buf_reset(&dr->buf);
|
375
|
-
c = read_text(dr);
|
376
|
-
}
|
377
|
-
}
|
378
|
-
DONE:
|
352
|
+
buf_protect(&dr->buf);
|
353
|
+
if ('<' == c) {
|
354
|
+
c = buf_get(&dr->buf);
|
355
|
+
switch (c) {
|
356
|
+
case '?': /* instructions (xml or otherwise) */ c = read_instruction(dr); break;
|
357
|
+
case '!': /* comment or doctype */
|
358
|
+
buf_protect(&dr->buf);
|
359
|
+
c = buf_get(&dr->buf);
|
360
|
+
if ('\0' == c) {
|
361
|
+
ox_sax_drive_error(dr, NO_TERM "DOCTYPE or comment not terminated");
|
362
|
+
|
363
|
+
goto DONE;
|
364
|
+
} else if ('-' == c) {
|
365
|
+
c = buf_get(&dr->buf); /* skip first - and get next character */
|
366
|
+
if ('-' != c) {
|
367
|
+
ox_sax_drive_error(dr, INVALID_FORMAT "bad comment format, expected <!--");
|
368
|
+
} else {
|
369
|
+
c = buf_get(&dr->buf); /* skip second - */
|
370
|
+
}
|
371
|
+
c = read_comment(dr);
|
372
|
+
} else {
|
373
|
+
int i;
|
374
|
+
int spaced = 0;
|
375
|
+
off_t pos = dr->buf.pos + 1;
|
376
|
+
off_t line = dr->buf.line;
|
377
|
+
off_t col = dr->buf.col + 1;
|
378
|
+
|
379
|
+
if (is_white(c)) {
|
380
|
+
spaced = 1;
|
381
|
+
c = buf_next_non_white(&dr->buf);
|
382
|
+
}
|
383
|
+
dr->buf.str = dr->buf.tail - 1;
|
384
|
+
for (i = 7; 0 < i; i--) {
|
385
|
+
c = buf_get(&dr->buf);
|
386
|
+
}
|
387
|
+
if (0 == strncmp("DOCTYPE", dr->buf.str, 7)) {
|
388
|
+
if (spaced) {
|
389
|
+
ox_sax_drive_error_at(dr, WRONG_CHAR "<!DOCTYPE can not included spaces", pos, line, col);
|
390
|
+
}
|
391
|
+
if (START_STATE != state) {
|
392
|
+
ox_sax_drive_error(dr, OUT_OF_ORDER "DOCTYPE can not come after an element");
|
393
|
+
}
|
394
|
+
c = read_doctype(dr);
|
395
|
+
} else if (0 == strncasecmp("DOCTYPE", dr->buf.str, 7)) {
|
396
|
+
if (!dr->options.smart) {
|
397
|
+
ox_sax_drive_error(dr, CASE_ERROR "expected DOCTYPE all in caps");
|
398
|
+
}
|
399
|
+
if (START_STATE != state) {
|
400
|
+
ox_sax_drive_error(dr, OUT_OF_ORDER "DOCTYPE can not come after an element");
|
401
|
+
}
|
402
|
+
c = read_doctype(dr);
|
403
|
+
} else if (0 == strncmp("[CDATA[", dr->buf.str, 7)) {
|
404
|
+
if (spaced) {
|
405
|
+
ox_sax_drive_error_at(dr, WRONG_CHAR "<![CDATA[ can not included spaces", pos, line, col);
|
406
|
+
}
|
407
|
+
c = read_cdata(dr);
|
408
|
+
} else if (0 == strncasecmp("[CDATA[", dr->buf.str, 7)) {
|
409
|
+
if (!dr->options.smart) {
|
410
|
+
ox_sax_drive_error(dr, CASE_ERROR "expected CDATA all in caps");
|
411
|
+
}
|
412
|
+
c = read_cdata(dr);
|
413
|
+
} else {
|
414
|
+
Nv parent = stack_peek(&dr->stack);
|
415
|
+
|
416
|
+
if (0 != parent) {
|
417
|
+
parent->childCnt++;
|
418
|
+
}
|
419
|
+
ox_sax_drive_error_at(dr, WRONG_CHAR "DOCTYPE, CDATA, or comment expected", pos, line, col);
|
420
|
+
c = read_name_token(dr);
|
421
|
+
if ('>' == c) {
|
422
|
+
c = buf_get(&dr->buf);
|
423
|
+
}
|
424
|
+
}
|
425
|
+
}
|
426
|
+
break;
|
427
|
+
case '/': /* element end */
|
428
|
+
parent = stack_peek(&dr->stack);
|
429
|
+
if (0 != parent && 0 == parent->childCnt && dr->has_text && !dr->blocked) {
|
430
|
+
VALUE args[1];
|
431
|
+
args[0] = rb_str_new2("");
|
432
|
+
if (0 != dr->encoding) {
|
433
|
+
rb_enc_associate(args[0], dr->encoding);
|
434
|
+
}
|
435
|
+
dr->set_pos(dr->handler, dr->buf.pos);
|
436
|
+
dr->set_line(dr->handler, dr->buf.line);
|
437
|
+
dr->set_col(dr->handler, dr->buf.col);
|
438
|
+
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
439
|
+
}
|
440
|
+
c = read_element_end(dr);
|
441
|
+
if (0 == stack_peek(&dr->stack)) {
|
442
|
+
state = AFTER_STATE;
|
443
|
+
}
|
444
|
+
break;
|
445
|
+
case '\0': goto DONE;
|
446
|
+
default:
|
447
|
+
buf_backup(&dr->buf);
|
448
|
+
if (AFTER_STATE == state) {
|
449
|
+
ox_sax_drive_error(dr, OUT_OF_ORDER "multiple top level elements");
|
450
|
+
}
|
451
|
+
state = BODY_STATE;
|
452
|
+
c = read_element_start(dr);
|
453
|
+
if (0 == stack_peek(&dr->stack)) {
|
454
|
+
state = AFTER_STATE;
|
455
|
+
}
|
456
|
+
break;
|
457
|
+
}
|
458
|
+
} else {
|
459
|
+
buf_reset(&dr->buf);
|
460
|
+
c = read_text(dr);
|
461
|
+
}
|
462
|
+
}
|
463
|
+
DONE:
|
379
464
|
if (dr->abort) {
|
380
|
-
|
465
|
+
return;
|
381
466
|
}
|
382
467
|
if (dr->stack.head < dr->stack.tail) {
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(dr->buf.line));
|
391
|
-
}
|
392
|
-
if (dr->has.column) {
|
393
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(dr->buf.col));
|
394
|
-
}
|
395
|
-
for (sp = dr->stack.tail - 1; dr->stack.head <= sp; sp--) {
|
396
|
-
snprintf(msg, sizeof(msg) - 1, "%selement '%s' not closed", EL_MISMATCH, sp->name);
|
397
|
-
ox_sax_drive_error_at(dr, msg, dr->buf.pos, dr->buf.line, dr->buf.col);
|
398
|
-
if (dr->has.end_element && 0 >= dr->blocked &&
|
399
|
-
(NULL == sp->hint || ActiveOverlay == sp->hint->overlay || NestOverlay == sp->hint->overlay)) {
|
400
|
-
VALUE args[1];
|
401
|
-
|
402
|
-
args[0] = sp->val;
|
403
|
-
rb_funcall2(dr->handler, ox_end_element_id, 1, args);
|
404
|
-
}
|
405
|
-
if (dr->blocked && NULL != sp->hint && BlockOverlay == sp->hint->overlay) {
|
406
|
-
dr->blocked--;
|
407
|
-
}
|
468
|
+
char msg[256];
|
469
|
+
Nv sp;
|
470
|
+
|
471
|
+
for (sp = dr->stack.tail - 1; dr->stack.head <= sp; sp--) {
|
472
|
+
snprintf(msg, sizeof(msg) - 1, "%selement '%s' not closed", EL_MISMATCH, sp->name);
|
473
|
+
ox_sax_drive_error_at(dr, msg, dr->buf.pos, dr->buf.line, dr->buf.col);
|
474
|
+
end_element_cb(dr, sp->val, dr->buf.pos, dr->buf.line, dr->buf.col, sp->hint);
|
408
475
|
}
|
409
476
|
}
|
410
477
|
}
|
411
478
|
|
412
|
-
static void
|
413
|
-
|
414
|
-
char
|
415
|
-
char *end = content + len;
|
479
|
+
static void read_content(SaxDrive dr, char *content, size_t len) {
|
480
|
+
char c;
|
481
|
+
char *end = content + len;
|
416
482
|
|
417
483
|
while ('\0' != (c = buf_get(&dr->buf))) {
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
484
|
+
if (end <= content) {
|
485
|
+
*content = '\0';
|
486
|
+
ox_sax_drive_error(dr, "processing instruction content too large");
|
487
|
+
return;
|
488
|
+
}
|
489
|
+
if ('?' == c) {
|
490
|
+
if ('\0' == (c = buf_get(&dr->buf))) {
|
491
|
+
ox_sax_drive_error(dr, NO_TERM "document not terminated");
|
492
|
+
}
|
493
|
+
if ('>' == c) {
|
494
|
+
*content = '\0';
|
495
|
+
return;
|
496
|
+
} else {
|
497
|
+
*content++ = c;
|
498
|
+
}
|
499
|
+
} else {
|
500
|
+
*content++ = c;
|
501
|
+
}
|
436
502
|
}
|
437
503
|
*content = '\0';
|
438
504
|
}
|
439
505
|
|
440
506
|
/* Entered after the "<?" sequence. Ready to read the rest.
|
441
507
|
*/
|
442
|
-
static char
|
443
|
-
|
444
|
-
char
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
off_t
|
450
|
-
off_t
|
451
|
-
off_t col = dr->buf.col - 1;
|
508
|
+
static char read_instruction(SaxDrive dr) {
|
509
|
+
char content[4096];
|
510
|
+
char c;
|
511
|
+
int coff;
|
512
|
+
VALUE target = Qnil;
|
513
|
+
int is_xml;
|
514
|
+
off_t pos = dr->buf.pos - 1;
|
515
|
+
off_t line = dr->buf.line;
|
516
|
+
off_t col = dr->buf.col - 1;
|
452
517
|
|
453
518
|
buf_protect(&dr->buf);
|
454
519
|
if ('\0' == (c = read_name_token(dr))) {
|
455
520
|
return c;
|
456
521
|
}
|
457
522
|
is_xml = (0 == (dr->options.smart ? strcasecmp("xml", dr->buf.str) : strcmp("xml", dr->buf.str)));
|
458
|
-
|
459
|
-
|
460
|
-
}
|
461
|
-
if (dr->has.instruct) {
|
462
|
-
VALUE args[1];
|
463
|
-
|
464
|
-
if (dr->has.pos) {
|
465
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
466
|
-
}
|
467
|
-
if (dr->has.line) {
|
468
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
469
|
-
}
|
470
|
-
if (dr->has.column) {
|
471
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
472
|
-
}
|
473
|
-
args[0] = target;
|
474
|
-
rb_funcall2(dr->handler, ox_instruct_id, 1, args);
|
475
|
-
}
|
523
|
+
|
524
|
+
target = dr->instruct(dr, dr->buf.str, pos, line, col);
|
476
525
|
buf_protect(&dr->buf);
|
477
|
-
pos
|
526
|
+
pos = dr->buf.pos;
|
478
527
|
line = dr->buf.line;
|
479
|
-
col
|
528
|
+
col = dr->buf.col;
|
480
529
|
read_content(dr, content, sizeof(content) - 1);
|
481
530
|
coff = (int)(dr->buf.tail - dr->buf.head);
|
482
531
|
buf_reset(&dr->buf);
|
483
532
|
dr->err = 0;
|
484
|
-
c
|
485
|
-
|
486
|
-
rb_funcall(dr->handler, ox_attrs_done_id, 0);
|
487
|
-
}
|
533
|
+
c = read_attrs(dr, c, '?', '?', is_xml, 1, NULL);
|
534
|
+
dr->attrs_done(dr->handler);
|
488
535
|
if (dr->err) {
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
}
|
507
|
-
if (dr->has.column) {
|
508
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
509
|
-
}
|
510
|
-
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
511
|
-
}
|
512
|
-
dr->buf.tail = dr->buf.head + coff;
|
513
|
-
c = buf_get(&dr->buf);
|
536
|
+
if (dr->has_text) {
|
537
|
+
VALUE args[1];
|
538
|
+
|
539
|
+
if (dr->options.convert_special) {
|
540
|
+
ox_sax_collapse_special(dr, content, (int)pos, (int)line, (int)col);
|
541
|
+
}
|
542
|
+
args[0] = rb_str_new2(content);
|
543
|
+
if (0 != dr->encoding) {
|
544
|
+
rb_enc_associate(args[0], dr->encoding);
|
545
|
+
}
|
546
|
+
dr->set_pos(dr->handler, pos);
|
547
|
+
dr->set_line(dr->handler, line);
|
548
|
+
dr->set_col(dr->handler, col);
|
549
|
+
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
550
|
+
}
|
551
|
+
dr->buf.tail = dr->buf.head + coff;
|
552
|
+
c = buf_get(&dr->buf);
|
514
553
|
} else {
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
}
|
528
|
-
if (dr->has.end_instruct) {
|
529
|
-
VALUE args[1];
|
530
|
-
|
531
|
-
if (dr->has.pos) {
|
532
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
533
|
-
}
|
534
|
-
if (dr->has.line) {
|
535
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
536
|
-
}
|
537
|
-
if (dr->has.column) {
|
538
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
539
|
-
}
|
540
|
-
args[0] = target;
|
541
|
-
rb_funcall2(dr->handler, ox_end_instruct_id, 1, args);
|
554
|
+
pos = dr->buf.pos;
|
555
|
+
line = dr->buf.line;
|
556
|
+
col = dr->buf.col;
|
557
|
+
c = buf_next_non_white(&dr->buf);
|
558
|
+
if ('>' == c) {
|
559
|
+
c = buf_get(&dr->buf);
|
560
|
+
} else {
|
561
|
+
ox_sax_drive_error_at(dr, NO_TERM "instruction not terminated", pos, line, col);
|
562
|
+
if ('>' == c) {
|
563
|
+
c = buf_get(&dr->buf);
|
564
|
+
}
|
565
|
+
}
|
542
566
|
}
|
543
|
-
dr->
|
567
|
+
dr->end_instruct(dr, target, pos, line, col);
|
568
|
+
dr->buf.str = NULL;
|
544
569
|
|
545
570
|
return c;
|
546
571
|
}
|
547
572
|
|
548
|
-
static char
|
549
|
-
|
550
|
-
char c;
|
573
|
+
static char read_delimited(SaxDrive dr, char end) {
|
574
|
+
char c;
|
551
575
|
|
552
576
|
if ('"' == end || '\'' == end) {
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
577
|
+
while (end != (c = buf_get(&dr->buf))) {
|
578
|
+
if ('\0' == c) {
|
579
|
+
ox_sax_drive_error(dr, NO_TERM "doctype not terminated");
|
580
|
+
return c;
|
581
|
+
}
|
582
|
+
}
|
559
583
|
} else {
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
break;
|
575
|
-
case '[':
|
576
|
-
c = read_delimited(dr, ']');
|
577
|
-
break;
|
578
|
-
case '<':
|
579
|
-
c = read_delimited(dr, '>');
|
580
|
-
break;
|
581
|
-
default:
|
582
|
-
break;
|
583
|
-
}
|
584
|
-
}
|
584
|
+
while (1) {
|
585
|
+
c = buf_get(&dr->buf);
|
586
|
+
if (end == c) {
|
587
|
+
return c;
|
588
|
+
}
|
589
|
+
switch (c) {
|
590
|
+
case '\0': ox_sax_drive_error(dr, NO_TERM "doctype not terminated"); return c;
|
591
|
+
case '"': c = read_delimited(dr, c); break;
|
592
|
+
case '\'': c = read_delimited(dr, c); break;
|
593
|
+
case '[': c = read_delimited(dr, ']'); break;
|
594
|
+
case '<': c = read_delimited(dr, '>'); break;
|
595
|
+
default: break;
|
596
|
+
}
|
597
|
+
}
|
585
598
|
}
|
586
599
|
return c;
|
587
600
|
}
|
588
601
|
|
589
602
|
/* Entered after the "<!DOCTYPE " sequence. Ready to read the rest.
|
590
603
|
*/
|
591
|
-
static char
|
592
|
-
|
593
|
-
long
|
594
|
-
long
|
595
|
-
|
596
|
-
|
597
|
-
Nv parent = stack_peek(&dr->stack);
|
604
|
+
static char read_doctype(SaxDrive dr) {
|
605
|
+
long pos = (long)(dr->buf.pos - 9);
|
606
|
+
long line = (long)(dr->buf.line);
|
607
|
+
long col = (long)(dr->buf.col - 9);
|
608
|
+
char *s;
|
609
|
+
Nv parent = stack_peek(&dr->stack);
|
598
610
|
|
599
611
|
buf_backup(&dr->buf); /* back up to the start in case the doctype is empty */
|
600
612
|
buf_protect(&dr->buf);
|
601
613
|
read_delimited(dr, '>');
|
602
614
|
if (dr->options.smart && 0 == dr->options.hints) {
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
615
|
+
for (s = dr->buf.str; is_white(*s); s++) {
|
616
|
+
}
|
617
|
+
if (0 == strncasecmp("HTML", s, 4)) {
|
618
|
+
dr->options.hints = ox_hints_html();
|
619
|
+
}
|
607
620
|
}
|
608
621
|
*(dr->buf.tail - 1) = '\0';
|
609
622
|
if (0 != parent) {
|
610
|
-
|
611
|
-
}
|
612
|
-
if (dr->has.doctype) {
|
613
|
-
VALUE args[1];
|
614
|
-
|
615
|
-
if (dr->has.pos) {
|
616
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
617
|
-
}
|
618
|
-
if (dr->has.line) {
|
619
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
620
|
-
}
|
621
|
-
if (dr->has.column) {
|
622
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
623
|
-
}
|
624
|
-
args[0] = rb_str_new2(dr->buf.str);
|
625
|
-
rb_funcall2(dr->handler, ox_doctype_id, 1, args);
|
623
|
+
parent->childCnt++;
|
626
624
|
}
|
625
|
+
dr->doctype(dr, pos, line, col);
|
627
626
|
dr->buf.str = 0;
|
628
627
|
|
629
628
|
return buf_get(&dr->buf);
|
@@ -631,89 +630,65 @@ read_doctype(SaxDrive dr) {
|
|
631
630
|
|
632
631
|
/* Entered after the "<![CDATA[" sequence. Ready to read the rest.
|
633
632
|
*/
|
634
|
-
static char
|
635
|
-
|
636
|
-
char
|
637
|
-
|
638
|
-
|
639
|
-
long
|
640
|
-
long
|
641
|
-
|
642
|
-
|
643
|
-
Nv parent = stack_peek(&dr->stack);
|
633
|
+
static char read_cdata(SaxDrive dr) {
|
634
|
+
char c;
|
635
|
+
char zero = '\0';
|
636
|
+
int end = 0;
|
637
|
+
long pos = (long)(dr->buf.pos - 9);
|
638
|
+
long line = (long)(dr->buf.line);
|
639
|
+
long col = (long)(dr->buf.col - 9);
|
640
|
+
struct _checkPt cp = CHECK_PT_INIT;
|
641
|
+
Nv parent = stack_peek(&dr->stack);
|
644
642
|
|
645
643
|
// TBD check parent overlay
|
646
644
|
if (0 != parent) {
|
647
|
-
|
645
|
+
parent->childCnt++;
|
648
646
|
}
|
649
647
|
buf_backup(&dr->buf); /* back up to the start in case the cdata is empty */
|
650
648
|
buf_protect(&dr->buf);
|
651
649
|
while (1) {
|
652
650
|
c = buf_get(&dr->buf);
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
break;
|
657
|
-
case '>':
|
651
|
+
switch (c) {
|
652
|
+
case ']': end++; break;
|
653
|
+
case '>':
|
658
654
|
if (2 <= end) {
|
659
655
|
*(dr->buf.tail - 3) = '\0';
|
660
|
-
|
656
|
+
c = buf_get(&dr->buf);
|
661
657
|
goto CB;
|
662
658
|
}
|
663
|
-
|
664
|
-
|
665
|
-
|
659
|
+
if (!buf_checkset(&cp)) {
|
660
|
+
buf_checkpoint(&dr->buf, &cp);
|
661
|
+
}
|
662
|
+
end = 0;
|
663
|
+
break;
|
664
|
+
case '<':
|
665
|
+
if (!buf_checkset(&cp)) {
|
666
|
+
buf_checkpoint(&dr->buf, &cp);
|
667
|
+
}
|
666
668
|
end = 0;
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
c = buf_checkback(&dr->buf, &cp);
|
677
|
-
ox_sax_drive_error(dr, NO_TERM "CDATA not terminated");
|
678
|
-
zero = c;
|
679
|
-
*(dr->buf.tail - 1) = '\0';
|
680
|
-
goto CB;
|
681
|
-
}
|
669
|
+
break;
|
670
|
+
case '\0':
|
671
|
+
if (buf_checkset(&cp)) {
|
672
|
+
c = buf_checkback(&dr->buf, &cp);
|
673
|
+
ox_sax_drive_error(dr, NO_TERM "CDATA not terminated");
|
674
|
+
zero = c;
|
675
|
+
*(dr->buf.tail - 1) = '\0';
|
676
|
+
goto CB;
|
677
|
+
}
|
682
678
|
ox_sax_drive_error(dr, NO_TERM "CDATA not terminated");
|
683
679
|
return '\0';
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
}
|
692
|
-
CB:
|
693
|
-
if (!dr->blocked && (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay)) {
|
694
|
-
if (dr->has.cdata) {
|
695
|
-
VALUE args[1];
|
696
|
-
|
697
|
-
args[0] = rb_str_new2(dr->buf.str);
|
698
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
699
|
-
if (0 != dr->encoding) {
|
700
|
-
rb_enc_associate(args[0], dr->encoding);
|
701
|
-
}
|
702
|
-
#endif
|
703
|
-
if (dr->has.pos) {
|
704
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
705
|
-
}
|
706
|
-
if (dr->has.line) {
|
707
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
708
|
-
}
|
709
|
-
if (dr->has.column) {
|
710
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
711
|
-
}
|
712
|
-
rb_funcall2(dr->handler, ox_cdata_id, 1, args);
|
713
|
-
}
|
680
|
+
default:
|
681
|
+
if (1 < end && !buf_checkset(&cp)) {
|
682
|
+
buf_checkpoint(&dr->buf, &cp);
|
683
|
+
}
|
684
|
+
end = 0;
|
685
|
+
break;
|
686
|
+
}
|
714
687
|
}
|
688
|
+
CB:
|
689
|
+
dr->cdata(dr, pos, line, col);
|
715
690
|
if ('\0' != zero) {
|
716
|
-
|
691
|
+
*(dr->buf.tail - 1) = zero;
|
717
692
|
}
|
718
693
|
dr->buf.str = 0;
|
719
694
|
|
@@ -722,88 +697,60 @@ read_cdata(SaxDrive dr) {
|
|
722
697
|
|
723
698
|
/* Entered after the "<!--" sequence. Ready to read the rest.
|
724
699
|
*/
|
725
|
-
static char
|
726
|
-
|
727
|
-
char
|
728
|
-
|
729
|
-
|
730
|
-
long
|
731
|
-
long
|
732
|
-
|
733
|
-
struct _checkPt cp = CHECK_PT_INIT;
|
700
|
+
static char read_comment(SaxDrive dr) {
|
701
|
+
char c;
|
702
|
+
char zero = '\0';
|
703
|
+
int end = 0;
|
704
|
+
long pos = (long)(dr->buf.pos - 4);
|
705
|
+
long line = (long)(dr->buf.line);
|
706
|
+
long col = (long)(dr->buf.col - 4);
|
707
|
+
struct _checkPt cp = CHECK_PT_INIT;
|
734
708
|
|
735
709
|
buf_backup(&dr->buf); /* back up to the start in case the cdata is empty */
|
736
710
|
buf_protect(&dr->buf);
|
737
711
|
while (1) {
|
738
712
|
c = buf_get(&dr->buf);
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
break;
|
743
|
-
case '>':
|
713
|
+
switch (c) {
|
714
|
+
case '-': end++; break;
|
715
|
+
case '>':
|
744
716
|
if (2 <= end) {
|
745
717
|
*(dr->buf.tail - 3) = '\0';
|
746
|
-
|
718
|
+
c = buf_get(&dr->buf);
|
747
719
|
goto CB;
|
748
720
|
}
|
749
|
-
|
750
|
-
|
751
|
-
|
721
|
+
if (!buf_checkset(&cp)) {
|
722
|
+
buf_checkpoint(&dr->buf, &cp);
|
723
|
+
}
|
752
724
|
end = 0;
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
725
|
+
break;
|
726
|
+
case '<':
|
727
|
+
if (!buf_checkset(&cp)) {
|
728
|
+
buf_checkpoint(&dr->buf, &cp);
|
729
|
+
}
|
730
|
+
end = 0;
|
731
|
+
break;
|
732
|
+
case '\0':
|
733
|
+
if (buf_checkset(&cp)) {
|
734
|
+
c = buf_checkback(&dr->buf, &cp);
|
735
|
+
ox_sax_drive_error(dr, NO_TERM "comment not terminated");
|
736
|
+
zero = c;
|
737
|
+
*(dr->buf.tail - 1) = '\0';
|
738
|
+
goto CB;
|
739
|
+
}
|
768
740
|
ox_sax_drive_error(dr, NO_TERM "comment not terminated");
|
769
741
|
return '\0';
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
}
|
778
|
-
CB:
|
779
|
-
if (dr->has.comment && !dr->blocked) {
|
780
|
-
VALUE args[1];
|
781
|
-
Nv parent = stack_peek(&dr->stack);
|
782
|
-
Hint h = ox_hint_find(dr->options.hints, "!--");
|
783
|
-
|
784
|
-
if (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay ||
|
785
|
-
(NULL != h && (ActiveOverlay == h->overlay || ActiveOverlay == h->overlay))) {
|
786
|
-
|
787
|
-
args[0] = rb_str_new2(dr->buf.str);
|
788
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
789
|
-
if (0 != dr->encoding) {
|
790
|
-
rb_enc_associate(args[0], dr->encoding);
|
791
|
-
}
|
792
|
-
#endif
|
793
|
-
if (dr->has.pos) {
|
794
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
795
|
-
}
|
796
|
-
if (dr->has.line) {
|
797
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
798
|
-
}
|
799
|
-
if (dr->has.column) {
|
800
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
801
|
-
}
|
802
|
-
rb_funcall2(dr->handler, ox_comment_id, 1, args);
|
803
|
-
}
|
742
|
+
default:
|
743
|
+
if (1 < end && !buf_checkset(&cp)) {
|
744
|
+
buf_checkpoint(&dr->buf, &cp);
|
745
|
+
}
|
746
|
+
end = 0;
|
747
|
+
break;
|
748
|
+
}
|
804
749
|
}
|
750
|
+
CB:
|
751
|
+
dr->comment(dr, pos, line, col);
|
805
752
|
if ('\0' != zero) {
|
806
|
-
|
753
|
+
*(dr->buf.tail - 1) = zero;
|
807
754
|
}
|
808
755
|
dr->buf.str = 0;
|
809
756
|
|
@@ -813,106 +760,115 @@ read_comment(SaxDrive dr) {
|
|
813
760
|
/* Entered after the '<' and the first character after that. Returns status
|
814
761
|
* code.
|
815
762
|
*/
|
816
|
-
static char
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
long
|
823
|
-
long
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
Nv parent = stack_peek(&dr->stack);
|
763
|
+
static char read_element_start(SaxDrive dr) {
|
764
|
+
const char *ename = 0;
|
765
|
+
volatile VALUE name = Qnil;
|
766
|
+
char c;
|
767
|
+
int closed;
|
768
|
+
long pos = (long)(dr->buf.pos);
|
769
|
+
long line = (long)(dr->buf.line);
|
770
|
+
long col = (long)(dr->buf.col);
|
771
|
+
Hint h = NULL;
|
772
|
+
int stackless = 0;
|
773
|
+
Nv parent = stack_peek(&dr->stack);
|
828
774
|
|
829
775
|
if ('\0' == (c = read_name_token(dr))) {
|
830
776
|
return '\0';
|
831
777
|
}
|
832
778
|
if ('\0' == *dr->buf.str) {
|
833
|
-
|
779
|
+
char msg[256];
|
834
780
|
|
835
|
-
|
836
|
-
|
781
|
+
snprintf(msg, sizeof(msg) - 1, "%sempty element", INVALID_FORMAT);
|
782
|
+
ox_sax_drive_error_at(dr, msg, pos, line, col);
|
837
783
|
|
838
|
-
|
784
|
+
return buf_get(&dr->buf);
|
839
785
|
}
|
840
786
|
if (0 != parent) {
|
841
|
-
|
787
|
+
parent->childCnt++;
|
842
788
|
}
|
843
|
-
if (dr->options.smart && 0 == dr->options.hints && stack_empty(&dr->stack) &&
|
844
|
-
|
789
|
+
if (dr->options.smart && 0 == dr->options.hints && stack_empty(&dr->stack) &&
|
790
|
+
0 == strcasecmp("html", dr->buf.str)) {
|
791
|
+
dr->options.hints = ox_hints_html();
|
845
792
|
}
|
846
793
|
if (NULL != dr->options.hints) {
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
794
|
+
hint_clear_empty(dr);
|
795
|
+
h = ox_hint_find(dr->options.hints, dr->buf.str);
|
796
|
+
if (NULL == h) {
|
797
|
+
char msg[256];
|
798
|
+
|
799
|
+
snprintf(msg,
|
800
|
+
sizeof(msg),
|
801
|
+
"%s%s is not a valid element type for a %s document type.",
|
802
|
+
INV_ELEMENT,
|
803
|
+
dr->buf.str,
|
804
|
+
dr->options.hints->name);
|
805
|
+
ox_sax_drive_error(dr, msg);
|
806
|
+
} else {
|
807
|
+
Nv top_nv = stack_peek(&dr->stack);
|
808
|
+
|
809
|
+
if (AbortOverlay == h->overlay) {
|
810
|
+
if (rb_respond_to(dr->handler, ox_abort_id)) {
|
811
|
+
VALUE args[1];
|
812
|
+
|
813
|
+
args[0] = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 1, NULL);
|
814
|
+
rb_funcall2(dr->handler, ox_abort_id, 1, args);
|
815
|
+
}
|
816
|
+
dr->abort = true;
|
817
|
+
return '\0';
|
818
|
+
}
|
819
|
+
if (BlockOverlay == h->overlay) {
|
820
|
+
dr->blocked++;
|
821
|
+
}
|
822
|
+
if (h->empty) {
|
823
|
+
stackless = 1;
|
824
|
+
}
|
825
|
+
if (0 != top_nv) {
|
826
|
+
char msg[256];
|
827
|
+
|
828
|
+
if (!h->nest && NestOverlay != h->overlay && 0 == strcasecmp(top_nv->name, h->name)) {
|
829
|
+
snprintf(msg,
|
830
|
+
sizeof(msg) - 1,
|
831
|
+
"%s%s can not be nested in a %s document, closing previous.",
|
832
|
+
INV_ELEMENT,
|
833
|
+
dr->buf.str,
|
834
|
+
dr->options.hints->name);
|
835
|
+
ox_sax_drive_error(dr, msg);
|
836
|
+
stack_pop(&dr->stack);
|
837
|
+
end_element_cb(dr, top_nv->val, pos, line, col, top_nv->hint);
|
838
|
+
top_nv = stack_peek(&dr->stack);
|
839
|
+
}
|
840
|
+
if (NULL != top_nv && 0 != h->parents && NestOverlay != h->overlay) {
|
841
|
+
const char **p;
|
842
|
+
int ok = 0;
|
843
|
+
|
844
|
+
for (p = h->parents; 0 != *p; p++) {
|
845
|
+
if (0 == strcasecmp(*p, top_nv->name)) {
|
846
|
+
ok = 1;
|
847
|
+
break;
|
848
|
+
}
|
849
|
+
}
|
850
|
+
if (!ok) {
|
851
|
+
snprintf(msg,
|
852
|
+
sizeof(msg) - 1,
|
853
|
+
"%s%s can not be a child of a %s in a %s document.",
|
854
|
+
INV_ELEMENT,
|
855
|
+
h->name,
|
856
|
+
top_nv->name,
|
857
|
+
dr->options.hints->name);
|
858
|
+
ox_sax_drive_error(dr, msg);
|
859
|
+
}
|
860
|
+
}
|
861
|
+
}
|
862
|
+
}
|
863
|
+
}
|
864
|
+
name = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 1, &ename);
|
865
|
+
if (dr->has_start_element && 0 >= dr->blocked &&
|
866
|
+
(NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
867
|
+
VALUE args[1];
|
868
|
+
|
869
|
+
dr->set_pos(dr->handler, pos);
|
870
|
+
dr->set_line(dr->handler, line);
|
871
|
+
dr->set_col(dr->handler, col);
|
916
872
|
args[0] = name;
|
917
873
|
rb_funcall2(dr->handler, ox_start_element_id, 1, args);
|
918
874
|
}
|
@@ -921,362 +877,302 @@ read_element_start(SaxDrive dr) {
|
|
921
877
|
} else if ('>' == c) {
|
922
878
|
closed = 0;
|
923
879
|
} else {
|
924
|
-
|
880
|
+
buf_protect(&dr->buf);
|
925
881
|
c = read_attrs(dr, c, '/', '>', 0, 0, h);
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
882
|
+
if (is_white(c)) {
|
883
|
+
c = buf_next_non_white(&dr->buf);
|
884
|
+
}
|
885
|
+
closed = ('/' == c);
|
930
886
|
}
|
931
|
-
if (
|
932
|
-
|
887
|
+
if (0 >= dr->blocked && (NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
888
|
+
dr->attrs_done(dr->handler);
|
933
889
|
}
|
934
890
|
if (closed) {
|
935
|
-
|
936
|
-
|
937
|
-
|
938
|
-
col = dr->buf.col;
|
939
|
-
end_element_cb(dr, name, pos, line, col, h);
|
891
|
+
c = buf_next_non_white(&dr->buf);
|
892
|
+
|
893
|
+
end_element_cb(dr, name, dr->buf.pos, dr->buf.line, dr->buf.col, h);
|
940
894
|
} else if (stackless) {
|
941
|
-
|
895
|
+
end_element_cb(dr, name, pos, line, col, h);
|
942
896
|
} else if (NULL != h && h->jump) {
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
897
|
+
stack_push(&dr->stack, ename, name, h);
|
898
|
+
if ('>' != c) {
|
899
|
+
ox_sax_drive_error(dr, WRONG_CHAR "element not closed");
|
900
|
+
return c;
|
901
|
+
}
|
902
|
+
read_jump(dr, h->name);
|
903
|
+
return '<';
|
950
904
|
} else {
|
951
|
-
|
905
|
+
stack_push(&dr->stack, ename, name, h);
|
952
906
|
}
|
953
907
|
if ('>' != c) {
|
954
|
-
|
955
|
-
|
908
|
+
ox_sax_drive_error(dr, WRONG_CHAR "element not closed");
|
909
|
+
return c;
|
956
910
|
}
|
957
911
|
dr->buf.str = 0;
|
958
912
|
|
959
913
|
return buf_get(&dr->buf);
|
960
914
|
}
|
961
915
|
|
962
|
-
static Nv
|
963
|
-
|
964
|
-
Nv nv;
|
916
|
+
static Nv stack_rev_find(SaxDrive dr, const char *name) {
|
917
|
+
Nv nv;
|
965
918
|
|
966
919
|
for (nv = dr->stack.tail - 1; dr->stack.head <= nv; nv--) {
|
967
|
-
|
968
|
-
|
969
|
-
|
920
|
+
if (0 == (dr->options.smart ? strcasecmp(name, nv->name) : strcmp(name, nv->name))) {
|
921
|
+
return nv;
|
922
|
+
}
|
970
923
|
}
|
971
924
|
return 0;
|
972
925
|
}
|
973
926
|
|
974
|
-
static char
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
long
|
979
|
-
long
|
980
|
-
|
981
|
-
|
982
|
-
Hint h = NULL;
|
927
|
+
static char read_element_end(SaxDrive dr) {
|
928
|
+
VALUE name = Qnil;
|
929
|
+
char c;
|
930
|
+
long pos = (long)(dr->buf.pos - 1);
|
931
|
+
long line = (long)(dr->buf.line);
|
932
|
+
long col = (long)(dr->buf.col - 1);
|
933
|
+
Nv nv;
|
934
|
+
Hint h = NULL;
|
983
935
|
|
984
936
|
if ('\0' == (c = read_name_token(dr))) {
|
985
937
|
return '\0';
|
986
938
|
}
|
987
939
|
if (is_white(c)) {
|
988
|
-
|
940
|
+
c = buf_next_non_white(&dr->buf);
|
989
941
|
}
|
990
942
|
// c should be > and current is one past so read another char
|
991
|
-
c
|
943
|
+
c = buf_get(&dr->buf);
|
992
944
|
nv = stack_peek(&dr->stack);
|
993
|
-
if (0 != nv &&
|
994
|
-
|
995
|
-
|
996
|
-
|
997
|
-
stack_pop(&dr->stack);
|
945
|
+
if (0 != nv && 0 == (dr->options.smart ? strcasecmp(dr->buf.str, nv->name) : strcmp(dr->buf.str, nv->name))) {
|
946
|
+
name = nv->val;
|
947
|
+
h = nv->hint;
|
948
|
+
stack_pop(&dr->stack);
|
998
949
|
} else {
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
rb_funcall(dr->handler, ox_end_element_id, 1, nv->val);
|
1057
|
-
}
|
1058
|
-
if (NULL != nv->hint && BlockOverlay == nv->hint->overlay && 0 < dr->blocked) {
|
1059
|
-
dr->blocked--;
|
1060
|
-
}
|
1061
|
-
}
|
1062
|
-
name = nv->val;
|
1063
|
-
h = nv->hint;
|
1064
|
-
}
|
1065
|
-
}
|
950
|
+
// Mismatched start and end
|
951
|
+
char msg[256];
|
952
|
+
Nv match = stack_rev_find(dr, dr->buf.str);
|
953
|
+
|
954
|
+
if (0 == match) {
|
955
|
+
// Not found so open and close element.
|
956
|
+
h = ox_hint_find(dr->options.hints, dr->buf.str);
|
957
|
+
if (NULL != h && h->empty) {
|
958
|
+
// Just close normally
|
959
|
+
name = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 2, 0);
|
960
|
+
snprintf(msg,
|
961
|
+
sizeof(msg) - 1,
|
962
|
+
"%selement '%s' should not have a separate close element",
|
963
|
+
EL_MISMATCH,
|
964
|
+
dr->buf.str);
|
965
|
+
ox_sax_drive_error_at(dr, msg, pos, line, col);
|
966
|
+
return c;
|
967
|
+
} else {
|
968
|
+
snprintf(msg, sizeof(msg) - 1, "%selement '%s' closed but not opened", EL_MISMATCH, dr->buf.str);
|
969
|
+
ox_sax_drive_error_at(dr, msg, pos, line, col);
|
970
|
+
name = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 2, 0);
|
971
|
+
if (dr->has_start_element && 0 >= dr->blocked &&
|
972
|
+
(NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
973
|
+
VALUE args[1];
|
974
|
+
|
975
|
+
dr->set_pos(dr->handler, pos);
|
976
|
+
dr->set_line(dr->handler, line);
|
977
|
+
dr->set_col(dr->handler, col);
|
978
|
+
args[0] = name;
|
979
|
+
rb_funcall2(dr->handler, ox_start_element_id, 1, args);
|
980
|
+
}
|
981
|
+
if (NULL != h && BlockOverlay == h->overlay && 0 < dr->blocked) {
|
982
|
+
dr->blocked--;
|
983
|
+
}
|
984
|
+
}
|
985
|
+
} else {
|
986
|
+
// Found a match so close all up to the found element in stack.
|
987
|
+
Nv n2;
|
988
|
+
|
989
|
+
if (0 != (n2 = hint_try_close(dr, dr->buf.str))) {
|
990
|
+
name = n2->val;
|
991
|
+
h = n2->hint;
|
992
|
+
} else {
|
993
|
+
snprintf(msg,
|
994
|
+
sizeof(msg) - 1,
|
995
|
+
"%selement '%s' close does not match '%s' open",
|
996
|
+
EL_MISMATCH,
|
997
|
+
dr->buf.str,
|
998
|
+
nv->name);
|
999
|
+
ox_sax_drive_error_at(dr, msg, pos, line, col);
|
1000
|
+
for (nv = stack_pop(&dr->stack); match < nv; nv = stack_pop(&dr->stack)) {
|
1001
|
+
end_element_cb(dr, nv->val, pos, line, col, nv->hint);
|
1002
|
+
}
|
1003
|
+
name = nv->val;
|
1004
|
+
h = nv->hint;
|
1005
|
+
}
|
1006
|
+
}
|
1066
1007
|
}
|
1067
1008
|
end_element_cb(dr, name, pos, line, col, h);
|
1068
1009
|
|
1069
1010
|
return c;
|
1070
1011
|
}
|
1071
1012
|
|
1072
|
-
static char
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
long
|
1077
|
-
long
|
1078
|
-
|
1079
|
-
|
1080
|
-
int allWhite = 1;
|
1013
|
+
static char read_text(SaxDrive dr) {
|
1014
|
+
VALUE args[1];
|
1015
|
+
char c;
|
1016
|
+
long pos = (long)(dr->buf.pos);
|
1017
|
+
long line = (long)(dr->buf.line);
|
1018
|
+
long col = (long)(dr->buf.col - 1);
|
1019
|
+
Nv parent = stack_peek(&dr->stack);
|
1020
|
+
int allWhite = 1;
|
1081
1021
|
|
1082
1022
|
buf_backup(&dr->buf);
|
1083
1023
|
buf_protect(&dr->buf);
|
1084
1024
|
while ('<' != (c = buf_get(&dr->buf))) {
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
}
|
1025
|
+
switch (c) {
|
1026
|
+
case ' ':
|
1027
|
+
case '\t':
|
1028
|
+
case '\f':
|
1029
|
+
case '\n':
|
1030
|
+
case '\r': break;
|
1031
|
+
case '\0':
|
1032
|
+
if (allWhite) {
|
1033
|
+
return c;
|
1034
|
+
}
|
1096
1035
|
ox_sax_drive_error(dr, NO_TERM "text not terminated");
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
}
|
1104
|
-
END_OF_BUF:
|
1036
|
+
goto END_OF_BUF;
|
1037
|
+
break;
|
1038
|
+
default: allWhite = 0; break;
|
1039
|
+
}
|
1040
|
+
}
|
1041
|
+
END_OF_BUF:
|
1105
1042
|
if ('\0' != c) {
|
1106
|
-
|
1043
|
+
*(dr->buf.tail - 1) = '\0';
|
1107
1044
|
}
|
1108
1045
|
if (allWhite) {
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
1126
|
-
}
|
1127
|
-
if (dr->has.column) {
|
1128
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
1129
|
-
}
|
1130
|
-
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
1131
|
-
}
|
1132
|
-
if (!isEnd || 0 == parent || 0 < parent->childCnt) {
|
1133
|
-
return c;
|
1134
|
-
}
|
1046
|
+
int isEnd = ('/' == buf_get(&dr->buf));
|
1047
|
+
|
1048
|
+
buf_backup(&dr->buf);
|
1049
|
+
if (dr->has_text && ((NoSkip == dr->options.skip && !isEnd) || (OffSkip == dr->options.skip))) {
|
1050
|
+
args[0] = rb_str_new2(dr->buf.str);
|
1051
|
+
if (0 != dr->encoding) {
|
1052
|
+
rb_enc_associate(args[0], dr->encoding);
|
1053
|
+
}
|
1054
|
+
dr->set_pos(dr->handler, pos);
|
1055
|
+
dr->set_line(dr->handler, line);
|
1056
|
+
dr->set_col(dr->handler, col);
|
1057
|
+
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
1058
|
+
}
|
1059
|
+
if (!isEnd || 0 == parent || 0 < parent->childCnt) {
|
1060
|
+
return c;
|
1061
|
+
}
|
1135
1062
|
}
|
1136
1063
|
if (0 != parent) {
|
1137
|
-
|
1064
|
+
parent->childCnt++;
|
1138
1065
|
}
|
1139
1066
|
if (!dr->blocked && (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay)) {
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
break;
|
1165
|
-
}
|
1166
|
-
args[0] = rb_str_new2(dr->buf.str);
|
1167
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
1168
|
-
if (0 != dr->encoding) {
|
1169
|
-
rb_enc_associate(args[0], dr->encoding);
|
1170
|
-
}
|
1171
|
-
#endif
|
1172
|
-
if (dr->has.pos) {
|
1173
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
1174
|
-
}
|
1175
|
-
if (dr->has.line) {
|
1176
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
1177
|
-
}
|
1178
|
-
if (dr->has.column) {
|
1179
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
1180
|
-
}
|
1181
|
-
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
1182
|
-
}
|
1067
|
+
if (dr->has_value) {
|
1068
|
+
dr->set_pos(dr->handler, pos);
|
1069
|
+
dr->set_line(dr->handler, line);
|
1070
|
+
dr->set_col(dr->handler, col);
|
1071
|
+
*args = dr->value_obj;
|
1072
|
+
rb_funcall2(dr->handler, ox_value_id, 1, args);
|
1073
|
+
} else if (dr->has_text) {
|
1074
|
+
if (dr->options.convert_special) {
|
1075
|
+
ox_sax_collapse_special(dr, dr->buf.str, pos, line, col);
|
1076
|
+
}
|
1077
|
+
switch (dr->options.skip) {
|
1078
|
+
case CrSkip: buf_collapse_return(dr->buf.str); break;
|
1079
|
+
case SpcSkip: buf_collapse_white(dr->buf.str); break;
|
1080
|
+
default: break;
|
1081
|
+
}
|
1082
|
+
args[0] = rb_str_new2(dr->buf.str);
|
1083
|
+
if (0 != dr->encoding) {
|
1084
|
+
rb_enc_associate(args[0], dr->encoding);
|
1085
|
+
}
|
1086
|
+
dr->set_pos(dr->handler, pos);
|
1087
|
+
dr->set_line(dr->handler, line);
|
1088
|
+
dr->set_col(dr->handler, col);
|
1089
|
+
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
1090
|
+
}
|
1183
1091
|
}
|
1184
1092
|
dr->buf.str = 0;
|
1185
1093
|
|
1186
1094
|
return c;
|
1187
1095
|
}
|
1188
1096
|
|
1189
|
-
static int
|
1190
|
-
|
1191
|
-
struct _checkPt cp;
|
1097
|
+
static int read_jump_term(Buf buf, const char *pat) {
|
1098
|
+
struct _checkPt cp;
|
1192
1099
|
|
1193
|
-
buf_checkpoint(buf, &cp);
|
1100
|
+
buf_checkpoint(buf, &cp); // right after <
|
1194
1101
|
if ('/' != buf_next_non_white(buf)) {
|
1195
|
-
|
1102
|
+
return 0;
|
1196
1103
|
}
|
1197
1104
|
if (*pat != tolower(buf_next_non_white(buf))) {
|
1198
|
-
|
1105
|
+
return 0;
|
1199
1106
|
}
|
1200
1107
|
for (pat++; '\0' != *pat; pat++) {
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1108
|
+
if (*pat != tolower(buf_get(buf))) {
|
1109
|
+
return 0;
|
1110
|
+
}
|
1204
1111
|
}
|
1205
1112
|
if ('>' != buf_next_non_white(buf)) {
|
1206
|
-
|
1113
|
+
return 0;
|
1207
1114
|
}
|
1208
1115
|
buf_checkback(buf, &cp);
|
1209
1116
|
return 1;
|
1210
1117
|
}
|
1211
1118
|
|
1212
|
-
static char
|
1213
|
-
|
1214
|
-
|
1215
|
-
|
1216
|
-
long
|
1217
|
-
long
|
1218
|
-
|
1219
|
-
Nv parent = stack_peek(&dr->stack);
|
1119
|
+
static char read_jump(SaxDrive dr, const char *pat) {
|
1120
|
+
VALUE args[1];
|
1121
|
+
char c;
|
1122
|
+
long pos = (long)(dr->buf.pos);
|
1123
|
+
long line = (long)(dr->buf.line);
|
1124
|
+
long col = (long)(dr->buf.col - 1);
|
1125
|
+
Nv parent = stack_peek(&dr->stack);
|
1220
1126
|
|
1221
1127
|
buf_protect(&dr->buf);
|
1222
1128
|
while (1) {
|
1223
|
-
|
1224
|
-
|
1225
|
-
|
1226
|
-
|
1227
|
-
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1231
|
-
|
1129
|
+
c = buf_get(&dr->buf);
|
1130
|
+
switch (c) {
|
1131
|
+
case '<':
|
1132
|
+
if (read_jump_term(&dr->buf, pat)) {
|
1133
|
+
goto END_OF_BUF;
|
1134
|
+
break;
|
1135
|
+
}
|
1136
|
+
break;
|
1137
|
+
case '\0':
|
1232
1138
|
ox_sax_drive_error(dr, NO_TERM "not terminated");
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
}
|
1139
|
+
goto END_OF_BUF;
|
1140
|
+
break;
|
1141
|
+
default: break;
|
1142
|
+
}
|
1238
1143
|
}
|
1239
|
-
|
1144
|
+
END_OF_BUF:
|
1240
1145
|
if ('\0' != c) {
|
1241
|
-
|
1146
|
+
*(dr->buf.tail - 1) = '\0';
|
1242
1147
|
}
|
1243
1148
|
if (0 != parent) {
|
1244
|
-
|
1149
|
+
parent->childCnt++;
|
1245
1150
|
}
|
1246
1151
|
// TBD check parent overlay
|
1247
|
-
if (dr->
|
1152
|
+
if (dr->has_text && !dr->blocked) {
|
1248
1153
|
args[0] = rb_str_new2(dr->buf.str);
|
1249
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
1250
1154
|
if (0 != dr->encoding) {
|
1251
1155
|
rb_enc_associate(args[0], dr->encoding);
|
1252
1156
|
}
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
}
|
1257
|
-
if (dr->has.line) {
|
1258
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
1259
|
-
}
|
1260
|
-
if (dr->has.column) {
|
1261
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
1262
|
-
}
|
1157
|
+
dr->set_pos(dr->handler, pos);
|
1158
|
+
dr->set_line(dr->handler, line);
|
1159
|
+
dr->set_col(dr->handler, col);
|
1263
1160
|
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
1264
1161
|
}
|
1265
1162
|
dr->buf.str = 0;
|
1266
1163
|
if ('\0' != c) {
|
1267
|
-
|
1164
|
+
*(dr->buf.tail - 1) = '<';
|
1268
1165
|
}
|
1269
1166
|
return c;
|
1270
1167
|
}
|
1271
1168
|
|
1272
|
-
static char
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
off_t
|
1277
|
-
off_t
|
1278
|
-
|
1279
|
-
char *attr_value;
|
1169
|
+
static char read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req, Hint h) {
|
1170
|
+
VALUE name = Qnil;
|
1171
|
+
int is_encoding = 0;
|
1172
|
+
off_t pos;
|
1173
|
+
off_t line;
|
1174
|
+
off_t col;
|
1175
|
+
char *attr_value;
|
1280
1176
|
|
1281
1177
|
// already protected by caller
|
1282
1178
|
dr->buf.str = dr->buf.tail;
|
@@ -1284,94 +1180,52 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req,
|
|
1284
1180
|
c = buf_next_non_white(&dr->buf);
|
1285
1181
|
}
|
1286
1182
|
while (termc != c && term2 != c) {
|
1287
|
-
|
1183
|
+
buf_backup(&dr->buf);
|
1288
1184
|
if ('\0' == c) {
|
1289
|
-
|
1290
|
-
|
1185
|
+
ox_sax_drive_error(dr, NO_TERM "attributes not terminated");
|
1186
|
+
return '\0';
|
1291
1187
|
}
|
1292
|
-
|
1293
|
-
|
1294
|
-
|
1188
|
+
pos = dr->buf.pos + 1;
|
1189
|
+
line = dr->buf.line;
|
1190
|
+
col = dr->buf.col + 1;
|
1295
1191
|
if ('\0' == (c = read_name_token(dr))) {
|
1296
|
-
|
1297
|
-
|
1192
|
+
ox_sax_drive_error(dr, NO_TERM "error reading token");
|
1193
|
+
return '\0';
|
1298
1194
|
}
|
1299
1195
|
if (is_xml && 0 == strcasecmp("encoding", dr->buf.str)) {
|
1300
1196
|
is_encoding = 1;
|
1301
1197
|
}
|
1302
|
-
if (dr->
|
1303
|
-
name = str2sym(dr, dr->buf.str, 0);
|
1198
|
+
if (dr->want_attr_name) {
|
1199
|
+
name = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 1, 0);
|
1304
1200
|
}
|
1305
1201
|
if (is_white(c)) {
|
1306
1202
|
c = buf_next_non_white(&dr->buf);
|
1307
1203
|
}
|
1308
1204
|
if ('=' != c) {
|
1309
|
-
|
1310
|
-
|
1311
|
-
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1205
|
+
if (eq_req) {
|
1206
|
+
dr->err = 1;
|
1207
|
+
return c;
|
1208
|
+
} else {
|
1209
|
+
ox_sax_drive_error(dr, WRONG_CHAR "no attribute value");
|
1210
|
+
attr_value = (char *)"";
|
1211
|
+
}
|
1316
1212
|
} else {
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1333
|
-
VALUE args[2];
|
1334
|
-
|
1335
|
-
if (dr->has.pos) {
|
1336
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
1337
|
-
}
|
1338
|
-
if (dr->has.line) {
|
1339
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
1340
|
-
}
|
1341
|
-
if (dr->has.column) {
|
1342
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
1343
|
-
}
|
1344
|
-
args[0] = name;
|
1345
|
-
args[1] = dr->value_obj;
|
1346
|
-
rb_funcall2(dr->handler, ox_attr_value_id, 2, args);
|
1347
|
-
} else if (dr->has.attr) {
|
1348
|
-
VALUE args[2];
|
1349
|
-
|
1350
|
-
args[0] = name;
|
1351
|
-
if (dr->options.convert_special) {
|
1352
|
-
ox_sax_collapse_special(dr, dr->buf.str, pos, line, col);
|
1353
|
-
}
|
1354
|
-
args[1] = rb_str_new2(attr_value);
|
1355
|
-
#if HAVE_RB_ENC_ASSOCIATE
|
1356
|
-
if (0 != dr->encoding) {
|
1357
|
-
rb_enc_associate(args[1], dr->encoding);
|
1358
|
-
}
|
1359
|
-
#endif
|
1360
|
-
if (dr->has.pos) {
|
1361
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
1362
|
-
}
|
1363
|
-
if (dr->has.line) {
|
1364
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
1365
|
-
}
|
1366
|
-
if (dr->has.column) {
|
1367
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
1368
|
-
}
|
1369
|
-
rb_funcall2(dr->handler, ox_attr_id, 2, args);
|
1370
|
-
}
|
1371
|
-
}
|
1372
|
-
if (is_white(c)) {
|
1373
|
-
c = buf_next_non_white(&dr->buf);
|
1374
|
-
}
|
1213
|
+
pos = dr->buf.pos + 1;
|
1214
|
+
line = dr->buf.line;
|
1215
|
+
col = dr->buf.col + 1;
|
1216
|
+
c = read_quoted_value(dr);
|
1217
|
+
attr_value = dr->buf.str;
|
1218
|
+
if (is_encoding) {
|
1219
|
+
dr->encoding = rb_enc_find(dr->buf.str);
|
1220
|
+
is_encoding = 0;
|
1221
|
+
}
|
1222
|
+
}
|
1223
|
+
if (0 >= dr->blocked && (NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
1224
|
+
dr->attr_cb(dr, name, attr_value, pos, line, col);
|
1225
|
+
}
|
1226
|
+
if (is_white(c)) {
|
1227
|
+
c = buf_next_non_white(&dr->buf);
|
1228
|
+
}
|
1375
1229
|
}
|
1376
1230
|
dr->buf.str = 0;
|
1377
1231
|
|
@@ -1381,66 +1235,62 @@ read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req,
|
|
1381
1235
|
/* The character after the word is returned. dr->buf.tail is one past
|
1382
1236
|
* that. dr->buf.str will point to the token which will be '\0' terminated.
|
1383
1237
|
*/
|
1384
|
-
static char
|
1385
|
-
|
1386
|
-
char c;
|
1238
|
+
static char read_name_token(SaxDrive dr) {
|
1239
|
+
char c;
|
1387
1240
|
|
1388
1241
|
dr->buf.str = dr->buf.tail;
|
1389
|
-
c
|
1242
|
+
c = buf_get(&dr->buf);
|
1390
1243
|
if (is_white(c)) {
|
1391
|
-
c
|
1244
|
+
c = buf_next_non_white(&dr->buf);
|
1392
1245
|
dr->buf.str = dr->buf.tail - 1;
|
1393
1246
|
}
|
1394
1247
|
while (1) {
|
1395
|
-
|
1396
|
-
|
1397
|
-
|
1398
|
-
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1402
|
-
|
1403
|
-
|
1404
|
-
|
1405
|
-
|
1406
|
-
|
1407
|
-
return c;
|
1408
|
-
case '\0':
|
1248
|
+
switch (c) {
|
1249
|
+
case ' ':
|
1250
|
+
case '\t':
|
1251
|
+
case '\f':
|
1252
|
+
case '?':
|
1253
|
+
case '=':
|
1254
|
+
case '/':
|
1255
|
+
case '>':
|
1256
|
+
case '<':
|
1257
|
+
case '\n':
|
1258
|
+
case '\r': *(dr->buf.tail - 1) = '\0'; return c;
|
1259
|
+
case '\0':
|
1409
1260
|
/* documents never terminate after a name token */
|
1410
1261
|
ox_sax_drive_error(dr, NO_TERM "document not terminated");
|
1411
1262
|
return '\0';
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1416
|
-
|
1417
|
-
|
1418
|
-
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1263
|
+
case ':':
|
1264
|
+
if ('\0' == *dr->options.strip_ns) {
|
1265
|
+
break;
|
1266
|
+
} else if ('*' == *dr->options.strip_ns && '\0' == dr->options.strip_ns[1]) {
|
1267
|
+
dr->buf.str = dr->buf.tail;
|
1268
|
+
} else if (dr->options.smart &&
|
1269
|
+
0 == strncasecmp(dr->options.strip_ns, dr->buf.str, dr->buf.tail - dr->buf.str - 1)) {
|
1270
|
+
dr->buf.str = dr->buf.tail;
|
1271
|
+
} else if (0 == strncmp(dr->options.strip_ns, dr->buf.str, dr->buf.tail - dr->buf.str - 1)) {
|
1272
|
+
dr->buf.str = dr->buf.tail;
|
1273
|
+
}
|
1274
|
+
break;
|
1275
|
+
default: break;
|
1276
|
+
}
|
1426
1277
|
c = buf_get(&dr->buf);
|
1427
1278
|
}
|
1428
1279
|
return '\0';
|
1429
1280
|
}
|
1430
1281
|
|
1431
|
-
/* The character after the quote or if there is no quote, the character after the word is returned. dr->buf.tail is one
|
1432
|
-
* that. dr->buf.str will point to the token which will be '\0' terminated.
|
1282
|
+
/* The character after the quote or if there is no quote, the character after the word is returned. dr->buf.tail is one
|
1283
|
+
* past that. dr->buf.str will point to the token which will be '\0' terminated.
|
1433
1284
|
*/
|
1434
|
-
static char
|
1435
|
-
|
1436
|
-
char c;
|
1285
|
+
static char read_quoted_value(SaxDrive dr) {
|
1286
|
+
char c;
|
1437
1287
|
|
1438
1288
|
c = buf_get(&dr->buf);
|
1439
1289
|
if (is_white(c)) {
|
1440
1290
|
c = buf_next_non_white(&dr->buf);
|
1441
1291
|
}
|
1442
1292
|
if ('"' == c || '\'' == c) {
|
1443
|
-
|
1293
|
+
char term = c;
|
1444
1294
|
|
1445
1295
|
dr->buf.str = dr->buf.tail;
|
1446
1296
|
while (term != (c = buf_get(&dr->buf))) {
|
@@ -1449,186 +1299,171 @@ read_quoted_value(SaxDrive dr) {
|
|
1449
1299
|
return '\0';
|
1450
1300
|
}
|
1451
1301
|
}
|
1452
|
-
|
1453
|
-
|
1454
|
-
|
1455
|
-
|
1302
|
+
// dr->buf.tail is one past quote char
|
1303
|
+
*(dr->buf.tail - 1) = '\0'; /* terminate value */
|
1304
|
+
c = buf_get(&dr->buf);
|
1305
|
+
return c;
|
1456
1306
|
}
|
1457
1307
|
// not quoted, look for something that terminates the string
|
1458
1308
|
dr->buf.str = dr->buf.tail - 1;
|
1459
1309
|
ox_sax_drive_error(dr, WRONG_CHAR "attribute value not in quotes");
|
1460
1310
|
while ('\0' != (c = buf_get(&dr->buf))) {
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1466
|
-
|
1467
|
-
|
1468
|
-
|
1469
|
-
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1473
|
-
|
1474
|
-
|
1475
|
-
|
1476
|
-
return '\0'; // should never get here
|
1311
|
+
switch (c) {
|
1312
|
+
case ' ':
|
1313
|
+
// case '/':
|
1314
|
+
case '>':
|
1315
|
+
case '?': // for instructions
|
1316
|
+
case '\t':
|
1317
|
+
case '\n':
|
1318
|
+
case '\r':
|
1319
|
+
*(dr->buf.tail - 1) = '\0'; /* terminate value */
|
1320
|
+
// dr->buf.tail is in the correct position, one after the word terminator
|
1321
|
+
return c;
|
1322
|
+
default: break;
|
1323
|
+
}
|
1324
|
+
}
|
1325
|
+
return '\0'; // should never get here
|
1477
1326
|
}
|
1478
1327
|
|
1479
|
-
static char*
|
1480
|
-
|
1481
|
-
|
1482
|
-
char c;
|
1328
|
+
static char *read_hex_uint64(char *b, uint64_t *up) {
|
1329
|
+
uint64_t u = 0;
|
1330
|
+
char c;
|
1483
1331
|
|
1484
1332
|
for (; ';' != *b; b++) {
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
|
1489
|
-
|
1490
|
-
|
1491
|
-
|
1492
|
-
|
1493
|
-
|
1494
|
-
|
1333
|
+
c = *b;
|
1334
|
+
if ('0' <= c && c <= '9') {
|
1335
|
+
u = (u << 4) | (uint64_t)(c - '0');
|
1336
|
+
} else if ('a' <= c && c <= 'f') {
|
1337
|
+
u = (u << 4) | (uint64_t)(c - 'a' + 10);
|
1338
|
+
} else if ('A' <= c && c <= 'F') {
|
1339
|
+
u = (u << 4) | (uint64_t)(c - 'A' + 10);
|
1340
|
+
} else {
|
1341
|
+
return 0;
|
1342
|
+
}
|
1495
1343
|
}
|
1496
1344
|
*up = u;
|
1497
1345
|
|
1498
1346
|
return b;
|
1499
1347
|
}
|
1500
1348
|
|
1501
|
-
static char*
|
1502
|
-
|
1503
|
-
|
1504
|
-
char c;
|
1349
|
+
static char *read_10_uint64(char *b, uint64_t *up) {
|
1350
|
+
uint64_t u = 0;
|
1351
|
+
char c;
|
1505
1352
|
|
1506
1353
|
for (; ';' != *b; b++) {
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
-
|
1511
|
-
|
1512
|
-
|
1354
|
+
c = *b;
|
1355
|
+
if ('0' <= c && c <= '9') {
|
1356
|
+
u = (u * 10) + (uint64_t)(c - '0');
|
1357
|
+
} else {
|
1358
|
+
return 0;
|
1359
|
+
}
|
1513
1360
|
}
|
1514
1361
|
*up = u;
|
1515
1362
|
|
1516
1363
|
return b;
|
1517
1364
|
}
|
1518
1365
|
|
1519
|
-
int
|
1520
|
-
|
1521
|
-
char
|
1522
|
-
char *b = str;
|
1366
|
+
int ox_sax_collapse_special(SaxDrive dr, char *str, long pos, long line, long col) {
|
1367
|
+
char *s = str;
|
1368
|
+
char *b = str;
|
1523
1369
|
|
1524
1370
|
while ('\0' != *s) {
|
1525
1371
|
if ('&' == *s) {
|
1526
|
-
int
|
1527
|
-
char
|
1372
|
+
int c = 0;
|
1373
|
+
char *end;
|
1528
1374
|
|
1529
1375
|
s++;
|
1530
1376
|
if ('#' == *s) {
|
1531
|
-
|
1532
|
-
|
1533
|
-
|
1534
|
-
|
1535
|
-
|
1536
|
-
|
1537
|
-
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1544
|
-
|
1545
|
-
|
1546
|
-
|
1547
|
-
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1551
|
-
|
1552
|
-
|
1553
|
-
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1559
|
-
|
1560
|
-
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1570
|
-
ox_sax_drive_error(dr, NO_TERM "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.");
|
1571
|
-
*b++ = '&';
|
1572
|
-
*b++ = '#';
|
1573
|
-
if ('\0' != x) {
|
1574
|
-
*b++ = x;
|
1575
|
-
}
|
1576
|
-
continue;
|
1577
|
-
*/
|
1578
|
-
}
|
1579
|
-
s = end + 1;
|
1580
|
-
continue;
|
1377
|
+
uint64_t u = 0;
|
1378
|
+
char x;
|
1379
|
+
|
1380
|
+
s++;
|
1381
|
+
if ('x' == *s || 'X' == *s) {
|
1382
|
+
x = *s;
|
1383
|
+
s++;
|
1384
|
+
end = read_hex_uint64(s, &u);
|
1385
|
+
} else {
|
1386
|
+
x = '\0';
|
1387
|
+
end = read_10_uint64(s, &u);
|
1388
|
+
}
|
1389
|
+
if (0 == end) {
|
1390
|
+
ox_sax_drive_error(dr, NO_TERM "special character does not end with a semicolon");
|
1391
|
+
*b++ = '&';
|
1392
|
+
*b++ = '#';
|
1393
|
+
if ('\0' != x) {
|
1394
|
+
*b++ = x;
|
1395
|
+
}
|
1396
|
+
continue;
|
1397
|
+
}
|
1398
|
+
if (u <= 0x000000000000007FULL) {
|
1399
|
+
*b++ = (char)u;
|
1400
|
+
} else if (ox_utf8_encoding == dr->encoding) {
|
1401
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
1402
|
+
} else if (0 == dr->encoding) {
|
1403
|
+
dr->encoding = ox_utf8_encoding;
|
1404
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
1405
|
+
} else {
|
1406
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
1407
|
+
/*
|
1408
|
+
ox_sax_drive_error(dr, NO_TERM "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character
|
1409
|
+
sequences."); *b++ = '&'; *b++ = '#'; if ('\0' != x) { *b++ = x;
|
1410
|
+
}
|
1411
|
+
continue;
|
1412
|
+
*/
|
1413
|
+
}
|
1414
|
+
s = end + 1;
|
1415
|
+
continue;
|
1581
1416
|
} else if (0 == strncasecmp(s, "lt;", 3)) {
|
1582
1417
|
c = '<';
|
1583
1418
|
s += 3;
|
1584
|
-
|
1419
|
+
col += 3;
|
1585
1420
|
} else if (0 == strncasecmp(s, "gt;", 3)) {
|
1586
1421
|
c = '>';
|
1587
1422
|
s += 3;
|
1588
|
-
|
1423
|
+
col += 3;
|
1589
1424
|
} else if (0 == strncasecmp(s, "amp;", 4)) {
|
1590
1425
|
c = '&';
|
1591
1426
|
s += 4;
|
1592
|
-
|
1427
|
+
col += 4;
|
1593
1428
|
} else if (0 == strncasecmp(s, "quot;", 5)) {
|
1594
1429
|
c = '"';
|
1595
1430
|
s += 5;
|
1596
|
-
|
1431
|
+
col += 5;
|
1597
1432
|
} else if (0 == strncasecmp(s, "apos;", 5)) {
|
1598
1433
|
c = '\'';
|
1599
1434
|
s += 5;
|
1600
1435
|
} else {
|
1601
|
-
|
1602
|
-
|
1603
|
-
|
1604
|
-
|
1605
|
-
|
1606
|
-
|
1607
|
-
|
1608
|
-
|
1609
|
-
|
1610
|
-
|
1611
|
-
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1617
|
-
|
1618
|
-
|
1619
|
-
|
1620
|
-
|
1621
|
-
|
1622
|
-
|
1436
|
+
char key[16];
|
1437
|
+
char *k = key;
|
1438
|
+
char *kend = key + sizeof(key) - 1;
|
1439
|
+
char *bn;
|
1440
|
+
char *s2 = s;
|
1441
|
+
|
1442
|
+
for (; ';' != *s2 && '\0' != *s2; s2++, k++) {
|
1443
|
+
if (kend <= k) {
|
1444
|
+
k = key;
|
1445
|
+
break;
|
1446
|
+
}
|
1447
|
+
*k = *s2;
|
1448
|
+
}
|
1449
|
+
*k = '\0';
|
1450
|
+
if ('\0' == *key || NULL == (bn = ox_entity_lookup(b, key))) {
|
1451
|
+
ox_sax_drive_error_at(dr, INVALID_FORMAT "Invalid special character sequence", pos, line, col);
|
1452
|
+
c = '&';
|
1453
|
+
} else {
|
1454
|
+
b = bn;
|
1455
|
+
s = s2 + 1;
|
1456
|
+
continue;
|
1457
|
+
}
|
1623
1458
|
}
|
1624
1459
|
*b++ = (char)c;
|
1625
|
-
|
1460
|
+
col++;
|
1626
1461
|
} else {
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1462
|
+
if ('\n' == *s) {
|
1463
|
+
line++;
|
1464
|
+
col = 0;
|
1465
|
+
}
|
1466
|
+
col++;
|
1632
1467
|
*b++ = *s++;
|
1633
1468
|
}
|
1634
1469
|
}
|
@@ -1637,64 +1472,43 @@ ox_sax_collapse_special(SaxDrive dr, char *str, long pos, long line, long col) {
|
|
1637
1472
|
return 0;
|
1638
1473
|
}
|
1639
1474
|
|
1640
|
-
static void
|
1641
|
-
|
1642
|
-
Nv nv;
|
1475
|
+
static void hint_clear_empty(SaxDrive dr) {
|
1476
|
+
Nv nv;
|
1643
1477
|
|
1644
1478
|
for (nv = stack_peek(&dr->stack); 0 != nv; nv = stack_peek(&dr->stack)) {
|
1645
|
-
|
1646
|
-
|
1647
|
-
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1652
|
-
|
1653
|
-
|
1479
|
+
if (0 == nv->hint) {
|
1480
|
+
break;
|
1481
|
+
}
|
1482
|
+
if (nv->hint->empty) {
|
1483
|
+
end_element_cb(dr, nv->val, dr->buf.pos, dr->buf.line, dr->buf.col, nv->hint);
|
1484
|
+
stack_pop(&dr->stack);
|
1485
|
+
} else {
|
1486
|
+
break;
|
1487
|
+
}
|
1654
1488
|
}
|
1655
1489
|
}
|
1656
1490
|
|
1657
|
-
static Nv
|
1658
|
-
|
1659
|
-
|
1660
|
-
Nv nv;
|
1491
|
+
static Nv hint_try_close(SaxDrive dr, const char *name) {
|
1492
|
+
Hint h = ox_hint_find(dr->options.hints, name);
|
1493
|
+
Nv nv;
|
1661
1494
|
|
1662
1495
|
if (0 == h) {
|
1663
|
-
|
1496
|
+
return 0;
|
1664
1497
|
}
|
1665
1498
|
for (nv = stack_peek(&dr->stack); 0 != nv; nv = stack_peek(&dr->stack)) {
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1670
|
-
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
1678
|
-
|
1499
|
+
if (0 == strcasecmp(name, nv->name)) {
|
1500
|
+
stack_pop(&dr->stack);
|
1501
|
+
return nv;
|
1502
|
+
}
|
1503
|
+
if (0 == nv->hint) {
|
1504
|
+
break;
|
1505
|
+
}
|
1506
|
+
if (nv->hint->empty) {
|
1507
|
+
end_element_cb(dr, nv->val, dr->buf.pos, dr->buf.line, dr->buf.col, nv->hint);
|
1508
|
+
dr->stack.tail = nv;
|
1509
|
+
} else {
|
1510
|
+
break;
|
1511
|
+
}
|
1679
1512
|
}
|
1680
1513
|
return 0;
|
1681
1514
|
}
|
1682
|
-
|
1683
|
-
static void
|
1684
|
-
end_element_cb(SaxDrive dr, VALUE name, long pos, long line, long col, Hint h) {
|
1685
|
-
if (dr->has.end_element && 0 >= dr->blocked && (NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
1686
|
-
if (dr->has.pos) {
|
1687
|
-
rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos));
|
1688
|
-
}
|
1689
|
-
if (dr->has.line) {
|
1690
|
-
rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line));
|
1691
|
-
}
|
1692
|
-
if (dr->has.column) {
|
1693
|
-
rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col));
|
1694
|
-
}
|
1695
|
-
rb_funcall(dr->handler, ox_end_element_id, 1, name);
|
1696
|
-
}
|
1697
|
-
if (NULL != h && BlockOverlay == h->overlay && 0 < dr->blocked) {
|
1698
|
-
dr->blocked--;
|
1699
|
-
}
|
1700
|
-
}
|