ox-bundlecachetest 2.14.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +751 -0
- data/LICENSE +21 -0
- data/README.md +351 -0
- data/ext/ox/attr.h +78 -0
- data/ext/ox/base64.c +105 -0
- data/ext/ox/base64.h +18 -0
- data/ext/ox/buf.h +162 -0
- data/ext/ox/builder.c +948 -0
- data/ext/ox/cache.c +351 -0
- data/ext/ox/cache.h +21 -0
- data/ext/ox/cache8.c +106 -0
- data/ext/ox/cache8.h +23 -0
- data/ext/ox/dump.c +1260 -0
- data/ext/ox/err.c +46 -0
- data/ext/ox/err.h +36 -0
- data/ext/ox/extconf.rb +47 -0
- data/ext/ox/gen_load.c +342 -0
- data/ext/ox/hash_load.c +309 -0
- data/ext/ox/helper.h +84 -0
- data/ext/ox/intern.c +157 -0
- data/ext/ox/intern.h +25 -0
- data/ext/ox/obj_load.c +809 -0
- data/ext/ox/ox.c +1649 -0
- data/ext/ox/ox.h +245 -0
- data/ext/ox/parse.c +1197 -0
- data/ext/ox/sax.c +1570 -0
- data/ext/ox/sax.h +69 -0
- data/ext/ox/sax_as.c +270 -0
- data/ext/ox/sax_buf.c +209 -0
- data/ext/ox/sax_buf.h +204 -0
- data/ext/ox/sax_hint.c +207 -0
- data/ext/ox/sax_hint.h +40 -0
- data/ext/ox/sax_stack.h +113 -0
- data/ext/ox/slotcache.c +158 -0
- data/ext/ox/slotcache.h +19 -0
- data/ext/ox/special.c +390 -0
- data/ext/ox/special.h +14 -0
- data/ext/ox/type.h +39 -0
- data/lib/ox/bag.rb +103 -0
- data/lib/ox/cdata.rb +10 -0
- data/lib/ox/comment.rb +11 -0
- data/lib/ox/doctype.rb +11 -0
- data/lib/ox/document.rb +28 -0
- data/lib/ox/element.rb +464 -0
- data/lib/ox/error.rb +25 -0
- data/lib/ox/hasattrs.rb +54 -0
- data/lib/ox/instruct.rb +34 -0
- data/lib/ox/node.rb +23 -0
- data/lib/ox/raw.rb +12 -0
- data/lib/ox/sax.rb +97 -0
- data/lib/ox/version.rb +4 -0
- data/lib/ox/xmlrpc_adapter.rb +33 -0
- data/lib/ox.rb +79 -0
- metadata +128 -0
data/ext/ox/sax.c
ADDED
|
@@ -0,0 +1,1570 @@
|
|
|
1
|
+
/* sax.c
|
|
2
|
+
* Copyright (c) 2011, Peter Ohler
|
|
3
|
+
* All rights reserved.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
#include <ctype.h>
|
|
7
|
+
#include <errno.h>
|
|
8
|
+
#include <stdio.h>
|
|
9
|
+
#include <stdlib.h>
|
|
10
|
+
#include <strings.h>
|
|
11
|
+
#include <sys/types.h>
|
|
12
|
+
#if HAVE_SYS_UIO_H
|
|
13
|
+
#include <sys/uio.h>
|
|
14
|
+
#endif
|
|
15
|
+
#include <time.h>
|
|
16
|
+
#include <unistd.h>
|
|
17
|
+
|
|
18
|
+
#include "intern.h"
|
|
19
|
+
#include "ox.h"
|
|
20
|
+
#include "ruby.h"
|
|
21
|
+
#include "ruby/encoding.h"
|
|
22
|
+
#include "sax.h"
|
|
23
|
+
#include "sax_buf.h"
|
|
24
|
+
#include "sax_stack.h"
|
|
25
|
+
#include "special.h"
|
|
26
|
+
|
|
27
|
+
#define NAME_MISMATCH 1
|
|
28
|
+
|
|
29
|
+
#define START_STATE 1
|
|
30
|
+
#define BODY_STATE 2
|
|
31
|
+
#define AFTER_STATE 3
|
|
32
|
+
|
|
33
|
+
// error prefixes
|
|
34
|
+
#define BAD_BOM "Bad BOM: "
|
|
35
|
+
#define NO_TERM "Not Terminated: "
|
|
36
|
+
#define INVALID_FORMAT "Invalid Format: "
|
|
37
|
+
#define CASE_ERROR "Case Error: "
|
|
38
|
+
#define OUT_OF_ORDER "Out of Order: "
|
|
39
|
+
#define WRONG_CHAR "Unexpected Character: "
|
|
40
|
+
#define EL_MISMATCH "Start End Mismatch: "
|
|
41
|
+
#define INV_ELEMENT "Invalid Element: "
|
|
42
|
+
|
|
43
|
+
#define UTF8_STR "UTF-8"
|
|
44
|
+
|
|
45
|
+
static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options);
|
|
46
|
+
static void parse(SaxDrive dr);
|
|
47
|
+
// All read functions should return the next character after the 'thing' that was read and leave dr->cur one after that.
|
|
48
|
+
static char read_instruction(SaxDrive dr);
|
|
49
|
+
static char read_doctype(SaxDrive dr);
|
|
50
|
+
static char read_cdata(SaxDrive dr);
|
|
51
|
+
static char read_comment(SaxDrive dr);
|
|
52
|
+
static char read_element_start(SaxDrive dr);
|
|
53
|
+
static char read_element_end(SaxDrive dr);
|
|
54
|
+
static char read_text(SaxDrive dr);
|
|
55
|
+
static char read_jump(SaxDrive dr, const char *pat);
|
|
56
|
+
static char read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req, Hint h);
|
|
57
|
+
static char read_name_token(SaxDrive dr);
|
|
58
|
+
static char read_quoted_value(SaxDrive dr, bool inst);
|
|
59
|
+
|
|
60
|
+
static void hint_clear_empty(SaxDrive dr);
|
|
61
|
+
static Nv hint_try_close(SaxDrive dr, const char *name);
|
|
62
|
+
|
|
63
|
+
VALUE ox_sax_value_class = Qnil;
|
|
64
|
+
|
|
65
|
+
const rb_data_type_t ox_sax_value_type = {
|
|
66
|
+
"Ox/Sax/Value",
|
|
67
|
+
{
|
|
68
|
+
NULL,
|
|
69
|
+
NULL,
|
|
70
|
+
NULL,
|
|
71
|
+
},
|
|
72
|
+
0,
|
|
73
|
+
0,
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
static VALUE protect_parse(VALUE drp) {
|
|
77
|
+
parse((SaxDrive)drp);
|
|
78
|
+
|
|
79
|
+
return Qnil;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
VALUE
|
|
83
|
+
str2sym(SaxDrive dr, const char *str, size_t len, const char **strp) {
|
|
84
|
+
VALUE sym;
|
|
85
|
+
|
|
86
|
+
if (dr->options.symbolize) {
|
|
87
|
+
sym = ox_sym_intern(str, len, strp);
|
|
88
|
+
} else {
|
|
89
|
+
sym = dr->get_name(str, len, dr->encoding, strp);
|
|
90
|
+
}
|
|
91
|
+
return sym;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
void ox_sax_parse(VALUE handler, VALUE io, SaxOptions options) {
|
|
95
|
+
#if HAVE_RB_EXT_RACTOR_SAFE
|
|
96
|
+
rb_ext_ractor_safe(true);
|
|
97
|
+
#endif
|
|
98
|
+
struct _saxDrive dr;
|
|
99
|
+
int line = 0;
|
|
100
|
+
|
|
101
|
+
sax_drive_init(&dr, handler, io, options);
|
|
102
|
+
rb_protect(protect_parse, (VALUE)&dr, &line);
|
|
103
|
+
ox_sax_drive_cleanup(&dr);
|
|
104
|
+
if (0 != line) {
|
|
105
|
+
rb_jump_tag(line);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
static void set_long_noop(VALUE handler, long pos) {
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
static void set_pos(VALUE handler, long pos) {
|
|
113
|
+
rb_ivar_set(handler, ox_at_pos_id, LONG2NUM(pos));
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
static void set_line(VALUE handler, long line) {
|
|
117
|
+
rb_ivar_set(handler, ox_at_line_id, LONG2NUM(line));
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
static void set_col(VALUE handler, long col) {
|
|
121
|
+
rb_ivar_set(handler, ox_at_column_id, LONG2NUM(col));
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
static void attr_noop(SaxDrive dr, VALUE name, char *value, long pos, long line, long col) {
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
static void attr_text(SaxDrive dr, VALUE name, char *value, long pos, long line, long col) {
|
|
128
|
+
VALUE args[2];
|
|
129
|
+
|
|
130
|
+
args[0] = name;
|
|
131
|
+
if (dr->options.convert_special && '\0' != value[0]) {
|
|
132
|
+
ox_sax_collapse_special(dr, value, pos, line, col);
|
|
133
|
+
}
|
|
134
|
+
args[1] = rb_str_new2(value);
|
|
135
|
+
if (0 != dr->encoding) {
|
|
136
|
+
rb_enc_associate(args[1], dr->encoding);
|
|
137
|
+
}
|
|
138
|
+
dr->set_pos(dr->handler, pos);
|
|
139
|
+
dr->set_line(dr->handler, line);
|
|
140
|
+
dr->set_col(dr->handler, col);
|
|
141
|
+
rb_funcall2(dr->handler, ox_attr_id, 2, args);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
static void attr_value(SaxDrive dr, VALUE name, char *value, long pos, long line, long col) {
|
|
145
|
+
VALUE args[2];
|
|
146
|
+
|
|
147
|
+
dr->set_pos(dr->handler, pos);
|
|
148
|
+
dr->set_line(dr->handler, line);
|
|
149
|
+
dr->set_col(dr->handler, col);
|
|
150
|
+
args[0] = name;
|
|
151
|
+
args[1] = dr->value_obj;
|
|
152
|
+
rb_funcall2(dr->handler, ox_attr_value_id, 2, args);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
static void attrs_done_noop(VALUE handler) {
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
static void attrs_done(VALUE handler) {
|
|
159
|
+
rb_funcall(handler, ox_attrs_done_id, 0);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
static VALUE instruct_noop(SaxDrive dr, const char *target, long pos, long line, long col) {
|
|
163
|
+
return Qnil;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
static VALUE instruct(SaxDrive dr, const char *target, long pos, long line, long col) {
|
|
167
|
+
VALUE arg = rb_str_new2(target);
|
|
168
|
+
|
|
169
|
+
dr->set_pos(dr->handler, pos);
|
|
170
|
+
dr->set_line(dr->handler, line);
|
|
171
|
+
dr->set_col(dr->handler, col);
|
|
172
|
+
rb_funcall(dr->handler, ox_instruct_id, 1, arg);
|
|
173
|
+
|
|
174
|
+
return arg;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
static VALUE instruct_just_value(SaxDrive dr, const char *target, long pos, long line, long col) {
|
|
178
|
+
return rb_str_new2(target);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
static void end_instruct_noop(SaxDrive dr, VALUE target, long pos, long line, long col) {
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
static void end_instruct(SaxDrive dr, VALUE target, long pos, long line, long col) {
|
|
185
|
+
dr->set_pos(dr->handler, pos);
|
|
186
|
+
dr->set_line(dr->handler, line);
|
|
187
|
+
dr->set_col(dr->handler, col);
|
|
188
|
+
rb_funcall(dr->handler, ox_end_instruct_id, 1, target);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
static void dr_loc_noop(SaxDrive dr, long pos, long line, long col) {
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
static void comment(SaxDrive dr, long pos, long line, long col) {
|
|
195
|
+
if (!dr->blocked) {
|
|
196
|
+
Nv parent = stack_peek(&dr->stack);
|
|
197
|
+
Hint h = ox_hint_find(dr->options.hints, "!--");
|
|
198
|
+
|
|
199
|
+
if (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay ||
|
|
200
|
+
(NULL != h && (ActiveOverlay == h->overlay || ActiveOverlay == h->overlay))) {
|
|
201
|
+
VALUE arg = rb_str_new2(dr->buf.str);
|
|
202
|
+
|
|
203
|
+
if (0 != dr->encoding) {
|
|
204
|
+
rb_enc_associate(arg, dr->encoding);
|
|
205
|
+
}
|
|
206
|
+
dr->set_pos(dr->handler, pos);
|
|
207
|
+
dr->set_line(dr->handler, line);
|
|
208
|
+
dr->set_col(dr->handler, col);
|
|
209
|
+
rb_funcall(dr->handler, ox_comment_id, 1, arg);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
static void cdata(SaxDrive dr, long pos, long line, long col) {
|
|
215
|
+
Nv parent = stack_peek(&dr->stack);
|
|
216
|
+
|
|
217
|
+
if (!dr->blocked && (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay)) {
|
|
218
|
+
VALUE arg = rb_str_new2(dr->buf.str);
|
|
219
|
+
|
|
220
|
+
if (0 != dr->encoding) {
|
|
221
|
+
rb_enc_associate(arg, dr->encoding);
|
|
222
|
+
}
|
|
223
|
+
dr->set_pos(dr->handler, pos);
|
|
224
|
+
dr->set_line(dr->handler, line);
|
|
225
|
+
dr->set_col(dr->handler, col);
|
|
226
|
+
rb_funcall(dr->handler, ox_cdata_id, 1, arg);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
static void doctype(SaxDrive dr, long pos, long line, long col) {
|
|
231
|
+
dr->set_pos(dr->handler, pos);
|
|
232
|
+
dr->set_line(dr->handler, line);
|
|
233
|
+
dr->set_col(dr->handler, col);
|
|
234
|
+
rb_funcall(dr->handler, ox_doctype_id, 1, rb_str_new2(dr->buf.str));
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
static void error_noop(SaxDrive dr, const char *msg, long pos, long line, long col) {
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
static void error(SaxDrive dr, const char *msg, long pos, long line, long col) {
|
|
241
|
+
VALUE args[3];
|
|
242
|
+
|
|
243
|
+
args[0] = rb_str_new2(msg);
|
|
244
|
+
args[1] = LONG2NUM(line);
|
|
245
|
+
args[2] = LONG2NUM(col);
|
|
246
|
+
dr->set_pos(dr->handler, pos);
|
|
247
|
+
dr->set_line(dr->handler, line);
|
|
248
|
+
dr->set_col(dr->handler, col);
|
|
249
|
+
rb_funcall2(dr->handler, ox_error_id, 3, args);
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
static void end_element_cb(SaxDrive dr, VALUE name, long pos, long line, long col, Hint h) {
|
|
253
|
+
if (dr->has_end_element && 0 >= dr->blocked &&
|
|
254
|
+
(NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
|
255
|
+
dr->set_pos(dr->handler, pos);
|
|
256
|
+
dr->set_line(dr->handler, line);
|
|
257
|
+
dr->set_col(dr->handler, col);
|
|
258
|
+
rb_funcall(dr->handler, ox_end_element_id, 1, name);
|
|
259
|
+
}
|
|
260
|
+
if (NULL != h && BlockOverlay == h->overlay && 0 < dr->blocked) {
|
|
261
|
+
dr->blocked--;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options) {
|
|
266
|
+
ox_sax_buf_init(&dr->buf, io);
|
|
267
|
+
dr->buf.dr = dr;
|
|
268
|
+
stack_init(&dr->stack);
|
|
269
|
+
dr->handler = handler;
|
|
270
|
+
dr->value_obj = TypedData_Wrap_Struct(ox_sax_value_class, &ox_sax_value_type, dr);
|
|
271
|
+
rb_gc_register_address(&dr->value_obj);
|
|
272
|
+
dr->options = *options;
|
|
273
|
+
dr->err = 0;
|
|
274
|
+
dr->blocked = 0;
|
|
275
|
+
dr->abort = false;
|
|
276
|
+
|
|
277
|
+
dr->set_pos = (Qtrue == rb_ivar_defined(handler, ox_at_pos_id)) ? set_pos : set_long_noop;
|
|
278
|
+
dr->set_line = (Qtrue == rb_ivar_defined(handler, ox_at_line_id)) ? set_line : set_long_noop;
|
|
279
|
+
dr->set_col = (Qtrue == rb_ivar_defined(handler, ox_at_column_id)) ? set_col : set_long_noop;
|
|
280
|
+
if (rb_respond_to(handler, ox_attr_value_id)) {
|
|
281
|
+
dr->attr_cb = attr_value;
|
|
282
|
+
dr->want_attr_name = true;
|
|
283
|
+
} else if (rb_respond_to(handler, ox_attr_id)) {
|
|
284
|
+
dr->attr_cb = attr_text;
|
|
285
|
+
dr->want_attr_name = true;
|
|
286
|
+
} else {
|
|
287
|
+
dr->attr_cb = attr_noop;
|
|
288
|
+
dr->want_attr_name = false;
|
|
289
|
+
}
|
|
290
|
+
dr->attrs_done = rb_respond_to(handler, ox_attrs_done_id) ? attrs_done : attrs_done_noop;
|
|
291
|
+
dr->instruct = rb_respond_to(handler, ox_instruct_id) ? instruct : instruct_noop;
|
|
292
|
+
dr->end_instruct = rb_respond_to(handler, ox_end_instruct_id) ? end_instruct : end_instruct_noop;
|
|
293
|
+
if (rb_respond_to(handler, ox_end_instruct_id) && !rb_respond_to(handler, ox_instruct_id)) {
|
|
294
|
+
dr->instruct = instruct_just_value;
|
|
295
|
+
}
|
|
296
|
+
dr->doctype = rb_respond_to(handler, ox_doctype_id) ? doctype : dr_loc_noop;
|
|
297
|
+
dr->comment = rb_respond_to(handler, ox_comment_id) ? comment : dr_loc_noop;
|
|
298
|
+
dr->cdata = rb_respond_to(handler, ox_cdata_id) ? cdata : dr_loc_noop;
|
|
299
|
+
dr->error = rb_respond_to(handler, ox_error_id) ? error : error_noop;
|
|
300
|
+
|
|
301
|
+
dr->has_text = rb_respond_to(handler, ox_text_id);
|
|
302
|
+
dr->has_value = rb_respond_to(handler, ox_value_id);
|
|
303
|
+
dr->has_start_element = rb_respond_to(handler, ox_start_element_id);
|
|
304
|
+
dr->has_end_element = rb_respond_to(handler, ox_end_element_id);
|
|
305
|
+
|
|
306
|
+
if ('\0' == *ox_default_options.encoding) {
|
|
307
|
+
VALUE encoding;
|
|
308
|
+
|
|
309
|
+
dr->encoding = 0;
|
|
310
|
+
if (rb_respond_to(io, ox_external_encoding_id) &&
|
|
311
|
+
Qnil != (encoding = rb_funcall(io, ox_external_encoding_id, 0))) {
|
|
312
|
+
int e = rb_enc_get_index(encoding);
|
|
313
|
+
if (0 <= e) {
|
|
314
|
+
dr->encoding = rb_enc_from_index(e);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
} else {
|
|
318
|
+
dr->encoding = rb_enc_find(ox_default_options.encoding);
|
|
319
|
+
}
|
|
320
|
+
dr->utf8 = (NULL == dr->encoding || rb_utf8_encoding() == dr->encoding);
|
|
321
|
+
if (NULL == dr->encoding || rb_utf8_encoding() == dr->encoding) { // UTF-8
|
|
322
|
+
dr->get_name = dr->options.symbolize ? ox_utf8_sym : ox_utf8_name; // TBD UTF8 sym?
|
|
323
|
+
} else {
|
|
324
|
+
dr->get_name = dr->options.symbolize ? ox_enc_sym : ox_enc_name;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
void ox_sax_drive_cleanup(SaxDrive dr) {
|
|
329
|
+
rb_gc_unregister_address(&dr->value_obj);
|
|
330
|
+
buf_cleanup(&dr->buf);
|
|
331
|
+
stack_cleanup(&dr->stack);
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
static void ox_sax_drive_error_at(SaxDrive dr, const char *msg, off_t pos, off_t line, off_t col) {
|
|
335
|
+
dr->error(dr, msg, pos, line, col);
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
void ox_sax_drive_error(SaxDrive dr, const char *msg) {
|
|
339
|
+
ox_sax_drive_error_at(dr, msg, dr->buf.pos, dr->buf.line, dr->buf.col);
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
static char skipBOM(SaxDrive dr) {
|
|
343
|
+
char c = buf_get(&dr->buf);
|
|
344
|
+
|
|
345
|
+
if (0xEF == (uint8_t)c) { /* only UTF8 is supported */
|
|
346
|
+
if (0xBB == (uint8_t)buf_get(&dr->buf) && 0xBF == (uint8_t)buf_get(&dr->buf)) {
|
|
347
|
+
dr->encoding = ox_utf8_encoding;
|
|
348
|
+
c = buf_get(&dr->buf);
|
|
349
|
+
} else {
|
|
350
|
+
ox_sax_drive_error(dr, BAD_BOM "invalid BOM or a binary file.");
|
|
351
|
+
c = '\0';
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
return c;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
static void parse(SaxDrive dr) {
|
|
358
|
+
char c = skipBOM(dr);
|
|
359
|
+
int state = START_STATE;
|
|
360
|
+
Nv parent;
|
|
361
|
+
|
|
362
|
+
while ('\0' != c) {
|
|
363
|
+
buf_protect(&dr->buf);
|
|
364
|
+
if ('<' == c) {
|
|
365
|
+
c = buf_get(&dr->buf);
|
|
366
|
+
switch (c) {
|
|
367
|
+
case '?': /* instructions (xml or otherwise) */ c = read_instruction(dr); break;
|
|
368
|
+
case '!': /* comment or doctype */
|
|
369
|
+
buf_protect(&dr->buf);
|
|
370
|
+
c = buf_get(&dr->buf);
|
|
371
|
+
if ('\0' == c) {
|
|
372
|
+
ox_sax_drive_error(dr, NO_TERM "DOCTYPE or comment not terminated");
|
|
373
|
+
|
|
374
|
+
goto DONE;
|
|
375
|
+
} else if ('-' == c) {
|
|
376
|
+
c = buf_get(&dr->buf); /* skip first - and get next character */
|
|
377
|
+
if ('-' != c) {
|
|
378
|
+
ox_sax_drive_error(dr, INVALID_FORMAT "bad comment format, expected <!--");
|
|
379
|
+
} else {
|
|
380
|
+
c = buf_get(&dr->buf); /* skip second - */
|
|
381
|
+
}
|
|
382
|
+
c = read_comment(dr);
|
|
383
|
+
} else {
|
|
384
|
+
int i;
|
|
385
|
+
int spaced = 0;
|
|
386
|
+
off_t pos = dr->buf.pos + 1;
|
|
387
|
+
off_t line = dr->buf.line;
|
|
388
|
+
off_t col = dr->buf.col + 1;
|
|
389
|
+
|
|
390
|
+
if (is_white(c)) {
|
|
391
|
+
spaced = 1;
|
|
392
|
+
c = buf_next_non_white(&dr->buf);
|
|
393
|
+
}
|
|
394
|
+
dr->buf.str = dr->buf.tail - 1;
|
|
395
|
+
for (i = 7; 0 < i; i--) {
|
|
396
|
+
c = buf_get(&dr->buf);
|
|
397
|
+
}
|
|
398
|
+
if (0 == strncmp("DOCTYPE", dr->buf.str, 7)) {
|
|
399
|
+
if (spaced) {
|
|
400
|
+
ox_sax_drive_error_at(dr, WRONG_CHAR "<!DOCTYPE can not included spaces", pos, line, col);
|
|
401
|
+
}
|
|
402
|
+
if (START_STATE != state) {
|
|
403
|
+
ox_sax_drive_error(dr, OUT_OF_ORDER "DOCTYPE can not come after an element");
|
|
404
|
+
}
|
|
405
|
+
c = read_doctype(dr);
|
|
406
|
+
} else if (0 == strncasecmp("DOCTYPE", dr->buf.str, 7)) {
|
|
407
|
+
if (!dr->options.smart) {
|
|
408
|
+
ox_sax_drive_error(dr, CASE_ERROR "expected DOCTYPE all in caps");
|
|
409
|
+
}
|
|
410
|
+
if (START_STATE != state) {
|
|
411
|
+
ox_sax_drive_error(dr, OUT_OF_ORDER "DOCTYPE can not come after an element");
|
|
412
|
+
}
|
|
413
|
+
c = read_doctype(dr);
|
|
414
|
+
} else if (0 == strncmp("[CDATA[", dr->buf.str, 7)) {
|
|
415
|
+
if (spaced) {
|
|
416
|
+
ox_sax_drive_error_at(dr, WRONG_CHAR "<![CDATA[ can not included spaces", pos, line, col);
|
|
417
|
+
}
|
|
418
|
+
c = read_cdata(dr);
|
|
419
|
+
} else if (0 == strncasecmp("[CDATA[", dr->buf.str, 7)) {
|
|
420
|
+
if (!dr->options.smart) {
|
|
421
|
+
ox_sax_drive_error(dr, CASE_ERROR "expected CDATA all in caps");
|
|
422
|
+
}
|
|
423
|
+
c = read_cdata(dr);
|
|
424
|
+
} else {
|
|
425
|
+
Nv parent = stack_peek(&dr->stack);
|
|
426
|
+
|
|
427
|
+
if (0 != parent) {
|
|
428
|
+
parent->childCnt++;
|
|
429
|
+
}
|
|
430
|
+
ox_sax_drive_error_at(dr, WRONG_CHAR "DOCTYPE, CDATA, or comment expected", pos, line, col);
|
|
431
|
+
c = read_name_token(dr);
|
|
432
|
+
if ('>' == c) {
|
|
433
|
+
c = buf_get(&dr->buf);
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
break;
|
|
438
|
+
case '/': /* element end */
|
|
439
|
+
parent = stack_peek(&dr->stack);
|
|
440
|
+
if (0 != parent && 0 == parent->childCnt && dr->has_text && !dr->blocked) {
|
|
441
|
+
VALUE args[1];
|
|
442
|
+
args[0] = rb_str_new2("");
|
|
443
|
+
if (0 != dr->encoding) {
|
|
444
|
+
rb_enc_associate(args[0], dr->encoding);
|
|
445
|
+
}
|
|
446
|
+
dr->set_pos(dr->handler, dr->buf.pos);
|
|
447
|
+
dr->set_line(dr->handler, dr->buf.line);
|
|
448
|
+
dr->set_col(dr->handler, dr->buf.col);
|
|
449
|
+
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
|
450
|
+
}
|
|
451
|
+
c = read_element_end(dr);
|
|
452
|
+
if (0 == stack_peek(&dr->stack)) {
|
|
453
|
+
state = AFTER_STATE;
|
|
454
|
+
}
|
|
455
|
+
break;
|
|
456
|
+
case '\0': goto DONE;
|
|
457
|
+
default:
|
|
458
|
+
buf_backup(&dr->buf);
|
|
459
|
+
if (AFTER_STATE == state) {
|
|
460
|
+
ox_sax_drive_error(dr, OUT_OF_ORDER "multiple top level elements");
|
|
461
|
+
}
|
|
462
|
+
state = BODY_STATE;
|
|
463
|
+
c = read_element_start(dr);
|
|
464
|
+
if (0 == stack_peek(&dr->stack)) {
|
|
465
|
+
state = AFTER_STATE;
|
|
466
|
+
}
|
|
467
|
+
break;
|
|
468
|
+
}
|
|
469
|
+
} else {
|
|
470
|
+
buf_reset(&dr->buf);
|
|
471
|
+
c = read_text(dr);
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
DONE:
|
|
475
|
+
if (dr->abort) {
|
|
476
|
+
return;
|
|
477
|
+
}
|
|
478
|
+
if (dr->stack.head < dr->stack.tail) {
|
|
479
|
+
char msg[256];
|
|
480
|
+
Nv sp;
|
|
481
|
+
|
|
482
|
+
for (sp = dr->stack.tail - 1; dr->stack.head <= sp; sp--) {
|
|
483
|
+
snprintf(msg, sizeof(msg) - 1, "%selement '%s' not closed", EL_MISMATCH, nv_name(sp));
|
|
484
|
+
ox_sax_drive_error_at(dr, msg, dr->buf.pos, dr->buf.line, dr->buf.col);
|
|
485
|
+
end_element_cb(dr, sp->val, dr->buf.pos, dr->buf.line, dr->buf.col, sp->hint);
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
static void read_content(SaxDrive dr, char *content, size_t len) {
|
|
491
|
+
char c;
|
|
492
|
+
char *end = content + len;
|
|
493
|
+
|
|
494
|
+
while ('\0' != (c = buf_get(&dr->buf))) {
|
|
495
|
+
if (end <= content) {
|
|
496
|
+
*content = '\0';
|
|
497
|
+
ox_sax_drive_error(dr, "processing instruction content too large");
|
|
498
|
+
return;
|
|
499
|
+
}
|
|
500
|
+
if ('?' == c) {
|
|
501
|
+
if ('\0' == (c = buf_get(&dr->buf))) {
|
|
502
|
+
ox_sax_drive_error(dr, NO_TERM "document not terminated");
|
|
503
|
+
}
|
|
504
|
+
if ('>' == c) {
|
|
505
|
+
*content = '\0';
|
|
506
|
+
return;
|
|
507
|
+
} else {
|
|
508
|
+
*content++ = c;
|
|
509
|
+
}
|
|
510
|
+
} else {
|
|
511
|
+
*content++ = c;
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
*content = '\0';
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
/* Entered after the "<?" sequence. Ready to read the rest.
|
|
518
|
+
*/
|
|
519
|
+
static char read_instruction(SaxDrive dr) {
|
|
520
|
+
char content[4096];
|
|
521
|
+
char c;
|
|
522
|
+
int coff;
|
|
523
|
+
VALUE target = Qnil;
|
|
524
|
+
int is_xml;
|
|
525
|
+
off_t pos = dr->buf.pos - 1;
|
|
526
|
+
off_t line = dr->buf.line;
|
|
527
|
+
off_t col = dr->buf.col - 1;
|
|
528
|
+
|
|
529
|
+
buf_protect(&dr->buf);
|
|
530
|
+
if ('\0' == (c = read_name_token(dr))) {
|
|
531
|
+
return c;
|
|
532
|
+
}
|
|
533
|
+
is_xml = (0 == (dr->options.smart ? strcasecmp("xml", dr->buf.str) : strcmp("xml", dr->buf.str)));
|
|
534
|
+
|
|
535
|
+
target = dr->instruct(dr, dr->buf.str, pos, line, col);
|
|
536
|
+
buf_protect(&dr->buf);
|
|
537
|
+
pos = dr->buf.pos;
|
|
538
|
+
line = dr->buf.line;
|
|
539
|
+
col = dr->buf.col;
|
|
540
|
+
read_content(dr, content, sizeof(content) - 1);
|
|
541
|
+
coff = (int)(dr->buf.tail - dr->buf.head);
|
|
542
|
+
buf_reset(&dr->buf);
|
|
543
|
+
dr->err = 0;
|
|
544
|
+
c = read_attrs(dr, c, '?', '?', is_xml, 1, NULL);
|
|
545
|
+
dr->attrs_done(dr->handler);
|
|
546
|
+
if (dr->err) {
|
|
547
|
+
if (dr->has_text) {
|
|
548
|
+
VALUE args[1];
|
|
549
|
+
|
|
550
|
+
if (dr->options.convert_special) {
|
|
551
|
+
ox_sax_collapse_special(dr, content, (int)pos, (int)line, (int)col);
|
|
552
|
+
}
|
|
553
|
+
args[0] = rb_str_new2(content);
|
|
554
|
+
if (0 != dr->encoding) {
|
|
555
|
+
rb_enc_associate(args[0], dr->encoding);
|
|
556
|
+
}
|
|
557
|
+
dr->set_pos(dr->handler, pos);
|
|
558
|
+
dr->set_line(dr->handler, line);
|
|
559
|
+
dr->set_col(dr->handler, col);
|
|
560
|
+
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
|
561
|
+
}
|
|
562
|
+
dr->buf.tail = dr->buf.head + coff;
|
|
563
|
+
c = buf_get(&dr->buf);
|
|
564
|
+
} else {
|
|
565
|
+
pos = dr->buf.pos;
|
|
566
|
+
line = dr->buf.line;
|
|
567
|
+
col = dr->buf.col;
|
|
568
|
+
c = buf_next_non_white(&dr->buf);
|
|
569
|
+
if ('>' == c) {
|
|
570
|
+
c = buf_get(&dr->buf);
|
|
571
|
+
} else {
|
|
572
|
+
ox_sax_drive_error_at(dr, NO_TERM "instruction not terminated", pos, line, col);
|
|
573
|
+
if ('>' == c) {
|
|
574
|
+
c = buf_get(&dr->buf);
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
dr->end_instruct(dr, target, pos, line, col);
|
|
579
|
+
dr->buf.str = NULL;
|
|
580
|
+
|
|
581
|
+
return c;
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
static char read_delimited(SaxDrive dr, char end) {
|
|
585
|
+
char c;
|
|
586
|
+
|
|
587
|
+
if ('"' == end || '\'' == end) {
|
|
588
|
+
while (end != (c = buf_get(&dr->buf))) {
|
|
589
|
+
if ('\0' == c) {
|
|
590
|
+
ox_sax_drive_error(dr, NO_TERM "doctype not terminated");
|
|
591
|
+
return c;
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
} else {
|
|
595
|
+
while (1) {
|
|
596
|
+
c = buf_get(&dr->buf);
|
|
597
|
+
if (end == c) {
|
|
598
|
+
return c;
|
|
599
|
+
}
|
|
600
|
+
switch (c) {
|
|
601
|
+
case '\0': ox_sax_drive_error(dr, NO_TERM "doctype not terminated"); return c;
|
|
602
|
+
case '"': c = read_delimited(dr, c); break;
|
|
603
|
+
case '\'': c = read_delimited(dr, c); break;
|
|
604
|
+
case '[': c = read_delimited(dr, ']'); break;
|
|
605
|
+
case '<': c = read_delimited(dr, '>'); break;
|
|
606
|
+
default: break;
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
return c;
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
/* Entered after the "<!DOCTYPE " sequence. Ready to read the rest.
|
|
614
|
+
*/
|
|
615
|
+
static char read_doctype(SaxDrive dr) {
|
|
616
|
+
long pos = (long)(dr->buf.pos - 9);
|
|
617
|
+
long line = (long)(dr->buf.line);
|
|
618
|
+
long col = (long)(dr->buf.col - 9);
|
|
619
|
+
char *s;
|
|
620
|
+
Nv parent = stack_peek(&dr->stack);
|
|
621
|
+
|
|
622
|
+
buf_backup(&dr->buf); /* back up to the start in case the doctype is empty */
|
|
623
|
+
buf_protect(&dr->buf);
|
|
624
|
+
read_delimited(dr, '>');
|
|
625
|
+
if (dr->options.smart && 0 == dr->options.hints) {
|
|
626
|
+
for (s = dr->buf.str; is_white(*s); s++) {
|
|
627
|
+
}
|
|
628
|
+
if (0 == strncasecmp("HTML", s, 4)) {
|
|
629
|
+
dr->options.hints = ox_hints_html();
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
*(dr->buf.tail - 1) = '\0';
|
|
633
|
+
if (0 != parent) {
|
|
634
|
+
parent->childCnt++;
|
|
635
|
+
}
|
|
636
|
+
dr->doctype(dr, pos, line, col);
|
|
637
|
+
dr->buf.str = 0;
|
|
638
|
+
|
|
639
|
+
return buf_get(&dr->buf);
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
/* Entered after the "<![CDATA[" sequence. Ready to read the rest.
|
|
643
|
+
*/
|
|
644
|
+
static char read_cdata(SaxDrive dr) {
|
|
645
|
+
char c;
|
|
646
|
+
char zero = '\0';
|
|
647
|
+
int end = 0;
|
|
648
|
+
long pos = (long)(dr->buf.pos - 9);
|
|
649
|
+
long line = (long)(dr->buf.line);
|
|
650
|
+
long col = (long)(dr->buf.col - 9);
|
|
651
|
+
struct _checkPt cp = CHECK_PT_INIT;
|
|
652
|
+
Nv parent = stack_peek(&dr->stack);
|
|
653
|
+
|
|
654
|
+
// TBD check parent overlay
|
|
655
|
+
if (0 != parent) {
|
|
656
|
+
parent->childCnt++;
|
|
657
|
+
}
|
|
658
|
+
buf_backup(&dr->buf); /* back up to the start in case the cdata is empty */
|
|
659
|
+
buf_protect(&dr->buf);
|
|
660
|
+
while (1) {
|
|
661
|
+
c = buf_get(&dr->buf);
|
|
662
|
+
switch (c) {
|
|
663
|
+
case ']': end++; break;
|
|
664
|
+
case '>':
|
|
665
|
+
if (2 <= end) {
|
|
666
|
+
*(dr->buf.tail - 3) = '\0';
|
|
667
|
+
c = buf_get(&dr->buf);
|
|
668
|
+
goto CB;
|
|
669
|
+
}
|
|
670
|
+
if (!buf_checkset(&cp)) {
|
|
671
|
+
buf_checkpoint(&dr->buf, &cp);
|
|
672
|
+
}
|
|
673
|
+
end = 0;
|
|
674
|
+
break;
|
|
675
|
+
case '<':
|
|
676
|
+
if (!buf_checkset(&cp)) {
|
|
677
|
+
buf_checkpoint(&dr->buf, &cp);
|
|
678
|
+
}
|
|
679
|
+
end = 0;
|
|
680
|
+
break;
|
|
681
|
+
case '\0':
|
|
682
|
+
if (buf_checkset(&cp)) {
|
|
683
|
+
c = buf_checkback(&dr->buf, &cp);
|
|
684
|
+
ox_sax_drive_error(dr, NO_TERM "CDATA not terminated");
|
|
685
|
+
zero = c;
|
|
686
|
+
*(dr->buf.tail - 1) = '\0';
|
|
687
|
+
goto CB;
|
|
688
|
+
}
|
|
689
|
+
ox_sax_drive_error(dr, NO_TERM "CDATA not terminated");
|
|
690
|
+
return '\0';
|
|
691
|
+
default:
|
|
692
|
+
if (1 < end && !buf_checkset(&cp)) {
|
|
693
|
+
buf_checkpoint(&dr->buf, &cp);
|
|
694
|
+
}
|
|
695
|
+
end = 0;
|
|
696
|
+
break;
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
CB:
|
|
700
|
+
dr->cdata(dr, pos, line, col);
|
|
701
|
+
if ('\0' != zero) {
|
|
702
|
+
*(dr->buf.tail - 1) = zero;
|
|
703
|
+
}
|
|
704
|
+
dr->buf.str = 0;
|
|
705
|
+
|
|
706
|
+
return c;
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
/* Entered after the "<!--" sequence. Ready to read the rest.
|
|
710
|
+
*/
|
|
711
|
+
static char read_comment(SaxDrive dr) {
|
|
712
|
+
char c;
|
|
713
|
+
char zero = '\0';
|
|
714
|
+
int end = 0;
|
|
715
|
+
long pos = (long)(dr->buf.pos - 4);
|
|
716
|
+
long line = (long)(dr->buf.line);
|
|
717
|
+
long col = (long)(dr->buf.col - 4);
|
|
718
|
+
struct _checkPt cp = CHECK_PT_INIT;
|
|
719
|
+
|
|
720
|
+
buf_backup(&dr->buf); /* back up to the start in case the cdata is empty */
|
|
721
|
+
buf_protect(&dr->buf);
|
|
722
|
+
while (1) {
|
|
723
|
+
c = buf_get(&dr->buf);
|
|
724
|
+
switch (c) {
|
|
725
|
+
case '-': end++; break;
|
|
726
|
+
case '>':
|
|
727
|
+
if (2 <= end) {
|
|
728
|
+
*(dr->buf.tail - 3) = '\0';
|
|
729
|
+
c = buf_get(&dr->buf);
|
|
730
|
+
goto CB;
|
|
731
|
+
}
|
|
732
|
+
if (!buf_checkset(&cp)) {
|
|
733
|
+
buf_checkpoint(&dr->buf, &cp);
|
|
734
|
+
}
|
|
735
|
+
end = 0;
|
|
736
|
+
break;
|
|
737
|
+
case '<':
|
|
738
|
+
if (!buf_checkset(&cp)) {
|
|
739
|
+
buf_checkpoint(&dr->buf, &cp);
|
|
740
|
+
}
|
|
741
|
+
end = 0;
|
|
742
|
+
break;
|
|
743
|
+
case '\0':
|
|
744
|
+
if (buf_checkset(&cp)) {
|
|
745
|
+
c = buf_checkback(&dr->buf, &cp);
|
|
746
|
+
ox_sax_drive_error(dr, NO_TERM "comment not terminated");
|
|
747
|
+
zero = c;
|
|
748
|
+
*(dr->buf.tail - 1) = '\0';
|
|
749
|
+
goto CB;
|
|
750
|
+
}
|
|
751
|
+
ox_sax_drive_error(dr, NO_TERM "comment not terminated");
|
|
752
|
+
return '\0';
|
|
753
|
+
default:
|
|
754
|
+
if (1 < end && !buf_checkset(&cp)) {
|
|
755
|
+
buf_checkpoint(&dr->buf, &cp);
|
|
756
|
+
}
|
|
757
|
+
end = 0;
|
|
758
|
+
break;
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
CB:
|
|
762
|
+
dr->comment(dr, pos, line, col);
|
|
763
|
+
if ('\0' != zero) {
|
|
764
|
+
*(dr->buf.tail - 1) = zero;
|
|
765
|
+
}
|
|
766
|
+
dr->buf.str = 0;
|
|
767
|
+
|
|
768
|
+
return c;
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
/* Entered after the '<' and the first character after that. Returns status
|
|
772
|
+
* code.
|
|
773
|
+
*/
|
|
774
|
+
static char read_element_start(SaxDrive dr) {
|
|
775
|
+
const char *ename = NULL;
|
|
776
|
+
char ebuf[128];
|
|
777
|
+
size_t nlen;
|
|
778
|
+
volatile VALUE name = Qnil;
|
|
779
|
+
char c;
|
|
780
|
+
long pos = (long)(dr->buf.pos);
|
|
781
|
+
long line = (long)(dr->buf.line);
|
|
782
|
+
long col = (long)(dr->buf.col);
|
|
783
|
+
Hint h = NULL;
|
|
784
|
+
int stackless = 0;
|
|
785
|
+
Nv parent = stack_peek(&dr->stack);
|
|
786
|
+
bool closed;
|
|
787
|
+
bool efree = false;
|
|
788
|
+
|
|
789
|
+
if ('\0' == (c = read_name_token(dr))) {
|
|
790
|
+
return '\0';
|
|
791
|
+
}
|
|
792
|
+
if ('\0' == *dr->buf.str) {
|
|
793
|
+
char msg[256];
|
|
794
|
+
|
|
795
|
+
snprintf(msg, sizeof(msg) - 1, "%sempty element", INVALID_FORMAT);
|
|
796
|
+
ox_sax_drive_error_at(dr, msg, pos, line, col);
|
|
797
|
+
|
|
798
|
+
return buf_get(&dr->buf);
|
|
799
|
+
}
|
|
800
|
+
if (0 != parent) {
|
|
801
|
+
parent->childCnt++;
|
|
802
|
+
}
|
|
803
|
+
if (dr->options.smart && 0 == dr->options.hints && stack_empty(&dr->stack) &&
|
|
804
|
+
0 == strcasecmp("html", dr->buf.str)) {
|
|
805
|
+
dr->options.hints = ox_hints_html();
|
|
806
|
+
}
|
|
807
|
+
nlen = dr->buf.tail - dr->buf.str - 1;
|
|
808
|
+
if (NULL != dr->options.hints) {
|
|
809
|
+
hint_clear_empty(dr);
|
|
810
|
+
h = ox_hint_find(dr->options.hints, dr->buf.str);
|
|
811
|
+
if (NULL == h) {
|
|
812
|
+
char msg[256];
|
|
813
|
+
|
|
814
|
+
snprintf(msg,
|
|
815
|
+
sizeof(msg),
|
|
816
|
+
"%s%s is not a valid element type for a %s document type.",
|
|
817
|
+
INV_ELEMENT,
|
|
818
|
+
dr->buf.str,
|
|
819
|
+
dr->options.hints->name);
|
|
820
|
+
ox_sax_drive_error(dr, msg);
|
|
821
|
+
} else {
|
|
822
|
+
Nv top_nv = stack_peek(&dr->stack);
|
|
823
|
+
|
|
824
|
+
if (AbortOverlay == h->overlay) {
|
|
825
|
+
if (rb_respond_to(dr->handler, ox_abort_id)) {
|
|
826
|
+
VALUE args[1];
|
|
827
|
+
|
|
828
|
+
args[0] = str2sym(dr, dr->buf.str, nlen, NULL);
|
|
829
|
+
rb_funcall2(dr->handler, ox_abort_id, 1, args);
|
|
830
|
+
}
|
|
831
|
+
dr->abort = true;
|
|
832
|
+
return '\0';
|
|
833
|
+
}
|
|
834
|
+
if (BlockOverlay == h->overlay) {
|
|
835
|
+
dr->blocked++;
|
|
836
|
+
}
|
|
837
|
+
if (h->empty) {
|
|
838
|
+
stackless = 1;
|
|
839
|
+
}
|
|
840
|
+
if (0 != top_nv) {
|
|
841
|
+
char msg[256];
|
|
842
|
+
|
|
843
|
+
if (!h->nest && NestOverlay != h->overlay && nv_same_name(top_nv, h->name, true)) {
|
|
844
|
+
snprintf(msg,
|
|
845
|
+
sizeof(msg) - 1,
|
|
846
|
+
"%s%s can not be nested in a %s document, closing previous.",
|
|
847
|
+
INV_ELEMENT,
|
|
848
|
+
dr->buf.str,
|
|
849
|
+
dr->options.hints->name);
|
|
850
|
+
ox_sax_drive_error(dr, msg);
|
|
851
|
+
stack_pop(&dr->stack);
|
|
852
|
+
end_element_cb(dr, top_nv->val, pos, line, col, top_nv->hint);
|
|
853
|
+
top_nv = stack_peek(&dr->stack);
|
|
854
|
+
}
|
|
855
|
+
if (NULL != top_nv && 0 != h->parents && NestOverlay != h->overlay) {
|
|
856
|
+
const char **p;
|
|
857
|
+
int ok = 0;
|
|
858
|
+
|
|
859
|
+
for (p = h->parents; 0 != *p; p++) {
|
|
860
|
+
if (nv_same_name(top_nv, *p, true)) {
|
|
861
|
+
ok = 1;
|
|
862
|
+
break;
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
if (!ok) {
|
|
866
|
+
snprintf(msg,
|
|
867
|
+
sizeof(msg) - 1,
|
|
868
|
+
"%s%s can not be a child of a %s in a %s document.",
|
|
869
|
+
INV_ELEMENT,
|
|
870
|
+
h->name,
|
|
871
|
+
nv_name(top_nv),
|
|
872
|
+
dr->options.hints->name);
|
|
873
|
+
ox_sax_drive_error(dr, msg);
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
name = str2sym(dr, dr->buf.str, nlen, &ename);
|
|
880
|
+
if (NULL == ename) {
|
|
881
|
+
if (sizeof(ebuf) <= nlen) {
|
|
882
|
+
ename = ox_strndup(dr->buf.str, nlen);
|
|
883
|
+
efree = true;
|
|
884
|
+
} else {
|
|
885
|
+
memcpy(ebuf, dr->buf.str, nlen);
|
|
886
|
+
ebuf[nlen] = '\0';
|
|
887
|
+
ename = ebuf;
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
if (dr->has_start_element && 0 >= dr->blocked &&
|
|
891
|
+
(NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
|
892
|
+
VALUE args[1];
|
|
893
|
+
|
|
894
|
+
dr->set_pos(dr->handler, pos);
|
|
895
|
+
dr->set_line(dr->handler, line);
|
|
896
|
+
dr->set_col(dr->handler, col);
|
|
897
|
+
args[0] = name;
|
|
898
|
+
rb_funcall2(dr->handler, ox_start_element_id, 1, args);
|
|
899
|
+
}
|
|
900
|
+
if ('/' == c) {
|
|
901
|
+
closed = true;
|
|
902
|
+
} else if ('>' == c) {
|
|
903
|
+
closed = false;
|
|
904
|
+
} else {
|
|
905
|
+
buf_protect(&dr->buf);
|
|
906
|
+
c = read_attrs(dr, c, '/', '>', 0, 0, h);
|
|
907
|
+
if (is_white(c)) {
|
|
908
|
+
c = buf_next_non_white(&dr->buf);
|
|
909
|
+
}
|
|
910
|
+
closed = ('/' == c);
|
|
911
|
+
}
|
|
912
|
+
if (0 >= dr->blocked && (NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
|
913
|
+
dr->attrs_done(dr->handler);
|
|
914
|
+
}
|
|
915
|
+
if (closed) {
|
|
916
|
+
c = buf_next_non_white(&dr->buf);
|
|
917
|
+
|
|
918
|
+
end_element_cb(dr, name, dr->buf.pos, dr->buf.line, dr->buf.col, h);
|
|
919
|
+
} else if (stackless) {
|
|
920
|
+
end_element_cb(dr, name, pos, line, col, h);
|
|
921
|
+
} else if (NULL != h && h->jump) {
|
|
922
|
+
stack_push(&dr->stack, ename, nlen, name, h);
|
|
923
|
+
if ('>' != c) {
|
|
924
|
+
ox_sax_drive_error(dr, WRONG_CHAR "element not closed");
|
|
925
|
+
return c;
|
|
926
|
+
}
|
|
927
|
+
read_jump(dr, h->name);
|
|
928
|
+
return '<';
|
|
929
|
+
} else {
|
|
930
|
+
stack_push(&dr->stack, ename, nlen, name, h);
|
|
931
|
+
}
|
|
932
|
+
if (efree) {
|
|
933
|
+
free((char *)ename);
|
|
934
|
+
}
|
|
935
|
+
if ('>' != c) {
|
|
936
|
+
ox_sax_drive_error(dr, WRONG_CHAR "element not closed");
|
|
937
|
+
return c;
|
|
938
|
+
}
|
|
939
|
+
dr->buf.str = NULL;
|
|
940
|
+
|
|
941
|
+
return buf_get(&dr->buf);
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
static Nv stack_rev_find(SaxDrive dr, const char *name) {
|
|
945
|
+
Nv nv;
|
|
946
|
+
|
|
947
|
+
for (nv = dr->stack.tail - 1; dr->stack.head <= nv; nv--) {
|
|
948
|
+
if (nv_same_name(nv, name, dr->options.smart)) {
|
|
949
|
+
return nv;
|
|
950
|
+
}
|
|
951
|
+
}
|
|
952
|
+
return 0;
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
static char read_element_end(SaxDrive dr) {
|
|
956
|
+
VALUE name = Qnil;
|
|
957
|
+
char c;
|
|
958
|
+
long pos = (long)(dr->buf.pos - 1);
|
|
959
|
+
long line = (long)(dr->buf.line);
|
|
960
|
+
long col = (long)(dr->buf.col - 1);
|
|
961
|
+
Nv nv;
|
|
962
|
+
Hint h = NULL;
|
|
963
|
+
|
|
964
|
+
if ('\0' == (c = read_name_token(dr))) {
|
|
965
|
+
return '\0';
|
|
966
|
+
}
|
|
967
|
+
if (is_white(c)) {
|
|
968
|
+
c = buf_next_non_white(&dr->buf);
|
|
969
|
+
}
|
|
970
|
+
// c should be > and current is one past so read another char
|
|
971
|
+
c = buf_get(&dr->buf);
|
|
972
|
+
nv = stack_peek(&dr->stack);
|
|
973
|
+
if (0 != nv && nv_same_name(nv, dr->buf.str, dr->options.smart)) {
|
|
974
|
+
name = nv->val;
|
|
975
|
+
h = nv->hint;
|
|
976
|
+
stack_pop(&dr->stack);
|
|
977
|
+
} else {
|
|
978
|
+
// Mismatched start and end
|
|
979
|
+
char msg[256];
|
|
980
|
+
Nv match = stack_rev_find(dr, dr->buf.str);
|
|
981
|
+
|
|
982
|
+
if (0 == match) {
|
|
983
|
+
// Not found so open and close element.
|
|
984
|
+
h = ox_hint_find(dr->options.hints, dr->buf.str);
|
|
985
|
+
if (NULL != h && h->empty) {
|
|
986
|
+
// Just close normally
|
|
987
|
+
name = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 2, 0);
|
|
988
|
+
snprintf(msg,
|
|
989
|
+
sizeof(msg) - 1,
|
|
990
|
+
"%selement '%s' should not have a separate close element",
|
|
991
|
+
EL_MISMATCH,
|
|
992
|
+
dr->buf.str);
|
|
993
|
+
ox_sax_drive_error_at(dr, msg, pos, line, col);
|
|
994
|
+
return c;
|
|
995
|
+
} else {
|
|
996
|
+
snprintf(msg, sizeof(msg) - 1, "%selement '%s' closed but not opened", EL_MISMATCH, dr->buf.str);
|
|
997
|
+
ox_sax_drive_error_at(dr, msg, pos, line, col);
|
|
998
|
+
if ('\x00' == *dr->buf.tail) {
|
|
999
|
+
name = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 1, 0);
|
|
1000
|
+
} else {
|
|
1001
|
+
name = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 2, 0);
|
|
1002
|
+
}
|
|
1003
|
+
if (dr->has_start_element && 0 >= dr->blocked &&
|
|
1004
|
+
(NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
|
1005
|
+
VALUE args[1];
|
|
1006
|
+
|
|
1007
|
+
dr->set_pos(dr->handler, pos);
|
|
1008
|
+
dr->set_line(dr->handler, line);
|
|
1009
|
+
dr->set_col(dr->handler, col);
|
|
1010
|
+
args[0] = name;
|
|
1011
|
+
rb_funcall2(dr->handler, ox_start_element_id, 1, args);
|
|
1012
|
+
}
|
|
1013
|
+
if (NULL != h && BlockOverlay == h->overlay && 0 < dr->blocked) {
|
|
1014
|
+
dr->blocked--;
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
} else {
|
|
1018
|
+
// Found a match so close all up to the found element in stack.
|
|
1019
|
+
Nv n2;
|
|
1020
|
+
|
|
1021
|
+
if (0 != (n2 = hint_try_close(dr, dr->buf.str))) {
|
|
1022
|
+
name = n2->val;
|
|
1023
|
+
h = n2->hint;
|
|
1024
|
+
} else {
|
|
1025
|
+
snprintf(msg,
|
|
1026
|
+
sizeof(msg) - 1,
|
|
1027
|
+
"%selement '%s' close does not match '%s' open",
|
|
1028
|
+
EL_MISMATCH,
|
|
1029
|
+
dr->buf.str,
|
|
1030
|
+
nv_name(nv));
|
|
1031
|
+
ox_sax_drive_error_at(dr, msg, pos, line, col);
|
|
1032
|
+
for (nv = stack_pop(&dr->stack); match < nv; nv = stack_pop(&dr->stack)) {
|
|
1033
|
+
end_element_cb(dr, nv->val, pos, line, col, nv->hint);
|
|
1034
|
+
}
|
|
1035
|
+
name = nv->val;
|
|
1036
|
+
h = nv->hint;
|
|
1037
|
+
}
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
end_element_cb(dr, name, pos, line, col, h);
|
|
1041
|
+
|
|
1042
|
+
return c;
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
static char read_text(SaxDrive dr) {
|
|
1046
|
+
VALUE args[1];
|
|
1047
|
+
char c;
|
|
1048
|
+
long pos = (long)(dr->buf.pos);
|
|
1049
|
+
long line = (long)(dr->buf.line);
|
|
1050
|
+
long col = (long)(dr->buf.col - 1);
|
|
1051
|
+
Nv parent = stack_peek(&dr->stack);
|
|
1052
|
+
int allWhite = 1;
|
|
1053
|
+
|
|
1054
|
+
buf_backup(&dr->buf);
|
|
1055
|
+
buf_protect(&dr->buf);
|
|
1056
|
+
while ('<' != (c = buf_get(&dr->buf))) {
|
|
1057
|
+
switch (c) {
|
|
1058
|
+
case ' ':
|
|
1059
|
+
case '\t':
|
|
1060
|
+
case '\f':
|
|
1061
|
+
case '\n':
|
|
1062
|
+
case '\r': break;
|
|
1063
|
+
case '\0':
|
|
1064
|
+
if (allWhite) {
|
|
1065
|
+
return c;
|
|
1066
|
+
}
|
|
1067
|
+
ox_sax_drive_error(dr, NO_TERM "text not terminated");
|
|
1068
|
+
goto END_OF_BUF;
|
|
1069
|
+
break;
|
|
1070
|
+
default: allWhite = 0; break;
|
|
1071
|
+
}
|
|
1072
|
+
}
|
|
1073
|
+
END_OF_BUF:
|
|
1074
|
+
if ('\0' != c) {
|
|
1075
|
+
*(dr->buf.tail - 1) = '\0';
|
|
1076
|
+
}
|
|
1077
|
+
if (allWhite) {
|
|
1078
|
+
int isEnd = ('/' == buf_get(&dr->buf));
|
|
1079
|
+
|
|
1080
|
+
buf_backup(&dr->buf);
|
|
1081
|
+
if (dr->has_text && ((NoSkip == dr->options.skip && !isEnd) || (OffSkip == dr->options.skip))) {
|
|
1082
|
+
args[0] = rb_str_new2(dr->buf.str);
|
|
1083
|
+
if (0 != dr->encoding) {
|
|
1084
|
+
rb_enc_associate(args[0], dr->encoding);
|
|
1085
|
+
}
|
|
1086
|
+
dr->set_pos(dr->handler, pos);
|
|
1087
|
+
dr->set_line(dr->handler, line);
|
|
1088
|
+
dr->set_col(dr->handler, col);
|
|
1089
|
+
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
|
1090
|
+
}
|
|
1091
|
+
if (!isEnd || 0 == parent || 0 < parent->childCnt) {
|
|
1092
|
+
return c;
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
if (0 != parent) {
|
|
1096
|
+
parent->childCnt++;
|
|
1097
|
+
}
|
|
1098
|
+
if (!dr->blocked && (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay)) {
|
|
1099
|
+
if (dr->has_value) {
|
|
1100
|
+
dr->set_pos(dr->handler, pos);
|
|
1101
|
+
dr->set_line(dr->handler, line);
|
|
1102
|
+
dr->set_col(dr->handler, col);
|
|
1103
|
+
*args = dr->value_obj;
|
|
1104
|
+
rb_funcall2(dr->handler, ox_value_id, 1, args);
|
|
1105
|
+
} else if (dr->has_text) {
|
|
1106
|
+
if (dr->options.convert_special) {
|
|
1107
|
+
ox_sax_collapse_special(dr, dr->buf.str, pos, line, col);
|
|
1108
|
+
}
|
|
1109
|
+
switch (dr->options.skip) {
|
|
1110
|
+
case CrSkip: buf_collapse_return(dr->buf.str); break;
|
|
1111
|
+
case SpcSkip: buf_collapse_white(dr->buf.str); break;
|
|
1112
|
+
default: break;
|
|
1113
|
+
}
|
|
1114
|
+
args[0] = rb_str_new2(dr->buf.str);
|
|
1115
|
+
if (0 != dr->encoding) {
|
|
1116
|
+
rb_enc_associate(args[0], dr->encoding);
|
|
1117
|
+
}
|
|
1118
|
+
dr->set_pos(dr->handler, pos);
|
|
1119
|
+
dr->set_line(dr->handler, line);
|
|
1120
|
+
dr->set_col(dr->handler, col);
|
|
1121
|
+
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
|
1122
|
+
}
|
|
1123
|
+
}
|
|
1124
|
+
dr->buf.str = 0;
|
|
1125
|
+
|
|
1126
|
+
return c;
|
|
1127
|
+
}
|
|
1128
|
+
|
|
1129
|
+
static int read_jump_term(Buf buf, const char *pat) {
|
|
1130
|
+
struct _checkPt cp;
|
|
1131
|
+
|
|
1132
|
+
buf_checkpoint(buf, &cp); // right after <
|
|
1133
|
+
if ('/' != buf_next_non_white(buf)) {
|
|
1134
|
+
return 0;
|
|
1135
|
+
}
|
|
1136
|
+
if (*pat != tolower(buf_next_non_white(buf))) {
|
|
1137
|
+
return 0;
|
|
1138
|
+
}
|
|
1139
|
+
for (pat++; '\0' != *pat; pat++) {
|
|
1140
|
+
if (*pat != tolower(buf_get(buf))) {
|
|
1141
|
+
return 0;
|
|
1142
|
+
}
|
|
1143
|
+
}
|
|
1144
|
+
if ('>' != buf_next_non_white(buf)) {
|
|
1145
|
+
return 0;
|
|
1146
|
+
}
|
|
1147
|
+
buf_checkback(buf, &cp);
|
|
1148
|
+
return 1;
|
|
1149
|
+
}
|
|
1150
|
+
|
|
1151
|
+
static char read_jump(SaxDrive dr, const char *pat) {
|
|
1152
|
+
VALUE args[1];
|
|
1153
|
+
char c;
|
|
1154
|
+
long pos = (long)(dr->buf.pos);
|
|
1155
|
+
long line = (long)(dr->buf.line);
|
|
1156
|
+
long col = (long)(dr->buf.col - 1);
|
|
1157
|
+
Nv parent = stack_peek(&dr->stack);
|
|
1158
|
+
|
|
1159
|
+
buf_protect(&dr->buf);
|
|
1160
|
+
while (1) {
|
|
1161
|
+
c = buf_get(&dr->buf);
|
|
1162
|
+
switch (c) {
|
|
1163
|
+
case '<':
|
|
1164
|
+
if (read_jump_term(&dr->buf, pat)) {
|
|
1165
|
+
goto END_OF_BUF;
|
|
1166
|
+
break;
|
|
1167
|
+
}
|
|
1168
|
+
break;
|
|
1169
|
+
case '\0':
|
|
1170
|
+
ox_sax_drive_error(dr, NO_TERM "not terminated");
|
|
1171
|
+
goto END_OF_BUF;
|
|
1172
|
+
break;
|
|
1173
|
+
default: break;
|
|
1174
|
+
}
|
|
1175
|
+
}
|
|
1176
|
+
END_OF_BUF:
|
|
1177
|
+
if ('\0' != c) {
|
|
1178
|
+
*(dr->buf.tail - 1) = '\0';
|
|
1179
|
+
}
|
|
1180
|
+
if (0 != parent) {
|
|
1181
|
+
parent->childCnt++;
|
|
1182
|
+
}
|
|
1183
|
+
// TBD check parent overlay
|
|
1184
|
+
if (dr->has_text && !dr->blocked) {
|
|
1185
|
+
args[0] = rb_str_new2(dr->buf.str);
|
|
1186
|
+
if (0 != dr->encoding) {
|
|
1187
|
+
rb_enc_associate(args[0], dr->encoding);
|
|
1188
|
+
}
|
|
1189
|
+
dr->set_pos(dr->handler, pos);
|
|
1190
|
+
dr->set_line(dr->handler, line);
|
|
1191
|
+
dr->set_col(dr->handler, col);
|
|
1192
|
+
rb_funcall2(dr->handler, ox_text_id, 1, args);
|
|
1193
|
+
}
|
|
1194
|
+
dr->buf.str = 0;
|
|
1195
|
+
if ('\0' != c) {
|
|
1196
|
+
*(dr->buf.tail - 1) = '<';
|
|
1197
|
+
}
|
|
1198
|
+
return c;
|
|
1199
|
+
}
|
|
1200
|
+
|
|
1201
|
+
static char read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req, Hint h) {
|
|
1202
|
+
VALUE name = Qnil;
|
|
1203
|
+
int is_encoding = 0;
|
|
1204
|
+
off_t pos;
|
|
1205
|
+
off_t line;
|
|
1206
|
+
off_t col;
|
|
1207
|
+
char *attr_value;
|
|
1208
|
+
|
|
1209
|
+
// already protected by caller
|
|
1210
|
+
dr->buf.str = dr->buf.tail;
|
|
1211
|
+
if (is_white(c)) {
|
|
1212
|
+
c = buf_next_non_white(&dr->buf);
|
|
1213
|
+
}
|
|
1214
|
+
while (termc != c && term2 != c) {
|
|
1215
|
+
buf_backup(&dr->buf);
|
|
1216
|
+
if ('\0' == c) {
|
|
1217
|
+
ox_sax_drive_error(dr, NO_TERM "attributes not terminated");
|
|
1218
|
+
return '\0';
|
|
1219
|
+
}
|
|
1220
|
+
pos = dr->buf.pos + 1;
|
|
1221
|
+
line = dr->buf.line;
|
|
1222
|
+
col = dr->buf.col + 1;
|
|
1223
|
+
if ('\0' == (c = read_name_token(dr))) {
|
|
1224
|
+
ox_sax_drive_error(dr, NO_TERM "error reading token");
|
|
1225
|
+
return '\0';
|
|
1226
|
+
}
|
|
1227
|
+
if (is_xml && 0 == strcasecmp("encoding", dr->buf.str)) {
|
|
1228
|
+
is_encoding = 1;
|
|
1229
|
+
}
|
|
1230
|
+
if (dr->want_attr_name) {
|
|
1231
|
+
name = str2sym(dr, dr->buf.str, dr->buf.tail - dr->buf.str - 1, 0);
|
|
1232
|
+
}
|
|
1233
|
+
if (is_white(c)) {
|
|
1234
|
+
c = buf_next_non_white(&dr->buf);
|
|
1235
|
+
}
|
|
1236
|
+
if ('=' != c) {
|
|
1237
|
+
// TBD allow in smart mode
|
|
1238
|
+
if (eq_req) {
|
|
1239
|
+
dr->err = 1;
|
|
1240
|
+
return c;
|
|
1241
|
+
} else {
|
|
1242
|
+
ox_sax_drive_error(dr, WRONG_CHAR "no attribute value");
|
|
1243
|
+
attr_value = (char *)"";
|
|
1244
|
+
}
|
|
1245
|
+
} else {
|
|
1246
|
+
pos = dr->buf.pos + 1;
|
|
1247
|
+
line = dr->buf.line;
|
|
1248
|
+
col = dr->buf.col + 1;
|
|
1249
|
+
c = read_quoted_value(dr, '?' == termc);
|
|
1250
|
+
attr_value = dr->buf.str;
|
|
1251
|
+
|
|
1252
|
+
if (is_encoding) {
|
|
1253
|
+
dr->encoding = rb_enc_find(dr->buf.str);
|
|
1254
|
+
is_encoding = 0;
|
|
1255
|
+
}
|
|
1256
|
+
}
|
|
1257
|
+
if (0 >= dr->blocked && (NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) {
|
|
1258
|
+
dr->attr_cb(dr, name, attr_value, pos, line, col);
|
|
1259
|
+
}
|
|
1260
|
+
if (is_white(c)) {
|
|
1261
|
+
c = buf_next_non_white(&dr->buf);
|
|
1262
|
+
}
|
|
1263
|
+
}
|
|
1264
|
+
dr->buf.str = 0;
|
|
1265
|
+
|
|
1266
|
+
return c;
|
|
1267
|
+
}
|
|
1268
|
+
|
|
1269
|
+
/* The character after the word is returned. dr->buf.tail is one past
|
|
1270
|
+
* that. dr->buf.str will point to the token which will be '\0' terminated.
|
|
1271
|
+
*/
|
|
1272
|
+
static char read_name_token(SaxDrive dr) {
|
|
1273
|
+
char c;
|
|
1274
|
+
|
|
1275
|
+
dr->buf.str = dr->buf.tail;
|
|
1276
|
+
c = buf_get(&dr->buf);
|
|
1277
|
+
if (is_white(c)) {
|
|
1278
|
+
c = buf_next_non_white(&dr->buf);
|
|
1279
|
+
dr->buf.str = dr->buf.tail - 1;
|
|
1280
|
+
}
|
|
1281
|
+
while (1) {
|
|
1282
|
+
switch (c) {
|
|
1283
|
+
case ' ':
|
|
1284
|
+
case '\t':
|
|
1285
|
+
case '\f':
|
|
1286
|
+
case '?':
|
|
1287
|
+
case '=':
|
|
1288
|
+
case '/':
|
|
1289
|
+
case '>':
|
|
1290
|
+
case '<':
|
|
1291
|
+
case '\n':
|
|
1292
|
+
case '\r': *(dr->buf.tail - 1) = '\0'; return c;
|
|
1293
|
+
case '\0':
|
|
1294
|
+
/* documents never terminate after a name token */
|
|
1295
|
+
ox_sax_drive_error(dr, NO_TERM "document not terminated");
|
|
1296
|
+
return '\0';
|
|
1297
|
+
case ':':
|
|
1298
|
+
if ('\0' == *dr->options.strip_ns) {
|
|
1299
|
+
break;
|
|
1300
|
+
} else if ('*' == *dr->options.strip_ns && '\0' == dr->options.strip_ns[1]) {
|
|
1301
|
+
dr->buf.str = dr->buf.tail;
|
|
1302
|
+
} else if (dr->options.smart &&
|
|
1303
|
+
0 == strncasecmp(dr->options.strip_ns, dr->buf.str, dr->buf.tail - dr->buf.str - 1)) {
|
|
1304
|
+
dr->buf.str = dr->buf.tail;
|
|
1305
|
+
} else if (0 == strncmp(dr->options.strip_ns, dr->buf.str, dr->buf.tail - dr->buf.str - 1)) {
|
|
1306
|
+
dr->buf.str = dr->buf.tail;
|
|
1307
|
+
}
|
|
1308
|
+
break;
|
|
1309
|
+
default: break;
|
|
1310
|
+
}
|
|
1311
|
+
c = buf_get(&dr->buf);
|
|
1312
|
+
}
|
|
1313
|
+
return '\0';
|
|
1314
|
+
}
|
|
1315
|
+
|
|
1316
|
+
/* The character after the quote or if there is no quote, the character after
|
|
1317
|
+
* the word is returned. dr->buf.tail is one past that. dr->buf.str will point
|
|
1318
|
+
* to the token which will be '\0' terminated.
|
|
1319
|
+
*/
|
|
1320
|
+
static char read_quoted_value(SaxDrive dr, bool inst) {
|
|
1321
|
+
char c;
|
|
1322
|
+
|
|
1323
|
+
c = buf_get(&dr->buf);
|
|
1324
|
+
if (is_white(c)) {
|
|
1325
|
+
c = buf_next_non_white(&dr->buf);
|
|
1326
|
+
}
|
|
1327
|
+
if ('"' == c || '\'' == c) {
|
|
1328
|
+
char term = c;
|
|
1329
|
+
|
|
1330
|
+
dr->buf.str = dr->buf.tail;
|
|
1331
|
+
while (term != (c = buf_get(&dr->buf))) {
|
|
1332
|
+
if ('\0' == c) {
|
|
1333
|
+
ox_sax_drive_error(dr, NO_TERM "quoted value not terminated");
|
|
1334
|
+
return '\0';
|
|
1335
|
+
}
|
|
1336
|
+
}
|
|
1337
|
+
// dr->buf.tail is one past quote char
|
|
1338
|
+
*(dr->buf.tail - 1) = '\0'; /* terminate value */
|
|
1339
|
+
c = buf_get(&dr->buf);
|
|
1340
|
+
return c;
|
|
1341
|
+
}
|
|
1342
|
+
// not quoted, look for something that terminates the string
|
|
1343
|
+
dr->buf.str = dr->buf.tail - 1;
|
|
1344
|
+
// TBD if smart or html then no error
|
|
1345
|
+
if (!(dr->options.smart && ox_hints_html() != dr->options.hints)) {
|
|
1346
|
+
ox_sax_drive_error(dr, WRONG_CHAR "attribute value not in quotes");
|
|
1347
|
+
}
|
|
1348
|
+
while ('\0' != (c = buf_get(&dr->buf))) {
|
|
1349
|
+
switch (c) {
|
|
1350
|
+
case ' ':
|
|
1351
|
+
// case '/':
|
|
1352
|
+
case '>':
|
|
1353
|
+
case '\t':
|
|
1354
|
+
case '\n':
|
|
1355
|
+
case '\r':
|
|
1356
|
+
*(dr->buf.tail - 1) = '\0'; /* terminate value */
|
|
1357
|
+
// dr->buf.tail is in the correct position, one after the word terminator
|
|
1358
|
+
return c;
|
|
1359
|
+
case '?':
|
|
1360
|
+
if (inst) {
|
|
1361
|
+
*(dr->buf.tail - 1) = '\0'; /* terminate value */
|
|
1362
|
+
return c;
|
|
1363
|
+
}
|
|
1364
|
+
break;
|
|
1365
|
+
default: break;
|
|
1366
|
+
}
|
|
1367
|
+
}
|
|
1368
|
+
return '\0'; // should never get here
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1371
|
+
static char *read_hex_uint64(char *b, uint64_t *up) {
|
|
1372
|
+
uint64_t u = 0;
|
|
1373
|
+
char c;
|
|
1374
|
+
|
|
1375
|
+
for (; ';' != *b; b++) {
|
|
1376
|
+
c = *b;
|
|
1377
|
+
if ('0' <= c && c <= '9') {
|
|
1378
|
+
u = (u << 4) | (uint64_t)(c - '0');
|
|
1379
|
+
} else if ('a' <= c && c <= 'f') {
|
|
1380
|
+
u = (u << 4) | (uint64_t)(c - 'a' + 10);
|
|
1381
|
+
} else if ('A' <= c && c <= 'F') {
|
|
1382
|
+
u = (u << 4) | (uint64_t)(c - 'A' + 10);
|
|
1383
|
+
} else {
|
|
1384
|
+
return 0;
|
|
1385
|
+
}
|
|
1386
|
+
}
|
|
1387
|
+
*up = u;
|
|
1388
|
+
|
|
1389
|
+
return b;
|
|
1390
|
+
}
|
|
1391
|
+
|
|
1392
|
+
static char *read_10_uint64(char *b, uint64_t *up) {
|
|
1393
|
+
uint64_t u = 0;
|
|
1394
|
+
char c;
|
|
1395
|
+
|
|
1396
|
+
for (; ';' != *b; b++) {
|
|
1397
|
+
c = *b;
|
|
1398
|
+
if ('0' <= c && c <= '9') {
|
|
1399
|
+
u = (u * 10) + (uint64_t)(c - '0');
|
|
1400
|
+
} else {
|
|
1401
|
+
return 0;
|
|
1402
|
+
}
|
|
1403
|
+
}
|
|
1404
|
+
*up = u;
|
|
1405
|
+
|
|
1406
|
+
return b;
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1409
|
+
int ox_sax_collapse_special(SaxDrive dr, char *str, long pos, long line, long col) {
|
|
1410
|
+
char *s = str;
|
|
1411
|
+
char *b = str;
|
|
1412
|
+
|
|
1413
|
+
while ('\0' != *s) {
|
|
1414
|
+
switch (*s) {
|
|
1415
|
+
case '&': {
|
|
1416
|
+
int c = 0;
|
|
1417
|
+
char *end;
|
|
1418
|
+
|
|
1419
|
+
s++;
|
|
1420
|
+
if ('#' == *s) {
|
|
1421
|
+
uint64_t u = 0;
|
|
1422
|
+
char x;
|
|
1423
|
+
|
|
1424
|
+
s++;
|
|
1425
|
+
if ('x' == *s || 'X' == *s) {
|
|
1426
|
+
x = *s;
|
|
1427
|
+
s++;
|
|
1428
|
+
end = read_hex_uint64(s, &u);
|
|
1429
|
+
} else {
|
|
1430
|
+
x = '\0';
|
|
1431
|
+
end = read_10_uint64(s, &u);
|
|
1432
|
+
}
|
|
1433
|
+
if (0 == end) {
|
|
1434
|
+
ox_sax_drive_error(dr, NO_TERM "special character does not end with a semicolon");
|
|
1435
|
+
*b++ = '&';
|
|
1436
|
+
*b++ = '#';
|
|
1437
|
+
if ('\0' != x) {
|
|
1438
|
+
*b++ = x;
|
|
1439
|
+
}
|
|
1440
|
+
continue;
|
|
1441
|
+
}
|
|
1442
|
+
if (u <= 0x000000000000007FULL) {
|
|
1443
|
+
*b++ = (char)u;
|
|
1444
|
+
} else if (ox_utf8_encoding == dr->encoding) {
|
|
1445
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
|
1446
|
+
} else if (0 == dr->encoding) {
|
|
1447
|
+
dr->encoding = ox_utf8_encoding;
|
|
1448
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
|
1449
|
+
} else {
|
|
1450
|
+
b = ox_ucs_to_utf8_chars(b, u);
|
|
1451
|
+
/*
|
|
1452
|
+
ox_sax_drive_error(dr, NO_TERM "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character
|
|
1453
|
+
sequences."); *b++ = '&'; *b++ = '#'; if ('\0' != x) { *b++ = x;
|
|
1454
|
+
}
|
|
1455
|
+
continue;
|
|
1456
|
+
*/
|
|
1457
|
+
}
|
|
1458
|
+
s = end + 1;
|
|
1459
|
+
continue;
|
|
1460
|
+
} else if (0 == strncasecmp(s, "lt;", 3)) {
|
|
1461
|
+
c = '<';
|
|
1462
|
+
s += 3;
|
|
1463
|
+
col += 3;
|
|
1464
|
+
} else if (0 == strncasecmp(s, "gt;", 3)) {
|
|
1465
|
+
c = '>';
|
|
1466
|
+
s += 3;
|
|
1467
|
+
col += 3;
|
|
1468
|
+
} else if (0 == strncasecmp(s, "amp;", 4)) {
|
|
1469
|
+
c = '&';
|
|
1470
|
+
s += 4;
|
|
1471
|
+
col += 4;
|
|
1472
|
+
} else if (0 == strncasecmp(s, "quot;", 5)) {
|
|
1473
|
+
c = '"';
|
|
1474
|
+
s += 5;
|
|
1475
|
+
col += 5;
|
|
1476
|
+
} else if (0 == strncasecmp(s, "apos;", 5)) {
|
|
1477
|
+
c = '\'';
|
|
1478
|
+
s += 5;
|
|
1479
|
+
} else {
|
|
1480
|
+
char key[16];
|
|
1481
|
+
char *k = key;
|
|
1482
|
+
char *kend = key + sizeof(key) - 1;
|
|
1483
|
+
char *bn;
|
|
1484
|
+
char *s2 = s;
|
|
1485
|
+
|
|
1486
|
+
for (; ';' != *s2 && '\0' != *s2; s2++, k++) {
|
|
1487
|
+
if (kend <= k) {
|
|
1488
|
+
k = key;
|
|
1489
|
+
break;
|
|
1490
|
+
}
|
|
1491
|
+
*k = *s2;
|
|
1492
|
+
}
|
|
1493
|
+
*k = '\0';
|
|
1494
|
+
if ('\0' == *key || NULL == (bn = ox_entity_lookup(b, key))) {
|
|
1495
|
+
ox_sax_drive_error_at(dr, INVALID_FORMAT "Invalid special character sequence", pos, line, col);
|
|
1496
|
+
c = '&';
|
|
1497
|
+
} else {
|
|
1498
|
+
b = bn;
|
|
1499
|
+
s = s2 + 1;
|
|
1500
|
+
continue;
|
|
1501
|
+
}
|
|
1502
|
+
}
|
|
1503
|
+
*b++ = (char)c;
|
|
1504
|
+
col++;
|
|
1505
|
+
break;
|
|
1506
|
+
}
|
|
1507
|
+
case '\r':
|
|
1508
|
+
s++;
|
|
1509
|
+
if ('\n' == *s) {
|
|
1510
|
+
continue;
|
|
1511
|
+
}
|
|
1512
|
+
line++;
|
|
1513
|
+
col = 1;
|
|
1514
|
+
*b++ = '\n';
|
|
1515
|
+
break;
|
|
1516
|
+
case '\n':
|
|
1517
|
+
line++;
|
|
1518
|
+
col = 0;
|
|
1519
|
+
// fall through
|
|
1520
|
+
default:
|
|
1521
|
+
col++;
|
|
1522
|
+
*b++ = *s++;
|
|
1523
|
+
break;
|
|
1524
|
+
}
|
|
1525
|
+
}
|
|
1526
|
+
*b = '\0';
|
|
1527
|
+
|
|
1528
|
+
return 0;
|
|
1529
|
+
}
|
|
1530
|
+
|
|
1531
|
+
static void hint_clear_empty(SaxDrive dr) {
|
|
1532
|
+
Nv nv;
|
|
1533
|
+
|
|
1534
|
+
for (nv = stack_peek(&dr->stack); 0 != nv; nv = stack_peek(&dr->stack)) {
|
|
1535
|
+
if (0 == nv->hint) {
|
|
1536
|
+
break;
|
|
1537
|
+
}
|
|
1538
|
+
if (nv->hint->empty) {
|
|
1539
|
+
end_element_cb(dr, nv->val, dr->buf.pos, dr->buf.line, dr->buf.col, nv->hint);
|
|
1540
|
+
stack_pop(&dr->stack);
|
|
1541
|
+
} else {
|
|
1542
|
+
break;
|
|
1543
|
+
}
|
|
1544
|
+
}
|
|
1545
|
+
}
|
|
1546
|
+
|
|
1547
|
+
static Nv hint_try_close(SaxDrive dr, const char *name) {
|
|
1548
|
+
Hint h = ox_hint_find(dr->options.hints, name);
|
|
1549
|
+
Nv nv;
|
|
1550
|
+
|
|
1551
|
+
if (0 == h) {
|
|
1552
|
+
return 0;
|
|
1553
|
+
}
|
|
1554
|
+
for (nv = stack_peek(&dr->stack); 0 != nv; nv = stack_peek(&dr->stack)) {
|
|
1555
|
+
if (nv_same_name(nv, name, true)) {
|
|
1556
|
+
stack_pop(&dr->stack);
|
|
1557
|
+
return nv;
|
|
1558
|
+
}
|
|
1559
|
+
if (0 == nv->hint) {
|
|
1560
|
+
break;
|
|
1561
|
+
}
|
|
1562
|
+
if (nv->hint->empty) {
|
|
1563
|
+
end_element_cb(dr, nv->val, dr->buf.pos, dr->buf.line, dr->buf.col, nv->hint);
|
|
1564
|
+
dr->stack.tail = nv;
|
|
1565
|
+
} else {
|
|
1566
|
+
break;
|
|
1567
|
+
}
|
|
1568
|
+
}
|
|
1569
|
+
return 0;
|
|
1570
|
+
}
|