ox 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of ox might be problematic. Click here for more details.
- data/LICENSE +27 -0
- data/README +153 -0
- data/ext/ox/base64.c +123 -0
- data/ext/ox/base64.h +44 -0
- data/ext/ox/cache.c +148 -0
- data/ext/ox/cache.h +43 -0
- data/ext/ox/cache8.c +80 -0
- data/ext/ox/cache8.h +43 -0
- data/ext/ox/cache8_test.c +69 -0
- data/ext/ox/cache_test.c +69 -0
- data/ext/ox/dump.c +901 -0
- data/ext/ox/extconf.rb +7 -0
- data/ext/ox/gen_load.c +196 -0
- data/ext/ox/obj_load.c +802 -0
- data/ext/ox/ox.c +456 -0
- data/ext/ox/ox.h +190 -0
- data/ext/ox/parse.c +629 -0
- data/lib/ox.rb +97 -0
- data/lib/ox/cdata.rb +12 -0
- data/lib/ox/comment.rb +13 -0
- data/lib/ox/doctype.rb +13 -0
- data/lib/ox/document.rb +20 -0
- data/lib/ox/element.rb +67 -0
- data/lib/ox/node.rb +24 -0
- data/test/Sample.graffle +2318 -0
- data/test/cache16_test.rb +17 -0
- data/test/cache8_test.rb +17 -0
- data/test/cache_test.rb +17 -0
- data/test/files.rb +34 -0
- data/test/func.rb +228 -0
- data/test/gen_sample.rb +22 -0
- data/test/obj_sample.rb +19 -0
- data/test/ox/change.rb +16 -0
- data/test/ox/dir.rb +21 -0
- data/test/ox/doc.rb +39 -0
- data/test/ox/file.rb +33 -0
- data/test/ox/group.rb +18 -0
- data/test/ox/hasprops.rb +18 -0
- data/test/ox/layer.rb +14 -0
- data/test/ox/line.rb +22 -0
- data/test/ox/oval.rb +12 -0
- data/test/ox/rect.rb +12 -0
- data/test/ox/shape.rb +37 -0
- data/test/ox/text.rb +23 -0
- data/test/perf_gen.rb +193 -0
- data/test/perf_mars.rb +97 -0
- data/test/perf_obj.rb +201 -0
- data/test/perf_pod.rb +88 -0
- data/test/perf_write.rb +80 -0
- data/test/sample.rb +62 -0
- data/test/test.rb +70 -0
- metadata +106 -0
data/ext/ox/ox.h
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
/* ox.h
|
2
|
+
* Copyright (c) 2011, Peter Ohler
|
3
|
+
* All rights reserved.
|
4
|
+
*
|
5
|
+
* Redistribution and use in source and binary forms, with or without
|
6
|
+
* modification, are permitted provided that the following conditions are met:
|
7
|
+
*
|
8
|
+
* - Redistributions of source code must retain the above copyright notice, this
|
9
|
+
* list of conditions and the following disclaimer.
|
10
|
+
*
|
11
|
+
* - Redistributions in binary form must reproduce the above copyright notice,
|
12
|
+
* this list of conditions and the following disclaimer in the documentation
|
13
|
+
* and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* - Neither the name of Peter Ohler nor the names of its contributors may be
|
16
|
+
* used to endorse or promote products derived from this software without
|
17
|
+
* specific prior written permission.
|
18
|
+
*
|
19
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
20
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
21
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
22
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
23
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
24
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
25
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
26
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
27
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
29
|
+
*/
|
30
|
+
|
31
|
+
#ifndef __OX_H__
|
32
|
+
#define __OX_H__
|
33
|
+
|
34
|
+
#if defined(__cplusplus)
|
35
|
+
extern "C" {
|
36
|
+
#if 0
|
37
|
+
} /* satisfy cc-mode */
|
38
|
+
#endif
|
39
|
+
#endif
|
40
|
+
|
41
|
+
#include "ruby/encoding.h"
|
42
|
+
#include "cache.h"
|
43
|
+
|
44
|
+
#define raise_error(msg, xml, current) _raise_error(msg, xml, current, __FILE__, __LINE__)
|
45
|
+
|
46
|
+
#define MAX_TEXT_LEN 4096
|
47
|
+
#define MAX_ATTRS 1024
|
48
|
+
#define MAX_DEPTH 1024
|
49
|
+
|
50
|
+
#define SILENT 0
|
51
|
+
#define TRACE 1
|
52
|
+
#define DEBUG 2
|
53
|
+
|
54
|
+
typedef enum {
|
55
|
+
UseObj = 1,
|
56
|
+
UseAttr = 2,
|
57
|
+
UseAttrSet = 3,
|
58
|
+
UseArray = 4,
|
59
|
+
UseAMember = 5,
|
60
|
+
UseHash = 6,
|
61
|
+
UseHashKey = 7,
|
62
|
+
UseHashVal = 8,
|
63
|
+
UseRange = 9,
|
64
|
+
UseRangeAttr= 10,
|
65
|
+
UseRaw = 11,
|
66
|
+
} Use;
|
67
|
+
|
68
|
+
typedef enum {
|
69
|
+
NoCode = 0,
|
70
|
+
ArrayCode = 'a',
|
71
|
+
Base64Code = 'b',
|
72
|
+
ClassCode = 'c',
|
73
|
+
FloatCode = 'f',
|
74
|
+
RegexpCode = 'g',
|
75
|
+
HashCode = 'h',
|
76
|
+
FixnumCode = 'i',
|
77
|
+
BignumCode = 'j',
|
78
|
+
KeyCode = 'k', // indicates the value is a hash key, kind of a hack
|
79
|
+
RationalCode = 'l',
|
80
|
+
SymbolCode = 'm',
|
81
|
+
FalseClassCode = 'n',
|
82
|
+
ObjectCode = 'o',
|
83
|
+
RefCode = 'p',
|
84
|
+
RangeCode = 'r',
|
85
|
+
StringCode = 's',
|
86
|
+
TimeCode = 't',
|
87
|
+
StructCode = 'u',
|
88
|
+
ComplexCode = 'v',
|
89
|
+
RawCode = 'x',
|
90
|
+
TrueClassCode = 'y',
|
91
|
+
NilClassCode = 'z',
|
92
|
+
} Type;
|
93
|
+
|
94
|
+
typedef struct _Attr {
|
95
|
+
const char *name;
|
96
|
+
const char *value;
|
97
|
+
} *Attr;
|
98
|
+
|
99
|
+
typedef struct _Helper {
|
100
|
+
ID var; /* Object var ID */
|
101
|
+
VALUE obj; /* object created or Qundef if not appropriate */
|
102
|
+
Type type; /* type of object in obj */
|
103
|
+
} *Helper;
|
104
|
+
|
105
|
+
typedef struct _PInfo *PInfo;
|
106
|
+
|
107
|
+
typedef struct _ParseCallbacks {
|
108
|
+
void (*add_prolog)(PInfo pi, const char *version, const char *encoding, const char *standalone);
|
109
|
+
void (*add_doctype)(PInfo pi, const char *docType);
|
110
|
+
void (*add_comment)(PInfo pi, const char *comment);
|
111
|
+
void (*add_cdata)(PInfo pi, const char *cdata, size_t len);
|
112
|
+
void (*add_text)(PInfo pi, char *text, int closed);
|
113
|
+
void (*add_element)(PInfo pi, const char *ename, Attr attrs, int hasChildren);
|
114
|
+
void (*end_element)(PInfo pi, const char *ename);
|
115
|
+
} *ParseCallbacks;
|
116
|
+
|
117
|
+
typedef struct _CircArray {
|
118
|
+
VALUE obj_array[1024];
|
119
|
+
VALUE *objs;
|
120
|
+
unsigned long size; // allocated size or initial array size
|
121
|
+
unsigned long cnt;
|
122
|
+
} *CircArray;
|
123
|
+
|
124
|
+
/* parse information structure */
|
125
|
+
struct _PInfo {
|
126
|
+
struct _Helper helpers[MAX_DEPTH];
|
127
|
+
Helper h; /* current helper or 0 if not set */
|
128
|
+
char *str; /* buffer being read from */
|
129
|
+
char *s; /* current position in buffer */
|
130
|
+
VALUE obj;
|
131
|
+
ParseCallbacks pcb;
|
132
|
+
CircArray circ_array;
|
133
|
+
rb_encoding *encoding;
|
134
|
+
unsigned long id; /* set for text types when cirs_array is set */
|
135
|
+
int trace;
|
136
|
+
int best_effort;
|
137
|
+
};
|
138
|
+
|
139
|
+
extern VALUE parse(char *xml, ParseCallbacks pcb, char **endp, int trace, int best_effort);
|
140
|
+
extern void _raise_error(const char *msg, const char *xml, const char *current, const char* file, int line);
|
141
|
+
|
142
|
+
extern char* write_obj_to_str(VALUE obj, int indent, int xsd_date, int circular);
|
143
|
+
extern void write_obj_to_file(VALUE obj, const char *path, int indent, int xsd_date, int circular);
|
144
|
+
|
145
|
+
extern VALUE Ox;
|
146
|
+
|
147
|
+
extern ID at_id;
|
148
|
+
extern ID attributes_id;
|
149
|
+
extern ID beg_id;
|
150
|
+
extern ID den_id;
|
151
|
+
extern ID end_id;
|
152
|
+
extern ID excl_id;
|
153
|
+
extern ID inspect_id;
|
154
|
+
extern ID keys_id;
|
155
|
+
extern ID local_id;
|
156
|
+
extern ID nodes_id;
|
157
|
+
extern ID num_id;
|
158
|
+
extern ID parse_id;
|
159
|
+
extern ID to_c_id;
|
160
|
+
extern ID to_s_id;
|
161
|
+
extern ID tv_sec_id;
|
162
|
+
extern ID tv_usec_id;
|
163
|
+
extern ID value_id;
|
164
|
+
|
165
|
+
extern VALUE empty_string;
|
166
|
+
extern VALUE encoding_sym;
|
167
|
+
extern VALUE standalone_sym;
|
168
|
+
extern VALUE struct_class;
|
169
|
+
extern VALUE time_class;
|
170
|
+
extern VALUE version_sym;
|
171
|
+
extern VALUE zero_fixnum;
|
172
|
+
|
173
|
+
extern VALUE ox_document_clas;
|
174
|
+
extern VALUE ox_element_clas;
|
175
|
+
extern VALUE ox_text_clas;
|
176
|
+
extern VALUE ox_comment_clas;
|
177
|
+
extern VALUE ox_doctype_clas;
|
178
|
+
extern VALUE ox_cdata_clas;
|
179
|
+
|
180
|
+
extern Cache symbol_cache;
|
181
|
+
extern Cache class_cache;
|
182
|
+
extern Cache attr_cache;
|
183
|
+
|
184
|
+
#if defined(__cplusplus)
|
185
|
+
#if 0
|
186
|
+
{ /* satisfy cc-mode */
|
187
|
+
#endif
|
188
|
+
} /* extern "C" { */
|
189
|
+
#endif
|
190
|
+
#endif /* __OX_H__ */
|
data/ext/ox/parse.c
ADDED
@@ -0,0 +1,629 @@
|
|
1
|
+
/* parse.c
|
2
|
+
* Copyright (c) 2011, Peter Ohler
|
3
|
+
* All rights reserved.
|
4
|
+
*
|
5
|
+
* Redistribution and use in source and binary forms, with or without
|
6
|
+
* modification, are permitted provided that the following conditions are met:
|
7
|
+
*
|
8
|
+
* - Redistributions of source code must retain the above copyright notice, this
|
9
|
+
* list of conditions and the following disclaimer.
|
10
|
+
*
|
11
|
+
* - Redistributions in binary form must reproduce the above copyright notice,
|
12
|
+
* this list of conditions and the following disclaimer in the documentation
|
13
|
+
* and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* - Neither the name of Peter Ohler nor the names of its contributors may be
|
16
|
+
* used to endorse or promote products derived from this software without
|
17
|
+
* specific prior written permission.
|
18
|
+
*
|
19
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
20
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
21
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
22
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
23
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
24
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
25
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
26
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
27
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
29
|
+
*/
|
30
|
+
|
31
|
+
#include <stdlib.h>
|
32
|
+
#include <errno.h>
|
33
|
+
#include <stdio.h>
|
34
|
+
#include <string.h>
|
35
|
+
|
36
|
+
#include "ruby.h"
|
37
|
+
#include "ox.h"
|
38
|
+
|
39
|
+
static void read_prolog(PInfo pi);
|
40
|
+
static void read_doctype(PInfo pi);
|
41
|
+
static void read_comment(PInfo pi);
|
42
|
+
static void read_element(PInfo pi);
|
43
|
+
static void read_text(PInfo pi);
|
44
|
+
static void read_cdata(PInfo pi);
|
45
|
+
static char* read_name_token(PInfo pi);
|
46
|
+
static char* read_quoted_value(PInfo pi);
|
47
|
+
static int read_coded_char(PInfo pi);
|
48
|
+
static void next_non_white(PInfo pi);
|
49
|
+
|
50
|
+
static int validateProlog = 1;
|
51
|
+
|
52
|
+
/* This XML parser is a single pass, destructive, callback parser. It is a
|
53
|
+
* single pass parse since it only make one pass over the characters in the
|
54
|
+
* XML document string. It is destructive because it re-uses the content of
|
55
|
+
* the string for values in the callback and places \0 characters at various
|
56
|
+
* places to mark the end of tokens and strings. It is a callback parser like
|
57
|
+
* a SAX parser because it uses callback when document elements are
|
58
|
+
* encountered.
|
59
|
+
*
|
60
|
+
* Parsing is very tolerant. Lack of headers and even mispelled element
|
61
|
+
* endings are passed over without raising an error. A best attempt is made in
|
62
|
+
* all cases to parse the string.
|
63
|
+
*/
|
64
|
+
|
65
|
+
inline static void
|
66
|
+
next_non_white(PInfo pi) {
|
67
|
+
for (; 1; pi->s++) {
|
68
|
+
switch(*pi->s) {
|
69
|
+
case ' ':
|
70
|
+
case '\t':
|
71
|
+
case '\f':
|
72
|
+
case '\n':
|
73
|
+
case '\r':
|
74
|
+
break;
|
75
|
+
default:
|
76
|
+
return;
|
77
|
+
}
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
VALUE
|
82
|
+
parse(char *xml, ParseCallbacks pcb, char **endp, int trace, int best_effort) {
|
83
|
+
struct _PInfo pi;
|
84
|
+
int body_read = 0;
|
85
|
+
|
86
|
+
if (0 == xml) {
|
87
|
+
raise_error("Invalid arg, xml string can not be null", xml, 0);
|
88
|
+
}
|
89
|
+
if (DEBUG <= trace) {
|
90
|
+
printf("Parsing xml:\n%s\n", xml);
|
91
|
+
}
|
92
|
+
/* initialize parse info */
|
93
|
+
pi.str = xml;
|
94
|
+
pi.s = xml;
|
95
|
+
pi.h = 0;
|
96
|
+
pi.pcb = pcb;
|
97
|
+
pi.obj = Qnil;
|
98
|
+
pi.circ_array = 0;
|
99
|
+
pi.encoding = 0;
|
100
|
+
pi.trace = trace;
|
101
|
+
pi.best_effort = best_effort;
|
102
|
+
while (1) {
|
103
|
+
next_non_white(&pi); // skip white space
|
104
|
+
if ('\0' == *pi.s) {
|
105
|
+
break;
|
106
|
+
}
|
107
|
+
if (body_read && 0 != endp) {
|
108
|
+
*endp = pi.s;
|
109
|
+
break;
|
110
|
+
}
|
111
|
+
if ('<' != *pi.s) { // all top level entities start with <
|
112
|
+
raise_error("invalid format, expected <", pi.str, pi.s);
|
113
|
+
}
|
114
|
+
pi.s++; // past <
|
115
|
+
switch (*pi.s) {
|
116
|
+
case '?': // prolog
|
117
|
+
pi.s++;
|
118
|
+
read_prolog(&pi);
|
119
|
+
break;
|
120
|
+
case '!': /* comment or doctype */
|
121
|
+
pi.s++;
|
122
|
+
if ('\0' == *pi.s) {
|
123
|
+
raise_error("invalid format, DOCTYPE or comment not terminated", pi.str, pi.s);
|
124
|
+
} else if ('-' == *pi.s) {
|
125
|
+
pi.s++; // skip -
|
126
|
+
if ('-' != *pi.s) {
|
127
|
+
raise_error("invalid format, bad comment format", pi.str, pi.s);
|
128
|
+
} else {
|
129
|
+
pi.s++; // skip second -
|
130
|
+
read_comment(&pi);
|
131
|
+
}
|
132
|
+
} else if (0 == strncmp("DOCTYPE", pi.s, 7)) {
|
133
|
+
pi.s += 7;
|
134
|
+
read_doctype(&pi);
|
135
|
+
} else {
|
136
|
+
raise_error("invalid format, DOCTYPE or comment expected", pi.str, pi.s);
|
137
|
+
}
|
138
|
+
break;
|
139
|
+
case '\0':
|
140
|
+
raise_error("invalid format, document not terminated", pi.str, pi.s);
|
141
|
+
default:
|
142
|
+
read_element(&pi);
|
143
|
+
body_read = 1;
|
144
|
+
break;
|
145
|
+
}
|
146
|
+
}
|
147
|
+
return pi.obj;
|
148
|
+
}
|
149
|
+
|
150
|
+
/* Entered after the "<?" sequence. Ready to read the rest.
|
151
|
+
*/
|
152
|
+
static void
|
153
|
+
read_prolog(PInfo pi) {
|
154
|
+
char *version = 0;
|
155
|
+
char *encoding = 0;
|
156
|
+
char *standalone = 0;
|
157
|
+
char *name;
|
158
|
+
char *end;
|
159
|
+
char c;
|
160
|
+
|
161
|
+
// skip xml string
|
162
|
+
if (0 != strncasecmp("xml", pi->s, 3)) {
|
163
|
+
raise_error("invalid format, expected 'xml'", pi->str, pi->s);
|
164
|
+
}
|
165
|
+
pi->s += 3; // past xml
|
166
|
+
/* looking for ?> to terminate the prolog */
|
167
|
+
while ('?' != *pi->s) {
|
168
|
+
if ('\0' == *pi->s) {
|
169
|
+
raise_error("invalid format, prolog not terminated", pi->str, pi->s);
|
170
|
+
}
|
171
|
+
name = read_name_token(pi);
|
172
|
+
end = pi->s;
|
173
|
+
next_non_white(pi);
|
174
|
+
c = *pi->s;
|
175
|
+
*end = '\0'; // terminate name
|
176
|
+
if ('=' == c) {
|
177
|
+
// Figure out what the token is, read a value for it, and check
|
178
|
+
// against supported values.
|
179
|
+
pi->s++;
|
180
|
+
next_non_white(pi);
|
181
|
+
if (0 == strcasecmp("version", name)) {
|
182
|
+
version = read_quoted_value(pi);
|
183
|
+
if (validateProlog &&
|
184
|
+
(0 != strcmp("1.0", version) &&
|
185
|
+
0 != strcmp("1.1", version))) {
|
186
|
+
raise_error("invalid format, wrong XML version", pi->str, pi->s);
|
187
|
+
}
|
188
|
+
} else if (0 == strcasecmp("encoding", name)) {
|
189
|
+
encoding = read_quoted_value(pi);
|
190
|
+
/*
|
191
|
+
if (validateProlog && 0 != strcasecmp("UTF-8", encoding)) {
|
192
|
+
raise_error("invalid format, only UTF-8 supported", pi->str, pi->s);
|
193
|
+
}
|
194
|
+
*/
|
195
|
+
} else if (0 == strcasecmp("standalone", name)) {
|
196
|
+
standalone = read_quoted_value(pi);
|
197
|
+
if (validateProlog && 0 != strcmp("yes", standalone)) {
|
198
|
+
raise_error("invalid format, only standalone XML supported", pi->str, pi->s);
|
199
|
+
}
|
200
|
+
} else {
|
201
|
+
raise_error("invalid format, unknown prolog attribute", pi->str, pi->s);
|
202
|
+
}
|
203
|
+
} else if ('?' == c) {
|
204
|
+
pi->s++;
|
205
|
+
if ('>' != *pi->s++) {
|
206
|
+
raise_error("invalid format, prolog not terminated", pi->str, pi->s);
|
207
|
+
}
|
208
|
+
return;
|
209
|
+
} else {
|
210
|
+
raise_error("invalid format, prolog format error", pi->str, pi->s);
|
211
|
+
}
|
212
|
+
}
|
213
|
+
if ('\0' == pi->s) {
|
214
|
+
raise_error("invalid format, prolog not terminated", pi->str, pi->s);
|
215
|
+
}
|
216
|
+
if ('?' == *pi->s) {
|
217
|
+
pi->s++;
|
218
|
+
}
|
219
|
+
if ('>' != *pi->s++) {
|
220
|
+
raise_error("invalid format, prolog not terminated", pi->str, pi->s);
|
221
|
+
}
|
222
|
+
if (0 != pi->pcb->add_prolog) {
|
223
|
+
pi->pcb->add_prolog(pi, version, encoding, standalone);
|
224
|
+
}
|
225
|
+
}
|
226
|
+
|
227
|
+
/* Entered after the "<!DOCTYPE" sequence plus the first character after
|
228
|
+
* that. Ready to read the rest. Returns error code.
|
229
|
+
*/
|
230
|
+
static void
|
231
|
+
read_doctype(PInfo pi) {
|
232
|
+
char *docType;
|
233
|
+
int depth = 1;
|
234
|
+
char c;
|
235
|
+
|
236
|
+
next_non_white(pi);
|
237
|
+
docType = pi->s;
|
238
|
+
while (1) {
|
239
|
+
c = *pi->s++;
|
240
|
+
if ('\0' == c) {
|
241
|
+
raise_error("invalid format, prolog not terminated", pi->str, pi->s);
|
242
|
+
} else if ('<' == c) {
|
243
|
+
depth++;
|
244
|
+
} else if ('>' == c) {
|
245
|
+
depth--;
|
246
|
+
if (0 == depth) { /* done, at the end */
|
247
|
+
break;
|
248
|
+
}
|
249
|
+
}
|
250
|
+
}
|
251
|
+
*pi->s = '\0';
|
252
|
+
pi->s++;
|
253
|
+
if (0 != pi->pcb->add_doctype) {
|
254
|
+
pi->pcb->add_doctype(pi, docType);
|
255
|
+
}
|
256
|
+
}
|
257
|
+
|
258
|
+
/* Entered after "<!--". Returns error code.
|
259
|
+
*/
|
260
|
+
static void
|
261
|
+
read_comment(PInfo pi) {
|
262
|
+
char *end;
|
263
|
+
char *s;
|
264
|
+
char *comment;
|
265
|
+
int done = 0;
|
266
|
+
|
267
|
+
next_non_white(pi);
|
268
|
+
comment = pi->s;
|
269
|
+
end = strstr(pi->s, "-->");
|
270
|
+
if (0 == end) {
|
271
|
+
raise_error("invalid format, comment not terminated", pi->str, pi->s);
|
272
|
+
}
|
273
|
+
for (s = end - 1; pi->s < s && !done; s--) {
|
274
|
+
switch(*s) {
|
275
|
+
case ' ':
|
276
|
+
case '\t':
|
277
|
+
case '\f':
|
278
|
+
case '\n':
|
279
|
+
case '\r':
|
280
|
+
break;
|
281
|
+
default:
|
282
|
+
*(s + 1) = '\0';
|
283
|
+
done = 1;
|
284
|
+
break;
|
285
|
+
}
|
286
|
+
}
|
287
|
+
*end = '\0'; // in case the comment was blank
|
288
|
+
pi->s = end + 3;
|
289
|
+
if (0 != pi->pcb->add_comment) {
|
290
|
+
pi->pcb->add_comment(pi, comment);
|
291
|
+
}
|
292
|
+
}
|
293
|
+
|
294
|
+
/* Entered after the '<' and the first character after that. Returns status
|
295
|
+
* code.
|
296
|
+
*/
|
297
|
+
static void
|
298
|
+
read_element(PInfo pi) {
|
299
|
+
struct _Attr attrs[MAX_ATTRS];
|
300
|
+
Attr ap = attrs;
|
301
|
+
char *name;
|
302
|
+
char *ename;
|
303
|
+
char *end;
|
304
|
+
char c;
|
305
|
+
long elen;
|
306
|
+
int hasChildren = 0;
|
307
|
+
int done = 0;
|
308
|
+
|
309
|
+
ename = read_name_token(pi);
|
310
|
+
end = pi->s;
|
311
|
+
elen = end - ename;
|
312
|
+
next_non_white(pi);
|
313
|
+
c = *pi->s;
|
314
|
+
*end = '\0';
|
315
|
+
if ('/' == c) {
|
316
|
+
/* empty element, no attributes and no children */
|
317
|
+
pi->s++;
|
318
|
+
if ('>' != *pi->s) {
|
319
|
+
printf("*** '%s'***\n", pi->s);
|
320
|
+
raise_error("invalid format, element not closed", pi->str, pi->s);
|
321
|
+
}
|
322
|
+
pi->s++; /* past > */
|
323
|
+
ap->name = 0;
|
324
|
+
pi->pcb->add_element(pi, ename, attrs, hasChildren);
|
325
|
+
pi->pcb->end_element(pi, ename);
|
326
|
+
|
327
|
+
return;
|
328
|
+
}
|
329
|
+
/* read attribute names until the close (/ or >) is reached */
|
330
|
+
while (!done) {
|
331
|
+
if ('\0' == c) {
|
332
|
+
next_non_white(pi);
|
333
|
+
c = *pi->s;
|
334
|
+
}
|
335
|
+
switch (c) {
|
336
|
+
case '\0':
|
337
|
+
raise_error("invalid format, document not terminated", pi->str, pi->s);
|
338
|
+
case '/':
|
339
|
+
// Element with just attributes.
|
340
|
+
pi->s++;
|
341
|
+
if ('>' != *pi->s) {
|
342
|
+
raise_error("invalid format, element not closed", pi->str, pi->s);
|
343
|
+
}
|
344
|
+
pi->s++;
|
345
|
+
ap->name = 0;
|
346
|
+
pi->pcb->add_element(pi, ename, attrs, hasChildren);
|
347
|
+
pi->pcb->end_element(pi, ename);
|
348
|
+
|
349
|
+
return;
|
350
|
+
case '>':
|
351
|
+
// has either children or a value
|
352
|
+
pi->s++;
|
353
|
+
hasChildren = 1;
|
354
|
+
done = 1;
|
355
|
+
ap->name = 0;
|
356
|
+
pi->pcb->add_element(pi, ename, attrs, hasChildren);
|
357
|
+
break;
|
358
|
+
default:
|
359
|
+
// Attribute name so it's an element and the attribute will be
|
360
|
+
// added to it.
|
361
|
+
ap->name = read_name_token(pi);
|
362
|
+
end = pi->s;
|
363
|
+
next_non_white(pi);
|
364
|
+
if ('=' != *pi->s++) {
|
365
|
+
raise_error("invalid format, no attribute value", pi->str, pi->s);
|
366
|
+
}
|
367
|
+
*end = '\0'; // terminate name
|
368
|
+
// read value
|
369
|
+
next_non_white(pi);
|
370
|
+
ap->value = read_quoted_value(pi);
|
371
|
+
ap++;
|
372
|
+
if (MAX_ATTRS <= (ap - attrs)) {
|
373
|
+
raise_error("too many attributes", pi->str, pi->s);
|
374
|
+
}
|
375
|
+
break;
|
376
|
+
}
|
377
|
+
c = '\0';
|
378
|
+
}
|
379
|
+
if (hasChildren) {
|
380
|
+
char *start;
|
381
|
+
|
382
|
+
done = 0;
|
383
|
+
// read children
|
384
|
+
while (!done) {
|
385
|
+
start = pi->s;
|
386
|
+
next_non_white(pi);
|
387
|
+
c = *pi->s++;
|
388
|
+
if ('\0' == c) {
|
389
|
+
raise_error("invalid format, document not terminated", pi->str, pi->s);
|
390
|
+
}
|
391
|
+
if ('<' == c) {
|
392
|
+
switch (*pi->s) {
|
393
|
+
case '!': /* better be a comment or CDATA */
|
394
|
+
pi->s++;
|
395
|
+
if ('-' == *pi->s && '-' == *(pi->s + 1)) {
|
396
|
+
pi->s += 2;
|
397
|
+
read_comment(pi);
|
398
|
+
} else if (0 == strncmp("[CDATA[", pi->s, 7)) {
|
399
|
+
pi->s += 7;
|
400
|
+
read_cdata(pi);
|
401
|
+
} else {
|
402
|
+
raise_error("invalid format, invalid comment or CDATA format", pi->str, pi->s);
|
403
|
+
}
|
404
|
+
break;
|
405
|
+
case '/':
|
406
|
+
pi->s++;
|
407
|
+
name = read_name_token(pi);
|
408
|
+
end = pi->s;
|
409
|
+
next_non_white(pi);
|
410
|
+
c = *pi->s;
|
411
|
+
*end = '\0';
|
412
|
+
if (0 != strcmp(name, ename)) {
|
413
|
+
raise_error("invalid format, elements overlap", pi->str, pi->s);
|
414
|
+
}
|
415
|
+
if ('>' != c) {
|
416
|
+
raise_error("invalid format, element not closed", pi->str, pi->s);
|
417
|
+
}
|
418
|
+
pi->s++;
|
419
|
+
pi->pcb->end_element(pi, ename);
|
420
|
+
return;
|
421
|
+
case '\0':
|
422
|
+
raise_error("invalid format, document not terminated", pi->str, pi->s);
|
423
|
+
default:
|
424
|
+
// a child element
|
425
|
+
read_element(pi);
|
426
|
+
break;
|
427
|
+
}
|
428
|
+
} else { // read as TEXT
|
429
|
+
pi->s = start;
|
430
|
+
//pi->s--;
|
431
|
+
read_text(pi);
|
432
|
+
// to exit read_text with no errors the next character must be <
|
433
|
+
if ('/' == *(pi->s + 1) &&
|
434
|
+
0 == strncmp(ename, pi->s + 2, elen) &&
|
435
|
+
'>' == *(pi->s + elen + 2)) {
|
436
|
+
// close tag after text so treat as a value
|
437
|
+
pi->s += elen + 3;
|
438
|
+
pi->pcb->end_element(pi, ename);
|
439
|
+
return;
|
440
|
+
}
|
441
|
+
}
|
442
|
+
}
|
443
|
+
}
|
444
|
+
}
|
445
|
+
|
446
|
+
static void
|
447
|
+
read_text(PInfo pi) {
|
448
|
+
char buf[MAX_TEXT_LEN];
|
449
|
+
char *b = buf;
|
450
|
+
char *alloc_buf = 0;
|
451
|
+
char *end = b + sizeof(buf) - 2;
|
452
|
+
char c;
|
453
|
+
int spc = 0;
|
454
|
+
int done = 0;
|
455
|
+
|
456
|
+
while (!done) {
|
457
|
+
c = *pi->s++;
|
458
|
+
switch(c) {
|
459
|
+
case ' ':
|
460
|
+
case '\t':
|
461
|
+
case '\f':
|
462
|
+
case '\n':
|
463
|
+
case '\r':
|
464
|
+
spc = 1;
|
465
|
+
break;
|
466
|
+
case '<':
|
467
|
+
done = 1;
|
468
|
+
pi->s--;
|
469
|
+
break;
|
470
|
+
case '\0':
|
471
|
+
raise_error("invalid format, document not terminated", pi->str, pi->s);
|
472
|
+
default:
|
473
|
+
if ('&' == c) {
|
474
|
+
c = read_coded_char(pi);
|
475
|
+
}
|
476
|
+
if (end <= b + spc) {
|
477
|
+
unsigned long size;
|
478
|
+
|
479
|
+
if (0 != alloc_buf) {
|
480
|
+
size = sizeof(buf) * 2;
|
481
|
+
if (0 == (alloc_buf = (char*)malloc(size))) {
|
482
|
+
raise_error("text too long", pi->str, pi->s);
|
483
|
+
}
|
484
|
+
memcpy(alloc_buf, buf, b - buf);
|
485
|
+
b = alloc_buf + (b - buf);
|
486
|
+
} else {
|
487
|
+
unsigned long pos = b - alloc_buf;
|
488
|
+
|
489
|
+
size = (end - alloc_buf) * 2;
|
490
|
+
if (0 == (alloc_buf = (char*)realloc(alloc_buf, size))) {
|
491
|
+
raise_error("text too long", pi->str, pi->s);
|
492
|
+
}
|
493
|
+
b = alloc_buf + pos;
|
494
|
+
}
|
495
|
+
end = alloc_buf + size;
|
496
|
+
}
|
497
|
+
if (spc) {
|
498
|
+
*b++ = ' ';
|
499
|
+
}
|
500
|
+
spc = 0;
|
501
|
+
*b++ = c;
|
502
|
+
break;
|
503
|
+
}
|
504
|
+
}
|
505
|
+
*b = '\0';
|
506
|
+
pi->pcb->add_text(pi, buf, ('/' == *(pi->s + 1)));
|
507
|
+
if (0 != alloc_buf) {
|
508
|
+
free(alloc_buf);
|
509
|
+
}
|
510
|
+
}
|
511
|
+
|
512
|
+
static char*
|
513
|
+
read_name_token(PInfo pi) {
|
514
|
+
char *start;
|
515
|
+
|
516
|
+
next_non_white(pi);
|
517
|
+
start = pi->s;
|
518
|
+
for (; 1; pi->s++) {
|
519
|
+
switch (*pi->s) {
|
520
|
+
case ' ':
|
521
|
+
case '\t':
|
522
|
+
case '\f':
|
523
|
+
case '?':
|
524
|
+
case '=':
|
525
|
+
case '/':
|
526
|
+
case '>':
|
527
|
+
case '\n':
|
528
|
+
case '\r':
|
529
|
+
return start;
|
530
|
+
case '\0':
|
531
|
+
// documents never terminate after a name token
|
532
|
+
raise_error("invalid format, document not terminated", pi->str, pi->s);
|
533
|
+
break; // to avoid warnings
|
534
|
+
default:
|
535
|
+
break;
|
536
|
+
}
|
537
|
+
}
|
538
|
+
return start;
|
539
|
+
}
|
540
|
+
|
541
|
+
static void
|
542
|
+
read_cdata(PInfo pi) {
|
543
|
+
char *start;
|
544
|
+
char *end;
|
545
|
+
|
546
|
+
start = pi->s;
|
547
|
+
end = strstr(pi->s, "]]>");
|
548
|
+
if (end == 0) {
|
549
|
+
raise_error("invalid format, CDATA not terminated", pi->str, pi->s);
|
550
|
+
}
|
551
|
+
*end = '\0';
|
552
|
+
pi->s = end + 3;
|
553
|
+
if (0 != pi->pcb->add_cdata) {
|
554
|
+
pi->pcb->add_cdata(pi, start, end - start);
|
555
|
+
}
|
556
|
+
}
|
557
|
+
|
558
|
+
/* Assume the value starts immediately and goes until the quote character is
|
559
|
+
* reached again. Do not read the character after the terminating quote.
|
560
|
+
*/
|
561
|
+
static char*
|
562
|
+
read_quoted_value(PInfo pi) {
|
563
|
+
char *value;
|
564
|
+
|
565
|
+
if ('"' != *pi->s) {
|
566
|
+
raise_error("invalid format, expected a quote character", pi->str, pi->s);
|
567
|
+
}
|
568
|
+
pi->s++; // skip quote character
|
569
|
+
value = pi->s;
|
570
|
+
for (; *pi->s != '"'; pi->s++) {
|
571
|
+
if ('\0' == *pi->s) {
|
572
|
+
raise_error("invalid format, document not terminated", pi->str, pi->s);
|
573
|
+
}
|
574
|
+
}
|
575
|
+
*pi->s = '\0'; // terminate value
|
576
|
+
pi->s++; // move past quote
|
577
|
+
|
578
|
+
return value;
|
579
|
+
}
|
580
|
+
|
581
|
+
static int
|
582
|
+
read_coded_char(PInfo pi) {
|
583
|
+
char *b, buf[8];
|
584
|
+
char *end = buf + sizeof(buf);
|
585
|
+
char *s;
|
586
|
+
int c;
|
587
|
+
|
588
|
+
for (b = buf, s = pi->s; b < end; b++, s++) {
|
589
|
+
if (';' == *s) {
|
590
|
+
*b = '\0';
|
591
|
+
s++;
|
592
|
+
break;
|
593
|
+
}
|
594
|
+
*b = *s;
|
595
|
+
}
|
596
|
+
if (b > end) {
|
597
|
+
return *pi->s;
|
598
|
+
}
|
599
|
+
if ('#' == *buf) {
|
600
|
+
c = (int)strtol(buf + 1, &end, 10);
|
601
|
+
if (0 >= c || '\0' != *end) {
|
602
|
+
return *pi->s;
|
603
|
+
}
|
604
|
+
pi->s = s;
|
605
|
+
|
606
|
+
return c;
|
607
|
+
}
|
608
|
+
if (0 == strcasecmp(buf, "nbsp")) {
|
609
|
+
pi->s = s;
|
610
|
+
return ' ';
|
611
|
+
} else if (0 == strcasecmp(buf, "lt")) {
|
612
|
+
pi->s = s;
|
613
|
+
return '<';
|
614
|
+
} else if (0 == strcasecmp(buf, "gt")) {
|
615
|
+
pi->s = s;
|
616
|
+
return '>';
|
617
|
+
} else if (0 == strcasecmp(buf, "amp")) {
|
618
|
+
pi->s = s;
|
619
|
+
return '&';
|
620
|
+
} else if (0 == strcasecmp(buf, "quot")) {
|
621
|
+
pi->s = s;
|
622
|
+
return '"';
|
623
|
+
} else if (0 == strcasecmp(buf, "apos")) {
|
624
|
+
pi->s = s;
|
625
|
+
return '\'';
|
626
|
+
}
|
627
|
+
return *pi->s;
|
628
|
+
}
|
629
|
+
|